├── .github └── workflows │ ├── features-and-predictions.yml │ ├── fraud-batch-inference-pipeline.yml │ └── fraud-feature-pipelines.yml ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── README.md ├── actual_iris.png ├── confusion_matrix.png ├── credit_cards.parquet ├── df_recent.png ├── images │ ├── card_horizontal.jpg │ └── serverless-ml-architecture.svg ├── latest_iris.png ├── profiles.parquet └── transactions.parquet ├── requirements.txt └── src ├── 00-intro ├── Feature-Store-Intro.ipynb ├── Pandas-Intro.ipynb ├── green-apples-vs-oranges.ipynb ├── red-and-green-apples-vs-oranges.ipynb └── streamlit-example.py ├── 01-module ├── assets │ ├── Setosa.png │ ├── Versicolor.png │ ├── Virginica.png │ ├── confusion-matrix.png │ └── iris.png ├── auto-commit-and-push.ipynb ├── iris-batch-inference-pipeline.ipynb ├── iris-feature-pipeline.ipynb ├── iris-train-pipeline.ipynb ├── iris_end_to_end_ml_pipeline.ipynb └── scripts │ └── run-feature-and-prediction-pipelines.sh ├── 02-module ├── 1_backfill_cc_feature_groups.ipynb ├── 2_cc_feature_pipeline.ipynb ├── scripts │ └── run-fraud-feature-pipelines.sh ├── sml │ ├── cc_features.py │ └── synthetic_data.py └── test_sml │ └── test_sml.py └── 03-module ├── 3_model_training.ipynb ├── 4_batch_predictions.ipynb ├── iris_with_sklearn_transformer.ipynb └── scripts └── run-fraud-batch-inference.sh /.github/workflows/features-and-predictions.yml: -------------------------------------------------------------------------------- 1 | name: iris-feature-and-prediction-pipelines 2 | 3 | on: 4 | workflow_dispatch: 5 | # schedule: 6 | # - cron: '11 11 * * *' 7 | 8 | jobs: 9 | test_schedule: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout repo content 13 | uses: actions/checkout@v2 14 | 15 | - name: setup python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.8.1' 19 | 20 | - name: install python packages 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | 25 | - name: execute python workflows from bash script 26 | env: 27 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 28 | run: ./src/01-module/scripts/run-feature-and-prediction-pipelines.sh 29 | 30 | - name: publish github pages 31 | uses: stefanzweifel/git-auto-commit-action@v4 32 | with: 33 | commit_message: "Automated graph published" 34 | 35 | # Optional. Local and remote branch name where commit is going to be pushed 36 | # to. Defaults to the current branch. 37 | # You might need to set `create_branch: true` if the branch does not exist. 38 | branch: main 39 | 40 | # Optional. Options used by `git-commit`. 41 | # See https://git-scm.com/docs/git-commit#_options 42 | commit_options: '--no-verify --signoff' 43 | 44 | # Optional glob pattern of files which should be added to the commit 45 | # Defaults to all (.) 46 | file_pattern: assets/latest_iris.png assets/actual_iris.png assets/confusion_matrix.png assets/df_recent.png 47 | 48 | # Optional. Local file path to the repository. 49 | # Defaults to the root of the repository. 50 | repository: . 51 | 52 | # Optional commit user and author settings 53 | commit_user_name: My GitHub Actions Bot # defaults to "github-actions[bot]" 54 | commit_user_email: my-github-actions-bot@example.org # defaults to "github-actions[bot]@users.noreply.github.com" 55 | commit_author: Author # defaults to author of the commit that triggered the run 56 | 57 | # Optional. Tag name being created in the local repository and 58 | # pushed to remote repository and defined branch. 59 | #tagging_message: 'v1.0.0' 60 | 61 | # Optional. Option used by `git-status` to determine if the repository is 62 | # dirty. See https://git-scm.com/docs/git-status#_options 63 | #status_options: '--untracked-files=no' 64 | 65 | # Optional. Options used by `git-add`. 66 | # See https://git-scm.com/docs/git-add#_options 67 | #add_options: '-u' 68 | 69 | # Optional. Options used by `git-push`. 70 | # See https://git-scm.com/docs/git-push#_options 71 | #push_options: '--force' 72 | 73 | # Optional. Disable dirty check and always try to create a commit and push 74 | skip_dirty_check: true 75 | 76 | # Optional. Skip internal call to `git fetch` 77 | skip_fetch: false 78 | 79 | # Optional. Skip internal call to `git checkout` 80 | skip_checkout: false 81 | 82 | # Optional. Prevents the shell from expanding filenames. 83 | # Details: https://www.gnu.org/software/bash/manual/html_node/Filename-Expansion.html 84 | disable_globbing: true 85 | 86 | # Optional. Create given branch name in local and remote repository. 87 | create_branch: false 88 | 89 | -------------------------------------------------------------------------------- /.github/workflows/fraud-batch-inference-pipeline.yml: -------------------------------------------------------------------------------- 1 | name: fraud-batch-inference-pipeline 2 | 3 | on: 4 | workflow_dispatch: 5 | # schedule: 6 | # - cron: '11 11 * * *' 7 | 8 | jobs: 9 | test_schedule: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout repo content 13 | uses: actions/checkout@v2 14 | 15 | - name: setup python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.8.1' 19 | 20 | - name: install python packages 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | 25 | - name: execute python workflows from bash script 26 | env: 27 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 28 | run: ./src/03-module/scripts/run-fraud-batch-inference.sh 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/fraud-feature-pipelines.yml: -------------------------------------------------------------------------------- 1 | name: fraud-feature-pipelines 2 | 3 | on: 4 | workflow_dispatch: 5 | # schedule: 6 | # - cron: '11 11 * * *' 7 | 8 | jobs: 9 | test_schedule: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout repo content 13 | uses: actions/checkout@v2 14 | 15 | - name: setup python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.8.1' 19 | 20 | - name: install python packages 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | 25 | - name: execute python workflows from bash script 26 | env: 27 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 28 | run: ./src/02-module/scripts/run-fraud-feature-pipelines.sh 29 | 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | *.nbconvert.ipynb 7 | *~ 8 | 9 | .hw_api_key 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![readme header](/assets//images/card_horizontal.jpg) 2 | 3 | 4 | 5 | 6 | 7 | 8 | # **[Beyond Notebooks - Serverless ML](https://www.serverless-ml.org)** 9 | ***Build Batch and Real-Time Prediction Services with Python*** 10 | 11 | ![serverless architecture](/assets/images/serverless-ml-architecture.svg "Serverless Architecture") 12 | 13 | # **Overview** 14 | You should not need to be an expert in Kubernetes or cloud computing to build an end-to-end service that makes intelligent decisions with the help of a ML model. Serverless ML makes it easy to build a system that uses ML models to make predictions. 15 | 16 | You do not need to install, upgrade, or operate any systems. You only need to be able to write Python programs that can be scheduled to run as pipelines. The features and models your pipelines produce are managed by a serverless feature store / model registry. We will also show you how to build a UI for your prediction service by writing Python and some HTML. 17 | 18 | **Prerequisites:** Python - Pandas - Github 19 | 20 | # **Modules** 21 | - ## **Module 00** - Introduction and optional content. 22 | - Why Serverless ML: [Video](https://www.youtube.com/watch?v=zM2_m898P5g) | [Slides](https://drive.google.com/file/d/15gwryDoHq88tgxu8CoCbTqr5L9YN9O5p/view?usp=sharing) 23 | - Introduction to the course: [Video](https://www.youtube.com/watch?v=FM1YkIl1wXI&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=3) | [slides](https://drive.google.com/file/d/1a5uZHhVSUyxxjrESFea9vONovKROra4L/view?usp=sharing) 24 | - Development Environment & Platforms [Video](https://www.youtube.com/watch?v=9kNjky0MQtc&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=3) | [slides](https://drive.google.com/file/d/1LTTHkwV8RirYaz1MeZtoYgTc9TRSrBwr/view?usp=sharing) 25 | 26 | - ***Introduction to Machine Learning (ML 101)*** [Video](https://www.youtube.com/watch?v=RmAGTZ7dy58&list=PLMeDf8qRRqgU_-erq30v-k8_it4pOqhoQ&index=4) | [slides](https://drive.google.com/file/d/1HXsrSRPcBMW53lgnBnYb95m5eS9oLqRk/view?usp=sharing) 27 | 28 | - ## **Module 01** - Pandas and ML Pipelines in Python. Write your first serverless App. 29 | - Full Lecture: [Video](https://www.youtube.com/watch?v=j-XnCflCc0I) | [Slides](https://drive.google.com/file/d/1L8DHGC5xo0NlNe8xfh4xf4NZV1CEGBA6/view?usp=sharing) 30 | 31 | - [Lab](https://www.youtube.com/watch?v=zAD3miW0Og0) | [Slides](https://drive.google.com/file/d/1hve9nVrImRhNE8lE26zPcr3X1DDDk7uD/view?usp=sharing) | [Homework form](https://forms.gle/2p5odBdpAqvavH1T7) 32 | 33 | - ## **Module 02** - Data modeling and the Feature Store. The Credit-card fraud prediction service. 34 | - Full Lecture: [Video](https://youtu.be/tpxZh8lbcBk) | [Slides](https://drive.google.com/file/d/1HgAKsHnOms1XCtl_KIEuELudTLtDkhxk/view?usp=sharing) 35 | 36 | - [Lab](https://www.youtube.com/watch?v=niPayagVxFg) | [Slides](https://drive.google.com/file/d/1_1oDN5nfpWSUpKNlls45HLllQ75yAWd-/view?usp=sharing) | [Homework form](https://forms.gle/5g9XtaeBEigKEirGA) 37 | - ## **Module 03** - Training Pipelines, Inference Pipelines, and the Model Registry. 38 | - Full lecture: [Video](https://youtu.be/BD1UOJs1Bvo) | [Slides](https://drive.google.com/file/d/1XhfnH7DzwDqQKS6WxDVqWFFas0fi_jnJ/view?usp=sharing) 39 | 40 | - [Lab](https://youtu.be/QfzrKgLqEXc) | [Slides](https://drive.google.com/file/d/1jITx5HGh2uM5vAeknvCaeN6ZPOc2i8AS/view?usp=sharing) 41 | - #### **Module 04** - Bring a Prediction Service to Life with a User Interface. 42 | - #### **Module 05** - Automated Testing and Versioning of features and models. 43 | - #### **Module 06** - Real-time serverless machine learning systems. Project presentation. 44 | 45 | --- 46 | 47 | ## **Learning Outcomes:** 48 | - Learn to develop and operate AI-enabled (prediction) services on serverless infrastructure 49 | - Develop and run serverless feature pipelines 50 | - Deploy features and models to serverless infrastructure 51 | - Train models and and run batch/inference pipelines 52 | - Develop a serverless UI for your prediction service 53 | - Learn MLOps fundamentals: versioning, testing, data validation, and operations 54 | - Develop and run a real-time serverless machine learning system 55 | 56 | ## **Course Contents:** 57 | - Pandas and ML Pipelines in Python. Write your first serverless App. 58 | - The Feature Store for Machine Learning. Feature engineering for a credit-card fraud serverless App. 59 | - Training Pipelines and Inference Pipelines 60 | - Bring a Prediction Service to Life with a User Interface (Gradio, Github Pages, Streamlit) 61 | - Automated Testing and Versioning of features and models 62 | - Real-time serverless machine learning systems. Project presentation. 63 | 64 | ## **Who is the target audience?** 65 | You have taken a course in machine learning (ML) and you can program in Python. You want to take the next step beyond training models on static datasets in notebooks. You want to be able to build a prediction service around your model. Maybe you work at an Enterprise and want to demonstrate your models’ value to stakeholders in the stakeholder's own language. Maybe you want to include ML in an existing application or system. 66 | 67 | ## **Why is this course different?** 68 | You don’t need any operations experience beyond using GitHub and writing Python code. You will learn the essentials of MLOps: versioning artifacts, testing artifacts, validating artifacts, and monitoring and upgrading running systems. You will work with raw and live data - you will need to engineer features in pipelines. You will learn how to select, extract, compute, and transform features. 69 | 70 | ## **Will this course cost me money?** 71 | No. You will become a serveless machine learning engineer without having to pay to run your serverless pipelines or to manage your features/models/user-interface. We will use Github Actions and Hopsworks that both have generous time-unlimited free tiers. 72 | 73 | **Register now at [Serveless ML Course](https://www.serverless-ml.org/register)** 74 | 75 | ## **Timeline** 76 | _Self-paced_ 77 | 78 | ## **Requirements** 79 | - **Python** environment include a notebook (Jupyter or Colabatory) 80 | - https://github.com account 81 | - https://hopsworks.ai account 82 | 83 | # **Key Technologies** 84 | 85 | ## **Development environment** 86 | You can write, test, debug, and train your models in some Python IDE. We will focus on notebooks and Python programs. You can use Jupyter notebooks or Colabatory. 87 | 88 | ## **Github** 89 | Github to manage your code, GitHub Actions to run your workflows, and Github Pages for your user interface for non-interactive applications. Github Actions offers a free tier of 500 MB and 2,000 minutes to run your pipelines. 90 | https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions 91 | 92 | ## **Hopsworks** 93 | [Hopsworks.ai](https://app.hopsworks.ai) has a free tier of 10 GB of storage. 94 |

95 | 96 | --- 97 | 98 | ## **Useful Resources** 99 | | name | Description | link | 100 | |------|-------------|------| 101 | |**Awesome MLOps**| A collection of links and resources for MLOps| https://github.com/visenger/awesome-mlops| 102 | |**Machine Learning Ops**| a collection of resources on how to facilitate Machine Learning Ops with GitHub.| https://mlops.githubapp.com/| 103 | |**MLOps Toys**| A curated list of MLOps projects.|https://mlops.toys/| 104 | |**MLOps Zoomcamp**| teaches practical aspects of productionizing ML services.|https://github.com/DataTalksClub/mlops-zoomcamp| 105 | |**PYSLACKERS**|A large open community for Python programming enthusiasts.|https://pyslackers.com/web| 106 | |**Feature Store Org**|An open community for everything feature stores.|https://www.featurestore.org| 107 | 108 | 109 | ## **Other MLOps Courses** 110 | | name | Description | link | 111 | |------|-------------|------| 112 | |**MlOps Zoomcamp**| DevOps style course with Python and Docker as prerequisites.| https://github.com/DataTalksClub/mlops-zoomcamp | 113 | |**Full Stack Deep Learning**| This course shares best practices for the full stack; topics range from problem selection to dataset management to monitoring.| https://fullstackdeeplearning.com/| 114 | |**MLOps course**| A series of lessons teaching how to apply ML to build production-grade products (by Goku Mohandas).|https://github.com/GokuMohandas/mlops-course | 115 | 116 | # **Support and Partners** 117 |
118 |

119 | 120 | FSorg 121 | 122 |

123 |
124 |

125 | 126 | Hopsworks 127 | 128 |

129 | -------------------------------------------------------------------------------- /assets/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/README.md -------------------------------------------------------------------------------- /assets/actual_iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/actual_iris.png -------------------------------------------------------------------------------- /assets/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/confusion_matrix.png -------------------------------------------------------------------------------- /assets/credit_cards.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/credit_cards.parquet -------------------------------------------------------------------------------- /assets/df_recent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/df_recent.png -------------------------------------------------------------------------------- /assets/images/card_horizontal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/images/card_horizontal.jpg -------------------------------------------------------------------------------- /assets/latest_iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/latest_iris.png -------------------------------------------------------------------------------- /assets/profiles.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/profiles.parquet -------------------------------------------------------------------------------- /assets/transactions.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/assets/transactions.parquet -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | faker 2 | parsedatetime 3 | hopsworks 4 | nbconvert 5 | scikit-learn 6 | plotly 7 | Pillow 8 | seaborn 9 | dataframe-image 10 | -------------------------------------------------------------------------------- /src/00-intro/Feature-Store-Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "076a7f63", 6 | "metadata": {}, 7 | "source": [ 8 | "## Feature Store Basics\n", 9 | "\n", 10 | "This notebook introduces the basics of working with the Hopsworks API and Pandas DataFrames.\n", 11 | "\n", 12 | "First, we will define a Pandas DataFrame with 4 credit card transactions in 3 different cities with the same credit card. The last 2 credit card transactions are labeled as 'fraud', while the first 2 transactions are labeled as 'not fraud'." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "9411b6f0", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "
\n", 25 | "\n", 38 | "\n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | "
credit_card_numbertrans_datetimeamountlocationfraud
01111 2222 3333 44442022-01-01 08:44:00142.34Sao PaoloFalse
11111 2222 3333 44442022-01-02 19:44:0012.34Rio De JaneiroFalse
21111 2222 3333 44442022-01-02 20:44:0066.29StockholmTrue
31111 2222 3333 44442022-01-02 20:55:00112.33StockholmTrue
\n", 84 | "
" 85 | ], 86 | "text/plain": [ 87 | " credit_card_number trans_datetime amount location fraud\n", 88 | "0 1111 2222 3333 4444 2022-01-01 08:44:00 142.34 Sao Paolo False\n", 89 | "1 1111 2222 3333 4444 2022-01-02 19:44:00 12.34 Rio De Janeiro False\n", 90 | "2 1111 2222 3333 4444 2022-01-02 20:44:00 66.29 Stockholm True\n", 91 | "3 1111 2222 3333 4444 2022-01-02 20:55:00 112.33 Stockholm True" 92 | ] 93 | }, 94 | "execution_count": 1, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "import pandas as pd\n", 101 | "\n", 102 | "data = { \n", 103 | " 'credit_card_number': ['1111 2222 3333 4444', '1111 2222 3333 4444','1111 2222 3333 4444',\n", 104 | " '1111 2222 3333 4444'],\n", 105 | " 'trans_datetime': ['2022-01-01 08:44', '2022-01-02 19:44', '2022-01-02 20:44', '2022-01-02 20:55'],\n", 106 | " 'amount': [142.34, 12.34, 66.29, 112.33],\n", 107 | " 'location': ['Sao Paolo', 'Rio De Janeiro', 'Stockholm', 'Stockholm'],\n", 108 | " 'fraud': [False, False, True, True] \n", 109 | "}\n", 110 | "\n", 111 | "df = pd.DataFrame.from_dict(data)\n", 112 | "df['trans_datetime']= pd.to_datetime(df['trans_datetime'])\n", 113 | "df" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "304e9f2d", 119 | "metadata": {}, 120 | "source": [ 121 | "## Connect to Hopsworks\n", 122 | "\n", 123 | "You need an API key to connect. First, login to Hopsworks, then run this code. It will provide a link to get your API key, that you then need to copy and paste into the text box that appears below this cell.\n", 124 | "\n", 125 | "It is good practice to save this API key somewhere safe so you don't have to create a new one every time you use Hopsworks. If you run this code on your laptop, a copy of the API key will be cached locally in this directory in a file with restricted permissions, so you don't have to always re-enter the API key." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 2, 131 | "id": "c855e3c0", 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "Connected. Call `.close()` to terminate connection gracefully.\n", 139 | "\n", 140 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/398\n", 141 | "Connected. Call `.close()` to terminate connection gracefully.\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "import hopsworks\n", 147 | "proj = hopsworks.login()\n", 148 | "fs = proj.get_feature_store()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "e1d38175", 154 | "metadata": {}, 155 | "source": [ 156 | "### Create a Feature Group\n", 157 | "\n", 158 | "A feature group is a table of features that are computed together in the same feature pipeline and written as a DataFrame to the Feature Store. You should have a unique idenitfier for each row that may be one or more columns, and you define as the `primary_key`. You may also have a column that represents the timestamp or datetime for when row values were observed. If so, you should specify the `event_time` column when creating the Feature Group.\n", 159 | "\n", 160 | "Hopsworks have comprehensive documentation on Feature Groups. Click on these links to learn more.\n", 161 | "\n", 162 | "* [Feature Group Concept](https://docs.hopsworks.ai/3.0/concepts/fs/feature_group/fg_overview/)\n", 163 | "* [Feature Group Creation Guide](https://docs.hopsworks.ai/3.0/user_guides/fs/feature_group/create/)\n", 164 | "* [Feature Group API Docs](https://docs.hopsworks.ai/feature-store-api/3.0/generated/api/feature_group_api/)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 3, 170 | "id": "3ea7a88a", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "fg = fs.get_or_create_feature_group(\n", 175 | " name=\"credit_card_transactions\",\n", 176 | " version=1,\n", 177 | " description=\"Credit Card Transaction data\",\n", 178 | " primary_key=['credit_card_number'],\n", 179 | " event_time='trans_datetime'\n", 180 | ") " 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "id": "990c0972", 186 | "metadata": {}, 187 | "source": [ 188 | "### Write your DataFrame to the Feature Group\n", 189 | "When you write your DataFrame to the feature group, first the DataFrame is copied to Hopsworks. \n", 190 | "Then a backfill ingestion job is run on Hopsworks to insert/append the DataFrame to the Feature Group. \n", 191 | "The job is a Spark job, and the data is stored in a Apache Hudi table in Hopsworks.\n", 192 | "\n", 193 | "It will take about 1 minute for the ingestion job to complete.\n", 194 | "If you don't want to wait 1 minute, you make the ingestion job run in the background with:\n", 195 | "\n", 196 | "\n", 197 | " fg.insert(df, write_options={\"wait_for_job\": False})" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 4, 203 | "id": "3380610c", 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "application/vnd.jupyter.widget-view+json": { 209 | "model_id": "41c290a8882f4dbba18d2598b478aea6", 210 | "version_major": 2, 211 | "version_minor": 0 212 | }, 213 | "text/plain": [ 214 | "Uploading Dataframe: 0.00% | | Rows 0/4 | Elapsed Time: 00:00 | Remaining Time: ?" 215 | ] 216 | }, 217 | "metadata": {}, 218 | "output_type": "display_data" 219 | }, 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "Launching offline feature group backfill job...\n", 225 | "Backfill Job started successfully, you can follow the progress at \n", 226 | "https://c.app.hopsworks.ai/p/398/jobs/named/credit_card_transactions_1_offline_fg_backfill/executions\n" 227 | ] 228 | }, 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "(, None)" 233 | ] 234 | }, 235 | "execution_count": 4, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "fg.insert(df)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "id": "0ef2bc45", 247 | "metadata": {}, 248 | "source": [ 249 | "## Read using Feature Views\n", 250 | "\n", 251 | "When you want to use features to train or serve models, you create Feature that are `labels` View a Feature View by first selecting features from Feature Groups. Here, we only have 1 Feature Group, and we select 3 features from it, returning a `query` object. The `query` object defines the set of features (or schema) for a Feature View. \n", 252 | "\n", 253 | "You create a Feature View with a `query` object (specifying the features and any extra columns that might be needed for inference (but not training)), providing a name and version, and specifying the columns that are `labels`, that is, the target your machine learning algorithm will try and optimize." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 5, 259 | "id": "ac7e3a12", 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "ename": "RestAPIError", 264 | "evalue": "Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/398/featurestores/335/featureview). Server response: \nHTTP code: 400, HTTP reason: Bad Request, error code: 270179, error msg: The provided feature view name and version already exists, user msg: Feature view: credit_card_transactions, version: 1", 265 | "output_type": "error", 266 | "traceback": [ 267 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 268 | "\u001b[0;31mRestAPIError\u001b[0m Traceback (most recent call last)", 269 | "\u001b[0;32m/tmp/ipykernel_3638955/1751349647.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mquery\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"amount\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"location\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fraud\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m fv = fs.create_feature_view(name=\"credit_card_transactions\",\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mversion\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Features from the credit_card_transactions FG\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 270 | "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/hsfs/feature_store.py\u001b[0m in \u001b[0;36mcreate_feature_view\u001b[0;34m(self, name, query, version, description, labels, transformation_functions)\u001b[0m\n\u001b[1;32m 947\u001b[0m \u001b[0mtransformation_functions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransformation_functions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 948\u001b[0m )\n\u001b[0;32m--> 949\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_feature_view_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeat_view\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 950\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 951\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_feature_view\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mversion\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 271 | "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/hsfs/core/feature_view_engine.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(self, feature_view_obj)\u001b[0m\n\u001b[1;32m 67\u001b[0m ]\n\u001b[1;32m 68\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_transformation_function_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattach_transformation_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0mupdated_fv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_feature_view_api\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeature_view_obj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m print(\n\u001b[1;32m 71\u001b[0m \u001b[0;34m\"Feature view created successfully, explore it at \\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 272 | "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/hsfs/core/feature_view_api.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(self, feature_view_obj)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0mheaders\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"content-type\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"application/json\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m return feature_view_obj.update_from_response_json(\n\u001b[0;32m---> 54\u001b[0;31m self._client._send_request(\n\u001b[0m\u001b[1;32m 55\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_POST\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_path\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 273 | "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/hsfs/decorators.py\u001b[0m in \u001b[0;36mif_connected\u001b[0;34m(inst, *args, **kwargs)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0minst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_connected\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNoHopsworksConnectionError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mif_connected\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 274 | "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/hsfs/client/base.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, path_params, query_params, headers, data, stream, files)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 170\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0;36m100\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 171\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRestAPIError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 172\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 275 | "\u001b[0;31mRestAPIError\u001b[0m: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/398/featurestores/335/featureview). Server response: \nHTTP code: 400, HTTP reason: Bad Request, error code: 270179, error msg: The provided feature view name and version already exists, user msg: Feature view: credit_card_transactions, version: 1" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "query = fg.select([\"amount\", \"location\", \"fraud\"])\n", 281 | "\n", 282 | "fv = fs.create_feature_view(name=\"credit_card_transactions\",\n", 283 | " version=1,\n", 284 | " description=\"Features from the credit_card_transactions FG\",\n", 285 | " labels=[\"fraud\"],\n", 286 | " query=query)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "id": "548b2099", 292 | "metadata": {}, 293 | "source": [ 294 | "### Splitting into Train/Test sets\n", 295 | "\n", 296 | "With a Feature View, you can read train and test sets directly as Pandas DataFrames - similar to scikit-learn.\n", 297 | "Here, \n", 298 | "\n", 299 | "* `X_train` is the features of our train set, \n", 300 | "* `y_train` is the labels of our train set, \n", 301 | "* `X_test` is the features of our test set, \n", 302 | "* `y_test` is the labels of our test set." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "id": "792da473", 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "X_train, X_test, y_train, y_test = fv.train_test_split(0.5)\n", 313 | "X_train" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "id": "5fef54b2", 319 | "metadata": {}, 320 | "source": [ 321 | "### Saving training data as files\n", 322 | "Sometimes, if you have a large volume of training data, it is better to save training data as files. Then read the files in your training pipeline. You can create training data as CSV files that is randomly split into train/test sets as follows (the `td_version` is the version of the training data for this feature view, and you can track the progress of the job used to create the training data using the `td_job` object)." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "id": "578e424f", 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "td_version, td_job = fv.create_train_test_split(\n", 333 | " description = 'Transactions fraud batch training dataset',\n", 334 | " data_format = 'csv',\n", 335 | " test_size = 0.5,\n", 336 | " write_options = {'wait_for_job': True},\n", 337 | " coalesce = True,\n", 338 | ")" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "id": "186b958f", 344 | "metadata": {}, 345 | "source": [ 346 | "## Training Data as files\n", 347 | "The training data is now stored as a CSV file on Hopsworks under `Project Settings` -> `File Browser` -> _Training_Datasets.\n", 348 | " \n", 349 | "You can read the training data as split train/test sets with the following. Note the parameter `td_version` we pass here. A feature view can have many training datasets, so you need to supply the version you want. " 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "11e53364", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "X_train, y_train, X_test, y_test = fv.get_train_test_split(td_version)\n", 360 | "X_train" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "43f6d416", 366 | "metadata": {}, 367 | "source": [ 368 | "### Aggregations\n", 369 | "\n", 370 | "Compute the total amount spent on the credit card by first grouping all the rows together with the same `credit_card_number` and then summing up their amounts. \n", 371 | "\n", 372 | "The code first creates a new DataFrame with only the `credit_card_number` and `amount` columns, then the logic of a group-by could be described as \n", 373 | "\n", 374 | " for-each (`credit_card_number`) do \\sigma amount" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "e4be1e90", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "df2 = df[[\"credit_card_number\", \"amount\"]].groupby(\"credit_card_number\").sum()\n", 385 | "df2.rename(columns={\"amount\": \"total_spent\"}, inplace=True)\n", 386 | "df2.info()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "id": "be0be838", 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "df2" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "id": "d187468e", 402 | "metadata": {}, 403 | "source": [ 404 | " We might also want to know at what point-in-time was that total and add a column with the datetime of the last (most recent) credit card transaction." 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "04244e4b", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "df2[\"as_of_datetime\"] = df[[\"credit_card_number\", \"trans_datetime\"]].groupby(\"credit_card_number\").max()\n", 415 | "df2" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "id": "ad27a872", 421 | "metadata": {}, 422 | "source": [ 423 | "The `groupby` operation sets `credit_card_number` as the index of our DataFrame.\n", 424 | "We want `credit_card_number` as a column, as Pandas indexes are not written to the Feature Group.\n", 425 | "We can move the index to a column using `reset_index`." 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "id": "4ec0104f", 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "df2.reset_index(inplace=True)\n", 436 | "df2" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "id": "a8253595", 442 | "metadata": {}, 443 | "source": [ 444 | "We create a feature group to store the contents of `df2` with our aggregated credit card spending information." 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "id": "67c321ed", 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "fg2 = fs.get_or_create_feature_group(\n", 455 | " name=\"credit_card_spending\",\n", 456 | " version=1,\n", 457 | " description=\"Credit Card Spending\",\n", 458 | " primary_key=['credit_card_number'],\n", 459 | " event_time='as_of_datetime'\n", 460 | ") " 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "id": "712f283f", 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "fg2.insert(df2, write_options={\"wait_for_job\": False})" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "id": "d0b7aedf", 476 | "metadata": {}, 477 | "source": [ 478 | "Let's add some more data to our original feature group" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "id": "7ac3a4ec", 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "more_data = { \n", 489 | " 'credit_card_number': ['9999 8888 7777 6666', '9999 8888 7777 6666','9999 8888 7777 6666',\n", 490 | " '9999 8888 7777 6666'],\n", 491 | " 'trans_datetime': ['2022-01-02 04:11', '2022-01-03 07:24', '2022-01-05 10:33', '2022-01-05 11:50'],\n", 492 | " 'amount': [55.67, 84, 77.95, 183],\n", 493 | " 'location': ['San Francisco', 'San Francisco', 'Dublin', 'Dublin'],\n", 494 | " 'fraud': [False, False, False, False] \n", 495 | "}\n", 496 | "\n", 497 | "df3 = pd.DataFrame.from_dict(more_data)\n", 498 | "df3['trans_datetime']= pd.to_datetime(df3['trans_datetime'])\n", 499 | "\n", 500 | "fg = fs.get_feature_group(name=\"credit_card_transactions\", version=1)\n", 501 | "\n", 502 | "fg.insert(df3, write_options={\"wait_for_job\": False})" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "id": "74a1d883", 508 | "metadata": {}, 509 | "source": [ 510 | "Now let's compute how much money was spent on the card since the last time we computed amount spent" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "id": "e7a2dc55", 516 | "metadata": {}, 517 | "source": [ 518 | "## Time Series: Window Aggregations\n", 519 | "\n", 520 | "Count the amount of money spent per day (make the length of the window '1d').\n", 521 | "We will need to set the `event_time` column as the index in order to use Pandas built-in window aggregations." 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "id": "e2baba5c", 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "df5 = fg.read()\n", 532 | "df5" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "id": "864dc8b6", 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "df5 = df5.set_index('trans_datetime')" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "id": "8b325eb1", 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "df5 = df5.sort_index()" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "id": "d51e0b89", 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | " df5['rolling_max_1d'] = df5.rolling('1D').amount.max()\n", 563 | " df5" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "id": "1dec426d", 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "df5['rolling_mean_1d'] = df5.rolling('1D').amount.mean()\n", 574 | "df5" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "id": "683a0337", 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [ 584 | "df5.reset_index(inplace=True)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "id": "114b74d5", 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "fg_agg = fs.get_or_create_feature_group(\n", 595 | " name=\"credit_card_rolling_windows\",\n", 596 | " version=1,\n", 597 | " description=\"Daily Credit Card Spending\",\n", 598 | " primary_key=['credit_card_number'],\n", 599 | " event_time='trans_datetime'\n", 600 | ") " 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "id": "f9b05b09", 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "fg_agg.insert(df5)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "id": "31174a83", 616 | "metadata": {}, 617 | "source": [ 618 | "### Create a Feature View using features from multiple Feature Groups\n", 619 | "\n", 620 | "We want to create a model that uses features from multiple feature groups. \n", 621 | "We will select features from the different feature groups and join them together to create a query object. \n", 622 | "We can read the data in the query object as a DataFrame to inspect it before we create the feature view. \n", 623 | "We will use the feature view to read the training data for the model." 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "id": "faf25f44", 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "query = fg.select_all().join(fg_agg.select(['rolling_max_1d', 'rolling_mean_1d']))\n", 634 | "\n", 635 | "training_data = query.read()\n", 636 | "training_data.head()" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": null, 642 | "id": "caa4cbd7", 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "fv = fs.create_feature_view(name=\"credit_card_fraud_rolling\",\n", 647 | " description=\"Features for a model to predict credit card fraud, including rolling windows\",\n", 648 | " version=1,\n", 649 | " query=query)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "id": "48a6e33f", 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [ 659 | "X_train, y_train, X_test, y_test = fv.train_test_split(0.5)\n", 660 | "X_train" 661 | ] 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "id": "c7576002", 666 | "metadata": {}, 667 | "source": [ 668 | "### Read from Feature Groups\n", 669 | "\n", 670 | "You are also able to read data from Feature Groups as DataFrames." 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "id": "6e1717fc", 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "fg = fs.get_feature_group(name=\"credit_card_transactions\", version=1)\n", 681 | "read_df = fg.read()" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "id": "67029b1e", 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "read_df" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "id": "3e5aa606", 697 | "metadata": {}, 698 | "source": [ 699 | "### Filters\n", 700 | "You can use filters on the `query` object or on the Feature Groups, when reading from them. Here, we read all rows where the transaction amount is greater than 100." 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "id": "5c4d9483", 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "from hsfs.feature import Feature\n", 711 | "\n", 712 | "big_amounts_df = fg.filter(Feature(\"amount\") > 100).read()\n", 713 | "big_amounts_df" 714 | ] 715 | } 716 | ], 717 | "metadata": { 718 | "kernelspec": { 719 | "display_name": "Python 3 (ipykernel)", 720 | "language": "python", 721 | "name": "python3" 722 | }, 723 | "language_info": { 724 | "codemirror_mode": { 725 | "name": "ipython", 726 | "version": 3 727 | }, 728 | "file_extension": ".py", 729 | "mimetype": "text/x-python", 730 | "name": "python", 731 | "nbconvert_exporter": "python", 732 | "pygments_lexer": "ipython3", 733 | "version": "3.9.7" 734 | } 735 | }, 736 | "nbformat": 4, 737 | "nbformat_minor": 5 738 | } 739 | -------------------------------------------------------------------------------- /src/00-intro/Pandas-Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f66fadac", 6 | "metadata": {}, 7 | "source": [ 8 | "## Pandas in 2 mins\n", 9 | "You can't learn Pandas in 2 mins, but here are some of the basics needed for this course.\n", 10 | "\n", 11 | "First, you can define a dict containing credit card payments, labeled as fraud or not-fraud, and create a Pandas DataFrame from it." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "27b01f37", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "\n", 23 | "data = { \n", 24 | " 'credit_card_number': ['1111 2222 3333 4444', '1111 2222 3333 4444','1111 2222 3333 4444',\n", 25 | " '1111 2222 3333 4444'],\n", 26 | " 'trans_datetime': ['2022-01-01 08:44', '2022-01-01 19:44', '2022-01-01 20:44', '2022-01-01 20:55'],\n", 27 | " 'amount': [142.34, 12.34, 66.29, 112.33],\n", 28 | " 'location': ['Sao Paolo', 'Rio De Janeiro', 'Stockholm', 'Stockholm'],\n", 29 | " 'fraud': [False, False, True, True] \n", 30 | "}\n", 31 | "\n", 32 | "df = pd.DataFrame.from_dict(data)\n", 33 | "df['trans_datetime']= pd.to_datetime(df['trans_datetime'])\n", 34 | "df" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "d0146eac", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "df" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "dd7889c9", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df.info()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "ecc3bb3b", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df['trans_datetime']= pd.to_datetime(df['trans_datetime'])\n", 65 | "df.info()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "280b5ebb", 71 | "metadata": {}, 72 | "source": [ 73 | "### Lambda functions\n", 74 | "\n", 75 | "We will now apply a lambda function to the column `amount` and save the result in a new column `is_big` in our DataFrame `df`." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "73ba75de", 82 | "metadata": { 83 | "scrolled": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "df['is_big'] = df['amount'].apply(lambda amount: amount > 100)\n", 88 | "df" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "f845b92e", 94 | "metadata": {}, 95 | "source": [ 96 | "### Apply and UDFs\n", 97 | "\n", 98 | "We will now apply a user-defined function (UDF), `is_small`, to each row in the data DataFrame `df`. \n", 99 | "The result is a series that we store in a new column in `df` called 'is_small'." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "36cf67ef", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "def is_small(row):\n", 110 | " return row['amount'] < 100\n", 111 | " \n", 112 | "df['is_small'] = df.apply(is_small, axis=1)\n", 113 | "df" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "c678d9ba", 119 | "metadata": {}, 120 | "source": [ 121 | "## Rolling Windows\n", 122 | "\n", 123 | "We will compute a rolling window over the day." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "8bc7a844", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "df_rolling = df.set_index('trans_datetime')\n", 134 | "df_rolling" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "4b3b6d2d", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "df_rolling['rolling_max_1d'] = df_rolling.rolling('1D').amount.max()\n", 145 | "df_rolling" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "12d55895", 151 | "metadata": {}, 152 | "source": [ 153 | "Let's create a new DataFrame, `d2`, with new data." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "f38554ad", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "import numpy as np\n", 164 | "import timeit \n", 165 | "\n", 166 | "df2 = pd.DataFrame({\n", 167 | " 'a':np.random.randint(1,100, size=10000),\n", 168 | " 'b':np.random.randint(100,1000, size=10000),\n", 169 | " 'c':np.random.random(10000)\n", 170 | "})\n", 171 | "df2.shape\n", 172 | "(100000, 3)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "id": "36e93895", 178 | "metadata": {}, 179 | "source": [ 180 | "### Vectorized operations are faster than \"apply\" with UDFs\n", 181 | "\n", 182 | "We will see that apply is approximately 50 times slower than the equivalent vectorized operation on 100k rows.\n", 183 | "\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "b35aa5a2", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "%%timeit\n", 194 | "df2['a'].apply(lambda x: x**2)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "id": "622dc43c", 200 | "metadata": {}, 201 | "source": [ 202 | "This vectorized operation is much faster" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "id": "de746618", 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "%%timeit\n", 213 | "df2['a'] ** 2" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "4aededa8", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "df2.describe()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "c40d50fe", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "df.trans_datetime.unique()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "361d75ee", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "df.credit_card_number.nunique()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "8f7de134", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "df.isnull().sum()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "b66d799c", 259 | "metadata": {}, 260 | "source": [ 261 | "## Transformations\n", 262 | "\n", 263 | "Plot a histogram with a long tail.\n", 264 | "Use numpy to seed the random number generator and generate a univariate data sample.\n" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "32ebde28", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "import seaborn as sns\n", 275 | "\n", 276 | "from numpy.random import seed\n", 277 | "from numpy.random import randn\n", 278 | "from numpy.random import rand\n", 279 | "from numpy import append\n", 280 | "seed(1)\n", 281 | "array = 5 * randn(100) + 10\n", 282 | "tail = 10 + (rand(50) * 100)\n", 283 | "array = append(array, tail)\n", 284 | "sns.histplot(array)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "id": "262bf19c", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "columns = ['amount']\n", 295 | "df_exp = pd.DataFrame(data = array, columns = columns)\n", 296 | " \n", 297 | "df_exp.describe()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "bb560fa4", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "df_exp" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "id": "31a8bac9", 313 | "metadata": {}, 314 | "source": [ 315 | "## Standard Scalar in Vectorized Pandas\n", 316 | "\n", 317 | "This is an efficient way to transform our input Pandas column into a range of [0.0, 1.]" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "ae928d6c", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "# Min-Max Normalization in Pandas\n", 328 | "df_norm = (df_exp-df_exp.min())/(df_exp.max()-df_exp.min())\n", 329 | "df_norm.head()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "bca3a9f9", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "sns.histplot(df_norm)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "id": "ff81e054", 345 | "metadata": {}, 346 | "source": [ 347 | "## Power Transformer in Scikit-Learn\n", 348 | "\n", 349 | "Scikit-Learn has many different transformation libraries.\n", 350 | "For heavy-tailed distributions, it is often recommended to perform a [power transformation](\n", 351 | "https://towardsdatascience.com/how-to-differentiate-between-scaling-normalization-and-log-transformations-69873d365a94)\n", 352 | "\n", 353 | "We can see in the histogram, this produces a more Gaussian (normal) distribution than the MinMax Scalar." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "id": "85f5e6d6", 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "from sklearn.preprocessing import PowerTransformer\n", 364 | "\n", 365 | "pt = PowerTransformer()\n", 366 | "\n", 367 | "df_power = pd.DataFrame(\n", 368 | " pt.fit_transform(df_exp[[\"amount\"]]), columns=[\"amount\"]\n", 369 | ")\n", 370 | "\n", 371 | "sns.histplot(df_power)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "id": "1ced0dce", 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [] 381 | } 382 | ], 383 | "metadata": { 384 | "kernelspec": { 385 | "display_name": "Python 3 (ipykernel)", 386 | "language": "python", 387 | "name": "python3" 388 | }, 389 | "language_info": { 390 | "codemirror_mode": { 391 | "name": "ipython", 392 | "version": 3 393 | }, 394 | "file_extension": ".py", 395 | "mimetype": "text/x-python", 396 | "name": "python", 397 | "nbconvert_exporter": "python", 398 | "pygments_lexer": "ipython3", 399 | "version": "3.9.7" 400 | } 401 | }, 402 | "nbformat": 4, 403 | "nbformat_minor": 5 404 | } 405 | -------------------------------------------------------------------------------- /src/00-intro/green-apples-vs-oranges.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "6b138a28", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "[0 1]\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import sklearn \n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "from sklearn import tree \n", 21 | "\n", 22 | "# 4 examples of features with [red-color, green-color]\n", 23 | "features = [[0,120], [0, 110], [250, 150], [255, 163]]\n", 24 | "# green apples == 0; oranges == 1\n", 25 | "labels = [0, 0, 1, 1]\n", 26 | "\n", 27 | "clf = tree.DecisionTreeClassifier()\n", 28 | "clf = clf.fit(features, labels)\n", 29 | "\n", 30 | "test_fruits = [[0,128], [249, 155]]\n", 31 | "test_labels = [0, 1] \n", 32 | "pred_labels = clf.predict(test_fruits)\n", 33 | "print(pred_labels)\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "b40db72f", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 3 (ipykernel)", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.9.7" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 5 66 | } 67 | -------------------------------------------------------------------------------- /src/00-intro/red-and-green-apples-vs-oranges.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "4948e813", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "[0 1 1 2]\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import sklearn \n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "\n", 21 | "# [green_apple(0,120), green_apple(0,110), orange(250,150), orange(255, 163), red_apple(255,0), red_apple(240,0)]\n", 22 | "features = [[0,120], [75, 40], [60, 60], [255, 163], [255, 0], [240, 0]]\n", 23 | "\n", 24 | "# [green_apple, green_apple, orange, orange, red_apple, red_apple]\n", 25 | "labels = [0, 0, 1, 1, 2, 2]\n", 26 | "\n", 27 | "clf = LogisticRegression()\n", 28 | "clf = clf.fit(features, labels)\n", 29 | "\n", 30 | "# (66,66) is labelled as a green apple\n", 31 | "test_features = [[0,110], [66, 66], [249, 155], [245, 0]]\n", 32 | "test_labels = [0, 1, 0, 2] \n", 33 | "pred_labels = clf.predict(test_features)\n", 34 | "\n", 35 | "print(pred_labels)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "f3ad083c", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3 (ipykernel)", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.9.7" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 5 68 | } 69 | -------------------------------------------------------------------------------- /src/00-intro/streamlit-example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import streamlit as st 3 | import numpy as np 4 | 5 | st.title("Streamlit for ServerlessML") 6 | st.header("Easy UI in Python with Streamlit") 7 | 8 | chart_data = pd.DataFrame(np.random.randn(30, 3), 9 | columns=["Data Engineers", "Data Scientists", "ML Engineers"]) 10 | 11 | st.bar_chart(chart_data) 12 | -------------------------------------------------------------------------------- /src/01-module/assets/Setosa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/src/01-module/assets/Setosa.png -------------------------------------------------------------------------------- /src/01-module/assets/Versicolor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/src/01-module/assets/Versicolor.png -------------------------------------------------------------------------------- /src/01-module/assets/Virginica.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/src/01-module/assets/Virginica.png -------------------------------------------------------------------------------- /src/01-module/assets/confusion-matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/src/01-module/assets/confusion-matrix.png -------------------------------------------------------------------------------- /src/01-module/assets/iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/serverless-ml-course/81f75deaa7581d2fee33c6c969ff507ea482d720/src/01-module/assets/iris.png -------------------------------------------------------------------------------- /src/01-module/auto-commit-and-push.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "fef79a2b-3616-40f0-8b08-a4ce2381474c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# get the environement variable for the token\n", 11 | "import os\n", 12 | "secret = os.environ['GIT_TOKEN']" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "e0bc7741-4b28-4c33-a261-20b97dec0267", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from datetime import datetime\n", 23 | "from git import Repo\n", 24 | "import git\n", 25 | "\n", 26 | "# move to the branch for pages\n", 27 | "repo = git.Repo('/project-dir/')\n", 28 | "repo.git.checkout('gh-pages', force=True)\n", 29 | "\n", 30 | "# List remotes\n", 31 | "# print('Remotes:')\n", 32 | "# for remote in repo.remotes:\n", 33 | "# print(f'- {remote.name} {remote.url}')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "76887f5c-dffe-413c-8a3c-7af077a46747", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Setting up \n", 44 | "current = datetime.now()\n", 45 | "full_local_path = \"/project-dir/\"\n", 46 | "\n", 47 | "account = \"username\"\n", 48 | "repo_url = \"serverless-ml-course\"\n", 49 | "remote = f\"https://{secret}@github.com/{account}/{repo_url}.git\"\n", 50 | " \n", 51 | " \n", 52 | "repo = Repo(full_local_path)\n", 53 | "origin = repo.remote(name=\"origin\") \n", 54 | "if origin.url != remote:\n", 55 | " origin.set_url(remote, origin.url)\n", 56 | "\n", 57 | "origin.pull()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "773faf93-b612-4e53-90ad-931937f32386", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Preparing the commit and what we want in it\n", 68 | "repo.git.add('assets/latest_iris.png', 'assets/actual_iris.png', 'assets/confusion_matrix.png', 'assets/df_recent.png')\n", 69 | "repo.index.commit(f'New prediction! time and date: {current}')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "e61fbfa5-d894-4cf3-811d-1428d335afcc", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# Push to the main repository\n", 80 | "origin.push()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "f1e87803-f50e-4fbe-b15c-6974955de0d2", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Remove last commit if needed\n", 91 | "# repo.head.reset('HEAD~1', index=True, working_tree=True)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "976e58c2-524a-4dba-8888-4ae0f619c030", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Going back to the main branch\n", 102 | "repo.git.checkout('main', force=True)" 103 | ] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3.10.6 64-bit", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.10.6" 123 | }, 124 | "vscode": { 125 | "interpreter": { 126 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 127 | } 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 5 132 | } 133 | -------------------------------------------------------------------------------- /src/01-module/iris-batch-inference-pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "d2kLrOh-bpGy" 7 | }, 8 | "source": [ 9 | "# Iris Flower - Batch Prediction\n", 10 | "\n", 11 | "\n", 12 | "In this notebook we will, \n", 13 | "\n", 14 | "1. Load the batch inference data that arrived in the last 24 hours\n", 15 | "2. Predict the first Iris Flower found in the batch\n", 16 | "3. Write the ouput png of the Iris flower predicted, to be displayed in Github Pages." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "id": "xRtpj-psbpG8" 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "import hopsworks\n", 29 | "import joblib\n", 30 | "\n", 31 | "project = hopsworks.login()\n", 32 | "fs = project.get_feature_store()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "mr = project.get_model_registry()\n", 42 | "model = mr.get_model(\"iris\", version=1)\n", 43 | "model_dir = model.download()\n", 44 | "model = joblib.load(model_dir + \"/iris_model.pkl\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "We are downloading the 'raw' iris data. We explicitly do not want transformed data, reading for training. \n", 52 | "\n", 53 | "So, let's download the iris dataset, and preview some rows. \n", 54 | "\n", 55 | "Note, that it is 'tabular data'. There are 5 columns: 4 of them are \"features\", and the \"variety\" column is the **target** (what we are trying to predict using the 4 feature values in the target's row)." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "colab": { 63 | "base_uri": "https://localhost:8080/", 64 | "height": 206 65 | }, 66 | "id": "nRmFM7vcbpHA", 67 | "outputId": "d920d168-9818-40c5-c292-4cf0afcbbcfd" 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "feature_view = fs.get_feature_view(name=\"iris\", version=1)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Now we will do some **Batch Inference**. \n", 79 | "\n", 80 | "We will read all the input features that have arrived in the last 24 hours, and score them." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "id": "uHuAD3ttP8Ep" 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "import datetime\n", 92 | "from PIL import Image\n", 93 | "\n", 94 | "batch_data = feature_view.get_batch_data()\n", 95 | "\n", 96 | "y_pred = model.predict(batch_data)\n", 97 | "\n", 98 | "y_pred" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "batch_data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Batch prediction output is the last entry in the batch - it is output as a file 'latest_iris.png'" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "flower = y_pred[y_pred.size-1]\n", 124 | "flower_img = \"assets/\" + flower + \".png\"\n", 125 | "img = Image.open(flower_img) \n", 126 | "\n", 127 | "img.save(\"../../assets/latest_iris.png\")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "iris_fg = fs.get_feature_group(name=\"iris\", version=1)\n", 137 | "df = iris_fg.read()\n", 138 | "df" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "label = df.iloc[-1][\"variety\"]\n", 148 | "label" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "label_flower = \"assets/\" + label + \".png\"\n", 158 | "\n", 159 | "img = Image.open(label_flower) \n", 160 | "\n", 161 | "img.save(\"../../assets/actual_iris.png\")" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "import pandas as pd\n", 171 | "\n", 172 | "monitor_fg = fs.get_or_create_feature_group(name=\"iris_predictions\",\n", 173 | " version=1,\n", 174 | " primary_key=[\"datetime\"],\n", 175 | " description=\"Iris flower Prediction/Outcome Monitoring\"\n", 176 | " )" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "from datetime import datetime\n", 186 | "now = datetime.now().strftime(\"%m/%d/%Y, %H:%M:%S\")\n", 187 | "\n", 188 | "data = {\n", 189 | " 'prediction': [flower],\n", 190 | " 'label': [label],\n", 191 | " 'datetime': [now],\n", 192 | "}\n", 193 | "monitor_df = pd.DataFrame(data)\n", 194 | "monitor_fg.insert(monitor_df)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "history_df = monitor_fg.read()\n", 204 | "history_df" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "import dataframe_image as dfi\n", 214 | "\n", 215 | "df_recent = history_df.tail(5)\n", 216 | " \n", 217 | "# If you exclude this image, you may have the same iris_latest.png and iris_actual.png files\n", 218 | "# If no files have changed, the GH-action 'git commit/push' stage fails, failing your GH action (last step)\n", 219 | "# This image, however, is always new, ensuring git commit/push will succeed.\n", 220 | "dfi.export(df_recent, '../../assets/df_recent.png', table_conversion = 'matplotlib')" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "from sklearn.metrics import confusion_matrix\n", 230 | "\n", 231 | "predictions = history_df[['prediction']]\n", 232 | "labels = history_df[['label']]\n", 233 | "\n", 234 | "results = confusion_matrix(labels, predictions)\n", 235 | "print(results)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "from matplotlib import pyplot\n", 245 | "import seaborn as sns\n", 246 | "\n", 247 | "# Only create the confusion matrix when our iris_predictions feature group has examples of all 3 iris flowers\n", 248 | "if results.shape == (3,3):\n", 249 | "\n", 250 | " df_cm = pd.DataFrame(results, ['True Setosa', 'True Versicolor', 'True Virginica'],\n", 251 | " ['Pred Setosa', 'Pred Versicolor', 'Pred Virginica'])\n", 252 | "\n", 253 | " cm = sns.heatmap(df_cm, annot=True)\n", 254 | "\n", 255 | " fig = cm.get_figure()\n", 256 | " fig.savefig(\"../../assets/confusion_matrix.png\") \n", 257 | " df_cm\n", 258 | "else:\n", 259 | " print(\"Run the batch inference pipeline more times until you get 3 different iris flowers\") " 260 | ] 261 | } 262 | ], 263 | "metadata": { 264 | "colab": { 265 | "collapsed_sections": [], 266 | "provenance": [] 267 | }, 268 | "kernelspec": { 269 | "display_name": "Python 3.10.6 64-bit", 270 | "language": "python", 271 | "name": "python3" 272 | }, 273 | "language_info": { 274 | "codemirror_mode": { 275 | "name": "ipython", 276 | "version": 3 277 | }, 278 | "file_extension": ".py", 279 | "mimetype": "text/x-python", 280 | "name": "python", 281 | "nbconvert_exporter": "python", 282 | "pygments_lexer": "ipython3", 283 | "version": "3.10.6" 284 | }, 285 | "vscode": { 286 | "interpreter": { 287 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 288 | } 289 | } 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 1 293 | } 294 | -------------------------------------------------------------------------------- /src/01-module/iris-feature-pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "d2kLrOh-bpGy" 7 | }, 8 | "source": [ 9 | "# Iris Flower - Feature Pipeline\n", 10 | "\n", 11 | "In this notebook we will, \n", 12 | "\n", 13 | "1. Run in either \"Backfill\" or \"Normal\" operation. \n", 14 | "2. IF *BACKFILL==True*, we will load our DataFrame with data from the iris.csv file \n", 15 | "\n", 16 | " ELSE *BACKFILL==False*, we will load our DataFrame with one synthetic Iris Flower sample \n", 17 | "3. Write our DataFrame to a Feature Group" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "!pip install -U hopsworks --quiet" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Set **BACKFILL=True** if you want to create features from the iris.csv file containing historical data." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import random\n", 43 | "import pandas as pd\n", 44 | "import hopsworks\n", 45 | "\n", 46 | "BACKFILL=False" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Synthetic Data Functions\n", 54 | "\n", 55 | "These synthetic data functions can be used to create a DataFrame containing a single Iris Flower sample." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "colab": { 63 | "base_uri": "https://localhost:8080/", 64 | "height": 206 65 | }, 66 | "id": "nRmFM7vcbpHA", 67 | "outputId": "d920d168-9818-40c5-c292-4cf0afcbbcfd" 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "def generate_flower(name, sepal_len_max, sepal_len_min, sepal_width_max, sepal_width_min, \n", 72 | " petal_len_max, petal_len_min, petal_width_max, petal_width_min):\n", 73 | " \"\"\"\n", 74 | " Returns a single iris flower as a single row in a DataFrame\n", 75 | " \"\"\"\n", 76 | " df = pd.DataFrame({ \"sepal_length\": [random.uniform(sepal_len_max, sepal_len_min)],\n", 77 | " \"sepal_width\": [random.uniform(sepal_width_max, sepal_width_min)],\n", 78 | " \"petal_length\": [random.uniform(petal_len_max, petal_len_min)],\n", 79 | " \"petal_width\": [random.uniform(petal_width_max, petal_width_min)]\n", 80 | " })\n", 81 | " df['variety'] = name\n", 82 | " return df\n", 83 | "\n", 84 | "\n", 85 | "def get_random_iris_flower():\n", 86 | " \"\"\"\n", 87 | " Returns a DataFrame containing one random iris flower\n", 88 | " \"\"\"\n", 89 | " virginica_df = generate_flower(\"Virginica\", 8, 5.5, 3.8, 2.2, 7, 4.5, 2.5, 1.4)\n", 90 | " versicolor_df = generate_flower(\"Versicolor\", 7.5, 4.5, 3.5, 2.1, 3.1, 5.5, 1.8, 1.0)\n", 91 | " setosa_df = generate_flower(\"Setosa\", 6, 4.5, 4.5, 2.3, 1.2, 2, 0.7, 0.3)\n", 92 | "\n", 93 | " # randomly pick one of these 3 and write it to the featurestore\n", 94 | " pick_random = random.uniform(0,3)\n", 95 | " if pick_random >= 2:\n", 96 | " iris_df = virginica_df\n", 97 | " elif pick_random >= 1:\n", 98 | " iris_df = versicolor_df\n", 99 | " else:\n", 100 | " iris_df = setosa_df\n", 101 | "\n", 102 | " return iris_df" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Backfill or create new synthetic input data\n", 110 | "\n", 111 | "You can run this pipeline in either *backfill* or *synthetic-data* mode." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "\n", 121 | "if BACKFILL == True:\n", 122 | " iris_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/iris.csv\")\n", 123 | "else:\n", 124 | " iris_df = get_random_iris_flower()\n", 125 | " \n", 126 | "iris_df.head()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Authenticate with Hopsworks using your API Key\n", 134 | "\n", 135 | "Hopsworks will prompt you to paste in your API key and provide you with a link to find your API key if you have not stored it securely already." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "project = hopsworks.login()\n", 145 | "fs = project.get_feature_store()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## Create and write to a feature group - primary keys\n", 153 | "\n", 154 | "To prevent duplicate entries, Hopsworks requires that each DataFame has a *primary_key*. \n", 155 | "A *primary_key* is one or more columns that uniquely identify the row. Here, we assume\n", 156 | "that each Iris flower has a unique combination of (\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\")\n", 157 | "feature values. If you randomly generate a sample that already exists in the feature group, the insert operation will fail.\n", 158 | "\n", 159 | "The *feature group* will create its online schema using the schema of the Pandas DataFame." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "iris_fg = fs.get_or_create_feature_group(name=\"iris\",\n", 169 | " version=1,\n", 170 | " primary_key=[\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\"],\n", 171 | " description=\"Iris flower dataset\"\n", 172 | " )\n", 173 | "iris_fg.insert(iris_df)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | } 183 | ], 184 | "metadata": { 185 | "colab": { 186 | "collapsed_sections": [], 187 | "provenance": [] 188 | }, 189 | "kernelspec": { 190 | "display_name": "Python 3 (ipykernel)", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.9.7" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 1 209 | } 210 | -------------------------------------------------------------------------------- /src/01-module/iris-train-pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "d2kLrOh-bpGy" 7 | }, 8 | "source": [ 9 | "# Iris Flower Train and Publish Model\n", 10 | "\n", 11 | "\n", 12 | "In this notebook we will, \n", 13 | "\n", 14 | "1. Load the Iris Flower dataset into random split (train/test) DataFrames using a Feature View\n", 15 | "2. Train a KNN Model using SkLearn\n", 16 | "3. Evaluate model performance on the test set\n", 17 | "4. Register the model with Hopsworks Model Registry" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "!pip install -U hopsworks --quiet" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "xRtpj-psbpG8" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from sklearn.neighbors import KNeighborsClassifier\n", 38 | "from sklearn.metrics import accuracy_score\n", 39 | "import pandas as pd\n", 40 | "import seaborn as sns\n", 41 | "import hopsworks" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Let's first get a feature_view for the iris flower dataset, or create one if it does not already exist.\n", 49 | "If you are running this notebook for the first time, it will create the feature view, which contains all of the columns from the **iris feature group**.\n", 50 | "\n", 51 | "There are 5 columns: 4 of them are \"features\", and the **variety** column is the **label** (what we are trying to predict using the 4 feature values in the label's row). The label is often called the **target**." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "colab": { 59 | "base_uri": "https://localhost:8080/", 60 | "height": 206 61 | }, 62 | "id": "nRmFM7vcbpHA", 63 | "outputId": "d920d168-9818-40c5-c292-4cf0afcbbcfd" 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "project = hopsworks.login()\n", 68 | "fs = project.get_feature_store()\n", 69 | "\n", 70 | "try: \n", 71 | " feature_view = fs.get_feature_view(name=\"iris\", version=1)\n", 72 | "except:\n", 73 | " iris_fg = fs.get_feature_group(name=\"iris\", version=1)\n", 74 | " query = iris_fg.select_all()\n", 75 | " feature_view = fs.create_feature_view(name=\"iris\",\n", 76 | " version=1,\n", 77 | " description=\"Read from Iris flower dataset\",\n", 78 | " labels=[\"variety\"],\n", 79 | " query=query)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "We will read our features and labels split into a **train_set** and a **test_set**. You split your data into a train_set and a test_set, because you want to train your model on only the train_set, and then evaluate its performance on data that was not seen during training, the test_set. This technique helps evaluate the ability of your model to accurately predict on data it has not seen before.\n", 87 | "\n", 88 | "We can ask the feature_view to return a **train_test_split** and it returns:\n", 89 | "\n", 90 | "* **X_** is a vector of features, so **X_train** is a vector of features from the **train_set**. \n", 91 | "* **y_** is a scale of labels, so **y_train** is a scalar of labels from the **train_set**. \n", 92 | "\n", 93 | "Note: a vector is an array of values and a scalar is a single value.\n", 94 | "\n", 95 | "Note: that mathematical convention is that a vector is denoted by an uppercase letter (hence \"X\") and a scalar is denoted by a lowercase letter (hence \"y\").\n", 96 | "\n", 97 | "**X_test** is the features and **y_test** is the labels from our holdout **test_set**. The **test_set** is used to evaluate model performance after the model has been trained." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "id": "JR8HeEs6bpHB" 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "X_train, X_test, y_train, y_test = feature_view.train_test_split(0.2, )" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "y_train" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Now, we can fit a model to our features and labels from our training set (**X_train** and **y_train**). \n", 125 | "\n", 126 | "Fitting a model to a dataset is more commonly called \"training a model\"." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "colab": { 134 | "base_uri": "https://localhost:8080/" 135 | }, 136 | "id": "PNZcUPHJPIu9", 137 | "outputId": "389acb4d-74ff-46f1-dee8-a7c27ee79a09" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "model = KNeighborsClassifier(n_neighbors=2)\n", 142 | "model.fit(X_train, y_train.values.ravel())" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Now, we have trained our model. We can evaluate our model on the **test_set** to estimate its performance." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "id": "uHuAD3ttP8Ep" 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "y_pred = model.predict(X_test)\n", 161 | "y_pred" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "We can report on how accurate these predictions (**y_pred**) are compared to the labels (the actual results - **y_test**). " 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "colab": { 176 | "base_uri": "https://localhost:8080/" 177 | }, 178 | "id": "b8EC4_SvbpHE", 179 | "outputId": "5d73b375-76f0-4518-8e88-4db23e8f2486" 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "from sklearn.metrics import classification_report\n", 184 | "\n", 185 | "metrics = classification_report(y_test, y_pred, output_dict=True)\n", 186 | "print(metrics)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "from sklearn.metrics import confusion_matrix\n", 196 | "\n", 197 | "results = confusion_matrix(y_test, y_pred)\n", 198 | "print(results)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "Notice in the confusion matrix results that we have 1 or 2 incorrect predictions.\n", 206 | "We have only 30 flowers in our test set - **y_test**.\n", 207 | "Our model predicted 1 or 2 flowers were of type \"Virginica\", but the flowers were, in fact, \"Versicolor\"." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "from matplotlib import pyplot\n", 217 | "\n", 218 | "df_cm = pd.DataFrame(results, ['True Setosa', 'True Versicolor', 'True Virginica'],\n", 219 | " ['Pred Setosa', 'Pred Versicolor', 'Pred Virginica'])\n", 220 | "\n", 221 | "cm = sns.heatmap(df_cm, annot=True)\n", 222 | "\n", 223 | "fig = cm.get_figure()\n", 224 | "fig.savefig(\"assets/confusion_matrix.png\") \n", 225 | "fig.show()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Register the Model with Hopsworks Model Registry\n", 233 | "\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "from hsml.schema import Schema\n", 243 | "from hsml.model_schema import ModelSchema\n", 244 | "import os\n", 245 | "import joblib\n", 246 | "import hopsworks\n", 247 | "import shutil\n", 248 | "\n", 249 | "project = hopsworks.login()\n", 250 | "mr = project.get_model_registry()\n", 251 | "\n", 252 | "# The 'iris_model' directory will be saved to the model registry\n", 253 | "model_dir=\"iris_model\"\n", 254 | "if os.path.isdir(model_dir) == False:\n", 255 | " os.mkdir(model_dir)\n", 256 | "joblib.dump(model, model_dir + \"/iris_model.pkl\")\n", 257 | "shutil.copyfile(\"assets/confusion_matrix.png\", model_dir + \"/confusion_matrix.png\")\n", 258 | "\n", 259 | "input_example = X_train.sample()\n", 260 | "input_schema = Schema(X_train)\n", 261 | "output_schema = Schema(y_train)\n", 262 | "model_schema = ModelSchema(input_schema, output_schema)\n", 263 | "\n", 264 | "iris_model = mr.python.create_model(\n", 265 | " version=1,\n", 266 | " name=\"iris\", \n", 267 | " metrics={\"accuracy\" : metrics['accuracy']},\n", 268 | " model_schema=model_schema,\n", 269 | " input_example=input_example, \n", 270 | " description=\"Iris Flower Predictor\")\n", 271 | "\n", 272 | "iris_model.save(model_dir)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [] 281 | } 282 | ], 283 | "metadata": { 284 | "colab": { 285 | "collapsed_sections": [], 286 | "provenance": [] 287 | }, 288 | "kernelspec": { 289 | "display_name": "Python 3 (ipykernel)", 290 | "language": "python", 291 | "name": "python3" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 3 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython3", 303 | "version": "3.9.7" 304 | } 305 | }, 306 | "nbformat": 4, 307 | "nbformat_minor": 1 308 | } 309 | -------------------------------------------------------------------------------- /src/01-module/scripts/run-feature-and-prediction-pipelines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd src/01-module 6 | 7 | jupyter nbconvert --to notebook --execute iris-feature-pipeline.ipynb 8 | jupyter nbconvert --to notebook --execute iris-batch-inference-pipeline.ipynb 9 | 10 | -------------------------------------------------------------------------------- /src/02-module/2_cc_feature_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fa9aedf3", 6 | "metadata": {}, 7 | "source": [ 8 | "![Screenshot from 2022-06-16 14-24-57.png]()" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "a03cd759", 14 | "metadata": {}, 15 | "source": [ 16 | "# Feature Pipeline using Synthetic Data\n", 17 | "\n", 18 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/featurestoreorg/serverless-ml-course/blob/main/nbs/01-module/1a_feature_groups.ipynb)\n", 19 | "\n", 20 | "**Note**: you may get an error when installing hopsworks on Colab, and it is safe to ignore it.\n", 21 | "\n", 22 | "## 🗒️ This notebook is divided in 2 sections:\n", 23 | "1. Reading the synthetic credit card data and feature engineeing,\n", 24 | "2. Write the Pandas DataFrames to the feature groups in the feature store.\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "6b730276", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "#!pip install -U hopsworks --quiet\n", 35 | "#!pip install -U faker --quiet" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "aeac6fdc", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import pandas as pd\n", 46 | "import datetime\n", 47 | "import hopsworks\n", 48 | "from sml import synthetic_data\n", 49 | "import random\n", 50 | "pd.options.mode.chained_assignment = None" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "71c2d6dd", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "start_time = (datetime.datetime.now() - datetime.timedelta(hours=24)).strftime(\"%Y-%m-%d %H:%M:%S\")\n", 61 | "print(start_time)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "ca60a7d2", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "#end_time = (datetime.datetime.now() - datetime.timedelta(hours=24)).strftime(\"%Y-%m-%d %H:%M:%S\")\n", 72 | "end_time = datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", 73 | "print(end_time)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "e060fdaa", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "synthetic_data.FRAUD_RATIO = random.uniform(0.001, 0.005)\n", 84 | "synthetic_data.TOTAL_UNIQUE_USERS = 1000\n", 85 | "synthetic_data.TOTAL_UNIQUE_TRANSACTIONS = 54000\n", 86 | "synthetic_data.CASH_WITHRAWAL_CARDS_TOTAL = 2000 \n", 87 | "synthetic_data.TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200\n", 88 | "synthetic_data.START_DATE=start_time\n", 89 | "synthetic_data.END_DATE=end_time\n", 90 | "\n", 91 | "credit_cards = synthetic_data.generate_list_credit_card_numbers()\n", 92 | "credit_cards_df = synthetic_data.create_credit_cards_as_df(credit_cards)\n", 93 | "profiles_df = synthetic_data.create_profiles_as_df(credit_cards)\n", 94 | "trans_df = synthetic_data.create_transactions_as_df(credit_cards)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "id": "0ba601c9", 100 | "metadata": {}, 101 | "source": [ 102 | "## 🛠️ Feature Engineering \n", 103 | "\n", 104 | "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", 105 | "1. **Features that aggregate data from different data sources**. This could for instance be the age of a customer at the time of a transaction, which combines the `birthdate` feature from `profiles.csv` with the `datetime` feature from `transactions.csv`.\n", 106 | "2. **Features that aggregate data from multiple time steps**. An example of this could be the transaction frequency of a credit card in the span of a few hours, which is computed using a window function.\n", 107 | "\n", 108 | "Let's start with the first category." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "741f6f20", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "fraud_labels = trans_df.copy()[[\"tid\", \"cc_num\", \"datetime\", \"fraud_label\"]]\n", 119 | "fraud_labels" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "4fd45117", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "from sml import cc_features\n", 130 | "\n", 131 | "fraud_labels.datetime = fraud_labels.datetime.map(lambda x: cc_features.date_to_timestamp(x))\n", 132 | "fraud_labels" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "46abff7f", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "trans_df" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "fcadef7c", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "trans_df.drop(['fraud_label'], inplace = True, axis=1)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "3d242a03", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "trans_df = cc_features.card_owner_age(trans_df, profiles_df)\n", 163 | "trans_df = cc_features.expiry_days(trans_df, credit_cards_df)\n", 164 | "trans_df = cc_features.activity_level(trans_df, 1)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "9884c382", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "window_len = 4\n", 175 | "window_aggs_df = cc_features.aggregate_activity_by_hour(trans_df, window_len)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "3acfeba9", 181 | "metadata": {}, 182 | "source": [ 183 | "Next, you will create features that for each credit card aggregate data from multiple time steps.\n", 184 | "\n", 185 | "Yoy will start by computing the distance between consecutive transactions, lets call it `loc_delta`.\n", 186 | "Here you will use the [Haversine distance](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html?highlight=haversine#sklearn.metrics.pairwise.haversine_distances) to quantify the distance between two longitude and latitude coordinates." 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "deda57e8", 192 | "metadata": {}, 193 | "source": [ 194 | "Next lets compute windowed aggregates. Here you will use 4-hour windows, but feel free to experiment with different window lengths by setting `window_len` below to a value of your choice." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "cf3ce8b0", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "project = hopsworks.login()\n", 205 | "fs = project.get_feature_store()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "9c8c224e", 211 | "metadata": {}, 212 | "source": [ 213 | "To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to `1`." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "32d5258d", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "trans_fg = fs.get_feature_group(name=\"cc_trans_fraud\", version=2)\n", 224 | "trans_fg.insert(trans_df, write_options={\"wait_for_job\" : False})" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "d81ed056", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "window_aggs_fg = fs.get_feature_group(name=f\"cc_trans_fraud_{window_len}h\", version=2)\n", 235 | "window_aggs_fg.insert(window_aggs_df, write_options={\"wait_for_job\" : False})" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "4167a4a7", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "\n", 246 | "labels_fg = fs.get_feature_group(name=\"transactions_fraud_label\", version=2)\n", 247 | "labels_fg.insert(fraud_labels)" 248 | ] 249 | } 250 | ], 251 | "metadata": { 252 | "interpreter": { 253 | "hash": "e1ddeae6eefc765c17da80d38ea59b893ab18c0c0904077a035ef84cfe367f83" 254 | }, 255 | "kernelspec": { 256 | "display_name": "Python 3 (ipykernel)", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.9.7" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 5 275 | } 276 | -------------------------------------------------------------------------------- /src/02-module/scripts/run-fraud-feature-pipelines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd src/02-module 6 | 7 | jupyter nbconvert --to notebook --execute 2_cc_feature_pipeline.ipynb 8 | 9 | -------------------------------------------------------------------------------- /src/02-module/sml/cc_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from datetime import datetime, date 5 | from math import radians 6 | 7 | # + 8 | def card_owner_age(trans_df : pd.DataFrame, profiles_df : pd.DataFrame)-> pd.DataFrame: 9 | """Used only in feature pipelines (not online inference). 10 | Unit test with DataFrames and sample data. 11 | """ 12 | age_df = trans_df.merge(profiles_df, on="cc_num", how="left") 13 | trans_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y") 14 | return trans_df 15 | 16 | def expiry_days(trans_df : pd.DataFrame, credit_cards_df : pd.DataFrame)-> pd.DataFrame: 17 | """Used only in feature pipelines (not online inference). 18 | Unit test with DataFrames and sample data. 19 | """ 20 | card_expiry_df = trans_df.merge(credit_cards_df, on="cc_num", how="left") 21 | card_expiry_df["expires"] = pd.to_datetime(card_expiry_df["expires"], format="%m/%y") 22 | trans_df["days_until_card_expires"] = (card_expiry_df["expires"] - card_expiry_df["datetime"]) / np.timedelta64(1, "D") 23 | return trans_df 24 | 25 | 26 | # - 27 | 28 | def haversine_distance(long: float, lat: float, prev_long: float, prev_lat: float)-> float: 29 | """Compute Haversine distance between each consecutive coordinate in (long, lat).""" 30 | 31 | if isinstance(long, pd.Series): 32 | long = long.map(lambda x: (x)) 33 | else: 34 | long = radians(long) 35 | 36 | if isinstance(lat, pd.Series): 37 | lat = lat.map(lambda x: (x)) 38 | else: 39 | lat = radians(lat) 40 | 41 | if isinstance(long, pd.Series): 42 | prev_long = prev_long.map(lambda x: (x)) 43 | else: 44 | prev_long = radians(prev_long) 45 | 46 | if isinstance(lat, pd.Series): 47 | prev_lat = prev_lat.map(lambda x: (x)) 48 | else: 49 | prev_lat = radians(prev_lat) 50 | 51 | long_diff = prev_long - long 52 | lat_diff = prev_lat - lat 53 | 54 | a = np.sin(lat_diff/2.0)**2 55 | b = np.cos(lat) * np.cos(prev_lat) * np.sin(long_diff/2.0)**2 56 | c = 2*np.arcsin(np.sqrt(a + b)) 57 | 58 | return c 59 | 60 | 61 | def time_delta(prev_datetime: int, current_datetime: int)-> int: 62 | """Compute time difference between each consecutive transaction.""" 63 | return prev_datetime - current_datetime 64 | 65 | def time_delta_to_days(time_delta: datetime)-> float: 66 | """.""" 67 | return time_delta.total_seconds() / 86400 68 | 69 | def date_to_timestamp(date_obj: datetime)-> int: 70 | return int(date_obj.timestamp() * 1000) 71 | 72 | def timestamp_to_date(timestamp: int)-> datetime: 73 | return datetime.fromtimestamp(timestamp // 1000) 74 | 75 | def activity_level(trans_df : pd.DataFrame, lag: int)-> pd.DataFrame: 76 | 77 | # Convert coordinates into radians: 78 | trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians) 79 | 80 | trans_df.sort_values(["datetime", "cc_num"], inplace=True) 81 | 82 | # When we call `haversine_distance`, we want to pass as params, the long/lat of the current row, and the long/lat of the most 83 | # recent prior purchase. By grouping the DF by cc_num, apart from the first transaction (which will be NaN and we fill that with 0 at the end), 84 | # we can access the previous lat/long using Panda's `shift` operation, which gives you the previous row (long/lang). 85 | trans_df[f"loc_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 86 | .apply(lambda x :haversine_distance(x["longitude"], x["latitude"], x["longitude"].shift(-lag), x["latitude"].shift(-lag)))\ 87 | .reset_index(level=0, drop=True)\ 88 | .fillna(0) 89 | 90 | # Use the same `shift` operation in Pandas to get the previous row for a given cc_number 91 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df.groupby("cc_num")\ 92 | .apply(lambda x : time_delta(x["datetime"].shift(-lag), x["datetime"]))\ 93 | .reset_index(level=0, drop=True) 94 | # .fillna(0) # handle the first datetime, which has no previous row when you call `shift` 95 | 96 | # Convert time_delta from seconds to days 97 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].map(lambda x: time_delta_to_days(x)) 98 | trans_df[f"time_delta_t_minus_{lag}"] = trans_df[f"time_delta_t_minus_{lag}"].fillna(0) 99 | trans_df = trans_df[["tid","datetime","cc_num","category", "amount", "city", "country", "age_at_transaction"\ 100 | ,"days_until_card_expires", f"loc_delta_t_minus_{lag}", f"time_delta_t_minus_{lag}"]] 101 | # Convert datetime to timestamp, because of a problem with UTC. Hopsworks assumes you use UTC, but if you don't use UTC 102 | # on your Python environment, the datetime will be wrong. With timestamps, we don't have the UTC problems when performing PIT Joins. 103 | trans_df.datetime = trans_df.datetime.map(lambda x: date_to_timestamp(x)) 104 | return trans_df 105 | 106 | 107 | def aggregate_activity_by_hour(trans_df : pd.DataFrame, window_len)-> pd.DataFrame: 108 | 109 | cc_group = trans_df[["cc_num", "amount", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime") 110 | 111 | # Moving average of transaction volume. 112 | df_mavg = pd.DataFrame(cc_group.mean()) 113 | df_mavg.columns = ["trans_volume_mavg", "datetime"] 114 | df_mavg = df_mavg.reset_index(level=["cc_num"]) 115 | df_mavg = df_mavg.drop(columns=["cc_num", "datetime"]) 116 | df_mavg = df_mavg.sort_index() 117 | 118 | # Moving standard deviation of transaction volume. 119 | df_std = pd.DataFrame(cc_group.mean()) 120 | df_std.columns = ["trans_volume_mstd", "datetime"] 121 | df_std = df_std.reset_index(level=["cc_num"]) 122 | df_std = df_std.drop(columns=["cc_num", "datetime"]) 123 | df_std = df_std.fillna(0) 124 | df_std = df_std.sort_index() 125 | window_aggs_df = df_std.merge(df_mavg,left_index=True, right_index=True) 126 | 127 | # Moving average of transaction frequency. 128 | df_count = pd.DataFrame(cc_group.mean()) 129 | df_count.columns = ["trans_freq", "datetime"] 130 | df_count = df_count.reset_index(level=["cc_num"]) 131 | df_count = df_count.drop(columns=["cc_num", "datetime"]) 132 | df_count = df_count.sort_index() 133 | window_aggs_df = window_aggs_df.merge(df_count,left_index=True, right_index=True) 134 | 135 | # Moving average of location difference between consecutive transactions. 136 | cc_group = trans_df[["cc_num", "loc_delta_t_minus_1", "datetime"]].groupby("cc_num").rolling(window_len, on="datetime").mean() 137 | df_loc_delta_mavg = pd.DataFrame(cc_group) 138 | df_loc_delta_mavg.columns = ["loc_delta_mavg", "datetime"] 139 | df_loc_delta_mavg = df_loc_delta_mavg.reset_index(level=["cc_num"]) 140 | df_loc_delta_mavg = df_loc_delta_mavg.drop(columns=["cc_num", "datetime"]) 141 | df_loc_delta_mavg = df_loc_delta_mavg.sort_index() 142 | window_aggs_df = window_aggs_df.merge(df_loc_delta_mavg,left_index=True, right_index=True) 143 | 144 | window_aggs_df = window_aggs_df.merge(trans_df[["cc_num", "datetime"]].sort_index(),left_index=True, right_index=True) 145 | 146 | return window_aggs_df 147 | -------------------------------------------------------------------------------- /src/02-module/sml/synthetic_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | from collections import defaultdict 5 | from faker import Faker 6 | import pandas as pd 7 | import numpy as np 8 | import datetime 9 | import hashlib 10 | import random 11 | import math 12 | import os 13 | import bisect 14 | from typing import Optional, Union, Any, Dict, List, TypeVar, Tuple 15 | 16 | # Seed for Reproducibility 17 | faker = Faker() 18 | faker.seed_locale('en_US', 0) 19 | 20 | 21 | def set_random_seed(seed: int): 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | faker.seed_instance(seed) 25 | 26 | set_random_seed(12345) 27 | 28 | 29 | TOTAL_UNIQUE_USERS = 1000 30 | TOTAL_UNIQUE_TRANSACTIONS = 54000 31 | CASH_WITHRAWAL_CARDS_TOTAL = 2000 32 | TOTAL_UNIQUE_CASH_WITHDRAWALS = 1200 33 | ATM_WITHRAWAL_SEQ_LENGTH = [3, 4, 5, 6, 7, 8, 9, 10] 34 | NORMAL_ATM_RADIUS = 0.01 35 | START_DATE = '2022-01-01 00:00:00' 36 | END_DATE = '2022-03-01 00:00:00' 37 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S' 38 | 39 | AMOUNT_DISTRIBUTION_PERCENTAGES = { 40 | 0.05: (0.01, 1.01), 41 | 0.075: (1, 11.01), 42 | 0.525: (10, 100.01), 43 | 0.25: (100, 1000.01), 44 | 0.099: (1000, 10000.01), 45 | 0.001: (10000, 30000.01) 46 | } 47 | 48 | CATEGORY_PERC_PRICE = { 49 | "Grocery": (0.5, 0.01, 100), 50 | "Restaurant/Cafeteria": (0.2, 1, 100), 51 | "Health/Beauty": (0.1, 10, 500.01), 52 | "Domestic Transport": (0.1, 10, 100.01), 53 | "Clothing": (0.05, 10, 2000.01), 54 | "Electronics": (0.02, 100, 10000.01), 55 | "Sports/Outdoors": (0.015, 10, 100.01), 56 | "Holliday/Travel": (0.014, 10, 100.01), 57 | "Jewelery": (0.001, 10, 100.01) 58 | } 59 | 60 | FRAUD_RATIO = 0.0025 # percentage of transactions that are fraudulent 61 | NUMBER_OF_FRAUDULENT_TRANSACTIONS = int(FRAUD_RATIO * TOTAL_UNIQUE_TRANSACTIONS) 62 | ATTACK_CHAIN_LENGTHS = [3, 4, 5, 6, 7, 8, 9, 10] 63 | 64 | SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE = { 65 | 0.055: (17, 24), 66 | 0.0015: (24, 34), 67 | 0.0015: (34, 44), 68 | 0.02: (44, 54), 69 | 0.022: (54, 64), 70 | 0.1: (64, 74), 71 | 0.40: (74, 84), 72 | 0.40: (84, 100), 73 | } 74 | 75 | 76 | 77 | def generate_unique_credit_card_numbers(n: int) -> pd.Series: 78 | """.""" 79 | cc_ids = set() 80 | for _ in range(n): 81 | cc_id = faker.credit_card_number(card_type='visa') 82 | cc_ids.add(cc_id) 83 | return pd.Series(list(cc_ids)) 84 | 85 | # write a pytest - assert len(credit_card_numbers) == TOTAL_UNIQUE_USERS 86 | # assert len(credit_card_numbers[0]) == 16 # validate if generated number is 16-digit 87 | 88 | def generate_list_credit_card_numbers() -> list: 89 | """.""" 90 | credit_cards = [] 91 | credit_card_numbers = generate_unique_credit_card_numbers(TOTAL_UNIQUE_USERS) 92 | delta_time_object = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 93 | delta_time_object + datetime.timedelta(days=-728) 94 | for cc_num in credit_card_numbers: 95 | credit_cards.append({'cc_num': cc_num, 'provider': 'visa', 'expires': faker.credit_card_expire(start=delta_time_object, end="+5y", date_format="%m/%y")}) 96 | return credit_cards 97 | 98 | def generate_df_with_profiles(credit_cards : list)-> pd.DataFrame: 99 | """.""" 100 | profiles = [] 101 | for credit_card in credit_cards: 102 | address = faker.local_latlng(country_code = 'US') 103 | age = 0 104 | profile = None 105 | while age < 18 or age > 100: 106 | profile = faker.profile(fields=['name', 'sex', 'mail', 'birthdate']) 107 | dday = profile['birthdate'] 108 | delta = datetime.datetime.now() - datetime.datetime(dday.year, dday.month, dday.day) 109 | age = int(delta.days / 365) 110 | profile['City'] = address[2] 111 | profile['Country'] = address[3] 112 | profile['cc_num'] = credit_card['cc_num'] 113 | credit_card['age'] = age 114 | profiles.append(profile) 115 | 116 | # Cast the columns to the correct Pandas DType 117 | profiles_df = pd.DataFrame.from_records(profiles) 118 | profiles_df['birthdate']= pd.to_datetime(profiles_df['birthdate']) 119 | profiles_df['cc_num']= pd.to_numeric(profiles_df['cc_num']) 120 | 121 | return profiles_df 122 | 123 | # pyasset - assert len(timestamps) == TOTAL_UNIQUE_TRANSACTIONS 124 | def generate_timestamps(n: int) -> list: 125 | """Return a list of timestamps of length 'n'.""" 126 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 127 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 128 | timestamps = list() 129 | for _ in range(n): 130 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None).strftime(DATE_FORMAT) 131 | timestamps.append(timestamp) 132 | timestamps = sorted(timestamps) 133 | return timestamps 134 | 135 | def get_random_transaction_amount(start: float, end: float) -> float: 136 | """.""" 137 | amt = round(np.random.uniform(start, end), 2) 138 | return amt 139 | 140 | def generate_amounts() -> list: 141 | """.""" 142 | amounts = [] 143 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 144 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 145 | start, end = span 146 | for _ in range(n): 147 | amounts.append(get_random_transaction_amount(start, end+1)) 148 | return amounts 149 | 150 | def generate_categories(amounts) -> list: 151 | """.""" 152 | categories = [] 153 | for category, category_perc_price in CATEGORY_PERC_PRICE.items(): 154 | percentage, min_price, max_price = category_perc_price 155 | n = int(TOTAL_UNIQUE_TRANSACTIONS * percentage) 156 | for _ in range(n): 157 | min_price_i = bisect.bisect_left(amounts, min_price) 158 | max_price_i = bisect.bisect_right(amounts, max_price, lo=min_price_i) 159 | categories.append({"category":category, "amount":random.choice(amounts[min_price_i:max_price_i])}) 160 | 161 | random.shuffle(categories) 162 | return categories 163 | 164 | def generate_transaction_id(timestamp: str, credit_card_number: str, transaction_amount: float) -> str: 165 | """.""" 166 | hashable = f'{timestamp}{credit_card_number}{transaction_amount}' 167 | hexdigest = hashlib.md5(hashable.encode('utf-8')).hexdigest() 168 | return hexdigest 169 | 170 | def generate_transactions(credit_card_numbers: list, timestamps: list, categories: list) -> list: 171 | """.""" 172 | transactions = [] 173 | for timestamp, category in zip(timestamps, categories): 174 | credit_card_number = random.choice(credit_card_numbers) 175 | point_of_tr = faker.local_latlng(country_code = 'US') 176 | transaction_id = generate_transaction_id(timestamp, credit_card_number, category['amount']) 177 | transactions.append({ 178 | 'tid': transaction_id, 179 | 'datetime': timestamp, 180 | 'cc_num': credit_card_number, 181 | 'category': category['category'], 182 | 'amount': category['amount'], 183 | 'latitude': point_of_tr[0], 184 | 'longitude': point_of_tr[1], 185 | 'city': point_of_tr[2], 186 | 'country': point_of_tr[3], 187 | 'fraud_label': 0 188 | } 189 | ) 190 | return transactions 191 | 192 | def generate_cash_amounts() -> list: 193 | """.""" 194 | cash_amounts = [] 195 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 196 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) 197 | start, end = span 198 | for _ in range(n): 199 | cash_amounts.append(get_random_transaction_amount(start, end+1)) 200 | return cash_amounts 201 | 202 | def generate_chains(): 203 | """.""" 204 | visited = set() 205 | chains = defaultdict(list) 206 | 207 | def size(chains: dict) -> int: 208 | counts = {key: len(values)+1 for (key, values) in chains.items()} 209 | return sum(counts.values()) 210 | 211 | 212 | def generate_attack_chain(i: int): 213 | chain_length = random.choice(ATTACK_CHAIN_LENGTHS) 214 | for j in range(1, chain_length): 215 | if i+j not in visited: 216 | if size(chains) == NUMBER_OF_FRAUDULENT_TRANSACTIONS: 217 | break 218 | chains[i].append(i+j) 219 | visited.add(i+j) 220 | 221 | while size(chains) < NUMBER_OF_FRAUDULENT_TRANSACTIONS: 222 | i = random.choice(range(TOTAL_UNIQUE_TRANSACTIONS)) 223 | if i not in visited: 224 | generate_attack_chain(i) 225 | visited.add(i) 226 | return chains 227 | 228 | def generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 229 | delta: int, radius: float = None, country_code = 'US') -> List[Dict]: 230 | """.""" 231 | atms = [] 232 | if length < 0: 233 | raise Exception('Length must be > 0') 234 | 235 | start = datetime.datetime.strptime(START_DATE, DATE_FORMAT) 236 | end = datetime.datetime.strptime(END_DATE, DATE_FORMAT) 237 | timestamp = faker.date_time_between(start_date=start, end_date=end, tzinfo=None) 238 | point_of_tr = faker.local_latlng(country_code = country_code) 239 | latitude = point_of_tr[0] 240 | longitude = point_of_tr[1] 241 | city = point_of_tr[2] 242 | for _ in range(length): 243 | current = timestamp + datetime.timedelta(hours=delta) 244 | if radius is not None: 245 | latitude = faker.coordinate(latitude, radius) 246 | longitude = faker.coordinate(longitude, radius) 247 | amount = random.sample(cash_amounts, 1)[0] 248 | transaction_id = generate_transaction_id(timestamp, credit_card_number, amount) 249 | atms.append({'tid': transaction_id, 250 | 'datetime': current.strftime(DATE_FORMAT), 251 | 'cc_num': credit_card_number, 252 | 'category': 'Cash Withdrawal', 253 | 'amount': amount, 254 | 'latitude': latitude, 255 | 'longitude': longitude, 256 | 'city': city, 257 | 'country': 'US', 258 | 'fraud_label': 0 259 | }) 260 | timestamp = current 261 | return atms 262 | 263 | def generate_susceptible_cards(credit_cards: list) -> list: 264 | """.""" 265 | susceptible_cards = [] 266 | visited_cards = [] 267 | for percentage, span in SUSCEPTIBLE_CARDS_DISTRIBUTION_BY_AGE.items(): 268 | n = int(TOTAL_UNIQUE_CASH_WITHDRAWALS * percentage) ## TODO: here total expected fraud 269 | start, end = span 270 | for _ in range(n): 271 | for card in credit_cards: 272 | if card['age'] > start and card['age'] < end: 273 | if card['cc_num'] not in visited_cards: 274 | current = card 275 | visited_cards.append(card['cc_num']) 276 | break 277 | else: 278 | current = None 279 | if current is not None: 280 | susceptible_cards.append(current) 281 | return susceptible_cards 282 | 283 | def generate_normal_atm_withdrawals(cash_amounts: list, susceptible_cards: list) -> list: 284 | """.""" 285 | normal_atm_withdrawals = [] 286 | atm_transactions = len(cash_amounts) 287 | cash_withdrawal_cards = random.sample(susceptible_cards, CASH_WITHRAWAL_CARDS_TOTAL//(CASH_WITHRAWAL_CARDS_TOTAL//len(susceptible_cards)+1)) 288 | atm_count = 0 289 | while atm_count < atm_transactions: 290 | for card in cash_withdrawal_cards: 291 | for ATM_WITHRAWAL_SEQ in ATM_WITHRAWAL_SEQ_LENGTH: 292 | # interval in hours between normal cash withdrawals 293 | delta = random.randint(6, 168) 294 | atm_tr = generate_atm_withdrawal(credit_card_number = card['cc_num'], cash_amounts = cash_amounts, length=ATM_WITHRAWAL_SEQ, delta=delta, radius = NORMAL_ATM_RADIUS) 295 | normal_atm_withdrawals.append(atm_tr) 296 | atm_count += ATM_WITHRAWAL_SEQ 297 | return normal_atm_withdrawals 298 | 299 | 300 | def generate_timestamps_for_fraud_attacks(timestamp: str, chain_length: int) -> list: 301 | """.""" 302 | timestamps = [] 303 | timestamp = datetime.datetime.strptime(timestamp, DATE_FORMAT) 304 | for _ in range(chain_length): 305 | # interval in seconds between fraudulent attacks 306 | delta = random.randint(30, 120) 307 | current = timestamp + datetime.timedelta(seconds=delta) 308 | timestamps.append(current.strftime(DATE_FORMAT)) 309 | timestamp = current 310 | return timestamps 311 | 312 | def generate_amounts_for_fraud_attacks(chain_length: int) -> list: 313 | """.""" 314 | amounts = [] 315 | for percentage, span in AMOUNT_DISTRIBUTION_PERCENTAGES.items(): 316 | n = math.ceil(chain_length * percentage) 317 | start, end = span 318 | for _ in range(n): 319 | amounts.append(get_random_transaction_amount(start, end+1)) 320 | return amounts[:chain_length] 321 | 322 | 323 | def update_transactions(transactions: list, chains: list) -> list: 324 | """.""" 325 | for key, chain in chains.items(): 326 | transaction = transactions[key] 327 | timestamp = transaction['datetime'] 328 | cc_num = transaction['cc_num'] 329 | amount = transaction['amount'] 330 | transaction['fraud_label'] = 1 331 | inject_timestamps = generate_timestamps_for_fraud_attacks(timestamp, len(chain)) 332 | inject_amounts = generate_amounts_for_fraud_attacks(len(chain)) 333 | random.shuffle(inject_amounts) 334 | for i, idx in enumerate(chain): 335 | original_transaction = transactions[idx] 336 | inject_timestamp = inject_timestamps[i] 337 | original_transaction['datetime'] = inject_timestamp 338 | original_transaction['fraud_label'] = 1 339 | original_transaction['cc_num'] = cc_num 340 | original_transaction['amount'] = inject_amounts[i] 341 | original_transaction['category'] = [category for category, category_perc_price in CATEGORY_PERC_PRICE.items() if int(inject_amounts[i]) in range(int(category_perc_price[1]), int(category_perc_price[2]))][0] 342 | original_transaction['tid'] = generate_transaction_id(inject_timestamp, cc_num, amount) 343 | transactions[idx] = original_transaction 344 | 345 | def generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals: list) -> list: 346 | """.""" 347 | return random.sample([i for i in range(0, len(normal_atm_withdrawals))], \ 348 | int(FRAUD_RATIO * len(normal_atm_withdrawals))) 349 | 350 | def update_normal_atm_withdrawals(fraudulent_atm_tr_indxs :list, normal_atm_withdrawals :list,\ 351 | cash_amounts: list): 352 | """.""" 353 | for fraudulent_atm_tr_indx in fraudulent_atm_tr_indxs: 354 | # interval in seconds between fraudulent attacks 355 | delta = random.randint(1, 5) 356 | atm_withdrawal = normal_atm_withdrawals[fraudulent_atm_tr_indx] 357 | pre_fraudulent_atm_tr = atm_withdrawal[0] 358 | fraudulent_atm_tr = generate_atm_withdrawal(credit_card_number = 359 | pre_fraudulent_atm_tr['cc_num'], cash_amounts = cash_amounts, length=1, delta=delta, radius = None)[0] 360 | fraudulent_atm_location = faker.location_on_land() 361 | while fraudulent_atm_location[3] == 'US': 362 | fraudulent_atm_location = faker.location_on_land() 363 | fraudulent_atm_tr['datetime'] = (datetime.datetime.strptime(pre_fraudulent_atm_tr['datetime'], 364 | DATE_FORMAT) + datetime.timedelta(hours=delta)).strftime(DATE_FORMAT) 365 | fraudulent_atm_tr['latitude'] = fraudulent_atm_location[0] 366 | fraudulent_atm_tr['longitude'] = fraudulent_atm_location[1] 367 | fraudulent_atm_tr['city'] = fraudulent_atm_location[2] 368 | fraudulent_atm_tr['country'] = fraudulent_atm_location[3] 369 | fraudulent_atm_tr['fraud_label'] = 1 370 | atm_withdrawal.append(fraudulent_atm_tr) 371 | normal_atm_withdrawals[fraudulent_atm_tr_indx] = atm_withdrawal 372 | 373 | 374 | def transactions_as_dataframe(transactions: list, normal_atm_withdrawals: list) -> pd.DataFrame: 375 | """.""" 376 | for atm_withdrawal in normal_atm_withdrawals: 377 | for withdrawal in atm_withdrawal: 378 | transactions.append(withdrawal) 379 | return pd.DataFrame.from_records(transactions) 380 | 381 | 382 | def create_credit_cards_as_df(credit_cards: list) -> pd.DataFrame: 383 | """.""" 384 | df = pd.DataFrame.from_records(credit_cards) 385 | # Cast the columns to the correct Pandas DType 386 | df['cc_num']= pd.to_numeric(df['cc_num']) 387 | return df 388 | 389 | def create_profiles_as_df(credit_cards: list) -> pd.DataFrame: 390 | """.""" 391 | profiles_df = generate_df_with_profiles(credit_cards) 392 | return profiles_df 393 | 394 | def create_transactions_as_df(credit_cards: list) -> pd.DataFrame: 395 | """.""" 396 | timestamps = generate_timestamps(TOTAL_UNIQUE_TRANSACTIONS) 397 | amounts = generate_amounts() 398 | categories = generate_categories(amounts) 399 | cc_df = create_credit_cards_as_df(credit_cards) 400 | transactions = generate_transactions(cc_df['cc_num'], timestamps, categories) 401 | cash_amounts = generate_cash_amounts() 402 | chains = generate_chains() 403 | susceptible_cards = generate_susceptible_cards(credit_cards) 404 | normal_atm_withdrawals = generate_normal_atm_withdrawals(cash_amounts, susceptible_cards) 405 | update_transactions(transactions, chains) 406 | 407 | fraudulent_atm_tr_indxs = generate_fraudulent_atm_tr_indxs(normal_atm_withdrawals) 408 | update_normal_atm_withdrawals(fraudulent_atm_tr_indxs, normal_atm_withdrawals, cash_amounts) 409 | 410 | transactions_df = transactions_as_dataframe(transactions, normal_atm_withdrawals) 411 | 412 | # Cast the columns to the correct Pandas DType 413 | transactions_df['cc_num'] = pd.to_numeric(transactions_df['cc_num']) 414 | transactions_df['longitude'] = pd.to_numeric(transactions_df['longitude']) 415 | transactions_df['latitude'] = pd.to_numeric(transactions_df['latitude']) 416 | transactions_df['datetime']= pd.to_datetime(transactions_df['datetime']) 417 | 418 | return transactions_df 419 | 420 | -------------------------------------------------------------------------------- /src/02-module/test_sml/test_sml.py: -------------------------------------------------------------------------------- 1 | from sml import synthetic_data 2 | from unittest import TestCase 3 | import pytest 4 | from contextlib import nullcontext as does_not_raise 5 | 6 | @pytest.mark.parametrize( 7 | "credit_card_number, cash_amounts, length, delta, radius, country_code, excp", 8 | [("1111 2222 3333 4444",[112.10, 11.23], 1, 1, 10.0, 'US', does_not_raise()) 9 | ,("1111 2222 3333 44",[-12.00], -1, 1, 1.0, 'IE', pytest.raises(Exception))] 10 | ) 11 | def test_generate_atm_withdrawal(credit_card_number: str, cash_amounts: list, length: int, \ 12 | delta: int, radius: float, country_code, excp): 13 | with excp: 14 | synthetic_data.generate_atm_withdrawal(credit_card_number, cash_amounts, length, delta, radius, country_code) 15 | 16 | -------------------------------------------------------------------------------- /src/03-module/iris_with_sklearn_transformer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "d2kLrOh-bpGy" 7 | }, 8 | "source": [ 9 | "# Iris Flower Classification with Scikit-Learn and Hopsworks\n", 10 | "\n", 11 | "In this notebook we will, \n", 12 | "\n", 13 | "1. Import libraries and connect to Hopsworks Feature Store\n", 14 | "2. Load the iris Flower dataset\n", 15 | "3. Create a feature group and upload to the feature store\n", 16 | "4. Create a feature view from the feature group\n", 17 | "5. Create a training dataset\n", 18 | "6. Train a model using SkLearn\n", 19 | "7. Save the trained model to Hopsworks\n", 20 | "8. Launch a serving instance.\n", 21 | "9. Model deployment in Hopsworks\n", 22 | "10. Send a prediction request to the served model\n", 23 | "11. Try out your Model Interactively with a Gradio UI \n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "id": "9vVDAHU_bpG4" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "!pip install -U hopsworks --quiet" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "id": "xRtpj-psbpG8" 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from sklearn.neighbors import KNeighborsClassifier\n", 46 | "from sklearn.metrics import accuracy_score\n", 47 | "import joblib\n", 48 | "import numpy as np\n", 49 | "import time\n", 50 | "import json\n", 51 | "import random\n", 52 | "import hopsworks\n", 53 | "import pandas as pd\n", 54 | "from sklearn import preprocessing" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "CVCqQYDhbpG_" 61 | }, 62 | "source": [ 63 | "## 💽 Loading the Data " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "id": "nRmFM7vcbpHA" 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "iris_df = pd.read_csv(\"https://repo.hops.works/master/hopsworks-tutorials/data/iris.csv\")\n", 75 | "iris_df.head()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "id": "JR8HeEs6bpHB" 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "iris_df.info()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "id": "2H3XTfhMbpHB" 93 | }, 94 | "source": [ 95 | "## 🪄 Creating Feature Groups \n", 96 | "\n", 97 | "We can save two feature groups (hive tables), one called `iris_features` that contains the iris features and the corresponding numeric label, and another feature group called `iris_labels_lookup` for converting the numeric iris label back to categorical.\n", 98 | "\n", 99 | "**Note**: To be able to run the feature store code, you first have to enable the Feature Store Service in your project. To do this, go to the \"Settings\" tab in your project, select the feature store service and click \"Save\". " 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "project = hopsworks.login()\n", 109 | "fs = project.get_feature_store()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "id": "4By1zTHIbpHC" 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "iris_fg = fs.get_or_create_feature_group(name=\"iris\",\n", 121 | " version=1,\n", 122 | " primary_key=[\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\"],\n", 123 | " description=\"Iris flower dataset\")\n", 124 | "iris_fg.insert(iris_df)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## ⚙️ Feature View Creation \n", 132 | "\n", 133 | "Feature views are used to read features for training and inference.\n", 134 | "If the feature view already exists, get it. If not, an exception is thrown, and we create the feature view." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "id": "2tO8iIb5bpHC" 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "try:\n", 146 | " feature_view = fs.get_feature_view(name=\"iris\", version=1)\n", 147 | "except:\n", 148 | " # Feature Selection\n", 149 | " query = iris_fg.select_all()\n", 150 | " feature_view = fs.create_feature_view(name=\"iris\",\n", 151 | " version=1,\n", 152 | " description=\"Read from Iris flower dataset\",\n", 153 | " labels=[\"variety\"],\n", 154 | " query=query)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## 🏋️ Training Dataset Creation\n", 162 | "\n", 163 | "In Hopsworks training data is a query where the projection (set of features) is determined by the parent FeatureView with an optional snapshot on disk of the data returned by the query.\n", 164 | "\n", 165 | "**Training Dataset may contain splits such as:** \n", 166 | "* Training set - the subset of training data used to train a model.\n", 167 | "* Validation set - the subset of training data used to evaluate hparams when training a model\n", 168 | "* Test set - the holdout subset of training data used to evaluate a mode\n", 169 | "\n", 170 | "Training dataset is created using `fs.create_train_validation_test_split()` method.\n", 171 | "\n", 172 | "* `X_train` is the train set features\n", 173 | "* `X_val` is the validation set features\n", 174 | "* `X_test` is the test set features\n", 175 | "* `Y_train` is the train set labels\n", 176 | "* `Y_val` is the validation set labels\n", 177 | "* `Y_test` is the test set labels" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "td_version, td_job = feature_view.create_train_validation_test_split(\n", 187 | " description = 'iris tutorial',\n", 188 | " data_format = 'csv',\n", 189 | " validation_size = 0.2,\n", 190 | " test_size = 0.1,\n", 191 | " write_options = {'wait_for_job': True},\n", 192 | " coalesce = True,\n", 193 | ")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "X_train, X_val, X_test, y_train, y_val, y_test = feature_view.get_train_validation_test_split(td_version)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## 🧬 Modeling\n", 210 | "\n", 211 | "Train a KNN (k-nearest neighbors) model with Scikit-learn. Use a label encoder to map the categorical labels to numbers." 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "id": "KJb2bj-_bpHD" 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "from sklearn import preprocessing\n", 223 | "\n", 224 | "le = preprocessing.LabelEncoder()\n", 225 | "y_train_encoded=le.fit_transform(y_train['variety'])\n", 226 | "y_test_encoded = le.transform(y_test['variety'])\n", 227 | "\n", 228 | "model = KNeighborsClassifier(n_neighbors=4)\n", 229 | "model.fit(X_train, y_train_encoded) " 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "### Evalute model performance\n", 237 | "\n", 238 | "Compute the MSE of the model." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "id": "b8EC4_SvbpHE" 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "from sklearn.metrics import mean_squared_error\n", 250 | "\n", 251 | "y_pred = model.predict(X_test)\n", 252 | "\n", 253 | "mse = mean_squared_error(y_test_encoded, y_pred)\n", 254 | "\n", 255 | "metrics = {\n", 256 | " \"mse\" : mse\n", 257 | "}\n", 258 | "print(metrics)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## 📝 Register model\n", 266 | "\n", 267 | "One of the features in Hopsworks is the model registry. This is where we can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints.\n", 268 | "\n", 269 | "Save the following pickled objects as .pkl files locally to a directory that will be uploaded later to the model registry:\n", 270 | "\n", 271 | " * the model object, **model** saved as iris_model.pkl\n", 272 | " * the label encoder object, **le** saved as iris_encoder.pkl, so that we can reconstruct categorical names \n", 273 | " from the encoded predictions (numbers) \n", 274 | " \n", 275 | "The model input schema is the same set of features as in the *x_train* DataFrame.\n", 276 | "\n", 277 | "The model output schema is the same label as in the *y_train* DataFrame.\n", 278 | "\n", 279 | "Finally, lazily create the model that will be register, including all files (artifacts) in the directory (containing the pickled label encoder object and the pickled model object), the model's input/output schema, and a sample input row (**input_example**). The model registry is the **mr** object, and for our Scikit-Learn model, we create a model of type Python with **mr.python.create_model()**. For TensorFlow, there is *mr.tensorflow.create_model()*." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "import joblib\n", 289 | "import os\n", 290 | "\n", 291 | "# The 'iris_model' directory will be saved to the model registry\n", 292 | "model_dir=\"iris_model\"\n", 293 | "if os.path.isdir(model_dir) == False:\n", 294 | " os.mkdir(model_dir)\n", 295 | " \n", 296 | "joblib.dump(model, model_dir + '/knn_iris_model.pkl')\n", 297 | "joblib.dump(le, model_dir + '/knn_iris_encoder.pkl')" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "mr = project.get_model_registry()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "### ⚙️ Model Schema\n", 314 | "\n", 315 | "The model needs to be set up with a [Model Schema](https://docs.hopsworks.ai/machine-learning-api/latest/generated/model_schema/), which describes the inputs and outputs for a model.\n", 316 | "\n", 317 | "A Model Schema can be automatically generated from training examples, as shown below." 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "id": "ulH3bX02bpHE" 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "from hsml.schema import Schema\n", 329 | "from hsml.model_schema import ModelSchema\n", 330 | "\n", 331 | "input_schema = Schema(X_train)\n", 332 | "output_schema = Schema(y_train)\n", 333 | "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n", 334 | "\n", 335 | "model_schema.to_dict()\n", 336 | "\n", 337 | "iris_model = mr.python.create_model(\n", 338 | " name=\"knn_iris_model\", \n", 339 | " metrics=metrics,\n", 340 | " model_schema=model_schema,\n", 341 | " input_example=X_train.sample(), \n", 342 | " description=\"Iris Flower Predictor\")\n", 343 | "\n", 344 | "iris_model.save(model_dir)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "### 📎 Predictor script for Python models\n", 352 | "\n", 353 | "\n", 354 | "Scikit-learn models are deployed as Python models, in which case you need to provide a **Predict** class that implements the **predict** method. The **predict()** method invokes the model on the inputs and returns the prediction as a list.\n", 355 | "\n", 356 | "The **init()** method is run when the predictor is loaded into memory, loading the model from the local directory it is materialized to, *ARTIFACT_FILES_PATH*.\n", 357 | "\n", 358 | "The directive \"%%writefile\" writes out the cell before to the given Python file. We will use the **iris_predictor.py** file to create a deployment for our Scikit-Learn K-NN model. " 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "id": "1k14k_uqbpHF" 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "%%writefile predict_example.py\n", 370 | "\n", 371 | "import joblib\n", 372 | "import os\n", 373 | "\n", 374 | "class Predict(object):\n", 375 | " \n", 376 | " def __init__(self):\n", 377 | " # NOTE: env var ARTIFACT_FILES_PATH has the local path to the model artifact files \n", 378 | " self.model = joblib.load(os.environ[\"ARTIFACT_FILES_PATH\"] + \"/knn_iris_model.pkl\")\n", 379 | "\n", 380 | "\n", 381 | " def predict(self, inputs):\n", 382 | " \"\"\" Serves a prediction request from a trained model\"\"\"\n", 383 | " return self.model.predict(inputs).tolist()" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "## 🚀 Model Deployment\n", 391 | "\n", 392 | "Provide the predictor script because it is a Python model (Scikit-Learn)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "id": "zEEHKFzdbpHG" 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "dataset_api = project.get_dataset_api()\n", 404 | "\n", 405 | "uploaded_file_path = dataset_api.upload(\"predict_example.py\", \"Models\", overwrite=True)\n", 406 | "predictor_script_path = os.path.join(\"/Projects\", project.name, uploaded_file_path)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "ms = project.get_model_serving()\n", 416 | "try:\n", 417 | " deployment = ms.get_deployment(\"irisdeployed\")\n", 418 | "except:\n", 419 | " deployment = iris_model.deploy(name=\"irisdeployed\",\n", 420 | " script_file=predictor_script_path, \n", 421 | " model_server=\"PYTHON\", \n", 422 | " serving_tool=\"KSERVE\")\n", 423 | "\n", 424 | "print(\"Deployment: \" + deployment.name)\n", 425 | "deployment.describe()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "### The deployment has now been registered. However, to start it you need to run:" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": { 439 | "id": "7h4qsnUlbpHG" 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "state = deployment.get_state()\n", 444 | "\n", 445 | "if state.status != \"Running\":\n", 446 | " deployment.start()\n", 447 | " deployment.describe()\n", 448 | "else:\n", 449 | " print(\"Deployment already running\")" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": { 455 | "id": "x0iRFs0FbpHH" 456 | }, 457 | "source": [ 458 | "## 🔮 Predicting using deployment" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "id": "ICAE38pzbpHH" 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "test_data = list(iris_model.input_example)\n", 470 | "\n", 471 | "data = {\"instances\" : [test_data]}\n", 472 | "res = deployment.predict(data)\n", 473 | "print(test_data)\n", 474 | "#print(le.inverse_transform([res[\"predictions\"][0]]))\n", 475 | "print([res[\"predictions\"][0]])" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": { 481 | "id": "WSFCgRWcbpHH" 482 | }, 483 | "source": [ 484 | "## 👾 Try out your Model Interactively \n", 485 | "\n", 486 | "\n", 487 | "We will build a user interface with Gradio to allow you to enter the 4 feature values (sepal length/width and petal length/width), producing a prediction of the type of iris flower.\n", 488 | "\n", 489 | "First, we have to install the gradio library." 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "id": "fdMNftbQbpHI" 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "!pip install gradio --quiet" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "### Run Gradio\n", 508 | "\n", 509 | "Start the Gradio UI. Users enter the 4 feature values and a prediction is returned. We use the label encoder object to transform the number returned to the categorical value (stringified name of the Iris Flower)." 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": { 516 | "id": "h3DyKEOLbpHI" 517 | }, 518 | "outputs": [], 519 | "source": [ 520 | "import gradio as gr\n", 521 | "\n", 522 | "\n", 523 | "def iris(sl, sw, pl, pw):\n", 524 | " list_inputs = []\n", 525 | " list_inputs.append(sl)\n", 526 | " list_inputs.append(sw)\n", 527 | " list_inputs.append(pl)\n", 528 | " list_inputs.append(pw)\n", 529 | " data = {\n", 530 | " \"instances\": [list_inputs]\n", 531 | " }\n", 532 | " res = deployment.predict(data)\n", 533 | " # Convert the numerical representation of the label back to it's original iris flower name.\n", 534 | " return le.inverse_transform([res[\"predictions\"][0]])[0]\n", 535 | "\n", 536 | "demo = gr.Interface(\n", 537 | " fn=iris,\n", 538 | " title=\"Iris Flower Predictive Analytics\",\n", 539 | " description=\"Experiment with sepal/petal lengths/widths to predict which flower it is.\",\n", 540 | " allow_flagging=\"never\",\n", 541 | " inputs=[\n", 542 | " gr.inputs.Number(default=1.0, label=\"sepal length (cm)\"),\n", 543 | " gr.inputs.Number(default=1.0, label=\"sepal width (cm)\"),\n", 544 | " gr.inputs.Number(default=1.0, label=\"petal length (cm)\"),\n", 545 | " gr.inputs.Number(default=1.0, label=\"petal width (cm)\"),\n", 546 | " ],\n", 547 | " outputs=\"text\")\n", 548 | "\n", 549 | "demo.launch(share=True)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": { 556 | "id": "6trMW766bpHJ" 557 | }, 558 | "outputs": [], 559 | "source": [] 560 | } 561 | ], 562 | "metadata": { 563 | "colab": { 564 | "collapsed_sections": [], 565 | "name": "August 2022 iris_sklearn.ipynb", 566 | "provenance": [] 567 | }, 568 | "kernelspec": { 569 | "display_name": "Python 3 (ipykernel)", 570 | "language": "python", 571 | "name": "python3" 572 | }, 573 | "language_info": { 574 | "codemirror_mode": { 575 | "name": "ipython", 576 | "version": 3 577 | }, 578 | "file_extension": ".py", 579 | "mimetype": "text/x-python", 580 | "name": "python", 581 | "nbconvert_exporter": "python", 582 | "pygments_lexer": "ipython3", 583 | "version": "3.9.7" 584 | } 585 | }, 586 | "nbformat": 4, 587 | "nbformat_minor": 1 588 | } 589 | -------------------------------------------------------------------------------- /src/03-module/scripts/run-fraud-batch-inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd src/03-module 6 | 7 | jupyter nbconvert --to notebook --execute 4_batch_predictions.ipynb 8 | --------------------------------------------------------------------------------