├── LICENSE ├── notebooks ├── .gitkeep └── dummy.ipynb ├── reports ├── .gitkeep └── .gitignore ├── src ├── __init__.py ├── data │ ├── .gitkeep │ ├── __init__.py │ └── make_dataset.py ├── features │ ├── .gitkeep │ ├── __init__.py │ └── build_features.py ├── models │ ├── .gitkeep │ ├── __init__.py │ ├── push_model.py │ └── train_model.py └── visualization │ ├── .gitkeep │ ├── __init__.py │ └── visualize.py ├── data ├── external │ └── .gitkeep ├── interim │ └── .gitkeep ├── .gitignore └── raw.dvc ├── references └── .gitkeep ├── .dvc ├── .gitignore └── config ├── tox.ini ├── model.joblib ├── params.yaml ├── requirements.txt ├── .dvcignore ├── dvc_plots └── static │ └── workspace_._reports_figures_plots_images_importance.png ├── dev-requirements.txt ├── setup.py ├── docs ├── getting-started.rst ├── commands.rst ├── index.rst ├── make.bat ├── Makefile └── conf.py ├── Dockerfile ├── test_environment.py ├── .gitignore ├── dvc.lock ├── app.py ├── dvc.yaml ├── README.md ├── app_gunicorn.py ├── .github └── workflows │ └── ci.yml ├── app_streamlit.py └── Makefile /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reports/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/external/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/interim/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /references/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/features/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/visualization/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reports/.gitignore: -------------------------------------------------------------------------------- 1 | /figures 2 | -------------------------------------------------------------------------------- /src/features/build_features.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | /raw 2 | /processed 3 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 79 3 | max-complexity = 10 4 | -------------------------------------------------------------------------------- /model.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PranY/creditcard/HEAD/model.joblib -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = remote 3 | ['remote "remote"'] 4 | url = ../TEMP 5 | -------------------------------------------------------------------------------- /data/raw.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 63310a287c47722acf1521efd0e0d061.dir 3 | size: 150825054 4 | nfiles: 2 5 | hash: md5 6 | path: raw 7 | -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- 1 | make_dataset: 2 | test_split: 0.2 3 | seed: 2023 4 | train_model: 5 | seed: 21 6 | n_estimators: 50 7 | max_depth: 8 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # external requirements 2 | fastapi 3 | joblib 4 | uvicorn 5 | scikit-learn # This is needed by joblib to read and load the model 6 | -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | -------------------------------------------------------------------------------- /dvc_plots/static/workspace_._reports_figures_plots_images_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PranY/creditcard/HEAD/dvc_plots/static/workspace_._reports_figures_plots_images_importance.png -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | # local package 2 | # -e . 3 | 4 | # external requirements 5 | fastapi 6 | scikit-learn 7 | pandas 8 | numpy 9 | joblib 10 | matplotlib 11 | uvicorn 12 | sagemaker 13 | ipykernel 14 | boto3 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='src', 5 | packages=find_packages(), 6 | version='0.1.0', 7 | description='A credit card fraud detection project', 8 | author='Pranjal', 9 | license='', 10 | ) 11 | -------------------------------------------------------------------------------- /docs/getting-started.rst: -------------------------------------------------------------------------------- 1 | Getting started 2 | =============== 3 | 4 | This is where you describe how to get set up on a clean install, including the 5 | commands necessary to get the raw data (using the `sync_data_from_s3` command, 6 | for example), and then how to make the cleaned, final data sets. 7 | -------------------------------------------------------------------------------- /docs/commands.rst: -------------------------------------------------------------------------------- 1 | Commands 2 | ======== 3 | 4 | The Makefile contains the central entry points for common tasks related to this project. 5 | 6 | Syncing data to S3 7 | ^^^^^^^^^^^^^^^^^^ 8 | 9 | * `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`. 10 | * `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`. 11 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. creditcard documentation master file, created by 2 | sphinx-quickstart. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | creditcard documentation! 7 | ============================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | getting-started 15 | commands 16 | 17 | 18 | 19 | Indices and tables 20 | ================== 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | * :ref:`search` 25 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.8-slim 3 | 4 | # Set the working directory to /app 5 | WORKDIR /app 6 | 7 | # Copy the required files and directory into the container at /app 8 | COPY app.py /app/app.py 9 | COPY model.joblib /app/model.joblib 10 | COPY requirements.txt /app/requirements.txt 11 | 12 | # Install any needed packages specified in requirements.txt 13 | RUN pip install -r requirements.txt 14 | 15 | # Copy files from S3 inside docker 16 | # RUN mkdir /app/models 17 | # RUN aws s3 cp s3://creditcard-project/models/model.joblib /app/models/model.joblib 18 | 19 | 20 | # Run app.py when the container launches 21 | CMD ["python", "app.py"] 22 | -------------------------------------------------------------------------------- /test_environment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | REQUIRED_PYTHON = "python3" 4 | 5 | 6 | def main(): 7 | system_major = sys.version_info.major 8 | if REQUIRED_PYTHON == "python": 9 | required_major = 2 10 | elif REQUIRED_PYTHON == "python3": 11 | required_major = 3 12 | else: 13 | raise ValueError("Unrecognized python interpreter: {}".format( 14 | REQUIRED_PYTHON)) 15 | 16 | if system_major != required_major: 17 | raise TypeError( 18 | "This project requires Python {}. Found: Python {}".format( 19 | required_major, sys.version)) 20 | else: 21 | print(">>> Development environment passes all tests!") 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /src/models/push_model.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore.exceptions import NoCredentialsError 3 | 4 | def upload_to_s3(local_file_path, bucket_name, s3_file_path): 5 | # Create an S3 client 6 | s3 = boto3.client('s3') 7 | 8 | try: 9 | # Upload the file 10 | s3.upload_file(local_file_path, bucket_name, s3_file_path) 11 | print(f"File uploaded successfully to {bucket_name}/{s3_file_path}") 12 | except FileNotFoundError: 13 | print(f"The file {local_file_path} was not found.") 14 | except NoCredentialsError: 15 | print("Credentials not available.") 16 | 17 | # Example usage 18 | local_model_path = 'models/model.joblib' 19 | s3_bucket_name = 'creditcard-project' 20 | s3_file_path = 'models/model.joblib' 21 | 22 | upload_to_s3(local_model_path, s3_bucket_name, s3_file_path) -------------------------------------------------------------------------------- /src/data/make_dataset.py: -------------------------------------------------------------------------------- 1 | # make_dataset.py 2 | import pathlib 3 | import yaml 4 | import sys 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | 8 | def load_data(data_path): 9 | # Load your dataset from a given path 10 | df = pd.read_csv(data_path) 11 | return df 12 | 13 | def split_data(df, test_split, seed): 14 | # Split the dataset into train and test sets 15 | train, test = train_test_split(df, test_size=test_split, random_state=seed) 16 | return train, test 17 | 18 | def save_data(train, test, output_path): 19 | # Save the split datasets to the specified output path 20 | pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) 21 | train.to_csv(output_path + '/train.csv', index=False) 22 | test.to_csv(output_path + '/test.csv', index=False) 23 | 24 | def main(): 25 | 26 | curr_dir = pathlib.Path(__file__) 27 | home_dir = curr_dir.parent.parent.parent 28 | params_file = home_dir.as_posix() + '/params.yaml' 29 | params = yaml.safe_load(open(params_file))["make_dataset"] 30 | 31 | input_file = sys.argv[1] 32 | data_path = home_dir.as_posix() + input_file 33 | output_path = home_dir.as_posix() + '/data/processed' 34 | 35 | data = load_data(data_path) 36 | train_data, test_data = split_data(data, params['test_split'], params['seed']) 37 | save_data(train_data, test_data, output_path) 38 | 39 | if __name__ == "__main__": 40 | main() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # DotEnv configuration 60 | .env 61 | 62 | # Database 63 | *.db 64 | *.rdb 65 | 66 | # Pycharm 67 | .idea 68 | 69 | # VS Code 70 | .vscode/ 71 | 72 | # Spyder 73 | .spyproject/ 74 | 75 | # Jupyter NB Checkpoints 76 | .ipynb_checkpoints/ 77 | 78 | # exclude data from source control by default 79 | /TEMP 80 | # Mac OS-specific storage files 81 | .DS_Store 82 | 83 | # vim 84 | *.swp 85 | *.swo 86 | 87 | # Mypy cache 88 | .mypy_cache/ 89 | /models 90 | /dvclive 91 | 92 | # AWS files 93 | .aws/* -------------------------------------------------------------------------------- /src/models/train_model.py: -------------------------------------------------------------------------------- 1 | # train_model.py 2 | import pathlib 3 | import sys 4 | import yaml 5 | import joblib 6 | 7 | import pandas as pd 8 | from sklearn.ensemble import RandomForestClassifier 9 | 10 | 11 | def train_model(train_features, target, n_estimators, max_depth, seed): 12 | # Train your machine learning model 13 | model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=seed) 14 | model.fit(train_features, target) 15 | return model 16 | 17 | def save_model(model, output_path): 18 | # Save the trained model to the specified output path 19 | joblib.dump(model, output_path + '/model.joblib') 20 | 21 | def main(): 22 | 23 | curr_dir = pathlib.Path(__file__) 24 | home_dir = curr_dir.parent.parent.parent 25 | params_file = home_dir.as_posix() + '/params.yaml' 26 | params = yaml.safe_load(open(params_file))["train_model"] 27 | 28 | input_file = sys.argv[1] 29 | data_path = home_dir.as_posix() + input_file 30 | output_path = home_dir.as_posix() + '/models' 31 | pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) 32 | 33 | TARGET = 'Class' 34 | train_features = pd.read_csv(data_path + '/train.csv') 35 | X = train_features.drop(TARGET, axis=1) 36 | y = train_features[TARGET] 37 | 38 | trained_model = train_model(X, y, params['n_estimators'], params['max_depth'], params['seed']) 39 | save_model(trained_model, output_path) 40 | 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /notebooks/dummy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "", 10 | "evalue": "", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[1;31mRunning cells with 'test' requires the ipykernel package.\n", 14 | "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", 15 | "\u001b[1;31mCommand: 'conda install -n test ipykernel --update-deps --force-reinstall'" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "ename": "", 30 | "evalue": "", 31 | "output_type": "error", 32 | "traceback": [ 33 | "\u001b[1;31mRunning cells with 'test' requires the ipykernel package.\n", 34 | "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", 35 | "\u001b[1;31mCommand: 'conda install -n test ipykernel --update-deps --force-reinstall'" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "df = pd.read_csv('../data/raw/creditcard.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "test", 54 | "language": "python", 55 | "name": "python3" 56 | }, 57 | "language_info": { 58 | "name": "python", 59 | "version": "3.11.6" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 2 64 | } 65 | -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- 1 | schema: '2.0' 2 | stages: 3 | make_dataset: 4 | cmd: python .\src\data\make_dataset.py .\data\raw\creditcard.csv 5 | deps: 6 | - path: .\data\raw\creditcard.csv 7 | hash: md5 8 | md5: e8ffcd27762aaf279e268eb849a060f9 9 | size: 150825054 10 | - path: .\src\data\make_dataset.py 11 | hash: md5 12 | md5: 5d87ea0853101b748b17af0be510c2ac 13 | size: 1314 14 | params: 15 | params.yaml: 16 | make_dataset.seed: 2023 17 | make_dataset.test_split: 0.2 18 | outs: 19 | - path: .\data\processed\ 20 | hash: md5 21 | md5: b21101ae3f5090f5af39e6f9cbdb21c3.dir 22 | size: 151098829 23 | nfiles: 2 24 | train_model: 25 | cmd: python .\src\models\train_model.py .\data\processed\ 26 | deps: 27 | - path: .\data\processed\ 28 | hash: md5 29 | md5: b21101ae3f5090f5af39e6f9cbdb21c3.dir 30 | size: 151098829 31 | nfiles: 2 32 | - path: .\src\models\train_model.py 33 | hash: md5 34 | md5: 5af92edffd22804f29af18c1efe94bbb 35 | size: 1362 36 | outs: 37 | - path: .\models\ 38 | hash: md5 39 | md5: 6191af90def2b514e1128c7d91ae5877.dir 40 | size: 397817 41 | nfiles: 1 42 | visualize: 43 | cmd: python .\src\visualization\visualize.py .\models\model.joblib .\data\processed\ 44 | deps: 45 | - path: .\data\processed\ 46 | hash: md5 47 | md5: 45207c27b5aec2846e4532f5bee1631f.dir 48 | size: 151098829 49 | nfiles: 2 50 | - path: .\models\model.joblib 51 | hash: md5 52 | md5: e01830bd03da0f4f328c34580d40d5b3 53 | size: 3865 54 | - path: .\src\visualization\visualize.py 55 | hash: md5 56 | md5: db86d2e43c664f095353e9ba6afe3393 57 | size: 3687 58 | outs: 59 | - path: dvclive 60 | hash: md5 61 | md5: e887b02afa678d483c083f4d3632dc94.dir 62 | size: 18530385 63 | nfiles: 8 64 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # main.py 2 | from fastapi import FastAPI 3 | from joblib import load 4 | from pydantic import BaseModel 5 | 6 | app = FastAPI() 7 | 8 | class PredictionInput(BaseModel): 9 | # Define the input parameters required for making predictions 10 | Time: float 11 | V1: float 12 | V2: float 13 | V3: float 14 | V4: float 15 | V5: float 16 | V6: float 17 | V7: float 18 | V8: float 19 | V9: float 20 | V10: float 21 | V11: float 22 | V12: float 23 | V13: float 24 | V14: float 25 | V15: float 26 | V16: float 27 | V17: float 28 | V18: float 29 | V19: float 30 | V20: float 31 | V21: float 32 | V22: float 33 | V23: float 34 | V24: float 35 | V25: float 36 | V26: float 37 | V27: float 38 | V28: float 39 | Amount: float 40 | 41 | 42 | # Load the pre-trained RandomForest model 43 | model_path = "model.joblib" 44 | model = load(model_path) 45 | 46 | @app.get("/") 47 | def home(): 48 | return "Working fine" 49 | 50 | @app.post("/predict") 51 | def predict(input_data: PredictionInput): 52 | # Extract features from input_data and make predictions using the loaded model 53 | features = [input_data.Time, 54 | input_data.V1, 55 | input_data.V2, 56 | input_data.V3, 57 | input_data.V4, 58 | input_data.V5, 59 | input_data.V6, 60 | input_data.V7, 61 | input_data.V8, 62 | input_data.V9, 63 | input_data.V10, 64 | input_data.V11, 65 | input_data.V12, 66 | input_data.V13, 67 | input_data.V14, 68 | input_data.V15, 69 | input_data.V16, 70 | input_data.V17, 71 | input_data.V18, 72 | input_data.V19, 73 | input_data.V20, 74 | input_data.V21, 75 | input_data.V22, 76 | input_data.V23, 77 | input_data.V24, 78 | input_data.V25, 79 | input_data.V26, 80 | input_data.V27, 81 | input_data.V28, 82 | input_data.Amount 83 | ] 84 | prediction = model.predict([features])[0].item() 85 | # Return the prediction 86 | return {"prediction": prediction} 87 | 88 | if __name__ == "__main__": 89 | import uvicorn 90 | uvicorn.run(app, host="0.0.0.0", port=8080) 91 | -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- 1 | stages: 2 | make_dataset: 3 | cmd: python .\src\data\make_dataset.py .\data\raw\creditcard.csv 4 | deps: 5 | - .\data\raw\creditcard.csv 6 | - .\src\data\make_dataset.py 7 | params: 8 | - make_dataset.test_split 9 | - make_dataset.seed 10 | outs: 11 | - .\data\processed\ 12 | train_model: 13 | cmd: python .\src\models\train_model.py .\data\processed\ 14 | deps: 15 | - .\data\processed\ 16 | - .\src\models\train_model.py 17 | outs: 18 | - .\models\ 19 | visualize: 20 | cmd: python .\src\visualization\visualize.py .\models\model.joblib .\data\processed\ 21 | deps: 22 | - .\data\processed\ 23 | - .\models\model.joblib 24 | - .\src\visualization\visualize.py 25 | outs: 26 | - dvclive 27 | 28 | metrics: 29 | - dvclive\metrics.json 30 | 31 | plots: 32 | # - dvclive\plots\images\importance.png 33 | # - dvclive\plots\sklearn\roc\train.json: 34 | # template: simple 35 | # x: fpr 36 | # y: tpr 37 | # title: Receiver operating characteristic (ROC) 38 | # x_label: False Positive Rate 39 | # y_label: True Positive Rate 40 | # - dvclive\plots\sklearn\prc\train.json: 41 | # template: simple 42 | # x: recall 43 | # y: precision 44 | # title: Precision-Recall Curve 45 | # x_label: Recall 46 | # y_label: Precision 47 | # - dvclive\plots\sklearn\cm\train.json: 48 | # template: confusion 49 | # x: actual 50 | # y: predicted 51 | # title: Confusion Matrix 52 | # x_label: True Label 53 | # y_label: Predicted Label 54 | # - dvclive\plots\sklearn\roc\test.json: 55 | # template: simple 56 | # x: fpr 57 | # y: tpr 58 | # title: Receiver operating characteristic (ROC) 59 | # x_label: False Positive Rate 60 | # y_label: True Positive Rate 61 | # - dvclive\plots\sklearn\prc\test.json: 62 | # template: simple 63 | # x: recall 64 | # y: precision 65 | # title: Precision-Recall Curve 66 | # x_label: Recall 67 | # y_label: Precision 68 | # - dvclive\plots\sklearn\cm\test.json: 69 | # template: confusion 70 | # x: actual 71 | # y: predicted 72 | # title: Confusion Matrix 73 | # x_label: True Label 74 | # y_label: Predicted Label 75 | - ROC: 76 | template: simple 77 | x: fpr 78 | y: 79 | dvclive\plots\sklearn\roc\train.json: tpr 80 | dvclive\plots\sklearn\roc\test.json: tpr 81 | - Confusion-Matrix: 82 | template: confusion 83 | x: actual 84 | y: 85 | dvclive\plots\sklearn\cm\train.json: predicted 86 | dvclive\plots\sklearn\cm\test.json: predicted 87 | - Precision-Recall: 88 | template: simple 89 | x: recall 90 | y: 91 | dvclive\plots\sklearn\prc\train.json: precision 92 | dvclive\plots\sklearn\prc\test.json: precision 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | array(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 2 | 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 3 | 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 4 | 'V28', 'Amount'], dtype=object) 5 | 6 | creditcard 7 | ============================== 8 | 9 | A credit card fraud detection project 10 | 11 | Project Organization 12 | ------------ 13 | 14 | ├── LICENSE 15 | ├── Makefile <- Makefile with commands like `make data` or `make train` 16 | ├── README.md <- The top-level README for developers using this project. 17 | ├── data 18 | │ ├── external <- Data from third party sources. 19 | │ ├── interim <- Intermediate data that has been transformed. 20 | │ ├── processed <- The final, canonical data sets for modeling. 21 | │ └── raw <- The original, immutable data dump. 22 | │ 23 | ├── docs <- A default Sphinx project; see sphinx-doc.org for details 24 | │ 25 | ├── models <- Trained and serialized models, model predictions, or model summaries 26 | │ 27 | ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), 28 | │ the creator's initials, and a short `-` delimited description, e.g. 29 | │ `1.0-jqp-initial-data-exploration`. 30 | │ 31 | ├── references <- Data dictionaries, manuals, and all other explanatory materials. 32 | │ 33 | ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. 34 | │ └── figures <- Generated graphics and figures to be used in reporting 35 | │ 36 | ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g. 37 | │ generated with `pip freeze > requirements.txt` 38 | │ 39 | ├── setup.py <- makes project pip installable (pip install -e .) so src can be imported 40 | ├── src <- Source code for use in this project. 41 | │ ├── __init__.py <- Makes src a Python module 42 | │ │ 43 | │ ├── data <- Scripts to download or generate data 44 | │ │ └── make_dataset.py 45 | │ │ 46 | │ ├── features <- Scripts to turn raw data into features for modeling 47 | │ │ └── build_features.py 48 | │ │ 49 | │ ├── models <- Scripts to train models and then use trained models to make 50 | │ │ │ predictions 51 | │ │ ├── predict_model.py 52 | │ │ └── train_model.py 53 | │ │ 54 | │ └── visualization <- Scripts to create exploratory and results oriented visualizations 55 | │ └── visualize.py 56 | │ 57 | └── tox.ini <- tox file with settings for running tox; see tox.readthedocs.io 58 | 59 | 60 | -------- 61 | 62 |
Project based on the cookiecutter data science project template. #cookiecutterdatascience
63 | -------------------------------------------------------------------------------- /app_gunicorn.py: -------------------------------------------------------------------------------- 1 | # main.py 2 | from fastapi import FastAPI 3 | from joblib import load 4 | from pydantic import BaseModel 5 | 6 | app = FastAPI() 7 | 8 | class PredictionInput(BaseModel): 9 | # Define the input parameters required for making predictions 10 | Time: float 11 | V1: float 12 | V2: float 13 | V3: float 14 | V4: float 15 | V5: float 16 | V6: float 17 | V7: float 18 | V8: float 19 | V9: float 20 | V10: float 21 | V11: float 22 | V12: float 23 | V13: float 24 | V14: float 25 | V15: float 26 | V16: float 27 | V17: float 28 | V18: float 29 | V19: float 30 | V20: float 31 | V21: float 32 | V22: float 33 | V23: float 34 | V24: float 35 | V25: float 36 | V26: float 37 | V27: float 38 | V28: float 39 | Amount: float 40 | 41 | 42 | # Load the pre-trained RandomForest model 43 | model_path = "models/model.joblib" 44 | model = load(model_path) 45 | 46 | @app.get("/") 47 | def home(): 48 | return "Working fine" 49 | 50 | @app.post("/predict") 51 | def predict(input_data: PredictionInput): 52 | # Extract features from input_data and make predictions using the loaded model 53 | features = [input_data.Time, 54 | input_data.V1, 55 | input_data.V2, 56 | input_data.V3, 57 | input_data.V4, 58 | input_data.V5, 59 | input_data.V6, 60 | input_data.V7, 61 | input_data.V8, 62 | input_data.V9, 63 | input_data.V10, 64 | input_data.V11, 65 | input_data.V12, 66 | input_data.V13, 67 | input_data.V14, 68 | input_data.V15, 69 | input_data.V16, 70 | input_data.V17, 71 | input_data.V18, 72 | input_data.V19, 73 | input_data.V20, 74 | input_data.V21, 75 | input_data.V22, 76 | input_data.V23, 77 | input_data.V24, 78 | input_data.V25, 79 | input_data.V26, 80 | input_data.V27, 81 | input_data.V28, 82 | input_data.Amount 83 | ] 84 | prediction = model.predict([features])[0].item() 85 | # Return the prediction 86 | return {"prediction": prediction} 87 | 88 | if __name__ == "__main__": 89 | import uvicorn 90 | uvicorn.run(app, host="127.0.0.1", port=8000) 91 | 92 | # CMD: gunicorn -w 4 -k uvicorn.workers.UvicornWorker app_gunicorn:app 93 | 94 | # Uvicorn is a lightweight ASGI (Asynchronous Server Gateway Interface) server that specifically serves ASGI applications, such as those built with FastAPI. 95 | # It is responsible for handling the asynchronous aspects of the application, making it efficient for high-concurrency scenarios. 96 | 97 | # Gunicorn is a WSGI (Web Server Gateway Interface) server. While it is not designed for handling asynchronous tasks directly, it can be used to serve synchronous WSGI applications, including FastAPI applications. 98 | # Gunicorn is a pre-fork worker model server, meaning it spawns multiple worker processes to handle incoming requests concurrently. Each worker runs in a separate process and can handle one request at a time. -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI Pipeline 2 | 3 | on: 4 | push: 5 | 6 | permissions: 7 | id-token: write 8 | contents: read 9 | 10 | jobs: 11 | integration: 12 | name: Continuous Integration 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout Code 16 | uses: actions/checkout@v3 17 | 18 | - name: Lint code 19 | run: echo "Linting repository" 20 | 21 | - name: Run unit tests 22 | run: echo "Running unit tests" 23 | 24 | build-and-push-ecr-image: 25 | name: Push to ECR 26 | needs: integration 27 | runs-on: ubuntu-latest 28 | steps: 29 | - name: Checkout Code 30 | uses: actions/checkout@v3 31 | 32 | - name: Install Utilities 33 | run: | 34 | sudo apt-get update 35 | sudo apt-get install -y jq unzip 36 | - name: Configure AWS credentials 37 | uses: aws-actions/configure-aws-credentials@v4 38 | with: 39 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 40 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 41 | aws-region: ${{ secrets.AWS_REGION }} 42 | 43 | - name: Login to Amazon ECR 44 | id: login-ecr 45 | uses: aws-actions/amazon-ecr-login@v2 46 | 47 | - name: Build, tag, and push image to Amazon ECR 48 | id: build-image 49 | env: 50 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 51 | ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY_NAME }} 52 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 53 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 54 | AWS_REGION: ${{ secrets.AWS_REGION }} 55 | IMAGE_TAG: latest 56 | run: | 57 | # Build a docker container and 58 | # push it to ECR so that it can 59 | # be deployed to ECS. 60 | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . 61 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 62 | echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" 63 | 64 | Continuous-Deployment: 65 | runs-on: self-hosted 66 | needs: build-and-push-ecr-image 67 | steps: 68 | - name: Configure AWS credentials 69 | uses: aws-actions/configure-aws-credentials@v4 70 | with: 71 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 72 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 73 | aws-region: ${{ secrets.AWS_REGION }} 74 | 75 | - name: Login to Amazon ECR 76 | id: login-ecr 77 | uses: aws-actions/amazon-ecr-login@v2 78 | 79 | - name: Pull latest images 80 | env: 81 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 82 | ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY_NAME }} 83 | IMAGE_TAG: latest 84 | run: | 85 | docker pull $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 86 | 87 | - name: Delete Previous Container 88 | run: | 89 | docker rm -f creditcard || true 90 | 91 | - name: Run Docker Image to serve users 92 | run: | 93 | docker run -d -p 8080:8080 --ipc="host" --name=creditcard -e 'AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}' -e 'AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}' -e 'AWS_REGION=${{ secrets.AWS_REGION }}' ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest 94 | -------------------------------------------------------------------------------- /app_streamlit.py: -------------------------------------------------------------------------------- 1 | # app_streamlit.py 2 | import streamlit as st 3 | from joblib import load 4 | 5 | # Load the pre-trained RandomForest model 6 | model_path = "models/model.joblib" 7 | model = load(model_path) 8 | 9 | def predict(features): 10 | # Make predictions using the loaded model 11 | prediction = model.predict_proba([features]) 12 | return prediction 13 | 14 | def main(): 15 | st.title("Machine Learning Model Prediction") 16 | 17 | # User input form 18 | Time = st.slider("Time", min_value=0.0, max_value=10.0, value=5.0) 19 | V1 = st.slider("V1", min_value=-10.0, max_value=10.0, value=5.0) 20 | V2 = st.slider("V2", min_value=-50.0, max_value=10.0, value=5.0) 21 | V3 = st.slider("V3", min_value=-20.0, max_value=10.0, value=5.0) 22 | V4 = st.slider("V4", min_value=0.0, max_value=10.0, value=5.0) 23 | V5 = st.slider("V5", min_value=0.0, max_value=10.0, value=5.0) 24 | V6 = st.slider("V6", min_value=0.0, max_value=10.0, value=5.0) 25 | V7 = st.slider("V7", min_value=0.0, max_value=10.0, value=5.0) 26 | V8 = st.slider("V8", min_value=0.0, max_value=10.0, value=5.0) 27 | V9 = st.slider("V9", min_value=0.0, max_value=10.0, value=5.0) 28 | V10 = st.slider("V10", min_value=0.0, max_value=10.0, value=5.0) 29 | V11 = st.slider("V11", min_value=0.0, max_value=10.0, value=5.0) 30 | V12 = st.slider("V12", min_value=0.0, max_value=10.0, value=5.0) 31 | V13 = st.slider("V13", min_value=0.0, max_value=10.0, value=5.0) 32 | V14 = st.slider("V14", min_value=0.0, max_value=10.0, value=5.0) 33 | V15 = st.slider("V15", min_value=0.0, max_value=10.0, value=5.0) 34 | V16 = st.slider("V16", min_value=0.0, max_value=10.0, value=5.0) 35 | V17 = st.slider("V17", min_value=0.0, max_value=10.0, value=5.0) 36 | V18 = st.slider("V18", min_value=0.0, max_value=10.0, value=5.0) 37 | V19 = st.slider("V19", min_value=0.0, max_value=10.0, value=5.0) 38 | V20 = st.slider("V20", min_value=0.0, max_value=10.0, value=5.0) 39 | V21 = st.slider("V21", min_value=0.0, max_value=10.0, value=5.0) 40 | V22 = st.slider("V22", min_value=0.0, max_value=10.0, value=5.0) 41 | V23 = st.slider("V23", min_value=0.0, max_value=10.0, value=5.0) 42 | V24 = st.slider("V24", min_value=0.0, max_value=10.0, value=5.0) 43 | V25 = st.slider("V25", min_value=0.0, max_value=10.0, value=5.0) 44 | V26 = st.slider("V26", min_value=0.0, max_value=10.0, value=5.0) 45 | V27 = st.slider("V27", min_value=0.0, max_value=10.0, value=5.0) 46 | V28 = st.slider("V28", min_value=0.0, max_value=10.0, value=5.0) 47 | Amount = st.slider("Amount", min_value=0.0, max_value=1000.0, value=5.0) 48 | 49 | if st.button("Predict"): 50 | features = [Time, 51 | V1, 52 | V2, 53 | V3, 54 | V4, 55 | V5, 56 | V6, 57 | V7, 58 | V8, 59 | V9, 60 | V10, 61 | V11, 62 | V12, 63 | V13, 64 | V14, 65 | V15, 66 | V16, 67 | V17, 68 | V18, 69 | V19, 70 | V20, 71 | V21, 72 | V22, 73 | V23, 74 | V24, 75 | V25, 76 | V26, 77 | V27, 78 | V28, 79 | Amount 80 | ] 81 | result = predict(features) 82 | st.success(f"The prediction is: {result}") 83 | 84 | if __name__ == "__main__": 85 | main() 86 | 87 | # streamlit run app_streamlit.py 88 | -------------------------------------------------------------------------------- /src/visualization/visualize.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import joblib 3 | import sys 4 | import yaml 5 | import pandas as pd 6 | from sklearn import metrics 7 | from sklearn import tree 8 | from dvclive import Live 9 | from matplotlib import pyplot as plt 10 | 11 | 12 | def evaluate(model, X, y, split, live, save_path): 13 | """ 14 | Dump all evaluation metrics and plots for given datasets. 15 | 16 | Args: 17 | model (sklearn.ensemble.RandomForestClassifier): Trained classifier. 18 | X (pandas.DataFrame): Input DF. 19 | y (pamdas.Series): Target column. 20 | split (str): Dataset name. 21 | live (dvclive.Live): Dvclive instance. 22 | save_path (str): Path to save the metrics. 23 | """ 24 | 25 | predictions_by_class = model.predict_proba(X) 26 | predictions = predictions_by_class[:, 1] 27 | 28 | # Use dvclive to log a few simple metrics... 29 | avg_prec = metrics.average_precision_score(y, predictions) 30 | roc_auc = metrics.roc_auc_score(y, predictions) 31 | if not live.summary: 32 | live.summary = {"avg_prec": {}, "roc_auc": {}} 33 | live.summary["avg_prec"][split] = avg_prec 34 | live.summary["roc_auc"][split] = roc_auc 35 | # ... and plots... 36 | # ... like an roc plot... 37 | live.log_sklearn_plot("roc", y, predictions, name=f"roc/{split}") 38 | # ... and precision recall plot... 39 | # ... which passes `drop_intermediate=True` to the sklearn method... 40 | live.log_sklearn_plot( 41 | "precision_recall", 42 | y, 43 | predictions, 44 | name=f"prc/{split}", 45 | drop_intermediate=True, 46 | ) 47 | # ... and confusion matrix plot 48 | live.log_sklearn_plot( 49 | "confusion_matrix", 50 | y, 51 | predictions_by_class.argmax(-1), 52 | name=f"cm/{split}", 53 | ) 54 | 55 | 56 | def save_importance_plot(live, model, feature_names): 57 | """ 58 | Save feature importance plot. 59 | 60 | Args: 61 | live (dvclive.Live): DVCLive instance. 62 | model (sklearn.ensemble.RandomForestClassifier): Trained classifier. 63 | feature_names (list): List of feature names. 64 | """ 65 | fig, axes = plt.subplots(dpi=100) 66 | fig.subplots_adjust(bottom=0.2, top=0.95) 67 | axes.set_ylabel("Mean decrease in impurity") 68 | 69 | importances = model.feature_importances_ 70 | forest_importances = pd.Series(importances, index=feature_names).nlargest(n=10) 71 | forest_importances.plot.bar(ax=axes) 72 | 73 | live.log_image("importance.png", fig) 74 | 75 | 76 | def main(): 77 | 78 | curr_dir = pathlib.Path(__file__) 79 | home_dir = curr_dir.parent.parent.parent 80 | # TODO - Optionally add visualization params as well 81 | # params_file = home_dir.as_posix() + '/params.yaml' 82 | # params = yaml.safe_load(open(params_file))["train_model"] 83 | 84 | model_file = sys.argv[1] 85 | # Load the model. 86 | model = joblib.load(model_file) 87 | 88 | # Load the data. 89 | input_file = sys.argv[2] 90 | data_path = home_dir.as_posix() + input_file 91 | output_path = home_dir.as_posix() + '/dvclive' 92 | pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) 93 | 94 | TARGET = 'Class' 95 | train_features = pd.read_csv(data_path + '/train.csv') 96 | X_train = train_features.drop(TARGET, axis=1) 97 | y_train = train_features[TARGET] 98 | feature_names = X_train.columns.to_list() 99 | 100 | test_features = pd.read_csv(data_path + '/test.csv') 101 | X_test = test_features.drop(TARGET, axis=1) 102 | y_test = test_features[TARGET] 103 | 104 | # Evaluate train and test datasets. 105 | with Live(output_path, dvcyaml=False) as live: 106 | evaluate(model, X_train, y_train, "train", live, output_path) 107 | evaluate(model, X_test, y_test, "test", live, output_path) 108 | 109 | # Dump feature importance plot. 110 | save_importance_plot(live, model, feature_names) 111 | 112 | if __name__ == "__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 2 | 3 | ################################################################################# 4 | # GLOBALS # 5 | ################################################################################# 6 | 7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 8 | BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') 9 | PROFILE = default 10 | PROJECT_NAME = creditcard 11 | PYTHON_INTERPRETER = python3 12 | 13 | ifeq (,$(shell which conda)) 14 | HAS_CONDA=False 15 | else 16 | HAS_CONDA=True 17 | endif 18 | 19 | ################################################################################# 20 | # COMMANDS # 21 | ################################################################################# 22 | 23 | ## Install Python Dependencies 24 | requirements: test_environment 25 | $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel 26 | $(PYTHON_INTERPRETER) -m pip install -r requirements.txt 27 | 28 | ## Make Dataset 29 | data: requirements 30 | $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed 31 | 32 | ## Delete all compiled Python files 33 | clean: 34 | find . -type f -name "*.py[co]" -delete 35 | find . -type d -name "__pycache__" -delete 36 | 37 | ## Lint using flake8 38 | lint: 39 | flake8 src 40 | 41 | ## Upload Data to S3 42 | sync_data_to_s3: 43 | ifeq (default,$(PROFILE)) 44 | aws s3 sync data/ s3://$(BUCKET)/data/ 45 | else 46 | aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE) 47 | endif 48 | 49 | ## Download Data from S3 50 | sync_data_from_s3: 51 | ifeq (default,$(PROFILE)) 52 | aws s3 sync s3://$(BUCKET)/data/ data/ 53 | else 54 | aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE) 55 | endif 56 | 57 | ## Set up python interpreter environment 58 | create_environment: 59 | ifeq (True,$(HAS_CONDA)) 60 | @echo ">>> Detected conda, creating conda environment." 61 | ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) 62 | conda create --name $(PROJECT_NAME) python=3 63 | else 64 | conda create --name $(PROJECT_NAME) python=2.7 65 | endif 66 | @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" 67 | else 68 | $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper 69 | @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ 70 | export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" 71 | @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" 72 | @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" 73 | endif 74 | 75 | ## Test python environment is setup correctly 76 | test_environment: 77 | $(PYTHON_INTERPRETER) test_environment.py 78 | 79 | ################################################################################# 80 | # PROJECT RULES # 81 | ################################################################################# 82 | 83 | 84 | 85 | ################################################################################# 86 | # Self Documenting Commands # 87 | ################################################################################# 88 | 89 | .DEFAULT_GOAL := help 90 | 91 | # Inspired by