├── model-training └── xgboost │ ├── src │ ├── requirements.txt │ ├── train.py │ └── sagemaker_ray_helper.py │ ├── xgb_training_job.ipynb │ ├── clean_data.ipynb │ └── clean_large_data.ipynb ├── CODE_OF_CONDUCT.md ├── LICENSE ├── CONTRIBUTING.md ├── inference ├── inference.ipynb └── deploy_xgb_triton_endpoint.ipynb ├── README.md └── Fraud_Detection_Feature_Engineering_v22.ipynb /model-training/xgboost/src/requirements.txt: -------------------------------------------------------------------------------- 1 | ray[data,train] 2 | xgboost -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | -------------------------------------------------------------------------------- /model-training/xgboost/src/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import ray 4 | from sagemaker_ray_helper import RayHelper 5 | from ray.data import Dataset 6 | from ray.train.xgboost import XGBoostTrainer 7 | from ray.train import Result, ScalingConfig 8 | 9 | 10 | def train_xgboost( 11 | num_workers: int, 12 | train_data: Dataset, 13 | test_data: Dataset, 14 | boosting_rounds: int = 100, 15 | use_gpu: bool = False, 16 | ) -> Result: 17 | 18 | params = { 19 | "tree_method": "gpu_hist" if use_gpu else "hist", 20 | "objective": "binary:logistic", 21 | "eval_metric": ["logloss", "error"], 22 | } 23 | 24 | trainer = XGBoostTrainer( 25 | scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), 26 | label_column="TX_FRAUD_1", 27 | params=params, 28 | datasets={"train": train_data, "valid": test_data}, 29 | # datasets={"train": train_data}, 30 | num_boost_round=boosting_rounds, 31 | ) 32 | result = trainer.fit() 33 | print(result.metrics) 34 | 35 | return result 36 | 37 | 38 | if __name__ == "__main__": 39 | 40 | ray_helper = RayHelper() 41 | ray_helper.start_ray() 42 | 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument("--boost_round", type=int, default=100) 45 | parser.add_argument("--train_data_path", type=str) 46 | parser.add_argument("--test_data_path", type=str) 47 | 48 | args = parser.parse_args() 49 | 50 | use_gpu = os.environ.get("USE_GPU", "false").lower() == "true" 51 | 52 | if use_gpu: 53 | num_workers = int(ray.cluster_resources()["GPU"]) 54 | else: 55 | num_workers = int(ray.cluster_resources()["CPU"]) - 2 56 | 57 | train_data = ray.data.read_parquet(args.train_data_path) 58 | test_data = ray.data.read_parquet(args.test_data_path) 59 | 60 | result = train_xgboost( 61 | num_workers, train_data, test_data, args.boost_round, use_gpu 62 | ) 63 | 64 | model = XGBoostTrainer.get_model(result.checkpoint) 65 | 66 | model.save_model("/opt/ml/model/model.xgb") 67 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /model-training/xgboost/src/sagemaker_ray_helper.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import time 4 | import ray 5 | import socket 6 | import json 7 | import sys 8 | 9 | 10 | class RayHelper: 11 | def __init__(self, ray_port: str = "9339", redis_pass: str = "redis_password"): 12 | 13 | self.ray_port = ray_port 14 | self.redis_pass = redis_pass 15 | self.resource_config = self.get_resource_config() 16 | self.master_host = self.resource_config["hosts"][0] 17 | self.n_hosts = len(self.resource_config["hosts"]) 18 | 19 | @staticmethod 20 | def get_resource_config(): 21 | 22 | return dict( 23 | current_host=os.environ.get("SM_CURRENT_HOST"), 24 | hosts=json.loads(os.environ.get("SM_HOSTS")), 25 | ) 26 | 27 | def _get_ip_from_host(self): 28 | ip_wait_time = 200 29 | counter = 0 30 | ip = "" 31 | 32 | while counter < ip_wait_time and ip == "": 33 | try: 34 | ip = socket.gethostbyname(self.master_host) 35 | break 36 | except: 37 | counter += 1 38 | time.sleep(1) 39 | 40 | if counter == ip_wait_time and ip == "": 41 | raise Exception( 42 | "Exceeded max wait time of {}s for hostname resolution".format( 43 | ip_wait_time 44 | ) 45 | ) 46 | return ip 47 | 48 | def start_ray(self): 49 | 50 | master_ip = self._get_ip_from_host() 51 | 52 | if self.resource_config["current_host"] == self.master_host: 53 | if ray.is_initialized(): 54 | print("There is a Ray cluste already running. Shutting it down.") 55 | ray.shutdown() 56 | time.sleep(5) 57 | output = subprocess.run( 58 | [ 59 | "ray", 60 | "start", 61 | "--head", 62 | "-vvv", 63 | "--port", 64 | self.ray_port, 65 | "--redis-password", 66 | self.redis_pass, 67 | "--include-dashboard", 68 | "false", 69 | ], 70 | stdout=subprocess.PIPE, 71 | ) 72 | print(output.stdout.decode("utf-8")) 73 | ray.init(address="auto", include_dashboard=False) 74 | self._wait_for_workers() 75 | print("All workers present and accounted for") 76 | print(ray.cluster_resources()) 77 | 78 | else: 79 | time.sleep(10) 80 | output = subprocess.run( 81 | [ 82 | "ray", 83 | "start", 84 | f"--address={master_ip}:{self.ray_port}", 85 | "--redis-password", 86 | self.redis_pass, 87 | "--block", 88 | ], 89 | stdout=subprocess.PIPE, 90 | ) 91 | print(output.stdout.decode("utf-8")) 92 | sys.exit(0) 93 | 94 | def _wait_for_workers(self, timeout=60): 95 | 96 | print(f"Waiting {timeout} seconds for {self.n_hosts} nodes to join") 97 | 98 | while len(ray.nodes()) < self.n_hosts: 99 | print(f"{len(ray.nodes())} nodes connected to cluster") 100 | time.sleep(5) 101 | timeout -= 5 102 | if timeout == 0: 103 | raise Exception("Max timeout for nodes to join exceeded") 104 | -------------------------------------------------------------------------------- /model-training/xgboost/xgb_training_job.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%pip install -Uqq sagemaker" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%pip install modin" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import boto3\n", 28 | "import json\n", 29 | "import sagemaker\n", 30 | "from pathlib import Path\n", 31 | "from sagemaker.pytorch import PyTorch # PyTorch Estimator for running our training job\n", 32 | "\n", 33 | "role = sagemaker.get_execution_role() # execution role for the endpoint\n", 34 | "sess = sagemaker.session.Session() # sagemaker session for interacting with different AWS APIs\n", 35 | "region = sess._region_name # region name of the current SageMaker Studio environment\n", 36 | "bucket = sess.default_bucket() # default bucket name\n", 37 | "account_id = sess.account_id()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "train_data_path = \"s3://nvidia-aws-fraud-detection-demo-1/output121_clean/train/\"\n", 47 | "test_data_path = \"s3://nvidia-aws-fraud-detection-demo-1/output121_clean/test-small/\"" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "USE_GPU = True\n", 57 | "\n", 58 | "gpu_job = PyTorch(\n", 59 | " source_dir=\"src\",\n", 60 | " entry_point=\"train.py\",\n", 61 | " framework_version=\"2.3\",\n", 62 | " py_version=\"py311\",\n", 63 | " role=role,\n", 64 | " environment={\"USE_GPU\": str(USE_GPU)},\n", 65 | " hyperparameters={ \n", 66 | " \"boost_round\": 100,\n", 67 | " \"train_data_path\": train_data_path,\n", 68 | " \"test_data_path\": test_data_path,\n", 69 | " },\n", 70 | " instance_type=\"ml.g5.12xlarge\",\n", 71 | " instance_count = 1, \n", 72 | " max_run=1000, \n", 73 | " keep_alive_period_in_seconds=300 \n", 74 | ")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "scrolled": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "gpu_job.fit()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "USE_GPU = False\n", 95 | "\n", 96 | "cpu_job = PyTorch(\n", 97 | " source_dir=\"src\",\n", 98 | " entry_point=\"train.py\",\n", 99 | " framework_version=\"2.3\",\n", 100 | " py_version=\"py311\",\n", 101 | " role=role,\n", 102 | " environment={\"USE_GPU\": str(USE_GPU)},\n", 103 | " hyperparameters={ \n", 104 | " \"boost_round\": 100,\n", 105 | " \"train_data_path\": train_data_path,\n", 106 | " \"test_data_path\": test_data_path,\n", 107 | " },\n", 108 | " instance_type=\"ml.r5.12xlarge\",\n", 109 | " instance_count = 2, \n", 110 | " max_run=1000, \n", 111 | " keep_alive_period_in_seconds=300 \n", 112 | ")" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "scrolled": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "cpu_job.fit()" 124 | ] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Python 3 (ipykernel)", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.10.14" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 4 148 | } 149 | -------------------------------------------------------------------------------- /inference/inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "4d61cf6b-1884-4ea3-9fbf-c36ea9bfa866", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import boto3\n", 11 | "import tarfile\n", 12 | "import xgboost as xgb\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "\n", 17 | "def download_model_from_s3(bucket_name, s3_file_path, local_file_path):\n", 18 | " s3 = boto3.client('s3')\n", 19 | " s3.download_file(bucket_name, s3_file_path, local_file_path)\n", 20 | "\n", 21 | "def extract_model(tar_file_path, extract_path):\n", 22 | " with tarfile.open(tar_file_path, 'r:gz') as tar:\n", 23 | " tar.extractall(path=extract_path)\n", 24 | "\n", 25 | "def load_model(model_path):\n", 26 | " return xgb.Booster(model_file=model_path)\n", 27 | "\n", 28 | "def preprocess_data(df):\n", 29 | " # List of features the model expects\n", 30 | " expected_features = ['TX_AMOUNT', 'yyyy', 'mm', 'dd', 'customer_id_nb_txns_15min_window', \n", 31 | " 'customer_id_nb_txns_30min_window', 'customer_id_nb_txns_60min_window', \n", 32 | " 'customer_id_nb_txns_1day_window', 'customer_id_nb_txns_7day_window', \n", 33 | " 'customer_id_nb_txns_15day_window', 'customer_id_nb_txns_30day_window', \n", 34 | " 'customer_id_avg_amt_15min_window', 'customer_id_avg_amt_30min_window', \n", 35 | " 'customer_id_avg_amt_60min_window', 'customer_id_avg_amt_1day_window', \n", 36 | " 'customer_id_avg_amt_7day_window', 'customer_id_avg_amt_15day_window', \n", 37 | " 'customer_id_avg_amt_30day_window', 'terminal_id_nb_txns_15min_window', \n", 38 | " 'terminal_id_nb_txns_30min_window', 'terminal_id_nb_txns_60min_window', \n", 39 | " 'terminal_id_nb_txns_1day_window', 'terminal_id_nb_txns_7day_window', \n", 40 | " 'terminal_id_nb_txns_15day_window', 'terminal_id_nb_txns_30day_window', \n", 41 | " 'terminal_id_avg_amt_15min_window', 'terminal_id_avg_amt_30min_window', \n", 42 | " 'terminal_id_avg_amt_60min_window', 'terminal_id_avg_amt_1day_window', \n", 43 | " 'terminal_id_avg_amt_7day_window', 'terminal_id_avg_amt_15day_window', \n", 44 | " 'terminal_id_avg_amt_30day_window']\n", 45 | "\n", 46 | " # Check if all expected features are in the DataFrame\n", 47 | " missing_features = set(expected_features) - set(df.columns)\n", 48 | " if missing_features:\n", 49 | " raise ValueError(f\"Missing features in input data: {missing_features}\")\n", 50 | "\n", 51 | " # Select only the expected features\n", 52 | " df_selected = df[expected_features]\n", 53 | "\n", 54 | " # Handle missing values if any\n", 55 | " df_selected = df_selected.fillna(0) # or use another appropriate method\n", 56 | "\n", 57 | " return df_selected\n", 58 | "\n", 59 | "\n", 60 | "def predict(model, input_data):\n", 61 | " dmatrix = xgb.DMatrix(input_data)\n", 62 | " return model.predict(dmatrix)\n", 63 | "\n", 64 | "# S3 details\n", 65 | "bucket_name = 'sagemaker-us-east-2-386900942011'\n", 66 | "s3_file_path = 'cpu-job-2024-10-21-18-14-03-937/output/model.tar.gz'\n", 67 | "local_file_path = '/tmp/model.tar.gz'\n", 68 | "extract_path = '/tmp/model'\n", 69 | "\n", 70 | "# Download and extract the model\n", 71 | "download_model_from_s3(bucket_name, s3_file_path, local_file_path)\n", 72 | "extract_model(local_file_path, extract_path)\n", 73 | "\n", 74 | "# Load the model\n", 75 | "model = load_model(os.path.join(extract_path, 'model.xgb'))\n", 76 | "\n", 77 | "# Load your Parquet file\n", 78 | "parquet_file_path = 's3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_0.parquet'\n", 79 | "df = pd.read_parquet(parquet_file_path)\n", 80 | "\n", 81 | "print(\"Columns in the Parquet file:\")\n", 82 | "print(df.columns.tolist())\n", 83 | "\n", 84 | "# Preprocess the data\n", 85 | "try:\n", 86 | " preprocessed_data = preprocess_data(df)\n", 87 | "except ValueError as e:\n", 88 | " print(f\"Error: {e}\")\n", 89 | " # Handle the error appropriately, maybe exit the script\n", 90 | " exit(1)\n", 91 | "\n", 92 | "# Make predictions\n", 93 | "predictions = predict(model, preprocessed_data)\n", 94 | "\n", 95 | "# Add predictions to the original dataframe\n", 96 | "df['predictions'] = predictions\n", 97 | "\n", 98 | "# Display a few rows with predictions\n", 99 | "print(\"\\nSample predictions:\")\n", 100 | "print(df[['TX_FRAUD_1', 'TERMINAL_ID_index', 'merchant_index', 'predictions']])\n", 101 | "\n", 102 | "# Clean up\n", 103 | "os.remove(local_file_path)\n", 104 | "os.remove(os.path.join(extract_path, 'model.xgb'))\n", 105 | "os.rmdir(extract_path)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "183ddb7b-faf1-4e75-a46e-dae3f9addfd4", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3 (ipykernel)", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.11.9" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 5 138 | } 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Accelerating Fraud Detection in Financial Services with NVIDIA RAPIDS on AWS 2 | 3 | This repository demonstrates how to accelerate fraud detection workflows in financial services using **NVIDIA RAPIDS** on **AWS**. The project showcases a GPU-accelerated data pipeline that significantly improves processing speed and cost efficiency compared to traditional CPU-based workflows. 4 | 5 | ## Table of Contents 6 | 1. [What is NVIDIA RAPIDS?](#what-is-nvidia-rapids) 7 | 2. [Why NVIDIA RAPIDS Benefits Financial Services](#why-nvidia-rapids-benefits-financial-services) 8 | 3. [Setting Up EMR Clusters with NVIDIA GPUs](#setting-up-emr-clusters-with-nvidia-gpus) 9 | 4. [Fraud Detection Pipeline](#fraud-detection-pipeline) 10 | 5. [Performance Benchmarks and Cost Efficiency](#performance-benchmarks-and-cost-efficiency) 11 | 6. [Conclusion](#conclusion) 12 | 13 | --- 14 | 15 | ## What is NVIDIA RAPIDS? 16 | NVIDIA RAPIDS is a suite of open-source GPU-accelerated libraries designed to accelerate data science and machine learning workflows. Built on CUDA-X AI, RAPIDS integrates seamlessly with popular frameworks like Apache Spark, enabling massive parallelization of data processing tasks. By leveraging the computational power of GPUs, RAPIDS drastically reduces the time required for operations such as data ingestion, feature engineering, and model training. 17 | 18 | Unlike traditional CPU-based workflows, RAPIDS processes large datasets in-memory, bypassing bottlenecks associated with disk I/O and CPU thread limitations. This makes it ideal for real-time analytics and scalable pipelines, particularly in industries like financial services where speed and accuracy are critical. 19 | 20 | ## Why NVIDIA RAPIDS Benefits Financial Services 21 | Fraud detection in financial services is challenging, requiring real-time insights from complex, large-scale datasets. Traditional CPU-based systems struggle to keep pace with the sheer volume of transactions and the computational demands of feature engineering. NVIDIA RAPIDS, with its GPU-accelerated capabilities, enables faster and more cost-effective data processing, providing an edge in building scalable and efficient data pipelines. 22 | 23 | With RAPIDS, financial institutions can: 24 | - **Detect fraud in near real-time**, minimizing losses. 25 | - **Scale efficiently** to handle growing transaction volumes. 26 | - **Lower infrastructure costs** by reducing reliance on expensive CPU clusters. 27 | 28 | This repository demonstrates a GPU-accelerated fraud detection workflow using NVIDIA RAPIDS and Apache Spark on AWS. The workflow showcases significant improvements in processing speed and cost efficiency compared to CPU-based alternatives. 29 | 30 | --- 31 | 32 | ## Setting Up EMR Clusters with NVIDIA GPUs 33 | To leverage NVIDIA RAPIDS for fraud detection, the EMR cluster must be configured with GPU-enabled instances and specific settings to optimize performance. 34 | You can download the source files for [customers](https://d2908q01vomqb2.cloudfront.net/artifacts/DBSBlogs/FSI-NVIDIA-rapids/customers_parquet.tar.gz), [terminals](https://d2908q01vomqb2.cloudfront.net/artifacts/DBSBlogs/FSI-NVIDIA-rapids/terminals_parquet.tar.gz), [transactions-part1](https://d2908q01vomqb2.cloudfront.net/artifacts/DBSBlogs/FSI-NVIDIA-rapids/transactions_parquet_part1.tar.gz), [transactions-part2](https://d2908q01vomqb2.cloudfront.net/artifacts/DBSBlogs/FSI-NVIDIA-rapids/transactions_parquet_part2.tar.gz) 35 | 36 | ### GPU Cluster Configuration 37 | #### Instances: 38 | - **Primary Node:** `M5.xlarge` 39 | - **Core Nodes:** `12 nodes of G6.4xlarge` (GPU-enabled) 40 | 41 | #### Bootstrap Script: 42 | ```bash 43 | #!/bin/bash 44 | set -ex 45 | sudo mkdir -p /spark-rapids-cgroup/devices 46 | sudo mount -t cgroup -o devices cgroupv1-devices /spark-rapids-cgroup/devices 47 | sudo chmod a+rwx -R /spark-rapids-cgroup 48 | sudo pip3 install numpy 49 | ``` 50 | 51 | #### JSON Configuration: 52 | ```json 53 | [ 54 | { 55 | "Classification": "spark", 56 | "Properties": { 57 | "enableSparkRapids": "true" 58 | } 59 | }, 60 | { 61 | "Classification": "spark-defaults", 62 | "Properties": { 63 | "spark.executor.memory": "30G", 64 | "spark.executor.instances": "12", 65 | "spark.executor.resource.gpu.amount": "1", 66 | "spark.plugins": "com.nvidia.spark.SQLPlugin", 67 | "spark.rapids.sql.enabled": "true" 68 | } 69 | } 70 | ] 71 | ``` 72 | 73 | --- 74 | 75 | ## Fraud Detection Pipeline 76 | ### Step 1: Initialize Spark Session with GPU Optimizations 77 | ```python 78 | spark = SparkSession.builder \ 79 | .config("spark.executor.memory", "80G") \ 80 | .config("spark.sql.shuffle.partitions", "20000") \ 81 | .getOrCreate() 82 | ``` 83 | 84 | ### Step 2: Load and Prepare Data 85 | ```python 86 | customers_df = spark.read.parquet(customers_path).repartition(300) 87 | transactions_df = spark.read.parquet(transactions_path).repartition(1000) 88 | ``` 89 | 90 | ### Step 3: Convert TX_DATETIME to Timestamp 91 | ```python 92 | transactions_df = transactions_df.withColumn("TX_DATETIME", F.col("TX_DATETIME").cast("timestamp")) 93 | ``` 94 | 95 | ### Step 4: Extract Date Components 96 | ```python 97 | transactions_df = transactions_df.withColumn("yyyy", year(F.col("TX_DATETIME"))) \ 98 | .withColumn("mm", month(F.col("TX_DATETIME"))) \ 99 | .withColumn("dd", dayofmonth(F.col("TX_DATETIME"))) 100 | ``` 101 | 102 | ### Step 5: Save Final Dataset 103 | ```python 104 | final_df.write.mode("overwrite").parquet("s3://path/to/output/") 105 | ``` 106 | 107 | --- 108 | 109 | ## Performance Benchmarks and Cost Efficiency 110 | | Instance Type | Core Count | Hourly Cost | Run Time (Minutes) | Total Cost | 111 | |--------------|------------|-------------|--------------------|------------| 112 | | GPU (G6.4xlarge) | 12 | $1.323 | 43 | $11.52 | 113 | | CPU (R7i.4xLarge) | 12 | $1.058 | 450 | $96.66 | 114 | | CPU (R7a.4xlarge) | 12 | $1.217 | 246 | $60.67 | 115 | 116 | ### Key Takeaways: 117 | - **up to 10.5x Speedup**: GPU workflows process data in minutes, enabling real-time fraud alerts. 118 | - **up to 8.4x Cost Reduction**: Lower infrastructure costs due to reduced runtime and optimized resource usage. 119 | 120 | --- 121 | 122 | ## Conclusion 123 | NVIDIA RAPIDS, integrated with AWS, transforms fraud detection pipelines by delivering unmatched performance and cost efficiency. By combining GPU acceleration with AWS’s scalable infrastructure, businesses can achieve real-time insights, minimize losses, and build a future-proof fraud detection system. 124 | 125 | **Ready to supercharge your pipeline?** Explore NVIDIA RAPIDS and AWS GPU instances today to unlock the next level of speed and efficiency. 126 | 127 | -------------------------------------------------------------------------------- /model-training/xgboost/clean_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Collecting duckdb\n", 13 | " Downloading duckdb-1.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)\n", 14 | "Downloading duckdb-1.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.1 MB)\n", 15 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.1/20.1 MB\u001b[0m \u001b[31m56.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", 16 | "\u001b[?25hInstalling collected packages: duckdb\n", 17 | "Successfully installed duckdb-1.1.2\n", 18 | "Note: you may need to restart the kernel to use updated packages.\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "%pip install duckdb" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import duckdb\n", 33 | "from pathlib import Path" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "application/vnd.jupyter.widget-view+json": { 44 | "model_id": "439b43e18fc549089a3da72ab0117f57", 45 | "version_major": 2, 46 | "version_minor": 0 47 | }, 48 | "text/plain": [ 49 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 50 | ] 51 | }, 52 | "metadata": {}, 53 | "output_type": "display_data" 54 | }, 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "" 59 | ] 60 | }, 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "con = duckdb.connect()\n", 68 | "con.execute(\"CREATE TABLE training_data AS SELECT * FROM 'data/train.parquet'\")\n", 69 | "con.execute(\"CREATE TABLE validation_data AS SELECT * FROM 'data/validation.parquet'\")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "columns_to_drop = [\n", 79 | " \"CUSTOMER_ID_index\",\n", 80 | " \"customer_name_index\",\n", 81 | " \"customer_email_index\",\n", 82 | " \"phone_index\",\n", 83 | " \"billing_zip\",\n", 84 | " \"billing_city_index\",\n", 85 | " \"billing_state_index\",\n", 86 | " \"x_customer_id\",\n", 87 | " \"y_customer_id\",\n", 88 | "]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "application/vnd.jupyter.widget-view+json": { 99 | "model_id": "53e4420549e246b2b4e3d4527fefeae6", 100 | "version_major": 2, 101 | "version_minor": 0 102 | }, 103 | "text/plain": [ 104 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 105 | ] 106 | }, 107 | "metadata": {}, 108 | "output_type": "display_data" 109 | } 110 | ], 111 | "source": [ 112 | "for table_name in [\"training_data\", \"validation_data\"]:\n", 113 | " # drop bad feature columns\n", 114 | " for column in columns_to_drop:\n", 115 | " con.execute(f\"ALTER TABLE {table_name} DROP COLUMN {column}\")\n", 116 | " \n", 117 | " # convert TX_AMOUNT to double\n", 118 | " con.execute(f\"ALTER TABLE {table_name} ADD COLUMN TX_AMOUNT_TEMP DOUBLE\")\n", 119 | " con.execute(f\"UPDATE {table_name} SET TX_AMOUNT_TEMP = CAST(TX_AMOUNT AS DOUBLE)\")\n", 120 | " con.execute(f\"ALTER TABLE {table_name} DROP COLUMN TX_AMOUNT\")\n", 121 | " con.execute(f\"ALTER TABLE {table_name} RENAME COLUMN TX_AMOUNT_TEMP TO TX_AMOUNT\")" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "rows_per_file = 1_000_000\n", 131 | "cleaned_data_path = \"data/cleaned_data\"\n", 132 | "\n", 133 | "Path(cleaned_data_path).mkdir(parents=True, exist_ok=True)\n", 134 | "\n", 135 | "for table_name in [\"training_data\", \"validation_data\"]:\n", 136 | " num_rows = con.execute(f\"SELECT COUNT(*) FROM {table_name}\").fetchone()[0]\n", 137 | " \n", 138 | " Path(f\"{cleaned_data_path}/{table_name}\").mkdir(parents=True, exist_ok=True)\n", 139 | " \n", 140 | " for i in range(0, num_rows, rows_per_file):\n", 141 | " query = f\"SELECT * FROM {table_name} LIMIT {rows_per_file} OFFSET {i}\"\n", 142 | " df = con.execute(query).fetchdf()\n", 143 | " \n", 144 | " df.to_parquet(f\"{cleaned_data_path}/{table_name}/{table_name}_{i}.parquet\")" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 8, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "upload: data/cleaned_data/training_data/training_data_0.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_0.parquet\n", 157 | "upload: data/cleaned_data/training_data/training_data_13000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_13000000.parquet\n", 158 | "upload: data/cleaned_data/training_data/training_data_1000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_1000000.parquet\n", 159 | "upload: data/cleaned_data/training_data/training_data_10000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_10000000.parquet\n", 160 | "upload: data/cleaned_data/training_data/training_data_12000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_12000000.parquet\n", 161 | "upload: data/cleaned_data/training_data/training_data_15000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_15000000.parquet\n", 162 | "upload: data/cleaned_data/training_data/training_data_16000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_16000000.parquet\n", 163 | "upload: data/cleaned_data/training_data/training_data_19000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_19000000.parquet\n", 164 | "upload: data/cleaned_data/training_data/training_data_17000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_17000000.parquet\n", 165 | "upload: data/cleaned_data/training_data/training_data_11000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_11000000.parquet\n", 166 | "upload: data/cleaned_data/training_data/training_data_2000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_2000000.parquet\n", 167 | "upload: data/cleaned_data/training_data/training_data_18000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_18000000.parquet\n", 168 | "upload: data/cleaned_data/training_data/training_data_20000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_20000000.parquet\n", 169 | "upload: data/cleaned_data/training_data/training_data_14000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_14000000.parquet\n", 170 | "upload: data/cleaned_data/training_data/training_data_23000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_23000000.parquet\n", 171 | "upload: data/cleaned_data/training_data/training_data_21000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_21000000.parquet\n", 172 | "upload: data/cleaned_data/training_data/training_data_25000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_25000000.parquet\n", 173 | "upload: data/cleaned_data/training_data/training_data_26000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_26000000.parquet\n", 174 | "upload: data/cleaned_data/training_data/training_data_28000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_28000000.parquet\n", 175 | "upload: data/cleaned_data/training_data/training_data_24000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_24000000.parquet\n", 176 | "upload: data/cleaned_data/training_data/training_data_22000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_22000000.parquet\n", 177 | "upload: data/cleaned_data/training_data/training_data_29000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_29000000.parquet\n", 178 | "upload: data/cleaned_data/training_data/training_data_27000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_27000000.parquet\n", 179 | "upload: data/cleaned_data/training_data/training_data_30000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_30000000.parquet\n", 180 | "upload: data/cleaned_data/training_data/training_data_3000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_3000000.parquet\n", 181 | "upload: data/cleaned_data/training_data/training_data_31000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_31000000.parquet\n", 182 | "upload: data/cleaned_data/training_data/training_data_32000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_32000000.parquet\n", 183 | "upload: data/cleaned_data/training_data/training_data_36000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_36000000.parquet\n", 184 | "upload: data/cleaned_data/training_data/training_data_34000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_34000000.parquet\n", 185 | "upload: data/cleaned_data/training_data/training_data_38000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_38000000.parquet\n", 186 | "upload: data/cleaned_data/training_data/training_data_37000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_37000000.parquet\n", 187 | "upload: data/cleaned_data/training_data/training_data_35000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_35000000.parquet\n", 188 | "upload: data/cleaned_data/training_data/training_data_33000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_33000000.parquet\n", 189 | "upload: data/cleaned_data/training_data/training_data_39000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_39000000.parquet\n", 190 | "upload: data/cleaned_data/training_data/training_data_43000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_43000000.parquet\n", 191 | "upload: data/cleaned_data/training_data/training_data_41000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_41000000.parquet\n", 192 | "upload: data/cleaned_data/training_data/training_data_42000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_42000000.parquet\n", 193 | "upload: data/cleaned_data/training_data/training_data_44000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_44000000.parquet\n", 194 | "upload: data/cleaned_data/training_data/training_data_4000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_4000000.parquet\n", 195 | "upload: data/cleaned_data/training_data/training_data_40000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_40000000.parquet\n", 196 | "upload: data/cleaned_data/training_data/training_data_49000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_49000000.parquet\n", 197 | "upload: data/cleaned_data/training_data/training_data_48000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_48000000.parquet\n", 198 | "upload: data/cleaned_data/training_data/training_data_45000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_45000000.parquet\n", 199 | "upload: data/cleaned_data/training_data/training_data_46000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_46000000.parquet\n", 200 | "upload: data/cleaned_data/training_data/training_data_5000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_5000000.parquet\n", 201 | "upload: data/cleaned_data/training_data/training_data_51000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_51000000.parquet\n", 202 | "upload: data/cleaned_data/training_data/training_data_50000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_50000000.parquet\n", 203 | "upload: data/cleaned_data/training_data/training_data_53000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_53000000.parquet\n", 204 | "upload: data/cleaned_data/training_data/training_data_52000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_52000000.parquet\n", 205 | "upload: data/cleaned_data/training_data/training_data_55000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_55000000.parquet\n", 206 | "upload: data/cleaned_data/training_data/training_data_47000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_47000000.parquet\n", 207 | "upload: data/cleaned_data/training_data/training_data_56000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_56000000.parquet\n", 208 | "upload: data/cleaned_data/training_data/training_data_6000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_6000000.parquet\n", 209 | "upload: data/cleaned_data/training_data/training_data_57000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_57000000.parquet\n", 210 | "upload: data/cleaned_data/training_data/training_data_54000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_54000000.parquet\n", 211 | "upload: data/cleaned_data/training_data/training_data_58000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_58000000.parquet\n", 212 | "upload: data/cleaned_data/training_data/training_data_59000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_59000000.parquet\n", 213 | "upload: data/cleaned_data/training_data/training_data_61000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_61000000.parquet\n", 214 | "upload: data/cleaned_data/training_data/training_data_60000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_60000000.parquet\n", 215 | "upload: data/cleaned_data/training_data/training_data_64000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_64000000.parquet\n", 216 | "upload: data/cleaned_data/training_data/training_data_62000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_62000000.parquet\n", 217 | "upload: data/cleaned_data/training_data/training_data_66000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_66000000.parquet\n", 218 | "upload: data/cleaned_data/training_data/training_data_67000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_67000000.parquet\n", 219 | "upload: data/cleaned_data/training_data/training_data_68000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_68000000.parquet\n", 220 | "upload: data/cleaned_data/training_data/training_data_65000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_65000000.parquet\n", 221 | "upload: data/cleaned_data/training_data/training_data_69000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_69000000.parquet\n", 222 | "upload: data/cleaned_data/training_data/training_data_7000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_7000000.parquet\n", 223 | "upload: data/cleaned_data/validation_data/validation_data_0.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_0.parquet\n", 224 | "upload: data/cleaned_data/training_data/training_data_8000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_8000000.parquet\n", 225 | "upload: data/cleaned_data/training_data/training_data_63000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_63000000.parquet\n", 226 | "upload: data/cleaned_data/validation_data/validation_data_1000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_1000000.parquet\n", 227 | "upload: data/cleaned_data/training_data/training_data_9000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_9000000.parquet\n", 228 | "upload: data/cleaned_data/validation_data/validation_data_11000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_11000000.parquet\n", 229 | "upload: data/cleaned_data/validation_data/validation_data_10000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_10000000.parquet\n", 230 | "upload: data/cleaned_data/validation_data/validation_data_14000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_14000000.parquet\n", 231 | "upload: data/cleaned_data/validation_data/validation_data_13000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_13000000.parquet\n", 232 | "upload: data/cleaned_data/validation_data/validation_data_15000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_15000000.parquet\n", 233 | "upload: data/cleaned_data/validation_data/validation_data_12000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_12000000.parquet\n", 234 | "upload: data/cleaned_data/validation_data/validation_data_16000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_16000000.parquet\n", 235 | "upload: data/cleaned_data/validation_data/validation_data_17000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_17000000.parquet\n", 236 | "upload: data/cleaned_data/validation_data/validation_data_18000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_18000000.parquet\n", 237 | "upload: data/cleaned_data/validation_data/validation_data_19000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_19000000.parquet\n", 238 | "upload: data/cleaned_data/validation_data/validation_data_2000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_2000000.parquet\n", 239 | "upload: data/cleaned_data/validation_data/validation_data_3000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_3000000.parquet\n", 240 | "upload: data/cleaned_data/validation_data/validation_data_4000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_4000000.parquet\n", 241 | "upload: data/cleaned_data/validation_data/validation_data_6000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_6000000.parquet\n", 242 | "upload: data/cleaned_data/validation_data/validation_data_5000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_5000000.parquet\n", 243 | "upload: data/cleaned_data/validation_data/validation_data_7000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_7000000.parquet\n", 244 | "upload: data/cleaned_data/validation_data/validation_data_9000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_9000000.parquet\n", 245 | "upload: data/cleaned_data/validation_data/validation_data_8000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/validation_data/validation_data_8000000.parquet\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "!aws s3 sync data/cleaned_data/ s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [] 259 | } 260 | ], 261 | "metadata": { 262 | "kernelspec": { 263 | "display_name": "Python 3 (ipykernel)", 264 | "language": "python", 265 | "name": "python3" 266 | }, 267 | "language_info": { 268 | "codemirror_mode": { 269 | "name": "ipython", 270 | "version": 3 271 | }, 272 | "file_extension": ".py", 273 | "mimetype": "text/x-python", 274 | "name": "python", 275 | "nbconvert_exporter": "python", 276 | "pygments_lexer": "ipython3", 277 | "version": "3.11.9" 278 | } 279 | }, 280 | "nbformat": 4, 281 | "nbformat_minor": 4 282 | } 283 | -------------------------------------------------------------------------------- /Fraud_Detection_Feature_Engineering_v22.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f12da5e9-44de-4459-86c8-27cefdad4950", 7 | "metadata": { 8 | "execution": { 9 | "iopub.execute_input": "2024-10-20T12:14:23.029461Z", 10 | "iopub.status.busy": "2024-10-20T12:14:23.029121Z", 11 | "iopub.status.idle": "2024-10-20T12:14:56.448746Z", 12 | "shell.execute_reply": "2024-10-20T12:14:56.448056Z", 13 | "shell.execute_reply.started": "2024-10-20T12:14:23.029424Z" 14 | }, 15 | "tags": [] 16 | }, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "application/vnd.jupyter.widget-view+json": { 21 | "model_id": "f63a9cedc0f24de386dbc7a352607f6d", 22 | "version_major": 2, 23 | "version_minor": 0 24 | }, 25 | "text/plain": [ 26 | "VBox()" 27 | ] 28 | }, 29 | "metadata": {}, 30 | "output_type": "display_data" 31 | }, 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "Starting Spark application\n" 37 | ] 38 | }, 39 | { 40 | "data": { 41 | "text/html": [ 42 | "\n", 43 | "
IDYARN Application IDKindStateSpark UIDriver logUserCurrent session?
0application_1729426168720_0001pysparkidleLinkLinkNone
" 45 | ], 46 | "text/plain": [ 47 | "" 48 | ] 49 | }, 50 | "metadata": {}, 51 | "output_type": "display_data" 52 | }, 53 | { 54 | "data": { 55 | "application/vnd.jupyter.widget-view+json": { 56 | "model_id": "", 57 | "version_major": 2, 58 | "version_minor": 0 59 | }, 60 | "text/plain": [ 61 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 62 | ] 63 | }, 64 | "metadata": {}, 65 | "output_type": "display_data" 66 | }, 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "SparkSession available as 'spark'.\n" 72 | ] 73 | }, 74 | { 75 | "data": { 76 | "application/vnd.jupyter.widget-view+json": { 77 | "model_id": "", 78 | "version_major": 2, 79 | "version_minor": 0 80 | }, 81 | "text/plain": [ 82 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 83 | ] 84 | }, 85 | "metadata": {}, 86 | "output_type": "display_data" 87 | }, 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "The value of spark.kryoserializer.buffer.max is: 2047m\n", 93 | "Current spark.executor.memory: 80G\n", 94 | "spark.network.timeout: 1600s\n", 95 | "spark.driver.maxResultSize: Not Set\n", 96 | "spark.executor.heartbeatInterval: Not Set" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "#Load Libraries and initialize session\n", 102 | "from pyspark.sql import functions as F\n", 103 | "from pyspark.sql.window import Window\n", 104 | "from pyspark.sql import SparkSession\n", 105 | "from pyspark import SparkConf\n", 106 | "from pyspark.ml.feature import StringIndexer\n", 107 | "from pyspark.sql.functions import year, month, dayofmonth\n", 108 | "from pyspark.sql.functions import broadcast\n", 109 | "\n", 110 | "# Initialize Spark session with updated configurations\n", 111 | "spark = SparkSession.builder \\\n", 112 | " .appName(\"Fraud Detection Feature Engineering\") \\\n", 113 | " .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n", 114 | " .config(\"spark.kryoserializer.buffer.max\", \"2047m\") \\\n", 115 | " .config(\"spark.executor.memory\", \"80G\") \\\n", 116 | " .config(\"spark.shuffle.compress\", \"true\") \\\n", 117 | " .config(\"spark.shuffle.spill.compress\", \"true\") \\\n", 118 | " .config(\"spark.sql.shuffle.partitions\", \"20000\") \\\n", 119 | " .getOrCreate()\n", 120 | "\n", 121 | "# Retrieve the value of spark.config\n", 122 | "network_timeout = spark.conf.get(\"spark.network.timeout\", \"Not Set\")\n", 123 | "max_result_size = spark.conf.get(\"spark.driver.maxResultSize\", \"Not Set\")\n", 124 | "heartbeat_interval = spark.conf.get(\"spark.executor.heartbeatInterval\", \"Not Set\")\n", 125 | "buffer_max = spark.conf.get(\"spark.kryoserializer.buffer.max\")\n", 126 | "print(f\"The value of spark.kryoserializer.buffer.max is: {buffer_max}\")\n", 127 | "executor_memory = spark.conf.get(\"spark.executor.memory\")\n", 128 | "print(f\"Current spark.executor.memory: {executor_memory}\")\n", 129 | "print(f\"spark.network.timeout: {network_timeout}\")\n", 130 | "print(f\"spark.driver.maxResultSize: {max_result_size}\")\n", 131 | "print(f\"spark.executor.heartbeatInterval: {heartbeat_interval}\")" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 2, 137 | "id": "96c4f4c3-94b8-47ac-b960-3b01e9c0ff8b", 138 | "metadata": { 139 | "execution": { 140 | "iopub.execute_input": "2024-10-20T12:14:56.450018Z", 141 | "iopub.status.busy": "2024-10-20T12:14:56.449802Z", 142 | "iopub.status.idle": "2024-10-20T12:15:05.780397Z", 143 | "shell.execute_reply": "2024-10-20T12:15:05.779648Z", 144 | "shell.execute_reply.started": "2024-10-20T12:14:56.449969Z" 145 | }, 146 | "tags": [] 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "application/vnd.jupyter.widget-view+json": { 152 | "model_id": "b0472a922bcd412fb0ce453e0173f5c2", 153 | "version_major": 2, 154 | "version_minor": 0 155 | }, 156 | "text/plain": [ 157 | "VBox()" 158 | ] 159 | }, 160 | "metadata": {}, 161 | "output_type": "display_data" 162 | }, 163 | { 164 | "data": { 165 | "application/vnd.jupyter.widget-view+json": { 166 | "model_id": "", 167 | "version_major": 2, 168 | "version_minor": 0 169 | }, 170 | "text/plain": [ 171 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 172 | ] 173 | }, 174 | "metadata": {}, 175 | "output_type": "display_data" 176 | }, 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "Customers Schema:\n", 182 | "root\n", 183 | " |-- CUSTOMER_ID: string (nullable = true)\n", 184 | " |-- customer_name: string (nullable = true)\n", 185 | " |-- billing_street: string (nullable = true)\n", 186 | " |-- billing_city: string (nullable = true)\n", 187 | " |-- billing_state: string (nullable = true)\n", 188 | " |-- billing_zip: string (nullable = true)\n", 189 | " |-- customer_job: string (nullable = true)\n", 190 | " |-- customer_email: string (nullable = true)\n", 191 | " |-- phone: string (nullable = true)\n", 192 | " |-- x_customer_id: double (nullable = true)\n", 193 | " |-- y_customer_id: double (nullable = true)\n", 194 | " |-- mean_amount: double (nullable = true)\n", 195 | " |-- std_amount: double (nullable = true)\n", 196 | " |-- mean_nb_tx_per_day: double (nullable = true)\n", 197 | " |-- std_dev_nb_tx_per_day: double (nullable = true)\n", 198 | " |-- available_terminals: array (nullable = true)\n", 199 | " | |-- element: string (containsNull = true)\n", 200 | "\n", 201 | "Terminals Schema:\n", 202 | "root\n", 203 | " |-- TERMINAL_ID: string (nullable = true)\n", 204 | " |-- x_terminal_id: double (nullable = true)\n", 205 | " |-- y_terminal_id: double (nullable = true)\n", 206 | " |-- merchant: string (nullable = true)\n", 207 | "\n", 208 | "Transactions Schema:\n", 209 | "root\n", 210 | " |-- TX_DATETIME: string (nullable = true)\n", 211 | " |-- CUSTOMER_ID: string (nullable = true)\n", 212 | " |-- TERMINAL_ID: string (nullable = true)\n", 213 | " |-- TX_AMOUNT: double (nullable = true)\n", 214 | " |-- TX_TIME_SECONDS: long (nullable = true)\n", 215 | " |-- TX_TIME_DAYS: integer (nullable = true)\n", 216 | " |-- TX_FRAUD: integer (nullable = true)\n", 217 | " |-- month: string (nullable = true)" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "# Load datasets and infer schema\n", 223 | "customers_path = \"s3://nvidia-aws-fraud-detection-demo-training-data/customers_parquet/\"\n", 224 | "terminals_path = \"s3://nvidia-aws-fraud-detection-demo-training-data/terminals_parquet/\"\n", 225 | "transactions_path = \"s3://nvidia-aws-fraud-detection-demo-training-data/transactions_parquet/\"\n", 226 | "\n", 227 | "customers_df = spark.read.parquet(customers_path).repartition(300)\n", 228 | "terminals_df = spark.read.parquet(terminals_path)\n", 229 | "transactions_df = spark.read.parquet(transactions_path).repartition(1000)\n", 230 | "_\n", 231 | "# Show schema of each dataset to understand their structure\n", 232 | "print(\"Customers Schema:\")\n", 233 | "customers_df.printSchema()\n", 234 | "\n", 235 | "print(\"Terminals Schema:\")\n", 236 | "terminals_df.printSchema()\n", 237 | "\n", 238 | "print(\"Transactions Schema:\")\n", 239 | "transactions_df.printSchema()\n", 240 | "\n", 241 | "# Count the rows in each dataset to understand the size\n", 242 | "#print(f\"Number of rows in customers: {customers_df.count()}\")\n", 243 | "#print(f\"Number of rows in terminals: {terminals_df.count()}\")\n", 244 | "#print(f\"Number of rows in transactions: {transactions_df.count()}\")" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 3, 250 | "id": "f5daaefd-fa96-4c24-8a70-03210dd49585", 251 | "metadata": { 252 | "execution": { 253 | "iopub.execute_input": "2024-10-20T12:15:05.781534Z", 254 | "iopub.status.busy": "2024-10-20T12:15:05.781359Z", 255 | "iopub.status.idle": "2024-10-20T12:15:06.039330Z", 256 | "shell.execute_reply": "2024-10-20T12:15:06.038692Z", 257 | "shell.execute_reply.started": "2024-10-20T12:15:05.781510Z" 258 | } 259 | }, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "application/vnd.jupyter.widget-view+json": { 264 | "model_id": "810dc6eee9cf46d480bbd3da5f2b410f", 265 | "version_major": 2, 266 | "version_minor": 0 267 | }, 268 | "text/plain": [ 269 | "VBox()" 270 | ] 271 | }, 272 | "metadata": {}, 273 | "output_type": "display_data" 274 | }, 275 | { 276 | "data": { 277 | "application/vnd.jupyter.widget-view+json": { 278 | "model_id": "", 279 | "version_major": 2, 280 | "version_minor": 0 281 | }, 282 | "text/plain": [ 283 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 284 | ] 285 | }, 286 | "metadata": {}, 287 | "output_type": "display_data" 288 | } 289 | ], 290 | "source": [ 291 | "# Broadcast smaller tables for efficient joins\n", 292 | "terminals_df = broadcast(terminals_df)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 4, 298 | "id": "f5d20ad4-5145-4b58-8747-d3ff6e6868f7", 299 | "metadata": { 300 | "execution": { 301 | "iopub.execute_input": "2024-10-20T12:15:06.040398Z", 302 | "iopub.status.busy": "2024-10-20T12:15:06.040226Z", 303 | "iopub.status.idle": "2024-10-20T12:15:06.300418Z", 304 | "shell.execute_reply": "2024-10-20T12:15:06.299813Z", 305 | "shell.execute_reply.started": "2024-10-20T12:15:06.040376Z" 306 | }, 307 | "tags": [] 308 | }, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "application/vnd.jupyter.widget-view+json": { 313 | "model_id": "c338511da9784351be85624d3f1c2394", 314 | "version_major": 2, 315 | "version_minor": 0 316 | }, 317 | "text/plain": [ 318 | "VBox()" 319 | ] 320 | }, 321 | "metadata": {}, 322 | "output_type": "display_data" 323 | }, 324 | { 325 | "data": { 326 | "application/vnd.jupyter.widget-view+json": { 327 | "model_id": "", 328 | "version_major": 2, 329 | "version_minor": 0 330 | }, 331 | "text/plain": [ 332 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 333 | ] 334 | }, 335 | "metadata": {}, 336 | "output_type": "display_data" 337 | } 338 | ], 339 | "source": [ 340 | "# Convert the TX_DATETIME column to timestamp\n", 341 | "transactions_df = transactions_df.withColumn(\n", 342 | " \"TX_DATETIME\",\n", 343 | " F.col(\"TX_DATETIME\").cast(\"timestamp\"))\n", 344 | "\n", 345 | "# Split TX_DATETIME into yyyy, mm, and dd columns\n", 346 | "transactions_df = transactions_df.withColumn(\"yyyy\", year(F.col(\"TX_DATETIME\"))) \\\n", 347 | " .withColumn(\"mm\", month(F.col(\"TX_DATETIME\"))) \\\n", 348 | " .withColumn(\"dd\", dayofmonth(F.col(\"TX_DATETIME\")))\n", 349 | "\n", 350 | "# Define time windows in seconds for feature extraction\n", 351 | "time_windows = {\n", 352 | " \"15min\": 15 * 60,\n", 353 | " \"30min\": 30 * 60,\n", 354 | " \"60min\": 60 * 60,\n", 355 | " \"1day\": 24 * 60 * 60,\n", 356 | " \"7day\": 7 * 24 * 60 * 60,\n", 357 | " \"15day\": 15 * 24 * 60 * 60,\n", 358 | " \"30day\": 30 * 24 * 60 * 60\n", 359 | "}" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 5, 365 | "id": "80d130e0-70b3-4892-921c-948862b990cf", 366 | "metadata": { 367 | "execution": { 368 | "iopub.execute_input": "2024-10-20T12:15:06.302015Z", 369 | "iopub.status.busy": "2024-10-20T12:15:06.301848Z", 370 | "iopub.status.idle": "2024-10-20T12:23:13.490975Z", 371 | "shell.execute_reply": "2024-10-20T12:23:13.490302Z", 372 | "shell.execute_reply.started": "2024-10-20T12:15:06.301993Z" 373 | }, 374 | "tags": [] 375 | }, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "application/vnd.jupyter.widget-view+json": { 380 | "model_id": "5a7d4191ef0343d2a33efda3b0f743ef", 381 | "version_major": 2, 382 | "version_minor": 0 383 | }, 384 | "text/plain": [ 385 | "VBox()" 386 | ] 387 | }, 388 | "metadata": {}, 389 | "output_type": "display_data" 390 | }, 391 | { 392 | "data": { 393 | "application/vnd.jupyter.widget-view+json": { 394 | "model_id": "", 395 | "version_major": 2, 396 | "version_minor": 0 397 | }, 398 | "text/plain": [ 399 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 400 | ] 401 | }, 402 | "metadata": {}, 403 | "output_type": "display_data" 404 | } 405 | ], 406 | "source": [ 407 | "# Define a function to add window features efficiently\n", 408 | "def add_window_features(transactions_df, time_windows, entity_id_col, prefix):\n", 409 | " for window_name, window_duration in time_windows.items():\n", 410 | " window_spec = Window.partitionBy(entity_id_col).orderBy(\n", 411 | " F.col(\"TX_DATETIME\").cast(\"long\")).rangeBetween(\n", 412 | " -window_duration, 0)\n", 413 | "\n", 414 | " # Number of transactions in the time window\n", 415 | " transactions_df = transactions_df.withColumn(\n", 416 | " f\"{prefix}_nb_txns_{window_name}_window\",\n", 417 | " F.count(\"*\").over(window_spec))\n", 418 | "\n", 419 | " # Average transaction amount in the time window\n", 420 | " transactions_df = transactions_df.withColumn(\n", 421 | " f\"{prefix}_avg_amt_{window_name}_window\",\n", 422 | " F.avg(\"TX_AMOUNT\").over(window_spec))\n", 423 | "\n", 424 | " return transactions_df\n", 425 | "\n", 426 | "\n", 427 | "# Add customer-related features\n", 428 | "transactions_df = add_window_features(transactions_df, time_windows,\n", 429 | " \"CUSTOMER_ID\", \"customer_id\")\n", 430 | "\n", 431 | "# Add terminal-related features\n", 432 | "transactions_df = add_window_features(transactions_df, time_windows,\n", 433 | " \"TERMINAL_ID\", \"terminal_id\")\n", 434 | "\n", 435 | "# Ordinal Encoding using StringIndexer for CUSTOMER_ID and TERMINAL_ID\n", 436 | "customer_indexer = StringIndexer(inputCol=\"CUSTOMER_ID\",\n", 437 | " outputCol=\"CUSTOMER_ID_index\",\n", 438 | " handleInvalid=\"keep\").fit(transactions_df)\n", 439 | "transactions_df = customer_indexer.transform(transactions_df)\n", 440 | "\n", 441 | "# Apply the same StringIndexer to customers_df to create the CUSTOMER_ID_index column\n", 442 | "customers_df = customer_indexer.transform(customers_df)\n", 443 | "\n", 444 | "# Ordinal encoding for other columns in customers_df\n", 445 | "columns_to_encode_customers = ['customer_name', 'customer_email', 'phone']\n", 446 | "for column in columns_to_encode_customers:\n", 447 | " if column in customers_df.columns:\n", 448 | " indexer = StringIndexer(inputCol=column,\n", 449 | " outputCol=f\"{column}_index\",\n", 450 | " handleInvalid=\"keep\").fit(customers_df)\n", 451 | " customers_df = indexer.transform(customers_df)\n", 452 | "\n", 453 | "# Ordinal encoding for TERMINAL_ID in transactions_df\n", 454 | "terminal_indexer = StringIndexer(inputCol=\"TERMINAL_ID\",\n", 455 | " outputCol=\"TERMINAL_ID_index\",\n", 456 | " handleInvalid=\"keep\").fit(transactions_df)\n", 457 | "transactions_df = terminal_indexer.transform(transactions_df)\n", 458 | "\n", 459 | "# Apply the same StringIndexer to terminals_df to create the TERMINAL_ID_index column\n", 460 | "terminals_df = terminal_indexer.transform(terminals_df)\n", 461 | "\n", 462 | "# Ordinal encoding for merchant in both transactions_df and terminals_df\n", 463 | "if 'merchant' in transactions_df.columns:\n", 464 | " merchant_indexer = StringIndexer(inputCol='merchant',\n", 465 | " outputCol='merchant_index',\n", 466 | " handleInvalid=\"keep\").fit(transactions_df)\n", 467 | " transactions_df = merchant_indexer.transform(transactions_df)\n", 468 | "\n", 469 | "if 'merchant' in terminals_df.columns:\n", 470 | " merchant_indexer_terminals = StringIndexer(\n", 471 | " inputCol='merchant', outputCol='merchant_index',\n", 472 | " handleInvalid=\"keep\").fit(terminals_df)\n", 473 | " terminals_df = merchant_indexer_terminals.transform(terminals_df)\n", 474 | "\n", 475 | "# Apply StringIndexer to additional categorical columns in transactions_df\n", 476 | "columns_to_encode_transactions = ['merchant'] # Already handled 'merchant'\n", 477 | "for column in columns_to_encode_transactions:\n", 478 | " if column in transactions_df.columns:\n", 479 | " indexer = StringIndexer(inputCol=column,\n", 480 | " outputCol=f\"{column}_index\",\n", 481 | " handleInvalid=\"keep\").fit(transactions_df)\n", 482 | " transactions_df = indexer.transform(transactions_df)\n", 483 | " transactions_df = transactions_df.drop(column)\n", 484 | "\n", 485 | "# One-hot encoding for TX_FRAUD\n", 486 | "transactions_df = transactions_df.withColumn(\n", 487 | " \"TX_FRAUD_0\", (F.col(\"TX_FRAUD\") == 0).cast(\"int\"))\n", 488 | "transactions_df = transactions_df.withColumn(\n", 489 | " \"TX_FRAUD_1\", (F.col(\"TX_FRAUD\") == 1).cast(\"int\"))\n", 490 | "\n", 491 | "# Drop TX_FRAUD and TX_DATETIME column after encoding\n", 492 | "transactions_df = transactions_df.drop(\"TX_FRAUD\")\n", 493 | "\n", 494 | "transactions_df = transactions_df.drop(\"TX_DATETIME\")\n", 495 | "\n", 496 | "# Apply StringIndexer for billing_city and billing_state in customers_df\n", 497 | "billing_city_indexer = StringIndexer(inputCol=\"billing_city\", outputCol=\"billing_city_index\").fit(customers_df)\n", 498 | "customers_df = billing_city_indexer.transform(customers_df)\n", 499 | "\n", 500 | "billing_state_indexer = StringIndexer(inputCol=\"billing_state\", outputCol=\"billing_state_index\").fit(customers_df)\n", 501 | "customers_df = billing_state_indexer.transform(customers_df)\n", 502 | "\n", 503 | "# Drop the original columns after encoding\n", 504 | "customers_df = customers_df.drop(\"billing_city\", \"billing_state\")\n", 505 | "\n", 506 | "# Join the enriched transactions data with customer and terminal details\n", 507 | "#intermediate_df = transactions_df.join(customers_df,\n", 508 | "# on=\"CUSTOMER_ID_index\",\n", 509 | "# how=\"right\").join(terminals_df,\n", 510 | "# on=\"TERMINAL_ID_index\",\n", 511 | "# how=\"right\")\n", 512 | "#print(f\"Total number of rows for right join: {intermediate_df.count()}\")\n", 513 | "\n", 514 | "final_df = transactions_df.join(customers_df,\n", 515 | " on=\"CUSTOMER_ID_index\",\n", 516 | " how=\"left\").join(terminals_df,\n", 517 | " on=\"TERMINAL_ID_index\",\n", 518 | " how=\"left\")\n", 519 | "#print(f\"Total number of rows for left join: {final_df.count()}\")\n", 520 | "\n", 521 | "\n", 522 | "# Select the final features and customer/terminal details\n", 523 | "final_columns = [\n", 524 | " \"CUSTOMER_ID_index\",\n", 525 | " \"customer_name_index\",\n", 526 | " \"customer_email_index\",\n", 527 | " \"phone_index\",\n", 528 | " \"billing_zip\",\n", 529 | " \"billing_city_index\", # Ordinal encoded billing_city\n", 530 | " \"billing_state_index\", # Ordinal encoded billing_state\n", 531 | " \"x_customer_id\", # Added column\n", 532 | " \"y_customer_id\", # Added column\n", 533 | " \"TX_AMOUNT\",\n", 534 | " \"TX_FRAUD_0\", # One-hot encoded column\n", 535 | " \"TX_FRAUD_1\", # One-hot encoded column \n", 536 | " \"TERMINAL_ID_index\",\n", 537 | " \"merchant_index\", # Ensure 'merchant_index' is present\n", 538 | " \"yyyy\",\n", 539 | " \"mm\",\n", 540 | " \"dd\",\n", 541 | " # Customer-related features\n", 542 | " \"customer_id_nb_txns_15min_window\",\n", 543 | " \"customer_id_nb_txns_30min_window\",\n", 544 | " \"customer_id_nb_txns_60min_window\",\n", 545 | " \"customer_id_nb_txns_1day_window\",\n", 546 | " \"customer_id_nb_txns_7day_window\",\n", 547 | " \"customer_id_nb_txns_15day_window\",\n", 548 | " \"customer_id_nb_txns_30day_window\",\n", 549 | " \"customer_id_avg_amt_15min_window\",\n", 550 | " \"customer_id_avg_amt_30min_window\",\n", 551 | " \"customer_id_avg_amt_60min_window\",\n", 552 | " \"customer_id_avg_amt_1day_window\",\n", 553 | " \"customer_id_avg_amt_7day_window\",\n", 554 | " \"customer_id_avg_amt_15day_window\",\n", 555 | " \"customer_id_avg_amt_30day_window\",\n", 556 | " # Terminal-related features\n", 557 | " \"terminal_id_nb_txns_15min_window\",\n", 558 | " \"terminal_id_nb_txns_30min_window\",\n", 559 | " \"terminal_id_nb_txns_60min_window\",\n", 560 | " \"terminal_id_nb_txns_1day_window\",\n", 561 | " \"terminal_id_nb_txns_7day_window\",\n", 562 | " \"terminal_id_nb_txns_15day_window\",\n", 563 | " \"terminal_id_nb_txns_30day_window\",\n", 564 | " \"terminal_id_avg_amt_15min_window\",\n", 565 | " \"terminal_id_avg_amt_30min_window\",\n", 566 | " \"terminal_id_avg_amt_60min_window\",\n", 567 | " \"terminal_id_avg_amt_1day_window\",\n", 568 | " \"terminal_id_avg_amt_7day_window\",\n", 569 | " \"terminal_id_avg_amt_15day_window\",\n", 570 | " \"terminal_id_avg_amt_30day_window\"\n", 571 | "]\n", 572 | "\n", 573 | "final_df = final_df.select(final_columns).repartition(10000)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "id": "8c81f498-533b-4ca3-b55c-68e6f1c1521a", 580 | "metadata": { 581 | "execution": { 582 | "iopub.execute_input": "2024-10-20T12:23:13.492345Z", 583 | "iopub.status.busy": "2024-10-20T12:23:13.492176Z" 584 | }, 585 | "tags": [] 586 | }, 587 | "outputs": [ 588 | { 589 | "data": { 590 | "application/vnd.jupyter.widget-view+json": { 591 | "model_id": "723c0f6be46248dfa213015cd339f07e", 592 | "version_major": 2, 593 | "version_minor": 0 594 | }, 595 | "text/plain": [ 596 | "VBox()" 597 | ] 598 | }, 599 | "metadata": {}, 600 | "output_type": "display_data" 601 | }, 602 | { 603 | "data": { 604 | "application/vnd.jupyter.widget-view+json": { 605 | "model_id": "8462cf0f61f644e1aaf0f432d130cd00", 606 | "version_major": 2, 607 | "version_minor": 0 608 | }, 609 | "text/plain": [ 610 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 611 | ] 612 | }, 613 | "metadata": {}, 614 | "output_type": "display_data" 615 | } 616 | ], 617 | "source": [ 618 | "# Save the result to S3 as Parquet\n", 619 | "spark.conf.set(\"spark.sql.files.maxPartitionBytes\", \"128M\")\n", 620 | "spark.conf.set(\"spark.sql.autoBroadcastJoinThreshold\", \"500M\")\n", 621 | "final_output_path = \"s3://nvidia-aws-fraud-detection-demo/output121/\"\n", 622 | "final_df.write.mode(\"overwrite\").parquet(final_output_path)\n", 623 | "\n", 624 | "print(f\"Data successfully written to {final_output_path}\")\n", 625 | "# Stop the Spark session\n", 626 | "spark.stop()" 627 | ] 628 | } 629 | ], 630 | "metadata": { 631 | "kernelspec": { 632 | "display_name": "PySpark", 633 | "language": "python", 634 | "name": "pysparkkernel" 635 | }, 636 | "language_info": { 637 | "codemirror_mode": { 638 | "name": "python", 639 | "version": 3 640 | }, 641 | "file_extension": ".py", 642 | "mimetype": "text/x-python", 643 | "name": "pyspark", 644 | "pygments_lexer": "python3" 645 | } 646 | }, 647 | "nbformat": 4, 648 | "nbformat_minor": 5 649 | } 650 | -------------------------------------------------------------------------------- /inference/deploy_xgb_triton_endpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Requirement already satisfied: sagemaker in /opt/conda/lib/python3.11/site-packages (2.221.1)\n", 13 | "Collecting sagemaker\n", 14 | " Downloading sagemaker-2.235.2-py3-none-any.whl.metadata (16 kB)\n", 15 | "Requirement already satisfied: attrs<24,>=23.1.0 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (23.2.0)\n", 16 | "Collecting boto3<2.0,>=1.34.142 (from sagemaker)\n", 17 | " Downloading boto3-1.35.70-py3-none-any.whl.metadata (6.7 kB)\n", 18 | "Requirement already satisfied: cloudpickle==2.2.1 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (2.2.1)\n", 19 | "Requirement already satisfied: docker in /opt/conda/lib/python3.11/site-packages (from sagemaker) (7.1.0)\n", 20 | "Requirement already satisfied: google-pasta in /opt/conda/lib/python3.11/site-packages (from sagemaker) (0.2.0)\n", 21 | "Requirement already satisfied: importlib-metadata<7.0,>=1.4.0 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (6.11.0)\n", 22 | "Requirement already satisfied: jsonschema in /opt/conda/lib/python3.11/site-packages (from sagemaker) (4.22.0)\n", 23 | "Collecting numpy<2.0,>=1.9.0 (from sagemaker)\n", 24 | " Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n", 25 | "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (23.2)\n", 26 | "Requirement already satisfied: pandas in /opt/conda/lib/python3.11/site-packages (from sagemaker) (2.2.3)\n", 27 | "Requirement already satisfied: pathos in /opt/conda/lib/python3.11/site-packages (from sagemaker) (0.3.2)\n", 28 | "Requirement already satisfied: platformdirs in /opt/conda/lib/python3.11/site-packages (from sagemaker) (4.1.0)\n", 29 | "Requirement already satisfied: protobuf<5.0,>=3.12 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (3.20.3)\n", 30 | "Requirement already satisfied: psutil in /opt/conda/lib/python3.11/site-packages (from sagemaker) (5.9.8)\n", 31 | "Requirement already satisfied: pyyaml~=6.0 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (6.0.1)\n", 32 | "Requirement already satisfied: requests in /opt/conda/lib/python3.11/site-packages (from sagemaker) (2.32.2)\n", 33 | "Collecting sagemaker-core<2.0.0,>=1.0.15 (from sagemaker)\n", 34 | " Downloading sagemaker_core-1.0.16-py3-none-any.whl.metadata (4.9 kB)\n", 35 | "Requirement already satisfied: schema in /opt/conda/lib/python3.11/site-packages (from sagemaker) (0.7.7)\n", 36 | "Requirement already satisfied: smdebug-rulesconfig==1.0.1 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (1.0.1)\n", 37 | "Requirement already satisfied: tblib<4,>=1.7.0 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (3.0.0)\n", 38 | "Requirement already satisfied: tqdm in /opt/conda/lib/python3.11/site-packages (from sagemaker) (4.66.4)\n", 39 | "Requirement already satisfied: urllib3<3.0.0,>=1.26.8 in /opt/conda/lib/python3.11/site-packages (from sagemaker) (1.26.20)\n", 40 | "Collecting botocore<1.36.0,>=1.35.70 (from boto3<2.0,>=1.34.142->sagemaker)\n", 41 | " Downloading botocore-1.35.70-py3-none-any.whl.metadata (5.7 kB)\n", 42 | "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.11/site-packages (from boto3<2.0,>=1.34.142->sagemaker) (1.0.1)\n", 43 | "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /opt/conda/lib/python3.11/site-packages (from boto3<2.0,>=1.34.142->sagemaker) (0.10.1)\n", 44 | "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.11/site-packages (from importlib-metadata<7.0,>=1.4.0->sagemaker) (3.21.0)\n", 45 | "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /opt/conda/lib/python3.11/site-packages (from sagemaker-core<2.0.0,>=1.0.15->sagemaker) (2.7.1)\n", 46 | "Requirement already satisfied: rich<14.0.0,>=13.0.0 in /opt/conda/lib/python3.11/site-packages (from sagemaker-core<2.0.0,>=1.0.15->sagemaker) (13.7.1)\n", 47 | "Collecting mock<5.0,>4.0 (from sagemaker-core<2.0.0,>=1.0.15->sagemaker)\n", 48 | " Downloading mock-4.0.3-py3-none-any.whl.metadata (2.8 kB)\n", 49 | "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.11/site-packages (from jsonschema->sagemaker) (2023.12.1)\n", 50 | "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.11/site-packages (from jsonschema->sagemaker) (0.35.1)\n", 51 | "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.11/site-packages (from jsonschema->sagemaker) (0.18.1)\n", 52 | "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.11/site-packages (from requests->sagemaker) (3.3.2)\n", 53 | "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.11/site-packages (from requests->sagemaker) (3.7)\n", 54 | "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.11/site-packages (from requests->sagemaker) (2024.8.30)\n", 55 | "Requirement already satisfied: six in /opt/conda/lib/python3.11/site-packages (from google-pasta->sagemaker) (1.16.0)\n", 56 | "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.11/site-packages (from pandas->sagemaker) (2.9.0)\n", 57 | "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.11/site-packages (from pandas->sagemaker) (2024.1)\n", 58 | "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.11/site-packages (from pandas->sagemaker) (2024.1)\n", 59 | "Requirement already satisfied: ppft>=1.7.6.8 in /opt/conda/lib/python3.11/site-packages (from pathos->sagemaker) (1.7.6.8)\n", 60 | "Requirement already satisfied: dill>=0.3.8 in /opt/conda/lib/python3.11/site-packages (from pathos->sagemaker) (0.3.8)\n", 61 | "Requirement already satisfied: pox>=0.3.4 in /opt/conda/lib/python3.11/site-packages (from pathos->sagemaker) (0.3.4)\n", 62 | "Requirement already satisfied: multiprocess>=0.70.16 in /opt/conda/lib/python3.11/site-packages (from pathos->sagemaker) (0.70.16)\n", 63 | "Requirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->sagemaker-core<2.0.0,>=1.0.15->sagemaker) (0.7.0)\n", 64 | "Requirement already satisfied: pydantic-core==2.18.2 in /opt/conda/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->sagemaker-core<2.0.0,>=1.0.15->sagemaker) (2.18.2)\n", 65 | "Requirement already satisfied: typing-extensions>=4.6.1 in /opt/conda/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->sagemaker-core<2.0.0,>=1.0.15->sagemaker) (4.11.0)\n", 66 | "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.11/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core<2.0.0,>=1.0.15->sagemaker) (3.0.0)\n", 67 | "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.11/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core<2.0.0,>=1.0.15->sagemaker) (2.18.0)\n", 68 | "Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.0.0->sagemaker-core<2.0.0,>=1.0.15->sagemaker) (0.1.2)\n", 69 | "Downloading sagemaker-2.235.2-py3-none-any.whl (1.6 MB)\n", 70 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m61.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 71 | "\u001b[?25hDownloading boto3-1.35.70-py3-none-any.whl (139 kB)\n", 72 | "Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)\n", 73 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.3/18.3 MB\u001b[0m \u001b[31m137.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 74 | "\u001b[?25hDownloading sagemaker_core-1.0.16-py3-none-any.whl (389 kB)\n", 75 | "Downloading botocore-1.35.70-py3-none-any.whl (13.0 MB)\n", 76 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.0/13.0 MB\u001b[0m \u001b[31m137.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 77 | "\u001b[?25hDownloading mock-4.0.3-py3-none-any.whl (28 kB)\n", 78 | "Installing collected packages: numpy, mock, botocore, boto3, sagemaker-core, sagemaker\n", 79 | " Attempting uninstall: numpy\n", 80 | " Found existing installation: numpy 2.1.3\n", 81 | " Uninstalling numpy-2.1.3:\n", 82 | " Successfully uninstalled numpy-2.1.3\n", 83 | " Attempting uninstall: botocore\n", 84 | " Found existing installation: botocore 1.34.112\n", 85 | " Uninstalling botocore-1.34.112:\n", 86 | " Successfully uninstalled botocore-1.34.112\n", 87 | " Attempting uninstall: boto3\n", 88 | " Found existing installation: boto3 1.34.112\n", 89 | " Uninstalling boto3-1.34.112:\n", 90 | " Successfully uninstalled boto3-1.34.112\n", 91 | " Attempting uninstall: sagemaker\n", 92 | " Found existing installation: sagemaker 2.221.1\n", 93 | " Uninstalling sagemaker-2.221.1:\n", 94 | " Successfully uninstalled sagemaker-2.221.1\n", 95 | "Successfully installed boto3-1.35.70 botocore-1.35.70 mock-4.0.3 numpy-1.26.4 sagemaker-2.235.2 sagemaker-core-1.0.16\n", 96 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", 97 | "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", 98 | "Collecting xgboost\n", 99 | " Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)\n", 100 | "Requirement already satisfied: numpy in /opt/conda/lib/python3.11/site-packages (from xgboost) (1.26.4)\n", 101 | "Collecting nvidia-nccl-cu12 (from xgboost)\n", 102 | " Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n", 103 | "Requirement already satisfied: scipy in /opt/conda/lib/python3.11/site-packages (from xgboost) (1.13.1)\n", 104 | "Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)\n", 105 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m153.9/153.9 MB\u001b[0m \u001b[31m117.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 106 | "\u001b[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)\n", 107 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.0/199.0 MB\u001b[0m \u001b[31m102.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 108 | "\u001b[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost\n", 109 | "Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.3\n", 110 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", 111 | "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", 112 | "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.11/site-packages (3.1.4)\n", 113 | "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.11/site-packages (from jinja2) (2.1.5)\n", 114 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", 115 | "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "%pip install -U sagemaker\n", 121 | "%pip install -U xgboost\n", 122 | "%pip install -U jinja2" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 22, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "import sagemaker\n", 132 | "from sagemaker.multidatamodel import MultiDataModel\n", 133 | "from sagemaker.model import Model\n", 134 | "\n", 135 | "from pathlib import Path\n", 136 | "import boto3\n", 137 | "import json\n", 138 | "import shutil\n", 139 | "import datetime as dt\n", 140 | "import tarfile\n", 141 | "import xgboost as xgb\n", 142 | "import pandas as pd\n", 143 | "import numpy as np\n", 144 | "import time\n", 145 | "from concurrent.futures import ThreadPoolExecutor\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 4, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "role = sagemaker.get_execution_role()\n", 155 | "sess = sagemaker.Session()\n", 156 | "region = sess.boto_region_name\n", 157 | "\n", 158 | "triton_framework = \"sagemaker-tritonserver\"\n", 159 | "version = \"24.09\"\n", 160 | "instance_type = \"ml.g5.2xlarge\"\n", 161 | "\n", 162 | "test_data_path = (\n", 163 | " \"s3://sagemaker-us-east-1-152804913371/nvidia-aws-fraud-detection-demo/test/\"\n", 164 | ")\n", 165 | "trained_model_path = \"s3://sagemaker-us-east-1-152804913371/pytorch-training-2024-11-22-15-54-43-056/output/model.tar.gz\"\n", 166 | "\n", 167 | "mme_s3_uri = f\"s3://{sess.default_bucket()}/xgboost-mme\"\n", 168 | "\n", 169 | "mme_triton_image_uri = sagemaker.image_uris.retrieve(\n", 170 | " framework=triton_framework,\n", 171 | " region=region,\n", 172 | " version=version,\n", 173 | " instance_type=instance_type,\n", 174 | ")" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 5, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "download: s3://sagemaker-us-east-1-152804913371/pytorch-training-2024-11-22-15-54-43-056/output/model.tar.gz to ./model.tar.gz\n", 187 | "Model file name: xgboost.json\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "model_tar_name = Path(trained_model_path).name\n", 193 | "!aws s3 cp {trained_model_path} {model_tar_name}\n", 194 | "with tarfile.open(model_tar_name, \"r:gz\") as tar:\n", 195 | " model_file_name = tar.getnames()[0]\n", 196 | " tar.extractall()\n", 197 | "\n", 198 | "print(f\"Model file name: {model_file_name}\")" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 6, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "bst = xgb.Booster()\n", 208 | "bst.load_model(model_file_name)\n", 209 | "num_features = bst.num_features()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 7, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "from jinja2 import Template\n", 219 | "\n", 220 | "config_template = \"\"\"backend: \"fil\"\n", 221 | "max_batch_size: 1000\n", 222 | "input [ \n", 223 | " { \n", 224 | " name: \"input__0\"\n", 225 | " data_type: TYPE_FP32\n", 226 | " dims: [ {{ input_size }} ] \n", 227 | " } \n", 228 | "]\n", 229 | "output [\n", 230 | " {\n", 231 | " name: \"output__0\"\n", 232 | " data_type: TYPE_FP32\n", 233 | " dims: [ 2 ]\n", 234 | " }\n", 235 | "]\n", 236 | "instance_group [{ kind: KIND_{{ device }} }]\n", 237 | "parameters [\n", 238 | " {\n", 239 | " key: \"model_type\"\n", 240 | " value: { string_value: \"xgboost_json\" }\n", 241 | " },\n", 242 | " {\n", 243 | " key: \"predict_proba\"\n", 244 | " value: { string_value: \"true\" }\n", 245 | " },\n", 246 | " {\n", 247 | " key: \"output_class\"\n", 248 | " value: { string_value: \"true\" }\n", 249 | " },\n", 250 | " {\n", 251 | " key: \"threshold\"\n", 252 | " value: { string_value: \"0.5\" }\n", 253 | " },\n", 254 | " {\n", 255 | " key: \"storage_type\"\n", 256 | " value: { string_value: \"AUTO\" }\n", 257 | " }\n", 258 | "]\n", 259 | "\n", 260 | "dynamic_batching {\n", 261 | "\n", 262 | "}\n", 263 | "\"\"\"\n", 264 | "\n", 265 | "template = Template(config_template)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 8, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "model_workspace = Path(\"workspace\")\n", 275 | "gpu_model_path = model_workspace / \"xgboost_gpu\"\n", 276 | "cpu_model_path = model_workspace / \"xgboost_cpu\"\n", 277 | "\n", 278 | "for device, model_path in zip([\"GPU\", \"CPU\"], [gpu_model_path, cpu_model_path]):\n", 279 | " model_path.mkdir(parents=True, exist_ok=True)\n", 280 | " with open(model_path / \"config.pbtxt\", \"w\") as f:\n", 281 | " f.write(\n", 282 | " template.render(\n", 283 | " input_size=num_features,\n", 284 | " device=device,\n", 285 | " )\n", 286 | " )\n", 287 | " (model_path / \"1\").mkdir(parents=True, exist_ok=True)\n", 288 | " shutil.copy(model_file_name, model_path / \"1\" / model_file_name) \n", 289 | "\n", 290 | "cpu_tar_name = \"xgboost_cpu.tar.gz\"\n", 291 | "gpu_tar_name = \"xgboost_gpu.tar.gz\"\n", 292 | "\n", 293 | "with tarfile.open(cpu_tar_name, \"w:gz\") as tar:\n", 294 | " tar.add(cpu_model_path, arcname=cpu_model_path.name)\n", 295 | "\n", 296 | "with tarfile.open(gpu_tar_name, \"w:gz\") as tar:\n", 297 | " tar.add(gpu_model_path, arcname=gpu_model_path.name)\n" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "gpu_s3_uri = sess.upload_data(gpu_tar_name, bucket=sess.default_bucket(), key_prefix=\"xgboost-mme\")\n", 307 | "cpu_s3_uri = sess.upload_data(cpu_tar_name, bucket=sess.default_bucket(), key_prefix=\"xgboost-mme\")" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 10, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "model = Model(\n", 317 | " model_data=cpu_s3_uri,\n", 318 | " image_uri=mme_triton_image_uri,\n", 319 | " role=role,\n", 320 | " sagemaker_session=sess\n", 321 | ")\n", 322 | "\n", 323 | "mme = MultiDataModel(\n", 324 | " name=\"triton-fil-\" + dt.datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\"),\n", 325 | " model_data_prefix=mme_s3_uri,\n", 326 | " model=model,\n", 327 | " sagemaker_session=sess,\n", 328 | ")" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 11, 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/html": [ 339 | "
[11/26/24 20:25:27] INFO     Creating model with name: triton-fil-2024-11-26-20-25-24               session.py:4025\n",
340 |        "
\n" 341 | ], 342 | "text/plain": [ 343 | "\u001b[2;36m[11/26/24 20:25:27]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Creating model with name: triton-fil-\u001b[1;36m2024\u001b[0m-\u001b[1;36m11\u001b[0m-\u001b[1;36m26\u001b[0m-\u001b[1;36m20\u001b[0m-\u001b[1;36m25\u001b[0m-\u001b[1;36m24\u001b[0m \u001b]8;id=545124;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py\u001b\\\u001b[2msession.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=861209;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py#4025\u001b\\\u001b[2m4025\u001b[0m\u001b]8;;\u001b\\\n" 344 | ] 345 | }, 346 | "metadata": {}, 347 | "output_type": "display_data" 348 | }, 349 | { 350 | "data": { 351 | "text/html": [ 352 | "
                    INFO     Creating endpoint-config with name triton-fil-2024-11-26-20-25-24      session.py:5820\n",
353 |        "
\n" 354 | ], 355 | "text/plain": [ 356 | "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Creating endpoint-config with name triton-fil-\u001b[1;36m2024\u001b[0m-\u001b[1;36m11\u001b[0m-\u001b[1;36m26\u001b[0m-\u001b[1;36m20\u001b[0m-\u001b[1;36m25\u001b[0m-\u001b[1;36m24\u001b[0m \u001b]8;id=519670;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py\u001b\\\u001b[2msession.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=799083;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py#5820\u001b\\\u001b[2m5820\u001b[0m\u001b]8;;\u001b\\\n" 357 | ] 358 | }, 359 | "metadata": {}, 360 | "output_type": "display_data" 361 | }, 362 | { 363 | "data": { 364 | "text/html": [ 365 | "
[11/26/24 20:25:28] INFO     Creating endpoint with name triton-fil-2024-11-26-20-25-24             session.py:4642\n",
366 |        "
\n" 367 | ], 368 | "text/plain": [ 369 | "\u001b[2;36m[11/26/24 20:25:28]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Creating endpoint with name triton-fil-\u001b[1;36m2024\u001b[0m-\u001b[1;36m11\u001b[0m-\u001b[1;36m26\u001b[0m-\u001b[1;36m20\u001b[0m-\u001b[1;36m25\u001b[0m-\u001b[1;36m24\u001b[0m \u001b]8;id=752539;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py\u001b\\\u001b[2msession.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=201503;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py#4642\u001b\\\u001b[2m4642\u001b[0m\u001b]8;;\u001b\\\n" 370 | ] 371 | }, 372 | "metadata": {}, 373 | "output_type": "display_data" 374 | }, 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "-----------!" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "mme.deploy(\n", 385 | " initial_instance_count=1,\n", 386 | " instance_type=instance_type,\n", 387 | ")" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 13, 393 | "metadata": {}, 394 | "outputs": [ 395 | { 396 | "data": { 397 | "text/plain": [ 398 | "['/xgboost_cpu.tar.gz', '/xgboost_gpu.tar.gz']" 399 | ] 400 | }, 401 | "execution_count": 13, 402 | "metadata": {}, 403 | "output_type": "execute_result" 404 | } 405 | ], 406 | "source": [ 407 | "list(mme.list_models())" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 14, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "predictor = sagemaker.predictor.Predictor(endpoint_name=mme.endpoint_name, sagemaker_session=sess)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 15, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "test_data_bucket = test_data_path.split(\"/\")[2]\n", 426 | "test_data_prefix = \"/\".join(test_data_path.split(\"/\")[3:])\n", 427 | "\n", 428 | "\n", 429 | "test_file = sess.list_s3_files(test_data_bucket, test_data_prefix)[0]\n", 430 | "test_file =f\"s3://{test_data_bucket}/{test_file}\"" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 16, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "df = pd.read_parquet(test_file)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 17, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "df.drop(columns=[\"TX_FRAUD_1\"], inplace=True)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 18, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "def prepare_payload(input_data: np.ndarray, num_features: int) -> dict:\n", 458 | "\n", 459 | " input_data = input_data.reshape(-1, num_features)\n", 460 | " \n", 461 | " payload = {\n", 462 | " \"inputs\": [\n", 463 | " {\n", 464 | " \"name\": \"input__0\",\n", 465 | " \"shape\": input_data.shape,\n", 466 | " \"datatype\": \"FP32\",\n", 467 | " \"data\": input_data.tolist(),\n", 468 | " }\n", 469 | " ]\n", 470 | " }\n", 471 | " return json.dumps(payload)\n", 472 | "\n", 473 | "def make_batches (data: np.ndarray, batch_size: int) -> list:\n", 474 | " return [data[i:i+batch_size] for i in range(0, len(data), batch_size)]" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 26, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "BATCH_SIZE = 500\n", 484 | "payloads = [prepare_payload(batch, num_features=32) for batch in make_batches(df.values, BATCH_SIZE)]" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 27, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "name": "stdout", 494 | "output_type": "stream", 495 | "text": [ 496 | "CPU throughput: 123673.93 records per second\n", 497 | "GPU throughput: 348148.16 records per second\n" 498 | ] 499 | } 500 | ], 501 | "source": [ 502 | "NUM_CLIENTS = 10\n", 503 | "from functools import partial\n", 504 | "\n", 505 | "start_time = time.perf_counter()\n", 506 | "with ThreadPoolExecutor(max_workers=NUM_CLIENTS) as executor:\n", 507 | " cpu_predict = partial(predictor.predict, target_model=\"/xgboost_cpu.tar.gz\")\n", 508 | " results = list(executor.map(cpu_predict, payloads))\n", 509 | "cpu_throughput = len(df) / (time.perf_counter() - start_time)\n", 510 | "print(f\"CPU throughput: {cpu_throughput:.2f} records per second\")\n", 511 | "\n", 512 | "start_time = time.perf_counter()\n", 513 | "with ThreadPoolExecutor(max_workers=NUM_CLIENTS) as executor:\n", 514 | " cpu_predict = partial(predictor.predict, target_model=\"/xgboost_gpu.tar.gz\")\n", 515 | " results = list(executor.map(cpu_predict, payloads))\n", 516 | "cpu_throughput = len(df) / (time.perf_counter() - start_time)\n", 517 | "print(f\"GPU throughput: {cpu_throughput:.2f} records per second\")" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "mme.delete_endpoint()" 527 | ] 528 | } 529 | ], 530 | "metadata": { 531 | "kernelspec": { 532 | "display_name": "base", 533 | "language": "python", 534 | "name": "python3" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 3 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython3", 546 | "version": "3.11.9" 547 | } 548 | }, 549 | "nbformat": 4, 550 | "nbformat_minor": 2 551 | } 552 | -------------------------------------------------------------------------------- /model-training/xgboost/clean_large_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Note: you may need to restart the kernel to use updated packages.\n", 13 | "\u001b[1;31merror\u001b[0m: \u001b[1muninstall-no-record-file\u001b[0m\n", 14 | "\n", 15 | "\u001b[31m×\u001b[0m Cannot uninstall fsspec None\n", 16 | "\u001b[31m╰─>\u001b[0m The package's contents are unknown: no RECORD file was found for fsspec.\n", 17 | "\n", 18 | "\u001b[1;36mhint\u001b[0m: You might be able to recover from this via: \u001b[32mpip install --force-reinstall --no-deps fsspec==2023.6.0\u001b[0m\n", 19 | "Note: you may need to restart the kernel to use updated packages.\n", 20 | "\u001b[1;31merror\u001b[0m: \u001b[1muninstall-no-record-file\u001b[0m\n", 21 | "\n", 22 | "\u001b[31m×\u001b[0m Cannot uninstall fsspec None\n", 23 | "\u001b[31m╰─>\u001b[0m The package's contents are unknown: no RECORD file was found for fsspec.\n", 24 | "\n", 25 | "\u001b[1;36mhint\u001b[0m: You might be able to recover from this via: \u001b[32mpip install --force-reinstall --no-deps fsspec==2023.6.0\u001b[0m\n", 26 | "Note: you may need to restart the kernel to use updated packages.\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "%pip install -Uqq duckdb\n", 32 | "%pip install -Uqq --force-reinstall --no-deps fsspec==2023.6.0\n", 33 | "%pip install -Uqq s3fs" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 1, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import duckdb\n", 50 | "import s3fs\n", 51 | "from pathlib import Path\n", 52 | "import os" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "file_path = \"s3://nvidia-aws-fraud-detection-demo/output121/*.parquet\"\n", 62 | "s3 = s3fs.S3FileSystem()\n", 63 | "source_files = s3.glob(file_path)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "con = duckdb.connect()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "" 84 | ] 85 | }, 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "con.execute(\n", 93 | " \"\"\"CREATE SECRET s3_access (\n", 94 | " TYPE S3,\n", 95 | " PROVIDER CREDENTIAL_CHAIN\n", 96 | " );\"\"\"\n", 97 | ")" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "columns_to_drop = [\n", 107 | " 'CUSTOMER_ID_index', 'customer_name_index', 'customer_email_index',\n", 108 | " 'phone_index', 'billing_zip', 'billing_city_index',\n", 109 | " 'billing_state_index', 'x_customer_id', 'y_customer_id',\n", 110 | " 'TX_FRAUD_0', 'TERMINAL_ID_index', 'merchant_index'\n", 111 | "]" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 20, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "s3_output_path = \"s3://nvidia-aws-fraud-detection-demo/output121_clean\"" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 23, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "application/vnd.jupyter.widget-view+json": { 131 | "model_id": "074a7f97f4c94b7ba830e3a59aa73264", 132 | "version_major": 2, 133 | "version_minor": 0 134 | }, 135 | "text/plain": [ 136 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 137 | ] 138 | }, 139 | "metadata": {}, 140 | "output_type": "display_data" 141 | }, 142 | { 143 | "data": { 144 | "application/vnd.jupyter.widget-view+json": { 145 | "model_id": "747460732dcc4742b069e3df80ae5858", 146 | "version_major": 2, 147 | "version_minor": 0 148 | }, 149 | "text/plain": [ 150 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 151 | ] 152 | }, 153 | "metadata": {}, 154 | "output_type": "display_data" 155 | }, 156 | { 157 | "data": { 158 | "application/vnd.jupyter.widget-view+json": { 159 | "model_id": "d9edd2b3f38c44dfbeacc2f13f8eb41a", 160 | "version_major": 2, 161 | "version_minor": 0 162 | }, 163 | "text/plain": [ 164 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 165 | ] 166 | }, 167 | "metadata": {}, 168 | "output_type": "display_data" 169 | }, 170 | { 171 | "data": { 172 | "application/vnd.jupyter.widget-view+json": { 173 | "model_id": "5afa73ba027b4b788f9f5ce7b4825227", 174 | "version_major": 2, 175 | "version_minor": 0 176 | }, 177 | "text/plain": [ 178 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 179 | ] 180 | }, 181 | "metadata": {}, 182 | "output_type": "display_data" 183 | }, 184 | { 185 | "data": { 186 | "application/vnd.jupyter.widget-view+json": { 187 | "model_id": "3274e797bbaf4ba6aa4a0b09efda2bce", 188 | "version_major": 2, 189 | "version_minor": 0 190 | }, 191 | "text/plain": [ 192 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 193 | ] 194 | }, 195 | "metadata": {}, 196 | "output_type": "display_data" 197 | }, 198 | { 199 | "data": { 200 | "application/vnd.jupyter.widget-view+json": { 201 | "model_id": "c1c0c770fe4749ed96c50410a3a22f21", 202 | "version_major": 2, 203 | "version_minor": 0 204 | }, 205 | "text/plain": [ 206 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 207 | ] 208 | }, 209 | "metadata": {}, 210 | "output_type": "display_data" 211 | }, 212 | { 213 | "data": { 214 | "application/vnd.jupyter.widget-view+json": { 215 | "model_id": "6b9e8d77d68b45f4aa9bde538f5bd180", 216 | "version_major": 2, 217 | "version_minor": 0 218 | }, 219 | "text/plain": [ 220 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 221 | ] 222 | }, 223 | "metadata": {}, 224 | "output_type": "display_data" 225 | }, 226 | { 227 | "data": { 228 | "application/vnd.jupyter.widget-view+json": { 229 | "model_id": "0657bf2f00e24d929ea94c7d4a33b59a", 230 | "version_major": 2, 231 | "version_minor": 0 232 | }, 233 | "text/plain": [ 234 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 235 | ] 236 | }, 237 | "metadata": {}, 238 | "output_type": "display_data" 239 | }, 240 | { 241 | "data": { 242 | "application/vnd.jupyter.widget-view+json": { 243 | "model_id": "53bb55b7bdbf4ceebdce3211e18d0d6f", 244 | "version_major": 2, 245 | "version_minor": 0 246 | }, 247 | "text/plain": [ 248 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 249 | ] 250 | }, 251 | "metadata": {}, 252 | "output_type": "display_data" 253 | }, 254 | { 255 | "data": { 256 | "application/vnd.jupyter.widget-view+json": { 257 | "model_id": "95899f26518c4ac2bf345a1f689382c8", 258 | "version_major": 2, 259 | "version_minor": 0 260 | }, 261 | "text/plain": [ 262 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 263 | ] 264 | }, 265 | "metadata": {}, 266 | "output_type": "display_data" 267 | }, 268 | { 269 | "data": { 270 | "application/vnd.jupyter.widget-view+json": { 271 | "model_id": "4ff55b9982fc4b7582911d1e6470b013", 272 | "version_major": 2, 273 | "version_minor": 0 274 | }, 275 | "text/plain": [ 276 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 277 | ] 278 | }, 279 | "metadata": {}, 280 | "output_type": "display_data" 281 | }, 282 | { 283 | "data": { 284 | "application/vnd.jupyter.widget-view+json": { 285 | "model_id": "75c0bcfe053d461480a92325de5361c1", 286 | "version_major": 2, 287 | "version_minor": 0 288 | }, 289 | "text/plain": [ 290 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 291 | ] 292 | }, 293 | "metadata": {}, 294 | "output_type": "display_data" 295 | }, 296 | { 297 | "data": { 298 | "application/vnd.jupyter.widget-view+json": { 299 | "model_id": "458b3584d409429d93ba8f4e59732d1a", 300 | "version_major": 2, 301 | "version_minor": 0 302 | }, 303 | "text/plain": [ 304 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 305 | ] 306 | }, 307 | "metadata": {}, 308 | "output_type": "display_data" 309 | }, 310 | { 311 | "data": { 312 | "application/vnd.jupyter.widget-view+json": { 313 | "model_id": "45180d3d62804acf9995384ff3ba0c7d", 314 | "version_major": 2, 315 | "version_minor": 0 316 | }, 317 | "text/plain": [ 318 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 319 | ] 320 | }, 321 | "metadata": {}, 322 | "output_type": "display_data" 323 | }, 324 | { 325 | "data": { 326 | "application/vnd.jupyter.widget-view+json": { 327 | "model_id": "6462b27a4f09407595662106f2bba38a", 328 | "version_major": 2, 329 | "version_minor": 0 330 | }, 331 | "text/plain": [ 332 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 333 | ] 334 | }, 335 | "metadata": {}, 336 | "output_type": "display_data" 337 | }, 338 | { 339 | "data": { 340 | "application/vnd.jupyter.widget-view+json": { 341 | "model_id": "7ea57eb7226643feaad2f9a97ab5cc71", 342 | "version_major": 2, 343 | "version_minor": 0 344 | }, 345 | "text/plain": [ 346 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 347 | ] 348 | }, 349 | "metadata": {}, 350 | "output_type": "display_data" 351 | }, 352 | { 353 | "data": { 354 | "application/vnd.jupyter.widget-view+json": { 355 | "model_id": "206d0a1419404ae1805844d718271a81", 356 | "version_major": 2, 357 | "version_minor": 0 358 | }, 359 | "text/plain": [ 360 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 361 | ] 362 | }, 363 | "metadata": {}, 364 | "output_type": "display_data" 365 | }, 366 | { 367 | "data": { 368 | "application/vnd.jupyter.widget-view+json": { 369 | "model_id": "b1e051a24efc445aa688f04065243f44", 370 | "version_major": 2, 371 | "version_minor": 0 372 | }, 373 | "text/plain": [ 374 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 375 | ] 376 | }, 377 | "metadata": {}, 378 | "output_type": "display_data" 379 | }, 380 | { 381 | "data": { 382 | "application/vnd.jupyter.widget-view+json": { 383 | "model_id": "793f0822d43c44d68a27330d96ea9d26", 384 | "version_major": 2, 385 | "version_minor": 0 386 | }, 387 | "text/plain": [ 388 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 389 | ] 390 | }, 391 | "metadata": {}, 392 | "output_type": "display_data" 393 | }, 394 | { 395 | "data": { 396 | "application/vnd.jupyter.widget-view+json": { 397 | "model_id": "3dbc06cf499448d1811a2c58590b14d1", 398 | "version_major": 2, 399 | "version_minor": 0 400 | }, 401 | "text/plain": [ 402 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 403 | ] 404 | }, 405 | "metadata": {}, 406 | "output_type": "display_data" 407 | }, 408 | { 409 | "data": { 410 | "application/vnd.jupyter.widget-view+json": { 411 | "model_id": "6e438d3c63a9481ab9f6b0f0dd7f8270", 412 | "version_major": 2, 413 | "version_minor": 0 414 | }, 415 | "text/plain": [ 416 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 417 | ] 418 | }, 419 | "metadata": {}, 420 | "output_type": "display_data" 421 | }, 422 | { 423 | "data": { 424 | "application/vnd.jupyter.widget-view+json": { 425 | "model_id": "a1074870a43e496b8361a72340fa940d", 426 | "version_major": 2, 427 | "version_minor": 0 428 | }, 429 | "text/plain": [ 430 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 431 | ] 432 | }, 433 | "metadata": {}, 434 | "output_type": "display_data" 435 | }, 436 | { 437 | "data": { 438 | "application/vnd.jupyter.widget-view+json": { 439 | "model_id": "b1ecea00b2c04f2a83648f64b4cd0261", 440 | "version_major": 2, 441 | "version_minor": 0 442 | }, 443 | "text/plain": [ 444 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 445 | ] 446 | }, 447 | "metadata": {}, 448 | "output_type": "display_data" 449 | }, 450 | { 451 | "data": { 452 | "application/vnd.jupyter.widget-view+json": { 453 | "model_id": "6fca5abd4cfa4697b50c1ebd1f9218d5", 454 | "version_major": 2, 455 | "version_minor": 0 456 | }, 457 | "text/plain": [ 458 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 459 | ] 460 | }, 461 | "metadata": {}, 462 | "output_type": "display_data" 463 | }, 464 | { 465 | "data": { 466 | "application/vnd.jupyter.widget-view+json": { 467 | "model_id": "d865b6d76001424890e4a5f90aaecb30", 468 | "version_major": 2, 469 | "version_minor": 0 470 | }, 471 | "text/plain": [ 472 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 473 | ] 474 | }, 475 | "metadata": {}, 476 | "output_type": "display_data" 477 | }, 478 | { 479 | "data": { 480 | "application/vnd.jupyter.widget-view+json": { 481 | "model_id": "a42df917c685404e9ab6f8a571b50f8e", 482 | "version_major": 2, 483 | "version_minor": 0 484 | }, 485 | "text/plain": [ 486 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 487 | ] 488 | }, 489 | "metadata": {}, 490 | "output_type": "display_data" 491 | }, 492 | { 493 | "data": { 494 | "application/vnd.jupyter.widget-view+json": { 495 | "model_id": "81b07beab93c4e3ca29d7d7631dcaca2", 496 | "version_major": 2, 497 | "version_minor": 0 498 | }, 499 | "text/plain": [ 500 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 501 | ] 502 | }, 503 | "metadata": {}, 504 | "output_type": "display_data" 505 | }, 506 | { 507 | "data": { 508 | "application/vnd.jupyter.widget-view+json": { 509 | "model_id": "8fb38b88a8834de0a5495b3cb6a557df", 510 | "version_major": 2, 511 | "version_minor": 0 512 | }, 513 | "text/plain": [ 514 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 515 | ] 516 | }, 517 | "metadata": {}, 518 | "output_type": "display_data" 519 | }, 520 | { 521 | "data": { 522 | "application/vnd.jupyter.widget-view+json": { 523 | "model_id": "4371a0ef22884d43b365db63641bf002", 524 | "version_major": 2, 525 | "version_minor": 0 526 | }, 527 | "text/plain": [ 528 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 529 | ] 530 | }, 531 | "metadata": {}, 532 | "output_type": "display_data" 533 | }, 534 | { 535 | "data": { 536 | "application/vnd.jupyter.widget-view+json": { 537 | "model_id": "f31564fdd7e84781960208f8dce864f4", 538 | "version_major": 2, 539 | "version_minor": 0 540 | }, 541 | "text/plain": [ 542 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 543 | ] 544 | }, 545 | "metadata": {}, 546 | "output_type": "display_data" 547 | }, 548 | { 549 | "data": { 550 | "application/vnd.jupyter.widget-view+json": { 551 | "model_id": "a10e0163a3e04213868fa978e584cd16", 552 | "version_major": 2, 553 | "version_minor": 0 554 | }, 555 | "text/plain": [ 556 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 557 | ] 558 | }, 559 | "metadata": {}, 560 | "output_type": "display_data" 561 | }, 562 | { 563 | "data": { 564 | "application/vnd.jupyter.widget-view+json": { 565 | "model_id": "f7d734ba82a746f68b5b17cfe97f7fd7", 566 | "version_major": 2, 567 | "version_minor": 0 568 | }, 569 | "text/plain": [ 570 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 571 | ] 572 | }, 573 | "metadata": {}, 574 | "output_type": "display_data" 575 | }, 576 | { 577 | "data": { 578 | "application/vnd.jupyter.widget-view+json": { 579 | "model_id": "b3a7152eeb5b49498c99e26413f9447f", 580 | "version_major": 2, 581 | "version_minor": 0 582 | }, 583 | "text/plain": [ 584 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 585 | ] 586 | }, 587 | "metadata": {}, 588 | "output_type": "display_data" 589 | }, 590 | { 591 | "data": { 592 | "application/vnd.jupyter.widget-view+json": { 593 | "model_id": "b005b9c29e5e4ff3ac83df6195eaaf58", 594 | "version_major": 2, 595 | "version_minor": 0 596 | }, 597 | "text/plain": [ 598 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 599 | ] 600 | }, 601 | "metadata": {}, 602 | "output_type": "display_data" 603 | }, 604 | { 605 | "data": { 606 | "application/vnd.jupyter.widget-view+json": { 607 | "model_id": "3a2ded4cb40544bbb828039603c268ff", 608 | "version_major": 2, 609 | "version_minor": 0 610 | }, 611 | "text/plain": [ 612 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 613 | ] 614 | }, 615 | "metadata": {}, 616 | "output_type": "display_data" 617 | }, 618 | { 619 | "data": { 620 | "application/vnd.jupyter.widget-view+json": { 621 | "model_id": "b97cb2c20682446b82b0e6d28338b736", 622 | "version_major": 2, 623 | "version_minor": 0 624 | }, 625 | "text/plain": [ 626 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 627 | ] 628 | }, 629 | "metadata": {}, 630 | "output_type": "display_data" 631 | }, 632 | { 633 | "data": { 634 | "application/vnd.jupyter.widget-view+json": { 635 | "model_id": "6ae0313e02cb4bde8b0be6680f27ba75", 636 | "version_major": 2, 637 | "version_minor": 0 638 | }, 639 | "text/plain": [ 640 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 641 | ] 642 | }, 643 | "metadata": {}, 644 | "output_type": "display_data" 645 | }, 646 | { 647 | "data": { 648 | "application/vnd.jupyter.widget-view+json": { 649 | "model_id": "3c11b224a5f14a48b2787cf9d5ec40bc", 650 | "version_major": 2, 651 | "version_minor": 0 652 | }, 653 | "text/plain": [ 654 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 655 | ] 656 | }, 657 | "metadata": {}, 658 | "output_type": "display_data" 659 | }, 660 | { 661 | "data": { 662 | "application/vnd.jupyter.widget-view+json": { 663 | "model_id": "49b45f9d7ee04967a654617af658edfe", 664 | "version_major": 2, 665 | "version_minor": 0 666 | }, 667 | "text/plain": [ 668 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 669 | ] 670 | }, 671 | "metadata": {}, 672 | "output_type": "display_data" 673 | }, 674 | { 675 | "data": { 676 | "application/vnd.jupyter.widget-view+json": { 677 | "model_id": "392b2a89daf247179cbd9f8b2a4db112", 678 | "version_major": 2, 679 | "version_minor": 0 680 | }, 681 | "text/plain": [ 682 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 683 | ] 684 | }, 685 | "metadata": {}, 686 | "output_type": "display_data" 687 | }, 688 | { 689 | "data": { 690 | "application/vnd.jupyter.widget-view+json": { 691 | "model_id": "df74ace5a0774f55af0be1dbd1425cd6", 692 | "version_major": 2, 693 | "version_minor": 0 694 | }, 695 | "text/plain": [ 696 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 697 | ] 698 | }, 699 | "metadata": {}, 700 | "output_type": "display_data" 701 | }, 702 | { 703 | "data": { 704 | "application/vnd.jupyter.widget-view+json": { 705 | "model_id": "d6c517d1b6774c6a92ae2981d841ea59", 706 | "version_major": 2, 707 | "version_minor": 0 708 | }, 709 | "text/plain": [ 710 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 711 | ] 712 | }, 713 | "metadata": {}, 714 | "output_type": "display_data" 715 | }, 716 | { 717 | "data": { 718 | "application/vnd.jupyter.widget-view+json": { 719 | "model_id": "750b245b7948481e8356cace8870c06f", 720 | "version_major": 2, 721 | "version_minor": 0 722 | }, 723 | "text/plain": [ 724 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 725 | ] 726 | }, 727 | "metadata": {}, 728 | "output_type": "display_data" 729 | }, 730 | { 731 | "data": { 732 | "application/vnd.jupyter.widget-view+json": { 733 | "model_id": "0f85bc8344f0447698800af1510091f2", 734 | "version_major": 2, 735 | "version_minor": 0 736 | }, 737 | "text/plain": [ 738 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 739 | ] 740 | }, 741 | "metadata": {}, 742 | "output_type": "display_data" 743 | }, 744 | { 745 | "data": { 746 | "application/vnd.jupyter.widget-view+json": { 747 | "model_id": "4ba525c76522494fbd6f9831d9391fe9", 748 | "version_major": 2, 749 | "version_minor": 0 750 | }, 751 | "text/plain": [ 752 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 753 | ] 754 | }, 755 | "metadata": {}, 756 | "output_type": "display_data" 757 | }, 758 | { 759 | "data": { 760 | "application/vnd.jupyter.widget-view+json": { 761 | "model_id": "b1c87194f67d4be28e1a384e921cf029", 762 | "version_major": 2, 763 | "version_minor": 0 764 | }, 765 | "text/plain": [ 766 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 767 | ] 768 | }, 769 | "metadata": {}, 770 | "output_type": "display_data" 771 | }, 772 | { 773 | "data": { 774 | "application/vnd.jupyter.widget-view+json": { 775 | "model_id": "07d31a0accb344ec8943f3b03e0dcaf9", 776 | "version_major": 2, 777 | "version_minor": 0 778 | }, 779 | "text/plain": [ 780 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 781 | ] 782 | }, 783 | "metadata": {}, 784 | "output_type": "display_data" 785 | }, 786 | { 787 | "data": { 788 | "application/vnd.jupyter.widget-view+json": { 789 | "model_id": "d7be1eba1f4943e3b153a9188f672fef", 790 | "version_major": 2, 791 | "version_minor": 0 792 | }, 793 | "text/plain": [ 794 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 795 | ] 796 | }, 797 | "metadata": {}, 798 | "output_type": "display_data" 799 | }, 800 | { 801 | "data": { 802 | "application/vnd.jupyter.widget-view+json": { 803 | "model_id": "ebeba9f18d234adf87850357cf8f63e3", 804 | "version_major": 2, 805 | "version_minor": 0 806 | }, 807 | "text/plain": [ 808 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 809 | ] 810 | }, 811 | "metadata": {}, 812 | "output_type": "display_data" 813 | }, 814 | { 815 | "data": { 816 | "application/vnd.jupyter.widget-view+json": { 817 | "model_id": "882e2c37fa42455daed1c2e48bb1ac6b", 818 | "version_major": 2, 819 | "version_minor": 0 820 | }, 821 | "text/plain": [ 822 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 823 | ] 824 | }, 825 | "metadata": {}, 826 | "output_type": "display_data" 827 | }, 828 | { 829 | "data": { 830 | "application/vnd.jupyter.widget-view+json": { 831 | "model_id": "1f7aa3e459a8400d8dcf242bf9a66098", 832 | "version_major": 2, 833 | "version_minor": 0 834 | }, 835 | "text/plain": [ 836 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 837 | ] 838 | }, 839 | "metadata": {}, 840 | "output_type": "display_data" 841 | }, 842 | { 843 | "data": { 844 | "application/vnd.jupyter.widget-view+json": { 845 | "model_id": "dc64e9ef1383462db09439bbd5d6bdb0", 846 | "version_major": 2, 847 | "version_minor": 0 848 | }, 849 | "text/plain": [ 850 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 851 | ] 852 | }, 853 | "metadata": {}, 854 | "output_type": "display_data" 855 | }, 856 | { 857 | "data": { 858 | "application/vnd.jupyter.widget-view+json": { 859 | "model_id": "813919f70aec4374990266b1ac7b5208", 860 | "version_major": 2, 861 | "version_minor": 0 862 | }, 863 | "text/plain": [ 864 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 865 | ] 866 | }, 867 | "metadata": {}, 868 | "output_type": "display_data" 869 | }, 870 | { 871 | "data": { 872 | "application/vnd.jupyter.widget-view+json": { 873 | "model_id": "d2d8704dc90e4668aa105477782f3266", 874 | "version_major": 2, 875 | "version_minor": 0 876 | }, 877 | "text/plain": [ 878 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 879 | ] 880 | }, 881 | "metadata": {}, 882 | "output_type": "display_data" 883 | }, 884 | { 885 | "data": { 886 | "application/vnd.jupyter.widget-view+json": { 887 | "model_id": "7886b7af03cc497dba376d33992970ae", 888 | "version_major": 2, 889 | "version_minor": 0 890 | }, 891 | "text/plain": [ 892 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 893 | ] 894 | }, 895 | "metadata": {}, 896 | "output_type": "display_data" 897 | }, 898 | { 899 | "data": { 900 | "application/vnd.jupyter.widget-view+json": { 901 | "model_id": "c2051fb0dca6416abc7094a989374a4a", 902 | "version_major": 2, 903 | "version_minor": 0 904 | }, 905 | "text/plain": [ 906 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 907 | ] 908 | }, 909 | "metadata": {}, 910 | "output_type": "display_data" 911 | }, 912 | { 913 | "data": { 914 | "application/vnd.jupyter.widget-view+json": { 915 | "model_id": "0ed8d451c96d4c97952458b31fb73e0f", 916 | "version_major": 2, 917 | "version_minor": 0 918 | }, 919 | "text/plain": [ 920 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 921 | ] 922 | }, 923 | "metadata": {}, 924 | "output_type": "display_data" 925 | }, 926 | { 927 | "data": { 928 | "application/vnd.jupyter.widget-view+json": { 929 | "model_id": "29146a25b4414d70be996db789fa9bee", 930 | "version_major": 2, 931 | "version_minor": 0 932 | }, 933 | "text/plain": [ 934 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 935 | ] 936 | }, 937 | "metadata": {}, 938 | "output_type": "display_data" 939 | }, 940 | { 941 | "data": { 942 | "application/vnd.jupyter.widget-view+json": { 943 | "model_id": "0aadee7c9f0c481793dea88595a04a9a", 944 | "version_major": 2, 945 | "version_minor": 0 946 | }, 947 | "text/plain": [ 948 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 949 | ] 950 | }, 951 | "metadata": {}, 952 | "output_type": "display_data" 953 | }, 954 | { 955 | "data": { 956 | "application/vnd.jupyter.widget-view+json": { 957 | "model_id": "55abdae8a5cd41139bd1749c206f2fe0", 958 | "version_major": 2, 959 | "version_minor": 0 960 | }, 961 | "text/plain": [ 962 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 963 | ] 964 | }, 965 | "metadata": {}, 966 | "output_type": "display_data" 967 | }, 968 | { 969 | "data": { 970 | "application/vnd.jupyter.widget-view+json": { 971 | "model_id": "6c4a0b3ea32b4b8b984b7baed814df73", 972 | "version_major": 2, 973 | "version_minor": 0 974 | }, 975 | "text/plain": [ 976 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 977 | ] 978 | }, 979 | "metadata": {}, 980 | "output_type": "display_data" 981 | }, 982 | { 983 | "data": { 984 | "application/vnd.jupyter.widget-view+json": { 985 | "model_id": "d334a005b5064566b027aaa53d3118a6", 986 | "version_major": 2, 987 | "version_minor": 0 988 | }, 989 | "text/plain": [ 990 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 991 | ] 992 | }, 993 | "metadata": {}, 994 | "output_type": "display_data" 995 | }, 996 | { 997 | "data": { 998 | "application/vnd.jupyter.widget-view+json": { 999 | "model_id": "54998b13238c4b2683100072041f365f", 1000 | "version_major": 2, 1001 | "version_minor": 0 1002 | }, 1003 | "text/plain": [ 1004 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1005 | ] 1006 | }, 1007 | "metadata": {}, 1008 | "output_type": "display_data" 1009 | }, 1010 | { 1011 | "data": { 1012 | "application/vnd.jupyter.widget-view+json": { 1013 | "model_id": "4215f6c6c6a44e27b825101b267877be", 1014 | "version_major": 2, 1015 | "version_minor": 0 1016 | }, 1017 | "text/plain": [ 1018 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1019 | ] 1020 | }, 1021 | "metadata": {}, 1022 | "output_type": "display_data" 1023 | }, 1024 | { 1025 | "data": { 1026 | "application/vnd.jupyter.widget-view+json": { 1027 | "model_id": "6ea57d4a1ffb464daee954238692b3a0", 1028 | "version_major": 2, 1029 | "version_minor": 0 1030 | }, 1031 | "text/plain": [ 1032 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1033 | ] 1034 | }, 1035 | "metadata": {}, 1036 | "output_type": "display_data" 1037 | }, 1038 | { 1039 | "data": { 1040 | "application/vnd.jupyter.widget-view+json": { 1041 | "model_id": "c6b00229c45843a2ac573ff8b6b188cd", 1042 | "version_major": 2, 1043 | "version_minor": 0 1044 | }, 1045 | "text/plain": [ 1046 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1047 | ] 1048 | }, 1049 | "metadata": {}, 1050 | "output_type": "display_data" 1051 | }, 1052 | { 1053 | "data": { 1054 | "application/vnd.jupyter.widget-view+json": { 1055 | "model_id": "6aadcfc8628a4ced9f8a9a04d757c97f", 1056 | "version_major": 2, 1057 | "version_minor": 0 1058 | }, 1059 | "text/plain": [ 1060 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1061 | ] 1062 | }, 1063 | "metadata": {}, 1064 | "output_type": "display_data" 1065 | }, 1066 | { 1067 | "data": { 1068 | "application/vnd.jupyter.widget-view+json": { 1069 | "model_id": "b2d7f56ab263440bb8ff94a7c9db07e1", 1070 | "version_major": 2, 1071 | "version_minor": 0 1072 | }, 1073 | "text/plain": [ 1074 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1075 | ] 1076 | }, 1077 | "metadata": {}, 1078 | "output_type": "display_data" 1079 | }, 1080 | { 1081 | "data": { 1082 | "application/vnd.jupyter.widget-view+json": { 1083 | "model_id": "44dcd286db1942b896081bf44c9e16b0", 1084 | "version_major": 2, 1085 | "version_minor": 0 1086 | }, 1087 | "text/plain": [ 1088 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1089 | ] 1090 | }, 1091 | "metadata": {}, 1092 | "output_type": "display_data" 1093 | }, 1094 | { 1095 | "data": { 1096 | "application/vnd.jupyter.widget-view+json": { 1097 | "model_id": "6766a8361f4e44a6b5478f8ed5cd24c4", 1098 | "version_major": 2, 1099 | "version_minor": 0 1100 | }, 1101 | "text/plain": [ 1102 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1103 | ] 1104 | }, 1105 | "metadata": {}, 1106 | "output_type": "display_data" 1107 | }, 1108 | { 1109 | "data": { 1110 | "application/vnd.jupyter.widget-view+json": { 1111 | "model_id": "e9fe8334d82240f2ba486f7bd6849b73", 1112 | "version_major": 2, 1113 | "version_minor": 0 1114 | }, 1115 | "text/plain": [ 1116 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1117 | ] 1118 | }, 1119 | "metadata": {}, 1120 | "output_type": "display_data" 1121 | }, 1122 | { 1123 | "data": { 1124 | "application/vnd.jupyter.widget-view+json": { 1125 | "model_id": "99ee9d7f364e487b933cc47712329b0c", 1126 | "version_major": 2, 1127 | "version_minor": 0 1128 | }, 1129 | "text/plain": [ 1130 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1131 | ] 1132 | }, 1133 | "metadata": {}, 1134 | "output_type": "display_data" 1135 | }, 1136 | { 1137 | "data": { 1138 | "application/vnd.jupyter.widget-view+json": { 1139 | "model_id": "e16aa0ba4ce34bdf964dadd9dcd76018", 1140 | "version_major": 2, 1141 | "version_minor": 0 1142 | }, 1143 | "text/plain": [ 1144 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1145 | ] 1146 | }, 1147 | "metadata": {}, 1148 | "output_type": "display_data" 1149 | }, 1150 | { 1151 | "data": { 1152 | "application/vnd.jupyter.widget-view+json": { 1153 | "model_id": "1dc6be455905443181f29296394f8887", 1154 | "version_major": 2, 1155 | "version_minor": 0 1156 | }, 1157 | "text/plain": [ 1158 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1159 | ] 1160 | }, 1161 | "metadata": {}, 1162 | "output_type": "display_data" 1163 | }, 1164 | { 1165 | "data": { 1166 | "application/vnd.jupyter.widget-view+json": { 1167 | "model_id": "c00ee8a0096541e59f0034835e44bca4", 1168 | "version_major": 2, 1169 | "version_minor": 0 1170 | }, 1171 | "text/plain": [ 1172 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1173 | ] 1174 | }, 1175 | "metadata": {}, 1176 | "output_type": "display_data" 1177 | }, 1178 | { 1179 | "data": { 1180 | "application/vnd.jupyter.widget-view+json": { 1181 | "model_id": "74ad4ad8c9d841b8b576e74d332d2374", 1182 | "version_major": 2, 1183 | "version_minor": 0 1184 | }, 1185 | "text/plain": [ 1186 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1187 | ] 1188 | }, 1189 | "metadata": {}, 1190 | "output_type": "display_data" 1191 | }, 1192 | { 1193 | "data": { 1194 | "application/vnd.jupyter.widget-view+json": { 1195 | "model_id": "8343bbf3f0af4ce2b055114c9879c4ad", 1196 | "version_major": 2, 1197 | "version_minor": 0 1198 | }, 1199 | "text/plain": [ 1200 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1201 | ] 1202 | }, 1203 | "metadata": {}, 1204 | "output_type": "display_data" 1205 | }, 1206 | { 1207 | "data": { 1208 | "application/vnd.jupyter.widget-view+json": { 1209 | "model_id": "77ba4c39e1df4cc7999031baa3429bd1", 1210 | "version_major": 2, 1211 | "version_minor": 0 1212 | }, 1213 | "text/plain": [ 1214 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1215 | ] 1216 | }, 1217 | "metadata": {}, 1218 | "output_type": "display_data" 1219 | }, 1220 | { 1221 | "data": { 1222 | "application/vnd.jupyter.widget-view+json": { 1223 | "model_id": "85e0367dd18e444fa5eabf7e6c1dc73b", 1224 | "version_major": 2, 1225 | "version_minor": 0 1226 | }, 1227 | "text/plain": [ 1228 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1229 | ] 1230 | }, 1231 | "metadata": {}, 1232 | "output_type": "display_data" 1233 | }, 1234 | { 1235 | "data": { 1236 | "application/vnd.jupyter.widget-view+json": { 1237 | "model_id": "d59a9fa4659a413090e130145f7be756", 1238 | "version_major": 2, 1239 | "version_minor": 0 1240 | }, 1241 | "text/plain": [ 1242 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1243 | ] 1244 | }, 1245 | "metadata": {}, 1246 | "output_type": "display_data" 1247 | } 1248 | ], 1249 | "source": [ 1250 | "rows_per_file = 1_000_000\n", 1251 | "train_files = 50\n", 1252 | "# con.execute(\"DROP TABLE temp_table\")\n", 1253 | "\n", 1254 | "for n,file in enumerate(source_files):\n", 1255 | " \n", 1256 | " file = f\"s3://{file}\"\n", 1257 | " con.execute(f\"CREATE TABLE temp_table AS SELECT * FROM '{file}'\")\n", 1258 | " for column in columns_to_drop:\n", 1259 | " con.execute(f\"ALTER TABLE temp_table DROP COLUMN {column}\")\n", 1260 | " \n", 1261 | " num_rows = con.execute(f\"SELECT COUNT(*) FROM temp_table\").fetchone()[0]\n", 1262 | " \n", 1263 | " for i in range(0, num_rows, rows_per_file):\n", 1264 | " new_file_name = os.path.basename(file).split(\".\")[0] + f\"_{i}.snappy.parquet\"\n", 1265 | " if n < train_files: \n", 1266 | " output_path = f\"{s3_output_path}/train/{new_file_name}\"\n", 1267 | " else:\n", 1268 | " output_path = f\"{s3_output_path}/test/{new_file_name}\"\n", 1269 | " \n", 1270 | " query = f\"\"\"COPY (SELECT * FROM temp_table LIMIT {rows_per_file} OFFSET {i}) \n", 1271 | " to '{output_path}' (FORMAT 'parquet', COMPRESSION 'snappy', OVERWRITE_OR_IGNORE true)\"\"\"\n", 1272 | " con.execute(query)\n", 1273 | " \n", 1274 | " con.execute(\"DROP TABLE temp_table\")" 1275 | ] 1276 | }, 1277 | { 1278 | "cell_type": "code", 1279 | "execution_count": null, 1280 | "metadata": {}, 1281 | "outputs": [], 1282 | "source": [] 1283 | } 1284 | ], 1285 | "metadata": { 1286 | "kernelspec": { 1287 | "display_name": "Python 3 (ipykernel)", 1288 | "language": "python", 1289 | "name": "python3" 1290 | }, 1291 | "language_info": { 1292 | "codemirror_mode": { 1293 | "name": "ipython", 1294 | "version": 3 1295 | }, 1296 | "file_extension": ".py", 1297 | "mimetype": "text/x-python", 1298 | "name": "python", 1299 | "nbconvert_exporter": "python", 1300 | "pygments_lexer": "ipython3", 1301 | "version": "3.11.9" 1302 | } 1303 | }, 1304 | "nbformat": 4, 1305 | "nbformat_minor": 4 1306 | } 1307 | --------------------------------------------------------------------------------