├── .gitattributes ├── .gitconfig ├── .gitignore ├── LICENSE ├── README.md ├── chapter-1 ├── .ipynb_checkpoints │ ├── ch1_linear_sales-checkpoint.png │ ├── decision-tree-classification-checkpoint.ipynb │ ├── decision-tree-regression-checkpoint.ipynb │ ├── decision-tree-visualization-checkpoint.ipynb │ ├── linear-regression-checkpoint.ipynb │ ├── manual-linear-regression-checkpoint.ipynb │ └── overfitting-checkpoint.ipynb ├── decision-tree-classification.ipynb ├── decision-tree-regression.ipynb ├── decision-tree-visualization.ipynb ├── linear-regression.ipynb ├── manual-linear-regression.ipynb └── overfitting.ipynb ├── chapter-10 ├── postgresml.sql └── steel-plates-data.ipynb ├── chapter-11 ├── lightgbm-dask.ipynb ├── lightgbm-gpu.ipynb └── setup-data.ipynb ├── chapter-2 ├── .ipynb_checkpoints │ ├── gradient-descent-checkpoint.ipynb │ └── random-forest-regression-checkpoint.ipynb ├── gradient-descent.ipynb └── random-forest-regression.ipynb ├── chapter-3 ├── lightgbm-python-api.ipynb ├── lightgbm-sklearn-python-api.ipynb └── predicting-academic-success.ipynb ├── chapter-4 ├── lightgbm-census-income.ipynb ├── lightgbm-credit-card.ipynb ├── tabtransformer-census-income.ipynb ├── tabtransformer-credit-card.ipynb ├── xgboost-census-income.ipynb └── xgboost-credit-card.ipynb ├── chapter-5 └── lightgbm-customer-churn.ipynb ├── chapter-6 ├── credit-score-classification.ipynb └── wind-turbine-power-output.ipynb ├── chapter-7 └── lightgbm-automl-flaml.ipynb ├── chapter-8 ├── Dockerfile ├── lightgbm-customer-churn-pipeline.ipynb ├── requirements.txt ├── telco_churn_api.py └── transformers.py ├── chapter-9 ├── ml-model-workflow.ipynb └── src │ ├── evaluate.py │ ├── lambda_deployer.py │ ├── lightgbm_train.py │ ├── preprocessing.py │ └── requirements.txt └── environment.yml /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb merge=nbdev-merge 2 | -------------------------------------------------------------------------------- /.gitconfig: -------------------------------------------------------------------------------- 1 | # Generated by nbdev_install_hooks 2 | # 3 | # If you need to disable this instrumentation do: 4 | # git config --local --unset include.path 5 | # 6 | # To restore: 7 | # git config --local include.path ../.gitconfig 8 | # 9 | [merge "nbdev-merge"] 10 | name = resolve conflicts with nbdev_fix 11 | driver = nbdev_merge %O %A %B %P 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | figures/ 3 | tmp/ 4 | 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | *.so 10 | 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | *.manifest 31 | *.spec 32 | 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | htmlcov/ 37 | .tox/ 38 | .nox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | *.py,cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | cover/ 49 | 50 | *.mo 51 | *.pot 52 | 53 | *.log 54 | local_settings.py 55 | db.sqlite3 56 | db.sqlite3-journal 57 | 58 | instance/ 59 | .webassets-cache 60 | 61 | 62 | .pdm.toml 63 | 64 | __pypackages__/ 65 | 66 | .env 67 | .venv 68 | env/ 69 | venv/ 70 | ENV/ 71 | env.bak/ 72 | venv.bak/ 73 | 74 | .mypy_cache/ 75 | .dmypy.json 76 | dmypy.json 77 | 78 | telco/ 79 | wind-turbine/ 80 | bank/ 81 | steel-plates/ 82 | covtype/ 83 | creditscore/ 84 | *.pkl 85 | *.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning with LightGBM and Python 2 | 3 | Classification and regression 4 | 5 | This is the code repository for [Machine Learning with LightGBM and Python](https://www.packtpub.com/product/machine-learning-with-lightgbm-and-python/9781800564749?utm_source=github&utm_medium=repository&utm_campaign=9781800564749), published by Packt. 6 | 7 | **A practitioner's guide to developing production-ready machine learning systems** 8 | 9 | ## What is this book about? 10 | Machine Learning with LightGBM and Python is a comprehensive guide to learning the basics of machine learning and progressing to building scalable machine learning systems that are ready for release. 11 | This book will get you acquainted with the high-performance gradient-boosting LightGBM framework and show you how it can be used to solve various machine-learning problems to produce highly accurate, robust, and predictive solutions. Starting with simple machine learning models in scikit-learn, you’ll explore the intricacies of gradient boosting machines and LightGBM. You’ll be guided through various case studies to better understand the data science processes and learn how to practically apply your skills to real-world problems. As you progress, you’ll elevate your software engineering skills by learning how to build and integrate scalable machine-learning pipelines to process data, train models, and deploy them to serve secure APIs using Python tools such as FastAPI. 12 | 13 | This book covers the following exciting features: 14 | * Get an overview of ML and working with data and models in Python using scikit-learn 15 | * Explore decision trees, ensemble learning, gradient boosting, DART, and GOSS 16 | * Master LightGBM and apply it to classification and regression problems 17 | * Tune and train your models using AutoML with FLAML and Optuna 18 | * Build ML pipelines in Python to train and deploy models with secure and performant APIs 19 | * Scale your solutions to production readiness with AWS Sagemaker, PostgresML, and Dask 20 | 21 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1800564740) today! 22 | https://www.packtpub.com/ 24 | 25 | ## Instructions and Navigations 26 | All of the code is organized into folders. For example, Chapter01. 27 | 28 | The code will look like the following: 29 | ``` 30 | import numpy as np 31 | import pandas as pd 32 | 33 | from matplotlib import pyplot as plt 34 | import seaborn as sns 35 | 36 | from sklearn.linear_model import LinearRegression 37 | from sklearn.metrics import mean_absolute_error 38 | ``` 39 | 40 | **Following is what you need for this book:** 41 | This book is for software engineers aspiring to be better machine learning engineers and data scientists unfamiliar with LightGBM, looking to gain in-depth knowledge of its libraries. Basic to intermediate Python programming knowledge is required to get started with the book. 42 | The book is also an excellent source for ML veterans, with a strong focus on ML engineering with up-to-date and thorough coverage of platforms such as AWS Sagemaker, PostgresML, and Dask. 43 | 44 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11). 45 | ### Software and Hardware List 46 | | Chapter | Software required | OS required | 47 | | -------- | ------------------------------------ | ----------------------------------- | 48 | | 1-11 | Python 3.10 | Windows, macOS, or Linux | 49 | | 1-11 | Anaconda 3 | Windows, macOS, or Linux | 50 | | 1-11 | scikit-learn 1.2.1 | Windows, macOS, or Linux | 51 | | 1-11 | LightGBM 3.3.5 | Windows, macOS, or Linux | 52 | | 1-11 | XGBoost 1.7.4 | Windows, macOS, or Linux | 53 | | 1-11 | Optuna 3.1.1 | Windows, macOS, or Linux | 54 | | 1-11 | FLAML 1.2.3 | Windows, macOS, or Linux | 55 | | 1-11 | FastAPI 0.103.1 | Windows, macOS, or Linux | 56 | | 1-11 | Amazon SageMaker | Windows, macOS, or Linux | 57 | | 1-11 | Docker 23.0.1 | Windows, macOS, or Linux | 58 | | 1-11 | PostgresML 2.7.0 | Windows, macOS, or Linux | 59 | | 1-11 | Dask 2023.7.1 | Windows, macOS, or Linux | 60 | 61 | ### Related products 62 | * A Handbook of Mathematical Models with Python [[Packt]](https://www.packtpub.com/product/a-handbook-of-mathematical-models-with-python/9781804616703?utm_source=github&utm_medium=repository&utm_campaign=9781804616703) [[Amazon]](https://www.amazon.com/dp/1804616702) 63 | 64 | * The Regularization Cookbook [[Packt]](https://www.packtpub.com/product/the-regularization-cookbook/9781837634088?utm_source=github&utm_medium=repository&utm_campaign=9781837634088) [[Amazon]](https://www.amazon.com/dp/1837634084) 65 | 66 | 67 | ## Get to Know the Author 68 | **Andrich van Wyk** 69 | has 15 years of experience in machine learning R&D, building AI-driven solutions, and consulting in the AI domain. He also has broad experience as a software engineer and architect with over a decade of industry experience working on enterprise systems. 70 | He graduated cum laude with an M.Sc. in Computer Science from the University of Pretoria, focusing on neural networks and evolutionary algorithms. 71 | Andrich enjoys writing about machine learning engineering and the software industry at large. He currently resides in South Africa with his wife and daughter. 72 | 73 | -------------------------------------------------------------------------------- /chapter-1/.ipynb_checkpoints/ch1_linear_sales-checkpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Practical-Machine-Learning-with-LightGBM-and-Python/042afffffc0322b623f03d4a89f17b7432e0b553/chapter-1/.ipynb_checkpoints/ch1_linear_sales-checkpoint.png -------------------------------------------------------------------------------- /chapter-1/.ipynb_checkpoints/decision-tree-classification-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c1ff44f1-f6f2-49bc-b158-d68f1d9c800f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "\n", 13 | "from matplotlib import pyplot as plt\n", 14 | "\n", 15 | "from sklearn import datasets\n", 16 | "from sklearn.tree import DecisionTreeClassifier\n", 17 | "from sklearn.model_selection import train_test_split\n", 18 | "from sklearn.metrics import f1_score\n", 19 | "from sklearn import tree" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "b959b796-cc19-4b33-bb93-9345f9c1e485", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "0.9440993788819876" 32 | ] 33 | }, 34 | "execution_count": null, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "dataset = datasets.load_breast_cancer()\n", 41 | "\n", 42 | "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=157)\n", 43 | "\n", 44 | "model = DecisionTreeClassifier(random_state=157, max_depth=3, min_samples_split=2)\n", 45 | "model = model.fit(X_train, y_train)\n", 46 | "\n", 47 | "f1_score(y_test, model.predict(X_test))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "2c4f331a-3755-4258-a1af-9114de351999", 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "ename": "NameError", 58 | "evalue": "name 'clf' is not defined", 59 | "output_type": "error", 60 | "traceback": [ 61 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 62 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 63 | "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m fig \u001b[38;5;241m=\u001b[39m plt\u001b[38;5;241m.\u001b[39mfigure()\n\u001b[0;32m----> 2\u001b[0m _ \u001b[38;5;241m=\u001b[39m tree\u001b[38;5;241m.\u001b[39mplot_tree(\u001b[43mclf\u001b[49m,rounded\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 3\u001b[0m feature_names\u001b[38;5;241m=\u001b[39mdataset\u001b[38;5;241m.\u001b[39mfeature_names,\n\u001b[1;32m 4\u001b[0m class_names\u001b[38;5;241m=\u001b[39mdataset\u001b[38;5;241m.\u001b[39mtarget_names,\n\u001b[1;32m 5\u001b[0m filled\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", 64 | "\u001b[0;31mNameError\u001b[0m: name 'clf' is not defined" 65 | ] 66 | }, 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "
" 71 | ] 72 | }, 73 | "metadata": {}, 74 | "output_type": "display_data" 75 | } 76 | ], 77 | "source": [ 78 | "fig = plt.figure()\n", 79 | "_ = tree.plot_tree(model, rounded=True,\n", 80 | " feature_names=dataset.feature_names,\n", 81 | " class_names=dataset.target_names,\n", 82 | " filled=True)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "d92752f8-8576-439b-9200-0f5c57c20ab0", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3 (ipykernel)", 97 | "language": "python", 98 | "name": "python3" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 5 103 | } 104 | -------------------------------------------------------------------------------- /chapter-10/postgresml.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE pgml.telco_churn CASCADE; 2 | CREATE TABLE pgml.telco_churn 3 | ( 4 | customerid VARCHAR(100), 5 | gender VARCHAR(100), 6 | seniorcitizen BOOLEAN, 7 | partner VARCHAR(10), 8 | dependents VARCHAR(10), 9 | tenure REAL, 10 | phoneservice VARCHAR(10), 11 | multiplelines VARCHAR(30), 12 | internetservice VARCHAR(30), 13 | onlinesecurity VARCHAR(30), 14 | onlinebackup VARCHAR(30), 15 | deviceprotection VARCHAR(30), 16 | techsupport VARCHAR(30), 17 | streamingtv VARCHAR(30), 18 | streamingmovies VARCHAR(30), 19 | contract VARCHAR(30), 20 | paperlessbilling VARCHAR(30), 21 | paymentmethod VARCHAR(30), 22 | monthlycharges VARCHAR(50), 23 | totalcharges VARCHAR(50), 24 | churn VARCHAR(10) 25 | ); 26 | 27 | COPY pgml.telco_churn (customerid, 28 | gender, 29 | seniorcitizen, 30 | partner, 31 | dependents, 32 | tenure, 33 | phoneservice, 34 | multiplelines, 35 | internetservice, 36 | onlinesecurity, 37 | onlinebackup, 38 | deviceprotection, 39 | techsupport, 40 | streamingtv, 41 | streamingmovies, 42 | contract, 43 | paperlessbilling, 44 | paymentmethod, 45 | monthlycharges, 46 | totalcharges, 47 | churn 48 | ) FROM '/tmp/telco-churn.csv' 49 | DELIMITER ',' 50 | CSV HEADER; 51 | 52 | SELECT * 53 | FROM pgml.telco_churn; 54 | 55 | UPDATE pgml.telco_churn 56 | SET totalcharges = NULL 57 | WHERE totalcharges = ' '; 58 | 59 | DROP VIEW pgml.telco_churn_data; 60 | CREATE VIEW pgml.telco_churn_data AS 61 | SELECT gender, 62 | seniorcitizen, 63 | CAST(CASE partner 64 | WHEN 'Yes' THEN true 65 | WHEN 'No' THEN false 66 | END AS BOOLEAN) AS partner, 67 | CAST(CASE dependents 68 | WHEN 'Yes' THEN true 69 | WHEN 'No' THEN false 70 | END AS BOOLEAN) AS dependents, 71 | tenure, 72 | CAST(CASE phoneservice 73 | WHEN 'Yes' THEN true 74 | WHEN 'No' THEN false 75 | END AS BOOLEAN) AS phoneservice, 76 | multiplelines, 77 | internetservice, 78 | onlinesecurity, 79 | onlinebackup, 80 | deviceprotection, 81 | techsupport, 82 | streamingtv, 83 | streamingmovies, 84 | contract, 85 | paperlessbilling, 86 | paymentmethod, 87 | CAST(monthlycharges AS REAL), 88 | CAST(totalcharges AS REAL), 89 | CAST(CASE churn 90 | WHEN 'Yes' THEN true 91 | WHEN 'No' THEN false 92 | END AS BOOLEAN) AS churn 93 | FROM pgml.telco_churn; 94 | 95 | 96 | SELECT * 97 | FROM pgml.train('Telco Churn', 98 | task => 'classification', 99 | relation_name => 'pgml.telco_churn_data', 100 | y_column_name => 'churn', 101 | algorithm => 'lightgbm', 102 | preprocess => '{"totalcharges": {"impute": "mean"} }', 103 | search => 'random', 104 | search_args => '{"n_iter": 500 }', 105 | search_params => '{ 106 | "num_leaves": [2, 4, 8, 16, 32, 64, 128], 107 | "max_bin": [32, 64, 128, 256, 512], 108 | "learning_rate": [0.0001, 0.001, 0.1, 0.5], 109 | "n_estimators": [20, 40, 80, 100, 200, 400] 110 | }' 111 | ); 112 | 113 | SELECT metrics, hyperparams 114 | FROM pgml.models m 115 | LEFT OUTER JOIN pgml.projects p on p.id = m.project_id 116 | WHERE p.name = 'Telco Churn'; 117 | 118 | SELECT pgml.predict( 119 | 'Telco Churn', 120 | ROW ( 121 | CAST('Male' AS VARCHAR(30)), 122 | 1, 123 | 1, 124 | 0, 125 | 0, 126 | 1, 127 | CAST('No phone service' AS VARCHAR(30)), 128 | CAST('Fiber optic' AS VARCHAR(30)), 129 | CAST('No' AS VARCHAR(30)), 130 | CAST('Yes' AS VARCHAR(30)), 131 | CAST('No' AS VARCHAR(30)), 132 | CAST('No' AS VARCHAR(30)), 133 | CAST('Yes' AS VARCHAR(30)), 134 | CAST('No' AS VARCHAR(30)), 135 | CAST('Month-to-month' AS VARCHAR(30)), 136 | CAST('Yes' AS VARCHAR(30)), 137 | CAST('Electronic check' AS VARCHAR(30)), 138 | CAST(20.25 AS REAL), 139 | CAST(4107.25 AS REAL) 140 | ) 141 | ) AS prediction; 142 | 143 | SELECT *, 144 | pgml.predict( 145 | 'Telco Churn', 146 | ROW ( 147 | gender, 148 | seniorcitizen, 149 | CAST(CASE partner 150 | WHEN 'Yes' THEN true 151 | WHEN 'No' THEN false 152 | END AS BOOLEAN), 153 | CAST(CASE dependents 154 | WHEN 'Yes' THEN true 155 | WHEN 'No' THEN false 156 | END AS BOOLEAN), 157 | tenure, 158 | CAST(CASE phoneservice 159 | WHEN 'Yes' THEN true 160 | WHEN 'No' THEN false 161 | END AS BOOLEAN), 162 | multiplelines, 163 | internetservice, 164 | onlinesecurity, 165 | onlinebackup, 166 | deviceprotection, 167 | techsupport, 168 | streamingtv, 169 | streamingmovies, 170 | contract, 171 | paperlessbilling, 172 | paymentmethod, 173 | CAST(monthlycharges AS REAL), 174 | CAST(totalcharges AS REAL) 175 | ) 176 | ) AS prediction 177 | FROM pgml.telco_churn; -------------------------------------------------------------------------------- /chapter-10/steel-plates-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "ExecuteTime": { 9 | "end_time": "2023-07-03T20:22:04.253076300Z", 10 | "start_time": "2023-07-03T20:22:02.956548Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import openml" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "outputs": [], 22 | "source": [ 23 | "dataset = openml.datasets.get_dataset(1504)" 24 | ], 25 | "metadata": { 26 | "collapsed": false, 27 | "ExecuteTime": { 28 | "end_time": "2023-07-03T20:22:10.855667100Z", 29 | "start_time": "2023-07-03T20:22:04.254082200Z" 30 | } 31 | } 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "outputs": [], 37 | "source": [ 38 | "X, y, X2, y2 = dataset.get_data(dataset_format=\"dataframe\", target=\"Class\")" 39 | ], 40 | "metadata": { 41 | "collapsed": false, 42 | "ExecuteTime": { 43 | "end_time": "2023-07-03T20:22:11.001698100Z", 44 | "start_time": "2023-07-03T20:22:10.858681900Z" 45 | } 46 | } 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 12, 51 | "outputs": [], 52 | "source": [ 53 | "X[\"Class\"] = y" 54 | ], 55 | "metadata": { 56 | "collapsed": false, 57 | "ExecuteTime": { 58 | "end_time": "2023-07-03T20:23:59.569087700Z", 59 | "start_time": "2023-07-03T20:23:59.564588500Z" 60 | } 61 | } 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 13, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": " V1 V2 V3 V4 V5 V6 V7 V8 \\\n0 42.0 50.0 270900.0 270944.0 267.0 17.0 44.0 24220.0 \n1 645.0 651.0 2538079.0 2538108.0 108.0 10.0 30.0 11397.0 \n2 829.0 835.0 1553913.0 1553931.0 71.0 8.0 19.0 7972.0 \n3 853.0 860.0 369370.0 369415.0 176.0 13.0 45.0 18996.0 \n4 1289.0 1306.0 498078.0 498335.0 2409.0 60.0 260.0 246930.0 \n... ... ... ... ... ... ... ... ... \n1936 249.0 277.0 325780.0 325796.0 273.0 54.0 22.0 35033.0 \n1937 144.0 175.0 340581.0 340598.0 287.0 44.0 24.0 34599.0 \n1938 145.0 174.0 386779.0 386794.0 292.0 40.0 22.0 37572.0 \n1939 137.0 170.0 422497.0 422528.0 419.0 97.0 47.0 52715.0 \n1940 1261.0 1281.0 87951.0 87967.0 103.0 26.0 22.0 11682.0 \n\n V9 V10 ... V25 V26 V27 V28 V29 V30 V31 V32 V33 \\\n0 76 108 ... 0.8182 -0.2913 0.5822 1 0 0 0 0 0 \n1 84 123 ... 0.7931 -0.1756 0.2984 1 0 0 0 0 0 \n2 99 125 ... 0.6667 -0.1228 0.2150 1 0 0 0 0 0 \n3 99 126 ... 0.8444 -0.1568 0.5212 1 0 0 0 0 0 \n4 37 126 ... 0.9338 -0.1992 1.0000 1 0 0 0 0 0 \n... ... ... ... ... ... ... ... ... ... ... ... ... \n1936 119 141 ... -0.4286 0.0026 0.7254 0 0 0 0 0 0 \n1937 112 133 ... -0.4516 -0.0582 0.8173 0 0 0 0 0 0 \n1938 120 140 ... -0.4828 0.0052 0.7079 0 0 0 0 0 0 \n1939 117 140 ... -0.0606 -0.0171 0.9919 0 0 0 0 0 0 \n1940 101 133 ... -0.2000 -0.1139 0.5296 0 0 0 0 0 0 \n\n Class \n0 1 \n1 1 \n2 1 \n3 1 \n4 1 \n... ... \n1936 2 \n1937 2 \n1938 2 \n1939 2 \n1940 2 \n\n[1941 rows x 34 columns]", 70 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
V1V2V3V4V5V6V7V8V9V10...V25V26V27V28V29V30V31V32V33Class
042.050.0270900.0270944.0267.017.044.024220.076108...0.8182-0.29130.58221000001
1645.0651.02538079.02538108.0108.010.030.011397.084123...0.7931-0.17560.29841000001
2829.0835.01553913.01553931.071.08.019.07972.099125...0.6667-0.12280.21501000001
3853.0860.0369370.0369415.0176.013.045.018996.099126...0.8444-0.15680.52121000001
41289.01306.0498078.0498335.02409.060.0260.0246930.037126...0.9338-0.19921.00001000001
..................................................................
1936249.0277.0325780.0325796.0273.054.022.035033.0119141...-0.42860.00260.72540000002
1937144.0175.0340581.0340598.0287.044.024.034599.0112133...-0.4516-0.05820.81730000002
1938145.0174.0386779.0386794.0292.040.022.037572.0120140...-0.48280.00520.70790000002
1939137.0170.0422497.0422528.0419.097.047.052715.0117140...-0.0606-0.01710.99190000002
19401261.01281.087951.087967.0103.026.022.011682.0101133...-0.2000-0.11390.52960000002
\n

1941 rows × 34 columns

\n
" 71 | }, 72 | "execution_count": 13, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "X" 79 | ], 80 | "metadata": { 81 | "collapsed": false, 82 | "ExecuteTime": { 83 | "end_time": "2023-07-03T20:24:00.854609900Z", 84 | "start_time": "2023-07-03T20:24:00.838317300Z" 85 | } 86 | } 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 23, 91 | "outputs": [], 92 | "source": [ 93 | "X = X.sample(frac=1)" 94 | ], 95 | "metadata": { 96 | "collapsed": false, 97 | "ExecuteTime": { 98 | "end_time": "2023-07-03T21:30:31.821722500Z", 99 | "start_time": "2023-07-03T21:30:31.808645300Z" 100 | } 101 | } 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 24, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": " V1 V2 V3 V4 V5 V6 V7 V8 \\\n209 0.0 37.0 814236.0 814627.0 7292.0 466.0 536.0 608995.0 \n126 1595.0 1611.0 1594430.0 1594517.0 794.0 59.0 87.0 84410.0 \n23 82.0 92.0 149044.0 149083.0 264.0 15.0 39.0 32175.0 \n46 1495.0 1502.0 3509718.0 3509731.0 56.0 12.0 13.0 5587.0 \n1751 54.0 75.0 2344077.0 2344153.0 838.0 67.0 80.0 85442.0 \n... ... ... ... ... ... ... ... ... \n275 30.0 58.0 1772722.0 1772736.0 253.0 50.0 26.0 28728.0 \n1174 1031.0 1045.0 621518.0 621530.0 115.0 15.0 12.0 11562.0 \n1117 315.0 341.0 530471.0 530483.0 116.0 32.0 19.0 14912.0 \n1058 993.0 1004.0 1589161.0 1589168.0 57.0 12.0 7.0 6528.0 \n1821 54.0 62.0 3971234.0 3971359.0 583.0 75.0 125.0 60497.0 \n\n V9 V10 ... V25 V26 V27 V28 V29 V30 V31 V32 V33 \\\n209 43 108 ... 0.9054 -0.3475 1.0000 0 1 0 0 0 0 \n126 62 127 ... 0.8161 -0.1694 0.9993 1 0 0 0 0 0 \n23 92 141 ... 0.7436 -0.0479 0.6422 1 0 0 0 0 0 \n46 85 116 ... 0.4615 -0.2206 0.1965 1 0 0 0 0 0 \n1751 90 119 ... 0.7237 -0.2034 0.9998 0 0 0 0 0 0 \n... ... ... ... ... ... ... ... ... ... ... ... ... \n275 86 135 ... -0.5000 -0.1129 0.6453 0 1 0 0 0 0 \n1174 68 127 ... -0.1429 -0.2145 0.2901 0 0 0 0 0 1 \n1117 121 142 ... -0.5385 0.0043 0.5162 0 0 0 0 0 1 \n1058 103 126 ... -0.3636 -0.1053 0.1822 0 0 0 0 0 1 \n1821 89 126 ... 0.9360 -0.1893 0.9905 0 0 0 0 0 0 \n\n Class \n209 1 \n126 1 \n23 1 \n46 1 \n1751 2 \n... ... \n275 1 \n1174 1 \n1117 1 \n1058 1 \n1821 2 \n\n[1941 rows x 34 columns]", 110 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
V1V2V3V4V5V6V7V8V9V10...V25V26V27V28V29V30V31V32V33Class
2090.037.0814236.0814627.07292.0466.0536.0608995.043108...0.9054-0.34751.00000100001
1261595.01611.01594430.01594517.0794.059.087.084410.062127...0.8161-0.16940.99931000001
2382.092.0149044.0149083.0264.015.039.032175.092141...0.7436-0.04790.64221000001
461495.01502.03509718.03509731.056.012.013.05587.085116...0.4615-0.22060.19651000001
175154.075.02344077.02344153.0838.067.080.085442.090119...0.7237-0.20340.99980000002
..................................................................
27530.058.01772722.01772736.0253.050.026.028728.086135...-0.5000-0.11290.64530100001
11741031.01045.0621518.0621530.0115.015.012.011562.068127...-0.1429-0.21450.29010000011
1117315.0341.0530471.0530483.0116.032.019.014912.0121142...-0.53850.00430.51620000011
1058993.01004.01589161.01589168.057.012.07.06528.0103126...-0.3636-0.10530.18220000011
182154.062.03971234.03971359.0583.075.0125.060497.089126...0.9360-0.18930.99050000002
\n

1941 rows × 34 columns

\n
" 111 | }, 112 | "execution_count": 24, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "X" 119 | ], 120 | "metadata": { 121 | "collapsed": false, 122 | "ExecuteTime": { 123 | "end_time": "2023-07-03T21:30:32.316009700Z", 124 | "start_time": "2023-07-03T21:30:32.267516700Z" 125 | } 126 | } 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 26, 131 | "outputs": [], 132 | "source": [ 133 | "X.to_csv(\"steel-plates/steel-plates.csv\", index=False)" 134 | ], 135 | "metadata": { 136 | "collapsed": false, 137 | "ExecuteTime": { 138 | "end_time": "2023-07-03T21:31:01.217655Z", 139 | "start_time": "2023-07-03T21:31:01.168839800Z" 140 | } 141 | } 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "outputs": [], 147 | "source": [], 148 | "metadata": { 149 | "collapsed": false 150 | } 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "Python 3", 156 | "language": "python", 157 | "name": "python3" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 2 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython2", 169 | "version": "2.7.6" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 0 174 | } 175 | -------------------------------------------------------------------------------- /chapter-11/lightgbm-dask.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "ExecuteTime": { 9 | "end_time": "2023-08-05T09:03:23.724265800Z", 10 | "start_time": "2023-08-05T09:03:22.794654100Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import dask.dataframe as dd\n", 16 | "import dask_ml\n", 17 | "import lightgbm as lgb\n", 18 | "from distributed import Client, LocalCluster" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "outputs": [], 25 | "source": [ 26 | "cluster = LocalCluster(n_workers=4, threads_per_worker=2)\n", 27 | "client = Client(cluster)" 28 | ], 29 | "metadata": { 30 | "collapsed": false, 31 | "ExecuteTime": { 32 | "end_time": "2023-08-05T09:03:24.873777Z", 33 | "start_time": "2023-08-05T09:03:23.728266200Z" 34 | } 35 | } 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "outputs": [], 41 | "source": [ 42 | "df = dd.read_csv(\"covtype/covtype.csv\", blocksize=\"64MB\")" 43 | ], 44 | "metadata": { 45 | "collapsed": false, 46 | "ExecuteTime": { 47 | "end_time": "2023-08-05T09:03:24.918752900Z", 48 | "start_time": "2023-08-05T09:03:24.878297800Z" 49 | } 50 | } 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": "(Delayed('int-1d52fdac-81a2-470d-9aaa-4823040383e9'), 55)" 59 | }, 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "df.shape" 67 | ], 68 | "metadata": { 69 | "collapsed": false, 70 | "ExecuteTime": { 71 | "end_time": "2023-08-05T09:03:26.084611200Z", 72 | "start_time": "2023-08-05T09:03:26.077638100Z" 73 | } 74 | } 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 5, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": "581012" 83 | }, 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "df.shape[0].compute()" 91 | ], 92 | "metadata": { 93 | "collapsed": false, 94 | "ExecuteTime": { 95 | "end_time": "2023-08-05T09:03:27.585076500Z", 96 | "start_time": "2023-08-05T09:03:26.421915500Z" 97 | } 98 | } 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 6, 103 | "outputs": [], 104 | "source": [ 105 | "X = df.iloc[:, :-1]\n", 106 | "y = df.iloc[:, -1]\n", 107 | "X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split(X, y)" 108 | ], 109 | "metadata": { 110 | "collapsed": false, 111 | "ExecuteTime": { 112 | "end_time": "2023-08-05T09:03:28.587634Z", 113 | "start_time": "2023-08-05T09:03:28.573573700Z" 114 | } 115 | } 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 19, 120 | "outputs": [ 121 | { 122 | "name": "stderr", 123 | "output_type": "stream", 124 | "text": [ 125 | "/home/avanwyk/.local/share/mambaforge/envs/lgbmenv/lib/python3.10/site-packages/lightgbm/dask.py:526: UserWarning: Parameter n_jobs will be ignored.\n", 126 | " _log_warning(f\"Parameter {param_alias} will be ignored.\")\n" 127 | ] 128 | }, 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "Finding random open ports for workers\n", 134 | "[LightGBM] [Info] Trying to bind port 59293...\n", 135 | "[LightGBM] [Info] Binding port 59293 succeeded\n", 136 | "[LightGBM] [Info] Listening...\n", 137 | "[LightGBM] [Info] Listening...\n", 138 | "[LightGBM] [Info] Connected to rank 0\n", 139 | "[LightGBM] [Info] Local rank: 1, total number of machines: 2\n", 140 | "[LightGBM] [Warning] num_threads is set=2, n_jobs=-1 will be ignored. Current value: num_threads=2\n" 141 | ] 142 | }, 143 | { 144 | "data": { 145 | "text/plain": "DaskLGBMClassifier(client=,\n n_estimators=200, num_threads=2, time_out=120,\n tree_learner='data')", 146 | "text/html": "
DaskLGBMClassifier(client=<Client: 'tcp://127.0.0.1:39171' processes=4 threads=8, memory=15.01 GiB>,\n                   n_estimators=200, num_threads=2, time_out=120,\n                   tree_learner='data')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 147 | }, 148 | "execution_count": 19, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "dask_model = lgb.DaskLGBMClassifier(n_estimators=200, client=client)\n", 155 | "dask_model.fit(X_train, y_train)" 156 | ], 157 | "metadata": { 158 | "collapsed": false, 159 | "ExecuteTime": { 160 | "end_time": "2023-08-05T09:56:55.048229700Z", 161 | "start_time": "2023-08-05T09:56:37.976424200Z" 162 | } 163 | } 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 22, 168 | "outputs": [], 169 | "source": [ 170 | "predictions = dask_model.predict(X_test)" 171 | ], 172 | "metadata": { 173 | "collapsed": false, 174 | "ExecuteTime": { 175 | "end_time": "2023-08-05T10:02:11.052388500Z", 176 | "start_time": "2023-08-05T10:02:11.042349100Z" 177 | } 178 | } 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 25, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": "array([5., 5., 2., ..., 3., 3., 3.])" 187 | }, 188 | "execution_count": 25, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "predictions.compute()" 195 | ], 196 | "metadata": { 197 | "collapsed": false, 198 | "ExecuteTime": { 199 | "end_time": "2023-08-05T10:04:48.093723300Z", 200 | "start_time": "2023-08-05T10:04:45.598200100Z" 201 | } 202 | } 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 16, 207 | "outputs": [], 208 | "source": [ 209 | "dask_model.score(X_test.to_dask_array(lengths=True), y_test.to_dask_array(lengths=True))" 210 | ], 211 | "metadata": { 212 | "collapsed": false, 213 | "ExecuteTime": { 214 | "end_time": "2023-08-05T09:06:30.847647600Z", 215 | "start_time": "2023-08-05T09:06:21.205338400Z" 216 | } 217 | } 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "outputs": [], 223 | "source": [], 224 | "metadata": { 225 | "collapsed": false 226 | } 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": "Python 3", 232 | "language": "python", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": { 237 | "name": "ipython", 238 | "version": 2 239 | }, 240 | "file_extension": ".py", 241 | "mimetype": "text/x-python", 242 | "name": "python", 243 | "nbconvert_exporter": "python", 244 | "pygments_lexer": "ipython2", 245 | "version": "2.7.6" 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 0 250 | } 251 | -------------------------------------------------------------------------------- /chapter-11/lightgbm-gpu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "ExecuteTime": { 9 | "end_time": "2023-08-06T14:33:52.936775200Z", 10 | "start_time": "2023-08-06T14:33:52.201992800Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "from sklearn import datasets\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.model_selection import cross_val_score\n", 18 | "\n", 19 | "import lightgbm as lgb\n", 20 | "import optuna" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "outputs": [], 27 | "source": [ 28 | "dataset = datasets.fetch_covtype()\n", 29 | "\n", 30 | "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=179)" 31 | ], 32 | "metadata": { 33 | "collapsed": false, 34 | "ExecuteTime": { 35 | "end_time": "2023-08-06T14:35:16.112273800Z", 36 | "start_time": "2023-08-06T14:33:52.938775200Z" 37 | } 38 | } 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "outputs": [], 44 | "source": [ 45 | "model = lgb.LGBMClassifier(\n", 46 | " n_estimators=150,\n", 47 | " verbose=-1\n", 48 | ")" 49 | ], 50 | "metadata": { 51 | "collapsed": false, 52 | "ExecuteTime": { 53 | "end_time": "2023-08-06T14:35:16.116796300Z", 54 | "start_time": "2023-08-06T14:35:16.116796300Z" 55 | } 56 | } 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "CPU times: user 16min 17s, sys: 4.35 s, total: 16min 22s\n", 67 | "Wall time: 2min 51s\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "%%time\n", 73 | "model = model.fit(X_train, y_train)" 74 | ], 75 | "metadata": { 76 | "collapsed": false, 77 | "ExecuteTime": { 78 | "end_time": "2023-08-06T13:58:10.090746900Z", 79 | "start_time": "2023-08-06T13:55:18.768652100Z" 80 | } 81 | } 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "outputs": [], 87 | "source": [ 88 | "model = lgb.LGBMClassifier(\n", 89 | " n_estimators=150,\n", 90 | " device=\"cuda\",\n", 91 | " verbose=-1\n", 92 | ")" 93 | ], 94 | "metadata": { 95 | "collapsed": false, 96 | "ExecuteTime": { 97 | "end_time": "2023-08-06T14:35:16.124381300Z", 98 | "start_time": "2023-08-06T14:35:16.121337300Z" 99 | } 100 | } 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "CPU times: user 10.6 s, sys: 1.56 s, total: 12.2 s\n", 111 | "Wall time: 11 s\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "%%time\n", 117 | "model = model.fit(X_train, y_train)" 118 | ], 119 | "metadata": { 120 | "collapsed": false, 121 | "ExecuteTime": { 122 | "end_time": "2023-08-06T14:35:27.117657700Z", 123 | "start_time": "2023-08-06T14:35:16.367076800Z" 124 | } 125 | } 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "outputs": [], 131 | "source": [ 132 | "def objective(optimize_boosting_type=True):\n", 133 | " def _objective(trial):\n", 134 | " if optimize_boosting_type:\n", 135 | " boosting_type = trial.suggest_categorical(\"boosting_type\", [\"dart\", \"gbdt\"])\n", 136 | " else:\n", 137 | " boosting_type = \"gbdt\"\n", 138 | " lambda_l1 = trial.suggest_float(\n", 139 | " 'lambda_l1', 1e-8, 10.0, log=True),\n", 140 | " lambda_l2 = trial.suggest_float(\n", 141 | " 'lambda_l2', 1e-8, 10.0, log=True),\n", 142 | " num_leaves = trial.suggest_int(\n", 143 | " 'num_leaves', 2, 256),\n", 144 | " feature_fraction = trial.suggest_float(\n", 145 | " 'feature_fraction', 0.4, 1.0),\n", 146 | " bagging_fraction = trial.suggest_float(\n", 147 | " 'bagging_fraction', 0.4, 1.0),\n", 148 | " bagging_freq = trial.suggest_int(\n", 149 | " 'bagging_freq', 1, 7),\n", 150 | " min_child_samples = trial.suggest_int(\n", 151 | " 'min_child_samples', 5, 100),\n", 152 | " learning_rate = trial.suggest_float(\n", 153 | " \"learning_rate\", 0.0001, 0.5, log=True),\n", 154 | " max_bin = trial.suggest_int(\n", 155 | " \"max_bin\", 128, 512, 32)\n", 156 | " n_estimators = trial.suggest_int(\n", 157 | " \"n_estimators\", 40, 400, 20)\n", 158 | "\n", 159 | " model = lgb.LGBMClassifier(\n", 160 | " force_row_wise=True,\n", 161 | " boosting_type=boosting_type,\n", 162 | " n_estimators=n_estimators,\n", 163 | " lambda_l1=lambda_l1,\n", 164 | " lambda_l2=lambda_l2,\n", 165 | " num_leaves=num_leaves,\n", 166 | " feature_fraction=feature_fraction,\n", 167 | " bagging_fraction=bagging_fraction,\n", 168 | " bagging_freq=bagging_freq,\n", 169 | " min_child_samples=min_child_samples,\n", 170 | " learning_rate=learning_rate,\n", 171 | " max_bin=max_bin,\n", 172 | " device=\"cuda\",\n", 173 | " verbose=-1)\n", 174 | " scores = cross_val_score(model, X_train, y_train, scoring=\"f1_macro\")\n", 175 | " return scores.mean()\n", 176 | "\n", 177 | " return _objective" 178 | ], 179 | "metadata": { 180 | "collapsed": false, 181 | "ExecuteTime": { 182 | "end_time": "2023-08-06T14:36:27.962810Z", 183 | "start_time": "2023-08-06T14:36:27.959950500Z" 184 | } 185 | } 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "outputs": [ 191 | { 192 | "name": "stderr", 193 | "output_type": "stream", 194 | "text": [ 195 | "[I 2023-08-06 14:36:28,978] A new study created in memory with name: no-name-9230bbce-53d5-4404-9c8a-2f87baec52d3\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "sampler = optuna.samplers.TPESampler()\n", 201 | "pruner = optuna.pruners.HyperbandPruner(\n", 202 | " min_resource=10, max_resource=400, reduction_factor=3)\n", 203 | "\n", 204 | "study = optuna.create_study(\n", 205 | " direction='maximize', sampler=sampler,\n", 206 | " pruner=pruner\n", 207 | ")\n", 208 | "study.optimize(objective(), n_trials=10, gc_after_trial=True, n_jobs=1)" 209 | ], 210 | "metadata": { 211 | "collapsed": false, 212 | "is_executing": true, 213 | "ExecuteTime": { 214 | "start_time": "2023-08-06T14:36:29.811456Z" 215 | } 216 | } 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "outputs": [], 222 | "source": [], 223 | "metadata": { 224 | "collapsed": false 225 | } 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 3", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 2 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython2", 244 | "version": "2.7.6" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 0 249 | } 250 | -------------------------------------------------------------------------------- /chapter-11/setup-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "from sklearn import datasets" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "outputs": [], 20 | "source": [ 21 | "dataset = datasets.fetch_covtype()\n", 22 | "result = np.column_stack((dataset.data, dataset.target))" 23 | ], 24 | "metadata": { 25 | "collapsed": false 26 | } 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "outputs": [], 32 | "source": [ 33 | "df = pd.DataFrame(result)\n", 34 | "df.to_csv(\"covtype/covtype.csv\", index=False)" 35 | ], 36 | "metadata": { 37 | "collapsed": false 38 | } 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 2 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython2", 57 | "version": "2.7.6" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 0 62 | } 63 | -------------------------------------------------------------------------------- /chapter-2/.ipynb_checkpoints/gradient-descent-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c28010a9-88d4-408d-a41f-d9a882eb16ed", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "from matplotlib import pyplot as plt\n", 12 | "\n", 13 | "import seaborn as sns\n", 14 | "sns.set_theme(style=\"white\", palette='bright')" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "807e14cb-d251-46d5-92e6-7eedce21c80d", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "f = lambda x: np.sin(x) * x * x + np.cos(x) * np.cos(x)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "8882f5b0-0e3a-4966-aec5-2157ec6ca030", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "X = np.arange(1.9, 6.6, 0.01)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "4c391586-2aec-43b3-9cc7-0258773a00e3", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "Y = f(X)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "16389cf3-3a41-4522-9485-607463d61167", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "GD_X = np.array([3, 3.5, 4.0, 4.5, 5])\n", 55 | "GD_Y = f(GD_X)\n", 56 | "GD_labels = ['x_0', 'x_1', 'x_2', 'x_3', 'x_4']" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "id": "57376ee5-0389-4d79-a288-be80814981c8", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "image/png": "\n", 68 | "text/plain": [ 69 | "
" 70 | ] 71 | }, 72 | "metadata": {}, 73 | "output_type": "display_data" 74 | } 75 | ], 76 | "source": [ 77 | "sns.lineplot(x=X, y=Y)\n", 78 | "plt.plot(GD_X, GD_Y, 'o:', label='gradient descent steps')\n", 79 | "plt.legend()\n", 80 | "plt.ylim([-28, 10])\n", 81 | "\n", 82 | "for i, label in enumerate(GD_labels):\n", 83 | " plt.annotate(f'${label}$', (GD_X[i] - 0.1, GD_Y[i] - 2.5))\n", 84 | "plt.savefig(\"figures/ch2_gradient_descent_example.png\", dpi=600)" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3 (ipykernel)", 91 | "language": "python", 92 | "name": "python3" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 5 97 | } 98 | -------------------------------------------------------------------------------- /chapter-2/.ipynb_checkpoints/random-forest-regression-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "22f220f8-e59d-4438-b361-67e05936e1c3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "\n", 13 | "from matplotlib import pyplot as plt\n", 14 | "\n", 15 | "from sklearn import datasets\n", 16 | "from sklearn.tree import DecisionTreeClassifier\n", 17 | "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n", 18 | "from sklearn.model_selection import train_test_split\n", 19 | "from sklearn.metrics import f1_score" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "b4b32741-ae12-4244-b38f-df57701adf69", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "dataset = datasets.fetch_covtype()\n", 30 | "\n", 31 | "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=179)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "88c0705b-b87d-496e-91b0-8866960d22b2", 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "0.8997162461938978" 44 | ] 45 | }, 46 | "execution_count": null, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "tree = DecisionTreeClassifier(random_state=179)\n", 53 | "tree = tree.fit(X_train, y_train)\n", 54 | "\n", 55 | "f1_score(y_test, tree.predict(X_test), average=\"macro\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "9ea70ab4-42fc-4ce9-98fa-107eefec28fe", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "0.9366622376129925" 68 | ] 69 | }, 70 | "execution_count": null, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "0.9366622376129925" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "f7c45eac-5c49-4b7b-b19e-29beb911aa08", 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "0.92423409561065" 89 | ] 90 | }, 91 | "execution_count": null, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "forest = RandomForestClassifier(random_state=179)\n", 98 | "forest = forest.fit(X_train, y_train)\n", 99 | "\n", 100 | "f1_score(y_test, forest.predict(X_test), average=\"macro\")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "b1aa0fd7-e8e3-4e46-915b-836a690675ef", 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "0.9515810344708887" 113 | ] 114 | }, 115 | "execution_count": null, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "extra_tree = ExtraTreesClassifier(random_state=179)\n", 122 | "extra_tree = extra_tree.fit(X_train, y_train)\n", 123 | "\n", 124 | "f1_score(y_test, extra_tree.predict(X_test), average=\"micro\")" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "e2b3e26c-ebf2-4b31-a4ee-45d1ced52b83", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "from lightgbm import LGBMClassifier" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "324ceba3-9b6b-40fa-9c9b-f5f1c2ff3d07", 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "0.9130645600719903" 147 | ] 148 | }, 149 | "execution_count": null, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "lgbm = LGBMClassifier(learning_rate=0.07, num_leaves=120, max_depth=22, n_estimators=120)\n", 156 | "lgbm = lgbm.fit(X_train, y_train)\n", 157 | "\n", 158 | "f1_score(y_test, lgbm.predict(X_test), average=\"macro\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "283412af-afa9-405d-944b-d0f5a2b65e9a", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "from sklearn.ensemble import GradientBoostingClassifier" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "eee94943-3a48-40a0-93f6-db33820142f6", 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/plain": [ 180 | "0.7727207011214916" 181 | ] 182 | }, 183 | "execution_count": null, 184 | "metadata": {}, 185 | "output_type": "execute_result" 186 | } 187 | ], 188 | "source": [ 189 | "booster = GradientBoostingClassifier(random_state=179)\n", 190 | "booster = booster.fit(X_train, y_train)\n", 191 | "\n", 192 | "f1_score(y_test, booster.predict(X_test), average=\"micro\")" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "id": "704c473f-9774-45d0-8b6b-3d516db1b2e4", 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [] 202 | } 203 | ], 204 | "metadata": { 205 | "kernelspec": { 206 | "display_name": "Python 3 (ipykernel)", 207 | "language": "python", 208 | "name": "python3" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 5 213 | } 214 | -------------------------------------------------------------------------------- /chapter-2/random-forest-regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "22f220f8-e59d-4438-b361-67e05936e1c3", 7 | "metadata": { 8 | "ExecuteTime": { 9 | "start_time": "2023-03-06T13:05:13.476966Z", 10 | "end_time": "2023-03-06T13:05:15.060203Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "\n", 18 | "from matplotlib import pyplot as plt\n", 19 | "\n", 20 | "from sklearn import datasets\n", 21 | "from sklearn.tree import DecisionTreeClassifier\n", 22 | "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n", 23 | "from sklearn.model_selection import train_test_split\n", 24 | "from sklearn.metrics import f1_score" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "b4b32741-ae12-4244-b38f-df57701adf69", 31 | "metadata": { 32 | "ExecuteTime": { 33 | "start_time": "2023-03-06T13:05:15.059693Z", 34 | "end_time": "2023-03-06T13:05:15.905859Z" 35 | } 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "dataset = datasets.fetch_covtype()\n", 40 | "\n", 41 | "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=179)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "88c0705b-b87d-496e-91b0-8866960d22b2", 48 | "metadata": { 49 | "ExecuteTime": { 50 | "start_time": "2023-03-06T13:05:15.905859Z", 51 | "end_time": "2023-03-06T13:05:23.646598Z" 52 | } 53 | }, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "0.8916521618124621\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "tree = DecisionTreeClassifier(random_state=179, min_samples_leaf=3, min_samples_split=6)\n", 65 | "tree = tree.fit(X_train, y_train)\n", 66 | "\n", 67 | "print(f1_score(y_test, tree.predict(X_test), average=\"macro\"))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "id": "f7c45eac-5c49-4b7b-b19e-29beb911aa08", 74 | "metadata": { 75 | "ExecuteTime": { 76 | "start_time": "2023-03-06T13:05:23.646598Z", 77 | "end_time": "2023-03-06T13:08:09.308924Z" 78 | } 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "0.9209269748875469\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "forest = RandomForestClassifier(random_state=179, min_samples_leaf=1, min_samples_split=2, n_estimators=140)\n", 91 | "forest = forest.fit(X_train, y_train)\n", 92 | "\n", 93 | "print(f1_score(y_test, forest.predict(X_test), average=\"macro\"))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "id": "b1aa0fd7-e8e3-4e46-915b-836a690675ef", 100 | "metadata": { 101 | "ExecuteTime": { 102 | "start_time": "2023-03-06T13:08:09.308924Z", 103 | "end_time": "2023-03-06T13:11:36.781035Z" 104 | } 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "0.9230624992615903\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "extra_tree = ExtraTreesClassifier(random_state=179, min_samples_leaf=1, min_samples_split=2, n_estimators=180)\n", 117 | "extra_tree = extra_tree.fit(X_train, y_train)\n", 118 | "\n", 119 | "print(f1_score(y_test, extra_tree.predict(X_test), average=\"macro\"))" 120 | ] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "lgbmenv", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.10.9" 140 | }, 141 | "vscode": { 142 | "interpreter": { 143 | "hash": "5e690b3d8af96fccddb0bcea98968490c59678da2298e058666d3fe70333b9d4" 144 | } 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 5 149 | } 150 | -------------------------------------------------------------------------------- /chapter-4/lightgbm-credit-card.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "start_time": "2023-04-08T22:16:18.375455Z", 9 | "end_time": "2023-04-08T22:16:19.569213Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import lightgbm as lgb\n", 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "from sklearn.model_selection import cross_val_score\n", 18 | "import time" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "ExecuteTime": { 26 | "start_time": "2023-04-08T22:16:19.631505Z", 27 | "end_time": "2023-04-08T22:16:22.920250Z" 28 | } 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "df = pd.read_csv(\"cc-fraud/creditcard.csv\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "ExecuteTime": { 40 | "start_time": "2023-04-08T22:16:22.919785Z", 41 | "end_time": "2023-04-08T22:16:23.214484Z" 42 | } 43 | }, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": " Time V1 V2 V3 V4 V5 V6 \\\n223278 143322.0 1.743591 -1.276207 -1.703568 -0.551499 0.182804 0.993246 \n145565 87054.0 -1.205241 -0.130198 -1.346397 -2.372208 1.553399 -1.777034 \n36813 38696.0 -2.109662 -1.044721 2.315128 -0.269519 0.918316 -0.860468 \n98424 66657.0 -0.858823 0.928831 1.638651 -0.535640 0.327566 -0.859982 \n224727 143934.0 -5.718998 -5.788360 -0.220737 0.324390 5.280846 -3.567975 \n76439 56571.0 -0.693223 0.624702 1.745828 -0.819360 0.151565 -0.084684 \n10282 16179.0 1.162990 0.585682 1.081202 2.877693 -0.226887 -0.104174 \n95398 65286.0 -0.047052 -0.246928 1.783663 -1.571497 -1.424880 -0.585279 \n248080 153789.0 0.035700 1.267053 -1.059015 -0.022929 0.270340 -1.339248 \n232733 147286.0 -3.317206 -4.081467 0.490204 4.132857 -0.827638 1.393303 \n\n V7 V8 V9 ... V21 V22 V23 \\\n223278 -0.401833 0.183088 -0.549561 ... 0.135117 0.802800 -0.115902 \n145565 0.637386 0.331383 0.050536 ... 0.358353 0.678112 -0.158165 \n36813 0.287132 -0.181102 0.104975 ... -0.103089 0.119249 0.875925 \n98424 1.212978 -0.123662 -1.162586 ... 0.091992 0.101818 -0.271478 \n224727 -2.146524 0.656593 -0.502391 ... 0.815861 -0.237749 -0.549783 \n76439 0.731768 0.076800 -0.252783 ... 0.032285 0.217722 -0.321889 \n10282 -0.112787 -0.099170 0.954979 ... -0.440766 -0.893027 0.087746 \n95398 -0.609930 -0.156651 -2.252177 ... 0.088517 0.687270 -0.063328 \n248080 0.820101 -0.044629 -0.328963 ... 0.355169 1.078043 -0.055421 \n232733 2.091947 0.360839 -1.259604 ... 0.990699 0.310767 2.987731 \n\n V24 V25 V26 V27 V28 Amount Class \n223278 -1.610128 0.005429 0.076368 0.029846 -0.053789 160.00 0 \n145565 0.760901 -0.391271 0.480153 0.212302 0.036251 1.00 0 \n36813 -0.110959 0.513343 0.351808 -0.120387 -0.247729 48.44 0 \n98424 0.572436 0.558861 0.249717 -0.050398 0.044221 42.81 0 \n224727 0.165590 1.025843 0.767957 -0.111164 -1.051211 59.00 0 \n76439 0.094126 0.317576 0.512195 0.012048 0.071173 29.75 0 \n10282 0.258407 0.372496 -0.216396 -0.015326 0.020287 3.80 0 \n95398 0.399094 -0.631889 -0.129960 0.095469 0.168752 15.80 0 \n248080 0.018688 -0.218777 -0.168232 -0.171916 -0.059910 42.81 0 \n232733 -0.625436 -0.939505 -0.298163 -0.165743 0.371756 1052.90 0 \n\n[10 rows x 31 columns]", 48 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
223278143322.01.743591-1.276207-1.703568-0.5514990.1828040.993246-0.4018330.183088-0.549561...0.1351170.802800-0.115902-1.6101280.0054290.0763680.029846-0.053789160.000
14556587054.0-1.205241-0.130198-1.346397-2.3722081.553399-1.7770340.6373860.3313830.050536...0.3583530.678112-0.1581650.760901-0.3912710.4801530.2123020.0362511.000
3681338696.0-2.109662-1.0447212.315128-0.2695190.918316-0.8604680.287132-0.1811020.104975...-0.1030890.1192490.875925-0.1109590.5133430.351808-0.120387-0.24772948.440
9842466657.0-0.8588230.9288311.638651-0.5356400.327566-0.8599821.212978-0.123662-1.162586...0.0919920.101818-0.2714780.5724360.5588610.249717-0.0503980.04422142.810
224727143934.0-5.718998-5.788360-0.2207370.3243905.280846-3.567975-2.1465240.656593-0.502391...0.815861-0.237749-0.5497830.1655901.0258430.767957-0.111164-1.05121159.000
7643956571.0-0.6932230.6247021.745828-0.8193600.151565-0.0846840.7317680.076800-0.252783...0.0322850.217722-0.3218890.0941260.3175760.5121950.0120480.07117329.750
1028216179.01.1629900.5856821.0812022.877693-0.226887-0.104174-0.112787-0.0991700.954979...-0.440766-0.8930270.0877460.2584070.372496-0.216396-0.0153260.0202873.800
9539865286.0-0.047052-0.2469281.783663-1.571497-1.424880-0.585279-0.609930-0.156651-2.252177...0.0885170.687270-0.0633280.399094-0.631889-0.1299600.0954690.16875215.800
248080153789.00.0357001.267053-1.059015-0.0229290.270340-1.3392480.820101-0.044629-0.328963...0.3551691.078043-0.0554210.018688-0.218777-0.168232-0.171916-0.05991042.810
232733147286.0-3.317206-4.0814670.4902044.132857-0.8276381.3933032.0919470.360839-1.259604...0.9906990.3107672.987731-0.625436-0.939505-0.298163-0.1657430.3717561052.900
\n

10 rows × 31 columns

\n
" 49 | }, 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "df.sample(10)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": "577.8760162601625" 66 | }, 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "counts = np.bincount(df[\"Class\"])\n", 74 | "class_weight = {\n", 75 | " 0: counts[0] / df.shape[0],\n", 76 | " 1: counts[1] / df.shape[0]\n", 77 | "}\n", 78 | "scale_pos_weight = class_weight[0]/class_weight[1]\n", 79 | "scale_pos_weight" 80 | ], 81 | "metadata": { 82 | "collapsed": false, 83 | "ExecuteTime": { 84 | "start_time": "2023-04-08T22:16:23.190792Z", 85 | "end_time": "2023-04-08T22:16:23.215988Z" 86 | } 87 | } 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": { 93 | "ExecuteTime": { 94 | "start_time": "2023-04-08T22:16:23.192785Z", 95 | "end_time": "2023-04-08T22:16:23.216486Z" 96 | } 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "X = df.drop(columns=[\"Class\"], axis=1)\n", 101 | "y = df[\"Class\"]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 6, 107 | "metadata": { 108 | "ExecuteTime": { 109 | "start_time": "2023-04-08T22:16:23.196787Z", 110 | "end_time": "2023-04-08T22:16:23.448461Z" 111 | } 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "model = lgb.LGBMClassifier(force_row_wise=True, boosting_type=\"dart\", learning_rate=0.0023, max_bin=384, n_estimators=300, scale_pos_weight=scale_pos_weight, verbose=-1)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 7, 121 | "metadata": { 122 | "ExecuteTime": { 123 | "start_time": "2023-04-08T22:16:23.402968Z", 124 | "end_time": "2023-04-08T22:18:16.383278Z" 125 | } 126 | }, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "Mean F1-score: 0.7968539399986524\n", 133 | "Training time: 113.36332678794861s\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "start = time.time()\n", 139 | "scores = cross_val_score(model, X, y, scoring=\"f1_macro\")\n", 140 | "print(f\"Mean F1-score: {scores.mean()}\")\n", 141 | "end = time.time()\n", 142 | "print(f\"Training time: {end - start}s\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 8, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "Mean F1-score: 0.9988588729715675\n", 154 | "Training time: 111.68864893913269s\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "start = time.time()\n", 160 | "scores = cross_val_score(model, X, y, scoring=\"accuracy\")\n", 161 | "print(f\"Mean F1-score: {scores.mean()}\")\n", 162 | "end = time.time()\n", 163 | "print(f\"Training time: {end - start}s\")" 164 | ], 165 | "metadata": { 166 | "collapsed": false, 167 | "ExecuteTime": { 168 | "start_time": "2023-04-08T22:18:16.382781Z", 169 | "end_time": "2023-04-08T22:20:08.085813Z" 170 | } 171 | } 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 3 (ipykernel)", 177 | "language": "python", 178 | "name": "python3" 179 | }, 180 | "orig_nbformat": 4 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /chapter-4/tabtransformer-credit-card.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "start_time": "2023-04-09T12:28:48.799187Z", 9 | "end_time": "2023-04-09T12:28:48.800699Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import time\n", 15 | "\n", 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import tensorflow as tf\n", 19 | "import tensorflow_addons as tfa\n", 20 | "from keras import backend as K\n", 21 | "from sklearn.model_selection import StratifiedKFold\n", 22 | "from tensorflow import keras\n", 23 | "from tensorflow.keras import layers" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 21, 29 | "metadata": { 30 | "ExecuteTime": { 31 | "start_time": "2023-04-09T12:28:48.802210Z", 32 | "end_time": "2023-04-09T12:28:50.632021Z" 33 | } 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "df = pd.read_csv(\"cc-fraud/creditcard.csv\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 22, 43 | "metadata": { 44 | "ExecuteTime": { 45 | "start_time": "2023-04-09T12:28:50.633027Z", 46 | "end_time": "2023-04-09T12:28:50.870448Z" 47 | } 48 | }, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": " Time V1 V2 V3 V4 V5 V6 \\\n96419 65760.0 -1.234665 0.804234 0.623907 -1.541686 0.517618 -0.205567 \n53817 46159.0 -0.601198 -0.568666 2.048295 0.940042 -0.838668 0.841241 \n220109 142029.0 -1.079029 0.394553 0.983123 0.886915 -0.532981 0.608702 \n162216 114938.0 0.338518 0.864410 0.506710 0.971164 0.489854 -0.512320 \n162222 114940.0 -0.706897 1.438612 -0.273680 2.031884 4.239727 4.180976 \n28559 35053.0 1.045957 0.014432 0.374282 1.249220 0.022520 0.600027 \n279720 169053.0 2.312026 -1.430082 -1.208911 -1.619857 -1.075136 -0.531039 \n242816 151670.0 1.863345 0.203137 0.048925 3.740346 -0.024943 0.766791 \n112720 72776.0 1.120144 -0.043233 0.719266 0.995361 -0.747237 -0.753670 \n21859 31898.0 -0.524524 0.372001 -0.312927 -1.122526 -2.253793 1.222536 \n\n V7 V8 V9 ... V21 V22 V23 \\\n96419 1.044546 -0.332273 0.546563 ... -0.235062 -0.132539 -0.225426 \n53817 0.333479 0.002445 -1.883368 ... -0.019566 0.221398 0.338976 \n220109 0.469422 0.581807 -0.683126 ... 0.482845 0.946737 0.126328 \n162216 0.964169 -0.486845 -0.512023 ... 0.346631 1.357021 -0.207703 \n162222 1.444147 0.314025 -2.493999 ... 0.016436 0.115246 -0.604038 \n28559 -0.131237 0.299514 0.114868 ... -0.018569 0.117538 -0.051174 \n279720 -1.106151 -0.138716 -1.059177 ... -0.201595 -0.163631 0.232024 \n242816 -0.578425 0.221620 -0.236275 ... 0.159962 0.504248 0.188202 \n112720 0.006921 -0.197221 0.428150 ... -0.028726 0.120239 -0.098999 \n21859 1.625274 -1.080441 1.068322 ... 0.652621 0.073774 0.270566 \n\n V24 V25 V26 V27 V28 Amount Class \n96419 -0.689264 -0.081429 0.805624 0.036428 0.065066 79.99 0 \n53817 -0.020215 0.178129 -0.035210 -0.159316 -0.224414 221.00 0 \n220109 -0.511093 0.066667 -0.367663 0.007497 0.063784 200.00 0 \n162216 1.226899 -0.486290 0.169571 0.071281 -0.009638 15.62 0 \n162222 0.703100 0.887917 0.317952 -0.592000 -0.378819 25.51 0 \n28559 -0.294268 0.531973 -0.299813 0.045950 0.003886 24.44 0 \n279720 0.425256 -0.149499 -0.175278 -0.005617 -0.048046 25.00 0 \n242816 0.548452 -0.200258 0.004093 0.007896 -0.023884 19.00 0 \n112720 0.804608 0.562700 0.410024 -0.016464 0.023895 49.43 0 \n21859 -0.263758 -1.405662 -0.103877 0.239186 0.113603 403.00 0 \n\n[10 rows x 31 columns]", 53 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
9641965760.0-1.2346650.8042340.623907-1.5416860.517618-0.2055671.044546-0.3322730.546563...-0.235062-0.132539-0.225426-0.689264-0.0814290.8056240.0364280.06506679.990
5381746159.0-0.601198-0.5686662.0482950.940042-0.8386680.8412410.3334790.002445-1.883368...-0.0195660.2213980.338976-0.0202150.178129-0.035210-0.159316-0.224414221.000
220109142029.0-1.0790290.3945530.9831230.886915-0.5329810.6087020.4694220.581807-0.683126...0.4828450.9467370.126328-0.5110930.066667-0.3676630.0074970.063784200.000
162216114938.00.3385180.8644100.5067100.9711640.489854-0.5123200.964169-0.486845-0.512023...0.3466311.357021-0.2077031.226899-0.4862900.1695710.071281-0.00963815.620
162222114940.0-0.7068971.438612-0.2736802.0318844.2397274.1809761.4441470.314025-2.493999...0.0164360.115246-0.6040380.7031000.8879170.317952-0.592000-0.37881925.510
2855935053.01.0459570.0144320.3742821.2492200.0225200.600027-0.1312370.2995140.114868...-0.0185690.117538-0.051174-0.2942680.531973-0.2998130.0459500.00388624.440
279720169053.02.312026-1.430082-1.208911-1.619857-1.075136-0.531039-1.106151-0.138716-1.059177...-0.201595-0.1636310.2320240.425256-0.149499-0.175278-0.005617-0.04804625.000
242816151670.01.8633450.2031370.0489253.740346-0.0249430.766791-0.5784250.221620-0.236275...0.1599620.5042480.1882020.548452-0.2002580.0040930.007896-0.02388419.000
11272072776.01.120144-0.0432330.7192660.995361-0.747237-0.7536700.006921-0.1972210.428150...-0.0287260.120239-0.0989990.8046080.5627000.410024-0.0164640.02389549.430
2185931898.0-0.5245240.372001-0.312927-1.122526-2.2537931.2225361.625274-1.0804411.068322...0.6526210.0737740.270566-0.263758-1.405662-0.1038770.2391860.113603403.000
\n

10 rows × 31 columns

\n
" 54 | }, 55 | "execution_count": 22, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "df.sample(10)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 23, 67 | "metadata": { 68 | "ExecuteTime": { 69 | "start_time": "2023-04-09T12:28:50.869444Z", 70 | "end_time": "2023-04-09T12:28:50.870953Z" 71 | } 72 | }, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "Number of positive samples in training data: 492 (0.17% of total)\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "counts = np.bincount(df[\"Class\"])\n", 84 | "print(\n", 85 | " \"Number of positive samples in training data: {} ({:.2f}% of total)\".format(\n", 86 | " counts[1], 100 * float(counts[1]) / len(df[\"Class\"])\n", 87 | " )\n", 88 | ")\n", 89 | "\n", 90 | "weight_for_0 = 1.0 / counts[0]\n", 91 | "weight_for_1 = 1.0 / counts[1]" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 24, 97 | "metadata": { 98 | "ExecuteTime": { 99 | "start_time": "2023-04-09T12:28:50.870953Z", 100 | "end_time": "2023-04-09T12:28:50.875921Z" 101 | } 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "df[\"Class\"] = df[\"Class\"].map({\n", 106 | " 0: \"Non-fraud\",\n", 107 | " 1: \"Fraud\",\n", 108 | "})" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 25, 114 | "metadata": { 115 | "ExecuteTime": { 116 | "start_time": "2023-04-09T12:28:50.875921Z", 117 | "end_time": "2023-04-09T12:28:50.888579Z" 118 | } 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "class_weight = {0: weight_for_0, 1: weight_for_1}" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 26, 128 | "metadata": { 129 | "ExecuteTime": { 130 | "start_time": "2023-04-09T12:28:50.889079Z", 131 | "end_time": "2023-04-09T12:28:50.890078Z" 132 | } 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "NUMERIC_FEATURE_NAMES = sorted(filter(lambda v: v != 'Class', list(df.columns.values)))\n", 137 | "CATEGORICAL_FEATURES_WITH_VOCABULARY = {}\n", 138 | "CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())\n", 139 | "FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES\n", 140 | "COLUMN_DEFAULTS = [\n", 141 | " [0.0] if feature_name in NUMERIC_FEATURE_NAMES else [\"NA\"]\n", 142 | " for feature_name in df.columns.values\n", 143 | "]\n", 144 | "TARGET_FEATURE_NAME = \"Class\"\n", 145 | "TARGET_LABELS = [\"Non-fraud\", \"Fraud\"]" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 27, 151 | "metadata": { 152 | "ExecuteTime": { 153 | "start_time": "2023-04-09T12:28:50.889079Z", 154 | "end_time": "2023-04-09T12:28:50.891587Z" 155 | } 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "LEARNING_RATE = 0.00007\n", 160 | "WEIGHT_DECAY = 0.0000178\n", 161 | "DROPOUT_RATE = 0.058\n", 162 | "BATCH_SIZE = 265\n", 163 | "NUM_EPOCHS = 15\n", 164 | "\n", 165 | "NUM_TRANSFORMER_BLOCKS = 3\n", 166 | "NUM_HEADS = 4\n", 167 | "EMBEDDING_DIMS = 16\n", 168 | "MLP_HIDDEN_UNITS = [\n", 169 | " 256, 256\n", 170 | "]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 28, 176 | "metadata": { 177 | "ExecuteTime": { 178 | "start_time": "2023-04-09T12:28:50.891587Z", 179 | "end_time": "2023-04-09T12:28:50.892593Z" 180 | } 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stderr", 185 | "output_type": "stream", 186 | "text": [ 187 | "/usr/local/lib/python3.9/dist-packages/numpy/core/numeric.py:2463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", 188 | " return bool(asarray(a1 == a2).all())\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "target_label_lookup = layers.StringLookup(\n", 194 | " vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0\n", 195 | ")\n", 196 | "\n", 197 | "\n", 198 | "def prepare_example(features, target):\n", 199 | " target_index = target_label_lookup(target)\n", 200 | " return features, target_index\n", 201 | "\n", 202 | "\n", 203 | "def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):\n", 204 | " dataset = tf.data.experimental.make_csv_dataset(\n", 205 | " csv_file_path,\n", 206 | " batch_size=batch_size,\n", 207 | " column_names=list(df.columns.values),\n", 208 | " column_defaults=COLUMN_DEFAULTS,\n", 209 | " label_name=TARGET_FEATURE_NAME,\n", 210 | " num_epochs=1,\n", 211 | " header=False,\n", 212 | " na_value=\"?\",\n", 213 | " shuffle=shuffle,\n", 214 | " ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)\n", 215 | " return dataset.cache()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 29, 221 | "metadata": { 222 | "ExecuteTime": { 223 | "start_time": "2023-04-09T12:28:50.892593Z", 224 | "end_time": "2023-04-09T12:28:50.893101Z" 225 | } 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "def create_model_inputs():\n", 230 | " inputs = {}\n", 231 | " for feature_name in FEATURE_NAMES:\n", 232 | " if feature_name in NUMERIC_FEATURE_NAMES:\n", 233 | " inputs[feature_name] = layers.Input(\n", 234 | " name=feature_name, shape=(), dtype=tf.float32\n", 235 | " )\n", 236 | " else:\n", 237 | " inputs[feature_name] = layers.Input(\n", 238 | " name=feature_name, shape=(), dtype=tf.string\n", 239 | " )\n", 240 | " return inputs" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 30, 246 | "metadata": { 247 | "ExecuteTime": { 248 | "start_time": "2023-04-09T12:28:50.893101Z", 249 | "end_time": "2023-04-09T12:28:51.104283Z" 250 | } 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "def encode_inputs(inputs, embedding_dims):\n", 255 | " encoded_categorical_feature_list = []\n", 256 | " numerical_feature_list = []\n", 257 | "\n", 258 | " for feature_name in inputs:\n", 259 | " if feature_name in CATEGORICAL_FEATURE_NAMES:\n", 260 | " vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]\n", 261 | " lookup = layers.StringLookup(\n", 262 | " vocabulary=vocabulary,\n", 263 | " mask_token=None,\n", 264 | " num_oov_indices=0,\n", 265 | " output_mode=\"int\",\n", 266 | " )\n", 267 | " encoded_feature = lookup(inputs[feature_name])\n", 268 | "\n", 269 | " embedding = layers.Embedding(\n", 270 | " input_dim=len(vocabulary), output_dim=embedding_dims\n", 271 | " )\n", 272 | " encoded_categorical_feature = embedding(encoded_feature)\n", 273 | " encoded_categorical_feature_list.append(encoded_categorical_feature)\n", 274 | " else:\n", 275 | " numerical_feature = tf.expand_dims(inputs[feature_name], -1)\n", 276 | " numerical_feature_list.append(numerical_feature)\n", 277 | "\n", 278 | " return encoded_categorical_feature_list, numerical_feature_list" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 31, 284 | "metadata": { 285 | "ExecuteTime": { 286 | "start_time": "2023-04-09T12:28:51.104283Z", 287 | "end_time": "2023-04-09T12:28:51.106272Z" 288 | } 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):\n", 293 | " mlp_layers = []\n", 294 | " for units in hidden_units:\n", 295 | " mlp_layers.append(normalization_layer),\n", 296 | " mlp_layers.append(layers.Dense(units, activation=activation))\n", 297 | " mlp_layers.append(layers.Dropout(dropout_rate))\n", 298 | "\n", 299 | " return keras.Sequential(mlp_layers, name=name)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 32, 305 | "metadata": { 306 | "ExecuteTime": { 307 | "start_time": "2023-04-09T12:28:51.108270Z", 308 | "end_time": "2023-04-09T12:28:51.108270Z" 309 | } 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "def create_tabtransformer_classifier(\n", 314 | " embedding_dims,\n", 315 | " mlp_hidden_units,\n", 316 | " dropout_rate,\n", 317 | "):\n", 318 | " inputs = create_model_inputs()\n", 319 | " encoded_categorical_feature_list, numerical_feature_list = encode_inputs(\n", 320 | " inputs, embedding_dims\n", 321 | " )\n", 322 | " numerical_features = layers.concatenate(numerical_feature_list)\n", 323 | " numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)\n", 324 | " features = layers.concatenate([numerical_features])\n", 325 | "\n", 326 | " features = create_mlp(\n", 327 | " hidden_units=mlp_hidden_units,\n", 328 | " dropout_rate=dropout_rate,\n", 329 | " activation=keras.activations.selu,\n", 330 | " normalization_layer=layers.BatchNormalization(),\n", 331 | " name=\"MLP\",\n", 332 | " )(features)\n", 333 | " outputs = layers.Dense(units=1, activation=\"sigmoid\", name=\"sigmoid\")(features)\n", 334 | " model = keras.Model(inputs=inputs, outputs=outputs)\n", 335 | " return model" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 33, 341 | "outputs": [], 342 | "source": [ 343 | "def recall_metric(y_true, y_pred):\n", 344 | " true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))\n", 345 | " possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))\n", 346 | " recall = true_positives / (possible_positives + K.epsilon())\n", 347 | " return recall\n", 348 | "\n", 349 | "\n", 350 | "def precision_metric(y_true, y_pred):\n", 351 | " true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))\n", 352 | " predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))\n", 353 | " precision = true_positives / (predicted_positives + K.epsilon())\n", 354 | " return precision\n", 355 | "\n", 356 | "\n", 357 | "def f1_metric(y_true, y_pred):\n", 358 | " precision = precision_metric(y_true, y_pred)\n", 359 | " recall = recall_metric(y_true, y_pred)\n", 360 | " return 2 * ((precision * recall) / (precision + recall + K.epsilon()))" 361 | ], 362 | "metadata": { 363 | "collapsed": false, 364 | "ExecuteTime": { 365 | "start_time": "2023-04-09T12:28:51.108777Z", 366 | "end_time": "2023-04-09T12:28:51.109775Z" 367 | } 368 | } 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 34, 373 | "outputs": [], 374 | "source": [ 375 | "def fit_model(\n", 376 | " model,\n", 377 | " train_data_file,\n", 378 | " test_data_file,\n", 379 | " num_epochs,\n", 380 | " learning_rate,\n", 381 | " weight_decay,\n", 382 | " batch_size,\n", 383 | "):\n", 384 | " optimizer = tfa.optimizers.AdamW(\n", 385 | " learning_rate=learning_rate,\n", 386 | " weight_decay=weight_decay\n", 387 | " )\n", 388 | "\n", 389 | " model.compile(\n", 390 | " optimizer=optimizer,\n", 391 | " loss=keras.losses.BinaryCrossentropy(),\n", 392 | " metrics=[keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n", 393 | " f1_metric,\n", 394 | " precision_metric,\n", 395 | " recall_metric],\n", 396 | " )\n", 397 | "\n", 398 | " train_dataset = get_dataset_from_csv(\n", 399 | " train_data_file, batch_size, shuffle=True\n", 400 | " )\n", 401 | " validation_dataset = get_dataset_from_csv(\n", 402 | " test_data_file, batch_size\n", 403 | " )\n", 404 | "\n", 405 | " callback = keras.callbacks.EarlyStopping(\n", 406 | " monitor='loss', patience=3\n", 407 | " )\n", 408 | "\n", 409 | " history = model.fit(\n", 410 | " train_dataset, epochs=num_epochs, validation_data=validation_dataset, class_weight=class_weight,\n", 411 | " callbacks=[callback], verbose=0\n", 412 | " )\n", 413 | "\n", 414 | " _, accuracy, f1, precision, recall = model.evaluate(validation_dataset, verbose=0)\n", 415 | "\n", 416 | " print(f\"Validation accuracy: {round(accuracy * 100, 2)}%\")\n", 417 | " print(f\"Validation F1: {f1}\")\n", 418 | "\n", 419 | " return f1, accuracy" 420 | ], 421 | "metadata": { 422 | "collapsed": false, 423 | "ExecuteTime": { 424 | "start_time": "2023-04-09T12:28:51.108777Z", 425 | "end_time": "2023-04-09T12:28:51.110775Z" 426 | } 427 | } 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 35, 432 | "outputs": [ 433 | { 434 | "name": "stdout", 435 | "output_type": "stream", 436 | "text": [ 437 | "Total model weights: 74165\n" 438 | ] 439 | } 440 | ], 441 | "source": [ 442 | "tabtransformer_model = create_tabtransformer_classifier(\n", 443 | " embedding_dims=EMBEDDING_DIMS,\n", 444 | " mlp_hidden_units=MLP_HIDDEN_UNITS,\n", 445 | " dropout_rate=DROPOUT_RATE,\n", 446 | ")\n", 447 | "\n", 448 | "print(\"Total model weights:\", tabtransformer_model.count_params())" 449 | ], 450 | "metadata": { 451 | "collapsed": false, 452 | "ExecuteTime": { 453 | "start_time": "2023-04-09T12:28:51.111776Z", 454 | "end_time": "2023-04-09T12:28:51.113282Z" 455 | } 456 | } 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 36, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "Validation accuracy: 91.84%\n", 467 | "Validation F1: 0.026406388729810715\n", 468 | "Validation accuracy: 95.47%\n", 469 | "Validation F1: 0.053651679307222366\n", 470 | "Validation accuracy: 89.24%\n", 471 | "Validation F1: 0.033678654581308365\n", 472 | "Validation accuracy: 92.72%\n", 473 | "Validation F1: 0.06240801140666008\n", 474 | "Validation accuracy: 97.58%\n", 475 | "Validation F1: 0.0628075897693634\n" 476 | ] 477 | } 478 | ], 479 | "source": [ 480 | "f1_scores = []\n", 481 | "acc_scores = []\n", 482 | "k_fold = StratifiedKFold(n_splits=5, shuffle=True)\n", 483 | "start = time.time()\n", 484 | "X = df.drop(columns=[\"Class\"], axis=1)\n", 485 | "y = df[\"Class\"]\n", 486 | "\n", 487 | "for fold, (train_data_idx, test_data_idx) in enumerate(k_fold.split(X, y)):\n", 488 | " train_data_file = f\"cc-fraud/train_data_{fold}.csv\"\n", 489 | " test_data_file = f\"cc-fraud/test_data_{fold}.csv\"\n", 490 | "\n", 491 | " train_data = df.iloc[train_data_idx]\n", 492 | " test_data = df.iloc[test_data_idx]\n", 493 | "\n", 494 | " train_data.to_csv(train_data_file, index=False, header=False)\n", 495 | " test_data.to_csv(test_data_file, index=False, header=False)\n", 496 | "\n", 497 | " f1, accuracy = fit_model(\n", 498 | " model=tabtransformer_model,\n", 499 | " train_data_file=train_data_file,\n", 500 | " test_data_file=test_data_file,\n", 501 | " num_epochs=NUM_EPOCHS,\n", 502 | " learning_rate=LEARNING_RATE,\n", 503 | " weight_decay=WEIGHT_DECAY,\n", 504 | " batch_size=BATCH_SIZE,\n", 505 | " )\n", 506 | " f1_scores.append(f1)\n", 507 | " acc_scores.append(accuracy)\n", 508 | "end = time.time()" 509 | ], 510 | "metadata": { 511 | "collapsed": false, 512 | "ExecuteTime": { 513 | "start_time": "2023-04-09T12:28:51.112276Z", 514 | "end_time": "2023-04-09T12:34:17.658931Z" 515 | } 516 | } 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 39, 521 | "outputs": [ 522 | { 523 | "name": "stdout", 524 | "output_type": "stream", 525 | "text": [ 526 | "Mean F1-score: 0.04779046475887298\n", 527 | "Mean accuracy: 0.9336989402770997\n", 528 | "Training time: 326.7684164047241s\n" 529 | ] 530 | } 531 | ], 532 | "source": [ 533 | "print(f\"Mean F1-score: {sum(f1_scores) / len(f1_scores)}\")\n", 534 | "print(f\"Mean accuracy: {sum(acc_scores) / len(acc_scores)}\")\n", 535 | "print(f\"Training time: {end - start}s\")" 536 | ], 537 | "metadata": { 538 | "collapsed": false, 539 | "ExecuteTime": { 540 | "start_time": "2023-04-09T12:34:18.134567Z", 541 | "end_time": "2023-04-09T12:34:18.137659Z" 542 | } 543 | } 544 | } 545 | ], 546 | "metadata": { 547 | "kernelspec": { 548 | "display_name": "Python 3 (ipykernel)", 549 | "language": "python", 550 | "name": "python3" 551 | }, 552 | "orig_nbformat": 4 553 | }, 554 | "nbformat": 4, 555 | "nbformat_minor": 2 556 | } 557 | -------------------------------------------------------------------------------- /chapter-4/xgboost-credit-card.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "start_time": "2023-04-08T22:45:08.661035Z", 9 | "end_time": "2023-04-08T22:45:09.804059Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "import xgboost as xgb\n", 17 | "from sklearn.model_selection import cross_val_score\n", 18 | "import time" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "ExecuteTime": { 26 | "start_time": "2023-04-08T22:45:09.819967Z", 27 | "end_time": "2023-04-08T22:45:12.935430Z" 28 | } 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "df = pd.read_csv(\"cc-fraud/creditcard.csv\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "ExecuteTime": { 40 | "start_time": "2023-04-08T22:45:12.937930Z", 41 | "end_time": "2023-04-08T22:45:13.205032Z" 42 | } 43 | }, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": " Time V1 V2 V3 V4 V5 V6 \\\n90016 62835.0 1.186916 -0.628387 0.823801 0.634563 -0.991028 0.378426 \n169324 119638.0 2.091130 -0.100302 -1.463019 0.057325 0.445755 -0.296167 \n157914 110608.0 0.172057 1.157305 -0.347110 -0.391079 1.090960 -0.670067 \n54321 46390.0 -2.446626 -1.392801 2.486090 -1.437006 -0.130777 0.247363 \n224520 143840.0 1.987746 -0.345377 -0.857260 -0.026509 -0.255560 -0.433971 \n134279 80731.0 1.088271 0.515387 0.034373 2.544648 0.127277 -0.815731 \n277531 167698.0 -0.549196 1.667503 1.723408 4.506009 -0.387283 0.909142 \n253777 156415.0 -0.746983 0.806339 -1.143385 -1.829478 -1.254622 -0.462788 \n213054 139095.0 -1.187823 -0.797728 1.826044 -1.726375 0.387285 -0.455587 \n138876 82895.0 -2.001406 1.469920 1.430859 1.484364 -1.855166 0.650517 \n\n V7 V8 V9 ... V21 V22 V23 \\\n90016 -0.733348 0.129817 -0.467202 ... -0.413338 -0.459490 -0.115784 \n169324 0.086775 -0.144889 0.292411 ... -0.299988 -0.742414 0.196875 \n157914 0.875220 -0.178660 1.250112 ... -0.507823 -1.058637 0.074348 \n54321 -0.122512 -0.217604 0.171860 ... -0.193329 0.546879 0.428545 \n224520 -0.268325 0.049544 1.195217 ... -0.439221 -1.184703 0.418502 \n134279 0.657932 -0.296933 -1.052196 ... 0.028268 -0.108078 -0.167465 \n277531 -0.217665 0.666612 -1.666344 ... 0.302117 1.030549 -0.120420 \n253777 -0.994538 1.277622 -1.207482 ... -0.022226 -0.519674 0.629858 \n213054 -0.434468 0.174386 -0.675245 ... -0.157146 -0.204616 -0.321785 \n138876 -1.487717 1.651590 0.920541 ... 0.134146 0.430580 0.138413 \n\n V24 V25 V26 V27 V28 Amount Class \n90016 0.013555 0.609051 -0.305222 0.077682 0.025180 45.45 0 \n169324 -1.026305 -0.179246 0.240368 -0.073363 -0.079581 1.98 0 \n157914 0.309518 -0.416293 0.086843 0.196423 0.075552 0.89 0 \n54321 0.025033 0.487434 -0.373896 -0.686355 0.157639 30.00 0 \n224520 -0.578599 -0.421837 -1.072648 0.024619 -0.050843 7.00 0 \n134279 0.398679 0.700764 0.060626 -0.054062 0.022710 76.07 0 \n277531 0.010829 -0.379345 0.538205 0.336028 0.179168 25.03 0 \n253777 0.695904 -1.139688 -0.898166 -0.304241 -0.148031 38.00 0 \n213054 -0.714656 0.633766 0.815002 -0.036699 0.056779 26.99 0 \n138876 0.426804 -0.443392 -0.360828 -0.404383 0.046306 3.43 0 \n\n[10 rows x 31 columns]", 48 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
9001662835.01.186916-0.6283870.8238010.634563-0.9910280.378426-0.7333480.129817-0.467202...-0.413338-0.459490-0.1157840.0135550.609051-0.3052220.0776820.02518045.450
169324119638.02.091130-0.100302-1.4630190.0573250.445755-0.2961670.086775-0.1448890.292411...-0.299988-0.7424140.196875-1.026305-0.1792460.240368-0.073363-0.0795811.980
157914110608.00.1720571.157305-0.347110-0.3910791.090960-0.6700670.875220-0.1786601.250112...-0.507823-1.0586370.0743480.309518-0.4162930.0868430.1964230.0755520.890
5432146390.0-2.446626-1.3928012.486090-1.437006-0.1307770.247363-0.122512-0.2176040.171860...-0.1933290.5468790.4285450.0250330.487434-0.373896-0.6863550.15763930.000
224520143840.01.987746-0.345377-0.857260-0.026509-0.255560-0.433971-0.2683250.0495441.195217...-0.439221-1.1847030.418502-0.578599-0.421837-1.0726480.024619-0.0508437.000
13427980731.01.0882710.5153870.0343732.5446480.127277-0.8157310.657932-0.296933-1.052196...0.028268-0.108078-0.1674650.3986790.7007640.060626-0.0540620.02271076.070
277531167698.0-0.5491961.6675031.7234084.506009-0.3872830.909142-0.2176650.666612-1.666344...0.3021171.030549-0.1204200.010829-0.3793450.5382050.3360280.17916825.030
253777156415.0-0.7469830.806339-1.143385-1.829478-1.254622-0.462788-0.9945381.277622-1.207482...-0.022226-0.5196740.6298580.695904-1.139688-0.898166-0.304241-0.14803138.000
213054139095.0-1.187823-0.7977281.826044-1.7263750.387285-0.455587-0.4344680.174386-0.675245...-0.157146-0.204616-0.321785-0.7146560.6337660.815002-0.0366990.05677926.990
13887682895.0-2.0014061.4699201.4308591.484364-1.8551660.650517-1.4877171.6515900.920541...0.1341460.4305800.1384130.426804-0.443392-0.360828-0.4043830.0463063.430
\n

10 rows × 31 columns

\n
" 49 | }, 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "df.sample(10)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": "577.8760162601625" 66 | }, 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "counts = np.bincount(df[\"Class\"])\n", 74 | "class_weight = {\n", 75 | " 0: counts[0] / df.shape[0],\n", 76 | " 1: counts[1] / df.shape[0]\n", 77 | "}\n", 78 | "scale_pos_weight = class_weight[0]/class_weight[1]\n", 79 | "scale_pos_weight" 80 | ], 81 | "metadata": { 82 | "collapsed": false, 83 | "ExecuteTime": { 84 | "start_time": "2023-04-08T22:45:13.206025Z", 85 | "end_time": "2023-04-08T22:45:13.208026Z" 86 | } 87 | } 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": { 93 | "ExecuteTime": { 94 | "start_time": "2023-04-08T22:45:13.206526Z", 95 | "end_time": "2023-04-08T22:45:13.208533Z" 96 | } 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "X = df.drop(columns=[\"Class\"], axis=1)\n", 101 | "y = df[\"Class\"]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 6, 107 | "outputs": [], 108 | "source": [ 109 | "model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, n_estimators=320, learning_rate=0.423)" 110 | ], 111 | "metadata": { 112 | "collapsed": false, 113 | "ExecuteTime": { 114 | "start_time": "2023-04-08T22:45:13.207027Z", 115 | "end_time": "2023-04-08T22:45:13.218027Z" 116 | } 117 | } 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": { 123 | "ExecuteTime": { 124 | "start_time": "2023-04-08T22:45:13.222526Z", 125 | "end_time": "2023-04-08T22:51:05.097714Z" 126 | } 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "Mean F1-score: 0.8205245517192994\n", 134 | "K-fold training time: 351.9986901283264s\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "start = time.time()\n", 140 | "scores = cross_val_score(model, X, y, scoring=\"f1_macro\")\n", 141 | "print(f\"Mean F1-score: {scores.mean()}\")\n", 142 | "end = time.time()\n", 143 | "print(f\"K-fold training time: {end - start}s\")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "Mean F1-score: 0.9841227407322783\n", 155 | "K-fold training time: 314.5866029262543s\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "start = time.time()\n", 161 | "scores = cross_val_score(model, X, y, scoring=\"accuracy\")\n", 162 | "print(f\"Mean F1-score: {scores.mean()}\")\n", 163 | "end = time.time()\n", 164 | "print(f\"K-fold training time: {end - start}s\")" 165 | ], 166 | "metadata": { 167 | "collapsed": false, 168 | "ExecuteTime": { 169 | "start_time": "2023-04-08T22:51:05.096218Z", 170 | "end_time": "2023-04-08T22:56:19.752816Z" 171 | } 172 | } 173 | } 174 | ], 175 | "metadata": { 176 | "kernelspec": { 177 | "display_name": "Python 3 (ipykernel)", 178 | "language": "python", 179 | "name": "python3" 180 | }, 181 | "orig_nbformat": 4 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 2 185 | } 186 | -------------------------------------------------------------------------------- /chapter-6/credit-score-classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import joblib\n", 12 | "import lightgbm as lgb\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import numpy as np\n", 15 | "import optuna\n", 16 | "import pandas as pd\n", 17 | "import seaborn as sns\n", 18 | "from imblearn.over_sampling import SMOTE\n", 19 | "from sklearn.ensemble import RandomForestClassifier\n", 20 | "from sklearn.metrics import f1_score\n", 21 | "from sklearn.model_selection import KFold\n", 22 | "from sklearn.tree import DecisionTreeClassifier" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "outputs": [], 29 | "source": [ 30 | "train_df = pd.read_csv(\"creditscore/train.csv\", low_memory=False)" 31 | ], 32 | "metadata": { 33 | "collapsed": false 34 | } 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "source": [ 39 | "## Data Preparation" 40 | ], 41 | "metadata": { 42 | "collapsed": false 43 | } 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "outputs": [], 49 | "source": [ 50 | "train_df.shape" 51 | ], 52 | "metadata": { 53 | "collapsed": false 54 | } 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "outputs": [], 60 | "source": [ 61 | "train_df.sample(30)" 62 | ], 63 | "metadata": { 64 | "collapsed": false 65 | } 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "outputs": [], 71 | "source": [ 72 | "train_df.info()" 73 | ], 74 | "metadata": { 75 | "collapsed": false 76 | } 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "outputs": [], 82 | "source": [ 83 | "train_df.nunique()" 84 | ], 85 | "metadata": { 86 | "collapsed": false 87 | } 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "source": [ 92 | "## Data Preparation" 93 | ], 94 | "metadata": { 95 | "collapsed": false 96 | } 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "source": [ 101 | "### Missing and duplicated values" 102 | ], 103 | "metadata": { 104 | "collapsed": false 105 | } 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "outputs": [], 111 | "source": [ 112 | "train_df.isnull().sum()" 113 | ], 114 | "metadata": { 115 | "collapsed": false 116 | } 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "outputs": [], 122 | "source": [ 123 | "train_df[train_df.duplicated()]" 124 | ], 125 | "metadata": { 126 | "collapsed": false 127 | } 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "source": [ 132 | "## Data Cleaning" 133 | ], 134 | "metadata": { 135 | "collapsed": false 136 | } 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "outputs": [], 142 | "source": [ 143 | "numerical_columns = ['Age',\n", 144 | " 'Annual_Income',\n", 145 | " 'Monthly_Inhand_Salary',\n", 146 | " 'Num_Bank_Accounts',\n", 147 | " 'Num_Credit_Card',\n", 148 | " 'Interest_Rate',\n", 149 | " 'Num_of_Loan',\n", 150 | " 'Delay_from_due_date',\n", 151 | " 'Num_of_Delayed_Payment',\n", 152 | " 'Changed_Credit_Limit',\n", 153 | " 'Num_Credit_Inquiries',\n", 154 | " 'Outstanding_Debt',\n", 155 | " 'Credit_Utilization_Ratio',\n", 156 | " 'Credit_History_Age',\n", 157 | " 'Total_EMI_per_month',\n", 158 | " 'Amount_invested_monthly',\n", 159 | " 'Monthly_Balance']" 160 | ], 161 | "metadata": { 162 | "collapsed": false 163 | } 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "outputs": [], 169 | "source": [ 170 | "categorical_columns = [\n", 171 | " \"Occupation\",\n", 172 | " \"Credit_Mix\",\n", 173 | " \"Payment_of_Min_Amount\",\n", 174 | " \"Payment_Behaviour\",\n", 175 | " \"Credit_Score\",\n", 176 | " \"Month\",\n", 177 | " \"auto_loan\",\n", 178 | " \"credit-builder_loan\",\n", 179 | " \"debt_consolidation_loan\",\n", 180 | " \"home_equity_loan\",\n", 181 | " \"mortgage_loan\",\n", 182 | " \"unspecified_loan\",\n", 183 | " \"payday_loan\",\n", 184 | " \"personal_loan\",\n", 185 | " \"student_loan\"\n", 186 | "]" 187 | ], 188 | "metadata": { 189 | "collapsed": false 190 | } 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "outputs": [], 196 | "source": [ 197 | "def clean_type_of_loan(frame):\n", 198 | " frame[\"auto_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"auto loan\").astype(bool)\n", 199 | " frame[\"credit-builder_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"credit-builder loan\").astype(bool)\n", 200 | " frame[\"debt_consolidation_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"debt consolidation loan\").astype(bool)\n", 201 | " frame[\"home_equity_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"home equity loan\").astype(bool)\n", 202 | " frame[\"mortgage_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"mortgage loan\").astype(bool)\n", 203 | " frame[\"unspecified_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"not specified\").astype(bool)\n", 204 | " frame[\"payday_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"payday loan\").astype(bool)\n", 205 | " frame[\"personal_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"personal loan\").astype(bool)\n", 206 | " frame[\"student_loan\"] = frame[\"Type_of_Loan\"].str.lower().str.contains(\"student loan\").astype(bool)\n", 207 | " return frame" 208 | ], 209 | "metadata": { 210 | "collapsed": false 211 | } 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "outputs": [], 217 | "source": [ 218 | "def clean_credit_age(age):\n", 219 | " if age == 'nan':\n", 220 | " return np.nan\n", 221 | " if not \"Years\" in age:\n", 222 | " return age\n", 223 | " years, months = age.split(\" Years and \")\n", 224 | " months = months.replace(\" Months\", \"\")\n", 225 | " return int(years) + int(months) / 12" 226 | ], 227 | "metadata": { 228 | "collapsed": false 229 | } 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "outputs": [], 235 | "source": [ 236 | "def clean_outliers(frame):\n", 237 | " frame.loc[frame[\"Age\"] > 65, \"Age\"] = 65\n", 238 | " frame.loc[frame[\"Num_Bank_Accounts\"] > 1000, \"Num_Bank_Accounts\"] = 1000\n", 239 | " frame.loc[frame[\"Monthly_Balance\"] > 1e6, \"Monthly_Balance\"] = np.nan\n", 240 | " return frame" 241 | ], 242 | "metadata": { 243 | "collapsed": false 244 | } 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "outputs": [], 250 | "source": [ 251 | "def clean_data(frame, is_test=False):\n", 252 | " numeric_object_columns = ['Age', 'Annual_Income', 'Delay_from_due_date', 'Num_of_Loan', 'Num_of_Delayed_Payment',\n", 253 | " 'Changed_Credit_Limit', 'Outstanding_Debt', 'Amount_invested_monthly', 'Monthly_Balance']\n", 254 | " for col in numeric_object_columns:\n", 255 | " frame[col] = frame[col].astype(str).str.replace(r'[^\\d\\.]', '', regex=True)\n", 256 | " frame[col] = pd.to_numeric(frame[col], errors=\"coerce\")\n", 257 | " frame[\"Credit_History_Age\"] = frame[\"Credit_History_Age\"].astype(str).apply(clean_credit_age)\n", 258 | " frame[\"Credit_History_Age\"] = pd.to_numeric(frame[\"Credit_History_Age\"], errors=\"coerce\")\n", 259 | "\n", 260 | " frame = clean_type_of_loan(frame)\n", 261 | "\n", 262 | " frame = clean_outliers(frame)\n", 263 | "\n", 264 | " frame[\"Occupation\"] = frame[\"Occupation\"].astype(str).str.replace(\"_______\", \"\")\n", 265 | " frame[\"Occupation\"] = frame[\"Occupation\"].astype(str).str.replace(\"nan\", \"\")\n", 266 | " frame[\"Payment_Behaviour\"] = frame[\"Payment_Behaviour\"].astype(str).str.replace(\"!@9#%8\", \"\")\n", 267 | " frame[\"Payment_Behaviour\"] = frame[\"Payment_Behaviour\"].astype(str).str.replace(\"nan\", \"\")\n", 268 | " frame = frame.drop_duplicates()\n", 269 | "\n", 270 | " frame = frame.drop(columns=['ID', 'Customer_ID', 'Name', 'SSN', \"Type_of_Loan\"], axis=1)\n", 271 | "\n", 272 | " frame.loc[frame[\"Num_Bank_Accounts\"] < 0, \"Num_Bank_Accounts\"] = np.nan\n", 273 | "\n", 274 | " for f in frame.columns:\n", 275 | " if is_test and f == \"Credit_Score\":\n", 276 | " break\n", 277 | " if f in numerical_columns:\n", 278 | " frame[f].fillna(frame[f].median(), inplace=True)\n", 279 | " else:\n", 280 | " frame[f].fillna(frame[f].mode()[0], inplace=True)\n", 281 | "\n", 282 | " return frame" 283 | ], 284 | "metadata": { 285 | "collapsed": false 286 | } 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "outputs": [], 292 | "source": [ 293 | "train_df = clean_data(train_df)" 294 | ], 295 | "metadata": { 296 | "collapsed": false 297 | } 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "outputs": [], 303 | "source": [ 304 | "train_df.isnull().sum()" 305 | ], 306 | "metadata": { 307 | "collapsed": false 308 | } 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "outputs": [], 314 | "source": [ 315 | "train_df[train_df.duplicated()]" 316 | ], 317 | "metadata": { 318 | "collapsed": false 319 | } 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "outputs": [], 325 | "source": [ 326 | "train_df.info()" 327 | ], 328 | "metadata": { 329 | "collapsed": false 330 | } 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "outputs": [], 336 | "source": [ 337 | "train_df.nunique()" 338 | ], 339 | "metadata": { 340 | "collapsed": false 341 | } 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "outputs": [], 347 | "source": [ 348 | "train_df.sample(20)" 349 | ], 350 | "metadata": { 351 | "collapsed": false 352 | } 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "source": [ 357 | "## Exploratory Data Analysis" 358 | ], 359 | "metadata": { 360 | "collapsed": false 361 | } 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "outputs": [], 367 | "source": [ 368 | "train_df.describe().T.style.bar(subset=['mean'])" 369 | ], 370 | "metadata": { 371 | "collapsed": false 372 | } 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "outputs": [], 378 | "source": [ 379 | "_ = train_df.hist(bins=20, figsize=(20, 15))" 380 | ], 381 | "metadata": { 382 | "collapsed": false 383 | } 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "outputs": [], 389 | "source": [ 390 | "plt.figure(figsize=(12,10))\n", 391 | "sns.histplot(train_df[\"Age\"], bins=20)\n", 392 | "plt.title('Customer Age')\n", 393 | "plt.savefig(\"figures/ch6_credit_age.png\", dpi=600)" 394 | ], 395 | "metadata": { 396 | "collapsed": false 397 | } 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "outputs": [], 403 | "source": [ 404 | "plt.figure(figsize=(12,10))\n", 405 | "sns.histplot(train_df[\"Monthly_Inhand_Salary\"], bins=30)\n", 406 | "plt.title('Monthly Inhand Salary')\n", 407 | "plt.savefig(\"figures/ch6_credit_salary.png\", dpi=600)" 408 | ], 409 | "metadata": { 410 | "collapsed": false 411 | } 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "outputs": [], 417 | "source": [ 418 | "fig, axes = plt.subplots(3, 2, figsize=(20, 15))\n", 419 | "sns.histplot(train_df['Occupation'], ax=axes[0, 0])\n", 420 | "sns.histplot(train_df['Credit_Mix'], ax=axes[0, 1])\n", 421 | "sns.histplot(train_df['Payment_of_Min_Amount'], ax=axes[1, 0])\n", 422 | "sns.histplot(train_df['Payment_Behaviour'], ax=axes[1, 1])\n", 423 | "sns.histplot(train_df['Credit_Score'], ax=axes[2, 0])\n", 424 | "sns.histplot(train_df['Month'], ax=axes[2, 1])" 425 | ], 426 | "metadata": { 427 | "collapsed": false 428 | } 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "outputs": [], 434 | "source": [ 435 | "plt.figure(figsize=(12,10))\n", 436 | "sns.histplot(train_df[\"Credit_Score\"], bins=30)\n", 437 | "plt.title('Credit Score')\n", 438 | "plt.savefig(\"figures/ch6_credit_score.png\", dpi=600)" 439 | ], 440 | "metadata": { 441 | "collapsed": false 442 | } 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "outputs": [], 448 | "source": [ 449 | "plt.figure(figsize=(12,10))\n", 450 | "sns.heatmap(train_df.corr(method=\"spearman\"), cmap='coolwarm')\n", 451 | "plt.title('Correlation Heatmap')\n", 452 | "plt.savefig(\"figures/ch6_credit_correlations.png\", dpi=600)" 453 | ], 454 | "metadata": { 455 | "collapsed": false 456 | } 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "outputs": [], 462 | "source": [ 463 | "train_df.groupby(\"Credit_Score\")[\"Annual_Income\"].mean().plot.bar()" 464 | ], 465 | "metadata": { 466 | "collapsed": false 467 | } 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "outputs": [], 473 | "source": [ 474 | "train_df.groupby(\"Credit_Score\")[\"Age\"].mean().plot.bar()" 475 | ], 476 | "metadata": { 477 | "collapsed": false 478 | } 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "outputs": [], 484 | "source": [ 485 | "sns.scatterplot(x='Age',y='Annual_Income',hue='Credit_Score',data=train_df)" 486 | ], 487 | "metadata": { 488 | "collapsed": false 489 | } 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "source": [], 494 | "metadata": { 495 | "collapsed": false 496 | } 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "outputs": [], 502 | "source": [ 503 | "sns.scatterplot(x='Age',y='Monthly_Inhand_Salary',hue='Credit_Score',data=train_df)" 504 | ], 505 | "metadata": { 506 | "collapsed": false 507 | } 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "outputs": [], 513 | "source": [ 514 | "sns.scatterplot(x='Num_of_Delayed_Payment',y='Credit_History_Age',hue='Credit_Score',data=train_df)" 515 | ], 516 | "metadata": { 517 | "collapsed": false 518 | } 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "outputs": [], 524 | "source": [ 525 | "sns.scatterplot(x='Monthly_Inhand_Salary',y='Monthly_Balance',hue='Credit_Score',data=train_df)" 526 | ], 527 | "metadata": { 528 | "collapsed": false 529 | } 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "outputs": [], 535 | "source": [ 536 | "sns.scatterplot(x='Delay_from_due_date',y='Outstanding_Debt',hue='Credit_Score',data=train_df)" 537 | ], 538 | "metadata": { 539 | "collapsed": false 540 | } 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "outputs": [], 546 | "source": [ 547 | "sns.scatterplot(x='Monthly_Inhand_Salary',y='Outstanding_Debt',hue='Credit_Score',data=train_df)" 548 | ], 549 | "metadata": { 550 | "collapsed": false 551 | } 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "outputs": [], 557 | "source": [ 558 | "sns.scatterplot(x='Annual_Income',y='Outstanding_Debt',hue='Credit_Score',data=train_df)" 559 | ], 560 | "metadata": { 561 | "collapsed": false 562 | } 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "source": [ 567 | "## Data Preparation" 568 | ], 569 | "metadata": { 570 | "collapsed": false 571 | } 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "outputs": [], 577 | "source": [ 578 | "def prepare_data(frame, test=False, resample=False):\n", 579 | " for feature in categorical_columns:\n", 580 | " if test and feature == 'Credit_Score':\n", 581 | " continue\n", 582 | " frame[feature] = pd.Series(frame[feature], dtype=\"category\")\n", 583 | " if not test:\n", 584 | " X_prep = frame.drop(columns=[\"Credit_Score\"], axis=1)\n", 585 | " else:\n", 586 | " X_prep = frame\n", 587 | " X_dummies = pd.get_dummies(X_prep)\n", 588 | " y = train_df[\"Credit_Score\"]\n", 589 | " if not test and resample:\n", 590 | " smote = SMOTE(sampling_strategy='auto')\n", 591 | " return smote.fit_resample(X_dummies, y)\n", 592 | " return X_dummies, y" 593 | ], 594 | "metadata": { 595 | "collapsed": false 596 | } 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "outputs": [], 602 | "source": [ 603 | "X, y = prepare_data(train_df, resample=True)" 604 | ], 605 | "metadata": { 606 | "collapsed": false 607 | } 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "outputs": [], 613 | "source": [ 614 | "y.value_counts(normalize=True)" 615 | ], 616 | "metadata": { 617 | "collapsed": false 618 | } 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "outputs": [], 624 | "source": [ 625 | "plt.figure(figsize=(12,10))\n", 626 | "sns.histplot(y, bins=30)\n", 627 | "plt.title('Credit Score')\n", 628 | "plt.savefig(\"figures/ch6_credit_score_normal.png\", dpi=600)" 629 | ], 630 | "metadata": { 631 | "collapsed": false 632 | } 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "source": [ 637 | "## Modeling" 638 | ], 639 | "metadata": { 640 | "collapsed": false 641 | } 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "outputs": [], 647 | "source": [ 648 | "def cross_validate_with_smote(val_model, X_cross, y_cross, verbose=True):\n", 649 | " X_cross = X_cross.values\n", 650 | " y_cross = y_cross.values\n", 651 | " kf = KFold(n_splits=5)\n", 652 | " accuracies = []\n", 653 | " f1_scores = []\n", 654 | " for fold, (train_index, test_index) in enumerate(kf.split(X_cross), 1):\n", 655 | " if verbose:\n", 656 | " print(f'Fold {fold}:')\n", 657 | " X_train = X_cross[train_index]\n", 658 | " y_train = y_cross[train_index]\n", 659 | " X_val = X_cross[test_index]\n", 660 | " y_val = y_cross[test_index]\n", 661 | " smote = SMOTE()\n", 662 | " X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n", 663 | " if verbose:\n", 664 | " print(\"Fitting model\")\n", 665 | " val_model.fit(X_train_resampled, y_train_resampled) \n", 666 | " y_pred = val_model.predict(X_val)\n", 667 | " \n", 668 | " accuracy = val_model.score(X_val, y_val)\n", 669 | " f1 = f1_score(y_val, y_pred, average=\"macro\")\n", 670 | " accuracies.append(accuracy)\n", 671 | " f1_scores.append(f1)\n", 672 | " \n", 673 | " if verbose:\n", 674 | " print(f'Accuracy: {accuracy}')\n", 675 | " print(f'F1 score: {f1}')\n", 676 | " return np.array(accuracies), np.array(f1_scores)\n" 677 | ], 678 | "metadata": { 679 | "collapsed": false 680 | } 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "outputs": [], 686 | "source": [ 687 | "tree = DecisionTreeClassifier()\n", 688 | "scores, f1_scores = cross_validate_with_smote(tree, X, y)\n", 689 | "print(f\"Accuracy: {scores.mean()}\")\n", 690 | "print(f\"F1: {f1_scores.mean()}\")" 691 | ], 692 | "metadata": { 693 | "collapsed": false 694 | } 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "outputs": [], 700 | "source": [ 701 | "forest = RandomForestClassifier()\n", 702 | "scores, f1_scores = cross_validate_with_smote(forest, X, y)\n", 703 | "print(f\"Accuracy: {scores.mean()}\")\n", 704 | "print(f\"F1: {f1_scores.mean()}\")" 705 | ], 706 | "metadata": { 707 | "collapsed": false 708 | } 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "outputs": [], 714 | "source": [ 715 | "lgbm = lgb.LGBMClassifier(force_row_wise=True, verbose = -1)\n", 716 | "scores, f1_scores = cross_validate_with_smote(lgbm, X, y)\n", 717 | "print(f\"Accuracy: {scores.mean()}\")\n", 718 | "print(f\"F1: {f1_scores.mean()}\")" 719 | ], 720 | "metadata": { 721 | "collapsed": false 722 | } 723 | }, 724 | { 725 | "cell_type": "markdown", 726 | "source": [ 727 | "## Parameter Optimization" 728 | ], 729 | "metadata": { 730 | "collapsed": false 731 | } 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": null, 736 | "outputs": [], 737 | "source": [ 738 | "X, y = prepare_data(train_df, resample=False) # we need the original data (without resampling) for cross validation" 739 | ], 740 | "metadata": { 741 | "collapsed": false 742 | } 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": null, 747 | "outputs": [], 748 | "source": [ 749 | "def objective(trial):\n", 750 | " boosting_type = trial.suggest_categorical(\"boosting_type\", [\"dart\", \"gbdt\"])\n", 751 | " lambda_l1 = trial.suggest_float(\n", 752 | " 'lambda_l1', 1e-8, 10.0, log=True),\n", 753 | " lambda_l2 = trial.suggest_float(\n", 754 | " 'lambda_l2', 1e-8, 10.0, log=True),\n", 755 | " num_leaves = trial.suggest_int(\n", 756 | " 'num_leaves', 2, 256),\n", 757 | " feature_fraction = trial.suggest_float(\n", 758 | " 'feature_fraction', 0.4, 1.0),\n", 759 | " bagging_fraction = trial.suggest_float(\n", 760 | " 'bagging_fraction', 0.4, 1.0),\n", 761 | " bagging_freq = trial.suggest_int(\n", 762 | " 'bagging_freq', 1, 7),\n", 763 | " min_child_samples = trial.suggest_int(\n", 764 | " 'min_child_samples', 5, 100),\n", 765 | " learning_rate = trial.suggest_float(\n", 766 | " \"learning_rate\", 0.0001, 0.5, log=True),\n", 767 | " max_bin = trial.suggest_int(\n", 768 | " \"max_bin\", 128, 512, 32)\n", 769 | " n_estimators = trial.suggest_int(\n", 770 | " \"n_estimators\", 40, 400, 20)\n", 771 | "\n", 772 | " model = lgb.LGBMClassifier(\n", 773 | " force_row_wise=True,\n", 774 | " boosting_type=boosting_type,\n", 775 | " n_estimators=n_estimators,\n", 776 | " lambda_l1=lambda_l1,\n", 777 | " lambda_l2=lambda_l2,\n", 778 | " num_leaves=num_leaves,\n", 779 | " feature_fraction=feature_fraction,\n", 780 | " bagging_fraction=bagging_fraction,\n", 781 | " bagging_freq=bagging_freq,\n", 782 | " min_child_samples=min_child_samples,\n", 783 | " learning_rate=learning_rate,\n", 784 | " max_bin=max_bin,\n", 785 | " verbose=-1)\n", 786 | " scores, f1_scores = cross_validate_with_smote(model, X, y, verbose=False)\n", 787 | " return f1_scores.mean()" 788 | ], 789 | "metadata": { 790 | "collapsed": false 791 | } 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": null, 796 | "outputs": [], 797 | "source": [ 798 | "sampler = optuna.samplers.TPESampler()\n", 799 | "pruner = optuna.pruners.HyperbandPruner(\n", 800 | " min_resource=20, max_resource=400, reduction_factor=3)\n", 801 | "\n", 802 | "study = optuna.create_study(\n", 803 | " direction='maximize', sampler=sampler,\n", 804 | " pruner=pruner\n", 805 | ")\n", 806 | "study.optimize(objective, n_trials=50, n_jobs=-1)" 807 | ], 808 | "metadata": { 809 | "collapsed": false 810 | } 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "outputs": [], 816 | "source": [ 817 | "print(study.best_trial)" 818 | ], 819 | "metadata": { 820 | "collapsed": false 821 | } 822 | }, 823 | { 824 | "cell_type": "markdown", 825 | "source": [ 826 | "## Train model using best results" 827 | ], 828 | "metadata": { 829 | "collapsed": false 830 | } 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "outputs": [], 836 | "source": [ 837 | "model = lgb.LGBMClassifier(\n", 838 | " force_row_wise=True,\n", 839 | " boosting_type='gbdt',\n", 840 | " n_estimators=200,\n", 841 | " lambda_l1=0.0003,\n", 842 | " lambda_l2=1.4418,\n", 843 | " num_leaves=58,\n", 844 | " feature_fraction=0.6628,\n", 845 | " bagging_fraction=0.6651,\n", 846 | " bagging_freq=3,\n", 847 | " min_child_samples=58,\n", 848 | " learning_rate=0.013,\n", 849 | " max_bin=128,\n", 850 | " verbose=-1\n", 851 | ")\n", 852 | "scores, f1_scores = cross_validate_with_smote(model, X, y, verbose=False)\n", 853 | "print(f\"Accuracy: {scores.mean()}\")\n", 854 | "print(f\"F1: {f1_scores.mean()}\")\n", 855 | "X, y = prepare_data(train_df, resample=True)\n", 856 | "model = model.fit(X, y)" 857 | ], 858 | "metadata": { 859 | "collapsed": false 860 | } 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "outputs": [], 866 | "source": [ 867 | "lgb.plot_importance(model, figsize=(15, 12))" 868 | ], 869 | "metadata": { 870 | "collapsed": false 871 | } 872 | }, 873 | { 874 | "cell_type": "markdown", 875 | "source": [ 876 | "### Saving and loading the model" 877 | ], 878 | "metadata": { 879 | "collapsed": false 880 | } 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": null, 885 | "outputs": [], 886 | "source": [ 887 | "X, y = prepare_data(train_df, resample=False)" 888 | ], 889 | "metadata": { 890 | "collapsed": false 891 | } 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": null, 896 | "outputs": [], 897 | "source": [ 898 | "joblib.dump(model, \"lgb_credit_score_classification.pkl\")" 899 | ], 900 | "metadata": { 901 | "collapsed": false 902 | } 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": null, 907 | "outputs": [], 908 | "source": [ 909 | "def make_predictions(data):\n", 910 | " model = joblib.load(\"lgb_credit_score_classification.pkl\")\n", 911 | " return model.predict(data)" 912 | ], 913 | "metadata": { 914 | "collapsed": false 915 | } 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": null, 920 | "outputs": [], 921 | "source": [ 922 | "predictions = make_predictions(X)" 923 | ], 924 | "metadata": { 925 | "collapsed": false 926 | } 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": null, 931 | "outputs": [], 932 | "source": [], 933 | "metadata": { 934 | "collapsed": false 935 | } 936 | } 937 | ], 938 | "metadata": { 939 | "kernelspec": { 940 | "display_name": "Python 3", 941 | "language": "python", 942 | "name": "python3" 943 | }, 944 | "language_info": { 945 | "codemirror_mode": { 946 | "name": "ipython", 947 | "version": 2 948 | }, 949 | "file_extension": ".py", 950 | "mimetype": "text/x-python", 951 | "name": "python", 952 | "nbconvert_exporter": "python", 953 | "pygments_lexer": "ipython2", 954 | "version": "2.7.6" 955 | } 956 | }, 957 | "nbformat": 4, 958 | "nbformat_minor": 0 959 | } 960 | -------------------------------------------------------------------------------- /chapter-8/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | RUN apt-get update && apt-get install -y --no-install-recommends apt-utils 4 | RUN apt-get -y install curl 5 | RUN apt-get install libgomp1 6 | 7 | WORKDIR /usr/src/app 8 | 9 | COPY requirements.txt ./ 10 | RUN pip install --no-cache-dir -r requirements.txt 11 | COPY . . 12 | 13 | CMD [ "uvicorn", "telco_churn_api:app", "--host", "0.0.0.0", "--port", "8080" ] -------------------------------------------------------------------------------- /chapter-8/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24.3 2 | scikit-learn==1.2.1 3 | lightgbm==3.3.5 4 | FLAML==1.2.4 5 | fastapi==0.95.2 6 | uvicorn[standard]==0.22.0 -------------------------------------------------------------------------------- /chapter-8/telco_churn_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import secrets 3 | from typing import Annotated 4 | import joblib 5 | import numpy as np 6 | import pandas as pd 7 | from fastapi import FastAPI, Depends, HTTPException, status 8 | from fastapi.security import HTTPBasic, HTTPBasicCredentials 9 | 10 | app = FastAPI() 11 | 12 | model = joblib.load("churn_pipeline.pkl") 13 | 14 | security = HTTPBasic() 15 | USER = bytes(os.getenv("CHURN_USER"), "utf-8") 16 | PASSWORD = bytes(os.getenv("CHURN_PASSWORD"), "utf-8") 17 | 18 | 19 | def authenticate(username: bytes, password: bytes): 20 | valid_user = secrets.compare_digest( 21 | username, USER 22 | ) 23 | valid_password = secrets.compare_digest( 24 | password, PASSWORD 25 | ) 26 | if not (valid_user and valid_password): 27 | raise HTTPException( 28 | status_code=status.HTTP_401_UNAUTHORIZED, 29 | detail="Incorrect username or password", 30 | headers={"WWW-Authenticate": "Basic"}, 31 | ) 32 | return username 33 | 34 | 35 | @app.post('/predict') 36 | def predict_instances( 37 | credentials: Annotated[HTTPBasicCredentials, Depends(security)], 38 | instances: list[dict[str, str]] 39 | ): 40 | authenticate(credentials.username.encode("utf-8"), credentials.password.encode("utf-8")) 41 | 42 | instance_frame = pd.DataFrame(instances) 43 | predictions = model.predict_proba(instance_frame) 44 | 45 | results = {} 46 | for i, row in enumerate(predictions): 47 | prediction = model.classes_[np.argmax(row)] 48 | probability = np.amax(row) 49 | results[i] = {"prediction": prediction, "probability": probability} 50 | return results 51 | -------------------------------------------------------------------------------- /chapter-8/transformers.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | class CustomerIdTransformer(BaseEstimator, TransformerMixin): 4 | def __init__(self, id_columns): 5 | self.id_columns = id_columns 6 | 7 | def fit(self, X, y=None): 8 | return self 9 | 10 | def transform(self, X, y=None): 11 | return X.drop(columns=self.id_columns, axis=1) -------------------------------------------------------------------------------- /chapter-9/src/evaluate.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import pathlib 4 | import pickle 5 | import tarfile 6 | import argparse 7 | import os 8 | import subprocess 9 | import sys 10 | import joblib 11 | import json 12 | 13 | def install(package): 14 | subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package]) 15 | 16 | install("lightgbm") 17 | 18 | import numpy as np 19 | import pandas as pd 20 | import lightgbm as lgb 21 | 22 | from sklearn.metrics import f1_score 23 | from sklearn.model_selection import cross_val_score 24 | 25 | 26 | def prepare_data(df): 27 | categorical_features = list( 28 | df.loc[:, df.dtypes == "object"].columns.values 29 | ) 30 | for f in categorical_features: 31 | df[f] = df[f].astype("category") 32 | X = df.drop(columns=["Class"]) 33 | y = df["Class"] 34 | return X, y 35 | 36 | logger = logging.getLogger() 37 | logger.setLevel(logging.INFO) 38 | logger.addHandler(logging.StreamHandler()) 39 | 40 | 41 | if __name__ == "__main__": 42 | model_path = "/opt/ml/processing/model/model.tar.gz" 43 | with tarfile.open(model_path) as tar: 44 | tar.extractall(path=".") 45 | 46 | logger.debug("Loading LightGBM model.") 47 | model = joblib.load(open("lightgbm-model", "rb")) 48 | 49 | logger.debug("Reading test data.") 50 | test_local_path = "/opt/ml/processing/test/test.csv" 51 | 52 | test_df = pd.read_csv(test_local_path) 53 | 54 | X_test, y_test = prepare_data(test_df) 55 | test_f1 = f1_score(y_test, model.predict(X_test)) 56 | 57 | logger.debug("Calculating F1 score.") 58 | metric_dict = { 59 | "classification_metrics": {"f1": {"value": test_f1}} 60 | } 61 | 62 | output_dir = "/opt/ml/processing/evaluation" 63 | pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) 64 | 65 | logger.info("Writing evaluation report with F1: %f", test_f1) 66 | evaluation_path = f"{output_dir}/evaluation.json" 67 | with open(evaluation_path, "w") as f: 68 | f.write(json.dumps(metric_dict)) 69 | -------------------------------------------------------------------------------- /chapter-9/src/lambda_deployer.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Lambda function creates an endpoint configuration and deploys a model to real-time endpoint. 4 | Required parameters for deployment are retrieved from the event object 5 | """ 6 | 7 | import json 8 | import boto3 9 | 10 | 11 | def lambda_handler(event, context): 12 | sm_client = boto3.client("sagemaker") 13 | 14 | model_name = event["model_name"] 15 | model_package_arn = event["model_package_arn"] 16 | endpoint_config_name = event["endpoint_config_name"] 17 | endpoint_name = event["endpoint_name"] 18 | role = event["role"] 19 | instance_type = event["instance_type"] 20 | instance_count = event["instance_count"] 21 | primary_container = {"ModelPackageName": model_package_arn} 22 | 23 | model = sm_client.create_model( 24 | ModelName=model_name, 25 | PrimaryContainer=primary_container, 26 | ExecutionRoleArn=role 27 | ) 28 | 29 | create_endpoint_config_response = sm_client.create_endpoint_config( 30 | EndpointConfigName=endpoint_config_name, 31 | ProductionVariants=[ 32 | { 33 | "VariantName": "Alltraffic", 34 | "ModelName": model_name, 35 | "InitialInstanceCount": instance_count, 36 | "InstanceType": instance_type, 37 | "InitialVariantWeight": 1 38 | } 39 | ] 40 | ) 41 | 42 | create_endpoint_response = sm_client.create_endpoint( 43 | EndpointName=endpoint_name, 44 | EndpointConfigName=endpoint_config_name 45 | ) 46 | -------------------------------------------------------------------------------- /chapter-9/src/lightgbm_train.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import subprocess 5 | import sys 6 | import joblib 7 | import json 8 | import pandas as pd 9 | from sklearn.metrics import f1_score 10 | from sklearn.model_selection import cross_val_score 11 | 12 | def install(package): 13 | subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package]) 14 | 15 | install("lightgbm") 16 | 17 | import lightgbm as lgb 18 | 19 | 20 | def prepare_data(df): 21 | categorical_features = list( 22 | df.loc[:, df.dtypes == "object"].columns.values 23 | ) 24 | for f in categorical_features: 25 | df[f] = df[f].astype("category") 26 | X = df.drop(columns=["Class"]) 27 | y = df["Class"] 28 | return X, y 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | 34 | parser.add_argument("--boosting_type", type=str, default="gbdt") 35 | parser.add_argument("--objective", type=str, default="binary") 36 | 37 | parser.add_argument("--n_estimators", type=int, default=200) 38 | parser.add_argument("--learning_rate", type=float, default=0.001) 39 | parser.add_argument("--num_leaves", type=int, default=30) 40 | parser.add_argument("--max_bin", type=int, default=300) 41 | 42 | # SageMaker 43 | parser.add_argument("--train_data_dir", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) 44 | parser.add_argument("--validation_data_dir", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION")) 45 | parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR")) 46 | parser.add_argument("--output_data_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR")) 47 | 48 | args = parser.parse_args() 49 | 50 | train_df = pd.read_csv(f"{args.train_data_dir}/train.csv") 51 | val_df = pd.read_csv(f"{args.validation_data_dir}/validation.csv") 52 | 53 | params = { 54 | "n_estimators": args.n_estimators, 55 | "learning_rate": args.learning_rate, 56 | "num_leaves": args.num_leaves, 57 | "max_bin": args.max_bin, 58 | } 59 | 60 | X, y = prepare_data(train_df) 61 | model = lgb.LGBMClassifier(**params) 62 | 63 | scores = cross_val_score(model, X, y, scoring="f1_macro") 64 | train_f1 = scores.mean() 65 | model = model.fit(X, y) 66 | 67 | X_test, y_test = prepare_data(val_df) 68 | test_f1 = f1_score(y_test, model.predict(X_test)) 69 | 70 | print(f"[0]#011train-f1:{train_f1:.2f}") 71 | print(f"[0]#011validation-f1:{test_f1:.2f}") 72 | 73 | metrics_data = {"hyperparameters": params, 74 | "binary_classification_metrics": {"validation:f1": {"value": test_f1}, 75 | "train:f1": {"value": train_f1} 76 | } 77 | } 78 | 79 | # Save the evaluation metrics to the location specified by output_data_dir 80 | metrics_location = args.output_data_dir + "/metrics.json" 81 | 82 | # Save the model to the location specified by model_dir 83 | model_location = args.model_dir + "/lightgbm-model" 84 | 85 | with open(metrics_location, "w") as f: 86 | json.dump(metrics_data, f) 87 | 88 | with open(model_location, "wb") as f: 89 | joblib.dump(model, f) 90 | -------------------------------------------------------------------------------- /chapter-9/src/preprocessing.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import pathlib 4 | import boto3 5 | import os 6 | import pandas as pd 7 | import numpy as np 8 | import logging 9 | from sklearn.model_selection import train_test_split 10 | 11 | logger = logging.getLogger() 12 | logger.setLevel(logging.INFO) 13 | logger.addHandler(logging.StreamHandler()) 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--train-ratio", type=float, default=0.8) 19 | parser.add_argument("--validation-ratio", type=float, default=0.1) 20 | parser.add_argument("--test-ratio", type=float, default=0.1) 21 | args, _ = parser.parse_known_args() 22 | logger.info("Received arguments {}".format(args)) 23 | 24 | # Set local path prefix in the processing container 25 | local_dir = "/opt/ml/processing" 26 | 27 | input_data_path = os.path.join("/opt/ml/processing/census-income", "census-income.csv") 28 | 29 | logger.info("Reading claims data from {}".format(input_data_path)) 30 | df = pd.read_csv(input_data_path) 31 | 32 | df = df.replace("unknown", np.nan) 33 | 34 | categorical_features = list( 35 | df.loc[:, df.dtypes == "object"].columns.values 36 | ) 37 | for f in categorical_features: 38 | df[f] = df[f].astype("category") 39 | categorical_features = [c for c in categorical_features if not c == "Class"] 40 | 41 | for f in df.columns: 42 | if f in categorical_features: 43 | df[f].fillna(df[f].mode()[0], inplace=True) 44 | else: 45 | df[f].fillna(df[f].median(), inplace=True) 46 | 47 | df = pd.get_dummies(df, columns=categorical_features) 48 | 49 | X = df.drop(columns=["Class"], axis=1) 50 | y = df["Class"] 51 | 52 | train_ratio = args.train_ratio 53 | val_ratio = args.validation_ratio 54 | test_ratio = args.test_ratio 55 | 56 | logger.debug("Splitting data into train, validation, and test sets") 57 | 58 | X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=test_ratio) 59 | X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_ratio) 60 | 61 | X_train["Class"] = y_train 62 | X_val["Class"] = y_val 63 | X_test["Class"] = y_test 64 | X["Class"] = y 65 | 66 | logger.info("Train data shape after preprocessing: {}".format(X_train.shape)) 67 | logger.info("Validation data shape after preprocessing: {}".format(X_val.shape)) 68 | logger.info("Test data shape after preprocessing: {}".format(X_test.shape)) 69 | 70 | # Save processed datasets to the local paths in the processing container. 71 | # SageMaker will upload the contents of these paths to S3 bucket 72 | logger.debug("Writing processed datasets to container local path.") 73 | train_output_path = os.path.join(f"{local_dir}/train", "train.csv") 74 | validation_output_path = os.path.join(f"{local_dir}/val", "validation.csv") 75 | test_output_path = os.path.join(f"{local_dir}/test", "test.csv") 76 | full_processed_output_path = os.path.join(f"{local_dir}/full", "dataset.csv") 77 | 78 | logger.info("Saving train data to {}".format(train_output_path)) 79 | X_train.to_csv(train_output_path, index=False) 80 | 81 | logger.info("Saving validation data to {}".format(validation_output_path)) 82 | X_val.to_csv(validation_output_path, index=False) 83 | 84 | logger.info("Saving test data to {}".format(test_output_path)) 85 | X_test.to_csv(test_output_path, index=False) 86 | 87 | logger.info("Saving full processed data to {}".format(full_processed_output_path)) 88 | X.to_csv(full_processed_output_path, index=False) 89 | -------------------------------------------------------------------------------- /chapter-9/src/requirements.txt: -------------------------------------------------------------------------------- 1 | lightgbm==3.3.5 -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: lgbmenv 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - _libgcc_mutex=0.1=conda_forge 6 | - _openmp_mutex=4.5=2_gnu 7 | - aiofiles=22.1.0=pyhd8ed1ab_0 8 | - aiosqlite=0.18.0=pyhd8ed1ab_0 9 | - alsa-lib=1.2.8=h166bdaf_0 10 | - anyio=3.6.2=pyhd8ed1ab_0 11 | - appdirs=1.4.4=pyh9f0ad1d_0 12 | - argon2-cffi=21.3.0=pyhd8ed1ab_0 13 | - argon2-cffi-bindings=21.2.0=py310h5764c6d_3 14 | - asttokens=2.2.1=pyhd8ed1ab_0 15 | - attr=2.5.1=h166bdaf_1 16 | - attrs=22.2.0=pyh71513ae_0 17 | - aws-c-auth=0.7.0=hf8751d9_2 18 | - aws-c-cal=0.6.0=h93469e0_0 19 | - aws-c-common=0.8.23=hd590300_0 20 | - aws-c-compression=0.2.17=h862ab75_1 21 | - aws-c-event-stream=0.3.1=h9599702_1 22 | - aws-c-http=0.7.11=hbe98c3e_0 23 | - aws-c-io=0.13.28=h3870b5a_0 24 | - aws-c-mqtt=0.8.14=h2e270ba_2 25 | - aws-c-s3=0.3.13=heb0bb06_2 26 | - aws-c-sdkutils=0.1.11=h862ab75_1 27 | - aws-checksums=0.1.16=h862ab75_1 28 | - aws-crt-cpp=0.20.3=he9c0e7f_4 29 | - aws-sdk-cpp=1.10.57=hbc2ea52_17 30 | - babel=2.11.0=pyhd8ed1ab_0 31 | - backcall=0.2.0=pyh9f0ad1d_0 32 | - backports=1.0=pyhd8ed1ab_3 33 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 34 | - beautifulsoup4=4.11.2=pyha770c72_0 35 | - bleach=6.0.0=pyhd8ed1ab_0 36 | - bokeh=3.2.1=pyhd8ed1ab_0 37 | - brotli=1.0.9=h166bdaf_8 38 | - brotli-bin=1.0.9=h166bdaf_8 39 | - brotlipy=0.7.0=py310h5764c6d_1005 40 | - bzip2=1.0.8=h7f98852_4 41 | - c-ares=1.19.1=hd590300_0 42 | - ca-certificates=2023.7.22=hbcca054_0 43 | - cairo=1.16.0=ha61ee94_1014 44 | - certifi=2023.7.22=pyhd8ed1ab_0 45 | - cffi=1.15.1=py310h255011f_3 46 | - charset-normalizer=2.1.1=pyhd8ed1ab_0 47 | - cloudpickle=2.2.1=pyhd8ed1ab_0 48 | - comm=0.1.2=pyhd8ed1ab_0 49 | - contourpy=1.0.7=py310hdf3cbec_0 50 | - cryptography=39.0.1=py310h34c0648_0 51 | - cycler=0.11.0=pyhd8ed1ab_0 52 | - cytoolz=0.12.0=py310h5764c6d_1 53 | - dask=2023.7.1=pyhd8ed1ab_0 54 | - dask-core=2023.7.1=pyhd8ed1ab_0 55 | - dask-glm=0.2.0=py_1 56 | - dask-ml=2023.3.24=pyhd8ed1ab_1 57 | - dbus=1.13.6=h5008d03_3 58 | - debugpy=1.6.6=py310heca2aa9_0 59 | - decorator=5.1.1=pyhd8ed1ab_0 60 | - defusedxml=0.7.1=pyhd8ed1ab_0 61 | - distributed=2023.7.1=pyhd8ed1ab_0 62 | - entrypoints=0.4=pyhd8ed1ab_0 63 | - executing=1.2.0=pyhd8ed1ab_0 64 | - expat=2.5.0=h27087fc_0 65 | - fftw=3.3.10=nompi_hf0379b8_106 66 | - flit-core=3.8.0=pyhd8ed1ab_0 67 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 68 | - font-ttf-inconsolata=3.000=h77eed37_0 69 | - font-ttf-source-code-pro=2.038=h77eed37_0 70 | - font-ttf-ubuntu=0.83=hab24e00_0 71 | - fontconfig=2.14.2=h14ed4e7_0 72 | - fonts-conda-ecosystem=1=0 73 | - fonts-conda-forge=1=0 74 | - fonttools=4.38.0=py310h5764c6d_1 75 | - freetype=2.12.1=hca18f0e_1 76 | - gettext=0.21.1=h27087fc_0 77 | - gflags=2.2.2=he1b5a44_1004 78 | - glib=2.74.1=h6239696_1 79 | - glib-tools=2.74.1=h6239696_1 80 | - glog=0.6.0=h6f12383_0 81 | - graphite2=1.3.13=h58526e2_1001 82 | - gst-plugins-base=1.22.0=h4243ec0_0 83 | - gstreamer=1.22.0=h25f0c4b_0 84 | - gstreamer-orc=0.4.33=h166bdaf_0 85 | - harfbuzz=6.0.0=h8e241bc_0 86 | - icu=70.1=h27087fc_0 87 | - idna=3.4=pyhd8ed1ab_0 88 | - importlib-metadata=6.0.0=pyha770c72_0 89 | - importlib_metadata=6.0.0=hd8ed1ab_0 90 | - importlib_resources=5.10.2=pyhd8ed1ab_0 91 | - ipykernel=6.21.2=pyh210e3f2_0 92 | - ipython=8.10.0=pyh41d4057_0 93 | - ipython_genutils=0.2.0=py_1 94 | - jack=1.9.22=h11f4161_0 95 | - jedi=0.18.2=pyhd8ed1ab_0 96 | - jinja2=3.1.2=pyhd8ed1ab_1 97 | - joblib=1.2.0=pyhd8ed1ab_0 98 | - jpeg=9e=h0b41bf4_3 99 | - json5=0.9.5=pyh9f0ad1d_0 100 | - jsonschema=4.17.3=pyhd8ed1ab_0 101 | - jupyter_client=8.0.3=pyhd8ed1ab_0 102 | - jupyter_core=5.2.0=py310hff52083_0 103 | - jupyter_events=0.6.3=pyhd8ed1ab_0 104 | - jupyter_server=2.3.0=pyhd8ed1ab_0 105 | - jupyter_server_fileid=0.7.0=pyhd8ed1ab_0 106 | - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1 107 | - jupyter_server_ydoc=0.6.1=pyhd8ed1ab_0 108 | - jupyter_ydoc=0.2.2=pyhd8ed1ab_0 109 | - jupyterlab=3.6.1=pyhd8ed1ab_0 110 | - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 111 | - jupyterlab_server=2.19.0=pyhd8ed1ab_0 112 | - keyutils=1.6.1=h166bdaf_0 113 | - kiwisolver=1.4.4=py310hbf28c38_1 114 | - krb5=1.20.1=h81ceb04_0 115 | - lame=3.100=h166bdaf_1003 116 | - lcms2=2.14=hfd0df8a_1 117 | - ld_impl_linux-64=2.40=h41732ed_0 118 | - lerc=4.0.0=h27087fc_0 119 | - libabseil=20230125.3=cxx17_h59595ed_0 120 | - libarrow=12.0.1=h657c46f_5_cpu 121 | - libblas=3.9.0=16_linux64_openblas 122 | - libbrotlicommon=1.0.9=h166bdaf_8 123 | - libbrotlidec=1.0.9=h166bdaf_8 124 | - libbrotlienc=1.0.9=h166bdaf_8 125 | - libcap=2.66=ha37c62d_0 126 | - libcblas=3.9.0=16_linux64_openblas 127 | - libclang=15.0.7=default_had23c3d_1 128 | - libclang13=15.0.7=default_h3e3d535_1 129 | - libcrc32c=1.1.2=h9c3ff4c_0 130 | - libcups=2.3.3=h36d4200_3 131 | - libcurl=8.1.2=h409715c_0 132 | - libdb=6.2.32=h9c3ff4c_0 133 | - libdeflate=1.17=h0b41bf4_0 134 | - libedit=3.1.20191231=he28a2e2_2 135 | - libev=4.33=h516909a_1 136 | - libevent=2.1.10=h28343ad_4 137 | - libffi=3.4.2=h7f98852_5 138 | - libflac=1.4.2=h27087fc_0 139 | - libgcc-ng=12.2.0=h65d4601_19 140 | - libgcrypt=1.10.1=h166bdaf_0 141 | - libgfortran-ng=12.2.0=h69a702a_19 142 | - libgfortran5=12.2.0=h337968e_19 143 | - libglib=2.74.1=h606061b_1 144 | - libgomp=12.2.0=h65d4601_19 145 | - libgoogle-cloud=2.12.0=h840a212_1 146 | - libgpg-error=1.46=h620e276_0 147 | - libgrpc=1.56.2=h3905398_0 148 | - libiconv=1.17=h166bdaf_0 149 | - liblapack=3.9.0=16_linux64_openblas 150 | - libllvm14=14.0.6=hcd5def8_4 151 | - libllvm15=15.0.7=hadd5161_0 152 | - libnghttp2=1.52.0=h61bc06f_0 153 | - libnsl=2.0.0=h7f98852_0 154 | - libnuma=2.0.16=h0b41bf4_1 155 | - libogg=1.3.4=h7f98852_1 156 | - libopenblas=0.3.21=pthreads_h78a6416_3 157 | - libopus=1.3.1=h7f98852_1 158 | - libpng=1.6.39=h753d276_0 159 | - libpq=15.2=hb675445_0 160 | - libprotobuf=4.23.3=hd1fb520_0 161 | - libsndfile=1.2.0=hb75c966_0 162 | - libsodium=1.0.18=h36c2ea0_1 163 | - libsqlite=3.40.0=h753d276_0 164 | - libssh2=1.11.0=h0841786_0 165 | - libstdcxx-ng=12.2.0=h46fd767_19 166 | - libsystemd0=252=h2a991cd_0 167 | - libthrift=0.18.1=h5e4af38_0 168 | - libtiff=4.5.0=h6adf6a1_2 169 | - libtool=2.4.7=h27087fc_0 170 | - libudev1=252=h166bdaf_0 171 | - libutf8proc=2.8.0=h166bdaf_0 172 | - libuuid=2.32.1=h7f98852_1000 173 | - libvorbis=1.3.7=h9c3ff4c_0 174 | - libwebp-base=1.2.4=h166bdaf_0 175 | - libxcb=1.13=h7f98852_1004 176 | - libxkbcommon=1.5.0=h79f4944_0 177 | - libxml2=2.10.3=h7463322_0 178 | - libzlib=1.2.13=h166bdaf_4 179 | - lightgbm=3.3.5=py310heca2aa9_0 180 | - locket=1.0.0=pyhd8ed1ab_0 181 | - lz4=4.3.2=py310h0cfdcf0_0 182 | - lz4-c=1.9.4=hcb278e6_0 183 | - markupsafe=2.1.2=py310h1fa729e_0 184 | - matplotlib-inline=0.1.6=pyhd8ed1ab_0 185 | - mistune=2.0.5=pyhd8ed1ab_0 186 | - mpg123=1.31.2=hcb278e6_0 187 | - msgpack-python=1.0.5=py310hdf3cbec_0 188 | - multipledispatch=0.6.0=py_0 189 | - munkres=1.1.4=pyh9f0ad1d_0 190 | - mysql-common=8.0.32=ha901b37_0 191 | - mysql-libs=8.0.32=hd7da12d_0 192 | - nbclassic=0.5.2=pyhd8ed1ab_0 193 | - nbclient=0.7.2=pyhd8ed1ab_0 194 | - nbconvert=7.2.9=pyhd8ed1ab_0 195 | - nbconvert-core=7.2.9=pyhd8ed1ab_0 196 | - nbconvert-pandoc=7.2.9=pyhd8ed1ab_0 197 | - nbformat=5.7.3=pyhd8ed1ab_0 198 | - ncurses=6.3=h27087fc_1 199 | - nest-asyncio=1.5.6=pyhd8ed1ab_0 200 | - notebook=6.5.2=pyha770c72_1 201 | - notebook-shim=0.2.2=pyhd8ed1ab_0 202 | - nspr=4.35=h27087fc_0 203 | - nss=3.88=he45b914_0 204 | - openjpeg=2.5.0=hfec8fc6_2 205 | - openssl=3.1.2=hd590300_0 206 | - orc=1.9.0=h385abfd_1 207 | - packaging=23.0=pyhd8ed1ab_0 208 | - pandas=1.5.3=py310h9b08913_0 209 | - pandoc=2.19.2=h32600fe_1 210 | - pandocfilters=1.5.0=pyhd8ed1ab_0 211 | - parso=0.8.3=pyhd8ed1ab_0 212 | - partd=1.4.0=pyhd8ed1ab_0 213 | - patsy=0.5.3=pyhd8ed1ab_0 214 | - pcre2=10.40=hc3806b6_0 215 | - pexpect=4.8.0=pyh1a96a4e_2 216 | - pickleshare=0.7.5=py_1003 217 | - pillow=9.4.0=py310h023d228_1 218 | - pip=23.0.1=pyhd8ed1ab_0 219 | - pixman=0.40.0=h36c2ea0_0 220 | - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 221 | - platformdirs=3.0.0=pyhd8ed1ab_0 222 | - ply=3.11=py_1 223 | - pooch=1.6.0=pyhd8ed1ab_0 224 | - prometheus_client=0.16.0=pyhd8ed1ab_0 225 | - prompt-toolkit=3.0.36=pyha770c72_0 226 | - psutil=5.9.4=py310h5764c6d_0 227 | - pthread-stubs=0.4=h36c2ea0_1001 228 | - ptyprocess=0.7.0=pyhd3deb0d_0 229 | - pulseaudio=16.1=ha8d29e2_1 230 | - pure_eval=0.2.2=pyhd8ed1ab_0 231 | - pycparser=2.21=pyhd8ed1ab_0 232 | - pygments=2.14.0=pyhd8ed1ab_0 233 | - pyopenssl=23.0.0=pyhd8ed1ab_0 234 | - pyparsing=3.0.9=pyhd8ed1ab_0 235 | - pyqt=5.15.7=py310hab646b1_3 236 | - pyqt5-sip=12.11.0=py310heca2aa9_3 237 | - pyrsistent=0.19.3=py310h1fa729e_0 238 | - pysocks=1.7.1=pyha2e5f31_6 239 | - python=3.10.9=he550d4f_0_cpython 240 | - python-dateutil=2.8.2=pyhd8ed1ab_0 241 | - python-fastjsonschema=2.16.2=pyhd8ed1ab_0 242 | - python-json-logger=2.0.6=pyhd8ed1ab_0 243 | - python_abi=3.10=3_cp310 244 | - pytz=2022.7.1=pyhd8ed1ab_0 245 | - pyyaml=6.0=py310h5764c6d_5 246 | - pyzmq=25.0.0=py310h059b190_0 247 | - qt-main=5.15.8=h5d23da1_6 248 | - rdma-core=28.9=h59595ed_1 249 | - re2=2023.03.02=h8c504da_0 250 | - readline=8.1.2=h0f457ee_0 251 | - requests=2.28.2=pyhd8ed1ab_0 252 | - rfc3339-validator=0.1.4=pyhd8ed1ab_0 253 | - rfc3986-validator=0.1.1=pyh9f0ad1d_0 254 | - s2n=1.3.46=h06160fa_0 255 | - scikit-learn=1.2.1=py310h209a8ca_0 256 | - seaborn=0.12.2=hd8ed1ab_0 257 | - seaborn-base=0.12.2=pyhd8ed1ab_0 258 | - send2trash=1.8.0=pyhd8ed1ab_0 259 | - setuptools=67.3.2=pyhd8ed1ab_0 260 | - sip=6.7.7=py310heca2aa9_0 261 | - six=1.16.0=pyh6c4a22f_0 262 | - snappy=1.1.10=h9fff704_0 263 | - sniffio=1.3.0=pyhd8ed1ab_0 264 | - sortedcontainers=2.4.0=pyhd8ed1ab_0 265 | - soupsieve=2.3.2.post1=pyhd8ed1ab_0 266 | - stack_data=0.6.2=pyhd8ed1ab_0 267 | - statsmodels=0.13.5=py310hde88566_2 268 | - tblib=1.7.0=pyhd8ed1ab_0 269 | - terminado=0.17.1=pyh41d4057_0 270 | - threadpoolctl=3.1.0=pyh8a188c0_0 271 | - tinycss2=1.2.1=pyhd8ed1ab_0 272 | - tk=8.6.12=h27826a3_0 273 | - toml=0.10.2=pyhd8ed1ab_0 274 | - tomli=2.0.1=pyhd8ed1ab_0 275 | - toolz=0.12.0=pyhd8ed1ab_0 276 | - tornado=6.2=py310h5764c6d_1 277 | - traitlets=5.9.0=pyhd8ed1ab_0 278 | - typing-extensions=4.4.0=hd8ed1ab_0 279 | - typing_extensions=4.4.0=pyha770c72_0 280 | - tzdata=2022g=h191b570_0 281 | - ucx=1.14.1=hf587318_2 282 | - unicodedata2=15.0.0=py310h5764c6d_0 283 | - urllib3=1.26.14=pyhd8ed1ab_0 284 | - wcwidth=0.2.6=pyhd8ed1ab_0 285 | - webencodings=0.5.1=py_1 286 | - websocket-client=1.5.1=pyhd8ed1ab_0 287 | - wheel=0.38.4=pyhd8ed1ab_0 288 | - xcb-util=0.4.0=h166bdaf_0 289 | - xcb-util-image=0.4.0=h166bdaf_0 290 | - xcb-util-keysyms=0.4.0=h166bdaf_0 291 | - xcb-util-renderutil=0.3.9=h166bdaf_0 292 | - xcb-util-wm=0.4.1=h166bdaf_0 293 | - xorg-kbproto=1.0.7=h7f98852_1002 294 | - xorg-libice=1.0.10=h7f98852_0 295 | - xorg-libsm=1.2.3=hd9c2040_1000 296 | - xorg-libx11=1.7.2=h7f98852_0 297 | - xorg-libxau=1.0.9=h7f98852_0 298 | - xorg-libxdmcp=1.1.3=h7f98852_0 299 | - xorg-libxext=1.3.4=h7f98852_1 300 | - xorg-libxrender=0.9.10=h7f98852_1003 301 | - xorg-renderproto=0.11.1=h7f98852_1002 302 | - xorg-xextproto=7.3.0=h7f98852_1002 303 | - xorg-xproto=7.0.31=h7f98852_1007 304 | - xyzservices=2023.7.0=pyhd8ed1ab_0 305 | - xz=5.2.6=h166bdaf_0 306 | - y-py=0.5.5=py310h4426083_2 307 | - yaml=0.2.5=h7f98852_2 308 | - ypy-websocket=0.8.2=pyhd8ed1ab_0 309 | - zeromq=4.3.4=h9c3ff4c_1 310 | - zict=3.0.0=pyhd8ed1ab_0 311 | - zipp=3.14.0=pyhd8ed1ab_0 312 | - zlib=1.2.13=h166bdaf_4 313 | - zstd=1.5.2=h3eb15da_6 314 | - pip: 315 | - aiohttp==3.8.4 316 | - aiosignal==1.3.1 317 | - alembic==1.10.4 318 | - altair==5.0.1 319 | - async-timeout==4.0.2 320 | - click==8.1.3 321 | - cmaes==0.9.1 322 | - colorlog==6.7.0 323 | - fastapi==0.95.2 324 | - featuretools==1.26.0 325 | - featuretools-sklearn-transformer==1.0.0 326 | - ffmpy==0.3.0 327 | - filelock==3.12.0 328 | - flaml==1.2.3 329 | - frozenlist==1.3.3 330 | - fsspec==2023.5.0 331 | - gradio==3.32.0 332 | - gradio-client==0.2.5 333 | - greenlet==2.0.2 334 | - h11==0.14.0 335 | - holidays==0.25 336 | - htmlmin==0.1.12 337 | - httpcore==0.17.2 338 | - httpx==0.24.1 339 | - huggingface-hub==0.14.1 340 | - imagehash==4.3.1 341 | - imbalanced-learn==0.10.1 342 | - imblearn==0.0 343 | - ipywidgets==8.0.6 344 | - jupyterlab-widgets==3.0.7 345 | - kaleido==0.2.1 346 | - korean-lunar-calendar==0.3.1 347 | - liac-arff==2.5.0 348 | - linkify-it-py==2.0.2 349 | - llvmlite==0.40.1rc1 350 | - mako==1.2.4 351 | - markdown-it-py==2.2.0 352 | - matplotlib==3.6.3 353 | - mdit-py-plugins==0.3.3 354 | - mdurl==0.1.2 355 | - minio==7.1.15 356 | - multidict==6.0.4 357 | - multimethod==1.9.1 358 | - networkx==3.1 359 | - numba==0.57.0 360 | - numpy==1.23.5 361 | - openml==0.13.1 362 | - optuna==3.1.1 363 | - orjson==3.8.14 364 | - pandas-profiling==3.6.6 365 | - phik==0.12.3 366 | - plotly==5.15.0 367 | - pyarrow==12.0.0 368 | - pydantic==1.10.7 369 | - pydub==0.25.1 370 | - python-multipart==0.0.6 371 | - pywavelets==1.4.1 372 | - scipy==1.9.3 373 | - semantic-version==2.10.0 374 | - shap==0.41.0 375 | - slicer==0.0.7 376 | - sqlalchemy==2.0.13 377 | - starlette==0.27.0 378 | - tangled-up-in-unicode==0.2.0 379 | - tenacity==8.2.2 380 | - tqdm==4.64.1 381 | - typeguard==2.13.3 382 | - uc-micro-py==1.0.2 383 | - uvicorn==0.22.0 384 | - visions==0.7.5 385 | - websockets==11.0.3 386 | - widgetsnbextension==4.0.7 387 | - woodwork==0.23.0 388 | - xgboost==1.7.4 389 | - xmltodict==0.13.0 390 | - yarl==1.9.2 391 | - ydata-profiling==4.1.2 392 | --------------------------------------------------------------------------------