├── machine_learning_with_r_4th_ed ├── img │ ├── cat.jpg │ ├── pizza.jpg │ ├── ice_cream.jpg │ └── youcanbeapirate-wb-sparkline.jpg ├── data │ ├── credit.sqlite3 │ ├── titanic_train.xlsx │ ├── pt_data.csv │ ├── challenger.csv │ └── usedcars.csv ├── chapter_15_making_use_of_big_data.Rmd ├── chapter_12_advanced_data_preparation.Rmd ├── chapter_14_building_better_learners.Rmd ├── chapter_10_evaluating_model_performance.Rmd ├── chapter_11_being_succesful_with_machine_learning.Rmd ├── chapter_13_challenging_data_too_much_too_little_too_complex.Rmd ├── Chapter_11.r ├── chapter_08_finding_patterns_market_basket_analysis_using_association_rules.Rmd ├── Chapter_10.r ├── Chapter_12.r ├── chapter_09_finding_groups_of_data_clustering_with_k-means.Rmd ├── chapter_03_lazy_learning_classification_using_nearest_neighbors.Rmd ├── Chapter_14.r ├── Chapter_15.r ├── Chapter_13.r ├── chapter_02_managing_and_understanding_data.Rmd └── chapter_07_black-box_methods_neural_networks_and_support_vector_machines.Rmd ├── machine_learning_with_r_3rd_ed ├── naive_bayes │ ├── images │ │ ├── crosstable.jpeg │ │ ├── wordcloud2_ham.jpeg │ │ └── wordcloud2_spam.jpeg │ ├── readme.MD │ └── filtering-mobile-phone-spam-with-naive-bayes.r ├── regression_methods │ ├── images │ │ ├── fancyRpartPlot_wine.jpg │ │ └── fancyRpartPlot_wine.pdf │ ├── readme.MD │ ├── estimating_wine_quality_with_regression_trees_and_model_trees.r │ └── predicting_medical_expenses_with_linear_regression.r ├── association_rules │ ├── images │ │ ├── association_rules_visualized.jpg │ │ └── association_rules_visualized.pdf │ ├── readme.MD │ └── identifying_frequently_purchased_groceries_with_association_rules.r ├── k_means_clustering │ ├── images │ │ ├── total_within_sum_of_squares.jpg │ │ └── total_within_sum_of_squares.pdf │ ├── readme.MD │ └── finding_teen_market_segments_using_k_means_clustering.r ├── hyperparameter_optimization │ ├── images │ │ ├── hyperparameter_optimization_rpart.jpg │ │ └── hyperparameter_optimization_rpart.pdf │ ├── readme.MD │ ├── hyperparameter_optimization_with_tidymodels.R │ └── hyperparameter_optimization_with_tidymodels_random.R ├── README.md ├── knn │ ├── README.md │ ├── data │ │ └── wdbc-names.txt │ └── classifying-cancer-samples-with-knn.r ├── support_vector_machines │ ├── readme.MD │ └── performing_optical_character_recognition_with_support_vector_machines_.r ├── neural_networks │ ├── readme.MD │ └── modeling_the_strength_of_concrete_with_neural_networks.r ├── decision_trees_and_rules │ ├── readme.MD │ ├── identifying_poisonous_mushrooms_with_rule_learners.r │ └── identifying_risky_bank_loans_using_C5.0_decision_trees.r ├── random_forest │ └── random_forest_with_tidymodels.R └── boosting │ └── boosting_with_tidymodels.R ├── learning_machine_learning.Rproj ├── .gitignore ├── LICENSE └── README.md /machine_learning_with_r_4th_ed/img/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/cat.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/img/pizza.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/pizza.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/img/ice_cream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/ice_cream.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/data/credit.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/data/credit.sqlite3 -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/data/titanic_train.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/data/titanic_train.xlsx -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/naive_bayes/images/crosstable.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/crosstable.jpeg -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/data/pt_data.csv: -------------------------------------------------------------------------------- 1 | subject_name,temperature,flu_status,gender,blood_type 2 | John Doe,98.1,FALSE,MALE,O 3 | Jane Doe,98.6,FALSE,FEMALE,AB 4 | Steve Graves,101.4,TRUE,MALE,A 5 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_ham.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_ham.jpeg -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_spam.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_spam.jpeg -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.pdf -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.pdf -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.pdf -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.jpg -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.pdf -------------------------------------------------------------------------------- /learning_machine_learning.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/data/challenger.csv: -------------------------------------------------------------------------------- 1 | distress_ct,temperature,field_check_pressure,flight_num 0,66,50,1 1,70,50,2 0,69,50,3 0,68,50,4 0,67,50,5 0,72,50,6 0,73,100,7 0,70,100,8 1,57,200,9 1,63,200,10 1,70,200,11 0,78,200,12 0,67,200,13 2,53,200,14 0,67,200,15 0,75,200,16 0,70,200,17 0,81,200,18 0,76,200,19 0,79,200,20 2,75,200,21 0,76,200,22 1,58,200,23 -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_15_making_use_of_big_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 15: Making Use of Big Data" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_12_advanced_data_preparation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 12: Advanced Data Preparation" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_14_building_better_learners.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 14: Building Better Learners" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_10_evaluating_model_performance.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 10: Evaluating Model Performance" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_11_being_succesful_with_machine_learning.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 11: Being Succesful with Machine Learning" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_13_challenging_data_too_much_too_little_too_complex.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 13: Challenging Data - Too Much, Too Little, Too Complex" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/README.md: -------------------------------------------------------------------------------- 1 | # Learning Machine Learning 2 | 3 | I'm learning ML with R. Focusing on (but not limiting myself to) tidyverse and tidymodels. 4 | 5 | ## Algorithms tackled (see the named folder(s) for examples): 6 | * K-nearest neighbors (k-NN) 7 | * Naive Bayes __UPDATED__ 8 | * Decision Trees and Rules __UPDATED__ 9 | * Regression Methods 10 | * Neural Networks 11 | * Support Vector Machines 12 | * Association Rules 13 | * Clustering 14 | 15 | ## Coming up next: 16 | * Hyperparameter Optimization 17 | * Bagging 18 | * Boosting 19 | * Random Forest -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # R Environment Variables 39 | .Renviron 40 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/knn/README.md: -------------------------------------------------------------------------------- 1 | # Classifying Cancer Samples with K-Nearest Neighbors 2 | 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 3: Lazy Learning - Classification Using Nearest Neighbors. 4 | 5 | The original code is made with a lot of base R, {class} and {gmodels}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}. 6 | 7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 8 | 9 | You can find the original code here: https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter03 10 | 11 | Dataset Citation: 12 | Wolberg, William, Street, W. & Mangasarian, Olvi. (1995). Breast Cancer Wisconsin (Diagnostic). UCI Machine Learning Repository. -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/support_vector_machines/readme.MD: -------------------------------------------------------------------------------- 1 | # Performing Optical Character Recognition with Support Vector Machines 2 | 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 7: 4 | Black Box Methods - Neural Networks and Support Vector Machines. 5 | 6 | The original code is made with {kernlab}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}. 7 | 8 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 9 | 10 | You can find the original code and the dataset here: 11 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07 12 | 13 | Dataset Citation: 14 | P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". (Machine Learning Vol 6 #2 March 91) -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/hyperparameter_optimization/readme.MD: -------------------------------------------------------------------------------- 1 | # Hyperparameter Optimization with tidymodels 2 | 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 11: 4 | Improving Model Performance. 5 | 6 | The original code is made with {caret}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}. 7 | 8 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 9 | 10 | You can find the original code and the dataset here: 11 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11 12 | 13 | Dataset Citation: 14 | 15 | Professor Dr. Hans Hofmann
16 | Institut für Statistik und Ökonometrie
17 | Universität Hamburg
18 | FB Wirtschaftswissenschaften
19 | Von-Melle-Park 5
20 | 2000 Hamburg 13 -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/neural_networks/readme.MD: -------------------------------------------------------------------------------- 1 | # Modeling the Strength of Concrete with Neural Networks 2 | 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 7: Black Box Methods - Neural Networks and Support Vector Machines. 4 | 5 | The original code is made with {neuralnet}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}. 6 | 7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 8 | 9 | You can find the original code and the dataset here: 10 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07 11 | 12 | Dataset Citation: 13 | I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Antti Rask 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/k_means_clustering/readme.MD: -------------------------------------------------------------------------------- 1 | # Finding Teen Market Segments Using k-means Clustering 2 | 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 9: Finding Groups of Data - Clustering with k-means. 4 | 5 | The original code is made with base R. While {tidymodels} doesn't have a k-means engine, I wanted to see how one could still recreate the code using {tidyverse} as much as possible and even using the {recipes} package from the {tidymodels} to do all the pre-processing needed and {broom} package from the same {tidymodels} family to help look at the model metrics in a more tidy way. 6 | 7 | I was also inspired by this blog post from the tidymodels website to continue the exercise a bit further than the original code did: 8 | https://www.tidymodels.org/learn/statistics/k-means/ 9 | 10 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 11 | 12 | You can find the original code and the dataset here: 13 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter09 14 | 15 | Dataset Citation: 16 | Brett Lantz. "Finding Groups of Data - Clustering with k-means". (Machine Learning with R) -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/regression_methods/readme.MD: -------------------------------------------------------------------------------- 1 | # Predicting Medical Expenses with Linear Regression / 2 | # Estimating Wine Quality with Regression Trees and Model Trees 3 | 4 | Inspired by Brett Lantz's Machine Learning with R, Chapter 6: 5 | Forecasting Numeric Data - Regression Methods. 6 | 7 | The original code is made with {psych}, {rpart}, {rpart.plot} and {Cubist}. I wanted to see how one could recreate it using mainly {tidymodels}, {tidyverse} and {rules}. 8 | 9 | If you haven't already, buy the book:
https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 10 | 11 | You can find the original code and the datasets here:
12 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06 13 | 14 | Dataset Citation(s):
15 | 16 | __Predicting Medical Expenses with Linear Regression__
17 | Brett Lantz: "These data were created for this book using demographic statistics from the U.S. Census Bureau" 18 | 19 | __Estimating Wine Quality with Regression Trees and Model Trees__
20 | P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
21 | Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg) 2 | 3 | # Learning Machine Learning 4 | 5 | I'm learning machine learning (ML) by reading the book __Machine Learning with R__ by __Brett Lantz__. I've read the 3rd edition (published in 2019) and am currently reading the 4th edition. I will update the repo along the way. 6 | 7 | While the book is excellent in explaining the different algorithms used, it still doesn't use tidyverse and tidymodels. Two of the frameworks that make data science -related tasks more user friendly, mainly because of a unified syntax. Obviously I'm not limiting myself to only them, but the whole point of this repo is to offer an alternative to base R and the separate packages used in the book and in the [official GitHub repo](https://github.com/PacktPublishing/Machine-Learning-with-R-Fourth-Edition). 8 | 9 | ## Disclaimer! 10 | This repo is not meant to replace the book in any way. You should definitely read the book. It will help you understand ML in general (and specifically with R) much better than looking at the code alone. 11 | 12 | Also, I would recommend you buy the book. Here's a direct [link](https://www.packtpub.com/product/machine-learning-with-r-fourth-edition/9781801071321) to __Packt__'s (the publisher) website. -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/association_rules/readme.MD: -------------------------------------------------------------------------------- 1 | # Identifying Frequently Purchased Groceries with Association Rules 2 | 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 8: Finding Patterns - Market Basket Analysis Using Association Rules 4 | 5 | The original code is made with {arules}. Now, this is an algorithm that isn't found in {tidymodels}. I still wanted to recreate it using {tidyverse} as much as possible. Helps with the readability if nothing else! 6 | 7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 8 | 9 | You can find the original code and the dataset here: 10 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter08 11 | 12 | Dataset Citation: 13 | 14 | Author:
15 | Michael Hahsler
16 |
17 | Source:
18 | The data set is provided for {arules} by Michael Hahsler, Kurt Hornik and Thomas Reutterer.
19 |
20 | References:
21 | Michael Hahsler, Kurt Hornik, and Thomas Reutterer (2006) Implications of probabilistic data modeling for mining association rules. In M. Spiliopoulou, R. Kruse, C. Borgelt, A. Nuernberger, and W. Gaul, editors, From Data and Information Analysis to Knowledge Engineering, Studies in Classification, Data Analysis, and Knowledge Organization, pages 598–605. Springer-Verlag. -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/decision_trees_and_rules/readme.MD: -------------------------------------------------------------------------------- 1 | # Identifying Risky Bank Loans Using C5.0 Decision Trees / 2 | # Identifying Poisonous Mushrooms with Rule Learners 3 | 4 | Inspired by Brett Lantz's Machine Learning with R,
5 | Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
6 | Chapter 10: Evaluating Model Performance. 7 | 8 | The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I 9 | wanted to see how one could recreate it using mainly {tidymodels} and 10 | {tidyverse}. 11 | 12 | If you haven't already, buy the book:
https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 13 | 14 | You can find the original code and the slightly modified datasets here:
15 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
16 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10 17 | 18 | Dataset Citation(s): 19 | 20 | __Identifying Risky Bank Loans Using C5.0 Decision Trees__
21 | Professor Dr. Hans Hofmann
22 | Institut für Statistik und Ökonometrie
23 | Universität Hamburg
24 | FB Wirtschaftswissenschaften
25 | Von-Melle-Park 5
26 | 2000 Hamburg 13 27 | 28 | __Identifying Poisonous Mushrooms with Rule Learners__
29 | Origin:
30 | Mushroom records drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf
31 |
32 | Donor:
33 | Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu) 34 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/naive_bayes/readme.MD: -------------------------------------------------------------------------------- 1 | # Filtering Mobile Phone Spam with Naive Bayes 2 | 3 | Inspired by Brett Lantz's Machine Learning with R,
4 | Chapter 4: Probabilistic Learning - Classification Using Naive Bayes and
5 | Chapter 10: Evaluating Model Performance. 6 | 7 | The original code is made with a lot of base R, {e1071} and {gmodels}. I wanted to see how one could recreate it using mainly {textrecipes}, {tidymodels}, {tidytext} and {tidyverse}. 8 | 9 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864 10 | 11 | You can find the original code and the slightly modified dataset here:
12 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter04
13 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10 14 | 15 | Dataset Citation: 16 | [1] Gómez Hidalgo, J.M., Cajigas Bringas, G., Puertas Sãnz, E., Carrero García, F. Content Based SMS Spam Filtering. Proceedings of the 2006 ACM Symposium on Document Engineering (ACM DOCENG'06), Amsterdam, The Netherlands, 10-13, 2006. 17 | 18 | [2] Cormack, G. V., Gómez Hidalgo, J. M., and Puertas Sãnz, E. Feature engineering for mobile (SMS) spam filtering. Proceedings of the 30th Annual international ACM Conference on Research and Development in information Retrieval (ACM SIGIR'07), New York, NY, 871-872, 2007. 19 | 20 | [3] Cormack, G. V., Gómez Hidalgo, J. M., and Puertas Sãnz, E. Spam filtering for short messages. Proceedings of the 16th ACM Conference on Information and Knowledge Management (ACM CIKM'07). Lisbon, Portugal, 313-320, 2007. 21 | 22 | Also, there were parts that were inspired by: 23 | 24 | A blog post by Julia Silge: 25 | https://www.tidyverse.org/blog/2020/11/tidymodels-sparse-support/ 26 | 27 | Julia Silge & David Robinson's book Text Mining with R: A Tidy Approach: https://www.tidytextmining.com/ 28 | 29 | A blog post by Céline Van den Rul: 30 | https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/Chapter_11.r: -------------------------------------------------------------------------------- 1 | ##### Chapter 11: Being Successful with Machine Learning -------------------- 2 | 3 | ## Example: Using ggplot2 for visual data exploration ---- 4 | 5 | # read the titanic training dataset 6 | titanic_train <- read.csv("titanic_train.csv") 7 | 8 | # examine the features 9 | str(titanic_train) 10 | 11 | # load ggplot2 package and apply to Titanic dataset 12 | library(ggplot2) 13 | p <- ggplot(data = titanic_train) 14 | p # creates an empty gray plot 15 | 16 | # compare built-in boxplot to ggplot2 boxplot 17 | boxplot(titanic_train$Age) # use R's built-in boxplot() 18 | p + geom_boxplot(mapping = aes(y = Age)) # use ggplot2 boxplot 19 | 20 | # boxplot examining relationship between age and survival 21 | p + geom_boxplot(mapping = aes(x = Age, y = as.factor(Survived))) 22 | 23 | # compare built-in histogram to ggplot2 histogram 24 | hist(titanic_train$Age) # use R's built-in hist() 25 | p + geom_histogram(aes(x = Age)) # use ggplot2 histogram 26 | 27 | # overlapping histograms 28 | p + geom_histogram(aes(x = Age, fill = as.factor(Survived))) + 29 | ggtitle("Distribution of Age by Titanic Survival Status") 30 | 31 | # side-by-side histograms 32 | p + geom_histogram(aes(x = Age)) + 33 | facet_grid(cols = vars(Survived)) + 34 | ggtitle("Distribution of Age by Titanic Survival Status") 35 | 36 | # overlapping density plots 37 | p + geom_density(aes(x = Age, 38 | color = as.factor(Survived), 39 | fill = as.factor(Survived)), 40 | alpha = 0.25) + 41 | ggtitle("Density of Age by Titanic Survival Status") 42 | 43 | # bar chart of passenger counts by sex 44 | p + geom_bar(aes(x = Sex)) + 45 | ggtitle("Titanic Passenger Counts by Gender") 46 | 47 | # bar chart of survival probability by sex 48 | p + geom_bar(aes(x = Sex, y = Survived), 49 | stat = "summary", fun = "mean") + 50 | ggtitle("Titanic Survival Rate by Gender") 51 | 52 | # bar chart of survival probability by passenger class 53 | p + geom_bar(aes(x = Pclass, y = Survived), 54 | stat = "summary", fun = "mean") + 55 | ggtitle("Titanic Survival Rate by Passenger Class") 56 | 57 | # stacked bar chart of survival by passenger class 58 | p + geom_bar(aes(x = Pclass, 59 | fill = factor(Survived, 60 | labels = c("No", "Yes")))) + 61 | labs(fill = "Survived") + 62 | ylab("Number of Passengers") + 63 | ggtitle("Titanic Survival Counts by Passenger Class") 64 | 65 | # survivival status by passenger class 66 | p + geom_bar(aes(x = Pclass, 67 | fill = factor(Survived, 68 | labels = c("No", "Yes"))), 69 | position = "fill") + 70 | labs(fill = "Survived") + 71 | ylab("Proportion of Passengers") + 72 | ggtitle("Titanic Survival by Passenger Class") 73 | 74 | # bar chart of interaction between class and sex 75 | p + geom_bar(aes(x = Pclass, y = Survived, fill = Sex), 76 | position = "dodge", stat = "summary", fun = "mean") + 77 | ylab("Survival Proportion") + 78 | ggtitle("Titanic Survival Rate by Class and Sex") 79 | 80 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/association_rules/identifying_frequently_purchased_groceries_with_association_rules.r: -------------------------------------------------------------------------------- 1 | # Identifying Frequently Purchased Groceries with Association Rules ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 8: 4 | # Finding Patterns - Market Basket Analysis Using Association Rules 5 | # 6 | # The original code is made with {arules}. Now, this is an algorithm that isn't 7 | # found in {tidymodels}. I still wanted to recreate it using {tidyverse} as much 8 | # as possible. Helps with the readability if nothing else! 9 | # 10 | # You can find the original code and the dataset here: 11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter08 12 | 13 | ## 1. Loading libraries ---- 14 | library(conflicted) 15 | library(tidyverse) 16 | library(arules) 17 | library(arulesViz) 18 | 19 | 20 | ## 2. Exploring and preparing the data ---- 21 | 22 | ### Load the grocery data into a sparse matrix ---- 23 | groceries <- read.transactions( 24 | "machine_learning_with_r_3rd_ed/association_rules/data/groceries.csv", 25 | sep = "," 26 | ) 27 | summary(groceries) 28 | 29 | ### Look at the first five transactions ---- 30 | groceries %>% 31 | head(5) %>% 32 | inspect() 33 | 34 | ### Examine the frequency of items ---- 35 | groceries[, 1:3] %>% 36 | itemFrequency() 37 | 38 | ### Plot the frequency of items ---- 39 | par(mar = c(1, 1, 1, 1)) # Have to adjust the margins or there will be an error 40 | 41 | groceries %>% 42 | itemFrequencyPlot(support = 0.1) 43 | 44 | groceries %>% 45 | itemFrequencyPlot(topN = 10) 46 | 47 | ### A visualization of the sparse matrix for the first five transactions ---- 48 | groceries[1:5] %>% 49 | image() 50 | 51 | ### Visualization of a random sample of 100 transactions ---- 52 | sample(groceries, 100) %>% 53 | image() 54 | 55 | 56 | ## 3. Training a model on the data ---- 57 | 58 | ### Default settings result in zero rules learned ---- 59 | groceries %>% 60 | apriori() 61 | 62 | ### Set better support and confidence levels to learn more rules ---- 63 | groceryrules <- apriori( 64 | groceries, 65 | parameter = list( 66 | support = 0.006, 67 | confidence = 0.25, 68 | minlen = 2 69 | ) 70 | ) 71 | groceryrules 72 | 73 | 74 | ## 4. Evaluating model performance ---- 75 | 76 | # Summary of grocery association rules ---- 77 | groceryrules %>% summary() 78 | 79 | # Look at the first three rules ---- 80 | groceryrules[1:3] %>% 81 | inspect() 82 | 83 | # Visualize the rules ---- 84 | groceryrules %>% plot( 85 | method = "graph", 86 | limit = 10 87 | ) 88 | 89 | 90 | ## 5. Improving model performance ---- 91 | 92 | ### Sorting grocery rules by lift ---- 93 | sort(groceryrules, by = "lift")[1:5] %>% 94 | inspect() 95 | 96 | ### Finding subsets of rules containing any berry items ---- 97 | berryrules <- groceryrules %>% 98 | subset(items %in% "berries") 99 | 100 | berryrules %>% 101 | inspect() 102 | 103 | ### Converting the rule set to a tibble ---- 104 | groceryrules_tbl <- groceryrules %>% 105 | as("data.frame") %>% 106 | as_tibble() 107 | 108 | groceryrules_tbl %>% 109 | glimpse() 110 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_08_finding_patterns_market_basket_analysis_using_association_rules.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 8: Finding Patterns - Market Basket Analysis Using Association Rules" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Loading Packages 13 | 14 | ```{r} 15 | library(arules) 16 | library(arulesViz) 17 | library(conflicted) 18 | library(tidyverse) 19 | ``` 20 | 21 | ## Exploring and preparing the data 22 | 23 | ### Load the grocery data into a sparse matrix 24 | 25 | ```{r} 26 | groceries <- read.transactions( 27 | "data/groceries.csv", 28 | sep = "," 29 | ) 30 | ``` 31 | 32 | ```{r} 33 | groceries %>% 34 | summary() 35 | ``` 36 | 37 | ### Look at the first five transactions 38 | 39 | ```{r} 40 | groceries %>% 41 | head(5) %>% 42 | inspect() 43 | ``` 44 | 45 | ### Examine the frequency of items 46 | 47 | ```{r} 48 | groceries[, 1:3] %>% 49 | itemFrequency() 50 | ``` 51 | 52 | ### Plot the frequency of items 53 | 54 | Have to adjust the margins or there will be an error 55 | 56 | ```{r} 57 | par(mar = c(1, 1, 1, 1)) 58 | ``` 59 | 60 | ```{r} 61 | groceries %>% 62 | itemFrequencyPlot(support = 0.1) 63 | ``` 64 | 65 | ```{r} 66 | groceries %>% 67 | itemFrequencyPlot(topN = 20) 68 | ``` 69 | 70 | ### A visualization of the sparse matrix for the first five transactions 71 | 72 | ```{r} 73 | groceries[1:5] %>% 74 | image() 75 | ``` 76 | 77 | ### Visualization of a random sample of 100 transactions 78 | 79 | ```{r} 80 | sample(groceries, 100) %>% 81 | image() 82 | ``` 83 | 84 | ## Training a model on the data 85 | 86 | ### Default settings result in zero rules learned 87 | 88 | ```{r} 89 | groceries %>% 90 | apriori() 91 | ``` 92 | 93 | ### Set better support and confidence levels to learn more rules 94 | 95 | ```{r} 96 | groceryrules <- apriori( 97 | groceries, 98 | parameter = list( 99 | support = 0.006, 100 | confidence = 0.25, 101 | minlen = 2 102 | ) 103 | ) 104 | 105 | groceryrules 106 | ``` 107 | 108 | ## Evaluating model performance 109 | 110 | ### Summary of grocery association rules 111 | 112 | ```{r} 113 | groceryrules %>% 114 | summary() 115 | ``` 116 | 117 | ### Look at the first three rules 118 | 119 | ```{r} 120 | groceryrules[1:3] %>% 121 | inspect() 122 | ``` 123 | 124 | ### Visualize the rules 125 | 126 | ```{r} 127 | groceryrules %>% plot( 128 | method = "graph", 129 | limit = 10 130 | ) 131 | ``` 132 | 133 | ## Improving model performance 134 | 135 | ### Sorting grocery rules by lift 136 | 137 | ```{r} 138 | sort(groceryrules, by = "lift")[1:5] %>% 139 | inspect() 140 | ``` 141 | 142 | ### Finding subsets of rules containing any berry items 143 | 144 | ```{r} 145 | berryrules <- groceryrules %>% 146 | subset(items %in% "berries") 147 | ``` 148 | 149 | ```{r} 150 | berryrules %>% 151 | inspect() 152 | ``` 153 | 154 | ### Converting the rule set to a tibble 155 | 156 | ```{r} 157 | groceryrules_tbl <- groceryrules %>% 158 | as("data.frame") %>% 159 | as_tibble() 160 | ``` 161 | 162 | ```{r} 163 | groceryrules_tbl 164 | ``` 165 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/random_forest/random_forest_with_tidymodels.R: -------------------------------------------------------------------------------- 1 | # Random Forest with tidymodels ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, 4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and 5 | # Chapter 10: Evaluating Model Performance 6 | # 7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I 8 | # wanted to see how one could recreate it using mainly {tidymodels} and 9 | # {tidyverse}. 10 | # 11 | # You can find the original code and the slightly modified dataset here: 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10 14 | 15 | ## 1. Loading libraries (in the order they get used) ---- 16 | library(conflicted) 17 | library(tidyverse) 18 | library(tidymodels) 19 | 20 | ## 2. Exploring and preparing the data ---- 21 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv") 22 | 23 | ### Examine the structure of the credit data ---- 24 | glimpse(credit_tbl) 25 | 26 | ### look at two characteristics of the applicant ---- 27 | credit_tbl %>% 28 | count(checking_balance) %>% 29 | mutate(pct = (n / sum(n) * 100)) 30 | 31 | credit_tbl %>% 32 | count(savings_balance) %>% 33 | mutate(pct = (n / sum(n) * 100)) 34 | 35 | ### look at two characteristics of the loan ---- 36 | credit_tbl %>% 37 | select(months_loan_duration, amount) %>% 38 | summary() 39 | 40 | ### look at the class variable ---- 41 | credit_tbl %>% 42 | count(default) %>% 43 | mutate(pct = (n / sum(n) * 100)) 44 | 45 | 46 | ## 3. Creating the recipe and splitting the data ---- 47 | 48 | ### Convert strings to factors ---- 49 | recipe_obj <- recipe( 50 | default ~ ., 51 | data = credit_tbl 52 | ) %>% 53 | step_string2factor(all_nominal()) 54 | recipe_obj 55 | 56 | credit_factorized_tbl <- recipe_obj %>% 57 | prep() %>% 58 | bake(new_data = NULL) 59 | credit_factorized_tbl 60 | 61 | ### Create training and test data (randomly) ---- 62 | 63 | # Use set.seed to use the same random number sequence as the original 64 | RNGversion("3.5.2") 65 | set.seed(123) 66 | 67 | credit_split <- initial_split( 68 | credit_factorized_tbl, 69 | prop = 0.9 70 | ) 71 | credit_train <- training(credit_split) 72 | credit_test <- testing(credit_split) 73 | 74 | ### Check the proportion of class variable ---- 75 | credit_train %>% 76 | count(default) %>% 77 | mutate(pct = (n / sum(n) * 100)) 78 | 79 | credit_test %>% 80 | count(default) %>% 81 | mutate(pct = (n / sum(n) * 100)) 82 | 83 | 84 | ## 4. Building a random forest ---- 85 | 86 | ### Specify a random forest ---- 87 | spec <- rand_forest(trees = 100) %>% 88 | set_mode("classification") %>% 89 | set_engine("ranger", importance = "impurity") 90 | 91 | ### Train the forest ---- 92 | model <- spec %>% 93 | fit( 94 | default ~ ., 95 | data = credit_train 96 | ) 97 | model 98 | 99 | ### Plot the variable importance ---- 100 | vip::vip(model) 101 | 102 | 103 | ## 5. Predicting ---- 104 | 105 | ### Make the predictions (you could skip this step) ---- 106 | credit_test_pred <- predict( 107 | object = model, 108 | new_data = credit_test, 109 | type = "class" 110 | ) 111 | credit_test_pred 112 | 113 | ### Add the predictions to the test tibble ---- 114 | credit_test_with_pred_tbl <- augment(model, credit_test) 115 | credit_test_with_pred_tbl 116 | 117 | 118 | ## 6. Evaluating model performance ---- 119 | 120 | ### Create a confusion matrix ---- 121 | conf_mat <- conf_mat( 122 | data = credit_test_with_pred_tbl, 123 | truth = default, 124 | estimate = .pred_class 125 | ) 126 | conf_mat 127 | 128 | ### Visualize the confusion matrix ---- 129 | conf_mat %>% autoplot(type = "heatmap") 130 | conf_mat %>% autoplot(type = "mosaic") 131 | 132 | ### Visualize the ROC curve ---- 133 | credit_test_with_pred_tbl %>% 134 | roc_curve( 135 | truth = default, 136 | estimate = .pred_no 137 | ) %>% 138 | autoplot() 139 | 140 | ### Calculate the ROC AUC (area under the curve) ---- 141 | credit_roc_auc <- credit_test_with_pred_tbl %>% 142 | roc_auc( 143 | truth = default, 144 | estimate = .pred_no 145 | ) 146 | credit_roc_auc 147 | 148 | ### Put together other model metrics ---- 149 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 150 | classification_metrics <- conf_mat( 151 | credit_test_with_pred_tbl, 152 | truth = default, 153 | estimate = .pred_class 154 | ) %>% 155 | summary() 156 | classification_metrics 157 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/neural_networks/modeling_the_strength_of_concrete_with_neural_networks.r: -------------------------------------------------------------------------------- 1 | # Modeling the Strength of Concrete with Neural Networks ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 7: 4 | # Black Box Methods - Neural Networks and Support Vector Machines 5 | # 6 | # The original code is made with {neuralnet}. I wanted to see how one could 7 | # recreate it using mainly {tidymodels} and {tidyverse}. 8 | # 9 | # You can find the original code and the dataset here: 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07 11 | 12 | ## 1. Loading libraries (in the order they get used) ---- 13 | library(conflicted) 14 | library(tidyverse) 15 | library(tidymodels) 16 | library(corrr) 17 | 18 | 19 | ## 2. Exploring and preparing the data ---- 20 | 21 | ### Read in data and examine structure ---- 22 | concrete_tbl <- read_csv("machine_learning_with_r_3rd_ed/neural_networks/data/concrete.csv") 23 | 24 | concrete_tbl %>% glimpse() 25 | 26 | ### Check the minimum and maximum strength ---- 27 | concrete_tbl %>% 28 | select(strength) %>% 29 | summary() 30 | 31 | 32 | ## 3. Creating the recipe ---- 33 | 34 | ### Apply normalization to the numeric predictors ---- 35 | recipe_obj <- recipe( 36 | strength ~ ., 37 | data = concrete_tbl 38 | ) %>% 39 | step_range( 40 | all_numeric_predictors(), 41 | min = 0, 42 | max = 1 43 | ) 44 | recipe_obj 45 | 46 | concrete_normalized_tbl <- recipe_obj %>% 47 | prep() %>% 48 | bake(new_data = NULL) 49 | concrete_normalized_tbl 50 | 51 | ### Create training and test data ---- 52 | concrete_split <- initial_time_split( 53 | concrete_normalized_tbl, 54 | prop = 773 / 1030 55 | ) 56 | concrete_train <- training(concrete_split) 57 | concrete_test <- testing(concrete_split) 58 | 59 | 60 | ## 3. Training a model on the data ---- 61 | 62 | # nnet is the engine (needs to be installed if not already): 63 | # install.packages("nnet") 64 | 65 | # It is used as the engine for {parsnip}'s mlp() function. 66 | # And since we are predicting strength, we choose regression as the mode. 67 | 68 | ### Create model specification ---- 69 | model_spec <- mlp( 70 | engine = "nnet", 71 | mode = "regression", 72 | hidden_units = 5, 73 | penalty = 0, 74 | epochs = 100 75 | ) %>% 76 | translate() 77 | model_spec 78 | 79 | ### Fit the model ---- 80 | model_fit <- fit( 81 | model_spec, 82 | strength ~ ., 83 | concrete_train 84 | ) 85 | model_fit 86 | 87 | ### Take a closer look at the model ---- 88 | summary(model_fit$fit) 89 | 90 | ### Make the predictions (you could skip this step) ---- 91 | concrete_test_pred <- predict( 92 | model_fit, 93 | new_data = concrete_test, 94 | type = "numeric" 95 | ) 96 | concrete_test_pred 97 | 98 | ### Add the predictions to the test tibble ---- 99 | concrete_test_with_pred <- augment(model_fit, concrete_test) 100 | concrete_test_with_pred 101 | 102 | ### Metrics ---- 103 | concrete_test_with_pred %>% metrics(strength, .pred) 104 | 105 | ### Visualize the network topology ---- 106 | # I have yet to find a working solution to create this visualization. 107 | # If you know how to do it, please let me know! 108 | 109 | 110 | ## 4. Evaluating model performance ---- 111 | 112 | ### Examine the correlation between predicted and actual values ---- 113 | concrete_test_with_pred %>% 114 | select(.pred, strength) %>% 115 | correlate() 116 | 117 | 118 | ## 5. Improving model performance with two hidden layers and custom activation function ---- 119 | 120 | ### Create model specification ---- 121 | model_spec_2 <- mlp( 122 | engine = "nnet", 123 | mode = "regression", 124 | hidden_units = 5, 125 | penalty = 0.1, 126 | epochs = 100, 127 | ) %>% 128 | translate() 129 | model_spec_2 130 | 131 | ### Fit the model ---- 132 | model_fit_2 <- fit( 133 | model_spec_2, 134 | strength ~ ., 135 | concrete_train 136 | ) 137 | model_fit_2 138 | 139 | ### Take a closer look at the model ---- 140 | summary(model_fit_2$fit) 141 | 142 | ### Make the predictions (you could skip this step) ---- 143 | concrete_test_pred_2 <- predict( 144 | model_fit_2, 145 | new_data = concrete_test, 146 | type = "numeric" 147 | ) 148 | concrete_test_pred_2 149 | 150 | ### Add the predictions to the test tibble ---- 151 | concrete_test_with_pred_2 <- augment(model_fit_2, concrete_test) 152 | concrete_test_with_pred_2 153 | 154 | ### Metrics ---- 155 | concrete_test_with_pred_2 %>% metrics(strength, .pred) 156 | 157 | ### Examine the correlation between predicted and actual values ---- 158 | concrete_test_with_pred_2 %>% 159 | select(.pred, strength) %>% 160 | correlate() 161 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/decision_trees_and_rules/identifying_poisonous_mushrooms_with_rule_learners.r: -------------------------------------------------------------------------------- 1 | # Identifying Poisonous Mushrooms with Rule Learners ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 5: 4 | # Divide and Conquer - Classification Using Decision Trees and Rules. 5 | # 6 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I 7 | # wanted to see how one could recreate it using mainly {tidymodels} and 8 | # {tidyverse}. 9 | # 10 | # You can find the original code and the slightly modified dataset here: 11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05 12 | 13 | ## 1. Loading libraries (in the order they get used) ---- 14 | library(conflicted) 15 | library(tidyverse) 16 | library(tidymodels) 17 | library(rules) 18 | 19 | ## 2. Exploring and preparing the data ---- 20 | mushrooms_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/mushrooms.csv") 21 | 22 | ### Examine the data ---- 23 | mushrooms_tbl %>% map(unique) 24 | 25 | ### Drop the veil_type feature ---- 26 | mushroom_selected_tbl <- mushrooms_tbl %>% 27 | select(-veil_type) 28 | 29 | ### Examine the class distribution ---- 30 | mushroom_selected_tbl %>% 31 | count(type) %>% 32 | mutate(pct = (n / sum(n) * 100)) 33 | 34 | 35 | ## 3. Creating the recipe ---- 36 | recipe_obj <- recipe( 37 | type ~ ., 38 | data = mushroom_selected_tbl 39 | ) 40 | recipe_obj 41 | 42 | mushroom_baked_tbl <- recipe_obj %>% 43 | prep() %>% 44 | bake(new_data = NULL) 45 | 46 | 47 | ## 4. Training a model on the data ---- 48 | 49 | ### Model specification ---- 50 | model_spec <- C5_rules( 51 | mode = "classification", 52 | engine = "C5.0", 53 | trees = NULL, 54 | min_n = NULL 55 | ) %>% 56 | translate() 57 | model_spec 58 | 59 | ### Fit the model ---- 60 | model_fit <- fit( 61 | model_spec, 62 | type ~ ., 63 | mushroom_baked_tbl 64 | ) 65 | model_fit 66 | 67 | model_fit %>% 68 | extract_fit_engine() %>% 69 | summary() 70 | 71 | ### Make the predictions (you could skip this step) ---- 72 | mushroom_pred_tbl <- predict( 73 | object = model_fit, 74 | new_data = mushroom_baked_tbl, 75 | type = "class" 76 | ) 77 | mushroom_pred_tbl 78 | 79 | ### Add the predictions to the test tibble ---- 80 | mushroom_pred_tbl <- augment(model_fit, mushroom_baked_tbl) 81 | mushroom_pred_tbl 82 | 83 | 84 | ## 5. Evaluating model performance ---- 85 | 86 | ### Create a confusion matrix ---- 87 | conf_mat <- conf_mat( 88 | data = mushroom_pred_tbl, 89 | truth = type, 90 | estimate = .pred_class 91 | ) 92 | conf_mat 93 | 94 | ### Visualize the confusion matrix ---- 95 | conf_mat %>% autoplot(type = "heatmap") 96 | conf_mat %>% autoplot(type = "mosaic") 97 | 98 | ### Visualize the ROC curve ---- 99 | mushroom_pred_tbl %>% 100 | roc_curve( 101 | truth = type, 102 | estimate = .pred_edible 103 | ) %>% 104 | autoplot() 105 | 106 | ### Calculate the ROC AUC (area under the curve) ---- 107 | mushroom_roc_auc <- mushroom_pred_tbl %>% 108 | roc_auc( 109 | truth = type, 110 | estimate = .pred_edible 111 | ) 112 | mushroom_roc_auc 113 | 114 | ### Put together other model metrics ---- 115 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 116 | classification_metrics <- conf_mat( 117 | mushroom_pred_tbl, 118 | truth = type, 119 | estimate = .pred_class 120 | ) %>% 121 | summary() 122 | classification_metrics 123 | 124 | 125 | ## 6. Creating a function to help evaluate the model further ---- 126 | 127 | # The assumption here is that you have already gone through steps 1. to 4. 128 | # What we're potentially tuning here are the arguments .trees and .min_n 129 | 130 | classify_with_c5_rules <- function( 131 | .trees = NULL, 132 | .min_n = NULL 133 | ) { 134 | 135 | # Create the recipe 136 | recipe_obj <- recipe( 137 | type ~ ., 138 | data = mushroom_selected_tbl 139 | ) 140 | 141 | mushroom_baked_tbl <- recipe_obj %>% 142 | prep() %>% 143 | bake(new_data = NULL) 144 | 145 | # Model specification 146 | model_spec <- C5_rules( 147 | mode = "classification", 148 | engine = "C5.0", 149 | trees = .trees, 150 | min_n = .min_n 151 | ) %>% 152 | translate() 153 | 154 | # Fit the model 155 | model_fit <- fit( 156 | model_spec, 157 | type ~ ., 158 | mushroom_baked_tbl 159 | ) 160 | 161 | model_fit %>% 162 | extract_fit_engine() %>% 163 | summary() 164 | 165 | # Add the predictions to the test tibble 166 | mushroom_pred_tbl <- augment(model_fit, mushroom_baked_tbl) 167 | mushroom_pred_tbl 168 | 169 | # Create a confusion matrix 170 | conf_mat <- conf_mat( 171 | data = mushroom_pred_tbl, 172 | truth = type, 173 | estimate = .pred_class 174 | ) 175 | 176 | conf_mat %>% autoplot(type = "heatmap") 177 | 178 | } 179 | 180 | ### Test the function ---- 181 | classify_with_c5_rules( 182 | .trees = 3, 183 | .min_n = 1 184 | ) 185 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/knn/data/wdbc-names.txt: -------------------------------------------------------------------------------- 1 | 1. Title: Wisconsin Diagnostic Breast Cancer (WDBC) 2 | 3 | 2. Source Information 4 | 5 | a) Creators: 6 | 7 | Dr. William H. Wolberg, General Surgery Dept., University of 8 | Wisconsin, Clinical Sciences Center, Madison, WI 53792 9 | wolberg@eagle.surgery.wisc.edu 10 | 11 | W. Nick Street, Computer Sciences Dept., University of 12 | Wisconsin, 1210 West Dayton St., Madison, WI 53706 13 | street@cs.wisc.edu 608-262-6619 14 | 15 | Olvi L. Mangasarian, Computer Sciences Dept., University of 16 | Wisconsin, 1210 West Dayton St., Madison, WI 53706 17 | olvi@cs.wisc.edu 18 | 19 | b) Donor: Nick Street 20 | 21 | c) Date: November 1995 22 | 23 | 3. Past Usage: 24 | 25 | first usage: 26 | 27 | W.N. Street, W.H. Wolberg and O.L. Mangasarian 28 | Nuclear feature extraction for breast tumor diagnosis. 29 | IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science 30 | and Technology, volume 1905, pages 861-870, San Jose, CA, 1993. 31 | 32 | OR literature: 33 | 34 | O.L. Mangasarian, W.N. Street and W.H. Wolberg. 35 | Breast cancer diagnosis and prognosis via linear programming. 36 | Operations Research, 43(4), pages 570-577, July-August 1995. 37 | 38 | Medical literature: 39 | 40 | W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 41 | Machine learning techniques to diagnose breast cancer from 42 | fine-needle aspirates. 43 | Cancer Letters 77 (1994) 163-171. 44 | 45 | W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 46 | Image analysis and machine learning applied to breast cancer 47 | diagnosis and prognosis. 48 | Analytical and Quantitative Cytology and Histology, Vol. 17 49 | No. 2, pages 77-87, April 1995. 50 | 51 | W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 52 | Computerized breast cancer diagnosis and prognosis from fine 53 | needle aspirates. 54 | Archives of Surgery 1995;130:511-516. 55 | 56 | W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 57 | Computer-derived nuclear features distinguish malignant from 58 | benign breast cytology. 59 | Human Pathology, 26:792--796, 1995. 60 | 61 | See also: 62 | http://www.cs.wisc.edu/~olvi/uwmp/mpml.html 63 | http://www.cs.wisc.edu/~olvi/uwmp/cancer.html 64 | 65 | Results: 66 | 67 | - predicting field 2, diagnosis: B = benign, M = malignant 68 | - sets are linearly separable using all 30 input features 69 | - best predictive accuracy obtained using one separating plane 70 | in the 3-D space of Worst Area, Worst Smoothness and 71 | Mean Texture. Estimated accuracy 97.5% using repeated 72 | 10-fold crossvalidations. Classifier has correctly 73 | diagnosed 176 consecutive new patients as of November 74 | 1995. 75 | 76 | 4. Relevant information 77 | 78 | Features are computed from a digitized image of a fine needle 79 | aspirate (FNA) of a breast mass. They describe 80 | characteristics of the cell nuclei present in the image. 81 | A few of the images can be found at 82 | http://www.cs.wisc.edu/~street/images/ 83 | 84 | Separating plane described above was obtained using 85 | Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree 86 | Construction Via Linear Programming." Proceedings of the 4th 87 | Midwest Artificial Intelligence and Cognitive Science Society, 88 | pp. 97-101, 1992], a classification method which uses linear 89 | programming to construct a decision tree. Relevant features 90 | were selected using an exhaustive search in the space of 1-4 91 | features and 1-3 separating planes. 92 | 93 | The actual linear program used to obtain the separating plane 94 | in the 3-dimensional space is that described in: 95 | [K. P. Bennett and O. L. Mangasarian: "Robust Linear 96 | Programming Discrimination of Two Linearly Inseparable Sets", 97 | Optimization Methods and Software 1, 1992, 23-34]. 98 | 99 | 100 | This database is also available through the UW CS ftp server: 101 | 102 | ftp ftp.cs.wisc.edu 103 | cd math-prog/cpo-dataset/machine-learn/WDBC/ 104 | 105 | 5. Number of instances: 569 106 | 107 | 6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features) 108 | 109 | 7. Attribute information 110 | 111 | 1) ID number 112 | 2) Diagnosis (M = malignant, B = benign) 113 | 3-32) 114 | 115 | Ten real-valued features are computed for each cell nucleus: 116 | 117 | a) radius (mean of distances from center to points on the perimeter) 118 | b) texture (standard deviation of gray-scale values) 119 | c) perimeter 120 | d) area 121 | e) smoothness (local variation in radius lengths) 122 | f) compactness (perimeter^2 / area - 1.0) 123 | g) concavity (severity of concave portions of the contour) 124 | h) concave points (number of concave portions of the contour) 125 | i) symmetry 126 | j) fractal dimension ("coastline approximation" - 1) 127 | 128 | Several of the papers listed above contain detailed descriptions of 129 | how these features are computed. 130 | 131 | The mean, standard error, and "worst" or largest (mean of the three 132 | largest values) of these features were computed for each image, 133 | resulting in 30 features. For instance, field 3 is Mean Radius, field 134 | 13 is Radius SE, field 23 is Worst Radius. 135 | 136 | All feature values are recoded with four significant digits. 137 | 138 | 8. Missing attribute values: none 139 | 140 | 9. Class distribution: 357 benign, 212 malignant -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/data/usedcars.csv: -------------------------------------------------------------------------------- 1 | year,model,price,mileage,color,transmission 2 | 2011,SEL,21992,7413,Yellow,AUTO 3 | 2011,SEL,20995,10926,Gray,AUTO 4 | 2011,SEL,19995,7351,Silver,AUTO 5 | 2011,SEL,17809,11613,Gray,AUTO 6 | 2012,SE,17500,8367,White,AUTO 7 | 2010,SEL,17495,25125,Silver,AUTO 8 | 2011,SEL,17000,27393,Blue,AUTO 9 | 2010,SEL,16995,21026,Silver,AUTO 10 | 2011,SES,16995,32655,Silver,AUTO 11 | 2010,SES,16995,36116,Silver,AUTO 12 | 2010,SES,16995,40539,Black,AUTO 13 | 2011,SES,16992,9199,Silver,AUTO 14 | 2011,SEL,16950,9388,Green,AUTO 15 | 2010,SES,16950,32058,Red,AUTO 16 | 2011,SE,16000,15367,White,AUTO 17 | 2011,SES,15999,16368,Blue,AUTO 18 | 2010,SEL,15999,19926,Silver,AUTO 19 | 2010,SES,15995,36049,Silver,AUTO 20 | 2011,SEL,15992,11662,Blue,AUTO 21 | 2011,SEL,15992,32069,Silver,AUTO 22 | 2010,SES,15988,16035,Silver,MANUAL 23 | 2010,SEL,15980,39943,White,AUTO 24 | 2011,SE,15899,36685,Silver,AUTO 25 | 2010,SEL,15889,24920,Black,AUTO 26 | 2009,SEL,15688,20019,Blue,AUTO 27 | 2010,SE,15500,29338,Blue,AUTO 28 | 2010,SE,15499,7784,Black,AUTO 29 | 2010,SE,15499,35636,Black,AUTO 30 | 2010,SES,15298,22029,Gray,AUTO 31 | 2009,SEL,14999,33107,Silver,AUTO 32 | 2010,SES,14999,36306,Red,AUTO 33 | 2009,SE,14995,34419,Black,MANUAL 34 | 2011,SE,14992,4867,Black,AUTO 35 | 2011,SEL,14992,18948,Black,AUTO 36 | 2009,SES,14992,24030,Red,AUTO 37 | 2010,SEL,14990,33036,Black,AUTO 38 | 2011,SE,14989,23967,White,AUTO 39 | 2010,SE,14906,37905,Silver,AUTO 40 | 2010,SE,14900,28955,White,AUTO 41 | 2010,SE,14893,11165,White,AUTO 42 | 2010,SES,14761,44813,Black,AUTO 43 | 2010,SES,14699,36469,Gray,AUTO 44 | 2010,SES,14677,22143,Black,MANUAL 45 | 2009,SES,14549,34046,Silver,AUTO 46 | 2010,SE,14499,32703,Red,AUTO 47 | 2010,SES,14495,35894,Silver,AUTO 48 | 2010,SE,14495,38275,Black,AUTO 49 | 2010,SE,14480,24855,Blue,AUTO 50 | 2009,SEL,14477,29501,Gray,MANUAL 51 | 2009,SEL,14355,35394,Red,AUTO 52 | 2010,SE,14299,36447,Black,AUTO 53 | 2010,SES,14275,35318,Black,AUTO 54 | 2010,SES,14000,24929,Silver,AUTO 55 | 2009,SE,13999,23785,Red,AUTO 56 | 2010,SE,13997,15167,Black,MANUAL 57 | 2010,SE,13995,13541,Silver,AUTO 58 | 2010,SE,13995,20278,Black,MANUAL 59 | 2009,SES,13995,46126,Black,AUTO 60 | 2009,SES,13995,53733,Silver,AUTO 61 | 2009,SES,13992,21108,Silver,AUTO 62 | 2010,SE,13992,21721,Green,AUTO 63 | 2010,SES,13992,26716,Gray,MANUAL 64 | 2009,SES,13992,26887,Black,AUTO 65 | 2009,SE,13991,36252,Silver,MANUAL 66 | 2009,SE,13950,9450,Black,AUTO 67 | 2010,SE,13950,31414,Black,AUTO 68 | 2010,SE,13950,37185,Blue,AUTO 69 | 2010,SE,13895,48174,Gray,AUTO 70 | 2009,SE,13888,50533,White,AUTO 71 | 2009,SE,13845,36713,Blue,AUTO 72 | 2009,SES,13799,34888,Black,AUTO 73 | 2009,SES,13742,38380,Black,AUTO 74 | 2010,SEL,13687,35574,Gray,AUTO 75 | 2009,SEL,13663,27528,Silver,AUTO 76 | 2010,SES,13599,33302,Red,AUTO 77 | 2009,SEL,13584,43369,Red,AUTO 78 | 2009,SES,13425,64055,Black,AUTO 79 | 2010,SE,13384,41342,Gray,AUTO 80 | 2010,SE,13383,34503,Black,AUTO 81 | 2010,SE,13350,16573,Blue,AUTO 82 | 2009,SES,12999,32403,Blue,AUTO 83 | 2009,SE,12998,34846,Blue,AUTO 84 | 2007,SE,12997,39665,Red,AUTO 85 | 2010,SE,12995,21325,Black,AUTO 86 | 2010,SE,12995,32743,Black,MANUAL 87 | 2010,SE,12995,40058,White,MANUAL 88 | 2009,SE,12995,42325,Blue,AUTO 89 | 2009,SE,12995,44518,Red,AUTO 90 | 2009,SE,12995,53902,Gray,AUTO 91 | 2008,SE,12995,127327,Red,AUTO 92 | 2009,SE,12992,27136,Gray,AUTO 93 | 2009,SES,12990,45813,Silver,AUTO 94 | 2009,SE,12988,31538,Gray,AUTO 95 | 2010,SE,12849,29517,Silver,AUTO 96 | 2010,SE,12780,35871,Black,AUTO 97 | 2008,SE,12777,49787,Black,MANUAL 98 | 2008,SES,12704,36323,Blue,AUTO 99 | 2009,SES,12595,39211,Blue,AUTO 100 | 2009,SE,12507,44789,Gray,AUTO 101 | 2008,SE,12500,45996,White,MANUAL 102 | 2009,SE,12500,54988,White,MANUAL 103 | 2009,SE,12280,29288,Red,AUTO 104 | 2009,SE,11999,36124,Blue,AUTO 105 | 2009,SE,11992,32559,Black,MANUAL 106 | 2009,SES,11984,59048,Black,AUTO 107 | 2009,SE,11980,55170,Red,AUTO 108 | 2010,SE,11792,39722,Green,AUTO 109 | 2008,SE,11754,38286,Black,AUTO 110 | 2008,SES,11749,57341,Red,AUTO 111 | 2008,SES,11495,82221,Silver,AUTO 112 | 2008,SE,11450,85229,Red,MANUAL 113 | 2009,SES,10995,42834,Red,AUTO 114 | 2005,SES,10995,69415,Blue,AUTO 115 | 2009,SEL,10995,78264,Gray,AUTO 116 | 2009,SE,10979,60709,Red,AUTO 117 | 2008,SE,10955,39643,Gray,AUTO 118 | 2009,SE,10955,40180,Gold,AUTO 119 | 2008,SE,10836,40330,Green,MANUAL 120 | 2007,SES,10815,77231,Red,AUTO 121 | 2007,SE,10770,72937,Silver,MANUAL 122 | 2010,SE,10717,64199,Black,AUTO 123 | 2007,SES,10000,63926,Red,AUTO 124 | 2007,SES,9999,74427,Silver,AUTO 125 | 2007,SES,9999,78948,Black,MANUAL 126 | 2006,SE,9995,51311,Silver,AUTO 127 | 2008,SE,9995,95364,White,AUTO 128 | 2008,SE,9992,74109,White,AUTO 129 | 2007,SE,9651,63296,Blue,AUTO 130 | 2007,SES,9000,80605,Red,AUTO 131 | 2006,SE,8999,49656,Silver,AUTO 132 | 2007,SE,8996,48652,Silver,MANUAL 133 | 2006,SE,8800,71331,White,AUTO 134 | 2008,SE,8495,106171,Black,AUTO 135 | 2008,SE,8494,68901,Silver,AUTO 136 | 2009,SE,8480,70036,White,MANUAL 137 | 2007,SES,7999,81596,Yellow,MANUAL 138 | 2006,SES,7995,35000,Black,MANUAL 139 | 2006,SES,7995,97987,Red,AUTO 140 | 2003,SES,7900,96000,White,AUTO 141 | 2005,SES,7488,59013,Red,AUTO 142 | 2004,SE,6999,105714,Silver,AUTO 143 | 2007,SE,6995,86862,White,AUTO 144 | 2000,SE,6980,60161,Green,AUTO 145 | 2004,SES,6980,101130,Gray,AUTO 146 | 2004,SES,6950,119720,Black,AUTO 147 | 2006,SES,6200,95000,Silver,AUTO 148 | 2002,SE,5995,87003,Red,AUTO 149 | 2000,SE,5980,96841,Red,AUTO 150 | 2001,SE,4899,151479,Yellow,AUTO 151 | 2000,SE,3800,109259,Red,AUTO 152 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/Chapter_10.r: -------------------------------------------------------------------------------- 1 | ##### Chapter 10: Evaluating Model Performance ------------------- 2 | 3 | ## Confusion matrixes in R ---- 4 | 5 | ## Create the predicted probabilities from the SMS classifier built in Chapter 4. 6 | sms_results <- read.csv("sms_results.csv", stringsAsFactors = TRUE) 7 | 8 | # the first several test cases 9 | head(sms_results) 10 | 11 | # test cases where the model is less confident 12 | head(subset(sms_results, prob_spam > 0.40 & prob_spam < 0.60)) 13 | 14 | # test cases where the model was wrong 15 | head(subset(sms_results, actual_type != predict_type)) 16 | 17 | # specifying vectors 18 | table(sms_results$actual_type, sms_results$predict_type) 19 | 20 | # alternative solution using the formula interface (not shown in book) 21 | xtabs(~ actual_type + predict_type, sms_results) 22 | 23 | # using the CrossTable function 24 | library(gmodels) 25 | CrossTable(sms_results$actual_type, sms_results$predict_type) 26 | 27 | # accuracy and error rate calculation -- 28 | # accuracy 29 | (152 + 1203) / (152 + 1203 + 4 + 31) 30 | # error rate 31 | (4 + 31) / (152 + 1203 + 4 + 31) 32 | # error rate = 1 - accuracy 33 | 1 - 0.9748201 34 | 35 | ## Beyond accuracy: other performance measures ---- 36 | library(caret) 37 | confusionMatrix(sms_results$predict_type, sms_results$actual_type, positive = "spam") 38 | 39 | # Kappa statistic 40 | # example using SMS classifier 41 | pr_a <- 0.865 + 0.109 42 | pr_a 43 | 44 | pr_e <- 0.868 * 0.888 + 0.132 * 0.112 45 | pr_e 46 | 47 | k <- (pr_a - pr_e) / (1 - pr_e) 48 | k 49 | 50 | # calculate kappa via the vcd package 51 | library(vcd) 52 | Kappa(table(sms_results$actual_type, sms_results$predict_type)) 53 | 54 | # calculate kappa via the irr package 55 | library(irr) 56 | kappa2(sms_results[1:2]) 57 | 58 | # Matthews correlation coefficient (MCC) 59 | # for SMS classifier: TN = 1203, FP = 4, FN = 31, TP = 152 60 | 61 | # MCC = (TP * TN - FP * FN) / 62 | # sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) 63 | 64 | # compute MCC by hand for SMS classifier 65 | (152 * 1203 - 4 * 31) / 66 | sqrt((152 + 4) * (152 + 31) * (1203 + 4) * (1203 + 31)) 67 | 68 | # compute MCC using mcc() function in mltools package 69 | library(mltools) 70 | mcc(sms_results$actual_type, sms_results$predict_type) 71 | 72 | # compute MCC using Pearson correlation 73 | cor(ifelse(sms_results$actual_type == "spam", 1, 0), 74 | ifelse(sms_results$predict_type == "spam", 1, 0)) 75 | 76 | # Sensitivity and specificity 77 | # example using SMS classifier 78 | sens <- 152 / (152 + 31) 79 | sens 80 | 81 | spec <- 1203 / (1203 + 4) 82 | spec 83 | 84 | # example using the caret package 85 | library(caret) 86 | sensitivity(sms_results$predict_type, sms_results$actual_type, positive = "spam") 87 | specificity(sms_results$predict_type, sms_results$actual_type, negative = "ham") 88 | 89 | # Precision and recall 90 | prec <- 152 / (152 + 4) 91 | prec 92 | 93 | rec <- 152 / (152 + 31) 94 | rec 95 | 96 | # example using the caret package 97 | library(caret) 98 | posPredValue(sms_results$predict_type, sms_results$actual_type, positive = "spam") 99 | sensitivity(sms_results$predict_type, sms_results$actual_type, positive = "spam") 100 | 101 | # F-measure 102 | f <- (2 * prec * rec) / (prec + rec) 103 | f 104 | 105 | f <- (2 * 152) / (2 * 152 + 4 + 31) 106 | f 107 | 108 | ## Visualizing Performance Tradeoffs ---- 109 | library(pROC) 110 | sms_roc <- roc(sms_results$actual_type, sms_results$prob_spam) 111 | 112 | # ROC curve for Naive Bayes 113 | plot(sms_roc, main = "ROC curve for SMS spam filter", 114 | col = "blue", lwd = 2, grid = TRUE, legacy.axes = TRUE) 115 | 116 | # compare to kNN 117 | sms_results_knn <- read.csv("sms_results_knn.csv") 118 | sms_roc_knn <- roc(sms_results$actual_type, sms_results_knn$p_spam) 119 | plot(sms_roc_knn, col = "red", lwd = 2, add = TRUE) 120 | 121 | # calculate AUC for Naive Bayes and kNN 122 | auc(sms_roc) 123 | auc(sms_roc_knn) 124 | 125 | ## Estimating Future Performance ---- 126 | 127 | # partitioning data 128 | library(caret) 129 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 130 | 131 | # Holdout method 132 | # using random IDs 133 | random_ids <- order(runif(1000)) 134 | credit_train <- credit[random_ids[1:500],] 135 | credit_validate <- credit[random_ids[501:750], ] 136 | credit_test <- credit[random_ids[751:1000], ] 137 | 138 | # using caret function 139 | in_train <- createDataPartition(credit$default, p = 0.75, list = FALSE) 140 | credit_train <- credit[in_train, ] 141 | credit_test <- credit[-in_train, ] 142 | 143 | # 10-fold CV 144 | set.seed(123) # to ensure results match 145 | folds <- createFolds(credit$default, k = 10) 146 | str(folds) 147 | credit01_test <- credit[folds$Fold01, ] 148 | credit01_train <- credit[-folds$Fold01, ] 149 | 150 | ## Automating 10-fold CV for a C5.0 Decision Tree using lapply() ---- 151 | library(caret) 152 | library(C50) 153 | library(irr) 154 | 155 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 156 | 157 | set.seed(123) 158 | folds <- createFolds(credit$default, k = 10) 159 | 160 | cv_results <- lapply(folds, function(x) { 161 | credit_train <- credit[-x, ] 162 | credit_test <- credit[x, ] 163 | credit_model <- C5.0(default ~ ., data = credit_train) 164 | credit_pred <- predict(credit_model, credit_test) 165 | credit_actual <- credit_test$default 166 | kappa <- kappa2(data.frame(credit_actual, credit_pred))$value 167 | return(kappa) 168 | }) 169 | 170 | # examine the results of the 10 trials 171 | str(cv_results) 172 | 173 | # compute the average kappa across the 10 trials 174 | mean(unlist(cv_results)) 175 | 176 | # compute the standard deviation across the 10 trials 177 | sd(unlist(cv_results)) 178 | 179 | ## Bootstrap sampling ---- 180 | 181 | # computing the probability that a record is unselected 182 | (1 - (1/1000))^1000 183 | (1 - (1/100000))^100000 184 | 1 / exp(1) # as n approaches infinity 185 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/hyperparameter_optimization/hyperparameter_optimization_with_tidymodels.R: -------------------------------------------------------------------------------- 1 | # Hyperparameter Optimization with tidymodels ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 11: 4 | # Improving Model Performance 5 | # 6 | # The original code is made with {caret}. I wanted to see how one could recreate 7 | # it using mainly {tidymodels} and {tidyverse}. 8 | # 9 | # You can find the original code and the slightly modified dataset here: 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11 11 | 12 | ## 1. Loading libraries (in the order they get used) ---- 13 | library(conflicted) 14 | library(tidyverse) 15 | library(tidymodels) 16 | 17 | 18 | ## 2. Loading the dataset ---- 19 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv") 20 | 21 | 22 | ## 3. Creating the recipe and splitting the data ---- 23 | 24 | ### Convert strings to factors ---- 25 | recipe_obj <- recipe( 26 | default ~ ., 27 | data = credit_tbl 28 | ) %>% 29 | step_string2factor(all_nominal()) 30 | recipe_obj 31 | 32 | credit_factorized_tbl <- recipe_obj %>% 33 | prep() %>% 34 | bake(new_data = NULL) 35 | credit_factorized_tbl 36 | 37 | ### Create training and test data (randomly) ---- 38 | set.seed(123) 39 | credit_split <- initial_split( 40 | credit_factorized_tbl, 41 | prop = 0.9, 42 | strata = default 43 | ) 44 | 45 | credit_train <- training(credit_split) 46 | credit_test <- testing(credit_split) 47 | 48 | ### Check the proportion of class variable ---- 49 | credit_train %>% 50 | count(default) %>% 51 | mutate(pct = (n / sum(n) * 100)) 52 | 53 | credit_test %>% 54 | count(default) %>% 55 | mutate(pct = (n / sum(n) * 100)) 56 | 57 | 58 | ## 4. Creating a simple model ---- 59 | 60 | ### Create specifications with the default tuning parameters for rpart ---- 61 | # tree_depth = 30, min_n = 2 and cost_complexity = 0.01 62 | 63 | set.seed(123) 64 | tune_spec_simple <- decision_tree( 65 | mode = "classification", 66 | engine = "rpart" 67 | ) 68 | tune_spec_simple 69 | 70 | ### Fit the model ---- 71 | model_fit_simple <- fit( 72 | tune_spec_simple, 73 | default ~ ., 74 | credit_train 75 | ) 76 | 77 | model_fit_simple %>% 78 | extract_fit_engine() %>% 79 | summary() 80 | 81 | ### Make the predictions ---- 82 | credit_simple <- augment(model_fit_simple, credit_test) 83 | credit_simple 84 | 85 | ### Create a confusion matrix ---- 86 | conf_mat_simple <- conf_mat( 87 | data = credit_simple, 88 | truth = default, 89 | estimate = .pred_class 90 | ) 91 | conf_mat_simple 92 | 93 | ### Visualize the ROC curve ---- 94 | credit_simple %>% 95 | roc_curve( 96 | truth = default, 97 | estimate = .pred_no 98 | ) %>% 99 | autoplot() 100 | 101 | ### Calculate the ROC AUC (area under the curve) ---- 102 | credit_roc_auc_simple <- credit_simple %>% 103 | roc_auc( 104 | truth = default, 105 | estimate = .pred_no 106 | ) 107 | credit_roc_auc_simple 108 | 109 | ### Put together other model metrics ---- 110 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 111 | 112 | classification_metrics_simple <- conf_mat( 113 | credit_simple, 114 | truth = default, 115 | estimate = .pred_class 116 | ) %>% 117 | summary() 118 | classification_metrics_simple 119 | 120 | 121 | ## 5. Generating a tuning grid ---- 122 | 123 | ### Create a specification with tuning placeholders ---- 124 | set.seed(123) 125 | tune_spec <- decision_tree( 126 | mode = "classification", 127 | engine = "rpart", 128 | tree_depth = tune(), 129 | min_n = tune(), 130 | cost_complexity = tune() 131 | ) 132 | tune_spec 133 | 134 | ### Create a regular grid ---- 135 | tree_grid <- grid_regular( 136 | extract_parameter_set_dials(tune_spec), 137 | levels = 4 138 | ) 139 | tree_grid 140 | 141 | 142 | ## 6. Tuning along the grid ---- 143 | 144 | ### Create CV folds of the credit_train ---- 145 | folds <- vfold_cv( 146 | credit_train, 147 | v = 10 148 | ) 149 | folds 150 | 151 | ### Tune along the grid ---- 152 | tune_results <- tune_grid( 153 | tune_spec, 154 | default ~ ., 155 | resamples = folds, 156 | grid = tree_grid, 157 | metrics = metric_set(accuracy) 158 | ) 159 | 160 | ### Plot the tuning results ---- 161 | autoplot(tune_results) 162 | 163 | 164 | ## 7. Picking the winner ---- 165 | 166 | ### Select the parameters that perform best ---- 167 | final_params <- select_best(tune_results) 168 | final_params 169 | 170 | ### Finalize the specification ---- 171 | best_spec <- finalize_model(tune_spec, final_params) 172 | best_spec 173 | 174 | ### Build the final model ---- 175 | final_model <- fit( 176 | best_spec, 177 | default ~ ., 178 | credit_train 179 | ) 180 | final_model 181 | 182 | final_model %>% 183 | extract_fit_engine() %>% 184 | summary() 185 | 186 | ### Add the predictions to the test tibble ---- 187 | credit_with_pred_tbl <- augment(final_model, credit_test) 188 | credit_with_pred_tbl 189 | 190 | 191 | ## 8. Evaluating model performance ---- 192 | 193 | ### Create a confusion matrix ---- 194 | conf_mat <- conf_mat( 195 | data = credit_with_pred_tbl, 196 | truth = default, 197 | estimate = .pred_class 198 | ) 199 | conf_mat 200 | 201 | ### Visualize the ROC curve ---- 202 | credit_with_pred_tbl %>% 203 | roc_curve( 204 | truth = default, 205 | estimate = .pred_no 206 | ) %>% 207 | autoplot() 208 | 209 | ### Calculate the ROC AUC (area under the curve) ---- 210 | credit_roc_auc <- credit_with_pred_tbl %>% 211 | roc_auc( 212 | truth = default, 213 | estimate = .pred_no 214 | ) 215 | credit_roc_auc 216 | 217 | ### Put together other model metrics ---- 218 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 219 | classification_metrics <- conf_mat( 220 | credit_with_pred_tbl, 221 | truth = default, 222 | estimate = .pred_class 223 | ) %>% 224 | summary() 225 | classification_metrics 226 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/hyperparameter_optimization/hyperparameter_optimization_with_tidymodels_random.R: -------------------------------------------------------------------------------- 1 | # Hyperparameter Optimization with tidymodels ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 11: 4 | # Improving Model Performance 5 | # 6 | # The original code is made with {caret}. I wanted to see how one could recreate 7 | # it using mainly {tidymodels} and {tidyverse}. 8 | # 9 | # You can find the original code and the slightly modified dataset here: 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11 11 | 12 | ## 1. Loading libraries (in the order they get used) ---- 13 | library(conflicted) 14 | library(tidyverse) 15 | library(tidymodels) 16 | 17 | 18 | ## 2. Loading the dataset ---- 19 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv") 20 | 21 | 22 | ## 3. Creating the recipe and splitting the data ---- 23 | 24 | ### Convert strings to factors ---- 25 | recipe_obj <- recipe( 26 | default ~ ., 27 | data = credit_tbl 28 | ) %>% 29 | step_string2factor(all_nominal()) 30 | recipe_obj 31 | 32 | credit_factorized_tbl <- recipe_obj %>% 33 | prep() %>% 34 | bake(new_data = NULL) 35 | credit_factorized_tbl 36 | 37 | ### Create training and test data (randomly) ---- 38 | set.seed(123) 39 | credit_split <- initial_split( 40 | credit_factorized_tbl, 41 | prop = 0.9, 42 | strata = default 43 | ) 44 | 45 | credit_train <- training(credit_split) 46 | credit_test <- testing(credit_split) 47 | 48 | ### Check the proportion of class variable ---- 49 | credit_train %>% 50 | count(default) %>% 51 | mutate(pct = (n / sum(n) * 100)) 52 | 53 | credit_test %>% 54 | count(default) %>% 55 | mutate(pct = (n / sum(n) * 100)) 56 | 57 | 58 | ## 4. Creating a simple model ---- 59 | 60 | ### Create specifications with the default tuning parameters for rpart ---- 61 | # tree_depth = 30, min_n = 2 and cost_complexity = 0.01 62 | 63 | set.seed(123) 64 | tune_spec_simple <- decision_tree( 65 | mode = "classification", 66 | engine = "rpart" 67 | ) 68 | tune_spec_simple 69 | 70 | ### Fit the model ---- 71 | model_fit_simple <- fit( 72 | tune_spec_simple, 73 | default ~ ., 74 | credit_train 75 | ) 76 | 77 | model_fit_simple %>% 78 | extract_fit_engine() %>% 79 | summary() 80 | 81 | ### Make the predictions ---- 82 | credit_simple <- augment(model_fit_simple, credit_test) 83 | credit_simple 84 | 85 | ### Create a confusion matrix ---- 86 | conf_mat_simple <- conf_mat( 87 | data = credit_simple, 88 | truth = default, 89 | estimate = .pred_class 90 | ) 91 | conf_mat_simple 92 | 93 | ### Visualize the ROC curve ---- 94 | credit_simple %>% 95 | roc_curve( 96 | truth = default, 97 | estimate = .pred_no 98 | ) %>% 99 | autoplot() 100 | 101 | ### Calculate the ROC AUC (area under the curve) ---- 102 | credit_roc_auc_simple <- credit_simple %>% 103 | roc_auc( 104 | truth = default, 105 | estimate = .pred_no 106 | ) 107 | credit_roc_auc_simple 108 | 109 | ### Put together other model metrics ---- 110 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 111 | 112 | classification_metrics_simple <- conf_mat( 113 | credit_simple, 114 | truth = default, 115 | estimate = .pred_class 116 | ) %>% 117 | summary() 118 | classification_metrics_simple 119 | 120 | 121 | ## 5. Generating a tuning grid ---- 122 | 123 | ### Create a specification with tuning placeholders ---- 124 | set.seed(123) 125 | tune_spec <- decision_tree( 126 | mode = "classification", 127 | engine = "rpart", 128 | tree_depth = tune(), 129 | min_n = tune(), 130 | cost_complexity = tune() 131 | ) 132 | tune_spec 133 | 134 | ### Create a regular grid ---- 135 | tree_grid <- grid_random( 136 | extract_parameter_set_dials(tune_spec), 137 | levels = 4 138 | ) 139 | tree_grid 140 | 141 | 142 | ## 6. Tuning along the grid ---- 143 | 144 | ### Create CV folds of the credit_train ---- 145 | folds <- vfold_cv( 146 | credit_train, 147 | v = 10 148 | ) 149 | folds 150 | 151 | ### Tune along the grid ---- 152 | tune_results <- tune_grid( 153 | tune_spec, 154 | default ~ ., 155 | resamples = folds, 156 | grid = tree_grid, 157 | metrics = metric_set(accuracy) 158 | ) 159 | 160 | ### Plot the tuning results ---- 161 | autoplot(tune_results) 162 | 163 | 164 | ## 7. Picking the winner ---- 165 | 166 | ### Select the parameters that perform best ---- 167 | final_params <- select_best(tune_results) 168 | final_params 169 | 170 | ### Finalize the specification ---- 171 | best_spec <- finalize_model(tune_spec, final_params) 172 | best_spec 173 | 174 | ### Build the final model ---- 175 | final_model <- fit( 176 | best_spec, 177 | default ~ ., 178 | credit_train 179 | ) 180 | final_model 181 | 182 | final_model %>% 183 | extract_fit_engine() %>% 184 | summary() 185 | 186 | ### Add the predictions to the test tibble ---- 187 | credit_with_pred_tbl <- augment(final_model, credit_test) 188 | credit_with_pred_tbl 189 | 190 | 191 | ## 8. Evaluating model performance ---- 192 | 193 | ### Create a confusion matrix ---- 194 | conf_mat <- conf_mat( 195 | data = credit_with_pred_tbl, 196 | truth = default, 197 | estimate = .pred_class 198 | ) 199 | conf_mat 200 | 201 | ### Visualize the ROC curve ---- 202 | credit_with_pred_tbl %>% 203 | roc_curve( 204 | truth = default, 205 | estimate = .pred_no 206 | ) %>% 207 | autoplot() 208 | 209 | ### Calculate the ROC AUC (area under the curve) ---- 210 | credit_roc_auc <- credit_with_pred_tbl %>% 211 | roc_auc( 212 | truth = default, 213 | estimate = .pred_no 214 | ) 215 | credit_roc_auc 216 | 217 | ### Put together other model metrics ---- 218 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 219 | classification_metrics <- conf_mat( 220 | credit_with_pred_tbl, 221 | truth = default, 222 | estimate = .pred_class 223 | ) %>% 224 | summary() 225 | classification_metrics 226 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/Chapter_12.r: -------------------------------------------------------------------------------- 1 | ##### Chapter 12: Advanced Data Preparation -------------------- 2 | 3 | ## Exploring R's tidyverse ---- 4 | 5 | library(tidyverse) # load all tidyverse packages 6 | 7 | # convert the Titanic training dataset into a tibble 8 | library(tibble) # not necessary if tidyverse is already loaded 9 | titanic_csv <- read.csv("titanic_train.csv") 10 | titanic_tbl <- as_tibble(titanic_csv) 11 | titanic_tbl 12 | 13 | # read the titanic training dataset using readr 14 | library(readr) # not necessary if tidyverse is already loaded 15 | titanic_train <- read_csv("titanic_train.csv") 16 | 17 | # read the titanic training dataset using readxl 18 | library(readxl) 19 | titanic_train <- read_excel("titanic_train.xlsx") 20 | 21 | # preparing and piping data with dplyr 22 | library(dplyr) 23 | 24 | # filter for female rows only 25 | titanic_train |> filter(Sex == "female") 26 | 27 | # select only name, sex, and age columns 28 | titanic_train |> select(Name, Sex, Age) 29 | 30 | # combine multiple dplyr verbs and save output to a tibble 31 | titanic_women <- titanic_train |> 32 | filter(Sex == "female") |> 33 | select(Name, Sex, Age) |> 34 | arrange(Name) 35 | 36 | # create a new feature indicating elderly age 37 | titanic_train |> 38 | mutate(elderly = if_else(Age >= 65, 1, 0)) 39 | 40 | # create multiple features within the same mutate command 41 | titanic_train |> 42 | mutate( 43 | elderly = if_else(Age >= 65, 1, 0), 44 | child = if_else(Age < 18, 1, 0) 45 | ) 46 | 47 | # compute survival rate by gender 48 | titanic_train |> 49 | group_by(Sex) |> 50 | summarize(survival_rate = mean(Survived)) 51 | 52 | # compute average survival rate for children vs. non-children 53 | titanic_train |> 54 | filter(!is.na(Age)) |> 55 | mutate(child = if_else(Age < 18, 1, 0)) |> 56 | group_by(child) |> 57 | summarize(survival_rate = mean(Survived)) 58 | 59 | # transform the dataset and pipe into a decision tree 60 | library(rpart) 61 | m_titanic <- titanic_train |> 62 | filter(!is.na(Age)) |> 63 | mutate(AgeGroup = if_else(Age < 18, "Child", "Adult")) |> 64 | select(Survived, Pclass, Sex, AgeGroup) |> 65 | rpart(formula = Survived ~ ., data = _) 66 | 67 | library(rpart.plot) 68 | rpart.plot(m_titanic) 69 | 70 | ## Transforming text with stringr ---- 71 | library(readr) 72 | titanic_train <- read_csv("titanic_train.csv") 73 | 74 | library(stringr) 75 | 76 | # examine cabin prefix code 77 | titanic_train <- titanic_train |> 78 | mutate(CabinCode = str_sub(Cabin, start = 1, end = 1)) 79 | 80 | # compare cabin prefix to passenger class 81 | table(titanic_train$Pclass, titanic_train$CabinCode, 82 | useNA = "ifany") 83 | 84 | # plot of survival probability by cabin code 85 | library(ggplot2) 86 | titanic_train |> ggplot() + 87 | geom_bar(aes(x = CabinCode, y = Survived), 88 | stat = "summary", fun = "mean") + 89 | ggtitle("Titanic Survival Rate by Cabin Code") 90 | 91 | # look at the first few passenger names 92 | head(titanic_train$Name) 93 | 94 | # create a title / salutation feature 95 | titanic_train <- titanic_train |> 96 | # use regular expressions to find the characters between the comma and period 97 | mutate(Title = str_extract(Name, ", [A-z]+\\.")) 98 | 99 | # look at the first few examples 100 | head(titanic_train$Title) 101 | 102 | # clean up the title feature 103 | titanic_train <- titanic_train |> 104 | mutate(Title = str_replace_all(Title, "[, \\.]", "")) 105 | 106 | # examine output 107 | table(titanic_train$Title) 108 | 109 | # group titles into related categories 110 | titanic_train <- titanic_train |> 111 | mutate(TitleGroup = recode(Title, 112 | # the first few stay the same 113 | "Mr" = "Mr", "Mrs" = "Mrs", "Master" = "Master", 114 | "Miss" = "Miss", 115 | # combine variants of "Miss" 116 | "Ms" = "Miss", "Mlle" = "Miss", "Mme" = "Miss", 117 | # anything else will be "Other" 118 | .missing = "Other", 119 | .default = "Other" 120 | ) 121 | ) 122 | 123 | # examine output 124 | table(titanic_train$TitleGroup) 125 | 126 | # plot of survival probability by title group 127 | library(ggplot2) 128 | titanic_train |> ggplot() + 129 | geom_bar(aes(x = TitleGroup, y = Survived), 130 | stat = "summary", fun = "mean") + 131 | ggtitle("Titanic Survival Rate by Salutation") 132 | 133 | ## Cleaning dates with lubridate ---- 134 | 135 | library(lubridate) 136 | 137 | # reading in Machine Learning with R publication dates in different formats 138 | mdy(c("October 25, 2013", "10/25/2013")) 139 | dmy(c("25 October 2013", "25.10.13")) 140 | ymd("2013-10-25") 141 | 142 | # construct MLwR publication dates 143 | MLwR_1stEd <- mdy("October 25, 2013") 144 | MLwR_2ndEd <- mdy("July 31, 2015") 145 | MLwR_3rdEd <- mdy("April 15, 2019") 146 | 147 | # compute differences (returns a difftime object) 148 | MLwR_2ndEd - MLwR_1stEd 149 | MLwR_3rdEd - MLwR_2ndEd 150 | 151 | # convert the differences to durations 152 | as.duration(MLwR_2ndEd - MLwR_1stEd) 153 | as.duration(MLwR_3rdEd - MLwR_2ndEd) 154 | 155 | # convert the duration to years 156 | dyears() 157 | as.duration(MLwR_2ndEd - MLwR_1stEd) / dyears() 158 | as.duration(MLwR_3rdEd - MLwR_2ndEd) / dyears() 159 | 160 | # easier-to-remember version of the above: 161 | time_length(MLwR_2ndEd - MLwR_1stEd, unit = "years") 162 | time_length(MLwR_3rdEd - MLwR_2ndEd, unit = "years") 163 | 164 | # compute age (in duration) 165 | USA_DOB <- mdy("July 4, 1776") # USA's Date of Birth 166 | time_length(mdy("July 3 2023") - USA_DOB, unit = "years") 167 | time_length(mdy("July 5 2023") - USA_DOB, unit = "years") 168 | 169 | # compute age (using intervals) 170 | interval(USA_DOB, mdy("July 3 2023")) / years() 171 | interval(USA_DOB, mdy("July 5 2023")) / years() 172 | 173 | # compute age (using integer divison) 174 | USA_DOB %--% mdy("July 3 2023") %/% years() 175 | USA_DOB %--% mdy("July 5 2023") %/% years() 176 | 177 | # function to compute calendar age 178 | age <- function(birthdate) { 179 | birthdate %--% today() %/% years() 180 | } 181 | 182 | # compute age of celebrities 183 | age(mdy("Jan 12, 1964")) # Jeff Bezos 184 | age(mdy("June 28, 1971")) # Elon Musk 185 | age(mdy("Oct 28, 1955")) # Bill Gates 186 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/boosting/boosting_with_tidymodels.R: -------------------------------------------------------------------------------- 1 | # Boosting with tidymodels ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, 4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and 5 | # Chapter 10: Evaluating Model Performance 6 | # 7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I 8 | # wanted to see how one could recreate it using mainly {tidymodels} and 9 | # {tidyverse}. 10 | # 11 | # You can find the original code and the slightly modified dataset here: 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10 14 | 15 | ## 1. Loading libraries (in the order they get used) ---- 16 | library(conflicted) 17 | library(tidyverse) 18 | library(tidymodels) 19 | 20 | ## 2. Exploring and preparing the data ---- 21 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv") 22 | 23 | ### Examine the structure of the credit data ---- 24 | glimpse(credit_tbl) 25 | 26 | ### look at two characteristics of the applicant ---- 27 | credit_tbl %>% 28 | count(checking_balance) %>% 29 | mutate(pct = (n / sum(n) * 100)) 30 | 31 | credit_tbl %>% 32 | count(savings_balance) %>% 33 | mutate(pct = (n / sum(n) * 100)) 34 | 35 | ### look at two characteristics of the loan ---- 36 | credit_tbl %>% 37 | select(months_loan_duration, amount) %>% 38 | summary() 39 | 40 | ### look at the class variable ---- 41 | credit_tbl %>% 42 | count(default) %>% 43 | mutate(pct = (n / sum(n) * 100)) 44 | 45 | 46 | ## 3. Creating the recipe and splitting the data ---- 47 | 48 | ### Convert strings to factors ---- 49 | recipe_obj <- recipe( 50 | default ~ ., 51 | data = credit_tbl 52 | ) %>% 53 | step_string2factor(all_nominal()) 54 | recipe_obj 55 | 56 | credit_factorized_tbl <- recipe_obj %>% 57 | prep() %>% 58 | bake(new_data = NULL) 59 | credit_factorized_tbl 60 | 61 | ### Create training and test data (randomly) ---- 62 | 63 | # Use set.seed to use the same random number sequence as the original 64 | RNGversion("3.5.2") 65 | set.seed(123) 66 | 67 | credit_split <- initial_split( 68 | credit_factorized_tbl, 69 | prop = 0.9 70 | ) 71 | credit_train <- training(credit_split) 72 | credit_test <- testing(credit_split) 73 | 74 | ### Check the proportion of class variable ---- 75 | credit_train %>% 76 | count(default) %>% 77 | mutate(pct = (n / sum(n) * 100)) 78 | 79 | credit_test %>% 80 | count(default) %>% 81 | mutate(pct = (n / sum(n) * 100)) 82 | 83 | 84 | ## 4. Building a boosted ensemble ---- 85 | 86 | ### Specify the model class ---- 87 | boost_spec <- boost_tree() %>% 88 | # Set the mode 89 | set_mode("classification") %>% 90 | # Set the engine 91 | set_engine("xgboost") 92 | boost_spec 93 | 94 | 95 | ## 5. Evaluate the ensemble ---- 96 | set.seed(99) 97 | 98 | ### Create CV folds ---- 99 | folds <- vfold_cv(data = credit_train, v = 10) 100 | folds 101 | 102 | ### Fit and evaluate models for all folds ---- 103 | cv_results <- fit_resamples( 104 | boost_spec, 105 | default ~ ., 106 | resamples = folds, 107 | metrics = metric_set(roc_auc) 108 | ) 109 | 110 | ### Collect cross-validated metrics ---- 111 | collect_metrics(cv_results) 112 | 113 | 114 | ## 6. Tuning preparation ---- 115 | 116 | ### Create the specification with placeholders ---- 117 | boost_spec <- boost_tree( 118 | trees = 500, 119 | learn_rate = tune(), 120 | tree_depth = tune(), 121 | sample_size = tune()) %>% 122 | set_mode("classification") %>% 123 | set_engine("xgboost") 124 | 125 | ### Create the tuning grid ---- 126 | tunegrid_boost <- grid_regular( 127 | extract_parameter_set_dials(boost_spec), 128 | levels = 3 129 | ) 130 | tunegrid_boost 131 | 132 | 133 | ## 7. The actual tuning ---- 134 | 135 | ### Create CV folds of training data ---- 136 | folds <- vfold_cv( 137 | credit_train, 138 | v = 10 139 | ) 140 | 141 | ### Tune along the grid ---- 142 | tune_results <- tune_grid( 143 | boost_spec, 144 | default ~ ., 145 | resamples = folds, 146 | grid = tunegrid_boost, 147 | metrics = metric_set(roc_auc) 148 | ) 149 | 150 | ### Plot the results ---- 151 | autoplot(tune_results) 152 | 153 | 154 | ## 8. Finalizing the model ---- 155 | 156 | ### Select the final hyperparameters ---- 157 | best_params <- select_best(tune_results) 158 | best_params 159 | 160 | ### Finalize the specification ---- 161 | final_spec <- finalize_model(boost_spec, best_params) 162 | final_spec 163 | 164 | ### Train the final model on the full training data ---- 165 | final_model <- final_spec %>% 166 | fit( 167 | formula = default ~ ., 168 | data = credit_train 169 | ) 170 | final_model 171 | 172 | JATKA TÄSTÄ! 173 | https://app.datacamp.com/learn/courses/machine-learning-with-tree-based-models-in-r 174 | 175 | # ### Train the forest ---- 176 | # model <- spec %>% 177 | # fit( 178 | # default ~ ., 179 | # data = credit_train 180 | # ) 181 | # model 182 | # 183 | # ### Plot the variable importance 184 | # vip::vip(model) 185 | # 186 | # 187 | # ## 5. Predicting ---- 188 | # 189 | # ### Make the predictions (you could skip this step) ---- 190 | # credit_test_pred <- predict( 191 | # object = model, 192 | # new_data = credit_test, 193 | # type = "class" 194 | # ) 195 | # credit_test_pred 196 | # 197 | # ### Add the predictions to the test tibble ---- 198 | # credit_test_with_pred_tbl <- augment(model, credit_test) 199 | # credit_test_with_pred_tbl 200 | # 201 | # 202 | # ## 6. Evaluating model performance ---- 203 | # 204 | # ### Create a confusion matrix ---- 205 | # conf_mat <- conf_mat( 206 | # data = credit_test_with_pred_tbl, 207 | # truth = default, 208 | # estimate = .pred_class 209 | # ) 210 | # conf_mat 211 | # 212 | # ### Visualize the confusion matrix ---- 213 | # conf_mat %>% autoplot(type = "heatmap") 214 | # conf_mat %>% autoplot(type = "mosaic") 215 | # 216 | # ### Visualize the ROC curve ---- 217 | # credit_test_with_pred_tbl %>% 218 | # roc_curve( 219 | # truth = default, 220 | # estimate = .pred_no 221 | # ) %>% 222 | # autoplot() 223 | # 224 | # ### Calculate the ROC AUC (area under the curve) ---- 225 | # credit_roc_auc <- credit_test_with_pred_tbl %>% 226 | # roc_auc( 227 | # truth = default, 228 | # estimate = .pred_no 229 | # ) 230 | # credit_roc_auc 231 | # 232 | # ### Put together other model metrics ---- 233 | # # Such as accuracy, Matthews correlation coefficient (mcc) and others... 234 | # classification_metrics <- conf_mat( 235 | # credit_test_with_pred_tbl, 236 | # truth = default, 237 | # estimate = .pred_class 238 | # ) %>% 239 | # summary() 240 | # classification_metrics 241 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/support_vector_machines/performing_optical_character_recognition_with_support_vector_machines_.r: -------------------------------------------------------------------------------- 1 | # Performing Optical Character Recognition with Support Vector Machines ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 7: 4 | # Black Box Methods - Neural Networks and Support Vector Machines 5 | # 6 | # The original code is made with {kernlab}. I wanted to see how one could 7 | # recreate it using mainly {tidymodels} and {tidyverse}. 8 | # 9 | # You can find the original code and the dataset here: 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07 11 | 12 | ## 1. Loading libraries (in the order they get used) ---- 13 | library(conflicted) 14 | library(tidyverse) 15 | library(tidymodels) 16 | library(janitor) 17 | 18 | 19 | ## 2. Exploring and preparing the data ---- 20 | 21 | ### Read in data and examine structure ---- 22 | letters_tbl <- read_csv("machine_learning_with_r_3rd_ed/support_vector_machines/data/letterdata.csv") %>% 23 | mutate(across(where(is.character), as.factor)) 24 | 25 | letters_tbl %>% 26 | glimpse() 27 | 28 | 29 | ## 3. Creating the recipe ---- 30 | 31 | ### Apply normalization to entire data frame ---- 32 | recipe_obj <- recipe( 33 | letter ~ ., 34 | data = letters_tbl 35 | ) 36 | recipe_obj 37 | 38 | letters_baked_tbl <- recipe_obj %>% 39 | prep() %>% 40 | bake(new_data = NULL) 41 | letters_baked_tbl 42 | 43 | ### Create training and test data ---- 44 | letters_split <- initial_time_split( 45 | letters_baked_tbl, 46 | prop = 16000 / 20000 47 | ) 48 | letters_train <- training(letters_split) 49 | letters_test <- testing(letters_split) 50 | 51 | 52 | ## 4. Training a model on the data ---- 53 | 54 | ### Create model specification ---- 55 | model_spec <- svm_linear( 56 | engine = "kernlab", 57 | mode = "classification", 58 | cost = NULL, 59 | margin = NULL 60 | ) %>% 61 | translate() 62 | model_spec 63 | 64 | ### Fit the model ---- 65 | model_fit <- fit( 66 | model_spec, 67 | letter ~ ., 68 | letters_train 69 | ) 70 | model_fit 71 | 72 | ### Make the predictions (you could skip this step) ---- 73 | letters_test_pred <- predict( 74 | model_fit, 75 | new_data = letters_test 76 | ) 77 | letters_test_pred 78 | 79 | ### Add the predictions to the test tibble ---- 80 | letters_test_with_pred <- augment(model_fit, letters_test) 81 | letters_test_with_pred 82 | 83 | 84 | ## 5. Evaluating model performance ---- 85 | 86 | ### Predictions on testing dataset ---- 87 | letters_test_with_pred %>% 88 | tabyl(letter, .pred_class) 89 | 90 | ### Look only at agreement vs. non-agreement ---- 91 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions 92 | letters_test_with_pred %>% 93 | mutate( 94 | agreement = case_when( 95 | letter == .pred_class ~ TRUE, 96 | TRUE ~ FALSE 97 | ) 98 | ) %>% 99 | tabyl(agreement) 100 | 101 | 102 | ## 6. Improving model performance ---- 103 | 104 | ### Change to a RBF kernel ---- 105 | model_spec_rbf <- svm_rbf( 106 | engine = "kernlab", 107 | mode = "classification", 108 | cost = NULL, 109 | margin = NULL, 110 | rbf_sigma = NULL 111 | ) %>% 112 | translate() 113 | model_spec_rbf 114 | 115 | ### Fit the model ---- 116 | model_fit_rbf <- fit( 117 | model_spec_rbf, 118 | letter ~ ., 119 | letters_train 120 | ) 121 | model_fit_rbf 122 | 123 | ### Make the predictions (you could skip this step) ---- 124 | letters_test_pred_rbf <- predict( 125 | model_fit_rbf, 126 | new_data = letters_test 127 | ) 128 | letters_test_pred_rbf 129 | 130 | ### Add the predictions to the test tibble ---- 131 | letters_test_with_pred_rbf <- augment(model_fit_rbf, letters_test) 132 | letters_test_with_pred_rbf 133 | 134 | ### Predictions on testing dataset ---- 135 | letters_test_with_pred_rbf %>% 136 | tabyl(letter, .pred_class) 137 | 138 | ### Look only at agreement vs. non-agreement ---- 139 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions 140 | letters_test_with_pred_rbf %>% 141 | mutate( 142 | agreement = case_when( 143 | letter == .pred_class ~ TRUE, 144 | TRUE ~ FALSE 145 | ) 146 | ) %>% 147 | tabyl(agreement) 148 | 149 | ### Test various values of the cost parameter ---- 150 | cost_values <- c(1, seq(from = 5, to = 40, by = 5)) 151 | 152 | accuracy_values <- map_dbl(cost_values, function(x) { 153 | 154 | model_spec_rbf <- svm_rbf( 155 | engine = "kernlab", 156 | mode = "classification", 157 | cost = {{ x }}, 158 | margin = NULL, 159 | rbf_sigma = NULL 160 | ) %>% 161 | translate() 162 | 163 | model_fit_rbf <- fit( 164 | model_spec_rbf, 165 | letter ~ ., 166 | letters_train 167 | ) 168 | 169 | letters_test_pred_rbf <- predict( 170 | model_fit_rbf, 171 | new_data = letters_test 172 | ) %>% as_vector() 173 | 174 | agree <- ifelse(letters_test_pred_rbf == letters_test$letter, 1, 0) 175 | 176 | accuracy <- sum(agree) / nrow(letters_test) 177 | 178 | return(accuracy) 179 | 180 | }) 181 | 182 | ### Bind together the cost parameter and accuracy values ---- 183 | cost_vs_accuracy_tbl <- bind_cols( 184 | cost_values, 185 | accuracy_values, 186 | ) %>% 187 | rename( 188 | cost_values = ...1, 189 | accuracy_values = ...2 190 | ) 191 | 192 | ### Visualize to find the optimal cost parameter value ---- 193 | cost_vs_accuracy_tbl %>% 194 | ggplot(aes(cost_values, accuracy_values)) + 195 | geom_line() + 196 | geom_point() + 197 | theme_bw() + 198 | labs( 199 | x = "Cost Parameter Value", 200 | y = "Accuracy" 201 | ) 202 | 203 | ### Make sure you have the right optimal cost value for the best accuracy ---- 204 | cost_vs_accuracy_tbl %>% 205 | slice_max(accuracy_values) 206 | 207 | ### Pull the first cost_value that has the max accuracy_value ---- 208 | .max_accuracy <- cost_vs_accuracy_tbl %>% 209 | slice_max(accuracy_values) %>% 210 | slice(1) %>% 211 | pull(cost_values) 212 | 213 | 214 | ## 7. Fitting the model with the optimal cost value (that was just pulled) ---- 215 | 216 | ### Give model specification ---- 217 | model_spec_best <- svm_rbf( 218 | engine = "kernlab", 219 | mode = "classification", 220 | cost = {{ .max_accuracy }}, 221 | margin = NULL, 222 | rbf_sigma = NULL 223 | ) %>% 224 | translate() 225 | 226 | ### Fit the model ---- 227 | model_fit_best <- fit( 228 | model_spec_best, 229 | letter ~ ., 230 | letters_train 231 | ) 232 | 233 | ### Add the predictions to the test tibble ---- 234 | letters_test_with_pred_best <- augment(model_fit_best, letters_test) 235 | 236 | ### Predictions on testing dataset ---- 237 | letters_test_with_pred_best %>% 238 | tabyl(letter, .pred_class) 239 | 240 | ### Look only at agreement vs. non-agreement ---- 241 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions 242 | letters_test_with_pred_best %>% 243 | mutate( 244 | agreement = case_when( 245 | letter == .pred_class ~ TRUE, 246 | TRUE ~ FALSE 247 | ) 248 | ) %>% 249 | tabyl(agreement) 250 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/k_means_clustering/finding_teen_market_segments_using_k_means_clustering.r: -------------------------------------------------------------------------------- 1 | # Finding Teen Market Segments Using k-means Clustering ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 9: 4 | # Finding Groups of Data - Clustering with k-means 5 | # 6 | # The original code is made with base R. While {tidymodels} doesn't have a 7 | # k-means engine, I wanted to see how one could still recreate the code using 8 | # {tidyverse} as much as possible and even using the {recipes} package from the 9 | # {tidymodels} to do all the pre-processing needed and {broom} package from the 10 | # same {tidymodels} family to help look at the model metrics in a more tidy way. 11 | # 12 | # I was also inspired by this blog post from the tidymodels website to continue 13 | # the exercise a bit further than the original code did: 14 | # https://www.tidymodels.org/learn/statistics/k-means/ 15 | # 16 | # You can find the original code and the dataset here: 17 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter09 18 | 19 | ## 1. Loading libraries ---- 20 | library(conflicted) 21 | library(tidyverse) 22 | library(janitor) 23 | library(tidymodels) 24 | 25 | 26 | ## 2. Exploring and preparing the data ---- 27 | teens_tbl <- read_csv("machine_learning_with_r_3rd_ed/k_means_clustering/data/snsdata.csv") 28 | 29 | teens_tbl %>% 30 | glimpse() 31 | 32 | ### Look at missing data for age variable ---- 33 | teens_tbl %>% 34 | select(age) %>% 35 | summary() 36 | 37 | ### Eliminate age outliers ---- 38 | teens_only_tbl <- teens_tbl %>% 39 | mutate( 40 | age = case_when( 41 | between(age, 13, 20) ~ age, 42 | TRUE ~ NA_real_ 43 | ) 44 | ) 45 | 46 | teens_only_tbl %>% 47 | select(age) %>% 48 | summary() 49 | 50 | ### Look at missing data for female variable ---- 51 | teens_only_tbl %>% 52 | tabyl(gender) 53 | 54 | ### Reassign missing gender values to "unknown" and change into factors ---- 55 | teens_gendered_tbl <- teens_only_tbl %>% 56 | mutate( 57 | gender = case_when( 58 | gender %in% c("F", "M") ~ gender, 59 | TRUE ~ "gender_unknown" 60 | ) %>% 61 | as.factor() 62 | ) 63 | 64 | ### Check our recoding work ---- 65 | teens_gendered_tbl %>% 66 | tabyl(gender) %>% 67 | arrange(desc(percent)) 68 | 69 | ### Finding the mean age by cohort ---- 70 | # Doesn't work because of the NAs 71 | teens_gendered_tbl %>% 72 | pull(age) %>% 73 | mean() 74 | 75 | # Works thanks to the na.rm = TRUE 76 | teens_gendered_tbl %>% 77 | pull(age) %>% 78 | mean(na.rm = TRUE) 79 | 80 | ### Age by cohort ---- 81 | teens_gendered_tbl %>% 82 | with_groups( 83 | gradyear, 84 | summarize, 85 | age = mean(age, na.rm = TRUE) 86 | ) 87 | 88 | ### Create a vector with the average age for each gradyear, repeated by person ---- 89 | average_age_by_gradyear <- teens_gendered_tbl %>% 90 | with_groups( 91 | gradyear, 92 | mutate, 93 | ave_age = mean(age, na.rm = TRUE) 94 | ) %>% 95 | pull() 96 | 97 | ### Impute the missing age values with the average age by gradyear ---- 98 | teens_imputed_tbl <- teens_gendered_tbl %>% 99 | mutate( 100 | age = case_when( 101 | !is.na(age) ~ age, 102 | TRUE ~ average_age_by_gradyear 103 | ) 104 | ) 105 | 106 | ### Check the summary results to ensure missing values are eliminated ---- 107 | teens_imputed_tbl %>% 108 | select(age) %>% 109 | summary() 110 | 111 | 112 | ## 3. Creating the recipe ---- 113 | 114 | ### Apply normalization to entire data frame ---- 115 | recipe_obj <- recipe(teens_imputed_tbl) %>% 116 | step_normalize( 117 | all_numeric(), 118 | -c(gradyear, age, friends) 119 | ) %>% 120 | step_dummy( 121 | gender, 122 | keep_original_cols = TRUE, 123 | one_hot = TRUE 124 | ) 125 | recipe_obj 126 | 127 | recipe_baked_tbl <- recipe_obj %>% 128 | prep() %>% 129 | bake(new_data = NULL) 130 | recipe_baked_tbl 131 | 132 | 133 | ## 4. Training a model on the data ---- 134 | 135 | ### Create a z-score standardized data frame for easier interpretation ---- 136 | interests_tbl <- recipe_baked_tbl %>% 137 | select(5:40) 138 | 139 | ### Compare the data before and after the transformation ---- 140 | 141 | # Before 142 | teens_imputed_tbl %>% 143 | select(basketball) %>% 144 | summary() 145 | 146 | # After 147 | interests_tbl %>% 148 | select(basketball) %>% 149 | summary() 150 | 151 | ### Create the clusters using k-means ---- 152 | RNGversion("3.5.2") 153 | set.seed(2345) 154 | 155 | teens_clusters <- interests_tbl %>% 156 | kmeans(5) 157 | teens_clusters 158 | 159 | 160 | ## 5. Evaluating model performance ---- 161 | 162 | ### Look at the single-row summary ---- 163 | teens_clusters %>% 164 | glance() 165 | 166 | ### Look at the size and the centers of the clusters ---- 167 | teens_clusters %>% 168 | tidy() %>% 169 | select(cluster, size, withinss, everything()) 170 | 171 | 172 | ## 6. Improving model performance ---- 173 | 174 | ### Apply the cluster IDs to the original data frame ---- 175 | teens_and_clusters <- augment(teens_clusters, recipe_baked_tbl) 176 | teens_and_clusters 177 | 178 | ### Look at the first five records ---- 179 | teens_and_clusters %>% 180 | select(.cluster, gender, age, friends) %>% 181 | slice_head(n = 5) 182 | 183 | ### Mean age by cluster ---- 184 | teens_and_clusters %>% 185 | with_groups( 186 | .cluster, 187 | summarize, 188 | age = mean(age) 189 | ) 190 | 191 | ### Proportion of females by cluster ---- 192 | teens_and_clusters %>% 193 | with_groups( 194 | .cluster, 195 | summarize, 196 | gender_F = mean(gender_F) 197 | ) 198 | 199 | ### Mean number of friends by cluster ---- 200 | teens_and_clusters %>% 201 | with_groups( 202 | .cluster, 203 | summarize, 204 | friends = mean(friends) 205 | ) 206 | 207 | 208 | ## 7. K-means clustering with tidy data principles ---- 209 | 210 | ### Exploratory clustering ---- 211 | kclusts <- 212 | tibble(k = 1:9) %>% 213 | mutate( 214 | kclust = map(k, ~kmeans(interests_tbl, .x)), 215 | tidied = map(kclust, tidy), 216 | glanced = map(kclust, glance), 217 | augmented = map(kclust, augment, recipe_baked_tbl) 218 | ) 219 | 220 | ### Create three separate datasets ---- 221 | clusters <- kclusts %>% 222 | unnest(cols = c(tidied)) 223 | 224 | assignments <- kclusts %>% 225 | unnest(cols = c(augmented)) 226 | 227 | clusterings <- kclusts %>% 228 | unnest(cols = c(glanced)) 229 | 230 | ### Plot the original points ---- 231 | p1 <- assignments %>% 232 | ggplot(aes(sports, music)) + 233 | geom_point(aes(color = .cluster), alpha = 0.5) + 234 | facet_wrap(vars(k)) + 235 | theme_bw() 236 | p1 237 | 238 | p2 <- p1 + 239 | geom_point(data = clusters, size = 5, shape = "x") 240 | p2 241 | 242 | 243 | ### Plot the total within sum of squares (tot.withinss) ---- 244 | clusterings %>% 245 | ggplot(aes(k, tot.withinss)) + 246 | geom_line() + 247 | geom_point() + 248 | theme_bw() + 249 | scale_x_continuous(breaks = seq(1, 9, by = 1)) + 250 | labs( 251 | x = "k", 252 | y = "Total Within Sum of Squares" 253 | ) 254 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_09_finding_groups_of_data_clustering_with_k-means.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 9: Finding Groups of Data - Clustering with k-means" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Loading Packages 13 | 14 | ```{r} 15 | library(conflicted) 16 | library(janitor) 17 | library(tidyclust) 18 | library(tidymodels) 19 | library(tidyverse) 20 | ``` 21 | 22 | ## Exploring and preparing the data 23 | 24 | ```{r} 25 | teens_tbl <- read_csv("data/snsdata.csv") %>% 26 | mutate(across(where(is.character), as.factor)) 27 | ``` 28 | 29 | ```{r} 30 | teens_tbl %>% 31 | glimpse() 32 | ``` 33 | 34 | ### Look at missing data for age variable 35 | 36 | ```{r} 37 | teens_tbl %>% 38 | pull(age) %>% 39 | summary() 40 | ``` 41 | 42 | ### Eliminate age outliers 43 | 44 | ```{r} 45 | teens_only_tbl <- teens_tbl %>% 46 | mutate( 47 | age = case_when( 48 | between(age, 13, 20) ~ age, 49 | .default = NA_real_ 50 | ) 51 | ) 52 | ``` 53 | 54 | ```{r} 55 | teens_only_tbl %>% 56 | pull(age) %>% 57 | summary() 58 | ``` 59 | 60 | ### Look at missing data for female variable 61 | 62 | ```{r} 63 | teens_only_tbl %>% 64 | tabyl(gender) %>% 65 | adorn_pct_formatting(digits = 1) 66 | ``` 67 | 68 | ### Reassign missing gender values to "unknown" and change into factors 69 | 70 | ```{r} 71 | teens_gendered_tbl <- teens_only_tbl %>% 72 | mutate( 73 | gender = case_when( 74 | gender %in% c("F", "M") ~ gender, 75 | .default = "gender_unknown" 76 | ) %>% 77 | as.factor() 78 | ) 79 | ``` 80 | 81 | ### Check our recoding work 82 | 83 | ```{r} 84 | teens_gendered_tbl %>% 85 | tabyl(gender) %>% 86 | adorn_pct_formatting(digits = 1) %>% 87 | arrange(desc(percent)) 88 | ``` 89 | 90 | ### Finding the mean age by cohort 91 | 92 | Doesn't work because of the NAs 93 | 94 | ```{r} 95 | teens_gendered_tbl %>% 96 | pull(age) %>% 97 | mean() 98 | ``` 99 | 100 | Works thanks to the na.rm = TRUE 101 | 102 | ```{r} 103 | teens_gendered_tbl %>% 104 | pull(age) %>% 105 | mean(na.rm = TRUE) 106 | ``` 107 | 108 | ### Age by cohort 109 | 110 | ```{r} 111 | teens_gendered_tbl %>% 112 | summarize( 113 | age = mean(age, na.rm = TRUE), 114 | .by = gradyear 115 | ) 116 | ``` 117 | 118 | ### Create a vector with the average age for each gradyear, repeated by person 119 | 120 | ```{r} 121 | average_age_by_gradyear <- teens_gendered_tbl %>% 122 | mutate( 123 | ave_age = mean(age, na.rm = TRUE), 124 | .by = gradyear 125 | ) %>% 126 | pull() 127 | ``` 128 | 129 | ### Impute the missing age values with the average age by gradyear 130 | 131 | ```{r} 132 | teens_imputed_tbl <- teens_gendered_tbl %>% 133 | mutate( 134 | age = case_when( 135 | !is.na(age) ~ age, 136 | .default = average_age_by_gradyear 137 | ) 138 | ) 139 | ``` 140 | 141 | ### Check the summary results to ensure missing values are eliminated 142 | 143 | ```{r} 144 | teens_imputed_tbl %>% 145 | pull(age) %>% 146 | summary() 147 | ``` 148 | 149 | ## Creating the recipe 150 | 151 | ### Apply normalization to entire data frame 152 | 153 | ```{r} 154 | recipe_obj <- recipe(teens_imputed_tbl) %>% 155 | step_normalize( 156 | all_numeric(), 157 | -c(gradyear, age, friends) 158 | ) %>% 159 | step_dummy( 160 | gender, 161 | keep_original_cols = TRUE, 162 | one_hot = TRUE 163 | ) 164 | 165 | recipe_obj 166 | ``` 167 | 168 | ```{r} 169 | recipe_baked_tbl <- recipe_obj %>% 170 | prep() %>% 171 | bake(new_data = NULL) 172 | 173 | recipe_baked_tbl 174 | ``` 175 | 176 | ## Training a model on the data 177 | 178 | ### Create a z-score standardized data frame for easier interpretation 179 | 180 | ```{r} 181 | interests_tbl <- recipe_baked_tbl %>% 182 | select(5:40) 183 | ``` 184 | 185 | ### Compare the data before and after the transformation 186 | 187 | Before 188 | 189 | ```{r} 190 | teens_imputed_tbl %>% 191 | pull(basketball) %>% 192 | summary() 193 | ``` 194 | 195 | After 196 | 197 | ```{r} 198 | interests_tbl %>% 199 | pull(basketball) %>% 200 | summary() 201 | ``` 202 | 203 | #### Create model specification 204 | 205 | ```{r} 206 | model_spec_kmeans <- k_means( 207 | engine = "stats", 208 | num_clusters = 5 209 | ) 210 | 211 | model_spec_kmeans 212 | ``` 213 | 214 | ```{r} 215 | set.seed(2345) 216 | 217 | model_fit_kmeans <- model_spec_kmeans %>% 218 | fit( 219 | ~ ., 220 | interests_tbl 221 | ) 222 | 223 | model_fit_kmeans 224 | ``` 225 | 226 | ## Evaluating model performance 227 | 228 | ### Look at the single-row summary 229 | 230 | ```{r} 231 | model_fit_kmeans %>% 232 | glance() 233 | ``` 234 | 235 | ### Look at the size and the centers of the clusters 236 | 237 | ```{r} 238 | model_fit_kmeans %>% 239 | tidy() %>% 240 | select(cluster, size, withinss, everything()) 241 | ``` 242 | 243 | ## Improving model performance 244 | 245 | ### Apply the cluster IDs to the original data frame 246 | 247 | ```{r} 248 | teens_and_clusters <- augment(model_fit_kmeans, recipe_baked_tbl) 249 | teens_and_clusters 250 | ``` 251 | 252 | ### Look at the first five records 253 | 254 | ```{r} 255 | teens_and_clusters %>% 256 | select(.pred_cluster, gender, age, friends) %>% 257 | slice_head(n = 5) 258 | ``` 259 | 260 | ### Mean age by cluster 261 | 262 | ```{r} 263 | teens_and_clusters %>% 264 | summarize( 265 | age = mean(age), 266 | .by = .pred_cluster 267 | ) 268 | ``` 269 | 270 | ### Proportion of females by cluster 271 | 272 | ```{r} 273 | teens_and_clusters %>% 274 | summarize( 275 | gender_F = mean(gender_F), 276 | .by = .pred_cluster 277 | ) 278 | ``` 279 | 280 | ### Mean number of friends by cluster 281 | 282 | ```{r} 283 | teens_and_clusters %>% 284 | summarize( 285 | friends = mean(friends), 286 | .by = .pred_cluster 287 | ) 288 | ``` 289 | 290 | ## K-means clustering with tidy data principles 291 | 292 | ### Exploratory clustering 293 | 294 | ```{r} 295 | kclusts <- 296 | tibble(k = 1:9) %>% 297 | mutate( 298 | kclust = map(k, ~kmeans(interests_tbl, .x)), 299 | tidied = map(kclust, tidy), 300 | glanced = map(kclust, glance), 301 | augmented = map(kclust, augment, recipe_baked_tbl) 302 | ) 303 | ``` 304 | 305 | ### Create three separate datasets 306 | 307 | ```{r} 308 | clusters <- kclusts %>% 309 | unnest(cols = c(tidied)) 310 | 311 | assignments <- kclusts %>% 312 | unnest(cols = c(augmented)) 313 | 314 | clusterings <- kclusts %>% 315 | unnest(cols = c(glanced)) 316 | ``` 317 | 318 | ### Plot the original points 319 | 320 | ```{r} 321 | p1 <- assignments %>% 322 | ggplot(aes(sports, music)) + 323 | geom_point(aes(color = .cluster), alpha = 0.5) + 324 | facet_wrap(vars(k)) + 325 | theme_bw() 326 | 327 | p1 328 | ``` 329 | 330 | ```{r} 331 | p2 <- p1 + 332 | geom_point(data = clusters, size = 5, shape = "x") 333 | 334 | p2 335 | ``` 336 | 337 | ### Plot the total within sum of squares (tot.withinss) 338 | 339 | ```{r} 340 | clusterings %>% 341 | ggplot(aes(k, tot.withinss)) + 342 | geom_line() + 343 | geom_point() + 344 | theme_bw() + 345 | scale_x_continuous(breaks = seq(1, 9, by = 1)) + 346 | labs( 347 | x = "k", 348 | y = "Total Within Sum of Squares" 349 | ) 350 | ``` 351 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/regression_methods/estimating_wine_quality_with_regression_trees_and_model_trees.r: -------------------------------------------------------------------------------- 1 | # Estimating Wine Quality with Regression Trees and Model Trees ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 6: 4 | # Forecasting Numeric Data - Regression Methods. 5 | # 6 | # The original code is made with {rpart}, {rpart.plot} and {Cubist}. I wanted to 7 | # see how one could recreate it using mainly {tidymodels}, {tidyverse} and {rules}. 8 | # 9 | # You can find the original code and the dataset here: 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06 11 | 12 | ## 1. Loading libraries (in the order they get used) ---- 13 | library(conflicted) 14 | conflict_prefer("filter", "dplyr", "stats") 15 | library(tidyverse) 16 | library(tidymodels) 17 | library(rattle) 18 | library(corrr) 19 | library(rules) 20 | 21 | 22 | ## 2. Preparing and exploring the data ---- 23 | wine_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/whitewines.csv") %>% 24 | rename_all( 25 | ~str_replace_all(., "\\s+", "_") %>% 26 | tolower() 27 | ) 28 | wine_tbl 29 | 30 | glimpse(wine_tbl) 31 | 32 | ### The distribution of quality ratings ---- 33 | wine_tbl %>% 34 | ggplot(aes(quality)) + 35 | geom_histogram() 36 | 37 | ### Summary statistics of the wine data ---- 38 | summary(wine_tbl) 39 | 40 | wine_split <- initial_time_split( 41 | wine_tbl, 42 | prop = 3750 / 4898 43 | ) 44 | wine_train <- training(wine_split) 45 | wine_test <- testing(wine_split) 46 | 47 | 48 | ## 3. Training a model on the data ---- 49 | # regression tree using rpart 50 | model_spec <- decision_tree( 51 | mode = "regression", 52 | engine = "rpart", 53 | cost_complexity = NULL, 54 | tree_depth = NULL, 55 | min_n = NULL 56 | ) %>% 57 | translate() 58 | model_spec 59 | 60 | ### Fit the model ---- 61 | model_fit <- fit( 62 | model_spec, 63 | quality ~ ., 64 | wine_train 65 | ) 66 | 67 | ### Get basic information about the tree ---- 68 | model_fit 69 | 70 | ### Get more detailed information about the tree ---- 71 | summary(model_fit$fit) 72 | 73 | ### Adjust plot margins to make the visualization work better 74 | par(mar = c(1, 1, 1, 1)) 75 | 76 | ### Use the rattle package to create a visualization ---- 77 | model_fit$fit %>% 78 | fancyRpartPlot(cex = 0.5) 79 | 80 | 81 | ## 4. Evaluate model performance ---- 82 | 83 | ### Generate predictions for the testing dataset (you could skip this step) ---- 84 | wine_test_pred <- predict( 85 | object = model_fit, 86 | new_data = wine_test, 87 | type = "numeric" 88 | ) 89 | wine_test_pred 90 | 91 | ### Add the predictions to the test tibble ---- 92 | wine_test_with_pred_tbl <- augment(model_fit, wine_test) 93 | wine_test_with_pred_tbl 94 | 95 | ### Compare the distribution of actual values vs. predicted values ---- 96 | wine_test_with_pred_tbl %>% 97 | select(quality, .pred) %>% 98 | summary() 99 | 100 | ### Compare the correlation ---- 101 | wine_test_with_pred_tbl %>% 102 | select(quality, .pred) %>% 103 | correlate() 104 | 105 | ### Mean absolute error between actual and predicted values ---- 106 | wine_test_with_pred_tbl %>% 107 | metrics(quality, .pred) %>% 108 | filter(.metric == "mae") 109 | 110 | ### Mean absolute error between actual values and mean value ---- 111 | mean_value <- wine_train$quality %>% 112 | mean() 113 | mean_value 114 | 115 | mae <- function(actual, predicted) { 116 | mean(abs(actual - predicted)) 117 | } 118 | 119 | mae(wine_test$quality, mean_value) 120 | 121 | 122 | ## 5. Improving model performance ---- 123 | # train a Cubist model tree 124 | model_spec_cubist <- cubist_rules( 125 | mode = "regression", 126 | engine = "Cubist", 127 | committees = NULL, 128 | neighbors = NULL, 129 | max_rules = NULL 130 | ) %>% 131 | translate() 132 | model_spec_cubist 133 | 134 | ### Fit the model ---- 135 | model_fit_cubist <- fit( 136 | model_spec_cubist, 137 | quality ~ ., 138 | wine_train 139 | ) 140 | 141 | ### Display basic information about the model tree ---- 142 | model_fit_cubist 143 | 144 | ### Display the tree itself ---- 145 | summary(model_fit_cubist$fit) 146 | 147 | ### Generate predictions for the model ---- 148 | wine_test_pred_cubist <- predict( 149 | object = model_fit_cubist, 150 | new_data = wine_test, 151 | type = "numeric" 152 | ) 153 | wine_test_pred_cubist 154 | 155 | ### Summary statistics about the predictions ---- 156 | wine_test_pred_cubist %>% 157 | summary() 158 | 159 | ### Add the predictions to the test tibble ---- 160 | wine_test_with_pred_cubist <- augment(model_fit_cubist, wine_test) 161 | wine_test_with_pred_cubist 162 | 163 | ### Compare the distribution of actual values vs. predicted values ---- 164 | wine_test_with_pred_cubist %>% 165 | select(quality, .pred) %>% 166 | summary() 167 | 168 | ### Correlation between the true and predicted values ---- 169 | wine_test_with_pred_cubist %>% 170 | select(quality, .pred) %>% 171 | correlate() 172 | 173 | ### Mean absolute error of true and predicted values ---- 174 | wine_test_with_pred_cubist %>% 175 | metrics(quality, .pred) %>% 176 | filter(.metric == "mae") 177 | 178 | 179 | ## 6. Creating a function to test the model(s) with another dataset ---- 180 | 181 | # The assumption here is that you have already taken step 1. 182 | 183 | # Preparing and exploring the other wine dataset 184 | red_wine_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/redwines.csv") %>% 185 | rename_all( 186 | ~str_replace_all(., "\\s+", "_") %>% 187 | tolower() 188 | ) 189 | red_wine_tbl 190 | 191 | ### The distribution of quality ratings ---- 192 | red_wine_tbl %>% 193 | ggplot(aes(quality)) + 194 | geom_histogram() 195 | 196 | ### Summary statistics of the wine data ---- 197 | summary(red_wine_tbl) 198 | 199 | ### Create the function 200 | 201 | predict_wine_quality <- function( 202 | .engine = c("rpart", "Cubist"), 203 | .winecolor = c("red", "white") 204 | ) { 205 | 206 | # Check that the wine color is valid 207 | if (!.winecolor %in% c("red", "white")) stop("Choose a wine color: red or white") 208 | 209 | # Write out the path so that you can insert the wine color in there 210 | path <- str_glue("machine_learning_with_r_3rd_ed/regression_methods/data/{.winecolor}wines.csv") 211 | 212 | # Read in the data 213 | wine_tbl <- read_csv(path) %>% 214 | rename_all( 215 | ~str_replace_all(., "\\s+", "_") %>% 216 | tolower() 217 | ) 218 | 219 | # Make the train/test split. It's not randomizing, as the data already is 220 | wine_split <- initial_time_split( 221 | wine_tbl, 222 | prop = 0.75 223 | ) 224 | wine_train <- training(wine_split) 225 | wine_test <- testing(wine_split) 226 | 227 | # Create the model based on the engine chosen 228 | if (.engine == "rpart") { 229 | 230 | model_spec <- decision_tree( 231 | mode = "regression", 232 | engine = "rpart" 233 | ) %>% 234 | translate() 235 | 236 | } else if (.engine == "Cubist") { 237 | 238 | model_spec <- cubist_rules( 239 | mode = "regression", 240 | engine = "Cubist" 241 | ) %>% 242 | translate() 243 | 244 | } else { 245 | 246 | stop("Choose an engine: rpart (decision tree) or Cubist (rules)") 247 | 248 | } 249 | 250 | # Fit the model 251 | model_fit <- fit( 252 | model_spec, 253 | quality ~ ., 254 | wine_train 255 | ) 256 | 257 | # Add the predictions 258 | wine_test_with_pred_tbl <- augment(model_fit, wine_test) 259 | 260 | # Get the metrics 261 | wine_test_with_pred_tbl %>% 262 | metrics(quality, .pred) 263 | 264 | } 265 | 266 | ### Test the function ---- 267 | predict_wine_quality( 268 | .engine = "Cubist", 269 | .winecolor = "white" 270 | ) 271 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_03_lazy_learning_classification_using_nearest_neighbors.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 3: Lazy Learning - Classification Using Nearest Neighbors" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | # 3 Lazy Learning - Classification Using Nearest Neighbors 13 | 14 | ## Load Packages 15 | 16 | ```{r} 17 | library(conflicted) # An Alternative Conflict Resolution Strategy 18 | library(janitor) # Simple Tools for Examining and Cleaning Dirty Data 19 | library(tidymodels) # Easily Install and Load the 'Tidymodels' Packages 20 | library(tidyverse) # Easily Install and Load the 'Tidyverse' 21 | ``` 22 | 23 | ## Exploring and preparing the data 24 | 25 | ### Import the CSV file (Breast Cancer Wisconsin (Diagnostic)) 26 | 27 | ```{r} 28 | wbcd_tbl <- read_csv( 29 | "data/wisc_bc_data.csv" 30 | ) 31 | ``` 32 | 33 | ### Take a look at the tibble 34 | 35 | ```{r} 36 | glimpse(wbcd_tbl) 37 | ``` 38 | 39 | ### Drop the unnecessary id column 40 | 41 | ```{r} 42 | wbcd_selected_tbl <- wbcd_tbl %>% 43 | select(-id) 44 | 45 | wbcd_selected_tbl 46 | ``` 47 | 48 | ```{r} 49 | # table of diagnosis 50 | wbcd_tbl %>% 51 | tabyl(diagnosis) %>% 52 | adorn_pct_formatting(digits = 1) 53 | ``` 54 | 55 | ### Transform diagnosis to a factor 56 | 57 | ```{r} 58 | wbcd_factored_tbl <- wbcd_selected_tbl %>% 59 | mutate( 60 | diagnosis = factor( 61 | diagnosis, 62 | levels = c("B", "M"), 63 | labels = c("Benign", "Malignant") 64 | ) 65 | ) 66 | 67 | wbcd_factored_tbl 68 | ``` 69 | 70 | ### Count the number of the two diagnosis (incl. percentage) 71 | 72 | ```{r} 73 | wbcd_factored_tbl %>% 74 | tabyl(diagnosis) %>% 75 | adorn_pct_formatting(digits = 1) 76 | ``` 77 | 78 | ### Summarize three numeric features 79 | 80 | ```{r} 81 | wbcd_factored_tbl %>% 82 | select(radius_mean, area_mean, smoothness_mean) %>% 83 | summary() 84 | ``` 85 | 86 | ## Creating the recipe and splitting the data 87 | 88 | ### Normalize the wbcd data 89 | 90 | ```{r} 91 | recipe_obj <- recipe( 92 | diagnosis ~ ., 93 | data = wbcd_factored_tbl 94 | ) %>% 95 | step_range( 96 | all_numeric_predictors(), 97 | min = 0, 98 | max = 1 99 | ) 100 | 101 | recipe_obj 102 | ``` 103 | 104 | ```{r} 105 | wbcd_normalized_tbl <- recipe_obj %>% 106 | prep() %>% 107 | bake(new_data = NULL) 108 | ``` 109 | 110 | ### Confirm that normalization worked 111 | 112 | ```{r} 113 | wbcd_normalized_tbl %>% 114 | select(area_mean) %>% 115 | summary() 116 | ``` 117 | 118 | ### Create training and test data (randomly) 119 | 120 | ```{r} 121 | wbcd_split <- initial_split( 122 | wbcd_normalized_tbl, 123 | prop = 469 / 569 124 | ) 125 | 126 | wbcd_train <- training(wbcd_split) 127 | wbcd_test <- testing(wbcd_split) 128 | ``` 129 | 130 | ## Training a model on the data 131 | 132 | kknn is the engine (and needs to be installed if it isn't already). It is used as the engine for {parsnip}'s nearest_neighbor() function. And since we are classifying, that is the mode we choose. 133 | 134 | ### Create model specification 135 | 136 | ```{r} 137 | model_spec <- nearest_neighbor( 138 | engine = "kknn", 139 | mode = "classification", 140 | neighbors = 21 141 | ) %>% 142 | translate() 143 | 144 | model_spec 145 | ``` 146 | 147 | ### Fit the model 148 | 149 | ```{r} 150 | model_fit <- model_spec %>% 151 | fit( 152 | diagnosis ~ ., 153 | wbcd_train 154 | ) 155 | 156 | model_fit 157 | ``` 158 | 159 | ### Make the predictions (you could skip this step) 160 | 161 | ```{r} 162 | wbcd_test_pred <- model_fit %>% 163 | predict( 164 | new_data = wbcd_test, 165 | type = "class" 166 | ) 167 | 168 | wbcd_test_pred 169 | ``` 170 | 171 | ### Add the predictions to the test tibble 172 | 173 | ```{r} 174 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test) 175 | wbcd_test_with_pred_tbl 176 | ``` 177 | 178 | ## Evaluating model performance 179 | 180 | ### Create a confusion matrix 181 | 182 | ```{r} 183 | conf_mat <- conf_mat( 184 | data = wbcd_test_with_pred_tbl, 185 | truth = diagnosis, 186 | estimate = .pred_class 187 | ) 188 | 189 | conf_mat 190 | ``` 191 | 192 | ### Visualize the confusion matrix 193 | 194 | ```{r} 195 | conf_mat %>% 196 | autoplot(type = "heatmap") 197 | 198 | conf_mat %>% 199 | autoplot(type = "mosaic") 200 | ``` 201 | 202 | ### Visualize the ROC curve 203 | 204 | ```{r} 205 | wbcd_test_with_pred_tbl %>% 206 | roc_curve( 207 | truth = diagnosis, 208 | .pred_Benign 209 | ) %>% 210 | autoplot() 211 | ``` 212 | 213 | ### Calculate the ROC AUC (area under the curve) 214 | 215 | ```{r} 216 | wbcd_roc_auc <- wbcd_test_with_pred_tbl %>% 217 | roc_auc( 218 | truth = diagnosis, 219 | .pred_Benign 220 | ) 221 | 222 | wbcd_roc_auc 223 | ``` 224 | 225 | ### Put together other model metrics 226 | 227 | ```{r} 228 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 229 | classification_metrics <- conf_mat( 230 | wbcd_test_with_pred_tbl, 231 | truth = diagnosis, 232 | estimate = .pred_class 233 | ) %>% 234 | summary() 235 | 236 | classification_metrics 237 | ``` 238 | 239 | ## Creating a function to help evaluate the model further 240 | 241 | The idea is to be able to choose different values for k and different methods for standardization (range (0 to 1) and normalization). 242 | 243 | ```{r} 244 | classify_with_knn <- function( 245 | k = 21, 246 | standardization_method = c("range", "normalization") 247 | ) { 248 | 249 | # Create a recipe according to the chosen standardization method 250 | if (standardization_method == "range") { 251 | 252 | recipe_obj <- recipe( 253 | formula = diagnosis ~ ., 254 | data = wbcd_factored_tbl 255 | ) %>% 256 | step_range( 257 | all_numeric_predictors(), 258 | min = 0, 259 | max = 1) 260 | 261 | } else if (standardization_method == "normalization") { 262 | 263 | recipe_obj <- recipe( 264 | formula = diagnosis ~ ., 265 | data = wbcd_factored_tbl 266 | ) %>% 267 | step_normalize(all_numeric_predictors()) 268 | 269 | } else { 270 | 271 | stop('Choose a starndardization method that is either "range" or "normalization"!') 272 | 273 | } 274 | 275 | wbcd_normalized_tbl <- recipe_obj %>% 276 | prep() %>% 277 | bake(new_data = wbcd_factored_tbl) 278 | 279 | # Create training and test data 280 | wbcd_split <- initial_split( 281 | wbcd_normalized_tbl, 282 | prop = 469 / 569 283 | ) 284 | wbcd_train <- training(wbcd_split) 285 | wbcd_test <- testing(wbcd_split) 286 | 287 | # Create model specification 288 | model_spec <- nearest_neighbor( 289 | engine = "kknn", 290 | mode = "classification", 291 | neighbors = k 292 | ) %>% 293 | translate() 294 | 295 | # Fit the model 296 | model_fit <- model_spec %>% 297 | fit( 298 | diagnosis ~ ., 299 | wbcd_train 300 | ) 301 | 302 | # Add the predictions to the test tibble 303 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test) 304 | 305 | # Create a confusion matrix 306 | conf_mat <- conf_mat( 307 | data = wbcd_test_with_pred_tbl, 308 | truth = diagnosis, 309 | estimate = .pred_class 310 | ) 311 | 312 | # Print the confusion matrix 313 | conf_mat %>% autoplot(type = "heatmap") 314 | } 315 | ``` 316 | 317 | ### Test the function 318 | 319 | ```{r} 320 | # standardization_method is either "range" or "normalization 321 | classify_with_knn( 322 | standardization_method = "range", 323 | k = 5 324 | ) 325 | ``` 326 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/Chapter_14.r: -------------------------------------------------------------------------------- 1 | ##### Chapter 14: Building Better Learners ------------------- 2 | 3 | # load the credit dataset 4 | credit <- read.csv("credit.csv") 5 | library(caret) 6 | 7 | ## Creating a simple tuned model ---- 8 | 9 | # look up the tuning parameters for C5.0 10 | modelLookup("C5.0") 11 | 12 | # automated parameter tuning of C5.0 decision tree 13 | set.seed(300) 14 | m <- train(default ~ ., data = credit, method = "C5.0") 15 | 16 | # summary of tuning results 17 | m 18 | 19 | # apply the best C5.0 candidate model to make predictions 20 | p <- predict(m, credit) 21 | table(p, credit$default) 22 | 23 | # obtain predicted classes 24 | head(predict(m, credit)) 25 | 26 | # obtain predicted probabilities 27 | head(predict(m, credit, type = "prob")) 28 | 29 | ## Customizing the tuning process ---- 30 | # use trainControl() to alter resampling strategy 31 | ctrl <- trainControl(method = "cv", number = 10, 32 | selectionFunction = "oneSE") 33 | 34 | # use expand.grid() to create grid of tuning parameters 35 | grid <- expand.grid(model = "tree", 36 | trials = c(1, 5, 10, 15, 20, 25, 30, 35), 37 | winnow = FALSE) 38 | 39 | # look at the result of expand.grid() 40 | grid 41 | 42 | # customize train() with the control list and grid of parameters 43 | set.seed(300) 44 | m <- train(default ~ ., data = credit, method = "C5.0", 45 | metric = "Kappa", 46 | trControl = ctrl, 47 | tuneGrid = grid) 48 | 49 | # see the results 50 | m 51 | 52 | ## Bagging ---- 53 | # Using the ipred bagged decision trees 54 | library(ipred) 55 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 56 | set.seed(300) 57 | mybag <- bagging(default ~ ., data = credit, nbagg = 25) 58 | credit_pred <- predict(mybag, credit) 59 | table(credit_pred, credit$default) 60 | 61 | # estimate performance of ipred bagged trees 62 | library(caret) 63 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 64 | set.seed(300) 65 | ctrl <- trainControl(method = "cv", number = 10) 66 | train(default ~ ., data = credit, method = "treebag", 67 | trControl = ctrl) 68 | 69 | ## Boosting ---- 70 | 71 | ## Using C5.0 Decision Tree (not shown in book) 72 | library(C50) 73 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 74 | m_c50_bst <- C5.0(default ~ ., data = credit, trials = 100) 75 | 76 | # create a Adaboost.M1 model 77 | library(adabag) 78 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 79 | set.seed(300) 80 | m_adaboost <- boosting(default ~ ., data = credit) 81 | p_adaboost <- predict(m_adaboost, credit) 82 | head(p_adaboost$class) 83 | p_adaboost$confusion 84 | 85 | # create and evaluate an Adaboost.M1 model using 10-fold-CV 86 | set.seed(300) 87 | adaboost_cv <- boosting.cv(default ~ ., data = credit) 88 | adaboost_cv$confusion 89 | 90 | # calculate kappa 91 | library(vcd) 92 | Kappa(adaboost_cv$confusion) 93 | 94 | ## Random Forests ---- 95 | # random forest with default settings 96 | library(randomForest) 97 | set.seed(300) 98 | rf <- randomForest(default ~ ., data = credit) 99 | rf 100 | 101 | # calculate kappa on the out-of-bag estimate 102 | library(vcd) 103 | Kappa(rf$confusion[1:2,1:2]) 104 | 105 | # ranger is a faster implementation of the random forest algorithm 106 | library(ranger) 107 | set.seed(300) 108 | m_ranger <- ranger(default ~ ., data = credit) 109 | m_ranger 110 | 111 | # calculate kappa 112 | Kappa(m_ranger$confusion.matrix) 113 | 114 | ## Gradient Boosting Machines (GBM) ---- 115 | 116 | # prepare the data for gbm() 117 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 118 | credit$default <- ifelse(credit$default == "yes", 1, 0) 119 | 120 | # create a random train and test split 121 | set.seed(123) 122 | train_sample <- sample(1000, 900) 123 | credit_train <- credit[train_sample, ] 124 | credit_test <- credit[-train_sample, ] 125 | 126 | # create a GBM model with default parameters 127 | library(gbm) 128 | set.seed(300) 129 | m_gbm <- gbm(default ~ ., data = credit_train) 130 | m_gbm 131 | 132 | # evaluate the simple GBM model 133 | p_gbm <- predict(m_gbm, credit_test, type = "response") 134 | p_gbm_c <- ifelse(p_gbm > 0.50, 1, 0) 135 | table(credit_test$default, p_gbm_c) 136 | 137 | # compute kappa 138 | library(vcd) 139 | Kappa(table(credit_test$default, p_gbm_c)) 140 | 141 | # create a tuned gbm() model using caret 142 | # start by creating the tuning grid 143 | grid_gbm <- expand.grid( 144 | n.trees = c(100, 150, 200), 145 | interaction.depth = c(1, 2, 3), 146 | shrinkage = c(0.01, 0.1, 0.3), 147 | n.minobsinnode = 10 148 | ) 149 | 150 | # define the experiment's parameters 151 | library(caret) 152 | ctrl <- trainControl(method = "cv", number = 10, 153 | selectionFunction = "best") 154 | 155 | # run the caret experiment 156 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 157 | set.seed(300) 158 | m_gbm_c <- train(default ~ ., data = credit, method = "gbm", 159 | trControl = ctrl, tuneGrid = grid_gbm, 160 | metric = "Kappa", 161 | verbose = FALSE) 162 | 163 | # see the results 164 | m_gbm_c 165 | 166 | ## Extreme Gradient Boosting (XGB) ---- 167 | 168 | # Format the credit data as a sparse matrix 169 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE) 170 | library(Matrix) 171 | credit_matrix <- sparse.model.matrix(~ . -default, data = credit) 172 | 173 | # examine the sparse credit_matrix 174 | dim(credit_matrix) 175 | print(credit_matrix[1:5, 1:15]) 176 | 177 | # remove the intercept 178 | credit_matrix <- credit_matrix[, -1] 179 | 180 | # split the matrix into train and test 181 | set.seed(12345) 182 | train_ids <- sample(1000, 900) 183 | credit_train <- credit_matrix[train_ids, ] 184 | credit_test <- credit_matrix[-train_ids, ] 185 | 186 | # check that the rows are 900 vs. 100 and the cols are 35 vs. 35 187 | dim(credit_train) 188 | dim(credit_test) 189 | 190 | # create 1/0 vectors for train and test data indicating loan default 191 | credit_train_labels <- 192 | ifelse(credit[train_ids, c("default")] == "yes", 1, 0) 193 | credit_test_labels <- 194 | ifelse(credit[-train_ids, c("default")] == "yes", 1, 0) 195 | 196 | # build the xgboost model 197 | library(xgboost) 198 | 199 | # set XGB hyperparameters 200 | params.xgb <- list(objective = "binary:logistic", 201 | max_depth = 6, 202 | eta = 0.3, 203 | gamma = 0, 204 | colsample_bytree = 1, 205 | min_child_weight = 1, 206 | subsample = 1) 207 | 208 | set.seed(555) 209 | xgb_credit <- xgboost(params = params.xgb, 210 | data = credit_train, 211 | label = credit_train_labels, 212 | nrounds = 100, 213 | print_every_n = 10, 214 | verbose = 1) 215 | 216 | # make predictions 217 | prob_default <- predict(xgb_credit, credit_test) 218 | pred_default <- ifelse(prob_default > 0.50, 1, 0) 219 | 220 | # create a confusion matrix 221 | table(pred_default, credit_test_labels) 222 | 223 | # compute kappa 224 | library(vcd) 225 | Kappa(table(pred_default, credit_test_labels)) 226 | 227 | # create a tuned xgboost() model using caret 228 | # start by creating the tuning grid 229 | grid_xgb <- expand.grid( 230 | eta = c(0.3, 0.4), 231 | max_depth = c(1, 2, 3), 232 | colsample_bytree = c(0.6, 0.8), 233 | subsample = c(0.50, 0.75, 1.00), 234 | nrounds = c(50, 100, 150), 235 | gamma = c(0, 1), 236 | min_child_weight = 1 237 | ) 238 | 239 | # define the control object 240 | library(caret) 241 | ctrl <- trainControl(method = "cv", number = 10, 242 | selectionFunction = "best") 243 | 244 | # run the caret experiment 245 | set.seed(300) 246 | m_xgb <- train(default ~ ., data = credit, method = "xgbTree", 247 | trControl = ctrl, tuneGrid = grid_xgb, 248 | metric = "Kappa", verbosity = 0) 249 | 250 | # see the results of all models (not shown in book due to size of output) 251 | m_xgb 252 | 253 | # see the hyperparameters for the best performing model 254 | m_xgb$bestTune 255 | 256 | # get the best kappa out of the 216 models tested 257 | max(m_xgb$results["Kappa"]) 258 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/knn/classifying-cancer-samples-with-knn.r: -------------------------------------------------------------------------------- 1 | # Classifying Cancer Samples with K-Nearest Neighbors ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 3: 4 | # Lazy Learning - Classification Using Nearest Neighbors. 5 | # 6 | # The original code is made with a lot of base R, {class} and {gmodels}. I 7 | # wanted to see how one could recreate it using mainly {tidymodels} 8 | # and {tidyverse}. 9 | # 10 | # You can find the original code here: 11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter03 12 | 13 | ## 1. Loading libraries (in the order they get used)---- 14 | library(conflicted) 15 | library(tidymodels) 16 | library(tidyverse) 17 | 18 | 19 | ## 2. Exploring and preparing the data ---- 20 | 21 | ### Create a vector for the column names ---- 22 | .col_names <- c( 23 | "id", 24 | "diagnosis", 25 | "radius_mean", 26 | "texture_mean", 27 | "perimeter_mean", 28 | "area_mean", 29 | "smoothness_mean", 30 | "compactness_mean", 31 | "concavity_mean", 32 | "points_mean", 33 | "symmetry_mean", 34 | "dimension_mean", 35 | "radius_se", 36 | "texture_se", 37 | "perimeter_se", 38 | "area_se", 39 | "smoothness_se", 40 | "compactness_se", 41 | "concavity_se", 42 | "points_se", 43 | "symmetry_se", 44 | "dimension_se", 45 | "radius_worst", 46 | "texture_worst", 47 | "perimeter_worst", 48 | "area_worst", 49 | "smoothness_worst", 50 | "compactness_worst", 51 | "concavity_worst", 52 | "points_worst", 53 | "symmetry_worst", 54 | "dimension_worst" 55 | ) 56 | 57 | ### Import the CSV file (Breast Cancer Wisconsin (Diagnostic)) ---- 58 | wbcd_tbl <- read_csv("machine_learning_with_r_3rd_ed/knn/data/wdbc-data.csv", col_names = .col_names) 59 | 60 | ### Take a look at the tibble ---- 61 | glimpse(wbcd_tbl) 62 | 63 | ### Drop the unnecessary id column ---- 64 | wbcd_selected_tbl <- wbcd_tbl %>% select(-id) 65 | wbcd_selected_tbl 66 | 67 | ### Transform diagnosis to a factor ---- 68 | wbcd_factored_tbl <- wbcd_selected_tbl %>% 69 | mutate( 70 | diagnosis = factor( 71 | diagnosis, 72 | levels = c("B", "M"), 73 | labels = c("Benign", "Malignant") 74 | ) 75 | ) 76 | wbcd_factored_tbl 77 | 78 | ### Count the number of the two diagnosis (incl. percentage) ---- 79 | wbcd_factored_tbl %>% 80 | count(diagnosis) %>% 81 | mutate(pct = (n / sum(n) * 100)) 82 | 83 | ### Summarize three numeric features ---- 84 | wbcd_factored_tbl %>% 85 | select(radius_mean, area_mean, smoothness_mean) %>% 86 | summary() 87 | 88 | 89 | ## 3. Creating the recipe and splitting the data ---- 90 | 91 | ### Normalize the wbcd data ---- 92 | recipe_obj <- recipe( 93 | diagnosis ~ ., 94 | data = wbcd_factored_tbl 95 | ) %>% 96 | step_range( 97 | all_numeric_predictors(), 98 | min = 0, 99 | max = 1 100 | ) 101 | recipe_obj 102 | 103 | wbcd_normalized_tbl <- recipe_obj %>% 104 | prep() %>% 105 | bake(new_data = NULL) 106 | 107 | ### Confirm that normalization worked ---- 108 | wbcd_normalized_tbl %>% 109 | select(area_mean) %>% 110 | summary() 111 | 112 | ### Create training and test data (randomly) ---- 113 | wbcd_split <- initial_split( 114 | wbcd_normalized_tbl, 115 | prop = 469 / 569 116 | ) 117 | wbcd_train <- training(wbcd_split) 118 | wbcd_test <- testing(wbcd_split) 119 | 120 | 121 | ## 4. Training a model on the data ---- 122 | 123 | # kknn is the engine (needs to be installed if not already): 124 | # install.packages("kknn") 125 | 126 | # It is used as the engine for {parsnip}'s nearest_neighbor() function. 127 | # And since we are classifying, that is the mode we choose. 128 | 129 | ### Create model specification ---- 130 | model_spec <- nearest_neighbor( 131 | engine = "kknn", 132 | mode = "classification", 133 | neighbors = 21 134 | ) %>% 135 | translate() 136 | model_spec 137 | 138 | ### Fit the model ---- 139 | model_fit <- fit( 140 | model_spec, 141 | diagnosis ~ ., 142 | wbcd_train 143 | ) 144 | model_fit 145 | 146 | ### Make the predictions (you could skip this step) ---- 147 | wbcd_test_pred <- predict( 148 | model_fit, 149 | new_data = wbcd_test, 150 | type = "class" 151 | ) 152 | wbcd_test_pred 153 | 154 | ### Add the predictions to the test tibble ---- 155 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test) 156 | wbcd_test_with_pred_tbl 157 | 158 | 159 | ## 5. Evaluating model performance ---- 160 | 161 | ### Create a confusion matrix ---- 162 | conf_mat <- conf_mat( 163 | data = wbcd_test_with_pred_tbl, 164 | truth = diagnosis, 165 | estimate = .pred_class 166 | ) 167 | conf_mat 168 | 169 | ### Visualize the confusion matrix ---- 170 | conf_mat %>% autoplot(type = "heatmap") 171 | conf_mat %>% autoplot(type = "mosaic") 172 | 173 | ### Visualize the ROC curve ---- 174 | wbcd_test_with_pred_tbl %>% 175 | roc_curve( 176 | truth = diagnosis, 177 | estimate = .pred_Benign 178 | ) %>% 179 | autoplot() 180 | 181 | ### Calculate the ROC AUC (area under the curve) ---- 182 | wbcd_roc_auc <- wbcd_test_with_pred_tbl %>% 183 | roc_auc( 184 | truth = diagnosis, 185 | estimate = .pred_Benign 186 | ) 187 | wbcd_roc_auc 188 | 189 | 190 | ### Put together other model metrics ---- 191 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 192 | classification_metrics <- conf_mat( 193 | wbcd_test_with_pred_tbl, 194 | truth = diagnosis, 195 | estimate = .pred_class 196 | ) %>% 197 | summary() 198 | 199 | 200 | ## 6. Creating a function to help evaluate the model further ---- 201 | 202 | # The assumption here is that you have already gone through steps 1. to 2. 203 | # The idea is to be able to choose different values for k and different 204 | # methods for standardization (range (0 to 1) and normalization) 205 | 206 | classify_with_knn <- function( 207 | k = 21, 208 | standardization_method = c("range", "normalization") 209 | ) { 210 | 211 | # Create a recipe according to the chosen standardization method 212 | if (standardization_method == "range") { 213 | 214 | recipe_obj <- recipe( 215 | formula = diagnosis ~ ., 216 | data = wbcd_factored_tbl 217 | ) %>% 218 | step_range( 219 | all_numeric_predictors(), 220 | min = 0, 221 | max = 1) 222 | 223 | } else if (standardization_method == "normalization") { 224 | 225 | recipe_obj <- recipe( 226 | formula = diagnosis ~ ., 227 | data = wbcd_factored_tbl 228 | ) %>% 229 | step_normalize(all_numeric_predictors()) 230 | 231 | } else { 232 | 233 | stop('Choose a starndardization method that is either "range" or "normalization"!') 234 | 235 | } 236 | 237 | wbcd_normalized_tbl <- recipe_obj %>% 238 | prep() %>% 239 | bake(new_data = wbcd_factored_tbl) 240 | 241 | # Create training and test data 242 | wbcd_split <- initial_split( 243 | wbcd_normalized_tbl, 244 | prop = 469 / 569 245 | ) 246 | wbcd_train <- training(wbcd_split) 247 | wbcd_test <- testing(wbcd_split) 248 | 249 | # Create model specification 250 | model_spec <- nearest_neighbor( 251 | engine = "kknn", 252 | mode = "classification", 253 | neighbors = k 254 | ) %>% 255 | translate() 256 | 257 | # Fit the model 258 | model_fit <- fit( 259 | model_spec, 260 | diagnosis ~ ., 261 | wbcd_train 262 | ) 263 | 264 | # Add the predictions to the test tibble 265 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test) 266 | 267 | # Create a confusion matrix 268 | conf_mat <- conf_mat( 269 | data = wbcd_test_with_pred_tbl, 270 | truth = diagnosis, 271 | estimate = .pred_class 272 | ) 273 | 274 | # Print the confusion matrix 275 | conf_mat %>% autoplot(type = "heatmap") 276 | } 277 | 278 | ### Test the function ---- 279 | classify_with_knn( 280 | standardization_method = "range", 281 | k = 5 282 | ) 283 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/regression_methods/predicting_medical_expenses_with_linear_regression.r: -------------------------------------------------------------------------------- 1 | # Predicting Medical Expenses with Linear Regression ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 6: 4 | # Forecasting Numeric Data - Regression Methods. 5 | # 6 | # The original code is made with base R and {psych}. I wanted to see how one 7 | # could recreate it using mainly {tidymodels} and {tidyverse}. 8 | # 9 | # You can find the original code and the dataset here: 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06 11 | 12 | library(conflicted) 13 | library(tidyverse) 14 | library(corrr) 15 | library(tidymodels) 16 | library(GGally) 17 | 18 | 19 | ## 2. Exploring and preparing the data ---- 20 | insurance_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/insurance.csv") 21 | glimpse(insurance_tbl) 22 | 23 | ### Summarize the data ---- 24 | insurance_tbl %>% 25 | mutate(across(where(is.character), as_factor)) %>% 26 | summary() 27 | 28 | ### Histogram of insurance charges ---- 29 | insurance_tbl %>% 30 | ggplot(aes(expenses)) + 31 | geom_histogram(binwidth = 5000) 32 | 33 | ### Distribution between regions ---- 34 | insurance_tbl %>% 35 | count(region) %>% 36 | mutate(pct = (n / sum(n) * 100)) 37 | 38 | ### Exploring relationships among features: correlation matrix ---- 39 | insurance_tbl %>% 40 | select(c("age", "bmi", "children", "expenses")) %>% 41 | as.matrix() %>% 42 | correlate() 43 | 44 | ### Visualing relationships among features: scatterplot matrix ---- 45 | insurance_tbl %>% 46 | select(c("age", "bmi", "children", "expenses")) %>% 47 | ggpairs() 48 | 49 | 50 | ## 3. Training a model on the data ---- 51 | 52 | ### Create a recipe ---- 53 | recipe_obj <- recipe( 54 | expenses ~ ., 55 | data = insurance_tbl 56 | ) 57 | recipe_obj 58 | 59 | insurance_baked_tbl <- recipe_obj %>% 60 | prep() %>% 61 | bake(new_data = NULL) 62 | insurance_baked_tbl 63 | 64 | ### Model specification ---- 65 | model_spec <- linear_reg( 66 | mode = "regression", 67 | engine = "lm" 68 | ) %>% 69 | translate() 70 | model_spec 71 | 72 | ### Fit the model ---- 73 | model_fit <- fit( 74 | model_spec, 75 | expenses ~ ., 76 | insurance_baked_tbl 77 | ) 78 | 79 | ### See the estimated beta coefficients ---- 80 | model_fit 81 | 82 | 83 | ## 4. Evaluating model performance ---- 84 | summary(model_fit$fit) 85 | 86 | 87 | ## 5. Improving model performance ---- 88 | 89 | ### Add a higher-order "age" term ---- 90 | insurance_augmented_tbl <- insurance_baked_tbl %>% 91 | mutate(age2 = age ^ 2, 92 | 93 | ### Add an indicator for BMI >= 30 ---- 94 | bmi30 = 95 | case_when( 96 | bmi >= 30 ~ 1, 97 | TRUE ~ 0 98 | ) 99 | ) 100 | 101 | 102 | ### Create the final model ---- 103 | recipe_augmented_obj <- recipe( 104 | expenses ~ ., 105 | data = insurance_augmented_tbl 106 | ) %>% 107 | # This step is needed for the next step, 108 | # adding the interaction between bmi30 and smoker 109 | step_dummy(smoker) %>% 110 | # Dummy variables need to be specified with starts_with() 111 | step_interact(terms = ~ bmi30:starts_with("smoker")) 112 | recipe_augmented_obj 113 | 114 | insurance_baked_augmented_tbl <- recipe_augmented_obj %>% 115 | prep() %>% 116 | bake(new_data = NULL) 117 | insurance_baked_augmented_tbl 118 | 119 | ### Model specification ---- 120 | model_spec_augmented <- linear_reg( 121 | mode = "regression", 122 | engine = "lm" 123 | ) %>% 124 | translate() 125 | model_spec_augmented 126 | 127 | ### Fit the model ---- 128 | model_fit_augmented <- fit( 129 | model_spec_augmented, 130 | expenses ~ ., 131 | insurance_baked_augmented_tbl 132 | ) 133 | 134 | ### See the estimated beta coefficients ---- 135 | model_fit_augmented 136 | 137 | ### Evaluate model performance ---- 138 | summary(model_fit_augmented$fit) 139 | 140 | ### Make predictions with the regression model (you could skip this step) ---- 141 | insurance_pred_tbl <- predict( 142 | object = model_fit_augmented, 143 | new_data = insurance_baked_augmented_tbl, 144 | type = "numeric" 145 | ) 146 | insurance_pred_tbl 147 | 148 | ### Add the predictions to the tibble ---- 149 | insurance_pred_tbl <- augment(model_fit_augmented, insurance_baked_augmented_tbl) 150 | insurance_pred_tbl 151 | 152 | ### See the correlation between the actual and predicted expenses ---- 153 | insurance_pred_tbl %>% 154 | select(.pred, expenses) %>% 155 | correlate() 156 | 157 | # Or with the more simple vectorized version: 158 | cor(insurance_pred_tbl$.pred, insurance_pred_tbl$expenses) 159 | 160 | ### Visualize the correlation ---- 161 | insurance_pred_tbl %>% 162 | ggplot(aes(.pred, expenses)) + 163 | geom_point() + 164 | geom_abline( 165 | intercept = 0, 166 | slope = 1, 167 | color = "red", 168 | size = 0.5, 169 | linetype = "dashed" 170 | ) + 171 | labs( 172 | x = "Predicted Expenses", 173 | y = "Actual Expenses" 174 | ) 175 | 176 | ### See the model metrics ---- 177 | insurance_model_metrics <- insurance_pred_tbl %>% 178 | metrics( 179 | truth = expenses, 180 | estimate = .pred 181 | ) 182 | insurance_model_metrics 183 | 184 | 185 | ### Make predictions with new data ---- 186 | predict( 187 | model_fit_augmented, 188 | tibble( 189 | age = 30, 190 | age2 = 30^2, 191 | bmi = 30, 192 | bmi30 = 1, 193 | bmi30_x_smoker_yes = 0, 194 | children = 2, 195 | region = "northeast", 196 | sex = "male", 197 | smoker_yes = 0 198 | ) 199 | ) 200 | 201 | predict( 202 | model_fit_augmented, 203 | tibble( 204 | age = 30, 205 | age2 = 30^2, 206 | bmi = 30, 207 | bmi30 = 1, 208 | bmi30_x_smoker_yes = 0, 209 | children = 2, 210 | region = "northeast", 211 | sex = "female", 212 | smoker_yes = 0 213 | ) 214 | ) 215 | 216 | predict( 217 | model_fit_augmented, 218 | tibble( 219 | age = 30, 220 | age2 = 30^2, 221 | bmi = 30, 222 | bmi30 = 1, 223 | bmi30_x_smoker_yes = 0, 224 | children = 0, 225 | region = "northeast", 226 | sex = "female", 227 | smoker_yes = 0 228 | ) 229 | ) 230 | 231 | 232 | ## 6. Creating a function to help evaluate the model further ---- 233 | 234 | # The assumption here is that you have already gone through steps 1. to 5. 235 | # You will see the predicted medical expenses, if you enter the following 236 | # parameters: age, BMI, number of children, region, sex and whether you smoke 237 | # or not. 238 | 239 | predict_medical_expenses <- function( 240 | .age, 241 | .bmi, 242 | .children, 243 | .region = c("northeast", "northwest", "southeast", "southwest"), 244 | .sex = c("female, male"), 245 | .smoker = c("no", "yes") 246 | ) { 247 | 248 | .bmi30 <- if_else( 249 | condition = .bmi >= 30, 250 | true = 1, 251 | false = 0, 252 | missing = NULL 253 | ) 254 | 255 | .smoker_yes <- if_else( 256 | condition = .smoker == "yes", 257 | true = 1, 258 | false = 0, 259 | missing = NULL 260 | ) 261 | 262 | .bmi30_x_smoker_yes <- if_else( 263 | condition = .bmi30 == TRUE && .smoker_yes == TRUE, 264 | true = 1, 265 | false = 0, 266 | missing = NULL 267 | ) 268 | 269 | .prediction <- predict( 270 | model_fit_augmented, 271 | tibble( 272 | age = .age, 273 | age2 = .age ^ 2, 274 | bmi = .bmi, 275 | bmi30 = .bmi30, 276 | bmi30_x_smoker_yes = .bmi30_x_smoker_yes, 277 | children = .children, 278 | region = .region, 279 | sex = .sex, 280 | smoker_yes = .smoker_yes 281 | ) 282 | ) 283 | 284 | str_glue("The predicted medical expenses according to the given parameters are: ${.prediction %>% round(0)}") 285 | 286 | } 287 | 288 | ### Test the function ---- 289 | predict_medical_expenses( 290 | .age = 30, 291 | .bmi = 30, 292 | .children = 0, 293 | .region = "northeast", 294 | .sex = "female", 295 | .smoker = "no" 296 | ) 297 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/Chapter_15.r: -------------------------------------------------------------------------------- 1 | ##### Chapter 15: Making Use of Big Data ------------------- 2 | 3 | ## Classifying images using a pre-trained CNN in R ---- 4 | ## IMPORTANT: some of the following steps are only necessary one time. 5 | 6 | # this package allows you to install packages from GitHub 7 | library(devtools) 8 | 9 | # install the latest tensorflow package 10 | # devtools::install_github("rstudio/tensorflow") # only necessary the first time! 11 | 12 | # install all of the dependencies for tensorflow (e.g., Python) 13 | library(tensorflow) 14 | # install_tensorflow() # only necessary the first time! 15 | 16 | # install the latest keras package 17 | # devtools::install_github("rstudio/keras") # only necessary the first time! 18 | 19 | # Begin the Example 20 | # load the ResNet50 model with weights trained on the ImageNet 21 | library(keras) 22 | m_resnet50 <- application_resnet50(weights = 'imagenet') 23 | 24 | # load the image and convert to array 25 | img <- image_load("ice_cream.jpg", target_size = c(224,224)) 26 | x <- image_to_array(img) 27 | 28 | # x is now a 3d tensor: y, x, and color channel (r, g, b) 29 | # note: (y, x) begins counting with (1, 1) in top-left and (1, 224) in top-right 30 | dim(x) 31 | str(x) 32 | 33 | # example pixels 34 | x[1, 224, 1:3] # a white pixel in the upper-right 35 | x[40, 145, 1:3] # a red pixel in the ice cream 36 | 37 | # create a 4-dimension tensor with a constant value of '1' in the first dimension 38 | # (the first dimension is the 'batch' and is used for multiple images) 39 | x <- array_reshape(x, c(1, dim(x))) 40 | dim(x) # shows the new dimensions 41 | 42 | # do some color conversions and zero-centering as pre-processing for ResNet-50 43 | x <- imagenet_preprocess_input(x) 44 | x[1, 40, 145, 1:3] # look at the red ice cream pixel again 45 | 46 | # use the ResNet-50 model to make a prediction for the image data 47 | p_resnet50 <- predict(m_resnet50, x) 48 | c_resnet50 <- imagenet_decode_predictions(p_resnet50, top = 10) 49 | 50 | # see the predictions 51 | c_resnet50 52 | 53 | # lapply the image processing steps on the other two images 54 | img_list <- list("cat.jpg", "pizza.jpg") 55 | img_data <- lapply(img_list, image_load, target_size = c(224,224)) 56 | img_arrs <- lapply(img_data, image_to_array) 57 | img_resh <- lapply(img_arrs, array_reshape, c(1, 224, 224, 3)) 58 | img_prep <- lapply(img_resh, imagenet_preprocess_input) 59 | img_prob <- lapply(img_prep, predict, object = m_resnet50) 60 | 61 | # sapply the decode function to get the final predictions 62 | img_classes <- sapply(img_prob, imagenet_decode_predictions, top = 3) 63 | img_classes 64 | 65 | ## Using word2vec for understanding text in R ---- 66 | 67 | # note: the code in this section requires the word embedding that was trained 68 | # on the Google News archive. Download the GoogleNews-vectors-negative300.bin.gz 69 | # file from https://code.google.com/archive/p/word2vec/ and unzip it to your R 70 | # project folder before proceeding. 71 | 72 | library(word2vec) 73 | 74 | # load the Google-trained 300-dimension word2vec embedding 75 | m_w2v <- read.word2vec(file = "GoogleNews-vectors-negative300.bin", 76 | normalize = TRUE) 77 | 78 | # examine the structure of the model 79 | str(m_w2v) 80 | 81 | # obtain the vector for a few terms 82 | foods <- predict(m_w2v, c("cereal", "bacon", "eggs", "sandwich", "salad", "steak", "spaghetti"), type = "embedding") 83 | meals <- predict(m_w2v, c("breakfast", "lunch", "dinner"), type = "embedding") 84 | 85 | # examine a single word vector 86 | head(foods["cereal", ]) 87 | 88 | # examine the first few columns 89 | foods[, 1:5] 90 | 91 | # compute the similarity between the foods and meals 92 | word2vec_similarity(foods, meals) 93 | 94 | # can also use cosine similarity (not shown in book) 95 | word2vec_similarity(foods, meals, type = "cosine") 96 | 97 | # create vector of hypothetical social media posts 98 | user_posts = c( 99 | "I eat bacon and eggs in the morning for the most important meal of the day!", 100 | "I am going to grab a quick sandwich this afternoon before hitting the gym.", 101 | "Can anyone provide restaurant recommendations for my date tonight?" 102 | ) 103 | 104 | library(tm) # use tm package for removeWords() and stopwords() functions 105 | 106 | user_posts_clean <- removeWords(user_posts, stopwords()) 107 | user_posts_clean <- txt_clean_word2vec(user_posts_clean) 108 | user_posts_clean[1] # look at the first cleaned user post 109 | 110 | # get the doc2vec vectors for the user posts 111 | post_vectors <- doc2vec(m_w2v, user_posts_clean) 112 | str(post_vectors) 113 | 114 | # get the word2vec vectors for the meal terms 115 | meals <- predict(m_w2v, c("breakfast", "lunch", "dinner"), type = "embedding") 116 | 117 | # compare the similarity of the posts and terms 118 | word2vec_similarity(post_vectors, meals) 119 | 120 | ## Visualizing highly dimensional data ---- 121 | 122 | library(tidyverse) 123 | sns_terms <- read_csv("snsdata.csv") |> 124 | select(basketball:drugs) 125 | 126 | # find the first two principal components 127 | library(irlba) 128 | set.seed(123456) 129 | sns_pca <- sns_terms |> 130 | prcomp_irlba(n = 2, center = TRUE, scale = TRUE) 131 | 132 | # create a scatterplot 133 | library(ggplot2) 134 | as.data.frame(sns_pca$x) |> 135 | ggplot(aes(PC1, PC2)) + geom_point(size = 1, shape = 1) 136 | 137 | ## Visualizing data's natural clusters with t-SNE ---- 138 | 139 | # take a random sample of 5,000 users 140 | library(tidyverse) 141 | set.seed(123) 142 | sns_sample <- read_csv("snsdata.csv") |> 143 | slice_sample(n = 5000) 144 | 145 | # run t-SNE with default parameters 146 | library(Rtsne) 147 | set.seed(123) 148 | sns_tsne <- sns_sample |> 149 | select(basketball:drugs) |> 150 | Rtsne(check_duplicates = FALSE) 151 | 152 | # visualize the t-SNE result 153 | library(ggplot2) 154 | data.frame(sns_tsne$Y) |> 155 | ggplot(aes(X1, X2)) + geom_point(size = 2, shape = 1) 156 | 157 | # create a categorical feature for the number of terms used 158 | sns_sample_tsne <- sns_sample |> 159 | bind_cols(data.frame(sns_tsne$Y)) |> # add the t-SNE data 160 | rowwise() |> # work across rows rather than columns 161 | mutate(n_terms = sum(c_across(basketball:drugs))) |> 162 | ungroup() |> # remove rowwise behavior 163 | mutate(`Terms Used` = if_else(n_terms > 0, "1+", "0")) 164 | 165 | # visualize the t-SNE result by number of terms used 166 | sns_sample_tsne |> 167 | ggplot(aes(X1, X2, shape = `Terms Used`, color = `Terms Used`)) + 168 | geom_point(size = 2) + 169 | scale_shape(solid = FALSE) 170 | 171 | ## working with SQL databases ---- 172 | 173 | # a tidy approach to database connections 174 | library(DBI) 175 | library(RSQLite) 176 | 177 | # creates a connection to the credit SQLite database 178 | con <- dbConnect(RSQLite::SQLite(), "credit.sqlite3") 179 | dbListTables(con) 180 | 181 | # create a data frame from the result 182 | res <- dbSendQuery(con, "SELECT * FROM credit WHERE age >= 45") 183 | credit_age45 <- dbFetch(res) 184 | summary(credit_age45$age) 185 | 186 | # disconnect from the database 187 | dbClearResult(res) 188 | dbDisconnect(con) 189 | 190 | # connecting to databases with the odbc package 191 | # (note: this is an example for illustration only and will need to be modified for your DB) 192 | library(DBI) 193 | con <- dbConnect(odbc:odbc(), "my_data_source_name") 194 | 195 | library(DBI) 196 | con <- dbConnect(odbc::odbc(), 197 | database = "my_database", 198 | uid = "my_username", 199 | pwd = "my_password", 200 | host = "my.server.address", 201 | port = 1234) 202 | 203 | # using a database backend with dplyr 204 | library(DBI) 205 | library(dplyr) 206 | con <- dbConnect(RSQLite::SQLite(), "credit.sqlite3") 207 | credit_tbl <- con |> tbl("credit") 208 | 209 | # compute summary statistics on age values filtered from the database 210 | credit_tbl |> 211 | filter(age >= 45) |> 212 | select(age) |> 213 | collect() |> 214 | summary() 215 | 216 | # show the average loan amount by loan default status, age 45+ 217 | credit_tbl |> 218 | filter(age >= 45) |> 219 | group_by(default) |> 220 | summarize(mean_amount = avg(amount)) 221 | 222 | # show the SQL used for the prior analysis 223 | credit_tbl |> 224 | filter(age >= 45) |> 225 | group_by(default) |> 226 | summarize(mean_amount = avg(amount)) |> 227 | show_query() 228 | 229 | ## Measuring execution time ---- 230 | 231 | system.time(rnorm(1000000)) 232 | 233 | ## Working in parallel ---- 234 | 235 | library(parallel) 236 | detectCores() 237 | 238 | # note: the following will only work on non-Windows systems (i.e., MacOSX or Unix/Linux) 239 | # you will also need enough cores to complete each task! 240 | 241 | # random number generation using multicore 242 | # one core 243 | system.time(l1 <- unlist(mclapply(1:10, function(x) { 244 | rnorm(10000000)}, mc.cores = 1))) 245 | 246 | # two cores 247 | system.time(l2 <- unlist(mclapply(1:10, function(x) { 248 | rnorm(10000000)}, mc.cores = 2))) 249 | 250 | # four cores 251 | system.time(l4 <- unlist(mclapply(1:10, function(x) { 252 | rnorm(10000000) }, mc.cores = 4))) 253 | 254 | # eight cores 255 | system.time(l8 <- unlist(mclapply(1:10, function(x) { 256 | rnorm(10000000) }, mc.cores = 8))) 257 | 258 | # creating a 4-node cluster with snow 259 | cl1 <- makeCluster(4) 260 | 261 | # confirm that the cluster is functioning 262 | clusterCall(cl1, function() { Sys.info()["nodename"] }) 263 | 264 | # running the same function on each node (not shown in book) 265 | clusterCall(cl1, function() { print("ready!") }) 266 | 267 | # running a different operation on each node 268 | clusterApply(cl1, c('A', 'B', 'C', 'D'), 269 | function(x) { paste("Cluster", x, "ready!") }) 270 | 271 | # close the cluster (IMPORTANT STEP!) 272 | stopCluster(cl1) 273 | 274 | ## Parallel loops with foreach ---- 275 | 276 | # generate 100 million random numbers 277 | system.time(l1 <- rnorm(100000000)) 278 | 279 | # combine four sets of 25 million random numbers 280 | library(foreach) 281 | system.time(l4 <- foreach(i = 1:4, .combine = 'c') 282 | %do% rnorm(25000000)) 283 | 284 | # confirm the number of cores 285 | detectCores() 286 | 287 | # parallel the above foreach loop 288 | library(doParallel) 289 | registerDoParallel(cores = 4) 290 | system.time(l4p <- foreach(i = 1:4, .combine = 'c') 291 | %dopar% rnorm(25000000)) 292 | 293 | stopImplicitCluster() 294 | 295 | ## Parallel processing with caret ---- 296 | 297 | # training a random forest without allowing parallel computing 298 | library(caret) 299 | credit <- read.csv("credit.csv") 300 | system.time(train(default ~ ., data = credit, method = "rf", 301 | trControl = trainControl(allowParallel = FALSE))) 302 | 303 | # training the same random forest in parallel (8 cores) 304 | library(doParallel) 305 | registerDoParallel(cores = 8) 306 | system.time(train(default ~ ., data = credit, method = "rf")) 307 | stopImplicitCluster() 308 | 309 | ## Parallel cloud computing with Apache Spark 310 | 311 | library(sparklyr) 312 | spark_install() # only need to run this the first time using Spark 313 | spark_cluster <- spark_connect(master = "local") 314 | 315 | credit_spark <- spark_read_csv(spark_cluster, "credit.csv") 316 | 317 | # split the credit_spark dataset into training and testing 318 | splits <- sdf_random_split(credit_spark, 319 | train = 0.75, test = 0.25, 320 | seed = 123) 321 | 322 | # build a random forest model using Spark 323 | credit_rf <- splits$train |> 324 | ml_random_forest(default ~ .) 325 | 326 | # make predictions on the test set 327 | pred <- ml_predict(credit_rf, splits$test) 328 | 329 | ml_binary_classification_evaluator(pred, metric_name = "areaUnderROC") 330 | 331 | spark_disconnect(spark_cluster) 332 | 333 | ## Faster modeling with h2o 334 | 335 | library(h2o) 336 | h2o_instance <- h2o.init() 337 | credit.hex <- h2o.uploadFile("credit.csv") 338 | 339 | h2o.randomForest(y = "default", 340 | training_frame = credit.hex, 341 | ntrees = 500, 342 | seed = 123) 343 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/Chapter_13.r: -------------------------------------------------------------------------------- 1 | ##### Chapter 13: Challenging Data -------------------- 2 | 3 | ## Stepwise regression ---- 4 | 5 | # read the data and do some simple data preparation 6 | library(tidyverse) 7 | titanic_train <- read_csv("titanic_train.csv") |> 8 | mutate( 9 | Age_MVI = if_else(is.na(Age), 1, 0), 10 | Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age), 11 | Cabin = if_else(is.na(Cabin), "X", Cabin), 12 | Embarked = factor(if_else(is.na(Embarked), "X", Embarked)), 13 | Sex = factor(Sex) 14 | ) 15 | 16 | # specify the simplest logistic regression model 17 | simple_model <- glm(Survived ~ 1, family = binomial, data = titanic_train) 18 | 19 | # specify the full logistic regression model 20 | full_model <- glm(Survived ~ Age + Age_MVI + Embarked + Sex + Pclass + SibSp + Fare, 21 | family = binomial, data = titanic_train) 22 | 23 | # forward stepwise regression 24 | sw_forward <- stats::step(simple_model, scope = formula(full_model), 25 | direction = "forward") 26 | 27 | # obtain the formula for the final model 28 | formula(sw_forward) 29 | 30 | # the final model's regression coefficients 31 | sw_forward$coefficients 32 | 33 | # backward stepwise 34 | sw_backward <- stats::step(full_model, direction = "backward") 35 | 36 | ## Feature selection with Boruta ---- 37 | 38 | set.seed(12345) # set the random seed to ensure results match 39 | # create a feature with random values to demonstrate a useless feature 40 | titanic_train$rand_vals <- runif(n = 891, min = 1, max = 100) 41 | 42 | # run Boruta on the Titanic dataset (this can take a long time for larger datasets) 43 | library(Boruta) 44 | titanic_boruta <- Boruta(Survived ~ PassengerId + Age + 45 | Sex + Pclass + SibSp + rand_vals, 46 | data = titanic_train, doTrace = 1) 47 | # check the result 48 | titanic_boruta 49 | 50 | # plot the feature importance 51 | plot(titanic_boruta) 52 | 53 | ## Principal Component Analysis (PCA) ---- 54 | 55 | library(tidyverse) # load the tidyverse suite of packages 56 | 57 | sns_data <- read_csv("snsdata.csv") # read the teenage social media data 58 | 59 | # select only the 36 columns from the column named 'basketball' through the one named 'drugs' 60 | # each column contains the count of times each social media profile used the respective term 61 | sns_terms <- sns_data |> select(basketball:drugs) 62 | 63 | # the irlba library provides a more efficient PCA function than R's built-in prcomp() 64 | library(irlba) 65 | 66 | # run the PCA - note that we center and re-scale the data here 67 | set.seed(2023) # to ensure the results match the book 68 | sns_pca <- sns_terms |> 69 | prcomp_irlba(n = 10, center = TRUE, scale = TRUE) # find first 10 principal components of the SNS data 70 | 71 | # create scree plot of the SNS data PCA 72 | screeplot(sns_pca, npcs = 10, type = "lines", 73 | main = "Scree Plot of SNS Data Principal Components") 74 | 75 | # use summary to see the components and the proportion of variance explained 76 | summary(sns_pca) 77 | 78 | # examine the PCA object -- we care most about the $x and $rotation components 79 | str(sns_pca) 80 | 81 | # the $x component is the transformed version of the original data 82 | str(sns_pca$x) 83 | 84 | # the $x is our original data transformed to have new "features" -- the principal components 85 | nrow(sns_pca$x) # should have 30,000 rows 86 | head(sns_pca$x) # should have ten columns 87 | 88 | # create a "long" version of the PCA dataset for visualization 89 | sns_pca_long <- tibble(SNS_Term = colnames(sns_terms), as_tibble(sns_pca$rotation)) |> # add the row labels 90 | pivot_longer(PC1:PC10, names_to = "PC", values_to = "Contribution") # go from a wide to long dataset 91 | 92 | # use ggplot to visualize the terms that are important for PC4 93 | library(ggplot2) 94 | 95 | sns_pca_long |> 96 | filter(PC == "PC3") |> 97 | top_n(15, abs(Contribution)) |> 98 | mutate(SNS_Term = reorder(SNS_Term, Contribution)) |> 99 | ggplot(aes(SNS_Term, Contribution, fill = SNS_Term)) + 100 | geom_col(show.legend = FALSE, alpha = 0.8) + 101 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 102 | axis.ticks.x = element_blank()) + 103 | labs(x = "Social Media Term", 104 | y = "Relative Importance to Principal Component", 105 | title = "Top 15 Contributors to PC3") 106 | 107 | # create a function to visualize the four other components 108 | plot_pca <- function(component) { 109 | sns_pca_long |> 110 | filter(PC == component) |> 111 | top_n(15, abs(Contribution)) |> 112 | mutate(SNS_Term = reorder(SNS_Term, Contribution)) |> 113 | ggplot(aes(SNS_Term, Contribution, fill = SNS_Term)) + 114 | geom_col(show.legend = FALSE, alpha = 0.8) + 115 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 116 | axis.ticks.x = element_blank()) + 117 | labs(x = "Social Media Term", 118 | y = "Relative Importance to Principal Component", 119 | title = paste("Top 15 Contributors to", component)) 120 | } 121 | 122 | # use the function 123 | plot_pca("PC1") 124 | plot_pca("PC2") 125 | plot_pca("PC4") 126 | plot_pca("PC5") 127 | 128 | # we can use the principal components to predict number of friends 129 | sns_data_pca <- cbind(sns_data[1:4], sns_pca$x) # join the principal components to the original data 130 | 131 | # create a linear regression model predicting friends from the principal components 132 | m <- lm(friends ~ PC1 + PC2 + PC3 + PC4 + PC5, data = sns_data_pca) 133 | 134 | m # show the model coefficients 135 | 136 | ## Remapping sparse categorical data ---- 137 | 138 | library(tidyverse) 139 | 140 | # read the Titanic dataset and create Title feature (from Chapter 12) 141 | titanic_train <- read_csv("titanic_train.csv") |> 142 | mutate(Title = str_extract(Name, ", [A-z]+\\.")) |> 143 | mutate(Title = str_replace_all(Title, "[, \\.]", "")) 144 | 145 | # the Title feature has a large number of categories 146 | table(titanic_train$Title, useNA = "ifany") 147 | 148 | # group categories with similar real-world meaning 149 | titanic_train <- titanic_train |> 150 | mutate(TitleGroup = fct_collapse(Title, 151 | Mr = "Mr", 152 | Mrs = "Mrs", 153 | Master = "Master", 154 | Miss = c("Miss", "Mlle", "Mme", "Ms"), 155 | Noble = c("Don", "Sir", "Jonkheer", "Lady"), 156 | Military = c("Capt", "Col", "Major"), 157 | Doctor = "Dr", 158 | Clergy = "Rev", 159 | other_level = "Other") 160 | ) |> 161 | mutate(TitleGroup = fct_na_value_to_level(TitleGroup, 162 | level = "Unknown")) 163 | 164 | # examine the recoding 165 | table(titanic_train$TitleGroup) 166 | 167 | # look at the counts and proportions of all levels, sorted largest to smallest 168 | fct_count(titanic_train$Title, sort = TRUE, prop = TRUE) 169 | 170 | # lump together everything outside of the top three levels 171 | table(fct_lump_n(titanic_train$Title, n = 3)) 172 | 173 | # lump together everything with less than 1% 174 | table(fct_lump_prop(titanic_train$Title, prop = 0.01)) 175 | 176 | # lump together everything with fewer than 5 observations 177 | table(fct_lump_min(titanic_train$Title, min = 5)) 178 | 179 | ## Binning sparse numeric data ---- 180 | 181 | # examine the Titanic fare data 182 | head(titanic_train$Fare) 183 | summary(titanic_train$Fare) 184 | 185 | # create a binary variable for first/second class 186 | titanic_train <- titanic_train |> mutate( 187 | fare_firstclass = if_else(Fare >= 31, 1, 0, missing = 0) 188 | ) 189 | 190 | # tabulate the binary values 191 | table(titanic_train$fare_firstclass) 192 | 193 | # create a three-level feature using case_when() 194 | titanic_train <- titanic_train |> 195 | mutate( 196 | fare_class = case_when( 197 | Fare >= 31 ~ "1st Class", 198 | Fare >= 15 ~ "2nd Class", 199 | TRUE ~ "3rd Class" 200 | ) 201 | ) 202 | 203 | # examine the result 204 | table(titanic_train$fare_class) 205 | 206 | # the cut() function can accomplish the same as the above case_when() 207 | table(cut(titanic_train$Fare, breaks = c(-Inf, 15, 31, Inf), 208 | right = FALSE)) 209 | 210 | # use cut() with seq() to generate evenly-sized break points 211 | table(cut(titanic_train$Fare, right = FALSE, 212 | breaks = seq(from = 0, to = 550, by = 50))) 213 | 214 | # use cut() with quantiles() and seq() to create bins with equal numbers of examples 215 | table(cut(titanic_train$Fare, right = FALSE, 216 | breaks = quantile(titanic_train$Fare, 217 | probs = seq(0, 1, 0.20)))) 218 | 219 | # use the tidyverse ntile() function to create five bins 220 | table(ntile(titanic_train$Fare, n = 5)) 221 | 222 | # convert the ntile() groups to a factor 223 | titanic_train <- titanic_train |> 224 | mutate(fare_level = factor(ntile(Fare, n = 11))) 225 | 226 | table(titanic_train$fare_level) 227 | 228 | ## Performing missing value imputation ---- 229 | 230 | library(readr) 231 | titanic_train <- read_csv("titanic_train.csv") 232 | 233 | # impute arbitrary text strings for missing categorical data 234 | titanic_train <- titanic_train |> 235 | mutate( 236 | Cabin = if_else(is.na(Cabin), "X", Cabin), 237 | Embarked = if_else(is.na(Embarked), "Unknown", Embarked) 238 | ) 239 | 240 | # impute mean value and create missing value indicator for age 241 | titanic_train <- titanic_train |> 242 | mutate( 243 | Age_MVI = if_else(is.na(Age), 1, 0), 244 | Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age) 245 | ) 246 | 247 | ## Simple strategies for rebalancing data ---- 248 | 249 | # load and prepare the teenage social media data 250 | library(tidyverse) 251 | 252 | snsdata <- read_csv("snsdata.csv") |> 253 | mutate( 254 | gender = fct_recode(gender, Female = "F", Male = "M"), 255 | gender = fct_na_value_to_level(gender, level = "Unknown"), 256 | age = ifelse(age < 13 | age > 20, NA, age) # replace age outliers 257 | ) |> 258 | group_by(gradyear) |> 259 | mutate(age_imp = if_else(is.na(age), median(age, na.rm = TRUE), age)) |> 260 | ungroup() |> 261 | select(gender, friends, gradyear, age_imp, basketball:drugs) 262 | 263 | # examine the initial class imbalance 264 | fct_count(snsdata$gender, prop = TRUE) 265 | 266 | # undersample the majority classes 267 | library(caret) 268 | sns_undersample <- downSample(x = snsdata[2:40], y = snsdata$gender, yname = "gender") 269 | fct_count(sns_undersample$gender, prop = TRUE) 270 | 271 | # oversample the minority classes 272 | library(caret) 273 | sns_oversample <- upSample(x = snsdata[2:40], y = snsdata$gender, yname = "gender") 274 | fct_count(sns_oversample$gender, prop = TRUE) 275 | 276 | ## Generating a synthetic balanced dataset with SMOTE ---- 277 | 278 | # create a gender balanced dataset using SMOTE 279 | library(themis) 280 | sns_balanced <- snsdata |> smote("gender") # simple syntax (without normalization) 281 | 282 | # check that the dataset is now gender balanced 283 | table(sns_balanced$gender) 284 | 285 | # for a better SMOTE, create a normalize function (introduced in Chapter 3) 286 | normalize <- function(x) { 287 | return ((x - min(x)) / (max(x) - min(x))) 288 | } 289 | 290 | # an unnormalize function returns data to original scale (introduced in Chapter 7) 291 | unnormalize <- function(norm_vals, col_name) { 292 | old_vals <- snsdata[col_name] 293 | unnormalized_vals <- norm_vals * (max(old_vals) - min(old_vals)) + min(old_vals) 294 | 295 | # round all columns to integers except age_imp 296 | rounded_vals <- if(col_name != "age_imp") { round(unnormalized_vals) } 297 | else {unnormalized_vals} 298 | 299 | return (rounded_vals) 300 | } 301 | 302 | # more advanced smote() process with normalized data 303 | snsdata_balanced <- snsdata |> 304 | mutate(across(where(is.numeric), normalize)) |> # normalize the numeric data 305 | smote("gender") |> 306 | mutate(across(where(is.numeric), ~unnormalize(.x, cur_column()))) # unnormalize the data 307 | 308 | # confirm that the rebalanced dataset worked correctly 309 | table(snsdata$gender) 310 | table(snsdata_balanced$gender) 311 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_02_managing_and_understanding_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 2: Managing and Understanding Data" 3 | author: "Original Code: Brett Lantz | Modifications: Antti Rask" 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | # 2 Managing and Understanding Data 13 | 14 | ## Load Packages 15 | 16 | ```{r} 17 | library(conflicted) # An Alternative Conflict Resolution Strategy 18 | conflicts_prefer(dplyr::filter) 19 | conflicts_prefer(janitor::chisq.test) 20 | library(crosstable) # Crosstables for Descriptive Analyses 21 | library(janitor) # Simple Tools for Examining and Cleaning Dirty Data 22 | library(tidyverse) # Easily Install and Load the 'Tidyverse' 23 | ``` 24 | 25 | ## Data 26 | 27 | ```{r} 28 | # create tibble of data for three medical patients 29 | patients_tbl <- tibble( 30 | subject_name = c("John Doe", "Jane Doe", "Steve Graves"), 31 | temperature = c(98.1, 98.6, 101.4), 32 | flu_status = c(FALSE, FALSE, TRUE), 33 | gender = factor(c("MALE", "FEMALE", "MALE")), 34 | blood = factor( 35 | c("O", "AB", "A"), 36 | levels = c("A", "B", "AB", "O") 37 | ), 38 | symptoms = factor( 39 | c("SEVERE", "MILD", "MODERATE"), 40 | levels = c("MILD", "MODERATE", "SEVERE"), 41 | ordered = TRUE 42 | ) 43 | ) 44 | 45 | patients_tbl 46 | ``` 47 | 48 | ## R data structures 49 | 50 | ### Vectors -> Tibble 51 | 52 | ```{r} 53 | # access the second body temperature 54 | patients_tbl %>% 55 | slice(2) %>% 56 | pull(temperature) 57 | ``` 58 | 59 | ```{r} 60 | # access body temperature for the second and third patient 61 | patients_tbl %>% 62 | slice(2:3) %>% 63 | pull(temperature) 64 | ``` 65 | 66 | ```{r} 67 | # access body temperature for all but the second patient 68 | patients_tbl %>% 69 | slice(-2) %>% 70 | pull(temperature) 71 | ``` 72 | 73 | ```{r} 74 | # Use a vector to indicate whether to include item 75 | patients_tbl %>% 76 | filter(c(TRUE, TRUE, FALSE)) %>% 77 | pull(temperature) 78 | ``` 79 | 80 | ```{r} 81 | # filter rows by conditions 82 | patients_tbl %>% 83 | filter(temperature > 100) %>% 84 | pull(subject_name) 85 | ``` 86 | 87 | ### Factors 88 | 89 | ```{r} 90 | # check gender factors 91 | patients_tbl %>% 92 | pull(gender) 93 | ``` 94 | 95 | ```{r} 96 | # check blood type factors 97 | patients_tbl %>% 98 | pull(blood) 99 | ``` 100 | 101 | ```{r} 102 | # check symptom factors 103 | patients_tbl %>% 104 | pull(symptoms) 105 | ``` 106 | 107 | ```{r} 108 | # check for symptoms greater than moderate 109 | patients_tbl %>% 110 | mutate(symptoms_severe = symptoms > "MODERATE") %>% 111 | pull(symptoms_severe) 112 | ``` 113 | 114 | ### Lists/Tibble 115 | 116 | ```{r} 117 | # display information for a patient 118 | patients_tbl %>% 119 | slice(1) 120 | ``` 121 | 122 | ```{r} 123 | # convert the tibble to a list 124 | patient_1_list <- patients_tbl %>% 125 | slice(1) %>% 126 | as.list() 127 | 128 | patient_1_list 129 | ``` 130 | 131 | #### Methods for Accessing a List 132 | 133 | ```{r} 134 | # get a single list value by position (returns a sub-list) 135 | patient_1_list[2] 136 | ``` 137 | 138 | ```{r} 139 | # get a single list value by position (returns a numeric vector) 140 | patient_1_list %>% 141 | pluck(2) 142 | ``` 143 | 144 | ```{r} 145 | # get a single list value by name 146 | patient_1_list %>% 147 | pluck("temperature") 148 | ``` 149 | 150 | ```{r} 151 | # get several list items by specifying a vector of names 152 | patient_1_list[c("temperature", "flu_status")] 153 | ``` 154 | #### Access a List Like a Vector 155 | 156 | ```{r} 157 | # get values 2 and 3 158 | patient_1_list[2:3] 159 | ``` 160 | 161 | ### Tibble 162 | 163 | #### Accessing a tibble 164 | 165 | ```{r} 166 | # display the data tibble 167 | patients_tbl 168 | ``` 169 | 170 | ```{r} 171 | # get a single column 172 | patients_tbl %>% 173 | pull(subject_name) 174 | ``` 175 | 176 | ```{r} 177 | # get several columns by specifying a vector of names 178 | patients_tbl %>% 179 | select(c("temperature", "flu_status")) 180 | ``` 181 | 182 | ```{r} 183 | # this is the same as above, extracting temperature and flu_status 184 | patients_tbl %>% 185 | select(2:3) 186 | ``` 187 | 188 | ```{r} 189 | # accessing by row and column 190 | patients_tbl %>% 191 | slice(c(1, 3)) %>% 192 | select(c(2, 4)) 193 | ``` 194 | 195 | #### Extract All Rows or Columns 196 | 197 | ```{r} 198 | # column 1, all rows 199 | patients_tbl %>% 200 | pull(1) 201 | ``` 202 | 203 | ```{r} 204 | # row 1, all columns 205 | patients_tbl %>% 206 | slice(1) 207 | ``` 208 | 209 | ```{r} 210 | # all rows and all columns 211 | patients_tbl 212 | ``` 213 | 214 | ```{r} 215 | # the following are equivalent 216 | patients_tbl %>% 217 | slice(c(1, 3)) %>% 218 | select(c(temperature, gender)) 219 | 220 | patients_tbl %>% 221 | slice(-2) %>% 222 | select(c(-1, -3, -5, -6)) 223 | ``` 224 | 225 | ```{r} 226 | # creating a Celsius temperature column, then comparing before and after 227 | patients_tbl %>% 228 | mutate(temp_c = (temperature - 32) * (5 / 9)) %>% 229 | select(temperature, temp_c) 230 | ``` 231 | 232 | ### Matrixes 233 | 234 | ```{r} 235 | # create a 2x2 matrix 236 | matrix(c(1, 2, 3, 4), nrow = 2) 237 | ``` 238 | 239 | ```{r} 240 | # equivalent to the above 241 | matrix(c(1, 2, 3, 4), ncol = 2) 242 | ``` 243 | 244 | ```{r} 245 | # create a 2x3 matrix 246 | matrix(c(1, 2, 3, 4, 5, 6), nrow = 2) 247 | ``` 248 | 249 | ```{r} 250 | # create a 3x2 matrix 251 | m <- matrix(c(1, 2, 3, 4, 5, 6), ncol = 2) 252 | m 253 | ``` 254 | 255 | ```{r} 256 | # extract values from matrixes 257 | m[1, 1] 258 | m[3, 2] 259 | ``` 260 | 261 | ```{r} 262 | # extract rows 263 | m[1, ] 264 | ``` 265 | 266 | ```{r} 267 | # extract columns 268 | m[, 1] 269 | ``` 270 | 271 | ## Managing data with R 272 | 273 | #### Saving, Loading, and Removing R Data Structures 274 | 275 | ```{r} 276 | # show all data structures in memory 277 | ls() 278 | ``` 279 | 280 | ```{r} 281 | # remove the m and patient_1_list objects 282 | rm(m, patient_1_list) 283 | ls() 284 | ``` 285 | 286 | ```{r} 287 | # remove all (remaining) objects 288 | rm(list = ls()) 289 | ls() 290 | ``` 291 | 292 | ## Importing and saving datasets from CSV files 293 | 294 | ```{r} 295 | # reading a CSV file 296 | patients_tbl <- read_csv("data/pt_data.csv") 297 | ``` 298 | 299 | ```{r} 300 | # reading a CSV file and converting all character columns to factors 301 | patients_factors_tbl <- read_csv( 302 | "data/pt_data.csv", 303 | col_types = "cdlff", 304 | show_col_types = TRUE 305 | ) 306 | ``` 307 | 308 | ```{r} 309 | # alternatively reading a CSV file and converting all character columns to factors 310 | patients_factors_tbl <- read_csv("data/pt_data.csv") %>% 311 | mutate(across(where(is.character), as.factor)) 312 | 313 | patients_factors_tbl 314 | ``` 315 | 316 | ## Exploring and understanding data 317 | 318 | ### Data Exploration Example Using Used Car Data 319 | 320 | ```{r} 321 | usedcars <- read_csv("data/usedcars.csv") 322 | ``` 323 | 324 | ```{r} 325 | # get structure of used car data 326 | usedcars %>% 327 | str() 328 | ``` 329 | 330 | ### Exploring numeric variables 331 | 332 | ```{r} 333 | # summarize numeric variables 334 | usedcars %>% 335 | select(year, price, mileage) %>% 336 | summary() 337 | ``` 338 | 339 | ```{r} 340 | # calculate the mean income 341 | (36000 + 44000 + 56000) / 3 342 | 343 | c(36000, 44000, 56000) %>% 344 | mean() 345 | ``` 346 | 347 | ```{r} 348 | # the median income 349 | c(36000, 44000, 56000) %>% 350 | median() 351 | ``` 352 | 353 | ```{r} 354 | # the min/max of used car prices 355 | usedcars %>% 356 | pull(price) %>% 357 | range() 358 | ``` 359 | 360 | ```{r} 361 | # the difference of the range 362 | usedcars %>% 363 | pull(price) %>% 364 | range() %>% 365 | diff() 366 | ``` 367 | 368 | ```{r} 369 | # IQR for used car prices 370 | usedcars %>% 371 | pull(price) %>% 372 | IQR() 373 | ``` 374 | 375 | ```{r} 376 | # use quantile to calculate five-number summary 377 | usedcars %>% 378 | pull(price) %>% 379 | quantile() 380 | ``` 381 | 382 | ```{r} 383 | # the 99th percentile 384 | usedcars %>% 385 | pull(price) %>% 386 | quantile( 387 | probs = c(0.01, 0.99) 388 | ) 389 | ``` 390 | 391 | ```{r} 392 | # quintiles 393 | usedcars %>% 394 | pull(price) %>% 395 | quantile( 396 | seq(from = 0, to = 1, by = 0.20) 397 | ) 398 | ``` 399 | 400 | ```{r} 401 | # boxplot of used car prices and mileage 402 | usedcars %>% 403 | ggplot(aes(y = price)) + 404 | geom_boxplot() + 405 | labs( 406 | title = "Boxplot of Used Car Prices", 407 | y = "Price ($)" 408 | ) + 409 | theme_classic() 410 | 411 | usedcars %>% 412 | ggplot(aes(y = mileage)) + 413 | geom_boxplot() + 414 | labs( 415 | title = "Boxplot of Used Car Mileage", 416 | y = "Odometer (mi.)" 417 | ) + 418 | theme_classic() 419 | ``` 420 | 421 | ```{r} 422 | # histograms of used car prices and mileage 423 | usedcars %>% 424 | ggplot(aes(price)) + 425 | geom_histogram( 426 | bins = 10, 427 | color = "black", 428 | fill = "white" 429 | ) + 430 | labs( 431 | title = "Histogram of Used Car Prices", 432 | x = "Price ($)" 433 | ) + 434 | theme_classic() 435 | 436 | usedcars %>% 437 | ggplot(aes(mileage)) + 438 | geom_histogram( 439 | bins = 8, 440 | color = "black", 441 | fill = "white" 442 | ) + 443 | labs( 444 | title = "Histogram of Used Car Mileage", 445 | x = "Odometer (mi.)" 446 | ) + 447 | theme_classic() 448 | ``` 449 | 450 | ```{r} 451 | # variance and standard deviation of the used car data 452 | usedcars %>% 453 | pull(price) %>% 454 | var() 455 | ``` 456 | 457 | ```{r} 458 | usedcars %>% 459 | pull(price) %>% 460 | sd() 461 | ``` 462 | 463 | ```{r} 464 | usedcars %>% 465 | pull(mileage) %>% 466 | var() 467 | ``` 468 | 469 | ```{r} 470 | usedcars %>% 471 | pull(mileage) %>% 472 | sd() 473 | ``` 474 | 475 | ### Exploring categorical features 476 | 477 | ```{r} 478 | # one-way tables for the used car data 479 | usedcars %>% 480 | tabyl(year) %>% 481 | adorn_pct_formatting(digits = 1) 482 | ``` 483 | 484 | ```{r} 485 | usedcars %>% 486 | tabyl(model) %>% 487 | adorn_pct_formatting(digits = 1) 488 | ``` 489 | 490 | ```{r} 491 | usedcars %>% 492 | tabyl(color) %>% 493 | adorn_pct_formatting(digits = 1) 494 | ``` 495 | 496 | ```{r} 497 | # you can also have a three-way table of the same 498 | usedcars %>% 499 | tabyl(year, model, color) 500 | ``` 501 | 502 | ### Exploring relationships between variables 503 | 504 | ```{r} 505 | # scatterplot of price vs. mileage 506 | usedcars %>% 507 | ggplot(aes(mileage, price)) + 508 | geom_point() + 509 | labs( 510 | title = "Scatterplot of Price vs. Mileage", 511 | x = "Used Car Odometer (mi.)", 512 | y = "Used Car Price ($)" 513 | ) + 514 | theme_classic() 515 | ``` 516 | 517 | ```{r} 518 | # new variable indicating conservative colors 519 | usedcars_conservative <- usedcars %>% 520 | mutate(conservative = color %in% c("Black", "Gray", "Silver", "White")) 521 | ``` 522 | 523 | ```{r} 524 | # checking our variable 525 | usedcars_conservative %>% 526 | tabyl(conservative) %>% 527 | adorn_pct_formatting(digits = 0) 528 | ``` 529 | 530 | ```{r} 531 | # crosstab of model by conservative 532 | usedcars_conservative %>% 533 | crosstable( 534 | cols = model, 535 | by = conservative, 536 | label = FALSE, 537 | total = "both", 538 | percent_pattern = "{n} ({p_row}/{p_col}/{p_tot})", 539 | percent_digits = 1 540 | ) %>% 541 | as_flextable(compact = TRUE) 542 | ``` 543 | 544 | ```{r} 545 | # compute the chi-square test 546 | usedcars_conservative %>% 547 | tabyl(model, conservative) %>% 548 | chisq.test() 549 | ``` 550 | 551 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/decision_trees_and_rules/identifying_risky_bank_loans_using_C5.0_decision_trees.r: -------------------------------------------------------------------------------- 1 | # Identifying Risky Bank Loans Using C5.0 Decision Trees ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, 4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and 5 | # Chapter 10: Evaluating Model Performance 6 | # 7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I 8 | # wanted to see how one could recreate it using mainly {tidymodels} and 9 | # {tidyverse}. 10 | # 11 | # You can find the original code and the slightly modified dataset here: 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10 14 | 15 | ## 1. Loading libraries (in the order they get used) ---- 16 | library(conflicted) 17 | library(tidyverse) 18 | library(tidymodels) 19 | 20 | ## 2. Exploring and preparing the data ---- 21 | credit_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/credit.csv") 22 | 23 | ### Examine the structure of the credit data ---- 24 | glimpse(credit_tbl) 25 | 26 | ### look at two characteristics of the applicant ---- 27 | credit_tbl %>% 28 | count(checking_balance) %>% 29 | mutate(pct = (n / sum(n) * 100)) 30 | 31 | credit_tbl %>% 32 | count(savings_balance) %>% 33 | mutate(pct = (n / sum(n) * 100)) 34 | 35 | ### look at two characteristics of the loan ---- 36 | credit_tbl %>% 37 | select(months_loan_duration, amount) %>% 38 | summary() 39 | 40 | ### look at the class variable ---- 41 | credit_tbl %>% 42 | count(default) %>% 43 | mutate(pct = (n / sum(n) * 100)) 44 | 45 | 46 | ## 3. Creating the recipe and splitting the data ---- 47 | 48 | ### Convert strings to factors ---- 49 | recipe_obj <- recipe( 50 | default ~ ., 51 | data = credit_tbl 52 | ) %>% 53 | step_string2factor(all_nominal()) 54 | recipe_obj 55 | 56 | credit_factorized_tbl <- recipe_obj %>% 57 | prep() %>% 58 | bake(new_data = NULL) 59 | credit_factorized_tbl 60 | 61 | ### Create training and test data (randomly) ---- 62 | 63 | # Use set.seed to use the same random number sequence as the original 64 | RNGversion("3.5.2") 65 | set.seed(123) 66 | 67 | credit_split <- initial_split( 68 | credit_factorized_tbl, 69 | prop = 0.9 70 | ) 71 | credit_train <- training(credit_split) 72 | credit_test <- testing(credit_split) 73 | 74 | ### Check the proportion of class variable ---- 75 | credit_train %>% 76 | count(default) %>% 77 | mutate(pct = (n / sum(n) * 100)) 78 | 79 | credit_test %>% 80 | count(default) %>% 81 | mutate(pct = (n / sum(n) * 100)) 82 | 83 | 84 | ## 4. Training a model on the data ---- 85 | 86 | ### Model specification ---- 87 | model_spec <- decision_tree( 88 | mode = "classification", 89 | engine = "C5.0", 90 | cost_complexity = NULL, 91 | tree_depth = NULL, 92 | min_n = NULL 93 | ) %>% 94 | translate() 95 | model_spec 96 | 97 | ### Fit the model ---- 98 | model_fit <- fit( 99 | model_spec, 100 | default ~ ., 101 | credit_train 102 | ) 103 | model_fit 104 | 105 | model_fit %>% 106 | extract_fit_engine() %>% 107 | summary() 108 | 109 | ### Make the predictions (you could skip this step) ---- 110 | credit_test_pred <- predict( 111 | object = model_fit, 112 | new_data = credit_test, 113 | type = "class" 114 | ) 115 | credit_test_pred 116 | 117 | ### Add the predictions to the test tibble ---- 118 | credit_test_with_pred_tbl <- augment(model_fit, credit_test) 119 | credit_test_with_pred_tbl 120 | 121 | 122 | ## 5. Evaluating model performance ---- 123 | 124 | ### Create a confusion matrix ---- 125 | conf_mat <- conf_mat( 126 | data = credit_test_with_pred_tbl, 127 | truth = default, 128 | estimate = .pred_class 129 | ) 130 | conf_mat 131 | 132 | ### Visualize the confusion matrix ---- 133 | conf_mat %>% autoplot(type = "heatmap") 134 | conf_mat %>% autoplot(type = "mosaic") 135 | 136 | ### Visualize the ROC curve ---- 137 | credit_test_with_pred_tbl %>% 138 | roc_curve( 139 | truth = default, 140 | estimate = .pred_no 141 | ) %>% 142 | autoplot() 143 | 144 | ### Calculate the ROC AUC (area under the curve) ---- 145 | credit_roc_auc <- credit_test_with_pred_tbl %>% 146 | roc_auc( 147 | truth = default, 148 | estimate = .pred_no 149 | ) 150 | credit_roc_auc 151 | 152 | ### Put together other model metrics ---- 153 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 154 | classification_metrics <- conf_mat( 155 | credit_test_with_pred_tbl, 156 | truth = default, 157 | estimate = .pred_class 158 | ) %>% 159 | summary() 160 | classification_metrics 161 | 162 | 163 | ## 6. Improving model performance ---- 164 | 165 | ### Boost the decision tree with 10 trials ---- 166 | model_spec_boost_tree <- boost_tree( 167 | mode = "classification", 168 | engine = "C5.0", 169 | trees = 10, 170 | min_n = NULL, 171 | sample_size = NULL 172 | ) %>% 173 | translate() 174 | model_spec_boost_tree 175 | 176 | ### Fit the model ---- 177 | model_fit_boost_tree <- fit( 178 | model_spec_boost_tree, 179 | default ~ ., 180 | credit_train 181 | ) 182 | model_fit_boost_tree 183 | 184 | model_fit_boost_tree %>% 185 | extract_fit_engine() %>% 186 | summary() 187 | 188 | ### Make the predictions (you could skip this step) ---- 189 | credit_test_pred_boost_tree <- predict( 190 | object = model_fit_boost_tree, 191 | new_data = credit_test, 192 | type = "class" 193 | ) 194 | credit_test_pred_boost_tree 195 | 196 | ### Add the predictions to the test tibble ---- 197 | credit_test_with_pred_boost_tree <- augment(model_fit_boost_tree, credit_test) 198 | credit_test_with_pred_boost_tree 199 | 200 | ### Create a confusion matrix ---- 201 | conf_mat_boost_tree <- conf_mat( 202 | data = credit_test_with_pred_boost_tree, 203 | truth = default, 204 | estimate = .pred_class 205 | ) 206 | conf_mat_boost_tree 207 | 208 | ### Visualize the confusion matrix ---- 209 | conf_mat_boost_tree %>% autoplot(type = "heatmap") 210 | conf_mat_boost_tree %>% autoplot(type = "mosaic") 211 | 212 | ### Visualize the ROC curve ---- 213 | credit_test_with_pred_boost_tree %>% 214 | roc_curve( 215 | truth = default, 216 | estimate = .pred_no 217 | ) %>% 218 | autoplot() 219 | 220 | ### Calculate the ROC AUC (area under the curve) ---- 221 | credit_roc_auc_boost_tree <- credit_test_with_pred_boost_tree %>% 222 | roc_auc( 223 | truth = default, 224 | estimate = .pred_no 225 | ) 226 | credit_roc_auc_boost_tree 227 | 228 | ### Put together other model metrics ---- 229 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 230 | classification_metrics_boost_tree <- conf_mat( 231 | credit_test_with_pred_boost_tree, 232 | truth = default, 233 | estimate = .pred_class 234 | ) %>% 235 | summary() 236 | classification_metrics_boost_tree 237 | 238 | 239 | ## 7. Creating a function to help evaluate the model further ---- 240 | 241 | # The assumption here is that you have already gone through steps 1. to 2. 242 | # What we're potentially tuning here are the arguments .tree_depth and .min_n 243 | # for decision_tree, and .trees and .min_n for boost_tree. 244 | 245 | classify_with_c5_trees <- function( 246 | .model = c("decision_tree", "boost_tree"), 247 | .mode = "classification", 248 | .engine = "C5.0", 249 | .tree_depth = NULL, # for decision_tree 250 | .trees = NULL, # for boost_tree 251 | .min_n = 1 # for both 252 | ) { 253 | 254 | # Create the recipe 255 | recipe_obj <- recipe( 256 | default ~ ., 257 | data = credit_tbl 258 | ) %>% 259 | step_string2factor(all_nominal()) 260 | 261 | credit_factorized_tbl <- recipe_obj %>% 262 | prep() %>% 263 | bake(new_data = NULL) 264 | 265 | # Create training and test data (randomly) 266 | RNGversion("3.5.2") 267 | set.seed(123) 268 | 269 | credit_split <- initial_split( 270 | credit_factorized_tbl, 271 | prop = 0.9 272 | ) 273 | credit_train <- training(credit_split) 274 | credit_test <- testing(credit_split) 275 | 276 | # Model specification 277 | model <- .model 278 | 279 | if (model == "decision_tree") { 280 | 281 | model_spec <- decision_tree( 282 | mode = .mode, 283 | engine = .engine, 284 | tree_depth = .tree_depth, 285 | min_n = .min_n 286 | ) %>% 287 | translate() 288 | 289 | } else if (model == "boost_tree") { 290 | 291 | model_spec <- boost_tree( 292 | mode = .mode, 293 | engine = .engine, 294 | trees = .trees, 295 | min_n = .min_n 296 | ) %>% 297 | translate() 298 | 299 | } else { 300 | 301 | stop("The model needs to be either decision_tree or boost_tree!") 302 | 303 | } 304 | 305 | # Fit the model 306 | model_fit <- fit( 307 | model_spec, 308 | default ~ ., 309 | credit_train 310 | ) 311 | 312 | # Add the predictions to the test tibble 313 | credit_test_with_pred_tbl <- augment(model_fit, credit_test) 314 | credit_test_with_pred_tbl 315 | 316 | # Create a confusion matrix 317 | conf_mat <- conf_mat( 318 | data = credit_test_with_pred_tbl, 319 | truth = default, 320 | estimate = .pred_class 321 | ) 322 | 323 | conf_mat %>% autoplot(type = "heatmap") 324 | 325 | } 326 | 327 | ### Test the function ---- 328 | classify_with_c5_trees( 329 | .model = "decision_tree", 330 | .mode = "classification", 331 | .engine = "C5.0", 332 | .tree_depth = NULL, # for decision_tree 333 | .trees = NULL, # for boost_tree 334 | .min_n = 1 # for both, NULL produces error, so > 1 is adviced 335 | ) 336 | 337 | 338 | ## 8. Cross-validation ---- 339 | # You might want to restart R (Ctrl + Shift + F10) at this point so you have a 340 | # clean slate 341 | 342 | ### Load libraries ---- 343 | library(conflicted) 344 | library(tidyverse) 345 | library(tidymodels) 346 | 347 | ### Load the data ---- 348 | credit_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/credit.csv") 349 | 350 | ### Create the train-test split ---- 351 | RNGversion("3.5.2") 352 | set.seed(123) 353 | 354 | credit_split <- initial_split( 355 | credit_tbl, 356 | prop = 0.9 357 | ) 358 | credit_train <- training(credit_split) 359 | credit_test <- testing(credit_split) 360 | 361 | ### Create the cross-validation folds ---- 362 | set.seed(345) 363 | folds_train <- vfold_cv(credit_train, v = 10) 364 | folds_train 365 | 366 | ### Create recipe, model specification and control ---- 367 | recipe_train <- recipe( 368 | default ~ ., 369 | data = credit_train 370 | ) %>% 371 | step_string2factor(all_nominal()) 372 | 373 | model_spec <- decision_tree( 374 | mode = "classification", 375 | engine = "C5.0", 376 | cost_complexity = NULL, 377 | tree_depth = NULL, 378 | min_n = NULL 379 | ) %>% 380 | translate() 381 | 382 | control <- control_resamples(save_pred = TRUE) 383 | 384 | ### Fit the samples ---- 385 | spline_res_train <- fit_resamples( 386 | object = model_spec, 387 | preprocessor = recipe_train, 388 | resamples = folds_train, 389 | control = control 390 | ) 391 | 392 | ### Look at the summarized model metrics ---- 393 | spline_res_train %>% 394 | collect_metrics() 395 | 396 | ### Look at the individual model metrics ---- 397 | spline_res_train %>% 398 | collect_metrics(summarize = FALSE) %>% 399 | ggplot( 400 | aes( 401 | x = .estimate, 402 | fill = .metric 403 | ) 404 | ) + 405 | geom_histogram() 406 | -------------------------------------------------------------------------------- /machine_learning_with_r_4th_ed/chapter_07_black-box_methods_neural_networks_and_support_vector_machines.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 7: Black-Box Methods - Neural Networks and Support Vector Machines" 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask' 4 | date: "2023-07-23" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Loading Packages 13 | 14 | ```{r} 15 | library(conflicted) # An Alternative Conflict Resolution Strategy 16 | library(corrr) # Correlations in R 17 | library(janitor) # Simple Tools for Examining and Cleaning Dirty Data 18 | library(tidymodels) # Easily Install and Load the 'Tidymodels' Packages 19 | library(tidyverse) # Easily Install and Load the 'Tidyverse' 20 | ``` 21 | 22 | ## Part 1: Neural Networks 23 | 24 | ### Exploring and preparing the data 25 | 26 | #### Read in data and examine structure 27 | 28 | ```{r} 29 | concrete_tbl <- read_csv("data/concrete.csv") 30 | ``` 31 | 32 | ```{r} 33 | concrete_tbl %>% 34 | glimpse() 35 | ``` 36 | 37 | #### Check the minimum and maximum strength 38 | 39 | ```{r} 40 | concrete_tbl %>% 41 | pull(strength) %>% 42 | summary() 43 | ``` 44 | 45 | ### Creating the recipe 46 | 47 | #### Apply normalization to the numeric predictors 48 | 49 | ```{r} 50 | recipe_obj <- recipe( 51 | strength ~ ., 52 | data = concrete_tbl 53 | ) %>% 54 | step_range( 55 | all_numeric_predictors(), 56 | min = 0, 57 | max = 1 58 | ) 59 | 60 | recipe_obj 61 | ``` 62 | 63 | ```{r} 64 | concrete_normalized_tbl <- recipe_obj %>% 65 | prep() %>% 66 | bake(new_data = NULL) 67 | 68 | concrete_normalized_tbl 69 | ``` 70 | 71 | #### Create training and test data 72 | 73 | ```{r} 74 | concrete_split <- initial_time_split( 75 | concrete_normalized_tbl, 76 | prop = 773 / 1030 77 | ) 78 | 79 | concrete_train <- training(concrete_split) 80 | concrete_test <- testing(concrete_split) 81 | ``` 82 | 83 | ### Training a model on the data 84 | 85 | nnet is the engine (needs to be installed if not already): 86 | install.packages("nnet") 87 | 88 | It is used as the engine for {parsnip}'s mlp() function. And since we are predicting strength, we choose regression as the mode. 89 | 90 | #### Create model specification 91 | 92 | ```{r} 93 | set.seed(12345) 94 | 95 | model_spec_nnet <- mlp( 96 | engine = "nnet", 97 | mode = "regression", 98 | hidden_units = 1, 99 | penalty = 0, 100 | dropout = 0, 101 | epochs = 100, 102 | activation = NULL, 103 | learn_rate = NULL 104 | ) %>% 105 | translate() 106 | 107 | model_spec_nnet 108 | ``` 109 | 110 | #### Fit the model 111 | 112 | ```{r} 113 | model_fit_nnet <- model_spec_nnet %>% 114 | fit( 115 | strength ~ ., 116 | concrete_train 117 | ) 118 | 119 | model_fit_nnet 120 | ``` 121 | 122 | #### Take a closer look at the model 123 | 124 | ```{r} 125 | model_fit_nnet$fit %>% 126 | summary() 127 | ``` 128 | 129 | #### Make the predictions (you could skip this step) 130 | 131 | ```{r} 132 | concrete_test_pred <- model_fit_nnet %>% 133 | predict( 134 | new_data = concrete_test, 135 | type = "numeric" 136 | ) 137 | 138 | concrete_test_pred 139 | ``` 140 | 141 | #### Add the predictions to the test tibble 142 | 143 | ```{r} 144 | concrete_test_with_pred <- augment(model_fit_nnet, concrete_test) 145 | concrete_test_with_pred 146 | ``` 147 | 148 | #### Metrics 149 | 150 | ```{r} 151 | concrete_test_with_pred %>% 152 | metrics(strength, .pred) 153 | ``` 154 | 155 | #### Visualize the network topology 156 | 157 | Currently looking for a method to do that with tidymodels. 158 | 159 | ### Evaluating model performance 160 | 161 | #### Examine the correlation between predicted and actual values 162 | 163 | ```{r} 164 | concrete_test_with_pred %>% 165 | select(.pred, strength) %>% 166 | correlate() 167 | ``` 168 | 169 | A simpler alternative: 170 | 171 | ```{r} 172 | cor( 173 | concrete_test_with_pred$.pred, 174 | concrete_test_with_pred$strength 175 | ) 176 | ``` 177 | 178 | ### Improving model performance with two hidden layers and custom activation function 179 | 180 | #### Create model specification 181 | 182 | ```{r} 183 | set.seed(12345) 184 | model_spec_nnet_2 <- mlp( 185 | engine = "nnet", 186 | mode = "regression", 187 | hidden_units = 5, 188 | penalty = 0.1, 189 | epochs = 100, 190 | ) %>% 191 | translate() 192 | 193 | model_spec_nnet_2 194 | ``` 195 | 196 | ### Fit the model ---- 197 | 198 | ```{r} 199 | model_fit_nnet_2 <- model_spec_nnet_2 %>% 200 | fit( 201 | strength ~ ., 202 | concrete_train 203 | ) 204 | 205 | model_fit_nnet_2 206 | ``` 207 | 208 | #### Take a closer look at the model 209 | 210 | ```{r} 211 | model_fit_nnet_2$fit %>% 212 | summary() 213 | ``` 214 | 215 | #### Make the predictions (you could skip this step) 216 | 217 | ```{r} 218 | concrete_test_pred_2 <- model_fit_nnet_2 %>% 219 | predict( 220 | new_data = concrete_test, 221 | type = "numeric" 222 | ) 223 | 224 | concrete_test_pred_2 225 | ``` 226 | 227 | #### Add the predictions to the test tibble 228 | 229 | ```{r} 230 | concrete_test_with_pred_2 <- augment(model_fit_nnet_2, concrete_test) 231 | concrete_test_with_pred_2 232 | ``` 233 | 234 | #### Metrics 235 | 236 | ```{r} 237 | concrete_test_with_pred_2 %>% 238 | metrics(strength, .pred) 239 | ``` 240 | 241 | #### Examine the correlation between predicted and actual values 242 | 243 | ```{r} 244 | concrete_test_with_pred_2 %>% 245 | select(.pred, strength) %>% 246 | correlate() 247 | ``` 248 | 249 | A simpler alternative: 250 | 251 | ```{r} 252 | cor( 253 | concrete_test_with_pred_2$.pred, 254 | concrete_test_with_pred_2$strength 255 | ) 256 | ``` 257 | 258 | ## Part 2: Support Vector Machines 259 | 260 | ### Exploring and preparing the data 261 | 262 | #### Read in data and examine structure 263 | 264 | ```{r} 265 | letters_tbl <- read_csv("data/letterdata.csv") %>% 266 | mutate(across(where(is.character), as.factor)) 267 | ``` 268 | 269 | ```{r} 270 | letters_tbl %>% 271 | glimpse() 272 | ``` 273 | 274 | ### Creating the recipe 275 | 276 | #### Apply normalization to entire data frame 277 | 278 | ```{r} 279 | recipe_obj <- recipe( 280 | letter ~ ., 281 | data = letters_tbl 282 | ) 283 | 284 | recipe_obj 285 | ``` 286 | 287 | ```{r} 288 | letters_baked_tbl <- recipe_obj %>% 289 | prep() %>% 290 | bake(new_data = NULL) 291 | 292 | letters_baked_tbl 293 | ``` 294 | 295 | #### Create training and test data 296 | 297 | ```{r} 298 | letters_split <- initial_time_split( 299 | letters_baked_tbl, 300 | prop = 16000 / 20000 301 | ) 302 | letters_train <- training(letters_split) 303 | letters_test <- testing(letters_split) 304 | ``` 305 | 306 | ### Training a model on the data 307 | 308 | #### Create model specification 309 | 310 | ```{r} 311 | model_spec_kernlab <- svm_linear( 312 | engine = "kernlab", 313 | mode = "classification", 314 | cost = NULL, 315 | margin = NULL 316 | ) %>% 317 | translate() 318 | 319 | model_spec_kernlab 320 | ``` 321 | 322 | #### Fit the model 323 | 324 | ```{r} 325 | model_fit_kernlab <- model_spec_kernlab %>% 326 | fit( 327 | letter ~ ., 328 | letters_train 329 | ) 330 | 331 | model_fit_kernlab 332 | ``` 333 | 334 | #### Make the predictions (you could skip this step) 335 | 336 | ```{r} 337 | letters_test_pred <- model_fit_kernlab %>% 338 | predict(new_data = letters_test) 339 | 340 | letters_test_pred 341 | ``` 342 | 343 | #### Add the predictions to the test tibble 344 | 345 | ```{r} 346 | letters_test_with_pred <- augment(model_fit_kernlab, letters_test) 347 | letters_test_with_pred 348 | ``` 349 | 350 | ### Evaluating model performance 351 | 352 | #### Predictions on testing dataset 353 | 354 | ```{r} 355 | letters_test_with_pred %>% 356 | tabyl(letter, .pred_class) 357 | ``` 358 | 359 | #### Look only at agreement vs. non-agreement 360 | 361 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions 362 | 363 | ```{r} 364 | letters_test_with_pred %>% 365 | mutate( 366 | agreement = case_when( 367 | letter == .pred_class ~ TRUE, 368 | .default = FALSE 369 | ) 370 | ) %>% 371 | tabyl(agreement) %>% 372 | adorn_pct_formatting(digits = 1) 373 | ``` 374 | 375 | ### Improving model performance 376 | 377 | #### Change to a RBF kernel 378 | 379 | ```{r} 380 | model_spec_rbf <- svm_rbf( 381 | engine = "kernlab", 382 | mode = "classification", 383 | cost = NULL, 384 | margin = NULL, 385 | rbf_sigma = NULL 386 | ) %>% 387 | translate() 388 | 389 | model_spec_rbf 390 | ``` 391 | 392 | #### Fit the model 393 | 394 | ```{r} 395 | model_fit_rbf <- fit( 396 | model_spec_rbf, 397 | letter ~ ., 398 | letters_train 399 | ) 400 | 401 | model_fit_rbf 402 | ``` 403 | 404 | #### Make the predictions (you could skip this step) 405 | 406 | ```{r} 407 | letters_test_pred_rbf <- model_fit_rbf %>% 408 | predict(new_data = letters_test) 409 | 410 | letters_test_pred_rbf 411 | ``` 412 | 413 | #### Add the predictions to the test tibble 414 | 415 | ```{r} 416 | letters_test_with_pred_rbf <- augment(model_fit_rbf, letters_test) 417 | letters_test_with_pred_rbf 418 | ``` 419 | 420 | #### Predictions on testing dataset 421 | 422 | ```{r} 423 | letters_test_with_pred_rbf %>% 424 | tabyl(letter, .pred_class) 425 | ``` 426 | 427 | #### Look only at agreement vs. non-agreement 428 | 429 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions 430 | 431 | ```{r} 432 | letters_test_with_pred_rbf %>% 433 | mutate( 434 | agreement = case_when( 435 | letter == .pred_class ~ TRUE, 436 | .default = FALSE 437 | ) 438 | ) %>% 439 | tabyl(agreement) %>% 440 | adorn_pct_formatting(digits = 1) 441 | ``` 442 | 443 | #### Test various values of the cost parameter 444 | 445 | ```{r} 446 | cost_values <- c(1, seq(from = 5, to = 40, by = 5)) 447 | ``` 448 | 449 | ```{r} 450 | accuracy_values <- map_dbl(cost_values, function(x) { 451 | 452 | model_spec_rbf <- svm_rbf( 453 | engine = "kernlab", 454 | mode = "classification", 455 | cost = {{ x }}, 456 | margin = NULL, 457 | rbf_sigma = NULL 458 | ) %>% 459 | translate() 460 | 461 | model_fit_rbf <- fit( 462 | model_spec_rbf, 463 | letter ~ ., 464 | letters_train 465 | ) 466 | 467 | letters_test_pred_rbf <- model_fit_rbf %>% 468 | predict(new_data = letters_test) %>% 469 | as_vector() 470 | 471 | agree <- if_else(letters_test_pred_rbf == letters_test %>% pull(letter), 1, 0) 472 | 473 | accuracy <- sum(agree) / nrow(letters_test) 474 | 475 | return(accuracy) 476 | 477 | }) 478 | ``` 479 | 480 | #### Bind together the cost parameter and accuracy values 481 | 482 | ```{r} 483 | cost_vs_accuracy_tbl <- bind_cols( 484 | cost_values, 485 | accuracy_values, 486 | ) %>% 487 | rename( 488 | cost_values = ...1, 489 | accuracy_values = ...2 490 | ) 491 | 492 | cost_vs_accuracy_tbl 493 | ``` 494 | 495 | #### Visualize to find the optimal cost parameter value 496 | 497 | ```{r} 498 | cost_vs_accuracy_tbl %>% 499 | ggplot(aes(cost_values, accuracy_values)) + 500 | geom_line() + 501 | geom_point() + 502 | labs( 503 | x = "Cost Parameter Value", 504 | y = "Accuracy" 505 | ) + 506 | theme_classic() 507 | ``` 508 | 509 | #### Make sure you have the right optimal cost value for the best accuracy 510 | 511 | ```{r} 512 | cost_vs_accuracy_tbl %>% 513 | slice_max(accuracy_values) 514 | ``` 515 | 516 | #### Pull the first cost_value that has the max accuracy_value 517 | 518 | ```{r} 519 | .max_accuracy <- cost_vs_accuracy_tbl %>% 520 | slice_max(accuracy_values) %>% 521 | slice(1) %>% 522 | pull(cost_values) 523 | ``` 524 | 525 | ### Fitting the model with the optimal cost value (that was just pulled) 526 | 527 | #### Give model specification 528 | 529 | ```{r} 530 | model_spec_best <- svm_rbf( 531 | engine = "kernlab", 532 | mode = "classification", 533 | cost = {{ .max_accuracy }}, 534 | margin = NULL, 535 | rbf_sigma = NULL 536 | ) %>% 537 | translate() 538 | ``` 539 | 540 | #### Fit the model 541 | 542 | ```{r} 543 | model_fit_best <- fit( 544 | model_spec_best, 545 | letter ~ ., 546 | letters_train 547 | ) 548 | ``` 549 | 550 | #### Add the predictions to the test tibble 551 | 552 | ```{r} 553 | letters_test_with_pred_best <- augment(model_fit_best, letters_test) 554 | ``` 555 | 556 | #### Predictions on testing dataset 557 | 558 | ```{r} 559 | letters_test_with_pred_best %>% 560 | tabyl(letter, .pred_class) 561 | ``` 562 | 563 | #### Look only at agreement vs. non-agreement 564 | 565 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions 566 | 567 | ```{r} 568 | letters_test_with_pred_best %>% 569 | mutate( 570 | agreement = case_when( 571 | letter == .pred_class ~ TRUE, 572 | .default = FALSE 573 | ) 574 | ) %>% 575 | tabyl(agreement) %>% 576 | adorn_pct_formatting(digits = 1) 577 | ``` 578 | -------------------------------------------------------------------------------- /machine_learning_with_r_3rd_ed/naive_bayes/filtering-mobile-phone-spam-with-naive-bayes.r: -------------------------------------------------------------------------------- 1 | # Filtering Mobile Phone Spam with the Naive Bayes Algorithm ---- 2 | 3 | # Inspired by Brett Lantz's Machine Learning with R, 4 | # Chapter 4: Probabilistic Learning - Classification Using Naive Bayes and 5 | # Chapter 10: Evaluating Model Performance 6 | # 7 | # The original code is made with a lot of base R, {e1071} and {gmodels}. I 8 | # wanted to see how one could recreate it using mainly {textrecipes}, 9 | # {tidymodels}, {tidytext} and {tidyverse}. 10 | # 11 | # You can find the original code and the slightly modified dataset(s) here: 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter04 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10 14 | 15 | ## 1. Loading libraries (in the order they get used) ---- 16 | library(conflicted) 17 | library(tidyverse) 18 | conflict_prefer("filter", "dplyr", "stats") 19 | library(tidytext) 20 | library(SnowballC) # for stemming 21 | library(wordcloud2) 22 | library(textrecipes) 23 | library(tidymodels) 24 | library(discrim) # for naive_Bayes() 25 | library(janitor) 26 | library(crosstable) 27 | 28 | 29 | ## 2. Exploring and preparing the data ---- 30 | 31 | ### Read the sms data into the sms tibble, convert spam/ham to factor ---- 32 | sms_tbl <- read_csv( 33 | "machine_learning_with_r_3rd_ed/naive_bayes/data/sms_spam.csv", 34 | col_types = "fc" 35 | ) %>% 36 | select(.type = type, everything()) 37 | 38 | ### Examine the structure of the sms data ---- 39 | glimpse(sms_tbl) 40 | 41 | ### Examine the distribution of spam/ham ---- 42 | sms_tbl %>% 43 | count(.type) %>% 44 | mutate(pct = (n / sum(n) * 100)) 45 | 46 | ### Build a corpus using the {tidytext} package instead of {tm} ---- 47 | 48 | # This part inspired by a blog post by Julia Silge: 49 | # https://www.tidyverse.org/blog/2020/11/tidymodels-sparse-support/ 50 | 51 | ### Add a row number ---- 52 | sms_row_numbers_tbl <- sms_tbl %>% 53 | mutate(line = row_number()) 54 | sms_row_numbers_tbl 55 | 56 | ### Manual preprocessing (just to see what it's all about) ---- 57 | tidy_sms_tbl <- sms_row_numbers_tbl %>% 58 | unnest_tokens(word, text) %>% 59 | count(line, word) %>% 60 | bind_tf_idf(word, line, n) 61 | tidy_sms_tbl 62 | 63 | wide_sms_tbl <- tidy_sms_tbl %>% 64 | dplyr::select(line, word, tf_idf) %>% 65 | pivot_wider( 66 | names_from = word, 67 | names_prefix = "word_", 68 | values_from = tf_idf, 69 | values_fill = 0 70 | ) 71 | wide_sms_tbl 72 | 73 | 74 | ## 3. Visualizing text data - word clouds ---- 75 | 76 | # This part inspired by Julia Silge & David Robinson's book Text Mining with R: 77 | # A Tidy Approach: https://www.tidytextmining.com/ 78 | 79 | ### Count word frequencies ---- 80 | frequency_tbl <- sms_row_numbers_tbl %>% 81 | 82 | # One word per one row 83 | unnest_tokens(word, text) %>% 84 | 85 | # Stemming 86 | mutate(word = wordStem(word)) %>% 87 | 88 | # Count the words 89 | count(.type, word) %>% 90 | 91 | # Count the proportion of words 92 | with_groups( 93 | .type, 94 | mutate, 95 | proportion = n / sum(n) 96 | ) %>% 97 | 98 | # Reorder the columns 99 | dplyr::select(-n) %>% 100 | pivot_wider(names_from = .type, values_from = proportion) %>% 101 | pivot_longer( 102 | cols = c("ham", "spam"), 103 | names_to = ".type", 104 | values_to = "freq" 105 | ) 106 | 107 | ### Subset the frequency data into two groups, spam and ham ---- 108 | spam_tbl <- frequency_tbl %>% 109 | filter(.type == "spam") %>% 110 | dplyr::select(-.type) %>% 111 | drop_na() 112 | 113 | ham_tbl <- frequency_tbl %>% 114 | filter(.type == "ham") %>% 115 | dplyr::select(-.type) %>% 116 | drop_na() 117 | 118 | ### Word cloud ---- 119 | 120 | # This part inspired by a blog post by Céline Van den Rul: 121 | # https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a 122 | 123 | # One for ham... 124 | wordcloud2( 125 | data = ham_tbl, 126 | size = 2, 127 | color = "random-dark" 128 | ) 129 | 130 | # ...and another for spam 131 | wordcloud2( 132 | data = spam_tbl, 133 | size = 2, 134 | color = "random-dark" 135 | ) 136 | 137 | 138 | ## 4. Creating the recipe and splitting the data ---- 139 | 140 | ### Create the recipe ---- 141 | text_recipe_obj <- recipe( 142 | .type ~ text, 143 | data = sms_row_numbers_tbl 144 | ) %>% 145 | step_tokenize(text) %>% 146 | step_stopwords(text) %>% 147 | step_tokenfilter(text, max_tokens = 1e3) %>% 148 | step_tfidf(text) 149 | text_recipe_obj 150 | 151 | # Bake it 152 | sms_baked_tbl <- text_recipe_obj %>% 153 | prep() %>% 154 | bake(new_data = NULL) 155 | 156 | # Simplify the tf-idf to yes/no 157 | sms_baked_longer_tbl <- sms_baked_tbl %>% 158 | mutate(across(where(is.numeric), 159 | ~ case_when( 160 | . > 0 ~ "Yes", 161 | TRUE ~ "No" 162 | ))) %>% 163 | # Rename the columns back to words 164 | rename_with( 165 | ~ tolower(gsub("tfidf_text_", "", .x)), 166 | .cols = starts_with("tfidf_text_") 167 | ) 168 | sms_baked_longer_tbl 169 | 170 | ### Create training and test data ---- 171 | # Not randomly, because the messages weren't in any particular order 172 | sms_split <- initial_time_split( 173 | sms_baked_longer_tbl, 174 | prop = 0.75 175 | ) 176 | sms_train <- training(sms_split) 177 | sms_test <- testing(sms_split) 178 | 179 | 180 | ## 5. Training a model on the data ---- 181 | 182 | # naivebayes is the engine (needs to be installed if not already): 183 | # install.packages("discrim") AND 184 | # install.packages("naivebayes") 185 | 186 | # It is used as the engine for {parsnip}'s naive_Bayes() function. And since 187 | # we are classifying, that is the mode we choose. 188 | 189 | # The simple reason is I couldn't get klaR (the other engine) to work. If you 190 | # know how, please comment on GitHub. It would be great to get to test what the 191 | # difference between the two engines are. 192 | 193 | ### Model specification ---- 194 | model_spec <- naive_Bayes( 195 | engine = "naivebayes", 196 | mode = "classification", 197 | smoothness = NULL, 198 | Laplace = NULL 199 | ) %>% 200 | translate() 201 | model_spec 202 | 203 | ### Fit the model ---- 204 | model_fit <- fit( 205 | model_spec, 206 | .type ~ ., 207 | sms_train 208 | ) 209 | model_fit 210 | 211 | ### Make the predictions (you could skip this step) ---- 212 | sms_test_pred <- predict( 213 | object = model_fit, 214 | new_data = sms_test, 215 | type = "class" 216 | ) 217 | sms_test_pred 218 | 219 | ### Add the predictions to the test tibble ---- 220 | sms_test_with_pred_tbl <- augment(model_fit, sms_test) 221 | sms_test_with_pred_tbl 222 | 223 | 224 | ## 6. Evaluating model performance ---- 225 | 226 | ### Create a confusion matrix ---- 227 | conf_mat <- conf_mat( 228 | data = sms_test_with_pred_tbl, 229 | truth = .type, 230 | estimate = .pred_class 231 | ) 232 | conf_mat 233 | 234 | ### Visualize the confusion matrix ---- 235 | conf_mat %>% autoplot(type = "heatmap") 236 | conf_mat %>% autoplot(type = "mosaic") 237 | 238 | ### Visualize the ROC curve ---- 239 | sms_test_with_pred_tbl %>% 240 | roc_curve( 241 | truth = .type, 242 | estimate = .pred_ham 243 | ) %>% 244 | autoplot() 245 | 246 | ### Calculate the ROC AUC (area under the curve) ---- 247 | sms_roc_auc <- sms_test_with_pred_tbl %>% 248 | roc_auc( 249 | truth = .type, 250 | estimate = .pred_ham 251 | ) 252 | sms_roc_auc 253 | 254 | 255 | ## 7. Compare to k-NN ---- 256 | 257 | ### Read the results ---- 258 | sms_results_knn <- read_csv("machine_learning_with_r_3rd_ed/naive_bayes/data/sms_results_knn.csv") %>% 259 | mutate(p_ham = 1 - p_spam) 260 | 261 | ### Naive Bayes vs k-NN - ROC curve ---- 262 | 263 | # Naive Bayes 264 | sms_test_with_pred_tbl %>% 265 | roc_curve( 266 | truth = .type, 267 | estimate = .pred_ham 268 | ) %>% 269 | autoplot() 270 | 271 | # k-NN 272 | sms_test_with_pred_tbl %>% 273 | mutate(.knn_p_ham = sms_results_knn$p_ham) %>% 274 | roc_curve( 275 | truth = .type, 276 | estimate = .knn_p_ham 277 | ) %>% 278 | autoplot() 279 | 280 | ### Naive Bayes vs k-NN - AUC ---- 281 | 282 | # Naive Bayes 283 | sms_test_with_pred_tbl %>% 284 | roc_auc( 285 | truth = .type, 286 | estimate = .pred_ham 287 | ) 288 | 289 | # k-NN 290 | sms_test_with_pred_tbl %>% 291 | mutate(.knn_p_ham = sms_results_knn$p_ham) %>% 292 | roc_auc( 293 | truth = .type, 294 | estimate = .knn_p_ham 295 | ) 296 | 297 | ### Put together other model metrics ---- 298 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 299 | classification_metrics <- conf_mat( 300 | sms_test_with_pred_tbl, 301 | truth = .type, 302 | estimate = .pred_class 303 | ) %>% 304 | summary() 305 | classification_metrics 306 | 307 | 308 | ## 8. Improving model performance ---- 309 | 310 | # Basically, the same as before, but with Laplace = 1 311 | 312 | ### Model specification ---- 313 | model_spec <- naive_Bayes( 314 | engine = "naivebayes", 315 | mode = "classification", 316 | smoothness = NULL, 317 | Laplace = 1 318 | ) %>% 319 | translate() 320 | model_spec 321 | 322 | ### Fit the model ---- 323 | model_fit <- fit( 324 | model_spec, 325 | .type ~ ., 326 | sms_train 327 | ) 328 | model_fit 329 | 330 | ### Make the predictions (you could skip this step) ---- 331 | sms_test_pred <- predict( 332 | object = model_fit, 333 | new_data = sms_test, 334 | type = "class" 335 | ) 336 | sms_test_pred 337 | 338 | ### Add the predictions to the test tibble ---- 339 | sms_test_with_pred_tbl <- augment(model_fit, sms_test) 340 | sms_test_with_pred_tbl 341 | 342 | 343 | ## 9. Evaluating model performance ---- 344 | 345 | ### Create a confusion matrix ---- 346 | conf_mat <- conf_mat( 347 | data = sms_test_with_pred_tbl, 348 | truth = .type, 349 | estimate = .pred_class 350 | ) 351 | conf_mat 352 | 353 | ### Visualize the confusion matrix ---- 354 | conf_mat %>% autoplot(type = "heatmap") 355 | conf_mat %>% autoplot(type = "mosaic") 356 | 357 | ### Visualize the ROC curve ---- 358 | sms_test_with_pred_tbl %>% 359 | roc_curve( 360 | truth = .type, 361 | estimate = .pred_ham 362 | ) %>% 363 | autoplot() 364 | 365 | ### Calculate the ROC AUC (area under the curve) ---- 366 | sms_roc_auc <- sms_test_with_pred_tbl %>% 367 | roc_auc( 368 | truth = .type, 369 | estimate = .pred_ham 370 | ) 371 | sms_roc_auc 372 | 373 | ### Put together other model metrics ---- 374 | # Such as accuracy, Matthews correlation coefficient (mcc) and others... 375 | classification_metrics <- conf_mat( 376 | sms_test_with_pred_tbl, 377 | truth = .type, 378 | estimate = .pred_class 379 | ) %>% 380 | summary() 381 | classification_metrics 382 | 383 | 384 | ## 10. Creating a function to help evaluate the model further ---- 385 | 386 | # The assumption here is that you have already gone through steps 1. to 4. 387 | # What we're potentially tuning here are the arguments .smoothness and .Laplace 388 | # Check out the book and/or the documentation for further info about them! 389 | 390 | classify_with_naive_bayes <- function( 391 | .smoothness = NULL, 392 | .laplace = NULL 393 | ) { 394 | 395 | # Model specification 396 | model_spec <- naive_Bayes( 397 | engine = "naivebayes", 398 | mode = "classification", 399 | smoothness = .smoothness, 400 | Laplace = .laplace 401 | ) %>% 402 | translate() 403 | model_spec 404 | 405 | # Fit the model 406 | model_fit <- fit( 407 | model_spec, 408 | .type ~ ., 409 | sms_train 410 | ) 411 | model_fit 412 | 413 | # Add the predictions to the test tibble 414 | sms_test_with_pred_tbl <- augment(model_fit, sms_test) 415 | sms_test_with_pred_tbl 416 | 417 | # Create a confusion matrix 418 | conf_mat <- conf_mat( 419 | data = sms_test_with_pred_tbl, 420 | truth = .type, 421 | estimate = .pred_class 422 | ) 423 | 424 | # Print the confusion matrix 425 | conf_mat %>% autoplot(type = "heatmap") 426 | 427 | } 428 | 429 | ### Test the function ---- 430 | classify_with_naive_bayes( 431 | .smoothness = 1, 432 | .laplace = 1 433 | ) 434 | 435 | 436 | ## 11. Understanding the classifier's predictions ---- 437 | 438 | ### Obtain the predicted probabilities (you could skip this step) ---- 439 | sms_test_prob <- predict( 440 | object = model_fit, 441 | new_data = sms_test, 442 | type = "prob" 443 | ) 444 | sms_test_prob 445 | 446 | ### Look at the predicted probabilities ---- 447 | sms_results <- sms_test_with_pred_tbl %>% 448 | select( 449 | actual_type = .type, 450 | predicted_type = .pred_class, 451 | prob_spam = .pred_spam, 452 | prob_ham = .pred_ham 453 | ) %>% 454 | mutate( 455 | prob_spam = prob_spam %>% round(2), 456 | prob_ham = prob_ham %>% round(2) 457 | ) 458 | sms_results 459 | 460 | ### Test cases where the model is less confident ---- 461 | sms_results %>% 462 | filter(between(prob_spam, 0.40, 0.60)) 463 | 464 | ### Test cases where the model was wrong ---- 465 | sms_results %>% 466 | filter(actual_type != predicted_type) 467 | 468 | ### Specifying vectors ---- 469 | sms_results %>% 470 | tabyl(actual_type, predicted_type) 471 | 472 | ### Using {crosstable} ---- 473 | sms_results %>% 474 | crosstable( 475 | cols = actual_type, 476 | by = predicted_type, 477 | label = FALSE, 478 | total = "both", 479 | percent_pattern = "{n} ({p_row}/{p_col})", 480 | percent_digits = 1 481 | ) %>% 482 | as_flextable(compact = TRUE) 483 | --------------------------------------------------------------------------------