├── machine_learning_with_r_4th_ed
    ├── img
    │   ├── cat.jpg
    │   ├── pizza.jpg
    │   ├── ice_cream.jpg
    │   └── youcanbeapirate-wb-sparkline.jpg
    ├── data
    │   ├── credit.sqlite3
    │   ├── titanic_train.xlsx
    │   ├── pt_data.csv
    │   ├── challenger.csv
    │   └── usedcars.csv
    ├── chapter_15_making_use_of_big_data.Rmd
    ├── chapter_12_advanced_data_preparation.Rmd
    ├── chapter_14_building_better_learners.Rmd
    ├── chapter_10_evaluating_model_performance.Rmd
    ├── chapter_11_being_succesful_with_machine_learning.Rmd
    ├── chapter_13_challenging_data_too_much_too_little_too_complex.Rmd
    ├── Chapter_11.r
    ├── chapter_08_finding_patterns_market_basket_analysis_using_association_rules.Rmd
    ├── Chapter_10.r
    ├── Chapter_12.r
    ├── chapter_09_finding_groups_of_data_clustering_with_k-means.Rmd
    ├── chapter_03_lazy_learning_classification_using_nearest_neighbors.Rmd
    ├── Chapter_14.r
    ├── Chapter_15.r
    ├── Chapter_13.r
    ├── chapter_02_managing_and_understanding_data.Rmd
    └── chapter_07_black-box_methods_neural_networks_and_support_vector_machines.Rmd
├── machine_learning_with_r_3rd_ed
    ├── naive_bayes
    │   ├── images
    │   │   ├── crosstable.jpeg
    │   │   ├── wordcloud2_ham.jpeg
    │   │   └── wordcloud2_spam.jpeg
    │   ├── readme.MD
    │   └── filtering-mobile-phone-spam-with-naive-bayes.r
    ├── regression_methods
    │   ├── images
    │   │   ├── fancyRpartPlot_wine.jpg
    │   │   └── fancyRpartPlot_wine.pdf
    │   ├── readme.MD
    │   ├── estimating_wine_quality_with_regression_trees_and_model_trees.r
    │   └── predicting_medical_expenses_with_linear_regression.r
    ├── association_rules
    │   ├── images
    │   │   ├── association_rules_visualized.jpg
    │   │   └── association_rules_visualized.pdf
    │   ├── readme.MD
    │   └── identifying_frequently_purchased_groceries_with_association_rules.r
    ├── k_means_clustering
    │   ├── images
    │   │   ├── total_within_sum_of_squares.jpg
    │   │   └── total_within_sum_of_squares.pdf
    │   ├── readme.MD
    │   └── finding_teen_market_segments_using_k_means_clustering.r
    ├── hyperparameter_optimization
    │   ├── images
    │   │   ├── hyperparameter_optimization_rpart.jpg
    │   │   └── hyperparameter_optimization_rpart.pdf
    │   ├── readme.MD
    │   ├── hyperparameter_optimization_with_tidymodels.R
    │   └── hyperparameter_optimization_with_tidymodels_random.R
    ├── README.md
    ├── knn
    │   ├── README.md
    │   ├── data
    │   │   └── wdbc-names.txt
    │   └── classifying-cancer-samples-with-knn.r
    ├── support_vector_machines
    │   ├── readme.MD
    │   └── performing_optical_character_recognition_with_support_vector_machines_.r
    ├── neural_networks
    │   ├── readme.MD
    │   └── modeling_the_strength_of_concrete_with_neural_networks.r
    ├── decision_trees_and_rules
    │   ├── readme.MD
    │   ├── identifying_poisonous_mushrooms_with_rule_learners.r
    │   └── identifying_risky_bank_loans_using_C5.0_decision_trees.r
    ├── random_forest
    │   └── random_forest_with_tidymodels.R
    └── boosting
    │   └── boosting_with_tidymodels.R
├── learning_machine_learning.Rproj
├── .gitignore
├── LICENSE
└── README.md


/machine_learning_with_r_4th_ed/img/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/cat.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/img/pizza.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/pizza.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/img/ice_cream.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/ice_cream.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/credit.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/data/credit.sqlite3


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/titanic_train.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/data/titanic_train.xlsx


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/images/crosstable.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/crosstable.jpeg


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/pt_data.csv:
--------------------------------------------------------------------------------
1 | subject_name,temperature,flu_status,gender,blood_type
2 | John Doe,98.1,FALSE,MALE,O
3 | Jane Doe,98.6,FALSE,FEMALE,AB
4 | Steve Graves,101.4,TRUE,MALE,A
5 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_ham.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_ham.jpeg


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_spam.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_spam.jpeg


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.pdf


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.pdf


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.pdf


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.jpg


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.pdf


--------------------------------------------------------------------------------
/learning_machine_learning.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/challenger.csv:
--------------------------------------------------------------------------------
1 | distress_ct,temperature,field_check_pressure,flight_num0,66,50,11,70,50,20,69,50,30,68,50,40,67,50,50,72,50,60,73,100,70,70,100,81,57,200,91,63,200,101,70,200,110,78,200,120,67,200,132,53,200,140,67,200,150,75,200,160,70,200,170,81,200,180,76,200,190,79,200,202,75,200,210,76,200,221,58,200,23


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_15_making_use_of_big_data.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 15: Making Use of Big Data"
 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
 4 | date: "2023-07-23"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_12_advanced_data_preparation.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 12: Advanced Data Preparation"
 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
 4 | date: "2023-07-23"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_14_building_better_learners.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 14: Building Better Learners"
 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
 4 | date: "2023-07-23"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_10_evaluating_model_performance.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 10: Evaluating Model Performance"
 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
 4 | date: "2023-07-23"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_11_being_succesful_with_machine_learning.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 11: Being Succesful with Machine Learning"
 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
 4 | date: "2023-07-23"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_13_challenging_data_too_much_too_little_too_complex.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 13: Challenging Data - Too Much, Too Little, Too Complex"
 3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
 4 | date: "2023-07-23"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/README.md:
--------------------------------------------------------------------------------
 1 | # Learning Machine Learning
 2 | 
 3 | I'm learning ML with R. Focusing on (but not limiting myself to) tidyverse and tidymodels.
 4 | 
 5 | ## Algorithms tackled (see the named folder(s) for examples):
 6 | * K-nearest neighbors (k-NN)
 7 | * Naive Bayes __UPDATED__
 8 | * Decision Trees and Rules __UPDATED__
 9 | * Regression Methods 
10 | * Neural Networks
11 | * Support Vector Machines
12 | * Association Rules
13 | * Clustering
14 | 
15 | ## Coming up next:
16 | * Hyperparameter Optimization
17 | * Bagging
18 | * Boosting
19 | * Random Forest


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/knn/README.md:
--------------------------------------------------------------------------------
 1 | # Classifying Cancer Samples with K-Nearest Neighbors
 2 | 
 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 3: Lazy Learning - Classification Using Nearest Neighbors.
 4 | 
 5 | The original code is made with a lot of base R, {class} and {gmodels}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
 6 | 
 7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
 8 | 
 9 | You can find the original code here: https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter03
10 | 
11 | Dataset Citation:
12 | Wolberg, William, Street, W. & Mangasarian, Olvi. (1995). Breast Cancer Wisconsin (Diagnostic). UCI Machine Learning Repository.


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/support_vector_machines/readme.MD:
--------------------------------------------------------------------------------
 1 | # Performing Optical Character Recognition with Support Vector Machines
 2 | 
 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 7:
 4 | Black Box Methods - Neural Networks and Support Vector Machines.
 5 | 
 6 | The original code is made with {kernlab}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
 7 | 
 8 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
 9 | 
10 | You can find the original code and the dataset here:
11 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
12 | 
13 | Dataset Citation:
14 | P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". (Machine Learning Vol 6 #2 March 91)


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/readme.MD:
--------------------------------------------------------------------------------
 1 | # Hyperparameter Optimization with tidymodels
 2 | 
 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 11:
 4 | Improving Model Performance.
 5 | 
 6 | The original code is made with {caret}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
 7 | 
 8 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
 9 | 
10 | You can find the original code and the dataset here:
11 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11
12 | 
13 | Dataset Citation:
14 | 
15 | Professor Dr. Hans Hofmann<br>
16 | Institut für Statistik und Ökonometrie<br>
17 | Universität Hamburg<br>
18 | FB Wirtschaftswissenschaften<br>
19 | Von-Melle-Park 5<br>
20 | 2000 Hamburg 13


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/neural_networks/readme.MD:
--------------------------------------------------------------------------------
 1 | # Modeling the Strength of Concrete with Neural Networks
 2 | 
 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 7: Black Box Methods - Neural Networks and Support Vector Machines.
 4 | 
 5 | The original code is made with {neuralnet}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
 6 | 
 7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
 8 | 
 9 | You can find the original code and the dataset here:
10 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
11 | 
12 | Dataset Citation:
13 | I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998).


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Antti Rask
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/readme.MD:
--------------------------------------------------------------------------------
 1 | # Finding Teen Market Segments Using k-means Clustering
 2 | 
 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 9: Finding Groups of Data - Clustering with k-means.
 4 | 
 5 | The original code is made with base R. While {tidymodels} doesn't have a k-means engine, I wanted to see how one could still recreate the code using {tidyverse} as much as possible and even using the {recipes} package from the {tidymodels} to do all the pre-processing needed and {broom} package from the same {tidymodels} family to help look at the model metrics in a more tidy way.
 6 | 
 7 | I was also inspired by this blog post from the tidymodels website to continue the exercise a bit further than the original code did:
 8 | https://www.tidymodels.org/learn/statistics/k-means/
 9 | 
10 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
11 | 
12 | You can find the original code and the dataset here:
13 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter09
14 | 
15 | Dataset Citation:
16 | Brett Lantz. "Finding Groups of Data - Clustering with k-means". (Machine Learning with R)


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/readme.MD:
--------------------------------------------------------------------------------
 1 | # Predicting Medical Expenses with Linear Regression /
 2 | # Estimating Wine Quality with Regression Trees and Model Trees
 3 | 
 4 | Inspired by Brett Lantz's Machine Learning with R, Chapter 6:
 5 | Forecasting Numeric Data - Regression Methods.
 6 | 
 7 | The original code is made with {psych}, {rpart}, {rpart.plot} and {Cubist}. I wanted to see how one could recreate it using mainly {tidymodels}, {tidyverse} and {rules}.
 8 | 
 9 | If you haven't already, buy the book:<br> https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
10 | 
11 | You can find the original code and the datasets here:<br>
12 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06
13 | 
14 | Dataset Citation(s):<br>
15 | 
16 | __Predicting Medical Expenses with Linear Regression__<br>
17 | Brett Lantz: "These data were created for this book using demographic statistics from the U.S. Census Bureau"
18 | 
19 | __Estimating Wine Quality with Regression Trees and Model Trees__<br>
20 | P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.<br>
21 | Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![](machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg)
 2 | 
 3 | # Learning Machine Learning
 4 | 
 5 | I'm learning machine learning (ML) by reading the book __Machine Learning with R__ by __Brett Lantz__. I've read the 3rd edition (published in 2019) and am currently reading the 4th edition. I will update the repo along the way.
 6 | 
 7 | While the book is excellent in explaining the different algorithms used, it still doesn't use tidyverse and tidymodels. Two of the frameworks that make data science -related tasks more user friendly, mainly because of a unified syntax. Obviously I'm not limiting myself to only them, but the whole point of this repo is to offer an alternative to base R and the separate packages used in the book and in the [official GitHub repo](https://github.com/PacktPublishing/Machine-Learning-with-R-Fourth-Edition).
 8 | 
 9 | ## Disclaimer!
10 | This repo is not meant to replace the book in any way. You should definitely read the book. It will help you understand ML in general (and specifically with R) much better than looking at the code alone.
11 | 
12 | Also, I would recommend you buy the book. Here's a direct [link](https://www.packtpub.com/product/machine-learning-with-r-fourth-edition/9781801071321) to __Packt__'s (the publisher) website.


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/readme.MD:
--------------------------------------------------------------------------------
 1 | # Identifying Frequently Purchased Groceries with Association Rules
 2 | 
 3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 8: Finding Patterns - Market Basket Analysis Using Association Rules
 4 | 
 5 | The original code is made with {arules}. Now, this is an algorithm that isn't found in {tidymodels}. I still wanted to recreate it using {tidyverse} as much as possible. Helps with the readability if nothing else!
 6 | 
 7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
 8 | 
 9 | You can find the original code and the dataset here:
10 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter08
11 | 
12 | Dataset Citation:
13 | 
14 | Author: <br>
15 | Michael Hahsler <br>
16 | <br>
17 | Source: <br>
18 | The data set is provided for {arules} by Michael Hahsler, Kurt Hornik and Thomas Reutterer. <br>
19 | <br>
20 | References: <br>
21 | Michael Hahsler, Kurt Hornik, and Thomas Reutterer (2006) Implications of probabilistic data modeling for mining association rules. In M. Spiliopoulou, R. Kruse, C. Borgelt, A. Nuernberger, and W. Gaul, editors, From Data and Information Analysis to Knowledge Engineering, Studies in Classification, Data Analysis, and Knowledge Organization, pages 598–605. Springer-Verlag.


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/decision_trees_and_rules/readme.MD:
--------------------------------------------------------------------------------
 1 | # Identifying Risky Bank Loans Using C5.0 Decision Trees /
 2 | # Identifying Poisonous Mushrooms with Rule Learners
 3 | 
 4 | Inspired by Brett Lantz's Machine Learning with R,<br>
 5 | Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and<br>
 6 | Chapter 10: Evaluating Model Performance.
 7 | 
 8 | The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I 
 9 | wanted to see how one could recreate it using mainly {tidymodels} and
10 | {tidyverse}.
11 | 
12 | If you haven't already, buy the book:<br> https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
13 | 
14 | You can find the original code and the slightly modified datasets here:<br>
15 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05<br>
16 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
17 | 
18 | Dataset Citation(s):
19 | 
20 | __Identifying Risky Bank Loans Using C5.0 Decision Trees__<br>
21 | Professor Dr. Hans Hofmann<br>
22 | Institut für Statistik und Ökonometrie<br>
23 | Universität Hamburg<br>
24 | FB Wirtschaftswissenschaften<br>
25 | Von-Melle-Park 5<br>
26 | 2000 Hamburg 13
27 | 
28 | __Identifying Poisonous Mushrooms with Rule Learners__<br>
29 | Origin:<br>
30 | Mushroom records drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf<br>
31 | <br>
32 | Donor:<br>
33 | Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)
34 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/readme.MD:
--------------------------------------------------------------------------------
 1 | # Filtering Mobile Phone Spam with Naive Bayes
 2 | 
 3 | Inspired by Brett Lantz's Machine Learning with R,<br>
 4 | Chapter 4: Probabilistic Learning - Classification Using Naive Bayes and<br>
 5 | Chapter 10: Evaluating Model Performance.
 6 | 
 7 | The original code is made with a lot of base R, {e1071} and {gmodels}. I wanted to see how one could recreate it using mainly {textrecipes}, {tidymodels}, {tidytext} and {tidyverse}.
 8 | 
 9 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
10 | 
11 | You can find the original code and the slightly modified dataset here:<br>
12 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter04<br>
13 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
14 | 
15 | Dataset Citation:
16 | [1] Gómez Hidalgo, J.M., Cajigas Bringas, G., Puertas Sãnz, E., Carrero García, F. Content Based SMS Spam Filtering. Proceedings of the 2006 ACM Symposium on Document Engineering (ACM DOCENG'06), Amsterdam, The Netherlands, 10-13, 2006.
17 | 
18 | [2] Cormack, G. V., Gómez Hidalgo, J. M., and Puertas Sãnz, E. Feature engineering for mobile (SMS) spam filtering. Proceedings of the 30th Annual international ACM Conference on Research and Development in information Retrieval (ACM SIGIR'07), New York, NY, 871-872, 2007.
19 | 
20 | [3] Cormack, G. V., Gómez Hidalgo, J. M., and Puertas Sãnz, E. Spam filtering for short messages. Proceedings of the 16th ACM Conference on Information and Knowledge Management (ACM CIKM'07). Lisbon, Portugal, 313-320, 2007.
21 | 
22 | Also, there were parts that were inspired by:
23 | 
24 | A blog post by Julia Silge:
25 | https://www.tidyverse.org/blog/2020/11/tidymodels-sparse-support/
26 | 
27 | Julia Silge & David Robinson's book Text Mining with R: A Tidy Approach: https://www.tidytextmining.com/
28 | 
29 | A blog post by Céline Van den Rul:
30 | https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_11.r:
--------------------------------------------------------------------------------
 1 | ##### Chapter 11: Being Successful with Machine Learning --------------------
 2 | 
 3 | ## Example: Using ggplot2 for visual data exploration ----
 4 | 
 5 | # read the titanic training dataset
 6 | titanic_train <- read.csv("titanic_train.csv")
 7 | 
 8 | # examine the features
 9 | str(titanic_train)
10 | 
11 | # load ggplot2 package and apply to Titanic dataset
12 | library(ggplot2)
13 | p <- ggplot(data = titanic_train)
14 | p # creates an empty gray plot
15 | 
16 | # compare built-in boxplot to ggplot2 boxplot
17 | boxplot(titanic_train$Age) # use R's built-in boxplot()
18 | p + geom_boxplot(mapping = aes(y = Age)) # use ggplot2 boxplot
19 | 
20 | # boxplot examining relationship between age and survival
21 | p + geom_boxplot(mapping = aes(x = Age, y = as.factor(Survived)))
22 | 
23 | # compare built-in histogram to ggplot2 histogram
24 | hist(titanic_train$Age) # use R's built-in hist()
25 | p + geom_histogram(aes(x = Age)) # use ggplot2 histogram
26 | 
27 | # overlapping histograms
28 | p + geom_histogram(aes(x = Age, fill = as.factor(Survived))) +
29 |   ggtitle("Distribution of Age by Titanic Survival Status")
30 | 
31 | # side-by-side histograms
32 | p + geom_histogram(aes(x = Age)) +
33 |   facet_grid(cols = vars(Survived)) +
34 |   ggtitle("Distribution of Age by Titanic Survival Status")
35 | 
36 | # overlapping density plots
37 | p + geom_density(aes(x = Age,
38 |                      color = as.factor(Survived),
39 |                      fill = as.factor(Survived)),
40 |                  alpha = 0.25) +
41 |   ggtitle("Density of Age by Titanic Survival Status")
42 | 
43 | # bar chart of passenger counts by sex
44 | p + geom_bar(aes(x = Sex)) +
45 |   ggtitle("Titanic Passenger Counts by Gender")
46 | 
47 | # bar chart of survival probability by sex
48 | p + geom_bar(aes(x = Sex, y = Survived),
49 |              stat = "summary", fun = "mean") +
50 |     ggtitle("Titanic Survival Rate by Gender")
51 | 
52 | # bar chart of survival probability by passenger class
53 | p + geom_bar(aes(x = Pclass, y = Survived),
54 |              stat = "summary", fun = "mean") +
55 |     ggtitle("Titanic Survival Rate by Passenger Class")
56 | 
57 | # stacked bar chart of survival by passenger class
58 | p + geom_bar(aes(x = Pclass,
59 |                  fill = factor(Survived,
60 |                                labels = c("No", "Yes")))) +
61 |   labs(fill = "Survived") +
62 |   ylab("Number of Passengers") +
63 |   ggtitle("Titanic Survival Counts by Passenger Class")
64 | 
65 | # survivival status by passenger class
66 | p + geom_bar(aes(x = Pclass,
67 |                  fill = factor(Survived,
68 |                                labels = c("No", "Yes"))),
69 |              position = "fill") +
70 |   labs(fill = "Survived") +
71 |   ylab("Proportion of Passengers") +
72 |   ggtitle("Titanic Survival by Passenger Class")
73 | 
74 | # bar chart of interaction between class and sex
75 | p + geom_bar(aes(x = Pclass, y = Survived, fill = Sex),
76 |            position = "dodge", stat = "summary", fun = "mean") +
77 |   ylab("Survival Proportion") +
78 |   ggtitle("Titanic Survival Rate by Class and Sex")
79 | 
80 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/identifying_frequently_purchased_groceries_with_association_rules.r:
--------------------------------------------------------------------------------
  1 | # Identifying Frequently Purchased Groceries with Association Rules ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 8:
  4 | # Finding Patterns - Market Basket Analysis Using Association Rules
  5 | #
  6 | # The original code is made with {arules}. Now, this is an algorithm that isn't
  7 | # found in {tidymodels}. I still wanted to recreate it using {tidyverse} as much
  8 | # as possible. Helps with the readability if nothing else!
  9 | #
 10 | # You can find the original code and the dataset here:
 11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter08
 12 | 
 13 | ## 1. Loading libraries ----
 14 | library(conflicted)
 15 | library(tidyverse)
 16 | library(arules)
 17 | library(arulesViz)
 18 | 
 19 | 
 20 | ## 2. Exploring and preparing the data ----
 21 | 
 22 | ### Load the grocery data into a sparse matrix ----
 23 | groceries <- read.transactions(
 24 |     "machine_learning_with_r_3rd_ed/association_rules/data/groceries.csv",
 25 |     sep = ","
 26 | )
 27 | summary(groceries)
 28 | 
 29 | ### Look at the first five transactions ----
 30 | groceries %>%
 31 |     head(5) %>%
 32 |     inspect()
 33 | 
 34 | ### Examine the frequency of items ----
 35 | groceries[, 1:3] %>%
 36 |     itemFrequency()
 37 | 
 38 | ### Plot the frequency of items ----
 39 | par(mar = c(1, 1, 1, 1)) # Have to adjust the margins or there will be an error
 40 | 
 41 | groceries %>%
 42 |     itemFrequencyPlot(support = 0.1)
 43 | 
 44 | groceries %>%
 45 |     itemFrequencyPlot(topN = 10)
 46 | 
 47 | ### A visualization of the sparse matrix for the first five transactions ----
 48 | groceries[1:5] %>%
 49 |     image()
 50 | 
 51 | ### Visualization of a random sample of 100 transactions ----
 52 | sample(groceries, 100) %>%
 53 |     image()
 54 | 
 55 | 
 56 | ## 3. Training a model on the data ----
 57 | 
 58 | ### Default settings result in zero rules learned ----
 59 | groceries %>%
 60 |     apriori()
 61 | 
 62 | ### Set better support and confidence levels to learn more rules ----
 63 | groceryrules <- apriori(
 64 |     groceries,
 65 |     parameter = list(
 66 |         support = 0.006,
 67 |         confidence = 0.25,
 68 |         minlen = 2
 69 |     )
 70 | )
 71 | groceryrules
 72 | 
 73 | 
 74 | ## 4. Evaluating model performance ----
 75 | 
 76 | # Summary of grocery association rules ----
 77 | groceryrules %>% summary()
 78 | 
 79 | # Look at the first three rules ----
 80 | groceryrules[1:3] %>%
 81 |     inspect()
 82 | 
 83 | # Visualize the rules ----
 84 | groceryrules %>% plot(
 85 |     method = "graph",
 86 |     limit  = 10
 87 | )
 88 | 
 89 | 
 90 | ## 5. Improving model performance ----
 91 | 
 92 | ### Sorting grocery rules by lift ----
 93 | sort(groceryrules, by = "lift")[1:5] %>%
 94 |     inspect()
 95 | 
 96 | ### Finding subsets of rules containing any berry items ----
 97 | berryrules <- groceryrules %>%
 98 |     subset(items %in% "berries")
 99 | 
100 | berryrules %>%
101 |     inspect()
102 | 
103 | ### Converting the rule set to a tibble ----
104 | groceryrules_tbl <- groceryrules %>%
105 |     as("data.frame") %>%
106 |     as_tibble()
107 | 
108 | groceryrules_tbl %>%
109 |     glimpse()
110 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_08_finding_patterns_market_basket_analysis_using_association_rules.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 8: Finding Patterns - Market Basket Analysis Using Association Rules"
  3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
  4 | date: "2023-07-23"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ## Loading Packages
 13 | 
 14 | ```{r}
 15 | library(arules)
 16 | library(arulesViz)
 17 | library(conflicted)
 18 | library(tidyverse)
 19 | ```
 20 | 
 21 | ## Exploring and preparing the data
 22 | 
 23 | ### Load the grocery data into a sparse matrix
 24 | 
 25 | ```{r}
 26 | groceries <- read.transactions(
 27 |     "data/groceries.csv",
 28 |     sep = ","
 29 | )
 30 | ```
 31 | 
 32 | ```{r}
 33 | groceries %>% 
 34 |     summary()
 35 | ```
 36 | 
 37 | ### Look at the first five transactions
 38 | 
 39 | ```{r}
 40 | groceries %>%
 41 |     head(5) %>%
 42 |     inspect()
 43 | ```
 44 | 
 45 | ### Examine the frequency of items
 46 | 
 47 | ```{r}
 48 | groceries[, 1:3] %>%
 49 |     itemFrequency()
 50 | ```
 51 | 
 52 | ### Plot the frequency of items
 53 | 
 54 | Have to adjust the margins or there will be an error
 55 | 
 56 | ```{r}
 57 | par(mar = c(1, 1, 1, 1))
 58 | ```
 59 | 
 60 | ```{r}
 61 | groceries %>%
 62 |     itemFrequencyPlot(support = 0.1)
 63 | ```
 64 | 
 65 | ```{r}
 66 | groceries %>%
 67 |     itemFrequencyPlot(topN = 20)
 68 | ```
 69 | 
 70 | ### A visualization of the sparse matrix for the first five transactions
 71 | 
 72 | ```{r}
 73 | groceries[1:5] %>%
 74 |     image()
 75 | ```
 76 | 
 77 | ### Visualization of a random sample of 100 transactions
 78 | 
 79 | ```{r}
 80 | sample(groceries, 100) %>%
 81 |     image()
 82 | ```
 83 | 
 84 | ## Training a model on the data
 85 | 
 86 | ### Default settings result in zero rules learned
 87 | 
 88 | ```{r}
 89 | groceries %>%
 90 |     apriori()
 91 | ```
 92 | 
 93 | ### Set better support and confidence levels to learn more rules
 94 | 
 95 | ```{r}
 96 | groceryrules <- apriori(
 97 |     groceries,
 98 |     parameter = list(
 99 |         support = 0.006,
100 |         confidence = 0.25,
101 |         minlen = 2
102 |     )
103 | )
104 | 
105 | groceryrules
106 | ```
107 | 
108 | ## Evaluating model performance
109 | 
110 | ### Summary of grocery association rules
111 | 
112 | ```{r}
113 | groceryrules %>% 
114 |     summary()
115 | ```
116 | 
117 | ### Look at the first three rules
118 | 
119 | ```{r}
120 | groceryrules[1:3] %>%
121 |     inspect()
122 | ```
123 | 
124 | ### Visualize the rules
125 | 
126 | ```{r}
127 | groceryrules %>% plot(
128 |     method = "graph",
129 |     limit  = 10
130 | )
131 | ```
132 | 
133 | ## Improving model performance
134 | 
135 | ### Sorting grocery rules by lift
136 | 
137 | ```{r}
138 | sort(groceryrules, by = "lift")[1:5] %>%
139 |     inspect()
140 | ```
141 | 
142 | ### Finding subsets of rules containing any berry items
143 | 
144 | ```{r}
145 | berryrules <- groceryrules %>%
146 |     subset(items %in% "berries")
147 | ```
148 | 
149 | ```{r}
150 | berryrules %>%
151 |     inspect()
152 | ```
153 | 
154 | ### Converting the rule set to a tibble
155 | 
156 | ```{r}
157 | groceryrules_tbl <- groceryrules %>%
158 |     as("data.frame") %>%
159 |     as_tibble()
160 | ```
161 | 
162 | ```{r}
163 | groceryrules_tbl
164 | ```
165 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/random_forest/random_forest_with_tidymodels.R:
--------------------------------------------------------------------------------
  1 | # Random Forest with tidymodels ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, 
  4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
  5 | # Chapter 10: Evaluating Model Performance
  6 | #
  7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
  8 | # wanted to see how one could recreate it using mainly {tidymodels} and
  9 | # {tidyverse}.
 10 | #
 11 | # You can find the original code and the slightly modified dataset here:
 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
 14 | 
 15 | ## 1. Loading libraries (in the order they get used) ----
 16 | library(conflicted)
 17 | library(tidyverse)
 18 | library(tidymodels)
 19 | 
 20 | ## 2. Exploring and preparing the data ----
 21 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
 22 | 
 23 | ### Examine the structure of the credit data ----
 24 | glimpse(credit_tbl)
 25 | 
 26 | ### look at two characteristics of the applicant ----
 27 | credit_tbl %>%
 28 |     count(checking_balance) %>%
 29 |     mutate(pct = (n / sum(n) * 100))
 30 | 
 31 | credit_tbl %>%
 32 |     count(savings_balance) %>%
 33 |     mutate(pct = (n / sum(n) * 100))
 34 | 
 35 | ### look at two characteristics of the loan ----
 36 | credit_tbl %>%
 37 |     select(months_loan_duration, amount) %>%
 38 |     summary()
 39 | 
 40 | ### look at the class variable ----
 41 | credit_tbl %>%
 42 |     count(default) %>%
 43 |     mutate(pct = (n / sum(n) * 100))
 44 | 
 45 | 
 46 | ## 3. Creating the recipe and splitting the data ----
 47 | 
 48 | ### Convert strings to factors ----
 49 | recipe_obj <- recipe(
 50 |     default ~ .,
 51 |     data = credit_tbl
 52 | ) %>%
 53 |     step_string2factor(all_nominal())
 54 | recipe_obj
 55 | 
 56 | credit_factorized_tbl <- recipe_obj %>%
 57 |     prep() %>%
 58 |     bake(new_data = NULL)
 59 | credit_factorized_tbl
 60 | 
 61 | ### Create training and test data (randomly) ----
 62 | 
 63 | # Use set.seed to use the same random number sequence as the original
 64 | RNGversion("3.5.2")
 65 | set.seed(123)
 66 | 
 67 | credit_split <- initial_split(
 68 |     credit_factorized_tbl,
 69 |     prop = 0.9
 70 | )
 71 | credit_train <- training(credit_split)
 72 | credit_test  <- testing(credit_split)
 73 | 
 74 | ### Check the proportion of class variable ----
 75 | credit_train %>%
 76 |     count(default) %>%
 77 |     mutate(pct = (n / sum(n) * 100))
 78 | 
 79 | credit_test %>%
 80 |     count(default) %>%
 81 |     mutate(pct = (n / sum(n) * 100))
 82 | 
 83 | 
 84 | ## 4. Building a random forest ----
 85 | 
 86 | ### Specify a random forest ----
 87 | spec <- rand_forest(trees = 100) %>%
 88 |     set_mode("classification") %>%
 89 |     set_engine("ranger", importance = "impurity")
 90 | 
 91 | ### Train the forest ----
 92 | model <- spec %>%
 93 |     fit(
 94 |         default ~ .,
 95 |         data = credit_train
 96 |     )
 97 | model
 98 | 
 99 | ### Plot the variable importance ----
100 | vip::vip(model)
101 | 
102 | 
103 | ## 5. Predicting ----
104 | 
105 | ### Make the predictions (you could skip this step) ----
106 | credit_test_pred <- predict(
107 |     object   = model,
108 |     new_data = credit_test,
109 |     type     = "class"
110 | )
111 | credit_test_pred
112 | 
113 | ### Add the predictions to the test tibble ----
114 | credit_test_with_pred_tbl <- augment(model, credit_test)
115 | credit_test_with_pred_tbl
116 | 
117 | 
118 | ## 6. Evaluating model performance ----
119 | 
120 | ### Create a confusion matrix ----
121 | conf_mat <- conf_mat(
122 |     data     = credit_test_with_pred_tbl,
123 |     truth    = default,
124 |     estimate = .pred_class
125 | )
126 | conf_mat
127 | 
128 | ### Visualize the confusion matrix ----
129 | conf_mat %>% autoplot(type = "heatmap")
130 | conf_mat %>% autoplot(type = "mosaic")
131 | 
132 | ### Visualize the ROC curve ----
133 | credit_test_with_pred_tbl %>%
134 |     roc_curve(
135 |         truth    = default,
136 |         estimate = .pred_no
137 |     ) %>%
138 |     autoplot()
139 | 
140 | ### Calculate the ROC AUC (area under the curve) ----
141 | credit_roc_auc <- credit_test_with_pred_tbl %>%
142 |     roc_auc(
143 |         truth    = default,
144 |         estimate = .pred_no
145 |     )
146 | credit_roc_auc
147 | 
148 | ### Put together other model metrics ----
149 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
150 | classification_metrics <- conf_mat(
151 |     credit_test_with_pred_tbl,
152 |     truth    = default,
153 |     estimate = .pred_class
154 | ) %>%
155 |     summary()
156 | classification_metrics
157 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/neural_networks/modeling_the_strength_of_concrete_with_neural_networks.r:
--------------------------------------------------------------------------------
  1 | # Modeling the Strength of Concrete with Neural Networks ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 7:
  4 | # Black Box Methods - Neural Networks and Support Vector Machines
  5 | #
  6 | # The original code is made with {neuralnet}. I wanted to see how one could
  7 | # recreate it using mainly {tidymodels} and {tidyverse}.
  8 | #
  9 | # You can find the original code and the dataset here:
 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
 11 | 
 12 | ## 1. Loading libraries (in the order they get used) ----
 13 | library(conflicted)
 14 | library(tidyverse)
 15 | library(tidymodels)
 16 | library(corrr)
 17 | 
 18 | 
 19 | ## 2. Exploring and preparing the data ----
 20 | 
 21 | ### Read in data and examine structure ----
 22 | concrete_tbl <- read_csv("machine_learning_with_r_3rd_ed/neural_networks/data/concrete.csv")
 23 | 
 24 | concrete_tbl %>% glimpse()
 25 | 
 26 | ### Check the minimum and maximum strength ----
 27 | concrete_tbl %>%
 28 |   select(strength) %>%
 29 |   summary()
 30 | 
 31 | 
 32 | ## 3. Creating the recipe ----
 33 | 
 34 | ### Apply normalization to the numeric predictors ----
 35 | recipe_obj <- recipe(
 36 |   strength ~ .,
 37 |   data = concrete_tbl
 38 | ) %>%
 39 |   step_range(
 40 |     all_numeric_predictors(),
 41 |     min = 0,
 42 |     max = 1
 43 |   )
 44 | recipe_obj
 45 | 
 46 | concrete_normalized_tbl <- recipe_obj %>%
 47 |   prep() %>%
 48 |   bake(new_data = NULL)
 49 | concrete_normalized_tbl
 50 | 
 51 | ### Create training and test data ----
 52 | concrete_split <- initial_time_split(
 53 |   concrete_normalized_tbl,
 54 |   prop = 773 / 1030
 55 | )
 56 | concrete_train <- training(concrete_split)
 57 | concrete_test  <- testing(concrete_split)
 58 | 
 59 | 
 60 | ## 3. Training a model on the data ----
 61 | 
 62 | # nnet is the engine (needs to be installed if not already):
 63 | # install.packages("nnet")
 64 | 
 65 | # It is used as the engine for {parsnip}'s mlp() function.
 66 | # And since we are predicting strength, we choose regression as the mode.
 67 | 
 68 | ### Create model specification ----
 69 | model_spec <- mlp(
 70 |   engine       = "nnet",
 71 |   mode         = "regression",
 72 |   hidden_units = 5,
 73 |   penalty      = 0,
 74 |   epochs       = 100
 75 | ) %>%
 76 |   translate()
 77 | model_spec
 78 | 
 79 | ### Fit the model ----
 80 | model_fit <- fit(
 81 |   model_spec,
 82 |   strength ~ .,
 83 |   concrete_train
 84 | )
 85 | model_fit
 86 | 
 87 | ### Take a closer look at the model ----
 88 | summary(model_fit$fit)
 89 | 
 90 | ### Make the predictions (you could skip this step) ----
 91 | concrete_test_pred <- predict(
 92 |   model_fit,
 93 |   new_data = concrete_test,
 94 |   type = "numeric"
 95 | )
 96 | concrete_test_pred
 97 | 
 98 | ### Add the predictions to the test tibble ----
 99 | concrete_test_with_pred <- augment(model_fit, concrete_test)
100 | concrete_test_with_pred
101 | 
102 | ### Metrics ----
103 | concrete_test_with_pred %>% metrics(strength, .pred)
104 | 
105 | ### Visualize the network topology ----
106 | # I have yet to find a working solution to create this visualization.
107 | # If you know how to do it, please let me know!
108 | 
109 | 
110 | ## 4. Evaluating model performance ----
111 | 
112 | ### Examine the correlation between predicted and actual values ----
113 | concrete_test_with_pred %>%
114 |   select(.pred, strength) %>%
115 |   correlate()
116 | 
117 | 
118 | ## 5. Improving model performance with two hidden layers and custom activation function ----
119 | 
120 | ### Create model specification ----
121 | model_spec_2 <- mlp(
122 |   engine       = "nnet",
123 |   mode         = "regression",
124 |   hidden_units = 5,
125 |   penalty      = 0.1,
126 |   epochs       = 100,
127 | ) %>%
128 |   translate()
129 | model_spec_2
130 | 
131 | ### Fit the model ----
132 | model_fit_2 <- fit(
133 |   model_spec_2,
134 |   strength ~ .,
135 |   concrete_train
136 | )
137 | model_fit_2
138 | 
139 | ### Take a closer look at the model ----
140 | summary(model_fit_2$fit)
141 | 
142 | ### Make the predictions (you could skip this step) ----
143 | concrete_test_pred_2 <- predict(
144 |   model_fit_2,
145 |   new_data = concrete_test,
146 |   type = "numeric"
147 | )
148 | concrete_test_pred_2
149 | 
150 | ### Add the predictions to the test tibble ----
151 | concrete_test_with_pred_2 <- augment(model_fit_2, concrete_test)
152 | concrete_test_with_pred_2
153 | 
154 | ### Metrics ----
155 | concrete_test_with_pred_2 %>% metrics(strength, .pred)
156 | 
157 | ### Examine the correlation between predicted and actual values ----
158 | concrete_test_with_pred_2 %>%
159 |   select(.pred, strength) %>%
160 |   correlate()
161 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/decision_trees_and_rules/identifying_poisonous_mushrooms_with_rule_learners.r:
--------------------------------------------------------------------------------
  1 | # Identifying Poisonous Mushrooms with Rule Learners ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 5:
  4 | # Divide and Conquer - Classification Using Decision Trees and Rules.
  5 | #
  6 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
  7 | # wanted to see how one could recreate it using mainly {tidymodels} and
  8 | # {tidyverse}.
  9 | #
 10 | # You can find the original code and the slightly modified dataset here:
 11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
 12 | 
 13 | ## 1. Loading libraries (in the order they get used) ----
 14 | library(conflicted)
 15 | library(tidyverse)
 16 | library(tidymodels)
 17 | library(rules)
 18 | 
 19 | ## 2. Exploring and preparing the data ----
 20 | mushrooms_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/mushrooms.csv")
 21 | 
 22 | ### Examine the data ----
 23 | mushrooms_tbl %>% map(unique)
 24 | 
 25 | ### Drop the veil_type feature ----
 26 | mushroom_selected_tbl <- mushrooms_tbl %>%
 27 |     select(-veil_type)
 28 | 
 29 | ### Examine the class distribution ----
 30 | mushroom_selected_tbl %>%
 31 |     count(type) %>%
 32 |     mutate(pct = (n / sum(n) * 100))
 33 | 
 34 | 
 35 | ## 3. Creating the recipe ----
 36 | recipe_obj <- recipe(
 37 |     type ~ .,
 38 |     data = mushroom_selected_tbl
 39 | )
 40 | recipe_obj
 41 | 
 42 | mushroom_baked_tbl <- recipe_obj %>%
 43 |     prep() %>%
 44 |     bake(new_data = NULL)
 45 | 
 46 | 
 47 | ## 4. Training a model on the data ----
 48 | 
 49 | ### Model specification ----
 50 | model_spec <- C5_rules(
 51 |     mode            = "classification",
 52 |     engine          = "C5.0",
 53 |     trees           = NULL,
 54 |     min_n           = NULL
 55 | ) %>%
 56 |     translate()
 57 | model_spec
 58 | 
 59 | ### Fit the model ----
 60 | model_fit <- fit(
 61 |     model_spec,
 62 |     type ~ .,
 63 |     mushroom_baked_tbl
 64 | )
 65 | model_fit
 66 | 
 67 | model_fit %>%
 68 |     extract_fit_engine() %>%
 69 |     summary()
 70 | 
 71 | ### Make the predictions (you could skip this step) ----
 72 | mushroom_pred_tbl <- predict(
 73 |     object   = model_fit,
 74 |     new_data = mushroom_baked_tbl,
 75 |     type     = "class"
 76 | )
 77 | mushroom_pred_tbl
 78 | 
 79 | ### Add the predictions to the test tibble ----
 80 | mushroom_pred_tbl <- augment(model_fit, mushroom_baked_tbl)
 81 | mushroom_pred_tbl
 82 | 
 83 | 
 84 | ## 5. Evaluating model performance ----
 85 | 
 86 | ### Create a confusion matrix ----
 87 | conf_mat <- conf_mat(
 88 |     data     = mushroom_pred_tbl,
 89 |     truth    = type,
 90 |     estimate = .pred_class
 91 | )
 92 | conf_mat
 93 | 
 94 | ### Visualize the confusion matrix ----
 95 | conf_mat %>% autoplot(type = "heatmap")
 96 | conf_mat %>% autoplot(type = "mosaic")
 97 | 
 98 | ### Visualize the ROC curve ----
 99 | mushroom_pred_tbl %>%
100 |     roc_curve(
101 |         truth    = type,
102 |         estimate = .pred_edible
103 |     ) %>%
104 |     autoplot()
105 | 
106 | ### Calculate the ROC AUC (area under the curve) ----
107 | mushroom_roc_auc <- mushroom_pred_tbl %>%
108 |     roc_auc(
109 |         truth    = type,
110 |         estimate = .pred_edible
111 |     )
112 | mushroom_roc_auc
113 | 
114 | ### Put together other model metrics ----
115 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
116 | classification_metrics <- conf_mat(
117 |     mushroom_pred_tbl,
118 |     truth    = type,
119 |     estimate = .pred_class
120 | ) %>%
121 |     summary()
122 | classification_metrics
123 | 
124 | 
125 | ## 6. Creating a function to help evaluate the model further ----
126 | 
127 | # The assumption here is that you have already gone through steps 1. to 4.
128 | # What we're potentially tuning here are the arguments .trees and .min_n
129 | 
130 | classify_with_c5_rules <- function(
131 |     .trees           = NULL,
132 |     .min_n           = NULL
133 | ) {
134 |     
135 |     # Create the recipe
136 |     recipe_obj <- recipe(
137 |         type ~ .,
138 |         data = mushroom_selected_tbl
139 |     )
140 |     
141 |     mushroom_baked_tbl <- recipe_obj %>%
142 |         prep() %>%
143 |         bake(new_data = NULL)
144 |     
145 |     # Model specification
146 |     model_spec <- C5_rules(
147 |         mode            = "classification",
148 |         engine          = "C5.0",
149 |         trees           = .trees,
150 |         min_n           = .min_n
151 |     ) %>%
152 |         translate()
153 |     
154 |     # Fit the model
155 |     model_fit <- fit(
156 |         model_spec,
157 |         type ~ .,
158 |         mushroom_baked_tbl
159 |     )
160 |     
161 |     model_fit %>%
162 |         extract_fit_engine() %>%
163 |         summary()
164 |     
165 |     # Add the predictions to the test tibble
166 |     mushroom_pred_tbl <- augment(model_fit, mushroom_baked_tbl)
167 |     mushroom_pred_tbl
168 |     
169 |     # Create a confusion matrix
170 |     conf_mat <- conf_mat(
171 |         data     = mushroom_pred_tbl,
172 |         truth    = type,
173 |         estimate = .pred_class
174 |     )
175 |     
176 |     conf_mat %>% autoplot(type = "heatmap")
177 |     
178 | }
179 | 
180 | ### Test the function ----
181 | classify_with_c5_rules(
182 |     .trees = 3,
183 |     .min_n = 1
184 | )
185 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/knn/data/wdbc-names.txt:
--------------------------------------------------------------------------------
  1 | 1. Title: Wisconsin Diagnostic Breast Cancer (WDBC)
  2 | 
  3 | 2. Source Information
  4 | 
  5 | a) Creators: 
  6 | 
  7 | 	Dr. William H. Wolberg, General Surgery Dept., University of
  8 | 	Wisconsin,  Clinical Sciences Center, Madison, WI 53792
  9 | 	wolberg@eagle.surgery.wisc.edu
 10 | 
 11 | 	W. Nick Street, Computer Sciences Dept., University of
 12 | 	Wisconsin, 1210 West Dayton St., Madison, WI 53706
 13 | 	street@cs.wisc.edu  608-262-6619
 14 | 
 15 | 	Olvi L. Mangasarian, Computer Sciences Dept., University of
 16 | 	Wisconsin, 1210 West Dayton St., Madison, WI 53706
 17 | 	olvi@cs.wisc.edu 
 18 | 
 19 | b) Donor: Nick Street
 20 | 
 21 | c) Date: November 1995
 22 | 
 23 | 3. Past Usage:
 24 | 
 25 | first usage:
 26 | 
 27 | 	W.N. Street, W.H. Wolberg and O.L. Mangasarian 
 28 | 	Nuclear feature extraction for breast tumor diagnosis.
 29 | 	IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science
 30 | 	and Technology, volume 1905, pages 861-870, San Jose, CA, 1993.
 31 | 
 32 | OR literature:
 33 | 
 34 | 	O.L. Mangasarian, W.N. Street and W.H. Wolberg. 
 35 | 	Breast cancer diagnosis and prognosis via linear programming. 
 36 | 	Operations Research, 43(4), pages 570-577, July-August 1995.
 37 | 
 38 | Medical literature:
 39 | 
 40 | 	W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 
 41 | 	Machine learning techniques to diagnose breast cancer from
 42 | 	fine-needle aspirates.  
 43 | 	Cancer Letters 77 (1994) 163-171.
 44 | 
 45 | 	W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 
 46 | 	Image analysis and machine learning applied to breast cancer
 47 | 	diagnosis and prognosis.  
 48 | 	Analytical and Quantitative Cytology and Histology, Vol. 17
 49 | 	No. 2, pages 77-87, April 1995. 
 50 | 
 51 | 	W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 
 52 | 	Computerized breast cancer diagnosis and prognosis from fine
 53 | 	needle aspirates.  
 54 | 	Archives of Surgery 1995;130:511-516.
 55 | 
 56 | 	W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 
 57 | 	Computer-derived nuclear features distinguish malignant from
 58 | 	benign breast cytology.  
 59 | 	Human Pathology, 26:792--796, 1995.
 60 | 
 61 | See also:
 62 | 	http://www.cs.wisc.edu/~olvi/uwmp/mpml.html
 63 | 	http://www.cs.wisc.edu/~olvi/uwmp/cancer.html
 64 | 
 65 | Results:
 66 | 
 67 | 	- predicting field 2, diagnosis: B = benign, M = malignant
 68 | 	- sets are linearly separable using all 30 input features
 69 | 	- best predictive accuracy obtained using one separating plane
 70 | 		in the 3-D space of Worst Area, Worst Smoothness and
 71 | 		Mean Texture.  Estimated accuracy 97.5% using repeated
 72 | 		10-fold crossvalidations.  Classifier has correctly
 73 | 		diagnosed 176 consecutive new patients as of November
 74 | 		1995. 
 75 | 
 76 | 4. Relevant information
 77 | 
 78 | 	Features are computed from a digitized image of a fine needle
 79 | 	aspirate (FNA) of a breast mass.  They describe
 80 | 	characteristics of the cell nuclei present in the image.
 81 | 	A few of the images can be found at
 82 | 	http://www.cs.wisc.edu/~street/images/
 83 | 
 84 | 	Separating plane described above was obtained using
 85 | 	Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
 86 | 	Construction Via Linear Programming." Proceedings of the 4th
 87 | 	Midwest Artificial Intelligence and Cognitive Science Society,
 88 | 	pp. 97-101, 1992], a classification method which uses linear
 89 | 	programming to construct a decision tree.  Relevant features
 90 | 	were selected using an exhaustive search in the space of 1-4
 91 | 	features and 1-3 separating planes.
 92 | 
 93 | 	The actual linear program used to obtain the separating plane
 94 | 	in the 3-dimensional space is that described in:
 95 | 	[K. P. Bennett and O. L. Mangasarian: "Robust Linear
 96 | 	Programming Discrimination of Two Linearly Inseparable Sets",
 97 | 	Optimization Methods and Software 1, 1992, 23-34].
 98 | 
 99 | 
100 | 	This database is also available through the UW CS ftp server:
101 | 
102 | 	ftp ftp.cs.wisc.edu
103 | 	cd math-prog/cpo-dataset/machine-learn/WDBC/
104 | 
105 | 5. Number of instances: 569 
106 | 
107 | 6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features)
108 | 
109 | 7. Attribute information
110 | 
111 | 1) ID number
112 | 2) Diagnosis (M = malignant, B = benign)
113 | 3-32)
114 | 
115 | Ten real-valued features are computed for each cell nucleus:
116 | 
117 | 	a) radius (mean of distances from center to points on the perimeter)
118 | 	b) texture (standard deviation of gray-scale values)
119 | 	c) perimeter
120 | 	d) area
121 | 	e) smoothness (local variation in radius lengths)
122 | 	f) compactness (perimeter^2 / area - 1.0)
123 | 	g) concavity (severity of concave portions of the contour)
124 | 	h) concave points (number of concave portions of the contour)
125 | 	i) symmetry 
126 | 	j) fractal dimension ("coastline approximation" - 1)
127 | 
128 | Several of the papers listed above contain detailed descriptions of
129 | how these features are computed. 
130 | 
131 | The mean, standard error, and "worst" or largest (mean of the three
132 | largest values) of these features were computed for each image,
133 | resulting in 30 features.  For instance, field 3 is Mean Radius, field
134 | 13 is Radius SE, field 23 is Worst Radius.
135 | 
136 | All feature values are recoded with four significant digits.
137 | 
138 | 8. Missing attribute values: none
139 | 
140 | 9. Class distribution: 357 benign, 212 malignant


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/usedcars.csv:
--------------------------------------------------------------------------------
  1 | year,model,price,mileage,color,transmission
  2 | 2011,SEL,21992,7413,Yellow,AUTO
  3 | 2011,SEL,20995,10926,Gray,AUTO
  4 | 2011,SEL,19995,7351,Silver,AUTO
  5 | 2011,SEL,17809,11613,Gray,AUTO
  6 | 2012,SE,17500,8367,White,AUTO
  7 | 2010,SEL,17495,25125,Silver,AUTO
  8 | 2011,SEL,17000,27393,Blue,AUTO
  9 | 2010,SEL,16995,21026,Silver,AUTO
 10 | 2011,SES,16995,32655,Silver,AUTO
 11 | 2010,SES,16995,36116,Silver,AUTO
 12 | 2010,SES,16995,40539,Black,AUTO
 13 | 2011,SES,16992,9199,Silver,AUTO
 14 | 2011,SEL,16950,9388,Green,AUTO
 15 | 2010,SES,16950,32058,Red,AUTO
 16 | 2011,SE,16000,15367,White,AUTO
 17 | 2011,SES,15999,16368,Blue,AUTO
 18 | 2010,SEL,15999,19926,Silver,AUTO
 19 | 2010,SES,15995,36049,Silver,AUTO
 20 | 2011,SEL,15992,11662,Blue,AUTO
 21 | 2011,SEL,15992,32069,Silver,AUTO
 22 | 2010,SES,15988,16035,Silver,MANUAL
 23 | 2010,SEL,15980,39943,White,AUTO
 24 | 2011,SE,15899,36685,Silver,AUTO
 25 | 2010,SEL,15889,24920,Black,AUTO
 26 | 2009,SEL,15688,20019,Blue,AUTO
 27 | 2010,SE,15500,29338,Blue,AUTO
 28 | 2010,SE,15499,7784,Black,AUTO
 29 | 2010,SE,15499,35636,Black,AUTO
 30 | 2010,SES,15298,22029,Gray,AUTO
 31 | 2009,SEL,14999,33107,Silver,AUTO
 32 | 2010,SES,14999,36306,Red,AUTO
 33 | 2009,SE,14995,34419,Black,MANUAL
 34 | 2011,SE,14992,4867,Black,AUTO
 35 | 2011,SEL,14992,18948,Black,AUTO
 36 | 2009,SES,14992,24030,Red,AUTO
 37 | 2010,SEL,14990,33036,Black,AUTO
 38 | 2011,SE,14989,23967,White,AUTO
 39 | 2010,SE,14906,37905,Silver,AUTO
 40 | 2010,SE,14900,28955,White,AUTO
 41 | 2010,SE,14893,11165,White,AUTO
 42 | 2010,SES,14761,44813,Black,AUTO
 43 | 2010,SES,14699,36469,Gray,AUTO
 44 | 2010,SES,14677,22143,Black,MANUAL
 45 | 2009,SES,14549,34046,Silver,AUTO
 46 | 2010,SE,14499,32703,Red,AUTO
 47 | 2010,SES,14495,35894,Silver,AUTO
 48 | 2010,SE,14495,38275,Black,AUTO
 49 | 2010,SE,14480,24855,Blue,AUTO
 50 | 2009,SEL,14477,29501,Gray,MANUAL
 51 | 2009,SEL,14355,35394,Red,AUTO
 52 | 2010,SE,14299,36447,Black,AUTO
 53 | 2010,SES,14275,35318,Black,AUTO
 54 | 2010,SES,14000,24929,Silver,AUTO
 55 | 2009,SE,13999,23785,Red,AUTO
 56 | 2010,SE,13997,15167,Black,MANUAL
 57 | 2010,SE,13995,13541,Silver,AUTO
 58 | 2010,SE,13995,20278,Black,MANUAL
 59 | 2009,SES,13995,46126,Black,AUTO
 60 | 2009,SES,13995,53733,Silver,AUTO
 61 | 2009,SES,13992,21108,Silver,AUTO
 62 | 2010,SE,13992,21721,Green,AUTO
 63 | 2010,SES,13992,26716,Gray,MANUAL
 64 | 2009,SES,13992,26887,Black,AUTO
 65 | 2009,SE,13991,36252,Silver,MANUAL
 66 | 2009,SE,13950,9450,Black,AUTO
 67 | 2010,SE,13950,31414,Black,AUTO
 68 | 2010,SE,13950,37185,Blue,AUTO
 69 | 2010,SE,13895,48174,Gray,AUTO
 70 | 2009,SE,13888,50533,White,AUTO
 71 | 2009,SE,13845,36713,Blue,AUTO
 72 | 2009,SES,13799,34888,Black,AUTO
 73 | 2009,SES,13742,38380,Black,AUTO
 74 | 2010,SEL,13687,35574,Gray,AUTO
 75 | 2009,SEL,13663,27528,Silver,AUTO
 76 | 2010,SES,13599,33302,Red,AUTO
 77 | 2009,SEL,13584,43369,Red,AUTO
 78 | 2009,SES,13425,64055,Black,AUTO
 79 | 2010,SE,13384,41342,Gray,AUTO
 80 | 2010,SE,13383,34503,Black,AUTO
 81 | 2010,SE,13350,16573,Blue,AUTO
 82 | 2009,SES,12999,32403,Blue,AUTO
 83 | 2009,SE,12998,34846,Blue,AUTO
 84 | 2007,SE,12997,39665,Red,AUTO
 85 | 2010,SE,12995,21325,Black,AUTO
 86 | 2010,SE,12995,32743,Black,MANUAL
 87 | 2010,SE,12995,40058,White,MANUAL
 88 | 2009,SE,12995,42325,Blue,AUTO
 89 | 2009,SE,12995,44518,Red,AUTO
 90 | 2009,SE,12995,53902,Gray,AUTO
 91 | 2008,SE,12995,127327,Red,AUTO
 92 | 2009,SE,12992,27136,Gray,AUTO
 93 | 2009,SES,12990,45813,Silver,AUTO
 94 | 2009,SE,12988,31538,Gray,AUTO
 95 | 2010,SE,12849,29517,Silver,AUTO
 96 | 2010,SE,12780,35871,Black,AUTO
 97 | 2008,SE,12777,49787,Black,MANUAL
 98 | 2008,SES,12704,36323,Blue,AUTO
 99 | 2009,SES,12595,39211,Blue,AUTO
100 | 2009,SE,12507,44789,Gray,AUTO
101 | 2008,SE,12500,45996,White,MANUAL
102 | 2009,SE,12500,54988,White,MANUAL
103 | 2009,SE,12280,29288,Red,AUTO
104 | 2009,SE,11999,36124,Blue,AUTO
105 | 2009,SE,11992,32559,Black,MANUAL
106 | 2009,SES,11984,59048,Black,AUTO
107 | 2009,SE,11980,55170,Red,AUTO
108 | 2010,SE,11792,39722,Green,AUTO
109 | 2008,SE,11754,38286,Black,AUTO
110 | 2008,SES,11749,57341,Red,AUTO
111 | 2008,SES,11495,82221,Silver,AUTO
112 | 2008,SE,11450,85229,Red,MANUAL
113 | 2009,SES,10995,42834,Red,AUTO
114 | 2005,SES,10995,69415,Blue,AUTO
115 | 2009,SEL,10995,78264,Gray,AUTO
116 | 2009,SE,10979,60709,Red,AUTO
117 | 2008,SE,10955,39643,Gray,AUTO
118 | 2009,SE,10955,40180,Gold,AUTO
119 | 2008,SE,10836,40330,Green,MANUAL
120 | 2007,SES,10815,77231,Red,AUTO
121 | 2007,SE,10770,72937,Silver,MANUAL
122 | 2010,SE,10717,64199,Black,AUTO
123 | 2007,SES,10000,63926,Red,AUTO
124 | 2007,SES,9999,74427,Silver,AUTO
125 | 2007,SES,9999,78948,Black,MANUAL
126 | 2006,SE,9995,51311,Silver,AUTO
127 | 2008,SE,9995,95364,White,AUTO
128 | 2008,SE,9992,74109,White,AUTO
129 | 2007,SE,9651,63296,Blue,AUTO
130 | 2007,SES,9000,80605,Red,AUTO
131 | 2006,SE,8999,49656,Silver,AUTO
132 | 2007,SE,8996,48652,Silver,MANUAL
133 | 2006,SE,8800,71331,White,AUTO
134 | 2008,SE,8495,106171,Black,AUTO
135 | 2008,SE,8494,68901,Silver,AUTO
136 | 2009,SE,8480,70036,White,MANUAL
137 | 2007,SES,7999,81596,Yellow,MANUAL
138 | 2006,SES,7995,35000,Black,MANUAL
139 | 2006,SES,7995,97987,Red,AUTO
140 | 2003,SES,7900,96000,White,AUTO
141 | 2005,SES,7488,59013,Red,AUTO
142 | 2004,SE,6999,105714,Silver,AUTO
143 | 2007,SE,6995,86862,White,AUTO
144 | 2000,SE,6980,60161,Green,AUTO
145 | 2004,SES,6980,101130,Gray,AUTO
146 | 2004,SES,6950,119720,Black,AUTO
147 | 2006,SES,6200,95000,Silver,AUTO
148 | 2002,SE,5995,87003,Red,AUTO
149 | 2000,SE,5980,96841,Red,AUTO
150 | 2001,SE,4899,151479,Yellow,AUTO
151 | 2000,SE,3800,109259,Red,AUTO
152 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_10.r:
--------------------------------------------------------------------------------
  1 | ##### Chapter 10: Evaluating Model Performance -------------------
  2 | 
  3 | ## Confusion matrixes in R ----
  4 | 
  5 | ## Create the predicted probabilities from the SMS classifier built in Chapter 4.
  6 | sms_results <- read.csv("sms_results.csv", stringsAsFactors = TRUE)
  7 | 
  8 | # the first several test cases
  9 | head(sms_results)
 10 | 
 11 | # test cases where the model is less confident
 12 | head(subset(sms_results, prob_spam > 0.40 & prob_spam < 0.60))
 13 | 
 14 | # test cases where the model was wrong
 15 | head(subset(sms_results, actual_type != predict_type))
 16 | 
 17 | # specifying vectors
 18 | table(sms_results$actual_type, sms_results$predict_type)
 19 | 
 20 | # alternative solution using the formula interface (not shown in book)
 21 | xtabs(~ actual_type + predict_type, sms_results)
 22 | 
 23 | # using the CrossTable function
 24 | library(gmodels)
 25 | CrossTable(sms_results$actual_type, sms_results$predict_type)
 26 | 
 27 | # accuracy and error rate calculation --
 28 | # accuracy
 29 | (152 + 1203) / (152 + 1203 + 4 + 31)
 30 | # error rate
 31 | (4 + 31) / (152 + 1203 + 4 + 31)
 32 | # error rate = 1 - accuracy
 33 | 1 - 0.9748201
 34 | 
 35 | ## Beyond accuracy: other performance measures ----
 36 | library(caret)
 37 | confusionMatrix(sms_results$predict_type, sms_results$actual_type, positive = "spam")
 38 | 
 39 | # Kappa statistic
 40 | # example using SMS classifier
 41 | pr_a <- 0.865 + 0.109
 42 | pr_a
 43 | 
 44 | pr_e <- 0.868 * 0.888 + 0.132 * 0.112
 45 | pr_e
 46 | 
 47 | k <- (pr_a - pr_e) / (1 - pr_e)
 48 | k
 49 | 
 50 | # calculate kappa via the vcd package
 51 | library(vcd)
 52 | Kappa(table(sms_results$actual_type, sms_results$predict_type))
 53 | 
 54 | # calculate kappa via the irr package
 55 | library(irr)
 56 | kappa2(sms_results[1:2])
 57 | 
 58 | # Matthews correlation coefficient (MCC)
 59 | # for SMS classifier: TN = 1203, FP = 4, FN = 31, TP = 152
 60 | 
 61 | # MCC = (TP * TN - FP * FN) /
 62 | #         sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
 63 | 
 64 | # compute MCC by hand for SMS classifier
 65 | (152 * 1203 - 4 * 31) /
 66 |   sqrt((152 + 4) * (152 + 31) * (1203 + 4) * (1203 + 31))
 67 | 
 68 | # compute MCC using mcc() function in mltools package
 69 | library(mltools)
 70 | mcc(sms_results$actual_type, sms_results$predict_type)
 71 | 
 72 | # compute MCC using Pearson correlation
 73 | cor(ifelse(sms_results$actual_type == "spam", 1, 0),
 74 |     ifelse(sms_results$predict_type == "spam", 1, 0))
 75 | 
 76 | # Sensitivity and specificity
 77 | # example using SMS classifier
 78 | sens <- 152 / (152 + 31)
 79 | sens
 80 | 
 81 | spec <- 1203 / (1203 + 4)
 82 | spec
 83 | 
 84 | # example using the caret package
 85 | library(caret)
 86 | sensitivity(sms_results$predict_type, sms_results$actual_type, positive = "spam")
 87 | specificity(sms_results$predict_type, sms_results$actual_type, negative = "ham")
 88 | 
 89 | # Precision and recall
 90 | prec <- 152 / (152 + 4)
 91 | prec
 92 | 
 93 | rec <- 152 / (152 + 31)
 94 | rec
 95 | 
 96 | # example using the caret package
 97 | library(caret)
 98 | posPredValue(sms_results$predict_type, sms_results$actual_type, positive = "spam")
 99 | sensitivity(sms_results$predict_type, sms_results$actual_type, positive = "spam")
100 | 
101 | # F-measure
102 | f <- (2 * prec * rec) / (prec + rec)
103 | f
104 | 
105 | f <- (2 * 152) / (2 * 152 + 4 + 31)
106 | f
107 | 
108 | ## Visualizing Performance Tradeoffs ----
109 | library(pROC)
110 | sms_roc <- roc(sms_results$actual_type, sms_results$prob_spam)
111 | 
112 | # ROC curve for Naive Bayes
113 | plot(sms_roc, main = "ROC curve for SMS spam filter",
114 |      col = "blue", lwd = 2, grid = TRUE, legacy.axes = TRUE)
115 | 
116 | # compare to kNN 
117 | sms_results_knn <- read.csv("sms_results_knn.csv")
118 | sms_roc_knn <- roc(sms_results$actual_type, sms_results_knn$p_spam)
119 | plot(sms_roc_knn, col = "red", lwd = 2, add = TRUE)
120 | 
121 | # calculate AUC for Naive Bayes and kNN
122 | auc(sms_roc)
123 | auc(sms_roc_knn)
124 | 
125 | ## Estimating Future Performance ----
126 | 
127 | # partitioning data
128 | library(caret)
129 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
130 | 
131 | # Holdout method
132 | # using random IDs
133 | random_ids <- order(runif(1000))
134 | credit_train <- credit[random_ids[1:500],]
135 | credit_validate <- credit[random_ids[501:750], ]
136 | credit_test <- credit[random_ids[751:1000], ]
137 | 
138 | # using caret function
139 | in_train <- createDataPartition(credit$default, p = 0.75, list = FALSE)
140 | credit_train <- credit[in_train, ]
141 | credit_test <- credit[-in_train, ]
142 | 
143 | # 10-fold CV
144 | set.seed(123) # to ensure results match
145 | folds <- createFolds(credit$default, k = 10)
146 | str(folds)
147 | credit01_test <- credit[folds$Fold01, ]
148 | credit01_train <- credit[-folds$Fold01, ]
149 | 
150 | ## Automating 10-fold CV for a C5.0 Decision Tree using lapply() ----
151 | library(caret)
152 | library(C50)
153 | library(irr)
154 | 
155 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
156 | 
157 | set.seed(123)
158 | folds <- createFolds(credit$default, k = 10)
159 | 
160 | cv_results <- lapply(folds, function(x) {
161 |   credit_train <- credit[-x, ]
162 |   credit_test <- credit[x, ]
163 |   credit_model <- C5.0(default ~ ., data = credit_train)
164 |   credit_pred <- predict(credit_model, credit_test)
165 |   credit_actual <- credit_test$default
166 |   kappa <- kappa2(data.frame(credit_actual, credit_pred))$value
167 |   return(kappa)
168 | })
169 | 
170 | # examine the results of the 10 trials
171 | str(cv_results)
172 | 
173 | # compute the average kappa across the 10 trials
174 | mean(unlist(cv_results))
175 | 
176 | # compute the standard deviation across the 10 trials
177 | sd(unlist(cv_results))
178 | 
179 | ## Bootstrap sampling ----
180 | 
181 | # computing the probability that a record is unselected
182 | (1 - (1/1000))^1000
183 | (1 - (1/100000))^100000
184 | 1 / exp(1) # as n approaches infinity
185 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/hyperparameter_optimization_with_tidymodels.R:
--------------------------------------------------------------------------------
  1 | # Hyperparameter Optimization with tidymodels ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 11:
  4 | # Improving Model Performance
  5 | #
  6 | # The original code is made with {caret}. I wanted to see how one could recreate
  7 | # it using mainly {tidymodels} and {tidyverse}.
  8 | #
  9 | # You can find the original code and the slightly modified dataset here:
 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11
 11 | 
 12 | ## 1. Loading libraries (in the order they get used) ----
 13 | library(conflicted)
 14 | library(tidyverse)
 15 | library(tidymodels)
 16 | 
 17 | 
 18 | ## 2. Loading the dataset ----
 19 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
 20 | 
 21 | 
 22 | ## 3. Creating the recipe and splitting the data ----
 23 | 
 24 | ### Convert strings to factors ----
 25 | recipe_obj <- recipe(
 26 |     default ~ .,
 27 |     data = credit_tbl
 28 | ) %>%
 29 |     step_string2factor(all_nominal())
 30 | recipe_obj
 31 | 
 32 | credit_factorized_tbl <- recipe_obj %>%
 33 |     prep() %>%
 34 |     bake(new_data = NULL)
 35 | credit_factorized_tbl
 36 | 
 37 | ### Create training and test data (randomly) ----
 38 | set.seed(123)
 39 | credit_split <- initial_split(
 40 |     credit_factorized_tbl,
 41 |     prop = 0.9,
 42 |     strata = default
 43 | )
 44 | 
 45 | credit_train <- training(credit_split)
 46 | credit_test  <- testing(credit_split)
 47 | 
 48 | ### Check the proportion of class variable ----
 49 | credit_train %>%
 50 |     count(default) %>%
 51 |     mutate(pct = (n / sum(n) * 100))
 52 | 
 53 | credit_test %>%
 54 |     count(default) %>%
 55 |     mutate(pct = (n / sum(n) * 100))
 56 | 
 57 | 
 58 | ## 4. Creating a simple model ----
 59 | 
 60 | ### Create specifications with the default tuning parameters for rpart ----
 61 | # tree_depth = 30, min_n = 2 and cost_complexity = 0.01
 62 | 
 63 | set.seed(123)
 64 | tune_spec_simple <- decision_tree(
 65 |     mode            = "classification",
 66 |     engine          = "rpart"
 67 | )
 68 | tune_spec_simple
 69 | 
 70 | ### Fit the model ----
 71 | model_fit_simple <- fit(
 72 |     tune_spec_simple,
 73 |     default ~ .,
 74 |     credit_train
 75 | )
 76 | 
 77 | model_fit_simple %>% 
 78 |     extract_fit_engine() %>%
 79 |     summary()
 80 | 
 81 | ### Make the predictions ----
 82 | credit_simple <- augment(model_fit_simple, credit_test)
 83 | credit_simple
 84 | 
 85 | ### Create a confusion matrix ----
 86 | conf_mat_simple <- conf_mat(
 87 |     data     = credit_simple,
 88 |     truth    = default,
 89 |     estimate = .pred_class
 90 | )
 91 | conf_mat_simple
 92 | 
 93 | ### Visualize the ROC curve ----
 94 | credit_simple %>%
 95 |     roc_curve(
 96 |         truth    = default,
 97 |         estimate = .pred_no
 98 |     ) %>%
 99 |     autoplot()
100 | 
101 | ### Calculate the ROC AUC (area under the curve) ----
102 | credit_roc_auc_simple <- credit_simple %>%
103 |     roc_auc(
104 |         truth    = default,
105 |         estimate = .pred_no
106 |     )
107 | credit_roc_auc_simple
108 | 
109 | ### Put together other model metrics ----
110 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
111 | 
112 | classification_metrics_simple <- conf_mat(
113 |     credit_simple,
114 |     truth    = default,
115 |     estimate = .pred_class
116 | ) %>%
117 |     summary()
118 | classification_metrics_simple
119 | 
120 | 
121 | ## 5. Generating a tuning grid ----
122 | 
123 | ### Create a specification with tuning placeholders ----
124 | set.seed(123)
125 | tune_spec <- decision_tree(
126 |     mode            = "classification",
127 |     engine          = "rpart",
128 |     tree_depth      = tune(), 
129 |     min_n           = tune(),
130 |     cost_complexity = tune()
131 | )
132 | tune_spec
133 | 
134 | ### Create a regular grid ----
135 | tree_grid <- grid_regular(
136 |     extract_parameter_set_dials(tune_spec),
137 |     levels = 4
138 | )
139 | tree_grid
140 | 
141 | 
142 | ## 6. Tuning along the grid ----
143 | 
144 | ### Create CV folds of the credit_train ----
145 | folds <- vfold_cv(
146 |     credit_train,
147 |     v = 10
148 | )
149 | folds
150 | 
151 | ### Tune along the grid ----
152 | tune_results <- tune_grid(
153 |     tune_spec, 
154 |     default ~ .,
155 |     resamples = folds,
156 |     grid      = tree_grid,
157 |     metrics   = metric_set(accuracy)
158 | )
159 | 
160 | ### Plot the tuning results ----
161 | autoplot(tune_results)
162 | 
163 | 
164 | ## 7. Picking the winner ----
165 | 
166 | ### Select the parameters that perform best ----
167 | final_params <- select_best(tune_results)
168 | final_params
169 | 
170 | ### Finalize the specification ----
171 | best_spec <- finalize_model(tune_spec, final_params)
172 | best_spec
173 | 
174 | ### Build the final model ----
175 | final_model <- fit(
176 |     best_spec,
177 |     default ~ .,
178 |     credit_train
179 | )
180 | final_model
181 | 
182 | final_model %>%
183 |     extract_fit_engine() %>%
184 |     summary()
185 | 
186 | ### Add the predictions to the test tibble ----
187 | credit_with_pred_tbl <- augment(final_model, credit_test)
188 | credit_with_pred_tbl
189 | 
190 | 
191 | ## 8. Evaluating model performance ----
192 | 
193 | ### Create a confusion matrix ----
194 | conf_mat <- conf_mat(
195 |     data     = credit_with_pred_tbl,
196 |     truth    = default,
197 |     estimate = .pred_class
198 | )
199 | conf_mat
200 | 
201 | ### Visualize the ROC curve ----
202 | credit_with_pred_tbl %>%
203 |     roc_curve(
204 |         truth    = default,
205 |         estimate = .pred_no
206 |     ) %>%
207 |     autoplot()
208 | 
209 | ### Calculate the ROC AUC (area under the curve) ----
210 | credit_roc_auc <- credit_with_pred_tbl %>%
211 |     roc_auc(
212 |         truth    = default,
213 |         estimate = .pred_no
214 |     )
215 | credit_roc_auc
216 | 
217 | ### Put together other model metrics ----
218 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
219 | classification_metrics <- conf_mat(
220 |     credit_with_pred_tbl,
221 |     truth    = default,
222 |     estimate = .pred_class
223 | ) %>%
224 |     summary()
225 | classification_metrics
226 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/hyperparameter_optimization_with_tidymodels_random.R:
--------------------------------------------------------------------------------
  1 | # Hyperparameter Optimization with tidymodels ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 11:
  4 | # Improving Model Performance
  5 | #
  6 | # The original code is made with {caret}. I wanted to see how one could recreate
  7 | # it using mainly {tidymodels} and {tidyverse}.
  8 | #
  9 | # You can find the original code and the slightly modified dataset here:
 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11
 11 | 
 12 | ## 1. Loading libraries (in the order they get used) ----
 13 | library(conflicted)
 14 | library(tidyverse)
 15 | library(tidymodels)
 16 | 
 17 | 
 18 | ## 2. Loading the dataset ----
 19 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
 20 | 
 21 | 
 22 | ## 3. Creating the recipe and splitting the data ----
 23 | 
 24 | ### Convert strings to factors ----
 25 | recipe_obj <- recipe(
 26 |     default ~ .,
 27 |     data = credit_tbl
 28 | ) %>%
 29 |     step_string2factor(all_nominal())
 30 | recipe_obj
 31 | 
 32 | credit_factorized_tbl <- recipe_obj %>%
 33 |     prep() %>%
 34 |     bake(new_data = NULL)
 35 | credit_factorized_tbl
 36 | 
 37 | ### Create training and test data (randomly) ----
 38 | set.seed(123)
 39 | credit_split <- initial_split(
 40 |     credit_factorized_tbl,
 41 |     prop = 0.9,
 42 |     strata = default
 43 | )
 44 | 
 45 | credit_train <- training(credit_split)
 46 | credit_test  <- testing(credit_split)
 47 | 
 48 | ### Check the proportion of class variable ----
 49 | credit_train %>%
 50 |     count(default) %>%
 51 |     mutate(pct = (n / sum(n) * 100))
 52 | 
 53 | credit_test %>%
 54 |     count(default) %>%
 55 |     mutate(pct = (n / sum(n) * 100))
 56 | 
 57 | 
 58 | ## 4. Creating a simple model ----
 59 | 
 60 | ### Create specifications with the default tuning parameters for rpart ----
 61 | # tree_depth = 30, min_n = 2 and cost_complexity = 0.01
 62 | 
 63 | set.seed(123)
 64 | tune_spec_simple <- decision_tree(
 65 |     mode            = "classification",
 66 |     engine          = "rpart"
 67 | )
 68 | tune_spec_simple
 69 | 
 70 | ### Fit the model ----
 71 | model_fit_simple <- fit(
 72 |     tune_spec_simple,
 73 |     default ~ .,
 74 |     credit_train
 75 | )
 76 | 
 77 | model_fit_simple %>% 
 78 |     extract_fit_engine() %>%
 79 |     summary()
 80 | 
 81 | ### Make the predictions ----
 82 | credit_simple <- augment(model_fit_simple, credit_test)
 83 | credit_simple
 84 | 
 85 | ### Create a confusion matrix ----
 86 | conf_mat_simple <- conf_mat(
 87 |     data     = credit_simple,
 88 |     truth    = default,
 89 |     estimate = .pred_class
 90 | )
 91 | conf_mat_simple
 92 | 
 93 | ### Visualize the ROC curve ----
 94 | credit_simple %>%
 95 |     roc_curve(
 96 |         truth    = default,
 97 |         estimate = .pred_no
 98 |     ) %>%
 99 |     autoplot()
100 | 
101 | ### Calculate the ROC AUC (area under the curve) ----
102 | credit_roc_auc_simple <- credit_simple %>%
103 |     roc_auc(
104 |         truth    = default,
105 |         estimate = .pred_no
106 |     )
107 | credit_roc_auc_simple
108 | 
109 | ### Put together other model metrics ----
110 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
111 | 
112 | classification_metrics_simple <- conf_mat(
113 |     credit_simple,
114 |     truth    = default,
115 |     estimate = .pred_class
116 | ) %>%
117 |     summary()
118 | classification_metrics_simple
119 | 
120 | 
121 | ## 5. Generating a tuning grid ----
122 | 
123 | ### Create a specification with tuning placeholders ----
124 | set.seed(123)
125 | tune_spec <- decision_tree(
126 |     mode            = "classification",
127 |     engine          = "rpart",
128 |     tree_depth      = tune(), 
129 |     min_n           = tune(),
130 |     cost_complexity = tune()
131 | )
132 | tune_spec
133 | 
134 | ### Create a regular grid ----
135 | tree_grid <- grid_random(
136 |     extract_parameter_set_dials(tune_spec),
137 |     levels = 4
138 | )
139 | tree_grid
140 | 
141 | 
142 | ## 6. Tuning along the grid ----
143 | 
144 | ### Create CV folds of the credit_train ----
145 | folds <- vfold_cv(
146 |     credit_train,
147 |     v = 10
148 | )
149 | folds
150 | 
151 | ### Tune along the grid ----
152 | tune_results <- tune_grid(
153 |     tune_spec, 
154 |     default ~ .,
155 |     resamples = folds,
156 |     grid      = tree_grid,
157 |     metrics   = metric_set(accuracy)
158 | )
159 | 
160 | ### Plot the tuning results ----
161 | autoplot(tune_results)
162 | 
163 | 
164 | ## 7. Picking the winner ----
165 | 
166 | ### Select the parameters that perform best ----
167 | final_params <- select_best(tune_results)
168 | final_params
169 | 
170 | ### Finalize the specification ----
171 | best_spec <- finalize_model(tune_spec, final_params)
172 | best_spec
173 | 
174 | ### Build the final model ----
175 | final_model <- fit(
176 |     best_spec,
177 |     default ~ .,
178 |     credit_train
179 | )
180 | final_model
181 | 
182 | final_model %>%
183 |     extract_fit_engine() %>%
184 |     summary()
185 | 
186 | ### Add the predictions to the test tibble ----
187 | credit_with_pred_tbl <- augment(final_model, credit_test)
188 | credit_with_pred_tbl
189 | 
190 | 
191 | ## 8. Evaluating model performance ----
192 | 
193 | ### Create a confusion matrix ----
194 | conf_mat <- conf_mat(
195 |     data     = credit_with_pred_tbl,
196 |     truth    = default,
197 |     estimate = .pred_class
198 | )
199 | conf_mat
200 | 
201 | ### Visualize the ROC curve ----
202 | credit_with_pred_tbl %>%
203 |     roc_curve(
204 |         truth    = default,
205 |         estimate = .pred_no
206 |     ) %>%
207 |     autoplot()
208 | 
209 | ### Calculate the ROC AUC (area under the curve) ----
210 | credit_roc_auc <- credit_with_pred_tbl %>%
211 |     roc_auc(
212 |         truth    = default,
213 |         estimate = .pred_no
214 |     )
215 | credit_roc_auc
216 | 
217 | ### Put together other model metrics ----
218 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
219 | classification_metrics <- conf_mat(
220 |     credit_with_pred_tbl,
221 |     truth    = default,
222 |     estimate = .pred_class
223 | ) %>%
224 |     summary()
225 | classification_metrics
226 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_12.r:
--------------------------------------------------------------------------------
  1 | ##### Chapter 12: Advanced Data Preparation --------------------
  2 | 
  3 | ## Exploring R's tidyverse ----
  4 | 
  5 | library(tidyverse) # load all tidyverse packages
  6 | 
  7 | # convert the Titanic training dataset into a tibble
  8 | library(tibble) # not necessary if tidyverse is already loaded
  9 | titanic_csv <- read.csv("titanic_train.csv")
 10 | titanic_tbl <- as_tibble(titanic_csv)
 11 | titanic_tbl
 12 | 
 13 | # read the titanic training dataset using readr
 14 | library(readr) # not necessary if tidyverse is already loaded
 15 | titanic_train <- read_csv("titanic_train.csv")
 16 | 
 17 | # read the titanic training dataset using readxl
 18 | library(readxl)
 19 | titanic_train <- read_excel("titanic_train.xlsx")
 20 | 
 21 | # preparing and piping data with dplyr
 22 | library(dplyr)
 23 | 
 24 | # filter for female rows only
 25 | titanic_train |> filter(Sex == "female")
 26 | 
 27 | # select only name, sex, and age columns
 28 | titanic_train |> select(Name, Sex, Age)
 29 | 
 30 | # combine multiple dplyr verbs and save output to a tibble
 31 | titanic_women <- titanic_train |>
 32 |   filter(Sex == "female") |>
 33 |   select(Name, Sex, Age) |>
 34 |   arrange(Name)
 35 | 
 36 | # create a new feature indicating elderly age
 37 | titanic_train |>
 38 |   mutate(elderly = if_else(Age >= 65, 1, 0))
 39 | 
 40 | # create multiple features within the same mutate command
 41 | titanic_train |>
 42 |   mutate(
 43 |     elderly = if_else(Age >= 65, 1, 0),
 44 |     child = if_else(Age < 18, 1, 0)
 45 |   )
 46 | 
 47 | # compute survival rate by gender
 48 | titanic_train |>
 49 |   group_by(Sex) |>
 50 |   summarize(survival_rate = mean(Survived))
 51 | 
 52 | # compute average survival rate for children vs. non-children
 53 | titanic_train |>
 54 |   filter(!is.na(Age)) |>
 55 |   mutate(child = if_else(Age < 18, 1, 0)) |>
 56 |   group_by(child) |>
 57 |   summarize(survival_rate = mean(Survived))
 58 | 
 59 | # transform the dataset and pipe into a decision tree
 60 | library(rpart)
 61 | m_titanic <- titanic_train |>
 62 |   filter(!is.na(Age)) |>
 63 |   mutate(AgeGroup = if_else(Age < 18, "Child", "Adult")) |>
 64 |   select(Survived, Pclass, Sex, AgeGroup) |>
 65 |   rpart(formula = Survived ~ ., data = _)
 66 | 
 67 | library(rpart.plot)
 68 | rpart.plot(m_titanic)
 69 | 
 70 | ## Transforming text with stringr ----
 71 | library(readr)
 72 | titanic_train <- read_csv("titanic_train.csv")
 73 | 
 74 | library(stringr)
 75 | 
 76 | # examine cabin prefix code
 77 | titanic_train <- titanic_train |>
 78 |   mutate(CabinCode = str_sub(Cabin, start = 1, end = 1))
 79 | 
 80 | # compare cabin prefix to passenger class
 81 | table(titanic_train$Pclass, titanic_train$CabinCode,
 82 |       useNA = "ifany")
 83 | 
 84 | # plot of survival probability by cabin code
 85 | library(ggplot2)
 86 | titanic_train |> ggplot() +
 87 |   geom_bar(aes(x = CabinCode, y = Survived),
 88 |              stat = "summary", fun = "mean") +
 89 |   ggtitle("Titanic Survival Rate by Cabin Code")
 90 | 
 91 | # look at the first few passenger names
 92 | head(titanic_train$Name)
 93 | 
 94 | # create a title / salutation feature
 95 | titanic_train <- titanic_train |>
 96 |   # use regular expressions to find the characters between the comma and period
 97 |   mutate(Title = str_extract(Name, ", [A-z]+\\."))
 98 | 
 99 | # look at the first few examples
100 | head(titanic_train$Title)
101 | 
102 | # clean up the title feature
103 | titanic_train <- titanic_train |>
104 |   mutate(Title = str_replace_all(Title, "[, \\.]", ""))
105 | 
106 | # examine output
107 | table(titanic_train$Title)
108 | 
109 | # group titles into related categories
110 | titanic_train <- titanic_train |>
111 |   mutate(TitleGroup = recode(Title,
112 |     # the first few stay the same
113 |     "Mr" = "Mr", "Mrs" = "Mrs", "Master" = "Master",
114 |     "Miss" = "Miss",
115 |     # combine  variants of "Miss"
116 |     "Ms" = "Miss", "Mlle" = "Miss", "Mme" = "Miss",
117 |     # anything else will be "Other"
118 |     .missing = "Other",
119 |     .default = "Other"
120 |     )
121 |   )
122 | 
123 | # examine output
124 | table(titanic_train$TitleGroup)
125 | 
126 | # plot of survival probability by title group
127 | library(ggplot2)
128 | titanic_train |> ggplot() +
129 |   geom_bar(aes(x = TitleGroup, y = Survived),
130 |            stat = "summary", fun = "mean") +
131 |   ggtitle("Titanic Survival Rate by Salutation")
132 | 
133 | ## Cleaning dates with lubridate ----
134 | 
135 | library(lubridate)
136 | 
137 | # reading in Machine Learning with R publication dates in different formats
138 | mdy(c("October 25, 2013", "10/25/2013"))
139 | dmy(c("25 October 2013", "25.10.13"))
140 | ymd("2013-10-25")
141 | 
142 | # construct MLwR publication dates
143 | MLwR_1stEd <- mdy("October 25, 2013")
144 | MLwR_2ndEd <- mdy("July 31, 2015")
145 | MLwR_3rdEd <- mdy("April 15, 2019")
146 | 
147 | # compute differences (returns a difftime object)
148 | MLwR_2ndEd - MLwR_1stEd
149 | MLwR_3rdEd - MLwR_2ndEd
150 | 
151 | # convert the differences to durations
152 | as.duration(MLwR_2ndEd - MLwR_1stEd)
153 | as.duration(MLwR_3rdEd - MLwR_2ndEd)
154 | 
155 | # convert the duration to years
156 | dyears()
157 | as.duration(MLwR_2ndEd - MLwR_1stEd) / dyears()
158 | as.duration(MLwR_3rdEd - MLwR_2ndEd) / dyears()
159 | 
160 | # easier-to-remember version of the above:
161 | time_length(MLwR_2ndEd - MLwR_1stEd, unit = "years")
162 | time_length(MLwR_3rdEd - MLwR_2ndEd, unit = "years")
163 | 
164 | # compute age (in duration)
165 | USA_DOB <- mdy("July 4, 1776") # USA's Date of Birth
166 | time_length(mdy("July 3 2023") - USA_DOB, unit = "years")
167 | time_length(mdy("July 5 2023") - USA_DOB, unit = "years")
168 | 
169 | # compute age (using intervals)
170 | interval(USA_DOB, mdy("July 3 2023")) / years()
171 | interval(USA_DOB, mdy("July 5 2023")) / years()
172 | 
173 | # compute age (using integer divison)
174 | USA_DOB %--% mdy("July 3 2023") %/% years()
175 | USA_DOB %--% mdy("July 5 2023") %/% years()
176 | 
177 | # function to compute calendar age
178 | age <- function(birthdate) {
179 |   birthdate %--% today() %/% years()
180 | }
181 | 
182 | # compute age of celebrities
183 | age(mdy("Jan 12, 1964")) # Jeff Bezos
184 | age(mdy("June 28, 1971")) # Elon Musk
185 | age(mdy("Oct 28, 1955")) # Bill Gates
186 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/boosting/boosting_with_tidymodels.R:
--------------------------------------------------------------------------------
  1 | # Boosting with tidymodels ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, 
  4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
  5 | # Chapter 10: Evaluating Model Performance
  6 | #
  7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
  8 | # wanted to see how one could recreate it using mainly {tidymodels} and
  9 | # {tidyverse}.
 10 | #
 11 | # You can find the original code and the slightly modified dataset here:
 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
 14 | 
 15 | ## 1. Loading libraries (in the order they get used) ----
 16 | library(conflicted)
 17 | library(tidyverse)
 18 | library(tidymodels)
 19 | 
 20 | ## 2. Exploring and preparing the data ----
 21 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
 22 | 
 23 | ### Examine the structure of the credit data ----
 24 | glimpse(credit_tbl)
 25 | 
 26 | ### look at two characteristics of the applicant ----
 27 | credit_tbl %>%
 28 |     count(checking_balance) %>%
 29 |     mutate(pct = (n / sum(n) * 100))
 30 | 
 31 | credit_tbl %>%
 32 |     count(savings_balance) %>%
 33 |     mutate(pct = (n / sum(n) * 100))
 34 | 
 35 | ### look at two characteristics of the loan ----
 36 | credit_tbl %>%
 37 |     select(months_loan_duration, amount) %>%
 38 |     summary()
 39 | 
 40 | ### look at the class variable ----
 41 | credit_tbl %>%
 42 |     count(default) %>%
 43 |     mutate(pct = (n / sum(n) * 100))
 44 | 
 45 | 
 46 | ## 3. Creating the recipe and splitting the data ----
 47 | 
 48 | ### Convert strings to factors ----
 49 | recipe_obj <- recipe(
 50 |     default ~ .,
 51 |     data = credit_tbl
 52 | ) %>%
 53 |     step_string2factor(all_nominal())
 54 | recipe_obj
 55 | 
 56 | credit_factorized_tbl <- recipe_obj %>%
 57 |     prep() %>%
 58 |     bake(new_data = NULL)
 59 | credit_factorized_tbl
 60 | 
 61 | ### Create training and test data (randomly) ----
 62 | 
 63 | # Use set.seed to use the same random number sequence as the original
 64 | RNGversion("3.5.2")
 65 | set.seed(123)
 66 | 
 67 | credit_split <- initial_split(
 68 |     credit_factorized_tbl,
 69 |     prop = 0.9
 70 | )
 71 | credit_train <- training(credit_split)
 72 | credit_test  <- testing(credit_split)
 73 | 
 74 | ### Check the proportion of class variable ----
 75 | credit_train %>%
 76 |     count(default) %>%
 77 |     mutate(pct = (n / sum(n) * 100))
 78 | 
 79 | credit_test %>%
 80 |     count(default) %>%
 81 |     mutate(pct = (n / sum(n) * 100))
 82 | 
 83 | 
 84 | ## 4. Building a boosted ensemble ----
 85 | 
 86 | ### Specify the model class ----
 87 | boost_spec <- boost_tree() %>%
 88 |     # Set the mode
 89 |     set_mode("classification") %>%
 90 |     # Set the engine
 91 |     set_engine("xgboost")
 92 | boost_spec
 93 | 
 94 | 
 95 | ## 5. Evaluate the ensemble ----
 96 | set.seed(99)
 97 | 
 98 | ### Create CV folds ----
 99 | folds <- vfold_cv(data = credit_train, v = 10)
100 | folds
101 | 
102 | ### Fit and evaluate models for all folds ----
103 | cv_results <- fit_resamples(
104 |     boost_spec,
105 |     default ~ .,
106 |     resamples = folds,
107 |     metrics   = metric_set(roc_auc)
108 | )
109 | 
110 | ### Collect cross-validated metrics ----
111 | collect_metrics(cv_results)
112 | 
113 | 
114 | ## 6. Tuning preparation ----
115 | 
116 | ### Create the specification with placeholders ----
117 | boost_spec <- boost_tree(
118 |     trees = 500,
119 |     learn_rate = tune(),
120 |     tree_depth = tune(),
121 |     sample_size = tune()) %>%
122 |     set_mode("classification") %>%
123 |     set_engine("xgboost")
124 | 
125 | ### Create the tuning grid ----
126 | tunegrid_boost <- grid_regular(
127 |     extract_parameter_set_dials(boost_spec), 
128 |     levels = 3
129 | )
130 | tunegrid_boost
131 | 
132 | 
133 | ## 7. The actual tuning ----
134 | 
135 | ### Create CV folds of training data ----
136 | folds <- vfold_cv(
137 |     credit_train,
138 |     v = 10
139 | )
140 | 
141 | ### Tune along the grid ----
142 | tune_results <- tune_grid(
143 |     boost_spec,
144 |     default ~ .,
145 |     resamples = folds,
146 |     grid      = tunegrid_boost,
147 |     metrics   = metric_set(roc_auc)
148 | )
149 | 
150 | ### Plot the results ----
151 | autoplot(tune_results)
152 | 
153 | 
154 | ## 8. Finalizing the model ----
155 | 
156 | ### Select the final hyperparameters ----
157 | best_params <- select_best(tune_results)
158 | best_params
159 | 
160 | ### Finalize the specification ----
161 | final_spec <- finalize_model(boost_spec, best_params)
162 | final_spec
163 | 
164 | ### Train the final model on the full training data ----
165 | final_model <- final_spec %>%
166 |     fit(
167 |         formula = default ~ .,
168 |         data = credit_train
169 |     )
170 | final_model
171 | 
172 | JATKA TÄSTÄ!
173 | https://app.datacamp.com/learn/courses/machine-learning-with-tree-based-models-in-r
174 | 
175 | # ### Train the forest ----
176 | # model <- spec %>%
177 | #     fit(
178 | #         default ~ .,
179 | #         data = credit_train
180 | #     )
181 | # model
182 | # 
183 | # ### Plot the variable importance
184 | # vip::vip(model)
185 | # 
186 | # 
187 | # ## 5. Predicting ----
188 | # 
189 | # ### Make the predictions (you could skip this step) ----
190 | # credit_test_pred <- predict(
191 | #     object   = model,
192 | #     new_data = credit_test,
193 | #     type     = "class"
194 | # )
195 | # credit_test_pred
196 | # 
197 | # ### Add the predictions to the test tibble ----
198 | # credit_test_with_pred_tbl <- augment(model, credit_test)
199 | # credit_test_with_pred_tbl
200 | # 
201 | # 
202 | # ## 6. Evaluating model performance ----
203 | # 
204 | # ### Create a confusion matrix ----
205 | # conf_mat <- conf_mat(
206 | #     data     = credit_test_with_pred_tbl,
207 | #     truth    = default,
208 | #     estimate = .pred_class
209 | # )
210 | # conf_mat
211 | # 
212 | # ### Visualize the confusion matrix ----
213 | # conf_mat %>% autoplot(type = "heatmap")
214 | # conf_mat %>% autoplot(type = "mosaic")
215 | # 
216 | # ### Visualize the ROC curve ----
217 | # credit_test_with_pred_tbl %>%
218 | #     roc_curve(
219 | #         truth    = default,
220 | #         estimate = .pred_no
221 | #     ) %>%
222 | #     autoplot()
223 | # 
224 | # ### Calculate the ROC AUC (area under the curve) ----
225 | # credit_roc_auc <- credit_test_with_pred_tbl %>%
226 | #     roc_auc(
227 | #         truth    = default,
228 | #         estimate = .pred_no
229 | #     )
230 | # credit_roc_auc
231 | # 
232 | # ### Put together other model metrics ----
233 | # # Such as accuracy, Matthews correlation coefficient (mcc) and others...
234 | # classification_metrics <- conf_mat(
235 | #     credit_test_with_pred_tbl,
236 | #     truth    = default,
237 | #     estimate = .pred_class
238 | # ) %>%
239 | #     summary()
240 | # classification_metrics
241 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/support_vector_machines/performing_optical_character_recognition_with_support_vector_machines_.r:
--------------------------------------------------------------------------------
  1 | # Performing Optical Character Recognition with Support Vector Machines ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 7:
  4 | # Black Box Methods - Neural Networks and Support Vector Machines
  5 | #
  6 | # The original code is made with {kernlab}. I wanted to see how one could
  7 | # recreate it using mainly {tidymodels} and {tidyverse}.
  8 | #
  9 | # You can find the original code and the dataset here:
 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
 11 | 
 12 | ## 1. Loading libraries (in the order they get used) ----
 13 | library(conflicted)
 14 | library(tidyverse)
 15 | library(tidymodels)
 16 | library(janitor)
 17 | 
 18 | 
 19 | ## 2. Exploring and preparing the data ----
 20 | 
 21 | ### Read in data and examine structure ----
 22 | letters_tbl <- read_csv("machine_learning_with_r_3rd_ed/support_vector_machines/data/letterdata.csv") %>%
 23 |   mutate(across(where(is.character), as.factor))
 24 | 
 25 | letters_tbl %>% 
 26 |   glimpse()
 27 | 
 28 | 
 29 | ## 3. Creating the recipe ----
 30 | 
 31 | ### Apply normalization to entire data frame ----
 32 | recipe_obj <- recipe(
 33 |   letter ~ .,
 34 |   data = letters_tbl
 35 | )
 36 | recipe_obj
 37 | 
 38 | letters_baked_tbl <- recipe_obj %>%
 39 |   prep() %>%
 40 |   bake(new_data = NULL)
 41 | letters_baked_tbl
 42 | 
 43 | ### Create training and test data ----
 44 | letters_split <- initial_time_split(
 45 |   letters_baked_tbl,
 46 |   prop = 16000 / 20000
 47 | )
 48 | letters_train <- training(letters_split)
 49 | letters_test  <- testing(letters_split)
 50 | 
 51 | 
 52 | ## 4. Training a model on the data ----
 53 | 
 54 | ### Create model specification ----
 55 | model_spec <- svm_linear(
 56 |   engine       = "kernlab",
 57 |   mode         = "classification",
 58 |   cost         = NULL,
 59 |   margin       = NULL
 60 | ) %>%
 61 |   translate()
 62 | model_spec
 63 | 
 64 | ### Fit the model ----
 65 | model_fit <- fit(
 66 |   model_spec,
 67 |   letter ~ .,
 68 |   letters_train
 69 | )
 70 | model_fit
 71 | 
 72 | ### Make the predictions (you could skip this step) ----
 73 | letters_test_pred <- predict(
 74 |   model_fit,
 75 |   new_data = letters_test
 76 | )
 77 | letters_test_pred
 78 | 
 79 | ### Add the predictions to the test tibble ----
 80 | letters_test_with_pred <- augment(model_fit, letters_test)
 81 | letters_test_with_pred
 82 | 
 83 | 
 84 | ## 5. Evaluating model performance ----
 85 | 
 86 | ### Predictions on testing dataset ----
 87 | letters_test_with_pred %>%
 88 |   tabyl(letter, .pred_class)
 89 | 
 90 | ### Look only at agreement vs. non-agreement ----
 91 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
 92 | letters_test_with_pred %>%
 93 |   mutate(
 94 |     agreement = case_when(
 95 |       letter == .pred_class ~ TRUE,
 96 |       TRUE                  ~ FALSE
 97 |     )
 98 |   ) %>%
 99 |   tabyl(agreement)
100 | 
101 | 
102 | ## 6. Improving model performance ----
103 | 
104 | ### Change to a RBF kernel ----
105 | model_spec_rbf <- svm_rbf(
106 |   engine       = "kernlab",
107 |   mode         = "classification",
108 |   cost         = NULL,
109 |   margin       = NULL,
110 |   rbf_sigma    = NULL
111 | ) %>%
112 |   translate()
113 | model_spec_rbf
114 | 
115 | ### Fit the model ----
116 | model_fit_rbf <- fit(
117 |   model_spec_rbf,
118 |   letter ~ .,
119 |   letters_train
120 | )
121 | model_fit_rbf
122 | 
123 | ### Make the predictions (you could skip this step) ----
124 | letters_test_pred_rbf <- predict(
125 |   model_fit_rbf,
126 |   new_data = letters_test
127 | )
128 | letters_test_pred_rbf
129 | 
130 | ### Add the predictions to the test tibble ----
131 | letters_test_with_pred_rbf <- augment(model_fit_rbf, letters_test)
132 | letters_test_with_pred_rbf
133 | 
134 | ### Predictions on testing dataset ----
135 | letters_test_with_pred_rbf %>%
136 |   tabyl(letter, .pred_class)
137 | 
138 | ### Look only at agreement vs. non-agreement ----
139 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
140 | letters_test_with_pred_rbf %>%
141 |   mutate(
142 |     agreement = case_when(
143 |       letter == .pred_class ~ TRUE,
144 |       TRUE                  ~ FALSE
145 |     )
146 |   ) %>%
147 |   tabyl(agreement)
148 | 
149 | ### Test various values of the cost parameter ----
150 | cost_values <- c(1, seq(from = 5, to = 40, by = 5))
151 | 
152 | accuracy_values <- map_dbl(cost_values, function(x) {
153 | 
154 |   model_spec_rbf <- svm_rbf(
155 |     engine       = "kernlab",
156 |     mode         = "classification",
157 |     cost         = {{ x }},
158 |     margin       = NULL,
159 |     rbf_sigma    = NULL
160 |   ) %>%
161 |     translate()
162 | 
163 |   model_fit_rbf <- fit(
164 |     model_spec_rbf,
165 |     letter ~ .,
166 |     letters_train
167 |   )
168 | 
169 |   letters_test_pred_rbf <- predict(
170 |     model_fit_rbf,
171 |     new_data = letters_test
172 |   ) %>% as_vector()
173 | 
174 |   agree <- ifelse(letters_test_pred_rbf == letters_test$letter, 1, 0)
175 | 
176 |   accuracy <- sum(agree) / nrow(letters_test)
177 | 
178 |   return(accuracy)
179 | 
180 | })
181 | 
182 | ### Bind together the cost parameter and accuracy values ----
183 | cost_vs_accuracy_tbl <- bind_cols(
184 |   cost_values,
185 |   accuracy_values,
186 | ) %>%
187 |   rename(
188 |     cost_values     = ...1,
189 |     accuracy_values = ...2
190 |   )
191 | 
192 | ### Visualize to find the optimal cost parameter value ----
193 | cost_vs_accuracy_tbl %>%
194 |   ggplot(aes(cost_values, accuracy_values)) +
195 |   geom_line() +
196 |   geom_point() +
197 |   theme_bw() +
198 |   labs(
199 |     x = "Cost Parameter Value",
200 |     y = "Accuracy"
201 |   )
202 | 
203 | ### Make sure you have the right optimal cost value for the best accuracy ----
204 | cost_vs_accuracy_tbl %>%
205 |   slice_max(accuracy_values)
206 | 
207 | ### Pull the first cost_value that has the max accuracy_value ----
208 | .max_accuracy <- cost_vs_accuracy_tbl %>%
209 |   slice_max(accuracy_values) %>%
210 |   slice(1) %>%
211 |   pull(cost_values)
212 | 
213 | 
214 | ## 7. Fitting the model with the optimal cost value (that was just pulled) ----
215 | 
216 | ### Give model specification ----
217 | model_spec_best <- svm_rbf(
218 |   engine       = "kernlab",
219 |   mode         = "classification",
220 |   cost         = {{ .max_accuracy }},
221 |   margin       = NULL,
222 |   rbf_sigma    = NULL
223 | ) %>%
224 |   translate()
225 | 
226 | ### Fit the model ----
227 | model_fit_best <- fit(
228 |   model_spec_best,
229 |   letter ~ .,
230 |   letters_train
231 | )
232 | 
233 | ### Add the predictions to the test tibble ----
234 | letters_test_with_pred_best <- augment(model_fit_best, letters_test)
235 | 
236 | ### Predictions on testing dataset ----
237 | letters_test_with_pred_best %>%
238 |   tabyl(letter, .pred_class)
239 | 
240 | ### Look only at agreement vs. non-agreement ----
241 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
242 | letters_test_with_pred_best %>%
243 |   mutate(
244 |     agreement = case_when(
245 |       letter == .pred_class ~ TRUE,
246 |       TRUE                  ~ FALSE
247 |     )
248 |   ) %>%
249 |   tabyl(agreement)
250 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/finding_teen_market_segments_using_k_means_clustering.r:
--------------------------------------------------------------------------------
  1 | # Finding Teen Market Segments Using k-means Clustering ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 9:
  4 | # Finding Groups of Data - Clustering with k-means
  5 | #
  6 | # The original code is made with base R. While {tidymodels} doesn't have a
  7 | # k-means engine, I wanted to see how one could still recreate the code using
  8 | # {tidyverse} as much as possible and even using the {recipes} package from the
  9 | # {tidymodels} to do all the pre-processing needed and {broom} package from the
 10 | # same {tidymodels} family to help look at the model metrics in a more tidy way.
 11 | #
 12 | # I was also inspired by this blog post from the tidymodels website to continue
 13 | # the exercise a bit further than the original code did:
 14 | # https://www.tidymodels.org/learn/statistics/k-means/
 15 | #
 16 | # You can find the original code and the dataset here:
 17 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter09
 18 | 
 19 | ## 1. Loading libraries ----
 20 | library(conflicted)
 21 | library(tidyverse)
 22 | library(janitor)
 23 | library(tidymodels)
 24 | 
 25 | 
 26 | ## 2. Exploring and preparing the data ----
 27 | teens_tbl <- read_csv("machine_learning_with_r_3rd_ed/k_means_clustering/data/snsdata.csv")
 28 | 
 29 | teens_tbl %>%
 30 |     glimpse()
 31 | 
 32 | ### Look at missing data for age variable ----
 33 | teens_tbl %>%
 34 |     select(age) %>%
 35 |     summary()
 36 | 
 37 | ### Eliminate age outliers ----
 38 | teens_only_tbl <- teens_tbl %>%
 39 |     mutate(
 40 |         age = case_when(
 41 |             between(age, 13, 20) ~ age,
 42 |             TRUE      ~ NA_real_
 43 |         )
 44 |     )
 45 | 
 46 | teens_only_tbl %>%
 47 |     select(age) %>%
 48 |     summary()
 49 | 
 50 | ### Look at missing data for female variable ----
 51 | teens_only_tbl %>%
 52 |     tabyl(gender)
 53 | 
 54 | ### Reassign missing gender values to "unknown" and change into factors ----
 55 | teens_gendered_tbl <- teens_only_tbl %>%
 56 |     mutate(
 57 |         gender = case_when(
 58 |             gender %in% c("F", "M") ~ gender,
 59 |             TRUE                    ~ "gender_unknown"
 60 |         ) %>%
 61 |             as.factor()
 62 |     )
 63 | 
 64 | ### Check our recoding work ----
 65 | teens_gendered_tbl %>%
 66 |     tabyl(gender) %>%
 67 |     arrange(desc(percent))
 68 | 
 69 | ### Finding the mean age by cohort ----
 70 | # Doesn't work because of the NAs
 71 | teens_gendered_tbl %>%
 72 |     pull(age) %>%
 73 |     mean()
 74 | 
 75 | # Works thanks to the na.rm = TRUE
 76 | teens_gendered_tbl %>%
 77 |     pull(age) %>%
 78 |     mean(na.rm = TRUE)
 79 | 
 80 | ### Age by cohort ----
 81 | teens_gendered_tbl %>%
 82 |     with_groups(
 83 |         gradyear,
 84 |         summarize,
 85 |         age = mean(age, na.rm = TRUE)
 86 |     )
 87 | 
 88 | ### Create a vector with the average age for each gradyear, repeated by person ----
 89 | average_age_by_gradyear <- teens_gendered_tbl %>%
 90 |     with_groups(
 91 |         gradyear,
 92 |         mutate,
 93 |         ave_age = mean(age, na.rm = TRUE)
 94 |     ) %>%
 95 |     pull()
 96 | 
 97 | ### Impute the missing age values with the average age by gradyear ----
 98 | teens_imputed_tbl <- teens_gendered_tbl %>%
 99 |     mutate(
100 |         age = case_when(
101 |             !is.na(age) ~ age,
102 |             TRUE        ~ average_age_by_gradyear
103 |         )
104 |     )
105 | 
106 | ### Check the summary results to ensure missing values are eliminated ----
107 | teens_imputed_tbl %>%
108 |     select(age) %>%
109 |     summary()
110 | 
111 | 
112 | ## 3. Creating the recipe ----
113 | 
114 | ### Apply normalization to entire data frame ----
115 | recipe_obj <- recipe(teens_imputed_tbl) %>%
116 |     step_normalize(
117 |         all_numeric(),
118 |         -c(gradyear, age, friends)
119 |     ) %>%
120 |     step_dummy(
121 |         gender,
122 |         keep_original_cols = TRUE,
123 |         one_hot            = TRUE
124 |     )
125 | recipe_obj
126 | 
127 | recipe_baked_tbl <- recipe_obj %>%
128 |     prep() %>%
129 |     bake(new_data = NULL)
130 | recipe_baked_tbl
131 | 
132 | 
133 | ## 4. Training a model on the data ----
134 | 
135 | ### Create a z-score standardized data frame for easier interpretation ----
136 | interests_tbl <- recipe_baked_tbl %>%
137 |     select(5:40)
138 | 
139 | ### Compare the data before and after the transformation ----
140 | 
141 | # Before
142 | teens_imputed_tbl %>%
143 |     select(basketball) %>%
144 |     summary()
145 | 
146 | # After
147 | interests_tbl %>%
148 |     select(basketball) %>%
149 |     summary()
150 | 
151 | ### Create the clusters using k-means ----
152 | RNGversion("3.5.2")
153 | set.seed(2345)
154 | 
155 | teens_clusters <- interests_tbl %>%
156 |     kmeans(5)
157 | teens_clusters
158 | 
159 | 
160 | ## 5. Evaluating model performance ----
161 | 
162 | ### Look at the single-row summary ----
163 | teens_clusters %>%
164 |     glance()
165 | 
166 | ### Look at the size and the centers of the clusters ----
167 | teens_clusters %>%
168 |     tidy() %>%
169 |     select(cluster, size, withinss, everything())
170 | 
171 | 
172 | ## 6. Improving model performance ----
173 | 
174 | ### Apply the cluster IDs to the original data frame ----
175 | teens_and_clusters <- augment(teens_clusters, recipe_baked_tbl)
176 | teens_and_clusters
177 | 
178 | ### Look at the first five records ----
179 | teens_and_clusters %>%
180 |     select(.cluster, gender, age, friends) %>%
181 |     slice_head(n = 5)
182 | 
183 | ### Mean age by cluster ----
184 | teens_and_clusters %>%
185 |     with_groups(
186 |         .cluster,
187 |         summarize,
188 |         age = mean(age)
189 |     )
190 | 
191 | ### Proportion of females by cluster ----
192 | teens_and_clusters %>%
193 |     with_groups(
194 |         .cluster,
195 |         summarize,
196 |         gender_F = mean(gender_F)
197 |     )
198 | 
199 | ### Mean number of friends by cluster ----
200 | teens_and_clusters %>%
201 |     with_groups(
202 |         .cluster,
203 |         summarize,
204 |         friends = mean(friends)
205 |     )
206 | 
207 | 
208 | ## 7. K-means clustering with tidy data principles ----
209 | 
210 | ### Exploratory clustering ----
211 | kclusts <-
212 |     tibble(k = 1:9) %>%
213 |     mutate(
214 |         kclust = map(k, ~kmeans(interests_tbl, .x)),
215 |         tidied = map(kclust, tidy),
216 |         glanced = map(kclust, glance),
217 |         augmented = map(kclust, augment, recipe_baked_tbl)
218 |     )
219 | 
220 | ### Create three separate datasets ----
221 | clusters <- kclusts %>%
222 |     unnest(cols = c(tidied))
223 | 
224 | assignments <- kclusts %>%
225 |     unnest(cols = c(augmented))
226 | 
227 | clusterings <- kclusts %>%
228 |     unnest(cols = c(glanced))
229 | 
230 | ### Plot the original points ----
231 | p1 <- assignments %>%
232 |     ggplot(aes(sports, music)) +
233 |     geom_point(aes(color = .cluster), alpha = 0.5) +
234 |     facet_wrap(vars(k)) +
235 |     theme_bw()
236 | p1
237 | 
238 | p2 <- p1 +
239 |     geom_point(data = clusters, size = 5, shape = "x")
240 | p2
241 | 
242 | 
243 | ### Plot the total within sum of squares (tot.withinss) ----
244 | clusterings %>%
245 |     ggplot(aes(k, tot.withinss)) +
246 |     geom_line() +
247 |     geom_point() +
248 |     theme_bw() +
249 |     scale_x_continuous(breaks = seq(1, 9, by = 1)) +
250 |     labs(
251 |         x = "k",
252 |         y = "Total Within Sum of Squares"
253 |     )
254 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_09_finding_groups_of_data_clustering_with_k-means.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 9: Finding Groups of Data - Clustering with k-means"
  3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
  4 | date: "2023-07-23"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ## Loading Packages
 13 | 
 14 | ```{r}
 15 | library(conflicted)
 16 | library(janitor)
 17 | library(tidyclust)
 18 | library(tidymodels)
 19 | library(tidyverse)
 20 | ```
 21 | 
 22 | ## Exploring and preparing the data
 23 | 
 24 | ```{r}
 25 | teens_tbl <- read_csv("data/snsdata.csv") %>%
 26 |     mutate(across(where(is.character), as.factor))
 27 | ```
 28 | 
 29 | ```{r}
 30 | teens_tbl %>%
 31 |     glimpse()
 32 | ```
 33 | 
 34 | ### Look at missing data for age variable
 35 | 
 36 | ```{r}
 37 | teens_tbl %>%
 38 |     pull(age) %>%
 39 |     summary()
 40 | ```
 41 | 
 42 | ### Eliminate age outliers
 43 | 
 44 | ```{r}
 45 | teens_only_tbl <- teens_tbl %>%
 46 |     mutate(
 47 |         age = case_when(
 48 |             between(age, 13, 20) ~ age,
 49 |             .default             = NA_real_
 50 |         )
 51 |     )
 52 | ```
 53 | 
 54 | ```{r}
 55 | teens_only_tbl %>%
 56 |     pull(age) %>%
 57 |     summary()
 58 | ```
 59 | 
 60 | ### Look at missing data for female variable
 61 | 
 62 | ```{r}
 63 | teens_only_tbl %>%
 64 |     tabyl(gender) %>%
 65 |     adorn_pct_formatting(digits = 1)
 66 | ```
 67 | 
 68 | ### Reassign missing gender values to "unknown" and change into factors
 69 | 
 70 | ```{r}
 71 | teens_gendered_tbl <- teens_only_tbl %>%
 72 |     mutate(
 73 |         gender = case_when(
 74 |             gender %in% c("F", "M") ~ gender,
 75 |             .default                = "gender_unknown"
 76 |         ) %>%
 77 |             as.factor()
 78 |     )
 79 | ```
 80 | 
 81 | ### Check our recoding work
 82 | 
 83 | ```{r}
 84 | teens_gendered_tbl %>%
 85 |     tabyl(gender) %>%
 86 |     adorn_pct_formatting(digits = 1) %>%
 87 |     arrange(desc(percent))
 88 | ```
 89 | 
 90 | ### Finding the mean age by cohort
 91 | 
 92 | Doesn't work because of the NAs
 93 | 
 94 | ```{r}
 95 | teens_gendered_tbl %>%
 96 |     pull(age) %>%
 97 |     mean()
 98 | ```
 99 | 
100 | Works thanks to the na.rm = TRUE
101 | 
102 | ```{r}
103 | teens_gendered_tbl %>%
104 |     pull(age) %>%
105 |     mean(na.rm = TRUE)
106 | ```
107 | 
108 | ### Age by cohort
109 | 
110 | ```{r}
111 | teens_gendered_tbl %>%
112 |     summarize(
113 |         age = mean(age, na.rm = TRUE),
114 |         .by = gradyear
115 |     )
116 | ```
117 | 
118 | ### Create a vector with the average age for each gradyear, repeated by person
119 | 
120 | ```{r}
121 | average_age_by_gradyear <- teens_gendered_tbl %>%
122 |     mutate(
123 |         ave_age = mean(age, na.rm = TRUE),
124 |         .by     = gradyear
125 |     ) %>%
126 |     pull()
127 | ```
128 | 
129 | ### Impute the missing age values with the average age by gradyear
130 | 
131 | ```{r}
132 | teens_imputed_tbl <- teens_gendered_tbl %>%
133 |     mutate(
134 |         age = case_when(
135 |             !is.na(age) ~ age,
136 |             .default    = average_age_by_gradyear
137 |         )
138 |     )
139 | ```
140 | 
141 | ### Check the summary results to ensure missing values are eliminated
142 | 
143 | ```{r}
144 | teens_imputed_tbl %>%
145 |     pull(age) %>%
146 |     summary()
147 | ```
148 | 
149 | ## Creating the recipe
150 | 
151 | ### Apply normalization to entire data frame
152 | 
153 | ```{r}
154 | recipe_obj <- recipe(teens_imputed_tbl) %>%
155 |     step_normalize(
156 |         all_numeric(),
157 |         -c(gradyear, age, friends)
158 |     ) %>%
159 |     step_dummy(
160 |         gender,
161 |         keep_original_cols = TRUE,
162 |         one_hot            = TRUE
163 |     )
164 | 
165 | recipe_obj
166 | ```
167 | 
168 | ```{r}
169 | recipe_baked_tbl <- recipe_obj %>%
170 |     prep() %>%
171 |     bake(new_data = NULL)
172 | 
173 | recipe_baked_tbl
174 | ```
175 | 
176 | ## Training a model on the data
177 | 
178 | ### Create a z-score standardized data frame for easier interpretation
179 | 
180 | ```{r}
181 | interests_tbl <- recipe_baked_tbl %>%
182 |     select(5:40)
183 | ```
184 | 
185 | ### Compare the data before and after the transformation
186 | 
187 | Before
188 | 
189 | ```{r}
190 | teens_imputed_tbl %>%
191 |     pull(basketball) %>%
192 |     summary()
193 | ```
194 | 
195 | After
196 | 
197 | ```{r}
198 | interests_tbl %>%
199 |     pull(basketball) %>%
200 |     summary()
201 | ```
202 | 
203 | #### Create model specification
204 | 
205 | ```{r}
206 | model_spec_kmeans <- k_means(
207 |     engine       = "stats",
208 |     num_clusters = 5
209 | )
210 | 
211 | model_spec_kmeans
212 | ```
213 | 
214 | ```{r}
215 | set.seed(2345)
216 | 
217 | model_fit_kmeans <- model_spec_kmeans %>%
218 |   fit(
219 |       ~ .,
220 |       interests_tbl
221 |   )
222 | 
223 | model_fit_kmeans
224 | ```
225 | 
226 | ## Evaluating model performance
227 | 
228 | ### Look at the single-row summary
229 | 
230 | ```{r}
231 | model_fit_kmeans %>%
232 |     glance()
233 | ```
234 | 
235 | ### Look at the size and the centers of the clusters
236 | 
237 | ```{r}
238 | model_fit_kmeans %>%
239 |     tidy() %>%
240 |     select(cluster, size, withinss, everything())
241 | ```
242 | 
243 | ## Improving model performance
244 | 
245 | ### Apply the cluster IDs to the original data frame
246 | 
247 | ```{r}
248 | teens_and_clusters <- augment(model_fit_kmeans, recipe_baked_tbl)
249 | teens_and_clusters
250 | ```
251 | 
252 | ### Look at the first five records
253 | 
254 | ```{r}
255 | teens_and_clusters %>%
256 |     select(.pred_cluster, gender, age, friends) %>%
257 |     slice_head(n = 5)
258 | ```
259 | 
260 | ### Mean age by cluster
261 | 
262 | ```{r}
263 | teens_and_clusters %>%
264 |     summarize(
265 |         age = mean(age),
266 |         .by = .pred_cluster
267 |     )
268 | ```
269 | 
270 | ### Proportion of females by cluster
271 | 
272 | ```{r}
273 | teens_and_clusters %>%
274 |     summarize(
275 |         gender_F = mean(gender_F),
276 |         .by      = .pred_cluster 
277 |     )
278 | ```
279 | 
280 | ### Mean number of friends by cluster
281 | 
282 | ```{r}
283 | teens_and_clusters %>%
284 |     summarize(
285 |         friends = mean(friends),
286 |         .by     = .pred_cluster
287 |     )
288 | ```
289 | 
290 | ## K-means clustering with tidy data principles
291 | 
292 | ### Exploratory clustering
293 | 
294 | ```{r}
295 | kclusts <-
296 |     tibble(k = 1:9) %>%
297 |     mutate(
298 |         kclust    = map(k, ~kmeans(interests_tbl, .x)),
299 |         tidied    = map(kclust, tidy),
300 |         glanced   = map(kclust, glance),
301 |         augmented = map(kclust, augment, recipe_baked_tbl)
302 |     )
303 | ```
304 | 
305 | ### Create three separate datasets
306 | 
307 | ```{r}
308 | clusters <- kclusts %>%
309 |     unnest(cols = c(tidied))
310 | 
311 | assignments <- kclusts %>%
312 |     unnest(cols = c(augmented))
313 | 
314 | clusterings <- kclusts %>%
315 |     unnest(cols = c(glanced))
316 | ```
317 | 
318 | ### Plot the original points
319 | 
320 | ```{r}
321 | p1 <- assignments %>%
322 |     ggplot(aes(sports, music)) +
323 |     geom_point(aes(color = .cluster), alpha = 0.5) +
324 |     facet_wrap(vars(k)) +
325 |     theme_bw()
326 | 
327 | p1
328 | ```
329 | 
330 | ```{r}
331 | p2 <- p1 +
332 |     geom_point(data = clusters, size = 5, shape = "x")
333 | 
334 | p2
335 | ```
336 | 
337 | ### Plot the total within sum of squares (tot.withinss)
338 | 
339 | ```{r}
340 | clusterings %>%
341 |     ggplot(aes(k, tot.withinss)) +
342 |     geom_line() +
343 |     geom_point() +
344 |     theme_bw() +
345 |     scale_x_continuous(breaks = seq(1, 9, by = 1)) +
346 |     labs(
347 |         x = "k",
348 |         y = "Total Within Sum of Squares"
349 |     )
350 | ```
351 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/estimating_wine_quality_with_regression_trees_and_model_trees.r:
--------------------------------------------------------------------------------
  1 | # Estimating Wine Quality with Regression Trees and Model Trees ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 6:
  4 | # Forecasting Numeric Data - Regression Methods.
  5 | #
  6 | # The original code is made with {rpart}, {rpart.plot} and {Cubist}. I wanted to
  7 | # see how one could recreate it using mainly {tidymodels}, {tidyverse} and {rules}.
  8 | #
  9 | # You can find the original code and the dataset here:
 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06
 11 | 
 12 | ## 1. Loading libraries (in the order they get used) ----
 13 | library(conflicted)
 14 |   conflict_prefer("filter", "dplyr", "stats")
 15 | library(tidyverse)
 16 | library(tidymodels)
 17 | library(rattle)
 18 | library(corrr)
 19 | library(rules)
 20 | 
 21 | 
 22 | ## 2. Preparing and exploring the data ----
 23 | wine_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/whitewines.csv") %>%
 24 |   rename_all(
 25 |     ~str_replace_all(., "\\s+", "_") %>%
 26 |       tolower()
 27 |   )
 28 | wine_tbl
 29 | 
 30 | glimpse(wine_tbl)
 31 | 
 32 | ### The distribution of quality ratings ----
 33 | wine_tbl %>%
 34 |   ggplot(aes(quality)) +
 35 |   geom_histogram()
 36 | 
 37 | ### Summary statistics of the wine data ----
 38 | summary(wine_tbl)
 39 | 
 40 | wine_split <- initial_time_split(
 41 |   wine_tbl,
 42 |   prop = 3750 / 4898
 43 | )
 44 | wine_train <- training(wine_split)
 45 | wine_test  <- testing(wine_split)
 46 | 
 47 | 
 48 | ## 3. Training a model on the data ----
 49 | # regression tree using rpart
 50 | model_spec <- decision_tree(
 51 |   mode            = "regression",
 52 |   engine          = "rpart",
 53 |   cost_complexity = NULL,
 54 |   tree_depth      = NULL,
 55 |   min_n           = NULL
 56 | ) %>%
 57 |   translate()
 58 | model_spec
 59 | 
 60 | ### Fit the model ----
 61 | model_fit <- fit(
 62 |   model_spec,
 63 |   quality ~ .,
 64 |   wine_train
 65 | )
 66 | 
 67 | ### Get basic information about the tree ----
 68 | model_fit
 69 | 
 70 | ### Get more detailed information about the tree ----
 71 | summary(model_fit$fit)
 72 | 
 73 | ### Adjust plot margins to make the visualization work better
 74 | par(mar = c(1, 1, 1, 1))
 75 | 
 76 | ### Use the rattle package to create a visualization ----
 77 | model_fit$fit %>%
 78 |   fancyRpartPlot(cex = 0.5)
 79 | 
 80 | 
 81 | ## 4. Evaluate model performance ----
 82 | 
 83 | ### Generate predictions for the testing dataset (you could skip this step) ----
 84 | wine_test_pred <- predict(
 85 |   object   = model_fit,
 86 |   new_data = wine_test,
 87 |   type     = "numeric"
 88 | )
 89 | wine_test_pred
 90 | 
 91 | ### Add the predictions to the test tibble ----
 92 | wine_test_with_pred_tbl <- augment(model_fit, wine_test)
 93 | wine_test_with_pred_tbl
 94 | 
 95 | ### Compare the distribution of actual values vs. predicted values ----
 96 | wine_test_with_pred_tbl %>%
 97 |   select(quality, .pred) %>%
 98 |   summary()
 99 | 
100 | ### Compare the correlation ----
101 | wine_test_with_pred_tbl %>%
102 |   select(quality, .pred) %>%
103 |   correlate()
104 | 
105 | ### Mean absolute error between actual and predicted values ----
106 | wine_test_with_pred_tbl %>%
107 |   metrics(quality, .pred) %>%
108 |   filter(.metric == "mae")
109 | 
110 | ### Mean absolute error between actual values and mean value ----
111 | mean_value <- wine_train$quality %>%
112 |   mean()
113 | mean_value
114 | 
115 | mae <- function(actual, predicted) {
116 |   mean(abs(actual - predicted))
117 | }
118 | 
119 | mae(wine_test$quality, mean_value)
120 | 
121 | 
122 | ## 5. Improving model performance ----
123 | # train a Cubist model tree
124 | model_spec_cubist <- cubist_rules(
125 |   mode            = "regression",
126 |   engine          = "Cubist",
127 |   committees      = NULL,
128 |   neighbors       = NULL,
129 |   max_rules       = NULL
130 | ) %>%
131 |   translate()
132 | model_spec_cubist
133 | 
134 | ### Fit the model ----
135 | model_fit_cubist <- fit(
136 |   model_spec_cubist,
137 |   quality ~ .,
138 |   wine_train
139 | )
140 | 
141 | ### Display basic information about the model tree ----
142 | model_fit_cubist
143 | 
144 | ### Display the tree itself ----
145 | summary(model_fit_cubist$fit)
146 | 
147 | ### Generate predictions for the model ----
148 | wine_test_pred_cubist <- predict(
149 |   object   = model_fit_cubist,
150 |   new_data = wine_test,
151 |   type     = "numeric"
152 | )
153 | wine_test_pred_cubist
154 | 
155 | ### Summary statistics about the predictions ----
156 | wine_test_pred_cubist %>%
157 |   summary()
158 | 
159 | ### Add the predictions to the test tibble ----
160 | wine_test_with_pred_cubist <- augment(model_fit_cubist, wine_test)
161 | wine_test_with_pred_cubist
162 | 
163 | ### Compare the distribution of actual values vs. predicted values ----
164 | wine_test_with_pred_cubist %>%
165 |   select(quality, .pred) %>%
166 |   summary()
167 | 
168 | ### Correlation between the true and predicted values ----
169 | wine_test_with_pred_cubist %>%
170 |   select(quality, .pred) %>%
171 |   correlate()
172 | 
173 | ### Mean absolute error of true and predicted values ----
174 | wine_test_with_pred_cubist %>%
175 |   metrics(quality, .pred) %>%
176 |   filter(.metric == "mae")
177 | 
178 | 
179 | ## 6. Creating a function to test the model(s) with another dataset ----
180 | 
181 | # The assumption here is that you have already taken step 1.
182 | 
183 | # Preparing and exploring the other wine dataset
184 | red_wine_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/redwines.csv") %>%
185 |   rename_all(
186 |     ~str_replace_all(., "\\s+", "_") %>%
187 |       tolower()
188 |   )
189 | red_wine_tbl
190 | 
191 | ### The distribution of quality ratings ----
192 | red_wine_tbl %>%
193 |   ggplot(aes(quality)) +
194 |   geom_histogram()
195 | 
196 | ### Summary statistics of the wine data ----
197 | summary(red_wine_tbl)
198 | 
199 | ### Create the function
200 | 
201 | predict_wine_quality <- function(
202 |   .engine    = c("rpart", "Cubist"),
203 |   .winecolor = c("red", "white")
204 | ) {
205 | 
206 |   # Check that the wine color is valid
207 |   if (!.winecolor %in% c("red", "white")) stop("Choose a wine color: red or white")
208 | 
209 |   # Write out the path so that you can insert the wine color in there
210 |   path <- str_glue("machine_learning_with_r_3rd_ed/regression_methods/data/{.winecolor}wines.csv")
211 | 
212 |   # Read in the data
213 |   wine_tbl <- read_csv(path) %>%
214 |     rename_all(
215 |       ~str_replace_all(., "\\s+", "_") %>%
216 |         tolower()
217 |     )
218 | 
219 |   # Make the train/test split. It's not randomizing, as the data already is
220 |   wine_split <- initial_time_split(
221 |     wine_tbl,
222 |     prop = 0.75
223 |   )
224 |   wine_train <- training(wine_split)
225 |   wine_test  <- testing(wine_split)
226 | 
227 |   # Create the model based on the engine chosen
228 |   if (.engine == "rpart") {
229 | 
230 |     model_spec <- decision_tree(
231 |       mode            = "regression",
232 |       engine          = "rpart"
233 |     ) %>%
234 |       translate()
235 | 
236 |   } else if (.engine == "Cubist") {
237 | 
238 |     model_spec <- cubist_rules(
239 |       mode            = "regression",
240 |       engine          = "Cubist"
241 |     ) %>%
242 |       translate()
243 | 
244 |   } else {
245 | 
246 |     stop("Choose an engine: rpart (decision tree) or Cubist (rules)")
247 | 
248 |   }
249 | 
250 |   # Fit the model
251 |   model_fit <- fit(
252 |     model_spec,
253 |     quality ~ .,
254 |     wine_train
255 |   )
256 | 
257 |   # Add the predictions
258 |   wine_test_with_pred_tbl <- augment(model_fit, wine_test)
259 | 
260 |   # Get the metrics
261 |   wine_test_with_pred_tbl %>%
262 |     metrics(quality, .pred)
263 | 
264 | }
265 | 
266 | ### Test the function ----
267 | predict_wine_quality(
268 |   .engine    = "Cubist",
269 |   .winecolor = "white"
270 | )
271 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_03_lazy_learning_classification_using_nearest_neighbors.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 3: Lazy Learning - Classification Using Nearest Neighbors"
  3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
  4 | date: "2023-07-23"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | # 3 Lazy Learning - Classification Using Nearest Neighbors
 13 | 
 14 | ## Load Packages
 15 | 
 16 | ```{r}
 17 | library(conflicted) # An Alternative Conflict Resolution Strategy
 18 | library(janitor)    # Simple Tools for Examining and Cleaning Dirty Data
 19 | library(tidymodels) # Easily Install and Load the 'Tidymodels' Packages
 20 | library(tidyverse)  # Easily Install and Load the 'Tidyverse'
 21 | ```
 22 | 
 23 | ## Exploring and preparing the data
 24 | 
 25 | ### Import the CSV file (Breast Cancer Wisconsin (Diagnostic))
 26 | 
 27 | ```{r}
 28 | wbcd_tbl <- read_csv(
 29 |     "data/wisc_bc_data.csv"
 30 | )
 31 | ```
 32 | 
 33 | ### Take a look at the tibble
 34 | 
 35 | ```{r}
 36 | glimpse(wbcd_tbl)
 37 | ```
 38 | 
 39 | ### Drop the unnecessary id column
 40 | 
 41 | ```{r}
 42 | wbcd_selected_tbl <- wbcd_tbl %>%
 43 |     select(-id)
 44 | 
 45 | wbcd_selected_tbl
 46 | ```
 47 | 
 48 | ```{r}
 49 | # table of diagnosis
 50 | wbcd_tbl %>%
 51 |     tabyl(diagnosis) %>%
 52 |     adorn_pct_formatting(digits = 1)
 53 | ```
 54 | 
 55 | ### Transform diagnosis to a factor
 56 | 
 57 | ```{r}
 58 | wbcd_factored_tbl <- wbcd_selected_tbl %>%
 59 |     mutate(
 60 |         diagnosis = factor(
 61 |             diagnosis,
 62 |             levels = c("B", "M"),
 63 |             labels = c("Benign", "Malignant")
 64 |         )
 65 |     )
 66 | 
 67 | wbcd_factored_tbl
 68 | ```
 69 | 
 70 | ### Count the number of the two diagnosis (incl. percentage)
 71 | 
 72 | ```{r}
 73 | wbcd_factored_tbl %>%
 74 |     tabyl(diagnosis) %>%
 75 |     adorn_pct_formatting(digits = 1)
 76 | ```
 77 | 
 78 | ### Summarize three numeric features
 79 | 
 80 | ```{r}
 81 | wbcd_factored_tbl %>%
 82 |     select(radius_mean, area_mean, smoothness_mean) %>%
 83 |     summary()
 84 | ```
 85 | 
 86 | ## Creating the recipe and splitting the data
 87 | 
 88 | ### Normalize the wbcd data
 89 | 
 90 | ```{r}
 91 | recipe_obj <- recipe(
 92 |     diagnosis ~ .,
 93 |     data = wbcd_factored_tbl
 94 | ) %>%
 95 |     step_range(
 96 |         all_numeric_predictors(),
 97 |         min = 0,
 98 |         max = 1
 99 |     )
100 | 
101 | recipe_obj
102 | ```
103 | 
104 | ```{r}
105 | wbcd_normalized_tbl <- recipe_obj %>%
106 |     prep() %>%
107 |     bake(new_data = NULL)
108 | ```
109 | 
110 | ### Confirm that normalization worked
111 | 
112 | ```{r}
113 | wbcd_normalized_tbl %>%
114 |     select(area_mean) %>%
115 |     summary()
116 | ```
117 | 
118 | ### Create training and test data (randomly)
119 | 
120 | ```{r}
121 | wbcd_split <- initial_split(
122 |     wbcd_normalized_tbl,
123 |     prop = 469 / 569
124 | )
125 | 
126 | wbcd_train <- training(wbcd_split)
127 | wbcd_test  <- testing(wbcd_split)
128 | ```
129 | 
130 | ## Training a model on the data
131 | 
132 | kknn is the engine (and needs to be installed if it isn't already). It is used as the engine for {parsnip}'s nearest_neighbor() function. And since we are classifying, that is the mode we choose.
133 | 
134 | ### Create model specification
135 | 
136 | ```{r}
137 | model_spec <- nearest_neighbor(
138 |     engine      = "kknn",
139 |     mode        = "classification",
140 |     neighbors   = 21
141 | ) %>%
142 |     translate()
143 | 
144 | model_spec
145 | ```
146 | 
147 | ### Fit the model
148 | 
149 | ```{r}
150 | model_fit <- model_spec %>% 
151 |     fit(
152 |         diagnosis ~ .,
153 |         wbcd_train
154 |     )
155 | 
156 | model_fit
157 | ```
158 | 
159 | ### Make the predictions (you could skip this step)
160 | 
161 | ```{r}
162 | wbcd_test_pred <- model_fit %>% 
163 |     predict(
164 |         new_data = wbcd_test,
165 |         type = "class"
166 |     )
167 | 
168 | wbcd_test_pred
169 | ```
170 | 
171 | ### Add the predictions to the test tibble
172 | 
173 | ```{r}
174 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
175 | wbcd_test_with_pred_tbl
176 | ```
177 | 
178 | ## Evaluating model performance
179 | 
180 | ### Create a confusion matrix
181 | 
182 | ```{r}
183 | conf_mat <- conf_mat(
184 |     data     = wbcd_test_with_pred_tbl,
185 |     truth    = diagnosis,
186 |     estimate = .pred_class
187 | )
188 | 
189 | conf_mat
190 | ```
191 | 
192 | ### Visualize the confusion matrix
193 | 
194 | ```{r}
195 | conf_mat %>%
196 |     autoplot(type = "heatmap")
197 | 
198 | conf_mat %>%
199 |     autoplot(type = "mosaic")
200 | ```
201 | 
202 | ### Visualize the ROC curve
203 | 
204 | ```{r}
205 | wbcd_test_with_pred_tbl %>%
206 |     roc_curve(
207 |         truth    = diagnosis,
208 |         .pred_Benign
209 |     ) %>%
210 |     autoplot()
211 | ```
212 | 
213 | ### Calculate the ROC AUC (area under the curve)
214 | 
215 | ```{r}
216 | wbcd_roc_auc <- wbcd_test_with_pred_tbl %>%
217 |     roc_auc(
218 |         truth    = diagnosis,
219 |         .pred_Benign
220 |     )
221 | 
222 | wbcd_roc_auc
223 | ```
224 | 
225 | ### Put together other model metrics
226 | 
227 | ```{r}
228 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
229 | classification_metrics <- conf_mat(
230 |     wbcd_test_with_pred_tbl,
231 |     truth    = diagnosis,
232 |     estimate = .pred_class
233 | ) %>%
234 |     summary()
235 | 
236 | classification_metrics
237 | ```
238 | 
239 | ## Creating a function to help evaluate the model further
240 | 
241 | The idea is to be able to choose different values for k and different methods for standardization (range (0 to 1) and normalization).
242 | 
243 | ```{r}
244 | classify_with_knn <- function(
245 |         k = 21,
246 |         standardization_method = c("range", "normalization")
247 | ) {
248 |     
249 |     # Create a recipe according to the chosen standardization method
250 |     if (standardization_method == "range") {
251 |         
252 |         recipe_obj <- recipe(
253 |             formula = diagnosis ~ .,
254 |             data    = wbcd_factored_tbl
255 |         ) %>%
256 |             step_range(
257 |                 all_numeric_predictors(),
258 |                 min = 0,
259 |                 max = 1)
260 |         
261 |     } else if (standardization_method == "normalization") {
262 |         
263 |         recipe_obj <- recipe(
264 |             formula = diagnosis ~ .,
265 |             data    = wbcd_factored_tbl
266 |         ) %>%
267 |             step_normalize(all_numeric_predictors())
268 |         
269 |     } else {
270 |         
271 |         stop('Choose a starndardization method that is either "range" or "normalization"!')
272 |         
273 |     }
274 |     
275 |     wbcd_normalized_tbl <- recipe_obj %>%
276 |         prep() %>%
277 |         bake(new_data = wbcd_factored_tbl)
278 |     
279 |     # Create training and test data
280 |     wbcd_split <- initial_split(
281 |         wbcd_normalized_tbl,
282 |         prop = 469 / 569
283 |     )
284 |     wbcd_train <- training(wbcd_split)
285 |     wbcd_test  <- testing(wbcd_split)
286 |     
287 |     # Create model specification
288 |     model_spec <- nearest_neighbor(
289 |         engine      = "kknn",
290 |         mode        = "classification",
291 |         neighbors   = k
292 |     ) %>%
293 |         translate()
294 |     
295 |     # Fit the model
296 |     model_fit <- model_spec %>% 
297 |         fit(
298 |             diagnosis ~ .,
299 |             wbcd_train
300 |         )
301 |     
302 |     # Add the predictions to the test tibble
303 |     wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
304 |     
305 |     # Create a confusion matrix
306 |     conf_mat <- conf_mat(
307 |         data     = wbcd_test_with_pred_tbl,
308 |         truth    = diagnosis,
309 |         estimate = .pred_class
310 |     )
311 |     
312 |     # Print the confusion matrix
313 |     conf_mat %>% autoplot(type = "heatmap")
314 | }
315 | ```
316 | 
317 | ### Test the function
318 | 
319 | ```{r}
320 | # standardization_method is either "range" or "normalization
321 | classify_with_knn(
322 |     standardization_method = "range",
323 |     k = 5
324 | )
325 | ```
326 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_14.r:
--------------------------------------------------------------------------------
  1 | ##### Chapter 14: Building Better Learners -------------------
  2 | 
  3 | # load the credit dataset
  4 | credit <- read.csv("credit.csv")
  5 | library(caret)
  6 | 
  7 | ## Creating a simple tuned model ----
  8 | 
  9 | # look up the tuning parameters for C5.0
 10 | modelLookup("C5.0")
 11 | 
 12 | # automated parameter tuning of C5.0 decision tree 
 13 | set.seed(300)
 14 | m <- train(default ~ ., data = credit, method = "C5.0")
 15 | 
 16 | # summary of tuning results
 17 | m
 18 | 
 19 | # apply the best C5.0 candidate model to make predictions
 20 | p <- predict(m, credit)
 21 | table(p, credit$default)
 22 | 
 23 | # obtain predicted classes
 24 | head(predict(m, credit))
 25 | 
 26 | # obtain predicted probabilities
 27 | head(predict(m, credit, type = "prob"))
 28 | 
 29 | ## Customizing the tuning process ----
 30 | # use trainControl() to alter resampling strategy
 31 | ctrl <- trainControl(method = "cv", number = 10,
 32 |                      selectionFunction = "oneSE")
 33 | 
 34 | # use expand.grid() to create grid of tuning parameters
 35 | grid <- expand.grid(model = "tree",
 36 |                     trials = c(1, 5, 10, 15, 20, 25, 30, 35),
 37 |                     winnow = FALSE)
 38 | 
 39 | # look at the result of expand.grid()
 40 | grid
 41 | 
 42 | # customize train() with the control list and grid of parameters 
 43 | set.seed(300)
 44 | m <- train(default ~ ., data = credit, method = "C5.0",
 45 |            metric = "Kappa",
 46 |            trControl = ctrl,
 47 |            tuneGrid = grid)
 48 | 
 49 | # see the results
 50 | m
 51 | 
 52 | ## Bagging ----
 53 | # Using the ipred bagged decision trees
 54 | library(ipred)
 55 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
 56 | set.seed(300)
 57 | mybag <- bagging(default ~ ., data = credit, nbagg = 25)
 58 | credit_pred <- predict(mybag, credit)
 59 | table(credit_pred, credit$default)
 60 | 
 61 | # estimate performance of ipred bagged trees
 62 | library(caret)
 63 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
 64 | set.seed(300)
 65 | ctrl <- trainControl(method = "cv", number = 10)
 66 | train(default ~ ., data = credit, method = "treebag",
 67 |       trControl = ctrl)
 68 | 
 69 | ## Boosting ----
 70 | 
 71 | ## Using C5.0 Decision Tree (not shown in book)
 72 | library(C50)
 73 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
 74 | m_c50_bst <- C5.0(default ~ ., data = credit, trials = 100)
 75 | 
 76 | # create a Adaboost.M1 model
 77 | library(adabag)
 78 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
 79 | set.seed(300)
 80 | m_adaboost <- boosting(default ~ ., data = credit)
 81 | p_adaboost <- predict(m_adaboost, credit)
 82 | head(p_adaboost$class)
 83 | p_adaboost$confusion
 84 | 
 85 | # create and evaluate an Adaboost.M1 model using 10-fold-CV
 86 | set.seed(300)
 87 | adaboost_cv <- boosting.cv(default ~ ., data = credit)
 88 | adaboost_cv$confusion
 89 | 
 90 | # calculate kappa
 91 | library(vcd)
 92 | Kappa(adaboost_cv$confusion)
 93 | 
 94 | ## Random Forests ----
 95 | # random forest with default settings
 96 | library(randomForest)
 97 | set.seed(300)
 98 | rf <- randomForest(default ~ ., data = credit)
 99 | rf
100 | 
101 | # calculate kappa on the out-of-bag estimate
102 | library(vcd)
103 | Kappa(rf$confusion[1:2,1:2])
104 | 
105 | # ranger is a faster implementation of the random forest algorithm
106 | library(ranger)
107 | set.seed(300)
108 | m_ranger <- ranger(default ~ ., data = credit)
109 | m_ranger
110 | 
111 | # calculate kappa
112 | Kappa(m_ranger$confusion.matrix)
113 | 
114 | ## Gradient Boosting Machines (GBM) ----
115 | 
116 | # prepare the data for gbm()
117 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
118 | credit$default <- ifelse(credit$default == "yes", 1, 0)
119 | 
120 | # create a random train and test split
121 | set.seed(123)
122 | train_sample <- sample(1000, 900)
123 | credit_train <- credit[train_sample, ]
124 | credit_test  <- credit[-train_sample, ]
125 | 
126 | # create a GBM model with default parameters
127 | library(gbm)
128 | set.seed(300)
129 | m_gbm <- gbm(default ~ ., data = credit_train)
130 | m_gbm
131 | 
132 | # evaluate the simple GBM model
133 | p_gbm <- predict(m_gbm, credit_test, type = "response")
134 | p_gbm_c <- ifelse(p_gbm > 0.50, 1, 0)
135 | table(credit_test$default, p_gbm_c)
136 | 
137 | # compute kappa 
138 | library(vcd)
139 | Kappa(table(credit_test$default, p_gbm_c))
140 | 
141 | # create a tuned gbm() model using caret
142 | # start by creating the tuning grid
143 | grid_gbm <- expand.grid(
144 |   n.trees = c(100, 150, 200),
145 |   interaction.depth = c(1, 2, 3),
146 |   shrinkage = c(0.01, 0.1, 0.3),
147 |   n.minobsinnode = 10
148 | )
149 | 
150 | # define the experiment's parameters
151 | library(caret)
152 | ctrl <- trainControl(method = "cv", number = 10,
153 |                      selectionFunction = "best")
154 | 
155 | # run the caret experiment
156 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
157 | set.seed(300)
158 | m_gbm_c <- train(default ~ ., data = credit, method = "gbm",
159 |                  trControl = ctrl, tuneGrid = grid_gbm,
160 |                  metric = "Kappa",
161 |                  verbose = FALSE)
162 | 
163 | # see the results
164 | m_gbm_c
165 | 
166 | ## Extreme Gradient Boosting (XGB) ----
167 | 
168 | # Format the credit data as a sparse matrix
169 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
170 | library(Matrix)
171 | credit_matrix <- sparse.model.matrix(~ . -default, data = credit)
172 | 
173 | # examine the sparse credit_matrix
174 | dim(credit_matrix)
175 | print(credit_matrix[1:5, 1:15])
176 | 
177 | # remove the intercept
178 | credit_matrix <- credit_matrix[, -1] 
179 | 
180 | # split the matrix into train and test
181 | set.seed(12345)
182 | train_ids <- sample(1000, 900)
183 | credit_train <- credit_matrix[train_ids, ]
184 | credit_test <- credit_matrix[-train_ids, ]
185 | 
186 | # check that the rows are 900 vs. 100 and the cols are 35 vs. 35
187 | dim(credit_train)
188 | dim(credit_test)
189 | 
190 | # create 1/0 vectors for train and test data indicating loan default
191 | credit_train_labels <-
192 |   ifelse(credit[train_ids, c("default")] == "yes", 1, 0)
193 | credit_test_labels <-
194 |   ifelse(credit[-train_ids, c("default")] == "yes", 1, 0)
195 | 
196 | # build the xgboost model
197 | library(xgboost)
198 | 
199 | # set XGB hyperparameters
200 | params.xgb <- list(objective   = "binary:logistic",
201 |                    max_depth   = 6,
202 |                    eta         = 0.3,
203 |                    gamma       = 0,
204 |                    colsample_bytree = 1,
205 |                    min_child_weight = 1,
206 |                    subsample = 1)
207 | 
208 | set.seed(555)
209 | xgb_credit <- xgboost(params  = params.xgb,
210 |                       data    = credit_train,
211 |                       label   = credit_train_labels, 
212 |                       nrounds = 100,
213 |                       print_every_n = 10,
214 |                       verbose = 1)
215 | 
216 | # make predictions
217 | prob_default <- predict(xgb_credit, credit_test)
218 | pred_default <- ifelse(prob_default > 0.50, 1, 0)
219 | 
220 | # create a confusion matrix
221 | table(pred_default, credit_test_labels)
222 | 
223 | # compute kappa
224 | library(vcd)
225 | Kappa(table(pred_default, credit_test_labels))
226 | 
227 | # create a tuned xgboost() model using caret
228 | # start by creating the tuning grid
229 | grid_xgb <- expand.grid(
230 |   eta = c(0.3, 0.4),
231 |   max_depth = c(1, 2, 3),
232 |   colsample_bytree = c(0.6, 0.8),
233 |   subsample = c(0.50, 0.75, 1.00),
234 |   nrounds = c(50, 100, 150),
235 |   gamma = c(0, 1),
236 |   min_child_weight = 1
237 | )
238 | 
239 | # define the control object
240 | library(caret)
241 | ctrl <- trainControl(method = "cv", number = 10,
242 |                      selectionFunction = "best")
243 | 
244 | # run the caret experiment
245 | set.seed(300)
246 | m_xgb <- train(default ~ ., data = credit, method = "xgbTree",
247 |                     trControl = ctrl, tuneGrid = grid_xgb,
248 |                     metric = "Kappa", verbosity = 0)
249 | 
250 | # see the results of all models (not shown in book due to size of output)
251 | m_xgb
252 | 
253 | # see the hyperparameters for the best performing model
254 | m_xgb$bestTune
255 | 
256 | # get the best kappa out of the 216 models tested
257 | max(m_xgb$results["Kappa"])
258 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/knn/classifying-cancer-samples-with-knn.r:
--------------------------------------------------------------------------------
  1 | # Classifying Cancer Samples with K-Nearest Neighbors ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 3:
  4 | # Lazy Learning - Classification Using Nearest Neighbors.
  5 | #
  6 | # The original code is made with a lot of base R, {class} and {gmodels}. I
  7 | # wanted to see how one could recreate it using mainly {tidymodels}
  8 | # and {tidyverse}.
  9 | #
 10 | # You can find the original code here:
 11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter03
 12 | 
 13 | ## 1. Loading libraries (in the order they get used)----
 14 | library(conflicted)
 15 | library(tidymodels)
 16 | library(tidyverse)
 17 | 
 18 | 
 19 | ## 2. Exploring and preparing the data ----
 20 | 
 21 | ### Create a vector for the column names ----
 22 | .col_names <- c(
 23 |     "id",
 24 |     "diagnosis",
 25 |     "radius_mean",
 26 |     "texture_mean",
 27 |     "perimeter_mean",
 28 |     "area_mean",
 29 |     "smoothness_mean",
 30 |     "compactness_mean",
 31 |     "concavity_mean",
 32 |     "points_mean",
 33 |     "symmetry_mean",
 34 |     "dimension_mean",
 35 |     "radius_se",
 36 |     "texture_se",
 37 |     "perimeter_se",
 38 |     "area_se",
 39 |     "smoothness_se",
 40 |     "compactness_se",
 41 |     "concavity_se",
 42 |     "points_se",
 43 |     "symmetry_se",
 44 |     "dimension_se",
 45 |     "radius_worst",
 46 |     "texture_worst",
 47 |     "perimeter_worst",
 48 |     "area_worst",
 49 |     "smoothness_worst",
 50 |     "compactness_worst",
 51 |     "concavity_worst",
 52 |     "points_worst",
 53 |     "symmetry_worst",
 54 |     "dimension_worst"
 55 | )
 56 | 
 57 | ### Import the CSV file (Breast Cancer Wisconsin (Diagnostic)) ----
 58 | wbcd_tbl <- read_csv("machine_learning_with_r_3rd_ed/knn/data/wdbc-data.csv", col_names = .col_names)
 59 | 
 60 | ### Take a look at the tibble ----
 61 | glimpse(wbcd_tbl)
 62 | 
 63 | ### Drop the unnecessary id column ----
 64 | wbcd_selected_tbl <- wbcd_tbl %>% select(-id)
 65 | wbcd_selected_tbl
 66 | 
 67 | ### Transform diagnosis to a factor ----
 68 | wbcd_factored_tbl <- wbcd_selected_tbl %>%
 69 |     mutate(
 70 |         diagnosis = factor(
 71 |             diagnosis,
 72 |             levels = c("B", "M"),
 73 |             labels = c("Benign", "Malignant")
 74 |         )
 75 |     )
 76 | wbcd_factored_tbl
 77 | 
 78 | ### Count the number of the two diagnosis (incl. percentage) ----
 79 | wbcd_factored_tbl %>%
 80 |     count(diagnosis) %>%
 81 |     mutate(pct = (n / sum(n) * 100))
 82 | 
 83 | ### Summarize three numeric features ----
 84 | wbcd_factored_tbl %>%
 85 |     select(radius_mean, area_mean, smoothness_mean) %>%
 86 |     summary()
 87 | 
 88 | 
 89 | ## 3. Creating the recipe and splitting the data ----
 90 | 
 91 | ### Normalize the wbcd data ----
 92 | recipe_obj <- recipe(
 93 |     diagnosis ~ .,
 94 |     data = wbcd_factored_tbl
 95 | ) %>%
 96 |     step_range(
 97 |         all_numeric_predictors(),
 98 |         min = 0,
 99 |         max = 1
100 |     )
101 | recipe_obj
102 | 
103 | wbcd_normalized_tbl <- recipe_obj %>%
104 |     prep() %>%
105 |     bake(new_data = NULL)
106 | 
107 | ### Confirm that normalization worked ----
108 | wbcd_normalized_tbl %>%
109 |     select(area_mean) %>%
110 |     summary()
111 | 
112 | ### Create training and test data (randomly) ----
113 | wbcd_split <- initial_split(
114 |     wbcd_normalized_tbl,
115 |     prop = 469 / 569
116 | )
117 | wbcd_train <- training(wbcd_split)
118 | wbcd_test  <- testing(wbcd_split)
119 | 
120 | 
121 | ## 4. Training a model on the data ----
122 | 
123 | # kknn is the engine (needs to be installed if not already):
124 | # install.packages("kknn")
125 | 
126 | # It is used as the engine for {parsnip}'s nearest_neighbor() function.
127 | # And since we are classifying, that is the mode we choose.
128 | 
129 | ### Create model specification ----
130 | model_spec <- nearest_neighbor(
131 |     engine      = "kknn",
132 |     mode        = "classification",
133 |     neighbors   = 21
134 | ) %>%
135 |     translate()
136 | model_spec
137 | 
138 | ### Fit the model ----
139 | model_fit <- fit(
140 |     model_spec,
141 |     diagnosis ~ .,
142 |     wbcd_train
143 | )
144 | model_fit
145 | 
146 | ### Make the predictions (you could skip this step) ----
147 | wbcd_test_pred <- predict(
148 |     model_fit,
149 |     new_data = wbcd_test,
150 |     type = "class"
151 | )
152 | wbcd_test_pred
153 | 
154 | ### Add the predictions to the test tibble ----
155 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
156 | wbcd_test_with_pred_tbl
157 | 
158 | 
159 | ## 5. Evaluating model performance ----
160 | 
161 | ### Create a confusion matrix ----
162 | conf_mat <- conf_mat(
163 |     data     = wbcd_test_with_pred_tbl,
164 |     truth    = diagnosis,
165 |     estimate = .pred_class
166 | )
167 | conf_mat
168 | 
169 | ### Visualize the confusion matrix ----
170 | conf_mat %>% autoplot(type = "heatmap")
171 | conf_mat %>% autoplot(type = "mosaic")
172 | 
173 | ### Visualize the ROC curve ----
174 | wbcd_test_with_pred_tbl %>%
175 |     roc_curve(
176 |         truth    = diagnosis,
177 |         estimate = .pred_Benign
178 |     ) %>%
179 |     autoplot()
180 | 
181 | ### Calculate the ROC AUC (area under the curve) ----
182 | wbcd_roc_auc <- wbcd_test_with_pred_tbl %>%
183 |     roc_auc(
184 |         truth    = diagnosis,
185 |         estimate = .pred_Benign
186 |     )
187 | wbcd_roc_auc
188 | 
189 | 
190 | ### Put together other model metrics ----
191 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
192 | classification_metrics <- conf_mat(
193 |     wbcd_test_with_pred_tbl,
194 |     truth    = diagnosis,
195 |     estimate = .pred_class
196 | ) %>%
197 |     summary()
198 | 
199 | 
200 | ## 6. Creating a function to help evaluate the model further ----
201 | 
202 | # The assumption here is that you have already gone through steps 1. to 2.
203 | # The idea is to be able to choose different values for k and different
204 | # methods for standardization (range (0 to 1) and normalization)
205 | 
206 | classify_with_knn <- function(
207 |     k = 21,
208 |     standardization_method = c("range", "normalization")
209 | ) {
210 |     
211 |     # Create a recipe according to the chosen standardization method
212 |     if (standardization_method == "range") {
213 |         
214 |         recipe_obj <- recipe(
215 |             formula = diagnosis ~ .,
216 |             data    = wbcd_factored_tbl
217 |         ) %>%
218 |             step_range(
219 |                 all_numeric_predictors(),
220 |                 min = 0,
221 |                 max = 1)
222 |         
223 |     } else if (standardization_method == "normalization") {
224 |         
225 |         recipe_obj <- recipe(
226 |             formula = diagnosis ~ .,
227 |             data    = wbcd_factored_tbl
228 |         ) %>%
229 |             step_normalize(all_numeric_predictors())
230 |         
231 |     } else {
232 |         
233 |         stop('Choose a starndardization method that is either "range" or "normalization"!')
234 |         
235 |     }
236 |     
237 |     wbcd_normalized_tbl <- recipe_obj %>%
238 |         prep() %>%
239 |         bake(new_data = wbcd_factored_tbl)
240 |     
241 |     # Create training and test data
242 |     wbcd_split <- initial_split(
243 |         wbcd_normalized_tbl,
244 |         prop = 469 / 569
245 |     )
246 |     wbcd_train <- training(wbcd_split)
247 |     wbcd_test  <- testing(wbcd_split)
248 |     
249 |     # Create model specification
250 |     model_spec <- nearest_neighbor(
251 |         engine      = "kknn",
252 |         mode        = "classification",
253 |         neighbors   = k
254 |     ) %>%
255 |         translate()
256 |     
257 |     # Fit the model
258 |     model_fit <- fit(
259 |         model_spec,
260 |         diagnosis ~ .,
261 |         wbcd_train
262 |     )
263 |     
264 |     # Add the predictions to the test tibble
265 |     wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
266 |     
267 |     # Create a confusion matrix
268 |     conf_mat <- conf_mat(
269 |         data     = wbcd_test_with_pred_tbl,
270 |         truth    = diagnosis,
271 |         estimate = .pred_class
272 |     )
273 |     
274 |     # Print the confusion matrix
275 |     conf_mat %>% autoplot(type = "heatmap")
276 | }
277 | 
278 | ### Test the function ----
279 | classify_with_knn(
280 |     standardization_method = "range",
281 |     k = 5
282 | )
283 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/predicting_medical_expenses_with_linear_regression.r:
--------------------------------------------------------------------------------
  1 | # Predicting Medical Expenses with Linear Regression ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 6:
  4 | # Forecasting Numeric Data - Regression Methods.
  5 | #
  6 | # The original code is made with base R and {psych}. I wanted to see how one
  7 | # could recreate it using mainly {tidymodels} and {tidyverse}.
  8 | #
  9 | # You can find the original code and the dataset here:
 10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06
 11 | 
 12 | library(conflicted)
 13 | library(tidyverse)
 14 | library(corrr)
 15 | library(tidymodels)
 16 | library(GGally)
 17 | 
 18 | 
 19 | ## 2. Exploring and preparing the data ----
 20 | insurance_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/insurance.csv")
 21 | glimpse(insurance_tbl)
 22 | 
 23 | ### Summarize the data ----
 24 | insurance_tbl %>%
 25 |   mutate(across(where(is.character), as_factor)) %>%
 26 |   summary()
 27 | 
 28 | ### Histogram of insurance charges ----
 29 | insurance_tbl %>%
 30 |   ggplot(aes(expenses)) +
 31 |   geom_histogram(binwidth = 5000)
 32 | 
 33 | ### Distribution between regions ----
 34 | insurance_tbl %>%
 35 |   count(region) %>%
 36 |   mutate(pct = (n / sum(n) * 100))
 37 | 
 38 | ### Exploring relationships among features: correlation matrix ----
 39 | insurance_tbl %>%
 40 |   select(c("age", "bmi", "children", "expenses")) %>%
 41 |   as.matrix() %>%
 42 |   correlate()
 43 | 
 44 | ### Visualing relationships among features: scatterplot matrix ----
 45 | insurance_tbl %>%
 46 |   select(c("age", "bmi", "children", "expenses")) %>%
 47 |   ggpairs()
 48 | 
 49 | 
 50 | ## 3. Training a model on the data ----
 51 | 
 52 | ### Create a recipe ----
 53 | recipe_obj <- recipe(
 54 |   expenses ~ .,
 55 |   data = insurance_tbl
 56 | )
 57 | recipe_obj
 58 | 
 59 | insurance_baked_tbl <- recipe_obj %>%
 60 |   prep() %>%
 61 |   bake(new_data = NULL)
 62 | insurance_baked_tbl
 63 | 
 64 | ### Model specification ----
 65 | model_spec <- linear_reg(
 66 |   mode            = "regression",
 67 |   engine          = "lm"
 68 | ) %>%
 69 |   translate()
 70 | model_spec
 71 | 
 72 | ### Fit the model ----
 73 | model_fit <- fit(
 74 |   model_spec,
 75 |   expenses ~ .,
 76 |   insurance_baked_tbl
 77 | )
 78 | 
 79 | ### See the estimated beta coefficients ----
 80 | model_fit
 81 | 
 82 | 
 83 | ## 4. Evaluating model performance ----
 84 | summary(model_fit$fit)
 85 | 
 86 | 
 87 | ## 5. Improving model performance ----
 88 | 
 89 | ### Add a higher-order "age" term ----
 90 | insurance_augmented_tbl <- insurance_baked_tbl %>%
 91 |   mutate(age2 = age ^ 2,
 92 | 
 93 |          ### Add an indicator for BMI >= 30 ----
 94 |          bmi30 =
 95 |            case_when(
 96 |              bmi >= 30 ~ 1,
 97 |              TRUE      ~ 0
 98 |            )
 99 |   )
100 | 
101 | 
102 | ### Create the final model ----
103 | recipe_augmented_obj <- recipe(
104 |   expenses ~ .,
105 |   data = insurance_augmented_tbl
106 | ) %>%
107 |   # This step is needed for the next step,
108 |   # adding the interaction between bmi30 and smoker
109 |   step_dummy(smoker) %>%
110 |   # Dummy variables need to be specified with starts_with()
111 |   step_interact(terms = ~ bmi30:starts_with("smoker"))
112 | recipe_augmented_obj
113 | 
114 | insurance_baked_augmented_tbl <- recipe_augmented_obj %>%
115 |   prep() %>%
116 |   bake(new_data = NULL)
117 | insurance_baked_augmented_tbl
118 | 
119 | ### Model specification ----
120 | model_spec_augmented <- linear_reg(
121 |   mode            = "regression",
122 |   engine          = "lm"
123 | ) %>%
124 |   translate()
125 | model_spec_augmented
126 | 
127 | ### Fit the model ----
128 | model_fit_augmented <- fit(
129 |   model_spec_augmented,
130 |   expenses ~ .,
131 |   insurance_baked_augmented_tbl
132 | )
133 | 
134 | ### See the estimated beta coefficients ----
135 | model_fit_augmented
136 | 
137 | ### Evaluate model performance ----
138 | summary(model_fit_augmented$fit)
139 | 
140 | ### Make predictions with the regression model (you could skip this step) ----
141 | insurance_pred_tbl <- predict(
142 |   object   = model_fit_augmented,
143 |   new_data = insurance_baked_augmented_tbl,
144 |   type     = "numeric"
145 | )
146 | insurance_pred_tbl
147 | 
148 | ### Add the predictions to the tibble ----
149 | insurance_pred_tbl <- augment(model_fit_augmented, insurance_baked_augmented_tbl)
150 | insurance_pred_tbl
151 | 
152 | ### See the correlation between the actual and predicted expenses ----
153 | insurance_pred_tbl %>%
154 |   select(.pred, expenses) %>%
155 |   correlate()
156 | 
157 | # Or with the more simple vectorized version:
158 | cor(insurance_pred_tbl$.pred, insurance_pred_tbl$expenses)
159 | 
160 | ### Visualize the correlation ----
161 | insurance_pred_tbl %>%
162 |   ggplot(aes(.pred, expenses)) +
163 |   geom_point() +
164 |   geom_abline(
165 |     intercept = 0,
166 |     slope     = 1,
167 |     color     = "red",
168 |     size      = 0.5,
169 |     linetype  = "dashed"
170 |   ) +
171 |   labs(
172 |     x = "Predicted Expenses",
173 |     y = "Actual Expenses"
174 |   )
175 | 
176 | ### See the model metrics ----
177 | insurance_model_metrics <- insurance_pred_tbl %>%
178 |   metrics(
179 |     truth    = expenses,
180 |     estimate = .pred
181 |   )
182 | insurance_model_metrics
183 | 
184 | 
185 | ### Make predictions with new data ----
186 | predict(
187 |   model_fit_augmented,
188 |   tibble(
189 |     age                = 30,
190 |     age2               = 30^2,
191 |     bmi                = 30,
192 |     bmi30              = 1,
193 |     bmi30_x_smoker_yes = 0,
194 |     children           = 2,
195 |     region             = "northeast",
196 |     sex                = "male",
197 |     smoker_yes         = 0
198 |   )
199 | )
200 | 
201 | predict(
202 |   model_fit_augmented,
203 |   tibble(
204 |     age                = 30,
205 |     age2               = 30^2,
206 |     bmi                = 30,
207 |     bmi30              = 1,
208 |     bmi30_x_smoker_yes = 0,
209 |     children           = 2,
210 |     region             = "northeast",
211 |     sex                = "female",
212 |     smoker_yes         = 0
213 |   )
214 | )
215 | 
216 | predict(
217 |   model_fit_augmented,
218 |   tibble(
219 |     age                = 30,
220 |     age2               = 30^2,
221 |     bmi                = 30,
222 |     bmi30              = 1,
223 |     bmi30_x_smoker_yes = 0,
224 |     children           = 0,
225 |     region             = "northeast",
226 |     sex                = "female",
227 |     smoker_yes         = 0
228 |   )
229 | )
230 | 
231 | 
232 | ## 6. Creating a function to help evaluate the model further ----
233 | 
234 | # The assumption here is that you have already gone through steps 1. to 5.
235 | # You will see the predicted medical expenses, if you enter the following
236 | # parameters: age, BMI, number of children, region, sex and whether you smoke
237 | # or not.
238 | 
239 | predict_medical_expenses <- function(
240 |   .age,
241 |   .bmi,
242 |   .children,
243 |   .region = c("northeast", "northwest", "southeast", "southwest"),
244 |   .sex    = c("female, male"),
245 |   .smoker = c("no", "yes")
246 | ) {
247 | 
248 |   .bmi30 <- if_else(
249 |     condition = .bmi >= 30,
250 |     true      = 1,
251 |     false     = 0,
252 |     missing   = NULL
253 |   )
254 | 
255 |   .smoker_yes <- if_else(
256 |     condition = .smoker == "yes",
257 |     true      = 1,
258 |     false     = 0,
259 |     missing   = NULL
260 |   )
261 | 
262 |   .bmi30_x_smoker_yes <- if_else(
263 |     condition = .bmi30 == TRUE && .smoker_yes == TRUE,
264 |     true      = 1,
265 |     false     = 0,
266 |     missing   = NULL
267 |   )
268 | 
269 |   .prediction <- predict(
270 |     model_fit_augmented,
271 |     tibble(
272 |       age                = .age,
273 |       age2               = .age ^ 2,
274 |       bmi                = .bmi,
275 |       bmi30              = .bmi30,
276 |       bmi30_x_smoker_yes = .bmi30_x_smoker_yes,
277 |       children           = .children,
278 |       region             = .region,
279 |       sex                = .sex,
280 |       smoker_yes         = .smoker_yes
281 |     )
282 |   )
283 | 
284 |   str_glue("The predicted medical expenses according to the given parameters are: ${.prediction %>% round(0)}")
285 | 
286 | }
287 | 
288 | ### Test the function ----
289 | predict_medical_expenses(
290 |   .age      = 30,
291 |   .bmi      = 30,
292 |   .children = 0,
293 |   .region   = "northeast",
294 |   .sex      = "female",
295 |   .smoker   = "no"
296 | )
297 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_15.r:
--------------------------------------------------------------------------------
  1 | ##### Chapter 15: Making Use of Big Data -------------------
  2 | 
  3 | ## Classifying images using a pre-trained CNN in R ----
  4 | ## IMPORTANT: some of the following steps are only necessary one time.
  5 | 
  6 | # this package allows you to install packages from GitHub
  7 | library(devtools)
  8 | 
  9 | # install the latest tensorflow package
 10 | # devtools::install_github("rstudio/tensorflow") # only necessary the first time!
 11 | 
 12 | # install all of the dependencies for tensorflow (e.g., Python)
 13 | library(tensorflow)
 14 | # install_tensorflow() # only necessary the first time!
 15 | 
 16 | # install the latest keras package
 17 | # devtools::install_github("rstudio/keras") # only necessary the first time!
 18 | 
 19 | # Begin the Example
 20 | # load the ResNet50 model with weights trained on the ImageNet
 21 | library(keras)
 22 | m_resnet50 <- application_resnet50(weights = 'imagenet')
 23 | 
 24 | # load the image and convert to array
 25 | img <- image_load("ice_cream.jpg", target_size = c(224,224))
 26 | x <- image_to_array(img)
 27 | 
 28 | # x is now a 3d tensor: y, x, and color channel (r, g, b)
 29 | # note: (y, x) begins counting with (1, 1) in top-left and (1, 224) in top-right
 30 | dim(x)
 31 | str(x)
 32 | 
 33 | # example pixels
 34 | x[1, 224, 1:3] # a white pixel in the upper-right
 35 | x[40, 145, 1:3] # a red pixel in the ice cream
 36 | 
 37 | # create a 4-dimension tensor with a constant value of '1' in the first dimension
 38 | # (the first dimension is the 'batch' and is used for multiple images)
 39 | x <- array_reshape(x, c(1, dim(x)))
 40 | dim(x) # shows the new dimensions
 41 | 
 42 | #  do some color conversions and zero-centering as pre-processing for ResNet-50
 43 | x <- imagenet_preprocess_input(x)
 44 | x[1, 40, 145, 1:3] # look at the red ice cream pixel again
 45 | 
 46 | # use the ResNet-50 model to make a prediction for the image data
 47 | p_resnet50 <- predict(m_resnet50, x)
 48 | c_resnet50 <- imagenet_decode_predictions(p_resnet50, top = 10)
 49 | 
 50 | # see the predictions
 51 | c_resnet50
 52 | 
 53 | # lapply the image processing steps on the other two images
 54 | img_list <- list("cat.jpg", "pizza.jpg")
 55 | img_data <- lapply(img_list, image_load, target_size = c(224,224))
 56 | img_arrs <- lapply(img_data, image_to_array)
 57 | img_resh <- lapply(img_arrs, array_reshape, c(1, 224, 224, 3))
 58 | img_prep <- lapply(img_resh, imagenet_preprocess_input)
 59 | img_prob <- lapply(img_prep, predict, object = m_resnet50)
 60 | 
 61 | # sapply the decode function to get the final predictions
 62 | img_classes <- sapply(img_prob, imagenet_decode_predictions, top = 3)
 63 | img_classes
 64 | 
 65 | ## Using word2vec for understanding text in R ----
 66 | 
 67 | # note: the code in this section requires the word embedding that was trained
 68 | # on the Google News archive. Download the GoogleNews-vectors-negative300.bin.gz
 69 | # file from https://code.google.com/archive/p/word2vec/ and unzip it to your R
 70 | # project folder before proceeding.
 71 | 
 72 | library(word2vec)
 73 | 
 74 | # load the Google-trained 300-dimension word2vec embedding
 75 | m_w2v <- read.word2vec(file = "GoogleNews-vectors-negative300.bin",
 76 |                        normalize = TRUE)
 77 | 
 78 | # examine the structure of the model
 79 | str(m_w2v)
 80 | 
 81 | # obtain the vector for a few terms
 82 | foods <- predict(m_w2v, c("cereal", "bacon", "eggs", "sandwich", "salad", "steak", "spaghetti"), type = "embedding")
 83 | meals <- predict(m_w2v, c("breakfast", "lunch", "dinner"), type = "embedding")
 84 | 
 85 | # examine a single word vector
 86 | head(foods["cereal", ])
 87 | 
 88 | # examine the first few columns
 89 | foods[, 1:5]
 90 | 
 91 | # compute the similarity between the foods and meals
 92 | word2vec_similarity(foods, meals)
 93 | 
 94 | # can also use cosine similarity (not shown in book)
 95 | word2vec_similarity(foods, meals, type = "cosine")
 96 | 
 97 | # create vector of hypothetical social media posts
 98 | user_posts = c(
 99 |   "I eat bacon and eggs in the morning for the most important meal of the day!",
100 |   "I am going to grab a quick sandwich this afternoon before hitting the gym.",
101 |   "Can anyone provide restaurant recommendations for my date tonight?"
102 | )
103 | 
104 | library(tm) # use tm package for removeWords() and stopwords() functions
105 | 
106 | user_posts_clean <- removeWords(user_posts, stopwords())
107 | user_posts_clean <- txt_clean_word2vec(user_posts_clean)
108 | user_posts_clean[1] # look at the first cleaned user post
109 | 
110 | # get the doc2vec vectors for the user posts
111 | post_vectors <- doc2vec(m_w2v, user_posts_clean)
112 | str(post_vectors)
113 | 
114 | # get the word2vec vectors for the meal terms
115 | meals <- predict(m_w2v, c("breakfast", "lunch", "dinner"), type = "embedding")
116 | 
117 | # compare the similarity of the posts and terms
118 | word2vec_similarity(post_vectors, meals)
119 | 
120 | ## Visualizing highly dimensional data ----
121 | 
122 | library(tidyverse)
123 | sns_terms <- read_csv("snsdata.csv") |>
124 |   select(basketball:drugs)
125 | 
126 | # find the first two principal components
127 | library(irlba)
128 | set.seed(123456)
129 | sns_pca <- sns_terms |>
130 |   prcomp_irlba(n = 2, center = TRUE, scale = TRUE) 
131 | 
132 | # create a scatterplot
133 | library(ggplot2)
134 | as.data.frame(sns_pca$x) |>
135 |   ggplot(aes(PC1, PC2)) + geom_point(size = 1, shape = 1)
136 | 
137 | ## Visualizing data's natural clusters with t-SNE ----
138 | 
139 | # take a random sample of 5,000 users
140 | library(tidyverse)
141 | set.seed(123)
142 | sns_sample <- read_csv("snsdata.csv") |>
143 |   slice_sample(n = 5000)
144 | 
145 | # run t-SNE with default parameters
146 | library(Rtsne)
147 | set.seed(123)
148 | sns_tsne <- sns_sample |>
149 |   select(basketball:drugs) |>
150 |   Rtsne(check_duplicates = FALSE)
151 | 
152 | # visualize the t-SNE result
153 | library(ggplot2)
154 | data.frame(sns_tsne$Y) |>
155 |   ggplot(aes(X1, X2)) + geom_point(size = 2, shape = 1)
156 | 
157 | # create a categorical feature for the number of terms used
158 | sns_sample_tsne <- sns_sample |>
159 |   bind_cols(data.frame(sns_tsne$Y)) |> # add the t-SNE data
160 |   rowwise() |> # work across rows rather than columns
161 |   mutate(n_terms = sum(c_across(basketball:drugs))) |>
162 |   ungroup() |> # remove rowwise behavior
163 |   mutate(`Terms Used` = if_else(n_terms > 0, "1+", "0"))
164 | 
165 | # visualize the t-SNE result by number of terms used
166 | sns_sample_tsne |>
167 |   ggplot(aes(X1, X2, shape = `Terms Used`, color = `Terms Used`)) +
168 |   geom_point(size = 2) +
169 |   scale_shape(solid = FALSE)
170 | 
171 | ## working with SQL databases ----
172 | 
173 | # a tidy approach to database connections
174 | library(DBI)
175 | library(RSQLite)
176 | 
177 | # creates a connection to the credit SQLite database
178 | con <- dbConnect(RSQLite::SQLite(), "credit.sqlite3")
179 | dbListTables(con)
180 | 
181 | # create a data frame from the result
182 | res <- dbSendQuery(con, "SELECT * FROM credit WHERE age >= 45")
183 | credit_age45 <- dbFetch(res)
184 | summary(credit_age45$age)
185 | 
186 | # disconnect from the database
187 | dbClearResult(res)
188 | dbDisconnect(con)
189 | 
190 | # connecting to databases with the odbc package
191 | # (note: this is an example for illustration only and will need to be modified for your DB)
192 | library(DBI)
193 | con <- dbConnect(odbc:odbc(), "my_data_source_name")
194 | 
195 | library(DBI)
196 | con <- dbConnect(odbc::odbc(),
197 |                  database = "my_database",
198 |                  uid = "my_username",
199 |                  pwd = "my_password",
200 |                  host = "my.server.address",
201 |                  port = 1234)
202 | 
203 | # using a database backend with dplyr
204 | library(DBI)
205 | library(dplyr)
206 | con <- dbConnect(RSQLite::SQLite(), "credit.sqlite3")
207 | credit_tbl <- con |> tbl("credit")
208 | 
209 | # compute summary statistics on age values filtered from the database
210 | credit_tbl |>
211 |   filter(age >= 45) |>
212 |   select(age) |>
213 |   collect() |>
214 |   summary()
215 | 
216 | # show the average loan amount by loan default status, age 45+
217 | credit_tbl |>
218 |   filter(age >= 45) |>
219 |   group_by(default) |>
220 |   summarize(mean_amount = avg(amount))
221 | 
222 | # show the SQL used for the prior analysis
223 | credit_tbl |>
224 |   filter(age >= 45) |>
225 |   group_by(default) |>
226 |   summarize(mean_amount = avg(amount)) |>
227 |   show_query()
228 | 
229 | ## Measuring execution time ----
230 | 
231 | system.time(rnorm(1000000))
232 | 
233 | ## Working in parallel ----
234 | 
235 | library(parallel)
236 | detectCores()
237 | 
238 | # note: the following will only work on non-Windows systems (i.e., MacOSX or Unix/Linux)
239 | # you will also need enough cores to complete each task!
240 | 
241 | # random number generation using multicore
242 | # one core
243 | system.time(l1 <- unlist(mclapply(1:10, function(x) {
244 |   rnorm(10000000)}, mc.cores = 1)))
245 | 
246 | # two cores
247 | system.time(l2 <- unlist(mclapply(1:10, function(x) {
248 |   rnorm(10000000)}, mc.cores = 2)))
249 | 
250 | # four cores
251 | system.time(l4 <- unlist(mclapply(1:10, function(x) {
252 |   rnorm(10000000) }, mc.cores = 4)))
253 | 
254 | # eight cores
255 | system.time(l8 <- unlist(mclapply(1:10, function(x) {
256 |   rnorm(10000000) }, mc.cores = 8)))
257 | 
258 | # creating a 4-node cluster with snow
259 | cl1 <- makeCluster(4)
260 | 
261 | # confirm that the cluster is functioning
262 | clusterCall(cl1, function() { Sys.info()["nodename"] })
263 | 
264 | # running the same function on each node (not shown in book)
265 | clusterCall(cl1, function() { print("ready!") })
266 | 
267 | # running a different operation on each node
268 | clusterApply(cl1, c('A', 'B', 'C', 'D'),
269 |              function(x) { paste("Cluster", x, "ready!") })
270 | 
271 | # close the cluster (IMPORTANT STEP!)
272 | stopCluster(cl1)
273 | 
274 | ## Parallel loops with foreach ----
275 | 
276 | # generate 100 million random numbers
277 | system.time(l1 <- rnorm(100000000))
278 | 
279 | # combine four sets of 25 million random numbers
280 | library(foreach)
281 | system.time(l4 <- foreach(i = 1:4, .combine = 'c')
282 |             %do% rnorm(25000000))
283 | 
284 | # confirm the number of cores
285 | detectCores()
286 | 
287 | # parallel the above foreach loop
288 | library(doParallel)
289 | registerDoParallel(cores = 4)
290 | system.time(l4p <- foreach(i = 1:4, .combine = 'c')
291 |             %dopar% rnorm(25000000))
292 | 
293 | stopImplicitCluster()
294 | 
295 | ## Parallel processing with caret ----
296 | 
297 | # training a random forest without allowing parallel computing
298 | library(caret)
299 | credit <- read.csv("credit.csv")
300 | system.time(train(default ~ ., data = credit, method = "rf",
301 |                   trControl = trainControl(allowParallel = FALSE)))
302 | 
303 | # training the same random forest in parallel (8 cores)
304 | library(doParallel)
305 | registerDoParallel(cores = 8)
306 | system.time(train(default ~ ., data = credit, method = "rf"))
307 | stopImplicitCluster()
308 | 
309 | ## Parallel cloud computing with Apache Spark
310 | 
311 | library(sparklyr)
312 | spark_install() # only need to run this the first time using Spark
313 | spark_cluster <- spark_connect(master = "local")
314 | 
315 | credit_spark <- spark_read_csv(spark_cluster, "credit.csv")
316 | 
317 | # split the credit_spark dataset into training and testing
318 | splits <- sdf_random_split(credit_spark,
319 |                            train = 0.75, test = 0.25,
320 |                            seed = 123)
321 | 
322 | # build a random forest model using Spark
323 | credit_rf <- splits$train |>
324 |   ml_random_forest(default ~ .)
325 | 
326 | # make predictions on the test set
327 | pred <- ml_predict(credit_rf, splits$test)
328 | 
329 | ml_binary_classification_evaluator(pred, metric_name = "areaUnderROC")
330 | 
331 | spark_disconnect(spark_cluster)
332 | 
333 | ## Faster modeling with h2o
334 | 
335 | library(h2o)
336 | h2o_instance <- h2o.init()
337 | credit.hex <- h2o.uploadFile("credit.csv")
338 | 
339 | h2o.randomForest(y = "default",
340 |                  training_frame = credit.hex,
341 |                  ntrees = 500,
342 |                  seed = 123)
343 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_13.r:
--------------------------------------------------------------------------------
  1 | ##### Chapter 13: Challenging Data --------------------
  2 | 
  3 | ## Stepwise regression ----
  4 | 
  5 | # read the data and do some simple data preparation
  6 | library(tidyverse)
  7 | titanic_train <- read_csv("titanic_train.csv") |>
  8 |   mutate(
  9 |     Age_MVI = if_else(is.na(Age), 1, 0),
 10 |     Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age),
 11 |     Cabin = if_else(is.na(Cabin), "X", Cabin),
 12 |     Embarked = factor(if_else(is.na(Embarked), "X", Embarked)),
 13 |     Sex = factor(Sex)
 14 |   )
 15 | 
 16 | # specify the simplest logistic regression model
 17 | simple_model <- glm(Survived ~ 1, family = binomial, data = titanic_train)
 18 |   
 19 | # specify the full logistic regression model
 20 | full_model <- glm(Survived ~ Age + Age_MVI + Embarked + Sex + Pclass + SibSp + Fare,
 21 |                   family = binomial, data = titanic_train)
 22 | 
 23 | # forward stepwise regression
 24 | sw_forward <- stats::step(simple_model, scope = formula(full_model),
 25 |                           direction = "forward")
 26 | 
 27 | # obtain the formula for the final model
 28 | formula(sw_forward)
 29 | 
 30 | # the final model's regression coefficients
 31 | sw_forward$coefficients
 32 | 
 33 | # backward stepwise
 34 | sw_backward <- stats::step(full_model, direction = "backward")
 35 | 
 36 | ## Feature selection with Boruta ----
 37 | 
 38 | set.seed(12345) # set the random seed to ensure results match
 39 | # create a feature with random values to demonstrate a useless feature
 40 | titanic_train$rand_vals <- runif(n = 891, min = 1, max = 100)
 41 | 
 42 | # run Boruta on the Titanic dataset (this can take a long time for larger datasets)
 43 | library(Boruta)
 44 | titanic_boruta <- Boruta(Survived ~ PassengerId + Age + 
 45 |                            Sex + Pclass + SibSp + rand_vals,
 46 |                          data = titanic_train, doTrace = 1)
 47 | # check the result
 48 | titanic_boruta
 49 | 
 50 | # plot the feature importance
 51 | plot(titanic_boruta)
 52 | 
 53 | ## Principal Component Analysis (PCA) ----
 54 | 
 55 | library(tidyverse) # load the tidyverse suite of packages
 56 | 
 57 | sns_data <- read_csv("snsdata.csv") # read the teenage social media data
 58 | 
 59 | # select only the 36 columns from the column named 'basketball' through the one named 'drugs'
 60 | # each column contains the count of times each social media profile used the respective term
 61 | sns_terms <- sns_data |> select(basketball:drugs)
 62 | 
 63 | # the irlba library provides a more efficient PCA function than R's built-in prcomp()
 64 | library(irlba)
 65 | 
 66 | # run the PCA - note that we center and re-scale the data here
 67 | set.seed(2023) # to ensure the results match the book
 68 | sns_pca <- sns_terms |> 
 69 |   prcomp_irlba(n = 10, center = TRUE, scale = TRUE) # find first 10 principal components of the SNS data
 70 | 
 71 | # create scree plot of the SNS data PCA
 72 | screeplot(sns_pca, npcs = 10, type = "lines",
 73 |           main = "Scree Plot of SNS Data Principal Components")
 74 | 
 75 | # use summary to see the components and the proportion of variance explained
 76 | summary(sns_pca)
 77 | 
 78 | # examine the PCA object -- we care most about the $x and $rotation components
 79 | str(sns_pca)
 80 | 
 81 | # the $x component is the transformed version of the original data
 82 | str(sns_pca$x)
 83 | 
 84 | # the $x is our original data transformed to have new "features" -- the principal components
 85 | nrow(sns_pca$x) # should have 30,000 rows
 86 | head(sns_pca$x) # should have ten columns
 87 | 
 88 | # create a "long" version of the PCA dataset for visualization
 89 | sns_pca_long <- tibble(SNS_Term = colnames(sns_terms), as_tibble(sns_pca$rotation)) |> # add the row labels
 90 |   pivot_longer(PC1:PC10, names_to = "PC", values_to = "Contribution") # go from a wide to long dataset
 91 | 
 92 | # use ggplot to visualize the terms that are important for PC4
 93 | library(ggplot2)
 94 | 
 95 | sns_pca_long |>
 96 |   filter(PC == "PC3") |>
 97 |   top_n(15, abs(Contribution)) |>
 98 |   mutate(SNS_Term = reorder(SNS_Term, Contribution)) |>
 99 |   ggplot(aes(SNS_Term, Contribution, fill = SNS_Term)) +
100 |     geom_col(show.legend = FALSE, alpha = 0.8) +
101 |     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 
102 |           axis.ticks.x = element_blank()) + 
103 |     labs(x = "Social Media Term",
104 |          y = "Relative Importance to Principal Component",
105 |          title = "Top 15 Contributors to PC3")
106 | 
107 | # create a function to visualize the four other components
108 | plot_pca <- function(component) {
109 |   sns_pca_long |>
110 |     filter(PC == component) |>
111 |     top_n(15, abs(Contribution)) |>
112 |     mutate(SNS_Term = reorder(SNS_Term, Contribution)) |>
113 |     ggplot(aes(SNS_Term, Contribution, fill = SNS_Term)) +
114 |     geom_col(show.legend = FALSE, alpha = 0.8) +
115 |     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 
116 |           axis.ticks.x = element_blank()) + 
117 |     labs(x = "Social Media Term",
118 |          y = "Relative Importance to Principal Component",
119 |          title = paste("Top 15 Contributors to", component))
120 | }
121 | 
122 | # use the function
123 | plot_pca("PC1")
124 | plot_pca("PC2")
125 | plot_pca("PC4")
126 | plot_pca("PC5")
127 | 
128 | # we can use the principal components to predict number of friends
129 | sns_data_pca <- cbind(sns_data[1:4], sns_pca$x) # join the principal components to the original data
130 | 
131 | # create a linear regression model predicting friends from the principal components
132 | m <- lm(friends ~ PC1 + PC2 + PC3 + PC4 + PC5, data = sns_data_pca)
133 | 
134 | m # show the model coefficients
135 | 
136 | ## Remapping sparse categorical data ----
137 | 
138 | library(tidyverse)
139 | 
140 | # read the Titanic dataset and create Title feature (from Chapter 12)
141 | titanic_train <- read_csv("titanic_train.csv") |>
142 |   mutate(Title = str_extract(Name, ", [A-z]+\\.")) |>
143 |   mutate(Title = str_replace_all(Title, "[, \\.]", ""))
144 | 
145 | # the Title feature has a large number of categories
146 | table(titanic_train$Title, useNA = "ifany")
147 | 
148 | # group categories with similar real-world meaning
149 | titanic_train <- titanic_train |>
150 |   mutate(TitleGroup = fct_collapse(Title, 
151 |     Mr = "Mr",
152 |     Mrs = "Mrs",
153 |     Master = "Master",
154 |     Miss = c("Miss", "Mlle", "Mme", "Ms"),
155 |     Noble = c("Don", "Sir", "Jonkheer", "Lady"),
156 |     Military = c("Capt", "Col", "Major"),
157 |     Doctor = "Dr",
158 |     Clergy = "Rev",
159 |     other_level = "Other")
160 |   ) |>
161 |   mutate(TitleGroup = fct_na_value_to_level(TitleGroup,
162 |                                             level = "Unknown"))
163 | 
164 | # examine the recoding
165 | table(titanic_train$TitleGroup)
166 | 
167 | # look at the counts and proportions of all levels, sorted largest to smallest
168 | fct_count(titanic_train$Title, sort = TRUE, prop = TRUE)
169 | 
170 | # lump together everything outside of the top three levels
171 | table(fct_lump_n(titanic_train$Title, n = 3))
172 | 
173 | # lump together everything with less than 1%
174 | table(fct_lump_prop(titanic_train$Title, prop = 0.01))
175 | 
176 | # lump together everything with fewer than 5 observations
177 | table(fct_lump_min(titanic_train$Title, min = 5))
178 | 
179 | ## Binning sparse numeric data ----
180 | 
181 | # examine the Titanic fare data
182 | head(titanic_train$Fare)
183 | summary(titanic_train$Fare)
184 | 
185 | # create a binary variable for first/second class
186 | titanic_train <- titanic_train |> mutate(
187 |   fare_firstclass = if_else(Fare >= 31, 1, 0, missing = 0)
188 | )
189 | 
190 | # tabulate the binary values
191 | table(titanic_train$fare_firstclass)
192 | 
193 | # create a three-level feature using case_when()
194 | titanic_train <- titanic_train |>
195 |   mutate(
196 |     fare_class = case_when(
197 |       Fare >= 31 ~ "1st Class",
198 |       Fare >= 15 ~ "2nd Class",
199 |       TRUE ~ "3rd Class"
200 |     )
201 |   )
202 | 
203 | # examine the result
204 | table(titanic_train$fare_class)
205 | 
206 | # the cut() function can accomplish the same as the above case_when()
207 | table(cut(titanic_train$Fare, breaks = c(-Inf, 15, 31, Inf),
208 |           right = FALSE))
209 | 
210 | # use cut() with seq() to generate evenly-sized break points
211 | table(cut(titanic_train$Fare, right = FALSE,
212 |           breaks = seq(from = 0, to = 550, by = 50)))
213 | 
214 | # use cut() with quantiles() and seq() to create bins with equal numbers of examples
215 | table(cut(titanic_train$Fare, right = FALSE,
216 |           breaks = quantile(titanic_train$Fare,
217 |                             probs = seq(0, 1, 0.20))))
218 | 
219 | # use the tidyverse ntile() function to create five bins
220 | table(ntile(titanic_train$Fare, n = 5))
221 | 
222 | # convert the ntile() groups to a factor
223 | titanic_train <- titanic_train |>
224 |   mutate(fare_level = factor(ntile(Fare, n = 11)))
225 | 
226 | table(titanic_train$fare_level)
227 | 
228 | ## Performing missing value imputation ----
229 | 
230 | library(readr)
231 | titanic_train <- read_csv("titanic_train.csv")
232 | 
233 | # impute arbitrary text strings for missing categorical data
234 | titanic_train <- titanic_train |>
235 |   mutate(
236 |     Cabin = if_else(is.na(Cabin), "X", Cabin),
237 |     Embarked = if_else(is.na(Embarked), "Unknown", Embarked)
238 |   )
239 | 
240 | # impute mean value and create missing value indicator for age
241 | titanic_train <- titanic_train |>
242 |   mutate(
243 |     Age_MVI = if_else(is.na(Age), 1, 0),
244 |     Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age)
245 |   )
246 | 
247 | ## Simple strategies for rebalancing data ----
248 | 
249 | # load and prepare the teenage social media data
250 | library(tidyverse)
251 | 
252 | snsdata <- read_csv("snsdata.csv") |>
253 |   mutate(
254 |     gender = fct_recode(gender, Female = "F", Male = "M"),
255 |     gender = fct_na_value_to_level(gender, level = "Unknown"),
256 |     age = ifelse(age < 13 | age > 20, NA, age) # replace age outliers
257 |   ) |>
258 |   group_by(gradyear) |>
259 |   mutate(age_imp = if_else(is.na(age), median(age, na.rm = TRUE), age)) |>
260 |   ungroup() |>
261 |   select(gender, friends, gradyear, age_imp, basketball:drugs)
262 | 
263 | # examine the initial class imbalance
264 | fct_count(snsdata$gender, prop = TRUE)
265 | 
266 | # undersample the majority classes
267 | library(caret)
268 | sns_undersample <- downSample(x = snsdata[2:40], y = snsdata$gender, yname = "gender")
269 | fct_count(sns_undersample$gender, prop = TRUE)
270 | 
271 | # oversample the minority classes
272 | library(caret)
273 | sns_oversample <- upSample(x = snsdata[2:40], y = snsdata$gender, yname = "gender")
274 | fct_count(sns_oversample$gender, prop = TRUE)
275 | 
276 | ## Generating a synthetic balanced dataset with SMOTE ----
277 | 
278 | # create a gender balanced dataset using SMOTE
279 | library(themis)
280 | sns_balanced <- snsdata |> smote("gender") # simple syntax (without normalization)
281 | 
282 | # check that the dataset is now gender balanced
283 | table(sns_balanced$gender)
284 | 
285 | # for a better SMOTE, create a normalize function (introduced in Chapter 3)
286 | normalize <- function(x) {
287 |   return ((x - min(x)) / (max(x) - min(x)))
288 | }
289 | 
290 | # an unnormalize function returns data to original scale (introduced in Chapter 7)
291 | unnormalize <- function(norm_vals, col_name) {
292 |   old_vals <- snsdata[col_name]
293 |   unnormalized_vals <- norm_vals * (max(old_vals) - min(old_vals)) + min(old_vals)
294 |   
295 |   # round all columns to integers except age_imp
296 |   rounded_vals <- if(col_name != "age_imp") { round(unnormalized_vals) }
297 |                   else {unnormalized_vals}
298 |   
299 |   return (rounded_vals)
300 | }
301 | 
302 | # more advanced smote() process with normalized data
303 | snsdata_balanced <- snsdata |>
304 |   mutate(across(where(is.numeric), normalize)) |> # normalize the numeric data
305 |   smote("gender") |>
306 |   mutate(across(where(is.numeric), ~unnormalize(.x, cur_column()))) # unnormalize the data
307 | 
308 | # confirm that the rebalanced dataset worked correctly
309 | table(snsdata$gender)
310 | table(snsdata_balanced$gender)
311 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_02_managing_and_understanding_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 2: Managing and Understanding Data"
  3 | author: "Original Code: Brett Lantz | Modifications: Antti Rask"
  4 | date: "2023-07-23"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | # 2 Managing and Understanding Data
 13 | 
 14 | ## Load Packages
 15 | 
 16 | ```{r}
 17 | library(conflicted) # An Alternative Conflict Resolution Strategy
 18 |     conflicts_prefer(dplyr::filter)
 19 |     conflicts_prefer(janitor::chisq.test)
 20 | library(crosstable) # Crosstables for Descriptive Analyses
 21 | library(janitor)    # Simple Tools for Examining and Cleaning Dirty Data
 22 | library(tidyverse)  # Easily Install and Load the 'Tidyverse'
 23 | ```
 24 | 
 25 | ## Data
 26 | 
 27 | ```{r}
 28 | # create tibble of data for three medical patients
 29 | patients_tbl <- tibble(
 30 |     subject_name = c("John Doe", "Jane Doe", "Steve Graves"),
 31 |     temperature  = c(98.1, 98.6, 101.4),
 32 |     flu_status   = c(FALSE, FALSE, TRUE),
 33 |     gender       = factor(c("MALE", "FEMALE", "MALE")),
 34 |     blood        = factor(
 35 |         c("O", "AB", "A"),
 36 |         levels = c("A", "B", "AB", "O")
 37 |     ),
 38 |     symptoms     = factor(
 39 |         c("SEVERE", "MILD", "MODERATE"),
 40 |         levels = c("MILD", "MODERATE", "SEVERE"),
 41 |         ordered = TRUE
 42 |     )
 43 | )
 44 | 
 45 | patients_tbl
 46 | ```
 47 | 
 48 | ## R data structures
 49 | 
 50 | ### Vectors -> Tibble
 51 | 
 52 | ```{r}
 53 | # access the second body temperature
 54 | patients_tbl %>%
 55 |     slice(2) %>%
 56 |     pull(temperature)
 57 | ```
 58 | 
 59 | ```{r}
 60 | # access body temperature for the second and third patient
 61 | patients_tbl %>%
 62 |     slice(2:3) %>%
 63 |     pull(temperature)
 64 | ```
 65 | 
 66 | ```{r}
 67 | # access body temperature for all but the second patient
 68 | patients_tbl %>%
 69 |     slice(-2) %>%
 70 |     pull(temperature)
 71 | ```
 72 | 
 73 | ```{r}
 74 | # Use a vector to indicate whether to include item
 75 | patients_tbl %>%
 76 |     filter(c(TRUE, TRUE, FALSE)) %>%
 77 |     pull(temperature)
 78 | ```
 79 | 
 80 | ```{r}
 81 | # filter rows by conditions
 82 | patients_tbl %>%
 83 |     filter(temperature > 100) %>% 
 84 |     pull(subject_name)
 85 | ```
 86 | 
 87 | ### Factors
 88 | 
 89 | ```{r}
 90 | # check gender factors
 91 | patients_tbl %>%
 92 |     pull(gender)
 93 | ```
 94 | 
 95 | ```{r}
 96 | # check blood type factors
 97 | patients_tbl %>% 
 98 |     pull(blood)
 99 | ```
100 | 
101 | ```{r}
102 | # check symptom factors
103 | patients_tbl %>%
104 |     pull(symptoms)
105 | ```
106 | 
107 | ```{r}
108 | # check for symptoms greater than moderate
109 | patients_tbl %>%
110 |     mutate(symptoms_severe = symptoms > "MODERATE") %>% 
111 |     pull(symptoms_severe)
112 | ```
113 | 
114 | ### Lists/Tibble
115 | 
116 | ```{r}
117 | # display information for a patient
118 | patients_tbl %>%
119 |     slice(1)
120 | ```
121 | 
122 | ```{r}
123 | # convert the tibble to a list
124 | patient_1_list <- patients_tbl %>%
125 |     slice(1) %>% 
126 |     as.list()
127 | 
128 | patient_1_list
129 | ```
130 | 
131 | #### Methods for Accessing a List
132 | 
133 | ```{r}
134 | # get a single list value by position (returns a sub-list)
135 | patient_1_list[2]
136 | ```
137 | 
138 | ```{r}
139 | # get a single list value by position (returns a numeric vector)
140 | patient_1_list %>% 
141 |     pluck(2)
142 | ```
143 | 
144 | ```{r}
145 | # get a single list value by name
146 | patient_1_list %>% 
147 |     pluck("temperature")
148 | ```
149 | 
150 | ```{r}
151 | # get several list items by specifying a vector of names
152 | patient_1_list[c("temperature", "flu_status")]
153 | ```
154 | #### Access a List Like a Vector
155 | 
156 | ```{r}
157 | # get values 2 and 3
158 | patient_1_list[2:3]
159 | ```
160 | 
161 | ### Tibble
162 | 
163 | #### Accessing a tibble
164 | 
165 | ```{r}
166 | # display the data tibble
167 | patients_tbl
168 | ```
169 | 
170 | ```{r}
171 | # get a single column
172 | patients_tbl %>%
173 |     pull(subject_name)
174 | ```
175 | 
176 | ```{r}
177 | # get several columns by specifying a vector of names
178 | patients_tbl %>% 
179 |     select(c("temperature", "flu_status"))
180 | ```
181 | 
182 | ```{r}
183 | # this is the same as above, extracting temperature and flu_status
184 | patients_tbl %>% 
185 |     select(2:3)
186 | ```
187 | 
188 | ```{r}
189 | # accessing by row and column
190 | patients_tbl %>% 
191 |     slice(c(1, 3)) %>%
192 |     select(c(2, 4))
193 | ```
194 | 
195 | #### Extract All Rows or Columns
196 | 
197 | ```{r}
198 | # column 1, all rows
199 | patients_tbl %>% 
200 |     pull(1)
201 | ```
202 | 
203 | ```{r}
204 | # row 1, all columns
205 | patients_tbl %>%
206 |     slice(1)
207 | ```
208 | 
209 | ```{r}
210 | # all rows and all columns
211 | patients_tbl
212 | ```
213 | 
214 | ```{r}
215 | # the following are equivalent
216 | patients_tbl %>% 
217 |     slice(c(1, 3)) %>%
218 |     select(c(temperature, gender))
219 | 
220 | patients_tbl %>%
221 |     slice(-2) %>%
222 |     select(c(-1, -3, -5, -6))
223 | ```
224 | 
225 | ```{r}
226 | # creating a Celsius temperature column, then comparing before and after
227 | patients_tbl %>%
228 |     mutate(temp_c = (temperature - 32) * (5 / 9)) %>%
229 |     select(temperature, temp_c)
230 | ```
231 | 
232 | ### Matrixes
233 | 
234 | ```{r}
235 | # create a 2x2 matrix
236 | matrix(c(1, 2, 3, 4), nrow = 2)
237 | ```
238 | 
239 | ```{r}
240 | # equivalent to the above
241 | matrix(c(1, 2, 3, 4), ncol = 2)
242 | ```
243 | 
244 | ```{r}
245 | # create a 2x3 matrix
246 | matrix(c(1, 2, 3, 4, 5, 6), nrow = 2)
247 | ```
248 | 
249 | ```{r}
250 | # create a 3x2 matrix
251 | m <- matrix(c(1, 2, 3, 4, 5, 6), ncol = 2)
252 | m
253 | ```
254 | 
255 | ```{r}
256 | # extract values from matrixes
257 | m[1, 1]
258 | m[3, 2]
259 | ```
260 | 
261 | ```{r}
262 | # extract rows
263 | m[1, ]
264 | ```
265 | 
266 | ```{r}
267 | # extract columns
268 | m[, 1]
269 | ```
270 | 
271 | ## Managing data with R
272 | 
273 | #### Saving, Loading, and Removing R Data Structures
274 | 
275 | ```{r}
276 | # show all data structures in memory
277 | ls()
278 | ```
279 | 
280 | ```{r}
281 | # remove the m and patient_1_list objects
282 | rm(m, patient_1_list)
283 | ls()
284 | ```
285 | 
286 | ```{r}
287 | # remove all (remaining) objects
288 | rm(list = ls())
289 | ls()
290 | ```
291 | 
292 | ## Importing and saving datasets from CSV files
293 | 
294 | ```{r}
295 | # reading a CSV file
296 | patients_tbl <- read_csv("data/pt_data.csv")
297 | ```
298 | 
299 | ```{r}
300 | # reading a CSV file and converting all character columns to factors
301 | patients_factors_tbl <- read_csv(
302 |     "data/pt_data.csv",
303 |     col_types      = "cdlff",
304 |     show_col_types = TRUE
305 | )
306 | ```
307 | 
308 | ```{r}
309 | # alternatively reading a CSV file and converting all character columns to factors
310 | patients_factors_tbl <- read_csv("data/pt_data.csv") %>%
311 |     mutate(across(where(is.character), as.factor))
312 | 
313 | patients_factors_tbl
314 | ```
315 | 
316 | ## Exploring and understanding data
317 | 
318 | ### Data Exploration Example Using Used Car Data
319 | 
320 | ```{r}
321 | usedcars <- read_csv("data/usedcars.csv")
322 | ```
323 | 
324 | ```{r}
325 | # get structure of used car data
326 | usedcars %>%
327 |     str()
328 | ```
329 | 
330 | ### Exploring numeric variables
331 | 
332 | ```{r}
333 | # summarize numeric variables
334 | usedcars %>% 
335 |     select(year, price, mileage) %>% 
336 |     summary()
337 | ```
338 | 
339 | ```{r}
340 | # calculate the mean income
341 | (36000 + 44000 + 56000) / 3
342 | 
343 | c(36000, 44000, 56000) %>%
344 |     mean()
345 | ```
346 | 
347 | ```{r}
348 | # the median income
349 | c(36000, 44000, 56000) %>%
350 |     median()
351 | ```
352 | 
353 | ```{r}
354 | # the min/max of used car prices
355 | usedcars %>%
356 |     pull(price) %>% 
357 |     range()
358 | ```
359 | 
360 | ```{r}
361 | # the difference of the range
362 | usedcars %>% 
363 |     pull(price) %>%
364 |     range() %>%
365 |     diff()
366 | ```
367 | 
368 | ```{r}
369 | # IQR for used car prices
370 | usedcars %>% 
371 |     pull(price) %>%
372 |     IQR()
373 | ```
374 | 
375 | ```{r}
376 | # use quantile to calculate five-number summary
377 | usedcars %>%
378 |     pull(price) %>%
379 |     quantile()
380 | ```
381 | 
382 | ```{r}
383 | # the 99th percentile
384 | usedcars %>% 
385 |     pull(price) %>% 
386 |     quantile(
387 |         probs = c(0.01, 0.99)
388 |     )
389 | ```
390 | 
391 | ```{r}
392 | # quintiles
393 | usedcars %>% 
394 |     pull(price) %>%
395 |     quantile(
396 |         seq(from = 0, to = 1, by = 0.20)
397 |     )
398 | ```
399 | 
400 | ```{r}
401 | # boxplot of used car prices and mileage
402 | usedcars %>%
403 |     ggplot(aes(y = price)) +
404 |     geom_boxplot() +
405 |     labs(
406 |         title = "Boxplot of Used Car Prices",
407 |         y     = "Price ($)"
408 |     ) +
409 |     theme_classic()
410 | 
411 | usedcars %>%
412 |     ggplot(aes(y = mileage)) +
413 |     geom_boxplot() +
414 |     labs(
415 |         title = "Boxplot of Used Car Mileage",
416 |         y     = "Odometer (mi.)"
417 |     ) +
418 |     theme_classic()
419 | ```
420 | 
421 | ```{r}
422 | # histograms of used car prices and mileage
423 | usedcars %>%
424 |     ggplot(aes(price)) +
425 |     geom_histogram(
426 |         bins  = 10,
427 |         color = "black",
428 |         fill  = "white"
429 |     ) +
430 |     labs(
431 |         title = "Histogram of Used Car Prices",
432 |         x     = "Price ($)"
433 |     ) +
434 |     theme_classic()
435 | 
436 | usedcars %>%
437 |     ggplot(aes(mileage)) +
438 |     geom_histogram(
439 |         bins  = 8,
440 |         color = "black",
441 |         fill  = "white"
442 |     ) +
443 |     labs(
444 |         title = "Histogram of Used Car Mileage",
445 |         x     = "Odometer (mi.)"
446 |     ) +
447 |     theme_classic()
448 | ```
449 | 
450 | ```{r}
451 | # variance and standard deviation of the used car data
452 | usedcars %>%
453 |     pull(price) %>% 
454 |     var()
455 | ```
456 | 
457 | ```{r}
458 | usedcars %>%
459 |     pull(price) %>%
460 |     sd()
461 | ```
462 | 
463 | ```{r}
464 | usedcars %>%
465 |     pull(mileage) %>%
466 |     var()
467 | ```
468 | 
469 | ```{r}
470 | usedcars %>%
471 |     pull(mileage) %>%
472 |     sd()
473 | ```
474 | 
475 | ### Exploring categorical features
476 | 
477 | ```{r}
478 | # one-way tables for the used car data
479 | usedcars %>% 
480 |     tabyl(year) %>%
481 |     adorn_pct_formatting(digits = 1)
482 | ```
483 | 
484 | ```{r}
485 | usedcars %>% 
486 |     tabyl(model) %>%
487 |     adorn_pct_formatting(digits = 1)
488 | ```
489 | 
490 | ```{r}
491 | usedcars %>% 
492 |     tabyl(color) %>%
493 |     adorn_pct_formatting(digits = 1)
494 | ```
495 | 
496 | ```{r}
497 | # you can also have a three-way table of the same
498 | usedcars %>% 
499 |     tabyl(year, model, color)
500 | ```
501 | 
502 | ### Exploring relationships between variables
503 | 
504 | ```{r}
505 | # scatterplot of price vs. mileage
506 | usedcars %>%
507 |     ggplot(aes(mileage, price)) +
508 |     geom_point() +
509 |     labs(
510 |         title = "Scatterplot of Price vs. Mileage",
511 |         x     = "Used Car Odometer (mi.)",
512 |         y     = "Used Car Price ($)"
513 |     ) +
514 |     theme_classic()
515 | ```
516 | 
517 | ```{r}
518 | # new variable indicating conservative colors
519 | usedcars_conservative <- usedcars %>%
520 |     mutate(conservative = color %in% c("Black", "Gray", "Silver", "White"))
521 | ```
522 | 
523 | ```{r}
524 | # checking our variable
525 | usedcars_conservative %>% 
526 |     tabyl(conservative) %>%
527 |     adorn_pct_formatting(digits = 0)
528 | ```
529 | 
530 | ```{r}
531 | # crosstab of model by conservative 
532 | usedcars_conservative %>%
533 |     crosstable(
534 |         cols            = model,
535 |         by              = conservative,
536 |         label           = FALSE,
537 |         total           = "both",
538 |         percent_pattern = "{n} ({p_row}/{p_col}/{p_tot})",
539 |         percent_digits  = 1
540 |     ) %>%
541 |     as_flextable(compact = TRUE)
542 | ```
543 | 
544 | ```{r}
545 | # compute the chi-square test
546 | usedcars_conservative %>%
547 |     tabyl(model, conservative) %>%
548 |     chisq.test()
549 | ```
550 | 
551 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/decision_trees_and_rules/identifying_risky_bank_loans_using_C5.0_decision_trees.r:
--------------------------------------------------------------------------------
  1 | # Identifying Risky Bank Loans Using C5.0 Decision Trees ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R, 
  4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
  5 | # Chapter 10: Evaluating Model Performance
  6 | #
  7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
  8 | # wanted to see how one could recreate it using mainly {tidymodels} and
  9 | # {tidyverse}.
 10 | #
 11 | # You can find the original code and the slightly modified dataset here:
 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
 14 | 
 15 | ## 1. Loading libraries (in the order they get used) ----
 16 | library(conflicted)
 17 | library(tidyverse)
 18 | library(tidymodels)
 19 | 
 20 | ## 2. Exploring and preparing the data ----
 21 | credit_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/credit.csv")
 22 | 
 23 | ### Examine the structure of the credit data ----
 24 | glimpse(credit_tbl)
 25 | 
 26 | ### look at two characteristics of the applicant ----
 27 | credit_tbl %>%
 28 |     count(checking_balance) %>%
 29 |     mutate(pct = (n / sum(n) * 100))
 30 | 
 31 | credit_tbl %>%
 32 |     count(savings_balance) %>%
 33 |     mutate(pct = (n / sum(n) * 100))
 34 | 
 35 | ### look at two characteristics of the loan ----
 36 | credit_tbl %>%
 37 |     select(months_loan_duration, amount) %>%
 38 |     summary()
 39 | 
 40 | ### look at the class variable ----
 41 | credit_tbl %>%
 42 |     count(default) %>%
 43 |     mutate(pct = (n / sum(n) * 100))
 44 | 
 45 | 
 46 | ## 3. Creating the recipe and splitting the data ----
 47 | 
 48 | ### Convert strings to factors ----
 49 | recipe_obj <- recipe(
 50 |     default ~ .,
 51 |     data = credit_tbl
 52 | ) %>%
 53 |     step_string2factor(all_nominal())
 54 | recipe_obj
 55 | 
 56 | credit_factorized_tbl <- recipe_obj %>%
 57 |     prep() %>%
 58 |     bake(new_data = NULL)
 59 | credit_factorized_tbl
 60 | 
 61 | ### Create training and test data (randomly) ----
 62 | 
 63 | # Use set.seed to use the same random number sequence as the original
 64 | RNGversion("3.5.2")
 65 | set.seed(123)
 66 | 
 67 | credit_split <- initial_split(
 68 |     credit_factorized_tbl,
 69 |     prop = 0.9
 70 | )
 71 | credit_train <- training(credit_split)
 72 | credit_test  <- testing(credit_split)
 73 | 
 74 | ### Check the proportion of class variable ----
 75 | credit_train %>%
 76 |     count(default) %>%
 77 |     mutate(pct = (n / sum(n) * 100))
 78 | 
 79 | credit_test %>%
 80 |     count(default) %>%
 81 |     mutate(pct = (n / sum(n) * 100))
 82 | 
 83 | 
 84 | ## 4. Training a model on the data ----
 85 | 
 86 | ### Model specification ----
 87 | model_spec <- decision_tree(
 88 |     mode            = "classification",
 89 |     engine          = "C5.0",
 90 |     cost_complexity = NULL,
 91 |     tree_depth      = NULL,
 92 |     min_n           = NULL
 93 | ) %>%
 94 |     translate()
 95 | model_spec
 96 | 
 97 | ### Fit the model ----
 98 | model_fit <- fit(
 99 |     model_spec,
100 |     default ~ .,
101 |     credit_train
102 | )
103 | model_fit
104 | 
105 | model_fit %>%
106 |     extract_fit_engine() %>%
107 |     summary()
108 | 
109 | ### Make the predictions (you could skip this step) ----
110 | credit_test_pred <- predict(
111 |     object   = model_fit,
112 |     new_data = credit_test,
113 |     type     = "class"
114 | )
115 | credit_test_pred
116 | 
117 | ### Add the predictions to the test tibble ----
118 | credit_test_with_pred_tbl <- augment(model_fit, credit_test)
119 | credit_test_with_pred_tbl
120 | 
121 | 
122 | ## 5. Evaluating model performance ----
123 | 
124 | ### Create a confusion matrix ----
125 | conf_mat <- conf_mat(
126 |     data     = credit_test_with_pred_tbl,
127 |     truth    = default,
128 |     estimate = .pred_class
129 | )
130 | conf_mat
131 | 
132 | ### Visualize the confusion matrix ----
133 | conf_mat %>% autoplot(type = "heatmap")
134 | conf_mat %>% autoplot(type = "mosaic")
135 | 
136 | ### Visualize the ROC curve ----
137 | credit_test_with_pred_tbl %>%
138 |     roc_curve(
139 |         truth    = default,
140 |         estimate = .pred_no
141 |     ) %>%
142 |     autoplot()
143 | 
144 | ### Calculate the ROC AUC (area under the curve) ----
145 | credit_roc_auc <- credit_test_with_pred_tbl %>%
146 |     roc_auc(
147 |         truth    = default,
148 |         estimate = .pred_no
149 |     )
150 | credit_roc_auc
151 | 
152 | ### Put together other model metrics ----
153 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
154 | classification_metrics <- conf_mat(
155 |     credit_test_with_pred_tbl,
156 |     truth    = default,
157 |     estimate = .pred_class
158 | ) %>%
159 |     summary()
160 | classification_metrics
161 | 
162 | 
163 | ## 6. Improving model performance ----
164 | 
165 | ### Boost the decision tree with 10 trials ----
166 | model_spec_boost_tree <- boost_tree(
167 |     mode            = "classification",
168 |     engine          = "C5.0",
169 |     trees           = 10,
170 |     min_n           = NULL,
171 |     sample_size     = NULL
172 | ) %>%
173 |     translate()
174 | model_spec_boost_tree
175 | 
176 | ### Fit the model ----
177 | model_fit_boost_tree <- fit(
178 |     model_spec_boost_tree,
179 |     default ~ .,
180 |     credit_train
181 | )
182 | model_fit_boost_tree
183 | 
184 | model_fit_boost_tree %>%
185 |     extract_fit_engine() %>%
186 |     summary()
187 | 
188 | ### Make the predictions (you could skip this step) ----
189 | credit_test_pred_boost_tree <- predict(
190 |     object   = model_fit_boost_tree,
191 |     new_data = credit_test,
192 |     type     = "class"
193 | )
194 | credit_test_pred_boost_tree
195 | 
196 | ### Add the predictions to the test tibble ----
197 | credit_test_with_pred_boost_tree <- augment(model_fit_boost_tree, credit_test)
198 | credit_test_with_pred_boost_tree
199 | 
200 | ### Create a confusion matrix ----
201 | conf_mat_boost_tree <- conf_mat(
202 |     data     = credit_test_with_pred_boost_tree,
203 |     truth    = default,
204 |     estimate = .pred_class
205 | )
206 | conf_mat_boost_tree
207 | 
208 | ### Visualize the confusion matrix ----
209 | conf_mat_boost_tree %>% autoplot(type = "heatmap")
210 | conf_mat_boost_tree %>% autoplot(type = "mosaic")
211 | 
212 | ### Visualize the ROC curve ----
213 | credit_test_with_pred_boost_tree %>%
214 |     roc_curve(
215 |         truth    = default,
216 |         estimate = .pred_no
217 |     ) %>%
218 |     autoplot()
219 | 
220 | ### Calculate the ROC AUC (area under the curve) ----
221 | credit_roc_auc_boost_tree <- credit_test_with_pred_boost_tree %>%
222 |     roc_auc(
223 |         truth    = default,
224 |         estimate = .pred_no
225 |     )
226 | credit_roc_auc_boost_tree
227 | 
228 | ### Put together other model metrics ----
229 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
230 | classification_metrics_boost_tree <- conf_mat(
231 |     credit_test_with_pred_boost_tree,
232 |     truth    = default,
233 |     estimate = .pred_class
234 | ) %>%
235 |     summary()
236 | classification_metrics_boost_tree
237 | 
238 | 
239 | ## 7. Creating a function to help evaluate the model further ----
240 | 
241 | # The assumption here is that you have already gone through steps 1. to 2.
242 | # What we're potentially tuning here are the arguments .tree_depth and .min_n
243 | # for decision_tree, and .trees and .min_n for boost_tree.
244 | 
245 | classify_with_c5_trees <- function(
246 |     .model           = c("decision_tree", "boost_tree"),
247 |     .mode            = "classification",
248 |     .engine          = "C5.0",
249 |     .tree_depth      = NULL, # for decision_tree
250 |     .trees           = NULL, # for boost_tree
251 |     .min_n           = 1     # for both
252 | ) {
253 |     
254 |     # Create the recipe
255 |     recipe_obj <- recipe(
256 |         default ~ .,
257 |         data = credit_tbl
258 |     ) %>%
259 |         step_string2factor(all_nominal())
260 |     
261 |     credit_factorized_tbl <- recipe_obj %>%
262 |         prep() %>%
263 |         bake(new_data = NULL)
264 |     
265 |     # Create training and test data (randomly)
266 |     RNGversion("3.5.2")
267 |     set.seed(123)
268 |     
269 |     credit_split <- initial_split(
270 |         credit_factorized_tbl,
271 |         prop = 0.9
272 |     )
273 |     credit_train <- training(credit_split)
274 |     credit_test  <- testing(credit_split)
275 |     
276 |     # Model specification
277 |     model <- .model
278 |     
279 |     if (model == "decision_tree") {
280 |         
281 |         model_spec <- decision_tree(
282 |             mode            = .mode,
283 |             engine          = .engine,
284 |             tree_depth      = .tree_depth,
285 |             min_n           = .min_n
286 |         ) %>%
287 |             translate()
288 |         
289 |     } else if (model == "boost_tree") {
290 |         
291 |         model_spec <- boost_tree(
292 |             mode            = .mode,
293 |             engine          = .engine,
294 |             trees           = .trees,
295 |             min_n           = .min_n
296 |         ) %>%
297 |             translate()
298 |         
299 |     } else {
300 |         
301 |         stop("The model needs to be either decision_tree or boost_tree!")
302 |         
303 |     }
304 |     
305 |     # Fit the model
306 |     model_fit <- fit(
307 |         model_spec,
308 |         default ~ .,
309 |         credit_train
310 |     )
311 |     
312 |     # Add the predictions to the test tibble
313 |     credit_test_with_pred_tbl <- augment(model_fit, credit_test)
314 |     credit_test_with_pred_tbl
315 |     
316 |     # Create a confusion matrix
317 |     conf_mat <- conf_mat(
318 |         data     = credit_test_with_pred_tbl,
319 |         truth    = default,
320 |         estimate = .pred_class
321 |     )
322 |     
323 |     conf_mat %>% autoplot(type = "heatmap")
324 |     
325 | }
326 | 
327 | ### Test the function ----
328 | classify_with_c5_trees(
329 |     .model           = "decision_tree",
330 |     .mode            = "classification",
331 |     .engine          = "C5.0",
332 |     .tree_depth      = NULL, # for decision_tree
333 |     .trees           = NULL, # for boost_tree
334 |     .min_n           = 1  # for both, NULL produces error, so > 1 is adviced
335 | )
336 | 
337 | 
338 | ## 8. Cross-validation ----
339 | # You might want to restart R (Ctrl + Shift + F10) at this point so you have a
340 | # clean slate
341 | 
342 | ### Load libraries ----
343 | library(conflicted)
344 | library(tidyverse)
345 | library(tidymodels)
346 | 
347 | ### Load the data ----
348 | credit_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/credit.csv")
349 | 
350 | ### Create the train-test split ----
351 | RNGversion("3.5.2")
352 | set.seed(123)
353 | 
354 | credit_split <- initial_split(
355 |     credit_tbl,
356 |     prop = 0.9
357 | )
358 | credit_train <- training(credit_split)
359 | credit_test  <- testing(credit_split)
360 | 
361 | ### Create the cross-validation folds ----
362 | set.seed(345)
363 | folds_train <- vfold_cv(credit_train, v = 10)
364 | folds_train
365 | 
366 | ### Create recipe, model specification and control ----
367 | recipe_train <- recipe(
368 |     default ~ .,
369 |     data = credit_train
370 | ) %>%
371 |     step_string2factor(all_nominal())
372 | 
373 | model_spec <- decision_tree(
374 |     mode            = "classification",
375 |     engine          = "C5.0",
376 |     cost_complexity = NULL,
377 |     tree_depth      = NULL,
378 |     min_n           = NULL
379 | ) %>%
380 |     translate()
381 | 
382 | control <- control_resamples(save_pred = TRUE)
383 | 
384 | ### Fit the samples ----
385 | spline_res_train <- fit_resamples(
386 |     object       = model_spec,
387 |     preprocessor = recipe_train,
388 |     resamples    = folds_train,
389 |     control      = control
390 | )
391 | 
392 | ### Look at the summarized model metrics ----
393 | spline_res_train %>%
394 |     collect_metrics()
395 | 
396 | ### Look at the individual model metrics ----
397 | spline_res_train %>%
398 |     collect_metrics(summarize = FALSE) %>% 
399 |     ggplot(
400 |         aes(
401 |             x    = .estimate,
402 |             fill = .metric
403 |         )
404 |     ) +
405 |     geom_histogram()
406 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_07_black-box_methods_neural_networks_and_support_vector_machines.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 7: Black-Box Methods - Neural Networks and Support Vector Machines"
  3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
  4 | date: "2023-07-23"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ## Loading Packages
 13 | 
 14 | ```{r}
 15 | library(conflicted) # An Alternative Conflict Resolution Strategy
 16 | library(corrr)      # Correlations in R
 17 | library(janitor)    # Simple Tools for Examining and Cleaning Dirty Data
 18 | library(tidymodels) # Easily Install and Load the 'Tidymodels' Packages
 19 | library(tidyverse)  # Easily Install and Load the 'Tidyverse'
 20 | ```
 21 | 
 22 | ## Part 1: Neural Networks
 23 | 
 24 | ### Exploring and preparing the data
 25 | 
 26 | #### Read in data and examine structure
 27 | 
 28 | ```{r}
 29 | concrete_tbl <- read_csv("data/concrete.csv")
 30 | ```
 31 | 
 32 | ```{r}
 33 | concrete_tbl %>%
 34 |     glimpse()
 35 | ```
 36 | 
 37 | #### Check the minimum and maximum strength
 38 | 
 39 | ```{r}
 40 | concrete_tbl %>%
 41 |     pull(strength) %>%
 42 |     summary()
 43 | ```
 44 | 
 45 | ### Creating the recipe
 46 | 
 47 | #### Apply normalization to the numeric predictors
 48 | 
 49 | ```{r}
 50 | recipe_obj <- recipe(
 51 |     strength ~ .,
 52 |     data = concrete_tbl
 53 | ) %>%
 54 |     step_range(
 55 |         all_numeric_predictors(),
 56 |         min = 0,
 57 |         max = 1
 58 |     )
 59 | 
 60 | recipe_obj
 61 | ```
 62 | 
 63 | ```{r}
 64 | concrete_normalized_tbl <- recipe_obj %>%
 65 |     prep() %>%
 66 |     bake(new_data = NULL)
 67 | 
 68 | concrete_normalized_tbl
 69 | ```
 70 | 
 71 | #### Create training and test data
 72 | 
 73 | ```{r}
 74 | concrete_split <- initial_time_split(
 75 |     concrete_normalized_tbl,
 76 |     prop = 773 / 1030
 77 | )
 78 | 
 79 | concrete_train <- training(concrete_split)
 80 | concrete_test  <- testing(concrete_split)
 81 | ```
 82 | 
 83 | ### Training a model on the data
 84 | 
 85 | nnet is the engine (needs to be installed if not already):
 86 | install.packages("nnet")
 87 | 
 88 | It is used as the engine for {parsnip}'s mlp() function. And since we are predicting strength, we choose regression as the mode.
 89 | 
 90 | #### Create model specification
 91 | 
 92 | ```{r}
 93 | set.seed(12345)
 94 | 
 95 | model_spec_nnet <- mlp(
 96 |     engine       = "nnet",
 97 |     mode         = "regression",
 98 |     hidden_units = 1,
 99 |     penalty      = 0,
100 |     dropout      = 0,
101 |     epochs       = 100,
102 |     activation   = NULL,
103 |     learn_rate   = NULL
104 | ) %>%
105 |     translate()
106 | 
107 | model_spec_nnet
108 | ```
109 | 
110 | #### Fit the model
111 | 
112 | ```{r}
113 | model_fit_nnet <- model_spec_nnet %>% 
114 |     fit(
115 |         strength ~ .,
116 |         concrete_train
117 |     )
118 | 
119 | model_fit_nnet
120 | ```
121 | 
122 | #### Take a closer look at the model
123 | 
124 | ```{r}
125 | model_fit_nnet$fit %>%
126 |     summary()
127 | ```
128 | 
129 | #### Make the predictions (you could skip this step)
130 | 
131 | ```{r}
132 | concrete_test_pred <- model_fit_nnet %>% 
133 |     predict(
134 |         new_data = concrete_test,
135 |         type = "numeric"
136 |     )
137 | 
138 | concrete_test_pred
139 | ```
140 | 
141 | #### Add the predictions to the test tibble
142 | 
143 | ```{r}
144 | concrete_test_with_pred <- augment(model_fit_nnet, concrete_test)
145 | concrete_test_with_pred
146 | ```
147 | 
148 | #### Metrics
149 | 
150 | ```{r}
151 | concrete_test_with_pred %>%
152 |     metrics(strength, .pred)
153 | ```
154 | 
155 | #### Visualize the network topology
156 | 
157 | Currently looking for a method to do that with tidymodels.
158 | 
159 | ### Evaluating model performance
160 | 
161 | #### Examine the correlation between predicted and actual values
162 | 
163 | ```{r}
164 | concrete_test_with_pred %>%
165 |     select(.pred, strength) %>%
166 |     correlate()
167 | ```
168 | 
169 | A simpler alternative:
170 | 
171 | ```{r}
172 | cor(
173 |     concrete_test_with_pred$.pred,
174 |     concrete_test_with_pred$strength
175 | )
176 | ```
177 | 
178 | ### Improving model performance with two hidden layers and custom activation function
179 | 
180 | #### Create model specification
181 | 
182 | ```{r}
183 | set.seed(12345)
184 | model_spec_nnet_2 <- mlp(
185 |     engine       = "nnet",
186 |     mode         = "regression",
187 |     hidden_units = 5,
188 |     penalty      = 0.1,
189 |     epochs       = 100,
190 | ) %>%
191 |     translate()
192 | 
193 | model_spec_nnet_2
194 | ```
195 | 
196 | ### Fit the model ----
197 | 
198 | ```{r}
199 | model_fit_nnet_2 <- model_spec_nnet_2 %>% 
200 |     fit(
201 |         strength ~ .,
202 |         concrete_train
203 |     )
204 | 
205 | model_fit_nnet_2
206 | ```
207 | 
208 | #### Take a closer look at the model
209 | 
210 | ```{r}
211 | model_fit_nnet_2$fit %>%
212 |     summary()
213 | ```
214 | 
215 | #### Make the predictions (you could skip this step)
216 | 
217 | ```{r}
218 | concrete_test_pred_2 <- model_fit_nnet_2 %>% 
219 |     predict(
220 |         new_data = concrete_test,
221 |         type = "numeric"
222 |     )
223 | 
224 | concrete_test_pred_2
225 | ```
226 | 
227 | #### Add the predictions to the test tibble
228 | 
229 | ```{r}
230 | concrete_test_with_pred_2 <- augment(model_fit_nnet_2, concrete_test)
231 | concrete_test_with_pred_2
232 | ```
233 | 
234 | #### Metrics
235 | 
236 | ```{r}
237 | concrete_test_with_pred_2 %>%
238 |     metrics(strength, .pred)
239 | ```
240 | 
241 | #### Examine the correlation between predicted and actual values
242 | 
243 | ```{r}
244 | concrete_test_with_pred_2 %>%
245 |     select(.pred, strength) %>%
246 |     correlate()
247 | ```
248 | 
249 | A simpler alternative:
250 | 
251 | ```{r}
252 | cor(
253 |     concrete_test_with_pred_2$.pred,
254 |     concrete_test_with_pred_2$strength
255 | )
256 | ```
257 | 
258 | ## Part 2: Support Vector Machines
259 | 
260 | ### Exploring and preparing the data
261 | 
262 | #### Read in data and examine structure
263 | 
264 | ```{r}
265 | letters_tbl <- read_csv("data/letterdata.csv") %>%
266 |     mutate(across(where(is.character), as.factor))
267 | ```
268 | 
269 | ```{r}
270 | letters_tbl %>% 
271 |     glimpse()
272 | ```
273 | 
274 | ### Creating the recipe
275 | 
276 | #### Apply normalization to entire data frame
277 | 
278 | ```{r}
279 | recipe_obj <- recipe(
280 |     letter ~ .,
281 |     data = letters_tbl
282 | )
283 | 
284 | recipe_obj
285 | ```
286 | 
287 | ```{r}
288 | letters_baked_tbl <- recipe_obj %>%
289 |     prep() %>%
290 |     bake(new_data = NULL)
291 | 
292 | letters_baked_tbl
293 | ```
294 | 
295 | #### Create training and test data
296 | 
297 | ```{r}
298 | letters_split <- initial_time_split(
299 |     letters_baked_tbl,
300 |     prop = 16000 / 20000
301 | )
302 | letters_train <- training(letters_split)
303 | letters_test  <- testing(letters_split)
304 | ```
305 | 
306 | ### Training a model on the data
307 | 
308 | #### Create model specification
309 | 
310 | ```{r}
311 | model_spec_kernlab <- svm_linear(
312 |     engine       = "kernlab",
313 |     mode         = "classification",
314 |     cost         = NULL,
315 |     margin       = NULL
316 | ) %>%
317 |     translate()
318 | 
319 | model_spec_kernlab
320 | ```
321 | 
322 | #### Fit the model
323 | 
324 | ```{r}
325 | model_fit_kernlab <- model_spec_kernlab %>%
326 |     fit(
327 |         letter ~ .,
328 |         letters_train
329 |     )
330 | 
331 | model_fit_kernlab
332 | ```
333 | 
334 | #### Make the predictions (you could skip this step)
335 | 
336 | ```{r}
337 | letters_test_pred <- model_fit_kernlab %>% 
338 |     predict(new_data = letters_test)
339 | 
340 | letters_test_pred
341 | ```
342 | 
343 | #### Add the predictions to the test tibble
344 | 
345 | ```{r}
346 | letters_test_with_pred <- augment(model_fit_kernlab, letters_test)
347 | letters_test_with_pred
348 | ```
349 | 
350 | ### Evaluating model performance
351 | 
352 | #### Predictions on testing dataset
353 | 
354 | ```{r}
355 | letters_test_with_pred %>%
356 |     tabyl(letter, .pred_class)
357 | ```
358 | 
359 | #### Look only at agreement vs. non-agreement
360 | 
361 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
362 | 
363 | ```{r}
364 | letters_test_with_pred %>%
365 |     mutate(
366 |         agreement = case_when(
367 |             letter == .pred_class ~ TRUE,
368 |             .default              = FALSE
369 |         )
370 |     ) %>%
371 |     tabyl(agreement) %>% 
372 |     adorn_pct_formatting(digits = 1)
373 | ```
374 | 
375 | ### Improving model performance
376 | 
377 | #### Change to a RBF kernel
378 | 
379 | ```{r}
380 | model_spec_rbf <- svm_rbf(
381 |     engine       = "kernlab",
382 |     mode         = "classification",
383 |     cost         = NULL,
384 |     margin       = NULL,
385 |     rbf_sigma    = NULL
386 | ) %>%
387 |     translate()
388 | 
389 | model_spec_rbf
390 | ```
391 | 
392 | #### Fit the model
393 | 
394 | ```{r}
395 | model_fit_rbf <- fit(
396 |     model_spec_rbf,
397 |     letter ~ .,
398 |     letters_train
399 | )
400 | 
401 | model_fit_rbf
402 | ```
403 | 
404 | #### Make the predictions (you could skip this step)
405 | 
406 | ```{r}
407 | letters_test_pred_rbf <- model_fit_rbf %>% 
408 |     predict(new_data = letters_test)
409 | 
410 | letters_test_pred_rbf
411 | ```
412 | 
413 | #### Add the predictions to the test tibble
414 | 
415 | ```{r}
416 | letters_test_with_pred_rbf <- augment(model_fit_rbf, letters_test)
417 | letters_test_with_pred_rbf
418 | ```
419 | 
420 | #### Predictions on testing dataset
421 | 
422 | ```{r}
423 | letters_test_with_pred_rbf %>%
424 |     tabyl(letter, .pred_class)
425 | ```
426 | 
427 | #### Look only at agreement vs. non-agreement
428 | 
429 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
430 | 
431 | ```{r}
432 | letters_test_with_pred_rbf %>%
433 |     mutate(
434 |         agreement = case_when(
435 |             letter == .pred_class ~ TRUE,
436 |             .default              = FALSE
437 |         )
438 |     ) %>%
439 |     tabyl(agreement) %>% 
440 |     adorn_pct_formatting(digits = 1)
441 | ```
442 | 
443 | #### Test various values of the cost parameter
444 | 
445 | ```{r}
446 | cost_values <- c(1, seq(from = 5, to = 40, by = 5))
447 | ```
448 | 
449 | ```{r}
450 | accuracy_values <- map_dbl(cost_values, function(x) {
451 |     
452 |     model_spec_rbf <- svm_rbf(
453 |         engine       = "kernlab",
454 |         mode         = "classification",
455 |         cost         = {{ x }},
456 |         margin       = NULL,
457 |         rbf_sigma    = NULL
458 |     ) %>%
459 |         translate()
460 |     
461 |     model_fit_rbf <- fit(
462 |         model_spec_rbf,
463 |         letter ~ .,
464 |         letters_train
465 |     )
466 |     
467 |     letters_test_pred_rbf <- model_fit_rbf %>% 
468 |         predict(new_data = letters_test) %>%
469 |         as_vector()
470 |     
471 |     agree <- if_else(letters_test_pred_rbf == letters_test %>% pull(letter), 1, 0)
472 |     
473 |     accuracy <- sum(agree) / nrow(letters_test)
474 |     
475 |     return(accuracy)
476 |     
477 | })
478 | ```
479 | 
480 | #### Bind together the cost parameter and accuracy values
481 | 
482 | ```{r}
483 | cost_vs_accuracy_tbl <- bind_cols(
484 |     cost_values,
485 |     accuracy_values,
486 | ) %>%
487 |     rename(
488 |         cost_values     = ...1,
489 |         accuracy_values = ...2
490 |     )
491 | 
492 | cost_vs_accuracy_tbl
493 | ```
494 | 
495 | #### Visualize to find the optimal cost parameter value
496 | 
497 | ```{r}
498 | cost_vs_accuracy_tbl %>%
499 |     ggplot(aes(cost_values, accuracy_values)) +
500 |     geom_line() +
501 |     geom_point() +
502 |     labs(
503 |         x = "Cost Parameter Value",
504 |         y = "Accuracy"
505 |     ) +
506 |     theme_classic()
507 | ```
508 | 
509 | #### Make sure you have the right optimal cost value for the best accuracy
510 | 
511 | ```{r}
512 | cost_vs_accuracy_tbl %>%
513 |     slice_max(accuracy_values)
514 | ```
515 | 
516 | #### Pull the first cost_value that has the max accuracy_value
517 | 
518 | ```{r}
519 | .max_accuracy <- cost_vs_accuracy_tbl %>%
520 |     slice_max(accuracy_values) %>%
521 |     slice(1) %>%
522 |     pull(cost_values)
523 | ```
524 | 
525 | ### Fitting the model with the optimal cost value (that was just pulled)
526 | 
527 | #### Give model specification
528 | 
529 | ```{r}
530 | model_spec_best <- svm_rbf(
531 |     engine       = "kernlab",
532 |     mode         = "classification",
533 |     cost         = {{ .max_accuracy }},
534 |     margin       = NULL,
535 |     rbf_sigma    = NULL
536 | ) %>%
537 |     translate()
538 | ```
539 | 
540 | #### Fit the model
541 | 
542 | ```{r}
543 | model_fit_best <- fit(
544 |     model_spec_best,
545 |     letter ~ .,
546 |     letters_train
547 | )
548 | ```
549 | 
550 | #### Add the predictions to the test tibble
551 | 
552 | ```{r}
553 | letters_test_with_pred_best <- augment(model_fit_best, letters_test)
554 | ```
555 | 
556 | #### Predictions on testing dataset
557 | 
558 | ```{r}
559 | letters_test_with_pred_best %>%
560 |     tabyl(letter, .pred_class)
561 | ```
562 | 
563 | #### Look only at agreement vs. non-agreement
564 | 
565 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
566 | 
567 | ```{r}
568 | letters_test_with_pred_best %>%
569 |     mutate(
570 |         agreement = case_when(
571 |             letter == .pred_class ~ TRUE,
572 |             .default              = FALSE
573 |         )
574 |     ) %>%
575 |     tabyl(agreement) %>% 
576 |     adorn_pct_formatting(digits = 1)
577 | ```
578 | 


--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/filtering-mobile-phone-spam-with-naive-bayes.r:
--------------------------------------------------------------------------------
  1 | # Filtering Mobile Phone Spam with the Naive Bayes Algorithm ----
  2 | 
  3 | # Inspired by Brett Lantz's Machine Learning with R,
  4 | # Chapter 4: Probabilistic Learning - Classification Using Naive Bayes and
  5 | # Chapter 10: Evaluating Model Performance
  6 | #
  7 | # The original code is made with a lot of base R, {e1071} and {gmodels}. I
  8 | # wanted to see how one could recreate it using mainly {textrecipes},
  9 | # {tidymodels}, {tidytext} and {tidyverse}.
 10 | #
 11 | # You can find the original code and the slightly modified dataset(s) here:
 12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter04
 13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
 14 | 
 15 | ## 1. Loading libraries (in the order they get used) ----
 16 | library(conflicted)
 17 | library(tidyverse)
 18 |     conflict_prefer("filter", "dplyr", "stats")
 19 | library(tidytext)
 20 | library(SnowballC)   # for stemming
 21 | library(wordcloud2)
 22 | library(textrecipes)
 23 | library(tidymodels)
 24 | library(discrim)     # for naive_Bayes()
 25 | library(janitor)
 26 | library(crosstable)
 27 | 
 28 | 
 29 | ## 2. Exploring and preparing the data ----
 30 | 
 31 | ### Read the sms data into the sms tibble, convert spam/ham to factor ----
 32 | sms_tbl <- read_csv(
 33 |   "machine_learning_with_r_3rd_ed/naive_bayes/data/sms_spam.csv",
 34 |   col_types = "fc"
 35 | ) %>%
 36 |   select(.type = type, everything())
 37 | 
 38 | ### Examine the structure of the sms data ----
 39 | glimpse(sms_tbl)
 40 | 
 41 | ### Examine the distribution of spam/ham ----
 42 | sms_tbl %>%
 43 |   count(.type) %>%
 44 |   mutate(pct = (n / sum(n) * 100))
 45 | 
 46 | ### Build a corpus using the {tidytext} package instead of {tm} ----
 47 | 
 48 | # This part inspired by a blog post by Julia Silge:
 49 | # https://www.tidyverse.org/blog/2020/11/tidymodels-sparse-support/
 50 | 
 51 | ### Add a row number ----
 52 | sms_row_numbers_tbl <- sms_tbl %>%
 53 |   mutate(line = row_number())
 54 | sms_row_numbers_tbl
 55 | 
 56 | ### Manual preprocessing (just to see what it's all about) ----
 57 | tidy_sms_tbl <- sms_row_numbers_tbl %>%
 58 |   unnest_tokens(word, text) %>%
 59 |   count(line, word) %>%
 60 |   bind_tf_idf(word, line, n)
 61 | tidy_sms_tbl
 62 | 
 63 | wide_sms_tbl <- tidy_sms_tbl %>%
 64 |   dplyr::select(line, word, tf_idf) %>%
 65 |   pivot_wider(
 66 |     names_from   = word,
 67 |     names_prefix = "word_",
 68 |     values_from  = tf_idf,
 69 |     values_fill  = 0
 70 |   )
 71 | wide_sms_tbl
 72 | 
 73 | 
 74 | ## 3. Visualizing text data - word clouds ----
 75 | 
 76 | # This part inspired by Julia Silge & David Robinson's book Text Mining with R:
 77 | # A Tidy Approach: https://www.tidytextmining.com/
 78 | 
 79 | ### Count word frequencies ----
 80 | frequency_tbl <- sms_row_numbers_tbl %>%
 81 | 
 82 |   # One word per one row
 83 |   unnest_tokens(word, text) %>%
 84 | 
 85 |   # Stemming
 86 |   mutate(word = wordStem(word)) %>%
 87 | 
 88 |   # Count the words
 89 |   count(.type, word) %>%
 90 | 
 91 |   # Count the proportion of words
 92 |   with_groups(
 93 |     .type,
 94 |     mutate,
 95 |     proportion = n / sum(n)
 96 |   ) %>%
 97 | 
 98 |   # Reorder the columns
 99 |   dplyr::select(-n) %>%
100 |   pivot_wider(names_from = .type, values_from = proportion) %>%
101 |   pivot_longer(
102 |     cols      = c("ham", "spam"),
103 |     names_to  = ".type",
104 |     values_to = "freq"
105 |   )
106 | 
107 | ### Subset the frequency data into two groups, spam and ham ----
108 | spam_tbl <- frequency_tbl %>%
109 |   filter(.type == "spam") %>%
110 |   dplyr::select(-.type) %>%
111 |   drop_na()
112 | 
113 | ham_tbl <- frequency_tbl %>%
114 |   filter(.type == "ham") %>%
115 |   dplyr::select(-.type) %>%
116 |   drop_na()
117 | 
118 | ### Word cloud ----
119 | 
120 | # This part inspired by a blog post by Céline Van den Rul:
121 | # https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a
122 | 
123 | # One for ham...
124 | wordcloud2(
125 |   data  = ham_tbl,
126 |   size  = 2,
127 |   color = "random-dark"
128 | )
129 | 
130 | # ...and another for spam
131 | wordcloud2(
132 |   data  = spam_tbl,
133 |   size  = 2,
134 |   color = "random-dark"
135 | )
136 | 
137 | 
138 | ## 4. Creating the recipe and splitting the data ----
139 | 
140 | ### Create the recipe ----
141 | text_recipe_obj <- recipe(
142 |   .type ~ text,
143 |   data = sms_row_numbers_tbl
144 | ) %>%
145 |   step_tokenize(text)  %>%
146 |   step_stopwords(text) %>%
147 |   step_tokenfilter(text, max_tokens = 1e3) %>%
148 |   step_tfidf(text)
149 | text_recipe_obj
150 | 
151 | # Bake it
152 | sms_baked_tbl <- text_recipe_obj %>%
153 |   prep() %>%
154 |   bake(new_data = NULL)
155 | 
156 | # Simplify the tf-idf to yes/no
157 | sms_baked_longer_tbl <- sms_baked_tbl %>%
158 |   mutate(across(where(is.numeric),
159 |                 ~ case_when(
160 |                   . > 0 ~ "Yes",
161 |                   TRUE   ~ "No"
162 |                 ))) %>%
163 |   # Rename the columns back to words
164 |   rename_with(
165 |     ~ tolower(gsub("tfidf_text_", "", .x)),
166 |     .cols = starts_with("tfidf_text_")
167 |   )
168 | sms_baked_longer_tbl
169 | 
170 | ### Create training and test data ----
171 | # Not randomly, because the messages weren't in any particular order
172 | sms_split <- initial_time_split(
173 |   sms_baked_longer_tbl,
174 |   prop = 0.75
175 | )
176 | sms_train <- training(sms_split)
177 | sms_test  <- testing(sms_split)
178 | 
179 | 
180 | ## 5. Training a model on the data ----
181 | 
182 | # naivebayes is the engine (needs to be installed if not already):
183 | # install.packages("discrim") AND
184 | # install.packages("naivebayes")
185 | 
186 | # It is used as the engine for {parsnip}'s naive_Bayes() function. And since
187 | # we are classifying, that is the mode we choose.
188 | 
189 | # The simple reason is I couldn't get klaR (the other engine) to work. If you
190 | # know how, please comment on GitHub. It would be great to get to test what the
191 | # difference between the two engines are.
192 | 
193 | ### Model specification ----
194 | model_spec <- naive_Bayes(
195 |   engine     = "naivebayes",
196 |   mode       = "classification",
197 |   smoothness = NULL,
198 |   Laplace    = NULL
199 | ) %>%
200 |   translate()
201 | model_spec
202 | 
203 | ### Fit the model ----
204 | model_fit <- fit(
205 |   model_spec,
206 |   .type ~ .,
207 |   sms_train
208 | )
209 | model_fit
210 | 
211 | ### Make the predictions (you could skip this step) ----
212 | sms_test_pred <- predict(
213 |   object   = model_fit,
214 |   new_data = sms_test,
215 |   type     = "class"
216 | )
217 | sms_test_pred
218 | 
219 | ### Add the predictions to the test tibble ----
220 | sms_test_with_pred_tbl <- augment(model_fit, sms_test)
221 | sms_test_with_pred_tbl
222 | 
223 | 
224 | ## 6. Evaluating model performance ----
225 | 
226 | ### Create a confusion matrix ----
227 | conf_mat <- conf_mat(
228 |   data     = sms_test_with_pred_tbl,
229 |   truth    = .type,
230 |   estimate = .pred_class
231 | )
232 | conf_mat
233 | 
234 | ### Visualize the confusion matrix ----
235 | conf_mat %>% autoplot(type = "heatmap")
236 | conf_mat %>% autoplot(type = "mosaic")
237 | 
238 | ### Visualize the ROC curve ----
239 | sms_test_with_pred_tbl %>%
240 |   roc_curve(
241 |     truth    = .type,
242 |     estimate = .pred_ham
243 |   ) %>%
244 |   autoplot()
245 | 
246 | ### Calculate the ROC AUC (area under the curve) ----
247 | sms_roc_auc <- sms_test_with_pred_tbl %>%
248 |   roc_auc(
249 |     truth    = .type,
250 |     estimate = .pred_ham
251 |   )
252 | sms_roc_auc
253 | 
254 | 
255 | ## 7. Compare to k-NN ----
256 | 
257 | ### Read the results ----
258 | sms_results_knn <- read_csv("machine_learning_with_r_3rd_ed/naive_bayes/data/sms_results_knn.csv") %>%
259 |   mutate(p_ham = 1 - p_spam)
260 | 
261 | ### Naive Bayes vs k-NN - ROC curve ----
262 | 
263 | # Naive Bayes
264 | sms_test_with_pred_tbl %>%
265 |   roc_curve(
266 |     truth    = .type,
267 |     estimate = .pred_ham
268 |   ) %>%
269 |   autoplot()
270 | 
271 | # k-NN
272 | sms_test_with_pred_tbl %>%
273 |   mutate(.knn_p_ham = sms_results_knn$p_ham) %>%
274 |   roc_curve(
275 |     truth    = .type,
276 |     estimate = .knn_p_ham
277 |   ) %>%
278 |   autoplot()
279 | 
280 | ### Naive Bayes vs k-NN - AUC ----
281 | 
282 | # Naive Bayes
283 | sms_test_with_pred_tbl %>%
284 |   roc_auc(
285 |     truth    = .type,
286 |     estimate = .pred_ham
287 |   )
288 | 
289 | # k-NN
290 | sms_test_with_pred_tbl %>%
291 |   mutate(.knn_p_ham = sms_results_knn$p_ham) %>%
292 |   roc_auc(
293 |     truth    = .type,
294 |     estimate = .knn_p_ham
295 |   )
296 | 
297 | ### Put together other model metrics ----
298 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
299 | classification_metrics <- conf_mat(
300 |   sms_test_with_pred_tbl,
301 |   truth    = .type,
302 |   estimate = .pred_class
303 | ) %>%
304 |   summary()
305 | classification_metrics
306 | 
307 | 
308 | ## 8. Improving model performance ----
309 | 
310 | # Basically, the same as before, but with Laplace = 1
311 | 
312 | ### Model specification ----
313 | model_spec <- naive_Bayes(
314 |   engine     = "naivebayes",
315 |   mode       = "classification",
316 |   smoothness = NULL,
317 |   Laplace    = 1
318 | ) %>%
319 |   translate()
320 | model_spec
321 | 
322 | ### Fit the model ----
323 | model_fit <- fit(
324 |   model_spec,
325 |   .type ~ .,
326 |   sms_train
327 | )
328 | model_fit
329 | 
330 | ### Make the predictions (you could skip this step) ----
331 | sms_test_pred <- predict(
332 |   object   = model_fit,
333 |   new_data = sms_test,
334 |   type     = "class"
335 | )
336 | sms_test_pred
337 | 
338 | ### Add the predictions to the test tibble ----
339 | sms_test_with_pred_tbl <- augment(model_fit, sms_test)
340 | sms_test_with_pred_tbl
341 | 
342 | 
343 | ## 9. Evaluating model performance ----
344 | 
345 | ### Create a confusion matrix ----
346 | conf_mat <- conf_mat(
347 |   data     = sms_test_with_pred_tbl,
348 |   truth    = .type,
349 |   estimate = .pred_class
350 | )
351 | conf_mat
352 | 
353 | ### Visualize the confusion matrix ----
354 | conf_mat %>% autoplot(type = "heatmap")
355 | conf_mat %>% autoplot(type = "mosaic")
356 | 
357 | ### Visualize the ROC curve ----
358 | sms_test_with_pred_tbl %>%
359 |   roc_curve(
360 |     truth    = .type,
361 |     estimate = .pred_ham
362 |   ) %>%
363 |   autoplot()
364 | 
365 | ### Calculate the ROC AUC (area under the curve) ----
366 | sms_roc_auc <- sms_test_with_pred_tbl %>%
367 |   roc_auc(
368 |     truth    = .type,
369 |     estimate = .pred_ham
370 |   )
371 | sms_roc_auc
372 | 
373 | ### Put together other model metrics ----
374 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
375 | classification_metrics <- conf_mat(
376 |   sms_test_with_pred_tbl,
377 |   truth    = .type,
378 |   estimate = .pred_class
379 | ) %>%
380 |   summary()
381 | classification_metrics
382 | 
383 | 
384 | ## 10. Creating a function to help evaluate the model further ----
385 | 
386 | # The assumption here is that you have already gone through steps 1. to 4.
387 | # What we're potentially tuning here are the arguments .smoothness and .Laplace
388 | # Check out the book and/or the documentation for further info about them!
389 | 
390 | classify_with_naive_bayes <- function(
391 |   .smoothness  = NULL,
392 |   .laplace     = NULL
393 | ) {
394 | 
395 |   # Model specification
396 |   model_spec <- naive_Bayes(
397 |     engine     = "naivebayes",
398 |     mode       = "classification",
399 |     smoothness = .smoothness,
400 |     Laplace    = .laplace
401 |   ) %>%
402 |     translate()
403 |   model_spec
404 | 
405 |   # Fit the model
406 |   model_fit <- fit(
407 |     model_spec,
408 |     .type ~ .,
409 |     sms_train
410 |   )
411 |   model_fit
412 | 
413 |   # Add the predictions to the test tibble
414 |   sms_test_with_pred_tbl <- augment(model_fit, sms_test)
415 |   sms_test_with_pred_tbl
416 | 
417 |   # Create a confusion matrix
418 |   conf_mat <- conf_mat(
419 |     data     = sms_test_with_pred_tbl,
420 |     truth    = .type,
421 |     estimate = .pred_class
422 |   )
423 | 
424 |   # Print the confusion matrix
425 |   conf_mat %>% autoplot(type = "heatmap")
426 | 
427 | }
428 | 
429 | ### Test the function ----
430 | classify_with_naive_bayes(
431 |   .smoothness  = 1,
432 |   .laplace     = 1
433 | )
434 | 
435 | 
436 | ## 11. Understanding the classifier's predictions ----
437 | 
438 | ### Obtain the predicted probabilities (you could skip this step) ----
439 | sms_test_prob <- predict(
440 |   object   = model_fit,
441 |   new_data = sms_test,
442 |   type     = "prob"
443 | )
444 | sms_test_prob
445 | 
446 | ### Look at the predicted probabilities ----
447 | sms_results <- sms_test_with_pred_tbl %>%
448 |   select(
449 |     actual_type    = .type,
450 |     predicted_type = .pred_class,
451 |     prob_spam      = .pred_spam,
452 |     prob_ham       = .pred_ham
453 |   ) %>%
454 |   mutate(
455 |     prob_spam = prob_spam %>% round(2),
456 |     prob_ham  = prob_ham %>% round(2)
457 |   )
458 | sms_results
459 | 
460 | ### Test cases where the model is less confident ----
461 | sms_results %>%
462 |   filter(between(prob_spam, 0.40, 0.60))
463 | 
464 | ### Test cases where the model was wrong ----
465 | sms_results %>%
466 |   filter(actual_type != predicted_type)
467 | 
468 | ### Specifying vectors ----
469 | sms_results %>%
470 |   tabyl(actual_type, predicted_type)
471 | 
472 | ### Using {crosstable} ----
473 | sms_results %>%
474 |   crosstable(
475 |     cols            = actual_type,
476 |     by              = predicted_type,
477 |     label           = FALSE,
478 |     total           = "both",
479 |     percent_pattern = "{n} ({p_row}/{p_col})",
480 |     percent_digits  = 1
481 |   ) %>%
482 |   as_flextable(compact = TRUE)
483 | 


--------------------------------------------------------------------------------