├── machine_learning_with_r_4th_ed
├── img
│ ├── cat.jpg
│ ├── pizza.jpg
│ ├── ice_cream.jpg
│ └── youcanbeapirate-wb-sparkline.jpg
├── data
│ ├── credit.sqlite3
│ ├── titanic_train.xlsx
│ ├── pt_data.csv
│ ├── challenger.csv
│ └── usedcars.csv
├── chapter_15_making_use_of_big_data.Rmd
├── chapter_12_advanced_data_preparation.Rmd
├── chapter_14_building_better_learners.Rmd
├── chapter_10_evaluating_model_performance.Rmd
├── chapter_11_being_succesful_with_machine_learning.Rmd
├── chapter_13_challenging_data_too_much_too_little_too_complex.Rmd
├── Chapter_11.r
├── chapter_08_finding_patterns_market_basket_analysis_using_association_rules.Rmd
├── Chapter_10.r
├── Chapter_12.r
├── chapter_09_finding_groups_of_data_clustering_with_k-means.Rmd
├── chapter_03_lazy_learning_classification_using_nearest_neighbors.Rmd
├── Chapter_14.r
├── Chapter_15.r
├── Chapter_13.r
├── chapter_02_managing_and_understanding_data.Rmd
└── chapter_07_black-box_methods_neural_networks_and_support_vector_machines.Rmd
├── machine_learning_with_r_3rd_ed
├── naive_bayes
│ ├── images
│ │ ├── crosstable.jpeg
│ │ ├── wordcloud2_ham.jpeg
│ │ └── wordcloud2_spam.jpeg
│ ├── readme.MD
│ └── filtering-mobile-phone-spam-with-naive-bayes.r
├── regression_methods
│ ├── images
│ │ ├── fancyRpartPlot_wine.jpg
│ │ └── fancyRpartPlot_wine.pdf
│ ├── readme.MD
│ ├── estimating_wine_quality_with_regression_trees_and_model_trees.r
│ └── predicting_medical_expenses_with_linear_regression.r
├── association_rules
│ ├── images
│ │ ├── association_rules_visualized.jpg
│ │ └── association_rules_visualized.pdf
│ ├── readme.MD
│ └── identifying_frequently_purchased_groceries_with_association_rules.r
├── k_means_clustering
│ ├── images
│ │ ├── total_within_sum_of_squares.jpg
│ │ └── total_within_sum_of_squares.pdf
│ ├── readme.MD
│ └── finding_teen_market_segments_using_k_means_clustering.r
├── hyperparameter_optimization
│ ├── images
│ │ ├── hyperparameter_optimization_rpart.jpg
│ │ └── hyperparameter_optimization_rpart.pdf
│ ├── readme.MD
│ ├── hyperparameter_optimization_with_tidymodels.R
│ └── hyperparameter_optimization_with_tidymodels_random.R
├── README.md
├── knn
│ ├── README.md
│ ├── data
│ │ └── wdbc-names.txt
│ └── classifying-cancer-samples-with-knn.r
├── support_vector_machines
│ ├── readme.MD
│ └── performing_optical_character_recognition_with_support_vector_machines_.r
├── neural_networks
│ ├── readme.MD
│ └── modeling_the_strength_of_concrete_with_neural_networks.r
├── decision_trees_and_rules
│ ├── readme.MD
│ ├── identifying_poisonous_mushrooms_with_rule_learners.r
│ └── identifying_risky_bank_loans_using_C5.0_decision_trees.r
├── random_forest
│ └── random_forest_with_tidymodels.R
└── boosting
│ └── boosting_with_tidymodels.R
├── learning_machine_learning.Rproj
├── .gitignore
├── LICENSE
└── README.md
/machine_learning_with_r_4th_ed/img/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/cat.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/img/pizza.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/pizza.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/img/ice_cream.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/ice_cream.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/credit.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/data/credit.sqlite3
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/titanic_train.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/data/titanic_train.xlsx
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/images/crosstable.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/crosstable.jpeg
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/pt_data.csv:
--------------------------------------------------------------------------------
1 | subject_name,temperature,flu_status,gender,blood_type
2 | John Doe,98.1,FALSE,MALE,O
3 | Jane Doe,98.6,FALSE,FEMALE,AB
4 | Steve Graves,101.4,TRUE,MALE,A
5 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_4th_ed/img/youcanbeapirate-wb-sparkline.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_ham.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_ham.jpeg
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_spam.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/naive_bayes/images/wordcloud2_spam.jpeg
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/regression_methods/images/fancyRpartPlot_wine.pdf
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/association_rules/images/association_rules_visualized.pdf
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/k_means_clustering/images/total_within_sum_of_squares.pdf
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.jpg
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnttiRask/learning_machine_learning/HEAD/machine_learning_with_r_3rd_ed/hyperparameter_optimization/images/hyperparameter_optimization_rpart.pdf
--------------------------------------------------------------------------------
/learning_machine_learning.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/challenger.csv:
--------------------------------------------------------------------------------
1 | distress_ct,temperature,field_check_pressure,flight_num
0,66,50,1
1,70,50,2
0,69,50,3
0,68,50,4
0,67,50,5
0,72,50,6
0,73,100,7
0,70,100,8
1,57,200,9
1,63,200,10
1,70,200,11
0,78,200,12
0,67,200,13
2,53,200,14
0,67,200,15
0,75,200,16
0,70,200,17
0,81,200,18
0,76,200,19
0,79,200,20
2,75,200,21
0,76,200,22
1,58,200,23
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_15_making_use_of_big_data.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 15: Making Use of Big Data"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_12_advanced_data_preparation.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 12: Advanced Data Preparation"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_14_building_better_learners.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 14: Building Better Learners"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_10_evaluating_model_performance.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 10: Evaluating Model Performance"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_11_being_succesful_with_machine_learning.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 11: Being Succesful with Machine Learning"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_13_challenging_data_too_much_too_little_too_complex.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 13: Challenging Data - Too Much, Too Little, Too Complex"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/README.md:
--------------------------------------------------------------------------------
1 | # Learning Machine Learning
2 |
3 | I'm learning ML with R. Focusing on (but not limiting myself to) tidyverse and tidymodels.
4 |
5 | ## Algorithms tackled (see the named folder(s) for examples):
6 | * K-nearest neighbors (k-NN)
7 | * Naive Bayes __UPDATED__
8 | * Decision Trees and Rules __UPDATED__
9 | * Regression Methods
10 | * Neural Networks
11 | * Support Vector Machines
12 | * Association Rules
13 | * Clustering
14 |
15 | ## Coming up next:
16 | * Hyperparameter Optimization
17 | * Bagging
18 | * Boosting
19 | * Random Forest
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | .Rapp.history
4 |
5 | # Session Data files
6 | .RData
7 |
8 | # User-specific files
9 | .Ruserdata
10 |
11 | # Example code in package build process
12 | *-Ex.R
13 |
14 | # Output files from R CMD build
15 | /*.tar.gz
16 |
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 |
20 | # RStudio files
21 | .Rproj.user/
22 |
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 |
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 |
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 |
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 |
38 | # R Environment Variables
39 | .Renviron
40 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/knn/README.md:
--------------------------------------------------------------------------------
1 | # Classifying Cancer Samples with K-Nearest Neighbors
2 |
3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 3: Lazy Learning - Classification Using Nearest Neighbors.
4 |
5 | The original code is made with a lot of base R, {class} and {gmodels}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
6 |
7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
8 |
9 | You can find the original code here: https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter03
10 |
11 | Dataset Citation:
12 | Wolberg, William, Street, W. & Mangasarian, Olvi. (1995). Breast Cancer Wisconsin (Diagnostic). UCI Machine Learning Repository.
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/support_vector_machines/readme.MD:
--------------------------------------------------------------------------------
1 | # Performing Optical Character Recognition with Support Vector Machines
2 |
3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 7:
4 | Black Box Methods - Neural Networks and Support Vector Machines.
5 |
6 | The original code is made with {kernlab}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
7 |
8 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
9 |
10 | You can find the original code and the dataset here:
11 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
12 |
13 | Dataset Citation:
14 | P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". (Machine Learning Vol 6 #2 March 91)
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/readme.MD:
--------------------------------------------------------------------------------
1 | # Hyperparameter Optimization with tidymodels
2 |
3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 11:
4 | Improving Model Performance.
5 |
6 | The original code is made with {caret}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
7 |
8 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
9 |
10 | You can find the original code and the dataset here:
11 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11
12 |
13 | Dataset Citation:
14 |
15 | Professor Dr. Hans Hofmann
16 | Institut für Statistik und Ökonometrie
17 | Universität Hamburg
18 | FB Wirtschaftswissenschaften
19 | Von-Melle-Park 5
20 | 2000 Hamburg 13
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/neural_networks/readme.MD:
--------------------------------------------------------------------------------
1 | # Modeling the Strength of Concrete with Neural Networks
2 |
3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 7: Black Box Methods - Neural Networks and Support Vector Machines.
4 |
5 | The original code is made with {neuralnet}. I wanted to see how one could recreate it using mainly {tidymodels} and {tidyverse}.
6 |
7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
8 |
9 | You can find the original code and the dataset here:
10 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
11 |
12 | Dataset Citation:
13 | I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998).
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Antti Rask
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/readme.MD:
--------------------------------------------------------------------------------
1 | # Finding Teen Market Segments Using k-means Clustering
2 |
3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 9: Finding Groups of Data - Clustering with k-means.
4 |
5 | The original code is made with base R. While {tidymodels} doesn't have a k-means engine, I wanted to see how one could still recreate the code using {tidyverse} as much as possible and even using the {recipes} package from the {tidymodels} to do all the pre-processing needed and {broom} package from the same {tidymodels} family to help look at the model metrics in a more tidy way.
6 |
7 | I was also inspired by this blog post from the tidymodels website to continue the exercise a bit further than the original code did:
8 | https://www.tidymodels.org/learn/statistics/k-means/
9 |
10 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
11 |
12 | You can find the original code and the dataset here:
13 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter09
14 |
15 | Dataset Citation:
16 | Brett Lantz. "Finding Groups of Data - Clustering with k-means". (Machine Learning with R)
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/readme.MD:
--------------------------------------------------------------------------------
1 | # Predicting Medical Expenses with Linear Regression /
2 | # Estimating Wine Quality with Regression Trees and Model Trees
3 |
4 | Inspired by Brett Lantz's Machine Learning with R, Chapter 6:
5 | Forecasting Numeric Data - Regression Methods.
6 |
7 | The original code is made with {psych}, {rpart}, {rpart.plot} and {Cubist}. I wanted to see how one could recreate it using mainly {tidymodels}, {tidyverse} and {rules}.
8 |
9 | If you haven't already, buy the book:
https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
10 |
11 | You can find the original code and the datasets here:
12 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06
13 |
14 | Dataset Citation(s):
15 |
16 | __Predicting Medical Expenses with Linear Regression__
17 | Brett Lantz: "These data were created for this book using demographic statistics from the U.S. Census Bureau"
18 |
19 | __Estimating Wine Quality with Regression Trees and Model Trees__
20 | P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
21 | Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # Learning Machine Learning
4 |
5 | I'm learning machine learning (ML) by reading the book __Machine Learning with R__ by __Brett Lantz__. I've read the 3rd edition (published in 2019) and am currently reading the 4th edition. I will update the repo along the way.
6 |
7 | While the book is excellent in explaining the different algorithms used, it still doesn't use tidyverse and tidymodels. Two of the frameworks that make data science -related tasks more user friendly, mainly because of a unified syntax. Obviously I'm not limiting myself to only them, but the whole point of this repo is to offer an alternative to base R and the separate packages used in the book and in the [official GitHub repo](https://github.com/PacktPublishing/Machine-Learning-with-R-Fourth-Edition).
8 |
9 | ## Disclaimer!
10 | This repo is not meant to replace the book in any way. You should definitely read the book. It will help you understand ML in general (and specifically with R) much better than looking at the code alone.
11 |
12 | Also, I would recommend you buy the book. Here's a direct [link](https://www.packtpub.com/product/machine-learning-with-r-fourth-edition/9781801071321) to __Packt__'s (the publisher) website.
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/readme.MD:
--------------------------------------------------------------------------------
1 | # Identifying Frequently Purchased Groceries with Association Rules
2 |
3 | Inspired by Brett Lantz's Machine Learning with R, Chapter 8: Finding Patterns - Market Basket Analysis Using Association Rules
4 |
5 | The original code is made with {arules}. Now, this is an algorithm that isn't found in {tidymodels}. I still wanted to recreate it using {tidyverse} as much as possible. Helps with the readability if nothing else!
6 |
7 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
8 |
9 | You can find the original code and the dataset here:
10 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter08
11 |
12 | Dataset Citation:
13 |
14 | Author:
15 | Michael Hahsler
16 |
17 | Source:
18 | The data set is provided for {arules} by Michael Hahsler, Kurt Hornik and Thomas Reutterer.
19 |
20 | References:
21 | Michael Hahsler, Kurt Hornik, and Thomas Reutterer (2006) Implications of probabilistic data modeling for mining association rules. In M. Spiliopoulou, R. Kruse, C. Borgelt, A. Nuernberger, and W. Gaul, editors, From Data and Information Analysis to Knowledge Engineering, Studies in Classification, Data Analysis, and Knowledge Organization, pages 598–605. Springer-Verlag.
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/decision_trees_and_rules/readme.MD:
--------------------------------------------------------------------------------
1 | # Identifying Risky Bank Loans Using C5.0 Decision Trees /
2 | # Identifying Poisonous Mushrooms with Rule Learners
3 |
4 | Inspired by Brett Lantz's Machine Learning with R,
5 | Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
6 | Chapter 10: Evaluating Model Performance.
7 |
8 | The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
9 | wanted to see how one could recreate it using mainly {tidymodels} and
10 | {tidyverse}.
11 |
12 | If you haven't already, buy the book:
https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
13 |
14 | You can find the original code and the slightly modified datasets here:
15 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
16 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
17 |
18 | Dataset Citation(s):
19 |
20 | __Identifying Risky Bank Loans Using C5.0 Decision Trees__
21 | Professor Dr. Hans Hofmann
22 | Institut für Statistik und Ökonometrie
23 | Universität Hamburg
24 | FB Wirtschaftswissenschaften
25 | Von-Melle-Park 5
26 | 2000 Hamburg 13
27 |
28 | __Identifying Poisonous Mushrooms with Rule Learners__
29 | Origin:
30 | Mushroom records drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf
31 |
32 | Donor:
33 | Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)
34 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/readme.MD:
--------------------------------------------------------------------------------
1 | # Filtering Mobile Phone Spam with Naive Bayes
2 |
3 | Inspired by Brett Lantz's Machine Learning with R,
4 | Chapter 4: Probabilistic Learning - Classification Using Naive Bayes and
5 | Chapter 10: Evaluating Model Performance.
6 |
7 | The original code is made with a lot of base R, {e1071} and {gmodels}. I wanted to see how one could recreate it using mainly {textrecipes}, {tidymodels}, {tidytext} and {tidyverse}.
8 |
9 | If you haven't already, buy the book: https://www.packtpub.com/product/machine-learning-with-r-third-edition/9781788295864
10 |
11 | You can find the original code and the slightly modified dataset here:
12 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter04
13 | https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
14 |
15 | Dataset Citation:
16 | [1] Gómez Hidalgo, J.M., Cajigas Bringas, G., Puertas Sãnz, E., Carrero García, F. Content Based SMS Spam Filtering. Proceedings of the 2006 ACM Symposium on Document Engineering (ACM DOCENG'06), Amsterdam, The Netherlands, 10-13, 2006.
17 |
18 | [2] Cormack, G. V., Gómez Hidalgo, J. M., and Puertas Sãnz, E. Feature engineering for mobile (SMS) spam filtering. Proceedings of the 30th Annual international ACM Conference on Research and Development in information Retrieval (ACM SIGIR'07), New York, NY, 871-872, 2007.
19 |
20 | [3] Cormack, G. V., Gómez Hidalgo, J. M., and Puertas Sãnz, E. Spam filtering for short messages. Proceedings of the 16th ACM Conference on Information and Knowledge Management (ACM CIKM'07). Lisbon, Portugal, 313-320, 2007.
21 |
22 | Also, there were parts that were inspired by:
23 |
24 | A blog post by Julia Silge:
25 | https://www.tidyverse.org/blog/2020/11/tidymodels-sparse-support/
26 |
27 | Julia Silge & David Robinson's book Text Mining with R: A Tidy Approach: https://www.tidytextmining.com/
28 |
29 | A blog post by Céline Van den Rul:
30 | https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_11.r:
--------------------------------------------------------------------------------
1 | ##### Chapter 11: Being Successful with Machine Learning --------------------
2 |
3 | ## Example: Using ggplot2 for visual data exploration ----
4 |
5 | # read the titanic training dataset
6 | titanic_train <- read.csv("titanic_train.csv")
7 |
8 | # examine the features
9 | str(titanic_train)
10 |
11 | # load ggplot2 package and apply to Titanic dataset
12 | library(ggplot2)
13 | p <- ggplot(data = titanic_train)
14 | p # creates an empty gray plot
15 |
16 | # compare built-in boxplot to ggplot2 boxplot
17 | boxplot(titanic_train$Age) # use R's built-in boxplot()
18 | p + geom_boxplot(mapping = aes(y = Age)) # use ggplot2 boxplot
19 |
20 | # boxplot examining relationship between age and survival
21 | p + geom_boxplot(mapping = aes(x = Age, y = as.factor(Survived)))
22 |
23 | # compare built-in histogram to ggplot2 histogram
24 | hist(titanic_train$Age) # use R's built-in hist()
25 | p + geom_histogram(aes(x = Age)) # use ggplot2 histogram
26 |
27 | # overlapping histograms
28 | p + geom_histogram(aes(x = Age, fill = as.factor(Survived))) +
29 | ggtitle("Distribution of Age by Titanic Survival Status")
30 |
31 | # side-by-side histograms
32 | p + geom_histogram(aes(x = Age)) +
33 | facet_grid(cols = vars(Survived)) +
34 | ggtitle("Distribution of Age by Titanic Survival Status")
35 |
36 | # overlapping density plots
37 | p + geom_density(aes(x = Age,
38 | color = as.factor(Survived),
39 | fill = as.factor(Survived)),
40 | alpha = 0.25) +
41 | ggtitle("Density of Age by Titanic Survival Status")
42 |
43 | # bar chart of passenger counts by sex
44 | p + geom_bar(aes(x = Sex)) +
45 | ggtitle("Titanic Passenger Counts by Gender")
46 |
47 | # bar chart of survival probability by sex
48 | p + geom_bar(aes(x = Sex, y = Survived),
49 | stat = "summary", fun = "mean") +
50 | ggtitle("Titanic Survival Rate by Gender")
51 |
52 | # bar chart of survival probability by passenger class
53 | p + geom_bar(aes(x = Pclass, y = Survived),
54 | stat = "summary", fun = "mean") +
55 | ggtitle("Titanic Survival Rate by Passenger Class")
56 |
57 | # stacked bar chart of survival by passenger class
58 | p + geom_bar(aes(x = Pclass,
59 | fill = factor(Survived,
60 | labels = c("No", "Yes")))) +
61 | labs(fill = "Survived") +
62 | ylab("Number of Passengers") +
63 | ggtitle("Titanic Survival Counts by Passenger Class")
64 |
65 | # survivival status by passenger class
66 | p + geom_bar(aes(x = Pclass,
67 | fill = factor(Survived,
68 | labels = c("No", "Yes"))),
69 | position = "fill") +
70 | labs(fill = "Survived") +
71 | ylab("Proportion of Passengers") +
72 | ggtitle("Titanic Survival by Passenger Class")
73 |
74 | # bar chart of interaction between class and sex
75 | p + geom_bar(aes(x = Pclass, y = Survived, fill = Sex),
76 | position = "dodge", stat = "summary", fun = "mean") +
77 | ylab("Survival Proportion") +
78 | ggtitle("Titanic Survival Rate by Class and Sex")
79 |
80 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/association_rules/identifying_frequently_purchased_groceries_with_association_rules.r:
--------------------------------------------------------------------------------
1 | # Identifying Frequently Purchased Groceries with Association Rules ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 8:
4 | # Finding Patterns - Market Basket Analysis Using Association Rules
5 | #
6 | # The original code is made with {arules}. Now, this is an algorithm that isn't
7 | # found in {tidymodels}. I still wanted to recreate it using {tidyverse} as much
8 | # as possible. Helps with the readability if nothing else!
9 | #
10 | # You can find the original code and the dataset here:
11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter08
12 |
13 | ## 1. Loading libraries ----
14 | library(conflicted)
15 | library(tidyverse)
16 | library(arules)
17 | library(arulesViz)
18 |
19 |
20 | ## 2. Exploring and preparing the data ----
21 |
22 | ### Load the grocery data into a sparse matrix ----
23 | groceries <- read.transactions(
24 | "machine_learning_with_r_3rd_ed/association_rules/data/groceries.csv",
25 | sep = ","
26 | )
27 | summary(groceries)
28 |
29 | ### Look at the first five transactions ----
30 | groceries %>%
31 | head(5) %>%
32 | inspect()
33 |
34 | ### Examine the frequency of items ----
35 | groceries[, 1:3] %>%
36 | itemFrequency()
37 |
38 | ### Plot the frequency of items ----
39 | par(mar = c(1, 1, 1, 1)) # Have to adjust the margins or there will be an error
40 |
41 | groceries %>%
42 | itemFrequencyPlot(support = 0.1)
43 |
44 | groceries %>%
45 | itemFrequencyPlot(topN = 10)
46 |
47 | ### A visualization of the sparse matrix for the first five transactions ----
48 | groceries[1:5] %>%
49 | image()
50 |
51 | ### Visualization of a random sample of 100 transactions ----
52 | sample(groceries, 100) %>%
53 | image()
54 |
55 |
56 | ## 3. Training a model on the data ----
57 |
58 | ### Default settings result in zero rules learned ----
59 | groceries %>%
60 | apriori()
61 |
62 | ### Set better support and confidence levels to learn more rules ----
63 | groceryrules <- apriori(
64 | groceries,
65 | parameter = list(
66 | support = 0.006,
67 | confidence = 0.25,
68 | minlen = 2
69 | )
70 | )
71 | groceryrules
72 |
73 |
74 | ## 4. Evaluating model performance ----
75 |
76 | # Summary of grocery association rules ----
77 | groceryrules %>% summary()
78 |
79 | # Look at the first three rules ----
80 | groceryrules[1:3] %>%
81 | inspect()
82 |
83 | # Visualize the rules ----
84 | groceryrules %>% plot(
85 | method = "graph",
86 | limit = 10
87 | )
88 |
89 |
90 | ## 5. Improving model performance ----
91 |
92 | ### Sorting grocery rules by lift ----
93 | sort(groceryrules, by = "lift")[1:5] %>%
94 | inspect()
95 |
96 | ### Finding subsets of rules containing any berry items ----
97 | berryrules <- groceryrules %>%
98 | subset(items %in% "berries")
99 |
100 | berryrules %>%
101 | inspect()
102 |
103 | ### Converting the rule set to a tibble ----
104 | groceryrules_tbl <- groceryrules %>%
105 | as("data.frame") %>%
106 | as_tibble()
107 |
108 | groceryrules_tbl %>%
109 | glimpse()
110 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_08_finding_patterns_market_basket_analysis_using_association_rules.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 8: Finding Patterns - Market Basket Analysis Using Association Rules"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## Loading Packages
13 |
14 | ```{r}
15 | library(arules)
16 | library(arulesViz)
17 | library(conflicted)
18 | library(tidyverse)
19 | ```
20 |
21 | ## Exploring and preparing the data
22 |
23 | ### Load the grocery data into a sparse matrix
24 |
25 | ```{r}
26 | groceries <- read.transactions(
27 | "data/groceries.csv",
28 | sep = ","
29 | )
30 | ```
31 |
32 | ```{r}
33 | groceries %>%
34 | summary()
35 | ```
36 |
37 | ### Look at the first five transactions
38 |
39 | ```{r}
40 | groceries %>%
41 | head(5) %>%
42 | inspect()
43 | ```
44 |
45 | ### Examine the frequency of items
46 |
47 | ```{r}
48 | groceries[, 1:3] %>%
49 | itemFrequency()
50 | ```
51 |
52 | ### Plot the frequency of items
53 |
54 | Have to adjust the margins or there will be an error
55 |
56 | ```{r}
57 | par(mar = c(1, 1, 1, 1))
58 | ```
59 |
60 | ```{r}
61 | groceries %>%
62 | itemFrequencyPlot(support = 0.1)
63 | ```
64 |
65 | ```{r}
66 | groceries %>%
67 | itemFrequencyPlot(topN = 20)
68 | ```
69 |
70 | ### A visualization of the sparse matrix for the first five transactions
71 |
72 | ```{r}
73 | groceries[1:5] %>%
74 | image()
75 | ```
76 |
77 | ### Visualization of a random sample of 100 transactions
78 |
79 | ```{r}
80 | sample(groceries, 100) %>%
81 | image()
82 | ```
83 |
84 | ## Training a model on the data
85 |
86 | ### Default settings result in zero rules learned
87 |
88 | ```{r}
89 | groceries %>%
90 | apriori()
91 | ```
92 |
93 | ### Set better support and confidence levels to learn more rules
94 |
95 | ```{r}
96 | groceryrules <- apriori(
97 | groceries,
98 | parameter = list(
99 | support = 0.006,
100 | confidence = 0.25,
101 | minlen = 2
102 | )
103 | )
104 |
105 | groceryrules
106 | ```
107 |
108 | ## Evaluating model performance
109 |
110 | ### Summary of grocery association rules
111 |
112 | ```{r}
113 | groceryrules %>%
114 | summary()
115 | ```
116 |
117 | ### Look at the first three rules
118 |
119 | ```{r}
120 | groceryrules[1:3] %>%
121 | inspect()
122 | ```
123 |
124 | ### Visualize the rules
125 |
126 | ```{r}
127 | groceryrules %>% plot(
128 | method = "graph",
129 | limit = 10
130 | )
131 | ```
132 |
133 | ## Improving model performance
134 |
135 | ### Sorting grocery rules by lift
136 |
137 | ```{r}
138 | sort(groceryrules, by = "lift")[1:5] %>%
139 | inspect()
140 | ```
141 |
142 | ### Finding subsets of rules containing any berry items
143 |
144 | ```{r}
145 | berryrules <- groceryrules %>%
146 | subset(items %in% "berries")
147 | ```
148 |
149 | ```{r}
150 | berryrules %>%
151 | inspect()
152 | ```
153 |
154 | ### Converting the rule set to a tibble
155 |
156 | ```{r}
157 | groceryrules_tbl <- groceryrules %>%
158 | as("data.frame") %>%
159 | as_tibble()
160 | ```
161 |
162 | ```{r}
163 | groceryrules_tbl
164 | ```
165 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/random_forest/random_forest_with_tidymodels.R:
--------------------------------------------------------------------------------
1 | # Random Forest with tidymodels ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R,
4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
5 | # Chapter 10: Evaluating Model Performance
6 | #
7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
8 | # wanted to see how one could recreate it using mainly {tidymodels} and
9 | # {tidyverse}.
10 | #
11 | # You can find the original code and the slightly modified dataset here:
12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
14 |
15 | ## 1. Loading libraries (in the order they get used) ----
16 | library(conflicted)
17 | library(tidyverse)
18 | library(tidymodels)
19 |
20 | ## 2. Exploring and preparing the data ----
21 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
22 |
23 | ### Examine the structure of the credit data ----
24 | glimpse(credit_tbl)
25 |
26 | ### look at two characteristics of the applicant ----
27 | credit_tbl %>%
28 | count(checking_balance) %>%
29 | mutate(pct = (n / sum(n) * 100))
30 |
31 | credit_tbl %>%
32 | count(savings_balance) %>%
33 | mutate(pct = (n / sum(n) * 100))
34 |
35 | ### look at two characteristics of the loan ----
36 | credit_tbl %>%
37 | select(months_loan_duration, amount) %>%
38 | summary()
39 |
40 | ### look at the class variable ----
41 | credit_tbl %>%
42 | count(default) %>%
43 | mutate(pct = (n / sum(n) * 100))
44 |
45 |
46 | ## 3. Creating the recipe and splitting the data ----
47 |
48 | ### Convert strings to factors ----
49 | recipe_obj <- recipe(
50 | default ~ .,
51 | data = credit_tbl
52 | ) %>%
53 | step_string2factor(all_nominal())
54 | recipe_obj
55 |
56 | credit_factorized_tbl <- recipe_obj %>%
57 | prep() %>%
58 | bake(new_data = NULL)
59 | credit_factorized_tbl
60 |
61 | ### Create training and test data (randomly) ----
62 |
63 | # Use set.seed to use the same random number sequence as the original
64 | RNGversion("3.5.2")
65 | set.seed(123)
66 |
67 | credit_split <- initial_split(
68 | credit_factorized_tbl,
69 | prop = 0.9
70 | )
71 | credit_train <- training(credit_split)
72 | credit_test <- testing(credit_split)
73 |
74 | ### Check the proportion of class variable ----
75 | credit_train %>%
76 | count(default) %>%
77 | mutate(pct = (n / sum(n) * 100))
78 |
79 | credit_test %>%
80 | count(default) %>%
81 | mutate(pct = (n / sum(n) * 100))
82 |
83 |
84 | ## 4. Building a random forest ----
85 |
86 | ### Specify a random forest ----
87 | spec <- rand_forest(trees = 100) %>%
88 | set_mode("classification") %>%
89 | set_engine("ranger", importance = "impurity")
90 |
91 | ### Train the forest ----
92 | model <- spec %>%
93 | fit(
94 | default ~ .,
95 | data = credit_train
96 | )
97 | model
98 |
99 | ### Plot the variable importance ----
100 | vip::vip(model)
101 |
102 |
103 | ## 5. Predicting ----
104 |
105 | ### Make the predictions (you could skip this step) ----
106 | credit_test_pred <- predict(
107 | object = model,
108 | new_data = credit_test,
109 | type = "class"
110 | )
111 | credit_test_pred
112 |
113 | ### Add the predictions to the test tibble ----
114 | credit_test_with_pred_tbl <- augment(model, credit_test)
115 | credit_test_with_pred_tbl
116 |
117 |
118 | ## 6. Evaluating model performance ----
119 |
120 | ### Create a confusion matrix ----
121 | conf_mat <- conf_mat(
122 | data = credit_test_with_pred_tbl,
123 | truth = default,
124 | estimate = .pred_class
125 | )
126 | conf_mat
127 |
128 | ### Visualize the confusion matrix ----
129 | conf_mat %>% autoplot(type = "heatmap")
130 | conf_mat %>% autoplot(type = "mosaic")
131 |
132 | ### Visualize the ROC curve ----
133 | credit_test_with_pred_tbl %>%
134 | roc_curve(
135 | truth = default,
136 | estimate = .pred_no
137 | ) %>%
138 | autoplot()
139 |
140 | ### Calculate the ROC AUC (area under the curve) ----
141 | credit_roc_auc <- credit_test_with_pred_tbl %>%
142 | roc_auc(
143 | truth = default,
144 | estimate = .pred_no
145 | )
146 | credit_roc_auc
147 |
148 | ### Put together other model metrics ----
149 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
150 | classification_metrics <- conf_mat(
151 | credit_test_with_pred_tbl,
152 | truth = default,
153 | estimate = .pred_class
154 | ) %>%
155 | summary()
156 | classification_metrics
157 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/neural_networks/modeling_the_strength_of_concrete_with_neural_networks.r:
--------------------------------------------------------------------------------
1 | # Modeling the Strength of Concrete with Neural Networks ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 7:
4 | # Black Box Methods - Neural Networks and Support Vector Machines
5 | #
6 | # The original code is made with {neuralnet}. I wanted to see how one could
7 | # recreate it using mainly {tidymodels} and {tidyverse}.
8 | #
9 | # You can find the original code and the dataset here:
10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
11 |
12 | ## 1. Loading libraries (in the order they get used) ----
13 | library(conflicted)
14 | library(tidyverse)
15 | library(tidymodels)
16 | library(corrr)
17 |
18 |
19 | ## 2. Exploring and preparing the data ----
20 |
21 | ### Read in data and examine structure ----
22 | concrete_tbl <- read_csv("machine_learning_with_r_3rd_ed/neural_networks/data/concrete.csv")
23 |
24 | concrete_tbl %>% glimpse()
25 |
26 | ### Check the minimum and maximum strength ----
27 | concrete_tbl %>%
28 | select(strength) %>%
29 | summary()
30 |
31 |
32 | ## 3. Creating the recipe ----
33 |
34 | ### Apply normalization to the numeric predictors ----
35 | recipe_obj <- recipe(
36 | strength ~ .,
37 | data = concrete_tbl
38 | ) %>%
39 | step_range(
40 | all_numeric_predictors(),
41 | min = 0,
42 | max = 1
43 | )
44 | recipe_obj
45 |
46 | concrete_normalized_tbl <- recipe_obj %>%
47 | prep() %>%
48 | bake(new_data = NULL)
49 | concrete_normalized_tbl
50 |
51 | ### Create training and test data ----
52 | concrete_split <- initial_time_split(
53 | concrete_normalized_tbl,
54 | prop = 773 / 1030
55 | )
56 | concrete_train <- training(concrete_split)
57 | concrete_test <- testing(concrete_split)
58 |
59 |
60 | ## 3. Training a model on the data ----
61 |
62 | # nnet is the engine (needs to be installed if not already):
63 | # install.packages("nnet")
64 |
65 | # It is used as the engine for {parsnip}'s mlp() function.
66 | # And since we are predicting strength, we choose regression as the mode.
67 |
68 | ### Create model specification ----
69 | model_spec <- mlp(
70 | engine = "nnet",
71 | mode = "regression",
72 | hidden_units = 5,
73 | penalty = 0,
74 | epochs = 100
75 | ) %>%
76 | translate()
77 | model_spec
78 |
79 | ### Fit the model ----
80 | model_fit <- fit(
81 | model_spec,
82 | strength ~ .,
83 | concrete_train
84 | )
85 | model_fit
86 |
87 | ### Take a closer look at the model ----
88 | summary(model_fit$fit)
89 |
90 | ### Make the predictions (you could skip this step) ----
91 | concrete_test_pred <- predict(
92 | model_fit,
93 | new_data = concrete_test,
94 | type = "numeric"
95 | )
96 | concrete_test_pred
97 |
98 | ### Add the predictions to the test tibble ----
99 | concrete_test_with_pred <- augment(model_fit, concrete_test)
100 | concrete_test_with_pred
101 |
102 | ### Metrics ----
103 | concrete_test_with_pred %>% metrics(strength, .pred)
104 |
105 | ### Visualize the network topology ----
106 | # I have yet to find a working solution to create this visualization.
107 | # If you know how to do it, please let me know!
108 |
109 |
110 | ## 4. Evaluating model performance ----
111 |
112 | ### Examine the correlation between predicted and actual values ----
113 | concrete_test_with_pred %>%
114 | select(.pred, strength) %>%
115 | correlate()
116 |
117 |
118 | ## 5. Improving model performance with two hidden layers and custom activation function ----
119 |
120 | ### Create model specification ----
121 | model_spec_2 <- mlp(
122 | engine = "nnet",
123 | mode = "regression",
124 | hidden_units = 5,
125 | penalty = 0.1,
126 | epochs = 100,
127 | ) %>%
128 | translate()
129 | model_spec_2
130 |
131 | ### Fit the model ----
132 | model_fit_2 <- fit(
133 | model_spec_2,
134 | strength ~ .,
135 | concrete_train
136 | )
137 | model_fit_2
138 |
139 | ### Take a closer look at the model ----
140 | summary(model_fit_2$fit)
141 |
142 | ### Make the predictions (you could skip this step) ----
143 | concrete_test_pred_2 <- predict(
144 | model_fit_2,
145 | new_data = concrete_test,
146 | type = "numeric"
147 | )
148 | concrete_test_pred_2
149 |
150 | ### Add the predictions to the test tibble ----
151 | concrete_test_with_pred_2 <- augment(model_fit_2, concrete_test)
152 | concrete_test_with_pred_2
153 |
154 | ### Metrics ----
155 | concrete_test_with_pred_2 %>% metrics(strength, .pred)
156 |
157 | ### Examine the correlation between predicted and actual values ----
158 | concrete_test_with_pred_2 %>%
159 | select(.pred, strength) %>%
160 | correlate()
161 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/decision_trees_and_rules/identifying_poisonous_mushrooms_with_rule_learners.r:
--------------------------------------------------------------------------------
1 | # Identifying Poisonous Mushrooms with Rule Learners ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 5:
4 | # Divide and Conquer - Classification Using Decision Trees and Rules.
5 | #
6 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
7 | # wanted to see how one could recreate it using mainly {tidymodels} and
8 | # {tidyverse}.
9 | #
10 | # You can find the original code and the slightly modified dataset here:
11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
12 |
13 | ## 1. Loading libraries (in the order they get used) ----
14 | library(conflicted)
15 | library(tidyverse)
16 | library(tidymodels)
17 | library(rules)
18 |
19 | ## 2. Exploring and preparing the data ----
20 | mushrooms_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/mushrooms.csv")
21 |
22 | ### Examine the data ----
23 | mushrooms_tbl %>% map(unique)
24 |
25 | ### Drop the veil_type feature ----
26 | mushroom_selected_tbl <- mushrooms_tbl %>%
27 | select(-veil_type)
28 |
29 | ### Examine the class distribution ----
30 | mushroom_selected_tbl %>%
31 | count(type) %>%
32 | mutate(pct = (n / sum(n) * 100))
33 |
34 |
35 | ## 3. Creating the recipe ----
36 | recipe_obj <- recipe(
37 | type ~ .,
38 | data = mushroom_selected_tbl
39 | )
40 | recipe_obj
41 |
42 | mushroom_baked_tbl <- recipe_obj %>%
43 | prep() %>%
44 | bake(new_data = NULL)
45 |
46 |
47 | ## 4. Training a model on the data ----
48 |
49 | ### Model specification ----
50 | model_spec <- C5_rules(
51 | mode = "classification",
52 | engine = "C5.0",
53 | trees = NULL,
54 | min_n = NULL
55 | ) %>%
56 | translate()
57 | model_spec
58 |
59 | ### Fit the model ----
60 | model_fit <- fit(
61 | model_spec,
62 | type ~ .,
63 | mushroom_baked_tbl
64 | )
65 | model_fit
66 |
67 | model_fit %>%
68 | extract_fit_engine() %>%
69 | summary()
70 |
71 | ### Make the predictions (you could skip this step) ----
72 | mushroom_pred_tbl <- predict(
73 | object = model_fit,
74 | new_data = mushroom_baked_tbl,
75 | type = "class"
76 | )
77 | mushroom_pred_tbl
78 |
79 | ### Add the predictions to the test tibble ----
80 | mushroom_pred_tbl <- augment(model_fit, mushroom_baked_tbl)
81 | mushroom_pred_tbl
82 |
83 |
84 | ## 5. Evaluating model performance ----
85 |
86 | ### Create a confusion matrix ----
87 | conf_mat <- conf_mat(
88 | data = mushroom_pred_tbl,
89 | truth = type,
90 | estimate = .pred_class
91 | )
92 | conf_mat
93 |
94 | ### Visualize the confusion matrix ----
95 | conf_mat %>% autoplot(type = "heatmap")
96 | conf_mat %>% autoplot(type = "mosaic")
97 |
98 | ### Visualize the ROC curve ----
99 | mushroom_pred_tbl %>%
100 | roc_curve(
101 | truth = type,
102 | estimate = .pred_edible
103 | ) %>%
104 | autoplot()
105 |
106 | ### Calculate the ROC AUC (area under the curve) ----
107 | mushroom_roc_auc <- mushroom_pred_tbl %>%
108 | roc_auc(
109 | truth = type,
110 | estimate = .pred_edible
111 | )
112 | mushroom_roc_auc
113 |
114 | ### Put together other model metrics ----
115 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
116 | classification_metrics <- conf_mat(
117 | mushroom_pred_tbl,
118 | truth = type,
119 | estimate = .pred_class
120 | ) %>%
121 | summary()
122 | classification_metrics
123 |
124 |
125 | ## 6. Creating a function to help evaluate the model further ----
126 |
127 | # The assumption here is that you have already gone through steps 1. to 4.
128 | # What we're potentially tuning here are the arguments .trees and .min_n
129 |
130 | classify_with_c5_rules <- function(
131 | .trees = NULL,
132 | .min_n = NULL
133 | ) {
134 |
135 | # Create the recipe
136 | recipe_obj <- recipe(
137 | type ~ .,
138 | data = mushroom_selected_tbl
139 | )
140 |
141 | mushroom_baked_tbl <- recipe_obj %>%
142 | prep() %>%
143 | bake(new_data = NULL)
144 |
145 | # Model specification
146 | model_spec <- C5_rules(
147 | mode = "classification",
148 | engine = "C5.0",
149 | trees = .trees,
150 | min_n = .min_n
151 | ) %>%
152 | translate()
153 |
154 | # Fit the model
155 | model_fit <- fit(
156 | model_spec,
157 | type ~ .,
158 | mushroom_baked_tbl
159 | )
160 |
161 | model_fit %>%
162 | extract_fit_engine() %>%
163 | summary()
164 |
165 | # Add the predictions to the test tibble
166 | mushroom_pred_tbl <- augment(model_fit, mushroom_baked_tbl)
167 | mushroom_pred_tbl
168 |
169 | # Create a confusion matrix
170 | conf_mat <- conf_mat(
171 | data = mushroom_pred_tbl,
172 | truth = type,
173 | estimate = .pred_class
174 | )
175 |
176 | conf_mat %>% autoplot(type = "heatmap")
177 |
178 | }
179 |
180 | ### Test the function ----
181 | classify_with_c5_rules(
182 | .trees = 3,
183 | .min_n = 1
184 | )
185 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/knn/data/wdbc-names.txt:
--------------------------------------------------------------------------------
1 | 1. Title: Wisconsin Diagnostic Breast Cancer (WDBC)
2 |
3 | 2. Source Information
4 |
5 | a) Creators:
6 |
7 | Dr. William H. Wolberg, General Surgery Dept., University of
8 | Wisconsin, Clinical Sciences Center, Madison, WI 53792
9 | wolberg@eagle.surgery.wisc.edu
10 |
11 | W. Nick Street, Computer Sciences Dept., University of
12 | Wisconsin, 1210 West Dayton St., Madison, WI 53706
13 | street@cs.wisc.edu 608-262-6619
14 |
15 | Olvi L. Mangasarian, Computer Sciences Dept., University of
16 | Wisconsin, 1210 West Dayton St., Madison, WI 53706
17 | olvi@cs.wisc.edu
18 |
19 | b) Donor: Nick Street
20 |
21 | c) Date: November 1995
22 |
23 | 3. Past Usage:
24 |
25 | first usage:
26 |
27 | W.N. Street, W.H. Wolberg and O.L. Mangasarian
28 | Nuclear feature extraction for breast tumor diagnosis.
29 | IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science
30 | and Technology, volume 1905, pages 861-870, San Jose, CA, 1993.
31 |
32 | OR literature:
33 |
34 | O.L. Mangasarian, W.N. Street and W.H. Wolberg.
35 | Breast cancer diagnosis and prognosis via linear programming.
36 | Operations Research, 43(4), pages 570-577, July-August 1995.
37 |
38 | Medical literature:
39 |
40 | W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
41 | Machine learning techniques to diagnose breast cancer from
42 | fine-needle aspirates.
43 | Cancer Letters 77 (1994) 163-171.
44 |
45 | W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
46 | Image analysis and machine learning applied to breast cancer
47 | diagnosis and prognosis.
48 | Analytical and Quantitative Cytology and Histology, Vol. 17
49 | No. 2, pages 77-87, April 1995.
50 |
51 | W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
52 | Computerized breast cancer diagnosis and prognosis from fine
53 | needle aspirates.
54 | Archives of Surgery 1995;130:511-516.
55 |
56 | W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
57 | Computer-derived nuclear features distinguish malignant from
58 | benign breast cytology.
59 | Human Pathology, 26:792--796, 1995.
60 |
61 | See also:
62 | http://www.cs.wisc.edu/~olvi/uwmp/mpml.html
63 | http://www.cs.wisc.edu/~olvi/uwmp/cancer.html
64 |
65 | Results:
66 |
67 | - predicting field 2, diagnosis: B = benign, M = malignant
68 | - sets are linearly separable using all 30 input features
69 | - best predictive accuracy obtained using one separating plane
70 | in the 3-D space of Worst Area, Worst Smoothness and
71 | Mean Texture. Estimated accuracy 97.5% using repeated
72 | 10-fold crossvalidations. Classifier has correctly
73 | diagnosed 176 consecutive new patients as of November
74 | 1995.
75 |
76 | 4. Relevant information
77 |
78 | Features are computed from a digitized image of a fine needle
79 | aspirate (FNA) of a breast mass. They describe
80 | characteristics of the cell nuclei present in the image.
81 | A few of the images can be found at
82 | http://www.cs.wisc.edu/~street/images/
83 |
84 | Separating plane described above was obtained using
85 | Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
86 | Construction Via Linear Programming." Proceedings of the 4th
87 | Midwest Artificial Intelligence and Cognitive Science Society,
88 | pp. 97-101, 1992], a classification method which uses linear
89 | programming to construct a decision tree. Relevant features
90 | were selected using an exhaustive search in the space of 1-4
91 | features and 1-3 separating planes.
92 |
93 | The actual linear program used to obtain the separating plane
94 | in the 3-dimensional space is that described in:
95 | [K. P. Bennett and O. L. Mangasarian: "Robust Linear
96 | Programming Discrimination of Two Linearly Inseparable Sets",
97 | Optimization Methods and Software 1, 1992, 23-34].
98 |
99 |
100 | This database is also available through the UW CS ftp server:
101 |
102 | ftp ftp.cs.wisc.edu
103 | cd math-prog/cpo-dataset/machine-learn/WDBC/
104 |
105 | 5. Number of instances: 569
106 |
107 | 6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features)
108 |
109 | 7. Attribute information
110 |
111 | 1) ID number
112 | 2) Diagnosis (M = malignant, B = benign)
113 | 3-32)
114 |
115 | Ten real-valued features are computed for each cell nucleus:
116 |
117 | a) radius (mean of distances from center to points on the perimeter)
118 | b) texture (standard deviation of gray-scale values)
119 | c) perimeter
120 | d) area
121 | e) smoothness (local variation in radius lengths)
122 | f) compactness (perimeter^2 / area - 1.0)
123 | g) concavity (severity of concave portions of the contour)
124 | h) concave points (number of concave portions of the contour)
125 | i) symmetry
126 | j) fractal dimension ("coastline approximation" - 1)
127 |
128 | Several of the papers listed above contain detailed descriptions of
129 | how these features are computed.
130 |
131 | The mean, standard error, and "worst" or largest (mean of the three
132 | largest values) of these features were computed for each image,
133 | resulting in 30 features. For instance, field 3 is Mean Radius, field
134 | 13 is Radius SE, field 23 is Worst Radius.
135 |
136 | All feature values are recoded with four significant digits.
137 |
138 | 8. Missing attribute values: none
139 |
140 | 9. Class distribution: 357 benign, 212 malignant
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/data/usedcars.csv:
--------------------------------------------------------------------------------
1 | year,model,price,mileage,color,transmission
2 | 2011,SEL,21992,7413,Yellow,AUTO
3 | 2011,SEL,20995,10926,Gray,AUTO
4 | 2011,SEL,19995,7351,Silver,AUTO
5 | 2011,SEL,17809,11613,Gray,AUTO
6 | 2012,SE,17500,8367,White,AUTO
7 | 2010,SEL,17495,25125,Silver,AUTO
8 | 2011,SEL,17000,27393,Blue,AUTO
9 | 2010,SEL,16995,21026,Silver,AUTO
10 | 2011,SES,16995,32655,Silver,AUTO
11 | 2010,SES,16995,36116,Silver,AUTO
12 | 2010,SES,16995,40539,Black,AUTO
13 | 2011,SES,16992,9199,Silver,AUTO
14 | 2011,SEL,16950,9388,Green,AUTO
15 | 2010,SES,16950,32058,Red,AUTO
16 | 2011,SE,16000,15367,White,AUTO
17 | 2011,SES,15999,16368,Blue,AUTO
18 | 2010,SEL,15999,19926,Silver,AUTO
19 | 2010,SES,15995,36049,Silver,AUTO
20 | 2011,SEL,15992,11662,Blue,AUTO
21 | 2011,SEL,15992,32069,Silver,AUTO
22 | 2010,SES,15988,16035,Silver,MANUAL
23 | 2010,SEL,15980,39943,White,AUTO
24 | 2011,SE,15899,36685,Silver,AUTO
25 | 2010,SEL,15889,24920,Black,AUTO
26 | 2009,SEL,15688,20019,Blue,AUTO
27 | 2010,SE,15500,29338,Blue,AUTO
28 | 2010,SE,15499,7784,Black,AUTO
29 | 2010,SE,15499,35636,Black,AUTO
30 | 2010,SES,15298,22029,Gray,AUTO
31 | 2009,SEL,14999,33107,Silver,AUTO
32 | 2010,SES,14999,36306,Red,AUTO
33 | 2009,SE,14995,34419,Black,MANUAL
34 | 2011,SE,14992,4867,Black,AUTO
35 | 2011,SEL,14992,18948,Black,AUTO
36 | 2009,SES,14992,24030,Red,AUTO
37 | 2010,SEL,14990,33036,Black,AUTO
38 | 2011,SE,14989,23967,White,AUTO
39 | 2010,SE,14906,37905,Silver,AUTO
40 | 2010,SE,14900,28955,White,AUTO
41 | 2010,SE,14893,11165,White,AUTO
42 | 2010,SES,14761,44813,Black,AUTO
43 | 2010,SES,14699,36469,Gray,AUTO
44 | 2010,SES,14677,22143,Black,MANUAL
45 | 2009,SES,14549,34046,Silver,AUTO
46 | 2010,SE,14499,32703,Red,AUTO
47 | 2010,SES,14495,35894,Silver,AUTO
48 | 2010,SE,14495,38275,Black,AUTO
49 | 2010,SE,14480,24855,Blue,AUTO
50 | 2009,SEL,14477,29501,Gray,MANUAL
51 | 2009,SEL,14355,35394,Red,AUTO
52 | 2010,SE,14299,36447,Black,AUTO
53 | 2010,SES,14275,35318,Black,AUTO
54 | 2010,SES,14000,24929,Silver,AUTO
55 | 2009,SE,13999,23785,Red,AUTO
56 | 2010,SE,13997,15167,Black,MANUAL
57 | 2010,SE,13995,13541,Silver,AUTO
58 | 2010,SE,13995,20278,Black,MANUAL
59 | 2009,SES,13995,46126,Black,AUTO
60 | 2009,SES,13995,53733,Silver,AUTO
61 | 2009,SES,13992,21108,Silver,AUTO
62 | 2010,SE,13992,21721,Green,AUTO
63 | 2010,SES,13992,26716,Gray,MANUAL
64 | 2009,SES,13992,26887,Black,AUTO
65 | 2009,SE,13991,36252,Silver,MANUAL
66 | 2009,SE,13950,9450,Black,AUTO
67 | 2010,SE,13950,31414,Black,AUTO
68 | 2010,SE,13950,37185,Blue,AUTO
69 | 2010,SE,13895,48174,Gray,AUTO
70 | 2009,SE,13888,50533,White,AUTO
71 | 2009,SE,13845,36713,Blue,AUTO
72 | 2009,SES,13799,34888,Black,AUTO
73 | 2009,SES,13742,38380,Black,AUTO
74 | 2010,SEL,13687,35574,Gray,AUTO
75 | 2009,SEL,13663,27528,Silver,AUTO
76 | 2010,SES,13599,33302,Red,AUTO
77 | 2009,SEL,13584,43369,Red,AUTO
78 | 2009,SES,13425,64055,Black,AUTO
79 | 2010,SE,13384,41342,Gray,AUTO
80 | 2010,SE,13383,34503,Black,AUTO
81 | 2010,SE,13350,16573,Blue,AUTO
82 | 2009,SES,12999,32403,Blue,AUTO
83 | 2009,SE,12998,34846,Blue,AUTO
84 | 2007,SE,12997,39665,Red,AUTO
85 | 2010,SE,12995,21325,Black,AUTO
86 | 2010,SE,12995,32743,Black,MANUAL
87 | 2010,SE,12995,40058,White,MANUAL
88 | 2009,SE,12995,42325,Blue,AUTO
89 | 2009,SE,12995,44518,Red,AUTO
90 | 2009,SE,12995,53902,Gray,AUTO
91 | 2008,SE,12995,127327,Red,AUTO
92 | 2009,SE,12992,27136,Gray,AUTO
93 | 2009,SES,12990,45813,Silver,AUTO
94 | 2009,SE,12988,31538,Gray,AUTO
95 | 2010,SE,12849,29517,Silver,AUTO
96 | 2010,SE,12780,35871,Black,AUTO
97 | 2008,SE,12777,49787,Black,MANUAL
98 | 2008,SES,12704,36323,Blue,AUTO
99 | 2009,SES,12595,39211,Blue,AUTO
100 | 2009,SE,12507,44789,Gray,AUTO
101 | 2008,SE,12500,45996,White,MANUAL
102 | 2009,SE,12500,54988,White,MANUAL
103 | 2009,SE,12280,29288,Red,AUTO
104 | 2009,SE,11999,36124,Blue,AUTO
105 | 2009,SE,11992,32559,Black,MANUAL
106 | 2009,SES,11984,59048,Black,AUTO
107 | 2009,SE,11980,55170,Red,AUTO
108 | 2010,SE,11792,39722,Green,AUTO
109 | 2008,SE,11754,38286,Black,AUTO
110 | 2008,SES,11749,57341,Red,AUTO
111 | 2008,SES,11495,82221,Silver,AUTO
112 | 2008,SE,11450,85229,Red,MANUAL
113 | 2009,SES,10995,42834,Red,AUTO
114 | 2005,SES,10995,69415,Blue,AUTO
115 | 2009,SEL,10995,78264,Gray,AUTO
116 | 2009,SE,10979,60709,Red,AUTO
117 | 2008,SE,10955,39643,Gray,AUTO
118 | 2009,SE,10955,40180,Gold,AUTO
119 | 2008,SE,10836,40330,Green,MANUAL
120 | 2007,SES,10815,77231,Red,AUTO
121 | 2007,SE,10770,72937,Silver,MANUAL
122 | 2010,SE,10717,64199,Black,AUTO
123 | 2007,SES,10000,63926,Red,AUTO
124 | 2007,SES,9999,74427,Silver,AUTO
125 | 2007,SES,9999,78948,Black,MANUAL
126 | 2006,SE,9995,51311,Silver,AUTO
127 | 2008,SE,9995,95364,White,AUTO
128 | 2008,SE,9992,74109,White,AUTO
129 | 2007,SE,9651,63296,Blue,AUTO
130 | 2007,SES,9000,80605,Red,AUTO
131 | 2006,SE,8999,49656,Silver,AUTO
132 | 2007,SE,8996,48652,Silver,MANUAL
133 | 2006,SE,8800,71331,White,AUTO
134 | 2008,SE,8495,106171,Black,AUTO
135 | 2008,SE,8494,68901,Silver,AUTO
136 | 2009,SE,8480,70036,White,MANUAL
137 | 2007,SES,7999,81596,Yellow,MANUAL
138 | 2006,SES,7995,35000,Black,MANUAL
139 | 2006,SES,7995,97987,Red,AUTO
140 | 2003,SES,7900,96000,White,AUTO
141 | 2005,SES,7488,59013,Red,AUTO
142 | 2004,SE,6999,105714,Silver,AUTO
143 | 2007,SE,6995,86862,White,AUTO
144 | 2000,SE,6980,60161,Green,AUTO
145 | 2004,SES,6980,101130,Gray,AUTO
146 | 2004,SES,6950,119720,Black,AUTO
147 | 2006,SES,6200,95000,Silver,AUTO
148 | 2002,SE,5995,87003,Red,AUTO
149 | 2000,SE,5980,96841,Red,AUTO
150 | 2001,SE,4899,151479,Yellow,AUTO
151 | 2000,SE,3800,109259,Red,AUTO
152 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_10.r:
--------------------------------------------------------------------------------
1 | ##### Chapter 10: Evaluating Model Performance -------------------
2 |
3 | ## Confusion matrixes in R ----
4 |
5 | ## Create the predicted probabilities from the SMS classifier built in Chapter 4.
6 | sms_results <- read.csv("sms_results.csv", stringsAsFactors = TRUE)
7 |
8 | # the first several test cases
9 | head(sms_results)
10 |
11 | # test cases where the model is less confident
12 | head(subset(sms_results, prob_spam > 0.40 & prob_spam < 0.60))
13 |
14 | # test cases where the model was wrong
15 | head(subset(sms_results, actual_type != predict_type))
16 |
17 | # specifying vectors
18 | table(sms_results$actual_type, sms_results$predict_type)
19 |
20 | # alternative solution using the formula interface (not shown in book)
21 | xtabs(~ actual_type + predict_type, sms_results)
22 |
23 | # using the CrossTable function
24 | library(gmodels)
25 | CrossTable(sms_results$actual_type, sms_results$predict_type)
26 |
27 | # accuracy and error rate calculation --
28 | # accuracy
29 | (152 + 1203) / (152 + 1203 + 4 + 31)
30 | # error rate
31 | (4 + 31) / (152 + 1203 + 4 + 31)
32 | # error rate = 1 - accuracy
33 | 1 - 0.9748201
34 |
35 | ## Beyond accuracy: other performance measures ----
36 | library(caret)
37 | confusionMatrix(sms_results$predict_type, sms_results$actual_type, positive = "spam")
38 |
39 | # Kappa statistic
40 | # example using SMS classifier
41 | pr_a <- 0.865 + 0.109
42 | pr_a
43 |
44 | pr_e <- 0.868 * 0.888 + 0.132 * 0.112
45 | pr_e
46 |
47 | k <- (pr_a - pr_e) / (1 - pr_e)
48 | k
49 |
50 | # calculate kappa via the vcd package
51 | library(vcd)
52 | Kappa(table(sms_results$actual_type, sms_results$predict_type))
53 |
54 | # calculate kappa via the irr package
55 | library(irr)
56 | kappa2(sms_results[1:2])
57 |
58 | # Matthews correlation coefficient (MCC)
59 | # for SMS classifier: TN = 1203, FP = 4, FN = 31, TP = 152
60 |
61 | # MCC = (TP * TN - FP * FN) /
62 | # sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
63 |
64 | # compute MCC by hand for SMS classifier
65 | (152 * 1203 - 4 * 31) /
66 | sqrt((152 + 4) * (152 + 31) * (1203 + 4) * (1203 + 31))
67 |
68 | # compute MCC using mcc() function in mltools package
69 | library(mltools)
70 | mcc(sms_results$actual_type, sms_results$predict_type)
71 |
72 | # compute MCC using Pearson correlation
73 | cor(ifelse(sms_results$actual_type == "spam", 1, 0),
74 | ifelse(sms_results$predict_type == "spam", 1, 0))
75 |
76 | # Sensitivity and specificity
77 | # example using SMS classifier
78 | sens <- 152 / (152 + 31)
79 | sens
80 |
81 | spec <- 1203 / (1203 + 4)
82 | spec
83 |
84 | # example using the caret package
85 | library(caret)
86 | sensitivity(sms_results$predict_type, sms_results$actual_type, positive = "spam")
87 | specificity(sms_results$predict_type, sms_results$actual_type, negative = "ham")
88 |
89 | # Precision and recall
90 | prec <- 152 / (152 + 4)
91 | prec
92 |
93 | rec <- 152 / (152 + 31)
94 | rec
95 |
96 | # example using the caret package
97 | library(caret)
98 | posPredValue(sms_results$predict_type, sms_results$actual_type, positive = "spam")
99 | sensitivity(sms_results$predict_type, sms_results$actual_type, positive = "spam")
100 |
101 | # F-measure
102 | f <- (2 * prec * rec) / (prec + rec)
103 | f
104 |
105 | f <- (2 * 152) / (2 * 152 + 4 + 31)
106 | f
107 |
108 | ## Visualizing Performance Tradeoffs ----
109 | library(pROC)
110 | sms_roc <- roc(sms_results$actual_type, sms_results$prob_spam)
111 |
112 | # ROC curve for Naive Bayes
113 | plot(sms_roc, main = "ROC curve for SMS spam filter",
114 | col = "blue", lwd = 2, grid = TRUE, legacy.axes = TRUE)
115 |
116 | # compare to kNN
117 | sms_results_knn <- read.csv("sms_results_knn.csv")
118 | sms_roc_knn <- roc(sms_results$actual_type, sms_results_knn$p_spam)
119 | plot(sms_roc_knn, col = "red", lwd = 2, add = TRUE)
120 |
121 | # calculate AUC for Naive Bayes and kNN
122 | auc(sms_roc)
123 | auc(sms_roc_knn)
124 |
125 | ## Estimating Future Performance ----
126 |
127 | # partitioning data
128 | library(caret)
129 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
130 |
131 | # Holdout method
132 | # using random IDs
133 | random_ids <- order(runif(1000))
134 | credit_train <- credit[random_ids[1:500],]
135 | credit_validate <- credit[random_ids[501:750], ]
136 | credit_test <- credit[random_ids[751:1000], ]
137 |
138 | # using caret function
139 | in_train <- createDataPartition(credit$default, p = 0.75, list = FALSE)
140 | credit_train <- credit[in_train, ]
141 | credit_test <- credit[-in_train, ]
142 |
143 | # 10-fold CV
144 | set.seed(123) # to ensure results match
145 | folds <- createFolds(credit$default, k = 10)
146 | str(folds)
147 | credit01_test <- credit[folds$Fold01, ]
148 | credit01_train <- credit[-folds$Fold01, ]
149 |
150 | ## Automating 10-fold CV for a C5.0 Decision Tree using lapply() ----
151 | library(caret)
152 | library(C50)
153 | library(irr)
154 |
155 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
156 |
157 | set.seed(123)
158 | folds <- createFolds(credit$default, k = 10)
159 |
160 | cv_results <- lapply(folds, function(x) {
161 | credit_train <- credit[-x, ]
162 | credit_test <- credit[x, ]
163 | credit_model <- C5.0(default ~ ., data = credit_train)
164 | credit_pred <- predict(credit_model, credit_test)
165 | credit_actual <- credit_test$default
166 | kappa <- kappa2(data.frame(credit_actual, credit_pred))$value
167 | return(kappa)
168 | })
169 |
170 | # examine the results of the 10 trials
171 | str(cv_results)
172 |
173 | # compute the average kappa across the 10 trials
174 | mean(unlist(cv_results))
175 |
176 | # compute the standard deviation across the 10 trials
177 | sd(unlist(cv_results))
178 |
179 | ## Bootstrap sampling ----
180 |
181 | # computing the probability that a record is unselected
182 | (1 - (1/1000))^1000
183 | (1 - (1/100000))^100000
184 | 1 / exp(1) # as n approaches infinity
185 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/hyperparameter_optimization_with_tidymodels.R:
--------------------------------------------------------------------------------
1 | # Hyperparameter Optimization with tidymodels ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 11:
4 | # Improving Model Performance
5 | #
6 | # The original code is made with {caret}. I wanted to see how one could recreate
7 | # it using mainly {tidymodels} and {tidyverse}.
8 | #
9 | # You can find the original code and the slightly modified dataset here:
10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11
11 |
12 | ## 1. Loading libraries (in the order they get used) ----
13 | library(conflicted)
14 | library(tidyverse)
15 | library(tidymodels)
16 |
17 |
18 | ## 2. Loading the dataset ----
19 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
20 |
21 |
22 | ## 3. Creating the recipe and splitting the data ----
23 |
24 | ### Convert strings to factors ----
25 | recipe_obj <- recipe(
26 | default ~ .,
27 | data = credit_tbl
28 | ) %>%
29 | step_string2factor(all_nominal())
30 | recipe_obj
31 |
32 | credit_factorized_tbl <- recipe_obj %>%
33 | prep() %>%
34 | bake(new_data = NULL)
35 | credit_factorized_tbl
36 |
37 | ### Create training and test data (randomly) ----
38 | set.seed(123)
39 | credit_split <- initial_split(
40 | credit_factorized_tbl,
41 | prop = 0.9,
42 | strata = default
43 | )
44 |
45 | credit_train <- training(credit_split)
46 | credit_test <- testing(credit_split)
47 |
48 | ### Check the proportion of class variable ----
49 | credit_train %>%
50 | count(default) %>%
51 | mutate(pct = (n / sum(n) * 100))
52 |
53 | credit_test %>%
54 | count(default) %>%
55 | mutate(pct = (n / sum(n) * 100))
56 |
57 |
58 | ## 4. Creating a simple model ----
59 |
60 | ### Create specifications with the default tuning parameters for rpart ----
61 | # tree_depth = 30, min_n = 2 and cost_complexity = 0.01
62 |
63 | set.seed(123)
64 | tune_spec_simple <- decision_tree(
65 | mode = "classification",
66 | engine = "rpart"
67 | )
68 | tune_spec_simple
69 |
70 | ### Fit the model ----
71 | model_fit_simple <- fit(
72 | tune_spec_simple,
73 | default ~ .,
74 | credit_train
75 | )
76 |
77 | model_fit_simple %>%
78 | extract_fit_engine() %>%
79 | summary()
80 |
81 | ### Make the predictions ----
82 | credit_simple <- augment(model_fit_simple, credit_test)
83 | credit_simple
84 |
85 | ### Create a confusion matrix ----
86 | conf_mat_simple <- conf_mat(
87 | data = credit_simple,
88 | truth = default,
89 | estimate = .pred_class
90 | )
91 | conf_mat_simple
92 |
93 | ### Visualize the ROC curve ----
94 | credit_simple %>%
95 | roc_curve(
96 | truth = default,
97 | estimate = .pred_no
98 | ) %>%
99 | autoplot()
100 |
101 | ### Calculate the ROC AUC (area under the curve) ----
102 | credit_roc_auc_simple <- credit_simple %>%
103 | roc_auc(
104 | truth = default,
105 | estimate = .pred_no
106 | )
107 | credit_roc_auc_simple
108 |
109 | ### Put together other model metrics ----
110 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
111 |
112 | classification_metrics_simple <- conf_mat(
113 | credit_simple,
114 | truth = default,
115 | estimate = .pred_class
116 | ) %>%
117 | summary()
118 | classification_metrics_simple
119 |
120 |
121 | ## 5. Generating a tuning grid ----
122 |
123 | ### Create a specification with tuning placeholders ----
124 | set.seed(123)
125 | tune_spec <- decision_tree(
126 | mode = "classification",
127 | engine = "rpart",
128 | tree_depth = tune(),
129 | min_n = tune(),
130 | cost_complexity = tune()
131 | )
132 | tune_spec
133 |
134 | ### Create a regular grid ----
135 | tree_grid <- grid_regular(
136 | extract_parameter_set_dials(tune_spec),
137 | levels = 4
138 | )
139 | tree_grid
140 |
141 |
142 | ## 6. Tuning along the grid ----
143 |
144 | ### Create CV folds of the credit_train ----
145 | folds <- vfold_cv(
146 | credit_train,
147 | v = 10
148 | )
149 | folds
150 |
151 | ### Tune along the grid ----
152 | tune_results <- tune_grid(
153 | tune_spec,
154 | default ~ .,
155 | resamples = folds,
156 | grid = tree_grid,
157 | metrics = metric_set(accuracy)
158 | )
159 |
160 | ### Plot the tuning results ----
161 | autoplot(tune_results)
162 |
163 |
164 | ## 7. Picking the winner ----
165 |
166 | ### Select the parameters that perform best ----
167 | final_params <- select_best(tune_results)
168 | final_params
169 |
170 | ### Finalize the specification ----
171 | best_spec <- finalize_model(tune_spec, final_params)
172 | best_spec
173 |
174 | ### Build the final model ----
175 | final_model <- fit(
176 | best_spec,
177 | default ~ .,
178 | credit_train
179 | )
180 | final_model
181 |
182 | final_model %>%
183 | extract_fit_engine() %>%
184 | summary()
185 |
186 | ### Add the predictions to the test tibble ----
187 | credit_with_pred_tbl <- augment(final_model, credit_test)
188 | credit_with_pred_tbl
189 |
190 |
191 | ## 8. Evaluating model performance ----
192 |
193 | ### Create a confusion matrix ----
194 | conf_mat <- conf_mat(
195 | data = credit_with_pred_tbl,
196 | truth = default,
197 | estimate = .pred_class
198 | )
199 | conf_mat
200 |
201 | ### Visualize the ROC curve ----
202 | credit_with_pred_tbl %>%
203 | roc_curve(
204 | truth = default,
205 | estimate = .pred_no
206 | ) %>%
207 | autoplot()
208 |
209 | ### Calculate the ROC AUC (area under the curve) ----
210 | credit_roc_auc <- credit_with_pred_tbl %>%
211 | roc_auc(
212 | truth = default,
213 | estimate = .pred_no
214 | )
215 | credit_roc_auc
216 |
217 | ### Put together other model metrics ----
218 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
219 | classification_metrics <- conf_mat(
220 | credit_with_pred_tbl,
221 | truth = default,
222 | estimate = .pred_class
223 | ) %>%
224 | summary()
225 | classification_metrics
226 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/hyperparameter_optimization/hyperparameter_optimization_with_tidymodels_random.R:
--------------------------------------------------------------------------------
1 | # Hyperparameter Optimization with tidymodels ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 11:
4 | # Improving Model Performance
5 | #
6 | # The original code is made with {caret}. I wanted to see how one could recreate
7 | # it using mainly {tidymodels} and {tidyverse}.
8 | #
9 | # You can find the original code and the slightly modified dataset here:
10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter11
11 |
12 | ## 1. Loading libraries (in the order they get used) ----
13 | library(conflicted)
14 | library(tidyverse)
15 | library(tidymodels)
16 |
17 |
18 | ## 2. Loading the dataset ----
19 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
20 |
21 |
22 | ## 3. Creating the recipe and splitting the data ----
23 |
24 | ### Convert strings to factors ----
25 | recipe_obj <- recipe(
26 | default ~ .,
27 | data = credit_tbl
28 | ) %>%
29 | step_string2factor(all_nominal())
30 | recipe_obj
31 |
32 | credit_factorized_tbl <- recipe_obj %>%
33 | prep() %>%
34 | bake(new_data = NULL)
35 | credit_factorized_tbl
36 |
37 | ### Create training and test data (randomly) ----
38 | set.seed(123)
39 | credit_split <- initial_split(
40 | credit_factorized_tbl,
41 | prop = 0.9,
42 | strata = default
43 | )
44 |
45 | credit_train <- training(credit_split)
46 | credit_test <- testing(credit_split)
47 |
48 | ### Check the proportion of class variable ----
49 | credit_train %>%
50 | count(default) %>%
51 | mutate(pct = (n / sum(n) * 100))
52 |
53 | credit_test %>%
54 | count(default) %>%
55 | mutate(pct = (n / sum(n) * 100))
56 |
57 |
58 | ## 4. Creating a simple model ----
59 |
60 | ### Create specifications with the default tuning parameters for rpart ----
61 | # tree_depth = 30, min_n = 2 and cost_complexity = 0.01
62 |
63 | set.seed(123)
64 | tune_spec_simple <- decision_tree(
65 | mode = "classification",
66 | engine = "rpart"
67 | )
68 | tune_spec_simple
69 |
70 | ### Fit the model ----
71 | model_fit_simple <- fit(
72 | tune_spec_simple,
73 | default ~ .,
74 | credit_train
75 | )
76 |
77 | model_fit_simple %>%
78 | extract_fit_engine() %>%
79 | summary()
80 |
81 | ### Make the predictions ----
82 | credit_simple <- augment(model_fit_simple, credit_test)
83 | credit_simple
84 |
85 | ### Create a confusion matrix ----
86 | conf_mat_simple <- conf_mat(
87 | data = credit_simple,
88 | truth = default,
89 | estimate = .pred_class
90 | )
91 | conf_mat_simple
92 |
93 | ### Visualize the ROC curve ----
94 | credit_simple %>%
95 | roc_curve(
96 | truth = default,
97 | estimate = .pred_no
98 | ) %>%
99 | autoplot()
100 |
101 | ### Calculate the ROC AUC (area under the curve) ----
102 | credit_roc_auc_simple <- credit_simple %>%
103 | roc_auc(
104 | truth = default,
105 | estimate = .pred_no
106 | )
107 | credit_roc_auc_simple
108 |
109 | ### Put together other model metrics ----
110 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
111 |
112 | classification_metrics_simple <- conf_mat(
113 | credit_simple,
114 | truth = default,
115 | estimate = .pred_class
116 | ) %>%
117 | summary()
118 | classification_metrics_simple
119 |
120 |
121 | ## 5. Generating a tuning grid ----
122 |
123 | ### Create a specification with tuning placeholders ----
124 | set.seed(123)
125 | tune_spec <- decision_tree(
126 | mode = "classification",
127 | engine = "rpart",
128 | tree_depth = tune(),
129 | min_n = tune(),
130 | cost_complexity = tune()
131 | )
132 | tune_spec
133 |
134 | ### Create a regular grid ----
135 | tree_grid <- grid_random(
136 | extract_parameter_set_dials(tune_spec),
137 | levels = 4
138 | )
139 | tree_grid
140 |
141 |
142 | ## 6. Tuning along the grid ----
143 |
144 | ### Create CV folds of the credit_train ----
145 | folds <- vfold_cv(
146 | credit_train,
147 | v = 10
148 | )
149 | folds
150 |
151 | ### Tune along the grid ----
152 | tune_results <- tune_grid(
153 | tune_spec,
154 | default ~ .,
155 | resamples = folds,
156 | grid = tree_grid,
157 | metrics = metric_set(accuracy)
158 | )
159 |
160 | ### Plot the tuning results ----
161 | autoplot(tune_results)
162 |
163 |
164 | ## 7. Picking the winner ----
165 |
166 | ### Select the parameters that perform best ----
167 | final_params <- select_best(tune_results)
168 | final_params
169 |
170 | ### Finalize the specification ----
171 | best_spec <- finalize_model(tune_spec, final_params)
172 | best_spec
173 |
174 | ### Build the final model ----
175 | final_model <- fit(
176 | best_spec,
177 | default ~ .,
178 | credit_train
179 | )
180 | final_model
181 |
182 | final_model %>%
183 | extract_fit_engine() %>%
184 | summary()
185 |
186 | ### Add the predictions to the test tibble ----
187 | credit_with_pred_tbl <- augment(final_model, credit_test)
188 | credit_with_pred_tbl
189 |
190 |
191 | ## 8. Evaluating model performance ----
192 |
193 | ### Create a confusion matrix ----
194 | conf_mat <- conf_mat(
195 | data = credit_with_pred_tbl,
196 | truth = default,
197 | estimate = .pred_class
198 | )
199 | conf_mat
200 |
201 | ### Visualize the ROC curve ----
202 | credit_with_pred_tbl %>%
203 | roc_curve(
204 | truth = default,
205 | estimate = .pred_no
206 | ) %>%
207 | autoplot()
208 |
209 | ### Calculate the ROC AUC (area under the curve) ----
210 | credit_roc_auc <- credit_with_pred_tbl %>%
211 | roc_auc(
212 | truth = default,
213 | estimate = .pred_no
214 | )
215 | credit_roc_auc
216 |
217 | ### Put together other model metrics ----
218 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
219 | classification_metrics <- conf_mat(
220 | credit_with_pred_tbl,
221 | truth = default,
222 | estimate = .pred_class
223 | ) %>%
224 | summary()
225 | classification_metrics
226 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_12.r:
--------------------------------------------------------------------------------
1 | ##### Chapter 12: Advanced Data Preparation --------------------
2 |
3 | ## Exploring R's tidyverse ----
4 |
5 | library(tidyverse) # load all tidyverse packages
6 |
7 | # convert the Titanic training dataset into a tibble
8 | library(tibble) # not necessary if tidyverse is already loaded
9 | titanic_csv <- read.csv("titanic_train.csv")
10 | titanic_tbl <- as_tibble(titanic_csv)
11 | titanic_tbl
12 |
13 | # read the titanic training dataset using readr
14 | library(readr) # not necessary if tidyverse is already loaded
15 | titanic_train <- read_csv("titanic_train.csv")
16 |
17 | # read the titanic training dataset using readxl
18 | library(readxl)
19 | titanic_train <- read_excel("titanic_train.xlsx")
20 |
21 | # preparing and piping data with dplyr
22 | library(dplyr)
23 |
24 | # filter for female rows only
25 | titanic_train |> filter(Sex == "female")
26 |
27 | # select only name, sex, and age columns
28 | titanic_train |> select(Name, Sex, Age)
29 |
30 | # combine multiple dplyr verbs and save output to a tibble
31 | titanic_women <- titanic_train |>
32 | filter(Sex == "female") |>
33 | select(Name, Sex, Age) |>
34 | arrange(Name)
35 |
36 | # create a new feature indicating elderly age
37 | titanic_train |>
38 | mutate(elderly = if_else(Age >= 65, 1, 0))
39 |
40 | # create multiple features within the same mutate command
41 | titanic_train |>
42 | mutate(
43 | elderly = if_else(Age >= 65, 1, 0),
44 | child = if_else(Age < 18, 1, 0)
45 | )
46 |
47 | # compute survival rate by gender
48 | titanic_train |>
49 | group_by(Sex) |>
50 | summarize(survival_rate = mean(Survived))
51 |
52 | # compute average survival rate for children vs. non-children
53 | titanic_train |>
54 | filter(!is.na(Age)) |>
55 | mutate(child = if_else(Age < 18, 1, 0)) |>
56 | group_by(child) |>
57 | summarize(survival_rate = mean(Survived))
58 |
59 | # transform the dataset and pipe into a decision tree
60 | library(rpart)
61 | m_titanic <- titanic_train |>
62 | filter(!is.na(Age)) |>
63 | mutate(AgeGroup = if_else(Age < 18, "Child", "Adult")) |>
64 | select(Survived, Pclass, Sex, AgeGroup) |>
65 | rpart(formula = Survived ~ ., data = _)
66 |
67 | library(rpart.plot)
68 | rpart.plot(m_titanic)
69 |
70 | ## Transforming text with stringr ----
71 | library(readr)
72 | titanic_train <- read_csv("titanic_train.csv")
73 |
74 | library(stringr)
75 |
76 | # examine cabin prefix code
77 | titanic_train <- titanic_train |>
78 | mutate(CabinCode = str_sub(Cabin, start = 1, end = 1))
79 |
80 | # compare cabin prefix to passenger class
81 | table(titanic_train$Pclass, titanic_train$CabinCode,
82 | useNA = "ifany")
83 |
84 | # plot of survival probability by cabin code
85 | library(ggplot2)
86 | titanic_train |> ggplot() +
87 | geom_bar(aes(x = CabinCode, y = Survived),
88 | stat = "summary", fun = "mean") +
89 | ggtitle("Titanic Survival Rate by Cabin Code")
90 |
91 | # look at the first few passenger names
92 | head(titanic_train$Name)
93 |
94 | # create a title / salutation feature
95 | titanic_train <- titanic_train |>
96 | # use regular expressions to find the characters between the comma and period
97 | mutate(Title = str_extract(Name, ", [A-z]+\\."))
98 |
99 | # look at the first few examples
100 | head(titanic_train$Title)
101 |
102 | # clean up the title feature
103 | titanic_train <- titanic_train |>
104 | mutate(Title = str_replace_all(Title, "[, \\.]", ""))
105 |
106 | # examine output
107 | table(titanic_train$Title)
108 |
109 | # group titles into related categories
110 | titanic_train <- titanic_train |>
111 | mutate(TitleGroup = recode(Title,
112 | # the first few stay the same
113 | "Mr" = "Mr", "Mrs" = "Mrs", "Master" = "Master",
114 | "Miss" = "Miss",
115 | # combine variants of "Miss"
116 | "Ms" = "Miss", "Mlle" = "Miss", "Mme" = "Miss",
117 | # anything else will be "Other"
118 | .missing = "Other",
119 | .default = "Other"
120 | )
121 | )
122 |
123 | # examine output
124 | table(titanic_train$TitleGroup)
125 |
126 | # plot of survival probability by title group
127 | library(ggplot2)
128 | titanic_train |> ggplot() +
129 | geom_bar(aes(x = TitleGroup, y = Survived),
130 | stat = "summary", fun = "mean") +
131 | ggtitle("Titanic Survival Rate by Salutation")
132 |
133 | ## Cleaning dates with lubridate ----
134 |
135 | library(lubridate)
136 |
137 | # reading in Machine Learning with R publication dates in different formats
138 | mdy(c("October 25, 2013", "10/25/2013"))
139 | dmy(c("25 October 2013", "25.10.13"))
140 | ymd("2013-10-25")
141 |
142 | # construct MLwR publication dates
143 | MLwR_1stEd <- mdy("October 25, 2013")
144 | MLwR_2ndEd <- mdy("July 31, 2015")
145 | MLwR_3rdEd <- mdy("April 15, 2019")
146 |
147 | # compute differences (returns a difftime object)
148 | MLwR_2ndEd - MLwR_1stEd
149 | MLwR_3rdEd - MLwR_2ndEd
150 |
151 | # convert the differences to durations
152 | as.duration(MLwR_2ndEd - MLwR_1stEd)
153 | as.duration(MLwR_3rdEd - MLwR_2ndEd)
154 |
155 | # convert the duration to years
156 | dyears()
157 | as.duration(MLwR_2ndEd - MLwR_1stEd) / dyears()
158 | as.duration(MLwR_3rdEd - MLwR_2ndEd) / dyears()
159 |
160 | # easier-to-remember version of the above:
161 | time_length(MLwR_2ndEd - MLwR_1stEd, unit = "years")
162 | time_length(MLwR_3rdEd - MLwR_2ndEd, unit = "years")
163 |
164 | # compute age (in duration)
165 | USA_DOB <- mdy("July 4, 1776") # USA's Date of Birth
166 | time_length(mdy("July 3 2023") - USA_DOB, unit = "years")
167 | time_length(mdy("July 5 2023") - USA_DOB, unit = "years")
168 |
169 | # compute age (using intervals)
170 | interval(USA_DOB, mdy("July 3 2023")) / years()
171 | interval(USA_DOB, mdy("July 5 2023")) / years()
172 |
173 | # compute age (using integer divison)
174 | USA_DOB %--% mdy("July 3 2023") %/% years()
175 | USA_DOB %--% mdy("July 5 2023") %/% years()
176 |
177 | # function to compute calendar age
178 | age <- function(birthdate) {
179 | birthdate %--% today() %/% years()
180 | }
181 |
182 | # compute age of celebrities
183 | age(mdy("Jan 12, 1964")) # Jeff Bezos
184 | age(mdy("June 28, 1971")) # Elon Musk
185 | age(mdy("Oct 28, 1955")) # Bill Gates
186 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/boosting/boosting_with_tidymodels.R:
--------------------------------------------------------------------------------
1 | # Boosting with tidymodels ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R,
4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
5 | # Chapter 10: Evaluating Model Performance
6 | #
7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
8 | # wanted to see how one could recreate it using mainly {tidymodels} and
9 | # {tidyverse}.
10 | #
11 | # You can find the original code and the slightly modified dataset here:
12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
14 |
15 | ## 1. Loading libraries (in the order they get used) ----
16 | library(conflicted)
17 | library(tidyverse)
18 | library(tidymodels)
19 |
20 | ## 2. Exploring and preparing the data ----
21 | credit_tbl <- read_csv("decision_trees_and_rules/data/credit.csv")
22 |
23 | ### Examine the structure of the credit data ----
24 | glimpse(credit_tbl)
25 |
26 | ### look at two characteristics of the applicant ----
27 | credit_tbl %>%
28 | count(checking_balance) %>%
29 | mutate(pct = (n / sum(n) * 100))
30 |
31 | credit_tbl %>%
32 | count(savings_balance) %>%
33 | mutate(pct = (n / sum(n) * 100))
34 |
35 | ### look at two characteristics of the loan ----
36 | credit_tbl %>%
37 | select(months_loan_duration, amount) %>%
38 | summary()
39 |
40 | ### look at the class variable ----
41 | credit_tbl %>%
42 | count(default) %>%
43 | mutate(pct = (n / sum(n) * 100))
44 |
45 |
46 | ## 3. Creating the recipe and splitting the data ----
47 |
48 | ### Convert strings to factors ----
49 | recipe_obj <- recipe(
50 | default ~ .,
51 | data = credit_tbl
52 | ) %>%
53 | step_string2factor(all_nominal())
54 | recipe_obj
55 |
56 | credit_factorized_tbl <- recipe_obj %>%
57 | prep() %>%
58 | bake(new_data = NULL)
59 | credit_factorized_tbl
60 |
61 | ### Create training and test data (randomly) ----
62 |
63 | # Use set.seed to use the same random number sequence as the original
64 | RNGversion("3.5.2")
65 | set.seed(123)
66 |
67 | credit_split <- initial_split(
68 | credit_factorized_tbl,
69 | prop = 0.9
70 | )
71 | credit_train <- training(credit_split)
72 | credit_test <- testing(credit_split)
73 |
74 | ### Check the proportion of class variable ----
75 | credit_train %>%
76 | count(default) %>%
77 | mutate(pct = (n / sum(n) * 100))
78 |
79 | credit_test %>%
80 | count(default) %>%
81 | mutate(pct = (n / sum(n) * 100))
82 |
83 |
84 | ## 4. Building a boosted ensemble ----
85 |
86 | ### Specify the model class ----
87 | boost_spec <- boost_tree() %>%
88 | # Set the mode
89 | set_mode("classification") %>%
90 | # Set the engine
91 | set_engine("xgboost")
92 | boost_spec
93 |
94 |
95 | ## 5. Evaluate the ensemble ----
96 | set.seed(99)
97 |
98 | ### Create CV folds ----
99 | folds <- vfold_cv(data = credit_train, v = 10)
100 | folds
101 |
102 | ### Fit and evaluate models for all folds ----
103 | cv_results <- fit_resamples(
104 | boost_spec,
105 | default ~ .,
106 | resamples = folds,
107 | metrics = metric_set(roc_auc)
108 | )
109 |
110 | ### Collect cross-validated metrics ----
111 | collect_metrics(cv_results)
112 |
113 |
114 | ## 6. Tuning preparation ----
115 |
116 | ### Create the specification with placeholders ----
117 | boost_spec <- boost_tree(
118 | trees = 500,
119 | learn_rate = tune(),
120 | tree_depth = tune(),
121 | sample_size = tune()) %>%
122 | set_mode("classification") %>%
123 | set_engine("xgboost")
124 |
125 | ### Create the tuning grid ----
126 | tunegrid_boost <- grid_regular(
127 | extract_parameter_set_dials(boost_spec),
128 | levels = 3
129 | )
130 | tunegrid_boost
131 |
132 |
133 | ## 7. The actual tuning ----
134 |
135 | ### Create CV folds of training data ----
136 | folds <- vfold_cv(
137 | credit_train,
138 | v = 10
139 | )
140 |
141 | ### Tune along the grid ----
142 | tune_results <- tune_grid(
143 | boost_spec,
144 | default ~ .,
145 | resamples = folds,
146 | grid = tunegrid_boost,
147 | metrics = metric_set(roc_auc)
148 | )
149 |
150 | ### Plot the results ----
151 | autoplot(tune_results)
152 |
153 |
154 | ## 8. Finalizing the model ----
155 |
156 | ### Select the final hyperparameters ----
157 | best_params <- select_best(tune_results)
158 | best_params
159 |
160 | ### Finalize the specification ----
161 | final_spec <- finalize_model(boost_spec, best_params)
162 | final_spec
163 |
164 | ### Train the final model on the full training data ----
165 | final_model <- final_spec %>%
166 | fit(
167 | formula = default ~ .,
168 | data = credit_train
169 | )
170 | final_model
171 |
172 | JATKA TÄSTÄ!
173 | https://app.datacamp.com/learn/courses/machine-learning-with-tree-based-models-in-r
174 |
175 | # ### Train the forest ----
176 | # model <- spec %>%
177 | # fit(
178 | # default ~ .,
179 | # data = credit_train
180 | # )
181 | # model
182 | #
183 | # ### Plot the variable importance
184 | # vip::vip(model)
185 | #
186 | #
187 | # ## 5. Predicting ----
188 | #
189 | # ### Make the predictions (you could skip this step) ----
190 | # credit_test_pred <- predict(
191 | # object = model,
192 | # new_data = credit_test,
193 | # type = "class"
194 | # )
195 | # credit_test_pred
196 | #
197 | # ### Add the predictions to the test tibble ----
198 | # credit_test_with_pred_tbl <- augment(model, credit_test)
199 | # credit_test_with_pred_tbl
200 | #
201 | #
202 | # ## 6. Evaluating model performance ----
203 | #
204 | # ### Create a confusion matrix ----
205 | # conf_mat <- conf_mat(
206 | # data = credit_test_with_pred_tbl,
207 | # truth = default,
208 | # estimate = .pred_class
209 | # )
210 | # conf_mat
211 | #
212 | # ### Visualize the confusion matrix ----
213 | # conf_mat %>% autoplot(type = "heatmap")
214 | # conf_mat %>% autoplot(type = "mosaic")
215 | #
216 | # ### Visualize the ROC curve ----
217 | # credit_test_with_pred_tbl %>%
218 | # roc_curve(
219 | # truth = default,
220 | # estimate = .pred_no
221 | # ) %>%
222 | # autoplot()
223 | #
224 | # ### Calculate the ROC AUC (area under the curve) ----
225 | # credit_roc_auc <- credit_test_with_pred_tbl %>%
226 | # roc_auc(
227 | # truth = default,
228 | # estimate = .pred_no
229 | # )
230 | # credit_roc_auc
231 | #
232 | # ### Put together other model metrics ----
233 | # # Such as accuracy, Matthews correlation coefficient (mcc) and others...
234 | # classification_metrics <- conf_mat(
235 | # credit_test_with_pred_tbl,
236 | # truth = default,
237 | # estimate = .pred_class
238 | # ) %>%
239 | # summary()
240 | # classification_metrics
241 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/support_vector_machines/performing_optical_character_recognition_with_support_vector_machines_.r:
--------------------------------------------------------------------------------
1 | # Performing Optical Character Recognition with Support Vector Machines ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 7:
4 | # Black Box Methods - Neural Networks and Support Vector Machines
5 | #
6 | # The original code is made with {kernlab}. I wanted to see how one could
7 | # recreate it using mainly {tidymodels} and {tidyverse}.
8 | #
9 | # You can find the original code and the dataset here:
10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter07
11 |
12 | ## 1. Loading libraries (in the order they get used) ----
13 | library(conflicted)
14 | library(tidyverse)
15 | library(tidymodels)
16 | library(janitor)
17 |
18 |
19 | ## 2. Exploring and preparing the data ----
20 |
21 | ### Read in data and examine structure ----
22 | letters_tbl <- read_csv("machine_learning_with_r_3rd_ed/support_vector_machines/data/letterdata.csv") %>%
23 | mutate(across(where(is.character), as.factor))
24 |
25 | letters_tbl %>%
26 | glimpse()
27 |
28 |
29 | ## 3. Creating the recipe ----
30 |
31 | ### Apply normalization to entire data frame ----
32 | recipe_obj <- recipe(
33 | letter ~ .,
34 | data = letters_tbl
35 | )
36 | recipe_obj
37 |
38 | letters_baked_tbl <- recipe_obj %>%
39 | prep() %>%
40 | bake(new_data = NULL)
41 | letters_baked_tbl
42 |
43 | ### Create training and test data ----
44 | letters_split <- initial_time_split(
45 | letters_baked_tbl,
46 | prop = 16000 / 20000
47 | )
48 | letters_train <- training(letters_split)
49 | letters_test <- testing(letters_split)
50 |
51 |
52 | ## 4. Training a model on the data ----
53 |
54 | ### Create model specification ----
55 | model_spec <- svm_linear(
56 | engine = "kernlab",
57 | mode = "classification",
58 | cost = NULL,
59 | margin = NULL
60 | ) %>%
61 | translate()
62 | model_spec
63 |
64 | ### Fit the model ----
65 | model_fit <- fit(
66 | model_spec,
67 | letter ~ .,
68 | letters_train
69 | )
70 | model_fit
71 |
72 | ### Make the predictions (you could skip this step) ----
73 | letters_test_pred <- predict(
74 | model_fit,
75 | new_data = letters_test
76 | )
77 | letters_test_pred
78 |
79 | ### Add the predictions to the test tibble ----
80 | letters_test_with_pred <- augment(model_fit, letters_test)
81 | letters_test_with_pred
82 |
83 |
84 | ## 5. Evaluating model performance ----
85 |
86 | ### Predictions on testing dataset ----
87 | letters_test_with_pred %>%
88 | tabyl(letter, .pred_class)
89 |
90 | ### Look only at agreement vs. non-agreement ----
91 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
92 | letters_test_with_pred %>%
93 | mutate(
94 | agreement = case_when(
95 | letter == .pred_class ~ TRUE,
96 | TRUE ~ FALSE
97 | )
98 | ) %>%
99 | tabyl(agreement)
100 |
101 |
102 | ## 6. Improving model performance ----
103 |
104 | ### Change to a RBF kernel ----
105 | model_spec_rbf <- svm_rbf(
106 | engine = "kernlab",
107 | mode = "classification",
108 | cost = NULL,
109 | margin = NULL,
110 | rbf_sigma = NULL
111 | ) %>%
112 | translate()
113 | model_spec_rbf
114 |
115 | ### Fit the model ----
116 | model_fit_rbf <- fit(
117 | model_spec_rbf,
118 | letter ~ .,
119 | letters_train
120 | )
121 | model_fit_rbf
122 |
123 | ### Make the predictions (you could skip this step) ----
124 | letters_test_pred_rbf <- predict(
125 | model_fit_rbf,
126 | new_data = letters_test
127 | )
128 | letters_test_pred_rbf
129 |
130 | ### Add the predictions to the test tibble ----
131 | letters_test_with_pred_rbf <- augment(model_fit_rbf, letters_test)
132 | letters_test_with_pred_rbf
133 |
134 | ### Predictions on testing dataset ----
135 | letters_test_with_pred_rbf %>%
136 | tabyl(letter, .pred_class)
137 |
138 | ### Look only at agreement vs. non-agreement ----
139 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
140 | letters_test_with_pred_rbf %>%
141 | mutate(
142 | agreement = case_when(
143 | letter == .pred_class ~ TRUE,
144 | TRUE ~ FALSE
145 | )
146 | ) %>%
147 | tabyl(agreement)
148 |
149 | ### Test various values of the cost parameter ----
150 | cost_values <- c(1, seq(from = 5, to = 40, by = 5))
151 |
152 | accuracy_values <- map_dbl(cost_values, function(x) {
153 |
154 | model_spec_rbf <- svm_rbf(
155 | engine = "kernlab",
156 | mode = "classification",
157 | cost = {{ x }},
158 | margin = NULL,
159 | rbf_sigma = NULL
160 | ) %>%
161 | translate()
162 |
163 | model_fit_rbf <- fit(
164 | model_spec_rbf,
165 | letter ~ .,
166 | letters_train
167 | )
168 |
169 | letters_test_pred_rbf <- predict(
170 | model_fit_rbf,
171 | new_data = letters_test
172 | ) %>% as_vector()
173 |
174 | agree <- ifelse(letters_test_pred_rbf == letters_test$letter, 1, 0)
175 |
176 | accuracy <- sum(agree) / nrow(letters_test)
177 |
178 | return(accuracy)
179 |
180 | })
181 |
182 | ### Bind together the cost parameter and accuracy values ----
183 | cost_vs_accuracy_tbl <- bind_cols(
184 | cost_values,
185 | accuracy_values,
186 | ) %>%
187 | rename(
188 | cost_values = ...1,
189 | accuracy_values = ...2
190 | )
191 |
192 | ### Visualize to find the optimal cost parameter value ----
193 | cost_vs_accuracy_tbl %>%
194 | ggplot(aes(cost_values, accuracy_values)) +
195 | geom_line() +
196 | geom_point() +
197 | theme_bw() +
198 | labs(
199 | x = "Cost Parameter Value",
200 | y = "Accuracy"
201 | )
202 |
203 | ### Make sure you have the right optimal cost value for the best accuracy ----
204 | cost_vs_accuracy_tbl %>%
205 | slice_max(accuracy_values)
206 |
207 | ### Pull the first cost_value that has the max accuracy_value ----
208 | .max_accuracy <- cost_vs_accuracy_tbl %>%
209 | slice_max(accuracy_values) %>%
210 | slice(1) %>%
211 | pull(cost_values)
212 |
213 |
214 | ## 7. Fitting the model with the optimal cost value (that was just pulled) ----
215 |
216 | ### Give model specification ----
217 | model_spec_best <- svm_rbf(
218 | engine = "kernlab",
219 | mode = "classification",
220 | cost = {{ .max_accuracy }},
221 | margin = NULL,
222 | rbf_sigma = NULL
223 | ) %>%
224 | translate()
225 |
226 | ### Fit the model ----
227 | model_fit_best <- fit(
228 | model_spec_best,
229 | letter ~ .,
230 | letters_train
231 | )
232 |
233 | ### Add the predictions to the test tibble ----
234 | letters_test_with_pred_best <- augment(model_fit_best, letters_test)
235 |
236 | ### Predictions on testing dataset ----
237 | letters_test_with_pred_best %>%
238 | tabyl(letter, .pred_class)
239 |
240 | ### Look only at agreement vs. non-agreement ----
241 | # Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
242 | letters_test_with_pred_best %>%
243 | mutate(
244 | agreement = case_when(
245 | letter == .pred_class ~ TRUE,
246 | TRUE ~ FALSE
247 | )
248 | ) %>%
249 | tabyl(agreement)
250 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/k_means_clustering/finding_teen_market_segments_using_k_means_clustering.r:
--------------------------------------------------------------------------------
1 | # Finding Teen Market Segments Using k-means Clustering ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 9:
4 | # Finding Groups of Data - Clustering with k-means
5 | #
6 | # The original code is made with base R. While {tidymodels} doesn't have a
7 | # k-means engine, I wanted to see how one could still recreate the code using
8 | # {tidyverse} as much as possible and even using the {recipes} package from the
9 | # {tidymodels} to do all the pre-processing needed and {broom} package from the
10 | # same {tidymodels} family to help look at the model metrics in a more tidy way.
11 | #
12 | # I was also inspired by this blog post from the tidymodels website to continue
13 | # the exercise a bit further than the original code did:
14 | # https://www.tidymodels.org/learn/statistics/k-means/
15 | #
16 | # You can find the original code and the dataset here:
17 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter09
18 |
19 | ## 1. Loading libraries ----
20 | library(conflicted)
21 | library(tidyverse)
22 | library(janitor)
23 | library(tidymodels)
24 |
25 |
26 | ## 2. Exploring and preparing the data ----
27 | teens_tbl <- read_csv("machine_learning_with_r_3rd_ed/k_means_clustering/data/snsdata.csv")
28 |
29 | teens_tbl %>%
30 | glimpse()
31 |
32 | ### Look at missing data for age variable ----
33 | teens_tbl %>%
34 | select(age) %>%
35 | summary()
36 |
37 | ### Eliminate age outliers ----
38 | teens_only_tbl <- teens_tbl %>%
39 | mutate(
40 | age = case_when(
41 | between(age, 13, 20) ~ age,
42 | TRUE ~ NA_real_
43 | )
44 | )
45 |
46 | teens_only_tbl %>%
47 | select(age) %>%
48 | summary()
49 |
50 | ### Look at missing data for female variable ----
51 | teens_only_tbl %>%
52 | tabyl(gender)
53 |
54 | ### Reassign missing gender values to "unknown" and change into factors ----
55 | teens_gendered_tbl <- teens_only_tbl %>%
56 | mutate(
57 | gender = case_when(
58 | gender %in% c("F", "M") ~ gender,
59 | TRUE ~ "gender_unknown"
60 | ) %>%
61 | as.factor()
62 | )
63 |
64 | ### Check our recoding work ----
65 | teens_gendered_tbl %>%
66 | tabyl(gender) %>%
67 | arrange(desc(percent))
68 |
69 | ### Finding the mean age by cohort ----
70 | # Doesn't work because of the NAs
71 | teens_gendered_tbl %>%
72 | pull(age) %>%
73 | mean()
74 |
75 | # Works thanks to the na.rm = TRUE
76 | teens_gendered_tbl %>%
77 | pull(age) %>%
78 | mean(na.rm = TRUE)
79 |
80 | ### Age by cohort ----
81 | teens_gendered_tbl %>%
82 | with_groups(
83 | gradyear,
84 | summarize,
85 | age = mean(age, na.rm = TRUE)
86 | )
87 |
88 | ### Create a vector with the average age for each gradyear, repeated by person ----
89 | average_age_by_gradyear <- teens_gendered_tbl %>%
90 | with_groups(
91 | gradyear,
92 | mutate,
93 | ave_age = mean(age, na.rm = TRUE)
94 | ) %>%
95 | pull()
96 |
97 | ### Impute the missing age values with the average age by gradyear ----
98 | teens_imputed_tbl <- teens_gendered_tbl %>%
99 | mutate(
100 | age = case_when(
101 | !is.na(age) ~ age,
102 | TRUE ~ average_age_by_gradyear
103 | )
104 | )
105 |
106 | ### Check the summary results to ensure missing values are eliminated ----
107 | teens_imputed_tbl %>%
108 | select(age) %>%
109 | summary()
110 |
111 |
112 | ## 3. Creating the recipe ----
113 |
114 | ### Apply normalization to entire data frame ----
115 | recipe_obj <- recipe(teens_imputed_tbl) %>%
116 | step_normalize(
117 | all_numeric(),
118 | -c(gradyear, age, friends)
119 | ) %>%
120 | step_dummy(
121 | gender,
122 | keep_original_cols = TRUE,
123 | one_hot = TRUE
124 | )
125 | recipe_obj
126 |
127 | recipe_baked_tbl <- recipe_obj %>%
128 | prep() %>%
129 | bake(new_data = NULL)
130 | recipe_baked_tbl
131 |
132 |
133 | ## 4. Training a model on the data ----
134 |
135 | ### Create a z-score standardized data frame for easier interpretation ----
136 | interests_tbl <- recipe_baked_tbl %>%
137 | select(5:40)
138 |
139 | ### Compare the data before and after the transformation ----
140 |
141 | # Before
142 | teens_imputed_tbl %>%
143 | select(basketball) %>%
144 | summary()
145 |
146 | # After
147 | interests_tbl %>%
148 | select(basketball) %>%
149 | summary()
150 |
151 | ### Create the clusters using k-means ----
152 | RNGversion("3.5.2")
153 | set.seed(2345)
154 |
155 | teens_clusters <- interests_tbl %>%
156 | kmeans(5)
157 | teens_clusters
158 |
159 |
160 | ## 5. Evaluating model performance ----
161 |
162 | ### Look at the single-row summary ----
163 | teens_clusters %>%
164 | glance()
165 |
166 | ### Look at the size and the centers of the clusters ----
167 | teens_clusters %>%
168 | tidy() %>%
169 | select(cluster, size, withinss, everything())
170 |
171 |
172 | ## 6. Improving model performance ----
173 |
174 | ### Apply the cluster IDs to the original data frame ----
175 | teens_and_clusters <- augment(teens_clusters, recipe_baked_tbl)
176 | teens_and_clusters
177 |
178 | ### Look at the first five records ----
179 | teens_and_clusters %>%
180 | select(.cluster, gender, age, friends) %>%
181 | slice_head(n = 5)
182 |
183 | ### Mean age by cluster ----
184 | teens_and_clusters %>%
185 | with_groups(
186 | .cluster,
187 | summarize,
188 | age = mean(age)
189 | )
190 |
191 | ### Proportion of females by cluster ----
192 | teens_and_clusters %>%
193 | with_groups(
194 | .cluster,
195 | summarize,
196 | gender_F = mean(gender_F)
197 | )
198 |
199 | ### Mean number of friends by cluster ----
200 | teens_and_clusters %>%
201 | with_groups(
202 | .cluster,
203 | summarize,
204 | friends = mean(friends)
205 | )
206 |
207 |
208 | ## 7. K-means clustering with tidy data principles ----
209 |
210 | ### Exploratory clustering ----
211 | kclusts <-
212 | tibble(k = 1:9) %>%
213 | mutate(
214 | kclust = map(k, ~kmeans(interests_tbl, .x)),
215 | tidied = map(kclust, tidy),
216 | glanced = map(kclust, glance),
217 | augmented = map(kclust, augment, recipe_baked_tbl)
218 | )
219 |
220 | ### Create three separate datasets ----
221 | clusters <- kclusts %>%
222 | unnest(cols = c(tidied))
223 |
224 | assignments <- kclusts %>%
225 | unnest(cols = c(augmented))
226 |
227 | clusterings <- kclusts %>%
228 | unnest(cols = c(glanced))
229 |
230 | ### Plot the original points ----
231 | p1 <- assignments %>%
232 | ggplot(aes(sports, music)) +
233 | geom_point(aes(color = .cluster), alpha = 0.5) +
234 | facet_wrap(vars(k)) +
235 | theme_bw()
236 | p1
237 |
238 | p2 <- p1 +
239 | geom_point(data = clusters, size = 5, shape = "x")
240 | p2
241 |
242 |
243 | ### Plot the total within sum of squares (tot.withinss) ----
244 | clusterings %>%
245 | ggplot(aes(k, tot.withinss)) +
246 | geom_line() +
247 | geom_point() +
248 | theme_bw() +
249 | scale_x_continuous(breaks = seq(1, 9, by = 1)) +
250 | labs(
251 | x = "k",
252 | y = "Total Within Sum of Squares"
253 | )
254 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_09_finding_groups_of_data_clustering_with_k-means.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 9: Finding Groups of Data - Clustering with k-means"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## Loading Packages
13 |
14 | ```{r}
15 | library(conflicted)
16 | library(janitor)
17 | library(tidyclust)
18 | library(tidymodels)
19 | library(tidyverse)
20 | ```
21 |
22 | ## Exploring and preparing the data
23 |
24 | ```{r}
25 | teens_tbl <- read_csv("data/snsdata.csv") %>%
26 | mutate(across(where(is.character), as.factor))
27 | ```
28 |
29 | ```{r}
30 | teens_tbl %>%
31 | glimpse()
32 | ```
33 |
34 | ### Look at missing data for age variable
35 |
36 | ```{r}
37 | teens_tbl %>%
38 | pull(age) %>%
39 | summary()
40 | ```
41 |
42 | ### Eliminate age outliers
43 |
44 | ```{r}
45 | teens_only_tbl <- teens_tbl %>%
46 | mutate(
47 | age = case_when(
48 | between(age, 13, 20) ~ age,
49 | .default = NA_real_
50 | )
51 | )
52 | ```
53 |
54 | ```{r}
55 | teens_only_tbl %>%
56 | pull(age) %>%
57 | summary()
58 | ```
59 |
60 | ### Look at missing data for female variable
61 |
62 | ```{r}
63 | teens_only_tbl %>%
64 | tabyl(gender) %>%
65 | adorn_pct_formatting(digits = 1)
66 | ```
67 |
68 | ### Reassign missing gender values to "unknown" and change into factors
69 |
70 | ```{r}
71 | teens_gendered_tbl <- teens_only_tbl %>%
72 | mutate(
73 | gender = case_when(
74 | gender %in% c("F", "M") ~ gender,
75 | .default = "gender_unknown"
76 | ) %>%
77 | as.factor()
78 | )
79 | ```
80 |
81 | ### Check our recoding work
82 |
83 | ```{r}
84 | teens_gendered_tbl %>%
85 | tabyl(gender) %>%
86 | adorn_pct_formatting(digits = 1) %>%
87 | arrange(desc(percent))
88 | ```
89 |
90 | ### Finding the mean age by cohort
91 |
92 | Doesn't work because of the NAs
93 |
94 | ```{r}
95 | teens_gendered_tbl %>%
96 | pull(age) %>%
97 | mean()
98 | ```
99 |
100 | Works thanks to the na.rm = TRUE
101 |
102 | ```{r}
103 | teens_gendered_tbl %>%
104 | pull(age) %>%
105 | mean(na.rm = TRUE)
106 | ```
107 |
108 | ### Age by cohort
109 |
110 | ```{r}
111 | teens_gendered_tbl %>%
112 | summarize(
113 | age = mean(age, na.rm = TRUE),
114 | .by = gradyear
115 | )
116 | ```
117 |
118 | ### Create a vector with the average age for each gradyear, repeated by person
119 |
120 | ```{r}
121 | average_age_by_gradyear <- teens_gendered_tbl %>%
122 | mutate(
123 | ave_age = mean(age, na.rm = TRUE),
124 | .by = gradyear
125 | ) %>%
126 | pull()
127 | ```
128 |
129 | ### Impute the missing age values with the average age by gradyear
130 |
131 | ```{r}
132 | teens_imputed_tbl <- teens_gendered_tbl %>%
133 | mutate(
134 | age = case_when(
135 | !is.na(age) ~ age,
136 | .default = average_age_by_gradyear
137 | )
138 | )
139 | ```
140 |
141 | ### Check the summary results to ensure missing values are eliminated
142 |
143 | ```{r}
144 | teens_imputed_tbl %>%
145 | pull(age) %>%
146 | summary()
147 | ```
148 |
149 | ## Creating the recipe
150 |
151 | ### Apply normalization to entire data frame
152 |
153 | ```{r}
154 | recipe_obj <- recipe(teens_imputed_tbl) %>%
155 | step_normalize(
156 | all_numeric(),
157 | -c(gradyear, age, friends)
158 | ) %>%
159 | step_dummy(
160 | gender,
161 | keep_original_cols = TRUE,
162 | one_hot = TRUE
163 | )
164 |
165 | recipe_obj
166 | ```
167 |
168 | ```{r}
169 | recipe_baked_tbl <- recipe_obj %>%
170 | prep() %>%
171 | bake(new_data = NULL)
172 |
173 | recipe_baked_tbl
174 | ```
175 |
176 | ## Training a model on the data
177 |
178 | ### Create a z-score standardized data frame for easier interpretation
179 |
180 | ```{r}
181 | interests_tbl <- recipe_baked_tbl %>%
182 | select(5:40)
183 | ```
184 |
185 | ### Compare the data before and after the transformation
186 |
187 | Before
188 |
189 | ```{r}
190 | teens_imputed_tbl %>%
191 | pull(basketball) %>%
192 | summary()
193 | ```
194 |
195 | After
196 |
197 | ```{r}
198 | interests_tbl %>%
199 | pull(basketball) %>%
200 | summary()
201 | ```
202 |
203 | #### Create model specification
204 |
205 | ```{r}
206 | model_spec_kmeans <- k_means(
207 | engine = "stats",
208 | num_clusters = 5
209 | )
210 |
211 | model_spec_kmeans
212 | ```
213 |
214 | ```{r}
215 | set.seed(2345)
216 |
217 | model_fit_kmeans <- model_spec_kmeans %>%
218 | fit(
219 | ~ .,
220 | interests_tbl
221 | )
222 |
223 | model_fit_kmeans
224 | ```
225 |
226 | ## Evaluating model performance
227 |
228 | ### Look at the single-row summary
229 |
230 | ```{r}
231 | model_fit_kmeans %>%
232 | glance()
233 | ```
234 |
235 | ### Look at the size and the centers of the clusters
236 |
237 | ```{r}
238 | model_fit_kmeans %>%
239 | tidy() %>%
240 | select(cluster, size, withinss, everything())
241 | ```
242 |
243 | ## Improving model performance
244 |
245 | ### Apply the cluster IDs to the original data frame
246 |
247 | ```{r}
248 | teens_and_clusters <- augment(model_fit_kmeans, recipe_baked_tbl)
249 | teens_and_clusters
250 | ```
251 |
252 | ### Look at the first five records
253 |
254 | ```{r}
255 | teens_and_clusters %>%
256 | select(.pred_cluster, gender, age, friends) %>%
257 | slice_head(n = 5)
258 | ```
259 |
260 | ### Mean age by cluster
261 |
262 | ```{r}
263 | teens_and_clusters %>%
264 | summarize(
265 | age = mean(age),
266 | .by = .pred_cluster
267 | )
268 | ```
269 |
270 | ### Proportion of females by cluster
271 |
272 | ```{r}
273 | teens_and_clusters %>%
274 | summarize(
275 | gender_F = mean(gender_F),
276 | .by = .pred_cluster
277 | )
278 | ```
279 |
280 | ### Mean number of friends by cluster
281 |
282 | ```{r}
283 | teens_and_clusters %>%
284 | summarize(
285 | friends = mean(friends),
286 | .by = .pred_cluster
287 | )
288 | ```
289 |
290 | ## K-means clustering with tidy data principles
291 |
292 | ### Exploratory clustering
293 |
294 | ```{r}
295 | kclusts <-
296 | tibble(k = 1:9) %>%
297 | mutate(
298 | kclust = map(k, ~kmeans(interests_tbl, .x)),
299 | tidied = map(kclust, tidy),
300 | glanced = map(kclust, glance),
301 | augmented = map(kclust, augment, recipe_baked_tbl)
302 | )
303 | ```
304 |
305 | ### Create three separate datasets
306 |
307 | ```{r}
308 | clusters <- kclusts %>%
309 | unnest(cols = c(tidied))
310 |
311 | assignments <- kclusts %>%
312 | unnest(cols = c(augmented))
313 |
314 | clusterings <- kclusts %>%
315 | unnest(cols = c(glanced))
316 | ```
317 |
318 | ### Plot the original points
319 |
320 | ```{r}
321 | p1 <- assignments %>%
322 | ggplot(aes(sports, music)) +
323 | geom_point(aes(color = .cluster), alpha = 0.5) +
324 | facet_wrap(vars(k)) +
325 | theme_bw()
326 |
327 | p1
328 | ```
329 |
330 | ```{r}
331 | p2 <- p1 +
332 | geom_point(data = clusters, size = 5, shape = "x")
333 |
334 | p2
335 | ```
336 |
337 | ### Plot the total within sum of squares (tot.withinss)
338 |
339 | ```{r}
340 | clusterings %>%
341 | ggplot(aes(k, tot.withinss)) +
342 | geom_line() +
343 | geom_point() +
344 | theme_bw() +
345 | scale_x_continuous(breaks = seq(1, 9, by = 1)) +
346 | labs(
347 | x = "k",
348 | y = "Total Within Sum of Squares"
349 | )
350 | ```
351 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/estimating_wine_quality_with_regression_trees_and_model_trees.r:
--------------------------------------------------------------------------------
1 | # Estimating Wine Quality with Regression Trees and Model Trees ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 6:
4 | # Forecasting Numeric Data - Regression Methods.
5 | #
6 | # The original code is made with {rpart}, {rpart.plot} and {Cubist}. I wanted to
7 | # see how one could recreate it using mainly {tidymodels}, {tidyverse} and {rules}.
8 | #
9 | # You can find the original code and the dataset here:
10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06
11 |
12 | ## 1. Loading libraries (in the order they get used) ----
13 | library(conflicted)
14 | conflict_prefer("filter", "dplyr", "stats")
15 | library(tidyverse)
16 | library(tidymodels)
17 | library(rattle)
18 | library(corrr)
19 | library(rules)
20 |
21 |
22 | ## 2. Preparing and exploring the data ----
23 | wine_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/whitewines.csv") %>%
24 | rename_all(
25 | ~str_replace_all(., "\\s+", "_") %>%
26 | tolower()
27 | )
28 | wine_tbl
29 |
30 | glimpse(wine_tbl)
31 |
32 | ### The distribution of quality ratings ----
33 | wine_tbl %>%
34 | ggplot(aes(quality)) +
35 | geom_histogram()
36 |
37 | ### Summary statistics of the wine data ----
38 | summary(wine_tbl)
39 |
40 | wine_split <- initial_time_split(
41 | wine_tbl,
42 | prop = 3750 / 4898
43 | )
44 | wine_train <- training(wine_split)
45 | wine_test <- testing(wine_split)
46 |
47 |
48 | ## 3. Training a model on the data ----
49 | # regression tree using rpart
50 | model_spec <- decision_tree(
51 | mode = "regression",
52 | engine = "rpart",
53 | cost_complexity = NULL,
54 | tree_depth = NULL,
55 | min_n = NULL
56 | ) %>%
57 | translate()
58 | model_spec
59 |
60 | ### Fit the model ----
61 | model_fit <- fit(
62 | model_spec,
63 | quality ~ .,
64 | wine_train
65 | )
66 |
67 | ### Get basic information about the tree ----
68 | model_fit
69 |
70 | ### Get more detailed information about the tree ----
71 | summary(model_fit$fit)
72 |
73 | ### Adjust plot margins to make the visualization work better
74 | par(mar = c(1, 1, 1, 1))
75 |
76 | ### Use the rattle package to create a visualization ----
77 | model_fit$fit %>%
78 | fancyRpartPlot(cex = 0.5)
79 |
80 |
81 | ## 4. Evaluate model performance ----
82 |
83 | ### Generate predictions for the testing dataset (you could skip this step) ----
84 | wine_test_pred <- predict(
85 | object = model_fit,
86 | new_data = wine_test,
87 | type = "numeric"
88 | )
89 | wine_test_pred
90 |
91 | ### Add the predictions to the test tibble ----
92 | wine_test_with_pred_tbl <- augment(model_fit, wine_test)
93 | wine_test_with_pred_tbl
94 |
95 | ### Compare the distribution of actual values vs. predicted values ----
96 | wine_test_with_pred_tbl %>%
97 | select(quality, .pred) %>%
98 | summary()
99 |
100 | ### Compare the correlation ----
101 | wine_test_with_pred_tbl %>%
102 | select(quality, .pred) %>%
103 | correlate()
104 |
105 | ### Mean absolute error between actual and predicted values ----
106 | wine_test_with_pred_tbl %>%
107 | metrics(quality, .pred) %>%
108 | filter(.metric == "mae")
109 |
110 | ### Mean absolute error between actual values and mean value ----
111 | mean_value <- wine_train$quality %>%
112 | mean()
113 | mean_value
114 |
115 | mae <- function(actual, predicted) {
116 | mean(abs(actual - predicted))
117 | }
118 |
119 | mae(wine_test$quality, mean_value)
120 |
121 |
122 | ## 5. Improving model performance ----
123 | # train a Cubist model tree
124 | model_spec_cubist <- cubist_rules(
125 | mode = "regression",
126 | engine = "Cubist",
127 | committees = NULL,
128 | neighbors = NULL,
129 | max_rules = NULL
130 | ) %>%
131 | translate()
132 | model_spec_cubist
133 |
134 | ### Fit the model ----
135 | model_fit_cubist <- fit(
136 | model_spec_cubist,
137 | quality ~ .,
138 | wine_train
139 | )
140 |
141 | ### Display basic information about the model tree ----
142 | model_fit_cubist
143 |
144 | ### Display the tree itself ----
145 | summary(model_fit_cubist$fit)
146 |
147 | ### Generate predictions for the model ----
148 | wine_test_pred_cubist <- predict(
149 | object = model_fit_cubist,
150 | new_data = wine_test,
151 | type = "numeric"
152 | )
153 | wine_test_pred_cubist
154 |
155 | ### Summary statistics about the predictions ----
156 | wine_test_pred_cubist %>%
157 | summary()
158 |
159 | ### Add the predictions to the test tibble ----
160 | wine_test_with_pred_cubist <- augment(model_fit_cubist, wine_test)
161 | wine_test_with_pred_cubist
162 |
163 | ### Compare the distribution of actual values vs. predicted values ----
164 | wine_test_with_pred_cubist %>%
165 | select(quality, .pred) %>%
166 | summary()
167 |
168 | ### Correlation between the true and predicted values ----
169 | wine_test_with_pred_cubist %>%
170 | select(quality, .pred) %>%
171 | correlate()
172 |
173 | ### Mean absolute error of true and predicted values ----
174 | wine_test_with_pred_cubist %>%
175 | metrics(quality, .pred) %>%
176 | filter(.metric == "mae")
177 |
178 |
179 | ## 6. Creating a function to test the model(s) with another dataset ----
180 |
181 | # The assumption here is that you have already taken step 1.
182 |
183 | # Preparing and exploring the other wine dataset
184 | red_wine_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/redwines.csv") %>%
185 | rename_all(
186 | ~str_replace_all(., "\\s+", "_") %>%
187 | tolower()
188 | )
189 | red_wine_tbl
190 |
191 | ### The distribution of quality ratings ----
192 | red_wine_tbl %>%
193 | ggplot(aes(quality)) +
194 | geom_histogram()
195 |
196 | ### Summary statistics of the wine data ----
197 | summary(red_wine_tbl)
198 |
199 | ### Create the function
200 |
201 | predict_wine_quality <- function(
202 | .engine = c("rpart", "Cubist"),
203 | .winecolor = c("red", "white")
204 | ) {
205 |
206 | # Check that the wine color is valid
207 | if (!.winecolor %in% c("red", "white")) stop("Choose a wine color: red or white")
208 |
209 | # Write out the path so that you can insert the wine color in there
210 | path <- str_glue("machine_learning_with_r_3rd_ed/regression_methods/data/{.winecolor}wines.csv")
211 |
212 | # Read in the data
213 | wine_tbl <- read_csv(path) %>%
214 | rename_all(
215 | ~str_replace_all(., "\\s+", "_") %>%
216 | tolower()
217 | )
218 |
219 | # Make the train/test split. It's not randomizing, as the data already is
220 | wine_split <- initial_time_split(
221 | wine_tbl,
222 | prop = 0.75
223 | )
224 | wine_train <- training(wine_split)
225 | wine_test <- testing(wine_split)
226 |
227 | # Create the model based on the engine chosen
228 | if (.engine == "rpart") {
229 |
230 | model_spec <- decision_tree(
231 | mode = "regression",
232 | engine = "rpart"
233 | ) %>%
234 | translate()
235 |
236 | } else if (.engine == "Cubist") {
237 |
238 | model_spec <- cubist_rules(
239 | mode = "regression",
240 | engine = "Cubist"
241 | ) %>%
242 | translate()
243 |
244 | } else {
245 |
246 | stop("Choose an engine: rpart (decision tree) or Cubist (rules)")
247 |
248 | }
249 |
250 | # Fit the model
251 | model_fit <- fit(
252 | model_spec,
253 | quality ~ .,
254 | wine_train
255 | )
256 |
257 | # Add the predictions
258 | wine_test_with_pred_tbl <- augment(model_fit, wine_test)
259 |
260 | # Get the metrics
261 | wine_test_with_pred_tbl %>%
262 | metrics(quality, .pred)
263 |
264 | }
265 |
266 | ### Test the function ----
267 | predict_wine_quality(
268 | .engine = "Cubist",
269 | .winecolor = "white"
270 | )
271 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_03_lazy_learning_classification_using_nearest_neighbors.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 3: Lazy Learning - Classification Using Nearest Neighbors"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | # 3 Lazy Learning - Classification Using Nearest Neighbors
13 |
14 | ## Load Packages
15 |
16 | ```{r}
17 | library(conflicted) # An Alternative Conflict Resolution Strategy
18 | library(janitor) # Simple Tools for Examining and Cleaning Dirty Data
19 | library(tidymodels) # Easily Install and Load the 'Tidymodels' Packages
20 | library(tidyverse) # Easily Install and Load the 'Tidyverse'
21 | ```
22 |
23 | ## Exploring and preparing the data
24 |
25 | ### Import the CSV file (Breast Cancer Wisconsin (Diagnostic))
26 |
27 | ```{r}
28 | wbcd_tbl <- read_csv(
29 | "data/wisc_bc_data.csv"
30 | )
31 | ```
32 |
33 | ### Take a look at the tibble
34 |
35 | ```{r}
36 | glimpse(wbcd_tbl)
37 | ```
38 |
39 | ### Drop the unnecessary id column
40 |
41 | ```{r}
42 | wbcd_selected_tbl <- wbcd_tbl %>%
43 | select(-id)
44 |
45 | wbcd_selected_tbl
46 | ```
47 |
48 | ```{r}
49 | # table of diagnosis
50 | wbcd_tbl %>%
51 | tabyl(diagnosis) %>%
52 | adorn_pct_formatting(digits = 1)
53 | ```
54 |
55 | ### Transform diagnosis to a factor
56 |
57 | ```{r}
58 | wbcd_factored_tbl <- wbcd_selected_tbl %>%
59 | mutate(
60 | diagnosis = factor(
61 | diagnosis,
62 | levels = c("B", "M"),
63 | labels = c("Benign", "Malignant")
64 | )
65 | )
66 |
67 | wbcd_factored_tbl
68 | ```
69 |
70 | ### Count the number of the two diagnosis (incl. percentage)
71 |
72 | ```{r}
73 | wbcd_factored_tbl %>%
74 | tabyl(diagnosis) %>%
75 | adorn_pct_formatting(digits = 1)
76 | ```
77 |
78 | ### Summarize three numeric features
79 |
80 | ```{r}
81 | wbcd_factored_tbl %>%
82 | select(radius_mean, area_mean, smoothness_mean) %>%
83 | summary()
84 | ```
85 |
86 | ## Creating the recipe and splitting the data
87 |
88 | ### Normalize the wbcd data
89 |
90 | ```{r}
91 | recipe_obj <- recipe(
92 | diagnosis ~ .,
93 | data = wbcd_factored_tbl
94 | ) %>%
95 | step_range(
96 | all_numeric_predictors(),
97 | min = 0,
98 | max = 1
99 | )
100 |
101 | recipe_obj
102 | ```
103 |
104 | ```{r}
105 | wbcd_normalized_tbl <- recipe_obj %>%
106 | prep() %>%
107 | bake(new_data = NULL)
108 | ```
109 |
110 | ### Confirm that normalization worked
111 |
112 | ```{r}
113 | wbcd_normalized_tbl %>%
114 | select(area_mean) %>%
115 | summary()
116 | ```
117 |
118 | ### Create training and test data (randomly)
119 |
120 | ```{r}
121 | wbcd_split <- initial_split(
122 | wbcd_normalized_tbl,
123 | prop = 469 / 569
124 | )
125 |
126 | wbcd_train <- training(wbcd_split)
127 | wbcd_test <- testing(wbcd_split)
128 | ```
129 |
130 | ## Training a model on the data
131 |
132 | kknn is the engine (and needs to be installed if it isn't already). It is used as the engine for {parsnip}'s nearest_neighbor() function. And since we are classifying, that is the mode we choose.
133 |
134 | ### Create model specification
135 |
136 | ```{r}
137 | model_spec <- nearest_neighbor(
138 | engine = "kknn",
139 | mode = "classification",
140 | neighbors = 21
141 | ) %>%
142 | translate()
143 |
144 | model_spec
145 | ```
146 |
147 | ### Fit the model
148 |
149 | ```{r}
150 | model_fit <- model_spec %>%
151 | fit(
152 | diagnosis ~ .,
153 | wbcd_train
154 | )
155 |
156 | model_fit
157 | ```
158 |
159 | ### Make the predictions (you could skip this step)
160 |
161 | ```{r}
162 | wbcd_test_pred <- model_fit %>%
163 | predict(
164 | new_data = wbcd_test,
165 | type = "class"
166 | )
167 |
168 | wbcd_test_pred
169 | ```
170 |
171 | ### Add the predictions to the test tibble
172 |
173 | ```{r}
174 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
175 | wbcd_test_with_pred_tbl
176 | ```
177 |
178 | ## Evaluating model performance
179 |
180 | ### Create a confusion matrix
181 |
182 | ```{r}
183 | conf_mat <- conf_mat(
184 | data = wbcd_test_with_pred_tbl,
185 | truth = diagnosis,
186 | estimate = .pred_class
187 | )
188 |
189 | conf_mat
190 | ```
191 |
192 | ### Visualize the confusion matrix
193 |
194 | ```{r}
195 | conf_mat %>%
196 | autoplot(type = "heatmap")
197 |
198 | conf_mat %>%
199 | autoplot(type = "mosaic")
200 | ```
201 |
202 | ### Visualize the ROC curve
203 |
204 | ```{r}
205 | wbcd_test_with_pred_tbl %>%
206 | roc_curve(
207 | truth = diagnosis,
208 | .pred_Benign
209 | ) %>%
210 | autoplot()
211 | ```
212 |
213 | ### Calculate the ROC AUC (area under the curve)
214 |
215 | ```{r}
216 | wbcd_roc_auc <- wbcd_test_with_pred_tbl %>%
217 | roc_auc(
218 | truth = diagnosis,
219 | .pred_Benign
220 | )
221 |
222 | wbcd_roc_auc
223 | ```
224 |
225 | ### Put together other model metrics
226 |
227 | ```{r}
228 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
229 | classification_metrics <- conf_mat(
230 | wbcd_test_with_pred_tbl,
231 | truth = diagnosis,
232 | estimate = .pred_class
233 | ) %>%
234 | summary()
235 |
236 | classification_metrics
237 | ```
238 |
239 | ## Creating a function to help evaluate the model further
240 |
241 | The idea is to be able to choose different values for k and different methods for standardization (range (0 to 1) and normalization).
242 |
243 | ```{r}
244 | classify_with_knn <- function(
245 | k = 21,
246 | standardization_method = c("range", "normalization")
247 | ) {
248 |
249 | # Create a recipe according to the chosen standardization method
250 | if (standardization_method == "range") {
251 |
252 | recipe_obj <- recipe(
253 | formula = diagnosis ~ .,
254 | data = wbcd_factored_tbl
255 | ) %>%
256 | step_range(
257 | all_numeric_predictors(),
258 | min = 0,
259 | max = 1)
260 |
261 | } else if (standardization_method == "normalization") {
262 |
263 | recipe_obj <- recipe(
264 | formula = diagnosis ~ .,
265 | data = wbcd_factored_tbl
266 | ) %>%
267 | step_normalize(all_numeric_predictors())
268 |
269 | } else {
270 |
271 | stop('Choose a starndardization method that is either "range" or "normalization"!')
272 |
273 | }
274 |
275 | wbcd_normalized_tbl <- recipe_obj %>%
276 | prep() %>%
277 | bake(new_data = wbcd_factored_tbl)
278 |
279 | # Create training and test data
280 | wbcd_split <- initial_split(
281 | wbcd_normalized_tbl,
282 | prop = 469 / 569
283 | )
284 | wbcd_train <- training(wbcd_split)
285 | wbcd_test <- testing(wbcd_split)
286 |
287 | # Create model specification
288 | model_spec <- nearest_neighbor(
289 | engine = "kknn",
290 | mode = "classification",
291 | neighbors = k
292 | ) %>%
293 | translate()
294 |
295 | # Fit the model
296 | model_fit <- model_spec %>%
297 | fit(
298 | diagnosis ~ .,
299 | wbcd_train
300 | )
301 |
302 | # Add the predictions to the test tibble
303 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
304 |
305 | # Create a confusion matrix
306 | conf_mat <- conf_mat(
307 | data = wbcd_test_with_pred_tbl,
308 | truth = diagnosis,
309 | estimate = .pred_class
310 | )
311 |
312 | # Print the confusion matrix
313 | conf_mat %>% autoplot(type = "heatmap")
314 | }
315 | ```
316 |
317 | ### Test the function
318 |
319 | ```{r}
320 | # standardization_method is either "range" or "normalization
321 | classify_with_knn(
322 | standardization_method = "range",
323 | k = 5
324 | )
325 | ```
326 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_14.r:
--------------------------------------------------------------------------------
1 | ##### Chapter 14: Building Better Learners -------------------
2 |
3 | # load the credit dataset
4 | credit <- read.csv("credit.csv")
5 | library(caret)
6 |
7 | ## Creating a simple tuned model ----
8 |
9 | # look up the tuning parameters for C5.0
10 | modelLookup("C5.0")
11 |
12 | # automated parameter tuning of C5.0 decision tree
13 | set.seed(300)
14 | m <- train(default ~ ., data = credit, method = "C5.0")
15 |
16 | # summary of tuning results
17 | m
18 |
19 | # apply the best C5.0 candidate model to make predictions
20 | p <- predict(m, credit)
21 | table(p, credit$default)
22 |
23 | # obtain predicted classes
24 | head(predict(m, credit))
25 |
26 | # obtain predicted probabilities
27 | head(predict(m, credit, type = "prob"))
28 |
29 | ## Customizing the tuning process ----
30 | # use trainControl() to alter resampling strategy
31 | ctrl <- trainControl(method = "cv", number = 10,
32 | selectionFunction = "oneSE")
33 |
34 | # use expand.grid() to create grid of tuning parameters
35 | grid <- expand.grid(model = "tree",
36 | trials = c(1, 5, 10, 15, 20, 25, 30, 35),
37 | winnow = FALSE)
38 |
39 | # look at the result of expand.grid()
40 | grid
41 |
42 | # customize train() with the control list and grid of parameters
43 | set.seed(300)
44 | m <- train(default ~ ., data = credit, method = "C5.0",
45 | metric = "Kappa",
46 | trControl = ctrl,
47 | tuneGrid = grid)
48 |
49 | # see the results
50 | m
51 |
52 | ## Bagging ----
53 | # Using the ipred bagged decision trees
54 | library(ipred)
55 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
56 | set.seed(300)
57 | mybag <- bagging(default ~ ., data = credit, nbagg = 25)
58 | credit_pred <- predict(mybag, credit)
59 | table(credit_pred, credit$default)
60 |
61 | # estimate performance of ipred bagged trees
62 | library(caret)
63 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
64 | set.seed(300)
65 | ctrl <- trainControl(method = "cv", number = 10)
66 | train(default ~ ., data = credit, method = "treebag",
67 | trControl = ctrl)
68 |
69 | ## Boosting ----
70 |
71 | ## Using C5.0 Decision Tree (not shown in book)
72 | library(C50)
73 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
74 | m_c50_bst <- C5.0(default ~ ., data = credit, trials = 100)
75 |
76 | # create a Adaboost.M1 model
77 | library(adabag)
78 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
79 | set.seed(300)
80 | m_adaboost <- boosting(default ~ ., data = credit)
81 | p_adaboost <- predict(m_adaboost, credit)
82 | head(p_adaboost$class)
83 | p_adaboost$confusion
84 |
85 | # create and evaluate an Adaboost.M1 model using 10-fold-CV
86 | set.seed(300)
87 | adaboost_cv <- boosting.cv(default ~ ., data = credit)
88 | adaboost_cv$confusion
89 |
90 | # calculate kappa
91 | library(vcd)
92 | Kappa(adaboost_cv$confusion)
93 |
94 | ## Random Forests ----
95 | # random forest with default settings
96 | library(randomForest)
97 | set.seed(300)
98 | rf <- randomForest(default ~ ., data = credit)
99 | rf
100 |
101 | # calculate kappa on the out-of-bag estimate
102 | library(vcd)
103 | Kappa(rf$confusion[1:2,1:2])
104 |
105 | # ranger is a faster implementation of the random forest algorithm
106 | library(ranger)
107 | set.seed(300)
108 | m_ranger <- ranger(default ~ ., data = credit)
109 | m_ranger
110 |
111 | # calculate kappa
112 | Kappa(m_ranger$confusion.matrix)
113 |
114 | ## Gradient Boosting Machines (GBM) ----
115 |
116 | # prepare the data for gbm()
117 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
118 | credit$default <- ifelse(credit$default == "yes", 1, 0)
119 |
120 | # create a random train and test split
121 | set.seed(123)
122 | train_sample <- sample(1000, 900)
123 | credit_train <- credit[train_sample, ]
124 | credit_test <- credit[-train_sample, ]
125 |
126 | # create a GBM model with default parameters
127 | library(gbm)
128 | set.seed(300)
129 | m_gbm <- gbm(default ~ ., data = credit_train)
130 | m_gbm
131 |
132 | # evaluate the simple GBM model
133 | p_gbm <- predict(m_gbm, credit_test, type = "response")
134 | p_gbm_c <- ifelse(p_gbm > 0.50, 1, 0)
135 | table(credit_test$default, p_gbm_c)
136 |
137 | # compute kappa
138 | library(vcd)
139 | Kappa(table(credit_test$default, p_gbm_c))
140 |
141 | # create a tuned gbm() model using caret
142 | # start by creating the tuning grid
143 | grid_gbm <- expand.grid(
144 | n.trees = c(100, 150, 200),
145 | interaction.depth = c(1, 2, 3),
146 | shrinkage = c(0.01, 0.1, 0.3),
147 | n.minobsinnode = 10
148 | )
149 |
150 | # define the experiment's parameters
151 | library(caret)
152 | ctrl <- trainControl(method = "cv", number = 10,
153 | selectionFunction = "best")
154 |
155 | # run the caret experiment
156 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
157 | set.seed(300)
158 | m_gbm_c <- train(default ~ ., data = credit, method = "gbm",
159 | trControl = ctrl, tuneGrid = grid_gbm,
160 | metric = "Kappa",
161 | verbose = FALSE)
162 |
163 | # see the results
164 | m_gbm_c
165 |
166 | ## Extreme Gradient Boosting (XGB) ----
167 |
168 | # Format the credit data as a sparse matrix
169 | credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
170 | library(Matrix)
171 | credit_matrix <- sparse.model.matrix(~ . -default, data = credit)
172 |
173 | # examine the sparse credit_matrix
174 | dim(credit_matrix)
175 | print(credit_matrix[1:5, 1:15])
176 |
177 | # remove the intercept
178 | credit_matrix <- credit_matrix[, -1]
179 |
180 | # split the matrix into train and test
181 | set.seed(12345)
182 | train_ids <- sample(1000, 900)
183 | credit_train <- credit_matrix[train_ids, ]
184 | credit_test <- credit_matrix[-train_ids, ]
185 |
186 | # check that the rows are 900 vs. 100 and the cols are 35 vs. 35
187 | dim(credit_train)
188 | dim(credit_test)
189 |
190 | # create 1/0 vectors for train and test data indicating loan default
191 | credit_train_labels <-
192 | ifelse(credit[train_ids, c("default")] == "yes", 1, 0)
193 | credit_test_labels <-
194 | ifelse(credit[-train_ids, c("default")] == "yes", 1, 0)
195 |
196 | # build the xgboost model
197 | library(xgboost)
198 |
199 | # set XGB hyperparameters
200 | params.xgb <- list(objective = "binary:logistic",
201 | max_depth = 6,
202 | eta = 0.3,
203 | gamma = 0,
204 | colsample_bytree = 1,
205 | min_child_weight = 1,
206 | subsample = 1)
207 |
208 | set.seed(555)
209 | xgb_credit <- xgboost(params = params.xgb,
210 | data = credit_train,
211 | label = credit_train_labels,
212 | nrounds = 100,
213 | print_every_n = 10,
214 | verbose = 1)
215 |
216 | # make predictions
217 | prob_default <- predict(xgb_credit, credit_test)
218 | pred_default <- ifelse(prob_default > 0.50, 1, 0)
219 |
220 | # create a confusion matrix
221 | table(pred_default, credit_test_labels)
222 |
223 | # compute kappa
224 | library(vcd)
225 | Kappa(table(pred_default, credit_test_labels))
226 |
227 | # create a tuned xgboost() model using caret
228 | # start by creating the tuning grid
229 | grid_xgb <- expand.grid(
230 | eta = c(0.3, 0.4),
231 | max_depth = c(1, 2, 3),
232 | colsample_bytree = c(0.6, 0.8),
233 | subsample = c(0.50, 0.75, 1.00),
234 | nrounds = c(50, 100, 150),
235 | gamma = c(0, 1),
236 | min_child_weight = 1
237 | )
238 |
239 | # define the control object
240 | library(caret)
241 | ctrl <- trainControl(method = "cv", number = 10,
242 | selectionFunction = "best")
243 |
244 | # run the caret experiment
245 | set.seed(300)
246 | m_xgb <- train(default ~ ., data = credit, method = "xgbTree",
247 | trControl = ctrl, tuneGrid = grid_xgb,
248 | metric = "Kappa", verbosity = 0)
249 |
250 | # see the results of all models (not shown in book due to size of output)
251 | m_xgb
252 |
253 | # see the hyperparameters for the best performing model
254 | m_xgb$bestTune
255 |
256 | # get the best kappa out of the 216 models tested
257 | max(m_xgb$results["Kappa"])
258 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/knn/classifying-cancer-samples-with-knn.r:
--------------------------------------------------------------------------------
1 | # Classifying Cancer Samples with K-Nearest Neighbors ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 3:
4 | # Lazy Learning - Classification Using Nearest Neighbors.
5 | #
6 | # The original code is made with a lot of base R, {class} and {gmodels}. I
7 | # wanted to see how one could recreate it using mainly {tidymodels}
8 | # and {tidyverse}.
9 | #
10 | # You can find the original code here:
11 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter03
12 |
13 | ## 1. Loading libraries (in the order they get used)----
14 | library(conflicted)
15 | library(tidymodels)
16 | library(tidyverse)
17 |
18 |
19 | ## 2. Exploring and preparing the data ----
20 |
21 | ### Create a vector for the column names ----
22 | .col_names <- c(
23 | "id",
24 | "diagnosis",
25 | "radius_mean",
26 | "texture_mean",
27 | "perimeter_mean",
28 | "area_mean",
29 | "smoothness_mean",
30 | "compactness_mean",
31 | "concavity_mean",
32 | "points_mean",
33 | "symmetry_mean",
34 | "dimension_mean",
35 | "radius_se",
36 | "texture_se",
37 | "perimeter_se",
38 | "area_se",
39 | "smoothness_se",
40 | "compactness_se",
41 | "concavity_se",
42 | "points_se",
43 | "symmetry_se",
44 | "dimension_se",
45 | "radius_worst",
46 | "texture_worst",
47 | "perimeter_worst",
48 | "area_worst",
49 | "smoothness_worst",
50 | "compactness_worst",
51 | "concavity_worst",
52 | "points_worst",
53 | "symmetry_worst",
54 | "dimension_worst"
55 | )
56 |
57 | ### Import the CSV file (Breast Cancer Wisconsin (Diagnostic)) ----
58 | wbcd_tbl <- read_csv("machine_learning_with_r_3rd_ed/knn/data/wdbc-data.csv", col_names = .col_names)
59 |
60 | ### Take a look at the tibble ----
61 | glimpse(wbcd_tbl)
62 |
63 | ### Drop the unnecessary id column ----
64 | wbcd_selected_tbl <- wbcd_tbl %>% select(-id)
65 | wbcd_selected_tbl
66 |
67 | ### Transform diagnosis to a factor ----
68 | wbcd_factored_tbl <- wbcd_selected_tbl %>%
69 | mutate(
70 | diagnosis = factor(
71 | diagnosis,
72 | levels = c("B", "M"),
73 | labels = c("Benign", "Malignant")
74 | )
75 | )
76 | wbcd_factored_tbl
77 |
78 | ### Count the number of the two diagnosis (incl. percentage) ----
79 | wbcd_factored_tbl %>%
80 | count(diagnosis) %>%
81 | mutate(pct = (n / sum(n) * 100))
82 |
83 | ### Summarize three numeric features ----
84 | wbcd_factored_tbl %>%
85 | select(radius_mean, area_mean, smoothness_mean) %>%
86 | summary()
87 |
88 |
89 | ## 3. Creating the recipe and splitting the data ----
90 |
91 | ### Normalize the wbcd data ----
92 | recipe_obj <- recipe(
93 | diagnosis ~ .,
94 | data = wbcd_factored_tbl
95 | ) %>%
96 | step_range(
97 | all_numeric_predictors(),
98 | min = 0,
99 | max = 1
100 | )
101 | recipe_obj
102 |
103 | wbcd_normalized_tbl <- recipe_obj %>%
104 | prep() %>%
105 | bake(new_data = NULL)
106 |
107 | ### Confirm that normalization worked ----
108 | wbcd_normalized_tbl %>%
109 | select(area_mean) %>%
110 | summary()
111 |
112 | ### Create training and test data (randomly) ----
113 | wbcd_split <- initial_split(
114 | wbcd_normalized_tbl,
115 | prop = 469 / 569
116 | )
117 | wbcd_train <- training(wbcd_split)
118 | wbcd_test <- testing(wbcd_split)
119 |
120 |
121 | ## 4. Training a model on the data ----
122 |
123 | # kknn is the engine (needs to be installed if not already):
124 | # install.packages("kknn")
125 |
126 | # It is used as the engine for {parsnip}'s nearest_neighbor() function.
127 | # And since we are classifying, that is the mode we choose.
128 |
129 | ### Create model specification ----
130 | model_spec <- nearest_neighbor(
131 | engine = "kknn",
132 | mode = "classification",
133 | neighbors = 21
134 | ) %>%
135 | translate()
136 | model_spec
137 |
138 | ### Fit the model ----
139 | model_fit <- fit(
140 | model_spec,
141 | diagnosis ~ .,
142 | wbcd_train
143 | )
144 | model_fit
145 |
146 | ### Make the predictions (you could skip this step) ----
147 | wbcd_test_pred <- predict(
148 | model_fit,
149 | new_data = wbcd_test,
150 | type = "class"
151 | )
152 | wbcd_test_pred
153 |
154 | ### Add the predictions to the test tibble ----
155 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
156 | wbcd_test_with_pred_tbl
157 |
158 |
159 | ## 5. Evaluating model performance ----
160 |
161 | ### Create a confusion matrix ----
162 | conf_mat <- conf_mat(
163 | data = wbcd_test_with_pred_tbl,
164 | truth = diagnosis,
165 | estimate = .pred_class
166 | )
167 | conf_mat
168 |
169 | ### Visualize the confusion matrix ----
170 | conf_mat %>% autoplot(type = "heatmap")
171 | conf_mat %>% autoplot(type = "mosaic")
172 |
173 | ### Visualize the ROC curve ----
174 | wbcd_test_with_pred_tbl %>%
175 | roc_curve(
176 | truth = diagnosis,
177 | estimate = .pred_Benign
178 | ) %>%
179 | autoplot()
180 |
181 | ### Calculate the ROC AUC (area under the curve) ----
182 | wbcd_roc_auc <- wbcd_test_with_pred_tbl %>%
183 | roc_auc(
184 | truth = diagnosis,
185 | estimate = .pred_Benign
186 | )
187 | wbcd_roc_auc
188 |
189 |
190 | ### Put together other model metrics ----
191 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
192 | classification_metrics <- conf_mat(
193 | wbcd_test_with_pred_tbl,
194 | truth = diagnosis,
195 | estimate = .pred_class
196 | ) %>%
197 | summary()
198 |
199 |
200 | ## 6. Creating a function to help evaluate the model further ----
201 |
202 | # The assumption here is that you have already gone through steps 1. to 2.
203 | # The idea is to be able to choose different values for k and different
204 | # methods for standardization (range (0 to 1) and normalization)
205 |
206 | classify_with_knn <- function(
207 | k = 21,
208 | standardization_method = c("range", "normalization")
209 | ) {
210 |
211 | # Create a recipe according to the chosen standardization method
212 | if (standardization_method == "range") {
213 |
214 | recipe_obj <- recipe(
215 | formula = diagnosis ~ .,
216 | data = wbcd_factored_tbl
217 | ) %>%
218 | step_range(
219 | all_numeric_predictors(),
220 | min = 0,
221 | max = 1)
222 |
223 | } else if (standardization_method == "normalization") {
224 |
225 | recipe_obj <- recipe(
226 | formula = diagnosis ~ .,
227 | data = wbcd_factored_tbl
228 | ) %>%
229 | step_normalize(all_numeric_predictors())
230 |
231 | } else {
232 |
233 | stop('Choose a starndardization method that is either "range" or "normalization"!')
234 |
235 | }
236 |
237 | wbcd_normalized_tbl <- recipe_obj %>%
238 | prep() %>%
239 | bake(new_data = wbcd_factored_tbl)
240 |
241 | # Create training and test data
242 | wbcd_split <- initial_split(
243 | wbcd_normalized_tbl,
244 | prop = 469 / 569
245 | )
246 | wbcd_train <- training(wbcd_split)
247 | wbcd_test <- testing(wbcd_split)
248 |
249 | # Create model specification
250 | model_spec <- nearest_neighbor(
251 | engine = "kknn",
252 | mode = "classification",
253 | neighbors = k
254 | ) %>%
255 | translate()
256 |
257 | # Fit the model
258 | model_fit <- fit(
259 | model_spec,
260 | diagnosis ~ .,
261 | wbcd_train
262 | )
263 |
264 | # Add the predictions to the test tibble
265 | wbcd_test_with_pred_tbl <- augment(model_fit, wbcd_test)
266 |
267 | # Create a confusion matrix
268 | conf_mat <- conf_mat(
269 | data = wbcd_test_with_pred_tbl,
270 | truth = diagnosis,
271 | estimate = .pred_class
272 | )
273 |
274 | # Print the confusion matrix
275 | conf_mat %>% autoplot(type = "heatmap")
276 | }
277 |
278 | ### Test the function ----
279 | classify_with_knn(
280 | standardization_method = "range",
281 | k = 5
282 | )
283 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/regression_methods/predicting_medical_expenses_with_linear_regression.r:
--------------------------------------------------------------------------------
1 | # Predicting Medical Expenses with Linear Regression ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R, Chapter 6:
4 | # Forecasting Numeric Data - Regression Methods.
5 | #
6 | # The original code is made with base R and {psych}. I wanted to see how one
7 | # could recreate it using mainly {tidymodels} and {tidyverse}.
8 | #
9 | # You can find the original code and the dataset here:
10 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter06
11 |
12 | library(conflicted)
13 | library(tidyverse)
14 | library(corrr)
15 | library(tidymodels)
16 | library(GGally)
17 |
18 |
19 | ## 2. Exploring and preparing the data ----
20 | insurance_tbl <- read_csv("machine_learning_with_r_3rd_ed/regression_methods/data/insurance.csv")
21 | glimpse(insurance_tbl)
22 |
23 | ### Summarize the data ----
24 | insurance_tbl %>%
25 | mutate(across(where(is.character), as_factor)) %>%
26 | summary()
27 |
28 | ### Histogram of insurance charges ----
29 | insurance_tbl %>%
30 | ggplot(aes(expenses)) +
31 | geom_histogram(binwidth = 5000)
32 |
33 | ### Distribution between regions ----
34 | insurance_tbl %>%
35 | count(region) %>%
36 | mutate(pct = (n / sum(n) * 100))
37 |
38 | ### Exploring relationships among features: correlation matrix ----
39 | insurance_tbl %>%
40 | select(c("age", "bmi", "children", "expenses")) %>%
41 | as.matrix() %>%
42 | correlate()
43 |
44 | ### Visualing relationships among features: scatterplot matrix ----
45 | insurance_tbl %>%
46 | select(c("age", "bmi", "children", "expenses")) %>%
47 | ggpairs()
48 |
49 |
50 | ## 3. Training a model on the data ----
51 |
52 | ### Create a recipe ----
53 | recipe_obj <- recipe(
54 | expenses ~ .,
55 | data = insurance_tbl
56 | )
57 | recipe_obj
58 |
59 | insurance_baked_tbl <- recipe_obj %>%
60 | prep() %>%
61 | bake(new_data = NULL)
62 | insurance_baked_tbl
63 |
64 | ### Model specification ----
65 | model_spec <- linear_reg(
66 | mode = "regression",
67 | engine = "lm"
68 | ) %>%
69 | translate()
70 | model_spec
71 |
72 | ### Fit the model ----
73 | model_fit <- fit(
74 | model_spec,
75 | expenses ~ .,
76 | insurance_baked_tbl
77 | )
78 |
79 | ### See the estimated beta coefficients ----
80 | model_fit
81 |
82 |
83 | ## 4. Evaluating model performance ----
84 | summary(model_fit$fit)
85 |
86 |
87 | ## 5. Improving model performance ----
88 |
89 | ### Add a higher-order "age" term ----
90 | insurance_augmented_tbl <- insurance_baked_tbl %>%
91 | mutate(age2 = age ^ 2,
92 |
93 | ### Add an indicator for BMI >= 30 ----
94 | bmi30 =
95 | case_when(
96 | bmi >= 30 ~ 1,
97 | TRUE ~ 0
98 | )
99 | )
100 |
101 |
102 | ### Create the final model ----
103 | recipe_augmented_obj <- recipe(
104 | expenses ~ .,
105 | data = insurance_augmented_tbl
106 | ) %>%
107 | # This step is needed for the next step,
108 | # adding the interaction between bmi30 and smoker
109 | step_dummy(smoker) %>%
110 | # Dummy variables need to be specified with starts_with()
111 | step_interact(terms = ~ bmi30:starts_with("smoker"))
112 | recipe_augmented_obj
113 |
114 | insurance_baked_augmented_tbl <- recipe_augmented_obj %>%
115 | prep() %>%
116 | bake(new_data = NULL)
117 | insurance_baked_augmented_tbl
118 |
119 | ### Model specification ----
120 | model_spec_augmented <- linear_reg(
121 | mode = "regression",
122 | engine = "lm"
123 | ) %>%
124 | translate()
125 | model_spec_augmented
126 |
127 | ### Fit the model ----
128 | model_fit_augmented <- fit(
129 | model_spec_augmented,
130 | expenses ~ .,
131 | insurance_baked_augmented_tbl
132 | )
133 |
134 | ### See the estimated beta coefficients ----
135 | model_fit_augmented
136 |
137 | ### Evaluate model performance ----
138 | summary(model_fit_augmented$fit)
139 |
140 | ### Make predictions with the regression model (you could skip this step) ----
141 | insurance_pred_tbl <- predict(
142 | object = model_fit_augmented,
143 | new_data = insurance_baked_augmented_tbl,
144 | type = "numeric"
145 | )
146 | insurance_pred_tbl
147 |
148 | ### Add the predictions to the tibble ----
149 | insurance_pred_tbl <- augment(model_fit_augmented, insurance_baked_augmented_tbl)
150 | insurance_pred_tbl
151 |
152 | ### See the correlation between the actual and predicted expenses ----
153 | insurance_pred_tbl %>%
154 | select(.pred, expenses) %>%
155 | correlate()
156 |
157 | # Or with the more simple vectorized version:
158 | cor(insurance_pred_tbl$.pred, insurance_pred_tbl$expenses)
159 |
160 | ### Visualize the correlation ----
161 | insurance_pred_tbl %>%
162 | ggplot(aes(.pred, expenses)) +
163 | geom_point() +
164 | geom_abline(
165 | intercept = 0,
166 | slope = 1,
167 | color = "red",
168 | size = 0.5,
169 | linetype = "dashed"
170 | ) +
171 | labs(
172 | x = "Predicted Expenses",
173 | y = "Actual Expenses"
174 | )
175 |
176 | ### See the model metrics ----
177 | insurance_model_metrics <- insurance_pred_tbl %>%
178 | metrics(
179 | truth = expenses,
180 | estimate = .pred
181 | )
182 | insurance_model_metrics
183 |
184 |
185 | ### Make predictions with new data ----
186 | predict(
187 | model_fit_augmented,
188 | tibble(
189 | age = 30,
190 | age2 = 30^2,
191 | bmi = 30,
192 | bmi30 = 1,
193 | bmi30_x_smoker_yes = 0,
194 | children = 2,
195 | region = "northeast",
196 | sex = "male",
197 | smoker_yes = 0
198 | )
199 | )
200 |
201 | predict(
202 | model_fit_augmented,
203 | tibble(
204 | age = 30,
205 | age2 = 30^2,
206 | bmi = 30,
207 | bmi30 = 1,
208 | bmi30_x_smoker_yes = 0,
209 | children = 2,
210 | region = "northeast",
211 | sex = "female",
212 | smoker_yes = 0
213 | )
214 | )
215 |
216 | predict(
217 | model_fit_augmented,
218 | tibble(
219 | age = 30,
220 | age2 = 30^2,
221 | bmi = 30,
222 | bmi30 = 1,
223 | bmi30_x_smoker_yes = 0,
224 | children = 0,
225 | region = "northeast",
226 | sex = "female",
227 | smoker_yes = 0
228 | )
229 | )
230 |
231 |
232 | ## 6. Creating a function to help evaluate the model further ----
233 |
234 | # The assumption here is that you have already gone through steps 1. to 5.
235 | # You will see the predicted medical expenses, if you enter the following
236 | # parameters: age, BMI, number of children, region, sex and whether you smoke
237 | # or not.
238 |
239 | predict_medical_expenses <- function(
240 | .age,
241 | .bmi,
242 | .children,
243 | .region = c("northeast", "northwest", "southeast", "southwest"),
244 | .sex = c("female, male"),
245 | .smoker = c("no", "yes")
246 | ) {
247 |
248 | .bmi30 <- if_else(
249 | condition = .bmi >= 30,
250 | true = 1,
251 | false = 0,
252 | missing = NULL
253 | )
254 |
255 | .smoker_yes <- if_else(
256 | condition = .smoker == "yes",
257 | true = 1,
258 | false = 0,
259 | missing = NULL
260 | )
261 |
262 | .bmi30_x_smoker_yes <- if_else(
263 | condition = .bmi30 == TRUE && .smoker_yes == TRUE,
264 | true = 1,
265 | false = 0,
266 | missing = NULL
267 | )
268 |
269 | .prediction <- predict(
270 | model_fit_augmented,
271 | tibble(
272 | age = .age,
273 | age2 = .age ^ 2,
274 | bmi = .bmi,
275 | bmi30 = .bmi30,
276 | bmi30_x_smoker_yes = .bmi30_x_smoker_yes,
277 | children = .children,
278 | region = .region,
279 | sex = .sex,
280 | smoker_yes = .smoker_yes
281 | )
282 | )
283 |
284 | str_glue("The predicted medical expenses according to the given parameters are: ${.prediction %>% round(0)}")
285 |
286 | }
287 |
288 | ### Test the function ----
289 | predict_medical_expenses(
290 | .age = 30,
291 | .bmi = 30,
292 | .children = 0,
293 | .region = "northeast",
294 | .sex = "female",
295 | .smoker = "no"
296 | )
297 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_15.r:
--------------------------------------------------------------------------------
1 | ##### Chapter 15: Making Use of Big Data -------------------
2 |
3 | ## Classifying images using a pre-trained CNN in R ----
4 | ## IMPORTANT: some of the following steps are only necessary one time.
5 |
6 | # this package allows you to install packages from GitHub
7 | library(devtools)
8 |
9 | # install the latest tensorflow package
10 | # devtools::install_github("rstudio/tensorflow") # only necessary the first time!
11 |
12 | # install all of the dependencies for tensorflow (e.g., Python)
13 | library(tensorflow)
14 | # install_tensorflow() # only necessary the first time!
15 |
16 | # install the latest keras package
17 | # devtools::install_github("rstudio/keras") # only necessary the first time!
18 |
19 | # Begin the Example
20 | # load the ResNet50 model with weights trained on the ImageNet
21 | library(keras)
22 | m_resnet50 <- application_resnet50(weights = 'imagenet')
23 |
24 | # load the image and convert to array
25 | img <- image_load("ice_cream.jpg", target_size = c(224,224))
26 | x <- image_to_array(img)
27 |
28 | # x is now a 3d tensor: y, x, and color channel (r, g, b)
29 | # note: (y, x) begins counting with (1, 1) in top-left and (1, 224) in top-right
30 | dim(x)
31 | str(x)
32 |
33 | # example pixels
34 | x[1, 224, 1:3] # a white pixel in the upper-right
35 | x[40, 145, 1:3] # a red pixel in the ice cream
36 |
37 | # create a 4-dimension tensor with a constant value of '1' in the first dimension
38 | # (the first dimension is the 'batch' and is used for multiple images)
39 | x <- array_reshape(x, c(1, dim(x)))
40 | dim(x) # shows the new dimensions
41 |
42 | # do some color conversions and zero-centering as pre-processing for ResNet-50
43 | x <- imagenet_preprocess_input(x)
44 | x[1, 40, 145, 1:3] # look at the red ice cream pixel again
45 |
46 | # use the ResNet-50 model to make a prediction for the image data
47 | p_resnet50 <- predict(m_resnet50, x)
48 | c_resnet50 <- imagenet_decode_predictions(p_resnet50, top = 10)
49 |
50 | # see the predictions
51 | c_resnet50
52 |
53 | # lapply the image processing steps on the other two images
54 | img_list <- list("cat.jpg", "pizza.jpg")
55 | img_data <- lapply(img_list, image_load, target_size = c(224,224))
56 | img_arrs <- lapply(img_data, image_to_array)
57 | img_resh <- lapply(img_arrs, array_reshape, c(1, 224, 224, 3))
58 | img_prep <- lapply(img_resh, imagenet_preprocess_input)
59 | img_prob <- lapply(img_prep, predict, object = m_resnet50)
60 |
61 | # sapply the decode function to get the final predictions
62 | img_classes <- sapply(img_prob, imagenet_decode_predictions, top = 3)
63 | img_classes
64 |
65 | ## Using word2vec for understanding text in R ----
66 |
67 | # note: the code in this section requires the word embedding that was trained
68 | # on the Google News archive. Download the GoogleNews-vectors-negative300.bin.gz
69 | # file from https://code.google.com/archive/p/word2vec/ and unzip it to your R
70 | # project folder before proceeding.
71 |
72 | library(word2vec)
73 |
74 | # load the Google-trained 300-dimension word2vec embedding
75 | m_w2v <- read.word2vec(file = "GoogleNews-vectors-negative300.bin",
76 | normalize = TRUE)
77 |
78 | # examine the structure of the model
79 | str(m_w2v)
80 |
81 | # obtain the vector for a few terms
82 | foods <- predict(m_w2v, c("cereal", "bacon", "eggs", "sandwich", "salad", "steak", "spaghetti"), type = "embedding")
83 | meals <- predict(m_w2v, c("breakfast", "lunch", "dinner"), type = "embedding")
84 |
85 | # examine a single word vector
86 | head(foods["cereal", ])
87 |
88 | # examine the first few columns
89 | foods[, 1:5]
90 |
91 | # compute the similarity between the foods and meals
92 | word2vec_similarity(foods, meals)
93 |
94 | # can also use cosine similarity (not shown in book)
95 | word2vec_similarity(foods, meals, type = "cosine")
96 |
97 | # create vector of hypothetical social media posts
98 | user_posts = c(
99 | "I eat bacon and eggs in the morning for the most important meal of the day!",
100 | "I am going to grab a quick sandwich this afternoon before hitting the gym.",
101 | "Can anyone provide restaurant recommendations for my date tonight?"
102 | )
103 |
104 | library(tm) # use tm package for removeWords() and stopwords() functions
105 |
106 | user_posts_clean <- removeWords(user_posts, stopwords())
107 | user_posts_clean <- txt_clean_word2vec(user_posts_clean)
108 | user_posts_clean[1] # look at the first cleaned user post
109 |
110 | # get the doc2vec vectors for the user posts
111 | post_vectors <- doc2vec(m_w2v, user_posts_clean)
112 | str(post_vectors)
113 |
114 | # get the word2vec vectors for the meal terms
115 | meals <- predict(m_w2v, c("breakfast", "lunch", "dinner"), type = "embedding")
116 |
117 | # compare the similarity of the posts and terms
118 | word2vec_similarity(post_vectors, meals)
119 |
120 | ## Visualizing highly dimensional data ----
121 |
122 | library(tidyverse)
123 | sns_terms <- read_csv("snsdata.csv") |>
124 | select(basketball:drugs)
125 |
126 | # find the first two principal components
127 | library(irlba)
128 | set.seed(123456)
129 | sns_pca <- sns_terms |>
130 | prcomp_irlba(n = 2, center = TRUE, scale = TRUE)
131 |
132 | # create a scatterplot
133 | library(ggplot2)
134 | as.data.frame(sns_pca$x) |>
135 | ggplot(aes(PC1, PC2)) + geom_point(size = 1, shape = 1)
136 |
137 | ## Visualizing data's natural clusters with t-SNE ----
138 |
139 | # take a random sample of 5,000 users
140 | library(tidyverse)
141 | set.seed(123)
142 | sns_sample <- read_csv("snsdata.csv") |>
143 | slice_sample(n = 5000)
144 |
145 | # run t-SNE with default parameters
146 | library(Rtsne)
147 | set.seed(123)
148 | sns_tsne <- sns_sample |>
149 | select(basketball:drugs) |>
150 | Rtsne(check_duplicates = FALSE)
151 |
152 | # visualize the t-SNE result
153 | library(ggplot2)
154 | data.frame(sns_tsne$Y) |>
155 | ggplot(aes(X1, X2)) + geom_point(size = 2, shape = 1)
156 |
157 | # create a categorical feature for the number of terms used
158 | sns_sample_tsne <- sns_sample |>
159 | bind_cols(data.frame(sns_tsne$Y)) |> # add the t-SNE data
160 | rowwise() |> # work across rows rather than columns
161 | mutate(n_terms = sum(c_across(basketball:drugs))) |>
162 | ungroup() |> # remove rowwise behavior
163 | mutate(`Terms Used` = if_else(n_terms > 0, "1+", "0"))
164 |
165 | # visualize the t-SNE result by number of terms used
166 | sns_sample_tsne |>
167 | ggplot(aes(X1, X2, shape = `Terms Used`, color = `Terms Used`)) +
168 | geom_point(size = 2) +
169 | scale_shape(solid = FALSE)
170 |
171 | ## working with SQL databases ----
172 |
173 | # a tidy approach to database connections
174 | library(DBI)
175 | library(RSQLite)
176 |
177 | # creates a connection to the credit SQLite database
178 | con <- dbConnect(RSQLite::SQLite(), "credit.sqlite3")
179 | dbListTables(con)
180 |
181 | # create a data frame from the result
182 | res <- dbSendQuery(con, "SELECT * FROM credit WHERE age >= 45")
183 | credit_age45 <- dbFetch(res)
184 | summary(credit_age45$age)
185 |
186 | # disconnect from the database
187 | dbClearResult(res)
188 | dbDisconnect(con)
189 |
190 | # connecting to databases with the odbc package
191 | # (note: this is an example for illustration only and will need to be modified for your DB)
192 | library(DBI)
193 | con <- dbConnect(odbc:odbc(), "my_data_source_name")
194 |
195 | library(DBI)
196 | con <- dbConnect(odbc::odbc(),
197 | database = "my_database",
198 | uid = "my_username",
199 | pwd = "my_password",
200 | host = "my.server.address",
201 | port = 1234)
202 |
203 | # using a database backend with dplyr
204 | library(DBI)
205 | library(dplyr)
206 | con <- dbConnect(RSQLite::SQLite(), "credit.sqlite3")
207 | credit_tbl <- con |> tbl("credit")
208 |
209 | # compute summary statistics on age values filtered from the database
210 | credit_tbl |>
211 | filter(age >= 45) |>
212 | select(age) |>
213 | collect() |>
214 | summary()
215 |
216 | # show the average loan amount by loan default status, age 45+
217 | credit_tbl |>
218 | filter(age >= 45) |>
219 | group_by(default) |>
220 | summarize(mean_amount = avg(amount))
221 |
222 | # show the SQL used for the prior analysis
223 | credit_tbl |>
224 | filter(age >= 45) |>
225 | group_by(default) |>
226 | summarize(mean_amount = avg(amount)) |>
227 | show_query()
228 |
229 | ## Measuring execution time ----
230 |
231 | system.time(rnorm(1000000))
232 |
233 | ## Working in parallel ----
234 |
235 | library(parallel)
236 | detectCores()
237 |
238 | # note: the following will only work on non-Windows systems (i.e., MacOSX or Unix/Linux)
239 | # you will also need enough cores to complete each task!
240 |
241 | # random number generation using multicore
242 | # one core
243 | system.time(l1 <- unlist(mclapply(1:10, function(x) {
244 | rnorm(10000000)}, mc.cores = 1)))
245 |
246 | # two cores
247 | system.time(l2 <- unlist(mclapply(1:10, function(x) {
248 | rnorm(10000000)}, mc.cores = 2)))
249 |
250 | # four cores
251 | system.time(l4 <- unlist(mclapply(1:10, function(x) {
252 | rnorm(10000000) }, mc.cores = 4)))
253 |
254 | # eight cores
255 | system.time(l8 <- unlist(mclapply(1:10, function(x) {
256 | rnorm(10000000) }, mc.cores = 8)))
257 |
258 | # creating a 4-node cluster with snow
259 | cl1 <- makeCluster(4)
260 |
261 | # confirm that the cluster is functioning
262 | clusterCall(cl1, function() { Sys.info()["nodename"] })
263 |
264 | # running the same function on each node (not shown in book)
265 | clusterCall(cl1, function() { print("ready!") })
266 |
267 | # running a different operation on each node
268 | clusterApply(cl1, c('A', 'B', 'C', 'D'),
269 | function(x) { paste("Cluster", x, "ready!") })
270 |
271 | # close the cluster (IMPORTANT STEP!)
272 | stopCluster(cl1)
273 |
274 | ## Parallel loops with foreach ----
275 |
276 | # generate 100 million random numbers
277 | system.time(l1 <- rnorm(100000000))
278 |
279 | # combine four sets of 25 million random numbers
280 | library(foreach)
281 | system.time(l4 <- foreach(i = 1:4, .combine = 'c')
282 | %do% rnorm(25000000))
283 |
284 | # confirm the number of cores
285 | detectCores()
286 |
287 | # parallel the above foreach loop
288 | library(doParallel)
289 | registerDoParallel(cores = 4)
290 | system.time(l4p <- foreach(i = 1:4, .combine = 'c')
291 | %dopar% rnorm(25000000))
292 |
293 | stopImplicitCluster()
294 |
295 | ## Parallel processing with caret ----
296 |
297 | # training a random forest without allowing parallel computing
298 | library(caret)
299 | credit <- read.csv("credit.csv")
300 | system.time(train(default ~ ., data = credit, method = "rf",
301 | trControl = trainControl(allowParallel = FALSE)))
302 |
303 | # training the same random forest in parallel (8 cores)
304 | library(doParallel)
305 | registerDoParallel(cores = 8)
306 | system.time(train(default ~ ., data = credit, method = "rf"))
307 | stopImplicitCluster()
308 |
309 | ## Parallel cloud computing with Apache Spark
310 |
311 | library(sparklyr)
312 | spark_install() # only need to run this the first time using Spark
313 | spark_cluster <- spark_connect(master = "local")
314 |
315 | credit_spark <- spark_read_csv(spark_cluster, "credit.csv")
316 |
317 | # split the credit_spark dataset into training and testing
318 | splits <- sdf_random_split(credit_spark,
319 | train = 0.75, test = 0.25,
320 | seed = 123)
321 |
322 | # build a random forest model using Spark
323 | credit_rf <- splits$train |>
324 | ml_random_forest(default ~ .)
325 |
326 | # make predictions on the test set
327 | pred <- ml_predict(credit_rf, splits$test)
328 |
329 | ml_binary_classification_evaluator(pred, metric_name = "areaUnderROC")
330 |
331 | spark_disconnect(spark_cluster)
332 |
333 | ## Faster modeling with h2o
334 |
335 | library(h2o)
336 | h2o_instance <- h2o.init()
337 | credit.hex <- h2o.uploadFile("credit.csv")
338 |
339 | h2o.randomForest(y = "default",
340 | training_frame = credit.hex,
341 | ntrees = 500,
342 | seed = 123)
343 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/Chapter_13.r:
--------------------------------------------------------------------------------
1 | ##### Chapter 13: Challenging Data --------------------
2 |
3 | ## Stepwise regression ----
4 |
5 | # read the data and do some simple data preparation
6 | library(tidyverse)
7 | titanic_train <- read_csv("titanic_train.csv") |>
8 | mutate(
9 | Age_MVI = if_else(is.na(Age), 1, 0),
10 | Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age),
11 | Cabin = if_else(is.na(Cabin), "X", Cabin),
12 | Embarked = factor(if_else(is.na(Embarked), "X", Embarked)),
13 | Sex = factor(Sex)
14 | )
15 |
16 | # specify the simplest logistic regression model
17 | simple_model <- glm(Survived ~ 1, family = binomial, data = titanic_train)
18 |
19 | # specify the full logistic regression model
20 | full_model <- glm(Survived ~ Age + Age_MVI + Embarked + Sex + Pclass + SibSp + Fare,
21 | family = binomial, data = titanic_train)
22 |
23 | # forward stepwise regression
24 | sw_forward <- stats::step(simple_model, scope = formula(full_model),
25 | direction = "forward")
26 |
27 | # obtain the formula for the final model
28 | formula(sw_forward)
29 |
30 | # the final model's regression coefficients
31 | sw_forward$coefficients
32 |
33 | # backward stepwise
34 | sw_backward <- stats::step(full_model, direction = "backward")
35 |
36 | ## Feature selection with Boruta ----
37 |
38 | set.seed(12345) # set the random seed to ensure results match
39 | # create a feature with random values to demonstrate a useless feature
40 | titanic_train$rand_vals <- runif(n = 891, min = 1, max = 100)
41 |
42 | # run Boruta on the Titanic dataset (this can take a long time for larger datasets)
43 | library(Boruta)
44 | titanic_boruta <- Boruta(Survived ~ PassengerId + Age +
45 | Sex + Pclass + SibSp + rand_vals,
46 | data = titanic_train, doTrace = 1)
47 | # check the result
48 | titanic_boruta
49 |
50 | # plot the feature importance
51 | plot(titanic_boruta)
52 |
53 | ## Principal Component Analysis (PCA) ----
54 |
55 | library(tidyverse) # load the tidyverse suite of packages
56 |
57 | sns_data <- read_csv("snsdata.csv") # read the teenage social media data
58 |
59 | # select only the 36 columns from the column named 'basketball' through the one named 'drugs'
60 | # each column contains the count of times each social media profile used the respective term
61 | sns_terms <- sns_data |> select(basketball:drugs)
62 |
63 | # the irlba library provides a more efficient PCA function than R's built-in prcomp()
64 | library(irlba)
65 |
66 | # run the PCA - note that we center and re-scale the data here
67 | set.seed(2023) # to ensure the results match the book
68 | sns_pca <- sns_terms |>
69 | prcomp_irlba(n = 10, center = TRUE, scale = TRUE) # find first 10 principal components of the SNS data
70 |
71 | # create scree plot of the SNS data PCA
72 | screeplot(sns_pca, npcs = 10, type = "lines",
73 | main = "Scree Plot of SNS Data Principal Components")
74 |
75 | # use summary to see the components and the proportion of variance explained
76 | summary(sns_pca)
77 |
78 | # examine the PCA object -- we care most about the $x and $rotation components
79 | str(sns_pca)
80 |
81 | # the $x component is the transformed version of the original data
82 | str(sns_pca$x)
83 |
84 | # the $x is our original data transformed to have new "features" -- the principal components
85 | nrow(sns_pca$x) # should have 30,000 rows
86 | head(sns_pca$x) # should have ten columns
87 |
88 | # create a "long" version of the PCA dataset for visualization
89 | sns_pca_long <- tibble(SNS_Term = colnames(sns_terms), as_tibble(sns_pca$rotation)) |> # add the row labels
90 | pivot_longer(PC1:PC10, names_to = "PC", values_to = "Contribution") # go from a wide to long dataset
91 |
92 | # use ggplot to visualize the terms that are important for PC4
93 | library(ggplot2)
94 |
95 | sns_pca_long |>
96 | filter(PC == "PC3") |>
97 | top_n(15, abs(Contribution)) |>
98 | mutate(SNS_Term = reorder(SNS_Term, Contribution)) |>
99 | ggplot(aes(SNS_Term, Contribution, fill = SNS_Term)) +
100 | geom_col(show.legend = FALSE, alpha = 0.8) +
101 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
102 | axis.ticks.x = element_blank()) +
103 | labs(x = "Social Media Term",
104 | y = "Relative Importance to Principal Component",
105 | title = "Top 15 Contributors to PC3")
106 |
107 | # create a function to visualize the four other components
108 | plot_pca <- function(component) {
109 | sns_pca_long |>
110 | filter(PC == component) |>
111 | top_n(15, abs(Contribution)) |>
112 | mutate(SNS_Term = reorder(SNS_Term, Contribution)) |>
113 | ggplot(aes(SNS_Term, Contribution, fill = SNS_Term)) +
114 | geom_col(show.legend = FALSE, alpha = 0.8) +
115 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
116 | axis.ticks.x = element_blank()) +
117 | labs(x = "Social Media Term",
118 | y = "Relative Importance to Principal Component",
119 | title = paste("Top 15 Contributors to", component))
120 | }
121 |
122 | # use the function
123 | plot_pca("PC1")
124 | plot_pca("PC2")
125 | plot_pca("PC4")
126 | plot_pca("PC5")
127 |
128 | # we can use the principal components to predict number of friends
129 | sns_data_pca <- cbind(sns_data[1:4], sns_pca$x) # join the principal components to the original data
130 |
131 | # create a linear regression model predicting friends from the principal components
132 | m <- lm(friends ~ PC1 + PC2 + PC3 + PC4 + PC5, data = sns_data_pca)
133 |
134 | m # show the model coefficients
135 |
136 | ## Remapping sparse categorical data ----
137 |
138 | library(tidyverse)
139 |
140 | # read the Titanic dataset and create Title feature (from Chapter 12)
141 | titanic_train <- read_csv("titanic_train.csv") |>
142 | mutate(Title = str_extract(Name, ", [A-z]+\\.")) |>
143 | mutate(Title = str_replace_all(Title, "[, \\.]", ""))
144 |
145 | # the Title feature has a large number of categories
146 | table(titanic_train$Title, useNA = "ifany")
147 |
148 | # group categories with similar real-world meaning
149 | titanic_train <- titanic_train |>
150 | mutate(TitleGroup = fct_collapse(Title,
151 | Mr = "Mr",
152 | Mrs = "Mrs",
153 | Master = "Master",
154 | Miss = c("Miss", "Mlle", "Mme", "Ms"),
155 | Noble = c("Don", "Sir", "Jonkheer", "Lady"),
156 | Military = c("Capt", "Col", "Major"),
157 | Doctor = "Dr",
158 | Clergy = "Rev",
159 | other_level = "Other")
160 | ) |>
161 | mutate(TitleGroup = fct_na_value_to_level(TitleGroup,
162 | level = "Unknown"))
163 |
164 | # examine the recoding
165 | table(titanic_train$TitleGroup)
166 |
167 | # look at the counts and proportions of all levels, sorted largest to smallest
168 | fct_count(titanic_train$Title, sort = TRUE, prop = TRUE)
169 |
170 | # lump together everything outside of the top three levels
171 | table(fct_lump_n(titanic_train$Title, n = 3))
172 |
173 | # lump together everything with less than 1%
174 | table(fct_lump_prop(titanic_train$Title, prop = 0.01))
175 |
176 | # lump together everything with fewer than 5 observations
177 | table(fct_lump_min(titanic_train$Title, min = 5))
178 |
179 | ## Binning sparse numeric data ----
180 |
181 | # examine the Titanic fare data
182 | head(titanic_train$Fare)
183 | summary(titanic_train$Fare)
184 |
185 | # create a binary variable for first/second class
186 | titanic_train <- titanic_train |> mutate(
187 | fare_firstclass = if_else(Fare >= 31, 1, 0, missing = 0)
188 | )
189 |
190 | # tabulate the binary values
191 | table(titanic_train$fare_firstclass)
192 |
193 | # create a three-level feature using case_when()
194 | titanic_train <- titanic_train |>
195 | mutate(
196 | fare_class = case_when(
197 | Fare >= 31 ~ "1st Class",
198 | Fare >= 15 ~ "2nd Class",
199 | TRUE ~ "3rd Class"
200 | )
201 | )
202 |
203 | # examine the result
204 | table(titanic_train$fare_class)
205 |
206 | # the cut() function can accomplish the same as the above case_when()
207 | table(cut(titanic_train$Fare, breaks = c(-Inf, 15, 31, Inf),
208 | right = FALSE))
209 |
210 | # use cut() with seq() to generate evenly-sized break points
211 | table(cut(titanic_train$Fare, right = FALSE,
212 | breaks = seq(from = 0, to = 550, by = 50)))
213 |
214 | # use cut() with quantiles() and seq() to create bins with equal numbers of examples
215 | table(cut(titanic_train$Fare, right = FALSE,
216 | breaks = quantile(titanic_train$Fare,
217 | probs = seq(0, 1, 0.20))))
218 |
219 | # use the tidyverse ntile() function to create five bins
220 | table(ntile(titanic_train$Fare, n = 5))
221 |
222 | # convert the ntile() groups to a factor
223 | titanic_train <- titanic_train |>
224 | mutate(fare_level = factor(ntile(Fare, n = 11)))
225 |
226 | table(titanic_train$fare_level)
227 |
228 | ## Performing missing value imputation ----
229 |
230 | library(readr)
231 | titanic_train <- read_csv("titanic_train.csv")
232 |
233 | # impute arbitrary text strings for missing categorical data
234 | titanic_train <- titanic_train |>
235 | mutate(
236 | Cabin = if_else(is.na(Cabin), "X", Cabin),
237 | Embarked = if_else(is.na(Embarked), "Unknown", Embarked)
238 | )
239 |
240 | # impute mean value and create missing value indicator for age
241 | titanic_train <- titanic_train |>
242 | mutate(
243 | Age_MVI = if_else(is.na(Age), 1, 0),
244 | Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age)
245 | )
246 |
247 | ## Simple strategies for rebalancing data ----
248 |
249 | # load and prepare the teenage social media data
250 | library(tidyverse)
251 |
252 | snsdata <- read_csv("snsdata.csv") |>
253 | mutate(
254 | gender = fct_recode(gender, Female = "F", Male = "M"),
255 | gender = fct_na_value_to_level(gender, level = "Unknown"),
256 | age = ifelse(age < 13 | age > 20, NA, age) # replace age outliers
257 | ) |>
258 | group_by(gradyear) |>
259 | mutate(age_imp = if_else(is.na(age), median(age, na.rm = TRUE), age)) |>
260 | ungroup() |>
261 | select(gender, friends, gradyear, age_imp, basketball:drugs)
262 |
263 | # examine the initial class imbalance
264 | fct_count(snsdata$gender, prop = TRUE)
265 |
266 | # undersample the majority classes
267 | library(caret)
268 | sns_undersample <- downSample(x = snsdata[2:40], y = snsdata$gender, yname = "gender")
269 | fct_count(sns_undersample$gender, prop = TRUE)
270 |
271 | # oversample the minority classes
272 | library(caret)
273 | sns_oversample <- upSample(x = snsdata[2:40], y = snsdata$gender, yname = "gender")
274 | fct_count(sns_oversample$gender, prop = TRUE)
275 |
276 | ## Generating a synthetic balanced dataset with SMOTE ----
277 |
278 | # create a gender balanced dataset using SMOTE
279 | library(themis)
280 | sns_balanced <- snsdata |> smote("gender") # simple syntax (without normalization)
281 |
282 | # check that the dataset is now gender balanced
283 | table(sns_balanced$gender)
284 |
285 | # for a better SMOTE, create a normalize function (introduced in Chapter 3)
286 | normalize <- function(x) {
287 | return ((x - min(x)) / (max(x) - min(x)))
288 | }
289 |
290 | # an unnormalize function returns data to original scale (introduced in Chapter 7)
291 | unnormalize <- function(norm_vals, col_name) {
292 | old_vals <- snsdata[col_name]
293 | unnormalized_vals <- norm_vals * (max(old_vals) - min(old_vals)) + min(old_vals)
294 |
295 | # round all columns to integers except age_imp
296 | rounded_vals <- if(col_name != "age_imp") { round(unnormalized_vals) }
297 | else {unnormalized_vals}
298 |
299 | return (rounded_vals)
300 | }
301 |
302 | # more advanced smote() process with normalized data
303 | snsdata_balanced <- snsdata |>
304 | mutate(across(where(is.numeric), normalize)) |> # normalize the numeric data
305 | smote("gender") |>
306 | mutate(across(where(is.numeric), ~unnormalize(.x, cur_column()))) # unnormalize the data
307 |
308 | # confirm that the rebalanced dataset worked correctly
309 | table(snsdata$gender)
310 | table(snsdata_balanced$gender)
311 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_02_managing_and_understanding_data.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 2: Managing and Understanding Data"
3 | author: "Original Code: Brett Lantz | Modifications: Antti Rask"
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | # 2 Managing and Understanding Data
13 |
14 | ## Load Packages
15 |
16 | ```{r}
17 | library(conflicted) # An Alternative Conflict Resolution Strategy
18 | conflicts_prefer(dplyr::filter)
19 | conflicts_prefer(janitor::chisq.test)
20 | library(crosstable) # Crosstables for Descriptive Analyses
21 | library(janitor) # Simple Tools for Examining and Cleaning Dirty Data
22 | library(tidyverse) # Easily Install and Load the 'Tidyverse'
23 | ```
24 |
25 | ## Data
26 |
27 | ```{r}
28 | # create tibble of data for three medical patients
29 | patients_tbl <- tibble(
30 | subject_name = c("John Doe", "Jane Doe", "Steve Graves"),
31 | temperature = c(98.1, 98.6, 101.4),
32 | flu_status = c(FALSE, FALSE, TRUE),
33 | gender = factor(c("MALE", "FEMALE", "MALE")),
34 | blood = factor(
35 | c("O", "AB", "A"),
36 | levels = c("A", "B", "AB", "O")
37 | ),
38 | symptoms = factor(
39 | c("SEVERE", "MILD", "MODERATE"),
40 | levels = c("MILD", "MODERATE", "SEVERE"),
41 | ordered = TRUE
42 | )
43 | )
44 |
45 | patients_tbl
46 | ```
47 |
48 | ## R data structures
49 |
50 | ### Vectors -> Tibble
51 |
52 | ```{r}
53 | # access the second body temperature
54 | patients_tbl %>%
55 | slice(2) %>%
56 | pull(temperature)
57 | ```
58 |
59 | ```{r}
60 | # access body temperature for the second and third patient
61 | patients_tbl %>%
62 | slice(2:3) %>%
63 | pull(temperature)
64 | ```
65 |
66 | ```{r}
67 | # access body temperature for all but the second patient
68 | patients_tbl %>%
69 | slice(-2) %>%
70 | pull(temperature)
71 | ```
72 |
73 | ```{r}
74 | # Use a vector to indicate whether to include item
75 | patients_tbl %>%
76 | filter(c(TRUE, TRUE, FALSE)) %>%
77 | pull(temperature)
78 | ```
79 |
80 | ```{r}
81 | # filter rows by conditions
82 | patients_tbl %>%
83 | filter(temperature > 100) %>%
84 | pull(subject_name)
85 | ```
86 |
87 | ### Factors
88 |
89 | ```{r}
90 | # check gender factors
91 | patients_tbl %>%
92 | pull(gender)
93 | ```
94 |
95 | ```{r}
96 | # check blood type factors
97 | patients_tbl %>%
98 | pull(blood)
99 | ```
100 |
101 | ```{r}
102 | # check symptom factors
103 | patients_tbl %>%
104 | pull(symptoms)
105 | ```
106 |
107 | ```{r}
108 | # check for symptoms greater than moderate
109 | patients_tbl %>%
110 | mutate(symptoms_severe = symptoms > "MODERATE") %>%
111 | pull(symptoms_severe)
112 | ```
113 |
114 | ### Lists/Tibble
115 |
116 | ```{r}
117 | # display information for a patient
118 | patients_tbl %>%
119 | slice(1)
120 | ```
121 |
122 | ```{r}
123 | # convert the tibble to a list
124 | patient_1_list <- patients_tbl %>%
125 | slice(1) %>%
126 | as.list()
127 |
128 | patient_1_list
129 | ```
130 |
131 | #### Methods for Accessing a List
132 |
133 | ```{r}
134 | # get a single list value by position (returns a sub-list)
135 | patient_1_list[2]
136 | ```
137 |
138 | ```{r}
139 | # get a single list value by position (returns a numeric vector)
140 | patient_1_list %>%
141 | pluck(2)
142 | ```
143 |
144 | ```{r}
145 | # get a single list value by name
146 | patient_1_list %>%
147 | pluck("temperature")
148 | ```
149 |
150 | ```{r}
151 | # get several list items by specifying a vector of names
152 | patient_1_list[c("temperature", "flu_status")]
153 | ```
154 | #### Access a List Like a Vector
155 |
156 | ```{r}
157 | # get values 2 and 3
158 | patient_1_list[2:3]
159 | ```
160 |
161 | ### Tibble
162 |
163 | #### Accessing a tibble
164 |
165 | ```{r}
166 | # display the data tibble
167 | patients_tbl
168 | ```
169 |
170 | ```{r}
171 | # get a single column
172 | patients_tbl %>%
173 | pull(subject_name)
174 | ```
175 |
176 | ```{r}
177 | # get several columns by specifying a vector of names
178 | patients_tbl %>%
179 | select(c("temperature", "flu_status"))
180 | ```
181 |
182 | ```{r}
183 | # this is the same as above, extracting temperature and flu_status
184 | patients_tbl %>%
185 | select(2:3)
186 | ```
187 |
188 | ```{r}
189 | # accessing by row and column
190 | patients_tbl %>%
191 | slice(c(1, 3)) %>%
192 | select(c(2, 4))
193 | ```
194 |
195 | #### Extract All Rows or Columns
196 |
197 | ```{r}
198 | # column 1, all rows
199 | patients_tbl %>%
200 | pull(1)
201 | ```
202 |
203 | ```{r}
204 | # row 1, all columns
205 | patients_tbl %>%
206 | slice(1)
207 | ```
208 |
209 | ```{r}
210 | # all rows and all columns
211 | patients_tbl
212 | ```
213 |
214 | ```{r}
215 | # the following are equivalent
216 | patients_tbl %>%
217 | slice(c(1, 3)) %>%
218 | select(c(temperature, gender))
219 |
220 | patients_tbl %>%
221 | slice(-2) %>%
222 | select(c(-1, -3, -5, -6))
223 | ```
224 |
225 | ```{r}
226 | # creating a Celsius temperature column, then comparing before and after
227 | patients_tbl %>%
228 | mutate(temp_c = (temperature - 32) * (5 / 9)) %>%
229 | select(temperature, temp_c)
230 | ```
231 |
232 | ### Matrixes
233 |
234 | ```{r}
235 | # create a 2x2 matrix
236 | matrix(c(1, 2, 3, 4), nrow = 2)
237 | ```
238 |
239 | ```{r}
240 | # equivalent to the above
241 | matrix(c(1, 2, 3, 4), ncol = 2)
242 | ```
243 |
244 | ```{r}
245 | # create a 2x3 matrix
246 | matrix(c(1, 2, 3, 4, 5, 6), nrow = 2)
247 | ```
248 |
249 | ```{r}
250 | # create a 3x2 matrix
251 | m <- matrix(c(1, 2, 3, 4, 5, 6), ncol = 2)
252 | m
253 | ```
254 |
255 | ```{r}
256 | # extract values from matrixes
257 | m[1, 1]
258 | m[3, 2]
259 | ```
260 |
261 | ```{r}
262 | # extract rows
263 | m[1, ]
264 | ```
265 |
266 | ```{r}
267 | # extract columns
268 | m[, 1]
269 | ```
270 |
271 | ## Managing data with R
272 |
273 | #### Saving, Loading, and Removing R Data Structures
274 |
275 | ```{r}
276 | # show all data structures in memory
277 | ls()
278 | ```
279 |
280 | ```{r}
281 | # remove the m and patient_1_list objects
282 | rm(m, patient_1_list)
283 | ls()
284 | ```
285 |
286 | ```{r}
287 | # remove all (remaining) objects
288 | rm(list = ls())
289 | ls()
290 | ```
291 |
292 | ## Importing and saving datasets from CSV files
293 |
294 | ```{r}
295 | # reading a CSV file
296 | patients_tbl <- read_csv("data/pt_data.csv")
297 | ```
298 |
299 | ```{r}
300 | # reading a CSV file and converting all character columns to factors
301 | patients_factors_tbl <- read_csv(
302 | "data/pt_data.csv",
303 | col_types = "cdlff",
304 | show_col_types = TRUE
305 | )
306 | ```
307 |
308 | ```{r}
309 | # alternatively reading a CSV file and converting all character columns to factors
310 | patients_factors_tbl <- read_csv("data/pt_data.csv") %>%
311 | mutate(across(where(is.character), as.factor))
312 |
313 | patients_factors_tbl
314 | ```
315 |
316 | ## Exploring and understanding data
317 |
318 | ### Data Exploration Example Using Used Car Data
319 |
320 | ```{r}
321 | usedcars <- read_csv("data/usedcars.csv")
322 | ```
323 |
324 | ```{r}
325 | # get structure of used car data
326 | usedcars %>%
327 | str()
328 | ```
329 |
330 | ### Exploring numeric variables
331 |
332 | ```{r}
333 | # summarize numeric variables
334 | usedcars %>%
335 | select(year, price, mileage) %>%
336 | summary()
337 | ```
338 |
339 | ```{r}
340 | # calculate the mean income
341 | (36000 + 44000 + 56000) / 3
342 |
343 | c(36000, 44000, 56000) %>%
344 | mean()
345 | ```
346 |
347 | ```{r}
348 | # the median income
349 | c(36000, 44000, 56000) %>%
350 | median()
351 | ```
352 |
353 | ```{r}
354 | # the min/max of used car prices
355 | usedcars %>%
356 | pull(price) %>%
357 | range()
358 | ```
359 |
360 | ```{r}
361 | # the difference of the range
362 | usedcars %>%
363 | pull(price) %>%
364 | range() %>%
365 | diff()
366 | ```
367 |
368 | ```{r}
369 | # IQR for used car prices
370 | usedcars %>%
371 | pull(price) %>%
372 | IQR()
373 | ```
374 |
375 | ```{r}
376 | # use quantile to calculate five-number summary
377 | usedcars %>%
378 | pull(price) %>%
379 | quantile()
380 | ```
381 |
382 | ```{r}
383 | # the 99th percentile
384 | usedcars %>%
385 | pull(price) %>%
386 | quantile(
387 | probs = c(0.01, 0.99)
388 | )
389 | ```
390 |
391 | ```{r}
392 | # quintiles
393 | usedcars %>%
394 | pull(price) %>%
395 | quantile(
396 | seq(from = 0, to = 1, by = 0.20)
397 | )
398 | ```
399 |
400 | ```{r}
401 | # boxplot of used car prices and mileage
402 | usedcars %>%
403 | ggplot(aes(y = price)) +
404 | geom_boxplot() +
405 | labs(
406 | title = "Boxplot of Used Car Prices",
407 | y = "Price ($)"
408 | ) +
409 | theme_classic()
410 |
411 | usedcars %>%
412 | ggplot(aes(y = mileage)) +
413 | geom_boxplot() +
414 | labs(
415 | title = "Boxplot of Used Car Mileage",
416 | y = "Odometer (mi.)"
417 | ) +
418 | theme_classic()
419 | ```
420 |
421 | ```{r}
422 | # histograms of used car prices and mileage
423 | usedcars %>%
424 | ggplot(aes(price)) +
425 | geom_histogram(
426 | bins = 10,
427 | color = "black",
428 | fill = "white"
429 | ) +
430 | labs(
431 | title = "Histogram of Used Car Prices",
432 | x = "Price ($)"
433 | ) +
434 | theme_classic()
435 |
436 | usedcars %>%
437 | ggplot(aes(mileage)) +
438 | geom_histogram(
439 | bins = 8,
440 | color = "black",
441 | fill = "white"
442 | ) +
443 | labs(
444 | title = "Histogram of Used Car Mileage",
445 | x = "Odometer (mi.)"
446 | ) +
447 | theme_classic()
448 | ```
449 |
450 | ```{r}
451 | # variance and standard deviation of the used car data
452 | usedcars %>%
453 | pull(price) %>%
454 | var()
455 | ```
456 |
457 | ```{r}
458 | usedcars %>%
459 | pull(price) %>%
460 | sd()
461 | ```
462 |
463 | ```{r}
464 | usedcars %>%
465 | pull(mileage) %>%
466 | var()
467 | ```
468 |
469 | ```{r}
470 | usedcars %>%
471 | pull(mileage) %>%
472 | sd()
473 | ```
474 |
475 | ### Exploring categorical features
476 |
477 | ```{r}
478 | # one-way tables for the used car data
479 | usedcars %>%
480 | tabyl(year) %>%
481 | adorn_pct_formatting(digits = 1)
482 | ```
483 |
484 | ```{r}
485 | usedcars %>%
486 | tabyl(model) %>%
487 | adorn_pct_formatting(digits = 1)
488 | ```
489 |
490 | ```{r}
491 | usedcars %>%
492 | tabyl(color) %>%
493 | adorn_pct_formatting(digits = 1)
494 | ```
495 |
496 | ```{r}
497 | # you can also have a three-way table of the same
498 | usedcars %>%
499 | tabyl(year, model, color)
500 | ```
501 |
502 | ### Exploring relationships between variables
503 |
504 | ```{r}
505 | # scatterplot of price vs. mileage
506 | usedcars %>%
507 | ggplot(aes(mileage, price)) +
508 | geom_point() +
509 | labs(
510 | title = "Scatterplot of Price vs. Mileage",
511 | x = "Used Car Odometer (mi.)",
512 | y = "Used Car Price ($)"
513 | ) +
514 | theme_classic()
515 | ```
516 |
517 | ```{r}
518 | # new variable indicating conservative colors
519 | usedcars_conservative <- usedcars %>%
520 | mutate(conservative = color %in% c("Black", "Gray", "Silver", "White"))
521 | ```
522 |
523 | ```{r}
524 | # checking our variable
525 | usedcars_conservative %>%
526 | tabyl(conservative) %>%
527 | adorn_pct_formatting(digits = 0)
528 | ```
529 |
530 | ```{r}
531 | # crosstab of model by conservative
532 | usedcars_conservative %>%
533 | crosstable(
534 | cols = model,
535 | by = conservative,
536 | label = FALSE,
537 | total = "both",
538 | percent_pattern = "{n} ({p_row}/{p_col}/{p_tot})",
539 | percent_digits = 1
540 | ) %>%
541 | as_flextable(compact = TRUE)
542 | ```
543 |
544 | ```{r}
545 | # compute the chi-square test
546 | usedcars_conservative %>%
547 | tabyl(model, conservative) %>%
548 | chisq.test()
549 | ```
550 |
551 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/decision_trees_and_rules/identifying_risky_bank_loans_using_C5.0_decision_trees.r:
--------------------------------------------------------------------------------
1 | # Identifying Risky Bank Loans Using C5.0 Decision Trees ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R,
4 | # Chapter 5: Divide and Conquer - Classification Using Decision Trees and Rules and
5 | # Chapter 10: Evaluating Model Performance
6 | #
7 | # The original code is made with {C50}, {gmodels}, {OneR} and {RWeka}. I
8 | # wanted to see how one could recreate it using mainly {tidymodels} and
9 | # {tidyverse}.
10 | #
11 | # You can find the original code and the slightly modified dataset here:
12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter05
13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
14 |
15 | ## 1. Loading libraries (in the order they get used) ----
16 | library(conflicted)
17 | library(tidyverse)
18 | library(tidymodels)
19 |
20 | ## 2. Exploring and preparing the data ----
21 | credit_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/credit.csv")
22 |
23 | ### Examine the structure of the credit data ----
24 | glimpse(credit_tbl)
25 |
26 | ### look at two characteristics of the applicant ----
27 | credit_tbl %>%
28 | count(checking_balance) %>%
29 | mutate(pct = (n / sum(n) * 100))
30 |
31 | credit_tbl %>%
32 | count(savings_balance) %>%
33 | mutate(pct = (n / sum(n) * 100))
34 |
35 | ### look at two characteristics of the loan ----
36 | credit_tbl %>%
37 | select(months_loan_duration, amount) %>%
38 | summary()
39 |
40 | ### look at the class variable ----
41 | credit_tbl %>%
42 | count(default) %>%
43 | mutate(pct = (n / sum(n) * 100))
44 |
45 |
46 | ## 3. Creating the recipe and splitting the data ----
47 |
48 | ### Convert strings to factors ----
49 | recipe_obj <- recipe(
50 | default ~ .,
51 | data = credit_tbl
52 | ) %>%
53 | step_string2factor(all_nominal())
54 | recipe_obj
55 |
56 | credit_factorized_tbl <- recipe_obj %>%
57 | prep() %>%
58 | bake(new_data = NULL)
59 | credit_factorized_tbl
60 |
61 | ### Create training and test data (randomly) ----
62 |
63 | # Use set.seed to use the same random number sequence as the original
64 | RNGversion("3.5.2")
65 | set.seed(123)
66 |
67 | credit_split <- initial_split(
68 | credit_factorized_tbl,
69 | prop = 0.9
70 | )
71 | credit_train <- training(credit_split)
72 | credit_test <- testing(credit_split)
73 |
74 | ### Check the proportion of class variable ----
75 | credit_train %>%
76 | count(default) %>%
77 | mutate(pct = (n / sum(n) * 100))
78 |
79 | credit_test %>%
80 | count(default) %>%
81 | mutate(pct = (n / sum(n) * 100))
82 |
83 |
84 | ## 4. Training a model on the data ----
85 |
86 | ### Model specification ----
87 | model_spec <- decision_tree(
88 | mode = "classification",
89 | engine = "C5.0",
90 | cost_complexity = NULL,
91 | tree_depth = NULL,
92 | min_n = NULL
93 | ) %>%
94 | translate()
95 | model_spec
96 |
97 | ### Fit the model ----
98 | model_fit <- fit(
99 | model_spec,
100 | default ~ .,
101 | credit_train
102 | )
103 | model_fit
104 |
105 | model_fit %>%
106 | extract_fit_engine() %>%
107 | summary()
108 |
109 | ### Make the predictions (you could skip this step) ----
110 | credit_test_pred <- predict(
111 | object = model_fit,
112 | new_data = credit_test,
113 | type = "class"
114 | )
115 | credit_test_pred
116 |
117 | ### Add the predictions to the test tibble ----
118 | credit_test_with_pred_tbl <- augment(model_fit, credit_test)
119 | credit_test_with_pred_tbl
120 |
121 |
122 | ## 5. Evaluating model performance ----
123 |
124 | ### Create a confusion matrix ----
125 | conf_mat <- conf_mat(
126 | data = credit_test_with_pred_tbl,
127 | truth = default,
128 | estimate = .pred_class
129 | )
130 | conf_mat
131 |
132 | ### Visualize the confusion matrix ----
133 | conf_mat %>% autoplot(type = "heatmap")
134 | conf_mat %>% autoplot(type = "mosaic")
135 |
136 | ### Visualize the ROC curve ----
137 | credit_test_with_pred_tbl %>%
138 | roc_curve(
139 | truth = default,
140 | estimate = .pred_no
141 | ) %>%
142 | autoplot()
143 |
144 | ### Calculate the ROC AUC (area under the curve) ----
145 | credit_roc_auc <- credit_test_with_pred_tbl %>%
146 | roc_auc(
147 | truth = default,
148 | estimate = .pred_no
149 | )
150 | credit_roc_auc
151 |
152 | ### Put together other model metrics ----
153 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
154 | classification_metrics <- conf_mat(
155 | credit_test_with_pred_tbl,
156 | truth = default,
157 | estimate = .pred_class
158 | ) %>%
159 | summary()
160 | classification_metrics
161 |
162 |
163 | ## 6. Improving model performance ----
164 |
165 | ### Boost the decision tree with 10 trials ----
166 | model_spec_boost_tree <- boost_tree(
167 | mode = "classification",
168 | engine = "C5.0",
169 | trees = 10,
170 | min_n = NULL,
171 | sample_size = NULL
172 | ) %>%
173 | translate()
174 | model_spec_boost_tree
175 |
176 | ### Fit the model ----
177 | model_fit_boost_tree <- fit(
178 | model_spec_boost_tree,
179 | default ~ .,
180 | credit_train
181 | )
182 | model_fit_boost_tree
183 |
184 | model_fit_boost_tree %>%
185 | extract_fit_engine() %>%
186 | summary()
187 |
188 | ### Make the predictions (you could skip this step) ----
189 | credit_test_pred_boost_tree <- predict(
190 | object = model_fit_boost_tree,
191 | new_data = credit_test,
192 | type = "class"
193 | )
194 | credit_test_pred_boost_tree
195 |
196 | ### Add the predictions to the test tibble ----
197 | credit_test_with_pred_boost_tree <- augment(model_fit_boost_tree, credit_test)
198 | credit_test_with_pred_boost_tree
199 |
200 | ### Create a confusion matrix ----
201 | conf_mat_boost_tree <- conf_mat(
202 | data = credit_test_with_pred_boost_tree,
203 | truth = default,
204 | estimate = .pred_class
205 | )
206 | conf_mat_boost_tree
207 |
208 | ### Visualize the confusion matrix ----
209 | conf_mat_boost_tree %>% autoplot(type = "heatmap")
210 | conf_mat_boost_tree %>% autoplot(type = "mosaic")
211 |
212 | ### Visualize the ROC curve ----
213 | credit_test_with_pred_boost_tree %>%
214 | roc_curve(
215 | truth = default,
216 | estimate = .pred_no
217 | ) %>%
218 | autoplot()
219 |
220 | ### Calculate the ROC AUC (area under the curve) ----
221 | credit_roc_auc_boost_tree <- credit_test_with_pred_boost_tree %>%
222 | roc_auc(
223 | truth = default,
224 | estimate = .pred_no
225 | )
226 | credit_roc_auc_boost_tree
227 |
228 | ### Put together other model metrics ----
229 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
230 | classification_metrics_boost_tree <- conf_mat(
231 | credit_test_with_pred_boost_tree,
232 | truth = default,
233 | estimate = .pred_class
234 | ) %>%
235 | summary()
236 | classification_metrics_boost_tree
237 |
238 |
239 | ## 7. Creating a function to help evaluate the model further ----
240 |
241 | # The assumption here is that you have already gone through steps 1. to 2.
242 | # What we're potentially tuning here are the arguments .tree_depth and .min_n
243 | # for decision_tree, and .trees and .min_n for boost_tree.
244 |
245 | classify_with_c5_trees <- function(
246 | .model = c("decision_tree", "boost_tree"),
247 | .mode = "classification",
248 | .engine = "C5.0",
249 | .tree_depth = NULL, # for decision_tree
250 | .trees = NULL, # for boost_tree
251 | .min_n = 1 # for both
252 | ) {
253 |
254 | # Create the recipe
255 | recipe_obj <- recipe(
256 | default ~ .,
257 | data = credit_tbl
258 | ) %>%
259 | step_string2factor(all_nominal())
260 |
261 | credit_factorized_tbl <- recipe_obj %>%
262 | prep() %>%
263 | bake(new_data = NULL)
264 |
265 | # Create training and test data (randomly)
266 | RNGversion("3.5.2")
267 | set.seed(123)
268 |
269 | credit_split <- initial_split(
270 | credit_factorized_tbl,
271 | prop = 0.9
272 | )
273 | credit_train <- training(credit_split)
274 | credit_test <- testing(credit_split)
275 |
276 | # Model specification
277 | model <- .model
278 |
279 | if (model == "decision_tree") {
280 |
281 | model_spec <- decision_tree(
282 | mode = .mode,
283 | engine = .engine,
284 | tree_depth = .tree_depth,
285 | min_n = .min_n
286 | ) %>%
287 | translate()
288 |
289 | } else if (model == "boost_tree") {
290 |
291 | model_spec <- boost_tree(
292 | mode = .mode,
293 | engine = .engine,
294 | trees = .trees,
295 | min_n = .min_n
296 | ) %>%
297 | translate()
298 |
299 | } else {
300 |
301 | stop("The model needs to be either decision_tree or boost_tree!")
302 |
303 | }
304 |
305 | # Fit the model
306 | model_fit <- fit(
307 | model_spec,
308 | default ~ .,
309 | credit_train
310 | )
311 |
312 | # Add the predictions to the test tibble
313 | credit_test_with_pred_tbl <- augment(model_fit, credit_test)
314 | credit_test_with_pred_tbl
315 |
316 | # Create a confusion matrix
317 | conf_mat <- conf_mat(
318 | data = credit_test_with_pred_tbl,
319 | truth = default,
320 | estimate = .pred_class
321 | )
322 |
323 | conf_mat %>% autoplot(type = "heatmap")
324 |
325 | }
326 |
327 | ### Test the function ----
328 | classify_with_c5_trees(
329 | .model = "decision_tree",
330 | .mode = "classification",
331 | .engine = "C5.0",
332 | .tree_depth = NULL, # for decision_tree
333 | .trees = NULL, # for boost_tree
334 | .min_n = 1 # for both, NULL produces error, so > 1 is adviced
335 | )
336 |
337 |
338 | ## 8. Cross-validation ----
339 | # You might want to restart R (Ctrl + Shift + F10) at this point so you have a
340 | # clean slate
341 |
342 | ### Load libraries ----
343 | library(conflicted)
344 | library(tidyverse)
345 | library(tidymodels)
346 |
347 | ### Load the data ----
348 | credit_tbl <- read_csv("machine_learning_with_r_3rd_ed/decision_trees_and_rules/data/credit.csv")
349 |
350 | ### Create the train-test split ----
351 | RNGversion("3.5.2")
352 | set.seed(123)
353 |
354 | credit_split <- initial_split(
355 | credit_tbl,
356 | prop = 0.9
357 | )
358 | credit_train <- training(credit_split)
359 | credit_test <- testing(credit_split)
360 |
361 | ### Create the cross-validation folds ----
362 | set.seed(345)
363 | folds_train <- vfold_cv(credit_train, v = 10)
364 | folds_train
365 |
366 | ### Create recipe, model specification and control ----
367 | recipe_train <- recipe(
368 | default ~ .,
369 | data = credit_train
370 | ) %>%
371 | step_string2factor(all_nominal())
372 |
373 | model_spec <- decision_tree(
374 | mode = "classification",
375 | engine = "C5.0",
376 | cost_complexity = NULL,
377 | tree_depth = NULL,
378 | min_n = NULL
379 | ) %>%
380 | translate()
381 |
382 | control <- control_resamples(save_pred = TRUE)
383 |
384 | ### Fit the samples ----
385 | spline_res_train <- fit_resamples(
386 | object = model_spec,
387 | preprocessor = recipe_train,
388 | resamples = folds_train,
389 | control = control
390 | )
391 |
392 | ### Look at the summarized model metrics ----
393 | spline_res_train %>%
394 | collect_metrics()
395 |
396 | ### Look at the individual model metrics ----
397 | spline_res_train %>%
398 | collect_metrics(summarize = FALSE) %>%
399 | ggplot(
400 | aes(
401 | x = .estimate,
402 | fill = .metric
403 | )
404 | ) +
405 | geom_histogram()
406 |
--------------------------------------------------------------------------------
/machine_learning_with_r_4th_ed/chapter_07_black-box_methods_neural_networks_and_support_vector_machines.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lantz, Brett - Machine Learning with R (4th ed.), Chapter 7: Black-Box Methods - Neural Networks and Support Vector Machines"
3 | author: 'Original Code: Brett Lantz | Modifications: Antti Rask'
4 | date: "2023-07-23"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## Loading Packages
13 |
14 | ```{r}
15 | library(conflicted) # An Alternative Conflict Resolution Strategy
16 | library(corrr) # Correlations in R
17 | library(janitor) # Simple Tools for Examining and Cleaning Dirty Data
18 | library(tidymodels) # Easily Install and Load the 'Tidymodels' Packages
19 | library(tidyverse) # Easily Install and Load the 'Tidyverse'
20 | ```
21 |
22 | ## Part 1: Neural Networks
23 |
24 | ### Exploring and preparing the data
25 |
26 | #### Read in data and examine structure
27 |
28 | ```{r}
29 | concrete_tbl <- read_csv("data/concrete.csv")
30 | ```
31 |
32 | ```{r}
33 | concrete_tbl %>%
34 | glimpse()
35 | ```
36 |
37 | #### Check the minimum and maximum strength
38 |
39 | ```{r}
40 | concrete_tbl %>%
41 | pull(strength) %>%
42 | summary()
43 | ```
44 |
45 | ### Creating the recipe
46 |
47 | #### Apply normalization to the numeric predictors
48 |
49 | ```{r}
50 | recipe_obj <- recipe(
51 | strength ~ .,
52 | data = concrete_tbl
53 | ) %>%
54 | step_range(
55 | all_numeric_predictors(),
56 | min = 0,
57 | max = 1
58 | )
59 |
60 | recipe_obj
61 | ```
62 |
63 | ```{r}
64 | concrete_normalized_tbl <- recipe_obj %>%
65 | prep() %>%
66 | bake(new_data = NULL)
67 |
68 | concrete_normalized_tbl
69 | ```
70 |
71 | #### Create training and test data
72 |
73 | ```{r}
74 | concrete_split <- initial_time_split(
75 | concrete_normalized_tbl,
76 | prop = 773 / 1030
77 | )
78 |
79 | concrete_train <- training(concrete_split)
80 | concrete_test <- testing(concrete_split)
81 | ```
82 |
83 | ### Training a model on the data
84 |
85 | nnet is the engine (needs to be installed if not already):
86 | install.packages("nnet")
87 |
88 | It is used as the engine for {parsnip}'s mlp() function. And since we are predicting strength, we choose regression as the mode.
89 |
90 | #### Create model specification
91 |
92 | ```{r}
93 | set.seed(12345)
94 |
95 | model_spec_nnet <- mlp(
96 | engine = "nnet",
97 | mode = "regression",
98 | hidden_units = 1,
99 | penalty = 0,
100 | dropout = 0,
101 | epochs = 100,
102 | activation = NULL,
103 | learn_rate = NULL
104 | ) %>%
105 | translate()
106 |
107 | model_spec_nnet
108 | ```
109 |
110 | #### Fit the model
111 |
112 | ```{r}
113 | model_fit_nnet <- model_spec_nnet %>%
114 | fit(
115 | strength ~ .,
116 | concrete_train
117 | )
118 |
119 | model_fit_nnet
120 | ```
121 |
122 | #### Take a closer look at the model
123 |
124 | ```{r}
125 | model_fit_nnet$fit %>%
126 | summary()
127 | ```
128 |
129 | #### Make the predictions (you could skip this step)
130 |
131 | ```{r}
132 | concrete_test_pred <- model_fit_nnet %>%
133 | predict(
134 | new_data = concrete_test,
135 | type = "numeric"
136 | )
137 |
138 | concrete_test_pred
139 | ```
140 |
141 | #### Add the predictions to the test tibble
142 |
143 | ```{r}
144 | concrete_test_with_pred <- augment(model_fit_nnet, concrete_test)
145 | concrete_test_with_pred
146 | ```
147 |
148 | #### Metrics
149 |
150 | ```{r}
151 | concrete_test_with_pred %>%
152 | metrics(strength, .pred)
153 | ```
154 |
155 | #### Visualize the network topology
156 |
157 | Currently looking for a method to do that with tidymodels.
158 |
159 | ### Evaluating model performance
160 |
161 | #### Examine the correlation between predicted and actual values
162 |
163 | ```{r}
164 | concrete_test_with_pred %>%
165 | select(.pred, strength) %>%
166 | correlate()
167 | ```
168 |
169 | A simpler alternative:
170 |
171 | ```{r}
172 | cor(
173 | concrete_test_with_pred$.pred,
174 | concrete_test_with_pred$strength
175 | )
176 | ```
177 |
178 | ### Improving model performance with two hidden layers and custom activation function
179 |
180 | #### Create model specification
181 |
182 | ```{r}
183 | set.seed(12345)
184 | model_spec_nnet_2 <- mlp(
185 | engine = "nnet",
186 | mode = "regression",
187 | hidden_units = 5,
188 | penalty = 0.1,
189 | epochs = 100,
190 | ) %>%
191 | translate()
192 |
193 | model_spec_nnet_2
194 | ```
195 |
196 | ### Fit the model ----
197 |
198 | ```{r}
199 | model_fit_nnet_2 <- model_spec_nnet_2 %>%
200 | fit(
201 | strength ~ .,
202 | concrete_train
203 | )
204 |
205 | model_fit_nnet_2
206 | ```
207 |
208 | #### Take a closer look at the model
209 |
210 | ```{r}
211 | model_fit_nnet_2$fit %>%
212 | summary()
213 | ```
214 |
215 | #### Make the predictions (you could skip this step)
216 |
217 | ```{r}
218 | concrete_test_pred_2 <- model_fit_nnet_2 %>%
219 | predict(
220 | new_data = concrete_test,
221 | type = "numeric"
222 | )
223 |
224 | concrete_test_pred_2
225 | ```
226 |
227 | #### Add the predictions to the test tibble
228 |
229 | ```{r}
230 | concrete_test_with_pred_2 <- augment(model_fit_nnet_2, concrete_test)
231 | concrete_test_with_pred_2
232 | ```
233 |
234 | #### Metrics
235 |
236 | ```{r}
237 | concrete_test_with_pred_2 %>%
238 | metrics(strength, .pred)
239 | ```
240 |
241 | #### Examine the correlation between predicted and actual values
242 |
243 | ```{r}
244 | concrete_test_with_pred_2 %>%
245 | select(.pred, strength) %>%
246 | correlate()
247 | ```
248 |
249 | A simpler alternative:
250 |
251 | ```{r}
252 | cor(
253 | concrete_test_with_pred_2$.pred,
254 | concrete_test_with_pred_2$strength
255 | )
256 | ```
257 |
258 | ## Part 2: Support Vector Machines
259 |
260 | ### Exploring and preparing the data
261 |
262 | #### Read in data and examine structure
263 |
264 | ```{r}
265 | letters_tbl <- read_csv("data/letterdata.csv") %>%
266 | mutate(across(where(is.character), as.factor))
267 | ```
268 |
269 | ```{r}
270 | letters_tbl %>%
271 | glimpse()
272 | ```
273 |
274 | ### Creating the recipe
275 |
276 | #### Apply normalization to entire data frame
277 |
278 | ```{r}
279 | recipe_obj <- recipe(
280 | letter ~ .,
281 | data = letters_tbl
282 | )
283 |
284 | recipe_obj
285 | ```
286 |
287 | ```{r}
288 | letters_baked_tbl <- recipe_obj %>%
289 | prep() %>%
290 | bake(new_data = NULL)
291 |
292 | letters_baked_tbl
293 | ```
294 |
295 | #### Create training and test data
296 |
297 | ```{r}
298 | letters_split <- initial_time_split(
299 | letters_baked_tbl,
300 | prop = 16000 / 20000
301 | )
302 | letters_train <- training(letters_split)
303 | letters_test <- testing(letters_split)
304 | ```
305 |
306 | ### Training a model on the data
307 |
308 | #### Create model specification
309 |
310 | ```{r}
311 | model_spec_kernlab <- svm_linear(
312 | engine = "kernlab",
313 | mode = "classification",
314 | cost = NULL,
315 | margin = NULL
316 | ) %>%
317 | translate()
318 |
319 | model_spec_kernlab
320 | ```
321 |
322 | #### Fit the model
323 |
324 | ```{r}
325 | model_fit_kernlab <- model_spec_kernlab %>%
326 | fit(
327 | letter ~ .,
328 | letters_train
329 | )
330 |
331 | model_fit_kernlab
332 | ```
333 |
334 | #### Make the predictions (you could skip this step)
335 |
336 | ```{r}
337 | letters_test_pred <- model_fit_kernlab %>%
338 | predict(new_data = letters_test)
339 |
340 | letters_test_pred
341 | ```
342 |
343 | #### Add the predictions to the test tibble
344 |
345 | ```{r}
346 | letters_test_with_pred <- augment(model_fit_kernlab, letters_test)
347 | letters_test_with_pred
348 | ```
349 |
350 | ### Evaluating model performance
351 |
352 | #### Predictions on testing dataset
353 |
354 | ```{r}
355 | letters_test_with_pred %>%
356 | tabyl(letter, .pred_class)
357 | ```
358 |
359 | #### Look only at agreement vs. non-agreement
360 |
361 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
362 |
363 | ```{r}
364 | letters_test_with_pred %>%
365 | mutate(
366 | agreement = case_when(
367 | letter == .pred_class ~ TRUE,
368 | .default = FALSE
369 | )
370 | ) %>%
371 | tabyl(agreement) %>%
372 | adorn_pct_formatting(digits = 1)
373 | ```
374 |
375 | ### Improving model performance
376 |
377 | #### Change to a RBF kernel
378 |
379 | ```{r}
380 | model_spec_rbf <- svm_rbf(
381 | engine = "kernlab",
382 | mode = "classification",
383 | cost = NULL,
384 | margin = NULL,
385 | rbf_sigma = NULL
386 | ) %>%
387 | translate()
388 |
389 | model_spec_rbf
390 | ```
391 |
392 | #### Fit the model
393 |
394 | ```{r}
395 | model_fit_rbf <- fit(
396 | model_spec_rbf,
397 | letter ~ .,
398 | letters_train
399 | )
400 |
401 | model_fit_rbf
402 | ```
403 |
404 | #### Make the predictions (you could skip this step)
405 |
406 | ```{r}
407 | letters_test_pred_rbf <- model_fit_rbf %>%
408 | predict(new_data = letters_test)
409 |
410 | letters_test_pred_rbf
411 | ```
412 |
413 | #### Add the predictions to the test tibble
414 |
415 | ```{r}
416 | letters_test_with_pred_rbf <- augment(model_fit_rbf, letters_test)
417 | letters_test_with_pred_rbf
418 | ```
419 |
420 | #### Predictions on testing dataset
421 |
422 | ```{r}
423 | letters_test_with_pred_rbf %>%
424 | tabyl(letter, .pred_class)
425 | ```
426 |
427 | #### Look only at agreement vs. non-agreement
428 |
429 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
430 |
431 | ```{r}
432 | letters_test_with_pred_rbf %>%
433 | mutate(
434 | agreement = case_when(
435 | letter == .pred_class ~ TRUE,
436 | .default = FALSE
437 | )
438 | ) %>%
439 | tabyl(agreement) %>%
440 | adorn_pct_formatting(digits = 1)
441 | ```
442 |
443 | #### Test various values of the cost parameter
444 |
445 | ```{r}
446 | cost_values <- c(1, seq(from = 5, to = 40, by = 5))
447 | ```
448 |
449 | ```{r}
450 | accuracy_values <- map_dbl(cost_values, function(x) {
451 |
452 | model_spec_rbf <- svm_rbf(
453 | engine = "kernlab",
454 | mode = "classification",
455 | cost = {{ x }},
456 | margin = NULL,
457 | rbf_sigma = NULL
458 | ) %>%
459 | translate()
460 |
461 | model_fit_rbf <- fit(
462 | model_spec_rbf,
463 | letter ~ .,
464 | letters_train
465 | )
466 |
467 | letters_test_pred_rbf <- model_fit_rbf %>%
468 | predict(new_data = letters_test) %>%
469 | as_vector()
470 |
471 | agree <- if_else(letters_test_pred_rbf == letters_test %>% pull(letter), 1, 0)
472 |
473 | accuracy <- sum(agree) / nrow(letters_test)
474 |
475 | return(accuracy)
476 |
477 | })
478 | ```
479 |
480 | #### Bind together the cost parameter and accuracy values
481 |
482 | ```{r}
483 | cost_vs_accuracy_tbl <- bind_cols(
484 | cost_values,
485 | accuracy_values,
486 | ) %>%
487 | rename(
488 | cost_values = ...1,
489 | accuracy_values = ...2
490 | )
491 |
492 | cost_vs_accuracy_tbl
493 | ```
494 |
495 | #### Visualize to find the optimal cost parameter value
496 |
497 | ```{r}
498 | cost_vs_accuracy_tbl %>%
499 | ggplot(aes(cost_values, accuracy_values)) +
500 | geom_line() +
501 | geom_point() +
502 | labs(
503 | x = "Cost Parameter Value",
504 | y = "Accuracy"
505 | ) +
506 | theme_classic()
507 | ```
508 |
509 | #### Make sure you have the right optimal cost value for the best accuracy
510 |
511 | ```{r}
512 | cost_vs_accuracy_tbl %>%
513 | slice_max(accuracy_values)
514 | ```
515 |
516 | #### Pull the first cost_value that has the max accuracy_value
517 |
518 | ```{r}
519 | .max_accuracy <- cost_vs_accuracy_tbl %>%
520 | slice_max(accuracy_values) %>%
521 | slice(1) %>%
522 | pull(cost_values)
523 | ```
524 |
525 | ### Fitting the model with the optimal cost value (that was just pulled)
526 |
527 | #### Give model specification
528 |
529 | ```{r}
530 | model_spec_best <- svm_rbf(
531 | engine = "kernlab",
532 | mode = "classification",
533 | cost = {{ .max_accuracy }},
534 | margin = NULL,
535 | rbf_sigma = NULL
536 | ) %>%
537 | translate()
538 | ```
539 |
540 | #### Fit the model
541 |
542 | ```{r}
543 | model_fit_best <- fit(
544 | model_spec_best,
545 | letter ~ .,
546 | letters_train
547 | )
548 | ```
549 |
550 | #### Add the predictions to the test tibble
551 |
552 | ```{r}
553 | letters_test_with_pred_best <- augment(model_fit_best, letters_test)
554 | ```
555 |
556 | #### Predictions on testing dataset
557 |
558 | ```{r}
559 | letters_test_with_pred_best %>%
560 | tabyl(letter, .pred_class)
561 | ```
562 |
563 | #### Look only at agreement vs. non-agreement
564 |
565 | Construct a vector of TRUE/FALSE indicating correct/incorrect predictions
566 |
567 | ```{r}
568 | letters_test_with_pred_best %>%
569 | mutate(
570 | agreement = case_when(
571 | letter == .pred_class ~ TRUE,
572 | .default = FALSE
573 | )
574 | ) %>%
575 | tabyl(agreement) %>%
576 | adorn_pct_formatting(digits = 1)
577 | ```
578 |
--------------------------------------------------------------------------------
/machine_learning_with_r_3rd_ed/naive_bayes/filtering-mobile-phone-spam-with-naive-bayes.r:
--------------------------------------------------------------------------------
1 | # Filtering Mobile Phone Spam with the Naive Bayes Algorithm ----
2 |
3 | # Inspired by Brett Lantz's Machine Learning with R,
4 | # Chapter 4: Probabilistic Learning - Classification Using Naive Bayes and
5 | # Chapter 10: Evaluating Model Performance
6 | #
7 | # The original code is made with a lot of base R, {e1071} and {gmodels}. I
8 | # wanted to see how one could recreate it using mainly {textrecipes},
9 | # {tidymodels}, {tidytext} and {tidyverse}.
10 | #
11 | # You can find the original code and the slightly modified dataset(s) here:
12 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter04
13 | # https://github.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/tree/master/Chapter10
14 |
15 | ## 1. Loading libraries (in the order they get used) ----
16 | library(conflicted)
17 | library(tidyverse)
18 | conflict_prefer("filter", "dplyr", "stats")
19 | library(tidytext)
20 | library(SnowballC) # for stemming
21 | library(wordcloud2)
22 | library(textrecipes)
23 | library(tidymodels)
24 | library(discrim) # for naive_Bayes()
25 | library(janitor)
26 | library(crosstable)
27 |
28 |
29 | ## 2. Exploring and preparing the data ----
30 |
31 | ### Read the sms data into the sms tibble, convert spam/ham to factor ----
32 | sms_tbl <- read_csv(
33 | "machine_learning_with_r_3rd_ed/naive_bayes/data/sms_spam.csv",
34 | col_types = "fc"
35 | ) %>%
36 | select(.type = type, everything())
37 |
38 | ### Examine the structure of the sms data ----
39 | glimpse(sms_tbl)
40 |
41 | ### Examine the distribution of spam/ham ----
42 | sms_tbl %>%
43 | count(.type) %>%
44 | mutate(pct = (n / sum(n) * 100))
45 |
46 | ### Build a corpus using the {tidytext} package instead of {tm} ----
47 |
48 | # This part inspired by a blog post by Julia Silge:
49 | # https://www.tidyverse.org/blog/2020/11/tidymodels-sparse-support/
50 |
51 | ### Add a row number ----
52 | sms_row_numbers_tbl <- sms_tbl %>%
53 | mutate(line = row_number())
54 | sms_row_numbers_tbl
55 |
56 | ### Manual preprocessing (just to see what it's all about) ----
57 | tidy_sms_tbl <- sms_row_numbers_tbl %>%
58 | unnest_tokens(word, text) %>%
59 | count(line, word) %>%
60 | bind_tf_idf(word, line, n)
61 | tidy_sms_tbl
62 |
63 | wide_sms_tbl <- tidy_sms_tbl %>%
64 | dplyr::select(line, word, tf_idf) %>%
65 | pivot_wider(
66 | names_from = word,
67 | names_prefix = "word_",
68 | values_from = tf_idf,
69 | values_fill = 0
70 | )
71 | wide_sms_tbl
72 |
73 |
74 | ## 3. Visualizing text data - word clouds ----
75 |
76 | # This part inspired by Julia Silge & David Robinson's book Text Mining with R:
77 | # A Tidy Approach: https://www.tidytextmining.com/
78 |
79 | ### Count word frequencies ----
80 | frequency_tbl <- sms_row_numbers_tbl %>%
81 |
82 | # One word per one row
83 | unnest_tokens(word, text) %>%
84 |
85 | # Stemming
86 | mutate(word = wordStem(word)) %>%
87 |
88 | # Count the words
89 | count(.type, word) %>%
90 |
91 | # Count the proportion of words
92 | with_groups(
93 | .type,
94 | mutate,
95 | proportion = n / sum(n)
96 | ) %>%
97 |
98 | # Reorder the columns
99 | dplyr::select(-n) %>%
100 | pivot_wider(names_from = .type, values_from = proportion) %>%
101 | pivot_longer(
102 | cols = c("ham", "spam"),
103 | names_to = ".type",
104 | values_to = "freq"
105 | )
106 |
107 | ### Subset the frequency data into two groups, spam and ham ----
108 | spam_tbl <- frequency_tbl %>%
109 | filter(.type == "spam") %>%
110 | dplyr::select(-.type) %>%
111 | drop_na()
112 |
113 | ham_tbl <- frequency_tbl %>%
114 | filter(.type == "ham") %>%
115 | dplyr::select(-.type) %>%
116 | drop_na()
117 |
118 | ### Word cloud ----
119 |
120 | # This part inspired by a blog post by Céline Van den Rul:
121 | # https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a
122 |
123 | # One for ham...
124 | wordcloud2(
125 | data = ham_tbl,
126 | size = 2,
127 | color = "random-dark"
128 | )
129 |
130 | # ...and another for spam
131 | wordcloud2(
132 | data = spam_tbl,
133 | size = 2,
134 | color = "random-dark"
135 | )
136 |
137 |
138 | ## 4. Creating the recipe and splitting the data ----
139 |
140 | ### Create the recipe ----
141 | text_recipe_obj <- recipe(
142 | .type ~ text,
143 | data = sms_row_numbers_tbl
144 | ) %>%
145 | step_tokenize(text) %>%
146 | step_stopwords(text) %>%
147 | step_tokenfilter(text, max_tokens = 1e3) %>%
148 | step_tfidf(text)
149 | text_recipe_obj
150 |
151 | # Bake it
152 | sms_baked_tbl <- text_recipe_obj %>%
153 | prep() %>%
154 | bake(new_data = NULL)
155 |
156 | # Simplify the tf-idf to yes/no
157 | sms_baked_longer_tbl <- sms_baked_tbl %>%
158 | mutate(across(where(is.numeric),
159 | ~ case_when(
160 | . > 0 ~ "Yes",
161 | TRUE ~ "No"
162 | ))) %>%
163 | # Rename the columns back to words
164 | rename_with(
165 | ~ tolower(gsub("tfidf_text_", "", .x)),
166 | .cols = starts_with("tfidf_text_")
167 | )
168 | sms_baked_longer_tbl
169 |
170 | ### Create training and test data ----
171 | # Not randomly, because the messages weren't in any particular order
172 | sms_split <- initial_time_split(
173 | sms_baked_longer_tbl,
174 | prop = 0.75
175 | )
176 | sms_train <- training(sms_split)
177 | sms_test <- testing(sms_split)
178 |
179 |
180 | ## 5. Training a model on the data ----
181 |
182 | # naivebayes is the engine (needs to be installed if not already):
183 | # install.packages("discrim") AND
184 | # install.packages("naivebayes")
185 |
186 | # It is used as the engine for {parsnip}'s naive_Bayes() function. And since
187 | # we are classifying, that is the mode we choose.
188 |
189 | # The simple reason is I couldn't get klaR (the other engine) to work. If you
190 | # know how, please comment on GitHub. It would be great to get to test what the
191 | # difference between the two engines are.
192 |
193 | ### Model specification ----
194 | model_spec <- naive_Bayes(
195 | engine = "naivebayes",
196 | mode = "classification",
197 | smoothness = NULL,
198 | Laplace = NULL
199 | ) %>%
200 | translate()
201 | model_spec
202 |
203 | ### Fit the model ----
204 | model_fit <- fit(
205 | model_spec,
206 | .type ~ .,
207 | sms_train
208 | )
209 | model_fit
210 |
211 | ### Make the predictions (you could skip this step) ----
212 | sms_test_pred <- predict(
213 | object = model_fit,
214 | new_data = sms_test,
215 | type = "class"
216 | )
217 | sms_test_pred
218 |
219 | ### Add the predictions to the test tibble ----
220 | sms_test_with_pred_tbl <- augment(model_fit, sms_test)
221 | sms_test_with_pred_tbl
222 |
223 |
224 | ## 6. Evaluating model performance ----
225 |
226 | ### Create a confusion matrix ----
227 | conf_mat <- conf_mat(
228 | data = sms_test_with_pred_tbl,
229 | truth = .type,
230 | estimate = .pred_class
231 | )
232 | conf_mat
233 |
234 | ### Visualize the confusion matrix ----
235 | conf_mat %>% autoplot(type = "heatmap")
236 | conf_mat %>% autoplot(type = "mosaic")
237 |
238 | ### Visualize the ROC curve ----
239 | sms_test_with_pred_tbl %>%
240 | roc_curve(
241 | truth = .type,
242 | estimate = .pred_ham
243 | ) %>%
244 | autoplot()
245 |
246 | ### Calculate the ROC AUC (area under the curve) ----
247 | sms_roc_auc <- sms_test_with_pred_tbl %>%
248 | roc_auc(
249 | truth = .type,
250 | estimate = .pred_ham
251 | )
252 | sms_roc_auc
253 |
254 |
255 | ## 7. Compare to k-NN ----
256 |
257 | ### Read the results ----
258 | sms_results_knn <- read_csv("machine_learning_with_r_3rd_ed/naive_bayes/data/sms_results_knn.csv") %>%
259 | mutate(p_ham = 1 - p_spam)
260 |
261 | ### Naive Bayes vs k-NN - ROC curve ----
262 |
263 | # Naive Bayes
264 | sms_test_with_pred_tbl %>%
265 | roc_curve(
266 | truth = .type,
267 | estimate = .pred_ham
268 | ) %>%
269 | autoplot()
270 |
271 | # k-NN
272 | sms_test_with_pred_tbl %>%
273 | mutate(.knn_p_ham = sms_results_knn$p_ham) %>%
274 | roc_curve(
275 | truth = .type,
276 | estimate = .knn_p_ham
277 | ) %>%
278 | autoplot()
279 |
280 | ### Naive Bayes vs k-NN - AUC ----
281 |
282 | # Naive Bayes
283 | sms_test_with_pred_tbl %>%
284 | roc_auc(
285 | truth = .type,
286 | estimate = .pred_ham
287 | )
288 |
289 | # k-NN
290 | sms_test_with_pred_tbl %>%
291 | mutate(.knn_p_ham = sms_results_knn$p_ham) %>%
292 | roc_auc(
293 | truth = .type,
294 | estimate = .knn_p_ham
295 | )
296 |
297 | ### Put together other model metrics ----
298 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
299 | classification_metrics <- conf_mat(
300 | sms_test_with_pred_tbl,
301 | truth = .type,
302 | estimate = .pred_class
303 | ) %>%
304 | summary()
305 | classification_metrics
306 |
307 |
308 | ## 8. Improving model performance ----
309 |
310 | # Basically, the same as before, but with Laplace = 1
311 |
312 | ### Model specification ----
313 | model_spec <- naive_Bayes(
314 | engine = "naivebayes",
315 | mode = "classification",
316 | smoothness = NULL,
317 | Laplace = 1
318 | ) %>%
319 | translate()
320 | model_spec
321 |
322 | ### Fit the model ----
323 | model_fit <- fit(
324 | model_spec,
325 | .type ~ .,
326 | sms_train
327 | )
328 | model_fit
329 |
330 | ### Make the predictions (you could skip this step) ----
331 | sms_test_pred <- predict(
332 | object = model_fit,
333 | new_data = sms_test,
334 | type = "class"
335 | )
336 | sms_test_pred
337 |
338 | ### Add the predictions to the test tibble ----
339 | sms_test_with_pred_tbl <- augment(model_fit, sms_test)
340 | sms_test_with_pred_tbl
341 |
342 |
343 | ## 9. Evaluating model performance ----
344 |
345 | ### Create a confusion matrix ----
346 | conf_mat <- conf_mat(
347 | data = sms_test_with_pred_tbl,
348 | truth = .type,
349 | estimate = .pred_class
350 | )
351 | conf_mat
352 |
353 | ### Visualize the confusion matrix ----
354 | conf_mat %>% autoplot(type = "heatmap")
355 | conf_mat %>% autoplot(type = "mosaic")
356 |
357 | ### Visualize the ROC curve ----
358 | sms_test_with_pred_tbl %>%
359 | roc_curve(
360 | truth = .type,
361 | estimate = .pred_ham
362 | ) %>%
363 | autoplot()
364 |
365 | ### Calculate the ROC AUC (area under the curve) ----
366 | sms_roc_auc <- sms_test_with_pred_tbl %>%
367 | roc_auc(
368 | truth = .type,
369 | estimate = .pred_ham
370 | )
371 | sms_roc_auc
372 |
373 | ### Put together other model metrics ----
374 | # Such as accuracy, Matthews correlation coefficient (mcc) and others...
375 | classification_metrics <- conf_mat(
376 | sms_test_with_pred_tbl,
377 | truth = .type,
378 | estimate = .pred_class
379 | ) %>%
380 | summary()
381 | classification_metrics
382 |
383 |
384 | ## 10. Creating a function to help evaluate the model further ----
385 |
386 | # The assumption here is that you have already gone through steps 1. to 4.
387 | # What we're potentially tuning here are the arguments .smoothness and .Laplace
388 | # Check out the book and/or the documentation for further info about them!
389 |
390 | classify_with_naive_bayes <- function(
391 | .smoothness = NULL,
392 | .laplace = NULL
393 | ) {
394 |
395 | # Model specification
396 | model_spec <- naive_Bayes(
397 | engine = "naivebayes",
398 | mode = "classification",
399 | smoothness = .smoothness,
400 | Laplace = .laplace
401 | ) %>%
402 | translate()
403 | model_spec
404 |
405 | # Fit the model
406 | model_fit <- fit(
407 | model_spec,
408 | .type ~ .,
409 | sms_train
410 | )
411 | model_fit
412 |
413 | # Add the predictions to the test tibble
414 | sms_test_with_pred_tbl <- augment(model_fit, sms_test)
415 | sms_test_with_pred_tbl
416 |
417 | # Create a confusion matrix
418 | conf_mat <- conf_mat(
419 | data = sms_test_with_pred_tbl,
420 | truth = .type,
421 | estimate = .pred_class
422 | )
423 |
424 | # Print the confusion matrix
425 | conf_mat %>% autoplot(type = "heatmap")
426 |
427 | }
428 |
429 | ### Test the function ----
430 | classify_with_naive_bayes(
431 | .smoothness = 1,
432 | .laplace = 1
433 | )
434 |
435 |
436 | ## 11. Understanding the classifier's predictions ----
437 |
438 | ### Obtain the predicted probabilities (you could skip this step) ----
439 | sms_test_prob <- predict(
440 | object = model_fit,
441 | new_data = sms_test,
442 | type = "prob"
443 | )
444 | sms_test_prob
445 |
446 | ### Look at the predicted probabilities ----
447 | sms_results <- sms_test_with_pred_tbl %>%
448 | select(
449 | actual_type = .type,
450 | predicted_type = .pred_class,
451 | prob_spam = .pred_spam,
452 | prob_ham = .pred_ham
453 | ) %>%
454 | mutate(
455 | prob_spam = prob_spam %>% round(2),
456 | prob_ham = prob_ham %>% round(2)
457 | )
458 | sms_results
459 |
460 | ### Test cases where the model is less confident ----
461 | sms_results %>%
462 | filter(between(prob_spam, 0.40, 0.60))
463 |
464 | ### Test cases where the model was wrong ----
465 | sms_results %>%
466 | filter(actual_type != predicted_type)
467 |
468 | ### Specifying vectors ----
469 | sms_results %>%
470 | tabyl(actual_type, predicted_type)
471 |
472 | ### Using {crosstable} ----
473 | sms_results %>%
474 | crosstable(
475 | cols = actual_type,
476 | by = predicted_type,
477 | label = FALSE,
478 | total = "both",
479 | percent_pattern = "{n} ({p_row}/{p_col})",
480 | percent_digits = 1
481 | ) %>%
482 | as_flextable(compact = TRUE)
483 |
--------------------------------------------------------------------------------