├── .gitattributes ├── 1-select-file.jpg ├── 2-download.jpg ├── 9781484223338.jpg ├── Code ├── Ch02_Ramasubramanian.R ├── Ch03_Singh.R ├── Ch04_Ramasubramanian.R ├── Ch05_Singh.R ├── Ch06_Clustering.R ├── Ch06_DecisionTree.R ├── Ch06_Naive Bayes.R ├── Ch06_NeuralNet.R ├── Ch06_OnlineML.R ├── Ch06_Regression.R ├── Ch06_RuleMining.R ├── Ch06_SVM.R ├── Ch06_TextMining.R ├── Ch07_Singh.R ├── Ch08_Singh.R ├── Ch09_Ramasubramanian.R ├── actual_pred_plot.R └── concordance.R ├── Color Artwork ├── Chapter 1 │ └── Image │ │ ├── Image 1.1 Architecture of IBM's DeepQA.png │ │ ├── Image 1.2 Venn diagram definition of Data Science.png │ │ ├── Image 1.3 Sample space of three-coin tossing experiment.png │ │ ├── Image 1.5 Probabiity distribution with discrete and continuous Random Variable.png │ │ ├── Image 1.6 Confidence Interval.png │ │ ├── Image 1.7 z score and confidence level.png │ │ ├── Image 1.8 Machine Leaning Process Flow.png │ │ ├── Table 1.1 Permutation and Combinatio.png │ │ ├── Table 1.2 Facebook Nearby example - Two-way contingency table.png │ │ └── Table 1.3 Data structures in R.png ├── Chapter 2 │ └── Images │ │ ├── Image 2.1 Number of atheletes in each type.png │ │ ├── Image 2.2 Boxplot showing variation of finish times for each type of runner.png │ │ ├── Image 2.3 Distribution showing Symmetrical Vs Negative and Positive skewness.png │ │ ├── Image 2.4 Distribution of finish time of atheletes in marathon data.png │ │ ├── Image 2.5 Showing kurtosis plots with simulated data.png │ │ ├── Image 2.6 Showing kurtosis plot of finish time in marathon data.png │ │ └── Image 2.7 Number of domestic transactions by Male and Female.png ├── Chapter 3 │ └── Images │ │ ├── Image 3.1 Objectives of sampling.jpg │ │ ├── Image 3.10 Popultion vs. Sample(WithReplacement) Distribution.png │ │ ├── Image 3.11 Homogenuous Population and Systematic Sample Distribution.png │ │ ├── Image 3.12 Population and Stratified Sample Distribution.png │ │ ├── Image 3.13 Input Data segmented by set of 5 classes by Number of International Transactions.png │ │ ├── Image 3.14 Clusters formed by K-means ( star sign present centeroid of cluster).png │ │ ├── Image 3.15 Cluster Population and Cluster Random Sample Distribution.png │ │ ├── Image 3.16 Histogram and QQ Plot of estimated coefficient.png │ │ ├── Image 3.17 Histogram of Parameter Estimate from Bootstrap.png │ │ ├── Image 3.18 Histrogram with normal density function.png │ │ ├── Image 3.19 Beta Distrbution Plot.png │ │ ├── Image 3.2 Probability vs Non-Probability Sampling.png │ │ ├── Image 3.20 Sampling by rejection.png │ │ ├── Image 3.3 Simulation of Coin toss experiment.png │ │ ├── Image 3.4 Sampling Distribution Plots .png │ │ ├── Image 3.5 Distribution of sample means with Normal Density Lines.png │ │ ├── Image 3.6 Histogram of Outstanding Balance.png │ │ ├── Image 3.7 Histogram of Number of International Transactions.png │ │ ├── Image 3.8 Histogram of Number of domestic transactions.png │ │ └── Image 3.9 Popultion vs. Sample(Without Replacement) Distribution.png ├── Chapter 4 │ └── Images │ │ ├── Image 4.1 Line chart showing top 10 Countries based on their GDP.png │ │ ├── Image 4.10 Boxplot showing the GDP (in Trillion US $) for top 10 countries.png │ │ ├── Image 4.11 Boxplot showing the Population (in Billion) for top 10 countries.png │ │ ├── Image 4.12 Histogram showing GDP and Population for developed country.png │ │ ├── Image 4.13 Density Plot showing GDP and Population for developed country.png │ │ ├── Image 4.14 Histogram showing GDP and Population for developing country.png │ │ ├── Image 4.15 Density Plot showing GDP and Population for developing country.png │ │ ├── Image 4.16 Pie chart showing percentage share of each sector by consumption segment in India.png │ │ ├── Image 4.17 Pie chart showing percentage share of each sector by consumption segment in China.png │ │ ├── Image 4.18 Plot showing correlation between various world development indicators.png │ │ ├── Image 4.19 Heatmap between region and there various world development indicators.png │ │ ├── Image 4.2 Line chart showing top 10 countries based on % contribution to GDP from Agriculture.png │ │ ├── Image 4.20 Bubble chart showing GDP Per Captita Vs Life Expectency.png │ │ ├── Image 4.21 Bubble chart showing GDP Per Captita Vs Life Expectency for four Countries.png │ │ ├── Image 4.22 Bubble chart showing Fertility rate Vs Life Expectency.png │ │ ├── Image 4.23 Waterfall plot of Footfall at border.png │ │ ├── Image 4.24 Waterfall chart with Net effect.png │ │ ├── Image 4.25 Footfall end count as percentage of total end count.png │ │ ├── Image 4.26 Dendogram with distanceheight upto h=1.png │ │ ├── Image 4.27 Clusters by actual classification of species in Iris data.png │ │ ├── Image 4.28 Word cloud on job descriptions.png │ │ ├── Image 4.29 The Sankey chart for House Sale Data.png │ │ ├── Image 4.3 Line chart showing top 10 countries based on % contribution to GDP from service sector.png │ │ ├── Image 4.30 GDP Growth for eight countries.png │ │ ├── Image 4.31 GDP growth during recession.png │ │ ├── Image 4.32 The cohort plot for Credit Card Active by Year of Issue.png │ │ ├── Image 4.33 A example map pulled using ggplot() – New Delhi India.png │ │ ├── Image 4.34 India map with Robbery Counts in 2010.png │ │ ├── Image 4.4 Line chart showing top 10 countries based on % contribution to GDP from industry.png │ │ ├── Image 4.5 Stacked column chart showing contribution of various sector to the World GDP.png │ │ ├── Image 4.6 Line chart showing top 10 Countries based on their Working Age Ratio.png │ │ ├── Image 4.7 Stacked bar chart showing constituent of different age group as a % of total population.png │ │ ├── Image 4.8 Line chart showing top 10 countries and their annual % population growth.png │ │ └── Image 4.9 Scatterplot showing the relationship Population and GDP for top 10 countries.png ├── Chapter 5 │ └── Images │ │ ├── Image 5.1 Distribution of Loss (including No Default).png │ │ ├── Image 5.2 Distribution of Loss (excluding No Default).png │ │ ├── Image 5.3 Coefficient and Fraction of Deviance Explained by each featurevariable.png │ │ ├── Image 5.4 Missclasification error and log of penalization factor(Lambda).png │ │ ├── Image 5.5 Variance explained by Principles Components.png │ │ └── Image 5.6 Orgthoginality of Principle Component 1 and 2.png ├── Chapter 6 │ ├── Image 6.1 Machine Learning Types.png │ ├── Image 6.10 Deep Learning Algorithms.png │ ├── Image 6.11 Dimensionality Reduction Algorithms.png │ ├── Image 6.12 Ensemble Learning.png │ ├── Image 6.13 Text Mining Algorithms.png │ ├── Image 6.14 Scatter Plot between HousePrice and StoreArea.png │ ├── Image 6.15 Scatter Plot of Actual vs. Predicted.png │ ├── Image 6.16 Actual vs Predicted Plot.png │ ├── Image 6.17 Cooks distance for each observation.png │ ├── Image 6.18 Influence Plot.png │ ├── Image 6.19 Distribution of Studentized Residuals.png │ ├── Image 6.2 Regression Algorithms.png │ ├── Image 6.20 Durbin Watson Statistics Bounds.png │ ├── Image 6.21 Auto Correlation Funtion (ACF) plot .png │ ├── Image 6.22 Residuals vs Fitted Plot.png │ ├── Image 6.23 Actual vs Predicted Plot Linear Model.png │ ├── Image 6.24 Actual vs Predicted Plot Quadratic Polynomial Model.png │ ├── Image 6.25 Logit Function.png │ ├── Image 6.26 Purchase Rate and income Classs.png │ ├── Image 6.27 ROC Curve for Train Data.png │ ├── Image 6.28 Actual vs Predicted Plot against MembershipPoints.png │ ├── Image 6.29 Actual vs Predicted Plot against IncomeClass.png │ ├── Image 6.3 Distance Based Algorithms.png │ ├── Image 6.30 Actual vs Predicted Plot against CustomerPropensity.png │ ├── Image 6.31 Gains Charts with AUC.png │ ├── Image 6.32 Lift Chart.png │ ├── Image 6.33 Classification using Support Vector Machine.png │ ├── Image 6.34 Decision Tree with two attributes and class.png │ ├── Image 6.35 Recursive Binary split and its corresponding tree for 2-dimensional feature space.png │ ├── Image 6.36 Gini-Index Function.png │ ├── Image 6.37 Entropy Function.png │ ├── Image 6.38 C5.0 Decision Tree on Purchase Prediction Data set.png │ ├── Image 6.39 CART Model.png │ ├── Image 6.4 Regularization Algorithms.png │ ├── Image 6.40 CHAID Decision Tree.png │ ├── Image 6.41 Scatterplot between StoreArea and LawnArea for each HouseNetWorth group.png │ ├── Image 6.42 Cluster Dendogram.jpg │ ├── Image 6.43 Cluster Plot with LawnArea and Store Area.png │ ├── Image 6.44 Elbow Curve for varying values of k ( number of cluster) on x-axis..png │ ├── Image 6.45 Cluster plot using k-means.png │ ├── Image 6.46 Clustering plot based on EM Algorithm.png │ ├── Image 6.47 Cluster Plot for EM Algorithm.png │ ├── Image 6.48 Plot for Conver Cluster Hulls for EM Algorithm.png │ ├── Image 6.49 Silhouette Plot.jpg │ ├── Image 6.5 Decision Tree Algorithms.png │ ├── Image 6.50 Item frequency plot in Market Basket Data.png │ ├── Image 6.51 Sparcity Visualization in Transactions of Market basket Data.png │ ├── Image 6.52 Illustration of UBCF.png │ ├── Image 6.53 Dsitribution of Ratings.png │ ├── Image 6.54. Raw Ratings by Users.png │ ├── Image 6.55 True Positive Ration vs False Positive Ratio.png │ ├── Image 6.56 Neuron Anatomy.png │ ├── Image 6.57 Working of a Perceptron (Mathematically).png │ ├── Image 6.58 NAND gate operator.gif │ ├── Image 6.59 NAND gate perceptron.png │ ├── Image 6.6 Bayesian Algorithms.png │ ├── Image 6.60 Sigmoid Function.png │ ├── Image 6.61 Linear Seperability.png │ ├── Image 6.62 Artificial Network Architecture.png │ ├── Image 6.63 Supervised vs Unsupervised Learning.png │ ├── Image 6.64 Working of Backpropogation method.png │ ├── Image 6.65 One hidden Layer Neural Network.png │ ├── Image 6.66 Attribute importance by garsen method.jpg │ ├── Image 6.66 Attribute importance by olden method.jpg │ ├── Image 6.67 Attribute importance by garson method.png │ ├── Image 6.68 A Multi Layer Deep Neural Network.png │ ├── Image 6.69 A sample Volcanao picture for Image Recognition exercise.png │ ├── Image 6.7 Clustering Algorithms.png │ ├── Image 6.70 Normalized Image.png │ ├── Image 6.71 Part-of-Speech mapping.png │ ├── Image 6.72 Part of Speech frequency.png │ ├── Image 6.73 Word Cloud using Amazon Food Review Dataset.png │ ├── Image 6.74 Langauge detection input.png │ ├── Image 6.75 Language detection output.png │ ├── Image 6.76 Online Machine learning Algorithms.jpg │ ├── Image 6.8 Association Rule Mining.png │ ├── Image 6.9 Artificial Neural Networks.png │ ├── Table 6.1 House Sale Prices.png │ ├── Table 6.2 Purchase Preference.png │ ├── Table 6.3 Breast Cancer Wisconsin.png │ ├── Table 6.4 Market basket Data.png │ └── Table 6.5 Amazon Food Review.png ├── Chapter 7 │ └── Images │ │ ├── Image 7.1 Distribution of House Sale Price.png │ │ ├── Image 7.2 Distribution of Product Choice Options.png │ │ ├── Image 7.3 ECDF Plots for Set_1 and Set_2 .png │ │ ├── Image 7.4 ECDF Plots for Set_1 and Set_2(Manipulated).png │ │ ├── Image 7.5 Actual Vs Predicted Plot.png │ │ ├── Image 7.6 Image Explaining Squared Errors.png │ │ ├── Image 7.7 Two Class Classification Matrix.png │ │ ├── Image 7.8 ROC Curve .png │ │ ├── ch07_classi_06-1.png │ │ ├── ch07_mae_01-1.png │ │ ├── ch07_population_stability_03-1.png │ │ ├── ch07_population_stability_04-1.png │ │ ├── chap7_Data_load_01-1.png │ │ └── chap7_Data_load_02-1.png ├── Chapter 8 │ └── Images │ │ ├── Image 8.1 train() function algorithm in caret package.png │ │ ├── Image 8.10 Voting Ensemple Learning for Classification Problem.gif │ │ ├── Image 8.11 Bagging Ensemble Flow.png │ │ ├── Image 8.12 Boosting Ensemble Flow.png │ │ ├── Image 8.13 Accuracy and kappa of bagged tree.png │ │ ├── Image 8.14 Accuracy across boosting iterations – C5.0.png │ │ ├── Image 8.15 Accuracy across Boosting iterations – GBM.png │ │ ├── Image 8.16 Accuracy across Boosting Ensemble.png │ │ ├── Image 8.17Accuracy and kappa of individual models.png │ │ ├── Image 8.18 Scatter plot to high list correlation among results from stacked models.png │ │ ├── Image 8.19 RMSE in Cost and Sigma space .png │ │ ├── Image 8.2 Performance plot accuracy metrics.png │ │ ├── Image 8.3 Accuracy across cross validated samples.png │ │ ├── Image 8.4 Accuracy across cross validated samples – Automatic Grid Search.png │ │ ├── Image 8.5 Accuracy across cross validated samples and complexity parameters.png │ │ ├── Image 8.6 Accuracy across cross validated sets and randomly selected predictors.png │ │ ├── Image 8.7 Accuracy across cross validated samples and parameter mtry.png │ │ ├── Image 8.8 Bias and Variance Illustration- Bulls Eye Plot.png │ │ └── Image 8.9 Bias vs Variance trade off plot.png └── Chapter 9 │ └── Images │ ├── Image 9.1 Google File System.png │ ├── Image 9.11 Data in the HDFS file.png │ ├── Image 9.12 Connecting to Pig using local filesystem.png │ ├── Image 9.13 Load data into A1.png │ ├── Image 9.14 Tokenize each line.png │ ├── Image 9.15 Flatten tokens.png │ ├── Image 9.16 Group Words.png │ ├── Image 9.17 Count and Sort.png │ ├── Image 9.18 Starting Hbase.png │ ├── Image 9.19 Create and put data.png │ ├── Image 9.2 MapReduce Execution Flow.png │ ├── Image 9.20 Scan the data.png │ ├── Image 9.21 Feature definition of Prostatic Cancer data set.png │ ├── Image 9.3 Hadoop Components and tools..png │ ├── Image 9.4 Word Count example using MapReduce.png │ ├── Image 9.5 Hive Create table command.png │ ├── Image 9.6 Hive table in HDFS.png │ ├── Image 9.7 Describe table command.png │ ├── Image 9.8 Generate data and store in local file.png │ └── Image 9.9 Load data into hive table.png ├── Dataset ├── Chapter 2.zip ├── Chapter 3.zip ├── Chapter 4.zip ├── Chapter 5.zip ├── Chapter 6.zip ├── Chapter 7.zip ├── Chapter 8.zip └── Chapter 9.zip ├── LICENSE.txt ├── README.md ├── contributing.md └── errata.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.zip filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /1-select-file.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/machine-learning-using-r/51627d232360f7199939056aacab3de5699117c7/1-select-file.jpg -------------------------------------------------------------------------------- /2-download.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/machine-learning-using-r/51627d232360f7199939056aacab3de5699117c7/2-download.jpg -------------------------------------------------------------------------------- /9781484223338.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/machine-learning-using-r/51627d232360f7199939056aacab3de5699117c7/9781484223338.jpg -------------------------------------------------------------------------------- /Code/Ch02_Ramasubramanian.R: -------------------------------------------------------------------------------- 1 | ## ---- echo = FALSE------------------------------------------------------- 2 | 3 | ## ------------------------------------------------------------------------ 4 | read.csv("employees.csv", header = TRUE, sep = ",") 5 | 6 | 7 | ## ---- message=FALSE, warning=FALSE--------------------------------------- 8 | library(xlsx) 9 | read.xlsx("employees.xlsx",sheetName = "Sheet1") 10 | 11 | ## ---- eval=FALSE--------------------------------------------------------- 12 | ## 13 | ## 14 | ## 15 | ## Mike 16 | ## 25 17 | ## 18 | ## Two times world champion. Currently, worlds No. 3 19 | ## 20 | ## 6 21 | ## 22 | ## 23 | ## Usain 24 | ## 29 25 | ## 26 | ## Five times world champion. Currently, worlds No. 1 27 | ## 28 | ## 17 29 | ## 30 | ## 31 | ## 32 | 33 | ## ---- message=FALSE, warning=FALSE--------------------------------------- 34 | 35 | library(XML) 36 | library(plyr) 37 | 38 | xml_data <- xmlToList("marathon.xml") 39 | 40 | #Exluding "description" from print 41 | ldply(xml_data, function(x) { data.frame(x[!names(x)=="description"]) } ) 42 | 43 | 44 | ## ----header, eval=FALSE-------------------------------------------------- 45 | ## 46 | ## Machine Learning with R 47 | ## 48 | 49 | ## ----headings, eval=FALSE------------------------------------------------ 50 | ##

Header

51 | ##

Headings

52 | ##

Tables

53 | ##

Anchors

54 | ##
Links
55 | 56 | ## ----para, eval=FALSE---------------------------------------------------- 57 | ## 58 | ##

Paragraph 1

59 | ##

Paragraph 2

60 | ## 61 | 62 | ## ----table, eval=FALSE--------------------------------------------------- 63 | ## 64 | ## 65 | ## 66 | ## Define a row 67 | ## 68 | ## 69 | ##
70 | ## 71 | ## 72 | 73 | ## ----anchor, eval=FALSE-------------------------------------------------- 74 | ## 75 | ## 76 | ## __ Welcome to Machine Learning with R! 77 | ## 78 | 79 | ## ---- eval=FALSE--------------------------------------------------------- 80 | ## 81 | ## 82 | ## 83 | ## 84 | ##

Machine Learning with R

85 | ##

Hope you having fun time reading this book !!

86 | ##

Chapter 2

87 | ##

Data Exploration and Preparation

88 | ## Apress Website Link 89 | ## 90 | ## 91 | ## 92 | 93 | ## ------------------------------------------------------------------------ 94 | library(XML) 95 | url <- "html_example.html" 96 | doc <- htmlParse(url) 97 | xpathSApply(doc, "//a/@href") 98 | 99 | 100 | ## ----json, eval=FALSE---------------------------------------------------- 101 | ## { 102 | ## "data": [ 103 | ## { 104 | ## "id": "A1_B1", 105 | ## "from": { 106 | ## "name": "Jerry", "id": "G1" 107 | ## }, 108 | ## "message": "Hey! Hope you like the book so far", 109 | ## "actions": [ 110 | ## { 111 | ## "name": "Comment", 112 | ## "link": "http://www.facebook.com/A1/posts/B1" 113 | ## }, 114 | ## { 115 | ## "name": "Like", 116 | ## "link": "http://www.facebook.com/A1/posts/B1" 117 | ## } 118 | ## ], 119 | ## "type": "status", 120 | ## "created_time": "2016-08-02T00:24:41+0000", 121 | ## "updated_time": "2016-08-02T00:24:41+0000" 122 | ## }, 123 | ## { 124 | ## "id": "A2_B2", 125 | ## "from": { 126 | ## "name": "Tom", "id": "G2" 127 | ## }, 128 | ## "message": "Yes. Easy to understand book", 129 | ## "actions": [ 130 | ## { 131 | ## "name": "Comment", 132 | ## "link": "http://www.facebook.com/A2/posts/B2" 133 | ## }, 134 | ## { 135 | ## "name": "Like", 136 | ## "link": "http://www.facebook.com/A2/posts/B2" 137 | ## } 138 | ## ], 139 | ## "type": "status", 140 | ## "created_time": "2016-08-03T21:27:44+0000", 141 | ## "updated_time": "2016-08-03T21:27:44+0000" 142 | ## } 143 | ## ] 144 | ## } 145 | 146 | ## ------------------------------------------------------------------------ 147 | library(rjson) 148 | url <- "json_fb.json" 149 | document <- fromJSON(file=url, method='C') 150 | as.data.frame(document)[,1:3] 151 | 152 | ## ------------------------------------------------------------------------ 153 | emp <- read.csv("employees.csv", header = TRUE, sep = ",") 154 | str(emp) 155 | 156 | 157 | ## ------------------------------------------------------------------------ 158 | 159 | #Manually overriding the naming convention 160 | names(emp) <- c('Code','First Name','Last Name', 'Salary(US Dollar)') 161 | 162 | # Look at the data 163 | emp 164 | 165 | # Now lets clean it up using make.names 166 | names(emp) <- make.names(names(emp)) 167 | 168 | # Look at the data 169 | emp 170 | 171 | 172 | ## ------------------------------------------------------------------------ 173 | #Find duplicates 174 | table(emp$Code) 175 | 176 | #Find common names 177 | table(emp$First.Name) 178 | 179 | 180 | ## ---- echo=FALSE--------------------------------------------------------- 181 | emp <- read.csv("employees.csv") 182 | emp_qual <- read.csv("employees_qual.csv") 183 | 184 | ## ------------------------------------------------------------------------ 185 | merge(emp, emp_qual, by = "Code") 186 | 187 | ## ------------------------------------------------------------------------ 188 | merge(emp, emp_qual, by = "Code", all.x = TRUE) 189 | 190 | ## ------------------------------------------------------------------------ 191 | merge(emp, emp_qual, by = "Code", all.y = TRUE) 192 | 193 | ## ------------------------------------------------------------------------ 194 | merge(emp, emp_qual, by = "Code", all = TRUE) 195 | 196 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 197 | library(dplyr) 198 | 199 | ## ------------------------------------------------------------------------ 200 | inner_join(emp, emp_qual, by = "Code") 201 | 202 | ## ------------------------------------------------------------------------ 203 | left_join(emp, emp_qual, by = "Code") 204 | 205 | ## ------------------------------------------------------------------------ 206 | right_join(emp, emp_qual, by = "Code") 207 | 208 | ## ------------------------------------------------------------------------ 209 | full_join(emp, emp_qual, by = "Code") 210 | 211 | ## ------------------------------------------------------------------------ 212 | 213 | employees_qual <- read.csv("employees_qual.csv") 214 | 215 | #Inconsistent 216 | employees_qual 217 | 218 | employees_qual$Qual = as.character(employees_qual$Qual) 219 | employees_qual$Qual <- ifelse(employees_qual$Qual %in% c("Phd","phd","PHd"), "PhD", employees_qual$Qual) 220 | 221 | #Corrected 222 | employees_qual 223 | 224 | 225 | ## ------------------------------------------------------------------------ 226 | 227 | emp <- read.csv("employees.csv") 228 | employees_qual <- read.csv("employees_qual.csv") 229 | 230 | #Correcting the inconsistency 231 | employees_qual$Qual = as.character(employees_qual$Qual) 232 | employees_qual$Qual <- ifelse(employees_qual$Qual %in% c("Phd","phd","PHd"), "PhD", employees_qual$Qual) 233 | 234 | #Store the output from right_join in the variables impute_salary 235 | impute_salary <- right_join(emp, employees_qual, by = "Code") 236 | 237 | #Calculate the average salary for each Qualification 238 | ave_age <- ave(impute_salary$Salary.US.Dollar., impute_salary$Qual, 239 | FUN = function(x) mean(x, na.rm = TRUE)) 240 | 241 | #Fill the NAs with the average values 242 | impute_salary$Salary.US.Dollar. <- ifelse(is.na(impute_salary$Salary.US.Dollar.), ave_age, impute_salary$Salary.US.Dollar.) 243 | 244 | impute_salary 245 | 246 | 247 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 248 | 249 | library("lubridate") 250 | date <- as.POSIXct("2016-03-13 09:51:48") 251 | date 252 | with_tz(date, "UTC") 253 | 254 | 255 | ## ------------------------------------------------------------------------ 256 | 257 | dst_time <- ymd_hms("2010-03-14 01:59:59") 258 | dst_time <- force_tz(dst_time, "America/Chicago") 259 | dst_time 260 | 261 | 262 | ## ------------------------------------------------------------------------ 263 | dst_time + dseconds(1) 264 | 265 | 266 | ## ----warning=FALSE, message=FALSE---------------------------------------- 267 | 268 | library(data.table) 269 | WDI_Data <- fread("WDI_Data.csv", header = TRUE, skip = 333555, select = c(3,40,43)) 270 | setnames(WDI_Data, c("Dev_Indicators", "1995","1998")) 271 | WDI_Data <- WDI_Data[c(1,3),] 272 | 273 | 274 | ## ------------------------------------------------------------------------ 275 | WDI_Data[,"Dev_Indicators", with = FALSE] 276 | 277 | 278 | ## ------------------------------------------------------------------------ 279 | WDI_Data[,2:3, with = FALSE] 280 | 281 | 282 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 283 | library(tidyr) 284 | gather(WDI_Data,Year,Value, 2:3) 285 | 286 | 287 | ## ------------------------------------------------------------------------ 288 | marathon <- read.csv("marathon.csv") 289 | summary(marathon) 290 | quantile(marathon$Finish_Time, 0.25) 291 | 292 | 293 | ## ------------------------------------------------------------------------ 294 | 295 | quantile(marathon$Finish_Time, 0.25) 296 | 297 | 298 | ## ------------------------------------------------------------------------ 299 | 300 | quantile(marathon$Finish_Time, 0.5) 301 | 302 | #Another function to calculate median 303 | 304 | median(marathon$Finish_Time) 305 | 306 | 307 | ## ------------------------------------------------------------------------ 308 | 309 | quantile(marathon$Finish_Time, 0.75) 310 | 311 | 312 | ## ------------------------------------------------------------------------ 313 | 314 | quantile(marathon$Finish_Time, 0.75, names = FALSE) - quantile(marathon$Finish_Time, 0.25, names = FALSE) 315 | 316 | 317 | ## ------------------------------------------------------------------------ 318 | 319 | mean(marathon$Finish_Time) 320 | 321 | 322 | ## ----fig.width=10, fig.height=5, fig.cap='Frequency plot for Marathon Finish Time'---- 323 | plot(marathon$Type, xlab = "Marathoners Type", ylab = "Number of Marathoners") 324 | 325 | ## ----fig.width=10, fig.height=5, fig.cap='Boxplot of Marathon Finish Time'---- 326 | boxplot(Finish_Time ~ Type,data=marathon, main="Marathon Data", xlab="Type of Marathoner", ylab="Finish Time") 327 | 328 | ## ------------------------------------------------------------------------ 329 | 330 | mean(marathon$Finish_Time) 331 | var(marathon$Finish_Time) 332 | sd(marathon$Finish_Time) 333 | 334 | 335 | ## ------------------------------------------------------------------------ 336 | tapply(marathon$Finish_Time,marathon$Type, mean) 337 | tapply(marathon$Finish_Time,marathon$Type, sd) 338 | 339 | 340 | ## ---- fig.width=10, fig.height=5, fig.cap='Skewness Plots'--------------- 341 | 342 | library("moments") 343 | 344 | par(mfrow=c(1,3), mar=c(5.1,4.1,4.1,1)) 345 | 346 | # Negative skew 347 | hist(rbeta(10000,2,6), main = "Negative Skew" ) 348 | skewness(rbeta(10000,2,6)) 349 | 350 | # Positive skew 351 | hist(rbeta(10000,6,2), main = "Positive Skew") 352 | skewness(rbeta(10000,6,2)) 353 | 354 | # Symmetrical 355 | hist(rbeta(10000,6,6), main = "Symmetrical") 356 | skewness(rbeta(10000,6,6)) 357 | 358 | 359 | ## ---- fig.width=10, fig.height=5, fig.cap='Marathon Finish Time'--------- 360 | 361 | hist(marathon$Finish_Time, main = "Marathon Finish Time") 362 | skewness(marathon$Finish_Time) 363 | 364 | 365 | ## ----fig.width=10, fig.height=5, fig.cap='Kurtosis Plots'---------------- 366 | 367 | #leptokurtic 368 | set.seed(2) 369 | random_numbers <- rnorm(20000,0,0.5) 370 | plot(density(random_numbers), col = "blue", main = "Kurtosis Plots", lwd=2.5, asp = 4) 371 | kurtosis(random_numbers) 372 | 373 | 374 | #platykurtic 375 | set.seed(900) 376 | random_numbers <- rnorm(20000,0,0.6) 377 | lines(density(random_numbers), col = "red", lwd=2.5) 378 | kurtosis(random_numbers) 379 | 380 | 381 | #mesokurtic 382 | set.seed(3000) 383 | random_numbers <- rnorm(20000,0,1) 384 | lines(density(random_numbers), col = "green", lwd=2.5) 385 | kurtosis(random_numbers) 386 | 387 | legend(1,0.7, c("leptokurtic", "platykurtic","mesokurtic" ), 388 | lty=c(1,1), 389 | lwd=c(2.5,2.5),col=c("blue","red","green")) 390 | 391 | 392 | 393 | ## ----fig.width=10, fig.height=5, fig.cap='Marathon Finish Time'---------- 394 | 395 | plot(density(as.numeric(marathon$Finish_Time)), col = "blue", main = "Kurtosis Plots", lwd=2.5, asp = 4) 396 | kurtosis(marathon$Finish_Time) 397 | 398 | 399 | ## ---- warning=FALSE, message=FALSE, cache=TRUE--------------------------- 400 | 401 | library(data.table) 402 | data <- fread("ccFraud.csv",header=T, verbose = FALSE, showProgress = FALSE) 403 | str(data) 404 | 405 | 406 | ## ---- warning=FALSE, cache = TRUE---------------------------------------- 407 | 408 | library(data.table) 409 | US_state <- fread("US_State_Code_Mapping.csv",header=T, showProgress = FALSE) 410 | data<-merge(data, US_state, by = 'state') 411 | 412 | 413 | ## ------------------------------------------------------------------------ 414 | library(data.table) 415 | Gender_map<-fread("Gender Map.csv",header=T) 416 | data<-merge(data, Gender_map, by = 'gender') 417 | 418 | 419 | ## ------------------------------------------------------------------------ 420 | 421 | library(data.table) 422 | Credit_line<-fread("credit line map.csv",header=T) 423 | data<-merge(data, Credit_line, by = 'creditLine') 424 | 425 | 426 | ## ---- echo=FALSE--------------------------------------------------------- 427 | data$gender<-NULL 428 | data$state<-NULL 429 | data$PostalCode<- NULL 430 | 431 | ## ------------------------------------------------------------------------ 432 | setnames(data,"custID","CustomerID") 433 | setnames(data,"code","Gender") 434 | setnames(data,"numTrans","DomesTransc") 435 | setnames(data,"numIntlTrans","IntTransc") 436 | setnames(data,"fraudRisk","FraudFlag") 437 | setnames(data,"cardholder","NumOfCards") 438 | setnames(data,"balance","OutsBal") 439 | setnames(data,"StateName","State") 440 | 441 | str(data) 442 | 443 | 444 | ## ------------------------------------------------------------------------ 445 | 446 | summary(data[,c("NumOfCards","OutsBal","DomesTransc", 447 | "IntTransc"),with = FALSE]) 448 | 449 | 450 | ## ------------------------------------------------------------------------ 451 | boxplot(I(DomesTransc + IntTransc ) ~ Gender, data = data) 452 | title("Number of Domestic Transaction") 453 | 454 | tapply(I(data$DomesTransc + data$IntTransc),data$Gender, median) 455 | 456 | tapply(I(data$DomesTransc + data$IntTransc),data$Gender, mean) 457 | 458 | 459 | 460 | ## ------------------------------------------------------------------------ 461 | 462 | table(data$CardType,data$FraudFlag) 463 | 464 | 465 | ## ------------------------------------------------------------------------ 466 | 467 | table(data$Gender,data$FraudFlag) 468 | 469 | 470 | -------------------------------------------------------------------------------- /Code/Ch05_Singh.R: -------------------------------------------------------------------------------- 1 | ## ------------------------------------------------------------------------ 2 | library(knitr) 3 | opts_chunk$set(dev="png", 4 | dev.args=list(type="cairo"), 5 | dpi=96) 6 | knitr::opts_chunk$set( 7 | fig.path = "Images/" 8 | ) 9 | 10 | ## ----ch_05_Loading the data, warning=FALSE------------------------------- 11 | setwd("C:/Personal/Machine Learning/Final Artwork and Code/Chapter 5") 12 | 13 | ##Input the data and store in data table 14 | 15 | library(data.table) 16 | 17 | data <-fread ("Dataset/Loan Default Prediction.csv",header=T, verbose = FALSE, showProgress = TRUE) 18 | 19 | dim(data) 20 | 21 | 22 | ## ----ch_05_1_Type_of_Data_01--------------------------------------------- 23 | 24 | #Summary of the data 25 | summary(data$loss) 26 | 27 | 28 | 29 | ## ----ch_05_1_Type_of_Data_02--------------------------------------------- 30 | hist(data$loss, 31 | main="Histogram for Loss Distribution ", 32 | xlab="Loss", 33 | border="blue", 34 | col="red", 35 | las=0, 36 | breaks=100, 37 | prob = TRUE) 38 | 39 | 40 | ## ----ch_05_1_Type_of_Data_03--------------------------------------------- 41 | 42 | #Sub-set the data into NON Loss and Loss ( e.g., loss > 0) 43 | 44 | subset_loss <- subset(data,loss !=0) 45 | 46 | #Distrbution of cases where there is some loss regestered 47 | 48 | hist(subset_loss$loss, 49 | main="Histogram for Loss Distribution (Default cases) ", 50 | xlab="Loss", 51 | border="blue", 52 | col="red", 53 | las=0, 54 | breaks=100, 55 | prob = TRUE) 56 | 57 | ## ----ch_05_1_Type_of_Data_04--------------------------------------------- 58 | 59 | #Create the default variable 60 | 61 | data[,default := ifelse(data$loss == 0, 0,1)] 62 | 63 | #Distribution of defaults 64 | table(data$default) 65 | 66 | #Event rate is defined as ratio of default cases in total population 67 | 68 | print(table(data$default)*100/nrow(data)) 69 | 70 | 71 | ## ----ch_05_2_Feature_type_01--------------------------------------------- 72 | 73 | continuos <- character() 74 | categorical <- character() 75 | #Write a loop to go over all features and find unique values 76 | p<-1 77 | q<-1 78 | for (i in names(data)) 79 | { 80 | unique_levels = length(unique(data[,get(i)])) 81 | 82 | if(i %in% c("id","loss","default")) 83 | { 84 | next; 85 | } 86 | 87 | else 88 | { 89 | if (unique_levels <=30 | is.character(data[,get(i)])) 90 | { 91 | # cat("The feature ", i, " is a categorical variable") 92 | categorical[p] <- i 93 | p=p+1 94 | # Makingh the 95 | data[[i]] <- factor(data[[i]]) 96 | } 97 | else 98 | { 99 | # cat("The feature ", i, " is a continuos variable") 100 | continuos[q] <- i 101 | q=q+1 102 | 103 | } 104 | } 105 | } 106 | 107 | # subtract 1 as one is dependent variable = default 108 | cat("\nTotal number of continuos variables in feature set ", length(continuos) - 1) 109 | 110 | # subtract 2 as one is loss and one is id 111 | cat("\nTotal number of categorical variable in feature set ", length(categorical) - 2) 112 | 113 | 114 | ## ----ch05_feature_ranking_01, warning=FALSE------------------------------ 115 | 116 | library(MLmetrics) 117 | 118 | performance_metric_gini <- data.frame(feature = character(), Gini_value = numeric()) 119 | 120 | #Write a loop to go over all features and find unique values 121 | for (feature in names(data)) 122 | { 123 | if(feature %in% c("id","loss","default")) 124 | { 125 | next; 126 | } 127 | else 128 | { 129 | tryCatch({glm_model <- glm(default ~ get(feature),data=data,family=binomial(link="logit")); 130 | 131 | predicted_values <- predict.glm(glm_model,newdata=data,type="response"); 132 | 133 | Gini_value <- Gini(predicted_values,data$default); 134 | 135 | performance_metric_gini <- rbind(performance_metric_gini,cbind(feature,Gini_value));},error=function(e){}) 136 | 137 | } 138 | } 139 | 140 | performance_metric_gini$Gini_value <- as.numeric(as.character(performance_metric_gini$Gini_value)) 141 | #Rank the features by value of Gini Coefficient 142 | 143 | Ranked_Features <- performance_metric_gini[order(-performance_metric_gini$Gini_value),] 144 | 145 | print("Top 5 Features by Gini Coefficients\n") 146 | 147 | head(Ranked_Features) 148 | 149 | 150 | ## ----ch05_feature_ranking_02, message=FALSE,warning=FALSE---------------- 151 | #Create a logistic model with top 6 features (f766,f404,f629,f630,f281 and f322) 152 | 153 | glm_model <- glm(default ~ f766 + f404 + f629 + f630 + f281 + f322,data=data,family=binomial(link="logit")); 154 | 155 | predicted_values <- predict.glm(glm_model,newdata=data,type="response"); 156 | 157 | Gini_value <- Gini(predicted_values,data$default); 158 | 159 | summary(glm_model) 160 | 161 | Gini_value 162 | 163 | ## ----ch05_feature_ranking_03--------------------------------------------- 164 | #Create the correlation matrix for 6 features (f766,f404,f629,f630,f281 and f322) 165 | 166 | top_6_feature <- data.frame(data$f766,data$f404,data$f629,data$f630,data$f281,data$f322) 167 | 168 | cor(top_6_feature, use="complete") 169 | 170 | ## ----ch05_filter_method_01, warning=FALSE-------------------------------- 171 | #Calculate the variance of each individual variable and standardize the variance by dividing with mean() 172 | 173 | coefficient_of_variance <- data.frame(feature = character(), cov = numeric()) 174 | 175 | #Write a loop to go over all features and calculate variance 176 | for (feature in names(data)) 177 | { 178 | if(feature %in% c("id","loss","default")) 179 | { 180 | next; 181 | } 182 | else if(feature %in% continuos) 183 | { 184 | tryCatch( 185 | {cov <- abs(sd(data[[feature]], na.rm = TRUE)/mean(data[[feature]],na.rm = TRUE)); 186 | if(cov != Inf){ 187 | coefficient_of_variance <- rbind(coefficient_of_variance,cbind(feature,cov));} else {next;}},error=function(e){}) 188 | 189 | } 190 | else 191 | { 192 | next; 193 | } 194 | } 195 | 196 | coefficient_of_variance$cov <- as.numeric(as.character(coefficient_of_variance$cov)) 197 | 198 | #Order the list by highest to lowest coefficient of variation 199 | 200 | Ranked_Features_cov <- coefficient_of_variance[order(-coefficient_of_variance$cov),] 201 | 202 | print("Top 5 Features by Coefficient of Variance\n") 203 | 204 | head(Ranked_Features_cov) 205 | 206 | 207 | ## ----ch05_filter_method_02----------------------------------------------- 208 | #Create a logistic model with top 6 features (f338,f422,f724,f636,f775 and f723) 209 | 210 | glm_model <- glm(default ~ f338 + f422 + f724 + f636 + f775 + f723,data=data,family=binomial(link="logit")); 211 | 212 | predicted_values <- predict.glm(glm_model,newdata=data,type="response"); 213 | 214 | Gini_value <- Gini(predicted_values,data$default); 215 | 216 | summary(glm_model) 217 | 218 | cat("The Gini Coefficient for the fitted model is ",Gini_value); 219 | 220 | ## ----ch05_filter_method_03----------------------------------------------- 221 | #Create the correlation matrix for 6 features (f338,f422,f724,f636,f775 and f723) 222 | 223 | top_6_feature <- data.frame(as.double(data$f338),as.double(data$f422),as.double(data$f724),as.double(data$f636),as.double(data$f775),as.double(data$f723)) 224 | 225 | cor(top_6_feature, use="complete") 226 | 227 | ## ----ch05_wrapper_method_01---------------------------------------------- 228 | #Pull 5 variables we had from highest coefficient of variation (from filter method)(f338,f422,f724,f636 and f775) 229 | 230 | predictor_set <- c("f338","f422","f724","f636","f775") 231 | 232 | #Randomly Pull 5 variables from categorical variable set ( Reader can apply filter method to categorical variable and can choose these 5 variables systemetically as well) 233 | set.seed(101); 234 | ind <- sample(1:length(categorical), 5, replace=FALSE) 235 | p<- 1 236 | for (i in ind) 237 | { 238 | predictor_set [5+p] <- categorical[i] 239 | p=p+1 240 | } 241 | 242 | #Print the set of 10 variables we will be working with 243 | 244 | print(predictor_set) 245 | 246 | #Replaced f33 by f93 as f33 does not have levels 247 | 248 | predictor_set[7] <- "f93" 249 | 250 | #Print final list of variables 251 | 252 | print(predictor_set) 253 | 254 | ## ----ch05_wrapper_method_02---------------------------------------------- 255 | # Create a samll modeling datset with only predictors and dependent variable 256 | library(data.table) 257 | data_model <- data[,.(id,f338,f422,f724,f636,f775,f222,f93,f309,f303,f113,default),] 258 | #make sure to remove the missing cases to resolve erros regarding null values 259 | 260 | data_model<-na.omit(data_model) 261 | 262 | #Full model uses all the 10 variables 263 | full_model <- glm(default ~ f338 + f422 + f724 + f636 + f775 + f222 + f93 + f309 + f303 + f113,data=data_model,family=binomial(link="logit")) 264 | 265 | #Summary of the full model 266 | summary(full_model) 267 | 268 | ## ----ch05_wrapper_method_03---------------------------------------------- 269 | #Null model uses no variables 270 | null_model <- glm(default ~ 1 ,data=data_model,family=binomial(link="logit")) 271 | 272 | #Summary of the full model 273 | summary(null_model) 274 | 275 | ## ----ch05_wrapper_method_04---------------------------------------------- 276 | #Summary of backward step selection 277 | backwards <- step(full_model) 278 | 279 | 280 | ## ----ch05_wrapper_method_05---------------------------------------------- 281 | #summary of forward selection method 282 | forwards <- step(null_model,scope=list(lower=formula(null_model),upper=formula(full_model)), direction="forward") 283 | 284 | 285 | ## ----ch05_wrapper_method_06---------------------------------------------- 286 | #Summary of final model with backward selection process 287 | formula(backwards) 288 | 289 | #Summary of final model with forward selection process 290 | formula(forwards) 291 | 292 | ## ----ch05_embeded_method_01, warning=FALSE,message=FALSE----------------- 293 | 294 | #Create data farme with dependent and independent variables (Remove NA) 295 | 296 | data_model <- na.omit(data) 297 | 298 | y <- as.matrix(data_model$default) 299 | 300 | x <- as.matrix(subset(data_model, select=continuos[250:260])) 301 | 302 | library("glmnet") 303 | 304 | ## ----ch05_embeded_method_02, warning=FALSE,message=FALSE----------------- 305 | 306 | #Fit a model with dependent variable of binomial family 307 | fit = glmnet(x,y, family= "binomial") 308 | 309 | #Summary of fit model 310 | summary(fit) 311 | 312 | #Plot the output of glmnet fit model 313 | plot (fit, xvar= "dev", label=TRUE) 314 | 315 | ## ----ch05_embeded_method_03, warning=FALSE,message=FALSE----------------- 316 | 317 | #Fit a cross validated binomial model 318 | fit_logistic = cv.glmnet(x,y, family= "binomial", type.measure = "class") 319 | 320 | #Summary of fitted Cross Validated Linear Model 321 | 322 | summary(fit_logistic) 323 | 324 | #Plot the results 325 | plot (fit_logistic) 326 | 327 | ## ----ch05_embeded_method_04---------------------------------------------- 328 | #Print the minimum lambda - regularization factor 329 | print(fit_logistic$lambda.min) 330 | 331 | print(fit_logistic$lambda.1se) 332 | 333 | #Against the lamda minimum value we can get the coefficients 334 | param <- coef(fit_logistic, s="lambda.min") 335 | 336 | param <- as.data.frame(as.matrix(param)) 337 | 338 | param$feature<-rownames(param) 339 | 340 | #The list of variables suggested by the embeded method 341 | 342 | param_embeded <- param[param$`1` > 0,] 343 | 344 | print(param_embeded) 345 | 346 | ## ----ch05_principle_component_01----------------------------------------- 347 | #Take a subset of 10 features 348 | pca_data <- data[,.(f381,f408,f495,f529,f549,f539,f579,f634,f706,f743)] 349 | 350 | pca_data <- na.omit(pca_data) 351 | 352 | head(pca_data) 353 | 354 | #Normalise the data before applying PCA analysis mean=0, and sd=1 355 | scaled_pca_data <- scale(pca_data) 356 | 357 | head(scaled_pca_data) 358 | 359 | #Do the decomposition on the scaled series 360 | pca_results <- prcomp(scaled_pca_data) 361 | 362 | print(pca_results) 363 | 364 | summary(pca_results) 365 | 366 | plot(pca_results) 367 | 368 | #Create the biplot with principle components 369 | biplot(pca_results, col = c("red", "blue")) 370 | 371 | 372 | -------------------------------------------------------------------------------- /Code/Ch06_Clustering.R: -------------------------------------------------------------------------------- 1 | ## ----opts, echo = FALSE-------------------------------------------------- 2 | library(knitr) 3 | opts_chunk$set(dev="png", 4 | dev.args=list(type="cairo"), 5 | dpi=96) 6 | knitr::opts_chunk$set( 7 | fig.path = "~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Images/004/" 8 | ) 9 | 10 | ## ----ch06_hierachal_clust_01--------------------------------------------- 11 | # Read the house Worth Data 12 | Data_House_Worth <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/House Worth Data.csv",header=TRUE); 13 | 14 | str(Data_House_Worth) 15 | 16 | #remove teh extra column as well not be using this 17 | Data_House_Worth$BasementArea <- NULL 18 | 19 | ## ----ch06_hierachal_clust_02--------------------------------------------- 20 | library(ggplot2) 21 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) + geom_point() 22 | 23 | ## ----ch06_hierachal_clust_03--------------------------------------------- 24 | # apply the hirarichal clustering algorith 25 | clusters <- hclust(dist(Data_House_Worth[,2:3])) 26 | 27 | #Plot the dendogram 28 | plot(clusters) 29 | 30 | ## ----ch06_hierachal_clust_04--------------------------------------------- 31 | # Create different number of clusters 32 | clusterCut_2 <- cutree(clusters, 2) 33 | #table the clustering distribution with actual networth 34 | table(clusterCut_2,Data_House_Worth$HouseNetWorth) 35 | 36 | clusterCut_3 <- cutree(clusters, 3) 37 | #table the clustering distribution with actual networth 38 | table(clusterCut_3,Data_House_Worth$HouseNetWorth) 39 | 40 | 41 | clusterCut_4 <- cutree(clusters, 4) 42 | #table the clustering distribution with actual networth 43 | table(clusterCut_4,Data_House_Worth$HouseNetWorth) 44 | 45 | ## ----ch06_hierachal_clust_05--------------------------------------------- 46 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) + 47 | geom_point(alpha = 0.4, size = 3.5) + geom_point(col = clusterCut_3) + 48 | scale_color_manual(values = c('black', 'red', 'green')) 49 | 50 | ## ----ch06_kmeans_clust_06------------------------------------------------ 51 | 52 | # Elbow Curve 53 | 54 | wss <- (nrow(Data_House_Worth)-1)*sum(apply(Data_House_Worth[,2:3],2,var)) 55 | for (i in 2:15) { 56 | wss[i] <- sum(kmeans(Data_House_Worth[,2:3],centers=i)$withinss) 57 | } 58 | plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares") 59 | 60 | ## ----ch06_kmeans_clust_07------------------------------------------------ 61 | set.seed(917) 62 | #Run k-means cluster of the datase 63 | Cluster_kmean <- kmeans(Data_House_Worth[,2:3], 3, nstart = 20) 64 | 65 | #Tabulate teh cross distribution 66 | table(Cluster_kmean$cluster,Data_House_Worth$HouseNetWorth) 67 | 68 | ## ----ch06_kmeans_clust_08------------------------------------------------ 69 | Cluster_kmean$cluster <- factor(Cluster_kmean$cluster) 70 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) + 71 | geom_point(alpha = 0.4, size = 3.5) + geom_point(col = Cluster_kmean$cluster) + 72 | scale_color_manual(values = c('black', 'red', 'green')) 73 | 74 | ## ----ch06_dist_clust_01-------------------------------------------------- 75 | library(EMCluster, quietly = TRUE) 76 | 77 | ret <- init.EM(Data_House_Worth[,2:3], nclass = 3) 78 | ret 79 | 80 | ret.new <- assign.class(Data_House_Worth[,2:3], ret, return.all = FALSE) 81 | 82 | #This have assigned a class to each case 83 | str(ret.new) 84 | 85 | # Plot results 86 | plotem(ret,Data_House_Worth[,2:3]) 87 | 88 | ## ----ch06_disty_clust_02------------------------------------------------- 89 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) + 90 | geom_point(alpha = 0.4, size = 3.5) + geom_point(col = ret.new$class) + 91 | scale_color_manual(values = c('black', 'red', 'green')) 92 | 93 | ## ----ch06_dbscan_01------------------------------------------------------ 94 | library(dbscan) 95 | cluster_dbscan <- dbscan(Data_House_Worth[,2:3],eps=0.8,minPts = 10) 96 | cluster_dbscan 97 | 98 | #Display the hull plot 99 | hullplot(Data_House_Worth[,2:3],cluster_dbscan$cluster) 100 | 101 | ## ----ch06_dunnIndex------------------------------------------------------ 102 | library(clValid) 103 | #Showing for hieracical cluster with clusters = 3 104 | dunn(dist(Data_House_Worth[,2:3]), clusterCut_3) 105 | 106 | ## ----ch06_Silhoutte------------------------------------------------------ 107 | library(cluster) 108 | 109 | #Showing for k-means cluster with clusters = 3 110 | sk <- silhouette(clusterCut_3,dist(Data_House_Worth[,2:3])) 111 | 112 | plot(sk) 113 | 114 | ## ----ch06_rand----------------------------------------------------------- 115 | #Unign result from EM Algo 116 | library(EMCluster) 117 | clust <- ret.new$class 118 | orig <- ifelse(Data_House_Worth$HouseNetWorth == "High",2, 119 | ifelse(Data_House_Worth$HouseNetWorth == "Low",1,2)) 120 | RRand(orig, clust) 121 | 122 | ## ----ch06_jaccard-------------------------------------------------------- 123 | #Unign result from EM Algo 124 | library(EMCluster) 125 | clust <- ret.new$class 126 | orig <- ifelse(Data_House_Worth$HouseNetWorth == "High",2, 127 | ifelse(Data_House_Worth$HouseNetWorth == "Low",1,2)) 128 | Jaccard.Index(orig, clust) 129 | 130 | -------------------------------------------------------------------------------- /Code/Ch06_DecisionTree.R: -------------------------------------------------------------------------------- 1 | ## ----opts, echo = FALSE-------------------------------------------------- 2 | library(knitr) 3 | opts_chunk$set(dev="png", 4 | dev.args=list(type="cairo"), 5 | dpi=96) 6 | knitr::opts_chunk$set( 7 | fig.path = "~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Images/003/" 8 | ) 9 | 10 | ## ---- fig.cap="Image 6.x: Gini-Index Function"--------------------------- 11 | curve(x * (1- x) + (1 - x) * x, xlab = "P", ylab = "Gini-Index", lwd = 5) 12 | 13 | ## ---- fig.cap="Image 6.x: Entropy Function"------------------------------ 14 | curve(-x * log2(x) - (1 - x) * log2(1 - x), xlab = "x", ylab = "Entropy", lwd = 5) 15 | 16 | ## ---- message=FALSE, warning=FALSE--------------------------------------- 17 | 18 | library(C50) 19 | library(splitstackshape) 20 | library(rattle) 21 | library(rpart.plot) 22 | library(data.table) 23 | 24 | Data_Purchase <- fread("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Purchase Prediction Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE) 25 | str(Data_Purchase) 26 | 27 | #Check the distribution of data before grouping 28 | table(Data_Purchase$ProductChoice) 29 | 30 | ## ------------------------------------------------------------------------ 31 | 32 | #Pulling out only the relevant data to this chapter 33 | Data_Purchase <- Data_Purchase[,.(CUSTOMER_ID,ProductChoice,MembershipPoints,IncomeClass,CustomerPropensity,LastPurchaseDuration)] 34 | 35 | #Delete NA from subset 36 | Data_Purchase <- na.omit(Data_Purchase) 37 | Data_Purchase$CUSTOMER_ID <- as.character(Data_Purchase$CUSTOMER_ID) 38 | 39 | #Stratified Sampling 40 | Data_Purchase_Model<-stratified(Data_Purchase, group=c("ProductChoice"),size=10000,replace=FALSE) 41 | 42 | print("The Distribution of equal classes is as below") 43 | table(Data_Purchase_Model$ProductChoice) 44 | 45 | Data_Purchase_Model$ProductChoice <- as.factor(Data_Purchase_Model$ProductChoice) 46 | Data_Purchase_Model$IncomeClass <- as.factor(Data_Purchase_Model$IncomeClass) 47 | Data_Purchase_Model$CustomerPropensity <- as.factor(Data_Purchase_Model$CustomerPropensity) 48 | 49 | #Build the decision tree on Train Data (Set_1) and then test data (Set_2) will be used for performance testing 50 | 51 | set.seed(917); 52 | train <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=nrow(Data_Purchase_Model)*(0.7), replace = TRUE, prob = NULL),] 53 | train <- as.data.frame(train) 54 | 55 | test <- Data_Purchase_Model[!(Data_Purchase_Model$CUSTOMER_ID %in% train$CUSTOMER_ID),] 56 | 57 | 58 | ## ---- message=FALSE,warning=FALSE,eval=FALSE----------------------------- 59 | ## 60 | ## library(RWeka) 61 | ## 62 | ## WPM("refresh-cache") 63 | ## WPM("install-package", "simpleEducationalLearningSchemes") 64 | ## 65 | ## 66 | ## ## make classifier 67 | ## ID3 <- make_Weka_classifier("weka/classifiers/trees/Id3") 68 | ## 69 | ## ID3Model <- ID3(ProductChoice ~ CustomerPropensity + IncomeClass , data = train) 70 | ## 71 | ## summary(ID3Model) 72 | ## 73 | 74 | ## ----eval=FALSE---------------------------------------------------------- 75 | ## 76 | ## library(gmodels) 77 | ## purchase_pred_test <- predict(ID3Model, test) 78 | ## CrossTable(test$ProductChoice, purchase_pred_test, 79 | ## prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 80 | ## dnn = c('actual default', 'predicted default')) 81 | ## 82 | 83 | ## ------------------------------------------------------------------------ 84 | 85 | model_c50 <- C5.0(train[,c("CustomerPropensity","LastPurchaseDuration", "MembershipPoints")], 86 | train[,"ProductChoice"], 87 | control = C5.0Control(CF = 0.001, minCases = 2)) 88 | 89 | 90 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 91 | 92 | summary(model_c50) 93 | #plot(model_c50) 94 | 95 | 96 | ## ------------------------------------------------------------------------ 97 | 98 | library(gmodels) 99 | 100 | purchase_pred_train <- predict(model_c50, train,type = "class") 101 | CrossTable(train$ProductChoice, purchase_pred_train, 102 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 103 | dnn = c('actual default', 'predicted default')) 104 | 105 | 106 | ## ------------------------------------------------------------------------ 107 | purchase_pred_test <- predict(model_c50, test) 108 | CrossTable(test$ProductChoice, purchase_pred_test,prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 109 | dnn = c('actual default', 'predicted default')) 110 | 111 | 112 | ## ------------------------------------------------------------------------ 113 | 114 | CARTModel <- rpart(ProductChoice ~ IncomeClass + CustomerPropensity + LastPurchaseDuration + MembershipPoints, data=train) 115 | 116 | summary(CARTModel) 117 | 118 | library(rpart.plot) 119 | library(rattle) 120 | 121 | #fancyRpartPlot(CARTModel) 122 | 123 | 124 | ## ------------------------------------------------------------------------ 125 | 126 | library(gmodels) 127 | 128 | purchase_pred_train <- predict(CARTModel, train,type = "class") 129 | CrossTable(train$ProductChoice, purchase_pred_train, 130 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 131 | dnn = c('actual default', 'predicted default')) 132 | 133 | 134 | ## ------------------------------------------------------------------------ 135 | 136 | #CHAID installation from source 137 | ##install.packages("CHAID", repos="http://R-Forge.R-project.org") 138 | library(CHAID) 139 | 140 | ctrl <- chaid_control(minsplit = 200, minprob = 0.1) 141 | CHAIDModel <- chaid(ProductChoice ~ CustomerPropensity + IncomeClass, data = train, control = ctrl) 142 | print(CHAIDModel) 143 | #plot(CHAIDModel) 144 | 145 | 146 | ## ------------------------------------------------------------------------ 147 | 148 | library(gmodels) 149 | 150 | purchase_pred_train <- predict(CHAIDModel, train) 151 | CrossTable(train$ProductChoice, purchase_pred_train, 152 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 153 | dnn = c('actual default', 'predicted default')) 154 | 155 | ## ------------------------------------------------------------------------ 156 | purchase_pred_test <- predict(CHAIDModel, test) 157 | CrossTable(test$ProductChoice, purchase_pred_test, 158 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 159 | dnn = c('actual default', 'predicted default')) 160 | 161 | 162 | ## ------------------------------------------------------------------------ 163 | library(caret) 164 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 165 | 166 | ModelC50_boostcv10 <- C5.0(train[,c("CustomerPropensity","LastPurchaseDuration", "MembershipPoints")], train[,"ProductChoice"], trials = 10) 167 | 168 | ## ------------------------------------------------------------------------ 169 | 170 | library(gmodels) 171 | 172 | purchase_pred_train <- predict(ModelC50_boostcv10, train) 173 | CrossTable(train$ProductChoice, purchase_pred_train, 174 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 175 | dnn = c('actual default', 'predicted default')) 176 | 177 | ## ------------------------------------------------------------------------ 178 | purchase_pred_test <- predict(ModelC50_boostcv10, test) 179 | CrossTable(test$ProductChoice, purchase_pred_test, 180 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 181 | dnn = c('actual default', 'predicted default')) 182 | 183 | 184 | ## ------------------------------------------------------------------------ 185 | 186 | control <- trainControl(method="repeatedcv", number=5, repeats=2) 187 | 188 | # Bagged CART 189 | set.seed(100) 190 | CARTBagModel <- train(ProductChoice ~ CustomerPropensity + LastPurchaseDuration + MembershipPoints, data=train, method="treebag", trControl=control) 191 | 192 | ## ------------------------------------------------------------------------ 193 | 194 | library(gmodels) 195 | 196 | purchase_pred_train <- predict(CARTBagModel, train) 197 | CrossTable(train$ProductChoice, purchase_pred_train, 198 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 199 | dnn = c('actual default', 'predicted default')) 200 | 201 | ## ------------------------------------------------------------------------ 202 | purchase_pred_test <- predict(CARTBagModel, test) 203 | CrossTable(test$ProductChoice, purchase_pred_test, 204 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 205 | dnn = c('actual default', 'predicted default')) 206 | 207 | 208 | ## ------------------------------------------------------------------------ 209 | # Random Forest 210 | set.seed(100) 211 | 212 | rfModel <- train(ProductChoice ~ CustomerPropensity + LastPurchaseDuration + MembershipPoints, data=train, method="rf", trControl=control) 213 | 214 | 215 | ## ------------------------------------------------------------------------ 216 | 217 | library(gmodels) 218 | 219 | purchase_pred_train <- predict(rfModel, train) 220 | CrossTable(train$ProductChoice, purchase_pred_train, 221 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 222 | dnn = c('actual default', 'predicted default')) 223 | 224 | ## ------------------------------------------------------------------------ 225 | purchase_pred_test <- predict(rfModel, test) 226 | CrossTable(test$ProductChoice, purchase_pred_test, 227 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 228 | dnn = c('actual default', 'predicted default')) 229 | 230 | 231 | -------------------------------------------------------------------------------- /Code/Ch06_Naive Bayes.R: -------------------------------------------------------------------------------- 1 | ## ------------------------------------------------------------------------ 2 | library(data.table) 3 | library(splitstackshape) 4 | library(e1071) 5 | 6 | Data_Purchase <- fread("C:\\Users\\Karthik\\Dropbox\\Book Writing - Drafts\\Chapter Drafts\\Chap 6 29 Sep\\Dataset\\Purchase Prediction Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE) 7 | str(Data_Purchase) 8 | 9 | #Check the distribution of data before grouping 10 | table(Data_Purchase$ProductChoice) 11 | 12 | 13 | #Pulling out only the relevant data to this chapter 14 | Data_Purchase <- Data_Purchase[,.(CUSTOMER_ID,ProductChoice,MembershipPoints,IncomeClass,CustomerPropensity,LastPurchaseDuration)] 15 | 16 | #Delete NA from subset 17 | Data_Purchase <- na.omit(Data_Purchase) 18 | Data_Purchase$CUSTOMER_ID <- as.character(Data_Purchase$CUSTOMER_ID) 19 | 20 | #Stratified Sampling 21 | Data_Purchase_Model<-stratified(Data_Purchase, group=c("ProductChoice"),size=10000,replace=FALSE) 22 | 23 | print("The Distribution of equal classes is as below") 24 | table(Data_Purchase_Model$ProductChoice) 25 | 26 | Data_Purchase_Model$ProductChoice <- as.factor(Data_Purchase_Model$ProductChoice) 27 | Data_Purchase_Model$IncomeClass <- as.factor(Data_Purchase_Model$IncomeClass) 28 | Data_Purchase_Model$CustomerPropensity <- as.factor(Data_Purchase_Model$CustomerPropensity) 29 | 30 | set.seed(917); 31 | train <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=nrow(Data_Purchase_Model)*(0.7), replace = TRUE, prob = NULL),] 32 | train <- as.data.frame(train) 33 | 34 | test <- as.data.frame(Data_Purchase_Model[!(Data_Purchase_Model$CUSTOMER_ID %in% train$CUSTOMER_ID),]) 35 | 36 | ## ------------------------------------------------------------------------ 37 | model_naiveBayes <- naiveBayes(train[,c(3,4,5)], train[,2]) 38 | model_naiveBayes 39 | 40 | 41 | ## ------------------------------------------------------------------------ 42 | model_naiveBayes_pred <- predict(model_naiveBayes, train) 43 | 44 | library(gmodels) 45 | 46 | CrossTable(model_naiveBayes_pred, train[,2], 47 | prop.chisq = FALSE, prop.t = FALSE, 48 | dnn = c('predicted', 'actual')) 49 | 50 | ## ------------------------------------------------------------------------ 51 | 52 | model_naiveBayes_pred <- predict(model_naiveBayes, test) 53 | 54 | library(gmodels) 55 | 56 | CrossTable(model_naiveBayes_pred, test[,2], 57 | prop.chisq = FALSE, prop.t = FALSE, 58 | dnn = c('predicted', 'actual')) 59 | 60 | 61 | -------------------------------------------------------------------------------- /Code/Ch06_NeuralNet.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## ----ch06_nnet_01-------------------------------------------------------- 4 | 5 | #Load the data and prepare a dataset for logistic regression 6 | Data_Purchase_Prediction <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Purchase Prediction Dataset.csv",header=TRUE); 7 | 8 | Data_Purchase_Prediction$choice <- ifelse(Data_Purchase_Prediction$ProductChoice == 1,1, 9 | ifelse(Data_Purchase_Prediction$ProductChoice == 3,0,999)); 10 | 11 | Data_Neural_Net <- Data_Purchase_Prediction[Data_Purchase_Prediction$choice %in% c("0","1"),] 12 | 13 | #Remove Missing Values 14 | Data_Neural_Net <- na.omit(Data_Neural_Net) 15 | rownames(Data_Neural_Net) <- NULL 16 | 17 | 18 | ## ----ch06_nnet_02-------------------------------------------------------- 19 | #Transforming the continuous variables 20 | cont <- Data_Neural_Net[,c("PurchaseTenure","CustomerAge","MembershipPoints","IncomeClass")] 21 | 22 | maxs <- apply(cont, 2, max) 23 | mins <- apply(cont, 2, min) 24 | 25 | scaled_cont <- as.data.frame(scale(cont, center = mins, scale = maxs - mins)) 26 | 27 | #The dependent variable 28 | dep <- factor(Data_Neural_Net$choice) 29 | 30 | Data_Neural_Net$ModeOfPayment <- factor(Data_Neural_Net$ModeOfPayment); 31 | 32 | flags_ModeOfPayment = data.frame(Reduce(cbind, 33 | lapply(levels(Data_Neural_Net$ModeOfPayment), function(x){(Data_Neural_Net$ModeOfPayment == x)*1}) 34 | )) 35 | 36 | names(flags_ModeOfPayment) = levels(Data_Neural_Net$ModeOfPayment) 37 | 38 | Data_Neural_Net$CustomerPropensity <- factor(Data_Neural_Net$CustomerPropensity); 39 | 40 | flags_CustomerPropensity = data.frame(Reduce(cbind, 41 | lapply(levels(Data_Neural_Net$CustomerPropensity), function(x){(Data_Neural_Net$CustomerPropensity == x)*1}) 42 | )) 43 | names(flags_CustomerPropensity) = levels(Data_Neural_Net$CustomerPropensity) 44 | 45 | cate <- cbind(flags_ModeOfPayment,flags_CustomerPropensity) 46 | 47 | #Combine all data into single modeling data 48 | Dataset <- cbind(dep,scaled_cont,cate); 49 | 50 | #Divide the data into train and test 51 | set.seed(917); 52 | index <- sample(1:nrow(Dataset),round(0.7*nrow(Dataset))) 53 | train <- Dataset[index,] 54 | test <- Dataset[-index,] 55 | 56 | ## ----ch06_nnet_03-------------------------------------------------------- 57 | library(nnet) 58 | i <- names(train) 59 | form <- as.formula(paste("dep ~", paste(i[!i %in% "dep"], collapse = " + "))) 60 | nn <- nnet.formula(form,size=10,data=train) 61 | 62 | predict_class <- predict(nn, newdata=test, type="class") 63 | 64 | #Classiifcation table 65 | table(test$dep,predict_class) 66 | 67 | #Clasisifcation rate 68 | sum(diag(table(test$dep,predict_class))/nrow(test)) 69 | 70 | ## ----ch06_nnet_04-------------------------------------------------------- 71 | library(NeuralNetTools) 72 | # Plot the neural network 73 | plotnet(nn) 74 | 75 | #get the neural weights 76 | neuralweights(nn) 77 | 78 | # Plot the imporatance 79 | olden(nn) 80 | 81 | #variable importance by garson algorith 82 | garson(nn) 83 | 84 | 85 | ## ----ch06_data_mat------------------------------------------------------- 86 | #Pre-transfromation 87 | head(Data_Purchase_Prediction[,c("choice","PurchaseTenure","CustomerAge","MembershipPoints","IncomeClass","ModeOfPayment","CustomerPropensity")]) 88 | 89 | #Post-transoformation 90 | head(train) 91 | 92 | ## ----ch06_DeepNeuralNet_Darch,error=FALSE-------------------------------- 93 | #We will us the same data as of previous example in neural network 94 | devtools::install_github("maddin79/darch") 95 | library(darch) 96 | library(mlbench) 97 | library(RANN) 98 | 99 | #Print the model formula 100 | form 101 | 102 | #Apply the model using deep neural net with 103 | # deep_net <- darch(form, train, 104 | # preProc.params = list("method" = c("knnImpute")), 105 | # layers = c(0,10,30,10,0), 106 | # darch.batchSize = 1, 107 | # darch.returnBestModel.validationErrorFactor = 1, 108 | # darch.fineTuneFunction = "rpropagation", 109 | # darch.unitFunction = c("tanhUnit", "tanhUnit","tanhUnit","softmaxUnit"), 110 | # darch.numEpochs = 15, 111 | # bootstrap = T, 112 | # bootstrap.num = 500) 113 | 114 | deep_net <- darch(form,train, 115 | preProc.params = list(method = c("center", "scale")), 116 | layers = c(0,10,30,10,0), 117 | darch.unitFunction = c("sigmoidUnit", "tanhUnit","tanhUnit","softmaxUnit"), 118 | darch.fineTuneFunction = "minimizeClassifier", 119 | darch.numEpochs = 15, 120 | cg.length = 3, cg.switchLayers = 5) 121 | 122 | 123 | #Plot the deep net 124 | library(NeuralNetTools) 125 | plot(deep_net,"net") 126 | 127 | result <- darchTest(deep_net, newdata = test) 128 | result 129 | 130 | 131 | ## ----install,eval=FALSE-------------------------------------------------- 132 | ## install.packages("drat", repos="https://cran.rstudio.com") 133 | ## drat:::addRepo("dmlc") 134 | ## install.packages("mxnet") 135 | ## 136 | ## #Please refer https://github.com/dahtah/imager 137 | ## install.packages("devtools") 138 | ## devtools::install_github("dahtah/imager") 139 | 140 | ## ----ch06_deep_learning_01----------------------------------------------- 141 | library(mxnet) 142 | 143 | #install imager for loading images 144 | 145 | library(imager) 146 | 147 | #load the pre-trained model 148 | model <- mx.model.load("Inception/Inception_BN", iteration=39) 149 | 150 | #We also need to load in the mean image, which is used for preprocessing using mx.nd.load. 151 | 152 | mean.img = as.array(mx.nd.load("Inception/mean_224.nd")[["mean_img"]]) 153 | 154 | #Load and plot the image: (Defualt parot image) 155 | 156 | #im <- load.image(system.file("extdata/parrots.png", package="imager")) 157 | im <- load.image("Pictures/russia-volcano.jpg") 158 | plot(im) 159 | 160 | ## ----ch06_deep_learning_02----------------------------------------------- 161 | preproc.image <- function(im, mean.image) { 162 | # crop the image 163 | shape <- dim(im) 164 | short.edge <- min(shape[1:2]) 165 | xx <- floor((shape[1] - short.edge) / 2) 166 | yy <- floor((shape[2] - short.edge) / 2) 167 | croped <- crop.borders(im, xx, yy) 168 | # resize to 224 x 224, needed by input of the model. 169 | resized <- resize(croped, 224, 224) 170 | # convert to array (x, y, channel) 171 | arr <- as.array(resized) * 255 172 | dim(arr) <- c(224, 224, 3) 173 | # substract the mean 174 | normed <- arr - mean.img 175 | # Reshape to format needed by mxnet (width, height, channel, num) 176 | dim(normed) <- c(224, 224, 3, 1) 177 | return(normed) 178 | } 179 | 180 | #Now pass our image to pre-process 181 | normed <- preproc.image(im, mean.img) 182 | plot(normed) 183 | 184 | ## ----ch06_deep_learning_03,error= FALSE---------------------------------- 185 | #prob <- predict(model, X=normed) 186 | 187 | #We can extract the top-5 class index. 188 | 189 | #max.idx <- order(prob[,1], decreasing = TRUE)[1:5] 190 | max.idx <- c("981", "980", "971", "673", "985") 191 | max.idx 192 | 193 | synsets <- readLines("Inception/synset.txt") 194 | 195 | #And let us print the corresponding lines: 196 | 197 | print(paste0("Predicted Top-classes: ", synsets[as.numeric(max.idx)])) 198 | 199 | -------------------------------------------------------------------------------- /Code/Ch06_OnlineML.R: -------------------------------------------------------------------------------- 1 | ## ------------------------------------------------------------------------ 2 | library(ggplot2) 3 | library(e1071) 4 | 5 | Data_House_Worth <-read.csv("C:\\Users\\Karthik\\Dropbox\\Book Writing - Drafts\\Chapter Drafts\\Chap 6 29 Sep\\Dataset\\House Worth Data.csv",header=TRUE); 6 | 7 | str(Data_House_Worth) 8 | 9 | #remove the extra column that are not required for the model 10 | Data_House_Worth$BasementArea <-NULL 11 | 12 | ## ------------------------------------------------------------------------ 13 | 14 | online_cmean <-cmeans(Data_House_Worth[,2:3],3,20,verbose=TRUE,method="ufcl",m=2) 15 | print(online_cmean) 16 | 17 | 18 | ## ------------------------------------------------------------------------ 19 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) + 20 | geom_point(alpha =0.4, size =3.5) +geom_point(col = online_cmean$cluster) + 21 | scale_color_manual(values =c('black', 'red', 'green')) 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code/Ch06_RuleMining.R: -------------------------------------------------------------------------------- 1 | ## ----opts, echo = FALSE-------------------------------------------------- 2 | library(knitr) 3 | opts_chunk$set(dev="png", 4 | dev.args=list(type="cairo"), 5 | dpi=96); 6 | knitr::opts_chunk$set( 7 | fig.path = "~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Images/006/" 8 | ) 9 | 10 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 11 | 12 | library(arules) 13 | MarketBasket <- read.transactions("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/MarketBasketProcessed.csv", sep = ",") 14 | summary(MarketBasket) 15 | 16 | #Transactions - First two 17 | inspect(MarketBasket[1:2]) 18 | 19 | # Top 20 frequently bought product 20 | itemFrequencyPlot(MarketBasket, topN = 20) 21 | 22 | # Sparsity in the data - More white space means, more sparsity 23 | image(sample(MarketBasket, 100)) 24 | 25 | 26 | ## ------------------------------------------------------------------------ 27 | library(arules) 28 | 29 | 30 | apriori(MarketBasket) 31 | groceryrules <- apriori(MarketBasket, parameter = list(support = 0.2, confidence = 0.8, minlen = 2)) 32 | 33 | itemsets <- eclat(MarketBasket, 34 | parameter = list(supp = 0.1, maxlen = 15)) 35 | 36 | 37 | ## ------------------------------------------------------------------------ 38 | 39 | summary(groceryrules) 40 | 41 | # look at the first three rules 42 | inspect(groceryrules[1:3]) 43 | 44 | ## Step 5: Improving model performance ---- 45 | 46 | # sorting grocery rules by lift 47 | inspect(sort(groceryrules, by = "lift")[1:5]) 48 | 49 | # store as data frame 50 | groceryrules_df <- as(groceryrules, "data.frame") 51 | str(groceryrules_df) 52 | 53 | 54 | ## ------------------------------------------------------------------------ 55 | library(arules) 56 | 57 | eclat(MarketBasket) 58 | groceryrules <- eclat(MarketBasket, parameter = list(support = 0.2, minlen = 2)) 59 | 60 | itemsets <- eclat(MarketBasket, parameter = list(supp = 0.1, maxlen = 15)) 61 | 62 | 63 | ## ------------------------------------------------------------------------ 64 | 65 | summary(groceryrules) 66 | 67 | # look at the first three rules 68 | inspect(groceryrules[1:3]) 69 | 70 | ## Step 5: Improving model performance ---- 71 | 72 | # sorting grocery rules by lift 73 | inspect(sort(groceryrules, by = "support")[1:5]) 74 | 75 | # store as data frame 76 | groceryrules_df <- as(groceryrules, "data.frame") 77 | str(groceryrules_df) 78 | 79 | 80 | ## ------------------------------------------------------------------------ 81 | 82 | 83 | library(data.table) 84 | 85 | fine_food_data <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Food_Reviews.csv",stringsAsFactors = FALSE) 86 | fine_food_data$Score <- as.factor(fine_food_data$Score) 87 | 88 | str(fine_food_data[-10]) 89 | 90 | 91 | 92 | ## ---- message=FALSE,warning=FALSE---------------------------------------- 93 | 94 | library(caTools) 95 | 96 | # Randomly split data and use only 10% of the dataset 97 | set.seed(90) 98 | split = sample.split(fine_food_data$Score, SplitRatio = 0.05) 99 | 100 | fine_food_data = subset(fine_food_data, split == TRUE) 101 | 102 | select_col <- c("UserId","ProductId","Score") 103 | 104 | fine_food_data_selected <- fine_food_data[,select_col] 105 | rownames(fine_food_data_selected) <- NULL 106 | fine_food_data_selected$Score = as.numeric(fine_food_data_selected$Score) 107 | 108 | #Remove Duplicates 109 | 110 | fine_food_data_selected <- unique(fine_food_data_selected) 111 | 112 | 113 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 114 | 115 | library(recommenderlab) 116 | 117 | #RatingsMatrix 118 | 119 | RatingMat <- dcast(fine_food_data_selected,UserId ~ ProductId, value.var = "Score") 120 | User=RatingMat[,1] 121 | Product= colnames(RatingMat)[2:ncol(RatingMat)] 122 | RatingMat[,1] <- NULL 123 | RatingMat <- as.matrix(RatingMat) 124 | dimnames(RatingMat) = list(user = User , product = Product) 125 | 126 | realM <- as(RatingMat, "realRatingMatrix") 127 | 128 | ## ------------------------------------------------------------------------ 129 | 130 | #distribution of ratings 131 | hist(getRatings(realM), breaks=15, main = "Distribution of Ratings", xlab = "Ratings", col = "grey") 132 | 133 | #Sparse Matrix Representation 134 | head(as(realM, "data.frame")) 135 | 136 | #The realRatingMatrix can be coerced back into a matrix which is identical to the original matrix 137 | identical(as(realM, "matrix"),RatingMat) 138 | 139 | #Sparcity in Rating Matrix 140 | image(realM, main = "Raw Ratings") 141 | 142 | ## ------------------------------------------------------------------------ 143 | 144 | #UBCF Model 145 | r_UBCF <- Recommender(realM[1:1700], method = "UBCF") 146 | r_UBCF 147 | 148 | #List of objects in the model output 149 | names(getModel(r_UBCF)) 150 | 151 | #Recommend product for the rest of 29 left out observations 152 | recom_UBCF <- predict(r_UBCF, realM[1700:1729], n=5) 153 | recom_UBCF 154 | 155 | #Display the recommendation 156 | reco <- as(recom_UBCF, "list") 157 | reco[lapply(reco,length)>0] 158 | 159 | 160 | ## ------------------------------------------------------------------------ 161 | 162 | set.seed(2016) 163 | scheme <- evaluationScheme(realM[1:1700], method="split", train = .9, 164 | k=1, given=1, goodRating=3) 165 | 166 | scheme 167 | 168 | algorithms <- list( 169 | "random items" = list(name="RANDOM", param=NULL), 170 | "popular items" = list(name="POPULAR", param=NULL), 171 | "user-based CF" = list(name="UBCF", param=list(nn=50)), 172 | "item-based CF" = list(name="IBCF", param=list(k=50)) 173 | ) 174 | 175 | results <- evaluate(scheme, algorithms, type = "topNList", 176 | n=c(1, 3, 5, 10, 15, 20)) 177 | 178 | plot(results, annotate=c(1,3), legend="bottomright") 179 | 180 | 181 | -------------------------------------------------------------------------------- /Code/Ch06_SVM.R: -------------------------------------------------------------------------------- 1 | ## ------------------------------------------------------------------------ 2 | 3 | library(e1071) 4 | library(rpart) 5 | 6 | breast_cancer_data <- read.table("~/Dropbox/Book Writing - Drafts/Chapter Drafts/Final Artwork and Code/Chapter 6/Dataset/breast-cancer-wisconsin.data.txt",sep=",") 7 | breast_cancer_data$V11 = as.factor(breast_cancer_data$V11) 8 | 9 | summary(breast_cancer_data) 10 | 11 | 12 | ## ------------------------------------------------------------------------ 13 | 14 | ## split data into a train and test set 15 | index <- 1:nrow(breast_cancer_data) 16 | test_data_index <- sample(index, trunc(length(index)/3)) 17 | test_data <- breast_cancer_data[test_data_index,] 18 | train_data <- breast_cancer_data[-test_data_index,] 19 | 20 | 21 | ## ------------------------------------------------------------------------ 22 | 23 | svm.model <- svm(V11 ~ ., data = train_data, cost = 100, gamma = 1) 24 | 25 | 26 | ## ------------------------------------------------------------------------ 27 | 28 | library(gmodels) 29 | 30 | svm_pred_train <- predict(svm.model, train_data[,-11]) 31 | CrossTable(train_data$V11, svm_pred_train, 32 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 33 | dnn = c('actual default', 'predicted default')) 34 | 35 | 36 | ## ------------------------------------------------------------------------ 37 | 38 | svm_pred_test <- predict(svm.model, test_data[,-11]) 39 | CrossTable(test_data$V11, svm_pred_test, 40 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, 41 | dnn = c('actual default', 'predicted default')) 42 | 43 | 44 | ## ----ch06_multiclassSVM-------------------------------------------------- 45 | # Read the house Worth Data 46 | Data_House_Worth <- read.csv("~/Dropbox/Book Writing - Drafts/Chapter Drafts/Final Artwork and Code/Chapter 6/Dataset/House Worth Data.csv",header=TRUE); 47 | 48 | library( 'e1071' ) 49 | #Fit a multiclass SVM 50 | svm_multi_model <- svm( HouseNetWorth ~ StoreArea + LawnArea, Data_House_Worth ) 51 | 52 | #Display the model 53 | svm_multi_model 54 | 55 | #get the predicted vaule for all the set 56 | res <- predict( svm_multi_model, newdata=Data_House_Worth ) 57 | 58 | #Classification Matrix 59 | table(Data_House_Worth$HouseNetWorth,res) 60 | 61 | #Classification Rate 62 | 63 | sum(diag(table(Data_House_Worth$HouseNetWorth,res)))/nrow(Data_House_Worth) 64 | 65 | -------------------------------------------------------------------------------- /Code/Ch06_TextMining.R: -------------------------------------------------------------------------------- 1 | 2 | ## ------------------------------------------------------------------------ 3 | library(data.table) 4 | 5 | fine_food_data <- read.csv("Dataset/Food_Reviews.csv",stringsAsFactors = FALSE) 6 | fine_food_data$Score <- as.factor(fine_food_data$Score) 7 | 8 | str(fine_food_data[-10]) 9 | 10 | # Last column - Custmer review in free text 11 | 12 | head(fine_food_data[,10],2) 13 | 14 | 15 | ## ---- message=FALSE,warning=FALSE---------------------------------------- 16 | 17 | library(caTools) 18 | 19 | # Randomly split data and use only 10% of the dataset 20 | set.seed(90) 21 | split = sample.split(fine_food_data$Score, SplitRatio = 0.10) 22 | 23 | fine_food_data = subset(fine_food_data, split == TRUE) 24 | 25 | select_col <- c("Id","HelpfulnessNumerator","HelpfulnessDenominator","Score","Summary","Text") 26 | 27 | fine_food_data_selected <- fine_food_data[,select_col] 28 | 29 | 30 | ## ------------------------------------------------------------------------ 31 | fine_food_data_selected[2,6] 32 | 33 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 34 | library(LSAfun) 35 | genericSummary(fine_food_data_selected[2,6],k=1) 36 | 37 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 38 | library(LSAfun) 39 | genericSummary(fine_food_data_selected[2,6],k=2) 40 | 41 | ## ------------------------------------------------------------------------ 42 | fine_food_data_selected[2,5] 43 | 44 | ## ----warning=FALSE,message=FALSE,eval=FALSE------------------------------ 45 | ## 46 | ## library(proxy) 47 | ## library(tm) 48 | ## 49 | ## fine_food_data_corpus <- VCorpus(VectorSource(fine_food_data_selected$Text)) 50 | ## 51 | ## fine_food_data_text_tdm <- TermDocumentMatrix(fine_food_data_corpus, control = list( 52 | ## tolower = TRUE, 53 | ## removeNumbers = TRUE, 54 | ## stopwords = TRUE, 55 | ## removePunctuation = TRUE, 56 | ## stemming = TRUE 57 | ## )) 58 | ## 59 | ## matrix_c <- as.matrix(fine_food_data_text_tdm) 60 | ## 61 | ## 62 | ## wc_freq <- sort(rowSums(matrix_c)) 63 | ## wc_tmdata <- data.frame(words=names(wc_freq), wc_freq) 64 | ## 65 | ## wc_tmdata <- na.omit(wc_tmdata) 66 | ## 67 | ## tail(wc_tmdata$words,100) 68 | ## 69 | ## 70 | ## 71 | ## review_diss <- proxy::dist(matrix_c, method = "cosine") 72 | ## 73 | ## docsdissim2 <- as.matrix(docsdissim) 74 | ## rownames(docsdissim2) <- titles 75 | ## colnames(docsdissim2) <- titles 76 | ## docsdissim2 77 | ## h <- hclust(docsdissim, method = "ward") 78 | ## plot(h, labels = titles, sub = "") 79 | ## 80 | 81 | ## ------------------------------------------------------------------------ 82 | library(tm) 83 | fine_food_data_corpus <- VCorpus(VectorSource(fine_food_data_selected$Text)) 84 | 85 | #Standardize the text - Pre-Processing 86 | 87 | fine_food_data_text_dtm <- DocumentTermMatrix(fine_food_data_corpus, control = list( 88 | tolower = TRUE, 89 | removeNumbers = TRUE, 90 | stopwords = TRUE, 91 | removePunctuation = TRUE, 92 | stemming = TRUE 93 | )) 94 | 95 | # save frequently-appearing terms( more than 500 times) to a character vector 96 | 97 | fine_food_data_text_freq <- findFreqTerms(fine_food_data_text_dtm, 500) 98 | 99 | # create DTMs with only the frequent terms 100 | fine_food_data_text_dtm <- fine_food_data_text_dtm[ , fine_food_data_text_freq] 101 | 102 | tm::inspect(fine_food_data_text_dtm[1:5,1:10]) 103 | 104 | #Create a tf-idf matrix 105 | fine_food_data_tfidf <- weightTfIdf(fine_food_data_text_dtm, normalize = FALSE) 106 | 107 | tm::inspect(fine_food_data_tfidf[1:5,1:10]) 108 | 109 | 110 | ## ----message=FALSE,warning=FALSE----------------------------------------- 111 | 112 | library(NLP) 113 | library(tm) 114 | 115 | 116 | fine_food_data_corpus <- Corpus(VectorSource(fine_food_data_selected$Text[1:3])) 117 | fine_food_data_cleaned <- tm_map(fine_food_data_corpus, PlainTextDocument) 118 | 119 | #tolwer 120 | fine_food_data_cleaned <- tm_map(fine_food_data_cleaned, tolower) 121 | fine_food_data_cleaned[[1]] 122 | 123 | fine_food_data_cleaned <-tm_map(fine_food_data_cleaned, removeWords, stopwords("english")) 124 | fine_food_data_cleaned[[1]] 125 | 126 | fine_food_data_cleaned <- tm_map(fine_food_data_cleaned, removePunctuation) 127 | fine_food_data_cleaned[[1]] 128 | 129 | fine_food_data_cleaned <- tm_map(fine_food_data_cleaned, removeNumbers) 130 | fine_food_data_cleaned[[1]] 131 | 132 | fine_food_data_cleaned <-tm_map(fine_food_data_cleaned, stripWhitespace) 133 | fine_food_data_cleaned[[1]] 134 | 135 | ## ------------------------------------------------------------------------ 136 | library(openNLP) 137 | library(NLP) 138 | 139 | fine_food_data_string <- NLP::as.String(fine_food_data_cleaned[[1]]) 140 | 141 | sent_token_annotator <- Maxent_Sent_Token_Annotator() 142 | word_token_annotator <- Maxent_Word_Token_Annotator() 143 | fine_food_data_string_an <- annotate(fine_food_data_string, list(sent_token_annotator, word_token_annotator)) 144 | 145 | pos_tag_annotator <- Maxent_POS_Tag_Annotator() 146 | fine_food_data_string_an2 <- annotate(fine_food_data_string, pos_tag_annotator, fine_food_data_string_an) 147 | 148 | ## Variant with POS tag probabilities as (additional) features. 149 | head(annotate(fine_food_data_string, Maxent_POS_Tag_Annotator(probs = TRUE), fine_food_data_string_an2)) 150 | 151 | ## Determine the distribution of POS tags for word tokens. 152 | fine_food_data_string_an2w <- subset(fine_food_data_string_an2, type == "word") 153 | tags <- sapply(fine_food_data_string_an2w$features, `[[`, "POS") 154 | table(tags) 155 | 156 | plot(table(tags), type = "h", xlab="Part-Of_Speech", ylab = "Frequency") 157 | 158 | ## Extract token/POS pairs (all of them) 159 | head(sprintf("%s/%s", fine_food_data_string[fine_food_data_string_an2w], tags),15) 160 | 161 | 162 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 163 | 164 | library(SnowballC) 165 | library(wordcloud) 166 | library(tm) 167 | library(slam) 168 | 169 | fine_food_data_corpus <- VCorpus(VectorSource(fine_food_data_selected$Text)) 170 | 171 | fine_food_data_text_tdm <- TermDocumentMatrix(fine_food_data_corpus, control = list( 172 | tolower = TRUE, 173 | removeNumbers = TRUE, 174 | stopwords = TRUE, 175 | removePunctuation = TRUE, 176 | stemming = TRUE 177 | )) 178 | 179 | #reducing sparsity 180 | wc_tdm <- rollup(fine_food_data_text_tdm, 2, na.rm=TRUE, FUN = sum) 181 | matrix_c <- as.matrix(wc_tdm) 182 | wc_freq <- sort(rowSums(matrix_c)) 183 | wc_tmdata <- data.frame(words=names(wc_freq), wc_freq) 184 | 185 | wc_tmdata <- na.omit(wc_tmdata) 186 | 187 | wordcloud (tail(wc_tmdata$words,100), tail(wc_tmdata$wc_freq,100), random.order=FALSE, colors=brewer.pal(8, "Dark2")) 188 | 189 | 190 | ## ----ch06_setupTwitter_api----------------------------------------------- 191 | library("stringr") 192 | library("dplyr") 193 | #install.packages("twitteR") 194 | library("twitteR") 195 | #getTwitterOAuth(consumer_key, consumer_secret) 196 | consumerKey <- "YOUR KEY" 197 | consumerSecret <- " YOUR KEY" 198 | 199 | #Below two tokens need to be used when you want to pull tweets from your own account 200 | accessToken <- "YOUR KEY" 201 | accessTokenSecret <- "YOUR KEY" 202 | 203 | setup_twitter_oauth(consumerKey, consumerSecret,accessToken,accessTokenSecret) 204 | 205 | kIgnoreTweet <- "update:|nobot:" 206 | 207 | GetTweets <- function(handle, n = 1000) { 208 | 209 | timeline <- userTimeline(handle, n = n) 210 | tweets <- sapply(timeline, function(x) { 211 | c(x$getText(), x$getCreated()) 212 | }) 213 | tweets <- data.frame(t(tweets)) 214 | names(tweets) <- c("text.orig", "created.orig") 215 | 216 | tweets$text <- tolower(tweets$text.orig) 217 | tweets$created <- as.POSIXct(as.numeric(as.vector(tweets$created.orig)), origin="1970-01-01") 218 | 219 | arrange(tweets, created) 220 | } 221 | 222 | handle <- "@TimesNow" 223 | tweets <- GetTweets(handle, 100) 224 | 225 | #Store the tweets as used in the book for future reproducibility 226 | #write.csv(tweets,"Dataset/Twitter Feed From TimesNow.csv",row.names = FALSE) 227 | tweets <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Twitter Feed From TimesNow.csv") 228 | tweets[1:5,] 229 | 230 | ## ----ch06_microsoft_services--------------------------------------------- 231 | #install.packages("mscstexta4r") 232 | library(mscstexta4r) 233 | 234 | #Put the authentication APi keys you got from microsoft 235 | 236 | Sys.setenv(MSCS_TEXTANALYTICS_URL = "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/") 237 | Sys.setenv(MSCS_TEXTANALYTICS_KEY = "YOUR KEY") 238 | 239 | #Intiliatize the service 240 | textaInit() 241 | 242 | 243 | ## ----ch06_loadADocument-------------------------------------------------- 244 | # Load Packages 245 | require(tm) 246 | require(NLP) 247 | require(openNLP) 248 | 249 | #Read the Forbes article into R environment 250 | y <- paste(scan("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/india_after_independence.txt", what="character", sep=" "),collapse=" ") 251 | 252 | convert_text_to_sentences <- function(text, lang = "en") { 253 | # Function to compute sentence annotations using the Apache OpenNLP Maxent sentence detector employing the default model for language 'en'. 254 | sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang) 255 | 256 | # Convert text to class String from package NLP 257 | text <- as.String(text) 258 | 259 | # Sentence boundaries in text 260 | sentence.boundaries <- annotate(text, sentence_token_annotator) 261 | 262 | # Extract sentences 263 | sentences <- text[sentence.boundaries] 264 | 265 | # return sentences 266 | return(sentences) 267 | } 268 | 269 | # Convert the text into sentances 270 | article_text = convert_text_to_sentences(y, lang = "en") 271 | 272 | 273 | ## ----ch06_text_sentiment------------------------------------------------- 274 | 275 | document_lang <- rep("en", length(tweets$text)) 276 | 277 | tryCatch({ 278 | 279 | # Perform sentiment analysis 280 | output_1 <- textaSentiment( 281 | documents = tweets$text, # Input sentences or documents 282 | languages = document_lang 283 | # "en"(English, default)|"es"(Spanish)|"fr"(French)|"pt"(Portuguese) 284 | ) 285 | 286 | }, error = function(err) { 287 | 288 | # Print error 289 | geterrmessage() 290 | 291 | }) 292 | 293 | merged <- output_1$results 294 | 295 | #Order the tweets with sentment score 296 | ordered_tweets <- merged[order(merged$score),] 297 | 298 | #Top 5 negative tweets 299 | ordered_tweets[1:5,] 300 | 301 | #Top 5 Positive 302 | ordered_tweets[95:100,] 303 | 304 | ## ----ch06_text_topic_detect---------------------------------------------- 305 | 306 | handle <- "@CNN" 307 | topic_text <- GetTweets(handle, 150) 308 | #write.csv(topic_text,"Dataset/Twitter Feed from CNN.csv",row.names=FALSE) 309 | topic_text <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Twitter Feed from CNN.csv") 310 | 311 | tryCatch({ 312 | 313 | # Detect top topics in group of documents 314 | output_2 <- textaDetectTopics( 315 | topic_text$text, # At least 100 documents (English only) 316 | stopWords = NULL, # Stop word list (optional) 317 | topicsToExclude = NULL, # Topics to exclude (optional) 318 | minDocumentsPerWord = NULL, # Threshold to exclude rare topics (optional) 319 | maxDocumentsPerWord = NULL, # Threshold to exclude ubiquitous topics (optional) 320 | resultsPollInterval = 30L, # Poll interval (in s, default: 30s, use 0L for async) 321 | resultsTimeout = 1200L, # Give up timeout (in s, default: 1200s = 20mn) 322 | verbose = FALSE # If set to TRUE, print every poll status to stdout 323 | ) 324 | 325 | }, error = function(err) { 326 | 327 | # Print error 328 | geterrmessage() 329 | 330 | }) 331 | output_2 332 | 333 | ## ----ch06_lang_detect---------------------------------------------------- 334 | #Below i am creating 5 messgaes in 5 different langauge using google translater 335 | #1-ARABIC, 2-POTUGESE, 3- ENGLISH , 4- CHINESE AND 5 - HINDI 336 | 337 | lang_detect<- c("أنا عالم البيانات","Eu sou um cientista de dados","I am a data scientist","我是一个科学家的数据"," 338 | मैं एक डेटा वैज्ञानिक हूँ") 339 | 340 | 341 | tryCatch({ 342 | 343 | # Detect top topics in group of documents 344 | # Detect languages used in documents 345 | output_3 <- textaDetectLanguages( 346 | lang_detect, # Input sentences or documents 347 | numberOfLanguagesToDetect = 1L # Default: 1L 348 | ) 349 | 350 | }, error = function(err) { 351 | 352 | # Print error 353 | geterrmessage() 354 | 355 | }) 356 | output_3 357 | 358 | ## ----ch06_text_summarization--------------------------------------------- 359 | 360 | article_lang <- rep("en", length(article_text)) 361 | 362 | tryCatch({ 363 | 364 | # Get key talking points in documents 365 | output_4 <- textaKeyPhrases( 366 | documents = article_text, # Input sentences or documents 367 | languages = article_lang 368 | # "en"(English, default)|"de"(German)|"es"(Spanish)|"fr"(French)|"ja"(Japanese) 369 | ) 370 | 371 | }, error = function(err) { 372 | 373 | # Print error 374 | geterrmessage() 375 | 376 | }) 377 | 378 | #Print the top 5 summary 379 | output_4$results[1:5,1] 380 | 381 | 382 | -------------------------------------------------------------------------------- /Code/Ch07_Singh.R: -------------------------------------------------------------------------------- 1 | ## ------------------------------------------------------------------------ 2 | library(knitr) 3 | opts_chunk$set(dev="png", 4 | dev.args=list(type="cairo"), 5 | dpi=96) 6 | knitr::opts_chunk$set( 7 | fig.path = "Images/" 8 | ) 9 | 10 | ## ----chap7_Data_load, warnings=FALSE, message=FALSE---------------------- 11 | setwd("C:/Personal/Machine Learning/Final Artwork and Code/Chapter 7"); 12 | 13 | library(data.table) 14 | 15 | Data_House_Price <- fread("Dataset/House Sale Price Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE) 16 | 17 | str(Data_House_Price) 18 | 19 | 20 | ## ----chap7_Data_load_01, warning=FALSE,message=FALSE--------------------- 21 | 22 | dim(Data_House_Price) 23 | 24 | #Check the distribution of dependent variable 25 | hist(Data_House_Price$HousePrice/1000000, breaks=20, col="blue", xlab="House Sale Price(Million)", 26 | main="Distribution of House Sale Price") 27 | 28 | #Also look at the summary of the Dependent Variable 29 | summary(Data_House_Price$HousePrice) 30 | 31 | #Pulling out relevant columns and assigning required fields in the dataset 32 | Data_House_Price <- Data_House_Price[,.(HOUSE_ID,HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,Rating,SaleType)] 33 | 34 | #Omit Anu missing value 35 | Data_House_Price <- na.omit(Data_House_Price) 36 | 37 | Data_House_Price$HOUSE_ID <- as.character(Data_House_Price$HOUSE_ID) 38 | 39 | 40 | ## ----chap7_Data_load_03, warning=FALSE,message=FALSE--------------------- 41 | 42 | Data_Purchase <- fread("Dataset/Purchase Prediction Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE) 43 | str(Data_Purchase) 44 | 45 | ## ----chap7_Data_load_02, warning=FALSE,message=FALSE--------------------- 46 | 47 | dim(Data_Purchase); 48 | 49 | #Check the distribution of data before grouping 50 | table(Data_Purchase$ProductChoice) 51 | barplot(table(Data_Purchase$ProductChoice),main="Distribution of ProductChoice", xlab= "ProductChoice Options", col="Blue") 52 | 53 | 54 | #Pulling out only the relevant data to this chapter 55 | 56 | Data_Purchase <- Data_Purchase[,.(CUSTOMER_ID,ProductChoice,MembershipPoints,IncomeClass,CustomerPropensity,LastPurchaseDuration)] 57 | 58 | #Delete NA from subset 59 | 60 | Data_Purchase <- na.omit(Data_Purchase) 61 | 62 | Data_Purchase$CUSTOMER_ID <- as.character(Data_Purchase$CUSTOMER_ID) 63 | 64 | 65 | ## ----ch07_population_stability_01, warning=FALSE,message=FALSE----------- 66 | #Create set 1 and set 2 : First 2/3 as set 1 and remaining 1/3 as set 2 67 | summary(Data_House_Price$HousePrice) 68 | 69 | set_1 <- Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),]$HousePrice 70 | summary(set_1) 71 | 72 | set_2 <- Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),]$HousePrice 73 | summary(set_2) 74 | 75 | ## ----ch07_population_stability_02, warning=FALSE,message=FALSE----------- 76 | #Defining a function to give ks test result and ECDF plots on log scale 77 | library(rgr) 78 | ks_test <- function (xx1, xx2, xlab = "House Price", x1lab = deparse(substitute(xx1)),x2lab = deparse(substitute(xx2)), ylab = "Empirical Cumulative Distribution Function",log = TRUE, main = "Empirical EDF Plots - K-S Test", pch1 = 3, col1 = 2, pch2 = 4, col2 = 4, cex = 0.8, cexp = 0.9, ...) 79 | { 80 | temp.x <- remove.na(xx1) 81 | x1 <- sort(temp.x$x[1:temp.x$n]) 82 | nx1 <- temp.x$n 83 | y1 <- ((1:nx1) - 0.5)/nx1 84 | temp.x <- remove.na(xx2) 85 | x2 <- sort(temp.x$x[1:temp.x$n]) 86 | nx2 <- temp.x$n 87 | y2 <- ((1:nx2) - 0.5)/nx2 88 | xlim <- range(c(x1, x2)) 89 | if (log) { 90 | logx <- "x" 91 | if (xlim[1] <= 0) 92 | stop("\n Values cannot be .le. zero for a log plot\n") 93 | } 94 | else logx <- "" 95 | plot(x1, y1, log = logx, xlim = xlim, xlab = xlab, ylab = ylab, 96 | main = main, type = "n", ...) 97 | points(x1, y1, pch = pch1, col = col1, cex = cexp) 98 | points(x2, y2, pch = pch2, col = col2, cex = cexp) 99 | temp <- ks.test(x1, x2) 100 | print(temp) 101 | } 102 | 103 | ## ----ch07_population_stability_03, warning=FALSE,message=FALSE----------- 104 | #Perform K-S test on set_1 and set_2 and also display Empirical Cummulative Distribution Plots 105 | ks_test(set_1,set_2) 106 | 107 | ## ----ch07_population_stability_04, warning=FALSE,message=FALSE----------- 108 | #Maniplate the set 2 109 | set_2_new <- set_2*exp(set_2/100000) 110 | 111 | # Now do the k-s test again 112 | ks_test(set_1,set_2_new) 113 | 114 | ## ----ch07_population_stability_05, warning=FALSE,message=FALSE----------- 115 | #Let's create set 1 and set 2 from our Purchase Prediction Data 116 | print("Distribution of ProductChoice values before partition") 117 | table(Data_Purchase$ProductChoice) 118 | 119 | set_1 <- Data_Purchase[1:floor(nrow(Data_Purchase)*(2/3)),]$ProductChoice 120 | table(set_1) 121 | 122 | set_2 <- Data_Purchase[floor(nrow(Data_Purchase)*(2/3) + 1):nrow(Data_Purchase),]$ProductChoice 123 | table(set_2) 124 | 125 | ## ----ch07_population_stability_06, warning=FALSE,message=FALSE----------- 126 | #PSI='??((n1i/N1)'??(n2i/N2))'??ln((n1i/N1)/(n2i/N2)) 127 | 128 | temp1 <- (table(set_1)/length(set_1) - table(set_2)/length(set_2)) 129 | 130 | temp2 <- log((table(set_1)/length(set_1))*(table(set_2)/length(set_2))) 131 | 132 | psi <- abs(sum(temp1*temp2)) 133 | 134 | if(psi < 0.1 ){ 135 | cat("The population is stable with a PSI of " ,psi) 136 | } else if (psi >=0.1 & psi <= 0.2) { 137 | cat("The population need further investigation with a PSI of " ,psi) 138 | } else { 139 | cat("The population has gone thorugh significant changes with a PSi of " ,psi) 140 | } 141 | 142 | 143 | ## ----ch07_Linear_reg_model_01, warning=FALSE,message=FALSE--------------- 144 | # Create a model on Set 1 = Train data 145 | 146 | 147 | linear_reg_model <- lm(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + Rating + SaleType ,data=Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),]) 148 | 149 | summary(linear_reg_model) 150 | 151 | ## ----ch07_mae_01, warning=FALSE,message=FALSE---------------------------- 152 | #Create the test data which is set 2 153 | test <- Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),] 154 | 155 | #Fit the linear regression model on this and get predicted values 156 | 157 | predicted_lm <- predict(linear_reg_model,test, type= "response") 158 | 159 | actual_predicted <- as.data.frame(cbind(as.numeric(test$HOUSE_ID),as.numeric(test$HousePrice),as.numeric(predicted_lm))) 160 | 161 | names(actual_predicted) <- c("HOUSE_ID","Actual","Predicted") 162 | 163 | #Find the absolute residual and then take mean of that 164 | library(ggplot2) 165 | 166 | #Plot Actual vs Predicted values for Test Cases 167 | ggplot(actual_predicted,aes(x = actual_predicted$HOUSE_ID,color=Series)) + 168 | geom_line(data = actual_predicted, aes(x = actual_predicted$HOUSE_ID, y = Actual, color = "Actual")) + 169 | geom_line(data = actual_predicted, aes(x = actual_predicted$HOUSE_ID, y = Predicted, color = "Predicted")) + xlab('HOUSE_ID') + ylab('House Sale Price') 170 | 171 | 172 | ## ----ch07_mae_02, warning=FALSE,message=FALSE---------------------------- 173 | #Remove NA from test, as we have not done any treatment for NA 174 | actual_predicted <- na.omit(actual_predicted) 175 | 176 | #First take Actual - Predicted, then take mean of absolute errors(residual) 177 | 178 | mae <- sum(abs(actual_predicted$Actual - actual_predicted$Predicted))/nrow(actual_predicted) 179 | 180 | cat("Mean Absolute Error for the test case is ", mae) 181 | 182 | ## ----ch07_rmse_01, warning=FALSE,message=FALSE--------------------------- 183 | #As we have already have actual and predicted value we can directly calculate the RSME value 184 | 185 | rmse <- sqrt(sum((actual_predicted$Actual-actual_predicted$Predicted)^2)/nrow(actual_predicted)) 186 | 187 | cat("Root Mean Square Error for the test case is ", rmse) 188 | 189 | ## ----ch07_r_sqr_01, warning=FALSE,message=FALSE-------------------------- 190 | #Model training data ( we will show our analysis on this dataset) 191 | 192 | train <- Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),.(HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,StreetHouseFront,LawnArea,Rating,SaleType)]; 193 | 194 | #Omiting the NA from dataset 195 | 196 | train <- na.omit(train) 197 | 198 | # Get a linear regression model 199 | linear_reg_model <- lm(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + StreetHouseFront + LawnArea + Rating + SaleType ,data=train) 200 | 201 | # Show the function call to identify what model we will be working on 202 | 203 | print(linear_reg_model$call) 204 | 205 | #System generated Square value 206 | cat("The system generated R square value is " , summary(linear_reg_model)$r.squared) 207 | 208 | ## ----ch07_r_sqr_02, warning=FALSE,message=FALSE-------------------------- 209 | #calculate Total Sum of Squares 210 | 211 | SST <- sum((train$HousePrice - mean(train$HousePrice))^2); 212 | 213 | #Calculate Regression Sum of Squares 214 | 215 | SSR <- sum((linear_reg_model$fitted.values - mean(train$HousePrice))^2); 216 | 217 | #Calulate residual(Error) Sum of Squares 218 | 219 | SSE <- sum((train$HousePrice - linear_reg_model$fitted.values)^2); 220 | 221 | 222 | ## ----ch07_r_sqr_03, warning=FALSE,message=FALSE-------------------------- 223 | #calulate R-squared 224 | 225 | R_Sqr <- 1- (SSE/SST) 226 | 227 | #Dipslay the calulated R-Sqr 228 | 229 | cat("The calculated R Square is ", R_Sqr) 230 | 231 | ## ----ch07_classi_01, warning=FALSE,message=FALSE------------------------- 232 | #Remove the data having NA. NA is ignored in modeling algorithms 233 | Data_Purchase<-na.omit(Data_Purchase) 234 | 235 | #Sample equal sizes from Data_Purchase to reduce class imbalance issue 236 | library(splitstackshape) 237 | Data_Purchase_Model<-stratified(Data_Purchase, group=c("ProductChoice"),size=10000,replace=FALSE) 238 | 239 | print("The Distribution of equal classes is as below") 240 | table(Data_Purchase_Model$ProductChoice) 241 | 242 | #Build the multinomial model on Train Data (Set_1) and then test data (Set_2) will be used for performance testing 243 | set.seed(917); 244 | train <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=nrow(Data_Purchase_Model)*(0.7), replace = TRUE, prob = NULL),] 245 | dim(train) 246 | 247 | test <- Data_Purchase_Model[!(Data_Purchase_Model$CUSTOMER_ID %in% train$CUSTOMER_ID),] 248 | dim(test) 249 | 250 | #fit a multinomial logictic model 251 | library(nnet) 252 | mnl_model <- multinom (ProductChoice ~ MembershipPoints + IncomeClass + CustomerPropensity + LastPurchaseDuration, data = train) 253 | 254 | #Display the summary of model statistics 255 | mnl_model 256 | 257 | #Predict the probabilitilies 258 | predicted_test <- as.data.frame(predict(mnl_model, newdata = test, type= "probs")) 259 | 260 | #Display the predcited probabilities 261 | head(predicted_test) 262 | 263 | #Do the prediction based in highest probability 264 | test_result <- apply(predicted_test,1,which.max) 265 | 266 | table(test_result) 267 | #Combine to get predicted and actuals at one place 268 | 269 | result <- as.data.frame(cbind(test$ProductChoice,test_result)) 270 | 271 | colnames(result) <- c("Actual Class", "Predicted Class") 272 | 273 | head(result) 274 | 275 | ## ----ch07_classi_02, warning=FALSE,message=FALSE------------------------- 276 | #Create the classification matrix 277 | cmat <- as.matrix(table(Actual = result$`Actual Class`, Predicted = result$`Predicted Class`)) 278 | 279 | #Calculated above mentioned measures in order 280 | n <- sum(cmat) ; 281 | cat("Number of Cases ", n); 282 | 283 | nclass <- nrow(cmat); 284 | cat("Number of classes ", nclass); 285 | 286 | correct_class <- diag(cmat); 287 | cat("Number of Correct Classification ", correct_class); 288 | 289 | rowsums <- apply(cmat, 1, sum); 290 | cat("Number of Instances per class ", rowsums); 291 | 292 | colsums <- apply(cmat, 2, sum); 293 | cat("Number of Instances per predicted class ", colsums); 294 | 295 | actual_dist <- rowsums / n; 296 | cat("Distribution of actuals ", actual_dist); 297 | 298 | predict_dist <- colsums / n; 299 | cat("Distribution of predicted ", predict_dist); 300 | 301 | ## ----ch07_classi_03, warning=FALSE,message=FALSE------------------------- 302 | #print the classification matrix - on test data 303 | 304 | print(cmat) 305 | 306 | #Print Classification Rate 307 | 308 | classification_rate <- sum(correct_class)/n; 309 | print(classification_rate) 310 | 311 | ## ----ch07_classi_05, warning=FALSE,message=FALSE------------------------- 312 | #The analysis is shown for ProductChoice == 1 313 | Actual_Class <- ifelse(result$`Actual Class` == 1,"One","Rest"); 314 | Predicted_Class <- ifelse(result$`Predicted Class` == 1, "One", "Rest"); 315 | 316 | ss_analysis <- as.data.frame(cbind(Actual_Class,Predicted_Class)); 317 | 318 | #Create classification matrix for ProductChoice == 1 319 | 320 | cmat_ProductChoice1 <- as.matrix(table(Actual = ss_analysis$Actual_Class, Predicted = ss_analysis$Predicted_Class)); 321 | 322 | print(cmat_ProductChoice1) 323 | 324 | classification_rate_ProductChoice1 <- sum(diag(cmat_ProductChoice1))/n; 325 | 326 | cat("Classification rate for ProductChoice 1 is ", classification_rate_ProductChoice1) 327 | 328 | # Calculate TPR and TNR 329 | 330 | TPR <- cmat_ProductChoice1[1,1]/(cmat_ProductChoice1[1,1] + cmat_ProductChoice1[1,2]); 331 | 332 | cat(" Sensitivity or True Positive Rate is ", TPR); 333 | 334 | TNR <- cmat_ProductChoice1[2,2]/(cmat_ProductChoice1[2,1] + cmat_ProductChoice1[2,2]) 335 | 336 | cat(" Specificity or True Negative Rate is ", TNR); 337 | 338 | 339 | ## ----ch07_classi_06, warning=FALSE,message=FALSE------------------------- 340 | # create a new model on train with "One" =1 and "Rest" = 0 341 | 342 | train$ProductChoice_binom <- ifelse(train$ProductChoice == 1,1,0); 343 | test$ProductChoice_binom <- ifelse(test$ProductChoice == 1,1,0); 344 | 345 | glm_ProductChoice_binom <- glm( ProductChoice_binom ~ MembershipPoints + IncomeClass + CustomerPropensity + LastPurchaseDuration, data=train, family = binomial(link="logit")) 346 | 347 | #Print the summary of binomial logistic model 348 | summary(glm_ProductChoice_binom) 349 | 350 | #Now create the performance data set to create AUC curve 351 | library(ROCR) 352 | test_binom <- predict(glm_ProductChoice_binom,newdata=test, type = "response") 353 | pred <- prediction(test_binom, test$ProductChoice_binom) 354 | perf <- performance(pred,"tpr","fpr") 355 | 356 | # calculating AUC 357 | auc <- unlist(slot(performance(pred,"auc"),"y.values")); 358 | 359 | cat("The Area Under ROC curve for this model is ",auc); 360 | 361 | #Plotting the AUC curve 362 | library(ggplot2) 363 | library(plotROC) 364 | debug <- as.data.frame(cbind(test_binom,test$ProductChoice_binom)) 365 | ggplot(debug, aes(d = V2, m = test_binom)) + geom_roc() 366 | 367 | ## ----ch07_prob_01, warning=FALSE,message=FALSE--------------------------- 368 | library(caret) 369 | library(randomForest) 370 | set.seed(917); 371 | 372 | #Model training data ( we will show our analysis on this dataset) 373 | 374 | train <- Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),.(HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,StreetHouseFront,LawnArea,Rating,SaleType)]; 375 | 376 | #Create the test data which is set 2 377 | test <- Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),.(HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,StreetHouseFront,LawnArea,Rating,SaleType)] 378 | 379 | #Omiting the NA from dataset 380 | train <- na.omit(train) 381 | test <- na.omit(test) 382 | 383 | #Create the k subsets, let's take k as 10 (i.e., 10-fold cross validation) 384 | k_10_fold <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE) 385 | 386 | #Fit the model on folds and use rmse as metric to fit the model 387 | model_fitted <- train(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + StreetHouseFront + LawnArea + Rating + SaleType, data=train, family = identity,trControl = k_10_fold, tuneLength = 5) 388 | 389 | #Display the summary of the cross validation 390 | model_fitted 391 | 392 | ## ----ch07_prob_02, warning=FALSE,message=FALSE--------------------------- 393 | 394 | #Create the the boot experiment, let's take samples as as 10 (i.e., 10-sample bootstarped) 395 | boot_10s <- trainControl(method = "boot", number = 10, savePredictions = TRUE) 396 | 397 | #Fit the model on bootstraps and use rmse as metric to fit the model 398 | model_fitted <- train(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + StreetHouseFront + LawnArea + Rating + SaleType, data=train, family = identity,trControl = boot_10s, tuneLength = 5) 399 | 400 | #Display the summary of the boostraped model 401 | model_fitted 402 | 403 | ## ----ch07_kappa, warning=FALSE,message=FALSE----------------------------- 404 | library(caret) 405 | library(mlbench) 406 | 407 | #We will use the Purchase Prediction Data with a very simple model to illustarte the kappa and accuracy measure 408 | 409 | set.seed(917); 410 | train_kappa <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=5000, replace = TRUE, prob = NULL),] 411 | 412 | #train() function confuses between numeric levels, hence convert the dependent into text 413 | train_kappa$ProductChoice_multi <- ifelse(train_kappa$ProductChoice == 1,"A", 414 | ifelse(train_kappa$ProductChoice == 2, "B", 415 | ifelse(train_kappa$ProductChoice == 3,"C","D"))); 416 | 417 | train_kappa <- na.omit(train_kappa) 418 | #Set the experiment 419 | cntrl <- trainControl(method="cv", number=5, classProbs = TRUE) 420 | 421 | #Distribution of ProductChoices 422 | table(train_kappa$ProductChoice_multi) 423 | 424 | #Making the column names as legitemate names 425 | colnames(train_kappa) <- make.names(names(train_kappa), unique = TRUE, allow_ = TRUE) 426 | 427 | #Convert all the factors into factors in R 428 | train_kappa$ProductChoice_multi <- as.factor(train_kappa$ProductChoice_multi) 429 | train_kappa$CustomerPropensity <- as.factor(train_kappa$CustomerPropensity) 430 | train_kappa$LastPurchaseDuration <- as.factor(train_kappa$LastPurchaseDuration) 431 | 432 | #Fit the model with method as RandomForest. 433 | model_fitted <- train(ProductChoice_multi ~ CustomerPropensity + LastPurchaseDuration, data=train_kappa, method="rf", metric="Accuracy",trControl=cntrl) 434 | 435 | # The result displayed the kappa metrics 436 | print(model_fitted) 437 | 438 | #Display the predicted values 439 | pred <- predict(model_fitted, newdata=train_kappa) 440 | confusionMatrix(data=pred, train_kappa$ProductChoice_multi) 441 | 442 | -------------------------------------------------------------------------------- /Code/Ch08_Singh.R: -------------------------------------------------------------------------------- 1 | ## ------------------------------------------------------------------------ 2 | library(knitr) 3 | opts_chunk$set(dev="png", 4 | dev.args=list(type="cairo"), 5 | dpi=96) 6 | knitr::opts_chunk$set( 7 | fig.path = "Images/" 8 | ) 9 | 10 | ## ----caret_install, eval=FALSE------------------------------------------- 11 | ## install.packages("caret", dependencies = c("Depends", "Suggests")) 12 | 13 | ## ----caret_01, eval= FALSE----------------------------------------------- 14 | ## rfControl <- trainControl(# Example, 10-fold Cross Validation 15 | ## method = "repeatedcv", # Others are available, such as repeated K-fold cross-validation, leave-one-out etc 16 | ## number = 10, # Number of folds 17 | ## repeats = 10 # repeated ten times 18 | ## ) 19 | 20 | ## ----caret_02, eval= FALSE----------------------------------------------- 21 | ## set.seed(917) 22 | ## randomForectFit1 <- train(Class ~ ., # Define the model equation 23 | ## data = training, # Define the modeling data 24 | ## method = "rf", # List the model you want to use, caret provide list of options in train Model listor 25 | ## trControl = rfControl, # This defines the conditions on how to control the training 26 | ## ... ) # Other options specific to the modeling technique 27 | ## randomForectFit1 28 | 29 | ## ----ch08_hyperparameter_01, warning=FALSE------------------------------- 30 | setwd("C:/Personal/Machine Learning/Final Artwork and Code/Chapter 8"); 31 | library(caret) 32 | library(randomForest) 33 | set.seed(917); 34 | # Load Dataset 35 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE) 36 | 37 | #Remove the missing values 38 | data <- na.omit(Purchase_Data) 39 | 40 | #Pick a sample of records 41 | Data <- data[sample(nrow(data),size=10000),] 42 | 43 | #Model 1 with tree size = 20 44 | fit_20 <- randomForest(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, 45 | data=Data, 46 | importance=TRUE, 47 | ntree=20) 48 | #Print the result for ntree=20 49 | print(fit_20) 50 | 51 | 52 | #Model 1 with tree size = 50 53 | fit_50 <- randomForest(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, 54 | data=Data, 55 | importance=TRUE, 56 | ntree=50) 57 | #Print the result for ntree=50 58 | print(fit_50) 59 | 60 | 61 | ## ----ch08_hyper_01, warning=FALSE---------------------------------------- 62 | # Manually search parametres 63 | library(data.table) 64 | # load the packages 65 | library(randomForest) 66 | library(mlbench) 67 | library(caret) 68 | # Load Dataset 69 | 70 | dataset <- Data 71 | metric <- "Accuracy" 72 | # Manual Search 73 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid") 74 | tunegrid <- expand.grid(.mtry=c(sqrt(ncol(dataset)-2))) 75 | modellist <- list() 76 | for (ntree in c(100, 150, 200, 250)) { 77 | set.seed(917); 78 | fit <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rf", metric=metric, tuneGrid=tunegrid, trControl=trainControl, ntree=ntree) 79 | key <- toString(ntree) 80 | modellist[[key]] <- fit 81 | } 82 | # compare results by resampling 83 | results <- resamples(modellist) 84 | #Summary of Results 85 | summary(results) 86 | #Dot Plot of results 87 | dotplot(results) 88 | 89 | ## ----ch08_hyper_02,warning=FALSE----------------------------------------- 90 | # Tune algorithm parameters using a manual grid search. 91 | seed <- 917; 92 | dataset <- Data 93 | # prepare training scheme 94 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 95 | # design the parameter tuning grid 96 | grid <- expand.grid(size=c(5,10,20,50), k=c(1,2,3,4,5)) 97 | # train the model 98 | model <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="lvq", trControl=control, tuneGrid=grid) 99 | # summarize the model 100 | print(model) 101 | # plot the effect of parameters on accuracy 102 | plot(model) 103 | 104 | ## ----ch08_hyper_03,warning=FALSE----------------------------------------- 105 | # Tune algorithm parameters using an automatic grid search. 106 | set.seed(917); 107 | dataset <- Data 108 | 109 | # prepare training scheme 110 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 111 | # train the model 112 | model <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="lvq", trControl=control, tuneLength=5) 113 | # summarize the model 114 | print(model) 115 | # plot the effect of parameters on accuracy 116 | plot(model) 117 | 118 | ## ----ch08_hyper_04,warning=FALSE----------------------------------------- 119 | # Select the best tuning configuration 120 | dataset <- Data 121 | 122 | # prepare training scheme 123 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 124 | # CART 125 | set.seed(917); 126 | tunegrid <- expand.grid(.cp=seq(0,0.1,by=0.01)) 127 | fit.cart <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rpart", metric="Accuracy", tuneGrid=tunegrid, trControl=control) 128 | fit.cart 129 | # display the best configuration 130 | print(fit.cart$bestTune) 131 | 132 | plot(fit.cart) 133 | 134 | ## ----ch08_hyper_05,warning=FALSE----------------------------------------- 135 | 136 | # Randomly search algorithm parameters 137 | 138 | # Select the best tuning configuration 139 | dataset <- Data 140 | 141 | # prepare training scheme 142 | control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random") 143 | # train the model 144 | model <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rf", trControl=control) 145 | # summarize the model 146 | print(model) 147 | # plot the effect of parameters on accuracy 148 | plot(model) 149 | 150 | ## ----ch08_hyper_06,warning=FALSE----------------------------------------- 151 | library(caret) 152 | library(randomForest) 153 | library(class) 154 | # Load Dataset 155 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE) 156 | 157 | data <- na.omit(Purchase_Data) 158 | 159 | #Create a sample of 10K records 160 | set.seed(917); 161 | Data <- data[sample(nrow(data),size=10000),] 162 | # Select the best tuning configuration 163 | dataset <- Data 164 | 165 | # Customer Parameter Search 166 | 167 | # load the packages 168 | library(randomForest) 169 | library(mlbench) 170 | library(caret) 171 | 172 | # define the custom caret algorithm (wrapper for Random Forest) 173 | customRF <- list(type="Classification", library="randomForest", loop=NULL) 174 | customRF$parameters <- data.frame(parameter=c("mtry", "ntree"), class=rep("numeric", 2), label=c("mtry", "ntree")) 175 | customRF$grid <- function(x, y, len=NULL, search="grid") {} 176 | customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) { 177 | randomForest(x, y, mtry=param$mtry, ntree=param$ntree, ...) 178 | } 179 | customRF$predict <- function(modelFit, newdata, preProc=NULL, submodels=NULL) { predict(modelFit, newdata)} 180 | customRF$prob <- function(modelFit, newdata, preProc=NULL, submodels=NULL) { predict(modelFit, newdata, type = "prob")} 181 | customRF$sort <- function(x){ x[order(x[,1]),]} 182 | customRF$levels <- function(x) {x$classes} 183 | 184 | # Load Dataset 185 | 186 | dataset <- Data 187 | 188 | metric <- "Accuracy" 189 | 190 | # train model 191 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3) 192 | tunegrid <- expand.grid(.mtry=c(1:4), .ntree=c(100, 150, 200, 250)) 193 | set.seed(917) 194 | custom <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method=customRF, metric=metric, tuneGrid=tunegrid, trControl=trainControl) 195 | print(custom) 196 | plot(custom) 197 | 198 | 199 | ## ----ch08_bias_vs_variance,warning=FALSE--------------------------------- 200 | mu <- 2 201 | Z <- rnorm(20000, mu) 202 | 203 | MSE <- function(estimate, mu) { 204 | return(sum((estimate - mu)^2) / length(estimate)) 205 | } 206 | 207 | n <- 100 208 | shrink <- seq(0,0.5, length=n) 209 | mse <- numeric(n) 210 | bias <- numeric(n) 211 | variance <- numeric(n) 212 | 213 | for (i in 1:n) { 214 | mse[i] <- MSE((1 - shrink[i]) * Z, mu) 215 | bias[i] <- mu * shrink[i] 216 | variance[i] <- (1 - shrink[i])^2 217 | } 218 | 219 | # Bias-Variance tradeoff plot 220 | 221 | plot(shrink, mse, xlab='Shrinkage', ylab='MSE', type='l', col='pink', lwd=3, lty=1, ylim=c(0,1.2)) 222 | lines(shrink, bias^2, col='green', lwd=3, lty=2) 223 | lines(shrink, variance, col='red', lwd=3, lty=2) 224 | legend(0.02,0.6, c('Bias^2', 'Variance', 'MSE'), col=c('green', 'red', 'pink'), lwd=rep(3,3), lty=c(2,2,1)) 225 | 226 | ## ----ch08_bagging, warning= FALSE---------------------------------------- 227 | library(caret) 228 | library(randomForest) 229 | library(class) 230 | library(ipred) 231 | # Load Dataset 232 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE) 233 | 234 | data <- na.omit(Purchase_Data) 235 | 236 | #Create a sample of 10K records 237 | set.seed(917); 238 | Data <- data[sample(nrow(data),size=10000),] 239 | # Select the best tuning configuration 240 | dataset <- Data 241 | # Example of Bagging algorithms 242 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 243 | metric <- "Accuracy" 244 | # Bagged CART 245 | set.seed(917) 246 | fit.treebag <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="treebag", metric=metric, trControl=control) 247 | # Random Forest 248 | set.seed(917) 249 | fit.rf <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rf", metric=metric, trControl=control) 250 | # summarize results 251 | bagging_results <- resamples(list(treebag=fit.treebag, rf=fit.rf)) 252 | summary(bagging_results) 253 | dotplot(bagging_results) 254 | 255 | 256 | ## ----ch08_boosting, warning=FALSE---------------------------------------- 257 | # Load Dataset 258 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE) 259 | 260 | data <- na.omit(Purchase_Data) 261 | 262 | #Create a sample of 10K records 263 | set.seed(917); 264 | Data <- data[sample(nrow(data),size=10000),] 265 | # Select the best tuning configuration 266 | dataset <- Data 267 | library(caret) 268 | library(C50) 269 | library(gbm) 270 | dataset <- Data; 271 | # Example of Boosting Algorithms 272 | control <- trainControl(method="repeatedcv", number=10, repeats=3) 273 | metric <- "Accuracy" 274 | # C5.0 275 | set.seed(917) 276 | fit.c50 <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="C5.0", metric=metric, trControl=control) 277 | fit.c50 278 | plot(fit.c50) 279 | # Stochastic Gradient Boosting 280 | set.seed(917) 281 | fit.gbm <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="gbm", metric=metric, trControl=control, verbose=FALSE) 282 | fit.gbm 283 | plot(fit.gbm) 284 | 285 | # summarize results 286 | boosting_results <- resamples(list(c5.0=fit.c50, gbm=fit.gbm)) 287 | summary(boosting_results) 288 | dotplot(boosting_results) 289 | 290 | ## ----ch08_blending, warning=FALSE---------------------------------------- 291 | # Blending (linear combination of models) 292 | 293 | # load libraries 294 | library(caret) 295 | library(caretEnsemble) 296 | library(MASS) 297 | 298 | #dataset$choice <- ifelse(dataset$ProductChoice == 1,"A",ifelse(dataset$ProductChoice == 2,"B",ifelse(dataset$ProductChoice == 3, "C","D"))) 299 | set.seed(917); 300 | Data <- data[sample(nrow(data),size=10000),]; 301 | 302 | dataset <- Data; 303 | 304 | dataset$choice <- ifelse(dataset$ProductChoice == 1 | dataset$ProductChoice == 2 ,"A","B") 305 | 306 | dataset$choice <-as.factor(dataset$choice) 307 | # define training control 308 | train_control <- trainControl(method="cv", number=4, savePredictions=TRUE, classProbs=TRUE) 309 | # train a list of models 310 | methodList <- c('knn','rpart') 311 | models <- caretList(choice ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, trControl=train_control, methodList=methodList) 312 | # create ensemble of trained models 313 | ensemble <- caretEnsemble(models) 314 | # summarize ensemble 315 | summary(ensemble) 316 | 317 | ## ----ch08_stacking_01,warning=FALSE-------------------------------------- 318 | # Example of Stacking algorithms 319 | library(kernlab); 320 | # create submodels 321 | control <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE) 322 | algorithmList <- c('lda', 'rpart', 'glm', 'knn', 'svmRadial') 323 | set.seed(917) 324 | models <- caretList(choice ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, trControl=control, methodList=algorithmList) 325 | results <- resamples(models) 326 | summary(results) 327 | dotplot(results) 328 | 329 | 330 | ## ----cho08_stacking_02,warning=FALSE------------------------------------- 331 | # correlation between results 332 | modelCor(results) 333 | splom(results) 334 | 335 | ## ----ch08_stacking_03,warning=FALSE-------------------------------------- 336 | # stack using glm 337 | stackControl <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE) 338 | set.seed(917) 339 | stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=stackControl) 340 | print(stack.glm) 341 | 342 | 343 | ## ----ch08_stacking_04,warning=FALSE-------------------------------------- 344 | 345 | # stack using random forest 346 | set.seed(917) 347 | stack.rf <- caretStack(models, method="rf", metric="Accuracy", trControl=stackControl) 348 | print(stack.rf) 349 | 350 | ## ----ch08_bayesian_optimization_01,warning=FALSE------------------------- 351 | 352 | library(caret) 353 | library(randomForest) 354 | library(class) 355 | library(ipred) 356 | library(GPfit) 357 | # Load Dataset 358 | House_price <- read.csv("Dataset/House Sale Price Dataset.csv",header=TRUE) 359 | 360 | dataset <- na.omit(House_price) 361 | 362 | #Create a sample of 10K records 363 | set.seed(917); 364 | 365 | 366 | rand_ctrl <- trainControl(method = "repeatedcv", repeats = 5, search = "random") 367 | 368 | rand_search <- train(HousePrice ~ StoreArea + BasementArea + SellingYear + SaleType + ConstructionYear + Rating, data = dataset, method = "svmRadial", 369 | ## Create 20 random parameter values 370 | tuneLength = 20, 371 | metric = "RMSE", 372 | preProc = c("center", "scale"), 373 | trControl = rand_ctrl) 374 | rand_search 375 | 376 | ggplot(rand_search) + scale_x_log10() + scale_y_log10() 377 | 378 | getTrainPerf(rand_search) 379 | 380 | ## ----ch08_bayesian_optimization_02,warning=FALSE------------------------- 381 | # Define the resampling method 382 | ctrl <- trainControl(method = "repeatedcv", repeats = 5) 383 | 384 | ## Use this function to optimize the model. The two parameters are 385 | ## evaluated on the log scale given their range and scope. 386 | svm_fit_bayes <- function(logC, logSigma) { 387 | ## Use the same model code but for a single (C, sigma) pair. 388 | txt <- capture.output( 389 | mod <- train(HousePrice ~ StoreArea + BasementArea + SellingYear + SaleType + ConstructionYear + Rating , data = dataset, 390 | method = "svmRadial", 391 | preProc = c("center", "scale"), 392 | metric = "RMSE", 393 | trControl = ctrl, 394 | tuneGrid = data.frame(C = exp(logC), sigma = exp(logSigma))) 395 | ) 396 | ## The function wants to _maximize_ the outcome so we return 397 | ## the negative of the resampled RMSE value. `Pred` can be used 398 | ## to return predicted values but we'll avoid that and use zero 399 | list(Score = -getTrainPerf(mod)[, "TrainRMSE"], Pred = 0) 400 | } 401 | 402 | ## Define the bounds of the search. 403 | lower_bounds <- c(logC = -5, logSigma = -9) 404 | upper_bounds <- c(logC = 20, logSigma = -0.75) 405 | bounds <- list(logC = c(lower_bounds[1], upper_bounds[1]), 406 | logSigma = c(lower_bounds[2], upper_bounds[2])) 407 | 408 | ## Create a grid of values as the input into the BO code 409 | initial_grid <- rand_search$results[, c("C", "sigma", "RMSE")] 410 | initial_grid$C <- log(initial_grid$C) 411 | initial_grid$sigma <- log(initial_grid$sigma) 412 | initial_grid$RMSE <- -initial_grid$RMSE 413 | names(initial_grid) <- c("logC", "logSigma", "Value") 414 | 415 | ## Run the optimization with the initial grid and with 30 416 | 417 | 418 | library(rBayesianOptimization) 419 | 420 | set.seed(917) 421 | ba_search <- BayesianOptimization(svm_fit_bayes, 422 | bounds = bounds, 423 | init_grid_dt = initial_grid, 424 | init_points = 0, 425 | n_iter = 30, 426 | acq = "ucb", 427 | kappa = 1, 428 | eps = 0.0, 429 | verbose = TRUE) 430 | ba_search 431 | 432 | ## ----ch08_bayesian_optimization_03,warning=FALSE------------------------- 433 | final_search <- train(HousePrice ~ StoreArea + BasementArea + SellingYear + SaleType + ConstructionYear + Rating, data = dataset, 434 | method = "svmRadial", 435 | tuneGrid = data.frame(C = exp(ba_search$Best_Par["logC"]), 436 | sigma = exp(ba_search$Best_Par["logSigma"])), 437 | metric = "RMSE", 438 | preProc = c("center", "scale"), 439 | trControl = ctrl) 440 | 441 | final_search 442 | compare_models(final_search, rand_search) 443 | 444 | -------------------------------------------------------------------------------- /Code/Ch09_Ramasubramanian.R: -------------------------------------------------------------------------------- 1 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 2 | 3 | library(doParallel) 4 | 5 | # Find out how many cores are available (if you don't already know) 6 | c = detectCores() 7 | c 8 | 9 | # Find out how many cores are currently being used 10 | getDoParWorkers() 11 | 12 | # Create cluster with c-2 cores 13 | cl <- makeCluster(c-2) 14 | 15 | # Register cluster 16 | registerDoParallel(cl) 17 | 18 | # Find out how many cores are being used 19 | getDoParWorkers() 20 | 21 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 22 | 23 | 24 | ## Problem : Identifying Risky Bank Loans 25 | 26 | credit <- read.csv("credit.csv") 27 | str(credit) 28 | 29 | # create a random sample for training and test data 30 | # use set.seed to use the same random number sequence as the tutorial 31 | set.seed(123) 32 | train_sample <- sample(1000, 900) 33 | 34 | str(train_sample) 35 | 36 | # split the data frames 37 | credit_train <- credit[train_sample, ] 38 | credit_test <- credit[-train_sample, ] 39 | 40 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 41 | 42 | ## Training a model on the data 43 | 44 | library(randomForest) 45 | 46 | #Sequential Execution 47 | system.time(rf_credit_model <- randomForest(credit_train[-17], 48 | credit_train$default, 49 | ntree = 1000)) 50 | 51 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 52 | 53 | #Parallel Execution 54 | system.time( 55 | rf_credit_model_parallel <- foreach(nt = rep(250,4), 56 | .combine = combine , 57 | .packages = 'randomForest') 58 | %dopar% 59 | randomForest( 60 | credit_train[-17], 61 | credit_train$default, 62 | ntree = nt)) 63 | 64 | ## ---- warning=FALSE, message=FALSE--------------------------------------- 65 | 66 | #Shutting down cluster - when you're done, be sure to close #the parallel backend using 67 | stopCluster(cl) 68 | 69 | 70 | ## ---- eval=FALSE--------------------------------------------------------- 71 | ## 72 | Sys.setenv(HADOOP_CMD="/usr/lib/hadoop-2.2.0/bin/hadoop") 73 | Sys.setenv(HADOOP_STREAMING="/usr/lib/hadoop-2.2.0/share/hadoop/tools/lib/hadoop-streaming-2.2.0.jar") 74 | 75 | ## ---- eval=FALSE--------------------------------------------------------- 76 | ## 77 | library(rmr2) 78 | library(rhdfs) 79 | ## 80 | ## # Hadoop File Operations 81 | ## 82 | ## #initialize File HDFS 83 | hdfs.init() 84 | 85 | ## ---- eval=FALSE--------------------------------------------------------- 86 | ## 87 | ## #Put File into HDFS 88 | hdfs.put("/home/sample.txt","/hadoop_practice") 89 | ## ## [1] TRUE 90 | ## 91 | 92 | ## ---- eval=FALSE--------------------------------------------------------- 93 | ## 94 | ## # Reads a bunch of lines at a time 95 | ## #Map Phase 96 | map <- function(k,lines) { 97 | words.list <- strsplit(lines, '\\s+') 98 | words <- unlist(words.list) 99 | return( keyval(words, 1) ) 100 | ## } 101 | ## 102 | ## #Reduce Phase 103 | reduce <- function(word, counts) { 104 | keyval(word, sum(counts)) 105 | } 106 | ## 107 | ## #MapReduce Function 108 | wordcount <- function (input, output=NULL) { 109 | mapreduce(input=input, output=output, input.format="text", map=map, reduce=reduce) 110 | } 111 | 112 | ## ---- eval=FALSE--------------------------------------------------------- 113 | ## 114 | ## 115 | ## ## read text files from folder input on HDFS 116 | ## ## save result in folder output on HDFS 117 | ## ## Submit job 118 | ## 119 | basedir <- '/hadoop_practice' 120 | infile <- file.path(basedir, 'sample.txt') 121 | outfile <- file.path(basedir, 'output') 122 | ret <- wordcount(infile, outfile) 123 | 124 | ## ---- eval=FALSE--------------------------------------------------------- 125 | ## 126 | ## 127 | ## ## Fetch results from HDFS 128 | result <- from.dfs(outfile) 129 | results.df <- as.data.frame(result, stringsAsFactors=F) 130 | colnames(results.df) <- c('word', 'count') 131 | tail(results.df,100) 132 | ## 133 | ## ## word count 134 | ## ## 1 R 1 135 | ## ## 2 Hi 1 136 | ## ## 3 to 1 137 | ## ## 4 All 1 138 | ## ## 5 with 1 139 | ## ## 6 class 1 140 | ## ## 7 hadoop 3 141 | ## ## 8 Welcome 1 142 | ## ## 9 integrating 1 143 | ## 144 | ## 145 | head(results.df[order(results.df$count, decreasing = TRUE),]) 146 | ## 147 | ## ## word count 148 | ## ## 7 hadoop 3 149 | ## ## 1 R 1 150 | ## ## 2 Hi 1 151 | ## ## 3 to 1 152 | ## ## 4 All 1 153 | ## ## 5 with 1 154 | ## 155 | 156 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 157 | 158 | #Set environment variable 159 | Sys.setenv(SPARK_HOME='C:/Spark/spark-2.0.0-bin-hadoop2.7',HADOOP_HOME='C:/Hadoop-2.3.0') 160 | .libPaths(c(file.path(Sys.getenv('SPARK_HOME'), 'R', 'lib'),.libPaths())) 161 | Sys.setenv('SPARKR_SUBMIT_ARGS'='"sparkr-shell"') 162 | 163 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 164 | 165 | 166 | library(SparkR) 167 | library(rJava) 168 | 169 | #The entry point into SparkR is the SparkSession which connects your R program to a Spark cluster 170 | sparkR.session(enableHiveSupport = FALSE, appName = "SparkR-ML",master = "local[*]", sparkConfig = list(spark.driver.memory = "1g",spark.sql.warehouse.dir="C:/Hadoop-2.3.0")) 171 | 172 | 173 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 174 | library(data.table) 175 | 176 | #Read the housing data 177 | Data_House_Price <- fread("/Users/karthik/Dropbox/Book Writing - Drafts/Chapter Drafts/Chapter 7 - Machine Learning Model Evaluation/tosend/House Sale Price Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE) 178 | 179 | str(Data_House_Price) 180 | 181 | 182 | #Pulling out relevant columns and assigning required fields in the dataset 183 | Data_House_Price <- Data_House_Price[,.(HOUSE_ID,HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,Rating,SaleType)] 184 | 185 | #Omit any missing value 186 | Data_House_Price <- na.omit(Data_House_Price) 187 | 188 | Data_House_Price$HOUSE_ID <- as.character(Data_House_Price$HOUSE_ID) 189 | 190 | 191 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 192 | #Spark Data Frame - Train 193 | gaussianDF_train <- createDataFrame(Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),]) 194 | 195 | #Spark Data Frame - Test 196 | gaussianDF_test <- createDataFrame(Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),]) 197 | 198 | class(gaussianDF_train) 199 | 200 | class(gaussianDF_test) 201 | 202 | 203 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 204 | 205 | # Fit a generalized linear model of family "gaussian" with spark.glm 206 | gaussianGLM <- spark.glm(gaussianDF_train, HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + Rating + SaleType, family = "gaussian") 207 | 208 | # Model summary 209 | summary(gaussianGLM) 210 | 211 | 212 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 213 | 214 | #Prediction on the gaussianModel 215 | gaussianPredictions <- predict(gaussianGLM, gaussianDF_test) 216 | names(gaussianPredictions) <- c('HOUSE_ID','HousePrice','StoreArea','StreetHouseFront','BasementArea','LawnArea','Rating','SaleType','ActualPrice','PredictedPrice') 217 | gaussianPredictions$PredictedPrice <- round(gaussianPredictions$PredictedPrice,2.0) 218 | showDF(gaussianPredictions[,9:10]) 219 | 220 | ## ---- warning=FALSE,message=FALSE---------------------------------------- 221 | 222 | sparkR.stop() 223 | 224 | 225 | ## ----ch09_h2o_1,eval=FALSE----------------------------------------------- 226 | ## # The following two commands remove any previously installed H2O packages for R. 227 | ## if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) } 228 | ## if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") } 229 | ## 230 | ## # Next, we download, install and initialize the H2O package for R. 231 | ## install.packages("h2o", repos=(c("http://s3.amazonaws.com/h2o-release/h2o/rel-kahan/5/R", getOption("repos")))) 232 | ## 233 | ## #Alternatively you can install the package h2o from CRAN as below 234 | ## install.packages("h2o") 235 | 236 | ## ----ch09_h2o_2---------------------------------------------------------- 237 | # Load the h2o library in R 238 | library(h2o); 239 | 240 | #Initiate a cluster in your machine 241 | localH2O = h2o.init() 242 | 243 | ## ----ch09_h2o_3---------------------------------------------------------- 244 | # Run Deep learning demo 245 | demo(h2o.deeplearning) 246 | 247 | prostate.hex 248 | 249 | # Other demos can be called as below 250 | #demo(h2o.glm) 251 | #demo(h2o.gbm) 252 | 253 | -------------------------------------------------------------------------------- /Code/actual_pred_plot.R: -------------------------------------------------------------------------------- 1 | library(data.table) # for everything 2 | library(ggplot2) # for plotting 3 | library(reshape2) # for melting 4 | 5 | 6 | actual_pred_plot <- function(var.by=var.by, 7 | var.response=var.response, 8 | data=data, 9 | var.predict.current=NULL, 10 | var.predict.reference=NULL, 11 | var.split=NULL, 12 | var.weight=NULL, 13 | var.by.buckets=NULL, 14 | remove.na=FALSE, 15 | sort.factor=FALSE, 16 | errorbars=FALSE, 17 | subset.to=NA, 18 | count.line=NULL, 19 | barline.ratio=1, 20 | title="", 21 | make.plot=TRUE 22 | ){ 23 | ############################################################################# 24 | # FUNCTION: actual_pred_plot 25 | # 26 | # Procedure for making univariate plots. This will produce one graph from a 27 | # data frame/data table, containing overlaid exposure histograms, actual 28 | # line(s) and optionally one or many predicted lines. Additionally, two-way 29 | # interactions are possible and well-handled. Returns a ggplot object. 30 | # 31 | # ARGUMENTS: 32 | # var.by: The name of the variable whose levels to group by in the plot. 33 | # Determines the bins for histogram and line plot. (string) [required] 34 | # var.response: The name of the var containing the actual response. (string) 35 | # [required] 36 | # data: The data.frame/.table object to generate the plots from. [required] 37 | # var.predict.current: The name of the var containing the first predicted 38 | # response. (string) [optional] 39 | # var.predict.reference: The name of the var containing the second predicted 40 | # response. If you have more than two predicted cols to plot, you can 41 | # pass a vector of predicted colnames here (or to predict.current) and 42 | # actual_pred_plot will sort it out (string) [optional] 43 | # var.split: The name of the var to interact with var.by. (string) [optional] 44 | # var.weight: The name of the var to be used to weight each observation. 45 | # If NULL (default), all obs are given equal weight. (string) [optional] 46 | # var.by.buckets: How many buckets to group factor vars into (num) [optional] 47 | # remove.na: If response/predictions have NAs, should we remove them? 48 | # sort.factor: Do you want to sort x-axis so predicted line always increases? 49 | # errorbars: Boolean, should the actual line include error bars od +-2SE? 50 | # Defaults to FALSE. [optional] 51 | # subset.to: Vector of length 2 specifiying where to cap a numeric var.by. 52 | # e.g. c(min.val, max.val). To only specify one boundary use NA; e.g. 53 | # c(NA, max.val) (numeric) [optional] 54 | # barline.ratio: What should the ratio of maximum bar height to maximum line 55 | # value be? (num) [optional] 56 | # title: Do you want a title? (string) [optional] 57 | # make.plot: Boolean, if FALSE the grid used to make the plot will be 58 | # returned instead of the plot itself. [optional] 59 | # 60 | ############################################################################# 61 | 62 | # pull out only the cols you need and convert to DT if necessary 63 | needed.cols <- c(var.by, var.split, var.response, 64 | var.predict.current, var.predict.reference) 65 | 66 | if("data.table" %in% class(data)){ 67 | dat.frm <- data[, (needed.cols), with=FALSE] 68 | } else { 69 | dat.frm <- data.table(data[, needed.cols]) 70 | } 71 | 72 | # cap var.by at specified limits 73 | if(!is.na(subset.to[1])){ 74 | dat.frm[[var.by]] <- pmax(dat.frm[[var.by]], subset.to[1]) 75 | } 76 | if(!is.na(subset.to[2])){ 77 | dat.frm[[var.by]] <- pmin(dat.frm[[var.by]], subset.to[2]) 78 | } 79 | 80 | # upgroup poorly exposed buckets if specified 81 | if(!is.null(var.by.buckets)){ 82 | if(nlevels(dat.frm[[var.by]]) > var.by.buckets) 83 | dat.frm[[var.by]] <- keep_n_top(dat.frm[[var.by]], var.by.buckets) 84 | } 85 | 86 | # make sure there are fewer than 100 levels for numerics 87 | if(length(unique(dat.frm[[var.by]])) > 100){ 88 | dat.frm[[var.by]] <- round_down_to_100_levels(dat.frm[[var.by]]) 89 | } 90 | 91 | # set key (if var.split = NULL this does same as setkeyv(dat.frm, var.by)) 92 | setkeyv(dat.frm, c(var.by, var.split)) 93 | 94 | # get means using .SD and .SDcols, nice advantage of assigning nice colnames 95 | meanable.cols <- c(var.response, var.predict.current, var.predict.reference) 96 | 97 | # if response/predicted cols contain NAs throw an error by default, or.. 98 | if(remove.na){ 99 | meanme <- function(vector.to.mean){ 100 | mean(vector.to.mean, na.rm=TRUE) 101 | } 102 | } else { 103 | meanme <- function(vector.to.mean){ 104 | mean(vector.to.mean) 105 | } 106 | } 107 | 108 | meantable <- dat.frm[, lapply(.SD, function(x) meanme(x)), 109 | by = key(dat.frm), 110 | .SDcols = (meanable.cols)] 111 | 112 | if(!is.null(var.weight)){ 113 | counttable <- dat.frm[, list(count=sum(var.weight)), 114 | by=key(dat.frm), with=FALSE] 115 | } else { 116 | counttable <- dat.frm[, list(count=.N), by=key(dat.frm)] 117 | } 118 | 119 | # don't necessarily need to merge, but makes the ggplot call simpler 120 | plotme <- meantable[counttable] 121 | 122 | # scale the count, make it fit on response axis 123 | rescale_factor <- max(plotme[["count"]]) / max(plotme[[var.response]]) 124 | plotme$count_scaled <- plotme$count / (barline.ratio * rescale_factor) 125 | 126 | 127 | 128 | # which variables to use as melt id? 129 | melt.id <- c(var.by, var.split, "count", "count_scaled") 130 | 131 | # add in error bars if requested 132 | if(errorbars){ 133 | 134 | sterrtable <- dat.frm[, list(sterr = sd(get(var.response))/sqrt(.N)), 135 | by=key(dat.frm)] 136 | 137 | plotme <- plotme[sterrtable] 138 | 139 | plotme$ymax <- plotme[[var.response]] + 2 * plotme$sterr 140 | plotme$ymin <- plotme[[var.response]] - 2 * plotme$sterr 141 | plotme$sterr <- NULL 142 | 143 | melt.id <- c(melt.id, "ymax", "ymin") 144 | gg.errors <- geom_errorbar(aes_string(x=var.by, 145 | ymax= "ymax", 146 | ymin= "ymin", 147 | color="variable"), 148 | stat="identity", 149 | width=0.25, 150 | alpha=0.75) 151 | 152 | }else{ gg.errors <- NULL} 153 | 154 | # melt into a form that ggplot likes 155 | plotme <- melt(plotme, id=melt.id) 156 | plotme$grouping <- plotme$variable 157 | 158 | # get rid of errors on the non-response rows (to make error bar colours work) 159 | if(errorbars) plotme[!variable == var.response, ':=' (ymax = NA, ymin = NA)] 160 | 161 | 162 | if(!is.null(var.split)) { 163 | 164 | plotme[[var.split]] <- as.factor(plotme[[var.split]]) 165 | plotme$grouping <- paste0(plotme[[var.split]], " - ", plotme$variable) 166 | var.split2 <- var.split 167 | delete_linetype_legend <- NULL 168 | 169 | }else{ 170 | 171 | var.split2 <- NULL 172 | var.split <- "variable" 173 | delete_linetype_legend <- scale_linetype(guide='none') 174 | } 175 | 176 | if(is.null(var.predict.current) & is.null(var.predict.reference)) { 177 | delete_linetype_legend <- scale_linetype(guide='none') 178 | } 179 | 180 | # do you want to sort plot so predicted line is always increasing? 181 | if(sort.factor){ 182 | levels <- plotme[variable == var.predict.current][order(value)][[var.by]] 183 | plotme[[var.by]] <- factor(plotme[[var.by]], levels=levels) 184 | } else { 185 | plotme[[var.by]] <- as.factor(plotme[[var.by]]) 186 | } 187 | 188 | # add a dashed line to give some sort of reference for height of count bars 189 | if(!is.null(count.line)){ 190 | countline_rescaled <- count.line/(rescale_factor*barline.ratio) 191 | gg.countline <- geom_hline(yintercept=countline_rescaled, 192 | colour="grey", 193 | linetype="longdash", 194 | size=0.1, 195 | alpha=0.05) 196 | gg.annotate <- annotate("text", 197 | x=plotme[[var.by]][1], 198 | y=countline_rescaled, 199 | label = paste0("N = ", count.line), 200 | hjust=0, 201 | vjust=1) 202 | 203 | } else { 204 | gg.countline <- NULL 205 | gg.annotate <- NULL 206 | } 207 | 208 | # do you want the plot or just the grid used to make it? 209 | if(!make.plot){ 210 | return(plotme) 211 | } 212 | 213 | plt <- ggplot(plotme) + 214 | geom_line(aes_string(x=var.by, 215 | y="value", 216 | group="grouping", 217 | color=var.split, 218 | linetype="variable"), 219 | stat="identity") + 220 | gg.errors + 221 | geom_point(aes_string(x=var.by, 222 | y="value", 223 | group="grouping", 224 | color=var.split, 225 | shape=var.split), 226 | stat="identity", 227 | size=2) + 228 | geom_bar(aes_string(x=var.by, 229 | y="count_scaled", 230 | group="grouping", 231 | fill=var.split2), 232 | stat="identity", 233 | alpha=0.25, 234 | position="dodge") + 235 | geom_hline(yintercept=count.line/(rescale_factor*barline.ratio), 236 | colour="black", 237 | linetype="longdash", 238 | size=0.25) + 239 | gg.countline + gg.annotate + 240 | theme(legend.position=c(1,0.25),legend.justification=c(1,1)) + 241 | ylab(var.response) + delete_linetype_legend + 242 | ggtitle(title) 243 | 244 | return(plt) 245 | 246 | } 247 | 248 | 249 | round_down_to_100_levels <- function(vectortoround){ 250 | ############################################################################# 251 | # FUNCTION: round_down_to_100_levels 252 | # 253 | # actual_pred_plot requires a function for binning continuous numerical variables 254 | # into discrete buckets. Because of the way data.table (specifically setkey) 255 | # works, we need to upgroup numeric variables sensibly before summarizing. 256 | # Here we simply try rounding less and less finely until the result has less 257 | # than 100 groups. Maintains the distribution/shape of var.by (unlike decile 258 | # binning). Pulled this out of actual_pred_plot into its own function for easier 259 | # maintenance. In the future I will modularize more of actual_pred_plot. Plan to add 260 | # check to determine numerical v categorical vars and handle appropriately. 261 | # 262 | ############################################################################# 263 | 264 | # try rounding to these numbers in sequence 265 | toTheNearest <- c(0.01, 0.02, 0.05, 0.1, 1, 2, 266 | 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000) 267 | # initialise i to step through above attempts 268 | i <- 1 269 | 270 | # only round if you need to 271 | if(length(unique(vectortoround)) > 100){ 272 | 273 | # initialise new rounded version of variable 274 | rounded <- vectortoround 275 | 276 | # keep trying til one sticks, if a var won't round to 2000 you're done for 277 | while(length(unique(rounded)) > 100){ 278 | rounded <- round(vectortoround / toTheNearest[i]) * toTheNearest[i] 279 | i <- i + 1 280 | } 281 | print(paste0("Rounded to nearest ", as.character(toTheNearest[i-1]))) 282 | return(rounded) 283 | } 284 | return(vectortoround) 285 | } 286 | 287 | 288 | keep_n_top <- function(factor, n){ 289 | ############################################################################ 290 | # FUNCTION: keep_n_top 291 | # 292 | # Treat a factor so that it retains only the n-1 most exposed levels. 293 | # 294 | # ARGUMENTS: 295 | # factor: factor variable to modify 296 | # n: number of levels to retain 297 | # RETURNS: 298 | # Factor as input with the n - 1 most exposed levels and an 'Other' level 299 | # 300 | ############################################################################ 301 | 302 | top <- table(factor) 303 | top <- top[order(top, decreasing=TRUE)][1:(n - 1)] 304 | factor <- as.character(factor) 305 | factor[!factor %in% names(top)] <- "Other" 306 | as.factor(factor) 307 | 308 | } -------------------------------------------------------------------------------- /Code/concordance.R: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # Function concordance : for concordance, discordance, ties 3 | # The function returns Concordance, discordance, and ties 4 | # by taking a glm binomial model result as input. 5 | # It uses optimisation through subsetting 6 | ########################################################### 7 | 8 | concordance<-function(model) 9 | { 10 | # Get all actual observations and their fitted values into a frame 11 | fitted<-data.frame(cbind(model$y,model$fitted.values)) 12 | colnames(fitted)<-c('respvar','score') 13 | # Subset only ones 14 | ones<-fitted[fitted[,1]==1,] 15 | # Subset only zeros 16 | zeros<-fitted[fitted[,1]==0,] 17 | 18 | # Initialise all the values 19 | pairs_tested<-as.double(nrow(ones))*as.double(nrow(zeros)) 20 | conc<-0 21 | disc<-0 22 | 23 | # Get the values in a for-loop 24 | for(i in 1:nrow(ones)) 25 | { 26 | conc<-conc + sum(ones[i,"score"]>zeros[,"score"]) 27 | disc<-disc + sum(ones[i,"score"]