├── .gitattributes
├── 1-select-file.jpg
├── 2-download.jpg
├── 9781484223338.jpg
├── Code
├── Ch02_Ramasubramanian.R
├── Ch03_Singh.R
├── Ch04_Ramasubramanian.R
├── Ch05_Singh.R
├── Ch06_Clustering.R
├── Ch06_DecisionTree.R
├── Ch06_Naive Bayes.R
├── Ch06_NeuralNet.R
├── Ch06_OnlineML.R
├── Ch06_Regression.R
├── Ch06_RuleMining.R
├── Ch06_SVM.R
├── Ch06_TextMining.R
├── Ch07_Singh.R
├── Ch08_Singh.R
├── Ch09_Ramasubramanian.R
├── actual_pred_plot.R
└── concordance.R
├── Color Artwork
├── Chapter 1
│ └── Image
│ │ ├── Image 1.1 Architecture of IBM's DeepQA.png
│ │ ├── Image 1.2 Venn diagram definition of Data Science.png
│ │ ├── Image 1.3 Sample space of three-coin tossing experiment.png
│ │ ├── Image 1.5 Probabiity distribution with discrete and continuous Random Variable.png
│ │ ├── Image 1.6 Confidence Interval.png
│ │ ├── Image 1.7 z score and confidence level.png
│ │ ├── Image 1.8 Machine Leaning Process Flow.png
│ │ ├── Table 1.1 Permutation and Combinatio.png
│ │ ├── Table 1.2 Facebook Nearby example - Two-way contingency table.png
│ │ └── Table 1.3 Data structures in R.png
├── Chapter 2
│ └── Images
│ │ ├── Image 2.1 Number of atheletes in each type.png
│ │ ├── Image 2.2 Boxplot showing variation of finish times for each type of runner.png
│ │ ├── Image 2.3 Distribution showing Symmetrical Vs Negative and Positive skewness.png
│ │ ├── Image 2.4 Distribution of finish time of atheletes in marathon data.png
│ │ ├── Image 2.5 Showing kurtosis plots with simulated data.png
│ │ ├── Image 2.6 Showing kurtosis plot of finish time in marathon data.png
│ │ └── Image 2.7 Number of domestic transactions by Male and Female.png
├── Chapter 3
│ └── Images
│ │ ├── Image 3.1 Objectives of sampling.jpg
│ │ ├── Image 3.10 Popultion vs. Sample(WithReplacement) Distribution.png
│ │ ├── Image 3.11 Homogenuous Population and Systematic Sample Distribution.png
│ │ ├── Image 3.12 Population and Stratified Sample Distribution.png
│ │ ├── Image 3.13 Input Data segmented by set of 5 classes by Number of International Transactions.png
│ │ ├── Image 3.14 Clusters formed by K-means ( star sign present centeroid of cluster).png
│ │ ├── Image 3.15 Cluster Population and Cluster Random Sample Distribution.png
│ │ ├── Image 3.16 Histogram and QQ Plot of estimated coefficient.png
│ │ ├── Image 3.17 Histogram of Parameter Estimate from Bootstrap.png
│ │ ├── Image 3.18 Histrogram with normal density function.png
│ │ ├── Image 3.19 Beta Distrbution Plot.png
│ │ ├── Image 3.2 Probability vs Non-Probability Sampling.png
│ │ ├── Image 3.20 Sampling by rejection.png
│ │ ├── Image 3.3 Simulation of Coin toss experiment.png
│ │ ├── Image 3.4 Sampling Distribution Plots .png
│ │ ├── Image 3.5 Distribution of sample means with Normal Density Lines.png
│ │ ├── Image 3.6 Histogram of Outstanding Balance.png
│ │ ├── Image 3.7 Histogram of Number of International Transactions.png
│ │ ├── Image 3.8 Histogram of Number of domestic transactions.png
│ │ └── Image 3.9 Popultion vs. Sample(Without Replacement) Distribution.png
├── Chapter 4
│ └── Images
│ │ ├── Image 4.1 Line chart showing top 10 Countries based on their GDP.png
│ │ ├── Image 4.10 Boxplot showing the GDP (in Trillion US $) for top 10 countries.png
│ │ ├── Image 4.11 Boxplot showing the Population (in Billion) for top 10 countries.png
│ │ ├── Image 4.12 Histogram showing GDP and Population for developed country.png
│ │ ├── Image 4.13 Density Plot showing GDP and Population for developed country.png
│ │ ├── Image 4.14 Histogram showing GDP and Population for developing country.png
│ │ ├── Image 4.15 Density Plot showing GDP and Population for developing country.png
│ │ ├── Image 4.16 Pie chart showing percentage share of each sector by consumption segment in India.png
│ │ ├── Image 4.17 Pie chart showing percentage share of each sector by consumption segment in China.png
│ │ ├── Image 4.18 Plot showing correlation between various world development indicators.png
│ │ ├── Image 4.19 Heatmap between region and there various world development indicators.png
│ │ ├── Image 4.2 Line chart showing top 10 countries based on % contribution to GDP from Agriculture.png
│ │ ├── Image 4.20 Bubble chart showing GDP Per Captita Vs Life Expectency.png
│ │ ├── Image 4.21 Bubble chart showing GDP Per Captita Vs Life Expectency for four Countries.png
│ │ ├── Image 4.22 Bubble chart showing Fertility rate Vs Life Expectency.png
│ │ ├── Image 4.23 Waterfall plot of Footfall at border.png
│ │ ├── Image 4.24 Waterfall chart with Net effect.png
│ │ ├── Image 4.25 Footfall end count as percentage of total end count.png
│ │ ├── Image 4.26 Dendogram with distanceheight upto h=1.png
│ │ ├── Image 4.27 Clusters by actual classification of species in Iris data.png
│ │ ├── Image 4.28 Word cloud on job descriptions.png
│ │ ├── Image 4.29 The Sankey chart for House Sale Data.png
│ │ ├── Image 4.3 Line chart showing top 10 countries based on % contribution to GDP from service sector.png
│ │ ├── Image 4.30 GDP Growth for eight countries.png
│ │ ├── Image 4.31 GDP growth during recession.png
│ │ ├── Image 4.32 The cohort plot for Credit Card Active by Year of Issue.png
│ │ ├── Image 4.33 A example map pulled using ggplot() – New Delhi India.png
│ │ ├── Image 4.34 India map with Robbery Counts in 2010.png
│ │ ├── Image 4.4 Line chart showing top 10 countries based on % contribution to GDP from industry.png
│ │ ├── Image 4.5 Stacked column chart showing contribution of various sector to the World GDP.png
│ │ ├── Image 4.6 Line chart showing top 10 Countries based on their Working Age Ratio.png
│ │ ├── Image 4.7 Stacked bar chart showing constituent of different age group as a % of total population.png
│ │ ├── Image 4.8 Line chart showing top 10 countries and their annual % population growth.png
│ │ └── Image 4.9 Scatterplot showing the relationship Population and GDP for top 10 countries.png
├── Chapter 5
│ └── Images
│ │ ├── Image 5.1 Distribution of Loss (including No Default).png
│ │ ├── Image 5.2 Distribution of Loss (excluding No Default).png
│ │ ├── Image 5.3 Coefficient and Fraction of Deviance Explained by each featurevariable.png
│ │ ├── Image 5.4 Missclasification error and log of penalization factor(Lambda).png
│ │ ├── Image 5.5 Variance explained by Principles Components.png
│ │ └── Image 5.6 Orgthoginality of Principle Component 1 and 2.png
├── Chapter 6
│ ├── Image 6.1 Machine Learning Types.png
│ ├── Image 6.10 Deep Learning Algorithms.png
│ ├── Image 6.11 Dimensionality Reduction Algorithms.png
│ ├── Image 6.12 Ensemble Learning.png
│ ├── Image 6.13 Text Mining Algorithms.png
│ ├── Image 6.14 Scatter Plot between HousePrice and StoreArea.png
│ ├── Image 6.15 Scatter Plot of Actual vs. Predicted.png
│ ├── Image 6.16 Actual vs Predicted Plot.png
│ ├── Image 6.17 Cooks distance for each observation.png
│ ├── Image 6.18 Influence Plot.png
│ ├── Image 6.19 Distribution of Studentized Residuals.png
│ ├── Image 6.2 Regression Algorithms.png
│ ├── Image 6.20 Durbin Watson Statistics Bounds.png
│ ├── Image 6.21 Auto Correlation Funtion (ACF) plot .png
│ ├── Image 6.22 Residuals vs Fitted Plot.png
│ ├── Image 6.23 Actual vs Predicted Plot Linear Model.png
│ ├── Image 6.24 Actual vs Predicted Plot Quadratic Polynomial Model.png
│ ├── Image 6.25 Logit Function.png
│ ├── Image 6.26 Purchase Rate and income Classs.png
│ ├── Image 6.27 ROC Curve for Train Data.png
│ ├── Image 6.28 Actual vs Predicted Plot against MembershipPoints.png
│ ├── Image 6.29 Actual vs Predicted Plot against IncomeClass.png
│ ├── Image 6.3 Distance Based Algorithms.png
│ ├── Image 6.30 Actual vs Predicted Plot against CustomerPropensity.png
│ ├── Image 6.31 Gains Charts with AUC.png
│ ├── Image 6.32 Lift Chart.png
│ ├── Image 6.33 Classification using Support Vector Machine.png
│ ├── Image 6.34 Decision Tree with two attributes and class.png
│ ├── Image 6.35 Recursive Binary split and its corresponding tree for 2-dimensional feature space.png
│ ├── Image 6.36 Gini-Index Function.png
│ ├── Image 6.37 Entropy Function.png
│ ├── Image 6.38 C5.0 Decision Tree on Purchase Prediction Data set.png
│ ├── Image 6.39 CART Model.png
│ ├── Image 6.4 Regularization Algorithms.png
│ ├── Image 6.40 CHAID Decision Tree.png
│ ├── Image 6.41 Scatterplot between StoreArea and LawnArea for each HouseNetWorth group.png
│ ├── Image 6.42 Cluster Dendogram.jpg
│ ├── Image 6.43 Cluster Plot with LawnArea and Store Area.png
│ ├── Image 6.44 Elbow Curve for varying values of k ( number of cluster) on x-axis..png
│ ├── Image 6.45 Cluster plot using k-means.png
│ ├── Image 6.46 Clustering plot based on EM Algorithm.png
│ ├── Image 6.47 Cluster Plot for EM Algorithm.png
│ ├── Image 6.48 Plot for Conver Cluster Hulls for EM Algorithm.png
│ ├── Image 6.49 Silhouette Plot.jpg
│ ├── Image 6.5 Decision Tree Algorithms.png
│ ├── Image 6.50 Item frequency plot in Market Basket Data.png
│ ├── Image 6.51 Sparcity Visualization in Transactions of Market basket Data.png
│ ├── Image 6.52 Illustration of UBCF.png
│ ├── Image 6.53 Dsitribution of Ratings.png
│ ├── Image 6.54. Raw Ratings by Users.png
│ ├── Image 6.55 True Positive Ration vs False Positive Ratio.png
│ ├── Image 6.56 Neuron Anatomy.png
│ ├── Image 6.57 Working of a Perceptron (Mathematically).png
│ ├── Image 6.58 NAND gate operator.gif
│ ├── Image 6.59 NAND gate perceptron.png
│ ├── Image 6.6 Bayesian Algorithms.png
│ ├── Image 6.60 Sigmoid Function.png
│ ├── Image 6.61 Linear Seperability.png
│ ├── Image 6.62 Artificial Network Architecture.png
│ ├── Image 6.63 Supervised vs Unsupervised Learning.png
│ ├── Image 6.64 Working of Backpropogation method.png
│ ├── Image 6.65 One hidden Layer Neural Network.png
│ ├── Image 6.66 Attribute importance by garsen method.jpg
│ ├── Image 6.66 Attribute importance by olden method.jpg
│ ├── Image 6.67 Attribute importance by garson method.png
│ ├── Image 6.68 A Multi Layer Deep Neural Network.png
│ ├── Image 6.69 A sample Volcanao picture for Image Recognition exercise.png
│ ├── Image 6.7 Clustering Algorithms.png
│ ├── Image 6.70 Normalized Image.png
│ ├── Image 6.71 Part-of-Speech mapping.png
│ ├── Image 6.72 Part of Speech frequency.png
│ ├── Image 6.73 Word Cloud using Amazon Food Review Dataset.png
│ ├── Image 6.74 Langauge detection input.png
│ ├── Image 6.75 Language detection output.png
│ ├── Image 6.76 Online Machine learning Algorithms.jpg
│ ├── Image 6.8 Association Rule Mining.png
│ ├── Image 6.9 Artificial Neural Networks.png
│ ├── Table 6.1 House Sale Prices.png
│ ├── Table 6.2 Purchase Preference.png
│ ├── Table 6.3 Breast Cancer Wisconsin.png
│ ├── Table 6.4 Market basket Data.png
│ └── Table 6.5 Amazon Food Review.png
├── Chapter 7
│ └── Images
│ │ ├── Image 7.1 Distribution of House Sale Price.png
│ │ ├── Image 7.2 Distribution of Product Choice Options.png
│ │ ├── Image 7.3 ECDF Plots for Set_1 and Set_2 .png
│ │ ├── Image 7.4 ECDF Plots for Set_1 and Set_2(Manipulated).png
│ │ ├── Image 7.5 Actual Vs Predicted Plot.png
│ │ ├── Image 7.6 Image Explaining Squared Errors.png
│ │ ├── Image 7.7 Two Class Classification Matrix.png
│ │ ├── Image 7.8 ROC Curve .png
│ │ ├── ch07_classi_06-1.png
│ │ ├── ch07_mae_01-1.png
│ │ ├── ch07_population_stability_03-1.png
│ │ ├── ch07_population_stability_04-1.png
│ │ ├── chap7_Data_load_01-1.png
│ │ └── chap7_Data_load_02-1.png
├── Chapter 8
│ └── Images
│ │ ├── Image 8.1 train() function algorithm in caret package.png
│ │ ├── Image 8.10 Voting Ensemple Learning for Classification Problem.gif
│ │ ├── Image 8.11 Bagging Ensemble Flow.png
│ │ ├── Image 8.12 Boosting Ensemble Flow.png
│ │ ├── Image 8.13 Accuracy and kappa of bagged tree.png
│ │ ├── Image 8.14 Accuracy across boosting iterations – C5.0.png
│ │ ├── Image 8.15 Accuracy across Boosting iterations – GBM.png
│ │ ├── Image 8.16 Accuracy across Boosting Ensemble.png
│ │ ├── Image 8.17Accuracy and kappa of individual models.png
│ │ ├── Image 8.18 Scatter plot to high list correlation among results from stacked models.png
│ │ ├── Image 8.19 RMSE in Cost and Sigma space .png
│ │ ├── Image 8.2 Performance plot accuracy metrics.png
│ │ ├── Image 8.3 Accuracy across cross validated samples.png
│ │ ├── Image 8.4 Accuracy across cross validated samples – Automatic Grid Search.png
│ │ ├── Image 8.5 Accuracy across cross validated samples and complexity parameters.png
│ │ ├── Image 8.6 Accuracy across cross validated sets and randomly selected predictors.png
│ │ ├── Image 8.7 Accuracy across cross validated samples and parameter mtry.png
│ │ ├── Image 8.8 Bias and Variance Illustration- Bulls Eye Plot.png
│ │ └── Image 8.9 Bias vs Variance trade off plot.png
└── Chapter 9
│ └── Images
│ ├── Image 9.1 Google File System.png
│ ├── Image 9.11 Data in the HDFS file.png
│ ├── Image 9.12 Connecting to Pig using local filesystem.png
│ ├── Image 9.13 Load data into A1.png
│ ├── Image 9.14 Tokenize each line.png
│ ├── Image 9.15 Flatten tokens.png
│ ├── Image 9.16 Group Words.png
│ ├── Image 9.17 Count and Sort.png
│ ├── Image 9.18 Starting Hbase.png
│ ├── Image 9.19 Create and put data.png
│ ├── Image 9.2 MapReduce Execution Flow.png
│ ├── Image 9.20 Scan the data.png
│ ├── Image 9.21 Feature definition of Prostatic Cancer data set.png
│ ├── Image 9.3 Hadoop Components and tools..png
│ ├── Image 9.4 Word Count example using MapReduce.png
│ ├── Image 9.5 Hive Create table command.png
│ ├── Image 9.6 Hive table in HDFS.png
│ ├── Image 9.7 Describe table command.png
│ ├── Image 9.8 Generate data and store in local file.png
│ └── Image 9.9 Load data into hive table.png
├── Dataset
├── Chapter 2.zip
├── Chapter 3.zip
├── Chapter 4.zip
├── Chapter 5.zip
├── Chapter 6.zip
├── Chapter 7.zip
├── Chapter 8.zip
└── Chapter 9.zip
├── LICENSE.txt
├── README.md
├── contributing.md
└── errata.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.zip filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/1-select-file.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/machine-learning-using-r/51627d232360f7199939056aacab3de5699117c7/1-select-file.jpg
--------------------------------------------------------------------------------
/2-download.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/machine-learning-using-r/51627d232360f7199939056aacab3de5699117c7/2-download.jpg
--------------------------------------------------------------------------------
/9781484223338.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/machine-learning-using-r/51627d232360f7199939056aacab3de5699117c7/9781484223338.jpg
--------------------------------------------------------------------------------
/Code/Ch02_Ramasubramanian.R:
--------------------------------------------------------------------------------
1 | ## ---- echo = FALSE-------------------------------------------------------
2 |
3 | ## ------------------------------------------------------------------------
4 | read.csv("employees.csv", header = TRUE, sep = ",")
5 |
6 |
7 | ## ---- message=FALSE, warning=FALSE---------------------------------------
8 | library(xlsx)
9 | read.xlsx("employees.xlsx",sheetName = "Sheet1")
10 |
11 | ## ---- eval=FALSE---------------------------------------------------------
12 | ##
13 | ##
14 | ##
15 | ## Mike
16 | ## 25
17 | ##
18 | ## Two times world champion. Currently, worlds No. 3
19 | ##
20 | ## 6
21 | ##
22 | ##
23 | ## Usain
24 | ## 29
25 | ##
26 | ## Five times world champion. Currently, worlds No. 1
27 | ##
28 | ## 17
29 | ##
30 | ##
31 | ##
32 |
33 | ## ---- message=FALSE, warning=FALSE---------------------------------------
34 |
35 | library(XML)
36 | library(plyr)
37 |
38 | xml_data <- xmlToList("marathon.xml")
39 |
40 | #Exluding "description" from print
41 | ldply(xml_data, function(x) { data.frame(x[!names(x)=="description"]) } )
42 |
43 |
44 | ## ----header, eval=FALSE--------------------------------------------------
45 | ##
46 | ## Machine Learning with R
47 | ##
48 |
49 | ## ----headings, eval=FALSE------------------------------------------------
50 | ## Header
51 | ## Headings
52 | ## Tables
53 | ## Anchors
54 | ## Links
55 |
56 | ## ----para, eval=FALSE----------------------------------------------------
57 | ##
58 | ## Paragraph 1
59 | ## Paragraph 2
60 | ##
61 |
62 | ## ----table, eval=FALSE---------------------------------------------------
63 | ##
64 | ##
65 | ##
66 | ## Define a row
67 | ##
68 | ##
69 | ##
70 | ##
71 | ##
72 |
73 | ## ----anchor, eval=FALSE--------------------------------------------------
74 | ##
75 | ##
76 | ## __ Welcome to Machine Learning with R!
77 | ##
78 |
79 | ## ---- eval=FALSE---------------------------------------------------------
80 | ##
81 | ##
82 | ##
83 | ##
84 | ## Machine Learning with R
85 | ## Hope you having fun time reading this book !!
86 | ## Chapter 2
87 | ## Data Exploration and Preparation
88 | ## Apress Website Link
89 | ##
90 | ##
91 | ##
92 |
93 | ## ------------------------------------------------------------------------
94 | library(XML)
95 | url <- "html_example.html"
96 | doc <- htmlParse(url)
97 | xpathSApply(doc, "//a/@href")
98 |
99 |
100 | ## ----json, eval=FALSE----------------------------------------------------
101 | ## {
102 | ## "data": [
103 | ## {
104 | ## "id": "A1_B1",
105 | ## "from": {
106 | ## "name": "Jerry", "id": "G1"
107 | ## },
108 | ## "message": "Hey! Hope you like the book so far",
109 | ## "actions": [
110 | ## {
111 | ## "name": "Comment",
112 | ## "link": "http://www.facebook.com/A1/posts/B1"
113 | ## },
114 | ## {
115 | ## "name": "Like",
116 | ## "link": "http://www.facebook.com/A1/posts/B1"
117 | ## }
118 | ## ],
119 | ## "type": "status",
120 | ## "created_time": "2016-08-02T00:24:41+0000",
121 | ## "updated_time": "2016-08-02T00:24:41+0000"
122 | ## },
123 | ## {
124 | ## "id": "A2_B2",
125 | ## "from": {
126 | ## "name": "Tom", "id": "G2"
127 | ## },
128 | ## "message": "Yes. Easy to understand book",
129 | ## "actions": [
130 | ## {
131 | ## "name": "Comment",
132 | ## "link": "http://www.facebook.com/A2/posts/B2"
133 | ## },
134 | ## {
135 | ## "name": "Like",
136 | ## "link": "http://www.facebook.com/A2/posts/B2"
137 | ## }
138 | ## ],
139 | ## "type": "status",
140 | ## "created_time": "2016-08-03T21:27:44+0000",
141 | ## "updated_time": "2016-08-03T21:27:44+0000"
142 | ## }
143 | ## ]
144 | ## }
145 |
146 | ## ------------------------------------------------------------------------
147 | library(rjson)
148 | url <- "json_fb.json"
149 | document <- fromJSON(file=url, method='C')
150 | as.data.frame(document)[,1:3]
151 |
152 | ## ------------------------------------------------------------------------
153 | emp <- read.csv("employees.csv", header = TRUE, sep = ",")
154 | str(emp)
155 |
156 |
157 | ## ------------------------------------------------------------------------
158 |
159 | #Manually overriding the naming convention
160 | names(emp) <- c('Code','First Name','Last Name', 'Salary(US Dollar)')
161 |
162 | # Look at the data
163 | emp
164 |
165 | # Now lets clean it up using make.names
166 | names(emp) <- make.names(names(emp))
167 |
168 | # Look at the data
169 | emp
170 |
171 |
172 | ## ------------------------------------------------------------------------
173 | #Find duplicates
174 | table(emp$Code)
175 |
176 | #Find common names
177 | table(emp$First.Name)
178 |
179 |
180 | ## ---- echo=FALSE---------------------------------------------------------
181 | emp <- read.csv("employees.csv")
182 | emp_qual <- read.csv("employees_qual.csv")
183 |
184 | ## ------------------------------------------------------------------------
185 | merge(emp, emp_qual, by = "Code")
186 |
187 | ## ------------------------------------------------------------------------
188 | merge(emp, emp_qual, by = "Code", all.x = TRUE)
189 |
190 | ## ------------------------------------------------------------------------
191 | merge(emp, emp_qual, by = "Code", all.y = TRUE)
192 |
193 | ## ------------------------------------------------------------------------
194 | merge(emp, emp_qual, by = "Code", all = TRUE)
195 |
196 | ## ---- warning=FALSE, message=FALSE---------------------------------------
197 | library(dplyr)
198 |
199 | ## ------------------------------------------------------------------------
200 | inner_join(emp, emp_qual, by = "Code")
201 |
202 | ## ------------------------------------------------------------------------
203 | left_join(emp, emp_qual, by = "Code")
204 |
205 | ## ------------------------------------------------------------------------
206 | right_join(emp, emp_qual, by = "Code")
207 |
208 | ## ------------------------------------------------------------------------
209 | full_join(emp, emp_qual, by = "Code")
210 |
211 | ## ------------------------------------------------------------------------
212 |
213 | employees_qual <- read.csv("employees_qual.csv")
214 |
215 | #Inconsistent
216 | employees_qual
217 |
218 | employees_qual$Qual = as.character(employees_qual$Qual)
219 | employees_qual$Qual <- ifelse(employees_qual$Qual %in% c("Phd","phd","PHd"), "PhD", employees_qual$Qual)
220 |
221 | #Corrected
222 | employees_qual
223 |
224 |
225 | ## ------------------------------------------------------------------------
226 |
227 | emp <- read.csv("employees.csv")
228 | employees_qual <- read.csv("employees_qual.csv")
229 |
230 | #Correcting the inconsistency
231 | employees_qual$Qual = as.character(employees_qual$Qual)
232 | employees_qual$Qual <- ifelse(employees_qual$Qual %in% c("Phd","phd","PHd"), "PhD", employees_qual$Qual)
233 |
234 | #Store the output from right_join in the variables impute_salary
235 | impute_salary <- right_join(emp, employees_qual, by = "Code")
236 |
237 | #Calculate the average salary for each Qualification
238 | ave_age <- ave(impute_salary$Salary.US.Dollar., impute_salary$Qual,
239 | FUN = function(x) mean(x, na.rm = TRUE))
240 |
241 | #Fill the NAs with the average values
242 | impute_salary$Salary.US.Dollar. <- ifelse(is.na(impute_salary$Salary.US.Dollar.), ave_age, impute_salary$Salary.US.Dollar.)
243 |
244 | impute_salary
245 |
246 |
247 | ## ---- warning=FALSE,message=FALSE----------------------------------------
248 |
249 | library("lubridate")
250 | date <- as.POSIXct("2016-03-13 09:51:48")
251 | date
252 | with_tz(date, "UTC")
253 |
254 |
255 | ## ------------------------------------------------------------------------
256 |
257 | dst_time <- ymd_hms("2010-03-14 01:59:59")
258 | dst_time <- force_tz(dst_time, "America/Chicago")
259 | dst_time
260 |
261 |
262 | ## ------------------------------------------------------------------------
263 | dst_time + dseconds(1)
264 |
265 |
266 | ## ----warning=FALSE, message=FALSE----------------------------------------
267 |
268 | library(data.table)
269 | WDI_Data <- fread("WDI_Data.csv", header = TRUE, skip = 333555, select = c(3,40,43))
270 | setnames(WDI_Data, c("Dev_Indicators", "1995","1998"))
271 | WDI_Data <- WDI_Data[c(1,3),]
272 |
273 |
274 | ## ------------------------------------------------------------------------
275 | WDI_Data[,"Dev_Indicators", with = FALSE]
276 |
277 |
278 | ## ------------------------------------------------------------------------
279 | WDI_Data[,2:3, with = FALSE]
280 |
281 |
282 | ## ---- warning=FALSE, message=FALSE---------------------------------------
283 | library(tidyr)
284 | gather(WDI_Data,Year,Value, 2:3)
285 |
286 |
287 | ## ------------------------------------------------------------------------
288 | marathon <- read.csv("marathon.csv")
289 | summary(marathon)
290 | quantile(marathon$Finish_Time, 0.25)
291 |
292 |
293 | ## ------------------------------------------------------------------------
294 |
295 | quantile(marathon$Finish_Time, 0.25)
296 |
297 |
298 | ## ------------------------------------------------------------------------
299 |
300 | quantile(marathon$Finish_Time, 0.5)
301 |
302 | #Another function to calculate median
303 |
304 | median(marathon$Finish_Time)
305 |
306 |
307 | ## ------------------------------------------------------------------------
308 |
309 | quantile(marathon$Finish_Time, 0.75)
310 |
311 |
312 | ## ------------------------------------------------------------------------
313 |
314 | quantile(marathon$Finish_Time, 0.75, names = FALSE) - quantile(marathon$Finish_Time, 0.25, names = FALSE)
315 |
316 |
317 | ## ------------------------------------------------------------------------
318 |
319 | mean(marathon$Finish_Time)
320 |
321 |
322 | ## ----fig.width=10, fig.height=5, fig.cap='Frequency plot for Marathon Finish Time'----
323 | plot(marathon$Type, xlab = "Marathoners Type", ylab = "Number of Marathoners")
324 |
325 | ## ----fig.width=10, fig.height=5, fig.cap='Boxplot of Marathon Finish Time'----
326 | boxplot(Finish_Time ~ Type,data=marathon, main="Marathon Data", xlab="Type of Marathoner", ylab="Finish Time")
327 |
328 | ## ------------------------------------------------------------------------
329 |
330 | mean(marathon$Finish_Time)
331 | var(marathon$Finish_Time)
332 | sd(marathon$Finish_Time)
333 |
334 |
335 | ## ------------------------------------------------------------------------
336 | tapply(marathon$Finish_Time,marathon$Type, mean)
337 | tapply(marathon$Finish_Time,marathon$Type, sd)
338 |
339 |
340 | ## ---- fig.width=10, fig.height=5, fig.cap='Skewness Plots'---------------
341 |
342 | library("moments")
343 |
344 | par(mfrow=c(1,3), mar=c(5.1,4.1,4.1,1))
345 |
346 | # Negative skew
347 | hist(rbeta(10000,2,6), main = "Negative Skew" )
348 | skewness(rbeta(10000,2,6))
349 |
350 | # Positive skew
351 | hist(rbeta(10000,6,2), main = "Positive Skew")
352 | skewness(rbeta(10000,6,2))
353 |
354 | # Symmetrical
355 | hist(rbeta(10000,6,6), main = "Symmetrical")
356 | skewness(rbeta(10000,6,6))
357 |
358 |
359 | ## ---- fig.width=10, fig.height=5, fig.cap='Marathon Finish Time'---------
360 |
361 | hist(marathon$Finish_Time, main = "Marathon Finish Time")
362 | skewness(marathon$Finish_Time)
363 |
364 |
365 | ## ----fig.width=10, fig.height=5, fig.cap='Kurtosis Plots'----------------
366 |
367 | #leptokurtic
368 | set.seed(2)
369 | random_numbers <- rnorm(20000,0,0.5)
370 | plot(density(random_numbers), col = "blue", main = "Kurtosis Plots", lwd=2.5, asp = 4)
371 | kurtosis(random_numbers)
372 |
373 |
374 | #platykurtic
375 | set.seed(900)
376 | random_numbers <- rnorm(20000,0,0.6)
377 | lines(density(random_numbers), col = "red", lwd=2.5)
378 | kurtosis(random_numbers)
379 |
380 |
381 | #mesokurtic
382 | set.seed(3000)
383 | random_numbers <- rnorm(20000,0,1)
384 | lines(density(random_numbers), col = "green", lwd=2.5)
385 | kurtosis(random_numbers)
386 |
387 | legend(1,0.7, c("leptokurtic", "platykurtic","mesokurtic" ),
388 | lty=c(1,1),
389 | lwd=c(2.5,2.5),col=c("blue","red","green"))
390 |
391 |
392 |
393 | ## ----fig.width=10, fig.height=5, fig.cap='Marathon Finish Time'----------
394 |
395 | plot(density(as.numeric(marathon$Finish_Time)), col = "blue", main = "Kurtosis Plots", lwd=2.5, asp = 4)
396 | kurtosis(marathon$Finish_Time)
397 |
398 |
399 | ## ---- warning=FALSE, message=FALSE, cache=TRUE---------------------------
400 |
401 | library(data.table)
402 | data <- fread("ccFraud.csv",header=T, verbose = FALSE, showProgress = FALSE)
403 | str(data)
404 |
405 |
406 | ## ---- warning=FALSE, cache = TRUE----------------------------------------
407 |
408 | library(data.table)
409 | US_state <- fread("US_State_Code_Mapping.csv",header=T, showProgress = FALSE)
410 | data<-merge(data, US_state, by = 'state')
411 |
412 |
413 | ## ------------------------------------------------------------------------
414 | library(data.table)
415 | Gender_map<-fread("Gender Map.csv",header=T)
416 | data<-merge(data, Gender_map, by = 'gender')
417 |
418 |
419 | ## ------------------------------------------------------------------------
420 |
421 | library(data.table)
422 | Credit_line<-fread("credit line map.csv",header=T)
423 | data<-merge(data, Credit_line, by = 'creditLine')
424 |
425 |
426 | ## ---- echo=FALSE---------------------------------------------------------
427 | data$gender<-NULL
428 | data$state<-NULL
429 | data$PostalCode<- NULL
430 |
431 | ## ------------------------------------------------------------------------
432 | setnames(data,"custID","CustomerID")
433 | setnames(data,"code","Gender")
434 | setnames(data,"numTrans","DomesTransc")
435 | setnames(data,"numIntlTrans","IntTransc")
436 | setnames(data,"fraudRisk","FraudFlag")
437 | setnames(data,"cardholder","NumOfCards")
438 | setnames(data,"balance","OutsBal")
439 | setnames(data,"StateName","State")
440 |
441 | str(data)
442 |
443 |
444 | ## ------------------------------------------------------------------------
445 |
446 | summary(data[,c("NumOfCards","OutsBal","DomesTransc",
447 | "IntTransc"),with = FALSE])
448 |
449 |
450 | ## ------------------------------------------------------------------------
451 | boxplot(I(DomesTransc + IntTransc ) ~ Gender, data = data)
452 | title("Number of Domestic Transaction")
453 |
454 | tapply(I(data$DomesTransc + data$IntTransc),data$Gender, median)
455 |
456 | tapply(I(data$DomesTransc + data$IntTransc),data$Gender, mean)
457 |
458 |
459 |
460 | ## ------------------------------------------------------------------------
461 |
462 | table(data$CardType,data$FraudFlag)
463 |
464 |
465 | ## ------------------------------------------------------------------------
466 |
467 | table(data$Gender,data$FraudFlag)
468 |
469 |
470 |
--------------------------------------------------------------------------------
/Code/Ch05_Singh.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | library(knitr)
3 | opts_chunk$set(dev="png",
4 | dev.args=list(type="cairo"),
5 | dpi=96)
6 | knitr::opts_chunk$set(
7 | fig.path = "Images/"
8 | )
9 |
10 | ## ----ch_05_Loading the data, warning=FALSE-------------------------------
11 | setwd("C:/Personal/Machine Learning/Final Artwork and Code/Chapter 5")
12 |
13 | ##Input the data and store in data table
14 |
15 | library(data.table)
16 |
17 | data <-fread ("Dataset/Loan Default Prediction.csv",header=T, verbose = FALSE, showProgress = TRUE)
18 |
19 | dim(data)
20 |
21 |
22 | ## ----ch_05_1_Type_of_Data_01---------------------------------------------
23 |
24 | #Summary of the data
25 | summary(data$loss)
26 |
27 |
28 |
29 | ## ----ch_05_1_Type_of_Data_02---------------------------------------------
30 | hist(data$loss,
31 | main="Histogram for Loss Distribution ",
32 | xlab="Loss",
33 | border="blue",
34 | col="red",
35 | las=0,
36 | breaks=100,
37 | prob = TRUE)
38 |
39 |
40 | ## ----ch_05_1_Type_of_Data_03---------------------------------------------
41 |
42 | #Sub-set the data into NON Loss and Loss ( e.g., loss > 0)
43 |
44 | subset_loss <- subset(data,loss !=0)
45 |
46 | #Distrbution of cases where there is some loss regestered
47 |
48 | hist(subset_loss$loss,
49 | main="Histogram for Loss Distribution (Default cases) ",
50 | xlab="Loss",
51 | border="blue",
52 | col="red",
53 | las=0,
54 | breaks=100,
55 | prob = TRUE)
56 |
57 | ## ----ch_05_1_Type_of_Data_04---------------------------------------------
58 |
59 | #Create the default variable
60 |
61 | data[,default := ifelse(data$loss == 0, 0,1)]
62 |
63 | #Distribution of defaults
64 | table(data$default)
65 |
66 | #Event rate is defined as ratio of default cases in total population
67 |
68 | print(table(data$default)*100/nrow(data))
69 |
70 |
71 | ## ----ch_05_2_Feature_type_01---------------------------------------------
72 |
73 | continuos <- character()
74 | categorical <- character()
75 | #Write a loop to go over all features and find unique values
76 | p<-1
77 | q<-1
78 | for (i in names(data))
79 | {
80 | unique_levels = length(unique(data[,get(i)]))
81 |
82 | if(i %in% c("id","loss","default"))
83 | {
84 | next;
85 | }
86 |
87 | else
88 | {
89 | if (unique_levels <=30 | is.character(data[,get(i)]))
90 | {
91 | # cat("The feature ", i, " is a categorical variable")
92 | categorical[p] <- i
93 | p=p+1
94 | # Makingh the
95 | data[[i]] <- factor(data[[i]])
96 | }
97 | else
98 | {
99 | # cat("The feature ", i, " is a continuos variable")
100 | continuos[q] <- i
101 | q=q+1
102 |
103 | }
104 | }
105 | }
106 |
107 | # subtract 1 as one is dependent variable = default
108 | cat("\nTotal number of continuos variables in feature set ", length(continuos) - 1)
109 |
110 | # subtract 2 as one is loss and one is id
111 | cat("\nTotal number of categorical variable in feature set ", length(categorical) - 2)
112 |
113 |
114 | ## ----ch05_feature_ranking_01, warning=FALSE------------------------------
115 |
116 | library(MLmetrics)
117 |
118 | performance_metric_gini <- data.frame(feature = character(), Gini_value = numeric())
119 |
120 | #Write a loop to go over all features and find unique values
121 | for (feature in names(data))
122 | {
123 | if(feature %in% c("id","loss","default"))
124 | {
125 | next;
126 | }
127 | else
128 | {
129 | tryCatch({glm_model <- glm(default ~ get(feature),data=data,family=binomial(link="logit"));
130 |
131 | predicted_values <- predict.glm(glm_model,newdata=data,type="response");
132 |
133 | Gini_value <- Gini(predicted_values,data$default);
134 |
135 | performance_metric_gini <- rbind(performance_metric_gini,cbind(feature,Gini_value));},error=function(e){})
136 |
137 | }
138 | }
139 |
140 | performance_metric_gini$Gini_value <- as.numeric(as.character(performance_metric_gini$Gini_value))
141 | #Rank the features by value of Gini Coefficient
142 |
143 | Ranked_Features <- performance_metric_gini[order(-performance_metric_gini$Gini_value),]
144 |
145 | print("Top 5 Features by Gini Coefficients\n")
146 |
147 | head(Ranked_Features)
148 |
149 |
150 | ## ----ch05_feature_ranking_02, message=FALSE,warning=FALSE----------------
151 | #Create a logistic model with top 6 features (f766,f404,f629,f630,f281 and f322)
152 |
153 | glm_model <- glm(default ~ f766 + f404 + f629 + f630 + f281 + f322,data=data,family=binomial(link="logit"));
154 |
155 | predicted_values <- predict.glm(glm_model,newdata=data,type="response");
156 |
157 | Gini_value <- Gini(predicted_values,data$default);
158 |
159 | summary(glm_model)
160 |
161 | Gini_value
162 |
163 | ## ----ch05_feature_ranking_03---------------------------------------------
164 | #Create the correlation matrix for 6 features (f766,f404,f629,f630,f281 and f322)
165 |
166 | top_6_feature <- data.frame(data$f766,data$f404,data$f629,data$f630,data$f281,data$f322)
167 |
168 | cor(top_6_feature, use="complete")
169 |
170 | ## ----ch05_filter_method_01, warning=FALSE--------------------------------
171 | #Calculate the variance of each individual variable and standardize the variance by dividing with mean()
172 |
173 | coefficient_of_variance <- data.frame(feature = character(), cov = numeric())
174 |
175 | #Write a loop to go over all features and calculate variance
176 | for (feature in names(data))
177 | {
178 | if(feature %in% c("id","loss","default"))
179 | {
180 | next;
181 | }
182 | else if(feature %in% continuos)
183 | {
184 | tryCatch(
185 | {cov <- abs(sd(data[[feature]], na.rm = TRUE)/mean(data[[feature]],na.rm = TRUE));
186 | if(cov != Inf){
187 | coefficient_of_variance <- rbind(coefficient_of_variance,cbind(feature,cov));} else {next;}},error=function(e){})
188 |
189 | }
190 | else
191 | {
192 | next;
193 | }
194 | }
195 |
196 | coefficient_of_variance$cov <- as.numeric(as.character(coefficient_of_variance$cov))
197 |
198 | #Order the list by highest to lowest coefficient of variation
199 |
200 | Ranked_Features_cov <- coefficient_of_variance[order(-coefficient_of_variance$cov),]
201 |
202 | print("Top 5 Features by Coefficient of Variance\n")
203 |
204 | head(Ranked_Features_cov)
205 |
206 |
207 | ## ----ch05_filter_method_02-----------------------------------------------
208 | #Create a logistic model with top 6 features (f338,f422,f724,f636,f775 and f723)
209 |
210 | glm_model <- glm(default ~ f338 + f422 + f724 + f636 + f775 + f723,data=data,family=binomial(link="logit"));
211 |
212 | predicted_values <- predict.glm(glm_model,newdata=data,type="response");
213 |
214 | Gini_value <- Gini(predicted_values,data$default);
215 |
216 | summary(glm_model)
217 |
218 | cat("The Gini Coefficient for the fitted model is ",Gini_value);
219 |
220 | ## ----ch05_filter_method_03-----------------------------------------------
221 | #Create the correlation matrix for 6 features (f338,f422,f724,f636,f775 and f723)
222 |
223 | top_6_feature <- data.frame(as.double(data$f338),as.double(data$f422),as.double(data$f724),as.double(data$f636),as.double(data$f775),as.double(data$f723))
224 |
225 | cor(top_6_feature, use="complete")
226 |
227 | ## ----ch05_wrapper_method_01----------------------------------------------
228 | #Pull 5 variables we had from highest coefficient of variation (from filter method)(f338,f422,f724,f636 and f775)
229 |
230 | predictor_set <- c("f338","f422","f724","f636","f775")
231 |
232 | #Randomly Pull 5 variables from categorical variable set ( Reader can apply filter method to categorical variable and can choose these 5 variables systemetically as well)
233 | set.seed(101);
234 | ind <- sample(1:length(categorical), 5, replace=FALSE)
235 | p<- 1
236 | for (i in ind)
237 | {
238 | predictor_set [5+p] <- categorical[i]
239 | p=p+1
240 | }
241 |
242 | #Print the set of 10 variables we will be working with
243 |
244 | print(predictor_set)
245 |
246 | #Replaced f33 by f93 as f33 does not have levels
247 |
248 | predictor_set[7] <- "f93"
249 |
250 | #Print final list of variables
251 |
252 | print(predictor_set)
253 |
254 | ## ----ch05_wrapper_method_02----------------------------------------------
255 | # Create a samll modeling datset with only predictors and dependent variable
256 | library(data.table)
257 | data_model <- data[,.(id,f338,f422,f724,f636,f775,f222,f93,f309,f303,f113,default),]
258 | #make sure to remove the missing cases to resolve erros regarding null values
259 |
260 | data_model<-na.omit(data_model)
261 |
262 | #Full model uses all the 10 variables
263 | full_model <- glm(default ~ f338 + f422 + f724 + f636 + f775 + f222 + f93 + f309 + f303 + f113,data=data_model,family=binomial(link="logit"))
264 |
265 | #Summary of the full model
266 | summary(full_model)
267 |
268 | ## ----ch05_wrapper_method_03----------------------------------------------
269 | #Null model uses no variables
270 | null_model <- glm(default ~ 1 ,data=data_model,family=binomial(link="logit"))
271 |
272 | #Summary of the full model
273 | summary(null_model)
274 |
275 | ## ----ch05_wrapper_method_04----------------------------------------------
276 | #Summary of backward step selection
277 | backwards <- step(full_model)
278 |
279 |
280 | ## ----ch05_wrapper_method_05----------------------------------------------
281 | #summary of forward selection method
282 | forwards <- step(null_model,scope=list(lower=formula(null_model),upper=formula(full_model)), direction="forward")
283 |
284 |
285 | ## ----ch05_wrapper_method_06----------------------------------------------
286 | #Summary of final model with backward selection process
287 | formula(backwards)
288 |
289 | #Summary of final model with forward selection process
290 | formula(forwards)
291 |
292 | ## ----ch05_embeded_method_01, warning=FALSE,message=FALSE-----------------
293 |
294 | #Create data farme with dependent and independent variables (Remove NA)
295 |
296 | data_model <- na.omit(data)
297 |
298 | y <- as.matrix(data_model$default)
299 |
300 | x <- as.matrix(subset(data_model, select=continuos[250:260]))
301 |
302 | library("glmnet")
303 |
304 | ## ----ch05_embeded_method_02, warning=FALSE,message=FALSE-----------------
305 |
306 | #Fit a model with dependent variable of binomial family
307 | fit = glmnet(x,y, family= "binomial")
308 |
309 | #Summary of fit model
310 | summary(fit)
311 |
312 | #Plot the output of glmnet fit model
313 | plot (fit, xvar= "dev", label=TRUE)
314 |
315 | ## ----ch05_embeded_method_03, warning=FALSE,message=FALSE-----------------
316 |
317 | #Fit a cross validated binomial model
318 | fit_logistic = cv.glmnet(x,y, family= "binomial", type.measure = "class")
319 |
320 | #Summary of fitted Cross Validated Linear Model
321 |
322 | summary(fit_logistic)
323 |
324 | #Plot the results
325 | plot (fit_logistic)
326 |
327 | ## ----ch05_embeded_method_04----------------------------------------------
328 | #Print the minimum lambda - regularization factor
329 | print(fit_logistic$lambda.min)
330 |
331 | print(fit_logistic$lambda.1se)
332 |
333 | #Against the lamda minimum value we can get the coefficients
334 | param <- coef(fit_logistic, s="lambda.min")
335 |
336 | param <- as.data.frame(as.matrix(param))
337 |
338 | param$feature<-rownames(param)
339 |
340 | #The list of variables suggested by the embeded method
341 |
342 | param_embeded <- param[param$`1` > 0,]
343 |
344 | print(param_embeded)
345 |
346 | ## ----ch05_principle_component_01-----------------------------------------
347 | #Take a subset of 10 features
348 | pca_data <- data[,.(f381,f408,f495,f529,f549,f539,f579,f634,f706,f743)]
349 |
350 | pca_data <- na.omit(pca_data)
351 |
352 | head(pca_data)
353 |
354 | #Normalise the data before applying PCA analysis mean=0, and sd=1
355 | scaled_pca_data <- scale(pca_data)
356 |
357 | head(scaled_pca_data)
358 |
359 | #Do the decomposition on the scaled series
360 | pca_results <- prcomp(scaled_pca_data)
361 |
362 | print(pca_results)
363 |
364 | summary(pca_results)
365 |
366 | plot(pca_results)
367 |
368 | #Create the biplot with principle components
369 | biplot(pca_results, col = c("red", "blue"))
370 |
371 |
372 |
--------------------------------------------------------------------------------
/Code/Ch06_Clustering.R:
--------------------------------------------------------------------------------
1 | ## ----opts, echo = FALSE--------------------------------------------------
2 | library(knitr)
3 | opts_chunk$set(dev="png",
4 | dev.args=list(type="cairo"),
5 | dpi=96)
6 | knitr::opts_chunk$set(
7 | fig.path = "~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Images/004/"
8 | )
9 |
10 | ## ----ch06_hierachal_clust_01---------------------------------------------
11 | # Read the house Worth Data
12 | Data_House_Worth <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/House Worth Data.csv",header=TRUE);
13 |
14 | str(Data_House_Worth)
15 |
16 | #remove teh extra column as well not be using this
17 | Data_House_Worth$BasementArea <- NULL
18 |
19 | ## ----ch06_hierachal_clust_02---------------------------------------------
20 | library(ggplot2)
21 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) + geom_point()
22 |
23 | ## ----ch06_hierachal_clust_03---------------------------------------------
24 | # apply the hirarichal clustering algorith
25 | clusters <- hclust(dist(Data_House_Worth[,2:3]))
26 |
27 | #Plot the dendogram
28 | plot(clusters)
29 |
30 | ## ----ch06_hierachal_clust_04---------------------------------------------
31 | # Create different number of clusters
32 | clusterCut_2 <- cutree(clusters, 2)
33 | #table the clustering distribution with actual networth
34 | table(clusterCut_2,Data_House_Worth$HouseNetWorth)
35 |
36 | clusterCut_3 <- cutree(clusters, 3)
37 | #table the clustering distribution with actual networth
38 | table(clusterCut_3,Data_House_Worth$HouseNetWorth)
39 |
40 |
41 | clusterCut_4 <- cutree(clusters, 4)
42 | #table the clustering distribution with actual networth
43 | table(clusterCut_4,Data_House_Worth$HouseNetWorth)
44 |
45 | ## ----ch06_hierachal_clust_05---------------------------------------------
46 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) +
47 | geom_point(alpha = 0.4, size = 3.5) + geom_point(col = clusterCut_3) +
48 | scale_color_manual(values = c('black', 'red', 'green'))
49 |
50 | ## ----ch06_kmeans_clust_06------------------------------------------------
51 |
52 | # Elbow Curve
53 |
54 | wss <- (nrow(Data_House_Worth)-1)*sum(apply(Data_House_Worth[,2:3],2,var))
55 | for (i in 2:15) {
56 | wss[i] <- sum(kmeans(Data_House_Worth[,2:3],centers=i)$withinss)
57 | }
58 | plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
59 |
60 | ## ----ch06_kmeans_clust_07------------------------------------------------
61 | set.seed(917)
62 | #Run k-means cluster of the datase
63 | Cluster_kmean <- kmeans(Data_House_Worth[,2:3], 3, nstart = 20)
64 |
65 | #Tabulate teh cross distribution
66 | table(Cluster_kmean$cluster,Data_House_Worth$HouseNetWorth)
67 |
68 | ## ----ch06_kmeans_clust_08------------------------------------------------
69 | Cluster_kmean$cluster <- factor(Cluster_kmean$cluster)
70 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) +
71 | geom_point(alpha = 0.4, size = 3.5) + geom_point(col = Cluster_kmean$cluster) +
72 | scale_color_manual(values = c('black', 'red', 'green'))
73 |
74 | ## ----ch06_dist_clust_01--------------------------------------------------
75 | library(EMCluster, quietly = TRUE)
76 |
77 | ret <- init.EM(Data_House_Worth[,2:3], nclass = 3)
78 | ret
79 |
80 | ret.new <- assign.class(Data_House_Worth[,2:3], ret, return.all = FALSE)
81 |
82 | #This have assigned a class to each case
83 | str(ret.new)
84 |
85 | # Plot results
86 | plotem(ret,Data_House_Worth[,2:3])
87 |
88 | ## ----ch06_disty_clust_02-------------------------------------------------
89 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) +
90 | geom_point(alpha = 0.4, size = 3.5) + geom_point(col = ret.new$class) +
91 | scale_color_manual(values = c('black', 'red', 'green'))
92 |
93 | ## ----ch06_dbscan_01------------------------------------------------------
94 | library(dbscan)
95 | cluster_dbscan <- dbscan(Data_House_Worth[,2:3],eps=0.8,minPts = 10)
96 | cluster_dbscan
97 |
98 | #Display the hull plot
99 | hullplot(Data_House_Worth[,2:3],cluster_dbscan$cluster)
100 |
101 | ## ----ch06_dunnIndex------------------------------------------------------
102 | library(clValid)
103 | #Showing for hieracical cluster with clusters = 3
104 | dunn(dist(Data_House_Worth[,2:3]), clusterCut_3)
105 |
106 | ## ----ch06_Silhoutte------------------------------------------------------
107 | library(cluster)
108 |
109 | #Showing for k-means cluster with clusters = 3
110 | sk <- silhouette(clusterCut_3,dist(Data_House_Worth[,2:3]))
111 |
112 | plot(sk)
113 |
114 | ## ----ch06_rand-----------------------------------------------------------
115 | #Unign result from EM Algo
116 | library(EMCluster)
117 | clust <- ret.new$class
118 | orig <- ifelse(Data_House_Worth$HouseNetWorth == "High",2,
119 | ifelse(Data_House_Worth$HouseNetWorth == "Low",1,2))
120 | RRand(orig, clust)
121 |
122 | ## ----ch06_jaccard--------------------------------------------------------
123 | #Unign result from EM Algo
124 | library(EMCluster)
125 | clust <- ret.new$class
126 | orig <- ifelse(Data_House_Worth$HouseNetWorth == "High",2,
127 | ifelse(Data_House_Worth$HouseNetWorth == "Low",1,2))
128 | Jaccard.Index(orig, clust)
129 |
130 |
--------------------------------------------------------------------------------
/Code/Ch06_DecisionTree.R:
--------------------------------------------------------------------------------
1 | ## ----opts, echo = FALSE--------------------------------------------------
2 | library(knitr)
3 | opts_chunk$set(dev="png",
4 | dev.args=list(type="cairo"),
5 | dpi=96)
6 | knitr::opts_chunk$set(
7 | fig.path = "~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Images/003/"
8 | )
9 |
10 | ## ---- fig.cap="Image 6.x: Gini-Index Function"---------------------------
11 | curve(x * (1- x) + (1 - x) * x, xlab = "P", ylab = "Gini-Index", lwd = 5)
12 |
13 | ## ---- fig.cap="Image 6.x: Entropy Function"------------------------------
14 | curve(-x * log2(x) - (1 - x) * log2(1 - x), xlab = "x", ylab = "Entropy", lwd = 5)
15 |
16 | ## ---- message=FALSE, warning=FALSE---------------------------------------
17 |
18 | library(C50)
19 | library(splitstackshape)
20 | library(rattle)
21 | library(rpart.plot)
22 | library(data.table)
23 |
24 | Data_Purchase <- fread("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Purchase Prediction Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE)
25 | str(Data_Purchase)
26 |
27 | #Check the distribution of data before grouping
28 | table(Data_Purchase$ProductChoice)
29 |
30 | ## ------------------------------------------------------------------------
31 |
32 | #Pulling out only the relevant data to this chapter
33 | Data_Purchase <- Data_Purchase[,.(CUSTOMER_ID,ProductChoice,MembershipPoints,IncomeClass,CustomerPropensity,LastPurchaseDuration)]
34 |
35 | #Delete NA from subset
36 | Data_Purchase <- na.omit(Data_Purchase)
37 | Data_Purchase$CUSTOMER_ID <- as.character(Data_Purchase$CUSTOMER_ID)
38 |
39 | #Stratified Sampling
40 | Data_Purchase_Model<-stratified(Data_Purchase, group=c("ProductChoice"),size=10000,replace=FALSE)
41 |
42 | print("The Distribution of equal classes is as below")
43 | table(Data_Purchase_Model$ProductChoice)
44 |
45 | Data_Purchase_Model$ProductChoice <- as.factor(Data_Purchase_Model$ProductChoice)
46 | Data_Purchase_Model$IncomeClass <- as.factor(Data_Purchase_Model$IncomeClass)
47 | Data_Purchase_Model$CustomerPropensity <- as.factor(Data_Purchase_Model$CustomerPropensity)
48 |
49 | #Build the decision tree on Train Data (Set_1) and then test data (Set_2) will be used for performance testing
50 |
51 | set.seed(917);
52 | train <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=nrow(Data_Purchase_Model)*(0.7), replace = TRUE, prob = NULL),]
53 | train <- as.data.frame(train)
54 |
55 | test <- Data_Purchase_Model[!(Data_Purchase_Model$CUSTOMER_ID %in% train$CUSTOMER_ID),]
56 |
57 |
58 | ## ---- message=FALSE,warning=FALSE,eval=FALSE-----------------------------
59 | ##
60 | ## library(RWeka)
61 | ##
62 | ## WPM("refresh-cache")
63 | ## WPM("install-package", "simpleEducationalLearningSchemes")
64 | ##
65 | ##
66 | ## ## make classifier
67 | ## ID3 <- make_Weka_classifier("weka/classifiers/trees/Id3")
68 | ##
69 | ## ID3Model <- ID3(ProductChoice ~ CustomerPropensity + IncomeClass , data = train)
70 | ##
71 | ## summary(ID3Model)
72 | ##
73 |
74 | ## ----eval=FALSE----------------------------------------------------------
75 | ##
76 | ## library(gmodels)
77 | ## purchase_pred_test <- predict(ID3Model, test)
78 | ## CrossTable(test$ProductChoice, purchase_pred_test,
79 | ## prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
80 | ## dnn = c('actual default', 'predicted default'))
81 | ##
82 |
83 | ## ------------------------------------------------------------------------
84 |
85 | model_c50 <- C5.0(train[,c("CustomerPropensity","LastPurchaseDuration", "MembershipPoints")],
86 | train[,"ProductChoice"],
87 | control = C5.0Control(CF = 0.001, minCases = 2))
88 |
89 |
90 | ## ---- warning=FALSE,message=FALSE----------------------------------------
91 |
92 | summary(model_c50)
93 | #plot(model_c50)
94 |
95 |
96 | ## ------------------------------------------------------------------------
97 |
98 | library(gmodels)
99 |
100 | purchase_pred_train <- predict(model_c50, train,type = "class")
101 | CrossTable(train$ProductChoice, purchase_pred_train,
102 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
103 | dnn = c('actual default', 'predicted default'))
104 |
105 |
106 | ## ------------------------------------------------------------------------
107 | purchase_pred_test <- predict(model_c50, test)
108 | CrossTable(test$ProductChoice, purchase_pred_test,prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
109 | dnn = c('actual default', 'predicted default'))
110 |
111 |
112 | ## ------------------------------------------------------------------------
113 |
114 | CARTModel <- rpart(ProductChoice ~ IncomeClass + CustomerPropensity + LastPurchaseDuration + MembershipPoints, data=train)
115 |
116 | summary(CARTModel)
117 |
118 | library(rpart.plot)
119 | library(rattle)
120 |
121 | #fancyRpartPlot(CARTModel)
122 |
123 |
124 | ## ------------------------------------------------------------------------
125 |
126 | library(gmodels)
127 |
128 | purchase_pred_train <- predict(CARTModel, train,type = "class")
129 | CrossTable(train$ProductChoice, purchase_pred_train,
130 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
131 | dnn = c('actual default', 'predicted default'))
132 |
133 |
134 | ## ------------------------------------------------------------------------
135 |
136 | #CHAID installation from source
137 | ##install.packages("CHAID", repos="http://R-Forge.R-project.org")
138 | library(CHAID)
139 |
140 | ctrl <- chaid_control(minsplit = 200, minprob = 0.1)
141 | CHAIDModel <- chaid(ProductChoice ~ CustomerPropensity + IncomeClass, data = train, control = ctrl)
142 | print(CHAIDModel)
143 | #plot(CHAIDModel)
144 |
145 |
146 | ## ------------------------------------------------------------------------
147 |
148 | library(gmodels)
149 |
150 | purchase_pred_train <- predict(CHAIDModel, train)
151 | CrossTable(train$ProductChoice, purchase_pred_train,
152 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
153 | dnn = c('actual default', 'predicted default'))
154 |
155 | ## ------------------------------------------------------------------------
156 | purchase_pred_test <- predict(CHAIDModel, test)
157 | CrossTable(test$ProductChoice, purchase_pred_test,
158 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
159 | dnn = c('actual default', 'predicted default'))
160 |
161 |
162 | ## ------------------------------------------------------------------------
163 | library(caret)
164 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
165 |
166 | ModelC50_boostcv10 <- C5.0(train[,c("CustomerPropensity","LastPurchaseDuration", "MembershipPoints")], train[,"ProductChoice"], trials = 10)
167 |
168 | ## ------------------------------------------------------------------------
169 |
170 | library(gmodels)
171 |
172 | purchase_pred_train <- predict(ModelC50_boostcv10, train)
173 | CrossTable(train$ProductChoice, purchase_pred_train,
174 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
175 | dnn = c('actual default', 'predicted default'))
176 |
177 | ## ------------------------------------------------------------------------
178 | purchase_pred_test <- predict(ModelC50_boostcv10, test)
179 | CrossTable(test$ProductChoice, purchase_pred_test,
180 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
181 | dnn = c('actual default', 'predicted default'))
182 |
183 |
184 | ## ------------------------------------------------------------------------
185 |
186 | control <- trainControl(method="repeatedcv", number=5, repeats=2)
187 |
188 | # Bagged CART
189 | set.seed(100)
190 | CARTBagModel <- train(ProductChoice ~ CustomerPropensity + LastPurchaseDuration + MembershipPoints, data=train, method="treebag", trControl=control)
191 |
192 | ## ------------------------------------------------------------------------
193 |
194 | library(gmodels)
195 |
196 | purchase_pred_train <- predict(CARTBagModel, train)
197 | CrossTable(train$ProductChoice, purchase_pred_train,
198 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
199 | dnn = c('actual default', 'predicted default'))
200 |
201 | ## ------------------------------------------------------------------------
202 | purchase_pred_test <- predict(CARTBagModel, test)
203 | CrossTable(test$ProductChoice, purchase_pred_test,
204 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
205 | dnn = c('actual default', 'predicted default'))
206 |
207 |
208 | ## ------------------------------------------------------------------------
209 | # Random Forest
210 | set.seed(100)
211 |
212 | rfModel <- train(ProductChoice ~ CustomerPropensity + LastPurchaseDuration + MembershipPoints, data=train, method="rf", trControl=control)
213 |
214 |
215 | ## ------------------------------------------------------------------------
216 |
217 | library(gmodels)
218 |
219 | purchase_pred_train <- predict(rfModel, train)
220 | CrossTable(train$ProductChoice, purchase_pred_train,
221 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
222 | dnn = c('actual default', 'predicted default'))
223 |
224 | ## ------------------------------------------------------------------------
225 | purchase_pred_test <- predict(rfModel, test)
226 | CrossTable(test$ProductChoice, purchase_pred_test,
227 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
228 | dnn = c('actual default', 'predicted default'))
229 |
230 |
231 |
--------------------------------------------------------------------------------
/Code/Ch06_Naive Bayes.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | library(data.table)
3 | library(splitstackshape)
4 | library(e1071)
5 |
6 | Data_Purchase <- fread("C:\\Users\\Karthik\\Dropbox\\Book Writing - Drafts\\Chapter Drafts\\Chap 6 29 Sep\\Dataset\\Purchase Prediction Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE)
7 | str(Data_Purchase)
8 |
9 | #Check the distribution of data before grouping
10 | table(Data_Purchase$ProductChoice)
11 |
12 |
13 | #Pulling out only the relevant data to this chapter
14 | Data_Purchase <- Data_Purchase[,.(CUSTOMER_ID,ProductChoice,MembershipPoints,IncomeClass,CustomerPropensity,LastPurchaseDuration)]
15 |
16 | #Delete NA from subset
17 | Data_Purchase <- na.omit(Data_Purchase)
18 | Data_Purchase$CUSTOMER_ID <- as.character(Data_Purchase$CUSTOMER_ID)
19 |
20 | #Stratified Sampling
21 | Data_Purchase_Model<-stratified(Data_Purchase, group=c("ProductChoice"),size=10000,replace=FALSE)
22 |
23 | print("The Distribution of equal classes is as below")
24 | table(Data_Purchase_Model$ProductChoice)
25 |
26 | Data_Purchase_Model$ProductChoice <- as.factor(Data_Purchase_Model$ProductChoice)
27 | Data_Purchase_Model$IncomeClass <- as.factor(Data_Purchase_Model$IncomeClass)
28 | Data_Purchase_Model$CustomerPropensity <- as.factor(Data_Purchase_Model$CustomerPropensity)
29 |
30 | set.seed(917);
31 | train <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=nrow(Data_Purchase_Model)*(0.7), replace = TRUE, prob = NULL),]
32 | train <- as.data.frame(train)
33 |
34 | test <- as.data.frame(Data_Purchase_Model[!(Data_Purchase_Model$CUSTOMER_ID %in% train$CUSTOMER_ID),])
35 |
36 | ## ------------------------------------------------------------------------
37 | model_naiveBayes <- naiveBayes(train[,c(3,4,5)], train[,2])
38 | model_naiveBayes
39 |
40 |
41 | ## ------------------------------------------------------------------------
42 | model_naiveBayes_pred <- predict(model_naiveBayes, train)
43 |
44 | library(gmodels)
45 |
46 | CrossTable(model_naiveBayes_pred, train[,2],
47 | prop.chisq = FALSE, prop.t = FALSE,
48 | dnn = c('predicted', 'actual'))
49 |
50 | ## ------------------------------------------------------------------------
51 |
52 | model_naiveBayes_pred <- predict(model_naiveBayes, test)
53 |
54 | library(gmodels)
55 |
56 | CrossTable(model_naiveBayes_pred, test[,2],
57 | prop.chisq = FALSE, prop.t = FALSE,
58 | dnn = c('predicted', 'actual'))
59 |
60 |
61 |
--------------------------------------------------------------------------------
/Code/Ch06_NeuralNet.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## ----ch06_nnet_01--------------------------------------------------------
4 |
5 | #Load the data and prepare a dataset for logistic regression
6 | Data_Purchase_Prediction <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Purchase Prediction Dataset.csv",header=TRUE);
7 |
8 | Data_Purchase_Prediction$choice <- ifelse(Data_Purchase_Prediction$ProductChoice == 1,1,
9 | ifelse(Data_Purchase_Prediction$ProductChoice == 3,0,999));
10 |
11 | Data_Neural_Net <- Data_Purchase_Prediction[Data_Purchase_Prediction$choice %in% c("0","1"),]
12 |
13 | #Remove Missing Values
14 | Data_Neural_Net <- na.omit(Data_Neural_Net)
15 | rownames(Data_Neural_Net) <- NULL
16 |
17 |
18 | ## ----ch06_nnet_02--------------------------------------------------------
19 | #Transforming the continuous variables
20 | cont <- Data_Neural_Net[,c("PurchaseTenure","CustomerAge","MembershipPoints","IncomeClass")]
21 |
22 | maxs <- apply(cont, 2, max)
23 | mins <- apply(cont, 2, min)
24 |
25 | scaled_cont <- as.data.frame(scale(cont, center = mins, scale = maxs - mins))
26 |
27 | #The dependent variable
28 | dep <- factor(Data_Neural_Net$choice)
29 |
30 | Data_Neural_Net$ModeOfPayment <- factor(Data_Neural_Net$ModeOfPayment);
31 |
32 | flags_ModeOfPayment = data.frame(Reduce(cbind,
33 | lapply(levels(Data_Neural_Net$ModeOfPayment), function(x){(Data_Neural_Net$ModeOfPayment == x)*1})
34 | ))
35 |
36 | names(flags_ModeOfPayment) = levels(Data_Neural_Net$ModeOfPayment)
37 |
38 | Data_Neural_Net$CustomerPropensity <- factor(Data_Neural_Net$CustomerPropensity);
39 |
40 | flags_CustomerPropensity = data.frame(Reduce(cbind,
41 | lapply(levels(Data_Neural_Net$CustomerPropensity), function(x){(Data_Neural_Net$CustomerPropensity == x)*1})
42 | ))
43 | names(flags_CustomerPropensity) = levels(Data_Neural_Net$CustomerPropensity)
44 |
45 | cate <- cbind(flags_ModeOfPayment,flags_CustomerPropensity)
46 |
47 | #Combine all data into single modeling data
48 | Dataset <- cbind(dep,scaled_cont,cate);
49 |
50 | #Divide the data into train and test
51 | set.seed(917);
52 | index <- sample(1:nrow(Dataset),round(0.7*nrow(Dataset)))
53 | train <- Dataset[index,]
54 | test <- Dataset[-index,]
55 |
56 | ## ----ch06_nnet_03--------------------------------------------------------
57 | library(nnet)
58 | i <- names(train)
59 | form <- as.formula(paste("dep ~", paste(i[!i %in% "dep"], collapse = " + ")))
60 | nn <- nnet.formula(form,size=10,data=train)
61 |
62 | predict_class <- predict(nn, newdata=test, type="class")
63 |
64 | #Classiifcation table
65 | table(test$dep,predict_class)
66 |
67 | #Clasisifcation rate
68 | sum(diag(table(test$dep,predict_class))/nrow(test))
69 |
70 | ## ----ch06_nnet_04--------------------------------------------------------
71 | library(NeuralNetTools)
72 | # Plot the neural network
73 | plotnet(nn)
74 |
75 | #get the neural weights
76 | neuralweights(nn)
77 |
78 | # Plot the imporatance
79 | olden(nn)
80 |
81 | #variable importance by garson algorith
82 | garson(nn)
83 |
84 |
85 | ## ----ch06_data_mat-------------------------------------------------------
86 | #Pre-transfromation
87 | head(Data_Purchase_Prediction[,c("choice","PurchaseTenure","CustomerAge","MembershipPoints","IncomeClass","ModeOfPayment","CustomerPropensity")])
88 |
89 | #Post-transoformation
90 | head(train)
91 |
92 | ## ----ch06_DeepNeuralNet_Darch,error=FALSE--------------------------------
93 | #We will us the same data as of previous example in neural network
94 | devtools::install_github("maddin79/darch")
95 | library(darch)
96 | library(mlbench)
97 | library(RANN)
98 |
99 | #Print the model formula
100 | form
101 |
102 | #Apply the model using deep neural net with
103 | # deep_net <- darch(form, train,
104 | # preProc.params = list("method" = c("knnImpute")),
105 | # layers = c(0,10,30,10,0),
106 | # darch.batchSize = 1,
107 | # darch.returnBestModel.validationErrorFactor = 1,
108 | # darch.fineTuneFunction = "rpropagation",
109 | # darch.unitFunction = c("tanhUnit", "tanhUnit","tanhUnit","softmaxUnit"),
110 | # darch.numEpochs = 15,
111 | # bootstrap = T,
112 | # bootstrap.num = 500)
113 |
114 | deep_net <- darch(form,train,
115 | preProc.params = list(method = c("center", "scale")),
116 | layers = c(0,10,30,10,0),
117 | darch.unitFunction = c("sigmoidUnit", "tanhUnit","tanhUnit","softmaxUnit"),
118 | darch.fineTuneFunction = "minimizeClassifier",
119 | darch.numEpochs = 15,
120 | cg.length = 3, cg.switchLayers = 5)
121 |
122 |
123 | #Plot the deep net
124 | library(NeuralNetTools)
125 | plot(deep_net,"net")
126 |
127 | result <- darchTest(deep_net, newdata = test)
128 | result
129 |
130 |
131 | ## ----install,eval=FALSE--------------------------------------------------
132 | ## install.packages("drat", repos="https://cran.rstudio.com")
133 | ## drat:::addRepo("dmlc")
134 | ## install.packages("mxnet")
135 | ##
136 | ## #Please refer https://github.com/dahtah/imager
137 | ## install.packages("devtools")
138 | ## devtools::install_github("dahtah/imager")
139 |
140 | ## ----ch06_deep_learning_01-----------------------------------------------
141 | library(mxnet)
142 |
143 | #install imager for loading images
144 |
145 | library(imager)
146 |
147 | #load the pre-trained model
148 | model <- mx.model.load("Inception/Inception_BN", iteration=39)
149 |
150 | #We also need to load in the mean image, which is used for preprocessing using mx.nd.load.
151 |
152 | mean.img = as.array(mx.nd.load("Inception/mean_224.nd")[["mean_img"]])
153 |
154 | #Load and plot the image: (Defualt parot image)
155 |
156 | #im <- load.image(system.file("extdata/parrots.png", package="imager"))
157 | im <- load.image("Pictures/russia-volcano.jpg")
158 | plot(im)
159 |
160 | ## ----ch06_deep_learning_02-----------------------------------------------
161 | preproc.image <- function(im, mean.image) {
162 | # crop the image
163 | shape <- dim(im)
164 | short.edge <- min(shape[1:2])
165 | xx <- floor((shape[1] - short.edge) / 2)
166 | yy <- floor((shape[2] - short.edge) / 2)
167 | croped <- crop.borders(im, xx, yy)
168 | # resize to 224 x 224, needed by input of the model.
169 | resized <- resize(croped, 224, 224)
170 | # convert to array (x, y, channel)
171 | arr <- as.array(resized) * 255
172 | dim(arr) <- c(224, 224, 3)
173 | # substract the mean
174 | normed <- arr - mean.img
175 | # Reshape to format needed by mxnet (width, height, channel, num)
176 | dim(normed) <- c(224, 224, 3, 1)
177 | return(normed)
178 | }
179 |
180 | #Now pass our image to pre-process
181 | normed <- preproc.image(im, mean.img)
182 | plot(normed)
183 |
184 | ## ----ch06_deep_learning_03,error= FALSE----------------------------------
185 | #prob <- predict(model, X=normed)
186 |
187 | #We can extract the top-5 class index.
188 |
189 | #max.idx <- order(prob[,1], decreasing = TRUE)[1:5]
190 | max.idx <- c("981", "980", "971", "673", "985")
191 | max.idx
192 |
193 | synsets <- readLines("Inception/synset.txt")
194 |
195 | #And let us print the corresponding lines:
196 |
197 | print(paste0("Predicted Top-classes: ", synsets[as.numeric(max.idx)]))
198 |
199 |
--------------------------------------------------------------------------------
/Code/Ch06_OnlineML.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | library(ggplot2)
3 | library(e1071)
4 |
5 | Data_House_Worth <-read.csv("C:\\Users\\Karthik\\Dropbox\\Book Writing - Drafts\\Chapter Drafts\\Chap 6 29 Sep\\Dataset\\House Worth Data.csv",header=TRUE);
6 |
7 | str(Data_House_Worth)
8 |
9 | #remove the extra column that are not required for the model
10 | Data_House_Worth$BasementArea <-NULL
11 |
12 | ## ------------------------------------------------------------------------
13 |
14 | online_cmean <-cmeans(Data_House_Worth[,2:3],3,20,verbose=TRUE,method="ufcl",m=2)
15 | print(online_cmean)
16 |
17 |
18 | ## ------------------------------------------------------------------------
19 | ggplot(Data_House_Worth, aes(StoreArea, LawnArea, color = HouseNetWorth)) +
20 | geom_point(alpha =0.4, size =3.5) +geom_point(col = online_cmean$cluster) +
21 | scale_color_manual(values =c('black', 'red', 'green'))
22 |
23 |
24 |
--------------------------------------------------------------------------------
/Code/Ch06_RuleMining.R:
--------------------------------------------------------------------------------
1 | ## ----opts, echo = FALSE--------------------------------------------------
2 | library(knitr)
3 | opts_chunk$set(dev="png",
4 | dev.args=list(type="cairo"),
5 | dpi=96);
6 | knitr::opts_chunk$set(
7 | fig.path = "~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Images/006/"
8 | )
9 |
10 | ## ---- warning=FALSE, message=FALSE---------------------------------------
11 |
12 | library(arules)
13 | MarketBasket <- read.transactions("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/MarketBasketProcessed.csv", sep = ",")
14 | summary(MarketBasket)
15 |
16 | #Transactions - First two
17 | inspect(MarketBasket[1:2])
18 |
19 | # Top 20 frequently bought product
20 | itemFrequencyPlot(MarketBasket, topN = 20)
21 |
22 | # Sparsity in the data - More white space means, more sparsity
23 | image(sample(MarketBasket, 100))
24 |
25 |
26 | ## ------------------------------------------------------------------------
27 | library(arules)
28 |
29 |
30 | apriori(MarketBasket)
31 | groceryrules <- apriori(MarketBasket, parameter = list(support = 0.2, confidence = 0.8, minlen = 2))
32 |
33 | itemsets <- eclat(MarketBasket,
34 | parameter = list(supp = 0.1, maxlen = 15))
35 |
36 |
37 | ## ------------------------------------------------------------------------
38 |
39 | summary(groceryrules)
40 |
41 | # look at the first three rules
42 | inspect(groceryrules[1:3])
43 |
44 | ## Step 5: Improving model performance ----
45 |
46 | # sorting grocery rules by lift
47 | inspect(sort(groceryrules, by = "lift")[1:5])
48 |
49 | # store as data frame
50 | groceryrules_df <- as(groceryrules, "data.frame")
51 | str(groceryrules_df)
52 |
53 |
54 | ## ------------------------------------------------------------------------
55 | library(arules)
56 |
57 | eclat(MarketBasket)
58 | groceryrules <- eclat(MarketBasket, parameter = list(support = 0.2, minlen = 2))
59 |
60 | itemsets <- eclat(MarketBasket, parameter = list(supp = 0.1, maxlen = 15))
61 |
62 |
63 | ## ------------------------------------------------------------------------
64 |
65 | summary(groceryrules)
66 |
67 | # look at the first three rules
68 | inspect(groceryrules[1:3])
69 |
70 | ## Step 5: Improving model performance ----
71 |
72 | # sorting grocery rules by lift
73 | inspect(sort(groceryrules, by = "support")[1:5])
74 |
75 | # store as data frame
76 | groceryrules_df <- as(groceryrules, "data.frame")
77 | str(groceryrules_df)
78 |
79 |
80 | ## ------------------------------------------------------------------------
81 |
82 |
83 | library(data.table)
84 |
85 | fine_food_data <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Food_Reviews.csv",stringsAsFactors = FALSE)
86 | fine_food_data$Score <- as.factor(fine_food_data$Score)
87 |
88 | str(fine_food_data[-10])
89 |
90 |
91 |
92 | ## ---- message=FALSE,warning=FALSE----------------------------------------
93 |
94 | library(caTools)
95 |
96 | # Randomly split data and use only 10% of the dataset
97 | set.seed(90)
98 | split = sample.split(fine_food_data$Score, SplitRatio = 0.05)
99 |
100 | fine_food_data = subset(fine_food_data, split == TRUE)
101 |
102 | select_col <- c("UserId","ProductId","Score")
103 |
104 | fine_food_data_selected <- fine_food_data[,select_col]
105 | rownames(fine_food_data_selected) <- NULL
106 | fine_food_data_selected$Score = as.numeric(fine_food_data_selected$Score)
107 |
108 | #Remove Duplicates
109 |
110 | fine_food_data_selected <- unique(fine_food_data_selected)
111 |
112 |
113 | ## ---- warning=FALSE,message=FALSE----------------------------------------
114 |
115 | library(recommenderlab)
116 |
117 | #RatingsMatrix
118 |
119 | RatingMat <- dcast(fine_food_data_selected,UserId ~ ProductId, value.var = "Score")
120 | User=RatingMat[,1]
121 | Product= colnames(RatingMat)[2:ncol(RatingMat)]
122 | RatingMat[,1] <- NULL
123 | RatingMat <- as.matrix(RatingMat)
124 | dimnames(RatingMat) = list(user = User , product = Product)
125 |
126 | realM <- as(RatingMat, "realRatingMatrix")
127 |
128 | ## ------------------------------------------------------------------------
129 |
130 | #distribution of ratings
131 | hist(getRatings(realM), breaks=15, main = "Distribution of Ratings", xlab = "Ratings", col = "grey")
132 |
133 | #Sparse Matrix Representation
134 | head(as(realM, "data.frame"))
135 |
136 | #The realRatingMatrix can be coerced back into a matrix which is identical to the original matrix
137 | identical(as(realM, "matrix"),RatingMat)
138 |
139 | #Sparcity in Rating Matrix
140 | image(realM, main = "Raw Ratings")
141 |
142 | ## ------------------------------------------------------------------------
143 |
144 | #UBCF Model
145 | r_UBCF <- Recommender(realM[1:1700], method = "UBCF")
146 | r_UBCF
147 |
148 | #List of objects in the model output
149 | names(getModel(r_UBCF))
150 |
151 | #Recommend product for the rest of 29 left out observations
152 | recom_UBCF <- predict(r_UBCF, realM[1700:1729], n=5)
153 | recom_UBCF
154 |
155 | #Display the recommendation
156 | reco <- as(recom_UBCF, "list")
157 | reco[lapply(reco,length)>0]
158 |
159 |
160 | ## ------------------------------------------------------------------------
161 |
162 | set.seed(2016)
163 | scheme <- evaluationScheme(realM[1:1700], method="split", train = .9,
164 | k=1, given=1, goodRating=3)
165 |
166 | scheme
167 |
168 | algorithms <- list(
169 | "random items" = list(name="RANDOM", param=NULL),
170 | "popular items" = list(name="POPULAR", param=NULL),
171 | "user-based CF" = list(name="UBCF", param=list(nn=50)),
172 | "item-based CF" = list(name="IBCF", param=list(k=50))
173 | )
174 |
175 | results <- evaluate(scheme, algorithms, type = "topNList",
176 | n=c(1, 3, 5, 10, 15, 20))
177 |
178 | plot(results, annotate=c(1,3), legend="bottomright")
179 |
180 |
181 |
--------------------------------------------------------------------------------
/Code/Ch06_SVM.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 |
3 | library(e1071)
4 | library(rpart)
5 |
6 | breast_cancer_data <- read.table("~/Dropbox/Book Writing - Drafts/Chapter Drafts/Final Artwork and Code/Chapter 6/Dataset/breast-cancer-wisconsin.data.txt",sep=",")
7 | breast_cancer_data$V11 = as.factor(breast_cancer_data$V11)
8 |
9 | summary(breast_cancer_data)
10 |
11 |
12 | ## ------------------------------------------------------------------------
13 |
14 | ## split data into a train and test set
15 | index <- 1:nrow(breast_cancer_data)
16 | test_data_index <- sample(index, trunc(length(index)/3))
17 | test_data <- breast_cancer_data[test_data_index,]
18 | train_data <- breast_cancer_data[-test_data_index,]
19 |
20 |
21 | ## ------------------------------------------------------------------------
22 |
23 | svm.model <- svm(V11 ~ ., data = train_data, cost = 100, gamma = 1)
24 |
25 |
26 | ## ------------------------------------------------------------------------
27 |
28 | library(gmodels)
29 |
30 | svm_pred_train <- predict(svm.model, train_data[,-11])
31 | CrossTable(train_data$V11, svm_pred_train,
32 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
33 | dnn = c('actual default', 'predicted default'))
34 |
35 |
36 | ## ------------------------------------------------------------------------
37 |
38 | svm_pred_test <- predict(svm.model, test_data[,-11])
39 | CrossTable(test_data$V11, svm_pred_test,
40 | prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
41 | dnn = c('actual default', 'predicted default'))
42 |
43 |
44 | ## ----ch06_multiclassSVM--------------------------------------------------
45 | # Read the house Worth Data
46 | Data_House_Worth <- read.csv("~/Dropbox/Book Writing - Drafts/Chapter Drafts/Final Artwork and Code/Chapter 6/Dataset/House Worth Data.csv",header=TRUE);
47 |
48 | library( 'e1071' )
49 | #Fit a multiclass SVM
50 | svm_multi_model <- svm( HouseNetWorth ~ StoreArea + LawnArea, Data_House_Worth )
51 |
52 | #Display the model
53 | svm_multi_model
54 |
55 | #get the predicted vaule for all the set
56 | res <- predict( svm_multi_model, newdata=Data_House_Worth )
57 |
58 | #Classification Matrix
59 | table(Data_House_Worth$HouseNetWorth,res)
60 |
61 | #Classification Rate
62 |
63 | sum(diag(table(Data_House_Worth$HouseNetWorth,res)))/nrow(Data_House_Worth)
64 |
65 |
--------------------------------------------------------------------------------
/Code/Ch06_TextMining.R:
--------------------------------------------------------------------------------
1 |
2 | ## ------------------------------------------------------------------------
3 | library(data.table)
4 |
5 | fine_food_data <- read.csv("Dataset/Food_Reviews.csv",stringsAsFactors = FALSE)
6 | fine_food_data$Score <- as.factor(fine_food_data$Score)
7 |
8 | str(fine_food_data[-10])
9 |
10 | # Last column - Custmer review in free text
11 |
12 | head(fine_food_data[,10],2)
13 |
14 |
15 | ## ---- message=FALSE,warning=FALSE----------------------------------------
16 |
17 | library(caTools)
18 |
19 | # Randomly split data and use only 10% of the dataset
20 | set.seed(90)
21 | split = sample.split(fine_food_data$Score, SplitRatio = 0.10)
22 |
23 | fine_food_data = subset(fine_food_data, split == TRUE)
24 |
25 | select_col <- c("Id","HelpfulnessNumerator","HelpfulnessDenominator","Score","Summary","Text")
26 |
27 | fine_food_data_selected <- fine_food_data[,select_col]
28 |
29 |
30 | ## ------------------------------------------------------------------------
31 | fine_food_data_selected[2,6]
32 |
33 | ## ---- warning=FALSE,message=FALSE----------------------------------------
34 | library(LSAfun)
35 | genericSummary(fine_food_data_selected[2,6],k=1)
36 |
37 | ## ---- warning=FALSE,message=FALSE----------------------------------------
38 | library(LSAfun)
39 | genericSummary(fine_food_data_selected[2,6],k=2)
40 |
41 | ## ------------------------------------------------------------------------
42 | fine_food_data_selected[2,5]
43 |
44 | ## ----warning=FALSE,message=FALSE,eval=FALSE------------------------------
45 | ##
46 | ## library(proxy)
47 | ## library(tm)
48 | ##
49 | ## fine_food_data_corpus <- VCorpus(VectorSource(fine_food_data_selected$Text))
50 | ##
51 | ## fine_food_data_text_tdm <- TermDocumentMatrix(fine_food_data_corpus, control = list(
52 | ## tolower = TRUE,
53 | ## removeNumbers = TRUE,
54 | ## stopwords = TRUE,
55 | ## removePunctuation = TRUE,
56 | ## stemming = TRUE
57 | ## ))
58 | ##
59 | ## matrix_c <- as.matrix(fine_food_data_text_tdm)
60 | ##
61 | ##
62 | ## wc_freq <- sort(rowSums(matrix_c))
63 | ## wc_tmdata <- data.frame(words=names(wc_freq), wc_freq)
64 | ##
65 | ## wc_tmdata <- na.omit(wc_tmdata)
66 | ##
67 | ## tail(wc_tmdata$words,100)
68 | ##
69 | ##
70 | ##
71 | ## review_diss <- proxy::dist(matrix_c, method = "cosine")
72 | ##
73 | ## docsdissim2 <- as.matrix(docsdissim)
74 | ## rownames(docsdissim2) <- titles
75 | ## colnames(docsdissim2) <- titles
76 | ## docsdissim2
77 | ## h <- hclust(docsdissim, method = "ward")
78 | ## plot(h, labels = titles, sub = "")
79 | ##
80 |
81 | ## ------------------------------------------------------------------------
82 | library(tm)
83 | fine_food_data_corpus <- VCorpus(VectorSource(fine_food_data_selected$Text))
84 |
85 | #Standardize the text - Pre-Processing
86 |
87 | fine_food_data_text_dtm <- DocumentTermMatrix(fine_food_data_corpus, control = list(
88 | tolower = TRUE,
89 | removeNumbers = TRUE,
90 | stopwords = TRUE,
91 | removePunctuation = TRUE,
92 | stemming = TRUE
93 | ))
94 |
95 | # save frequently-appearing terms( more than 500 times) to a character vector
96 |
97 | fine_food_data_text_freq <- findFreqTerms(fine_food_data_text_dtm, 500)
98 |
99 | # create DTMs with only the frequent terms
100 | fine_food_data_text_dtm <- fine_food_data_text_dtm[ , fine_food_data_text_freq]
101 |
102 | tm::inspect(fine_food_data_text_dtm[1:5,1:10])
103 |
104 | #Create a tf-idf matrix
105 | fine_food_data_tfidf <- weightTfIdf(fine_food_data_text_dtm, normalize = FALSE)
106 |
107 | tm::inspect(fine_food_data_tfidf[1:5,1:10])
108 |
109 |
110 | ## ----message=FALSE,warning=FALSE-----------------------------------------
111 |
112 | library(NLP)
113 | library(tm)
114 |
115 |
116 | fine_food_data_corpus <- Corpus(VectorSource(fine_food_data_selected$Text[1:3]))
117 | fine_food_data_cleaned <- tm_map(fine_food_data_corpus, PlainTextDocument)
118 |
119 | #tolwer
120 | fine_food_data_cleaned <- tm_map(fine_food_data_cleaned, tolower)
121 | fine_food_data_cleaned[[1]]
122 |
123 | fine_food_data_cleaned <-tm_map(fine_food_data_cleaned, removeWords, stopwords("english"))
124 | fine_food_data_cleaned[[1]]
125 |
126 | fine_food_data_cleaned <- tm_map(fine_food_data_cleaned, removePunctuation)
127 | fine_food_data_cleaned[[1]]
128 |
129 | fine_food_data_cleaned <- tm_map(fine_food_data_cleaned, removeNumbers)
130 | fine_food_data_cleaned[[1]]
131 |
132 | fine_food_data_cleaned <-tm_map(fine_food_data_cleaned, stripWhitespace)
133 | fine_food_data_cleaned[[1]]
134 |
135 | ## ------------------------------------------------------------------------
136 | library(openNLP)
137 | library(NLP)
138 |
139 | fine_food_data_string <- NLP::as.String(fine_food_data_cleaned[[1]])
140 |
141 | sent_token_annotator <- Maxent_Sent_Token_Annotator()
142 | word_token_annotator <- Maxent_Word_Token_Annotator()
143 | fine_food_data_string_an <- annotate(fine_food_data_string, list(sent_token_annotator, word_token_annotator))
144 |
145 | pos_tag_annotator <- Maxent_POS_Tag_Annotator()
146 | fine_food_data_string_an2 <- annotate(fine_food_data_string, pos_tag_annotator, fine_food_data_string_an)
147 |
148 | ## Variant with POS tag probabilities as (additional) features.
149 | head(annotate(fine_food_data_string, Maxent_POS_Tag_Annotator(probs = TRUE), fine_food_data_string_an2))
150 |
151 | ## Determine the distribution of POS tags for word tokens.
152 | fine_food_data_string_an2w <- subset(fine_food_data_string_an2, type == "word")
153 | tags <- sapply(fine_food_data_string_an2w$features, `[[`, "POS")
154 | table(tags)
155 |
156 | plot(table(tags), type = "h", xlab="Part-Of_Speech", ylab = "Frequency")
157 |
158 | ## Extract token/POS pairs (all of them)
159 | head(sprintf("%s/%s", fine_food_data_string[fine_food_data_string_an2w], tags),15)
160 |
161 |
162 | ## ---- warning=FALSE, message=FALSE---------------------------------------
163 |
164 | library(SnowballC)
165 | library(wordcloud)
166 | library(tm)
167 | library(slam)
168 |
169 | fine_food_data_corpus <- VCorpus(VectorSource(fine_food_data_selected$Text))
170 |
171 | fine_food_data_text_tdm <- TermDocumentMatrix(fine_food_data_corpus, control = list(
172 | tolower = TRUE,
173 | removeNumbers = TRUE,
174 | stopwords = TRUE,
175 | removePunctuation = TRUE,
176 | stemming = TRUE
177 | ))
178 |
179 | #reducing sparsity
180 | wc_tdm <- rollup(fine_food_data_text_tdm, 2, na.rm=TRUE, FUN = sum)
181 | matrix_c <- as.matrix(wc_tdm)
182 | wc_freq <- sort(rowSums(matrix_c))
183 | wc_tmdata <- data.frame(words=names(wc_freq), wc_freq)
184 |
185 | wc_tmdata <- na.omit(wc_tmdata)
186 |
187 | wordcloud (tail(wc_tmdata$words,100), tail(wc_tmdata$wc_freq,100), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
188 |
189 |
190 | ## ----ch06_setupTwitter_api-----------------------------------------------
191 | library("stringr")
192 | library("dplyr")
193 | #install.packages("twitteR")
194 | library("twitteR")
195 | #getTwitterOAuth(consumer_key, consumer_secret)
196 | consumerKey <- "YOUR KEY"
197 | consumerSecret <- " YOUR KEY"
198 |
199 | #Below two tokens need to be used when you want to pull tweets from your own account
200 | accessToken <- "YOUR KEY"
201 | accessTokenSecret <- "YOUR KEY"
202 |
203 | setup_twitter_oauth(consumerKey, consumerSecret,accessToken,accessTokenSecret)
204 |
205 | kIgnoreTweet <- "update:|nobot:"
206 |
207 | GetTweets <- function(handle, n = 1000) {
208 |
209 | timeline <- userTimeline(handle, n = n)
210 | tweets <- sapply(timeline, function(x) {
211 | c(x$getText(), x$getCreated())
212 | })
213 | tweets <- data.frame(t(tweets))
214 | names(tweets) <- c("text.orig", "created.orig")
215 |
216 | tweets$text <- tolower(tweets$text.orig)
217 | tweets$created <- as.POSIXct(as.numeric(as.vector(tweets$created.orig)), origin="1970-01-01")
218 |
219 | arrange(tweets, created)
220 | }
221 |
222 | handle <- "@TimesNow"
223 | tweets <- GetTweets(handle, 100)
224 |
225 | #Store the tweets as used in the book for future reproducibility
226 | #write.csv(tweets,"Dataset/Twitter Feed From TimesNow.csv",row.names = FALSE)
227 | tweets <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Twitter Feed From TimesNow.csv")
228 | tweets[1:5,]
229 |
230 | ## ----ch06_microsoft_services---------------------------------------------
231 | #install.packages("mscstexta4r")
232 | library(mscstexta4r)
233 |
234 | #Put the authentication APi keys you got from microsoft
235 |
236 | Sys.setenv(MSCS_TEXTANALYTICS_URL = "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/")
237 | Sys.setenv(MSCS_TEXTANALYTICS_KEY = "YOUR KEY")
238 |
239 | #Intiliatize the service
240 | textaInit()
241 |
242 |
243 | ## ----ch06_loadADocument--------------------------------------------------
244 | # Load Packages
245 | require(tm)
246 | require(NLP)
247 | require(openNLP)
248 |
249 | #Read the Forbes article into R environment
250 | y <- paste(scan("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/india_after_independence.txt", what="character", sep=" "),collapse=" ")
251 |
252 | convert_text_to_sentences <- function(text, lang = "en") {
253 | # Function to compute sentence annotations using the Apache OpenNLP Maxent sentence detector employing the default model for language 'en'.
254 | sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang)
255 |
256 | # Convert text to class String from package NLP
257 | text <- as.String(text)
258 |
259 | # Sentence boundaries in text
260 | sentence.boundaries <- annotate(text, sentence_token_annotator)
261 |
262 | # Extract sentences
263 | sentences <- text[sentence.boundaries]
264 |
265 | # return sentences
266 | return(sentences)
267 | }
268 |
269 | # Convert the text into sentances
270 | article_text = convert_text_to_sentences(y, lang = "en")
271 |
272 |
273 | ## ----ch06_text_sentiment-------------------------------------------------
274 |
275 | document_lang <- rep("en", length(tweets$text))
276 |
277 | tryCatch({
278 |
279 | # Perform sentiment analysis
280 | output_1 <- textaSentiment(
281 | documents = tweets$text, # Input sentences or documents
282 | languages = document_lang
283 | # "en"(English, default)|"es"(Spanish)|"fr"(French)|"pt"(Portuguese)
284 | )
285 |
286 | }, error = function(err) {
287 |
288 | # Print error
289 | geterrmessage()
290 |
291 | })
292 |
293 | merged <- output_1$results
294 |
295 | #Order the tweets with sentment score
296 | ordered_tweets <- merged[order(merged$score),]
297 |
298 | #Top 5 negative tweets
299 | ordered_tweets[1:5,]
300 |
301 | #Top 5 Positive
302 | ordered_tweets[95:100,]
303 |
304 | ## ----ch06_text_topic_detect----------------------------------------------
305 |
306 | handle <- "@CNN"
307 | topic_text <- GetTweets(handle, 150)
308 | #write.csv(topic_text,"Dataset/Twitter Feed from CNN.csv",row.names=FALSE)
309 | topic_text <- read.csv("~/Dropbox/Book Writing - Drafts/Final Artwork and Code/Chapter 6/Dataset/Twitter Feed from CNN.csv")
310 |
311 | tryCatch({
312 |
313 | # Detect top topics in group of documents
314 | output_2 <- textaDetectTopics(
315 | topic_text$text, # At least 100 documents (English only)
316 | stopWords = NULL, # Stop word list (optional)
317 | topicsToExclude = NULL, # Topics to exclude (optional)
318 | minDocumentsPerWord = NULL, # Threshold to exclude rare topics (optional)
319 | maxDocumentsPerWord = NULL, # Threshold to exclude ubiquitous topics (optional)
320 | resultsPollInterval = 30L, # Poll interval (in s, default: 30s, use 0L for async)
321 | resultsTimeout = 1200L, # Give up timeout (in s, default: 1200s = 20mn)
322 | verbose = FALSE # If set to TRUE, print every poll status to stdout
323 | )
324 |
325 | }, error = function(err) {
326 |
327 | # Print error
328 | geterrmessage()
329 |
330 | })
331 | output_2
332 |
333 | ## ----ch06_lang_detect----------------------------------------------------
334 | #Below i am creating 5 messgaes in 5 different langauge using google translater
335 | #1-ARABIC, 2-POTUGESE, 3- ENGLISH , 4- CHINESE AND 5 - HINDI
336 |
337 | lang_detect<- c("أنا عالم البيانات","Eu sou um cientista de dados","I am a data scientist","我是一个科学家的数据","
338 | मैं एक डेटा वैज्ञानिक हूँ")
339 |
340 |
341 | tryCatch({
342 |
343 | # Detect top topics in group of documents
344 | # Detect languages used in documents
345 | output_3 <- textaDetectLanguages(
346 | lang_detect, # Input sentences or documents
347 | numberOfLanguagesToDetect = 1L # Default: 1L
348 | )
349 |
350 | }, error = function(err) {
351 |
352 | # Print error
353 | geterrmessage()
354 |
355 | })
356 | output_3
357 |
358 | ## ----ch06_text_summarization---------------------------------------------
359 |
360 | article_lang <- rep("en", length(article_text))
361 |
362 | tryCatch({
363 |
364 | # Get key talking points in documents
365 | output_4 <- textaKeyPhrases(
366 | documents = article_text, # Input sentences or documents
367 | languages = article_lang
368 | # "en"(English, default)|"de"(German)|"es"(Spanish)|"fr"(French)|"ja"(Japanese)
369 | )
370 |
371 | }, error = function(err) {
372 |
373 | # Print error
374 | geterrmessage()
375 |
376 | })
377 |
378 | #Print the top 5 summary
379 | output_4$results[1:5,1]
380 |
381 |
382 |
--------------------------------------------------------------------------------
/Code/Ch07_Singh.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | library(knitr)
3 | opts_chunk$set(dev="png",
4 | dev.args=list(type="cairo"),
5 | dpi=96)
6 | knitr::opts_chunk$set(
7 | fig.path = "Images/"
8 | )
9 |
10 | ## ----chap7_Data_load, warnings=FALSE, message=FALSE----------------------
11 | setwd("C:/Personal/Machine Learning/Final Artwork and Code/Chapter 7");
12 |
13 | library(data.table)
14 |
15 | Data_House_Price <- fread("Dataset/House Sale Price Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE)
16 |
17 | str(Data_House_Price)
18 |
19 |
20 | ## ----chap7_Data_load_01, warning=FALSE,message=FALSE---------------------
21 |
22 | dim(Data_House_Price)
23 |
24 | #Check the distribution of dependent variable
25 | hist(Data_House_Price$HousePrice/1000000, breaks=20, col="blue", xlab="House Sale Price(Million)",
26 | main="Distribution of House Sale Price")
27 |
28 | #Also look at the summary of the Dependent Variable
29 | summary(Data_House_Price$HousePrice)
30 |
31 | #Pulling out relevant columns and assigning required fields in the dataset
32 | Data_House_Price <- Data_House_Price[,.(HOUSE_ID,HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,Rating,SaleType)]
33 |
34 | #Omit Anu missing value
35 | Data_House_Price <- na.omit(Data_House_Price)
36 |
37 | Data_House_Price$HOUSE_ID <- as.character(Data_House_Price$HOUSE_ID)
38 |
39 |
40 | ## ----chap7_Data_load_03, warning=FALSE,message=FALSE---------------------
41 |
42 | Data_Purchase <- fread("Dataset/Purchase Prediction Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE)
43 | str(Data_Purchase)
44 |
45 | ## ----chap7_Data_load_02, warning=FALSE,message=FALSE---------------------
46 |
47 | dim(Data_Purchase);
48 |
49 | #Check the distribution of data before grouping
50 | table(Data_Purchase$ProductChoice)
51 | barplot(table(Data_Purchase$ProductChoice),main="Distribution of ProductChoice", xlab= "ProductChoice Options", col="Blue")
52 |
53 |
54 | #Pulling out only the relevant data to this chapter
55 |
56 | Data_Purchase <- Data_Purchase[,.(CUSTOMER_ID,ProductChoice,MembershipPoints,IncomeClass,CustomerPropensity,LastPurchaseDuration)]
57 |
58 | #Delete NA from subset
59 |
60 | Data_Purchase <- na.omit(Data_Purchase)
61 |
62 | Data_Purchase$CUSTOMER_ID <- as.character(Data_Purchase$CUSTOMER_ID)
63 |
64 |
65 | ## ----ch07_population_stability_01, warning=FALSE,message=FALSE-----------
66 | #Create set 1 and set 2 : First 2/3 as set 1 and remaining 1/3 as set 2
67 | summary(Data_House_Price$HousePrice)
68 |
69 | set_1 <- Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),]$HousePrice
70 | summary(set_1)
71 |
72 | set_2 <- Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),]$HousePrice
73 | summary(set_2)
74 |
75 | ## ----ch07_population_stability_02, warning=FALSE,message=FALSE-----------
76 | #Defining a function to give ks test result and ECDF plots on log scale
77 | library(rgr)
78 | ks_test <- function (xx1, xx2, xlab = "House Price", x1lab = deparse(substitute(xx1)),x2lab = deparse(substitute(xx2)), ylab = "Empirical Cumulative Distribution Function",log = TRUE, main = "Empirical EDF Plots - K-S Test", pch1 = 3, col1 = 2, pch2 = 4, col2 = 4, cex = 0.8, cexp = 0.9, ...)
79 | {
80 | temp.x <- remove.na(xx1)
81 | x1 <- sort(temp.x$x[1:temp.x$n])
82 | nx1 <- temp.x$n
83 | y1 <- ((1:nx1) - 0.5)/nx1
84 | temp.x <- remove.na(xx2)
85 | x2 <- sort(temp.x$x[1:temp.x$n])
86 | nx2 <- temp.x$n
87 | y2 <- ((1:nx2) - 0.5)/nx2
88 | xlim <- range(c(x1, x2))
89 | if (log) {
90 | logx <- "x"
91 | if (xlim[1] <= 0)
92 | stop("\n Values cannot be .le. zero for a log plot\n")
93 | }
94 | else logx <- ""
95 | plot(x1, y1, log = logx, xlim = xlim, xlab = xlab, ylab = ylab,
96 | main = main, type = "n", ...)
97 | points(x1, y1, pch = pch1, col = col1, cex = cexp)
98 | points(x2, y2, pch = pch2, col = col2, cex = cexp)
99 | temp <- ks.test(x1, x2)
100 | print(temp)
101 | }
102 |
103 | ## ----ch07_population_stability_03, warning=FALSE,message=FALSE-----------
104 | #Perform K-S test on set_1 and set_2 and also display Empirical Cummulative Distribution Plots
105 | ks_test(set_1,set_2)
106 |
107 | ## ----ch07_population_stability_04, warning=FALSE,message=FALSE-----------
108 | #Maniplate the set 2
109 | set_2_new <- set_2*exp(set_2/100000)
110 |
111 | # Now do the k-s test again
112 | ks_test(set_1,set_2_new)
113 |
114 | ## ----ch07_population_stability_05, warning=FALSE,message=FALSE-----------
115 | #Let's create set 1 and set 2 from our Purchase Prediction Data
116 | print("Distribution of ProductChoice values before partition")
117 | table(Data_Purchase$ProductChoice)
118 |
119 | set_1 <- Data_Purchase[1:floor(nrow(Data_Purchase)*(2/3)),]$ProductChoice
120 | table(set_1)
121 |
122 | set_2 <- Data_Purchase[floor(nrow(Data_Purchase)*(2/3) + 1):nrow(Data_Purchase),]$ProductChoice
123 | table(set_2)
124 |
125 | ## ----ch07_population_stability_06, warning=FALSE,message=FALSE-----------
126 | #PSI='??((n1i/N1)'??(n2i/N2))'??ln((n1i/N1)/(n2i/N2))
127 |
128 | temp1 <- (table(set_1)/length(set_1) - table(set_2)/length(set_2))
129 |
130 | temp2 <- log((table(set_1)/length(set_1))*(table(set_2)/length(set_2)))
131 |
132 | psi <- abs(sum(temp1*temp2))
133 |
134 | if(psi < 0.1 ){
135 | cat("The population is stable with a PSI of " ,psi)
136 | } else if (psi >=0.1 & psi <= 0.2) {
137 | cat("The population need further investigation with a PSI of " ,psi)
138 | } else {
139 | cat("The population has gone thorugh significant changes with a PSi of " ,psi)
140 | }
141 |
142 |
143 | ## ----ch07_Linear_reg_model_01, warning=FALSE,message=FALSE---------------
144 | # Create a model on Set 1 = Train data
145 |
146 |
147 | linear_reg_model <- lm(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + Rating + SaleType ,data=Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),])
148 |
149 | summary(linear_reg_model)
150 |
151 | ## ----ch07_mae_01, warning=FALSE,message=FALSE----------------------------
152 | #Create the test data which is set 2
153 | test <- Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),]
154 |
155 | #Fit the linear regression model on this and get predicted values
156 |
157 | predicted_lm <- predict(linear_reg_model,test, type= "response")
158 |
159 | actual_predicted <- as.data.frame(cbind(as.numeric(test$HOUSE_ID),as.numeric(test$HousePrice),as.numeric(predicted_lm)))
160 |
161 | names(actual_predicted) <- c("HOUSE_ID","Actual","Predicted")
162 |
163 | #Find the absolute residual and then take mean of that
164 | library(ggplot2)
165 |
166 | #Plot Actual vs Predicted values for Test Cases
167 | ggplot(actual_predicted,aes(x = actual_predicted$HOUSE_ID,color=Series)) +
168 | geom_line(data = actual_predicted, aes(x = actual_predicted$HOUSE_ID, y = Actual, color = "Actual")) +
169 | geom_line(data = actual_predicted, aes(x = actual_predicted$HOUSE_ID, y = Predicted, color = "Predicted")) + xlab('HOUSE_ID') + ylab('House Sale Price')
170 |
171 |
172 | ## ----ch07_mae_02, warning=FALSE,message=FALSE----------------------------
173 | #Remove NA from test, as we have not done any treatment for NA
174 | actual_predicted <- na.omit(actual_predicted)
175 |
176 | #First take Actual - Predicted, then take mean of absolute errors(residual)
177 |
178 | mae <- sum(abs(actual_predicted$Actual - actual_predicted$Predicted))/nrow(actual_predicted)
179 |
180 | cat("Mean Absolute Error for the test case is ", mae)
181 |
182 | ## ----ch07_rmse_01, warning=FALSE,message=FALSE---------------------------
183 | #As we have already have actual and predicted value we can directly calculate the RSME value
184 |
185 | rmse <- sqrt(sum((actual_predicted$Actual-actual_predicted$Predicted)^2)/nrow(actual_predicted))
186 |
187 | cat("Root Mean Square Error for the test case is ", rmse)
188 |
189 | ## ----ch07_r_sqr_01, warning=FALSE,message=FALSE--------------------------
190 | #Model training data ( we will show our analysis on this dataset)
191 |
192 | train <- Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),.(HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,StreetHouseFront,LawnArea,Rating,SaleType)];
193 |
194 | #Omiting the NA from dataset
195 |
196 | train <- na.omit(train)
197 |
198 | # Get a linear regression model
199 | linear_reg_model <- lm(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + StreetHouseFront + LawnArea + Rating + SaleType ,data=train)
200 |
201 | # Show the function call to identify what model we will be working on
202 |
203 | print(linear_reg_model$call)
204 |
205 | #System generated Square value
206 | cat("The system generated R square value is " , summary(linear_reg_model)$r.squared)
207 |
208 | ## ----ch07_r_sqr_02, warning=FALSE,message=FALSE--------------------------
209 | #calculate Total Sum of Squares
210 |
211 | SST <- sum((train$HousePrice - mean(train$HousePrice))^2);
212 |
213 | #Calculate Regression Sum of Squares
214 |
215 | SSR <- sum((linear_reg_model$fitted.values - mean(train$HousePrice))^2);
216 |
217 | #Calulate residual(Error) Sum of Squares
218 |
219 | SSE <- sum((train$HousePrice - linear_reg_model$fitted.values)^2);
220 |
221 |
222 | ## ----ch07_r_sqr_03, warning=FALSE,message=FALSE--------------------------
223 | #calulate R-squared
224 |
225 | R_Sqr <- 1- (SSE/SST)
226 |
227 | #Dipslay the calulated R-Sqr
228 |
229 | cat("The calculated R Square is ", R_Sqr)
230 |
231 | ## ----ch07_classi_01, warning=FALSE,message=FALSE-------------------------
232 | #Remove the data having NA. NA is ignored in modeling algorithms
233 | Data_Purchase<-na.omit(Data_Purchase)
234 |
235 | #Sample equal sizes from Data_Purchase to reduce class imbalance issue
236 | library(splitstackshape)
237 | Data_Purchase_Model<-stratified(Data_Purchase, group=c("ProductChoice"),size=10000,replace=FALSE)
238 |
239 | print("The Distribution of equal classes is as below")
240 | table(Data_Purchase_Model$ProductChoice)
241 |
242 | #Build the multinomial model on Train Data (Set_1) and then test data (Set_2) will be used for performance testing
243 | set.seed(917);
244 | train <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=nrow(Data_Purchase_Model)*(0.7), replace = TRUE, prob = NULL),]
245 | dim(train)
246 |
247 | test <- Data_Purchase_Model[!(Data_Purchase_Model$CUSTOMER_ID %in% train$CUSTOMER_ID),]
248 | dim(test)
249 |
250 | #fit a multinomial logictic model
251 | library(nnet)
252 | mnl_model <- multinom (ProductChoice ~ MembershipPoints + IncomeClass + CustomerPropensity + LastPurchaseDuration, data = train)
253 |
254 | #Display the summary of model statistics
255 | mnl_model
256 |
257 | #Predict the probabilitilies
258 | predicted_test <- as.data.frame(predict(mnl_model, newdata = test, type= "probs"))
259 |
260 | #Display the predcited probabilities
261 | head(predicted_test)
262 |
263 | #Do the prediction based in highest probability
264 | test_result <- apply(predicted_test,1,which.max)
265 |
266 | table(test_result)
267 | #Combine to get predicted and actuals at one place
268 |
269 | result <- as.data.frame(cbind(test$ProductChoice,test_result))
270 |
271 | colnames(result) <- c("Actual Class", "Predicted Class")
272 |
273 | head(result)
274 |
275 | ## ----ch07_classi_02, warning=FALSE,message=FALSE-------------------------
276 | #Create the classification matrix
277 | cmat <- as.matrix(table(Actual = result$`Actual Class`, Predicted = result$`Predicted Class`))
278 |
279 | #Calculated above mentioned measures in order
280 | n <- sum(cmat) ;
281 | cat("Number of Cases ", n);
282 |
283 | nclass <- nrow(cmat);
284 | cat("Number of classes ", nclass);
285 |
286 | correct_class <- diag(cmat);
287 | cat("Number of Correct Classification ", correct_class);
288 |
289 | rowsums <- apply(cmat, 1, sum);
290 | cat("Number of Instances per class ", rowsums);
291 |
292 | colsums <- apply(cmat, 2, sum);
293 | cat("Number of Instances per predicted class ", colsums);
294 |
295 | actual_dist <- rowsums / n;
296 | cat("Distribution of actuals ", actual_dist);
297 |
298 | predict_dist <- colsums / n;
299 | cat("Distribution of predicted ", predict_dist);
300 |
301 | ## ----ch07_classi_03, warning=FALSE,message=FALSE-------------------------
302 | #print the classification matrix - on test data
303 |
304 | print(cmat)
305 |
306 | #Print Classification Rate
307 |
308 | classification_rate <- sum(correct_class)/n;
309 | print(classification_rate)
310 |
311 | ## ----ch07_classi_05, warning=FALSE,message=FALSE-------------------------
312 | #The analysis is shown for ProductChoice == 1
313 | Actual_Class <- ifelse(result$`Actual Class` == 1,"One","Rest");
314 | Predicted_Class <- ifelse(result$`Predicted Class` == 1, "One", "Rest");
315 |
316 | ss_analysis <- as.data.frame(cbind(Actual_Class,Predicted_Class));
317 |
318 | #Create classification matrix for ProductChoice == 1
319 |
320 | cmat_ProductChoice1 <- as.matrix(table(Actual = ss_analysis$Actual_Class, Predicted = ss_analysis$Predicted_Class));
321 |
322 | print(cmat_ProductChoice1)
323 |
324 | classification_rate_ProductChoice1 <- sum(diag(cmat_ProductChoice1))/n;
325 |
326 | cat("Classification rate for ProductChoice 1 is ", classification_rate_ProductChoice1)
327 |
328 | # Calculate TPR and TNR
329 |
330 | TPR <- cmat_ProductChoice1[1,1]/(cmat_ProductChoice1[1,1] + cmat_ProductChoice1[1,2]);
331 |
332 | cat(" Sensitivity or True Positive Rate is ", TPR);
333 |
334 | TNR <- cmat_ProductChoice1[2,2]/(cmat_ProductChoice1[2,1] + cmat_ProductChoice1[2,2])
335 |
336 | cat(" Specificity or True Negative Rate is ", TNR);
337 |
338 |
339 | ## ----ch07_classi_06, warning=FALSE,message=FALSE-------------------------
340 | # create a new model on train with "One" =1 and "Rest" = 0
341 |
342 | train$ProductChoice_binom <- ifelse(train$ProductChoice == 1,1,0);
343 | test$ProductChoice_binom <- ifelse(test$ProductChoice == 1,1,0);
344 |
345 | glm_ProductChoice_binom <- glm( ProductChoice_binom ~ MembershipPoints + IncomeClass + CustomerPropensity + LastPurchaseDuration, data=train, family = binomial(link="logit"))
346 |
347 | #Print the summary of binomial logistic model
348 | summary(glm_ProductChoice_binom)
349 |
350 | #Now create the performance data set to create AUC curve
351 | library(ROCR)
352 | test_binom <- predict(glm_ProductChoice_binom,newdata=test, type = "response")
353 | pred <- prediction(test_binom, test$ProductChoice_binom)
354 | perf <- performance(pred,"tpr","fpr")
355 |
356 | # calculating AUC
357 | auc <- unlist(slot(performance(pred,"auc"),"y.values"));
358 |
359 | cat("The Area Under ROC curve for this model is ",auc);
360 |
361 | #Plotting the AUC curve
362 | library(ggplot2)
363 | library(plotROC)
364 | debug <- as.data.frame(cbind(test_binom,test$ProductChoice_binom))
365 | ggplot(debug, aes(d = V2, m = test_binom)) + geom_roc()
366 |
367 | ## ----ch07_prob_01, warning=FALSE,message=FALSE---------------------------
368 | library(caret)
369 | library(randomForest)
370 | set.seed(917);
371 |
372 | #Model training data ( we will show our analysis on this dataset)
373 |
374 | train <- Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),.(HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,StreetHouseFront,LawnArea,Rating,SaleType)];
375 |
376 | #Create the test data which is set 2
377 | test <- Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),.(HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,StreetHouseFront,LawnArea,Rating,SaleType)]
378 |
379 | #Omiting the NA from dataset
380 | train <- na.omit(train)
381 | test <- na.omit(test)
382 |
383 | #Create the k subsets, let's take k as 10 (i.e., 10-fold cross validation)
384 | k_10_fold <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
385 |
386 | #Fit the model on folds and use rmse as metric to fit the model
387 | model_fitted <- train(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + StreetHouseFront + LawnArea + Rating + SaleType, data=train, family = identity,trControl = k_10_fold, tuneLength = 5)
388 |
389 | #Display the summary of the cross validation
390 | model_fitted
391 |
392 | ## ----ch07_prob_02, warning=FALSE,message=FALSE---------------------------
393 |
394 | #Create the the boot experiment, let's take samples as as 10 (i.e., 10-sample bootstarped)
395 | boot_10s <- trainControl(method = "boot", number = 10, savePredictions = TRUE)
396 |
397 | #Fit the model on bootstraps and use rmse as metric to fit the model
398 | model_fitted <- train(HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + StreetHouseFront + LawnArea + Rating + SaleType, data=train, family = identity,trControl = boot_10s, tuneLength = 5)
399 |
400 | #Display the summary of the boostraped model
401 | model_fitted
402 |
403 | ## ----ch07_kappa, warning=FALSE,message=FALSE-----------------------------
404 | library(caret)
405 | library(mlbench)
406 |
407 | #We will use the Purchase Prediction Data with a very simple model to illustarte the kappa and accuracy measure
408 |
409 | set.seed(917);
410 | train_kappa <- Data_Purchase_Model[sample(nrow(Data_Purchase_Model),size=5000, replace = TRUE, prob = NULL),]
411 |
412 | #train() function confuses between numeric levels, hence convert the dependent into text
413 | train_kappa$ProductChoice_multi <- ifelse(train_kappa$ProductChoice == 1,"A",
414 | ifelse(train_kappa$ProductChoice == 2, "B",
415 | ifelse(train_kappa$ProductChoice == 3,"C","D")));
416 |
417 | train_kappa <- na.omit(train_kappa)
418 | #Set the experiment
419 | cntrl <- trainControl(method="cv", number=5, classProbs = TRUE)
420 |
421 | #Distribution of ProductChoices
422 | table(train_kappa$ProductChoice_multi)
423 |
424 | #Making the column names as legitemate names
425 | colnames(train_kappa) <- make.names(names(train_kappa), unique = TRUE, allow_ = TRUE)
426 |
427 | #Convert all the factors into factors in R
428 | train_kappa$ProductChoice_multi <- as.factor(train_kappa$ProductChoice_multi)
429 | train_kappa$CustomerPropensity <- as.factor(train_kappa$CustomerPropensity)
430 | train_kappa$LastPurchaseDuration <- as.factor(train_kappa$LastPurchaseDuration)
431 |
432 | #Fit the model with method as RandomForest.
433 | model_fitted <- train(ProductChoice_multi ~ CustomerPropensity + LastPurchaseDuration, data=train_kappa, method="rf", metric="Accuracy",trControl=cntrl)
434 |
435 | # The result displayed the kappa metrics
436 | print(model_fitted)
437 |
438 | #Display the predicted values
439 | pred <- predict(model_fitted, newdata=train_kappa)
440 | confusionMatrix(data=pred, train_kappa$ProductChoice_multi)
441 |
442 |
--------------------------------------------------------------------------------
/Code/Ch08_Singh.R:
--------------------------------------------------------------------------------
1 | ## ------------------------------------------------------------------------
2 | library(knitr)
3 | opts_chunk$set(dev="png",
4 | dev.args=list(type="cairo"),
5 | dpi=96)
6 | knitr::opts_chunk$set(
7 | fig.path = "Images/"
8 | )
9 |
10 | ## ----caret_install, eval=FALSE-------------------------------------------
11 | ## install.packages("caret", dependencies = c("Depends", "Suggests"))
12 |
13 | ## ----caret_01, eval= FALSE-----------------------------------------------
14 | ## rfControl <- trainControl(# Example, 10-fold Cross Validation
15 | ## method = "repeatedcv", # Others are available, such as repeated K-fold cross-validation, leave-one-out etc
16 | ## number = 10, # Number of folds
17 | ## repeats = 10 # repeated ten times
18 | ## )
19 |
20 | ## ----caret_02, eval= FALSE-----------------------------------------------
21 | ## set.seed(917)
22 | ## randomForectFit1 <- train(Class ~ ., # Define the model equation
23 | ## data = training, # Define the modeling data
24 | ## method = "rf", # List the model you want to use, caret provide list of options in train Model listor
25 | ## trControl = rfControl, # This defines the conditions on how to control the training
26 | ## ... ) # Other options specific to the modeling technique
27 | ## randomForectFit1
28 |
29 | ## ----ch08_hyperparameter_01, warning=FALSE-------------------------------
30 | setwd("C:/Personal/Machine Learning/Final Artwork and Code/Chapter 8");
31 | library(caret)
32 | library(randomForest)
33 | set.seed(917);
34 | # Load Dataset
35 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE)
36 |
37 | #Remove the missing values
38 | data <- na.omit(Purchase_Data)
39 |
40 | #Pick a sample of records
41 | Data <- data[sample(nrow(data),size=10000),]
42 |
43 | #Model 1 with tree size = 20
44 | fit_20 <- randomForest(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration,
45 | data=Data,
46 | importance=TRUE,
47 | ntree=20)
48 | #Print the result for ntree=20
49 | print(fit_20)
50 |
51 |
52 | #Model 1 with tree size = 50
53 | fit_50 <- randomForest(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration,
54 | data=Data,
55 | importance=TRUE,
56 | ntree=50)
57 | #Print the result for ntree=50
58 | print(fit_50)
59 |
60 |
61 | ## ----ch08_hyper_01, warning=FALSE----------------------------------------
62 | # Manually search parametres
63 | library(data.table)
64 | # load the packages
65 | library(randomForest)
66 | library(mlbench)
67 | library(caret)
68 | # Load Dataset
69 |
70 | dataset <- Data
71 | metric <- "Accuracy"
72 | # Manual Search
73 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
74 | tunegrid <- expand.grid(.mtry=c(sqrt(ncol(dataset)-2)))
75 | modellist <- list()
76 | for (ntree in c(100, 150, 200, 250)) {
77 | set.seed(917);
78 | fit <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rf", metric=metric, tuneGrid=tunegrid, trControl=trainControl, ntree=ntree)
79 | key <- toString(ntree)
80 | modellist[[key]] <- fit
81 | }
82 | # compare results by resampling
83 | results <- resamples(modellist)
84 | #Summary of Results
85 | summary(results)
86 | #Dot Plot of results
87 | dotplot(results)
88 |
89 | ## ----ch08_hyper_02,warning=FALSE-----------------------------------------
90 | # Tune algorithm parameters using a manual grid search.
91 | seed <- 917;
92 | dataset <- Data
93 | # prepare training scheme
94 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
95 | # design the parameter tuning grid
96 | grid <- expand.grid(size=c(5,10,20,50), k=c(1,2,3,4,5))
97 | # train the model
98 | model <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="lvq", trControl=control, tuneGrid=grid)
99 | # summarize the model
100 | print(model)
101 | # plot the effect of parameters on accuracy
102 | plot(model)
103 |
104 | ## ----ch08_hyper_03,warning=FALSE-----------------------------------------
105 | # Tune algorithm parameters using an automatic grid search.
106 | set.seed(917);
107 | dataset <- Data
108 |
109 | # prepare training scheme
110 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
111 | # train the model
112 | model <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="lvq", trControl=control, tuneLength=5)
113 | # summarize the model
114 | print(model)
115 | # plot the effect of parameters on accuracy
116 | plot(model)
117 |
118 | ## ----ch08_hyper_04,warning=FALSE-----------------------------------------
119 | # Select the best tuning configuration
120 | dataset <- Data
121 |
122 | # prepare training scheme
123 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
124 | # CART
125 | set.seed(917);
126 | tunegrid <- expand.grid(.cp=seq(0,0.1,by=0.01))
127 | fit.cart <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rpart", metric="Accuracy", tuneGrid=tunegrid, trControl=control)
128 | fit.cart
129 | # display the best configuration
130 | print(fit.cart$bestTune)
131 |
132 | plot(fit.cart)
133 |
134 | ## ----ch08_hyper_05,warning=FALSE-----------------------------------------
135 |
136 | # Randomly search algorithm parameters
137 |
138 | # Select the best tuning configuration
139 | dataset <- Data
140 |
141 | # prepare training scheme
142 | control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
143 | # train the model
144 | model <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rf", trControl=control)
145 | # summarize the model
146 | print(model)
147 | # plot the effect of parameters on accuracy
148 | plot(model)
149 |
150 | ## ----ch08_hyper_06,warning=FALSE-----------------------------------------
151 | library(caret)
152 | library(randomForest)
153 | library(class)
154 | # Load Dataset
155 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE)
156 |
157 | data <- na.omit(Purchase_Data)
158 |
159 | #Create a sample of 10K records
160 | set.seed(917);
161 | Data <- data[sample(nrow(data),size=10000),]
162 | # Select the best tuning configuration
163 | dataset <- Data
164 |
165 | # Customer Parameter Search
166 |
167 | # load the packages
168 | library(randomForest)
169 | library(mlbench)
170 | library(caret)
171 |
172 | # define the custom caret algorithm (wrapper for Random Forest)
173 | customRF <- list(type="Classification", library="randomForest", loop=NULL)
174 | customRF$parameters <- data.frame(parameter=c("mtry", "ntree"), class=rep("numeric", 2), label=c("mtry", "ntree"))
175 | customRF$grid <- function(x, y, len=NULL, search="grid") {}
176 | customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) {
177 | randomForest(x, y, mtry=param$mtry, ntree=param$ntree, ...)
178 | }
179 | customRF$predict <- function(modelFit, newdata, preProc=NULL, submodels=NULL) { predict(modelFit, newdata)}
180 | customRF$prob <- function(modelFit, newdata, preProc=NULL, submodels=NULL) { predict(modelFit, newdata, type = "prob")}
181 | customRF$sort <- function(x){ x[order(x[,1]),]}
182 | customRF$levels <- function(x) {x$classes}
183 |
184 | # Load Dataset
185 |
186 | dataset <- Data
187 |
188 | metric <- "Accuracy"
189 |
190 | # train model
191 | trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
192 | tunegrid <- expand.grid(.mtry=c(1:4), .ntree=c(100, 150, 200, 250))
193 | set.seed(917)
194 | custom <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method=customRF, metric=metric, tuneGrid=tunegrid, trControl=trainControl)
195 | print(custom)
196 | plot(custom)
197 |
198 |
199 | ## ----ch08_bias_vs_variance,warning=FALSE---------------------------------
200 | mu <- 2
201 | Z <- rnorm(20000, mu)
202 |
203 | MSE <- function(estimate, mu) {
204 | return(sum((estimate - mu)^2) / length(estimate))
205 | }
206 |
207 | n <- 100
208 | shrink <- seq(0,0.5, length=n)
209 | mse <- numeric(n)
210 | bias <- numeric(n)
211 | variance <- numeric(n)
212 |
213 | for (i in 1:n) {
214 | mse[i] <- MSE((1 - shrink[i]) * Z, mu)
215 | bias[i] <- mu * shrink[i]
216 | variance[i] <- (1 - shrink[i])^2
217 | }
218 |
219 | # Bias-Variance tradeoff plot
220 |
221 | plot(shrink, mse, xlab='Shrinkage', ylab='MSE', type='l', col='pink', lwd=3, lty=1, ylim=c(0,1.2))
222 | lines(shrink, bias^2, col='green', lwd=3, lty=2)
223 | lines(shrink, variance, col='red', lwd=3, lty=2)
224 | legend(0.02,0.6, c('Bias^2', 'Variance', 'MSE'), col=c('green', 'red', 'pink'), lwd=rep(3,3), lty=c(2,2,1))
225 |
226 | ## ----ch08_bagging, warning= FALSE----------------------------------------
227 | library(caret)
228 | library(randomForest)
229 | library(class)
230 | library(ipred)
231 | # Load Dataset
232 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE)
233 |
234 | data <- na.omit(Purchase_Data)
235 |
236 | #Create a sample of 10K records
237 | set.seed(917);
238 | Data <- data[sample(nrow(data),size=10000),]
239 | # Select the best tuning configuration
240 | dataset <- Data
241 | # Example of Bagging algorithms
242 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
243 | metric <- "Accuracy"
244 | # Bagged CART
245 | set.seed(917)
246 | fit.treebag <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="treebag", metric=metric, trControl=control)
247 | # Random Forest
248 | set.seed(917)
249 | fit.rf <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="rf", metric=metric, trControl=control)
250 | # summarize results
251 | bagging_results <- resamples(list(treebag=fit.treebag, rf=fit.rf))
252 | summary(bagging_results)
253 | dotplot(bagging_results)
254 |
255 |
256 | ## ----ch08_boosting, warning=FALSE----------------------------------------
257 | # Load Dataset
258 | Purchase_Data <- read.csv("Dataset/Purchase Prediction Dataset.csv",header=TRUE)
259 |
260 | data <- na.omit(Purchase_Data)
261 |
262 | #Create a sample of 10K records
263 | set.seed(917);
264 | Data <- data[sample(nrow(data),size=10000),]
265 | # Select the best tuning configuration
266 | dataset <- Data
267 | library(caret)
268 | library(C50)
269 | library(gbm)
270 | dataset <- Data;
271 | # Example of Boosting Algorithms
272 | control <- trainControl(method="repeatedcv", number=10, repeats=3)
273 | metric <- "Accuracy"
274 | # C5.0
275 | set.seed(917)
276 | fit.c50 <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="C5.0", metric=metric, trControl=control)
277 | fit.c50
278 | plot(fit.c50)
279 | # Stochastic Gradient Boosting
280 | set.seed(917)
281 | fit.gbm <- train(factor(ProductChoice) ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, method="gbm", metric=metric, trControl=control, verbose=FALSE)
282 | fit.gbm
283 | plot(fit.gbm)
284 |
285 | # summarize results
286 | boosting_results <- resamples(list(c5.0=fit.c50, gbm=fit.gbm))
287 | summary(boosting_results)
288 | dotplot(boosting_results)
289 |
290 | ## ----ch08_blending, warning=FALSE----------------------------------------
291 | # Blending (linear combination of models)
292 |
293 | # load libraries
294 | library(caret)
295 | library(caretEnsemble)
296 | library(MASS)
297 |
298 | #dataset$choice <- ifelse(dataset$ProductChoice == 1,"A",ifelse(dataset$ProductChoice == 2,"B",ifelse(dataset$ProductChoice == 3, "C","D")))
299 | set.seed(917);
300 | Data <- data[sample(nrow(data),size=10000),];
301 |
302 | dataset <- Data;
303 |
304 | dataset$choice <- ifelse(dataset$ProductChoice == 1 | dataset$ProductChoice == 2 ,"A","B")
305 |
306 | dataset$choice <-as.factor(dataset$choice)
307 | # define training control
308 | train_control <- trainControl(method="cv", number=4, savePredictions=TRUE, classProbs=TRUE)
309 | # train a list of models
310 | methodList <- c('knn','rpart')
311 | models <- caretList(choice ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, trControl=train_control, methodList=methodList)
312 | # create ensemble of trained models
313 | ensemble <- caretEnsemble(models)
314 | # summarize ensemble
315 | summary(ensemble)
316 |
317 | ## ----ch08_stacking_01,warning=FALSE--------------------------------------
318 | # Example of Stacking algorithms
319 | library(kernlab);
320 | # create submodels
321 | control <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE)
322 | algorithmList <- c('lda', 'rpart', 'glm', 'knn', 'svmRadial')
323 | set.seed(917)
324 | models <- caretList(choice ~ MembershipPoints + CustomerAge + PurchaseTenure + CustomerPropensity + LastPurchaseDuration, data=dataset, trControl=control, methodList=algorithmList)
325 | results <- resamples(models)
326 | summary(results)
327 | dotplot(results)
328 |
329 |
330 | ## ----cho08_stacking_02,warning=FALSE-------------------------------------
331 | # correlation between results
332 | modelCor(results)
333 | splom(results)
334 |
335 | ## ----ch08_stacking_03,warning=FALSE--------------------------------------
336 | # stack using glm
337 | stackControl <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE)
338 | set.seed(917)
339 | stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=stackControl)
340 | print(stack.glm)
341 |
342 |
343 | ## ----ch08_stacking_04,warning=FALSE--------------------------------------
344 |
345 | # stack using random forest
346 | set.seed(917)
347 | stack.rf <- caretStack(models, method="rf", metric="Accuracy", trControl=stackControl)
348 | print(stack.rf)
349 |
350 | ## ----ch08_bayesian_optimization_01,warning=FALSE-------------------------
351 |
352 | library(caret)
353 | library(randomForest)
354 | library(class)
355 | library(ipred)
356 | library(GPfit)
357 | # Load Dataset
358 | House_price <- read.csv("Dataset/House Sale Price Dataset.csv",header=TRUE)
359 |
360 | dataset <- na.omit(House_price)
361 |
362 | #Create a sample of 10K records
363 | set.seed(917);
364 |
365 |
366 | rand_ctrl <- trainControl(method = "repeatedcv", repeats = 5, search = "random")
367 |
368 | rand_search <- train(HousePrice ~ StoreArea + BasementArea + SellingYear + SaleType + ConstructionYear + Rating, data = dataset, method = "svmRadial",
369 | ## Create 20 random parameter values
370 | tuneLength = 20,
371 | metric = "RMSE",
372 | preProc = c("center", "scale"),
373 | trControl = rand_ctrl)
374 | rand_search
375 |
376 | ggplot(rand_search) + scale_x_log10() + scale_y_log10()
377 |
378 | getTrainPerf(rand_search)
379 |
380 | ## ----ch08_bayesian_optimization_02,warning=FALSE-------------------------
381 | # Define the resampling method
382 | ctrl <- trainControl(method = "repeatedcv", repeats = 5)
383 |
384 | ## Use this function to optimize the model. The two parameters are
385 | ## evaluated on the log scale given their range and scope.
386 | svm_fit_bayes <- function(logC, logSigma) {
387 | ## Use the same model code but for a single (C, sigma) pair.
388 | txt <- capture.output(
389 | mod <- train(HousePrice ~ StoreArea + BasementArea + SellingYear + SaleType + ConstructionYear + Rating , data = dataset,
390 | method = "svmRadial",
391 | preProc = c("center", "scale"),
392 | metric = "RMSE",
393 | trControl = ctrl,
394 | tuneGrid = data.frame(C = exp(logC), sigma = exp(logSigma)))
395 | )
396 | ## The function wants to _maximize_ the outcome so we return
397 | ## the negative of the resampled RMSE value. `Pred` can be used
398 | ## to return predicted values but we'll avoid that and use zero
399 | list(Score = -getTrainPerf(mod)[, "TrainRMSE"], Pred = 0)
400 | }
401 |
402 | ## Define the bounds of the search.
403 | lower_bounds <- c(logC = -5, logSigma = -9)
404 | upper_bounds <- c(logC = 20, logSigma = -0.75)
405 | bounds <- list(logC = c(lower_bounds[1], upper_bounds[1]),
406 | logSigma = c(lower_bounds[2], upper_bounds[2]))
407 |
408 | ## Create a grid of values as the input into the BO code
409 | initial_grid <- rand_search$results[, c("C", "sigma", "RMSE")]
410 | initial_grid$C <- log(initial_grid$C)
411 | initial_grid$sigma <- log(initial_grid$sigma)
412 | initial_grid$RMSE <- -initial_grid$RMSE
413 | names(initial_grid) <- c("logC", "logSigma", "Value")
414 |
415 | ## Run the optimization with the initial grid and with 30
416 |
417 |
418 | library(rBayesianOptimization)
419 |
420 | set.seed(917)
421 | ba_search <- BayesianOptimization(svm_fit_bayes,
422 | bounds = bounds,
423 | init_grid_dt = initial_grid,
424 | init_points = 0,
425 | n_iter = 30,
426 | acq = "ucb",
427 | kappa = 1,
428 | eps = 0.0,
429 | verbose = TRUE)
430 | ba_search
431 |
432 | ## ----ch08_bayesian_optimization_03,warning=FALSE-------------------------
433 | final_search <- train(HousePrice ~ StoreArea + BasementArea + SellingYear + SaleType + ConstructionYear + Rating, data = dataset,
434 | method = "svmRadial",
435 | tuneGrid = data.frame(C = exp(ba_search$Best_Par["logC"]),
436 | sigma = exp(ba_search$Best_Par["logSigma"])),
437 | metric = "RMSE",
438 | preProc = c("center", "scale"),
439 | trControl = ctrl)
440 |
441 | final_search
442 | compare_models(final_search, rand_search)
443 |
444 |
--------------------------------------------------------------------------------
/Code/Ch09_Ramasubramanian.R:
--------------------------------------------------------------------------------
1 | ## ---- warning=FALSE, message=FALSE---------------------------------------
2 |
3 | library(doParallel)
4 |
5 | # Find out how many cores are available (if you don't already know)
6 | c = detectCores()
7 | c
8 |
9 | # Find out how many cores are currently being used
10 | getDoParWorkers()
11 |
12 | # Create cluster with c-2 cores
13 | cl <- makeCluster(c-2)
14 |
15 | # Register cluster
16 | registerDoParallel(cl)
17 |
18 | # Find out how many cores are being used
19 | getDoParWorkers()
20 |
21 | ## ---- warning=FALSE, message=FALSE---------------------------------------
22 |
23 |
24 | ## Problem : Identifying Risky Bank Loans
25 |
26 | credit <- read.csv("credit.csv")
27 | str(credit)
28 |
29 | # create a random sample for training and test data
30 | # use set.seed to use the same random number sequence as the tutorial
31 | set.seed(123)
32 | train_sample <- sample(1000, 900)
33 |
34 | str(train_sample)
35 |
36 | # split the data frames
37 | credit_train <- credit[train_sample, ]
38 | credit_test <- credit[-train_sample, ]
39 |
40 | ## ---- warning=FALSE, message=FALSE---------------------------------------
41 |
42 | ## Training a model on the data
43 |
44 | library(randomForest)
45 |
46 | #Sequential Execution
47 | system.time(rf_credit_model <- randomForest(credit_train[-17],
48 | credit_train$default,
49 | ntree = 1000))
50 |
51 | ## ---- warning=FALSE, message=FALSE---------------------------------------
52 |
53 | #Parallel Execution
54 | system.time(
55 | rf_credit_model_parallel <- foreach(nt = rep(250,4),
56 | .combine = combine ,
57 | .packages = 'randomForest')
58 | %dopar%
59 | randomForest(
60 | credit_train[-17],
61 | credit_train$default,
62 | ntree = nt))
63 |
64 | ## ---- warning=FALSE, message=FALSE---------------------------------------
65 |
66 | #Shutting down cluster - when you're done, be sure to close #the parallel backend using
67 | stopCluster(cl)
68 |
69 |
70 | ## ---- eval=FALSE---------------------------------------------------------
71 | ##
72 | Sys.setenv(HADOOP_CMD="/usr/lib/hadoop-2.2.0/bin/hadoop")
73 | Sys.setenv(HADOOP_STREAMING="/usr/lib/hadoop-2.2.0/share/hadoop/tools/lib/hadoop-streaming-2.2.0.jar")
74 |
75 | ## ---- eval=FALSE---------------------------------------------------------
76 | ##
77 | library(rmr2)
78 | library(rhdfs)
79 | ##
80 | ## # Hadoop File Operations
81 | ##
82 | ## #initialize File HDFS
83 | hdfs.init()
84 |
85 | ## ---- eval=FALSE---------------------------------------------------------
86 | ##
87 | ## #Put File into HDFS
88 | hdfs.put("/home/sample.txt","/hadoop_practice")
89 | ## ## [1] TRUE
90 | ##
91 |
92 | ## ---- eval=FALSE---------------------------------------------------------
93 | ##
94 | ## # Reads a bunch of lines at a time
95 | ## #Map Phase
96 | map <- function(k,lines) {
97 | words.list <- strsplit(lines, '\\s+')
98 | words <- unlist(words.list)
99 | return( keyval(words, 1) )
100 | ## }
101 | ##
102 | ## #Reduce Phase
103 | reduce <- function(word, counts) {
104 | keyval(word, sum(counts))
105 | }
106 | ##
107 | ## #MapReduce Function
108 | wordcount <- function (input, output=NULL) {
109 | mapreduce(input=input, output=output, input.format="text", map=map, reduce=reduce)
110 | }
111 |
112 | ## ---- eval=FALSE---------------------------------------------------------
113 | ##
114 | ##
115 | ## ## read text files from folder input on HDFS
116 | ## ## save result in folder output on HDFS
117 | ## ## Submit job
118 | ##
119 | basedir <- '/hadoop_practice'
120 | infile <- file.path(basedir, 'sample.txt')
121 | outfile <- file.path(basedir, 'output')
122 | ret <- wordcount(infile, outfile)
123 |
124 | ## ---- eval=FALSE---------------------------------------------------------
125 | ##
126 | ##
127 | ## ## Fetch results from HDFS
128 | result <- from.dfs(outfile)
129 | results.df <- as.data.frame(result, stringsAsFactors=F)
130 | colnames(results.df) <- c('word', 'count')
131 | tail(results.df,100)
132 | ##
133 | ## ## word count
134 | ## ## 1 R 1
135 | ## ## 2 Hi 1
136 | ## ## 3 to 1
137 | ## ## 4 All 1
138 | ## ## 5 with 1
139 | ## ## 6 class 1
140 | ## ## 7 hadoop 3
141 | ## ## 8 Welcome 1
142 | ## ## 9 integrating 1
143 | ##
144 | ##
145 | head(results.df[order(results.df$count, decreasing = TRUE),])
146 | ##
147 | ## ## word count
148 | ## ## 7 hadoop 3
149 | ## ## 1 R 1
150 | ## ## 2 Hi 1
151 | ## ## 3 to 1
152 | ## ## 4 All 1
153 | ## ## 5 with 1
154 | ##
155 |
156 | ## ---- warning=FALSE,message=FALSE----------------------------------------
157 |
158 | #Set environment variable
159 | Sys.setenv(SPARK_HOME='C:/Spark/spark-2.0.0-bin-hadoop2.7',HADOOP_HOME='C:/Hadoop-2.3.0')
160 | .libPaths(c(file.path(Sys.getenv('SPARK_HOME'), 'R', 'lib'),.libPaths()))
161 | Sys.setenv('SPARKR_SUBMIT_ARGS'='"sparkr-shell"')
162 |
163 | ## ---- warning=FALSE,message=FALSE----------------------------------------
164 |
165 |
166 | library(SparkR)
167 | library(rJava)
168 |
169 | #The entry point into SparkR is the SparkSession which connects your R program to a Spark cluster
170 | sparkR.session(enableHiveSupport = FALSE, appName = "SparkR-ML",master = "local[*]", sparkConfig = list(spark.driver.memory = "1g",spark.sql.warehouse.dir="C:/Hadoop-2.3.0"))
171 |
172 |
173 | ## ---- warning=FALSE,message=FALSE----------------------------------------
174 | library(data.table)
175 |
176 | #Read the housing data
177 | Data_House_Price <- fread("/Users/karthik/Dropbox/Book Writing - Drafts/Chapter Drafts/Chapter 7 - Machine Learning Model Evaluation/tosend/House Sale Price Dataset.csv",header=T, verbose = FALSE, showProgress = FALSE)
178 |
179 | str(Data_House_Price)
180 |
181 |
182 | #Pulling out relevant columns and assigning required fields in the dataset
183 | Data_House_Price <- Data_House_Price[,.(HOUSE_ID,HousePrice,StoreArea,StreetHouseFront,BasementArea,LawnArea,Rating,SaleType)]
184 |
185 | #Omit any missing value
186 | Data_House_Price <- na.omit(Data_House_Price)
187 |
188 | Data_House_Price$HOUSE_ID <- as.character(Data_House_Price$HOUSE_ID)
189 |
190 |
191 | ## ---- warning=FALSE,message=FALSE----------------------------------------
192 | #Spark Data Frame - Train
193 | gaussianDF_train <- createDataFrame(Data_House_Price[1:floor(nrow(Data_House_Price)*(2/3)),])
194 |
195 | #Spark Data Frame - Test
196 | gaussianDF_test <- createDataFrame(Data_House_Price[floor(nrow(Data_House_Price)*(2/3) + 1):nrow(Data_House_Price),])
197 |
198 | class(gaussianDF_train)
199 |
200 | class(gaussianDF_test)
201 |
202 |
203 | ## ---- warning=FALSE,message=FALSE----------------------------------------
204 |
205 | # Fit a generalized linear model of family "gaussian" with spark.glm
206 | gaussianGLM <- spark.glm(gaussianDF_train, HousePrice ~ StoreArea + StreetHouseFront + BasementArea + LawnArea + Rating + SaleType, family = "gaussian")
207 |
208 | # Model summary
209 | summary(gaussianGLM)
210 |
211 |
212 | ## ---- warning=FALSE,message=FALSE----------------------------------------
213 |
214 | #Prediction on the gaussianModel
215 | gaussianPredictions <- predict(gaussianGLM, gaussianDF_test)
216 | names(gaussianPredictions) <- c('HOUSE_ID','HousePrice','StoreArea','StreetHouseFront','BasementArea','LawnArea','Rating','SaleType','ActualPrice','PredictedPrice')
217 | gaussianPredictions$PredictedPrice <- round(gaussianPredictions$PredictedPrice,2.0)
218 | showDF(gaussianPredictions[,9:10])
219 |
220 | ## ---- warning=FALSE,message=FALSE----------------------------------------
221 |
222 | sparkR.stop()
223 |
224 |
225 | ## ----ch09_h2o_1,eval=FALSE-----------------------------------------------
226 | ## # The following two commands remove any previously installed H2O packages for R.
227 | ## if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
228 | ## if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
229 | ##
230 | ## # Next, we download, install and initialize the H2O package for R.
231 | ## install.packages("h2o", repos=(c("http://s3.amazonaws.com/h2o-release/h2o/rel-kahan/5/R", getOption("repos"))))
232 | ##
233 | ## #Alternatively you can install the package h2o from CRAN as below
234 | ## install.packages("h2o")
235 |
236 | ## ----ch09_h2o_2----------------------------------------------------------
237 | # Load the h2o library in R
238 | library(h2o);
239 |
240 | #Initiate a cluster in your machine
241 | localH2O = h2o.init()
242 |
243 | ## ----ch09_h2o_3----------------------------------------------------------
244 | # Run Deep learning demo
245 | demo(h2o.deeplearning)
246 |
247 | prostate.hex
248 |
249 | # Other demos can be called as below
250 | #demo(h2o.glm)
251 | #demo(h2o.gbm)
252 |
253 |
--------------------------------------------------------------------------------
/Code/actual_pred_plot.R:
--------------------------------------------------------------------------------
1 | library(data.table) # for everything
2 | library(ggplot2) # for plotting
3 | library(reshape2) # for melting
4 |
5 |
6 | actual_pred_plot <- function(var.by=var.by,
7 | var.response=var.response,
8 | data=data,
9 | var.predict.current=NULL,
10 | var.predict.reference=NULL,
11 | var.split=NULL,
12 | var.weight=NULL,
13 | var.by.buckets=NULL,
14 | remove.na=FALSE,
15 | sort.factor=FALSE,
16 | errorbars=FALSE,
17 | subset.to=NA,
18 | count.line=NULL,
19 | barline.ratio=1,
20 | title="",
21 | make.plot=TRUE
22 | ){
23 | #############################################################################
24 | # FUNCTION: actual_pred_plot
25 | #
26 | # Procedure for making univariate plots. This will produce one graph from a
27 | # data frame/data table, containing overlaid exposure histograms, actual
28 | # line(s) and optionally one or many predicted lines. Additionally, two-way
29 | # interactions are possible and well-handled. Returns a ggplot object.
30 | #
31 | # ARGUMENTS:
32 | # var.by: The name of the variable whose levels to group by in the plot.
33 | # Determines the bins for histogram and line plot. (string) [required]
34 | # var.response: The name of the var containing the actual response. (string)
35 | # [required]
36 | # data: The data.frame/.table object to generate the plots from. [required]
37 | # var.predict.current: The name of the var containing the first predicted
38 | # response. (string) [optional]
39 | # var.predict.reference: The name of the var containing the second predicted
40 | # response. If you have more than two predicted cols to plot, you can
41 | # pass a vector of predicted colnames here (or to predict.current) and
42 | # actual_pred_plot will sort it out (string) [optional]
43 | # var.split: The name of the var to interact with var.by. (string) [optional]
44 | # var.weight: The name of the var to be used to weight each observation.
45 | # If NULL (default), all obs are given equal weight. (string) [optional]
46 | # var.by.buckets: How many buckets to group factor vars into (num) [optional]
47 | # remove.na: If response/predictions have NAs, should we remove them?
48 | # sort.factor: Do you want to sort x-axis so predicted line always increases?
49 | # errorbars: Boolean, should the actual line include error bars od +-2SE?
50 | # Defaults to FALSE. [optional]
51 | # subset.to: Vector of length 2 specifiying where to cap a numeric var.by.
52 | # e.g. c(min.val, max.val). To only specify one boundary use NA; e.g.
53 | # c(NA, max.val) (numeric) [optional]
54 | # barline.ratio: What should the ratio of maximum bar height to maximum line
55 | # value be? (num) [optional]
56 | # title: Do you want a title? (string) [optional]
57 | # make.plot: Boolean, if FALSE the grid used to make the plot will be
58 | # returned instead of the plot itself. [optional]
59 | #
60 | #############################################################################
61 |
62 | # pull out only the cols you need and convert to DT if necessary
63 | needed.cols <- c(var.by, var.split, var.response,
64 | var.predict.current, var.predict.reference)
65 |
66 | if("data.table" %in% class(data)){
67 | dat.frm <- data[, (needed.cols), with=FALSE]
68 | } else {
69 | dat.frm <- data.table(data[, needed.cols])
70 | }
71 |
72 | # cap var.by at specified limits
73 | if(!is.na(subset.to[1])){
74 | dat.frm[[var.by]] <- pmax(dat.frm[[var.by]], subset.to[1])
75 | }
76 | if(!is.na(subset.to[2])){
77 | dat.frm[[var.by]] <- pmin(dat.frm[[var.by]], subset.to[2])
78 | }
79 |
80 | # upgroup poorly exposed buckets if specified
81 | if(!is.null(var.by.buckets)){
82 | if(nlevels(dat.frm[[var.by]]) > var.by.buckets)
83 | dat.frm[[var.by]] <- keep_n_top(dat.frm[[var.by]], var.by.buckets)
84 | }
85 |
86 | # make sure there are fewer than 100 levels for numerics
87 | if(length(unique(dat.frm[[var.by]])) > 100){
88 | dat.frm[[var.by]] <- round_down_to_100_levels(dat.frm[[var.by]])
89 | }
90 |
91 | # set key (if var.split = NULL this does same as setkeyv(dat.frm, var.by))
92 | setkeyv(dat.frm, c(var.by, var.split))
93 |
94 | # get means using .SD and .SDcols, nice advantage of assigning nice colnames
95 | meanable.cols <- c(var.response, var.predict.current, var.predict.reference)
96 |
97 | # if response/predicted cols contain NAs throw an error by default, or..
98 | if(remove.na){
99 | meanme <- function(vector.to.mean){
100 | mean(vector.to.mean, na.rm=TRUE)
101 | }
102 | } else {
103 | meanme <- function(vector.to.mean){
104 | mean(vector.to.mean)
105 | }
106 | }
107 |
108 | meantable <- dat.frm[, lapply(.SD, function(x) meanme(x)),
109 | by = key(dat.frm),
110 | .SDcols = (meanable.cols)]
111 |
112 | if(!is.null(var.weight)){
113 | counttable <- dat.frm[, list(count=sum(var.weight)),
114 | by=key(dat.frm), with=FALSE]
115 | } else {
116 | counttable <- dat.frm[, list(count=.N), by=key(dat.frm)]
117 | }
118 |
119 | # don't necessarily need to merge, but makes the ggplot call simpler
120 | plotme <- meantable[counttable]
121 |
122 | # scale the count, make it fit on response axis
123 | rescale_factor <- max(plotme[["count"]]) / max(plotme[[var.response]])
124 | plotme$count_scaled <- plotme$count / (barline.ratio * rescale_factor)
125 |
126 |
127 |
128 | # which variables to use as melt id?
129 | melt.id <- c(var.by, var.split, "count", "count_scaled")
130 |
131 | # add in error bars if requested
132 | if(errorbars){
133 |
134 | sterrtable <- dat.frm[, list(sterr = sd(get(var.response))/sqrt(.N)),
135 | by=key(dat.frm)]
136 |
137 | plotme <- plotme[sterrtable]
138 |
139 | plotme$ymax <- plotme[[var.response]] + 2 * plotme$sterr
140 | plotme$ymin <- plotme[[var.response]] - 2 * plotme$sterr
141 | plotme$sterr <- NULL
142 |
143 | melt.id <- c(melt.id, "ymax", "ymin")
144 | gg.errors <- geom_errorbar(aes_string(x=var.by,
145 | ymax= "ymax",
146 | ymin= "ymin",
147 | color="variable"),
148 | stat="identity",
149 | width=0.25,
150 | alpha=0.75)
151 |
152 | }else{ gg.errors <- NULL}
153 |
154 | # melt into a form that ggplot likes
155 | plotme <- melt(plotme, id=melt.id)
156 | plotme$grouping <- plotme$variable
157 |
158 | # get rid of errors on the non-response rows (to make error bar colours work)
159 | if(errorbars) plotme[!variable == var.response, ':=' (ymax = NA, ymin = NA)]
160 |
161 |
162 | if(!is.null(var.split)) {
163 |
164 | plotme[[var.split]] <- as.factor(plotme[[var.split]])
165 | plotme$grouping <- paste0(plotme[[var.split]], " - ", plotme$variable)
166 | var.split2 <- var.split
167 | delete_linetype_legend <- NULL
168 |
169 | }else{
170 |
171 | var.split2 <- NULL
172 | var.split <- "variable"
173 | delete_linetype_legend <- scale_linetype(guide='none')
174 | }
175 |
176 | if(is.null(var.predict.current) & is.null(var.predict.reference)) {
177 | delete_linetype_legend <- scale_linetype(guide='none')
178 | }
179 |
180 | # do you want to sort plot so predicted line is always increasing?
181 | if(sort.factor){
182 | levels <- plotme[variable == var.predict.current][order(value)][[var.by]]
183 | plotme[[var.by]] <- factor(plotme[[var.by]], levels=levels)
184 | } else {
185 | plotme[[var.by]] <- as.factor(plotme[[var.by]])
186 | }
187 |
188 | # add a dashed line to give some sort of reference for height of count bars
189 | if(!is.null(count.line)){
190 | countline_rescaled <- count.line/(rescale_factor*barline.ratio)
191 | gg.countline <- geom_hline(yintercept=countline_rescaled,
192 | colour="grey",
193 | linetype="longdash",
194 | size=0.1,
195 | alpha=0.05)
196 | gg.annotate <- annotate("text",
197 | x=plotme[[var.by]][1],
198 | y=countline_rescaled,
199 | label = paste0("N = ", count.line),
200 | hjust=0,
201 | vjust=1)
202 |
203 | } else {
204 | gg.countline <- NULL
205 | gg.annotate <- NULL
206 | }
207 |
208 | # do you want the plot or just the grid used to make it?
209 | if(!make.plot){
210 | return(plotme)
211 | }
212 |
213 | plt <- ggplot(plotme) +
214 | geom_line(aes_string(x=var.by,
215 | y="value",
216 | group="grouping",
217 | color=var.split,
218 | linetype="variable"),
219 | stat="identity") +
220 | gg.errors +
221 | geom_point(aes_string(x=var.by,
222 | y="value",
223 | group="grouping",
224 | color=var.split,
225 | shape=var.split),
226 | stat="identity",
227 | size=2) +
228 | geom_bar(aes_string(x=var.by,
229 | y="count_scaled",
230 | group="grouping",
231 | fill=var.split2),
232 | stat="identity",
233 | alpha=0.25,
234 | position="dodge") +
235 | geom_hline(yintercept=count.line/(rescale_factor*barline.ratio),
236 | colour="black",
237 | linetype="longdash",
238 | size=0.25) +
239 | gg.countline + gg.annotate +
240 | theme(legend.position=c(1,0.25),legend.justification=c(1,1)) +
241 | ylab(var.response) + delete_linetype_legend +
242 | ggtitle(title)
243 |
244 | return(plt)
245 |
246 | }
247 |
248 |
249 | round_down_to_100_levels <- function(vectortoround){
250 | #############################################################################
251 | # FUNCTION: round_down_to_100_levels
252 | #
253 | # actual_pred_plot requires a function for binning continuous numerical variables
254 | # into discrete buckets. Because of the way data.table (specifically setkey)
255 | # works, we need to upgroup numeric variables sensibly before summarizing.
256 | # Here we simply try rounding less and less finely until the result has less
257 | # than 100 groups. Maintains the distribution/shape of var.by (unlike decile
258 | # binning). Pulled this out of actual_pred_plot into its own function for easier
259 | # maintenance. In the future I will modularize more of actual_pred_plot. Plan to add
260 | # check to determine numerical v categorical vars and handle appropriately.
261 | #
262 | #############################################################################
263 |
264 | # try rounding to these numbers in sequence
265 | toTheNearest <- c(0.01, 0.02, 0.05, 0.1, 1, 2,
266 | 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000)
267 | # initialise i to step through above attempts
268 | i <- 1
269 |
270 | # only round if you need to
271 | if(length(unique(vectortoround)) > 100){
272 |
273 | # initialise new rounded version of variable
274 | rounded <- vectortoround
275 |
276 | # keep trying til one sticks, if a var won't round to 2000 you're done for
277 | while(length(unique(rounded)) > 100){
278 | rounded <- round(vectortoround / toTheNearest[i]) * toTheNearest[i]
279 | i <- i + 1
280 | }
281 | print(paste0("Rounded to nearest ", as.character(toTheNearest[i-1])))
282 | return(rounded)
283 | }
284 | return(vectortoround)
285 | }
286 |
287 |
288 | keep_n_top <- function(factor, n){
289 | ############################################################################
290 | # FUNCTION: keep_n_top
291 | #
292 | # Treat a factor so that it retains only the n-1 most exposed levels.
293 | #
294 | # ARGUMENTS:
295 | # factor: factor variable to modify
296 | # n: number of levels to retain
297 | # RETURNS:
298 | # Factor as input with the n - 1 most exposed levels and an 'Other' level
299 | #
300 | ############################################################################
301 |
302 | top <- table(factor)
303 | top <- top[order(top, decreasing=TRUE)][1:(n - 1)]
304 | factor <- as.character(factor)
305 | factor[!factor %in% names(top)] <- "Other"
306 | as.factor(factor)
307 |
308 | }
--------------------------------------------------------------------------------
/Code/concordance.R:
--------------------------------------------------------------------------------
1 | ###########################################################
2 | # Function concordance : for concordance, discordance, ties
3 | # The function returns Concordance, discordance, and ties
4 | # by taking a glm binomial model result as input.
5 | # It uses optimisation through subsetting
6 | ###########################################################
7 |
8 | concordance<-function(model)
9 | {
10 | # Get all actual observations and their fitted values into a frame
11 | fitted<-data.frame(cbind(model$y,model$fitted.values))
12 | colnames(fitted)<-c('respvar','score')
13 | # Subset only ones
14 | ones<-fitted[fitted[,1]==1,]
15 | # Subset only zeros
16 | zeros<-fitted[fitted[,1]==0,]
17 |
18 | # Initialise all the values
19 | pairs_tested<-as.double(nrow(ones))*as.double(nrow(zeros))
20 | conc<-0
21 | disc<-0
22 |
23 | # Get the values in a for-loop
24 | for(i in 1:nrow(ones))
25 | {
26 | conc<-conc + sum(ones[i,"score"]>zeros[,"score"])
27 | disc<-disc + sum(ones[i,"score"]\Dataset\Chapter #"`
6 |
7 | For example, on page 34,
8 |
9 | Change `read.csv("employees.csv", header =TRUE, sep =",")`
10 | To `read.csv("\Dataset\Chapter 2\employees.csv", header =TRUE, sep =",")`
11 |
12 | Similar changes apply to the following pages:
13 |
14 | Page 42, 47, 52, 132, 134, 135, 137, 138, 140, 141, 143, 144, 146, 147, 148, 153, 155, 157, 160, 161, 163, 169, 171, 173, 174, 236, 269, 296, 339, 367, 384, 397, 419, 472, 485, 499, 511, 524.
--------------------------------------------------------------------------------