├── .Rhistory
├── IntroToTextAnalyticsWithR_Part1.pdf
├── IntroToTextAnalyticsWithR_Part10.pdf
├── IntroToTextAnalyticsWithR_Part12.pdf
├── IntroToTextAnalyticsWithR_Part2.pdf
├── IntroToTextAnalyticsWithR_Part5.pdf
├── IntroToTextAnalyticsWithR_Part6.pdf
├── IntroToTextAnalyticsWithR_Part7.pdf
├── IntroToTextAnalyticsWithR_Part9.pdf
├── IntroToTextAnalytics_Part1.R
├── IntroToTextAnalytics_Part10.R
├── IntroToTextAnalytics_Part11.R
├── IntroToTextAnalytics_Part12.R
├── IntroToTextAnalytics_Part2.R
├── IntroToTextAnalytics_Part3.R
├── IntroToTextAnalytics_Part4.R
├── IntroToTextAnalytics_Part5.R
├── IntroToTextAnalytics_Part6.R
├── IntroToTextAnalytics_Part7.R
├── IntroToTextAnalytics_Part8.R
├── IntroToTextAnalytics_Part9.R
├── README.md
├── rf.cv.1.RData
├── rf.cv.2.RData
├── rf.cv.3.RData
├── rf.cv.4.RData
└── spam.csv


/.Rhistory:
--------------------------------------------------------------------------------
  1 | names(new.theta) <- names(features)
  2 | alpha <- 0.01
  3 | # Utility functon that calculates the prediction for an observation
  4 | # given the current state of the hypothesis function.
  5 | #h.theta <- function(theta, observation) {
  6 | #  return(sum(theta * observation))
  7 | #  prediction <- 0.0
  8 | #  for(i in 1:length(theta)) {
  9 | #    prediction <- prediction + (theta[i] * observation[i])
 10 | #  }
 11 | #  return(prediction)
 12 | #}
 13 | # for(k in 1:2){
 14 | #   m <- ncol(features)
 15 | #
 16 | #   for(j in 1:m) {
 17 | #     n <- nrow(features)
 18 | #     summation <- 0.0
 19 | #
 20 | #     for(i in 1:n) {
 21 | # #      prediction <- h.theta(theta, features[i,])
 22 | #       prediction <- sum(theta * features[i,])
 23 | #       residual <- prediction - y[i]
 24 | #       update.value <- residual * features[i, j]
 25 | #       summation <- summation + update.value
 26 | #     }
 27 | #
 28 | #     new.theta[j] <- theta[j] + (alpha * summation)
 29 | #   }
 30 | #
 31 | #   theta <- new.theta
 32 | #
 33 | # #  print(theta)
 34 | # }
 35 | #
 36 | # print(theta)
 37 | iterations <- 250
 38 | X <- features
 39 | for(k in 1:iterations) {
 40 | for(j in 1:ncol(X)) {
 41 | summation <- 0
 42 | for(i in 1:nrow(X)) {
 43 | residual <- sum(X[i,] * theta[j]) - y[i]
 44 | summation <- summation + (residual * X[i, j])
 45 | }
 46 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation)
 47 | }
 48 | theta <- new.theta
 49 | }
 50 | lm.model
 51 | set.seed(1234)
 52 | x <- runif(1000, -5, 5)
 53 | y <- x + rnorm(1000) + 3
 54 | intercept <- rep(1, length(x))
 55 | lm.model <- lm(y ~ x)
 56 | summary(lm.model)
 57 | features <- data.frame(intercept = intercept, x = x)
 58 | theta <- rep(0, ncol(features))
 59 | names(theta) <- names(features)
 60 | theta
 61 | new.theta <- rep(0, ncol(features))
 62 | names(new.theta) <- names(features)
 63 | alpha <- 0.025
 64 | # Utility functon that calculates the prediction for an observation
 65 | # given the current state of the hypothesis function.
 66 | #h.theta <- function(theta, observation) {
 67 | #  return(sum(theta * observation))
 68 | #  prediction <- 0.0
 69 | #  for(i in 1:length(theta)) {
 70 | #    prediction <- prediction + (theta[i] * observation[i])
 71 | #  }
 72 | #  return(prediction)
 73 | #}
 74 | # for(k in 1:2){
 75 | #   m <- ncol(features)
 76 | #
 77 | #   for(j in 1:m) {
 78 | #     n <- nrow(features)
 79 | #     summation <- 0.0
 80 | #
 81 | #     for(i in 1:n) {
 82 | # #      prediction <- h.theta(theta, features[i,])
 83 | #       prediction <- sum(theta * features[i,])
 84 | #       residual <- prediction - y[i]
 85 | #       update.value <- residual * features[i, j]
 86 | #       summation <- summation + update.value
 87 | #     }
 88 | #
 89 | #     new.theta[j] <- theta[j] + (alpha * summation)
 90 | #   }
 91 | #
 92 | #   theta <- new.theta
 93 | #
 94 | # #  print(theta)
 95 | # }
 96 | #
 97 | # print(theta)
 98 | iterations <- 250
 99 | X <- features
100 | for(k in 1:iterations) {
101 | for(j in 1:ncol(X)) {
102 | summation <- 0
103 | for(i in 1:nrow(X)) {
104 | residual <- sum(X[i,] * theta[j]) - y[i]
105 | summation <- summation + (residual * X[i, j])
106 | }
107 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation)
108 | }
109 | theta <- new.theta
110 | }
111 | set.seed(1234)
112 | x <- runif(1000, -5, 5)
113 | y <- x + rnorm(1000) + 3
114 | intercept <- rep(1, length(x))
115 | lm.model <- lm(y ~ x)
116 | summary(lm.model)
117 | features <- data.frame(intercept = intercept, x = x)
118 | theta <- rep(0, ncol(features))
119 | names(theta) <- names(features)
120 | theta
121 | new.theta <- rep(0, ncol(features))
122 | names(new.theta) <- names(features)
123 | alpha <- 0.05
124 | # Utility functon that calculates the prediction for an observation
125 | # given the current state of the hypothesis function.
126 | #h.theta <- function(theta, observation) {
127 | #  return(sum(theta * observation))
128 | #  prediction <- 0.0
129 | #  for(i in 1:length(theta)) {
130 | #    prediction <- prediction + (theta[i] * observation[i])
131 | #  }
132 | #  return(prediction)
133 | #}
134 | # for(k in 1:2){
135 | #   m <- ncol(features)
136 | #
137 | #   for(j in 1:m) {
138 | #     n <- nrow(features)
139 | #     summation <- 0.0
140 | #
141 | #     for(i in 1:n) {
142 | # #      prediction <- h.theta(theta, features[i,])
143 | #       prediction <- sum(theta * features[i,])
144 | #       residual <- prediction - y[i]
145 | #       update.value <- residual * features[i, j]
146 | #       summation <- summation + update.value
147 | #     }
148 | #
149 | #     new.theta[j] <- theta[j] + (alpha * summation)
150 | #   }
151 | #
152 | #   theta <- new.theta
153 | #
154 | # #  print(theta)
155 | # }
156 | #
157 | # print(theta)
158 | iterations <- 250
159 | X <- features
160 | for(k in 1:iterations) {
161 | for(j in 1:ncol(X)) {
162 | summation <- 0
163 | for(i in 1:nrow(X)) {
164 | residual <- sum(X[i,] * theta[j]) - y[i]
165 | summation <- summation + (residual * X[i, j])
166 | }
167 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation)
168 | }
169 | theta <- new.theta
170 | }
171 | set.seed(1234)
172 | x <- runif(1000, -5, 5)
173 | y <- x + rnorm(1000) + 3
174 | intercept <- rep(1, length(x))
175 | lm.model <- lm(y ~ x)
176 | summary(lm.model)
177 | features <- data.frame(intercept = intercept, x = x)
178 | theta <- rep(0, ncol(features))
179 | names(theta) <- names(features)
180 | theta
181 | new.theta <- rep(0, ncol(features))
182 | names(new.theta) <- names(features)
183 | alpha <- 0.05
184 | # Utility functon that calculates the prediction for an observation
185 | # given the current state of the hypothesis function.
186 | #h.theta <- function(theta, observation) {
187 | #  return(sum(theta * observation))
188 | #  prediction <- 0.0
189 | #  for(i in 1:length(theta)) {
190 | #    prediction <- prediction + (theta[i] * observation[i])
191 | #  }
192 | #  return(prediction)
193 | #}
194 | # for(k in 1:2){
195 | #   m <- ncol(features)
196 | #
197 | #   for(j in 1:m) {
198 | #     n <- nrow(features)
199 | #     summation <- 0.0
200 | #
201 | #     for(i in 1:n) {
202 | # #      prediction <- h.theta(theta, features[i,])
203 | #       prediction <- sum(theta * features[i,])
204 | #       residual <- prediction - y[i]
205 | #       update.value <- residual * features[i, j]
206 | #       summation <- summation + update.value
207 | #     }
208 | #
209 | #     new.theta[j] <- theta[j] + (alpha * summation)
210 | #   }
211 | #
212 | #   theta <- new.theta
213 | #
214 | # #  print(theta)
215 | # }
216 | #
217 | # print(theta)
218 | iterations <- 300
219 | X <- features
220 | for(k in 1:iterations) {
221 | for(j in 1:ncol(X)) {
222 | summation <- 0
223 | for(i in 1:nrow(X)) {
224 | residual <- sum(X[i,] * theta[j]) - y[i]
225 | summation <- summation + (residual * X[i, j])
226 | }
227 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation)
228 | }
229 | theta <- new.theta
230 | }
231 | set.seed(1234)
232 | x <- runif(1000, -5, 5)
233 | y <- x + rnorm(1000) + 3
234 | intercept <- rep(1, length(x))
235 | lm.model <- lm(y ~ x)
236 | summary(lm.model)
237 | features <- data.frame(intercept = intercept, x = x)
238 | theta <- rep(0, ncol(features))
239 | names(theta) <- names(features)
240 | theta
241 | new.theta <- rep(0, ncol(features))
242 | names(new.theta) <- names(features)
243 | alpha <- 0.05
244 | # Utility functon that calculates the prediction for an observation
245 | # given the current state of the hypothesis function.
246 | #h.theta <- function(theta, observation) {
247 | #  return(sum(theta * observation))
248 | #  prediction <- 0.0
249 | #  for(i in 1:length(theta)) {
250 | #    prediction <- prediction + (theta[i] * observation[i])
251 | #  }
252 | #  return(prediction)
253 | #}
254 | # for(k in 1:2){
255 | #   m <- ncol(features)
256 | #
257 | #   for(j in 1:m) {
258 | #     n <- nrow(features)
259 | #     summation <- 0.0
260 | #
261 | #     for(i in 1:n) {
262 | # #      prediction <- h.theta(theta, features[i,])
263 | #       prediction <- sum(theta * features[i,])
264 | #       residual <- prediction - y[i]
265 | #       update.value <- residual * features[i, j]
266 | #       summation <- summation + update.value
267 | #     }
268 | #
269 | #     new.theta[j] <- theta[j] + (alpha * summation)
270 | #   }
271 | #
272 | #   theta <- new.theta
273 | #
274 | # #  print(theta)
275 | # }
276 | #
277 | # print(theta)
278 | iterations <- 500
279 | X <- features
280 | for(k in 1:iterations) {
281 | for(j in 1:ncol(X)) {
282 | summation <- 0
283 | for(i in 1:nrow(X)) {
284 | residual <- sum(X[i,] * theta[j]) - y[i]
285 | summation <- summation + (residual * X[i, j])
286 | }
287 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation)
288 | }
289 | theta <- new.theta
290 | }
291 | data(iria)
292 | data(iris)
293 | install.packages(c("lmtest", "mgcv", "nlme"))
294 | data("iris")
295 | library(GGally)
296 | ggpairs(iris)
297 | remove.packages("tibble")
298 | library(GGally)
299 | remove.packages("GGally")
300 | remove.packages("plotly")
301 | install.pacakges("GGally")
302 | install.packages("GGally")
303 | library(GGally)
304 | install.packages(tibble)
305 | install.packages("tibble")
306 | library(GGally)
307 | data("iris")
308 | ggpairs(iris)
309 | debugSource('~/Dropbox/AmsterdamBootcamp/GradientDescentExample.R', echo=TRUE)
310 | debugSource('~/Dropbox/AmsterdamBootcamp/GradientDescentExample.R', echo=TRUE)
311 | #
312 | # Copyright 2017 Dave Langer
313 | #
314 | # Licensed under the Apache License, Version 2.0 (the "License");
315 | # you may not use this file except in compliance with the License.
316 | # You may obtain a copy of the License at
317 | #
318 | #    http://www.apache.org/licenses/LICENSE-2.0
319 | #
320 | # Unless required by applicable law or agreed to in writing, software
321 | # distributed under the License is distributed on an "AS IS" BASIS,
322 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
323 | # See the License for the specific language governing permissions and
324 | # limitations under the License.
325 | #
326 | #
327 | # This R source code file corresponds to video 10 of the YouTube series
328 | # "R Programming for Excel Users" located at the following URL:
329 | #      https://youtu.be/gYt05xI2Fm8
330 | #
331 | #===========================================================================
332 | # Numeric Vectors
333 | #
334 | # Create a vector of integer values
335 | my_vector <- 1:10
336 | my_vector
337 | # Inspect the vector more closely
338 | class(my_vector)
339 | str(my_vector)
340 | summary(my_vector)
341 | # Add 1 to each value of the vector
342 | my_vector_plus1 <- my_vector + 1
343 | my_vector_plus1
344 | # Divide each value of the vector by 2
345 | half_my_vector <- my_vector / 2
346 | half_my_vector
347 | # Make the vector whole again
348 | whole_my_vector <- half_my_vector + half_my_vector
349 | whole_my_vector
350 | # Square the value of each vector
351 | my_vector_squared1 <- my_vector * my_vector
352 | my_vector_squared1
353 | # Square the value of each vector
354 | my_vector_squared2 <- my_vector ^ 2
355 | my_vector_squared2
356 | # Take the square root of each value
357 | sqrt_my_vector <- sqrt(my_vector)
358 | sqrt_my_vector
359 | # More vectorized functions
360 | sum(my_vector)
361 | mean(my_vector)
362 | sd(my_vector)
363 | #===========================================================================
364 | # Logical Vectors
365 | #
366 | # Which values are greater than 3.5?
367 | larger_than_3.5 <- my_vector > 3.5
368 | larger_than_3.5
369 | # Inspect vector more closely
370 | class(larger_than_3.5)
371 | str(larger_than_3.5)
372 | summary(larger_than_3.5)
373 | # Grab only the values larger than 3.5
374 | my_vector2 <- my_vector[larger_than_3.5]
375 | my_vector2
376 | # Grab only the values larger than 3.5
377 | my_vector3 <- my_vector[my_vector > 3.5]
378 | my_vector3
379 | # Grow the vector
380 | my_bigger_vector <- c(my_vector, 11:15, 16, 17, 18, 19, 20)
381 | my_bigger_vector
382 | # How big is it now?
383 | length(my_bigger_vector)
384 | dim(my_bigger_vector)
385 | #===========================================================================
386 | # String Vectors
387 | #
388 | # Create a vector of strings
389 | force_users <- c("Yoda", "Darth Vader", "Obi Wan", "Mace Windu",
390 | "Darth Maul", "Luke Skywalker", "Darth Sidious")
391 | # Inspect vector more closely
392 | class(force_users)
393 | str(force_users)
394 | summary(force_users)
395 | # Add 1 to string vector
396 | force_users + 1
397 | # Add another force user
398 | force_users <- force_users + "Kylo Ren"
399 | # Add more force users
400 | more_force_users <- c(force_users, "Qui-Gon Jinn", "Darth Tyranus")
401 | more_force_users
402 | # How big is the vector?
403 | length(more_force_users)
404 | # How long is each string in the vector?
405 | name_lengths <- nchar(more_force_users)
406 | name_lengths
407 | #===========================================================================
408 | # Missing Values
409 | #
410 | # Build a vector with missing values
411 | birthplaces <- c(NA, "Tatooine", "Stewjon", "Haruun Kal", "Dathomir",
412 | "Polis Massa", "Naboo", "Coruscant", "Serenno")
413 | birthplaces
414 | # Inspect closer
415 | class(birthplaces)
416 | str(birthplaces)
417 | summary(birthplaces)
418 | # Vectorized operation
419 | is.na(birthplaces)
420 | nchar(birthplaces)
421 | nchar("")
422 | # Logical operations
423 | birthplaces[!is.na(birthplaces)]
424 | #===========================================================================
425 | # Factor Vectors
426 | #
427 | # Create factor (categorical) vector
428 | affiliation <- as.factor(c("Jedi", "Sith", "Rogue"))
429 | affiliation
430 | # Inspect
431 | class(affiliation)
432 | str(affiliation)
433 | summary(affiliation)
434 | levels(affiliation)
435 | # Explore representations
436 | as.numeric(affiliation)
437 | as.character(affiliation)
438 | #===========================================================================
439 | # Data Frames
440 | #
441 | star_wars <- data.frame(id = 1:length(more_force_users),
442 | more_force_users,
443 | birthplaces = as.factor(birthplaces),
444 | affiliation = c("Jedi", "Sith",
445 | "Jedi", "Jedi",
446 | "Sith", "Jedi",
447 | "Sith", "Jedi",
448 | "Sith"),
449 | stringsAsFactors = FALSE)
450 | # Inspect
451 | View(star_wars)
452 | head(star_wars)
453 | summary(star_wars)
454 | str(star_wars)
455 | # Set up factors
456 | star_wars$affiliation <- as.factor(star_wars$affiliation)
457 | # Reinspect
458 | str(star_wars)
459 | # Additional slicing syntax
460 | star_wars$more_force_users[3]
461 | star_wars$more_force_users[star_wars$affiliation == "Sith"]
462 | # Load-up some built in data
463 | data(iris)
464 | data(mtcars)
465 | # Get help on built-in data
466 | ?mtcars
467 | # Understand the shape of a data frame
468 | nrow(mtcars)
469 | ncol(mtcars)
470 | dim(mtcars)
471 | # Understand the metadata of a data frame
472 | names(mtcars)
473 | names(mtcars)[3]
474 | colnames(mtcars)
475 | colnames(mtcars)[3:5]
476 | rownames(mtcars)
477 | rownames(mtcars)[c(3, 4, 5)]
478 | # Cool RStudio feature - spreadsheet view of a data frame
479 | View(mtcars)
480 | # See a few rows at the top and bottom of a data frame
481 | head(mtcars)
482 | tail(mtcars)
483 | # All-up view of a data frame
484 | summary(mtcars)
485 | # Understand the data type of a data frame
486 | class(mtcars)
487 | str(mtcars)
488 | setwd("~/Dropbox/DataScienceDojo/IntroToTextAnalyticsWithR")
489 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
490 | View(spam.raw)
491 | # Clean up the data frame and view our handiwork.
492 | spam.raw <- spam.raw[, 1:2]
493 | names(spam.raw) <- c("Label", "Text")
494 | View(spam.raw)
495 | # Check data to see if there are missing values.
496 | length(which(!complete.cases(spam.raw)))
497 | # Convert our class label into a factor.
498 | spam.raw$Label <- as.factor(spam.raw$Label)
499 | # The first step, as always, is to explore the data.
500 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
501 | prop.table(table(spam.raw$Label))
502 | # Next up, let's get a feel for the distribution of text lengths of the SMS
503 | # messages by adding a new feature for the length of each message.
504 | spam.raw$TextLength <- nchar(spam.raw$Text)
505 | summary(spam.raw$TextLength)
506 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
507 | library(ggplot2)
508 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
509 | theme_bw() +
510 | geom_histogram(binwidth = 5) +
511 | labs(y = "Text Count", x = "Length of Text",
512 | title = "Distribution of Text Lengths with Class Labels")
513 | 


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part1.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part10.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part12.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part2.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part5.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part6.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part7.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalyticsWithR_Part9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part9.pdf


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part1.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Data Science Dojo
 3 | #    
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # 
16 | 
17 | 
18 | #
19 | # This R source code file corresponds to video 1 of the Data Science
20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
21 | # at the following URL:
22 | #     <YouTube Video Link Here />     
23 | #
24 | 
25 | 
26 | # Install all required packages.
27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
28 |                    "irlba", "randomForest"))
29 | 
30 | 
31 | 
32 | 
33 | # Load up the .CSV data and explore in RStudio.
34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
35 | View(spam.raw)
36 | 
37 | 
38 | 
39 | # Clean up the data frame and view our handiwork.
40 | spam.raw <- spam.raw[, 1:2]
41 | names(spam.raw) <- c("Label", "Text")
42 | View(spam.raw)
43 | 
44 | 
45 | 
46 | # Check data to see if there are missing values.
47 | length(which(!complete.cases(spam.raw)))
48 | 
49 | 
50 | 
51 | # Convert our class label into a factor.
52 | spam.raw$Label <- as.factor(spam.raw$Label)
53 | 
54 | 
55 | 
56 | # The first step, as always, is to explore the data.
57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
58 | prop.table(table(spam.raw$Label))
59 | 
60 | 
61 | 
62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
63 | # messages by adding a new feature for the length of each message.
64 | spam.raw$TextLength <- nchar(spam.raw$Text)
65 | summary(spam.raw$TextLength)
66 | 
67 | 
68 | 
69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
70 | library(ggplot2)
71 | 
72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
73 |   theme_bw() +
74 |   geom_histogram(binwidth = 5) +
75 |   labs(y = "Text Count", x = "Length of Text",
76 |        title = "Distribution of Text Lengths with Class Labels")
77 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part10.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 10 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=7cwBhWYHgsA     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | # Clean up the data frame and view our handiwork.
 39 | spam.raw <- spam.raw[, 1:2]
 40 | names(spam.raw) <- c("Label", "Text")
 41 | View(spam.raw)
 42 | 
 43 | 
 44 | 
 45 | # Check data to see if there are missing values.
 46 | length(which(!complete.cases(spam.raw)))
 47 | 
 48 | 
 49 | 
 50 | # Convert our class label into a factor.
 51 | spam.raw$Label <- as.factor(spam.raw$Label)
 52 | 
 53 | 
 54 | 
 55 | # The first step, as always, is to explore the data.
 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 57 | prop.table(table(spam.raw$Label))
 58 | 
 59 | 
 60 | 
 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 62 | # messages by adding a new feature for the length of each message.
 63 | spam.raw$TextLength <- nchar(spam.raw$Text)
 64 | summary(spam.raw$TextLength)
 65 | 
 66 | 
 67 | 
 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 69 | library(ggplot2)
 70 | 
 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 72 |   theme_bw() +
 73 |   geom_histogram(binwidth = 5) +
 74 |   labs(y = "Text Count", x = "Length of Text",
 75 |        title = "Distribution of Text Lengths with Class Labels")
 76 | 
 77 | 
 78 | 
 79 | # At a minimum we need to split our data into a training set and a
 80 | # test set. In a true project we would want to use a three-way split 
 81 | # of training, validation, and test.
 82 | #
 83 | # As we know that our data has non-trivial class imbalance, we'll 
 84 | # use the mighty caret package to create a randomg train/test split 
 85 | # that ensures the correct ham/spam class label proportions (i.e., 
 86 | # we'll use caret for a random stratified split).
 87 | library(caret)
 88 | help(package = "caret")
 89 | 
 90 | 
 91 | # Use caret to create a 70%/30% stratified split. Set the random
 92 | # seed for reproducibility.
 93 | set.seed(32984)
 94 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 95 |                                p = 0.7, list = FALSE)
 96 | 
 97 | train <- spam.raw[indexes,]
 98 | test <- spam.raw[-indexes,]
 99 | 
100 | 
101 | # Verify proportions.
102 | prop.table(table(train$Label))
103 | prop.table(table(test$Label))
104 | 
105 | 
106 | 
107 | # Text analytics requires a lot of data exploration, data pre-processing
108 | # and data wrangling. Let's explore some examples.
109 | 
110 | # HTML-escaped ampersand character.
111 | train$Text[21]
112 | 
113 | 
114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
115 | # is an actual person, but we will ignore the implications of this for
116 | # this introductory tutorial.
117 | train$Text[38]
118 | 
119 | 
120 | # A URL.
121 | train$Text[357]
122 | 
123 | 
124 | 
125 | # There are many packages in the R ecosystem for performing text
126 | # analytics. One of the newer packages in quanteda. The quanteda
127 | # package has many useful functions for quickly and easily working
128 | # with text data.
129 | library(quanteda)
130 | help(package = "quanteda")
131 | 
132 | 
133 | # Tokenize SMS text messages.
134 | train.tokens <- tokens(train$Text, what = "word", 
135 |                        remove_numbers = TRUE, remove_punct = TRUE,
136 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
137 | 
138 | # Take a look at a specific SMS message and see how it transforms.
139 | train.tokens[[357]]
140 | 
141 | 
142 | # Lower case the tokens.
143 | train.tokens <- tokens_tolower(train.tokens)
144 | train.tokens[[357]]
145 | 
146 | 
147 | # Use quanteda's built-in stopword list for English.
148 | # NOTE - You should always inspect stopword lists for applicability to
149 | #        your problem/domain.
150 | train.tokens <- tokens_select(train.tokens, stopwords(), 
151 |                               selection = "remove")
152 | train.tokens[[357]]
153 | 
154 | 
155 | # Perform stemming on the tokens.
156 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
157 | train.tokens[[357]]
158 | 
159 | 
160 | # Create our first bag-of-words model.
161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
162 | 
163 | 
164 | # Transform to a matrix and inspect.
165 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
166 | View(train.tokens.matrix[1:20, 1:100])
167 | dim(train.tokens.matrix)
168 | 
169 | 
170 | # Investigate the effects of stemming.
171 | colnames(train.tokens.matrix)[1:50]
172 | 
173 | 
174 | # Per best practices, we will leverage cross validation (CV) as
175 | # the basis of our modeling process. Using CV we can create 
176 | # estimates of how well our model will do in Production on new,
177 | # unseen data. CV is powerful, but the downside is that it
178 | # requires more processing and therefore more time.
179 | #
180 | # If you are not familiar with CV, consult the following 
181 | # Wikipedia article:
182 | #
183 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
184 | #
185 | 
186 | # Setup a the feature data frame with labels.
187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
188 | 
189 | 
190 | # Often, tokenization requires some additional pre-processing
191 | names(train.tokens.df)[c(146, 148, 235, 238)]
192 | 
193 | 
194 | # Cleanup column names.
195 | names(train.tokens.df) <- make.names(names(train.tokens.df))
196 | 
197 | 
198 | # Use caret to create stratified folds for 10-fold cross validation repeated 
199 | # 3 times (i.e., create 30 random stratified samples)
200 | set.seed(48743)
201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
202 | 
203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
204 |                          repeats = 3, index = cv.folds)
205 | 
206 | 
207 | # Our data frame is non-trivial in size. As such, CV runs will take 
208 | # quite a long time to run. To cut down on total execution time, use
209 | # the doSNOW package to allow for multi-core training in parallel.
210 | #
211 | # WARNING - The following code is configured to run on a workstation-
212 | #           or server-class machine (i.e., 12 logical cores). Alter
213 | #           code to suit your HW environment.
214 | #
215 | #install.packages("doSNOW")
216 | library(doSNOW)
217 | 
218 | 
219 | # Time the code execution
220 | start.time <- Sys.time()
221 | 
222 | 
223 | # Create a cluster to work on 10 logical cores.
224 | cl <- makeCluster(10, type = "SOCK")
225 | registerDoSNOW(cl)
226 | 
227 | 
228 | # As our data is non-trivial in size at this point, use a single decision
229 | # tree alogrithm as our first model. We will graduate to using more 
230 | # powerful algorithms later when we perform feature extraction to shrink
231 | # the size of our data.
232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
233 |                     trControl = cv.cntrl, tuneLength = 7)
234 | 
235 | 
236 | # Processing is done, stop cluster.
237 | stopCluster(cl)
238 | 
239 | 
240 | # Total time of execution on workstation was approximately 4 minutes. 
241 | total.time <- Sys.time() - start.time
242 | total.time
243 | 
244 | 
245 | # Check out our results.
246 | rpart.cv.1
247 | 
248 | 
249 | 
250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
251 | # powerful technique for enhancing the information/signal contained
252 | # within our document-frequency matrix. Specifically, the mathematics
253 | # behind TF-IDF accomplish the following goals:
254 | #    1 - The TF calculation accounts for the fact that longer 
255 | #        documents will have higher individual term counts. Applying
256 | #        TF normalizes all documents in the corpus to be length 
257 | #        independent.
258 | #    2 - The IDF calculation accounts for the frequency of term
259 | #        appearance in all documents in the corpus. The intuition 
260 | #        being that a term that appears in every document has no
261 | #        predictive power.
262 | #    3 - The multiplication of TF by IDF for each cell in the matrix
263 | #        allows for weighting of #1 and #2 for each cell in the matrix.
264 | 
265 | 
266 | # Our function for calculating relative term frequency (TF)
267 | term.frequency <- function(row) {
268 |   row / sum(row)
269 | }
270 | 
271 | # Our function for calculating inverse document frequency (IDF)
272 | inverse.doc.freq <- function(col) {
273 |   corpus.size <- length(col)
274 |   doc.count <- length(which(col > 0))
275 | 
276 |   log10(corpus.size / doc.count)
277 | }
278 | 
279 | # Our function for calculating TF-IDF.
280 | tf.idf <- function(x, idf) {
281 |   x * idf
282 | }
283 | 
284 | 
285 | # First step, normalize all documents via TF.
286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
287 | dim(train.tokens.df)
288 | View(train.tokens.df[1:20, 1:100])
289 | 
290 | 
291 | # Second step, calculate the IDF vector that we will use - both
292 | # for training data and for test data!
293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
294 | str(train.tokens.idf)
295 | 
296 | 
297 | # Lastly, calculate TF-IDF for our training corpus.
298 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
299 | dim(train.tokens.tfidf)
300 | View(train.tokens.tfidf[1:25, 1:25])
301 | 
302 | 
303 | # Transpose the matrix
304 | train.tokens.tfidf <- t(train.tokens.tfidf)
305 | dim(train.tokens.tfidf)
306 | View(train.tokens.tfidf[1:25, 1:25])
307 | 
308 | 
309 | # Check for incopmlete cases.
310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
311 | train$Text[incomplete.cases]
312 | 
313 | 
314 | # Fix incomplete cases
315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
316 | dim(train.tokens.tfidf)
317 | sum(which(!complete.cases(train.tokens.tfidf)))
318 | 
319 | 
320 | # Make a clean data frame using the same process as before.
321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
323 | 
324 | 
325 | # Time the code execution
326 | start.time <- Sys.time()
327 | 
328 | # Create a cluster to work on 10 logical cores.
329 | cl <- makeCluster(3, type = "SOCK")
330 | registerDoSNOW(cl)
331 | 
332 | # As our data is non-trivial in size at this point, use a single decision
333 | # tree alogrithm as our first model. We will graduate to using more 
334 | # powerful algorithms later when we perform feature extraction to shrink
335 | # the size of our data.
336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
337 |                     trControl = cv.cntrl, tuneLength = 7)
338 | 
339 | # Processing is done, stop cluster.
340 | stopCluster(cl)
341 | 
342 | # Total time of execution on workstation was 
343 | total.time <- Sys.time() - start.time
344 | total.time
345 | 
346 | # Check out our results.
347 | rpart.cv.2
348 | 
349 | 
350 | 
351 | # N-grams allow us to augment our document-term frequency matrices with
352 | # word ordering. This often leads to increased performance (e.g., accuracy)
353 | # for machine learning models trained with more than just unigrams (i.e.,
354 | # single terms). Let's add bigrams to our training data and the TF-IDF 
355 | # transform the expanded featre matrix to see if accuracy improves.
356 | 
357 | # Add bigrams to our feature matrix.
358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
359 | train.tokens[[357]]
360 | 
361 | 
362 | # Transform to dfm and then a matrix.
363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
364 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
365 | train.tokens.dfm
366 | 
367 | 
368 | # Normalize all documents via TF.
369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
370 | 
371 |   
372 | # Calculate the IDF vector that we will use for training and test data!
373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
374 | 
375 |   
376 | # Calculate TF-IDF for our training corpus 
377 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, 
378 |                              idf = train.tokens.idf)
379 | 
380 |   
381 | # Transpose the matrix
382 | train.tokens.tfidf <- t(train.tokens.tfidf)
383 | 
384 |   
385 | # Fix incomplete cases
386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
388 | 
389 | 
390 | # Make a clean data frame.
391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
393 | 
394 | 
395 | # Clean up unused objects in memory.
396 | gc()
397 | 
398 | 
399 | 
400 | 
401 | #
402 | # NOTE - The following code requires the use of command-line R to execute
403 | #        due to the large number of features (i.e., columns) in the matrix.
404 | #        Please consult the following link for more details if you wish
405 | #        to run the code yourself:
406 | #
407 | #        https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r
408 | #
409 | #        Also note that running the following code required approximately
410 | #        38GB of RAM and more than 4.5 hours to execute on a 10-core 
411 | #        workstation!
412 | #
413 | 
414 | 
415 | # Time the code execution
416 | # start.time <- Sys.time()
417 | 
418 | # Leverage single decision trees to evaluate if adding bigrams improves the 
419 | # the effectiveness of the model.
420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
421 | #                     trControl = cv.cntrl, tuneLength = 7)
422 | 
423 | # Total time of execution on workstation was
424 | # total.time <- Sys.time() - start.time
425 | # total.time
426 | 
427 | # Check out our results.
428 | # rpart.cv.3
429 | 
430 | #
431 | # The results of the above processing show a slight decline in rpart 
432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457.
433 | # As we will discuss later, while the addition of bigrams appears to 
434 | # negatively impact a single decision tree, it helps with the mighty
435 | # random forest!
436 | #
437 | 
438 | 
439 | 
440 | 
441 | # We'll leverage the irlba package for our singular value 
442 | # decomposition (SVD). The irlba package allows us to specify
443 | # the number of the most important singular vectors we wish to
444 | # calculate and retain for features.
445 | library(irlba)
446 | 
447 | 
448 | # Time the code execution
449 | start.time <- Sys.time()
450 | 
451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns
452 | # for our latent semantic analysis (LSA).
453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600)
454 | 
455 | # Total time of execution on workstation was 
456 | total.time <- Sys.time() - start.time
457 | total.time
458 | 
459 | 
460 | # Take a look at the new feature data up close.
461 | View(train.irlba$v)
462 | 
463 | 
464 | # As with TF-IDF, we will need to project new data (e.g., the test data)
465 | # into the SVD semantic space. The following code illustrates how to do
466 | # this using a row of the training data that has already been transformed
467 | # by TF-IDF, per the mathematics illustrated in the slides.
468 | #
469 | #
470 | sigma.inverse <- 1 / train.irlba$d
471 | u.transpose <- t(train.irlba$u)
472 | document <- train.tokens.tfidf[1,]
473 | document.hat <- sigma.inverse * u.transpose %*% document
474 | 
475 | # Look at the first 10 components of projected document and the corresponding
476 | # row in our document semantic space (i.e., the V matrix)
477 | document.hat[1:10]
478 | train.irlba$v[1, 1:10]
479 | 
480 | 
481 | 
482 | #
483 | # Create new feature data frame using our document semantic space of 300
484 | # features (i.e., the V matrix from our SVD).
485 | #
486 | train.svd <- data.frame(Label = train$Label, train.irlba$v)
487 | 
488 | 
489 | # Create a cluster to work on 10 logical cores.
490 | cl <- makeCluster(10, type = "SOCK")
491 | registerDoSNOW(cl)
492 | 
493 | # Time the code execution
494 | start.time <- Sys.time()
495 | 
496 | # This will be the last run using single decision trees. With a much smaller
497 | # feature matrix we can now use more powerful methods like the mighty Random
498 | # Forest from now on!
499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 
500 |                     trControl = cv.cntrl, tuneLength = 7)
501 | 
502 | # Processing is done, stop cluster.
503 | stopCluster(cl)
504 | 
505 | # Total time of execution on workstation was 
506 | total.time <- Sys.time() - start.time
507 | total.time
508 | 
509 | # Check out our results.
510 | rpart.cv.4
511 | 
512 | 
513 | 
514 | 
515 | #
516 | # NOTE - The following code takes a long time to run. Here's the math.
517 | #        We are performing 10-fold CV repeated 3 times. That means we
518 | #        need to build 30 models. We are also asking caret to try 7 
519 | #        different values of the mtry parameter. Next up by default
520 | #        a mighty random forest leverages 500 trees. Lastly, caret will
521 | #        build 1 final model at the end of the process with the best 
522 | #        mtry value over all the training data. Here's the number of 
523 | #        tree we're building:
524 | #
525 | #             (10 * 3 * 7 * 500) + 500 = 105,500 trees!
526 | #
527 | # On a workstation using 10 cores the following code took 28 minutes 
528 | # to execute.
529 | #
530 | 
531 | 
532 | # Create a cluster to work on 10 logical cores.
533 | # cl <- makeCluster(10, type = "SOCK")
534 | # registerDoSNOW(cl)
535 | 
536 | # Time the code execution
537 | # start.time <- Sys.time()
538 | 
539 | # We have reduced the dimensionality of our data using SVD. Also, the 
540 | # application of SVD allows us to use LSA to simultaneously increase the
541 | # information density of each feature. To prove this out, leverage a 
542 | # mighty Random Forest with the default of 500 trees. We'll also ask
543 | # caret to try 7 different values of mtry to find the mtry value that 
544 | # gives the best result!
545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 
546 | #                 trControl = cv.cntrl, tuneLength = 7)
547 | 
548 | # Processing is done, stop cluster.
549 | # stopCluster(cl)
550 | 
551 | # Total time of execution on workstation was 
552 | # total.time <- Sys.time() - start.time
553 | # total.time
554 | 
555 | 
556 | # Load processing results from disk!
557 | load("rf.cv.1.RData")
558 | 
559 | # Check out our results.
560 | rf.cv.1
561 | 
562 | # Let's drill-down on the results.
563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted)
564 | 
565 | 
566 | 
567 | 
568 | 
569 | # OK, now let's add in the feature we engineered previously for SMS 
570 | # text length to see if it improves things.
571 | train.svd$TextLength <- train$TextLength
572 | 
573 | 
574 | # Create a cluster to work on 10 logical cores.
575 | # cl <- makeCluster(10, type = "SOCK")
576 | # registerDoSNOW(cl)
577 | 
578 | # Time the code execution
579 | # start.time <- Sys.time()
580 | 
581 | # Re-run the training process with the additional feature.
582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf",
583 | #                 trControl = cv.cntrl, tuneLength = 7, 
584 | #                 importance = TRUE)
585 | 
586 | # Processing is done, stop cluster.
587 | # stopCluster(cl)
588 | 
589 | # Total time of execution on workstation was 
590 | # total.time <- Sys.time() - start.time
591 | # total.time
592 | 
593 | # Load results from disk.
594 | load("rf.cv.2.RData")
595 | 
596 | # Check the results.
597 | rf.cv.2
598 | 
599 | # Drill-down on the results.
600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted)
601 | 
602 | # How important was the new feature?
603 | library(randomForest)
604 | varImpPlot(rf.cv.1$finalModel)
605 | varImpPlot(rf.cv.2$finalModel)
606 | 
607 | 
608 | 
609 | 
610 | # Turns out that our TextLength feature is very predictive and pushed our
611 | # overall accuracy over the training data to 97.1%. We can also use the
612 | # power of cosine similarity to engineer a feature for calculating, on 
613 | # average, how alike each SMS text message is to all of the spam messages.
614 | # The hypothesis here is that our use of bigrams, tf-idf, and LSA have 
615 | # produced a representation where ham SMS messages should have low cosine
616 | # similarities with spam SMS messages and vice versa.
617 | 
618 | # Use the lsa package's cosine function for our calculations.
619 | #install.packages("lsa")
620 | library(lsa)
621 | 
622 | train.similarities <- cosine(t(as.matrix(train.svd[, -c(1, ncol(train.svd))])))
623 | 
624 | 
625 | # Next up - take each SMS text message and find what the mean cosine 
626 | # similarity is for each SMS text mean with each of the spam SMS messages.
627 | # Per our hypothesis, ham SMS text messages should have relatively low
628 | # cosine similarities with spam messages and vice versa!
629 | spam.indexes <- which(train$Label == "spam")
630 | 
631 | train.svd$SpamSimilarity <- rep(0.0, nrow(train.svd))
632 | for(i in 1:nrow(train.svd)) {
633 |   train.svd$SpamSimilarity[i] <- mean(train.similarities[i, spam.indexes])  
634 | }
635 | 
636 | 
637 | # As always, let's visualize our results using the mighty ggplot2
638 | ggplot(train.svd, aes(x = SpamSimilarity, fill = Label)) +
639 |   theme_bw() +
640 |   geom_histogram(binwidth = 0.05) +
641 |   labs(y = "Message Count",
642 |        x = "Mean Spam Message Cosine Similarity",
643 |        title = "Distribution of Ham vs. Spam Using Spam Cosine Similarity")
644 | 
645 | 
646 | # Per our analysis of mighty random forest results, we are interested in 
647 | # in features that can raise model performance with respect to sensitivity.
648 | # Perform another CV process using the new spam cosine similarity feature.
649 | 
650 | # Create a cluster to work on 10 logical cores.
651 | # cl <- makeCluster(10, type = "SOCK")
652 | # registerDoSNOW(cl)
653 | 
654 | # Time the code execution
655 | # start.time <- Sys.time()
656 |  
657 | # Re-run the training process with the additional feature.
658 | # rf.cv.3 <- train(Label ~ ., data = train.svd, method = "rf",
659 | #                 trControl = cv.cntrl, tuneLength = 7,
660 | #                 importance = TRUE)
661 | 
662 | # Processing is done, stop cluster.
663 | # stopCluster(cl)
664 | 
665 | # Total time of execution on workstation was 
666 | # total.time <- Sys.time() - start.time
667 | # total.time
668 | 
669 | 
670 | # Load results from disk.
671 | load("rf.cv.3.RData")
672 | 
673 | # Check the results.
674 | rf.cv.3
675 | 
676 | # Drill-down on the results.
677 | confusionMatrix(train.svd$Label, rf.cv.3$finalModel$predicted)
678 | 
679 | # How important was this feature?
680 | library(randomForest)
681 | varImpPlot(rf.cv.3$finalModel)
682 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part11.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 11 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=XWUi7RivDJY     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | # Clean up the data frame and view our handiwork.
 39 | spam.raw <- spam.raw[, 1:2]
 40 | names(spam.raw) <- c("Label", "Text")
 41 | View(spam.raw)
 42 | 
 43 | 
 44 | 
 45 | # Check data to see if there are missing values.
 46 | length(which(!complete.cases(spam.raw)))
 47 | 
 48 | 
 49 | 
 50 | # Convert our class label into a factor.
 51 | spam.raw$Label <- as.factor(spam.raw$Label)
 52 | 
 53 | 
 54 | 
 55 | # The first step, as always, is to explore the data.
 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 57 | prop.table(table(spam.raw$Label))
 58 | 
 59 | 
 60 | 
 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 62 | # messages by adding a new feature for the length of each message.
 63 | spam.raw$TextLength <- nchar(spam.raw$Text)
 64 | summary(spam.raw$TextLength)
 65 | 
 66 | 
 67 | 
 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 69 | library(ggplot2)
 70 | 
 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 72 |   theme_bw() +
 73 |   geom_histogram(binwidth = 5) +
 74 |   labs(y = "Text Count", x = "Length of Text",
 75 |        title = "Distribution of Text Lengths with Class Labels")
 76 | 
 77 | 
 78 | 
 79 | # At a minimum we need to split our data into a training set and a
 80 | # test set. In a true project we would want to use a three-way split 
 81 | # of training, validation, and test.
 82 | #
 83 | # As we know that our data has non-trivial class imbalance, we'll 
 84 | # use the mighty caret package to create a randomg train/test split 
 85 | # that ensures the correct ham/spam class label proportions (i.e., 
 86 | # we'll use caret for a random stratified split).
 87 | library(caret)
 88 | help(package = "caret")
 89 | 
 90 | 
 91 | # Use caret to create a 70%/30% stratified split. Set the random
 92 | # seed for reproducibility.
 93 | set.seed(32984)
 94 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 95 |                                p = 0.7, list = FALSE)
 96 | 
 97 | train <- spam.raw[indexes,]
 98 | test <- spam.raw[-indexes,]
 99 | 
100 | 
101 | # Verify proportions.
102 | prop.table(table(train$Label))
103 | prop.table(table(test$Label))
104 | 
105 | 
106 | 
107 | # Text analytics requires a lot of data exploration, data pre-processing
108 | # and data wrangling. Let's explore some examples.
109 | 
110 | # HTML-escaped ampersand character.
111 | train$Text[21]
112 | 
113 | 
114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
115 | # is an actual person, but we will ignore the implications of this for
116 | # this introductory tutorial.
117 | train$Text[38]
118 | 
119 | 
120 | # A URL.
121 | train$Text[357]
122 | 
123 | 
124 | 
125 | # There are many packages in the R ecosystem for performing text
126 | # analytics. One of the newer packages in quanteda. The quanteda
127 | # package has many useful functions for quickly and easily working
128 | # with text data.
129 | library(quanteda)
130 | help(package = "quanteda")
131 | 
132 | 
133 | # Tokenize SMS text messages.
134 | train.tokens <- tokens(train$Text, what = "word", 
135 |                        remove_numbers = TRUE, remove_punct = TRUE,
136 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
137 | 
138 | # Take a look at a specific SMS message and see how it transforms.
139 | train.tokens[[357]]
140 | 
141 | 
142 | # Lower case the tokens.
143 | train.tokens <- tokens_tolower(train.tokens)
144 | train.tokens[[357]]
145 | 
146 | 
147 | # Use quanteda's built-in stopword list for English.
148 | # NOTE - You should always inspect stopword lists for applicability to
149 | #        your problem/domain.
150 | train.tokens <- tokens_select(train.tokens, stopwords(), 
151 |                               selection = "remove")
152 | train.tokens[[357]]
153 | 
154 | 
155 | # Perform stemming on the tokens.
156 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
157 | train.tokens[[357]]
158 | 
159 | 
160 | # Create our first bag-of-words model.
161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
162 | 
163 | 
164 | # Transform to a matrix and inspect.
165 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
166 | View(train.tokens.matrix[1:20, 1:100])
167 | dim(train.tokens.matrix)
168 | 
169 | 
170 | # Investigate the effects of stemming.
171 | colnames(train.tokens.matrix)[1:50]
172 | 
173 | 
174 | # Per best practices, we will leverage cross validation (CV) as
175 | # the basis of our modeling process. Using CV we can create 
176 | # estimates of how well our model will do in Production on new,
177 | # unseen data. CV is powerful, but the downside is that it
178 | # requires more processing and therefore more time.
179 | #
180 | # If you are not familiar with CV, consult the following 
181 | # Wikipedia article:
182 | #
183 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
184 | #
185 | 
186 | # Setup a the feature data frame with labels.
187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
188 | 
189 | 
190 | # Often, tokenization requires some additional pre-processing
191 | names(train.tokens.df)[c(146, 148, 235, 238)]
192 | 
193 | 
194 | # Cleanup column names.
195 | names(train.tokens.df) <- make.names(names(train.tokens.df))
196 | 
197 | 
198 | # Use caret to create stratified folds for 10-fold cross validation repeated 
199 | # 3 times (i.e., create 30 random stratified samples)
200 | set.seed(48743)
201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
202 | 
203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
204 |                          repeats = 3, index = cv.folds)
205 | 
206 | 
207 | # Our data frame is non-trivial in size. As such, CV runs will take 
208 | # quite a long time to run. To cut down on total execution time, use
209 | # the doSNOW package to allow for multi-core training in parallel.
210 | #
211 | # WARNING - The following code is configured to run on a workstation-
212 | #           or server-class machine (i.e., 12 logical cores). Alter
213 | #           code to suit your HW environment.
214 | #
215 | #install.packages("doSNOW")
216 | library(doSNOW)
217 | 
218 | 
219 | # Time the code execution
220 | start.time <- Sys.time()
221 | 
222 | 
223 | # Create a cluster to work on 10 logical cores.
224 | cl <- makeCluster(10, type = "SOCK")
225 | registerDoSNOW(cl)
226 | 
227 | 
228 | # As our data is non-trivial in size at this point, use a single decision
229 | # tree alogrithm as our first model. We will graduate to using more 
230 | # powerful algorithms later when we perform feature extraction to shrink
231 | # the size of our data.
232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
233 |                     trControl = cv.cntrl, tuneLength = 7)
234 | 
235 | 
236 | # Processing is done, stop cluster.
237 | stopCluster(cl)
238 | 
239 | 
240 | # Total time of execution on workstation was approximately 4 minutes. 
241 | total.time <- Sys.time() - start.time
242 | total.time
243 | 
244 | 
245 | # Check out our results.
246 | rpart.cv.1
247 | 
248 | 
249 | 
250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
251 | # powerful technique for enhancing the information/signal contained
252 | # within our document-frequency matrix. Specifically, the mathematics
253 | # behind TF-IDF accomplish the following goals:
254 | #    1 - The TF calculation accounts for the fact that longer 
255 | #        documents will have higher individual term counts. Applying
256 | #        TF normalizes all documents in the corpus to be length 
257 | #        independent.
258 | #    2 - The IDF calculation accounts for the frequency of term
259 | #        appearance in all documents in the corpus. The intuition 
260 | #        being that a term that appears in every document has no
261 | #        predictive power.
262 | #    3 - The multiplication of TF by IDF for each cell in the matrix
263 | #        allows for weighting of #1 and #2 for each cell in the matrix.
264 | 
265 | 
266 | # Our function for calculating relative term frequency (TF)
267 | term.frequency <- function(row) {
268 |   row / sum(row)
269 | }
270 | 
271 | # Our function for calculating inverse document frequency (IDF)
272 | inverse.doc.freq <- function(col) {
273 |   corpus.size <- length(col)
274 |   doc.count <- length(which(col > 0))
275 | 
276 |   log10(corpus.size / doc.count)
277 | }
278 | 
279 | # Our function for calculating TF-IDF.
280 | tf.idf <- function(x, idf) {
281 |   x * idf
282 | }
283 | 
284 | 
285 | # First step, normalize all documents via TF.
286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
287 | dim(train.tokens.df)
288 | View(train.tokens.df[1:20, 1:100])
289 | 
290 | 
291 | # Second step, calculate the IDF vector that we will use - both
292 | # for training data and for test data!
293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
294 | str(train.tokens.idf)
295 | 
296 | 
297 | # Lastly, calculate TF-IDF for our training corpus.
298 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
299 | dim(train.tokens.tfidf)
300 | View(train.tokens.tfidf[1:25, 1:25])
301 | 
302 | 
303 | # Transpose the matrix
304 | train.tokens.tfidf <- t(train.tokens.tfidf)
305 | dim(train.tokens.tfidf)
306 | View(train.tokens.tfidf[1:25, 1:25])
307 | 
308 | 
309 | # Check for incopmlete cases.
310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
311 | train$Text[incomplete.cases]
312 | 
313 | 
314 | # Fix incomplete cases
315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
316 | dim(train.tokens.tfidf)
317 | sum(which(!complete.cases(train.tokens.tfidf)))
318 | 
319 | 
320 | # Make a clean data frame using the same process as before.
321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
323 | 
324 | 
325 | # Time the code execution
326 | start.time <- Sys.time()
327 | 
328 | # Create a cluster to work on 10 logical cores.
329 | cl <- makeCluster(10, type = "SOCK")
330 | registerDoSNOW(cl)
331 | 
332 | # As our data is non-trivial in size at this point, use a single decision
333 | # tree alogrithm as our first model. We will graduate to using more 
334 | # powerful algorithms later when we perform feature extraction to shrink
335 | # the size of our data.
336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
337 |                     trControl = cv.cntrl, tuneLength = 7)
338 | 
339 | # Processing is done, stop cluster.
340 | stopCluster(cl)
341 | 
342 | # Total time of execution on workstation was 
343 | total.time <- Sys.time() - start.time
344 | total.time
345 | 
346 | # Check out our results.
347 | rpart.cv.2
348 | 
349 | 
350 | 
351 | # N-grams allow us to augment our document-term frequency matrices with
352 | # word ordering. This often leads to increased performance (e.g., accuracy)
353 | # for machine learning models trained with more than just unigrams (i.e.,
354 | # single terms). Let's add bigrams to our training data and the TF-IDF 
355 | # transform the expanded featre matrix to see if accuracy improves.
356 | 
357 | # Add bigrams to our feature matrix.
358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
359 | train.tokens[[357]]
360 | 
361 | 
362 | # Transform to dfm and then a matrix.
363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
364 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
365 | train.tokens.dfm
366 | 
367 | 
368 | # Normalize all documents via TF.
369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
370 | 
371 |   
372 | # Calculate the IDF vector that we will use for training and test data!
373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
374 | 
375 |   
376 | # Calculate TF-IDF for our training corpus 
377 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, 
378 |                              idf = train.tokens.idf)
379 | 
380 |   
381 | # Transpose the matrix
382 | train.tokens.tfidf <- t(train.tokens.tfidf)
383 | 
384 |   
385 | # Fix incomplete cases
386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
388 | 
389 | 
390 | # Make a clean data frame.
391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
393 | 
394 | 
395 | # Clean up unused objects in memory.
396 | gc()
397 | 
398 | 
399 | 
400 | 
401 | #
402 | # NOTE - The following code requires the use of command-line R to execute
403 | #        due to the large number of features (i.e., columns) in the matrix.
404 | #        Please consult the following link for more details if you wish
405 | #        to run the code yourself:
406 | #
407 | #        https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r
408 | #
409 | #        Also note that running the following code required approximately
410 | #        38GB of RAM and more than 4.5 hours to execute on a 10-core 
411 | #        workstation!
412 | #
413 | 
414 | 
415 | # Time the code execution
416 | # start.time <- Sys.time()
417 | 
418 | # Leverage single decision trees to evaluate if adding bigrams improves the 
419 | # the effectiveness of the model.
420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
421 | #                     trControl = cv.cntrl, tuneLength = 7)
422 | 
423 | # Total time of execution on workstation was
424 | # total.time <- Sys.time() - start.time
425 | # total.time
426 | 
427 | # Check out our results.
428 | # rpart.cv.3
429 | 
430 | #
431 | # The results of the above processing show a slight decline in rpart 
432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457.
433 | # As we will discuss later, while the addition of bigrams appears to 
434 | # negatively impact a single decision tree, it helps with the mighty
435 | # random forest!
436 | #
437 | 
438 | 
439 | 
440 | 
441 | # We'll leverage the irlba package for our singular value 
442 | # decomposition (SVD). The irlba package allows us to specify
443 | # the number of the most important singular vectors we wish to
444 | # calculate and retain for features.
445 | library(irlba)
446 | 
447 | 
448 | # Time the code execution
449 | start.time <- Sys.time()
450 | 
451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns
452 | # for our latent semantic analysis (LSA).
453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600)
454 | 
455 | # Total time of execution on workstation was 
456 | total.time <- Sys.time() - start.time
457 | total.time
458 | 
459 | 
460 | # Take a look at the new feature data up close.
461 | View(train.irlba$v)
462 | 
463 | 
464 | # As with TF-IDF, we will need to project new data (e.g., the test data)
465 | # into the SVD semantic space. The following code illustrates how to do
466 | # this using a row of the training data that has already been transformed
467 | # by TF-IDF, per the mathematics illustrated in the slides.
468 | #
469 | #
470 | sigma.inverse <- 1 / train.irlba$d
471 | u.transpose <- t(train.irlba$u)
472 | document <- train.tokens.tfidf[1,]
473 | document.hat <- sigma.inverse * u.transpose %*% document
474 | 
475 | # Look at the first 10 components of projected document and the corresponding
476 | # row in our document semantic space (i.e., the V matrix)
477 | document.hat[1:10]
478 | train.irlba$v[1, 1:10]
479 | 
480 | 
481 | 
482 | #
483 | # Create new feature data frame using our document semantic space of 300
484 | # features (i.e., the V matrix from our SVD).
485 | #
486 | train.svd <- data.frame(Label = train$Label, train.irlba$v)
487 | 
488 | 
489 | # Create a cluster to work on 10 logical cores.
490 | cl <- makeCluster(10, type = "SOCK")
491 | registerDoSNOW(cl)
492 | 
493 | # Time the code execution
494 | start.time <- Sys.time()
495 | 
496 | # This will be the last run using single decision trees. With a much smaller
497 | # feature matrix we can now use more powerful methods like the mighty Random
498 | # Forest from now on!
499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 
500 |                     trControl = cv.cntrl, tuneLength = 7)
501 | 
502 | # Processing is done, stop cluster.
503 | stopCluster(cl)
504 | 
505 | # Total time of execution on workstation was 
506 | total.time <- Sys.time() - start.time
507 | total.time
508 | 
509 | # Check out our results.
510 | rpart.cv.4
511 | 
512 | 
513 | 
514 | 
515 | #
516 | # NOTE - The following code takes a long time to run. Here's the math.
517 | #        We are performing 10-fold CV repeated 3 times. That means we
518 | #        need to build 30 models. We are also asking caret to try 7 
519 | #        different values of the mtry parameter. Next up by default
520 | #        a mighty random forest leverages 500 trees. Lastly, caret will
521 | #        build 1 final model at the end of the process with the best 
522 | #        mtry value over all the training data. Here's the number of 
523 | #        tree we're building:
524 | #
525 | #             (10 * 3 * 7 * 500) + 500 = 105,500 trees!
526 | #
527 | # On a workstation using 10 cores the following code took 28 minutes 
528 | # to execute.
529 | #
530 | 
531 | 
532 | # Create a cluster to work on 10 logical cores.
533 | # cl <- makeCluster(10, type = "SOCK")
534 | # registerDoSNOW(cl)
535 | 
536 | # Time the code execution
537 | # start.time <- Sys.time()
538 | 
539 | # We have reduced the dimensionality of our data using SVD. Also, the 
540 | # application of SVD allows us to use LSA to simultaneously increase the
541 | # information density of each feature. To prove this out, leverage a 
542 | # mighty Random Forest with the default of 500 trees. We'll also ask
543 | # caret to try 7 different values of mtry to find the mtry value that 
544 | # gives the best result!
545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 
546 | #                 trControl = cv.cntrl, tuneLength = 7)
547 | 
548 | # Processing is done, stop cluster.
549 | # stopCluster(cl)
550 | 
551 | # Total time of execution on workstation was 
552 | # total.time <- Sys.time() - start.time
553 | # total.time
554 | 
555 | 
556 | # Load processing results from disk!
557 | load("rf.cv.1.RData")
558 | 
559 | # Check out our results.
560 | rf.cv.1
561 | 
562 | # Let's drill-down on the results.
563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted)
564 | 
565 | 
566 | 
567 | 
568 | 
569 | # OK, now let's add in the feature we engineered previously for SMS 
570 | # text length to see if it improves things.
571 | train.svd$TextLength <- train$TextLength
572 | 
573 | 
574 | # Create a cluster to work on 10 logical cores.
575 | # cl <- makeCluster(10, type = "SOCK")
576 | # registerDoSNOW(cl)
577 | 
578 | # Time the code execution
579 | # start.time <- Sys.time()
580 | 
581 | # Re-run the training process with the additional feature.
582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf",
583 | #                 trControl = cv.cntrl, tuneLength = 7, 
584 | #                 importance = TRUE)
585 | 
586 | # Processing is done, stop cluster.
587 | # stopCluster(cl)
588 | 
589 | # Total time of execution on workstation was 
590 | # total.time <- Sys.time() - start.time
591 | # total.time
592 | 
593 | # Load results from disk.
594 | load("rf.cv.2.RData")
595 | 
596 | # Check the results.
597 | rf.cv.2
598 | 
599 | # Drill-down on the results.
600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted)
601 | 
602 | # How important was the new feature?
603 | library(randomForest)
604 | varImpPlot(rf.cv.1$finalModel)
605 | varImpPlot(rf.cv.2$finalModel)
606 | 
607 | 
608 | 
609 | 
610 | # Turns out that our TextLength feature is very predictive and pushed our
611 | # overall accuracy over the training data to 97.1%. We can also use the
612 | # power of cosine similarity to engineer a feature for calculating, on 
613 | # average, how alike each SMS text message is to all of the spam messages.
614 | # The hypothesis here is that our use of bigrams, tf-idf, and LSA have 
615 | # produced a representation where ham SMS messages should have low cosine
616 | # similarities with spam SMS messages and vice versa.
617 | 
618 | # Use the lsa package's cosine function for our calculations.
619 | #install.packages("lsa")
620 | library(lsa)
621 | 
622 | train.similarities <- cosine(t(as.matrix(train.svd[, -c(1, ncol(train.svd))])))
623 | 
624 | 
625 | # Next up - take each SMS text message and find what the mean cosine 
626 | # similarity is for each SMS text mean with each of the spam SMS messages.
627 | # Per our hypothesis, ham SMS text messages should have relatively low
628 | # cosine similarities with spam messages and vice versa!
629 | spam.indexes <- which(train$Label == "spam")
630 | 
631 | train.svd$SpamSimilarity <- rep(0.0, nrow(train.svd))
632 | for(i in 1:nrow(train.svd)) {
633 |   train.svd$SpamSimilarity[i] <- mean(train.similarities[i, spam.indexes])  
634 | }
635 | 
636 | 
637 | # As always, let's visualize our results using the mighty ggplot2
638 | ggplot(train.svd, aes(x = SpamSimilarity, fill = Label)) +
639 |   theme_bw() +
640 |   geom_histogram(binwidth = 0.05) +
641 |   labs(y = "Message Count",
642 |        x = "Mean Spam Message Cosine Similarity",
643 |        title = "Distribution of Ham vs. Spam Using Spam Cosine Similarity")
644 | 
645 | 
646 | # Per our analysis of mighty random forest results, we are interested in 
647 | # in features that can raise model performance with respect to sensitivity.
648 | # Perform another CV process using the new spam cosine similarity feature.
649 | 
650 | # Create a cluster to work on 10 logical cores.
651 | # cl <- makeCluster(10, type = "SOCK")
652 | # registerDoSNOW(cl)
653 | 
654 | # Time the code execution
655 | # start.time <- Sys.time()
656 | 
657 | # Re-run the training process with the additional feature.
658 | # set.seed(932847)
659 | # rf.cv.3 <- train(Label ~ ., data = train.svd, method = "rf",
660 | #                 trControl = cv.cntrl, tuneLength = 7,
661 | #                 importance = TRUE)
662 | 
663 | # Processing is done, stop cluster.
664 | # stopCluster(cl)
665 | 
666 | # Total time of execution on workstation was 
667 | # total.time <- Sys.time() - start.time
668 | # total.time
669 | 
670 | 
671 | # Load results from disk.
672 | load("rf.cv.3.RData")
673 | 
674 | # Check the results.
675 | rf.cv.3
676 | 
677 | # Drill-down on the results.
678 | confusionMatrix(train.svd$Label, rf.cv.3$finalModel$predicted)
679 | 
680 | # How important was this feature?
681 | library(randomForest)
682 | varImpPlot(rf.cv.3$finalModel)
683 | 
684 | 
685 | 
686 | 
687 | # We've built what appears to be an effective predictive model. Time to verify
688 | # using the test holdout data we set aside at the beginning of the project.
689 | # First stage of this verification is running the test data through our pre-
690 | # processing pipeline of:
691 | #      1 - Tokenization
692 | #      2 - Lower casing
693 | #      3 - Stopword removal
694 | #      4 - Stemming
695 | #      5 - Adding bigrams
696 | #      6 - Transform to dfm
697 | #      7 - Ensure test dfm has same features as train dfm
698 | 
699 | # Tokenization.
700 | test.tokens <- tokens(test$Text, what = "word", 
701 |                       remove_numbers = TRUE, remove_punct = TRUE,
702 |                       remove_symbols = TRUE, remove_hyphens = TRUE)
703 | 
704 | # Lower case the tokens.
705 | test.tokens <- tokens_tolower(test.tokens)
706 | 
707 | # Stopword removal.
708 | test.tokens <- tokens_select(test.tokens, stopwords(), 
709 |                              selection = "remove")
710 | 
711 | # Stemming.
712 | test.tokens <- tokens_wordstem(test.tokens, language = "english")
713 | 
714 | # Add bigrams.
715 | test.tokens <- tokens_ngrams(test.tokens, n = 1:2)
716 | 
717 | # Convert n-grams to quanteda document-term frequency matrix.
718 | test.tokens.dfm <- dfm(test.tokens, tolower = FALSE)
719 | 
720 | # Explore the train and test quanteda dfm objects.
721 | train.tokens.dfm
722 | test.tokens.dfm
723 | 
724 | # Ensure the test dfm has the same n-grams as the training dfm.
725 | #
726 | # NOTE - In production we should expect that new text messages will 
727 | #        contain n-grams that did not exist in the original training
728 | #        data. As such, we need to strip those n-grams out.
729 | #
730 | test.tokens.dfm <- dfm_select(test.tokens.dfm, pattern = train.tokens.dfm,
731 |                               selection = "keep")
732 | test.tokens.matrix <- as.matrix(test.tokens.dfm)
733 | test.tokens.dfm
734 | 
735 | 
736 | 
737 | 
738 | # With the raw test features in place next up is the projecting the term
739 | # counts for the unigrams into the same TF-IDF vector space as our training
740 | # data. The high level process is as follows:
741 | #      1 - Normalize each document (i.e, each row)
742 | #      2 - Perform IDF multiplication using training IDF values
743 | 
744 | # Normalize all documents via TF.
745 | test.tokens.df <- apply(test.tokens.matrix, 1, term.frequency)
746 | str(test.tokens.df)
747 | 
748 | # Lastly, calculate TF-IDF for our training corpus.
749 | test.tokens.tfidf <-  apply(test.tokens.df, 2, tf.idf, idf = train.tokens.idf)
750 | dim(test.tokens.tfidf)
751 | View(test.tokens.tfidf[1:25, 1:25])
752 | 
753 | # Transpose the matrix
754 | test.tokens.tfidf <- t(test.tokens.tfidf)
755 | 
756 | # Fix incomplete cases
757 | summary(test.tokens.tfidf[1,])
758 | test.tokens.tfidf[is.na(test.tokens.tfidf)] <- 0.0
759 | summary(test.tokens.tfidf[1,])
760 | 
761 | 
762 | 
763 | 
764 | # With the test data projected into the TF-IDF vector space of the training
765 | # data we can now to the final projection into the training LSA semantic
766 | # space (i.e. the SVD matrix factorization).
767 | test.svd.raw <- t(sigma.inverse * u.transpose %*% t(test.tokens.tfidf))
768 | 
769 | 
770 | # Lastly, we can now build the test data frame to feed into our trained
771 | # machine learning model for predictions. First up, add Label and TextLength.
772 | test.svd <- data.frame(Label = test$Label, test.svd.raw, 
773 |                        TextLength = test$TextLength)
774 | 
775 | 
776 | # Next step, calculate SpamSimilarity for all the test documents. First up, 
777 | # create a spam similarity matrix.
778 | test.similarities <- rbind(test.svd.raw, train.irlba$v[spam.indexes,])
779 | test.similarities <- cosine(t(test.similarities))
780 | 
781 | 
782 | #
783 | # NOTE - The following code was updated post-video recoding due to a bug.
784 | #
785 | test.svd$SpamSimilarity <- rep(0.0, nrow(test.svd))
786 | spam.cols <- (nrow(test.svd) + 1):ncol(test.similarities)
787 | for(i in 1:nrow(test.svd)) {
788 |   # The following line has the bug fix.
789 |   test.svd$SpamSimilarity[i] <- mean(test.similarities[i, spam.cols])  
790 | }
791 | 
792 | 
793 | # Some SMS text messages become empty as a result of stopword and special 
794 | # character removal. This results in spam similarity measures of 0. Correct.
795 | # This code as added post-video as part of the bug fix.
796 | test.svd$SpamSimilarity[!is.finite(test.svd$SpamSimilarity)] <- 0
797 | 
798 | 
799 | # Now we can make predictions on the test data set using our trained mighty 
800 | # random forest.
801 | preds <- predict(rf.cv.3, test.svd)
802 | 
803 | 
804 | # Drill-in on results
805 | confusionMatrix(preds, test.svd$Label)
806 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part12.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 12 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=-wCrClheObk     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | # Clean up the data frame and view our handiwork.
 39 | spam.raw <- spam.raw[, 1:2]
 40 | names(spam.raw) <- c("Label", "Text")
 41 | View(spam.raw)
 42 | 
 43 | 
 44 | 
 45 | # Check data to see if there are missing values.
 46 | length(which(!complete.cases(spam.raw)))
 47 | 
 48 | 
 49 | 
 50 | # Convert our class label into a factor.
 51 | spam.raw$Label <- as.factor(spam.raw$Label)
 52 | 
 53 | 
 54 | 
 55 | # The first step, as always, is to explore the data.
 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 57 | prop.table(table(spam.raw$Label))
 58 | 
 59 | 
 60 | 
 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 62 | # messages by adding a new feature for the length of each message.
 63 | spam.raw$TextLength <- nchar(spam.raw$Text)
 64 | summary(spam.raw$TextLength)
 65 | 
 66 | 
 67 | 
 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 69 | library(ggplot2)
 70 | 
 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 72 |   theme_bw() +
 73 |   geom_histogram(binwidth = 5) +
 74 |   labs(y = "Text Count", x = "Length of Text",
 75 |        title = "Distribution of Text Lengths with Class Labels")
 76 | 
 77 | 
 78 | 
 79 | # At a minimum we need to split our data into a training set and a
 80 | # test set. In a true project we would want to use a three-way split 
 81 | # of training, validation, and test.
 82 | #
 83 | # As we know that our data has non-trivial class imbalance, we'll 
 84 | # use the mighty caret package to create a randomg train/test split 
 85 | # that ensures the correct ham/spam class label proportions (i.e., 
 86 | # we'll use caret for a random stratified split).
 87 | library(caret)
 88 | help(package = "caret")
 89 | 
 90 | 
 91 | # Use caret to create a 70%/30% stratified split. Set the random
 92 | # seed for reproducibility.
 93 | set.seed(32984)
 94 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 95 |                                p = 0.7, list = FALSE)
 96 | 
 97 | train <- spam.raw[indexes,]
 98 | test <- spam.raw[-indexes,]
 99 | 
100 | 
101 | # Verify proportions.
102 | prop.table(table(train$Label))
103 | prop.table(table(test$Label))
104 | 
105 | 
106 | 
107 | # Text analytics requires a lot of data exploration, data pre-processing
108 | # and data wrangling. Let's explore some examples.
109 | 
110 | # HTML-escaped ampersand character.
111 | train$Text[21]
112 | 
113 | 
114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
115 | # is an actual person, but we will ignore the implications of this for
116 | # this introductory tutorial.
117 | train$Text[38]
118 | 
119 | 
120 | # A URL.
121 | train$Text[357]
122 | 
123 | 
124 | 
125 | # There are many packages in the R ecosystem for performing text
126 | # analytics. One of the newer packages in quanteda. The quanteda
127 | # package has many useful functions for quickly and easily working
128 | # with text data.
129 | library(quanteda)
130 | help(package = "quanteda")
131 | 
132 | 
133 | # Tokenize SMS text messages.
134 | train.tokens <- tokens(train$Text, what = "word", 
135 |                        remove_numbers = TRUE, remove_punct = TRUE,
136 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
137 | 
138 | # Take a look at a specific SMS message and see how it transforms.
139 | train.tokens[[357]]
140 | 
141 | 
142 | # Lower case the tokens.
143 | train.tokens <- tokens_tolower(train.tokens)
144 | train.tokens[[357]]
145 | 
146 | 
147 | # Use quanteda's built-in stopword list for English.
148 | # NOTE - You should always inspect stopword lists for applicability to
149 | #        your problem/domain.
150 | train.tokens <- tokens_select(train.tokens, stopwords(), 
151 |                               selection = "remove")
152 | train.tokens[[357]]
153 | 
154 | 
155 | # Perform stemming on the tokens.
156 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
157 | train.tokens[[357]]
158 | 
159 | 
160 | # Create our first bag-of-words model.
161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
162 | 
163 | 
164 | # Transform to a matrix and inspect.
165 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
166 | View(train.tokens.matrix[1:20, 1:100])
167 | dim(train.tokens.matrix)
168 | 
169 | 
170 | # Investigate the effects of stemming.
171 | colnames(train.tokens.matrix)[1:50]
172 | 
173 | 
174 | # Per best practices, we will leverage cross validation (CV) as
175 | # the basis of our modeling process. Using CV we can create 
176 | # estimates of how well our model will do in Production on new,
177 | # unseen data. CV is powerful, but the downside is that it
178 | # requires more processing and therefore more time.
179 | #
180 | # If you are not familiar with CV, consult the following 
181 | # Wikipedia article:
182 | #
183 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
184 | #
185 | 
186 | # Setup a the feature data frame with labels.
187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
188 | 
189 | 
190 | # Often, tokenization requires some additional pre-processing
191 | names(train.tokens.df)[c(146, 148, 235, 238)]
192 | 
193 | 
194 | # Cleanup column names.
195 | names(train.tokens.df) <- make.names(names(train.tokens.df))
196 | 
197 | 
198 | # Use caret to create stratified folds for 10-fold cross validation repeated 
199 | # 3 times (i.e., create 30 random stratified samples)
200 | set.seed(48743)
201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
202 | 
203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
204 |                          repeats = 3, index = cv.folds)
205 | 
206 | 
207 | # Our data frame is non-trivial in size. As such, CV runs will take 
208 | # quite a long time to run. To cut down on total execution time, use
209 | # the doSNOW package to allow for multi-core training in parallel.
210 | #
211 | # WARNING - The following code is configured to run on a workstation-
212 | #           or server-class machine (i.e., 12 logical cores). Alter
213 | #           code to suit your HW environment.
214 | #
215 | #install.packages("doSNOW")
216 | library(doSNOW)
217 | 
218 | 
219 | # Time the code execution
220 | start.time <- Sys.time()
221 | 
222 | 
223 | # Create a cluster to work on 10 logical cores.
224 | cl <- makeCluster(10, type = "SOCK")
225 | registerDoSNOW(cl)
226 | 
227 | 
228 | # As our data is non-trivial in size at this point, use a single decision
229 | # tree alogrithm as our first model. We will graduate to using more 
230 | # powerful algorithms later when we perform feature extraction to shrink
231 | # the size of our data.
232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
233 |                     trControl = cv.cntrl, tuneLength = 7)
234 | 
235 | 
236 | # Processing is done, stop cluster.
237 | stopCluster(cl)
238 | 
239 | 
240 | # Total time of execution on workstation was approximately 4 minutes. 
241 | total.time <- Sys.time() - start.time
242 | total.time
243 | 
244 | 
245 | # Check out our results.
246 | rpart.cv.1
247 | 
248 | 
249 | 
250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
251 | # powerful technique for enhancing the information/signal contained
252 | # within our document-frequency matrix. Specifically, the mathematics
253 | # behind TF-IDF accomplish the following goals:
254 | #    1 - The TF calculation accounts for the fact that longer 
255 | #        documents will have higher individual term counts. Applying
256 | #        TF normalizes all documents in the corpus to be length 
257 | #        independent.
258 | #    2 - The IDF calculation accounts for the frequency of term
259 | #        appearance in all documents in the corpus. The intuition 
260 | #        being that a term that appears in every document has no
261 | #        predictive power.
262 | #    3 - The multiplication of TF by IDF for each cell in the matrix
263 | #        allows for weighting of #1 and #2 for each cell in the matrix.
264 | 
265 | 
266 | # Our function for calculating relative term frequency (TF)
267 | term.frequency <- function(row) {
268 |   row / sum(row)
269 | }
270 | 
271 | # Our function for calculating inverse document frequency (IDF)
272 | inverse.doc.freq <- function(col) {
273 |   corpus.size <- length(col)
274 |   doc.count <- length(which(col > 0))
275 | 
276 |   log10(corpus.size / doc.count)
277 | }
278 | 
279 | # Our function for calculating TF-IDF.
280 | tf.idf <- function(x, idf) {
281 |   x * idf
282 | }
283 | 
284 | 
285 | # First step, normalize all documents via TF.
286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
287 | dim(train.tokens.df)
288 | View(train.tokens.df[1:20, 1:100])
289 | 
290 | 
291 | # Second step, calculate the IDF vector that we will use - both
292 | # for training data and for test data!
293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
294 | str(train.tokens.idf)
295 | 
296 | 
297 | # Lastly, calculate TF-IDF for our training corpus.
298 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
299 | dim(train.tokens.tfidf)
300 | View(train.tokens.tfidf[1:25, 1:25])
301 | 
302 | 
303 | # Transpose the matrix
304 | train.tokens.tfidf <- t(train.tokens.tfidf)
305 | dim(train.tokens.tfidf)
306 | View(train.tokens.tfidf[1:25, 1:25])
307 | 
308 | 
309 | # Check for incopmlete cases.
310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
311 | train$Text[incomplete.cases]
312 | 
313 | 
314 | # Fix incomplete cases
315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
316 | dim(train.tokens.tfidf)
317 | sum(which(!complete.cases(train.tokens.tfidf)))
318 | 
319 | 
320 | # Make a clean data frame using the same process as before.
321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
323 | 
324 | 
325 | # Time the code execution
326 | start.time <- Sys.time()
327 | 
328 | # Create a cluster to work on 10 logical cores.
329 | cl <- makeCluster(10, type = "SOCK")
330 | registerDoSNOW(cl)
331 | 
332 | # As our data is non-trivial in size at this point, use a single decision
333 | # tree alogrithm as our first model. We will graduate to using more 
334 | # powerful algorithms later when we perform feature extraction to shrink
335 | # the size of our data.
336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
337 |                     trControl = cv.cntrl, tuneLength = 7)
338 | 
339 | # Processing is done, stop cluster.
340 | stopCluster(cl)
341 | 
342 | # Total time of execution on workstation was 
343 | total.time <- Sys.time() - start.time
344 | total.time
345 | 
346 | # Check out our results.
347 | rpart.cv.2
348 | 
349 | 
350 | 
351 | # N-grams allow us to augment our document-term frequency matrices with
352 | # word ordering. This often leads to increased performance (e.g., accuracy)
353 | # for machine learning models trained with more than just unigrams (i.e.,
354 | # single terms). Let's add bigrams to our training data and the TF-IDF 
355 | # transform the expanded featre matrix to see if accuracy improves.
356 | 
357 | # Add bigrams to our feature matrix.
358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
359 | train.tokens[[357]]
360 | 
361 | 
362 | # Transform to dfm and then a matrix.
363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
364 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
365 | train.tokens.dfm
366 | 
367 | 
368 | # Normalize all documents via TF.
369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
370 | 
371 |   
372 | # Calculate the IDF vector that we will use for training and test data!
373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
374 | 
375 |   
376 | # Calculate TF-IDF for our training corpus 
377 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, 
378 |                              idf = train.tokens.idf)
379 | 
380 |   
381 | # Transpose the matrix
382 | train.tokens.tfidf <- t(train.tokens.tfidf)
383 | 
384 |   
385 | # Fix incomplete cases
386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
388 | 
389 | 
390 | # Make a clean data frame.
391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
393 | 
394 | 
395 | # Clean up unused objects in memory.
396 | gc()
397 | 
398 | 
399 | 
400 | 
401 | #
402 | # NOTE - The following code requires the use of command-line R to execute
403 | #        due to the large number of features (i.e., columns) in the matrix.
404 | #        Please consult the following link for more details if you wish
405 | #        to run the code yourself:
406 | #
407 | #        https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r
408 | #
409 | #        Also note that running the following code required approximately
410 | #        38GB of RAM and more than 4.5 hours to execute on a 10-core 
411 | #        workstation!
412 | #
413 | 
414 | 
415 | # Time the code execution
416 | # start.time <- Sys.time()
417 | 
418 | # Leverage single decision trees to evaluate if adding bigrams improves the 
419 | # the effectiveness of the model.
420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
421 | #                     trControl = cv.cntrl, tuneLength = 7)
422 | 
423 | # Total time of execution on workstation was
424 | # total.time <- Sys.time() - start.time
425 | # total.time
426 | 
427 | # Check out our results.
428 | # rpart.cv.3
429 | 
430 | #
431 | # The results of the above processing show a slight decline in rpart 
432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457.
433 | # As we will discuss later, while the addition of bigrams appears to 
434 | # negatively impact a single decision tree, it helps with the mighty
435 | # random forest!
436 | #
437 | 
438 | 
439 | 
440 | 
441 | # We'll leverage the irlba package for our singular value 
442 | # decomposition (SVD). The irlba package allows us to specify
443 | # the number of the most important singular vectors we wish to
444 | # calculate and retain for features.
445 | library(irlba)
446 | 
447 | 
448 | # Time the code execution
449 | start.time <- Sys.time()
450 | 
451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns
452 | # for our latent semantic analysis (LSA).
453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600)
454 | 
455 | # Total time of execution on workstation was 
456 | total.time <- Sys.time() - start.time
457 | total.time
458 | 
459 | 
460 | # Take a look at the new feature data up close.
461 | View(train.irlba$v)
462 | 
463 | 
464 | # As with TF-IDF, we will need to project new data (e.g., the test data)
465 | # into the SVD semantic space. The following code illustrates how to do
466 | # this using a row of the training data that has already been transformed
467 | # by TF-IDF, per the mathematics illustrated in the slides.
468 | #
469 | #
470 | sigma.inverse <- 1 / train.irlba$d
471 | u.transpose <- t(train.irlba$u)
472 | document <- train.tokens.tfidf[1,]
473 | document.hat <- sigma.inverse * u.transpose %*% document
474 | 
475 | # Look at the first 10 components of projected document and the corresponding
476 | # row in our document semantic space (i.e., the V matrix)
477 | document.hat[1:10]
478 | train.irlba$v[1, 1:10]
479 | 
480 | 
481 | 
482 | #
483 | # Create new feature data frame using our document semantic space of 300
484 | # features (i.e., the V matrix from our SVD).
485 | #
486 | train.svd <- data.frame(Label = train$Label, train.irlba$v)
487 | 
488 | 
489 | # Create a cluster to work on 10 logical cores.
490 | cl <- makeCluster(10, type = "SOCK")
491 | registerDoSNOW(cl)
492 | 
493 | # Time the code execution
494 | start.time <- Sys.time()
495 | 
496 | # This will be the last run using single decision trees. With a much smaller
497 | # feature matrix we can now use more powerful methods like the mighty Random
498 | # Forest from now on!
499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 
500 |                     trControl = cv.cntrl, tuneLength = 7)
501 | 
502 | # Processing is done, stop cluster.
503 | stopCluster(cl)
504 | 
505 | # Total time of execution on workstation was 
506 | total.time <- Sys.time() - start.time
507 | total.time
508 | 
509 | # Check out our results.
510 | rpart.cv.4
511 | 
512 | 
513 | 
514 | 
515 | #
516 | # NOTE - The following code takes a long time to run. Here's the math.
517 | #        We are performing 10-fold CV repeated 3 times. That means we
518 | #        need to build 30 models. We are also asking caret to try 7 
519 | #        different values of the mtry parameter. Next up by default
520 | #        a mighty random forest leverages 500 trees. Lastly, caret will
521 | #        build 1 final model at the end of the process with the best 
522 | #        mtry value over all the training data. Here's the number of 
523 | #        tree we're building:
524 | #
525 | #             (10 * 3 * 7 * 500) + 500 = 105,500 trees!
526 | #
527 | # On a workstation using 10 cores the following code took 28 minutes 
528 | # to execute.
529 | #
530 | 
531 | 
532 | # Create a cluster to work on 10 logical cores.
533 | # cl <- makeCluster(10, type = "SOCK")
534 | # registerDoSNOW(cl)
535 | 
536 | # Time the code execution
537 | # start.time <- Sys.time()
538 | 
539 | # We have reduced the dimensionality of our data using SVD. Also, the 
540 | # application of SVD allows us to use LSA to simultaneously increase the
541 | # information density of each feature. To prove this out, leverage a 
542 | # mighty Random Forest with the default of 500 trees. We'll also ask
543 | # caret to try 7 different values of mtry to find the mtry value that 
544 | # gives the best result!
545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 
546 | #                 trControl = cv.cntrl, tuneLength = 7)
547 | 
548 | # Processing is done, stop cluster.
549 | # stopCluster(cl)
550 | 
551 | # Total time of execution on workstation was 
552 | # total.time <- Sys.time() - start.time
553 | # total.time
554 | 
555 | 
556 | # Load processing results from disk!
557 | load("rf.cv.1.RData")
558 | 
559 | # Check out our results.
560 | rf.cv.1
561 | 
562 | # Let's drill-down on the results.
563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted)
564 | 
565 | 
566 | 
567 | 
568 | 
569 | # OK, now let's add in the feature we engineered previously for SMS 
570 | # text length to see if it improves things.
571 | train.svd$TextLength <- train$TextLength
572 | 
573 | 
574 | # Create a cluster to work on 10 logical cores.
575 | # cl <- makeCluster(10, type = "SOCK")
576 | # registerDoSNOW(cl)
577 | 
578 | # Time the code execution
579 | # start.time <- Sys.time()
580 | 
581 | # Re-run the training process with the additional feature.
582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf",
583 | #                 trControl = cv.cntrl, tuneLength = 7, 
584 | #                 importance = TRUE)
585 | 
586 | # Processing is done, stop cluster.
587 | # stopCluster(cl)
588 | 
589 | # Total time of execution on workstation was 
590 | # total.time <- Sys.time() - start.time
591 | # total.time
592 | 
593 | # Load results from disk.
594 | load("rf.cv.2.RData")
595 | 
596 | # Check the results.
597 | rf.cv.2
598 | 
599 | # Drill-down on the results.
600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted)
601 | 
602 | # How important was the new feature?
603 | library(randomForest)
604 | varImpPlot(rf.cv.1$finalModel)
605 | varImpPlot(rf.cv.2$finalModel)
606 | 
607 | 
608 | 
609 | 
610 | # Turns out that our TextLength feature is very predictive and pushed our
611 | # overall accuracy over the training data to 97.1%. We can also use the
612 | # power of cosine similarity to engineer a feature for calculating, on 
613 | # average, how alike each SMS text message is to all of the spam messages.
614 | # The hypothesis here is that our use of bigrams, tf-idf, and LSA have 
615 | # produced a representation where ham SMS messages should have low cosine
616 | # similarities with spam SMS messages and vice versa.
617 | 
618 | # Use the lsa package's cosine function for our calculations.
619 | #install.packages("lsa")
620 | library(lsa)
621 | 
622 | train.similarities <- cosine(t(as.matrix(train.svd[, -c(1, ncol(train.svd))])))
623 | 
624 | 
625 | # Next up - take each SMS text message and find what the mean cosine 
626 | # similarity is for each SMS text mean with each of the spam SMS messages.
627 | # Per our hypothesis, ham SMS text messages should have relatively low
628 | # cosine similarities with spam messages and vice versa!
629 | spam.indexes <- which(train$Label == "spam")
630 | 
631 | train.svd$SpamSimilarity <- rep(0.0, nrow(train.svd))
632 | for(i in 1:nrow(train.svd)) {
633 |   train.svd$SpamSimilarity[i] <- mean(train.similarities[i, spam.indexes])  
634 | }
635 | 
636 | 
637 | # As always, let's visualize our results using the mighty ggplot2
638 | ggplot(train.svd, aes(x = SpamSimilarity, fill = Label)) +
639 |   theme_bw() +
640 |   geom_histogram(binwidth = 0.05) +
641 |   labs(y = "Message Count",
642 |        x = "Mean Spam Message Cosine Similarity",
643 |        title = "Distribution of Ham vs. Spam Using Spam Cosine Similarity")
644 | 
645 | 
646 | # Per our analysis of mighty random forest results, we are interested in 
647 | # in features that can raise model performance with respect to sensitivity.
648 | # Perform another CV process using the new spam cosine similarity feature.
649 | 
650 | # Create a cluster to work on 10 logical cores.
651 | # cl <- makeCluster(10, type = "SOCK")
652 | # registerDoSNOW(cl)
653 | 
654 | # Time the code execution
655 | # start.time <- Sys.time()
656 |  
657 | # Re-run the training process with the additional feature.
658 | # set.seed(932847)
659 | # rf.cv.3 <- train(Label ~ ., data = train.svd, method = "rf",
660 | #                 trControl = cv.cntrl, tuneLength = 7,
661 | #                 importance = TRUE)
662 | 
663 | # Processing is done, stop cluster.
664 | # stopCluster(cl)
665 | 
666 | # Total time of execution on workstation was 
667 | # total.time <- Sys.time() - start.time
668 | # total.time
669 | 
670 | 
671 | # Load results from disk.
672 | load("rf.cv.3.RData")
673 | 
674 | # Check the results.
675 | rf.cv.3
676 | 
677 | # Drill-down on the results.
678 | confusionMatrix(train.svd$Label, rf.cv.3$finalModel$predicted)
679 | 
680 | # How important was this feature?
681 | library(randomForest)
682 | varImpPlot(rf.cv.3$finalModel)
683 | 
684 | 
685 | 
686 | 
687 | # We've built what appears to be an effective predictive model. Time to verify
688 | # using the test holdout data we set aside at the beginning of the project.
689 | # First stage of this verification is running the test data through our pre-
690 | # processing pipeline of:
691 | #      1 - Tokenization
692 | #      2 - Lower casing
693 | #      3 - Stopword removal
694 | #      4 - Stemming
695 | #      5 - Adding bigrams
696 | #      6 - Transform to dfm
697 | #      7 - Ensure test dfm has same features as train dfm
698 | 
699 | # Tokenization.
700 | test.tokens <- tokens(test$Text, what = "word", 
701 |                       remove_numbers = TRUE, remove_punct = TRUE,
702 |                       remove_symbols = TRUE, remove_hyphens = TRUE)
703 | 
704 | # Lower case the tokens.
705 | test.tokens <- tokens_tolower(test.tokens)
706 | 
707 | # Stopword removal.
708 | test.tokens <- tokens_select(test.tokens, stopwords(), 
709 |                              selection = "remove")
710 | 
711 | # Stemming.
712 | test.tokens <- tokens_wordstem(test.tokens, language = "english")
713 | 
714 | # Add bigrams.
715 | test.tokens <- tokens_ngrams(test.tokens, n = 1:2)
716 | 
717 | # Convert n-grams to quanteda document-term frequency matrix.
718 | test.tokens.dfm <- dfm(test.tokens, tolower = FALSE)
719 | 
720 | # Explore the train and test quanteda dfm objects.
721 | train.tokens.dfm
722 | test.tokens.dfm
723 | 
724 | # Ensure the test dfm has the same n-grams as the training dfm.
725 | #
726 | # NOTE - In production we should expect that new text messages will 
727 | #        contain n-grams that did not exist in the original training
728 | #        data. As such, we need to strip those n-grams out.
729 | #
730 | test.tokens.dfm <- dfm_select(test.tokens.dfm, pattern = train.tokens.dfm,
731 |                               selection = "keep")
732 | test.tokens.matrix <- as.matrix(test.tokens.dfm)
733 | test.tokens.dfm
734 | 
735 | 
736 | 
737 | 
738 | # With the raw test features in place next up is the projecting the term
739 | # counts for the unigrams into the same TF-IDF vector space as our training
740 | # data. The high level process is as follows:
741 | #      1 - Normalize each document (i.e, each row)
742 | #      2 - Perform IDF multiplication using training IDF values
743 | 
744 | # Normalize all documents via TF.
745 | test.tokens.df <- apply(test.tokens.matrix, 1, term.frequency)
746 | str(test.tokens.df)
747 | 
748 | # Lastly, calculate TF-IDF for our training corpus.
749 | test.tokens.tfidf <-  apply(test.tokens.df, 2, tf.idf, idf = train.tokens.idf)
750 | dim(test.tokens.tfidf)
751 | View(test.tokens.tfidf[1:25, 1:25])
752 | 
753 | # Transpose the matrix
754 | test.tokens.tfidf <- t(test.tokens.tfidf)
755 | 
756 | # Fix incomplete cases
757 | summary(test.tokens.tfidf[1,])
758 | test.tokens.tfidf[is.na(test.tokens.tfidf)] <- 0.0
759 | summary(test.tokens.tfidf[1,])
760 | 
761 | 
762 | 
763 | 
764 | # With the test data projected into the TF-IDF vector space of the training
765 | # data we can now to the final projection into the training LSA semantic
766 | # space (i.e. the SVD matrix factorization).
767 | test.svd.raw <- t(sigma.inverse * u.transpose %*% t(test.tokens.tfidf))
768 | 
769 | 
770 | # Lastly, we can now build the test data frame to feed into our trained
771 | # machine learning model for predictions. First up, add Label and TextLength.
772 | test.svd <- data.frame(Label = test$Label, test.svd.raw, 
773 |                        TextLength = test$TextLength)
774 | 
775 | 
776 | # Next step, calculate SpamSimilarity for all the test documents. First up, 
777 | # create a spam similarity matrix.
778 | test.similarities <- rbind(test.svd.raw, train.irlba$v[spam.indexes,])
779 | test.similarities <- cosine(t(test.similarities))
780 | 
781 | 
782 | #
783 | # NOTE - The following code was updated post-video recoding due to a bug.
784 | #
785 | test.svd$SpamSimilarity <- rep(0.0, nrow(test.svd))
786 | spam.cols <- (nrow(test.svd) + 1):ncol(test.similarities)
787 | for(i in 1:nrow(test.svd)) {
788 |   # The following line has the bug fix.
789 |   test.svd$SpamSimilarity[i] <- mean(test.similarities[i, spam.cols])  
790 | }
791 | 
792 | 
793 | # Some SMS text messages become empty as a result of stopword and special 
794 | # character removal. This results in spam similarity measures of 0. Correct.
795 | # This code as added post-video as part of the bug fix.
796 | test.svd$SpamSimilarity[!is.finite(test.svd$SpamSimilarity)] <- 0
797 | 
798 | 
799 | # Now we can make predictions on the test data set using our trained mighty 
800 | # random forest.
801 | preds <- predict(rf.cv.3, test.svd)
802 | 
803 | 
804 | # Drill-in on results
805 | confusionMatrix(preds, test.svd$Label)
806 | 
807 | 
808 | 
809 | 
810 | # The definition of overfitting is doing far better on the training data as
811 | # evidenced by CV than doing on a hold-out dataset (i.e., our test dataset).
812 | # One potential explantion of this overfitting is the use of the spam similarity
813 | # feature. The hypothesis here is that spam features (i.e., text content) varies
814 | # highly, espeically over time. As such, our average spam cosine similarity 
815 | # is likely to overfit to the training data. To combat this, let's rebuild a
816 | # mighty random forest without the spam similarity feature.
817 | train.svd$SpamSimilarity <- NULL
818 | test.svd$SpamSimilarity <- NULL
819 | 
820 | 
821 | # Create a cluster to work on 10 logical cores.
822 | # cl <- makeCluster(10, type = "SOCK")
823 | # registerDoSNOW(cl)
824 | 
825 | # Time the code execution
826 | # start.time <- Sys.time()
827 | 
828 | # Re-run the training process with the additional feature.
829 | # set.seed(254812)
830 | # rf.cv.4 <- train(Label ~ ., data = train.svd, method = "rf",
831 | #                  trControl = cv.cntrl, tuneLength = 7,
832 | #                  importance = TRUE)
833 | 
834 | # Processing is done, stop cluster.
835 | # stopCluster(cl)
836 | 
837 | # Total time of execution on workstation was
838 | # total.time <- Sys.time() - start.time
839 | # total.time
840 | 
841 | 
842 | # Load results from disk.
843 | load("rf.cv.4.RData")
844 | 
845 | 
846 | # Make predictions and drill-in on the results
847 | preds <- predict(rf.cv.4, test.svd)
848 | confusionMatrix(preds, test.svd$Label)
849 | 
850 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part2.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 2 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=Y7385dGRNLM     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | 
 39 | # Clean up the data frame and view our handiwork.
 40 | spam.raw <- spam.raw[, 1:2]
 41 | names(spam.raw) <- c("Label", "Text")
 42 | View(spam.raw)
 43 | 
 44 | 
 45 | 
 46 | # Check data to see if there are missing values.
 47 | length(which(!complete.cases(spam.raw)))
 48 | 
 49 | 
 50 | 
 51 | # Convert our class label into a factor.
 52 | spam.raw$Label <- as.factor(spam.raw$Label)
 53 | 
 54 | 
 55 | 
 56 | # The first step, as always, is to explore the data.
 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 58 | prop.table(table(spam.raw$Label))
 59 | 
 60 | 
 61 | 
 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 63 | # messages by adding a new feature for the length of each message.
 64 | spam.raw$TextLength <- nchar(spam.raw$Text)
 65 | summary(spam.raw$TextLength)
 66 | 
 67 | 
 68 | 
 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 70 | library(ggplot2)
 71 | 
 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 73 |   theme_bw() +
 74 |   geom_histogram(binwidth = 5) +
 75 |   labs(y = "Text Count", x = "Length of Text",
 76 |        title = "Distribution of Text Lengths with Class Labels")
 77 | 
 78 | 
 79 | 
 80 | # At a minimum we need to split our data into a training set and a
 81 | # test set. In a true project we would want to use a three-way split 
 82 | # of training, validation, and test.
 83 | #
 84 | # As we know that our data has non-trivial class imbalance, we'll 
 85 | # use the mighty caret package to create a randomg train/test split 
 86 | # that ensures the correct ham/spam class label proportions (i.e., 
 87 | # we'll use caret for a random stratified split).
 88 | library(caret)
 89 | help(package = "caret")
 90 | 
 91 | 
 92 | # Use caret to create a 70%/30% stratified split. Set the random
 93 | # seed for reproducibility.
 94 | set.seed(32984)
 95 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 96 |                                p = 0.7, list = FALSE)
 97 | 
 98 | train <- spam.raw[indexes,]
 99 | test <- spam.raw[-indexes,]
100 | 
101 | 
102 | # Verify proportions.
103 | prop.table(table(train$Label))
104 | prop.table(table(test$Label))
105 | 
106 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part3.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 3 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=CQsyVDxK7_g     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | 
 39 | # Clean up the data frame and view our handiwork.
 40 | spam.raw <- spam.raw[, 1:2]
 41 | names(spam.raw) <- c("Label", "Text")
 42 | View(spam.raw)
 43 | 
 44 | 
 45 | 
 46 | # Check data to see if there are missing values.
 47 | length(which(!complete.cases(spam.raw)))
 48 | 
 49 | 
 50 | 
 51 | # Convert our class label into a factor.
 52 | spam.raw$Label <- as.factor(spam.raw$Label)
 53 | 
 54 | 
 55 | 
 56 | # The first step, as always, is to explore the data.
 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 58 | prop.table(table(spam.raw$Label))
 59 | 
 60 | 
 61 | 
 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 63 | # messages by adding a new feature for the length of each message.
 64 | spam.raw$TextLength <- nchar(spam.raw$Text)
 65 | summary(spam.raw$TextLength)
 66 | 
 67 | 
 68 | 
 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 70 | library(ggplot2)
 71 | 
 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 73 |   theme_bw() +
 74 |   geom_histogram(binwidth = 5) +
 75 |   labs(y = "Text Count", x = "Length of Text",
 76 |        title = "Distribution of Text Lengths with Class Labels")
 77 | 
 78 | 
 79 | 
 80 | # At a minimum we need to split our data into a training set and a
 81 | # test set. In a true project we would want to use a three-way split 
 82 | # of training, validation, and test.
 83 | #
 84 | # As we know that our data has non-trivial class imbalance, we'll 
 85 | # use the mighty caret package to create a randomg train/test split 
 86 | # that ensures the correct ham/spam class label proportions (i.e., 
 87 | # we'll use caret for a random stratified split).
 88 | library(caret)
 89 | help(package = "caret")
 90 | 
 91 | 
 92 | # Use caret to create a 70%/30% stratified split. Set the random
 93 | # seed for reproducibility.
 94 | set.seed(32984)
 95 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 96 |                                p = 0.7, list = FALSE)
 97 | 
 98 | train <- spam.raw[indexes,]
 99 | test <- spam.raw[-indexes,]
100 | 
101 | 
102 | # Verify proportions.
103 | prop.table(table(train$Label))
104 | prop.table(table(test$Label))
105 | 
106 | 
107 | 
108 | # Text analytics requires a lot of data exploration, data pre-processing
109 | # and data wrangling. Let's explore some examples.
110 | 
111 | # HTML-escaped ampersand character.
112 | train$Text[21]
113 | 
114 | 
115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
116 | # is an actual person, but we will ignore the implications of this for
117 | # this introductory tutorial.
118 | train$Text[38]
119 | 
120 | 
121 | # A URL.
122 | train$Text[357]
123 | 
124 | 
125 | 
126 | # There are many packages in the R ecosystem for performing text
127 | # analytics. One of the newer packages in quanteda. The quanteda
128 | # package has many useful functions for quickly and easily working
129 | # with text data.
130 | library(quanteda)
131 | help(package = "quanteda")
132 | 
133 | 
134 | # Tokenize SMS text messages.
135 | train.tokens <- tokens(train$Text, what = "word", 
136 |                        remove_numbers = TRUE, remove_punct = TRUE,
137 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
138 | 
139 | # Take a look at a specific SMS message and see how it transforms.
140 | train.tokens[[357]]
141 | 
142 | 
143 | # Lower case the tokens.
144 | train.tokens <- tokens_tolower(train.tokens)
145 | train.tokens[[357]]
146 | 
147 | 
148 | # Use quanteda's built-in stopword list for English.
149 | # NOTE - You should always inspect stopword lists for applicability to
150 | #        your problem/domain.
151 | train.tokens <- tokens_select(train.tokens, stopwords(), 
152 |                               selection = "remove")
153 | train.tokens[[357]]
154 | 
155 | 
156 | # Perform stemming on the tokens.
157 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
158 | train.tokens[[357]]
159 | 
160 | 
161 | # Create our first bag-of-words model.
162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
163 | 
164 | 
165 | # Transform to a matrix and inspect.
166 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
167 | View(train.tokens.matrix[1:20, 1:100])
168 | dim(train.tokens.matrix)
169 | 
170 | 
171 | # Investigate the effects of stemming.
172 | colnames(train.tokens.matrix)[1:50]
173 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part4.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 4 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=IFhDlHKRHno     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | 
 39 | # Clean up the data frame and view our handiwork.
 40 | spam.raw <- spam.raw[, 1:2]
 41 | names(spam.raw) <- c("Label", "Text")
 42 | View(spam.raw)
 43 | 
 44 | 
 45 | 
 46 | # Check data to see if there are missing values.
 47 | length(which(!complete.cases(spam.raw)))
 48 | 
 49 | 
 50 | 
 51 | # Convert our class label into a factor.
 52 | spam.raw$Label <- as.factor(spam.raw$Label)
 53 | 
 54 | 
 55 | 
 56 | # The first step, as always, is to explore the data.
 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 58 | prop.table(table(spam.raw$Label))
 59 | 
 60 | 
 61 | 
 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 63 | # messages by adding a new feature for the length of each message.
 64 | spam.raw$TextLength <- nchar(spam.raw$Text)
 65 | summary(spam.raw$TextLength)
 66 | 
 67 | 
 68 | 
 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 70 | library(ggplot2)
 71 | 
 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 73 |   theme_bw() +
 74 |   geom_histogram(binwidth = 5) +
 75 |   labs(y = "Text Count", x = "Length of Text",
 76 |        title = "Distribution of Text Lengths with Class Labels")
 77 | 
 78 | 
 79 | 
 80 | # At a minimum we need to split our data into a training set and a
 81 | # test set. In a true project we would want to use a three-way split 
 82 | # of training, validation, and test.
 83 | #
 84 | # As we know that our data has non-trivial class imbalance, we'll 
 85 | # use the mighty caret package to create a randomg train/test split 
 86 | # that ensures the correct ham/spam class label proportions (i.e., 
 87 | # we'll use caret for a random stratified split).
 88 | library(caret)
 89 | help(package = "caret")
 90 | 
 91 | 
 92 | # Use caret to create a 70%/30% stratified split. Set the random
 93 | # seed for reproducibility.
 94 | set.seed(32984)
 95 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 96 |                                p = 0.7, list = FALSE)
 97 | 
 98 | train <- spam.raw[indexes,]
 99 | test <- spam.raw[-indexes,]
100 | 
101 | 
102 | # Verify proportions.
103 | prop.table(table(train$Label))
104 | prop.table(table(test$Label))
105 | 
106 | 
107 | 
108 | # Text analytics requires a lot of data exploration, data pre-processing
109 | # and data wrangling. Let's explore some examples.
110 | 
111 | # HTML-escaped ampersand character.
112 | train$Text[21]
113 | 
114 | 
115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
116 | # is an actual person, but we will ignore the implications of this for
117 | # this introductory tutorial.
118 | train$Text[38]
119 | 
120 | 
121 | # A URL.
122 | train$Text[357]
123 | 
124 | 
125 | 
126 | # There are many packages in the R ecosystem for performing text
127 | # analytics. One of the newer packages in quanteda. The quanteda
128 | # package has many useful functions for quickly and easily working
129 | # with text data.
130 | library(quanteda)
131 | help(package = "quanteda")
132 | 
133 | 
134 | # Tokenize SMS text messages.
135 | train.tokens <- tokens(train$Text, what = "word", 
136 |                        remove_numbers = TRUE, remove_punct = TRUE,
137 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
138 | 
139 | # Take a look at a specific SMS message and see how it transforms.
140 | train.tokens[[357]]
141 | 
142 | 
143 | # Lower case the tokens.
144 | train.tokens <- tokens_tolower(train.tokens)
145 | train.tokens[[357]]
146 | 
147 | 
148 | # Use quanteda's built-in stopword list for English.
149 | # NOTE - You should always inspect stopword lists for applicability to
150 | #        your problem/domain.
151 | train.tokens <- tokens_select(train.tokens, stopwords(), 
152 |                               selection = "remove")
153 | train.tokens[[357]]
154 | 
155 | 
156 | # Perform stemming on the tokens.
157 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
158 | train.tokens[[357]]
159 | 
160 | 
161 | # Create our first bag-of-words model.
162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
163 | 
164 | 
165 | # Transform to a matrix and inspect.
166 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
167 | View(train.tokens.matrix[1:20, 1:100])
168 | dim(train.tokens.matrix)
169 | 
170 | 
171 | # Investigate the effects of stemming.
172 | colnames(train.tokens.matrix)[1:50]
173 | 
174 | 
175 | # Per best practices, we will leverage cross validation (CV) as
176 | # the basis of our modeling process. Using CV we can create 
177 | # estimates of how well our model will do in Production on new,
178 | # unseen data. CV is powerful, but the downside is that it
179 | # requires more processing and therefore more time.
180 | #
181 | # If you are not familiar with CV, consult the following 
182 | # Wikipedia article:
183 | #
184 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
185 | #
186 | 
187 | # Setup a the feature data frame with labels.
188 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
189 | 
190 | 
191 | # Often, tokenization requires some additional pre-processing
192 | names(train.tokens.df)[c(146, 148, 235, 238)]
193 | 
194 | 
195 | # Cleanup column names.
196 | names(train.tokens.df) <- make.names(names(train.tokens.df))
197 | 
198 | 
199 | # Use caret to create stratified folds for 10-fold cross validation repeated 
200 | # 3 times (i.e., create 30 random stratified samples)
201 | set.seed(48743)
202 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
203 | 
204 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
205 |                          repeats = 3, index = cv.folds)
206 | 
207 | 
208 | # Our data frame is non-trivial in size. As such, CV runs will take 
209 | # quite a long time to run. To cut down on total execution time, use
210 | # the doSNOW package to allow for multi-core training in parallel.
211 | #
212 | # WARNING - The following code is configured to run on a workstation-
213 | #           or server-class machine (i.e., 12 logical cores). Alter
214 | #           code to suit your HW environment.
215 | #
216 | #install.packages("doSNOW")
217 | library(doSNOW)
218 | 
219 | 
220 | # Time the code execution
221 | start.time <- Sys.time()
222 | 
223 | 
224 | # Create a cluster to work on 10 logical cores.
225 | cl <- makeCluster(10, type = "SOCK")
226 | registerDoSNOW(cl)
227 | 
228 | 
229 | # As our data is non-trivial in size at this point, use a single decision
230 | # tree alogrithm as our first model. We will graduate to using more 
231 | # powerful algorithms later when we perform feature extraction to shrink
232 | # the size of our data.
233 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
234 |                     trControl = cv.cntrl, tuneLength = 7)
235 | 
236 | 
237 | # Processing is done, stop cluster.
238 | stopCluster(cl)
239 | 
240 | 
241 | # Total time of execution on workstation was approximately 4 minutes. 
242 | total.time <- Sys.time() - start.time
243 | total.time
244 | 
245 | 
246 | # Check out our results.
247 | rpart.cv.1
248 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part5.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 5 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=az7yf0IfWPM     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | 
 39 | # Clean up the data frame and view our handiwork.
 40 | spam.raw <- spam.raw[, 1:2]
 41 | names(spam.raw) <- c("Label", "Text")
 42 | View(spam.raw)
 43 | 
 44 | 
 45 | 
 46 | # Check data to see if there are missing values.
 47 | length(which(!complete.cases(spam.raw)))
 48 | 
 49 | 
 50 | 
 51 | # Convert our class label into a factor.
 52 | spam.raw$Label <- as.factor(spam.raw$Label)
 53 | 
 54 | 
 55 | 
 56 | # The first step, as always, is to explore the data.
 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 58 | prop.table(table(spam.raw$Label))
 59 | 
 60 | 
 61 | 
 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 63 | # messages by adding a new feature for the length of each message.
 64 | spam.raw$TextLength <- nchar(spam.raw$Text)
 65 | summary(spam.raw$TextLength)
 66 | 
 67 | 
 68 | 
 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 70 | library(ggplot2)
 71 | 
 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 73 |   theme_bw() +
 74 |   geom_histogram(binwidth = 5) +
 75 |   labs(y = "Text Count", x = "Length of Text",
 76 |        title = "Distribution of Text Lengths with Class Labels")
 77 | 
 78 | 
 79 | 
 80 | # At a minimum we need to split our data into a training set and a
 81 | # test set. In a true project we would want to use a three-way split 
 82 | # of training, validation, and test.
 83 | #
 84 | # As we know that our data has non-trivial class imbalance, we'll 
 85 | # use the mighty caret package to create a randomg train/test split 
 86 | # that ensures the correct ham/spam class label proportions (i.e., 
 87 | # we'll use caret for a random stratified split).
 88 | library(caret)
 89 | help(package = "caret")
 90 | 
 91 | 
 92 | # Use caret to create a 70%/30% stratified split. Set the random
 93 | # seed for reproducibility.
 94 | set.seed(32984)
 95 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 96 |                                p = 0.7, list = FALSE)
 97 | 
 98 | train <- spam.raw[indexes,]
 99 | test <- spam.raw[-indexes,]
100 | 
101 | 
102 | # Verify proportions.
103 | prop.table(table(train$Label))
104 | prop.table(table(test$Label))
105 | 
106 | 
107 | 
108 | # Text analytics requires a lot of data exploration, data pre-processing
109 | # and data wrangling. Let's explore some examples.
110 | 
111 | # HTML-escaped ampersand character.
112 | train$Text[21]
113 | 
114 | 
115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
116 | # is an actual person, but we will ignore the implications of this for
117 | # this introductory tutorial.
118 | train$Text[38]
119 | 
120 | 
121 | # A URL.
122 | train$Text[357]
123 | 
124 | 
125 | 
126 | # There are many packages in the R ecosystem for performing text
127 | # analytics. One of the newer packages in quanteda. The quanteda
128 | # package has many useful functions for quickly and easily working
129 | # with text data.
130 | library(quanteda)
131 | help(package = "quanteda")
132 | 
133 | 
134 | # Tokenize SMS text messages.
135 | train.tokens <- tokens(train$Text, what = "word", 
136 |                        remove_numbers = TRUE, remove_punct = TRUE,
137 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
138 | 
139 | # Take a look at a specific SMS message and see how it transforms.
140 | train.tokens[[357]]
141 | 
142 | 
143 | # Lower case the tokens.
144 | train.tokens <- tokens_tolower(train.tokens)
145 | train.tokens[[357]]
146 | 
147 | 
148 | # Use quanteda's built-in stopword list for English.
149 | # NOTE - You should always inspect stopword lists for applicability to
150 | #        your problem/domain.
151 | train.tokens <- tokens_select(train.tokens, stopwords(), 
152 |                               selection = "remove")
153 | train.tokens[[357]]
154 | 
155 | 
156 | # Perform stemming on the tokens.
157 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
158 | train.tokens[[357]]
159 | 
160 | 
161 | # Create our first bag-of-words model.
162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
163 | 
164 | 
165 | # Transform to a matrix and inspect.
166 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
167 | View(train.tokens.matrix[1:20, 1:100])
168 | dim(train.tokens.matrix)
169 | 
170 | 
171 | # Investigate the effects of stemming.
172 | colnames(train.tokens.matrix)[1:50]
173 | 
174 | 
175 | # Per best practices, we will leverage cross validation (CV) as
176 | # the basis of our modeling process. Using CV we can create 
177 | # estimates of how well our model will do in Production on new,
178 | # unseen data. CV is powerful, but the downside is that it
179 | # requires more processing and therefore more time.
180 | #
181 | # If you are not familiar with CV, consult the following 
182 | # Wikipedia article:
183 | #
184 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
185 | #
186 | 
187 | # Setup a the feature data frame with labels.
188 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
189 | 
190 | 
191 | # Often, tokenization requires some additional pre-processing
192 | names(train.tokens.df)[c(146, 148, 235, 238)]
193 | 
194 | 
195 | # Cleanup column names.
196 | names(train.tokens.df) <- make.names(names(train.tokens.df))
197 | 
198 | 
199 | # Use caret to create stratified folds for 10-fold cross validation repeated 
200 | # 3 times (i.e., create 30 random stratified samples)
201 | set.seed(48743)
202 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
203 | 
204 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
205 |                          repeats = 3, index = cv.folds)
206 | 
207 | 
208 | # Our data frame is non-trivial in size. As such, CV runs will take 
209 | # quite a long time to run. To cut down on total execution time, use
210 | # the doSNOW package to allow for multi-core training in parallel.
211 | #
212 | # WARNING - The following code is configured to run on a workstation-
213 | #           or server-class machine (i.e., 12 logical cores). Alter
214 | #           code to suit your HW environment.
215 | #
216 | #install.packages("doSNOW")
217 | library(doSNOW)
218 | 
219 | 
220 | # Time the code execution
221 | start.time <- Sys.time()
222 | 
223 | 
224 | # Create a cluster to work on 10 logical cores.
225 | cl <- makeCluster(10, type = "SOCK")
226 | registerDoSNOW(cl)
227 | 
228 | 
229 | # As our data is non-trivial in size at this point, use a single decision
230 | # tree alogrithm as our first model. We will graduate to using more 
231 | # powerful algorithms later when we perform feature extraction to shrink
232 | # the size of our data.
233 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
234 |                     trControl = cv.cntrl, tuneLength = 7)
235 | 
236 | 
237 | # Processing is done, stop cluster.
238 | stopCluster(cl)
239 | 
240 | 
241 | # Total time of execution on workstation was approximately 4 minutes. 
242 | total.time <- Sys.time() - start.time
243 | total.time
244 | 
245 | 
246 | # Check out our results.
247 | rpart.cv.1
248 | 
249 | 
250 | 
251 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
252 | # powerful technique for enhancing the information/signal contained
253 | # within our document-frequency matrix. Specifically, the mathematics
254 | # behind TF-IDF accomplish the following goals:
255 | #    1 - The TF calculation accounts for the fact that longer 
256 | #        documents will have higher individual term counts. Applying
257 | #        TF normalizes all documents in the corpus to be length 
258 | #        independent.
259 | #    2 - The IDF calculation accounts for the frequency of term
260 | #        appearance in all documents in the corpus. The intuition 
261 | #        being that a term that appears in every document has no
262 | #        predictive power.
263 | #    3 - The multiplication of TF by IDF for each cell in the matrix
264 | #        allows for weighting of #1 and #2 for each cell in the matrix.
265 | 
266 | 
267 | # Our function for calculating relative term frequency (TF)
268 | term.frequency <- function(row) {
269 |   row / sum(row)
270 | }
271 | 
272 | # Our function for calculating inverse document frequency (IDF)
273 | inverse.doc.freq <- function(col) {
274 |   corpus.size <- length(col)
275 |   doc.count <- length(which(col > 0))
276 | 
277 |   log10(corpus.size / doc.count)
278 | }
279 | 
280 | # Our function for calculating TF-IDF.
281 | tf.idf <- function(x, idf) {
282 |   x * idf
283 | }
284 | 
285 | 
286 | # First step, normalize all documents via TF.
287 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
288 | dim(train.tokens.df)
289 | View(train.tokens.df[1:20, 1:100])
290 | 
291 | 
292 | # Second step, calculate the IDF vector that we will use - both
293 | # for training data and for test data!
294 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
295 | str(train.tokens.idf)
296 | 
297 | 
298 | # Lastly, calculate TF-IDF for our training corpus.
299 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
300 | dim(train.tokens.tfidf)
301 | View(train.tokens.tfidf[1:25, 1:25])
302 | 
303 | 
304 | # Transpose the matrix
305 | train.tokens.tfidf <- t(train.tokens.tfidf)
306 | dim(train.tokens.tfidf)
307 | View(train.tokens.tfidf[1:25, 1:25])
308 | 
309 | 
310 | # Check for incopmlete cases.
311 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
312 | train$Text[incomplete.cases]
313 | 
314 | 
315 | # Fix incomplete cases
316 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
317 | dim(train.tokens.tfidf)
318 | sum(which(!complete.cases(train.tokens.tfidf)))
319 | 
320 | 
321 | # Make a clean data frame using the same process as before.
322 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
323 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
324 | 
325 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part6.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 6 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=neiW5Ugsob8     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | 
 39 | # Clean up the data frame and view our handiwork.
 40 | spam.raw <- spam.raw[, 1:2]
 41 | names(spam.raw) <- c("Label", "Text")
 42 | View(spam.raw)
 43 | 
 44 | 
 45 | 
 46 | # Check data to see if there are missing values.
 47 | length(which(!complete.cases(spam.raw)))
 48 | 
 49 | 
 50 | 
 51 | # Convert our class label into a factor.
 52 | spam.raw$Label <- as.factor(spam.raw$Label)
 53 | 
 54 | 
 55 | 
 56 | # The first step, as always, is to explore the data.
 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 58 | prop.table(table(spam.raw$Label))
 59 | 
 60 | 
 61 | 
 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 63 | # messages by adding a new feature for the length of each message.
 64 | spam.raw$TextLength <- nchar(spam.raw$Text)
 65 | summary(spam.raw$TextLength)
 66 | 
 67 | 
 68 | 
 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 70 | library(ggplot2)
 71 | 
 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 73 |   theme_bw() +
 74 |   geom_histogram(binwidth = 5) +
 75 |   labs(y = "Text Count", x = "Length of Text",
 76 |        title = "Distribution of Text Lengths with Class Labels")
 77 | 
 78 | 
 79 | 
 80 | # At a minimum we need to split our data into a training set and a
 81 | # test set. In a true project we would want to use a three-way split 
 82 | # of training, validation, and test.
 83 | #
 84 | # As we know that our data has non-trivial class imbalance, we'll 
 85 | # use the mighty caret package to create a randomg train/test split 
 86 | # that ensures the correct ham/spam class label proportions (i.e., 
 87 | # we'll use caret for a random stratified split).
 88 | library(caret)
 89 | help(package = "caret")
 90 | 
 91 | 
 92 | # Use caret to create a 70%/30% stratified split. Set the random
 93 | # seed for reproducibility.
 94 | set.seed(32984)
 95 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 96 |                                p = 0.7, list = FALSE)
 97 | 
 98 | train <- spam.raw[indexes,]
 99 | test <- spam.raw[-indexes,]
100 | 
101 | 
102 | # Verify proportions.
103 | prop.table(table(train$Label))
104 | prop.table(table(test$Label))
105 | 
106 | 
107 | 
108 | # Text analytics requires a lot of data exploration, data pre-processing
109 | # and data wrangling. Let's explore some examples.
110 | 
111 | # HTML-escaped ampersand character.
112 | train$Text[21]
113 | 
114 | 
115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
116 | # is an actual person, but we will ignore the implications of this for
117 | # this introductory tutorial.
118 | train$Text[38]
119 | 
120 | 
121 | # A URL.
122 | train$Text[357]
123 | 
124 | 
125 | 
126 | # There are many packages in the R ecosystem for performing text
127 | # analytics. One of the newer packages in quanteda. The quanteda
128 | # package has many useful functions for quickly and easily working
129 | # with text data.
130 | library(quanteda)
131 | help(package = "quanteda")
132 | 
133 | 
134 | # Tokenize SMS text messages.
135 | train.tokens <- tokens(train$Text, what = "word", 
136 |                        remove_numbers = TRUE, remove_punct = TRUE,
137 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
138 | 
139 | # Take a look at a specific SMS message and see how it transforms.
140 | train.tokens[[357]]
141 | 
142 | 
143 | # Lower case the tokens.
144 | train.tokens <- tokens_tolower(train.tokens)
145 | train.tokens[[357]]
146 | 
147 | 
148 | # Use quanteda's built-in stopword list for English.
149 | # NOTE - You should always inspect stopword lists for applicability to
150 | #        your problem/domain.
151 | train.tokens <- tokens_select(train.tokens, stopwords(), 
152 |                               selection = "remove")
153 | train.tokens[[357]]
154 | 
155 | 
156 | # Perform stemming on the tokens.
157 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
158 | train.tokens[[357]]
159 | 
160 | 
161 | # Create our first bag-of-words model.
162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
163 | 
164 | 
165 | # Transform to a matrix and inspect.
166 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
167 | View(train.tokens.matrix[1:20, 1:100])
168 | dim(train.tokens.matrix)
169 | 
170 | 
171 | # Investigate the effects of stemming.
172 | colnames(train.tokens.matrix)[1:50]
173 | 
174 | 
175 | # Per best practices, we will leverage cross validation (CV) as
176 | # the basis of our modeling process. Using CV we can create 
177 | # estimates of how well our model will do in Production on new,
178 | # unseen data. CV is powerful, but the downside is that it
179 | # requires more processing and therefore more time.
180 | #
181 | # If you are not familiar with CV, consult the following 
182 | # Wikipedia article:
183 | #
184 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
185 | #
186 | 
187 | # Setup a the feature data frame with labels.
188 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
189 | 
190 | 
191 | # Often, tokenization requires some additional pre-processing
192 | names(train.tokens.df)[c(146, 148, 235, 238)]
193 | 
194 | 
195 | # Cleanup column names.
196 | names(train.tokens.df) <- make.names(names(train.tokens.df))
197 | 
198 | 
199 | # Use caret to create stratified folds for 10-fold cross validation repeated 
200 | # 3 times (i.e., create 30 random stratified samples)
201 | set.seed(48743)
202 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
203 | 
204 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
205 |                          repeats = 3, index = cv.folds)
206 | 
207 | 
208 | # Our data frame is non-trivial in size. As such, CV runs will take 
209 | # quite a long time to run. To cut down on total execution time, use
210 | # the doSNOW package to allow for multi-core training in parallel.
211 | #
212 | # WARNING - The following code is configured to run on a workstation-
213 | #           or server-class machine (i.e., 12 logical cores). Alter
214 | #           code to suit your HW environment.
215 | #
216 | #install.packages("doSNOW")
217 | library(doSNOW)
218 | 
219 | 
220 | # Time the code execution
221 | start.time <- Sys.time()
222 | 
223 | 
224 | # Create a cluster to work on 10 logical cores.
225 | cl <- makeCluster(10, type = "SOCK")
226 | registerDoSNOW(cl)
227 | 
228 | 
229 | # As our data is non-trivial in size at this point, use a single decision
230 | # tree alogrithm as our first model. We will graduate to using more 
231 | # powerful algorithms later when we perform feature extraction to shrink
232 | # the size of our data.
233 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
234 |                     trControl = cv.cntrl, tuneLength = 7)
235 | 
236 | 
237 | # Processing is done, stop cluster.
238 | stopCluster(cl)
239 | 
240 | 
241 | # Total time of execution on workstation was approximately 4 minutes. 
242 | total.time <- Sys.time() - start.time
243 | total.time
244 | 
245 | 
246 | # Check out our results.
247 | rpart.cv.1
248 | 
249 | 
250 | 
251 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
252 | # powerful technique for enhancing the information/signal contained
253 | # within our document-frequency matrix. Specifically, the mathematics
254 | # behind TF-IDF accomplish the following goals:
255 | #    1 - The TF calculation accounts for the fact that longer 
256 | #        documents will have higher individual term counts. Applying
257 | #        TF normalizes all documents in the corpus to be length 
258 | #        independent.
259 | #    2 - The IDF calculation accounts for the frequency of term
260 | #        appearance in all documents in the corpus. The intuition 
261 | #        being that a term that appears in every document has no
262 | #        predictive power.
263 | #    3 - The multiplication of TF by IDF for each cell in the matrix
264 | #        allows for weighting of #1 and #2 for each cell in the matrix.
265 | 
266 | 
267 | # Our function for calculating relative term frequency (TF)
268 | term.frequency <- function(row) {
269 |   row / sum(row)
270 | }
271 | 
272 | # Our function for calculating inverse document frequency (IDF)
273 | inverse.doc.freq <- function(col) {
274 |   corpus.size <- length(col)
275 |   doc.count <- length(which(col > 0))
276 | 
277 |   log10(corpus.size / doc.count)
278 | }
279 | 
280 | # Our function for calculating TF-IDF.
281 | tf.idf <- function(x, idf) {
282 |   x * idf
283 | }
284 | 
285 | 
286 | # First step, normalize all documents via TF.
287 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
288 | dim(train.tokens.df)
289 | View(train.tokens.df[1:20, 1:100])
290 | 
291 | 
292 | # Second step, calculate the IDF vector that we will use - both
293 | # for training data and for test data!
294 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
295 | str(train.tokens.idf)
296 | 
297 | 
298 | # Lastly, calculate TF-IDF for our training corpus.
299 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
300 | dim(train.tokens.tfidf)
301 | View(train.tokens.tfidf[1:25, 1:25])
302 | 
303 | 
304 | # Transpose the matrix
305 | train.tokens.tfidf <- t(train.tokens.tfidf)
306 | dim(train.tokens.tfidf)
307 | View(train.tokens.tfidf[1:25, 1:25])
308 | 
309 | 
310 | # Check for incopmlete cases.
311 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
312 | train$Text[incomplete.cases]
313 | 
314 | 
315 | # Fix incomplete cases
316 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
317 | dim(train.tokens.tfidf)
318 | sum(which(!complete.cases(train.tokens.tfidf)))
319 | 
320 | 
321 | # Make a clean data frame using the same process as before.
322 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
323 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
324 | 
325 | 
326 | # Time the code execution
327 | start.time <- Sys.time()
328 | 
329 | # Create a cluster to work on 10 logical cores.
330 | cl <- makeCluster(3, type = "SOCK")
331 | registerDoSNOW(cl)
332 | 
333 | # As our data is non-trivial in size at this point, use a single decision
334 | # tree alogrithm as our first model. We will graduate to using more 
335 | # powerful algorithms later when we perform feature extraction to shrink
336 | # the size of our data.
337 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
338 |                     trControl = cv.cntrl, tuneLength = 7)
339 | 
340 | # Processing is done, stop cluster.
341 | stopCluster(cl)
342 | 
343 | # Total time of execution on workstation was 
344 | total.time <- Sys.time() - start.time
345 | total.time
346 | 
347 | # Check out our results.
348 | rpart.cv.2
349 | 
350 | 
351 | 
352 | # N-grams allow us to augment our document-term frequency matrices with
353 | # word ordering. This often leads to increased performance (e.g., accuracy)
354 | # for machine learning models trained with more than just unigrams (i.e.,
355 | # single terms). Let's add bigrams to our training data and the TF-IDF 
356 | # transform the expanded featre matrix to see if accuracy improves.
357 | 
358 | # Add bigrams to our feature matrix.
359 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
360 | train.tokens[[357]]
361 | 
362 | 
363 | # Transform to dfm and then a matrix.
364 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
365 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
366 | train.tokens.dfm
367 | 
368 | 
369 | # Normalize all documents via TF.
370 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
371 | 
372 |   
373 | # Calculate the IDF vector that we will use for training and test data!
374 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
375 | 
376 |   
377 | # Calculate TF-IDF for our training corpus 
378 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, 
379 |                              idf = train.tokens.idf)
380 | 
381 |   
382 | # Transpose the matrix
383 | train.tokens.tfidf <- t(train.tokens.tfidf)
384 | 
385 |   
386 | # Fix incomplete cases
387 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
388 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
389 | 
390 | 
391 | # Make a clean data frame.
392 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
393 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
394 | 
395 | 
396 | # Clean up unused objects in memory.
397 | gc()


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part7.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 7 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=Fza5szojsU8     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | # Clean up the data frame and view our handiwork.
 39 | spam.raw <- spam.raw[, 1:2]
 40 | names(spam.raw) <- c("Label", "Text")
 41 | View(spam.raw)
 42 | 
 43 | 
 44 | 
 45 | # Check data to see if there are missing values.
 46 | length(which(!complete.cases(spam.raw)))
 47 | 
 48 | 
 49 | 
 50 | # Convert our class label into a factor.
 51 | spam.raw$Label <- as.factor(spam.raw$Label)
 52 | 
 53 | 
 54 | 
 55 | # The first step, as always, is to explore the data.
 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 57 | prop.table(table(spam.raw$Label))
 58 | 
 59 | 
 60 | 
 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 62 | # messages by adding a new feature for the length of each message.
 63 | spam.raw$TextLength <- nchar(spam.raw$Text)
 64 | summary(spam.raw$TextLength)
 65 | 
 66 | 
 67 | 
 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 69 | library(ggplot2)
 70 | 
 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 72 |   theme_bw() +
 73 |   geom_histogram(binwidth = 5) +
 74 |   labs(y = "Text Count", x = "Length of Text",
 75 |        title = "Distribution of Text Lengths with Class Labels")
 76 | 
 77 | 
 78 | 
 79 | # At a minimum we need to split our data into a training set and a
 80 | # test set. In a true project we would want to use a three-way split 
 81 | # of training, validation, and test.
 82 | #
 83 | # As we know that our data has non-trivial class imbalance, we'll 
 84 | # use the mighty caret package to create a randomg train/test split 
 85 | # that ensures the correct ham/spam class label proportions (i.e., 
 86 | # we'll use caret for a random stratified split).
 87 | library(caret)
 88 | help(package = "caret")
 89 | 
 90 | 
 91 | # Use caret to create a 70%/30% stratified split. Set the random
 92 | # seed for reproducibility.
 93 | set.seed(32984)
 94 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 95 |                                p = 0.7, list = FALSE)
 96 | 
 97 | train <- spam.raw[indexes,]
 98 | test <- spam.raw[-indexes,]
 99 | 
100 | 
101 | # Verify proportions.
102 | prop.table(table(train$Label))
103 | prop.table(table(test$Label))
104 | 
105 | 
106 | 
107 | # Text analytics requires a lot of data exploration, data pre-processing
108 | # and data wrangling. Let's explore some examples.
109 | 
110 | # HTML-escaped ampersand character.
111 | train$Text[21]
112 | 
113 | 
114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
115 | # is an actual person, but we will ignore the implications of this for
116 | # this introductory tutorial.
117 | train$Text[38]
118 | 
119 | 
120 | # A URL.
121 | train$Text[357]
122 | 
123 | 
124 | 
125 | # There are many packages in the R ecosystem for performing text
126 | # analytics. One of the newer packages in quanteda. The quanteda
127 | # package has many useful functions for quickly and easily working
128 | # with text data.
129 | library(quanteda)
130 | help(package = "quanteda")
131 | 
132 | 
133 | # Tokenize SMS text messages.
134 | train.tokens <- tokens(train$Text, what = "word", 
135 |                        remove_numbers = TRUE, remove_punct = TRUE,
136 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
137 | 
138 | # Take a look at a specific SMS message and see how it transforms.
139 | train.tokens[[357]]
140 | 
141 | 
142 | # Lower case the tokens.
143 | train.tokens <- tokens_tolower(train.tokens)
144 | train.tokens[[357]]
145 | 
146 | 
147 | # Use quanteda's built-in stopword list for English.
148 | # NOTE - You should always inspect stopword lists for applicability to
149 | #        your problem/domain.
150 | train.tokens <- tokens_select(train.tokens, stopwords(), 
151 |                               selection = "remove")
152 | train.tokens[[357]]
153 | 
154 | 
155 | # Perform stemming on the tokens.
156 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
157 | train.tokens[[357]]
158 | 
159 | 
160 | # Create our first bag-of-words model.
161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
162 | 
163 | 
164 | # Transform to a matrix and inspect.
165 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
166 | View(train.tokens.matrix[1:20, 1:100])
167 | dim(train.tokens.matrix)
168 | 
169 | 
170 | # Investigate the effects of stemming.
171 | colnames(train.tokens.matrix)[1:50]
172 | 
173 | 
174 | # Per best practices, we will leverage cross validation (CV) as
175 | # the basis of our modeling process. Using CV we can create 
176 | # estimates of how well our model will do in Production on new,
177 | # unseen data. CV is powerful, but the downside is that it
178 | # requires more processing and therefore more time.
179 | #
180 | # If you are not familiar with CV, consult the following 
181 | # Wikipedia article:
182 | #
183 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
184 | #
185 | 
186 | # Setup a the feature data frame with labels.
187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
188 | 
189 | 
190 | # Often, tokenization requires some additional pre-processing
191 | names(train.tokens.df)[c(146, 148, 235, 238)]
192 | 
193 | 
194 | # Cleanup column names.
195 | names(train.tokens.df) <- make.names(names(train.tokens.df))
196 | 
197 | 
198 | # Use caret to create stratified folds for 10-fold cross validation repeated 
199 | # 3 times (i.e., create 30 random stratified samples)
200 | set.seed(48743)
201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
202 | 
203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
204 |                          repeats = 3, index = cv.folds)
205 | 
206 | 
207 | # Our data frame is non-trivial in size. As such, CV runs will take 
208 | # quite a long time to run. To cut down on total execution time, use
209 | # the doSNOW package to allow for multi-core training in parallel.
210 | #
211 | # WARNING - The following code is configured to run on a workstation-
212 | #           or server-class machine (i.e., 12 logical cores). Alter
213 | #           code to suit your HW environment.
214 | #
215 | #install.packages("doSNOW")
216 | library(doSNOW)
217 | 
218 | 
219 | # Time the code execution
220 | start.time <- Sys.time()
221 | 
222 | 
223 | # Create a cluster to work on 10 logical cores.
224 | cl <- makeCluster(10, type = "SOCK")
225 | registerDoSNOW(cl)
226 | 
227 | 
228 | # As our data is non-trivial in size at this point, use a single decision
229 | # tree alogrithm as our first model. We will graduate to using more 
230 | # powerful algorithms later when we perform feature extraction to shrink
231 | # the size of our data.
232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
233 |                     trControl = cv.cntrl, tuneLength = 7)
234 | 
235 | 
236 | # Processing is done, stop cluster.
237 | stopCluster(cl)
238 | 
239 | 
240 | # Total time of execution on workstation was approximately 4 minutes. 
241 | total.time <- Sys.time() - start.time
242 | total.time
243 | 
244 | 
245 | # Check out our results.
246 | rpart.cv.1
247 | 
248 | 
249 | 
250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
251 | # powerful technique for enhancing the information/signal contained
252 | # within our document-frequency matrix. Specifically, the mathematics
253 | # behind TF-IDF accomplish the following goals:
254 | #    1 - The TF calculation accounts for the fact that longer 
255 | #        documents will have higher individual term counts. Applying
256 | #        TF normalizes all documents in the corpus to be length 
257 | #        independent.
258 | #    2 - The IDF calculation accounts for the frequency of term
259 | #        appearance in all documents in the corpus. The intuition 
260 | #        being that a term that appears in every document has no
261 | #        predictive power.
262 | #    3 - The multiplication of TF by IDF for each cell in the matrix
263 | #        allows for weighting of #1 and #2 for each cell in the matrix.
264 | 
265 | 
266 | # Our function for calculating relative term frequency (TF)
267 | term.frequency <- function(row) {
268 |   row / sum(row)
269 | }
270 | 
271 | # Our function for calculating inverse document frequency (IDF)
272 | inverse.doc.freq <- function(col) {
273 |   corpus.size <- length(col)
274 |   doc.count <- length(which(col > 0))
275 | 
276 |   log10(corpus.size / doc.count)
277 | }
278 | 
279 | # Our function for calculating TF-IDF.
280 | tf.idf <- function(x, idf) {
281 |   x * idf
282 | }
283 | 
284 | 
285 | # First step, normalize all documents via TF.
286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
287 | dim(train.tokens.df)
288 | View(train.tokens.df[1:20, 1:100])
289 | 
290 | 
291 | # Second step, calculate the IDF vector that we will use - both
292 | # for training data and for test data!
293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
294 | str(train.tokens.idf)
295 | 
296 | 
297 | # Lastly, calculate TF-IDF for our training corpus.
298 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
299 | dim(train.tokens.tfidf)
300 | View(train.tokens.tfidf[1:25, 1:25])
301 | 
302 | 
303 | # Transpose the matrix
304 | train.tokens.tfidf <- t(train.tokens.tfidf)
305 | dim(train.tokens.tfidf)
306 | View(train.tokens.tfidf[1:25, 1:25])
307 | 
308 | 
309 | # Check for incopmlete cases.
310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
311 | train$Text[incomplete.cases]
312 | 
313 | 
314 | # Fix incomplete cases
315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
316 | dim(train.tokens.tfidf)
317 | sum(which(!complete.cases(train.tokens.tfidf)))
318 | 
319 | 
320 | # Make a clean data frame using the same process as before.
321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
323 | 
324 | 
325 | # Time the code execution
326 | start.time <- Sys.time()
327 | 
328 | # Create a cluster to work on 10 logical cores.
329 | cl <- makeCluster(3, type = "SOCK")
330 | registerDoSNOW(cl)
331 | 
332 | # As our data is non-trivial in size at this point, use a single decision
333 | # tree alogrithm as our first model. We will graduate to using more 
334 | # powerful algorithms later when we perform feature extraction to shrink
335 | # the size of our data.
336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
337 |                     trControl = cv.cntrl, tuneLength = 7)
338 | 
339 | # Processing is done, stop cluster.
340 | stopCluster(cl)
341 | 
342 | # Total time of execution on workstation was 
343 | total.time <- Sys.time() - start.time
344 | total.time
345 | 
346 | # Check out our results.
347 | rpart.cv.2
348 | 
349 | 
350 | 
351 | # N-grams allow us to augment our document-term frequency matrices with
352 | # word ordering. This often leads to increased performance (e.g., accuracy)
353 | # for machine learning models trained with more than just unigrams (i.e.,
354 | # single terms). Let's add bigrams to our training data and the TF-IDF 
355 | # transform the expanded featre matrix to see if accuracy improves.
356 | 
357 | # Add bigrams to our feature matrix.
358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
359 | train.tokens[[357]]
360 | 
361 | 
362 | # Transform to dfm and then a matrix.
363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
364 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
365 | train.tokens.dfm
366 | 
367 | 
368 | # Normalize all documents via TF.
369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
370 | 
371 |   
372 | # Calculate the IDF vector that we will use for training and test data!
373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
374 | 
375 |   
376 | # Calculate TF-IDF for our training corpus 
377 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, 
378 |                              idf = train.tokens.idf)
379 | 
380 |   
381 | # Transpose the matrix
382 | train.tokens.tfidf <- t(train.tokens.tfidf)
383 | 
384 |   
385 | # Fix incomplete cases
386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
388 | 
389 | 
390 | # Make a clean data frame.
391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
393 | 
394 | 
395 | # Clean up unused objects in memory.
396 | gc()
397 | 
398 | 
399 | 
400 | 
401 | #
402 | # NOTE - The following code requires the use of command-line R to execute
403 | #        due to the large number of features (i.e., columns) in the matrix.
404 | #        Please consult the following link for more details if you wish
405 | #        to run the code yourself:
406 | #
407 | #        https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r
408 | #
409 | #        Also note that running the following code required approximately
410 | #        38GB of RAM and more than 4.5 hours to execute on a 10-core 
411 | #        workstation!
412 | #
413 | 
414 | 
415 | # Time the code execution
416 | # start.time <- Sys.time()
417 | 
418 | # Leverage single decision trees to evaluate if adding bigrams improves the 
419 | # the effectiveness of the model.
420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
421 | #                     trControl = cv.cntrl, tuneLength = 7)
422 | 
423 | # Total time of execution on workstation was
424 | # total.time <- Sys.time() - start.time
425 | # total.time
426 | 
427 | # Check out our results.
428 | # rpart.cv.3
429 | 
430 | #
431 | # The results of the above processing show a slight decline in rpart 
432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457.
433 | # As we will discuss later, while the addition of bigrams appears to 
434 | # negatively impact a single decision tree, it helps with the mighty
435 | # random forest!
436 | #
437 | 
438 | 
439 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part8.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 8 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=4DI68P4hicQ     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest"))
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | # Clean up the data frame and view our handiwork.
 39 | spam.raw <- spam.raw[, 1:2]
 40 | names(spam.raw) <- c("Label", "Text")
 41 | View(spam.raw)
 42 | 
 43 | 
 44 | 
 45 | # Check data to see if there are missing values.
 46 | length(which(!complete.cases(spam.raw)))
 47 | 
 48 | 
 49 | 
 50 | # Convert our class label into a factor.
 51 | spam.raw$Label <- as.factor(spam.raw$Label)
 52 | 
 53 | 
 54 | 
 55 | # The first step, as always, is to explore the data.
 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 57 | prop.table(table(spam.raw$Label))
 58 | 
 59 | 
 60 | 
 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 62 | # messages by adding a new feature for the length of each message.
 63 | spam.raw$TextLength <- nchar(spam.raw$Text)
 64 | summary(spam.raw$TextLength)
 65 | 
 66 | 
 67 | 
 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 69 | library(ggplot2)
 70 | 
 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 72 |   theme_bw() +
 73 |   geom_histogram(binwidth = 5) +
 74 |   labs(y = "Text Count", x = "Length of Text",
 75 |        title = "Distribution of Text Lengths with Class Labels")
 76 | 
 77 | 
 78 | 
 79 | # At a minimum we need to split our data into a training set and a
 80 | # test set. In a true project we would want to use a three-way split 
 81 | # of training, validation, and test.
 82 | #
 83 | # As we know that our data has non-trivial class imbalance, we'll 
 84 | # use the mighty caret package to create a randomg train/test split 
 85 | # that ensures the correct ham/spam class label proportions (i.e., 
 86 | # we'll use caret for a random stratified split).
 87 | library(caret)
 88 | help(package = "caret")
 89 | 
 90 | 
 91 | # Use caret to create a 70%/30% stratified split. Set the random
 92 | # seed for reproducibility.
 93 | set.seed(32984)
 94 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 95 |                                p = 0.7, list = FALSE)
 96 | 
 97 | train <- spam.raw[indexes,]
 98 | test <- spam.raw[-indexes,]
 99 | 
100 | 
101 | # Verify proportions.
102 | prop.table(table(train$Label))
103 | prop.table(table(test$Label))
104 | 
105 | 
106 | 
107 | # Text analytics requires a lot of data exploration, data pre-processing
108 | # and data wrangling. Let's explore some examples.
109 | 
110 | # HTML-escaped ampersand character.
111 | train$Text[21]
112 | 
113 | 
114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
115 | # is an actual person, but we will ignore the implications of this for
116 | # this introductory tutorial.
117 | train$Text[38]
118 | 
119 | 
120 | # A URL.
121 | train$Text[357]
122 | 
123 | 
124 | 
125 | # There are many packages in the R ecosystem for performing text
126 | # analytics. One of the newer packages in quanteda. The quanteda
127 | # package has many useful functions for quickly and easily working
128 | # with text data.
129 | library(quanteda)
130 | help(package = "quanteda")
131 | 
132 | 
133 | # Tokenize SMS text messages.
134 | train.tokens <- tokens(train$Text, what = "word", 
135 |                        remove_numbers = TRUE, remove_punct = TRUE,
136 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
137 | 
138 | # Take a look at a specific SMS message and see how it transforms.
139 | train.tokens[[357]]
140 | 
141 | 
142 | # Lower case the tokens.
143 | train.tokens <- tokens_tolower(train.tokens)
144 | train.tokens[[357]]
145 | 
146 | 
147 | # Use quanteda's built-in stopword list for English.
148 | # NOTE - You should always inspect stopword lists for applicability to
149 | #        your problem/domain.
150 | train.tokens <- tokens_select(train.tokens, stopwords(), 
151 |                               selection = "remove")
152 | train.tokens[[357]]
153 | 
154 | 
155 | # Perform stemming on the tokens.
156 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
157 | train.tokens[[357]]
158 | 
159 | 
160 | # Create our first bag-of-words model.
161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
162 | 
163 | 
164 | # Transform to a matrix and inspect.
165 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
166 | View(train.tokens.matrix[1:20, 1:100])
167 | dim(train.tokens.matrix)
168 | 
169 | 
170 | # Investigate the effects of stemming.
171 | colnames(train.tokens.matrix)[1:50]
172 | 
173 | 
174 | # Per best practices, we will leverage cross validation (CV) as
175 | # the basis of our modeling process. Using CV we can create 
176 | # estimates of how well our model will do in Production on new,
177 | # unseen data. CV is powerful, but the downside is that it
178 | # requires more processing and therefore more time.
179 | #
180 | # If you are not familiar with CV, consult the following 
181 | # Wikipedia article:
182 | #
183 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
184 | #
185 | 
186 | # Setup a the feature data frame with labels.
187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
188 | 
189 | 
190 | # Often, tokenization requires some additional pre-processing
191 | names(train.tokens.df)[c(146, 148, 235, 238)]
192 | 
193 | 
194 | # Cleanup column names.
195 | names(train.tokens.df) <- make.names(names(train.tokens.df))
196 | 
197 | 
198 | # Use caret to create stratified folds for 10-fold cross validation repeated 
199 | # 3 times (i.e., create 30 random stratified samples)
200 | set.seed(48743)
201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
202 | 
203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
204 |                          repeats = 3, index = cv.folds)
205 | 
206 | 
207 | # Our data frame is non-trivial in size. As such, CV runs will take 
208 | # quite a long time to run. To cut down on total execution time, use
209 | # the doSNOW package to allow for multi-core training in parallel.
210 | #
211 | # WARNING - The following code is configured to run on a workstation-
212 | #           or server-class machine (i.e., 12 logical cores). Alter
213 | #           code to suit your HW environment.
214 | #
215 | #install.packages("doSNOW")
216 | library(doSNOW)
217 | 
218 | 
219 | # Time the code execution
220 | start.time <- Sys.time()
221 | 
222 | 
223 | # Create a cluster to work on 10 logical cores.
224 | cl <- makeCluster(10, type = "SOCK")
225 | registerDoSNOW(cl)
226 | 
227 | 
228 | # As our data is non-trivial in size at this point, use a single decision
229 | # tree alogrithm as our first model. We will graduate to using more 
230 | # powerful algorithms later when we perform feature extraction to shrink
231 | # the size of our data.
232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
233 |                     trControl = cv.cntrl, tuneLength = 7)
234 | 
235 | 
236 | # Processing is done, stop cluster.
237 | stopCluster(cl)
238 | 
239 | 
240 | # Total time of execution on workstation was approximately 4 minutes. 
241 | total.time <- Sys.time() - start.time
242 | total.time
243 | 
244 | 
245 | # Check out our results.
246 | rpart.cv.1
247 | 
248 | 
249 | 
250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
251 | # powerful technique for enhancing the information/signal contained
252 | # within our document-frequency matrix. Specifically, the mathematics
253 | # behind TF-IDF accomplish the following goals:
254 | #    1 - The TF calculation accounts for the fact that longer 
255 | #        documents will have higher individual term counts. Applying
256 | #        TF normalizes all documents in the corpus to be length 
257 | #        independent.
258 | #    2 - The IDF calculation accounts for the frequency of term
259 | #        appearance in all documents in the corpus. The intuition 
260 | #        being that a term that appears in every document has no
261 | #        predictive power.
262 | #    3 - The multiplication of TF by IDF for each cell in the matrix
263 | #        allows for weighting of #1 and #2 for each cell in the matrix.
264 | 
265 | 
266 | # Our function for calculating relative term frequency (TF)
267 | term.frequency <- function(row) {
268 |   row / sum(row)
269 | }
270 | 
271 | # Our function for calculating inverse document frequency (IDF)
272 | inverse.doc.freq <- function(col) {
273 |   corpus.size <- length(col)
274 |   doc.count <- length(which(col > 0))
275 | 
276 |   log10(corpus.size / doc.count)
277 | }
278 | 
279 | # Our function for calculating TF-IDF.
280 | tf.idf <- function(x, idf) {
281 |   x * idf
282 | }
283 | 
284 | 
285 | # First step, normalize all documents via TF.
286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
287 | dim(train.tokens.df)
288 | View(train.tokens.df[1:20, 1:100])
289 | 
290 | 
291 | # Second step, calculate the IDF vector that we will use - both
292 | # for training data and for test data!
293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
294 | str(train.tokens.idf)
295 | 
296 | 
297 | # Lastly, calculate TF-IDF for our training corpus.
298 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
299 | dim(train.tokens.tfidf)
300 | View(train.tokens.tfidf[1:25, 1:25])
301 | 
302 | 
303 | # Transpose the matrix
304 | train.tokens.tfidf <- t(train.tokens.tfidf)
305 | dim(train.tokens.tfidf)
306 | View(train.tokens.tfidf[1:25, 1:25])
307 | 
308 | 
309 | # Check for incopmlete cases.
310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
311 | train$Text[incomplete.cases]
312 | 
313 | 
314 | # Fix incomplete cases
315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
316 | dim(train.tokens.tfidf)
317 | sum(which(!complete.cases(train.tokens.tfidf)))
318 | 
319 | 
320 | # Make a clean data frame using the same process as before.
321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
323 | 
324 | 
325 | # Time the code execution
326 | start.time <- Sys.time()
327 | 
328 | # Create a cluster to work on 10 logical cores.
329 | cl <- makeCluster(3, type = "SOCK")
330 | registerDoSNOW(cl)
331 | 
332 | # As our data is non-trivial in size at this point, use a single decision
333 | # tree alogrithm as our first model. We will graduate to using more 
334 | # powerful algorithms later when we perform feature extraction to shrink
335 | # the size of our data.
336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
337 |                     trControl = cv.cntrl, tuneLength = 7)
338 | 
339 | # Processing is done, stop cluster.
340 | stopCluster(cl)
341 | 
342 | # Total time of execution on workstation was 
343 | total.time <- Sys.time() - start.time
344 | total.time
345 | 
346 | # Check out our results.
347 | rpart.cv.2
348 | 
349 | 
350 | 
351 | # N-grams allow us to augment our document-term frequency matrices with
352 | # word ordering. This often leads to increased performance (e.g., accuracy)
353 | # for machine learning models trained with more than just unigrams (i.e.,
354 | # single terms). Let's add bigrams to our training data and the TF-IDF 
355 | # transform the expanded featre matrix to see if accuracy improves.
356 | 
357 | # Add bigrams to our feature matrix.
358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
359 | train.tokens[[357]]
360 | 
361 | 
362 | # Transform to dfm and then a matrix.
363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
364 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
365 | train.tokens.dfm
366 | 
367 | 
368 | # Normalize all documents via TF.
369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
370 | 
371 |   
372 | # Calculate the IDF vector that we will use for training and test data!
373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
374 | 
375 |   
376 | # Calculate TF-IDF for our training corpus 
377 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, 
378 |                              idf = train.tokens.idf)
379 | 
380 |   
381 | # Transpose the matrix
382 | train.tokens.tfidf <- t(train.tokens.tfidf)
383 | 
384 |   
385 | # Fix incomplete cases
386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
388 | 
389 | 
390 | # Make a clean data frame.
391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
393 | 
394 | 
395 | # Clean up unused objects in memory.
396 | gc()
397 | 
398 | 
399 | 
400 | 
401 | #
402 | # NOTE - The following code requires the use of command-line R to execute
403 | #        due to the large number of features (i.e., columns) in the matrix.
404 | #        Please consult the following link for more details if you wish
405 | #        to run the code yourself:
406 | #
407 | #        https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r
408 | #
409 | #        Also note that running the following code required approximately
410 | #        38GB of RAM and more than 4.5 hours to execute on a 10-core 
411 | #        workstation!
412 | #
413 | 
414 | 
415 | # Time the code execution
416 | # start.time <- Sys.time()
417 | 
418 | # Leverage single decision trees to evaluate if adding bigrams improves the 
419 | # the effectiveness of the model.
420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
421 | #                     trControl = cv.cntrl, tuneLength = 7)
422 | 
423 | # Total time of execution on workstation was
424 | # total.time <- Sys.time() - start.time
425 | # total.time
426 | 
427 | # Check out our results.
428 | # rpart.cv.3
429 | 
430 | #
431 | # The results of the above processing show a slight decline in rpart 
432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457.
433 | # As we will discuss later, while the addition of bigrams appears to 
434 | # negatively impact a single decision tree, it helps with the mighty
435 | # random forest!
436 | #
437 | 
438 | 
439 | 
440 | 
441 | # We'll leverage the irlba package for our singular value 
442 | # decomposition (SVD). The irlba package allows us to specify
443 | # the number of the most important singular vectors we wish to
444 | # calculate and retain for features.
445 | library(irlba)
446 | 
447 | 
448 | # Time the code execution
449 | start.time <- Sys.time()
450 | 
451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns
452 | # for our latent semantic analysis (LSA).
453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600)
454 | 
455 | # Total time of execution on workstation was 
456 | total.time <- Sys.time() - start.time
457 | total.time
458 | 
459 | 
460 | # Take a look at the new feature data up close.
461 | View(train.irlba$v)
462 | 
463 | 
464 | # As with TF-IDF, we will need to project new data (e.g., the test data)
465 | # into the SVD semantic space. The following code illustrates how to do
466 | # this using a row of the training data that has already been transformed
467 | # by TF-IDF, per the mathematics illustrated in the slides.
468 | #
469 | #
470 | sigma.inverse <- 1 / train.irlba$d
471 | u.transpose <- t(train.irlba$u)
472 | document <- train.tokens.tfidf[1,]
473 | document.hat <- sigma.inverse * u.transpose %*% document
474 | 
475 | # Look at the first 10 components of projected document and the corresponding
476 | # row in our document semantic space (i.e., the V matrix)
477 | document.hat[1:10]
478 | train.irlba$v[1, 1:10]
479 | 
480 | 
481 | 
482 | #
483 | # Create new feature data frame using our document semantic space of 300
484 | # features (i.e., the V matrix from our SVD).
485 | #
486 | train.svd <- data.frame(Label = train$Label, train.irlba$v)
487 | 
488 | 
489 | # Create a cluster to work on 10 logical cores.
490 | cl <- makeCluster(10, type = "SOCK")
491 | registerDoSNOW(cl)
492 | 
493 | # Time the code execution
494 | start.time <- Sys.time()
495 | 
496 | # This will be the last run using single decision trees. With a much smaller
497 | # feature matrix we can now use more powerful methods like the mighty Random
498 | # Forest from now on!
499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 
500 |                     trControl = cv.cntrl, tuneLength = 7)
501 | 
502 | # Processing is done, stop cluster.
503 | stopCluster(cl)
504 | 
505 | # Total time of execution on workstation was 
506 | total.time <- Sys.time() - start.time
507 | total.time
508 | 
509 | # Check out our results.
510 | rpart.cv.4
511 | 
512 | 
513 | 
514 | 
515 | #
516 | # NOTE - The following code takes a long time to run. Here's the math.
517 | #        We are performing 10-fold CV repeated 3 times. That means we
518 | #        need to build 30 models. We are also asking caret to try 7 
519 | #        different values of the mtry parameter. Next up by default
520 | #        a mighty random forest leverages 500 trees. Lastly, caret will
521 | #        build 1 final model at the end of the process with the best 
522 | #        mtry value over all the training data. Here's the number of 
523 | #        tree we're building:
524 | #
525 | #             (10 * 3 * 7 * 500) + 500 = 105,500 trees!
526 | #
527 | # On a workstation using 10 cores the following code took 28 minutes 
528 | # to execute.
529 | #
530 | 
531 | 
532 | # Create a cluster to work on 10 logical cores.
533 | # cl <- makeCluster(10, type = "SOCK")
534 | # registerDoSNOW(cl)
535 | 
536 | # Time the code execution
537 | # start.time <- Sys.time()
538 | 
539 | # We have reduced the dimensionality of our data using SVD. Also, the 
540 | # application of SVD allows us to use LSA to simultaneously increase the
541 | # information density of each feature. To prove this out, leverage a 
542 | # mighty Random Forest with the default of 500 trees. We'll also ask
543 | # caret to try 7 different values of mtry to find the mtry value that 
544 | # gives the best result!
545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 
546 | #                 trControl = cv.cntrl, tuneLength = 7)
547 | 
548 | # Processing is done, stop cluster.
549 | # stopCluster(cl)
550 | 
551 | # Total time of execution on workstation was 
552 | # total.time <- Sys.time() - start.time
553 | # total.time
554 | 
555 | 
556 | # Load processing results from disk!
557 | load("rf.cv.1.RData")
558 | 
559 | # Check out our results.
560 | rf.cv.1
561 | 
562 | # Let's drill-down on the results.
563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted)
564 | 
565 | 


--------------------------------------------------------------------------------
/IntroToTextAnalytics_Part9.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Data Science Dojo
  3 | #    
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # 
 16 | 
 17 | 
 18 | #
 19 | # This R source code file corresponds to video 9 of the Data Science
 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 
 21 | # at the following URL:
 22 | #     https://www.youtube.com/watch?v=SgrLE6WQzkE     
 23 | #
 24 | 
 25 | 
 26 | # Install all required packages.
 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 
 28 |                    "irlba", "randomForest")) 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Load up the .CSV data and explore in RStudio.
 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
 35 | View(spam.raw)
 36 | 
 37 | 
 38 | # Clean up the data frame and view our handiwork.
 39 | spam.raw <- spam.raw[, 1:2]
 40 | names(spam.raw) <- c("Label", "Text")
 41 | View(spam.raw)
 42 | 
 43 | 
 44 | 
 45 | # Check data to see if there are missing values.
 46 | length(which(!complete.cases(spam.raw)))
 47 | 
 48 | 
 49 | 
 50 | # Convert our class label into a factor.
 51 | spam.raw$Label <- as.factor(spam.raw$Label)
 52 | 
 53 | 
 54 | 
 55 | # The first step, as always, is to explore the data.
 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
 57 | prop.table(table(spam.raw$Label))
 58 | 
 59 | 
 60 | 
 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 
 62 | # messages by adding a new feature for the length of each message.
 63 | spam.raw$TextLength <- nchar(spam.raw$Text)
 64 | summary(spam.raw$TextLength)
 65 | 
 66 | 
 67 | 
 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam.
 69 | library(ggplot2)
 70 | 
 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
 72 |   theme_bw() +
 73 |   geom_histogram(binwidth = 5) +
 74 |   labs(y = "Text Count", x = "Length of Text",
 75 |        title = "Distribution of Text Lengths with Class Labels")
 76 | 
 77 | 
 78 | 
 79 | # At a minimum we need to split our data into a training set and a
 80 | # test set. In a true project we would want to use a three-way split 
 81 | # of training, validation, and test.
 82 | #
 83 | # As we know that our data has non-trivial class imbalance, we'll 
 84 | # use the mighty caret package to create a randomg train/test split 
 85 | # that ensures the correct ham/spam class label proportions (i.e., 
 86 | # we'll use caret for a random stratified split).
 87 | library(caret)
 88 | help(package = "caret")
 89 | 
 90 | 
 91 | # Use caret to create a 70%/30% stratified split. Set the random
 92 | # seed for reproducibility.
 93 | set.seed(32984)
 94 | indexes <- createDataPartition(spam.raw$Label, times = 1,
 95 |                                p = 0.7, list = FALSE)
 96 | 
 97 | train <- spam.raw[indexes,]
 98 | test <- spam.raw[-indexes,]
 99 | 
100 | 
101 | # Verify proportions.
102 | prop.table(table(train$Label))
103 | prop.table(table(test$Label))
104 | 
105 | 
106 | 
107 | # Text analytics requires a lot of data exploration, data pre-processing
108 | # and data wrangling. Let's explore some examples.
109 | 
110 | # HTML-escaped ampersand character.
111 | train$Text[21]
112 | 
113 | 
114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
115 | # is an actual person, but we will ignore the implications of this for
116 | # this introductory tutorial.
117 | train$Text[38]
118 | 
119 | 
120 | # A URL.
121 | train$Text[357]
122 | 
123 | 
124 | 
125 | # There are many packages in the R ecosystem for performing text
126 | # analytics. One of the newer packages in quanteda. The quanteda
127 | # package has many useful functions for quickly and easily working
128 | # with text data.
129 | library(quanteda)
130 | help(package = "quanteda")
131 | 
132 | 
133 | # Tokenize SMS text messages.
134 | train.tokens <- tokens(train$Text, what = "word", 
135 |                        remove_numbers = TRUE, remove_punct = TRUE,
136 |                        remove_symbols = TRUE, remove_hyphens = TRUE)
137 | 
138 | # Take a look at a specific SMS message and see how it transforms.
139 | train.tokens[[357]]
140 | 
141 | 
142 | # Lower case the tokens.
143 | train.tokens <- tokens_tolower(train.tokens)
144 | train.tokens[[357]]
145 | 
146 | 
147 | # Use quanteda's built-in stopword list for English.
148 | # NOTE - You should always inspect stopword lists for applicability to
149 | #        your problem/domain.
150 | train.tokens <- tokens_select(train.tokens, stopwords(), 
151 |                               selection = "remove")
152 | train.tokens[[357]]
153 | 
154 | 
155 | # Perform stemming on the tokens.
156 | train.tokens <- tokens_wordstem(train.tokens, language = "english")
157 | train.tokens[[357]]
158 | 
159 | 
160 | # Create our first bag-of-words model.
161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
162 | 
163 | 
164 | # Transform to a matrix and inspect.
165 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
166 | View(train.tokens.matrix[1:20, 1:100])
167 | dim(train.tokens.matrix)
168 | 
169 | 
170 | # Investigate the effects of stemming.
171 | colnames(train.tokens.matrix)[1:50]
172 | 
173 | 
174 | # Per best practices, we will leverage cross validation (CV) as
175 | # the basis of our modeling process. Using CV we can create 
176 | # estimates of how well our model will do in Production on new,
177 | # unseen data. CV is powerful, but the downside is that it
178 | # requires more processing and therefore more time.
179 | #
180 | # If you are not familiar with CV, consult the following 
181 | # Wikipedia article:
182 | #
183 | #   https://en.wikipedia.org/wiki/Cross-validation_(statistics)
184 | #
185 | 
186 | # Setup a the feature data frame with labels.
187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))
188 | 
189 | 
190 | # Often, tokenization requires some additional pre-processing
191 | names(train.tokens.df)[c(146, 148, 235, 238)]
192 | 
193 | 
194 | # Cleanup column names.
195 | names(train.tokens.df) <- make.names(names(train.tokens.df))
196 | 
197 | 
198 | # Use caret to create stratified folds for 10-fold cross validation repeated 
199 | # 3 times (i.e., create 30 random stratified samples)
200 | set.seed(48743)
201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
202 | 
203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
204 |                          repeats = 3, index = cv.folds)
205 | 
206 | 
207 | # Our data frame is non-trivial in size. As such, CV runs will take 
208 | # quite a long time to run. To cut down on total execution time, use
209 | # the doSNOW package to allow for multi-core training in parallel.
210 | #
211 | # WARNING - The following code is configured to run on a workstation-
212 | #           or server-class machine (i.e., 12 logical cores). Alter
213 | #           code to suit your HW environment.
214 | #
215 | #install.packages("doSNOW")
216 | library(doSNOW)
217 | 
218 | 
219 | # Time the code execution
220 | start.time <- Sys.time()
221 | 
222 | 
223 | # Create a cluster to work on 10 logical cores.
224 | cl <- makeCluster(10, type = "SOCK")
225 | registerDoSNOW(cl)
226 | 
227 | 
228 | # As our data is non-trivial in size at this point, use a single decision
229 | # tree alogrithm as our first model. We will graduate to using more 
230 | # powerful algorithms later when we perform feature extraction to shrink
231 | # the size of our data.
232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 
233 |                     trControl = cv.cntrl, tuneLength = 7)
234 | 
235 | 
236 | # Processing is done, stop cluster.
237 | stopCluster(cl)
238 | 
239 | 
240 | # Total time of execution on workstation was approximately 4 minutes. 
241 | total.time <- Sys.time() - start.time
242 | total.time
243 | 
244 | 
245 | # Check out our results.
246 | rpart.cv.1
247 | 
248 | 
249 | 
250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 
251 | # powerful technique for enhancing the information/signal contained
252 | # within our document-frequency matrix. Specifically, the mathematics
253 | # behind TF-IDF accomplish the following goals:
254 | #    1 - The TF calculation accounts for the fact that longer 
255 | #        documents will have higher individual term counts. Applying
256 | #        TF normalizes all documents in the corpus to be length 
257 | #        independent.
258 | #    2 - The IDF calculation accounts for the frequency of term
259 | #        appearance in all documents in the corpus. The intuition 
260 | #        being that a term that appears in every document has no
261 | #        predictive power.
262 | #    3 - The multiplication of TF by IDF for each cell in the matrix
263 | #        allows for weighting of #1 and #2 for each cell in the matrix.
264 | 
265 | 
266 | # Our function for calculating relative term frequency (TF)
267 | term.frequency <- function(row) {
268 |   row / sum(row)
269 | }
270 | 
271 | # Our function for calculating inverse document frequency (IDF)
272 | inverse.doc.freq <- function(col) {
273 |   corpus.size <- length(col)
274 |   doc.count <- length(which(col > 0))
275 | 
276 |   log10(corpus.size / doc.count)
277 | }
278 | 
279 | # Our function for calculating TF-IDF.
280 | tf.idf <- function(x, idf) {
281 |   x * idf
282 | }
283 | 
284 | 
285 | # First step, normalize all documents via TF.
286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
287 | dim(train.tokens.df)
288 | View(train.tokens.df[1:20, 1:100])
289 | 
290 | 
291 | # Second step, calculate the IDF vector that we will use - both
292 | # for training data and for test data!
293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
294 | str(train.tokens.idf)
295 | 
296 | 
297 | # Lastly, calculate TF-IDF for our training corpus.
298 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
299 | dim(train.tokens.tfidf)
300 | View(train.tokens.tfidf[1:25, 1:25])
301 | 
302 | 
303 | # Transpose the matrix
304 | train.tokens.tfidf <- t(train.tokens.tfidf)
305 | dim(train.tokens.tfidf)
306 | View(train.tokens.tfidf[1:25, 1:25])
307 | 
308 | 
309 | # Check for incopmlete cases.
310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
311 | train$Text[incomplete.cases]
312 | 
313 | 
314 | # Fix incomplete cases
315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
316 | dim(train.tokens.tfidf)
317 | sum(which(!complete.cases(train.tokens.tfidf)))
318 | 
319 | 
320 | # Make a clean data frame using the same process as before.
321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
323 | 
324 | 
325 | # Time the code execution
326 | start.time <- Sys.time()
327 | 
328 | # Create a cluster to work on 10 logical cores.
329 | cl <- makeCluster(3, type = "SOCK")
330 | registerDoSNOW(cl)
331 | 
332 | # As our data is non-trivial in size at this point, use a single decision
333 | # tree alogrithm as our first model. We will graduate to using more 
334 | # powerful algorithms later when we perform feature extraction to shrink
335 | # the size of our data.
336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
337 |                     trControl = cv.cntrl, tuneLength = 7)
338 | 
339 | # Processing is done, stop cluster.
340 | stopCluster(cl)
341 | 
342 | # Total time of execution on workstation was 
343 | total.time <- Sys.time() - start.time
344 | total.time
345 | 
346 | # Check out our results.
347 | rpart.cv.2
348 | 
349 | 
350 | 
351 | # N-grams allow us to augment our document-term frequency matrices with
352 | # word ordering. This often leads to increased performance (e.g., accuracy)
353 | # for machine learning models trained with more than just unigrams (i.e.,
354 | # single terms). Let's add bigrams to our training data and the TF-IDF 
355 | # transform the expanded featre matrix to see if accuracy improves.
356 | 
357 | # Add bigrams to our feature matrix.
358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
359 | train.tokens[[357]]
360 | 
361 | 
362 | # Transform to dfm and then a matrix.
363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
364 | train.tokens.matrix <- as.matrix(train.tokens.dfm)
365 | train.tokens.dfm
366 | 
367 | 
368 | # Normalize all documents via TF.
369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
370 | 
371 |   
372 | # Calculate the IDF vector that we will use for training and test data!
373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
374 | 
375 |   
376 | # Calculate TF-IDF for our training corpus 
377 | train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, 
378 |                              idf = train.tokens.idf)
379 | 
380 |   
381 | # Transpose the matrix
382 | train.tokens.tfidf <- t(train.tokens.tfidf)
383 | 
384 |   
385 | # Fix incomplete cases
386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
388 | 
389 | 
390 | # Make a clean data frame.
391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
393 | 
394 | 
395 | # Clean up unused objects in memory.
396 | gc()
397 | 
398 | 
399 | 
400 | 
401 | #
402 | # NOTE - The following code requires the use of command-line R to execute
403 | #        due to the large number of features (i.e., columns) in the matrix.
404 | #        Please consult the following link for more details if you wish
405 | #        to run the code yourself:
406 | #
407 | #        https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r
408 | #
409 | #        Also note that running the following code required approximately
410 | #        38GB of RAM and more than 4.5 hours to execute on a 10-core 
411 | #        workstation!
412 | #
413 | 
414 | 
415 | # Time the code execution
416 | # start.time <- Sys.time()
417 | 
418 | # Leverage single decision trees to evaluate if adding bigrams improves the 
419 | # the effectiveness of the model.
420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 
421 | #                     trControl = cv.cntrl, tuneLength = 7)
422 | 
423 | # Total time of execution on workstation was
424 | # total.time <- Sys.time() - start.time
425 | # total.time
426 | 
427 | # Check out our results.
428 | # rpart.cv.3
429 | 
430 | #
431 | # The results of the above processing show a slight decline in rpart 
432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457.
433 | # As we will discuss later, while the addition of bigrams appears to 
434 | # negatively impact a single decision tree, it helps with the mighty
435 | # random forest!
436 | #
437 | 
438 | 
439 | 
440 | 
441 | # We'll leverage the irlba package for our singular value 
442 | # decomposition (SVD). The irlba package allows us to specify
443 | # the number of the most important singular vectors we wish to
444 | # calculate and retain for features.
445 | library(irlba)
446 | 
447 | 
448 | # Time the code execution
449 | start.time <- Sys.time()
450 | 
451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns
452 | # for our latent semantic analysis (LSA).
453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600)
454 | 
455 | # Total time of execution on workstation was 
456 | total.time <- Sys.time() - start.time
457 | total.time
458 | 
459 | 
460 | # Take a look at the new feature data up close.
461 | View(train.irlba$v)
462 | 
463 | 
464 | # As with TF-IDF, we will need to project new data (e.g., the test data)
465 | # into the SVD semantic space. The following code illustrates how to do
466 | # this using a row of the training data that has already been transformed
467 | # by TF-IDF, per the mathematics illustrated in the slides.
468 | #
469 | #
470 | sigma.inverse <- 1 / train.irlba$d
471 | u.transpose <- t(train.irlba$u)
472 | document <- train.tokens.tfidf[1,]
473 | document.hat <- sigma.inverse * u.transpose %*% document
474 | 
475 | # Look at the first 10 components of projected document and the corresponding
476 | # row in our document semantic space (i.e., the V matrix)
477 | document.hat[1:10]
478 | train.irlba$v[1, 1:10]
479 | 
480 | 
481 | 
482 | #
483 | # Create new feature data frame using our document semantic space of 300
484 | # features (i.e., the V matrix from our SVD).
485 | #
486 | train.svd <- data.frame(Label = train$Label, train.irlba$v)
487 | 
488 | 
489 | # Create a cluster to work on 10 logical cores.
490 | cl <- makeCluster(10, type = "SOCK")
491 | registerDoSNOW(cl)
492 | 
493 | # Time the code execution
494 | start.time <- Sys.time()
495 | 
496 | # This will be the last run using single decision trees. With a much smaller
497 | # feature matrix we can now use more powerful methods like the mighty Random
498 | # Forest from now on!
499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 
500 |                     trControl = cv.cntrl, tuneLength = 7)
501 | 
502 | # Processing is done, stop cluster.
503 | stopCluster(cl)
504 | 
505 | # Total time of execution on workstation was 
506 | total.time <- Sys.time() - start.time
507 | total.time
508 | 
509 | # Check out our results.
510 | rpart.cv.4
511 | 
512 | 
513 | 
514 | 
515 | #
516 | # NOTE - The following code takes a long time to run. Here's the math.
517 | #        We are performing 10-fold CV repeated 3 times. That means we
518 | #        need to build 30 models. We are also asking caret to try 7 
519 | #        different values of the mtry parameter. Next up by default
520 | #        a mighty random forest leverages 500 trees. Lastly, caret will
521 | #        build 1 final model at the end of the process with the best 
522 | #        mtry value over all the training data. Here's the number of 
523 | #        tree we're building:
524 | #
525 | #             (10 * 3 * 7 * 500) + 500 = 105,500 trees!
526 | #
527 | # On a workstation using 10 cores the following code took 28 minutes 
528 | # to execute.
529 | #
530 | 
531 | 
532 | # Create a cluster to work on 10 logical cores.
533 | # cl <- makeCluster(10, type = "SOCK")
534 | # registerDoSNOW(cl)
535 | 
536 | # Time the code execution
537 | # start.time <- Sys.time()
538 | 
539 | # We have reduced the dimensionality of our data using SVD. Also, the 
540 | # application of SVD allows us to use LSA to simultaneously increase the
541 | # information density of each feature. To prove this out, leverage a 
542 | # mighty Random Forest with the default of 500 trees. We'll also ask
543 | # caret to try 7 different values of mtry to find the mtry value that 
544 | # gives the best result!
545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 
546 | #                 trControl = cv.cntrl, tuneLength = 7)
547 | 
548 | # Processing is done, stop cluster.
549 | # stopCluster(cl)
550 | 
551 | # Total time of execution on workstation was 
552 | # total.time <- Sys.time() - start.time
553 | # total.time
554 | 
555 | 
556 | # Load processing results from disk!
557 | load("rf.cv.1.RData")
558 | 
559 | # Check out our results.
560 | rf.cv.1
561 | 
562 | # Let's drill-down on the results.
563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted)
564 | 
565 | 
566 | 
567 | 
568 | 
569 | # OK, now let's add in the feature we engineered previously for SMS 
570 | # text length to see if it improves things.
571 | train.svd$TextLength <- train$TextLength
572 | 
573 | 
574 | # Create a cluster to work on 10 logical cores.
575 | # cl <- makeCluster(10, type = "SOCK")
576 | # registerDoSNOW(cl)
577 | 
578 | # Time the code execution
579 | # start.time <- Sys.time()
580 | 
581 | # Re-run the training process with the additional feature.
582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf",
583 | #                 trControl = cv.cntrl, tuneLength = 7, 
584 | #                 importance = TRUE)
585 | 
586 | # Processing is done, stop cluster.
587 | # stopCluster(cl)
588 | 
589 | # Total time of execution on workstation was 
590 | # total.time <- Sys.time() - start.time
591 | # total.time
592 | 
593 | # Load results from disk.
594 | load("rf.cv.2.RData")
595 | 
596 | # Check the results.
597 | rf.cv.2
598 | 
599 | # Drill-down on the results.
600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted)
601 | 
602 | # How important was the new feature?
603 | library(randomForest)
604 | varImpPlot(rf.cv.1$finalModel)
605 | varImpPlot(rf.cv.2$finalModel)
606 | 
607 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IntroToTextAnalyticsWithR
 2 | Public repo for the Data Science Dojo YouTube tutorial series [Introduction to Text Analytics with R](https://www.youtube.com/playlist?list=PL8eNk_zTBST8olxIRFoo0YeXxEOkYdoxi). This tutorial series leverages the [Kaggle SMS Spam Collection Dataset](https://www.kaggle.com/uciml/sms-spam-collection-dataset) originally published by [UCI ML Repository](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)
 3 | 
 4 | 
 5 | - [Introduction to Text Analytics with R - Part 1](https://www.youtube.com/watch?v=4vuw0AsHeGw)
 6 | - [Introduction to Text Analytics with R - Part 2](https://www.youtube.com/watch?v=Y7385dGRNLM)
 7 | - [Introduction to Text Analytics with R - Part 3](https://www.youtube.com/watch?v=CQsyVDxK7_g)
 8 | - [Introduction to Text Analytics with R - Part 4](https://www.youtube.com/watch?v=IFhDlHKRHno)
 9 | - [Introduction to Text Analytics with R - Part 5](https://www.youtube.com/watch?v=az7yf0IfWPM)
10 | - [Introduction to Text Analytics with R - Part 6](https://www.youtube.com/watch?v=neiW5Ugsob8)
11 | - [Introduction to Text Analytics with R - Part 7](https://www.youtube.com/watch?v=Fza5szojsU8)
12 | - [Introduction to Text Analytics with R - Part 8](https://www.youtube.com/watch?v=4DI68P4hicQ)
13 | - [Introduction to Text Analytics with R - Part 9](https://www.youtube.com/watch?v=SgrLE6WQzkE)
14 | - [Introduction to Text Analytics with R - Part 10](https://www.youtube.com/watch?v=7cwBhWYHgsA)
15 | - [Introduction to Text Analytics with R - Part 11](https://www.youtube.com/watch?v=XWUi7RivDJY)
16 | - [Introduction to Text Analytics with R - Part 12](https://www.youtube.com/watch?v=-wCrClheObk)


--------------------------------------------------------------------------------
/rf.cv.1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.1.RData


--------------------------------------------------------------------------------
/rf.cv.2.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.2.RData


--------------------------------------------------------------------------------
/rf.cv.3.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.3.RData


--------------------------------------------------------------------------------
/rf.cv.4.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.4.RData


--------------------------------------------------------------------------------
/spam.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/spam.csv


--------------------------------------------------------------------------------