├── .Rhistory ├── IntroToTextAnalyticsWithR_Part1.pdf ├── IntroToTextAnalyticsWithR_Part10.pdf ├── IntroToTextAnalyticsWithR_Part12.pdf ├── IntroToTextAnalyticsWithR_Part2.pdf ├── IntroToTextAnalyticsWithR_Part5.pdf ├── IntroToTextAnalyticsWithR_Part6.pdf ├── IntroToTextAnalyticsWithR_Part7.pdf ├── IntroToTextAnalyticsWithR_Part9.pdf ├── IntroToTextAnalytics_Part1.R ├── IntroToTextAnalytics_Part10.R ├── IntroToTextAnalytics_Part11.R ├── IntroToTextAnalytics_Part12.R ├── IntroToTextAnalytics_Part2.R ├── IntroToTextAnalytics_Part3.R ├── IntroToTextAnalytics_Part4.R ├── IntroToTextAnalytics_Part5.R ├── IntroToTextAnalytics_Part6.R ├── IntroToTextAnalytics_Part7.R ├── IntroToTextAnalytics_Part8.R ├── IntroToTextAnalytics_Part9.R ├── README.md ├── rf.cv.1.RData ├── rf.cv.2.RData ├── rf.cv.3.RData ├── rf.cv.4.RData └── spam.csv /.Rhistory: -------------------------------------------------------------------------------- 1 | names(new.theta) <- names(features) 2 | alpha <- 0.01 3 | # Utility functon that calculates the prediction for an observation 4 | # given the current state of the hypothesis function. 5 | #h.theta <- function(theta, observation) { 6 | # return(sum(theta * observation)) 7 | # prediction <- 0.0 8 | # for(i in 1:length(theta)) { 9 | # prediction <- prediction + (theta[i] * observation[i]) 10 | # } 11 | # return(prediction) 12 | #} 13 | # for(k in 1:2){ 14 | # m <- ncol(features) 15 | # 16 | # for(j in 1:m) { 17 | # n <- nrow(features) 18 | # summation <- 0.0 19 | # 20 | # for(i in 1:n) { 21 | # # prediction <- h.theta(theta, features[i,]) 22 | # prediction <- sum(theta * features[i,]) 23 | # residual <- prediction - y[i] 24 | # update.value <- residual * features[i, j] 25 | # summation <- summation + update.value 26 | # } 27 | # 28 | # new.theta[j] <- theta[j] + (alpha * summation) 29 | # } 30 | # 31 | # theta <- new.theta 32 | # 33 | # # print(theta) 34 | # } 35 | # 36 | # print(theta) 37 | iterations <- 250 38 | X <- features 39 | for(k in 1:iterations) { 40 | for(j in 1:ncol(X)) { 41 | summation <- 0 42 | for(i in 1:nrow(X)) { 43 | residual <- sum(X[i,] * theta[j]) - y[i] 44 | summation <- summation + (residual * X[i, j]) 45 | } 46 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) 47 | } 48 | theta <- new.theta 49 | } 50 | lm.model 51 | set.seed(1234) 52 | x <- runif(1000, -5, 5) 53 | y <- x + rnorm(1000) + 3 54 | intercept <- rep(1, length(x)) 55 | lm.model <- lm(y ~ x) 56 | summary(lm.model) 57 | features <- data.frame(intercept = intercept, x = x) 58 | theta <- rep(0, ncol(features)) 59 | names(theta) <- names(features) 60 | theta 61 | new.theta <- rep(0, ncol(features)) 62 | names(new.theta) <- names(features) 63 | alpha <- 0.025 64 | # Utility functon that calculates the prediction for an observation 65 | # given the current state of the hypothesis function. 66 | #h.theta <- function(theta, observation) { 67 | # return(sum(theta * observation)) 68 | # prediction <- 0.0 69 | # for(i in 1:length(theta)) { 70 | # prediction <- prediction + (theta[i] * observation[i]) 71 | # } 72 | # return(prediction) 73 | #} 74 | # for(k in 1:2){ 75 | # m <- ncol(features) 76 | # 77 | # for(j in 1:m) { 78 | # n <- nrow(features) 79 | # summation <- 0.0 80 | # 81 | # for(i in 1:n) { 82 | # # prediction <- h.theta(theta, features[i,]) 83 | # prediction <- sum(theta * features[i,]) 84 | # residual <- prediction - y[i] 85 | # update.value <- residual * features[i, j] 86 | # summation <- summation + update.value 87 | # } 88 | # 89 | # new.theta[j] <- theta[j] + (alpha * summation) 90 | # } 91 | # 92 | # theta <- new.theta 93 | # 94 | # # print(theta) 95 | # } 96 | # 97 | # print(theta) 98 | iterations <- 250 99 | X <- features 100 | for(k in 1:iterations) { 101 | for(j in 1:ncol(X)) { 102 | summation <- 0 103 | for(i in 1:nrow(X)) { 104 | residual <- sum(X[i,] * theta[j]) - y[i] 105 | summation <- summation + (residual * X[i, j]) 106 | } 107 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) 108 | } 109 | theta <- new.theta 110 | } 111 | set.seed(1234) 112 | x <- runif(1000, -5, 5) 113 | y <- x + rnorm(1000) + 3 114 | intercept <- rep(1, length(x)) 115 | lm.model <- lm(y ~ x) 116 | summary(lm.model) 117 | features <- data.frame(intercept = intercept, x = x) 118 | theta <- rep(0, ncol(features)) 119 | names(theta) <- names(features) 120 | theta 121 | new.theta <- rep(0, ncol(features)) 122 | names(new.theta) <- names(features) 123 | alpha <- 0.05 124 | # Utility functon that calculates the prediction for an observation 125 | # given the current state of the hypothesis function. 126 | #h.theta <- function(theta, observation) { 127 | # return(sum(theta * observation)) 128 | # prediction <- 0.0 129 | # for(i in 1:length(theta)) { 130 | # prediction <- prediction + (theta[i] * observation[i]) 131 | # } 132 | # return(prediction) 133 | #} 134 | # for(k in 1:2){ 135 | # m <- ncol(features) 136 | # 137 | # for(j in 1:m) { 138 | # n <- nrow(features) 139 | # summation <- 0.0 140 | # 141 | # for(i in 1:n) { 142 | # # prediction <- h.theta(theta, features[i,]) 143 | # prediction <- sum(theta * features[i,]) 144 | # residual <- prediction - y[i] 145 | # update.value <- residual * features[i, j] 146 | # summation <- summation + update.value 147 | # } 148 | # 149 | # new.theta[j] <- theta[j] + (alpha * summation) 150 | # } 151 | # 152 | # theta <- new.theta 153 | # 154 | # # print(theta) 155 | # } 156 | # 157 | # print(theta) 158 | iterations <- 250 159 | X <- features 160 | for(k in 1:iterations) { 161 | for(j in 1:ncol(X)) { 162 | summation <- 0 163 | for(i in 1:nrow(X)) { 164 | residual <- sum(X[i,] * theta[j]) - y[i] 165 | summation <- summation + (residual * X[i, j]) 166 | } 167 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) 168 | } 169 | theta <- new.theta 170 | } 171 | set.seed(1234) 172 | x <- runif(1000, -5, 5) 173 | y <- x + rnorm(1000) + 3 174 | intercept <- rep(1, length(x)) 175 | lm.model <- lm(y ~ x) 176 | summary(lm.model) 177 | features <- data.frame(intercept = intercept, x = x) 178 | theta <- rep(0, ncol(features)) 179 | names(theta) <- names(features) 180 | theta 181 | new.theta <- rep(0, ncol(features)) 182 | names(new.theta) <- names(features) 183 | alpha <- 0.05 184 | # Utility functon that calculates the prediction for an observation 185 | # given the current state of the hypothesis function. 186 | #h.theta <- function(theta, observation) { 187 | # return(sum(theta * observation)) 188 | # prediction <- 0.0 189 | # for(i in 1:length(theta)) { 190 | # prediction <- prediction + (theta[i] * observation[i]) 191 | # } 192 | # return(prediction) 193 | #} 194 | # for(k in 1:2){ 195 | # m <- ncol(features) 196 | # 197 | # for(j in 1:m) { 198 | # n <- nrow(features) 199 | # summation <- 0.0 200 | # 201 | # for(i in 1:n) { 202 | # # prediction <- h.theta(theta, features[i,]) 203 | # prediction <- sum(theta * features[i,]) 204 | # residual <- prediction - y[i] 205 | # update.value <- residual * features[i, j] 206 | # summation <- summation + update.value 207 | # } 208 | # 209 | # new.theta[j] <- theta[j] + (alpha * summation) 210 | # } 211 | # 212 | # theta <- new.theta 213 | # 214 | # # print(theta) 215 | # } 216 | # 217 | # print(theta) 218 | iterations <- 300 219 | X <- features 220 | for(k in 1:iterations) { 221 | for(j in 1:ncol(X)) { 222 | summation <- 0 223 | for(i in 1:nrow(X)) { 224 | residual <- sum(X[i,] * theta[j]) - y[i] 225 | summation <- summation + (residual * X[i, j]) 226 | } 227 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) 228 | } 229 | theta <- new.theta 230 | } 231 | set.seed(1234) 232 | x <- runif(1000, -5, 5) 233 | y <- x + rnorm(1000) + 3 234 | intercept <- rep(1, length(x)) 235 | lm.model <- lm(y ~ x) 236 | summary(lm.model) 237 | features <- data.frame(intercept = intercept, x = x) 238 | theta <- rep(0, ncol(features)) 239 | names(theta) <- names(features) 240 | theta 241 | new.theta <- rep(0, ncol(features)) 242 | names(new.theta) <- names(features) 243 | alpha <- 0.05 244 | # Utility functon that calculates the prediction for an observation 245 | # given the current state of the hypothesis function. 246 | #h.theta <- function(theta, observation) { 247 | # return(sum(theta * observation)) 248 | # prediction <- 0.0 249 | # for(i in 1:length(theta)) { 250 | # prediction <- prediction + (theta[i] * observation[i]) 251 | # } 252 | # return(prediction) 253 | #} 254 | # for(k in 1:2){ 255 | # m <- ncol(features) 256 | # 257 | # for(j in 1:m) { 258 | # n <- nrow(features) 259 | # summation <- 0.0 260 | # 261 | # for(i in 1:n) { 262 | # # prediction <- h.theta(theta, features[i,]) 263 | # prediction <- sum(theta * features[i,]) 264 | # residual <- prediction - y[i] 265 | # update.value <- residual * features[i, j] 266 | # summation <- summation + update.value 267 | # } 268 | # 269 | # new.theta[j] <- theta[j] + (alpha * summation) 270 | # } 271 | # 272 | # theta <- new.theta 273 | # 274 | # # print(theta) 275 | # } 276 | # 277 | # print(theta) 278 | iterations <- 500 279 | X <- features 280 | for(k in 1:iterations) { 281 | for(j in 1:ncol(X)) { 282 | summation <- 0 283 | for(i in 1:nrow(X)) { 284 | residual <- sum(X[i,] * theta[j]) - y[i] 285 | summation <- summation + (residual * X[i, j]) 286 | } 287 | new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) 288 | } 289 | theta <- new.theta 290 | } 291 | data(iria) 292 | data(iris) 293 | install.packages(c("lmtest", "mgcv", "nlme")) 294 | data("iris") 295 | library(GGally) 296 | ggpairs(iris) 297 | remove.packages("tibble") 298 | library(GGally) 299 | remove.packages("GGally") 300 | remove.packages("plotly") 301 | install.pacakges("GGally") 302 | install.packages("GGally") 303 | library(GGally) 304 | install.packages(tibble) 305 | install.packages("tibble") 306 | library(GGally) 307 | data("iris") 308 | ggpairs(iris) 309 | debugSource('~/Dropbox/AmsterdamBootcamp/GradientDescentExample.R', echo=TRUE) 310 | debugSource('~/Dropbox/AmsterdamBootcamp/GradientDescentExample.R', echo=TRUE) 311 | # 312 | # Copyright 2017 Dave Langer 313 | # 314 | # Licensed under the Apache License, Version 2.0 (the "License"); 315 | # you may not use this file except in compliance with the License. 316 | # You may obtain a copy of the License at 317 | # 318 | # http://www.apache.org/licenses/LICENSE-2.0 319 | # 320 | # Unless required by applicable law or agreed to in writing, software 321 | # distributed under the License is distributed on an "AS IS" BASIS, 322 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 323 | # See the License for the specific language governing permissions and 324 | # limitations under the License. 325 | # 326 | # 327 | # This R source code file corresponds to video 10 of the YouTube series 328 | # "R Programming for Excel Users" located at the following URL: 329 | # https://youtu.be/gYt05xI2Fm8 330 | # 331 | #=========================================================================== 332 | # Numeric Vectors 333 | # 334 | # Create a vector of integer values 335 | my_vector <- 1:10 336 | my_vector 337 | # Inspect the vector more closely 338 | class(my_vector) 339 | str(my_vector) 340 | summary(my_vector) 341 | # Add 1 to each value of the vector 342 | my_vector_plus1 <- my_vector + 1 343 | my_vector_plus1 344 | # Divide each value of the vector by 2 345 | half_my_vector <- my_vector / 2 346 | half_my_vector 347 | # Make the vector whole again 348 | whole_my_vector <- half_my_vector + half_my_vector 349 | whole_my_vector 350 | # Square the value of each vector 351 | my_vector_squared1 <- my_vector * my_vector 352 | my_vector_squared1 353 | # Square the value of each vector 354 | my_vector_squared2 <- my_vector ^ 2 355 | my_vector_squared2 356 | # Take the square root of each value 357 | sqrt_my_vector <- sqrt(my_vector) 358 | sqrt_my_vector 359 | # More vectorized functions 360 | sum(my_vector) 361 | mean(my_vector) 362 | sd(my_vector) 363 | #=========================================================================== 364 | # Logical Vectors 365 | # 366 | # Which values are greater than 3.5? 367 | larger_than_3.5 <- my_vector > 3.5 368 | larger_than_3.5 369 | # Inspect vector more closely 370 | class(larger_than_3.5) 371 | str(larger_than_3.5) 372 | summary(larger_than_3.5) 373 | # Grab only the values larger than 3.5 374 | my_vector2 <- my_vector[larger_than_3.5] 375 | my_vector2 376 | # Grab only the values larger than 3.5 377 | my_vector3 <- my_vector[my_vector > 3.5] 378 | my_vector3 379 | # Grow the vector 380 | my_bigger_vector <- c(my_vector, 11:15, 16, 17, 18, 19, 20) 381 | my_bigger_vector 382 | # How big is it now? 383 | length(my_bigger_vector) 384 | dim(my_bigger_vector) 385 | #=========================================================================== 386 | # String Vectors 387 | # 388 | # Create a vector of strings 389 | force_users <- c("Yoda", "Darth Vader", "Obi Wan", "Mace Windu", 390 | "Darth Maul", "Luke Skywalker", "Darth Sidious") 391 | # Inspect vector more closely 392 | class(force_users) 393 | str(force_users) 394 | summary(force_users) 395 | # Add 1 to string vector 396 | force_users + 1 397 | # Add another force user 398 | force_users <- force_users + "Kylo Ren" 399 | # Add more force users 400 | more_force_users <- c(force_users, "Qui-Gon Jinn", "Darth Tyranus") 401 | more_force_users 402 | # How big is the vector? 403 | length(more_force_users) 404 | # How long is each string in the vector? 405 | name_lengths <- nchar(more_force_users) 406 | name_lengths 407 | #=========================================================================== 408 | # Missing Values 409 | # 410 | # Build a vector with missing values 411 | birthplaces <- c(NA, "Tatooine", "Stewjon", "Haruun Kal", "Dathomir", 412 | "Polis Massa", "Naboo", "Coruscant", "Serenno") 413 | birthplaces 414 | # Inspect closer 415 | class(birthplaces) 416 | str(birthplaces) 417 | summary(birthplaces) 418 | # Vectorized operation 419 | is.na(birthplaces) 420 | nchar(birthplaces) 421 | nchar("") 422 | # Logical operations 423 | birthplaces[!is.na(birthplaces)] 424 | #=========================================================================== 425 | # Factor Vectors 426 | # 427 | # Create factor (categorical) vector 428 | affiliation <- as.factor(c("Jedi", "Sith", "Rogue")) 429 | affiliation 430 | # Inspect 431 | class(affiliation) 432 | str(affiliation) 433 | summary(affiliation) 434 | levels(affiliation) 435 | # Explore representations 436 | as.numeric(affiliation) 437 | as.character(affiliation) 438 | #=========================================================================== 439 | # Data Frames 440 | # 441 | star_wars <- data.frame(id = 1:length(more_force_users), 442 | more_force_users, 443 | birthplaces = as.factor(birthplaces), 444 | affiliation = c("Jedi", "Sith", 445 | "Jedi", "Jedi", 446 | "Sith", "Jedi", 447 | "Sith", "Jedi", 448 | "Sith"), 449 | stringsAsFactors = FALSE) 450 | # Inspect 451 | View(star_wars) 452 | head(star_wars) 453 | summary(star_wars) 454 | str(star_wars) 455 | # Set up factors 456 | star_wars$affiliation <- as.factor(star_wars$affiliation) 457 | # Reinspect 458 | str(star_wars) 459 | # Additional slicing syntax 460 | star_wars$more_force_users[3] 461 | star_wars$more_force_users[star_wars$affiliation == "Sith"] 462 | # Load-up some built in data 463 | data(iris) 464 | data(mtcars) 465 | # Get help on built-in data 466 | ?mtcars 467 | # Understand the shape of a data frame 468 | nrow(mtcars) 469 | ncol(mtcars) 470 | dim(mtcars) 471 | # Understand the metadata of a data frame 472 | names(mtcars) 473 | names(mtcars)[3] 474 | colnames(mtcars) 475 | colnames(mtcars)[3:5] 476 | rownames(mtcars) 477 | rownames(mtcars)[c(3, 4, 5)] 478 | # Cool RStudio feature - spreadsheet view of a data frame 479 | View(mtcars) 480 | # See a few rows at the top and bottom of a data frame 481 | head(mtcars) 482 | tail(mtcars) 483 | # All-up view of a data frame 484 | summary(mtcars) 485 | # Understand the data type of a data frame 486 | class(mtcars) 487 | str(mtcars) 488 | setwd("~/Dropbox/DataScienceDojo/IntroToTextAnalyticsWithR") 489 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 490 | View(spam.raw) 491 | # Clean up the data frame and view our handiwork. 492 | spam.raw <- spam.raw[, 1:2] 493 | names(spam.raw) <- c("Label", "Text") 494 | View(spam.raw) 495 | # Check data to see if there are missing values. 496 | length(which(!complete.cases(spam.raw))) 497 | # Convert our class label into a factor. 498 | spam.raw$Label <- as.factor(spam.raw$Label) 499 | # The first step, as always, is to explore the data. 500 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 501 | prop.table(table(spam.raw$Label)) 502 | # Next up, let's get a feel for the distribution of text lengths of the SMS 503 | # messages by adding a new feature for the length of each message. 504 | spam.raw$TextLength <- nchar(spam.raw$Text) 505 | summary(spam.raw$TextLength) 506 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 507 | library(ggplot2) 508 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 509 | theme_bw() + 510 | geom_histogram(binwidth = 5) + 511 | labs(y = "Text Count", x = "Length of Text", 512 | title = "Distribution of Text Lengths with Class Labels") 513 | -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part1.pdf -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part10.pdf -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part12.pdf -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part2.pdf -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part5.pdf -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part6.pdf -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part7.pdf -------------------------------------------------------------------------------- /IntroToTextAnalyticsWithR_Part9.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/IntroToTextAnalyticsWithR_Part9.pdf -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part1.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 1 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | 39 | # Clean up the data frame and view our handiwork. 40 | spam.raw <- spam.raw[, 1:2] 41 | names(spam.raw) <- c("Label", "Text") 42 | View(spam.raw) 43 | 44 | 45 | 46 | # Check data to see if there are missing values. 47 | length(which(!complete.cases(spam.raw))) 48 | 49 | 50 | 51 | # Convert our class label into a factor. 52 | spam.raw$Label <- as.factor(spam.raw$Label) 53 | 54 | 55 | 56 | # The first step, as always, is to explore the data. 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 58 | prop.table(table(spam.raw$Label)) 59 | 60 | 61 | 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 63 | # messages by adding a new feature for the length of each message. 64 | spam.raw$TextLength <- nchar(spam.raw$Text) 65 | summary(spam.raw$TextLength) 66 | 67 | 68 | 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 70 | library(ggplot2) 71 | 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 73 | theme_bw() + 74 | geom_histogram(binwidth = 5) + 75 | labs(y = "Text Count", x = "Length of Text", 76 | title = "Distribution of Text Lengths with Class Labels") 77 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part10.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 10 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=7cwBhWYHgsA 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | # Clean up the data frame and view our handiwork. 39 | spam.raw <- spam.raw[, 1:2] 40 | names(spam.raw) <- c("Label", "Text") 41 | View(spam.raw) 42 | 43 | 44 | 45 | # Check data to see if there are missing values. 46 | length(which(!complete.cases(spam.raw))) 47 | 48 | 49 | 50 | # Convert our class label into a factor. 51 | spam.raw$Label <- as.factor(spam.raw$Label) 52 | 53 | 54 | 55 | # The first step, as always, is to explore the data. 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 57 | prop.table(table(spam.raw$Label)) 58 | 59 | 60 | 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 62 | # messages by adding a new feature for the length of each message. 63 | spam.raw$TextLength <- nchar(spam.raw$Text) 64 | summary(spam.raw$TextLength) 65 | 66 | 67 | 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 69 | library(ggplot2) 70 | 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 72 | theme_bw() + 73 | geom_histogram(binwidth = 5) + 74 | labs(y = "Text Count", x = "Length of Text", 75 | title = "Distribution of Text Lengths with Class Labels") 76 | 77 | 78 | 79 | # At a minimum we need to split our data into a training set and a 80 | # test set. In a true project we would want to use a three-way split 81 | # of training, validation, and test. 82 | # 83 | # As we know that our data has non-trivial class imbalance, we'll 84 | # use the mighty caret package to create a randomg train/test split 85 | # that ensures the correct ham/spam class label proportions (i.e., 86 | # we'll use caret for a random stratified split). 87 | library(caret) 88 | help(package = "caret") 89 | 90 | 91 | # Use caret to create a 70%/30% stratified split. Set the random 92 | # seed for reproducibility. 93 | set.seed(32984) 94 | indexes <- createDataPartition(spam.raw$Label, times = 1, 95 | p = 0.7, list = FALSE) 96 | 97 | train <- spam.raw[indexes,] 98 | test <- spam.raw[-indexes,] 99 | 100 | 101 | # Verify proportions. 102 | prop.table(table(train$Label)) 103 | prop.table(table(test$Label)) 104 | 105 | 106 | 107 | # Text analytics requires a lot of data exploration, data pre-processing 108 | # and data wrangling. Let's explore some examples. 109 | 110 | # HTML-escaped ampersand character. 111 | train$Text[21] 112 | 113 | 114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 115 | # is an actual person, but we will ignore the implications of this for 116 | # this introductory tutorial. 117 | train$Text[38] 118 | 119 | 120 | # A URL. 121 | train$Text[357] 122 | 123 | 124 | 125 | # There are many packages in the R ecosystem for performing text 126 | # analytics. One of the newer packages in quanteda. The quanteda 127 | # package has many useful functions for quickly and easily working 128 | # with text data. 129 | library(quanteda) 130 | help(package = "quanteda") 131 | 132 | 133 | # Tokenize SMS text messages. 134 | train.tokens <- tokens(train$Text, what = "word", 135 | remove_numbers = TRUE, remove_punct = TRUE, 136 | remove_symbols = TRUE, remove_hyphens = TRUE) 137 | 138 | # Take a look at a specific SMS message and see how it transforms. 139 | train.tokens[[357]] 140 | 141 | 142 | # Lower case the tokens. 143 | train.tokens <- tokens_tolower(train.tokens) 144 | train.tokens[[357]] 145 | 146 | 147 | # Use quanteda's built-in stopword list for English. 148 | # NOTE - You should always inspect stopword lists for applicability to 149 | # your problem/domain. 150 | train.tokens <- tokens_select(train.tokens, stopwords(), 151 | selection = "remove") 152 | train.tokens[[357]] 153 | 154 | 155 | # Perform stemming on the tokens. 156 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 157 | train.tokens[[357]] 158 | 159 | 160 | # Create our first bag-of-words model. 161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 162 | 163 | 164 | # Transform to a matrix and inspect. 165 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 166 | View(train.tokens.matrix[1:20, 1:100]) 167 | dim(train.tokens.matrix) 168 | 169 | 170 | # Investigate the effects of stemming. 171 | colnames(train.tokens.matrix)[1:50] 172 | 173 | 174 | # Per best practices, we will leverage cross validation (CV) as 175 | # the basis of our modeling process. Using CV we can create 176 | # estimates of how well our model will do in Production on new, 177 | # unseen data. CV is powerful, but the downside is that it 178 | # requires more processing and therefore more time. 179 | # 180 | # If you are not familiar with CV, consult the following 181 | # Wikipedia article: 182 | # 183 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 184 | # 185 | 186 | # Setup a the feature data frame with labels. 187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 188 | 189 | 190 | # Often, tokenization requires some additional pre-processing 191 | names(train.tokens.df)[c(146, 148, 235, 238)] 192 | 193 | 194 | # Cleanup column names. 195 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 196 | 197 | 198 | # Use caret to create stratified folds for 10-fold cross validation repeated 199 | # 3 times (i.e., create 30 random stratified samples) 200 | set.seed(48743) 201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 202 | 203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 204 | repeats = 3, index = cv.folds) 205 | 206 | 207 | # Our data frame is non-trivial in size. As such, CV runs will take 208 | # quite a long time to run. To cut down on total execution time, use 209 | # the doSNOW package to allow for multi-core training in parallel. 210 | # 211 | # WARNING - The following code is configured to run on a workstation- 212 | # or server-class machine (i.e., 12 logical cores). Alter 213 | # code to suit your HW environment. 214 | # 215 | #install.packages("doSNOW") 216 | library(doSNOW) 217 | 218 | 219 | # Time the code execution 220 | start.time <- Sys.time() 221 | 222 | 223 | # Create a cluster to work on 10 logical cores. 224 | cl <- makeCluster(10, type = "SOCK") 225 | registerDoSNOW(cl) 226 | 227 | 228 | # As our data is non-trivial in size at this point, use a single decision 229 | # tree alogrithm as our first model. We will graduate to using more 230 | # powerful algorithms later when we perform feature extraction to shrink 231 | # the size of our data. 232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 233 | trControl = cv.cntrl, tuneLength = 7) 234 | 235 | 236 | # Processing is done, stop cluster. 237 | stopCluster(cl) 238 | 239 | 240 | # Total time of execution on workstation was approximately 4 minutes. 241 | total.time <- Sys.time() - start.time 242 | total.time 243 | 244 | 245 | # Check out our results. 246 | rpart.cv.1 247 | 248 | 249 | 250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 251 | # powerful technique for enhancing the information/signal contained 252 | # within our document-frequency matrix. Specifically, the mathematics 253 | # behind TF-IDF accomplish the following goals: 254 | # 1 - The TF calculation accounts for the fact that longer 255 | # documents will have higher individual term counts. Applying 256 | # TF normalizes all documents in the corpus to be length 257 | # independent. 258 | # 2 - The IDF calculation accounts for the frequency of term 259 | # appearance in all documents in the corpus. The intuition 260 | # being that a term that appears in every document has no 261 | # predictive power. 262 | # 3 - The multiplication of TF by IDF for each cell in the matrix 263 | # allows for weighting of #1 and #2 for each cell in the matrix. 264 | 265 | 266 | # Our function for calculating relative term frequency (TF) 267 | term.frequency <- function(row) { 268 | row / sum(row) 269 | } 270 | 271 | # Our function for calculating inverse document frequency (IDF) 272 | inverse.doc.freq <- function(col) { 273 | corpus.size <- length(col) 274 | doc.count <- length(which(col > 0)) 275 | 276 | log10(corpus.size / doc.count) 277 | } 278 | 279 | # Our function for calculating TF-IDF. 280 | tf.idf <- function(x, idf) { 281 | x * idf 282 | } 283 | 284 | 285 | # First step, normalize all documents via TF. 286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 287 | dim(train.tokens.df) 288 | View(train.tokens.df[1:20, 1:100]) 289 | 290 | 291 | # Second step, calculate the IDF vector that we will use - both 292 | # for training data and for test data! 293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 294 | str(train.tokens.idf) 295 | 296 | 297 | # Lastly, calculate TF-IDF for our training corpus. 298 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 299 | dim(train.tokens.tfidf) 300 | View(train.tokens.tfidf[1:25, 1:25]) 301 | 302 | 303 | # Transpose the matrix 304 | train.tokens.tfidf <- t(train.tokens.tfidf) 305 | dim(train.tokens.tfidf) 306 | View(train.tokens.tfidf[1:25, 1:25]) 307 | 308 | 309 | # Check for incopmlete cases. 310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 311 | train$Text[incomplete.cases] 312 | 313 | 314 | # Fix incomplete cases 315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 316 | dim(train.tokens.tfidf) 317 | sum(which(!complete.cases(train.tokens.tfidf))) 318 | 319 | 320 | # Make a clean data frame using the same process as before. 321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 323 | 324 | 325 | # Time the code execution 326 | start.time <- Sys.time() 327 | 328 | # Create a cluster to work on 10 logical cores. 329 | cl <- makeCluster(3, type = "SOCK") 330 | registerDoSNOW(cl) 331 | 332 | # As our data is non-trivial in size at this point, use a single decision 333 | # tree alogrithm as our first model. We will graduate to using more 334 | # powerful algorithms later when we perform feature extraction to shrink 335 | # the size of our data. 336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 337 | trControl = cv.cntrl, tuneLength = 7) 338 | 339 | # Processing is done, stop cluster. 340 | stopCluster(cl) 341 | 342 | # Total time of execution on workstation was 343 | total.time <- Sys.time() - start.time 344 | total.time 345 | 346 | # Check out our results. 347 | rpart.cv.2 348 | 349 | 350 | 351 | # N-grams allow us to augment our document-term frequency matrices with 352 | # word ordering. This often leads to increased performance (e.g., accuracy) 353 | # for machine learning models trained with more than just unigrams (i.e., 354 | # single terms). Let's add bigrams to our training data and the TF-IDF 355 | # transform the expanded featre matrix to see if accuracy improves. 356 | 357 | # Add bigrams to our feature matrix. 358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2) 359 | train.tokens[[357]] 360 | 361 | 362 | # Transform to dfm and then a matrix. 363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 364 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 365 | train.tokens.dfm 366 | 367 | 368 | # Normalize all documents via TF. 369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 370 | 371 | 372 | # Calculate the IDF vector that we will use for training and test data! 373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 374 | 375 | 376 | # Calculate TF-IDF for our training corpus 377 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, 378 | idf = train.tokens.idf) 379 | 380 | 381 | # Transpose the matrix 382 | train.tokens.tfidf <- t(train.tokens.tfidf) 383 | 384 | 385 | # Fix incomplete cases 386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 388 | 389 | 390 | # Make a clean data frame. 391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 393 | 394 | 395 | # Clean up unused objects in memory. 396 | gc() 397 | 398 | 399 | 400 | 401 | # 402 | # NOTE - The following code requires the use of command-line R to execute 403 | # due to the large number of features (i.e., columns) in the matrix. 404 | # Please consult the following link for more details if you wish 405 | # to run the code yourself: 406 | # 407 | # https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r 408 | # 409 | # Also note that running the following code required approximately 410 | # 38GB of RAM and more than 4.5 hours to execute on a 10-core 411 | # workstation! 412 | # 413 | 414 | 415 | # Time the code execution 416 | # start.time <- Sys.time() 417 | 418 | # Leverage single decision trees to evaluate if adding bigrams improves the 419 | # the effectiveness of the model. 420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 421 | # trControl = cv.cntrl, tuneLength = 7) 422 | 423 | # Total time of execution on workstation was 424 | # total.time <- Sys.time() - start.time 425 | # total.time 426 | 427 | # Check out our results. 428 | # rpart.cv.3 429 | 430 | # 431 | # The results of the above processing show a slight decline in rpart 432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457. 433 | # As we will discuss later, while the addition of bigrams appears to 434 | # negatively impact a single decision tree, it helps with the mighty 435 | # random forest! 436 | # 437 | 438 | 439 | 440 | 441 | # We'll leverage the irlba package for our singular value 442 | # decomposition (SVD). The irlba package allows us to specify 443 | # the number of the most important singular vectors we wish to 444 | # calculate and retain for features. 445 | library(irlba) 446 | 447 | 448 | # Time the code execution 449 | start.time <- Sys.time() 450 | 451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns 452 | # for our latent semantic analysis (LSA). 453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600) 454 | 455 | # Total time of execution on workstation was 456 | total.time <- Sys.time() - start.time 457 | total.time 458 | 459 | 460 | # Take a look at the new feature data up close. 461 | View(train.irlba$v) 462 | 463 | 464 | # As with TF-IDF, we will need to project new data (e.g., the test data) 465 | # into the SVD semantic space. The following code illustrates how to do 466 | # this using a row of the training data that has already been transformed 467 | # by TF-IDF, per the mathematics illustrated in the slides. 468 | # 469 | # 470 | sigma.inverse <- 1 / train.irlba$d 471 | u.transpose <- t(train.irlba$u) 472 | document <- train.tokens.tfidf[1,] 473 | document.hat <- sigma.inverse * u.transpose %*% document 474 | 475 | # Look at the first 10 components of projected document and the corresponding 476 | # row in our document semantic space (i.e., the V matrix) 477 | document.hat[1:10] 478 | train.irlba$v[1, 1:10] 479 | 480 | 481 | 482 | # 483 | # Create new feature data frame using our document semantic space of 300 484 | # features (i.e., the V matrix from our SVD). 485 | # 486 | train.svd <- data.frame(Label = train$Label, train.irlba$v) 487 | 488 | 489 | # Create a cluster to work on 10 logical cores. 490 | cl <- makeCluster(10, type = "SOCK") 491 | registerDoSNOW(cl) 492 | 493 | # Time the code execution 494 | start.time <- Sys.time() 495 | 496 | # This will be the last run using single decision trees. With a much smaller 497 | # feature matrix we can now use more powerful methods like the mighty Random 498 | # Forest from now on! 499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 500 | trControl = cv.cntrl, tuneLength = 7) 501 | 502 | # Processing is done, stop cluster. 503 | stopCluster(cl) 504 | 505 | # Total time of execution on workstation was 506 | total.time <- Sys.time() - start.time 507 | total.time 508 | 509 | # Check out our results. 510 | rpart.cv.4 511 | 512 | 513 | 514 | 515 | # 516 | # NOTE - The following code takes a long time to run. Here's the math. 517 | # We are performing 10-fold CV repeated 3 times. That means we 518 | # need to build 30 models. We are also asking caret to try 7 519 | # different values of the mtry parameter. Next up by default 520 | # a mighty random forest leverages 500 trees. Lastly, caret will 521 | # build 1 final model at the end of the process with the best 522 | # mtry value over all the training data. Here's the number of 523 | # tree we're building: 524 | # 525 | # (10 * 3 * 7 * 500) + 500 = 105,500 trees! 526 | # 527 | # On a workstation using 10 cores the following code took 28 minutes 528 | # to execute. 529 | # 530 | 531 | 532 | # Create a cluster to work on 10 logical cores. 533 | # cl <- makeCluster(10, type = "SOCK") 534 | # registerDoSNOW(cl) 535 | 536 | # Time the code execution 537 | # start.time <- Sys.time() 538 | 539 | # We have reduced the dimensionality of our data using SVD. Also, the 540 | # application of SVD allows us to use LSA to simultaneously increase the 541 | # information density of each feature. To prove this out, leverage a 542 | # mighty Random Forest with the default of 500 trees. We'll also ask 543 | # caret to try 7 different values of mtry to find the mtry value that 544 | # gives the best result! 545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 546 | # trControl = cv.cntrl, tuneLength = 7) 547 | 548 | # Processing is done, stop cluster. 549 | # stopCluster(cl) 550 | 551 | # Total time of execution on workstation was 552 | # total.time <- Sys.time() - start.time 553 | # total.time 554 | 555 | 556 | # Load processing results from disk! 557 | load("rf.cv.1.RData") 558 | 559 | # Check out our results. 560 | rf.cv.1 561 | 562 | # Let's drill-down on the results. 563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted) 564 | 565 | 566 | 567 | 568 | 569 | # OK, now let's add in the feature we engineered previously for SMS 570 | # text length to see if it improves things. 571 | train.svd$TextLength <- train$TextLength 572 | 573 | 574 | # Create a cluster to work on 10 logical cores. 575 | # cl <- makeCluster(10, type = "SOCK") 576 | # registerDoSNOW(cl) 577 | 578 | # Time the code execution 579 | # start.time <- Sys.time() 580 | 581 | # Re-run the training process with the additional feature. 582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf", 583 | # trControl = cv.cntrl, tuneLength = 7, 584 | # importance = TRUE) 585 | 586 | # Processing is done, stop cluster. 587 | # stopCluster(cl) 588 | 589 | # Total time of execution on workstation was 590 | # total.time <- Sys.time() - start.time 591 | # total.time 592 | 593 | # Load results from disk. 594 | load("rf.cv.2.RData") 595 | 596 | # Check the results. 597 | rf.cv.2 598 | 599 | # Drill-down on the results. 600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted) 601 | 602 | # How important was the new feature? 603 | library(randomForest) 604 | varImpPlot(rf.cv.1$finalModel) 605 | varImpPlot(rf.cv.2$finalModel) 606 | 607 | 608 | 609 | 610 | # Turns out that our TextLength feature is very predictive and pushed our 611 | # overall accuracy over the training data to 97.1%. We can also use the 612 | # power of cosine similarity to engineer a feature for calculating, on 613 | # average, how alike each SMS text message is to all of the spam messages. 614 | # The hypothesis here is that our use of bigrams, tf-idf, and LSA have 615 | # produced a representation where ham SMS messages should have low cosine 616 | # similarities with spam SMS messages and vice versa. 617 | 618 | # Use the lsa package's cosine function for our calculations. 619 | #install.packages("lsa") 620 | library(lsa) 621 | 622 | train.similarities <- cosine(t(as.matrix(train.svd[, -c(1, ncol(train.svd))]))) 623 | 624 | 625 | # Next up - take each SMS text message and find what the mean cosine 626 | # similarity is for each SMS text mean with each of the spam SMS messages. 627 | # Per our hypothesis, ham SMS text messages should have relatively low 628 | # cosine similarities with spam messages and vice versa! 629 | spam.indexes <- which(train$Label == "spam") 630 | 631 | train.svd$SpamSimilarity <- rep(0.0, nrow(train.svd)) 632 | for(i in 1:nrow(train.svd)) { 633 | train.svd$SpamSimilarity[i] <- mean(train.similarities[i, spam.indexes]) 634 | } 635 | 636 | 637 | # As always, let's visualize our results using the mighty ggplot2 638 | ggplot(train.svd, aes(x = SpamSimilarity, fill = Label)) + 639 | theme_bw() + 640 | geom_histogram(binwidth = 0.05) + 641 | labs(y = "Message Count", 642 | x = "Mean Spam Message Cosine Similarity", 643 | title = "Distribution of Ham vs. Spam Using Spam Cosine Similarity") 644 | 645 | 646 | # Per our analysis of mighty random forest results, we are interested in 647 | # in features that can raise model performance with respect to sensitivity. 648 | # Perform another CV process using the new spam cosine similarity feature. 649 | 650 | # Create a cluster to work on 10 logical cores. 651 | # cl <- makeCluster(10, type = "SOCK") 652 | # registerDoSNOW(cl) 653 | 654 | # Time the code execution 655 | # start.time <- Sys.time() 656 | 657 | # Re-run the training process with the additional feature. 658 | # rf.cv.3 <- train(Label ~ ., data = train.svd, method = "rf", 659 | # trControl = cv.cntrl, tuneLength = 7, 660 | # importance = TRUE) 661 | 662 | # Processing is done, stop cluster. 663 | # stopCluster(cl) 664 | 665 | # Total time of execution on workstation was 666 | # total.time <- Sys.time() - start.time 667 | # total.time 668 | 669 | 670 | # Load results from disk. 671 | load("rf.cv.3.RData") 672 | 673 | # Check the results. 674 | rf.cv.3 675 | 676 | # Drill-down on the results. 677 | confusionMatrix(train.svd$Label, rf.cv.3$finalModel$predicted) 678 | 679 | # How important was this feature? 680 | library(randomForest) 681 | varImpPlot(rf.cv.3$finalModel) 682 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part11.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 11 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=XWUi7RivDJY 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | # Clean up the data frame and view our handiwork. 39 | spam.raw <- spam.raw[, 1:2] 40 | names(spam.raw) <- c("Label", "Text") 41 | View(spam.raw) 42 | 43 | 44 | 45 | # Check data to see if there are missing values. 46 | length(which(!complete.cases(spam.raw))) 47 | 48 | 49 | 50 | # Convert our class label into a factor. 51 | spam.raw$Label <- as.factor(spam.raw$Label) 52 | 53 | 54 | 55 | # The first step, as always, is to explore the data. 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 57 | prop.table(table(spam.raw$Label)) 58 | 59 | 60 | 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 62 | # messages by adding a new feature for the length of each message. 63 | spam.raw$TextLength <- nchar(spam.raw$Text) 64 | summary(spam.raw$TextLength) 65 | 66 | 67 | 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 69 | library(ggplot2) 70 | 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 72 | theme_bw() + 73 | geom_histogram(binwidth = 5) + 74 | labs(y = "Text Count", x = "Length of Text", 75 | title = "Distribution of Text Lengths with Class Labels") 76 | 77 | 78 | 79 | # At a minimum we need to split our data into a training set and a 80 | # test set. In a true project we would want to use a three-way split 81 | # of training, validation, and test. 82 | # 83 | # As we know that our data has non-trivial class imbalance, we'll 84 | # use the mighty caret package to create a randomg train/test split 85 | # that ensures the correct ham/spam class label proportions (i.e., 86 | # we'll use caret for a random stratified split). 87 | library(caret) 88 | help(package = "caret") 89 | 90 | 91 | # Use caret to create a 70%/30% stratified split. Set the random 92 | # seed for reproducibility. 93 | set.seed(32984) 94 | indexes <- createDataPartition(spam.raw$Label, times = 1, 95 | p = 0.7, list = FALSE) 96 | 97 | train <- spam.raw[indexes,] 98 | test <- spam.raw[-indexes,] 99 | 100 | 101 | # Verify proportions. 102 | prop.table(table(train$Label)) 103 | prop.table(table(test$Label)) 104 | 105 | 106 | 107 | # Text analytics requires a lot of data exploration, data pre-processing 108 | # and data wrangling. Let's explore some examples. 109 | 110 | # HTML-escaped ampersand character. 111 | train$Text[21] 112 | 113 | 114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 115 | # is an actual person, but we will ignore the implications of this for 116 | # this introductory tutorial. 117 | train$Text[38] 118 | 119 | 120 | # A URL. 121 | train$Text[357] 122 | 123 | 124 | 125 | # There are many packages in the R ecosystem for performing text 126 | # analytics. One of the newer packages in quanteda. The quanteda 127 | # package has many useful functions for quickly and easily working 128 | # with text data. 129 | library(quanteda) 130 | help(package = "quanteda") 131 | 132 | 133 | # Tokenize SMS text messages. 134 | train.tokens <- tokens(train$Text, what = "word", 135 | remove_numbers = TRUE, remove_punct = TRUE, 136 | remove_symbols = TRUE, remove_hyphens = TRUE) 137 | 138 | # Take a look at a specific SMS message and see how it transforms. 139 | train.tokens[[357]] 140 | 141 | 142 | # Lower case the tokens. 143 | train.tokens <- tokens_tolower(train.tokens) 144 | train.tokens[[357]] 145 | 146 | 147 | # Use quanteda's built-in stopword list for English. 148 | # NOTE - You should always inspect stopword lists for applicability to 149 | # your problem/domain. 150 | train.tokens <- tokens_select(train.tokens, stopwords(), 151 | selection = "remove") 152 | train.tokens[[357]] 153 | 154 | 155 | # Perform stemming on the tokens. 156 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 157 | train.tokens[[357]] 158 | 159 | 160 | # Create our first bag-of-words model. 161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 162 | 163 | 164 | # Transform to a matrix and inspect. 165 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 166 | View(train.tokens.matrix[1:20, 1:100]) 167 | dim(train.tokens.matrix) 168 | 169 | 170 | # Investigate the effects of stemming. 171 | colnames(train.tokens.matrix)[1:50] 172 | 173 | 174 | # Per best practices, we will leverage cross validation (CV) as 175 | # the basis of our modeling process. Using CV we can create 176 | # estimates of how well our model will do in Production on new, 177 | # unseen data. CV is powerful, but the downside is that it 178 | # requires more processing and therefore more time. 179 | # 180 | # If you are not familiar with CV, consult the following 181 | # Wikipedia article: 182 | # 183 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 184 | # 185 | 186 | # Setup a the feature data frame with labels. 187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 188 | 189 | 190 | # Often, tokenization requires some additional pre-processing 191 | names(train.tokens.df)[c(146, 148, 235, 238)] 192 | 193 | 194 | # Cleanup column names. 195 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 196 | 197 | 198 | # Use caret to create stratified folds for 10-fold cross validation repeated 199 | # 3 times (i.e., create 30 random stratified samples) 200 | set.seed(48743) 201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 202 | 203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 204 | repeats = 3, index = cv.folds) 205 | 206 | 207 | # Our data frame is non-trivial in size. As such, CV runs will take 208 | # quite a long time to run. To cut down on total execution time, use 209 | # the doSNOW package to allow for multi-core training in parallel. 210 | # 211 | # WARNING - The following code is configured to run on a workstation- 212 | # or server-class machine (i.e., 12 logical cores). Alter 213 | # code to suit your HW environment. 214 | # 215 | #install.packages("doSNOW") 216 | library(doSNOW) 217 | 218 | 219 | # Time the code execution 220 | start.time <- Sys.time() 221 | 222 | 223 | # Create a cluster to work on 10 logical cores. 224 | cl <- makeCluster(10, type = "SOCK") 225 | registerDoSNOW(cl) 226 | 227 | 228 | # As our data is non-trivial in size at this point, use a single decision 229 | # tree alogrithm as our first model. We will graduate to using more 230 | # powerful algorithms later when we perform feature extraction to shrink 231 | # the size of our data. 232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 233 | trControl = cv.cntrl, tuneLength = 7) 234 | 235 | 236 | # Processing is done, stop cluster. 237 | stopCluster(cl) 238 | 239 | 240 | # Total time of execution on workstation was approximately 4 minutes. 241 | total.time <- Sys.time() - start.time 242 | total.time 243 | 244 | 245 | # Check out our results. 246 | rpart.cv.1 247 | 248 | 249 | 250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 251 | # powerful technique for enhancing the information/signal contained 252 | # within our document-frequency matrix. Specifically, the mathematics 253 | # behind TF-IDF accomplish the following goals: 254 | # 1 - The TF calculation accounts for the fact that longer 255 | # documents will have higher individual term counts. Applying 256 | # TF normalizes all documents in the corpus to be length 257 | # independent. 258 | # 2 - The IDF calculation accounts for the frequency of term 259 | # appearance in all documents in the corpus. The intuition 260 | # being that a term that appears in every document has no 261 | # predictive power. 262 | # 3 - The multiplication of TF by IDF for each cell in the matrix 263 | # allows for weighting of #1 and #2 for each cell in the matrix. 264 | 265 | 266 | # Our function for calculating relative term frequency (TF) 267 | term.frequency <- function(row) { 268 | row / sum(row) 269 | } 270 | 271 | # Our function for calculating inverse document frequency (IDF) 272 | inverse.doc.freq <- function(col) { 273 | corpus.size <- length(col) 274 | doc.count <- length(which(col > 0)) 275 | 276 | log10(corpus.size / doc.count) 277 | } 278 | 279 | # Our function for calculating TF-IDF. 280 | tf.idf <- function(x, idf) { 281 | x * idf 282 | } 283 | 284 | 285 | # First step, normalize all documents via TF. 286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 287 | dim(train.tokens.df) 288 | View(train.tokens.df[1:20, 1:100]) 289 | 290 | 291 | # Second step, calculate the IDF vector that we will use - both 292 | # for training data and for test data! 293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 294 | str(train.tokens.idf) 295 | 296 | 297 | # Lastly, calculate TF-IDF for our training corpus. 298 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 299 | dim(train.tokens.tfidf) 300 | View(train.tokens.tfidf[1:25, 1:25]) 301 | 302 | 303 | # Transpose the matrix 304 | train.tokens.tfidf <- t(train.tokens.tfidf) 305 | dim(train.tokens.tfidf) 306 | View(train.tokens.tfidf[1:25, 1:25]) 307 | 308 | 309 | # Check for incopmlete cases. 310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 311 | train$Text[incomplete.cases] 312 | 313 | 314 | # Fix incomplete cases 315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 316 | dim(train.tokens.tfidf) 317 | sum(which(!complete.cases(train.tokens.tfidf))) 318 | 319 | 320 | # Make a clean data frame using the same process as before. 321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 323 | 324 | 325 | # Time the code execution 326 | start.time <- Sys.time() 327 | 328 | # Create a cluster to work on 10 logical cores. 329 | cl <- makeCluster(10, type = "SOCK") 330 | registerDoSNOW(cl) 331 | 332 | # As our data is non-trivial in size at this point, use a single decision 333 | # tree alogrithm as our first model. We will graduate to using more 334 | # powerful algorithms later when we perform feature extraction to shrink 335 | # the size of our data. 336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 337 | trControl = cv.cntrl, tuneLength = 7) 338 | 339 | # Processing is done, stop cluster. 340 | stopCluster(cl) 341 | 342 | # Total time of execution on workstation was 343 | total.time <- Sys.time() - start.time 344 | total.time 345 | 346 | # Check out our results. 347 | rpart.cv.2 348 | 349 | 350 | 351 | # N-grams allow us to augment our document-term frequency matrices with 352 | # word ordering. This often leads to increased performance (e.g., accuracy) 353 | # for machine learning models trained with more than just unigrams (i.e., 354 | # single terms). Let's add bigrams to our training data and the TF-IDF 355 | # transform the expanded featre matrix to see if accuracy improves. 356 | 357 | # Add bigrams to our feature matrix. 358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2) 359 | train.tokens[[357]] 360 | 361 | 362 | # Transform to dfm and then a matrix. 363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 364 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 365 | train.tokens.dfm 366 | 367 | 368 | # Normalize all documents via TF. 369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 370 | 371 | 372 | # Calculate the IDF vector that we will use for training and test data! 373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 374 | 375 | 376 | # Calculate TF-IDF for our training corpus 377 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, 378 | idf = train.tokens.idf) 379 | 380 | 381 | # Transpose the matrix 382 | train.tokens.tfidf <- t(train.tokens.tfidf) 383 | 384 | 385 | # Fix incomplete cases 386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 388 | 389 | 390 | # Make a clean data frame. 391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 393 | 394 | 395 | # Clean up unused objects in memory. 396 | gc() 397 | 398 | 399 | 400 | 401 | # 402 | # NOTE - The following code requires the use of command-line R to execute 403 | # due to the large number of features (i.e., columns) in the matrix. 404 | # Please consult the following link for more details if you wish 405 | # to run the code yourself: 406 | # 407 | # https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r 408 | # 409 | # Also note that running the following code required approximately 410 | # 38GB of RAM and more than 4.5 hours to execute on a 10-core 411 | # workstation! 412 | # 413 | 414 | 415 | # Time the code execution 416 | # start.time <- Sys.time() 417 | 418 | # Leverage single decision trees to evaluate if adding bigrams improves the 419 | # the effectiveness of the model. 420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 421 | # trControl = cv.cntrl, tuneLength = 7) 422 | 423 | # Total time of execution on workstation was 424 | # total.time <- Sys.time() - start.time 425 | # total.time 426 | 427 | # Check out our results. 428 | # rpart.cv.3 429 | 430 | # 431 | # The results of the above processing show a slight decline in rpart 432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457. 433 | # As we will discuss later, while the addition of bigrams appears to 434 | # negatively impact a single decision tree, it helps with the mighty 435 | # random forest! 436 | # 437 | 438 | 439 | 440 | 441 | # We'll leverage the irlba package for our singular value 442 | # decomposition (SVD). The irlba package allows us to specify 443 | # the number of the most important singular vectors we wish to 444 | # calculate and retain for features. 445 | library(irlba) 446 | 447 | 448 | # Time the code execution 449 | start.time <- Sys.time() 450 | 451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns 452 | # for our latent semantic analysis (LSA). 453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600) 454 | 455 | # Total time of execution on workstation was 456 | total.time <- Sys.time() - start.time 457 | total.time 458 | 459 | 460 | # Take a look at the new feature data up close. 461 | View(train.irlba$v) 462 | 463 | 464 | # As with TF-IDF, we will need to project new data (e.g., the test data) 465 | # into the SVD semantic space. The following code illustrates how to do 466 | # this using a row of the training data that has already been transformed 467 | # by TF-IDF, per the mathematics illustrated in the slides. 468 | # 469 | # 470 | sigma.inverse <- 1 / train.irlba$d 471 | u.transpose <- t(train.irlba$u) 472 | document <- train.tokens.tfidf[1,] 473 | document.hat <- sigma.inverse * u.transpose %*% document 474 | 475 | # Look at the first 10 components of projected document and the corresponding 476 | # row in our document semantic space (i.e., the V matrix) 477 | document.hat[1:10] 478 | train.irlba$v[1, 1:10] 479 | 480 | 481 | 482 | # 483 | # Create new feature data frame using our document semantic space of 300 484 | # features (i.e., the V matrix from our SVD). 485 | # 486 | train.svd <- data.frame(Label = train$Label, train.irlba$v) 487 | 488 | 489 | # Create a cluster to work on 10 logical cores. 490 | cl <- makeCluster(10, type = "SOCK") 491 | registerDoSNOW(cl) 492 | 493 | # Time the code execution 494 | start.time <- Sys.time() 495 | 496 | # This will be the last run using single decision trees. With a much smaller 497 | # feature matrix we can now use more powerful methods like the mighty Random 498 | # Forest from now on! 499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 500 | trControl = cv.cntrl, tuneLength = 7) 501 | 502 | # Processing is done, stop cluster. 503 | stopCluster(cl) 504 | 505 | # Total time of execution on workstation was 506 | total.time <- Sys.time() - start.time 507 | total.time 508 | 509 | # Check out our results. 510 | rpart.cv.4 511 | 512 | 513 | 514 | 515 | # 516 | # NOTE - The following code takes a long time to run. Here's the math. 517 | # We are performing 10-fold CV repeated 3 times. That means we 518 | # need to build 30 models. We are also asking caret to try 7 519 | # different values of the mtry parameter. Next up by default 520 | # a mighty random forest leverages 500 trees. Lastly, caret will 521 | # build 1 final model at the end of the process with the best 522 | # mtry value over all the training data. Here's the number of 523 | # tree we're building: 524 | # 525 | # (10 * 3 * 7 * 500) + 500 = 105,500 trees! 526 | # 527 | # On a workstation using 10 cores the following code took 28 minutes 528 | # to execute. 529 | # 530 | 531 | 532 | # Create a cluster to work on 10 logical cores. 533 | # cl <- makeCluster(10, type = "SOCK") 534 | # registerDoSNOW(cl) 535 | 536 | # Time the code execution 537 | # start.time <- Sys.time() 538 | 539 | # We have reduced the dimensionality of our data using SVD. Also, the 540 | # application of SVD allows us to use LSA to simultaneously increase the 541 | # information density of each feature. To prove this out, leverage a 542 | # mighty Random Forest with the default of 500 trees. We'll also ask 543 | # caret to try 7 different values of mtry to find the mtry value that 544 | # gives the best result! 545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 546 | # trControl = cv.cntrl, tuneLength = 7) 547 | 548 | # Processing is done, stop cluster. 549 | # stopCluster(cl) 550 | 551 | # Total time of execution on workstation was 552 | # total.time <- Sys.time() - start.time 553 | # total.time 554 | 555 | 556 | # Load processing results from disk! 557 | load("rf.cv.1.RData") 558 | 559 | # Check out our results. 560 | rf.cv.1 561 | 562 | # Let's drill-down on the results. 563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted) 564 | 565 | 566 | 567 | 568 | 569 | # OK, now let's add in the feature we engineered previously for SMS 570 | # text length to see if it improves things. 571 | train.svd$TextLength <- train$TextLength 572 | 573 | 574 | # Create a cluster to work on 10 logical cores. 575 | # cl <- makeCluster(10, type = "SOCK") 576 | # registerDoSNOW(cl) 577 | 578 | # Time the code execution 579 | # start.time <- Sys.time() 580 | 581 | # Re-run the training process with the additional feature. 582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf", 583 | # trControl = cv.cntrl, tuneLength = 7, 584 | # importance = TRUE) 585 | 586 | # Processing is done, stop cluster. 587 | # stopCluster(cl) 588 | 589 | # Total time of execution on workstation was 590 | # total.time <- Sys.time() - start.time 591 | # total.time 592 | 593 | # Load results from disk. 594 | load("rf.cv.2.RData") 595 | 596 | # Check the results. 597 | rf.cv.2 598 | 599 | # Drill-down on the results. 600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted) 601 | 602 | # How important was the new feature? 603 | library(randomForest) 604 | varImpPlot(rf.cv.1$finalModel) 605 | varImpPlot(rf.cv.2$finalModel) 606 | 607 | 608 | 609 | 610 | # Turns out that our TextLength feature is very predictive and pushed our 611 | # overall accuracy over the training data to 97.1%. We can also use the 612 | # power of cosine similarity to engineer a feature for calculating, on 613 | # average, how alike each SMS text message is to all of the spam messages. 614 | # The hypothesis here is that our use of bigrams, tf-idf, and LSA have 615 | # produced a representation where ham SMS messages should have low cosine 616 | # similarities with spam SMS messages and vice versa. 617 | 618 | # Use the lsa package's cosine function for our calculations. 619 | #install.packages("lsa") 620 | library(lsa) 621 | 622 | train.similarities <- cosine(t(as.matrix(train.svd[, -c(1, ncol(train.svd))]))) 623 | 624 | 625 | # Next up - take each SMS text message and find what the mean cosine 626 | # similarity is for each SMS text mean with each of the spam SMS messages. 627 | # Per our hypothesis, ham SMS text messages should have relatively low 628 | # cosine similarities with spam messages and vice versa! 629 | spam.indexes <- which(train$Label == "spam") 630 | 631 | train.svd$SpamSimilarity <- rep(0.0, nrow(train.svd)) 632 | for(i in 1:nrow(train.svd)) { 633 | train.svd$SpamSimilarity[i] <- mean(train.similarities[i, spam.indexes]) 634 | } 635 | 636 | 637 | # As always, let's visualize our results using the mighty ggplot2 638 | ggplot(train.svd, aes(x = SpamSimilarity, fill = Label)) + 639 | theme_bw() + 640 | geom_histogram(binwidth = 0.05) + 641 | labs(y = "Message Count", 642 | x = "Mean Spam Message Cosine Similarity", 643 | title = "Distribution of Ham vs. Spam Using Spam Cosine Similarity") 644 | 645 | 646 | # Per our analysis of mighty random forest results, we are interested in 647 | # in features that can raise model performance with respect to sensitivity. 648 | # Perform another CV process using the new spam cosine similarity feature. 649 | 650 | # Create a cluster to work on 10 logical cores. 651 | # cl <- makeCluster(10, type = "SOCK") 652 | # registerDoSNOW(cl) 653 | 654 | # Time the code execution 655 | # start.time <- Sys.time() 656 | 657 | # Re-run the training process with the additional feature. 658 | # set.seed(932847) 659 | # rf.cv.3 <- train(Label ~ ., data = train.svd, method = "rf", 660 | # trControl = cv.cntrl, tuneLength = 7, 661 | # importance = TRUE) 662 | 663 | # Processing is done, stop cluster. 664 | # stopCluster(cl) 665 | 666 | # Total time of execution on workstation was 667 | # total.time <- Sys.time() - start.time 668 | # total.time 669 | 670 | 671 | # Load results from disk. 672 | load("rf.cv.3.RData") 673 | 674 | # Check the results. 675 | rf.cv.3 676 | 677 | # Drill-down on the results. 678 | confusionMatrix(train.svd$Label, rf.cv.3$finalModel$predicted) 679 | 680 | # How important was this feature? 681 | library(randomForest) 682 | varImpPlot(rf.cv.3$finalModel) 683 | 684 | 685 | 686 | 687 | # We've built what appears to be an effective predictive model. Time to verify 688 | # using the test holdout data we set aside at the beginning of the project. 689 | # First stage of this verification is running the test data through our pre- 690 | # processing pipeline of: 691 | # 1 - Tokenization 692 | # 2 - Lower casing 693 | # 3 - Stopword removal 694 | # 4 - Stemming 695 | # 5 - Adding bigrams 696 | # 6 - Transform to dfm 697 | # 7 - Ensure test dfm has same features as train dfm 698 | 699 | # Tokenization. 700 | test.tokens <- tokens(test$Text, what = "word", 701 | remove_numbers = TRUE, remove_punct = TRUE, 702 | remove_symbols = TRUE, remove_hyphens = TRUE) 703 | 704 | # Lower case the tokens. 705 | test.tokens <- tokens_tolower(test.tokens) 706 | 707 | # Stopword removal. 708 | test.tokens <- tokens_select(test.tokens, stopwords(), 709 | selection = "remove") 710 | 711 | # Stemming. 712 | test.tokens <- tokens_wordstem(test.tokens, language = "english") 713 | 714 | # Add bigrams. 715 | test.tokens <- tokens_ngrams(test.tokens, n = 1:2) 716 | 717 | # Convert n-grams to quanteda document-term frequency matrix. 718 | test.tokens.dfm <- dfm(test.tokens, tolower = FALSE) 719 | 720 | # Explore the train and test quanteda dfm objects. 721 | train.tokens.dfm 722 | test.tokens.dfm 723 | 724 | # Ensure the test dfm has the same n-grams as the training dfm. 725 | # 726 | # NOTE - In production we should expect that new text messages will 727 | # contain n-grams that did not exist in the original training 728 | # data. As such, we need to strip those n-grams out. 729 | # 730 | test.tokens.dfm <- dfm_select(test.tokens.dfm, pattern = train.tokens.dfm, 731 | selection = "keep") 732 | test.tokens.matrix <- as.matrix(test.tokens.dfm) 733 | test.tokens.dfm 734 | 735 | 736 | 737 | 738 | # With the raw test features in place next up is the projecting the term 739 | # counts for the unigrams into the same TF-IDF vector space as our training 740 | # data. The high level process is as follows: 741 | # 1 - Normalize each document (i.e, each row) 742 | # 2 - Perform IDF multiplication using training IDF values 743 | 744 | # Normalize all documents via TF. 745 | test.tokens.df <- apply(test.tokens.matrix, 1, term.frequency) 746 | str(test.tokens.df) 747 | 748 | # Lastly, calculate TF-IDF for our training corpus. 749 | test.tokens.tfidf <- apply(test.tokens.df, 2, tf.idf, idf = train.tokens.idf) 750 | dim(test.tokens.tfidf) 751 | View(test.tokens.tfidf[1:25, 1:25]) 752 | 753 | # Transpose the matrix 754 | test.tokens.tfidf <- t(test.tokens.tfidf) 755 | 756 | # Fix incomplete cases 757 | summary(test.tokens.tfidf[1,]) 758 | test.tokens.tfidf[is.na(test.tokens.tfidf)] <- 0.0 759 | summary(test.tokens.tfidf[1,]) 760 | 761 | 762 | 763 | 764 | # With the test data projected into the TF-IDF vector space of the training 765 | # data we can now to the final projection into the training LSA semantic 766 | # space (i.e. the SVD matrix factorization). 767 | test.svd.raw <- t(sigma.inverse * u.transpose %*% t(test.tokens.tfidf)) 768 | 769 | 770 | # Lastly, we can now build the test data frame to feed into our trained 771 | # machine learning model for predictions. First up, add Label and TextLength. 772 | test.svd <- data.frame(Label = test$Label, test.svd.raw, 773 | TextLength = test$TextLength) 774 | 775 | 776 | # Next step, calculate SpamSimilarity for all the test documents. First up, 777 | # create a spam similarity matrix. 778 | test.similarities <- rbind(test.svd.raw, train.irlba$v[spam.indexes,]) 779 | test.similarities <- cosine(t(test.similarities)) 780 | 781 | 782 | # 783 | # NOTE - The following code was updated post-video recoding due to a bug. 784 | # 785 | test.svd$SpamSimilarity <- rep(0.0, nrow(test.svd)) 786 | spam.cols <- (nrow(test.svd) + 1):ncol(test.similarities) 787 | for(i in 1:nrow(test.svd)) { 788 | # The following line has the bug fix. 789 | test.svd$SpamSimilarity[i] <- mean(test.similarities[i, spam.cols]) 790 | } 791 | 792 | 793 | # Some SMS text messages become empty as a result of stopword and special 794 | # character removal. This results in spam similarity measures of 0. Correct. 795 | # This code as added post-video as part of the bug fix. 796 | test.svd$SpamSimilarity[!is.finite(test.svd$SpamSimilarity)] <- 0 797 | 798 | 799 | # Now we can make predictions on the test data set using our trained mighty 800 | # random forest. 801 | preds <- predict(rf.cv.3, test.svd) 802 | 803 | 804 | # Drill-in on results 805 | confusionMatrix(preds, test.svd$Label) 806 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part12.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 12 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=-wCrClheObk 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | # Clean up the data frame and view our handiwork. 39 | spam.raw <- spam.raw[, 1:2] 40 | names(spam.raw) <- c("Label", "Text") 41 | View(spam.raw) 42 | 43 | 44 | 45 | # Check data to see if there are missing values. 46 | length(which(!complete.cases(spam.raw))) 47 | 48 | 49 | 50 | # Convert our class label into a factor. 51 | spam.raw$Label <- as.factor(spam.raw$Label) 52 | 53 | 54 | 55 | # The first step, as always, is to explore the data. 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 57 | prop.table(table(spam.raw$Label)) 58 | 59 | 60 | 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 62 | # messages by adding a new feature for the length of each message. 63 | spam.raw$TextLength <- nchar(spam.raw$Text) 64 | summary(spam.raw$TextLength) 65 | 66 | 67 | 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 69 | library(ggplot2) 70 | 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 72 | theme_bw() + 73 | geom_histogram(binwidth = 5) + 74 | labs(y = "Text Count", x = "Length of Text", 75 | title = "Distribution of Text Lengths with Class Labels") 76 | 77 | 78 | 79 | # At a minimum we need to split our data into a training set and a 80 | # test set. In a true project we would want to use a three-way split 81 | # of training, validation, and test. 82 | # 83 | # As we know that our data has non-trivial class imbalance, we'll 84 | # use the mighty caret package to create a randomg train/test split 85 | # that ensures the correct ham/spam class label proportions (i.e., 86 | # we'll use caret for a random stratified split). 87 | library(caret) 88 | help(package = "caret") 89 | 90 | 91 | # Use caret to create a 70%/30% stratified split. Set the random 92 | # seed for reproducibility. 93 | set.seed(32984) 94 | indexes <- createDataPartition(spam.raw$Label, times = 1, 95 | p = 0.7, list = FALSE) 96 | 97 | train <- spam.raw[indexes,] 98 | test <- spam.raw[-indexes,] 99 | 100 | 101 | # Verify proportions. 102 | prop.table(table(train$Label)) 103 | prop.table(table(test$Label)) 104 | 105 | 106 | 107 | # Text analytics requires a lot of data exploration, data pre-processing 108 | # and data wrangling. Let's explore some examples. 109 | 110 | # HTML-escaped ampersand character. 111 | train$Text[21] 112 | 113 | 114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 115 | # is an actual person, but we will ignore the implications of this for 116 | # this introductory tutorial. 117 | train$Text[38] 118 | 119 | 120 | # A URL. 121 | train$Text[357] 122 | 123 | 124 | 125 | # There are many packages in the R ecosystem for performing text 126 | # analytics. One of the newer packages in quanteda. The quanteda 127 | # package has many useful functions for quickly and easily working 128 | # with text data. 129 | library(quanteda) 130 | help(package = "quanteda") 131 | 132 | 133 | # Tokenize SMS text messages. 134 | train.tokens <- tokens(train$Text, what = "word", 135 | remove_numbers = TRUE, remove_punct = TRUE, 136 | remove_symbols = TRUE, remove_hyphens = TRUE) 137 | 138 | # Take a look at a specific SMS message and see how it transforms. 139 | train.tokens[[357]] 140 | 141 | 142 | # Lower case the tokens. 143 | train.tokens <- tokens_tolower(train.tokens) 144 | train.tokens[[357]] 145 | 146 | 147 | # Use quanteda's built-in stopword list for English. 148 | # NOTE - You should always inspect stopword lists for applicability to 149 | # your problem/domain. 150 | train.tokens <- tokens_select(train.tokens, stopwords(), 151 | selection = "remove") 152 | train.tokens[[357]] 153 | 154 | 155 | # Perform stemming on the tokens. 156 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 157 | train.tokens[[357]] 158 | 159 | 160 | # Create our first bag-of-words model. 161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 162 | 163 | 164 | # Transform to a matrix and inspect. 165 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 166 | View(train.tokens.matrix[1:20, 1:100]) 167 | dim(train.tokens.matrix) 168 | 169 | 170 | # Investigate the effects of stemming. 171 | colnames(train.tokens.matrix)[1:50] 172 | 173 | 174 | # Per best practices, we will leverage cross validation (CV) as 175 | # the basis of our modeling process. Using CV we can create 176 | # estimates of how well our model will do in Production on new, 177 | # unseen data. CV is powerful, but the downside is that it 178 | # requires more processing and therefore more time. 179 | # 180 | # If you are not familiar with CV, consult the following 181 | # Wikipedia article: 182 | # 183 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 184 | # 185 | 186 | # Setup a the feature data frame with labels. 187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 188 | 189 | 190 | # Often, tokenization requires some additional pre-processing 191 | names(train.tokens.df)[c(146, 148, 235, 238)] 192 | 193 | 194 | # Cleanup column names. 195 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 196 | 197 | 198 | # Use caret to create stratified folds for 10-fold cross validation repeated 199 | # 3 times (i.e., create 30 random stratified samples) 200 | set.seed(48743) 201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 202 | 203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 204 | repeats = 3, index = cv.folds) 205 | 206 | 207 | # Our data frame is non-trivial in size. As such, CV runs will take 208 | # quite a long time to run. To cut down on total execution time, use 209 | # the doSNOW package to allow for multi-core training in parallel. 210 | # 211 | # WARNING - The following code is configured to run on a workstation- 212 | # or server-class machine (i.e., 12 logical cores). Alter 213 | # code to suit your HW environment. 214 | # 215 | #install.packages("doSNOW") 216 | library(doSNOW) 217 | 218 | 219 | # Time the code execution 220 | start.time <- Sys.time() 221 | 222 | 223 | # Create a cluster to work on 10 logical cores. 224 | cl <- makeCluster(10, type = "SOCK") 225 | registerDoSNOW(cl) 226 | 227 | 228 | # As our data is non-trivial in size at this point, use a single decision 229 | # tree alogrithm as our first model. We will graduate to using more 230 | # powerful algorithms later when we perform feature extraction to shrink 231 | # the size of our data. 232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 233 | trControl = cv.cntrl, tuneLength = 7) 234 | 235 | 236 | # Processing is done, stop cluster. 237 | stopCluster(cl) 238 | 239 | 240 | # Total time of execution on workstation was approximately 4 minutes. 241 | total.time <- Sys.time() - start.time 242 | total.time 243 | 244 | 245 | # Check out our results. 246 | rpart.cv.1 247 | 248 | 249 | 250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 251 | # powerful technique for enhancing the information/signal contained 252 | # within our document-frequency matrix. Specifically, the mathematics 253 | # behind TF-IDF accomplish the following goals: 254 | # 1 - The TF calculation accounts for the fact that longer 255 | # documents will have higher individual term counts. Applying 256 | # TF normalizes all documents in the corpus to be length 257 | # independent. 258 | # 2 - The IDF calculation accounts for the frequency of term 259 | # appearance in all documents in the corpus. The intuition 260 | # being that a term that appears in every document has no 261 | # predictive power. 262 | # 3 - The multiplication of TF by IDF for each cell in the matrix 263 | # allows for weighting of #1 and #2 for each cell in the matrix. 264 | 265 | 266 | # Our function for calculating relative term frequency (TF) 267 | term.frequency <- function(row) { 268 | row / sum(row) 269 | } 270 | 271 | # Our function for calculating inverse document frequency (IDF) 272 | inverse.doc.freq <- function(col) { 273 | corpus.size <- length(col) 274 | doc.count <- length(which(col > 0)) 275 | 276 | log10(corpus.size / doc.count) 277 | } 278 | 279 | # Our function for calculating TF-IDF. 280 | tf.idf <- function(x, idf) { 281 | x * idf 282 | } 283 | 284 | 285 | # First step, normalize all documents via TF. 286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 287 | dim(train.tokens.df) 288 | View(train.tokens.df[1:20, 1:100]) 289 | 290 | 291 | # Second step, calculate the IDF vector that we will use - both 292 | # for training data and for test data! 293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 294 | str(train.tokens.idf) 295 | 296 | 297 | # Lastly, calculate TF-IDF for our training corpus. 298 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 299 | dim(train.tokens.tfidf) 300 | View(train.tokens.tfidf[1:25, 1:25]) 301 | 302 | 303 | # Transpose the matrix 304 | train.tokens.tfidf <- t(train.tokens.tfidf) 305 | dim(train.tokens.tfidf) 306 | View(train.tokens.tfidf[1:25, 1:25]) 307 | 308 | 309 | # Check for incopmlete cases. 310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 311 | train$Text[incomplete.cases] 312 | 313 | 314 | # Fix incomplete cases 315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 316 | dim(train.tokens.tfidf) 317 | sum(which(!complete.cases(train.tokens.tfidf))) 318 | 319 | 320 | # Make a clean data frame using the same process as before. 321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 323 | 324 | 325 | # Time the code execution 326 | start.time <- Sys.time() 327 | 328 | # Create a cluster to work on 10 logical cores. 329 | cl <- makeCluster(10, type = "SOCK") 330 | registerDoSNOW(cl) 331 | 332 | # As our data is non-trivial in size at this point, use a single decision 333 | # tree alogrithm as our first model. We will graduate to using more 334 | # powerful algorithms later when we perform feature extraction to shrink 335 | # the size of our data. 336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 337 | trControl = cv.cntrl, tuneLength = 7) 338 | 339 | # Processing is done, stop cluster. 340 | stopCluster(cl) 341 | 342 | # Total time of execution on workstation was 343 | total.time <- Sys.time() - start.time 344 | total.time 345 | 346 | # Check out our results. 347 | rpart.cv.2 348 | 349 | 350 | 351 | # N-grams allow us to augment our document-term frequency matrices with 352 | # word ordering. This often leads to increased performance (e.g., accuracy) 353 | # for machine learning models trained with more than just unigrams (i.e., 354 | # single terms). Let's add bigrams to our training data and the TF-IDF 355 | # transform the expanded featre matrix to see if accuracy improves. 356 | 357 | # Add bigrams to our feature matrix. 358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2) 359 | train.tokens[[357]] 360 | 361 | 362 | # Transform to dfm and then a matrix. 363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 364 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 365 | train.tokens.dfm 366 | 367 | 368 | # Normalize all documents via TF. 369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 370 | 371 | 372 | # Calculate the IDF vector that we will use for training and test data! 373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 374 | 375 | 376 | # Calculate TF-IDF for our training corpus 377 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, 378 | idf = train.tokens.idf) 379 | 380 | 381 | # Transpose the matrix 382 | train.tokens.tfidf <- t(train.tokens.tfidf) 383 | 384 | 385 | # Fix incomplete cases 386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 388 | 389 | 390 | # Make a clean data frame. 391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 393 | 394 | 395 | # Clean up unused objects in memory. 396 | gc() 397 | 398 | 399 | 400 | 401 | # 402 | # NOTE - The following code requires the use of command-line R to execute 403 | # due to the large number of features (i.e., columns) in the matrix. 404 | # Please consult the following link for more details if you wish 405 | # to run the code yourself: 406 | # 407 | # https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r 408 | # 409 | # Also note that running the following code required approximately 410 | # 38GB of RAM and more than 4.5 hours to execute on a 10-core 411 | # workstation! 412 | # 413 | 414 | 415 | # Time the code execution 416 | # start.time <- Sys.time() 417 | 418 | # Leverage single decision trees to evaluate if adding bigrams improves the 419 | # the effectiveness of the model. 420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 421 | # trControl = cv.cntrl, tuneLength = 7) 422 | 423 | # Total time of execution on workstation was 424 | # total.time <- Sys.time() - start.time 425 | # total.time 426 | 427 | # Check out our results. 428 | # rpart.cv.3 429 | 430 | # 431 | # The results of the above processing show a slight decline in rpart 432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457. 433 | # As we will discuss later, while the addition of bigrams appears to 434 | # negatively impact a single decision tree, it helps with the mighty 435 | # random forest! 436 | # 437 | 438 | 439 | 440 | 441 | # We'll leverage the irlba package for our singular value 442 | # decomposition (SVD). The irlba package allows us to specify 443 | # the number of the most important singular vectors we wish to 444 | # calculate and retain for features. 445 | library(irlba) 446 | 447 | 448 | # Time the code execution 449 | start.time <- Sys.time() 450 | 451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns 452 | # for our latent semantic analysis (LSA). 453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600) 454 | 455 | # Total time of execution on workstation was 456 | total.time <- Sys.time() - start.time 457 | total.time 458 | 459 | 460 | # Take a look at the new feature data up close. 461 | View(train.irlba$v) 462 | 463 | 464 | # As with TF-IDF, we will need to project new data (e.g., the test data) 465 | # into the SVD semantic space. The following code illustrates how to do 466 | # this using a row of the training data that has already been transformed 467 | # by TF-IDF, per the mathematics illustrated in the slides. 468 | # 469 | # 470 | sigma.inverse <- 1 / train.irlba$d 471 | u.transpose <- t(train.irlba$u) 472 | document <- train.tokens.tfidf[1,] 473 | document.hat <- sigma.inverse * u.transpose %*% document 474 | 475 | # Look at the first 10 components of projected document and the corresponding 476 | # row in our document semantic space (i.e., the V matrix) 477 | document.hat[1:10] 478 | train.irlba$v[1, 1:10] 479 | 480 | 481 | 482 | # 483 | # Create new feature data frame using our document semantic space of 300 484 | # features (i.e., the V matrix from our SVD). 485 | # 486 | train.svd <- data.frame(Label = train$Label, train.irlba$v) 487 | 488 | 489 | # Create a cluster to work on 10 logical cores. 490 | cl <- makeCluster(10, type = "SOCK") 491 | registerDoSNOW(cl) 492 | 493 | # Time the code execution 494 | start.time <- Sys.time() 495 | 496 | # This will be the last run using single decision trees. With a much smaller 497 | # feature matrix we can now use more powerful methods like the mighty Random 498 | # Forest from now on! 499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 500 | trControl = cv.cntrl, tuneLength = 7) 501 | 502 | # Processing is done, stop cluster. 503 | stopCluster(cl) 504 | 505 | # Total time of execution on workstation was 506 | total.time <- Sys.time() - start.time 507 | total.time 508 | 509 | # Check out our results. 510 | rpart.cv.4 511 | 512 | 513 | 514 | 515 | # 516 | # NOTE - The following code takes a long time to run. Here's the math. 517 | # We are performing 10-fold CV repeated 3 times. That means we 518 | # need to build 30 models. We are also asking caret to try 7 519 | # different values of the mtry parameter. Next up by default 520 | # a mighty random forest leverages 500 trees. Lastly, caret will 521 | # build 1 final model at the end of the process with the best 522 | # mtry value over all the training data. Here's the number of 523 | # tree we're building: 524 | # 525 | # (10 * 3 * 7 * 500) + 500 = 105,500 trees! 526 | # 527 | # On a workstation using 10 cores the following code took 28 minutes 528 | # to execute. 529 | # 530 | 531 | 532 | # Create a cluster to work on 10 logical cores. 533 | # cl <- makeCluster(10, type = "SOCK") 534 | # registerDoSNOW(cl) 535 | 536 | # Time the code execution 537 | # start.time <- Sys.time() 538 | 539 | # We have reduced the dimensionality of our data using SVD. Also, the 540 | # application of SVD allows us to use LSA to simultaneously increase the 541 | # information density of each feature. To prove this out, leverage a 542 | # mighty Random Forest with the default of 500 trees. We'll also ask 543 | # caret to try 7 different values of mtry to find the mtry value that 544 | # gives the best result! 545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 546 | # trControl = cv.cntrl, tuneLength = 7) 547 | 548 | # Processing is done, stop cluster. 549 | # stopCluster(cl) 550 | 551 | # Total time of execution on workstation was 552 | # total.time <- Sys.time() - start.time 553 | # total.time 554 | 555 | 556 | # Load processing results from disk! 557 | load("rf.cv.1.RData") 558 | 559 | # Check out our results. 560 | rf.cv.1 561 | 562 | # Let's drill-down on the results. 563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted) 564 | 565 | 566 | 567 | 568 | 569 | # OK, now let's add in the feature we engineered previously for SMS 570 | # text length to see if it improves things. 571 | train.svd$TextLength <- train$TextLength 572 | 573 | 574 | # Create a cluster to work on 10 logical cores. 575 | # cl <- makeCluster(10, type = "SOCK") 576 | # registerDoSNOW(cl) 577 | 578 | # Time the code execution 579 | # start.time <- Sys.time() 580 | 581 | # Re-run the training process with the additional feature. 582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf", 583 | # trControl = cv.cntrl, tuneLength = 7, 584 | # importance = TRUE) 585 | 586 | # Processing is done, stop cluster. 587 | # stopCluster(cl) 588 | 589 | # Total time of execution on workstation was 590 | # total.time <- Sys.time() - start.time 591 | # total.time 592 | 593 | # Load results from disk. 594 | load("rf.cv.2.RData") 595 | 596 | # Check the results. 597 | rf.cv.2 598 | 599 | # Drill-down on the results. 600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted) 601 | 602 | # How important was the new feature? 603 | library(randomForest) 604 | varImpPlot(rf.cv.1$finalModel) 605 | varImpPlot(rf.cv.2$finalModel) 606 | 607 | 608 | 609 | 610 | # Turns out that our TextLength feature is very predictive and pushed our 611 | # overall accuracy over the training data to 97.1%. We can also use the 612 | # power of cosine similarity to engineer a feature for calculating, on 613 | # average, how alike each SMS text message is to all of the spam messages. 614 | # The hypothesis here is that our use of bigrams, tf-idf, and LSA have 615 | # produced a representation where ham SMS messages should have low cosine 616 | # similarities with spam SMS messages and vice versa. 617 | 618 | # Use the lsa package's cosine function for our calculations. 619 | #install.packages("lsa") 620 | library(lsa) 621 | 622 | train.similarities <- cosine(t(as.matrix(train.svd[, -c(1, ncol(train.svd))]))) 623 | 624 | 625 | # Next up - take each SMS text message and find what the mean cosine 626 | # similarity is for each SMS text mean with each of the spam SMS messages. 627 | # Per our hypothesis, ham SMS text messages should have relatively low 628 | # cosine similarities with spam messages and vice versa! 629 | spam.indexes <- which(train$Label == "spam") 630 | 631 | train.svd$SpamSimilarity <- rep(0.0, nrow(train.svd)) 632 | for(i in 1:nrow(train.svd)) { 633 | train.svd$SpamSimilarity[i] <- mean(train.similarities[i, spam.indexes]) 634 | } 635 | 636 | 637 | # As always, let's visualize our results using the mighty ggplot2 638 | ggplot(train.svd, aes(x = SpamSimilarity, fill = Label)) + 639 | theme_bw() + 640 | geom_histogram(binwidth = 0.05) + 641 | labs(y = "Message Count", 642 | x = "Mean Spam Message Cosine Similarity", 643 | title = "Distribution of Ham vs. Spam Using Spam Cosine Similarity") 644 | 645 | 646 | # Per our analysis of mighty random forest results, we are interested in 647 | # in features that can raise model performance with respect to sensitivity. 648 | # Perform another CV process using the new spam cosine similarity feature. 649 | 650 | # Create a cluster to work on 10 logical cores. 651 | # cl <- makeCluster(10, type = "SOCK") 652 | # registerDoSNOW(cl) 653 | 654 | # Time the code execution 655 | # start.time <- Sys.time() 656 | 657 | # Re-run the training process with the additional feature. 658 | # set.seed(932847) 659 | # rf.cv.3 <- train(Label ~ ., data = train.svd, method = "rf", 660 | # trControl = cv.cntrl, tuneLength = 7, 661 | # importance = TRUE) 662 | 663 | # Processing is done, stop cluster. 664 | # stopCluster(cl) 665 | 666 | # Total time of execution on workstation was 667 | # total.time <- Sys.time() - start.time 668 | # total.time 669 | 670 | 671 | # Load results from disk. 672 | load("rf.cv.3.RData") 673 | 674 | # Check the results. 675 | rf.cv.3 676 | 677 | # Drill-down on the results. 678 | confusionMatrix(train.svd$Label, rf.cv.3$finalModel$predicted) 679 | 680 | # How important was this feature? 681 | library(randomForest) 682 | varImpPlot(rf.cv.3$finalModel) 683 | 684 | 685 | 686 | 687 | # We've built what appears to be an effective predictive model. Time to verify 688 | # using the test holdout data we set aside at the beginning of the project. 689 | # First stage of this verification is running the test data through our pre- 690 | # processing pipeline of: 691 | # 1 - Tokenization 692 | # 2 - Lower casing 693 | # 3 - Stopword removal 694 | # 4 - Stemming 695 | # 5 - Adding bigrams 696 | # 6 - Transform to dfm 697 | # 7 - Ensure test dfm has same features as train dfm 698 | 699 | # Tokenization. 700 | test.tokens <- tokens(test$Text, what = "word", 701 | remove_numbers = TRUE, remove_punct = TRUE, 702 | remove_symbols = TRUE, remove_hyphens = TRUE) 703 | 704 | # Lower case the tokens. 705 | test.tokens <- tokens_tolower(test.tokens) 706 | 707 | # Stopword removal. 708 | test.tokens <- tokens_select(test.tokens, stopwords(), 709 | selection = "remove") 710 | 711 | # Stemming. 712 | test.tokens <- tokens_wordstem(test.tokens, language = "english") 713 | 714 | # Add bigrams. 715 | test.tokens <- tokens_ngrams(test.tokens, n = 1:2) 716 | 717 | # Convert n-grams to quanteda document-term frequency matrix. 718 | test.tokens.dfm <- dfm(test.tokens, tolower = FALSE) 719 | 720 | # Explore the train and test quanteda dfm objects. 721 | train.tokens.dfm 722 | test.tokens.dfm 723 | 724 | # Ensure the test dfm has the same n-grams as the training dfm. 725 | # 726 | # NOTE - In production we should expect that new text messages will 727 | # contain n-grams that did not exist in the original training 728 | # data. As such, we need to strip those n-grams out. 729 | # 730 | test.tokens.dfm <- dfm_select(test.tokens.dfm, pattern = train.tokens.dfm, 731 | selection = "keep") 732 | test.tokens.matrix <- as.matrix(test.tokens.dfm) 733 | test.tokens.dfm 734 | 735 | 736 | 737 | 738 | # With the raw test features in place next up is the projecting the term 739 | # counts for the unigrams into the same TF-IDF vector space as our training 740 | # data. The high level process is as follows: 741 | # 1 - Normalize each document (i.e, each row) 742 | # 2 - Perform IDF multiplication using training IDF values 743 | 744 | # Normalize all documents via TF. 745 | test.tokens.df <- apply(test.tokens.matrix, 1, term.frequency) 746 | str(test.tokens.df) 747 | 748 | # Lastly, calculate TF-IDF for our training corpus. 749 | test.tokens.tfidf <- apply(test.tokens.df, 2, tf.idf, idf = train.tokens.idf) 750 | dim(test.tokens.tfidf) 751 | View(test.tokens.tfidf[1:25, 1:25]) 752 | 753 | # Transpose the matrix 754 | test.tokens.tfidf <- t(test.tokens.tfidf) 755 | 756 | # Fix incomplete cases 757 | summary(test.tokens.tfidf[1,]) 758 | test.tokens.tfidf[is.na(test.tokens.tfidf)] <- 0.0 759 | summary(test.tokens.tfidf[1,]) 760 | 761 | 762 | 763 | 764 | # With the test data projected into the TF-IDF vector space of the training 765 | # data we can now to the final projection into the training LSA semantic 766 | # space (i.e. the SVD matrix factorization). 767 | test.svd.raw <- t(sigma.inverse * u.transpose %*% t(test.tokens.tfidf)) 768 | 769 | 770 | # Lastly, we can now build the test data frame to feed into our trained 771 | # machine learning model for predictions. First up, add Label and TextLength. 772 | test.svd <- data.frame(Label = test$Label, test.svd.raw, 773 | TextLength = test$TextLength) 774 | 775 | 776 | # Next step, calculate SpamSimilarity for all the test documents. First up, 777 | # create a spam similarity matrix. 778 | test.similarities <- rbind(test.svd.raw, train.irlba$v[spam.indexes,]) 779 | test.similarities <- cosine(t(test.similarities)) 780 | 781 | 782 | # 783 | # NOTE - The following code was updated post-video recoding due to a bug. 784 | # 785 | test.svd$SpamSimilarity <- rep(0.0, nrow(test.svd)) 786 | spam.cols <- (nrow(test.svd) + 1):ncol(test.similarities) 787 | for(i in 1:nrow(test.svd)) { 788 | # The following line has the bug fix. 789 | test.svd$SpamSimilarity[i] <- mean(test.similarities[i, spam.cols]) 790 | } 791 | 792 | 793 | # Some SMS text messages become empty as a result of stopword and special 794 | # character removal. This results in spam similarity measures of 0. Correct. 795 | # This code as added post-video as part of the bug fix. 796 | test.svd$SpamSimilarity[!is.finite(test.svd$SpamSimilarity)] <- 0 797 | 798 | 799 | # Now we can make predictions on the test data set using our trained mighty 800 | # random forest. 801 | preds <- predict(rf.cv.3, test.svd) 802 | 803 | 804 | # Drill-in on results 805 | confusionMatrix(preds, test.svd$Label) 806 | 807 | 808 | 809 | 810 | # The definition of overfitting is doing far better on the training data as 811 | # evidenced by CV than doing on a hold-out dataset (i.e., our test dataset). 812 | # One potential explantion of this overfitting is the use of the spam similarity 813 | # feature. The hypothesis here is that spam features (i.e., text content) varies 814 | # highly, espeically over time. As such, our average spam cosine similarity 815 | # is likely to overfit to the training data. To combat this, let's rebuild a 816 | # mighty random forest without the spam similarity feature. 817 | train.svd$SpamSimilarity <- NULL 818 | test.svd$SpamSimilarity <- NULL 819 | 820 | 821 | # Create a cluster to work on 10 logical cores. 822 | # cl <- makeCluster(10, type = "SOCK") 823 | # registerDoSNOW(cl) 824 | 825 | # Time the code execution 826 | # start.time <- Sys.time() 827 | 828 | # Re-run the training process with the additional feature. 829 | # set.seed(254812) 830 | # rf.cv.4 <- train(Label ~ ., data = train.svd, method = "rf", 831 | # trControl = cv.cntrl, tuneLength = 7, 832 | # importance = TRUE) 833 | 834 | # Processing is done, stop cluster. 835 | # stopCluster(cl) 836 | 837 | # Total time of execution on workstation was 838 | # total.time <- Sys.time() - start.time 839 | # total.time 840 | 841 | 842 | # Load results from disk. 843 | load("rf.cv.4.RData") 844 | 845 | 846 | # Make predictions and drill-in on the results 847 | preds <- predict(rf.cv.4, test.svd) 848 | confusionMatrix(preds, test.svd$Label) 849 | 850 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part2.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 2 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=Y7385dGRNLM 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | 39 | # Clean up the data frame and view our handiwork. 40 | spam.raw <- spam.raw[, 1:2] 41 | names(spam.raw) <- c("Label", "Text") 42 | View(spam.raw) 43 | 44 | 45 | 46 | # Check data to see if there are missing values. 47 | length(which(!complete.cases(spam.raw))) 48 | 49 | 50 | 51 | # Convert our class label into a factor. 52 | spam.raw$Label <- as.factor(spam.raw$Label) 53 | 54 | 55 | 56 | # The first step, as always, is to explore the data. 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 58 | prop.table(table(spam.raw$Label)) 59 | 60 | 61 | 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 63 | # messages by adding a new feature for the length of each message. 64 | spam.raw$TextLength <- nchar(spam.raw$Text) 65 | summary(spam.raw$TextLength) 66 | 67 | 68 | 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 70 | library(ggplot2) 71 | 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 73 | theme_bw() + 74 | geom_histogram(binwidth = 5) + 75 | labs(y = "Text Count", x = "Length of Text", 76 | title = "Distribution of Text Lengths with Class Labels") 77 | 78 | 79 | 80 | # At a minimum we need to split our data into a training set and a 81 | # test set. In a true project we would want to use a three-way split 82 | # of training, validation, and test. 83 | # 84 | # As we know that our data has non-trivial class imbalance, we'll 85 | # use the mighty caret package to create a randomg train/test split 86 | # that ensures the correct ham/spam class label proportions (i.e., 87 | # we'll use caret for a random stratified split). 88 | library(caret) 89 | help(package = "caret") 90 | 91 | 92 | # Use caret to create a 70%/30% stratified split. Set the random 93 | # seed for reproducibility. 94 | set.seed(32984) 95 | indexes <- createDataPartition(spam.raw$Label, times = 1, 96 | p = 0.7, list = FALSE) 97 | 98 | train <- spam.raw[indexes,] 99 | test <- spam.raw[-indexes,] 100 | 101 | 102 | # Verify proportions. 103 | prop.table(table(train$Label)) 104 | prop.table(table(test$Label)) 105 | 106 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part3.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 3 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=CQsyVDxK7_g 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | 39 | # Clean up the data frame and view our handiwork. 40 | spam.raw <- spam.raw[, 1:2] 41 | names(spam.raw) <- c("Label", "Text") 42 | View(spam.raw) 43 | 44 | 45 | 46 | # Check data to see if there are missing values. 47 | length(which(!complete.cases(spam.raw))) 48 | 49 | 50 | 51 | # Convert our class label into a factor. 52 | spam.raw$Label <- as.factor(spam.raw$Label) 53 | 54 | 55 | 56 | # The first step, as always, is to explore the data. 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 58 | prop.table(table(spam.raw$Label)) 59 | 60 | 61 | 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 63 | # messages by adding a new feature for the length of each message. 64 | spam.raw$TextLength <- nchar(spam.raw$Text) 65 | summary(spam.raw$TextLength) 66 | 67 | 68 | 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 70 | library(ggplot2) 71 | 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 73 | theme_bw() + 74 | geom_histogram(binwidth = 5) + 75 | labs(y = "Text Count", x = "Length of Text", 76 | title = "Distribution of Text Lengths with Class Labels") 77 | 78 | 79 | 80 | # At a minimum we need to split our data into a training set and a 81 | # test set. In a true project we would want to use a three-way split 82 | # of training, validation, and test. 83 | # 84 | # As we know that our data has non-trivial class imbalance, we'll 85 | # use the mighty caret package to create a randomg train/test split 86 | # that ensures the correct ham/spam class label proportions (i.e., 87 | # we'll use caret for a random stratified split). 88 | library(caret) 89 | help(package = "caret") 90 | 91 | 92 | # Use caret to create a 70%/30% stratified split. Set the random 93 | # seed for reproducibility. 94 | set.seed(32984) 95 | indexes <- createDataPartition(spam.raw$Label, times = 1, 96 | p = 0.7, list = FALSE) 97 | 98 | train <- spam.raw[indexes,] 99 | test <- spam.raw[-indexes,] 100 | 101 | 102 | # Verify proportions. 103 | prop.table(table(train$Label)) 104 | prop.table(table(test$Label)) 105 | 106 | 107 | 108 | # Text analytics requires a lot of data exploration, data pre-processing 109 | # and data wrangling. Let's explore some examples. 110 | 111 | # HTML-escaped ampersand character. 112 | train$Text[21] 113 | 114 | 115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 116 | # is an actual person, but we will ignore the implications of this for 117 | # this introductory tutorial. 118 | train$Text[38] 119 | 120 | 121 | # A URL. 122 | train$Text[357] 123 | 124 | 125 | 126 | # There are many packages in the R ecosystem for performing text 127 | # analytics. One of the newer packages in quanteda. The quanteda 128 | # package has many useful functions for quickly and easily working 129 | # with text data. 130 | library(quanteda) 131 | help(package = "quanteda") 132 | 133 | 134 | # Tokenize SMS text messages. 135 | train.tokens <- tokens(train$Text, what = "word", 136 | remove_numbers = TRUE, remove_punct = TRUE, 137 | remove_symbols = TRUE, remove_hyphens = TRUE) 138 | 139 | # Take a look at a specific SMS message and see how it transforms. 140 | train.tokens[[357]] 141 | 142 | 143 | # Lower case the tokens. 144 | train.tokens <- tokens_tolower(train.tokens) 145 | train.tokens[[357]] 146 | 147 | 148 | # Use quanteda's built-in stopword list for English. 149 | # NOTE - You should always inspect stopword lists for applicability to 150 | # your problem/domain. 151 | train.tokens <- tokens_select(train.tokens, stopwords(), 152 | selection = "remove") 153 | train.tokens[[357]] 154 | 155 | 156 | # Perform stemming on the tokens. 157 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 158 | train.tokens[[357]] 159 | 160 | 161 | # Create our first bag-of-words model. 162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 163 | 164 | 165 | # Transform to a matrix and inspect. 166 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 167 | View(train.tokens.matrix[1:20, 1:100]) 168 | dim(train.tokens.matrix) 169 | 170 | 171 | # Investigate the effects of stemming. 172 | colnames(train.tokens.matrix)[1:50] 173 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part4.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 4 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=IFhDlHKRHno 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | 39 | # Clean up the data frame and view our handiwork. 40 | spam.raw <- spam.raw[, 1:2] 41 | names(spam.raw) <- c("Label", "Text") 42 | View(spam.raw) 43 | 44 | 45 | 46 | # Check data to see if there are missing values. 47 | length(which(!complete.cases(spam.raw))) 48 | 49 | 50 | 51 | # Convert our class label into a factor. 52 | spam.raw$Label <- as.factor(spam.raw$Label) 53 | 54 | 55 | 56 | # The first step, as always, is to explore the data. 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 58 | prop.table(table(spam.raw$Label)) 59 | 60 | 61 | 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 63 | # messages by adding a new feature for the length of each message. 64 | spam.raw$TextLength <- nchar(spam.raw$Text) 65 | summary(spam.raw$TextLength) 66 | 67 | 68 | 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 70 | library(ggplot2) 71 | 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 73 | theme_bw() + 74 | geom_histogram(binwidth = 5) + 75 | labs(y = "Text Count", x = "Length of Text", 76 | title = "Distribution of Text Lengths with Class Labels") 77 | 78 | 79 | 80 | # At a minimum we need to split our data into a training set and a 81 | # test set. In a true project we would want to use a three-way split 82 | # of training, validation, and test. 83 | # 84 | # As we know that our data has non-trivial class imbalance, we'll 85 | # use the mighty caret package to create a randomg train/test split 86 | # that ensures the correct ham/spam class label proportions (i.e., 87 | # we'll use caret for a random stratified split). 88 | library(caret) 89 | help(package = "caret") 90 | 91 | 92 | # Use caret to create a 70%/30% stratified split. Set the random 93 | # seed for reproducibility. 94 | set.seed(32984) 95 | indexes <- createDataPartition(spam.raw$Label, times = 1, 96 | p = 0.7, list = FALSE) 97 | 98 | train <- spam.raw[indexes,] 99 | test <- spam.raw[-indexes,] 100 | 101 | 102 | # Verify proportions. 103 | prop.table(table(train$Label)) 104 | prop.table(table(test$Label)) 105 | 106 | 107 | 108 | # Text analytics requires a lot of data exploration, data pre-processing 109 | # and data wrangling. Let's explore some examples. 110 | 111 | # HTML-escaped ampersand character. 112 | train$Text[21] 113 | 114 | 115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 116 | # is an actual person, but we will ignore the implications of this for 117 | # this introductory tutorial. 118 | train$Text[38] 119 | 120 | 121 | # A URL. 122 | train$Text[357] 123 | 124 | 125 | 126 | # There are many packages in the R ecosystem for performing text 127 | # analytics. One of the newer packages in quanteda. The quanteda 128 | # package has many useful functions for quickly and easily working 129 | # with text data. 130 | library(quanteda) 131 | help(package = "quanteda") 132 | 133 | 134 | # Tokenize SMS text messages. 135 | train.tokens <- tokens(train$Text, what = "word", 136 | remove_numbers = TRUE, remove_punct = TRUE, 137 | remove_symbols = TRUE, remove_hyphens = TRUE) 138 | 139 | # Take a look at a specific SMS message and see how it transforms. 140 | train.tokens[[357]] 141 | 142 | 143 | # Lower case the tokens. 144 | train.tokens <- tokens_tolower(train.tokens) 145 | train.tokens[[357]] 146 | 147 | 148 | # Use quanteda's built-in stopword list for English. 149 | # NOTE - You should always inspect stopword lists for applicability to 150 | # your problem/domain. 151 | train.tokens <- tokens_select(train.tokens, stopwords(), 152 | selection = "remove") 153 | train.tokens[[357]] 154 | 155 | 156 | # Perform stemming on the tokens. 157 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 158 | train.tokens[[357]] 159 | 160 | 161 | # Create our first bag-of-words model. 162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 163 | 164 | 165 | # Transform to a matrix and inspect. 166 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 167 | View(train.tokens.matrix[1:20, 1:100]) 168 | dim(train.tokens.matrix) 169 | 170 | 171 | # Investigate the effects of stemming. 172 | colnames(train.tokens.matrix)[1:50] 173 | 174 | 175 | # Per best practices, we will leverage cross validation (CV) as 176 | # the basis of our modeling process. Using CV we can create 177 | # estimates of how well our model will do in Production on new, 178 | # unseen data. CV is powerful, but the downside is that it 179 | # requires more processing and therefore more time. 180 | # 181 | # If you are not familiar with CV, consult the following 182 | # Wikipedia article: 183 | # 184 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 185 | # 186 | 187 | # Setup a the feature data frame with labels. 188 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 189 | 190 | 191 | # Often, tokenization requires some additional pre-processing 192 | names(train.tokens.df)[c(146, 148, 235, 238)] 193 | 194 | 195 | # Cleanup column names. 196 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 197 | 198 | 199 | # Use caret to create stratified folds for 10-fold cross validation repeated 200 | # 3 times (i.e., create 30 random stratified samples) 201 | set.seed(48743) 202 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 203 | 204 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 205 | repeats = 3, index = cv.folds) 206 | 207 | 208 | # Our data frame is non-trivial in size. As such, CV runs will take 209 | # quite a long time to run. To cut down on total execution time, use 210 | # the doSNOW package to allow for multi-core training in parallel. 211 | # 212 | # WARNING - The following code is configured to run on a workstation- 213 | # or server-class machine (i.e., 12 logical cores). Alter 214 | # code to suit your HW environment. 215 | # 216 | #install.packages("doSNOW") 217 | library(doSNOW) 218 | 219 | 220 | # Time the code execution 221 | start.time <- Sys.time() 222 | 223 | 224 | # Create a cluster to work on 10 logical cores. 225 | cl <- makeCluster(10, type = "SOCK") 226 | registerDoSNOW(cl) 227 | 228 | 229 | # As our data is non-trivial in size at this point, use a single decision 230 | # tree alogrithm as our first model. We will graduate to using more 231 | # powerful algorithms later when we perform feature extraction to shrink 232 | # the size of our data. 233 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 234 | trControl = cv.cntrl, tuneLength = 7) 235 | 236 | 237 | # Processing is done, stop cluster. 238 | stopCluster(cl) 239 | 240 | 241 | # Total time of execution on workstation was approximately 4 minutes. 242 | total.time <- Sys.time() - start.time 243 | total.time 244 | 245 | 246 | # Check out our results. 247 | rpart.cv.1 248 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part5.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 5 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=az7yf0IfWPM 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | 39 | # Clean up the data frame and view our handiwork. 40 | spam.raw <- spam.raw[, 1:2] 41 | names(spam.raw) <- c("Label", "Text") 42 | View(spam.raw) 43 | 44 | 45 | 46 | # Check data to see if there are missing values. 47 | length(which(!complete.cases(spam.raw))) 48 | 49 | 50 | 51 | # Convert our class label into a factor. 52 | spam.raw$Label <- as.factor(spam.raw$Label) 53 | 54 | 55 | 56 | # The first step, as always, is to explore the data. 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 58 | prop.table(table(spam.raw$Label)) 59 | 60 | 61 | 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 63 | # messages by adding a new feature for the length of each message. 64 | spam.raw$TextLength <- nchar(spam.raw$Text) 65 | summary(spam.raw$TextLength) 66 | 67 | 68 | 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 70 | library(ggplot2) 71 | 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 73 | theme_bw() + 74 | geom_histogram(binwidth = 5) + 75 | labs(y = "Text Count", x = "Length of Text", 76 | title = "Distribution of Text Lengths with Class Labels") 77 | 78 | 79 | 80 | # At a minimum we need to split our data into a training set and a 81 | # test set. In a true project we would want to use a three-way split 82 | # of training, validation, and test. 83 | # 84 | # As we know that our data has non-trivial class imbalance, we'll 85 | # use the mighty caret package to create a randomg train/test split 86 | # that ensures the correct ham/spam class label proportions (i.e., 87 | # we'll use caret for a random stratified split). 88 | library(caret) 89 | help(package = "caret") 90 | 91 | 92 | # Use caret to create a 70%/30% stratified split. Set the random 93 | # seed for reproducibility. 94 | set.seed(32984) 95 | indexes <- createDataPartition(spam.raw$Label, times = 1, 96 | p = 0.7, list = FALSE) 97 | 98 | train <- spam.raw[indexes,] 99 | test <- spam.raw[-indexes,] 100 | 101 | 102 | # Verify proportions. 103 | prop.table(table(train$Label)) 104 | prop.table(table(test$Label)) 105 | 106 | 107 | 108 | # Text analytics requires a lot of data exploration, data pre-processing 109 | # and data wrangling. Let's explore some examples. 110 | 111 | # HTML-escaped ampersand character. 112 | train$Text[21] 113 | 114 | 115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 116 | # is an actual person, but we will ignore the implications of this for 117 | # this introductory tutorial. 118 | train$Text[38] 119 | 120 | 121 | # A URL. 122 | train$Text[357] 123 | 124 | 125 | 126 | # There are many packages in the R ecosystem for performing text 127 | # analytics. One of the newer packages in quanteda. The quanteda 128 | # package has many useful functions for quickly and easily working 129 | # with text data. 130 | library(quanteda) 131 | help(package = "quanteda") 132 | 133 | 134 | # Tokenize SMS text messages. 135 | train.tokens <- tokens(train$Text, what = "word", 136 | remove_numbers = TRUE, remove_punct = TRUE, 137 | remove_symbols = TRUE, remove_hyphens = TRUE) 138 | 139 | # Take a look at a specific SMS message and see how it transforms. 140 | train.tokens[[357]] 141 | 142 | 143 | # Lower case the tokens. 144 | train.tokens <- tokens_tolower(train.tokens) 145 | train.tokens[[357]] 146 | 147 | 148 | # Use quanteda's built-in stopword list for English. 149 | # NOTE - You should always inspect stopword lists for applicability to 150 | # your problem/domain. 151 | train.tokens <- tokens_select(train.tokens, stopwords(), 152 | selection = "remove") 153 | train.tokens[[357]] 154 | 155 | 156 | # Perform stemming on the tokens. 157 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 158 | train.tokens[[357]] 159 | 160 | 161 | # Create our first bag-of-words model. 162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 163 | 164 | 165 | # Transform to a matrix and inspect. 166 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 167 | View(train.tokens.matrix[1:20, 1:100]) 168 | dim(train.tokens.matrix) 169 | 170 | 171 | # Investigate the effects of stemming. 172 | colnames(train.tokens.matrix)[1:50] 173 | 174 | 175 | # Per best practices, we will leverage cross validation (CV) as 176 | # the basis of our modeling process. Using CV we can create 177 | # estimates of how well our model will do in Production on new, 178 | # unseen data. CV is powerful, but the downside is that it 179 | # requires more processing and therefore more time. 180 | # 181 | # If you are not familiar with CV, consult the following 182 | # Wikipedia article: 183 | # 184 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 185 | # 186 | 187 | # Setup a the feature data frame with labels. 188 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 189 | 190 | 191 | # Often, tokenization requires some additional pre-processing 192 | names(train.tokens.df)[c(146, 148, 235, 238)] 193 | 194 | 195 | # Cleanup column names. 196 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 197 | 198 | 199 | # Use caret to create stratified folds for 10-fold cross validation repeated 200 | # 3 times (i.e., create 30 random stratified samples) 201 | set.seed(48743) 202 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 203 | 204 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 205 | repeats = 3, index = cv.folds) 206 | 207 | 208 | # Our data frame is non-trivial in size. As such, CV runs will take 209 | # quite a long time to run. To cut down on total execution time, use 210 | # the doSNOW package to allow for multi-core training in parallel. 211 | # 212 | # WARNING - The following code is configured to run on a workstation- 213 | # or server-class machine (i.e., 12 logical cores). Alter 214 | # code to suit your HW environment. 215 | # 216 | #install.packages("doSNOW") 217 | library(doSNOW) 218 | 219 | 220 | # Time the code execution 221 | start.time <- Sys.time() 222 | 223 | 224 | # Create a cluster to work on 10 logical cores. 225 | cl <- makeCluster(10, type = "SOCK") 226 | registerDoSNOW(cl) 227 | 228 | 229 | # As our data is non-trivial in size at this point, use a single decision 230 | # tree alogrithm as our first model. We will graduate to using more 231 | # powerful algorithms later when we perform feature extraction to shrink 232 | # the size of our data. 233 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 234 | trControl = cv.cntrl, tuneLength = 7) 235 | 236 | 237 | # Processing is done, stop cluster. 238 | stopCluster(cl) 239 | 240 | 241 | # Total time of execution on workstation was approximately 4 minutes. 242 | total.time <- Sys.time() - start.time 243 | total.time 244 | 245 | 246 | # Check out our results. 247 | rpart.cv.1 248 | 249 | 250 | 251 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 252 | # powerful technique for enhancing the information/signal contained 253 | # within our document-frequency matrix. Specifically, the mathematics 254 | # behind TF-IDF accomplish the following goals: 255 | # 1 - The TF calculation accounts for the fact that longer 256 | # documents will have higher individual term counts. Applying 257 | # TF normalizes all documents in the corpus to be length 258 | # independent. 259 | # 2 - The IDF calculation accounts for the frequency of term 260 | # appearance in all documents in the corpus. The intuition 261 | # being that a term that appears in every document has no 262 | # predictive power. 263 | # 3 - The multiplication of TF by IDF for each cell in the matrix 264 | # allows for weighting of #1 and #2 for each cell in the matrix. 265 | 266 | 267 | # Our function for calculating relative term frequency (TF) 268 | term.frequency <- function(row) { 269 | row / sum(row) 270 | } 271 | 272 | # Our function for calculating inverse document frequency (IDF) 273 | inverse.doc.freq <- function(col) { 274 | corpus.size <- length(col) 275 | doc.count <- length(which(col > 0)) 276 | 277 | log10(corpus.size / doc.count) 278 | } 279 | 280 | # Our function for calculating TF-IDF. 281 | tf.idf <- function(x, idf) { 282 | x * idf 283 | } 284 | 285 | 286 | # First step, normalize all documents via TF. 287 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 288 | dim(train.tokens.df) 289 | View(train.tokens.df[1:20, 1:100]) 290 | 291 | 292 | # Second step, calculate the IDF vector that we will use - both 293 | # for training data and for test data! 294 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 295 | str(train.tokens.idf) 296 | 297 | 298 | # Lastly, calculate TF-IDF for our training corpus. 299 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 300 | dim(train.tokens.tfidf) 301 | View(train.tokens.tfidf[1:25, 1:25]) 302 | 303 | 304 | # Transpose the matrix 305 | train.tokens.tfidf <- t(train.tokens.tfidf) 306 | dim(train.tokens.tfidf) 307 | View(train.tokens.tfidf[1:25, 1:25]) 308 | 309 | 310 | # Check for incopmlete cases. 311 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 312 | train$Text[incomplete.cases] 313 | 314 | 315 | # Fix incomplete cases 316 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 317 | dim(train.tokens.tfidf) 318 | sum(which(!complete.cases(train.tokens.tfidf))) 319 | 320 | 321 | # Make a clean data frame using the same process as before. 322 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 323 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 324 | 325 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part6.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 6 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=neiW5Ugsob8 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | 39 | # Clean up the data frame and view our handiwork. 40 | spam.raw <- spam.raw[, 1:2] 41 | names(spam.raw) <- c("Label", "Text") 42 | View(spam.raw) 43 | 44 | 45 | 46 | # Check data to see if there are missing values. 47 | length(which(!complete.cases(spam.raw))) 48 | 49 | 50 | 51 | # Convert our class label into a factor. 52 | spam.raw$Label <- as.factor(spam.raw$Label) 53 | 54 | 55 | 56 | # The first step, as always, is to explore the data. 57 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 58 | prop.table(table(spam.raw$Label)) 59 | 60 | 61 | 62 | # Next up, let's get a feel for the distribution of text lengths of the SMS 63 | # messages by adding a new feature for the length of each message. 64 | spam.raw$TextLength <- nchar(spam.raw$Text) 65 | summary(spam.raw$TextLength) 66 | 67 | 68 | 69 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 70 | library(ggplot2) 71 | 72 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 73 | theme_bw() + 74 | geom_histogram(binwidth = 5) + 75 | labs(y = "Text Count", x = "Length of Text", 76 | title = "Distribution of Text Lengths with Class Labels") 77 | 78 | 79 | 80 | # At a minimum we need to split our data into a training set and a 81 | # test set. In a true project we would want to use a three-way split 82 | # of training, validation, and test. 83 | # 84 | # As we know that our data has non-trivial class imbalance, we'll 85 | # use the mighty caret package to create a randomg train/test split 86 | # that ensures the correct ham/spam class label proportions (i.e., 87 | # we'll use caret for a random stratified split). 88 | library(caret) 89 | help(package = "caret") 90 | 91 | 92 | # Use caret to create a 70%/30% stratified split. Set the random 93 | # seed for reproducibility. 94 | set.seed(32984) 95 | indexes <- createDataPartition(spam.raw$Label, times = 1, 96 | p = 0.7, list = FALSE) 97 | 98 | train <- spam.raw[indexes,] 99 | test <- spam.raw[-indexes,] 100 | 101 | 102 | # Verify proportions. 103 | prop.table(table(train$Label)) 104 | prop.table(table(test$Label)) 105 | 106 | 107 | 108 | # Text analytics requires a lot of data exploration, data pre-processing 109 | # and data wrangling. Let's explore some examples. 110 | 111 | # HTML-escaped ampersand character. 112 | train$Text[21] 113 | 114 | 115 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 116 | # is an actual person, but we will ignore the implications of this for 117 | # this introductory tutorial. 118 | train$Text[38] 119 | 120 | 121 | # A URL. 122 | train$Text[357] 123 | 124 | 125 | 126 | # There are many packages in the R ecosystem for performing text 127 | # analytics. One of the newer packages in quanteda. The quanteda 128 | # package has many useful functions for quickly and easily working 129 | # with text data. 130 | library(quanteda) 131 | help(package = "quanteda") 132 | 133 | 134 | # Tokenize SMS text messages. 135 | train.tokens <- tokens(train$Text, what = "word", 136 | remove_numbers = TRUE, remove_punct = TRUE, 137 | remove_symbols = TRUE, remove_hyphens = TRUE) 138 | 139 | # Take a look at a specific SMS message and see how it transforms. 140 | train.tokens[[357]] 141 | 142 | 143 | # Lower case the tokens. 144 | train.tokens <- tokens_tolower(train.tokens) 145 | train.tokens[[357]] 146 | 147 | 148 | # Use quanteda's built-in stopword list for English. 149 | # NOTE - You should always inspect stopword lists for applicability to 150 | # your problem/domain. 151 | train.tokens <- tokens_select(train.tokens, stopwords(), 152 | selection = "remove") 153 | train.tokens[[357]] 154 | 155 | 156 | # Perform stemming on the tokens. 157 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 158 | train.tokens[[357]] 159 | 160 | 161 | # Create our first bag-of-words model. 162 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 163 | 164 | 165 | # Transform to a matrix and inspect. 166 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 167 | View(train.tokens.matrix[1:20, 1:100]) 168 | dim(train.tokens.matrix) 169 | 170 | 171 | # Investigate the effects of stemming. 172 | colnames(train.tokens.matrix)[1:50] 173 | 174 | 175 | # Per best practices, we will leverage cross validation (CV) as 176 | # the basis of our modeling process. Using CV we can create 177 | # estimates of how well our model will do in Production on new, 178 | # unseen data. CV is powerful, but the downside is that it 179 | # requires more processing and therefore more time. 180 | # 181 | # If you are not familiar with CV, consult the following 182 | # Wikipedia article: 183 | # 184 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 185 | # 186 | 187 | # Setup a the feature data frame with labels. 188 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 189 | 190 | 191 | # Often, tokenization requires some additional pre-processing 192 | names(train.tokens.df)[c(146, 148, 235, 238)] 193 | 194 | 195 | # Cleanup column names. 196 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 197 | 198 | 199 | # Use caret to create stratified folds for 10-fold cross validation repeated 200 | # 3 times (i.e., create 30 random stratified samples) 201 | set.seed(48743) 202 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 203 | 204 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 205 | repeats = 3, index = cv.folds) 206 | 207 | 208 | # Our data frame is non-trivial in size. As such, CV runs will take 209 | # quite a long time to run. To cut down on total execution time, use 210 | # the doSNOW package to allow for multi-core training in parallel. 211 | # 212 | # WARNING - The following code is configured to run on a workstation- 213 | # or server-class machine (i.e., 12 logical cores). Alter 214 | # code to suit your HW environment. 215 | # 216 | #install.packages("doSNOW") 217 | library(doSNOW) 218 | 219 | 220 | # Time the code execution 221 | start.time <- Sys.time() 222 | 223 | 224 | # Create a cluster to work on 10 logical cores. 225 | cl <- makeCluster(10, type = "SOCK") 226 | registerDoSNOW(cl) 227 | 228 | 229 | # As our data is non-trivial in size at this point, use a single decision 230 | # tree alogrithm as our first model. We will graduate to using more 231 | # powerful algorithms later when we perform feature extraction to shrink 232 | # the size of our data. 233 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 234 | trControl = cv.cntrl, tuneLength = 7) 235 | 236 | 237 | # Processing is done, stop cluster. 238 | stopCluster(cl) 239 | 240 | 241 | # Total time of execution on workstation was approximately 4 minutes. 242 | total.time <- Sys.time() - start.time 243 | total.time 244 | 245 | 246 | # Check out our results. 247 | rpart.cv.1 248 | 249 | 250 | 251 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 252 | # powerful technique for enhancing the information/signal contained 253 | # within our document-frequency matrix. Specifically, the mathematics 254 | # behind TF-IDF accomplish the following goals: 255 | # 1 - The TF calculation accounts for the fact that longer 256 | # documents will have higher individual term counts. Applying 257 | # TF normalizes all documents in the corpus to be length 258 | # independent. 259 | # 2 - The IDF calculation accounts for the frequency of term 260 | # appearance in all documents in the corpus. The intuition 261 | # being that a term that appears in every document has no 262 | # predictive power. 263 | # 3 - The multiplication of TF by IDF for each cell in the matrix 264 | # allows for weighting of #1 and #2 for each cell in the matrix. 265 | 266 | 267 | # Our function for calculating relative term frequency (TF) 268 | term.frequency <- function(row) { 269 | row / sum(row) 270 | } 271 | 272 | # Our function for calculating inverse document frequency (IDF) 273 | inverse.doc.freq <- function(col) { 274 | corpus.size <- length(col) 275 | doc.count <- length(which(col > 0)) 276 | 277 | log10(corpus.size / doc.count) 278 | } 279 | 280 | # Our function for calculating TF-IDF. 281 | tf.idf <- function(x, idf) { 282 | x * idf 283 | } 284 | 285 | 286 | # First step, normalize all documents via TF. 287 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 288 | dim(train.tokens.df) 289 | View(train.tokens.df[1:20, 1:100]) 290 | 291 | 292 | # Second step, calculate the IDF vector that we will use - both 293 | # for training data and for test data! 294 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 295 | str(train.tokens.idf) 296 | 297 | 298 | # Lastly, calculate TF-IDF for our training corpus. 299 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 300 | dim(train.tokens.tfidf) 301 | View(train.tokens.tfidf[1:25, 1:25]) 302 | 303 | 304 | # Transpose the matrix 305 | train.tokens.tfidf <- t(train.tokens.tfidf) 306 | dim(train.tokens.tfidf) 307 | View(train.tokens.tfidf[1:25, 1:25]) 308 | 309 | 310 | # Check for incopmlete cases. 311 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 312 | train$Text[incomplete.cases] 313 | 314 | 315 | # Fix incomplete cases 316 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 317 | dim(train.tokens.tfidf) 318 | sum(which(!complete.cases(train.tokens.tfidf))) 319 | 320 | 321 | # Make a clean data frame using the same process as before. 322 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 323 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 324 | 325 | 326 | # Time the code execution 327 | start.time <- Sys.time() 328 | 329 | # Create a cluster to work on 10 logical cores. 330 | cl <- makeCluster(3, type = "SOCK") 331 | registerDoSNOW(cl) 332 | 333 | # As our data is non-trivial in size at this point, use a single decision 334 | # tree alogrithm as our first model. We will graduate to using more 335 | # powerful algorithms later when we perform feature extraction to shrink 336 | # the size of our data. 337 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 338 | trControl = cv.cntrl, tuneLength = 7) 339 | 340 | # Processing is done, stop cluster. 341 | stopCluster(cl) 342 | 343 | # Total time of execution on workstation was 344 | total.time <- Sys.time() - start.time 345 | total.time 346 | 347 | # Check out our results. 348 | rpart.cv.2 349 | 350 | 351 | 352 | # N-grams allow us to augment our document-term frequency matrices with 353 | # word ordering. This often leads to increased performance (e.g., accuracy) 354 | # for machine learning models trained with more than just unigrams (i.e., 355 | # single terms). Let's add bigrams to our training data and the TF-IDF 356 | # transform the expanded featre matrix to see if accuracy improves. 357 | 358 | # Add bigrams to our feature matrix. 359 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2) 360 | train.tokens[[357]] 361 | 362 | 363 | # Transform to dfm and then a matrix. 364 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 365 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 366 | train.tokens.dfm 367 | 368 | 369 | # Normalize all documents via TF. 370 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 371 | 372 | 373 | # Calculate the IDF vector that we will use for training and test data! 374 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 375 | 376 | 377 | # Calculate TF-IDF for our training corpus 378 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, 379 | idf = train.tokens.idf) 380 | 381 | 382 | # Transpose the matrix 383 | train.tokens.tfidf <- t(train.tokens.tfidf) 384 | 385 | 386 | # Fix incomplete cases 387 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 388 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 389 | 390 | 391 | # Make a clean data frame. 392 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 393 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 394 | 395 | 396 | # Clean up unused objects in memory. 397 | gc() -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part7.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 7 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=Fza5szojsU8 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | # Clean up the data frame and view our handiwork. 39 | spam.raw <- spam.raw[, 1:2] 40 | names(spam.raw) <- c("Label", "Text") 41 | View(spam.raw) 42 | 43 | 44 | 45 | # Check data to see if there are missing values. 46 | length(which(!complete.cases(spam.raw))) 47 | 48 | 49 | 50 | # Convert our class label into a factor. 51 | spam.raw$Label <- as.factor(spam.raw$Label) 52 | 53 | 54 | 55 | # The first step, as always, is to explore the data. 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 57 | prop.table(table(spam.raw$Label)) 58 | 59 | 60 | 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 62 | # messages by adding a new feature for the length of each message. 63 | spam.raw$TextLength <- nchar(spam.raw$Text) 64 | summary(spam.raw$TextLength) 65 | 66 | 67 | 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 69 | library(ggplot2) 70 | 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 72 | theme_bw() + 73 | geom_histogram(binwidth = 5) + 74 | labs(y = "Text Count", x = "Length of Text", 75 | title = "Distribution of Text Lengths with Class Labels") 76 | 77 | 78 | 79 | # At a minimum we need to split our data into a training set and a 80 | # test set. In a true project we would want to use a three-way split 81 | # of training, validation, and test. 82 | # 83 | # As we know that our data has non-trivial class imbalance, we'll 84 | # use the mighty caret package to create a randomg train/test split 85 | # that ensures the correct ham/spam class label proportions (i.e., 86 | # we'll use caret for a random stratified split). 87 | library(caret) 88 | help(package = "caret") 89 | 90 | 91 | # Use caret to create a 70%/30% stratified split. Set the random 92 | # seed for reproducibility. 93 | set.seed(32984) 94 | indexes <- createDataPartition(spam.raw$Label, times = 1, 95 | p = 0.7, list = FALSE) 96 | 97 | train <- spam.raw[indexes,] 98 | test <- spam.raw[-indexes,] 99 | 100 | 101 | # Verify proportions. 102 | prop.table(table(train$Label)) 103 | prop.table(table(test$Label)) 104 | 105 | 106 | 107 | # Text analytics requires a lot of data exploration, data pre-processing 108 | # and data wrangling. Let's explore some examples. 109 | 110 | # HTML-escaped ampersand character. 111 | train$Text[21] 112 | 113 | 114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 115 | # is an actual person, but we will ignore the implications of this for 116 | # this introductory tutorial. 117 | train$Text[38] 118 | 119 | 120 | # A URL. 121 | train$Text[357] 122 | 123 | 124 | 125 | # There are many packages in the R ecosystem for performing text 126 | # analytics. One of the newer packages in quanteda. The quanteda 127 | # package has many useful functions for quickly and easily working 128 | # with text data. 129 | library(quanteda) 130 | help(package = "quanteda") 131 | 132 | 133 | # Tokenize SMS text messages. 134 | train.tokens <- tokens(train$Text, what = "word", 135 | remove_numbers = TRUE, remove_punct = TRUE, 136 | remove_symbols = TRUE, remove_hyphens = TRUE) 137 | 138 | # Take a look at a specific SMS message and see how it transforms. 139 | train.tokens[[357]] 140 | 141 | 142 | # Lower case the tokens. 143 | train.tokens <- tokens_tolower(train.tokens) 144 | train.tokens[[357]] 145 | 146 | 147 | # Use quanteda's built-in stopword list for English. 148 | # NOTE - You should always inspect stopword lists for applicability to 149 | # your problem/domain. 150 | train.tokens <- tokens_select(train.tokens, stopwords(), 151 | selection = "remove") 152 | train.tokens[[357]] 153 | 154 | 155 | # Perform stemming on the tokens. 156 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 157 | train.tokens[[357]] 158 | 159 | 160 | # Create our first bag-of-words model. 161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 162 | 163 | 164 | # Transform to a matrix and inspect. 165 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 166 | View(train.tokens.matrix[1:20, 1:100]) 167 | dim(train.tokens.matrix) 168 | 169 | 170 | # Investigate the effects of stemming. 171 | colnames(train.tokens.matrix)[1:50] 172 | 173 | 174 | # Per best practices, we will leverage cross validation (CV) as 175 | # the basis of our modeling process. Using CV we can create 176 | # estimates of how well our model will do in Production on new, 177 | # unseen data. CV is powerful, but the downside is that it 178 | # requires more processing and therefore more time. 179 | # 180 | # If you are not familiar with CV, consult the following 181 | # Wikipedia article: 182 | # 183 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 184 | # 185 | 186 | # Setup a the feature data frame with labels. 187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 188 | 189 | 190 | # Often, tokenization requires some additional pre-processing 191 | names(train.tokens.df)[c(146, 148, 235, 238)] 192 | 193 | 194 | # Cleanup column names. 195 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 196 | 197 | 198 | # Use caret to create stratified folds for 10-fold cross validation repeated 199 | # 3 times (i.e., create 30 random stratified samples) 200 | set.seed(48743) 201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 202 | 203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 204 | repeats = 3, index = cv.folds) 205 | 206 | 207 | # Our data frame is non-trivial in size. As such, CV runs will take 208 | # quite a long time to run. To cut down on total execution time, use 209 | # the doSNOW package to allow for multi-core training in parallel. 210 | # 211 | # WARNING - The following code is configured to run on a workstation- 212 | # or server-class machine (i.e., 12 logical cores). Alter 213 | # code to suit your HW environment. 214 | # 215 | #install.packages("doSNOW") 216 | library(doSNOW) 217 | 218 | 219 | # Time the code execution 220 | start.time <- Sys.time() 221 | 222 | 223 | # Create a cluster to work on 10 logical cores. 224 | cl <- makeCluster(10, type = "SOCK") 225 | registerDoSNOW(cl) 226 | 227 | 228 | # As our data is non-trivial in size at this point, use a single decision 229 | # tree alogrithm as our first model. We will graduate to using more 230 | # powerful algorithms later when we perform feature extraction to shrink 231 | # the size of our data. 232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 233 | trControl = cv.cntrl, tuneLength = 7) 234 | 235 | 236 | # Processing is done, stop cluster. 237 | stopCluster(cl) 238 | 239 | 240 | # Total time of execution on workstation was approximately 4 minutes. 241 | total.time <- Sys.time() - start.time 242 | total.time 243 | 244 | 245 | # Check out our results. 246 | rpart.cv.1 247 | 248 | 249 | 250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 251 | # powerful technique for enhancing the information/signal contained 252 | # within our document-frequency matrix. Specifically, the mathematics 253 | # behind TF-IDF accomplish the following goals: 254 | # 1 - The TF calculation accounts for the fact that longer 255 | # documents will have higher individual term counts. Applying 256 | # TF normalizes all documents in the corpus to be length 257 | # independent. 258 | # 2 - The IDF calculation accounts for the frequency of term 259 | # appearance in all documents in the corpus. The intuition 260 | # being that a term that appears in every document has no 261 | # predictive power. 262 | # 3 - The multiplication of TF by IDF for each cell in the matrix 263 | # allows for weighting of #1 and #2 for each cell in the matrix. 264 | 265 | 266 | # Our function for calculating relative term frequency (TF) 267 | term.frequency <- function(row) { 268 | row / sum(row) 269 | } 270 | 271 | # Our function for calculating inverse document frequency (IDF) 272 | inverse.doc.freq <- function(col) { 273 | corpus.size <- length(col) 274 | doc.count <- length(which(col > 0)) 275 | 276 | log10(corpus.size / doc.count) 277 | } 278 | 279 | # Our function for calculating TF-IDF. 280 | tf.idf <- function(x, idf) { 281 | x * idf 282 | } 283 | 284 | 285 | # First step, normalize all documents via TF. 286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 287 | dim(train.tokens.df) 288 | View(train.tokens.df[1:20, 1:100]) 289 | 290 | 291 | # Second step, calculate the IDF vector that we will use - both 292 | # for training data and for test data! 293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 294 | str(train.tokens.idf) 295 | 296 | 297 | # Lastly, calculate TF-IDF for our training corpus. 298 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 299 | dim(train.tokens.tfidf) 300 | View(train.tokens.tfidf[1:25, 1:25]) 301 | 302 | 303 | # Transpose the matrix 304 | train.tokens.tfidf <- t(train.tokens.tfidf) 305 | dim(train.tokens.tfidf) 306 | View(train.tokens.tfidf[1:25, 1:25]) 307 | 308 | 309 | # Check for incopmlete cases. 310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 311 | train$Text[incomplete.cases] 312 | 313 | 314 | # Fix incomplete cases 315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 316 | dim(train.tokens.tfidf) 317 | sum(which(!complete.cases(train.tokens.tfidf))) 318 | 319 | 320 | # Make a clean data frame using the same process as before. 321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 323 | 324 | 325 | # Time the code execution 326 | start.time <- Sys.time() 327 | 328 | # Create a cluster to work on 10 logical cores. 329 | cl <- makeCluster(3, type = "SOCK") 330 | registerDoSNOW(cl) 331 | 332 | # As our data is non-trivial in size at this point, use a single decision 333 | # tree alogrithm as our first model. We will graduate to using more 334 | # powerful algorithms later when we perform feature extraction to shrink 335 | # the size of our data. 336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 337 | trControl = cv.cntrl, tuneLength = 7) 338 | 339 | # Processing is done, stop cluster. 340 | stopCluster(cl) 341 | 342 | # Total time of execution on workstation was 343 | total.time <- Sys.time() - start.time 344 | total.time 345 | 346 | # Check out our results. 347 | rpart.cv.2 348 | 349 | 350 | 351 | # N-grams allow us to augment our document-term frequency matrices with 352 | # word ordering. This often leads to increased performance (e.g., accuracy) 353 | # for machine learning models trained with more than just unigrams (i.e., 354 | # single terms). Let's add bigrams to our training data and the TF-IDF 355 | # transform the expanded featre matrix to see if accuracy improves. 356 | 357 | # Add bigrams to our feature matrix. 358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2) 359 | train.tokens[[357]] 360 | 361 | 362 | # Transform to dfm and then a matrix. 363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 364 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 365 | train.tokens.dfm 366 | 367 | 368 | # Normalize all documents via TF. 369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 370 | 371 | 372 | # Calculate the IDF vector that we will use for training and test data! 373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 374 | 375 | 376 | # Calculate TF-IDF for our training corpus 377 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, 378 | idf = train.tokens.idf) 379 | 380 | 381 | # Transpose the matrix 382 | train.tokens.tfidf <- t(train.tokens.tfidf) 383 | 384 | 385 | # Fix incomplete cases 386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 388 | 389 | 390 | # Make a clean data frame. 391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 393 | 394 | 395 | # Clean up unused objects in memory. 396 | gc() 397 | 398 | 399 | 400 | 401 | # 402 | # NOTE - The following code requires the use of command-line R to execute 403 | # due to the large number of features (i.e., columns) in the matrix. 404 | # Please consult the following link for more details if you wish 405 | # to run the code yourself: 406 | # 407 | # https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r 408 | # 409 | # Also note that running the following code required approximately 410 | # 38GB of RAM and more than 4.5 hours to execute on a 10-core 411 | # workstation! 412 | # 413 | 414 | 415 | # Time the code execution 416 | # start.time <- Sys.time() 417 | 418 | # Leverage single decision trees to evaluate if adding bigrams improves the 419 | # the effectiveness of the model. 420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 421 | # trControl = cv.cntrl, tuneLength = 7) 422 | 423 | # Total time of execution on workstation was 424 | # total.time <- Sys.time() - start.time 425 | # total.time 426 | 427 | # Check out our results. 428 | # rpart.cv.3 429 | 430 | # 431 | # The results of the above processing show a slight decline in rpart 432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457. 433 | # As we will discuss later, while the addition of bigrams appears to 434 | # negatively impact a single decision tree, it helps with the mighty 435 | # random forest! 436 | # 437 | 438 | 439 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part8.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 8 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=4DI68P4hicQ 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | # Clean up the data frame and view our handiwork. 39 | spam.raw <- spam.raw[, 1:2] 40 | names(spam.raw) <- c("Label", "Text") 41 | View(spam.raw) 42 | 43 | 44 | 45 | # Check data to see if there are missing values. 46 | length(which(!complete.cases(spam.raw))) 47 | 48 | 49 | 50 | # Convert our class label into a factor. 51 | spam.raw$Label <- as.factor(spam.raw$Label) 52 | 53 | 54 | 55 | # The first step, as always, is to explore the data. 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 57 | prop.table(table(spam.raw$Label)) 58 | 59 | 60 | 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 62 | # messages by adding a new feature for the length of each message. 63 | spam.raw$TextLength <- nchar(spam.raw$Text) 64 | summary(spam.raw$TextLength) 65 | 66 | 67 | 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 69 | library(ggplot2) 70 | 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 72 | theme_bw() + 73 | geom_histogram(binwidth = 5) + 74 | labs(y = "Text Count", x = "Length of Text", 75 | title = "Distribution of Text Lengths with Class Labels") 76 | 77 | 78 | 79 | # At a minimum we need to split our data into a training set and a 80 | # test set. In a true project we would want to use a three-way split 81 | # of training, validation, and test. 82 | # 83 | # As we know that our data has non-trivial class imbalance, we'll 84 | # use the mighty caret package to create a randomg train/test split 85 | # that ensures the correct ham/spam class label proportions (i.e., 86 | # we'll use caret for a random stratified split). 87 | library(caret) 88 | help(package = "caret") 89 | 90 | 91 | # Use caret to create a 70%/30% stratified split. Set the random 92 | # seed for reproducibility. 93 | set.seed(32984) 94 | indexes <- createDataPartition(spam.raw$Label, times = 1, 95 | p = 0.7, list = FALSE) 96 | 97 | train <- spam.raw[indexes,] 98 | test <- spam.raw[-indexes,] 99 | 100 | 101 | # Verify proportions. 102 | prop.table(table(train$Label)) 103 | prop.table(table(test$Label)) 104 | 105 | 106 | 107 | # Text analytics requires a lot of data exploration, data pre-processing 108 | # and data wrangling. Let's explore some examples. 109 | 110 | # HTML-escaped ampersand character. 111 | train$Text[21] 112 | 113 | 114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 115 | # is an actual person, but we will ignore the implications of this for 116 | # this introductory tutorial. 117 | train$Text[38] 118 | 119 | 120 | # A URL. 121 | train$Text[357] 122 | 123 | 124 | 125 | # There are many packages in the R ecosystem for performing text 126 | # analytics. One of the newer packages in quanteda. The quanteda 127 | # package has many useful functions for quickly and easily working 128 | # with text data. 129 | library(quanteda) 130 | help(package = "quanteda") 131 | 132 | 133 | # Tokenize SMS text messages. 134 | train.tokens <- tokens(train$Text, what = "word", 135 | remove_numbers = TRUE, remove_punct = TRUE, 136 | remove_symbols = TRUE, remove_hyphens = TRUE) 137 | 138 | # Take a look at a specific SMS message and see how it transforms. 139 | train.tokens[[357]] 140 | 141 | 142 | # Lower case the tokens. 143 | train.tokens <- tokens_tolower(train.tokens) 144 | train.tokens[[357]] 145 | 146 | 147 | # Use quanteda's built-in stopword list for English. 148 | # NOTE - You should always inspect stopword lists for applicability to 149 | # your problem/domain. 150 | train.tokens <- tokens_select(train.tokens, stopwords(), 151 | selection = "remove") 152 | train.tokens[[357]] 153 | 154 | 155 | # Perform stemming on the tokens. 156 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 157 | train.tokens[[357]] 158 | 159 | 160 | # Create our first bag-of-words model. 161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 162 | 163 | 164 | # Transform to a matrix and inspect. 165 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 166 | View(train.tokens.matrix[1:20, 1:100]) 167 | dim(train.tokens.matrix) 168 | 169 | 170 | # Investigate the effects of stemming. 171 | colnames(train.tokens.matrix)[1:50] 172 | 173 | 174 | # Per best practices, we will leverage cross validation (CV) as 175 | # the basis of our modeling process. Using CV we can create 176 | # estimates of how well our model will do in Production on new, 177 | # unseen data. CV is powerful, but the downside is that it 178 | # requires more processing and therefore more time. 179 | # 180 | # If you are not familiar with CV, consult the following 181 | # Wikipedia article: 182 | # 183 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 184 | # 185 | 186 | # Setup a the feature data frame with labels. 187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 188 | 189 | 190 | # Often, tokenization requires some additional pre-processing 191 | names(train.tokens.df)[c(146, 148, 235, 238)] 192 | 193 | 194 | # Cleanup column names. 195 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 196 | 197 | 198 | # Use caret to create stratified folds for 10-fold cross validation repeated 199 | # 3 times (i.e., create 30 random stratified samples) 200 | set.seed(48743) 201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 202 | 203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 204 | repeats = 3, index = cv.folds) 205 | 206 | 207 | # Our data frame is non-trivial in size. As such, CV runs will take 208 | # quite a long time to run. To cut down on total execution time, use 209 | # the doSNOW package to allow for multi-core training in parallel. 210 | # 211 | # WARNING - The following code is configured to run on a workstation- 212 | # or server-class machine (i.e., 12 logical cores). Alter 213 | # code to suit your HW environment. 214 | # 215 | #install.packages("doSNOW") 216 | library(doSNOW) 217 | 218 | 219 | # Time the code execution 220 | start.time <- Sys.time() 221 | 222 | 223 | # Create a cluster to work on 10 logical cores. 224 | cl <- makeCluster(10, type = "SOCK") 225 | registerDoSNOW(cl) 226 | 227 | 228 | # As our data is non-trivial in size at this point, use a single decision 229 | # tree alogrithm as our first model. We will graduate to using more 230 | # powerful algorithms later when we perform feature extraction to shrink 231 | # the size of our data. 232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 233 | trControl = cv.cntrl, tuneLength = 7) 234 | 235 | 236 | # Processing is done, stop cluster. 237 | stopCluster(cl) 238 | 239 | 240 | # Total time of execution on workstation was approximately 4 minutes. 241 | total.time <- Sys.time() - start.time 242 | total.time 243 | 244 | 245 | # Check out our results. 246 | rpart.cv.1 247 | 248 | 249 | 250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 251 | # powerful technique for enhancing the information/signal contained 252 | # within our document-frequency matrix. Specifically, the mathematics 253 | # behind TF-IDF accomplish the following goals: 254 | # 1 - The TF calculation accounts for the fact that longer 255 | # documents will have higher individual term counts. Applying 256 | # TF normalizes all documents in the corpus to be length 257 | # independent. 258 | # 2 - The IDF calculation accounts for the frequency of term 259 | # appearance in all documents in the corpus. The intuition 260 | # being that a term that appears in every document has no 261 | # predictive power. 262 | # 3 - The multiplication of TF by IDF for each cell in the matrix 263 | # allows for weighting of #1 and #2 for each cell in the matrix. 264 | 265 | 266 | # Our function for calculating relative term frequency (TF) 267 | term.frequency <- function(row) { 268 | row / sum(row) 269 | } 270 | 271 | # Our function for calculating inverse document frequency (IDF) 272 | inverse.doc.freq <- function(col) { 273 | corpus.size <- length(col) 274 | doc.count <- length(which(col > 0)) 275 | 276 | log10(corpus.size / doc.count) 277 | } 278 | 279 | # Our function for calculating TF-IDF. 280 | tf.idf <- function(x, idf) { 281 | x * idf 282 | } 283 | 284 | 285 | # First step, normalize all documents via TF. 286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 287 | dim(train.tokens.df) 288 | View(train.tokens.df[1:20, 1:100]) 289 | 290 | 291 | # Second step, calculate the IDF vector that we will use - both 292 | # for training data and for test data! 293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 294 | str(train.tokens.idf) 295 | 296 | 297 | # Lastly, calculate TF-IDF for our training corpus. 298 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 299 | dim(train.tokens.tfidf) 300 | View(train.tokens.tfidf[1:25, 1:25]) 301 | 302 | 303 | # Transpose the matrix 304 | train.tokens.tfidf <- t(train.tokens.tfidf) 305 | dim(train.tokens.tfidf) 306 | View(train.tokens.tfidf[1:25, 1:25]) 307 | 308 | 309 | # Check for incopmlete cases. 310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 311 | train$Text[incomplete.cases] 312 | 313 | 314 | # Fix incomplete cases 315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 316 | dim(train.tokens.tfidf) 317 | sum(which(!complete.cases(train.tokens.tfidf))) 318 | 319 | 320 | # Make a clean data frame using the same process as before. 321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 323 | 324 | 325 | # Time the code execution 326 | start.time <- Sys.time() 327 | 328 | # Create a cluster to work on 10 logical cores. 329 | cl <- makeCluster(3, type = "SOCK") 330 | registerDoSNOW(cl) 331 | 332 | # As our data is non-trivial in size at this point, use a single decision 333 | # tree alogrithm as our first model. We will graduate to using more 334 | # powerful algorithms later when we perform feature extraction to shrink 335 | # the size of our data. 336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 337 | trControl = cv.cntrl, tuneLength = 7) 338 | 339 | # Processing is done, stop cluster. 340 | stopCluster(cl) 341 | 342 | # Total time of execution on workstation was 343 | total.time <- Sys.time() - start.time 344 | total.time 345 | 346 | # Check out our results. 347 | rpart.cv.2 348 | 349 | 350 | 351 | # N-grams allow us to augment our document-term frequency matrices with 352 | # word ordering. This often leads to increased performance (e.g., accuracy) 353 | # for machine learning models trained with more than just unigrams (i.e., 354 | # single terms). Let's add bigrams to our training data and the TF-IDF 355 | # transform the expanded featre matrix to see if accuracy improves. 356 | 357 | # Add bigrams to our feature matrix. 358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2) 359 | train.tokens[[357]] 360 | 361 | 362 | # Transform to dfm and then a matrix. 363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 364 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 365 | train.tokens.dfm 366 | 367 | 368 | # Normalize all documents via TF. 369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 370 | 371 | 372 | # Calculate the IDF vector that we will use for training and test data! 373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 374 | 375 | 376 | # Calculate TF-IDF for our training corpus 377 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, 378 | idf = train.tokens.idf) 379 | 380 | 381 | # Transpose the matrix 382 | train.tokens.tfidf <- t(train.tokens.tfidf) 383 | 384 | 385 | # Fix incomplete cases 386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 388 | 389 | 390 | # Make a clean data frame. 391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 393 | 394 | 395 | # Clean up unused objects in memory. 396 | gc() 397 | 398 | 399 | 400 | 401 | # 402 | # NOTE - The following code requires the use of command-line R to execute 403 | # due to the large number of features (i.e., columns) in the matrix. 404 | # Please consult the following link for more details if you wish 405 | # to run the code yourself: 406 | # 407 | # https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r 408 | # 409 | # Also note that running the following code required approximately 410 | # 38GB of RAM and more than 4.5 hours to execute on a 10-core 411 | # workstation! 412 | # 413 | 414 | 415 | # Time the code execution 416 | # start.time <- Sys.time() 417 | 418 | # Leverage single decision trees to evaluate if adding bigrams improves the 419 | # the effectiveness of the model. 420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 421 | # trControl = cv.cntrl, tuneLength = 7) 422 | 423 | # Total time of execution on workstation was 424 | # total.time <- Sys.time() - start.time 425 | # total.time 426 | 427 | # Check out our results. 428 | # rpart.cv.3 429 | 430 | # 431 | # The results of the above processing show a slight decline in rpart 432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457. 433 | # As we will discuss later, while the addition of bigrams appears to 434 | # negatively impact a single decision tree, it helps with the mighty 435 | # random forest! 436 | # 437 | 438 | 439 | 440 | 441 | # We'll leverage the irlba package for our singular value 442 | # decomposition (SVD). The irlba package allows us to specify 443 | # the number of the most important singular vectors we wish to 444 | # calculate and retain for features. 445 | library(irlba) 446 | 447 | 448 | # Time the code execution 449 | start.time <- Sys.time() 450 | 451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns 452 | # for our latent semantic analysis (LSA). 453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600) 454 | 455 | # Total time of execution on workstation was 456 | total.time <- Sys.time() - start.time 457 | total.time 458 | 459 | 460 | # Take a look at the new feature data up close. 461 | View(train.irlba$v) 462 | 463 | 464 | # As with TF-IDF, we will need to project new data (e.g., the test data) 465 | # into the SVD semantic space. The following code illustrates how to do 466 | # this using a row of the training data that has already been transformed 467 | # by TF-IDF, per the mathematics illustrated in the slides. 468 | # 469 | # 470 | sigma.inverse <- 1 / train.irlba$d 471 | u.transpose <- t(train.irlba$u) 472 | document <- train.tokens.tfidf[1,] 473 | document.hat <- sigma.inverse * u.transpose %*% document 474 | 475 | # Look at the first 10 components of projected document and the corresponding 476 | # row in our document semantic space (i.e., the V matrix) 477 | document.hat[1:10] 478 | train.irlba$v[1, 1:10] 479 | 480 | 481 | 482 | # 483 | # Create new feature data frame using our document semantic space of 300 484 | # features (i.e., the V matrix from our SVD). 485 | # 486 | train.svd <- data.frame(Label = train$Label, train.irlba$v) 487 | 488 | 489 | # Create a cluster to work on 10 logical cores. 490 | cl <- makeCluster(10, type = "SOCK") 491 | registerDoSNOW(cl) 492 | 493 | # Time the code execution 494 | start.time <- Sys.time() 495 | 496 | # This will be the last run using single decision trees. With a much smaller 497 | # feature matrix we can now use more powerful methods like the mighty Random 498 | # Forest from now on! 499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 500 | trControl = cv.cntrl, tuneLength = 7) 501 | 502 | # Processing is done, stop cluster. 503 | stopCluster(cl) 504 | 505 | # Total time of execution on workstation was 506 | total.time <- Sys.time() - start.time 507 | total.time 508 | 509 | # Check out our results. 510 | rpart.cv.4 511 | 512 | 513 | 514 | 515 | # 516 | # NOTE - The following code takes a long time to run. Here's the math. 517 | # We are performing 10-fold CV repeated 3 times. That means we 518 | # need to build 30 models. We are also asking caret to try 7 519 | # different values of the mtry parameter. Next up by default 520 | # a mighty random forest leverages 500 trees. Lastly, caret will 521 | # build 1 final model at the end of the process with the best 522 | # mtry value over all the training data. Here's the number of 523 | # tree we're building: 524 | # 525 | # (10 * 3 * 7 * 500) + 500 = 105,500 trees! 526 | # 527 | # On a workstation using 10 cores the following code took 28 minutes 528 | # to execute. 529 | # 530 | 531 | 532 | # Create a cluster to work on 10 logical cores. 533 | # cl <- makeCluster(10, type = "SOCK") 534 | # registerDoSNOW(cl) 535 | 536 | # Time the code execution 537 | # start.time <- Sys.time() 538 | 539 | # We have reduced the dimensionality of our data using SVD. Also, the 540 | # application of SVD allows us to use LSA to simultaneously increase the 541 | # information density of each feature. To prove this out, leverage a 542 | # mighty Random Forest with the default of 500 trees. We'll also ask 543 | # caret to try 7 different values of mtry to find the mtry value that 544 | # gives the best result! 545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 546 | # trControl = cv.cntrl, tuneLength = 7) 547 | 548 | # Processing is done, stop cluster. 549 | # stopCluster(cl) 550 | 551 | # Total time of execution on workstation was 552 | # total.time <- Sys.time() - start.time 553 | # total.time 554 | 555 | 556 | # Load processing results from disk! 557 | load("rf.cv.1.RData") 558 | 559 | # Check out our results. 560 | rf.cv.1 561 | 562 | # Let's drill-down on the results. 563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted) 564 | 565 | -------------------------------------------------------------------------------- /IntroToTextAnalytics_Part9.R: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Data Science Dojo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | # 19 | # This R source code file corresponds to video 9 of the Data Science 20 | # Dojo YouTube series "Introduction to Text Analytics with R" located 21 | # at the following URL: 22 | # https://www.youtube.com/watch?v=SgrLE6WQzkE 23 | # 24 | 25 | 26 | # Install all required packages. 27 | install.packages(c("ggplot2", "e1071", "caret", "quanteda", 28 | "irlba", "randomForest")) 29 | 30 | 31 | 32 | 33 | # Load up the .CSV data and explore in RStudio. 34 | spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") 35 | View(spam.raw) 36 | 37 | 38 | # Clean up the data frame and view our handiwork. 39 | spam.raw <- spam.raw[, 1:2] 40 | names(spam.raw) <- c("Label", "Text") 41 | View(spam.raw) 42 | 43 | 44 | 45 | # Check data to see if there are missing values. 46 | length(which(!complete.cases(spam.raw))) 47 | 48 | 49 | 50 | # Convert our class label into a factor. 51 | spam.raw$Label <- as.factor(spam.raw$Label) 52 | 53 | 54 | 55 | # The first step, as always, is to explore the data. 56 | # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). 57 | prop.table(table(spam.raw$Label)) 58 | 59 | 60 | 61 | # Next up, let's get a feel for the distribution of text lengths of the SMS 62 | # messages by adding a new feature for the length of each message. 63 | spam.raw$TextLength <- nchar(spam.raw$Text) 64 | summary(spam.raw$TextLength) 65 | 66 | 67 | 68 | # Visualize distribution with ggplot2, adding segmentation for ham/spam. 69 | library(ggplot2) 70 | 71 | ggplot(spam.raw, aes(x = TextLength, fill = Label)) + 72 | theme_bw() + 73 | geom_histogram(binwidth = 5) + 74 | labs(y = "Text Count", x = "Length of Text", 75 | title = "Distribution of Text Lengths with Class Labels") 76 | 77 | 78 | 79 | # At a minimum we need to split our data into a training set and a 80 | # test set. In a true project we would want to use a three-way split 81 | # of training, validation, and test. 82 | # 83 | # As we know that our data has non-trivial class imbalance, we'll 84 | # use the mighty caret package to create a randomg train/test split 85 | # that ensures the correct ham/spam class label proportions (i.e., 86 | # we'll use caret for a random stratified split). 87 | library(caret) 88 | help(package = "caret") 89 | 90 | 91 | # Use caret to create a 70%/30% stratified split. Set the random 92 | # seed for reproducibility. 93 | set.seed(32984) 94 | indexes <- createDataPartition(spam.raw$Label, times = 1, 95 | p = 0.7, list = FALSE) 96 | 97 | train <- spam.raw[indexes,] 98 | test <- spam.raw[-indexes,] 99 | 100 | 101 | # Verify proportions. 102 | prop.table(table(train$Label)) 103 | prop.table(table(test$Label)) 104 | 105 | 106 | 107 | # Text analytics requires a lot of data exploration, data pre-processing 108 | # and data wrangling. Let's explore some examples. 109 | 110 | # HTML-escaped ampersand character. 111 | train$Text[21] 112 | 113 | 114 | # HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat 115 | # is an actual person, but we will ignore the implications of this for 116 | # this introductory tutorial. 117 | train$Text[38] 118 | 119 | 120 | # A URL. 121 | train$Text[357] 122 | 123 | 124 | 125 | # There are many packages in the R ecosystem for performing text 126 | # analytics. One of the newer packages in quanteda. The quanteda 127 | # package has many useful functions for quickly and easily working 128 | # with text data. 129 | library(quanteda) 130 | help(package = "quanteda") 131 | 132 | 133 | # Tokenize SMS text messages. 134 | train.tokens <- tokens(train$Text, what = "word", 135 | remove_numbers = TRUE, remove_punct = TRUE, 136 | remove_symbols = TRUE, remove_hyphens = TRUE) 137 | 138 | # Take a look at a specific SMS message and see how it transforms. 139 | train.tokens[[357]] 140 | 141 | 142 | # Lower case the tokens. 143 | train.tokens <- tokens_tolower(train.tokens) 144 | train.tokens[[357]] 145 | 146 | 147 | # Use quanteda's built-in stopword list for English. 148 | # NOTE - You should always inspect stopword lists for applicability to 149 | # your problem/domain. 150 | train.tokens <- tokens_select(train.tokens, stopwords(), 151 | selection = "remove") 152 | train.tokens[[357]] 153 | 154 | 155 | # Perform stemming on the tokens. 156 | train.tokens <- tokens_wordstem(train.tokens, language = "english") 157 | train.tokens[[357]] 158 | 159 | 160 | # Create our first bag-of-words model. 161 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 162 | 163 | 164 | # Transform to a matrix and inspect. 165 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 166 | View(train.tokens.matrix[1:20, 1:100]) 167 | dim(train.tokens.matrix) 168 | 169 | 170 | # Investigate the effects of stemming. 171 | colnames(train.tokens.matrix)[1:50] 172 | 173 | 174 | # Per best practices, we will leverage cross validation (CV) as 175 | # the basis of our modeling process. Using CV we can create 176 | # estimates of how well our model will do in Production on new, 177 | # unseen data. CV is powerful, but the downside is that it 178 | # requires more processing and therefore more time. 179 | # 180 | # If you are not familiar with CV, consult the following 181 | # Wikipedia article: 182 | # 183 | # https://en.wikipedia.org/wiki/Cross-validation_(statistics) 184 | # 185 | 186 | # Setup a the feature data frame with labels. 187 | train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm)) 188 | 189 | 190 | # Often, tokenization requires some additional pre-processing 191 | names(train.tokens.df)[c(146, 148, 235, 238)] 192 | 193 | 194 | # Cleanup column names. 195 | names(train.tokens.df) <- make.names(names(train.tokens.df)) 196 | 197 | 198 | # Use caret to create stratified folds for 10-fold cross validation repeated 199 | # 3 times (i.e., create 30 random stratified samples) 200 | set.seed(48743) 201 | cv.folds <- createMultiFolds(train$Label, k = 10, times = 3) 202 | 203 | cv.cntrl <- trainControl(method = "repeatedcv", number = 10, 204 | repeats = 3, index = cv.folds) 205 | 206 | 207 | # Our data frame is non-trivial in size. As such, CV runs will take 208 | # quite a long time to run. To cut down on total execution time, use 209 | # the doSNOW package to allow for multi-core training in parallel. 210 | # 211 | # WARNING - The following code is configured to run on a workstation- 212 | # or server-class machine (i.e., 12 logical cores). Alter 213 | # code to suit your HW environment. 214 | # 215 | #install.packages("doSNOW") 216 | library(doSNOW) 217 | 218 | 219 | # Time the code execution 220 | start.time <- Sys.time() 221 | 222 | 223 | # Create a cluster to work on 10 logical cores. 224 | cl <- makeCluster(10, type = "SOCK") 225 | registerDoSNOW(cl) 226 | 227 | 228 | # As our data is non-trivial in size at this point, use a single decision 229 | # tree alogrithm as our first model. We will graduate to using more 230 | # powerful algorithms later when we perform feature extraction to shrink 231 | # the size of our data. 232 | rpart.cv.1 <- train(Label ~ ., data = train.tokens.df, method = "rpart", 233 | trControl = cv.cntrl, tuneLength = 7) 234 | 235 | 236 | # Processing is done, stop cluster. 237 | stopCluster(cl) 238 | 239 | 240 | # Total time of execution on workstation was approximately 4 minutes. 241 | total.time <- Sys.time() - start.time 242 | total.time 243 | 244 | 245 | # Check out our results. 246 | rpart.cv.1 247 | 248 | 249 | 250 | # The use of Term Frequency-Inverse Document Frequency (TF-IDF) is a 251 | # powerful technique for enhancing the information/signal contained 252 | # within our document-frequency matrix. Specifically, the mathematics 253 | # behind TF-IDF accomplish the following goals: 254 | # 1 - The TF calculation accounts for the fact that longer 255 | # documents will have higher individual term counts. Applying 256 | # TF normalizes all documents in the corpus to be length 257 | # independent. 258 | # 2 - The IDF calculation accounts for the frequency of term 259 | # appearance in all documents in the corpus. The intuition 260 | # being that a term that appears in every document has no 261 | # predictive power. 262 | # 3 - The multiplication of TF by IDF for each cell in the matrix 263 | # allows for weighting of #1 and #2 for each cell in the matrix. 264 | 265 | 266 | # Our function for calculating relative term frequency (TF) 267 | term.frequency <- function(row) { 268 | row / sum(row) 269 | } 270 | 271 | # Our function for calculating inverse document frequency (IDF) 272 | inverse.doc.freq <- function(col) { 273 | corpus.size <- length(col) 274 | doc.count <- length(which(col > 0)) 275 | 276 | log10(corpus.size / doc.count) 277 | } 278 | 279 | # Our function for calculating TF-IDF. 280 | tf.idf <- function(x, idf) { 281 | x * idf 282 | } 283 | 284 | 285 | # First step, normalize all documents via TF. 286 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 287 | dim(train.tokens.df) 288 | View(train.tokens.df[1:20, 1:100]) 289 | 290 | 291 | # Second step, calculate the IDF vector that we will use - both 292 | # for training data and for test data! 293 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 294 | str(train.tokens.idf) 295 | 296 | 297 | # Lastly, calculate TF-IDF for our training corpus. 298 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf) 299 | dim(train.tokens.tfidf) 300 | View(train.tokens.tfidf[1:25, 1:25]) 301 | 302 | 303 | # Transpose the matrix 304 | train.tokens.tfidf <- t(train.tokens.tfidf) 305 | dim(train.tokens.tfidf) 306 | View(train.tokens.tfidf[1:25, 1:25]) 307 | 308 | 309 | # Check for incopmlete cases. 310 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 311 | train$Text[incomplete.cases] 312 | 313 | 314 | # Fix incomplete cases 315 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 316 | dim(train.tokens.tfidf) 317 | sum(which(!complete.cases(train.tokens.tfidf))) 318 | 319 | 320 | # Make a clean data frame using the same process as before. 321 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 322 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 323 | 324 | 325 | # Time the code execution 326 | start.time <- Sys.time() 327 | 328 | # Create a cluster to work on 10 logical cores. 329 | cl <- makeCluster(3, type = "SOCK") 330 | registerDoSNOW(cl) 331 | 332 | # As our data is non-trivial in size at this point, use a single decision 333 | # tree alogrithm as our first model. We will graduate to using more 334 | # powerful algorithms later when we perform feature extraction to shrink 335 | # the size of our data. 336 | rpart.cv.2 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 337 | trControl = cv.cntrl, tuneLength = 7) 338 | 339 | # Processing is done, stop cluster. 340 | stopCluster(cl) 341 | 342 | # Total time of execution on workstation was 343 | total.time <- Sys.time() - start.time 344 | total.time 345 | 346 | # Check out our results. 347 | rpart.cv.2 348 | 349 | 350 | 351 | # N-grams allow us to augment our document-term frequency matrices with 352 | # word ordering. This often leads to increased performance (e.g., accuracy) 353 | # for machine learning models trained with more than just unigrams (i.e., 354 | # single terms). Let's add bigrams to our training data and the TF-IDF 355 | # transform the expanded featre matrix to see if accuracy improves. 356 | 357 | # Add bigrams to our feature matrix. 358 | train.tokens <- tokens_ngrams(train.tokens, n = 1:2) 359 | train.tokens[[357]] 360 | 361 | 362 | # Transform to dfm and then a matrix. 363 | train.tokens.dfm <- dfm(train.tokens, tolower = FALSE) 364 | train.tokens.matrix <- as.matrix(train.tokens.dfm) 365 | train.tokens.dfm 366 | 367 | 368 | # Normalize all documents via TF. 369 | train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency) 370 | 371 | 372 | # Calculate the IDF vector that we will use for training and test data! 373 | train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq) 374 | 375 | 376 | # Calculate TF-IDF for our training corpus 377 | train.tokens.tfidf <- apply(train.tokens.df, 2, tf.idf, 378 | idf = train.tokens.idf) 379 | 380 | 381 | # Transpose the matrix 382 | train.tokens.tfidf <- t(train.tokens.tfidf) 383 | 384 | 385 | # Fix incomplete cases 386 | incomplete.cases <- which(!complete.cases(train.tokens.tfidf)) 387 | train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf)) 388 | 389 | 390 | # Make a clean data frame. 391 | train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf)) 392 | names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df)) 393 | 394 | 395 | # Clean up unused objects in memory. 396 | gc() 397 | 398 | 399 | 400 | 401 | # 402 | # NOTE - The following code requires the use of command-line R to execute 403 | # due to the large number of features (i.e., columns) in the matrix. 404 | # Please consult the following link for more details if you wish 405 | # to run the code yourself: 406 | # 407 | # https://stackoverflow.com/questions/28728774/how-to-set-max-ppsize-in-r 408 | # 409 | # Also note that running the following code required approximately 410 | # 38GB of RAM and more than 4.5 hours to execute on a 10-core 411 | # workstation! 412 | # 413 | 414 | 415 | # Time the code execution 416 | # start.time <- Sys.time() 417 | 418 | # Leverage single decision trees to evaluate if adding bigrams improves the 419 | # the effectiveness of the model. 420 | # rpart.cv.3 <- train(Label ~ ., data = train.tokens.tfidf.df, method = "rpart", 421 | # trControl = cv.cntrl, tuneLength = 7) 422 | 423 | # Total time of execution on workstation was 424 | # total.time <- Sys.time() - start.time 425 | # total.time 426 | 427 | # Check out our results. 428 | # rpart.cv.3 429 | 430 | # 431 | # The results of the above processing show a slight decline in rpart 432 | # effectiveness with a 10-fold CV repeated 3 times accuracy of 0.9457. 433 | # As we will discuss later, while the addition of bigrams appears to 434 | # negatively impact a single decision tree, it helps with the mighty 435 | # random forest! 436 | # 437 | 438 | 439 | 440 | 441 | # We'll leverage the irlba package for our singular value 442 | # decomposition (SVD). The irlba package allows us to specify 443 | # the number of the most important singular vectors we wish to 444 | # calculate and retain for features. 445 | library(irlba) 446 | 447 | 448 | # Time the code execution 449 | start.time <- Sys.time() 450 | 451 | # Perform SVD. Specifically, reduce dimensionality down to 300 columns 452 | # for our latent semantic analysis (LSA). 453 | train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600) 454 | 455 | # Total time of execution on workstation was 456 | total.time <- Sys.time() - start.time 457 | total.time 458 | 459 | 460 | # Take a look at the new feature data up close. 461 | View(train.irlba$v) 462 | 463 | 464 | # As with TF-IDF, we will need to project new data (e.g., the test data) 465 | # into the SVD semantic space. The following code illustrates how to do 466 | # this using a row of the training data that has already been transformed 467 | # by TF-IDF, per the mathematics illustrated in the slides. 468 | # 469 | # 470 | sigma.inverse <- 1 / train.irlba$d 471 | u.transpose <- t(train.irlba$u) 472 | document <- train.tokens.tfidf[1,] 473 | document.hat <- sigma.inverse * u.transpose %*% document 474 | 475 | # Look at the first 10 components of projected document and the corresponding 476 | # row in our document semantic space (i.e., the V matrix) 477 | document.hat[1:10] 478 | train.irlba$v[1, 1:10] 479 | 480 | 481 | 482 | # 483 | # Create new feature data frame using our document semantic space of 300 484 | # features (i.e., the V matrix from our SVD). 485 | # 486 | train.svd <- data.frame(Label = train$Label, train.irlba$v) 487 | 488 | 489 | # Create a cluster to work on 10 logical cores. 490 | cl <- makeCluster(10, type = "SOCK") 491 | registerDoSNOW(cl) 492 | 493 | # Time the code execution 494 | start.time <- Sys.time() 495 | 496 | # This will be the last run using single decision trees. With a much smaller 497 | # feature matrix we can now use more powerful methods like the mighty Random 498 | # Forest from now on! 499 | rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart", 500 | trControl = cv.cntrl, tuneLength = 7) 501 | 502 | # Processing is done, stop cluster. 503 | stopCluster(cl) 504 | 505 | # Total time of execution on workstation was 506 | total.time <- Sys.time() - start.time 507 | total.time 508 | 509 | # Check out our results. 510 | rpart.cv.4 511 | 512 | 513 | 514 | 515 | # 516 | # NOTE - The following code takes a long time to run. Here's the math. 517 | # We are performing 10-fold CV repeated 3 times. That means we 518 | # need to build 30 models. We are also asking caret to try 7 519 | # different values of the mtry parameter. Next up by default 520 | # a mighty random forest leverages 500 trees. Lastly, caret will 521 | # build 1 final model at the end of the process with the best 522 | # mtry value over all the training data. Here's the number of 523 | # tree we're building: 524 | # 525 | # (10 * 3 * 7 * 500) + 500 = 105,500 trees! 526 | # 527 | # On a workstation using 10 cores the following code took 28 minutes 528 | # to execute. 529 | # 530 | 531 | 532 | # Create a cluster to work on 10 logical cores. 533 | # cl <- makeCluster(10, type = "SOCK") 534 | # registerDoSNOW(cl) 535 | 536 | # Time the code execution 537 | # start.time <- Sys.time() 538 | 539 | # We have reduced the dimensionality of our data using SVD. Also, the 540 | # application of SVD allows us to use LSA to simultaneously increase the 541 | # information density of each feature. To prove this out, leverage a 542 | # mighty Random Forest with the default of 500 trees. We'll also ask 543 | # caret to try 7 different values of mtry to find the mtry value that 544 | # gives the best result! 545 | # rf.cv.1 <- train(Label ~ ., data = train.svd, method = "rf", 546 | # trControl = cv.cntrl, tuneLength = 7) 547 | 548 | # Processing is done, stop cluster. 549 | # stopCluster(cl) 550 | 551 | # Total time of execution on workstation was 552 | # total.time <- Sys.time() - start.time 553 | # total.time 554 | 555 | 556 | # Load processing results from disk! 557 | load("rf.cv.1.RData") 558 | 559 | # Check out our results. 560 | rf.cv.1 561 | 562 | # Let's drill-down on the results. 563 | confusionMatrix(train.svd$Label, rf.cv.1$finalModel$predicted) 564 | 565 | 566 | 567 | 568 | 569 | # OK, now let's add in the feature we engineered previously for SMS 570 | # text length to see if it improves things. 571 | train.svd$TextLength <- train$TextLength 572 | 573 | 574 | # Create a cluster to work on 10 logical cores. 575 | # cl <- makeCluster(10, type = "SOCK") 576 | # registerDoSNOW(cl) 577 | 578 | # Time the code execution 579 | # start.time <- Sys.time() 580 | 581 | # Re-run the training process with the additional feature. 582 | # rf.cv.2 <- train(Label ~ ., data = train.svd, method = "rf", 583 | # trControl = cv.cntrl, tuneLength = 7, 584 | # importance = TRUE) 585 | 586 | # Processing is done, stop cluster. 587 | # stopCluster(cl) 588 | 589 | # Total time of execution on workstation was 590 | # total.time <- Sys.time() - start.time 591 | # total.time 592 | 593 | # Load results from disk. 594 | load("rf.cv.2.RData") 595 | 596 | # Check the results. 597 | rf.cv.2 598 | 599 | # Drill-down on the results. 600 | confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted) 601 | 602 | # How important was the new feature? 603 | library(randomForest) 604 | varImpPlot(rf.cv.1$finalModel) 605 | varImpPlot(rf.cv.2$finalModel) 606 | 607 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IntroToTextAnalyticsWithR 2 | Public repo for the Data Science Dojo YouTube tutorial series [Introduction to Text Analytics with R](https://www.youtube.com/playlist?list=PL8eNk_zTBST8olxIRFoo0YeXxEOkYdoxi). This tutorial series leverages the [Kaggle SMS Spam Collection Dataset](https://www.kaggle.com/uciml/sms-spam-collection-dataset) originally published by [UCI ML Repository](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) 3 | 4 | 5 | - [Introduction to Text Analytics with R - Part 1](https://www.youtube.com/watch?v=4vuw0AsHeGw) 6 | - [Introduction to Text Analytics with R - Part 2](https://www.youtube.com/watch?v=Y7385dGRNLM) 7 | - [Introduction to Text Analytics with R - Part 3](https://www.youtube.com/watch?v=CQsyVDxK7_g) 8 | - [Introduction to Text Analytics with R - Part 4](https://www.youtube.com/watch?v=IFhDlHKRHno) 9 | - [Introduction to Text Analytics with R - Part 5](https://www.youtube.com/watch?v=az7yf0IfWPM) 10 | - [Introduction to Text Analytics with R - Part 6](https://www.youtube.com/watch?v=neiW5Ugsob8) 11 | - [Introduction to Text Analytics with R - Part 7](https://www.youtube.com/watch?v=Fza5szojsU8) 12 | - [Introduction to Text Analytics with R - Part 8](https://www.youtube.com/watch?v=4DI68P4hicQ) 13 | - [Introduction to Text Analytics with R - Part 9](https://www.youtube.com/watch?v=SgrLE6WQzkE) 14 | - [Introduction to Text Analytics with R - Part 10](https://www.youtube.com/watch?v=7cwBhWYHgsA) 15 | - [Introduction to Text Analytics with R - Part 11](https://www.youtube.com/watch?v=XWUi7RivDJY) 16 | - [Introduction to Text Analytics with R - Part 12](https://www.youtube.com/watch?v=-wCrClheObk) -------------------------------------------------------------------------------- /rf.cv.1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.1.RData -------------------------------------------------------------------------------- /rf.cv.2.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.2.RData -------------------------------------------------------------------------------- /rf.cv.3.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.3.RData -------------------------------------------------------------------------------- /rf.cv.4.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/rf.cv.4.RData -------------------------------------------------------------------------------- /spam.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datasciencedojo/IntroToTextAnalyticsWithR/b71d0c5cc95860a0e51fbc3c4d9ffd4289c1e876/spam.csv --------------------------------------------------------------------------------