├── NAMESPACE ├── TextRegression_0.1-3.tar.gz ├── upload.sh ├── ChangeLog ├── TODO.md ├── man ├── dtm.to.Matrix.Rd ├── TextRegression.Rd └── regress.text.Rd ├── R ├── dtm.to.Matrix.R ├── help.R └── regress.text.R ├── DESCRIPTION ├── tests └── 1.R └── README.md /NAMESPACE: -------------------------------------------------------------------------------- 1 | export(dtm.to.Matrix) 2 | export(regress.text) 3 | -------------------------------------------------------------------------------- /TextRegression_0.1-3.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnmyleswhite/TextRegression/HEAD/TextRegression_0.1-3.tar.gz -------------------------------------------------------------------------------- /upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ftp ftp://cran.r-project.org 4 | # cd incoming 5 | # bin 6 | # put TextRegression_*.tar.gz 7 | # exit 8 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2011-10-30 John Myles White 2 | 3 | * v0.1-1 4 | * First draft of the TextRegression package, which automates text regression analyses. 5 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | * Improve devtools usage. 2 | * Allow greater configuration of analysis. 3 | * Expand on returned outputs. 4 | * Use heuristics to control cross-validation. 5 | 6 | * Figure out why glmnet used to tolerate y being a matrix and now requires a vector. 7 | * Add mechanism to predict from novel text. 8 | -------------------------------------------------------------------------------- /man/dtm.to.Matrix.Rd: -------------------------------------------------------------------------------- 1 | \name{dtm.to.Matrix} 2 | \alias{dtm.to.Matrix} 3 | \title{Transform a tm-style DTM into a sparse Matrix.} 4 | \usage{ 5 | dtm.to.Matrix(dtm) 6 | } 7 | \arguments{ 8 | \item{dtm}{A document term matrix of class 9 | 'DocumentTermMatrix'.} 10 | } 11 | \value{ 12 | A sparse matrix (of class 'Matrix') representation of the 13 | DTM. 14 | } 15 | \description{ 16 | Transform a DTM produced by the tm package into a sparse 17 | Matrix for use with the glmnet package. 18 | } 19 | \examples{ 20 | \dontrun{dtm.to.Matrix(dtm)} 21 | } 22 | 23 | -------------------------------------------------------------------------------- /R/dtm.to.Matrix.R: -------------------------------------------------------------------------------- 1 | #' Transform a tm-style DTM into a sparse Matrix. 2 | #' 3 | #' Transform a DTM produced by the tm package into a sparse Matrix for use 4 | #' with the glmnet package. 5 | #' 6 | #' @param dtm A document term matrix of class 'DocumentTermMatrix'. 7 | #' 8 | #' @return A sparse matrix (of class 'Matrix') representation of the DTM. 9 | #' 10 | #' @export 11 | #' 12 | #' @examples 13 | #' \dontrun{dtm.to.Matrix(dtm)} 14 | dtm.to.Matrix <- function(dtm) 15 | { 16 | m <- Matrix(0, nrow = dtm$nrow, ncol = dtm$ncol, sparse = TRUE) 17 | 18 | for (index in 1:length(dtm$i)) 19 | { 20 | m[dtm$i[index], dtm$j[index]] <- dtm$v[index] 21 | } 22 | 23 | return(m) 24 | } 25 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: TextRegression 2 | Type: Package 3 | Title: Predict continuous valued outputs associated with 4 | text documents. 5 | Version: 0.1-3 6 | Date: 2012-05-12 7 | Author: John Myles White 8 | Maintainer: John Myles White 9 | Description: Predict continuous valued outputs associated 10 | with text documents. The input corpus of text documents 11 | is transformed into a document-term matrix (DTM) and 12 | then a regularized linear regression is fit that uses 13 | this matrix as predictors to predict the continuous 14 | valued output. The corpus's terms, coefficients for all 15 | terms and an estimate of the model's predictive power 16 | are returned in a list. 17 | License: Artistic-2.0 18 | LazyLoad: yes 19 | Suggests: testthat 20 | Depends: 21 | tm, 22 | Matrix, 23 | glmnet, 24 | plyr 25 | Collate: 26 | 'dtm.to.Matrix.R' 27 | 'help.R' 28 | 'regress.text.R' 29 | -------------------------------------------------------------------------------- /man/TextRegression.Rd: -------------------------------------------------------------------------------- 1 | \docType{package} 2 | \name{TextRegression} 3 | \alias{"package-TextRegression"} 4 | \alias{TextRegression} 5 | \title{Predict continuous valued outputs associated with text documents.} 6 | \description{ 7 | Predict continuous valued outputs associated with text 8 | documents. The input corpus of text documents is 9 | transformed into a document-term matrix (DTM) and then a 10 | regularized linear regression is fit that uses this 11 | matrix as predictors to predict the continuous valued 12 | output. The corpus's terms, coefficients for all terms 13 | and an estimate of the model's predictive power are 14 | returned in a list. 15 | } 16 | \examples{ 17 | library('TextRegression') 18 | 19 | library('TextRegression') 20 | 21 | text <- c('saying text is good', 22 | 'saying text once and saying text twice is better', 23 | 'saying text text text is best', 24 | 'saying text once is still ok', 25 | 'not saying it at all is bad', 26 | 'because text is a good thing', 27 | 'we all like text', 28 | 'even though sometimes it is missing') 29 | 30 | y <- c(1, 2, 3, 1, 0, 1, 1, 0) 31 | 32 | results <- regress.text(text, y) 33 | 34 | print(results) 35 | } 36 | \references{ 37 | This code is inspired by Noah Smith's work. 38 | } 39 | 40 | -------------------------------------------------------------------------------- /R/help.R: -------------------------------------------------------------------------------- 1 | #' Predict continuous valued outputs associated with text documents. 2 | #' 3 | #' Predict continuous valued outputs associated with text documents. The input 4 | #' corpus of text documents is transformed into a document-term matrix (DTM) 5 | #' and then a regularized linear regression is fit that uses this matrix as 6 | #' predictors to predict the continuous valued output. The corpus's terms, 7 | #' coefficients for all terms and an estimate of the model's predictive 8 | #' power are returned in a list. 9 | #' 10 | #' @references This code is inspired by Noah Smith's work. 11 | #' @docType package 12 | #' @name TextRegression 13 | #' @aliases TextRegression package-TextRegression 14 | #' @examples 15 | #' library('TextRegression') 16 | #' 17 | #' library('TextRegression') 18 | #' 19 | #' text <- c('saying text is good', 20 | #' 'saying text once and saying text twice is better', 21 | #' 'saying text text text is best', 22 | #' 'saying text once is still ok', 23 | #' 'not saying it at all is bad', 24 | #' 'because text is a good thing', 25 | #' 'we all like text', 26 | #' 'even though sometimes it is missing') 27 | #' 28 | #' y <- c(1, 2, 3, 1, 0, 1, 1, 0) 29 | #' 30 | #' results <- regress.text(text, y) 31 | #' 32 | #' print(results) 33 | NULL 34 | -------------------------------------------------------------------------------- /tests/1.R: -------------------------------------------------------------------------------- 1 | library('TextRegression') 2 | library('testthat') 3 | 4 | text <- c('this is text', 5 | 'this is more text', 6 | 'both contained some text', 7 | 'text is good', 8 | 'and more text is better', 9 | 'but endless text is best', 10 | 'one day we will have enough text', 11 | 'until then we can only hope', 12 | 'in the text valhalla there are no stopwords', 13 | 'and draughts of text flow from the castle walls') 14 | 15 | documents <- data.frame(Text = text) 16 | row.names(documents) <- 1:nrow(documents) 17 | 18 | corpus <- Corpus(DataframeSource(documents)) 19 | corpus <- tm_map(corpus, tolower) 20 | corpus <- tm_map(corpus, stripWhitespace) 21 | corpus <- tm_map(corpus, removeWords, stopwords('english')) 22 | 23 | dtm <- DocumentTermMatrix(corpus) 24 | 25 | x <- dtm.to.Matrix(dtm) 26 | 27 | for (i in 1:3) 28 | { 29 | set.seed(i) 30 | 31 | beta <- rnorm(ncol(x), 0, 10) 32 | beta[sample(1:ncol(x), ncol(x) - 1, replace = FALSE)] <- 0 33 | 34 | intercept <- 100 35 | 36 | y <- x %*% beta + intercept + rnorm(nrow(x), 0, 0.0001) 37 | 38 | results <- regress.text(text, y) 39 | errors <- abs(results$coefficients - c(intercept, beta)) 40 | 41 | print(paste(i, max(errors))) 42 | 43 | expect_that(max(errors) < 10, is_true()) 44 | expect_that(length(results$coefficients) == length(results$terms), is_true()) 45 | } 46 | # What else should be tested? 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DEPRECATION NOTICE 2 | 3 | This package is abandoned. No maintenance work will be done on it. 4 | 5 | # Introduction 6 | This repository contains the development version of the TextRegression 7 | package, which makes it easy to predict continuous outputs using text inputs. 8 | To get started, install the package using the instructions immediately below. 9 | Then you can try out the examples later on to learn how to use the package 10 | to solve your text analysis problems. 11 | 12 | Be warned: text regression can take a while, because the data needs to be 13 | resampled to set the hyperparameters used in the final regression analysis. 14 | 15 | # Installation 16 | Use the code below inside an R session to install the TextRegression package: 17 | 18 | install.packages('devtools') 19 | library('devtools') 20 | install_github('TextRegression', username = 'johnmyleswhite') 21 | 22 | # Examples 23 | Try out this toy example to see how to perform a text regression: 24 | 25 | library('TextRegression') 26 | 27 | text <- c('saying text is good', 28 | 'saying text once and saying text twice is better', 29 | 'saying text text text is best', 30 | 'saying text once is still ok', 31 | 'not saying it at all is bad', 32 | 'because text is a good thing', 33 | 'we all like text', 34 | 'even though sometimes it is missing') 35 | 36 | y <- c(1, 2, 3, 1, 0, 1, 1, 0) 37 | 38 | results <- regress.text(text, y) 39 | 40 | print(results) 41 | -------------------------------------------------------------------------------- /man/regress.text.Rd: -------------------------------------------------------------------------------- 1 | \name{regress.text} 2 | \alias{regress.text} 3 | \title{Fit regularized regressions to text data given a corpus and outputs.} 4 | \usage{ 5 | regress.text(text, y, n.splits = 10, size = 0.8, 6 | standardizeCase = TRUE, stripSpace = TRUE, 7 | removeStopwords = TRUE) 8 | } 9 | \arguments{ 10 | \item{text}{A character vector containing the documents 11 | for analysis.} 12 | 13 | \item{y}{A numeric vector of outputs associated with the 14 | documents.} 15 | 16 | \item{n.splits}{How many resampling steps should be used 17 | to set lambda?} 18 | 19 | \item{size}{How much of the data should be used during 20 | resampling for model fitting?} 21 | 22 | \item{standardizeCase}{Should all of the text be 23 | standardized on lowercase?} 24 | 25 | \item{stripSpace}{Should all whitespace be stripped from 26 | the text?} 27 | 28 | \item{removeStopwords}{Should tm's list of English 29 | stopwords be pulled out of the text?} 30 | } 31 | \value{ 32 | A list containing regression coefficients, the terms used 33 | with those coefficients, the value of lambda used for 34 | model assessment, and an estimate of the RMSE associated 35 | with that model. 36 | } 37 | \description{ 38 | This function will fit regularized regressions to text 39 | data given a corpus and outputs. 40 | } 41 | \examples{ 42 | library('TextRegression') 43 | 44 | text <- c('saying text is good', 45 | 'saying text once and saying text twice is better', 46 | 'saying text text text is best', 47 | 'saying text once is still ok', 48 | 'not saying it at all is bad', 49 | 'because text is a good thing', 50 | 'we all like text', 51 | 'even though sometimes it is missing') 52 | 53 | y <- c(1, 2, 3, 1, 0, 1, 1, 0) 54 | 55 | results <- regress.text(text, y) 56 | 57 | print(results) 58 | } 59 | 60 | -------------------------------------------------------------------------------- /R/regress.text.R: -------------------------------------------------------------------------------- 1 | #' Fit regularized regressions to text data given a corpus and outputs. 2 | #' 3 | #' This function will fit regularized regressions to text data given 4 | #' a corpus and outputs. 5 | #' 6 | #' @param text A character vector containing the documents for analysis. 7 | #' @param y A numeric vector of outputs associated with the documents. 8 | #' @param n.splits How many resampling steps should be used to set lambda? 9 | #' @param size How much of the data should be used during resampling for model fitting? 10 | #' @param standardizeCase Should all of the text be standardized on lowercase? 11 | #' @param stripSpace Should all whitespace be stripped from the text? 12 | #' @param removeStopwords Should tm's list of English stopwords be pulled out of the text? 13 | #' 14 | #' @return A list containing regression coefficients, the terms used with those coefficients, the value of lambda used for model assessment, and an estimate of the RMSE associated with that model. 15 | #' 16 | #' @export 17 | #' 18 | #' @examples 19 | #' library('TextRegression') 20 | #' 21 | #' text <- c('saying text is good', 22 | #' 'saying text once and saying text twice is better', 23 | #' 'saying text text text is best', 24 | #' 'saying text once is still ok', 25 | #' 'not saying it at all is bad', 26 | #' 'because text is a good thing', 27 | #' 'we all like text', 28 | #' 'even though sometimes it is missing') 29 | #' 30 | #' y <- c(1, 2, 3, 1, 0, 1, 1, 0) 31 | #' 32 | #' results <- regress.text(text, y) 33 | #' 34 | #' print(results) 35 | regress.text <- function(text, 36 | y, 37 | n.splits = 10, 38 | size = 0.8, 39 | standardizeCase = TRUE, 40 | stripSpace = TRUE, 41 | removeStopwords = TRUE) 42 | { 43 | # Fit regularized regressions to text data given corpus and outputs. 44 | # Provide text documents as vector x. 45 | # Provide outputs as vector y. 46 | # In future, allow specification of directory with corpus in files. 47 | # Allow control of preprocessing of text. 48 | # Allow control of L1 or L2 regularization. 49 | # Allow control of amount of resampling for hyperparameter tuning. 50 | 51 | # Do you want coefficients as output? 52 | # Do you want RMSE on test data? 53 | documents <- data.frame(Text = text) 54 | row.names(documents) <- 1:nrow(documents) 55 | 56 | corpus <- Corpus(DataframeSource(documents)) 57 | 58 | if (standardizeCase) 59 | { 60 | corpus <- tm_map(corpus, tolower) 61 | } 62 | if (stripSpace) 63 | { 64 | corpus <- tm_map(corpus, stripWhitespace) 65 | } 66 | if (removeStopwords) 67 | { 68 | corpus <- tm_map(corpus, removeWords, stopwords('english')) 69 | } 70 | 71 | dtm <- DocumentTermMatrix(corpus) 72 | 73 | x <- dtm.to.Matrix(dtm) 74 | 75 | y <- as.vector(y) 76 | 77 | regularized.fit <- glmnet(x, y) 78 | 79 | lambdas <- regularized.fit$lambda 80 | 81 | # Calculate number of splits based on time required to perform original model fit. 82 | # Or based on data set size? 83 | 84 | performance <- data.frame() 85 | 86 | for (i in 1:n.splits) 87 | { 88 | indices <- sample(1:nrow(x), round(size * nrow(x))) 89 | 90 | training.x <- x[indices, ] 91 | training.y <- y[indices] 92 | test.x <- x[-indices, ] 93 | test.y <- y[-indices] 94 | 95 | for (lambda in lambdas) 96 | { 97 | resampling.fit <- glmnet(training.x, training.y) 98 | predicted.y <- as.numeric(predict(resampling.fit, newx = test.x, s = lambda)) 99 | rmse <- sqrt(mean((predicted.y - test.y) ^ 2)) 100 | performance <- rbind(performance, data.frame(Split = i, Lambda = lambda, RMSE = rmse)) 101 | } 102 | } 103 | 104 | mean.rmse <- ddply(performance, 105 | 'Lambda', 106 | function (df) 107 | { 108 | with(df, data.frame(RMSE = mean(RMSE))) 109 | }) 110 | 111 | optimal.lambda <- with(mean.rmse, max(Lambda[which(RMSE == min(RMSE))])) 112 | optimal.rmse <- with(subset(mean.rmse, Lambda == optimal.lambda), RMSE) 113 | 114 | coefficients <- as.numeric(coef(regularized.fit, s = optimal.lambda)[, 1]) 115 | terms <- c('(Intercept)', colnames(dtm)) 116 | 117 | return(list(coefficients = coefficients, 118 | terms = terms, 119 | lambda = optimal.lambda, 120 | rmse = optimal.rmse)) 121 | } 122 | --------------------------------------------------------------------------------