├── NAMESPACE
├── TextRegression_0.1-3.tar.gz
├── upload.sh
├── ChangeLog
├── TODO.md
├── man
    ├── dtm.to.Matrix.Rd
    ├── TextRegression.Rd
    └── regress.text.Rd
├── R
    ├── dtm.to.Matrix.R
    ├── help.R
    └── regress.text.R
├── DESCRIPTION
├── tests
    └── 1.R
└── README.md


/NAMESPACE:
--------------------------------------------------------------------------------
1 | export(dtm.to.Matrix)
2 | export(regress.text)
3 | 


--------------------------------------------------------------------------------
/TextRegression_0.1-3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnmyleswhite/TextRegression/HEAD/TextRegression_0.1-3.tar.gz


--------------------------------------------------------------------------------
/upload.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ftp ftp://cran.r-project.org
4 | # cd incoming
5 | # bin
6 | # put TextRegression_*.tar.gz
7 | # exit
8 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
1 | 2011-10-30	John Myles White  <jmw@johnmyleswhite.com>
2 | 
3 | * v0.1-1
4 | * First draft of the TextRegression package, which automates text regression analyses.
5 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | * Improve devtools usage.
2 | * Allow greater configuration of analysis.
3 | * Expand on returned outputs.
4 | * Use heuristics to control cross-validation.
5 | 
6 | * Figure out why glmnet used to tolerate y being a matrix and now requires a vector.
7 | * Add mechanism to predict from novel text.
8 | 


--------------------------------------------------------------------------------
/man/dtm.to.Matrix.Rd:
--------------------------------------------------------------------------------
 1 | \name{dtm.to.Matrix}
 2 | \alias{dtm.to.Matrix}
 3 | \title{Transform a tm-style DTM into a sparse Matrix.}
 4 | \usage{
 5 |   dtm.to.Matrix(dtm)
 6 | }
 7 | \arguments{
 8 |   \item{dtm}{A document term matrix of class
 9 |   'DocumentTermMatrix'.}
10 | }
11 | \value{
12 |   A sparse matrix (of class 'Matrix') representation of the
13 |   DTM.
14 | }
15 | \description{
16 |   Transform a DTM produced by the tm package into a sparse
17 |   Matrix for use with the glmnet package.
18 | }
19 | \examples{
20 | \dontrun{dtm.to.Matrix(dtm)}
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/R/dtm.to.Matrix.R:
--------------------------------------------------------------------------------
 1 | #' Transform a tm-style DTM into a sparse Matrix.
 2 | #'
 3 | #' Transform a DTM produced by the tm package into a sparse Matrix for use
 4 | #' with the glmnet package.
 5 | #'
 6 | #' @param dtm A document term matrix of class 'DocumentTermMatrix'.
 7 | #'
 8 | #' @return A sparse matrix (of class 'Matrix') representation of the DTM.
 9 | #'
10 | #' @export
11 | #'
12 | #' @examples
13 | #' \dontrun{dtm.to.Matrix(dtm)}
14 | dtm.to.Matrix <- function(dtm)
15 | {
16 |   m <- Matrix(0, nrow = dtm$nrow, ncol = dtm$ncol, sparse = TRUE)
17 |   
18 |   for (index in 1:length(dtm$i))
19 |   {
20 |     m[dtm$i[index], dtm$j[index]] <- dtm$v[index]
21 |   }
22 |   
23 |   return(m)
24 | }
25 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: TextRegression
 2 | Type: Package
 3 | Title: Predict continuous valued outputs associated with
 4 |     text documents.
 5 | Version: 0.1-3
 6 | Date: 2012-05-12
 7 | Author: John Myles White
 8 | Maintainer: John Myles White <jmw@johnmyleswhite.com>
 9 | Description: Predict continuous valued outputs associated
10 |     with text documents. The input corpus of text documents
11 |     is transformed into a document-term matrix (DTM) and
12 |     then a regularized linear regression is fit that uses
13 |     this matrix as predictors to predict the continuous
14 |     valued output. The corpus's terms, coefficients for all
15 |     terms and an estimate of the model's predictive power
16 |     are returned in a list.
17 | License: Artistic-2.0
18 | LazyLoad: yes
19 | Suggests: testthat
20 | Depends:
21 |     tm,
22 |     Matrix,
23 |     glmnet,
24 |     plyr
25 | Collate:
26 |     'dtm.to.Matrix.R'
27 |     'help.R'
28 |     'regress.text.R'
29 | 


--------------------------------------------------------------------------------
/man/TextRegression.Rd:
--------------------------------------------------------------------------------
 1 | \docType{package}
 2 | \name{TextRegression}
 3 | \alias{"package-TextRegression"}
 4 | \alias{TextRegression}
 5 | \title{Predict continuous valued outputs associated with text documents.}
 6 | \description{
 7 |   Predict continuous valued outputs associated with text
 8 |   documents. The input corpus of text documents is
 9 |   transformed into a document-term matrix (DTM) and then a
10 |   regularized linear regression is fit that uses this
11 |   matrix as predictors to predict the continuous valued
12 |   output. The corpus's terms, coefficients for all terms
13 |   and an estimate of the model's predictive power are
14 |   returned in a list.
15 | }
16 | \examples{
17 | library('TextRegression')
18 | 
19 | library('TextRegression')
20 | 
21 | text <- c('saying text is good',
22 | 'saying text once and saying text twice is better',
23 | 'saying text text text is best',
24 | 'saying text once is still ok',
25 | 'not saying it at all is bad',
26 | 'because text is a good thing',
27 | 'we all like text',
28 | 'even though sometimes it is missing')
29 | 
30 | y <- c(1, 2, 3, 1, 0, 1, 1, 0)
31 | 
32 | results <- regress.text(text, y)
33 | 
34 | print(results)
35 | }
36 | \references{
37 |   This code is inspired by Noah Smith's work.
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/R/help.R:
--------------------------------------------------------------------------------
 1 | #' Predict continuous valued outputs associated with text documents.
 2 | #'
 3 | #' Predict continuous valued outputs associated with text documents. The input
 4 | #' corpus of text documents is transformed into a document-term matrix (DTM)
 5 | #' and then a regularized linear regression is fit that uses this matrix as
 6 | #' predictors to predict the continuous valued output. The corpus's terms,
 7 | #' coefficients for all terms and an estimate of the model's predictive
 8 | #' power are returned in a list.
 9 | #'
10 | #' @references This code is inspired by Noah Smith's work.
11 | #' @docType package
12 | #' @name TextRegression
13 | #' @aliases TextRegression package-TextRegression
14 | #' @examples
15 | #' library('TextRegression')
16 | #'
17 | #' library('TextRegression')
18 | #'
19 | #' text <- c('saying text is good',
20 | #'           'saying text once and saying text twice is better',
21 | #'           'saying text text text is best',
22 | #'           'saying text once is still ok',
23 | #'           'not saying it at all is bad',
24 | #'           'because text is a good thing',
25 | #'           'we all like text',
26 | #'           'even though sometimes it is missing')
27 | #' 
28 | #' y <- c(1, 2, 3, 1, 0, 1, 1, 0)
29 | #' 
30 | #' results <- regress.text(text, y)
31 | #' 
32 | #' print(results)
33 | NULL
34 | 


--------------------------------------------------------------------------------
/tests/1.R:
--------------------------------------------------------------------------------
 1 | library('TextRegression')
 2 | library('testthat')
 3 | 
 4 | text <- c('this is text',
 5 |           'this is more text',
 6 |           'both contained some text',
 7 |           'text is good',
 8 |           'and more text is better',
 9 |           'but endless text is best',
10 |           'one day we will have enough text',
11 |           'until then we can only hope',
12 |           'in the text valhalla there are no stopwords',
13 |           'and draughts of text flow from the castle walls')
14 | 
15 | documents <- data.frame(Text = text)
16 | row.names(documents) <- 1:nrow(documents)
17 | 
18 | corpus <- Corpus(DataframeSource(documents))
19 | corpus <- tm_map(corpus, tolower)
20 | corpus <- tm_map(corpus, stripWhitespace)
21 | corpus <- tm_map(corpus, removeWords, stopwords('english'))
22 | 
23 | dtm <- DocumentTermMatrix(corpus)
24 | 
25 | x <- dtm.to.Matrix(dtm)
26 | 
27 | for (i in 1:3)
28 | {
29 |   set.seed(i)
30 | 
31 |   beta <- rnorm(ncol(x), 0, 10)
32 |   beta[sample(1:ncol(x), ncol(x) - 1, replace = FALSE)] <- 0
33 |   
34 |   intercept <- 100
35 |   
36 |   y <- x %*% beta + intercept + rnorm(nrow(x), 0, 0.0001)
37 |   
38 |   results <- regress.text(text, y)
39 |   errors <- abs(results$coefficients - c(intercept, beta))
40 |   
41 |   print(paste(i, max(errors)))
42 |   
43 |   expect_that(max(errors) < 10, is_true())
44 |   expect_that(length(results$coefficients) == length(results$terms), is_true())
45 | }
46 | # What else should be tested?
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DEPRECATION NOTICE
 2 | 
 3 | This package is abandoned. No maintenance work will be done on it.
 4 | 
 5 | # Introduction
 6 | This repository contains the development version of the TextRegression
 7 | package, which makes it easy to predict continuous outputs using text inputs.
 8 | To get started, install the package using the instructions immediately below.
 9 | Then you can try out the examples later on to learn how to use the package
10 | to solve your text analysis problems.
11 | 
12 | Be warned: text regression can take a while, because the data needs to be
13 | resampled to set the hyperparameters used in the final regression analysis.
14 | 
15 | # Installation
16 | Use the code below inside an R session to install the TextRegression package:
17 | 
18 |     install.packages('devtools')
19 |     library('devtools')
20 |     install_github('TextRegression', username = 'johnmyleswhite')
21 | 
22 | # Examples
23 | Try out this toy example to see how to perform a text regression:
24 | 
25 |     library('TextRegression')
26 |     
27 |     text <- c('saying text is good',
28 |               'saying text once and saying text twice is better',
29 |               'saying text text text is best',
30 |               'saying text once is still ok',
31 |               'not saying it at all is bad',
32 |               'because text is a good thing',
33 |               'we all like text',
34 |               'even though sometimes it is missing')
35 |     
36 |     y <- c(1, 2, 3, 1, 0, 1, 1, 0)
37 |     
38 |     results <- regress.text(text, y)
39 |     
40 |     print(results)
41 | 


--------------------------------------------------------------------------------
/man/regress.text.Rd:
--------------------------------------------------------------------------------
 1 | \name{regress.text}
 2 | \alias{regress.text}
 3 | \title{Fit regularized regressions to text data given a corpus and outputs.}
 4 | \usage{
 5 |   regress.text(text, y, n.splits = 10, size = 0.8,
 6 |   standardizeCase = TRUE, stripSpace = TRUE,
 7 |   removeStopwords = TRUE)
 8 | }
 9 | \arguments{
10 |   \item{text}{A character vector containing the documents
11 |   for analysis.}
12 | 
13 |   \item{y}{A numeric vector of outputs associated with the
14 |   documents.}
15 | 
16 |   \item{n.splits}{How many resampling steps should be used
17 |   to set lambda?}
18 | 
19 |   \item{size}{How much of the data should be used during
20 |   resampling for model fitting?}
21 | 
22 |   \item{standardizeCase}{Should all of the text be
23 |   standardized on lowercase?}
24 | 
25 |   \item{stripSpace}{Should all whitespace be stripped from
26 |   the text?}
27 | 
28 |   \item{removeStopwords}{Should tm's list of English
29 |   stopwords be pulled out of the text?}
30 | }
31 | \value{
32 |   A list containing regression coefficients, the terms used
33 |   with those coefficients, the value of lambda used for
34 |   model assessment, and an estimate of the RMSE associated
35 |   with that model.
36 | }
37 | \description{
38 |   This function will fit regularized regressions to text
39 |   data given a corpus and outputs.
40 | }
41 | \examples{
42 | library('TextRegression')
43 | 
44 | text <- c('saying text is good',
45 | 'saying text once and saying text twice is better',
46 | 'saying text text text is best',
47 | 'saying text once is still ok',
48 | 'not saying it at all is bad',
49 | 'because text is a good thing',
50 | 'we all like text',
51 | 'even though sometimes it is missing')
52 | 
53 | y <- c(1, 2, 3, 1, 0, 1, 1, 0)
54 | 
55 | results <- regress.text(text, y)
56 | 
57 | print(results)
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/R/regress.text.R:
--------------------------------------------------------------------------------
  1 | #' Fit regularized regressions to text data given a corpus and outputs.
  2 | #'
  3 | #' This function will fit regularized regressions to text data given
  4 | #' a corpus and outputs.
  5 | #'
  6 | #' @param text A character vector containing the documents for analysis.
  7 | #' @param y A numeric vector of outputs associated with the documents.
  8 | #' @param n.splits How many resampling steps should be used to set lambda?
  9 | #' @param size How much of the data should be used during resampling for model fitting?
 10 | #' @param standardizeCase Should all of the text be standardized on lowercase?
 11 | #' @param stripSpace Should all whitespace be stripped from the text?
 12 | #' @param removeStopwords Should tm's list of English stopwords be pulled out of the text?
 13 | #'
 14 | #' @return A list containing regression coefficients, the terms used with those coefficients, the value of lambda used for model assessment, and an estimate of the RMSE associated with that model.
 15 | #'
 16 | #' @export
 17 | #'
 18 | #' @examples
 19 | #' library('TextRegression')
 20 | #'
 21 | #' text <- c('saying text is good',
 22 | #'           'saying text once and saying text twice is better',
 23 | #'           'saying text text text is best',
 24 | #'           'saying text once is still ok',
 25 | #'           'not saying it at all is bad',
 26 | #'           'because text is a good thing',
 27 | #'           'we all like text',
 28 | #'           'even though sometimes it is missing')
 29 | #' 
 30 | #' y <- c(1, 2, 3, 1, 0, 1, 1, 0)
 31 | #' 
 32 | #' results <- regress.text(text, y)
 33 | #' 
 34 | #' print(results)
 35 | regress.text <- function(text,
 36 |                          y,
 37 |                          n.splits = 10,
 38 |                          size = 0.8,
 39 |                          standardizeCase = TRUE,
 40 |                          stripSpace = TRUE,
 41 |                          removeStopwords = TRUE)
 42 | {
 43 |   # Fit regularized regressions to text data given corpus and outputs.
 44 |   # Provide text documents as vector x.
 45 |   # Provide outputs as vector y.
 46 |   # In future, allow specification of directory with corpus in files.
 47 |   # Allow control of preprocessing of text.
 48 |   # Allow control of L1 or L2 regularization.
 49 |   # Allow control of amount of resampling for hyperparameter tuning.
 50 | 
 51 |   # Do you want coefficients as output?
 52 |   # Do you want RMSE on test data?
 53 |   documents <- data.frame(Text = text)
 54 |   row.names(documents) <- 1:nrow(documents)
 55 |   
 56 |   corpus <- Corpus(DataframeSource(documents))
 57 |   
 58 |   if (standardizeCase)
 59 |   {
 60 |     corpus <- tm_map(corpus, tolower)
 61 |   }
 62 |   if (stripSpace)
 63 |   {
 64 |     corpus <- tm_map(corpus, stripWhitespace)
 65 |   }
 66 |   if (removeStopwords)
 67 |   {
 68 |     corpus <- tm_map(corpus, removeWords, stopwords('english'))
 69 |   }
 70 |   
 71 |   dtm <- DocumentTermMatrix(corpus)
 72 |   
 73 |   x <- dtm.to.Matrix(dtm)
 74 |   
 75 |   y <- as.vector(y)
 76 |   
 77 |   regularized.fit <- glmnet(x, y)
 78 |   
 79 |   lambdas <- regularized.fit$lambda
 80 |   
 81 |   # Calculate number of splits based on time required to perform original model fit.
 82 |   # Or based on data set size?
 83 |   
 84 |   performance <- data.frame()
 85 |   
 86 |   for (i in 1:n.splits)
 87 |   {
 88 |     indices <- sample(1:nrow(x), round(size * nrow(x)))
 89 | 
 90 |     training.x <- x[indices, ]
 91 |     training.y <- y[indices]
 92 |     test.x <- x[-indices, ]
 93 |     test.y <- y[-indices]
 94 |     
 95 |     for (lambda in lambdas)
 96 |     {
 97 |       resampling.fit <- glmnet(training.x, training.y)
 98 |       predicted.y <- as.numeric(predict(resampling.fit, newx = test.x, s = lambda))
 99 |       rmse <- sqrt(mean((predicted.y - test.y) ^ 2))
100 |       performance <- rbind(performance, data.frame(Split = i, Lambda = lambda, RMSE = rmse))
101 |     }
102 |   }
103 |   
104 |   mean.rmse <- ddply(performance,
105 |                      'Lambda',
106 |                      function (df)
107 |                      {
108 |                        with(df, data.frame(RMSE = mean(RMSE)))
109 |                      })
110 |  
111 |   optimal.lambda <- with(mean.rmse, max(Lambda[which(RMSE == min(RMSE))]))
112 |   optimal.rmse <- with(subset(mean.rmse, Lambda == optimal.lambda), RMSE)
113 |   
114 |   coefficients <- as.numeric(coef(regularized.fit, s = optimal.lambda)[, 1])
115 |   terms <- c('(Intercept)', colnames(dtm))
116 |   
117 |   return(list(coefficients = coefficients,
118 |               terms = terms,
119 |               lambda = optimal.lambda,
120 |               rmse = optimal.rmse))
121 | }
122 | 


--------------------------------------------------------------------------------