├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── crossval.R
    ├── data.R
    ├── performance.R
    ├── quanteda.classifiers-package.R
    ├── textmodel_cnnlstmemb.R
    ├── textmodel_evaluate.R
    ├── textmodel_mlp.R
    ├── tokens2sequences.R
    └── zzz.R
├── README.Rmd
├── README.md
├── data
    ├── data_corpus_LMRD.rda
    └── data_corpus_manifestosentsUK.rda
├── inst
    └── WORDLIST
├── man
    ├── crossval.Rd
    ├── data_corpus_LMRD.Rd
    ├── data_corpus_manifestosentsUK.Rd
    ├── is.tokens2sequences.Rd
    ├── performance.Rd
    ├── predict.textmodel_cnnlstmemb.Rd
    ├── predict.textmodel_mlp.Rd
    ├── quanteda.classifiers-package.Rd
    ├── save.textmodel_mlp.Rd
    ├── summary.textmodel_cnnlstmemb.Rd
    ├── summary.textmodel_mlp.Rd
    ├── textmodel_cnnlstmemb.Rd
    ├── textmodel_evaluate.Rd
    ├── textmodel_mlp.Rd
    ├── tokens2sequences.Rd
    └── tokens2sequences_conform.Rd
├── quanteda.classifiers.Rproj
└── tests
    ├── data_creation
        ├── create_data_corpus_manifestosentUK.R
        ├── data_uk_immigration.zip
        ├── data_uk_manifestos_2015-2019.zip
        ├── data_uk_policyarea.zip
        └── master.sentences.Rdata
    ├── misc
        ├── test-LMRD.Rmd
        └── test-LMRD.nb.html
    ├── spelling.R
    ├── testthat.R
    └── testthat
        ├── test-performance.R
        ├── test-textmodel_cnnlstmemb.R
        ├── test-textmodel_evaluate.R
        ├── test-textmodel_mlp.R
        └── test-tokens2sequences.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^codecov\.yml$
2 | ^appveyor\.yml$
3 | ^\.travis\.yml$
4 | ^.*\.Rproj$
5 | ^\.Rproj\.user$
6 | README.Rmd
7 | tests/misc/
8 | \.github/
9 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | name: R-CMD-check
12 | 
13 | jobs:
14 |   R-CMD-check:
15 |     runs-on: ${{ matrix.config.os }}
16 | 
17 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
18 | 
19 |     strategy:
20 |       fail-fast: false
21 |       matrix:
22 |         config:
23 |           - {os: windows-latest, r: 'release'}
24 |           - {os: macOS-latest, r: 'release'}
25 |           - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
26 |           - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
27 | 
28 |     env:
29 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
30 |       RSPM: ${{ matrix.config.rspm }}
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v2
34 | 
35 |       - uses: r-lib/actions/setup-r@v2
36 |         with:
37 |           r-version: ${{ matrix.config.r }}
38 | 
39 |       - uses: r-lib/actions/setup-pandoc@v2
40 | 
41 |       - name: Query dependencies
42 |         run: |
43 |           install.packages('remotes')
44 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
45 |           writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
46 |         shell: Rscript {0}
47 | 
48 |       - name: Cache R packages
49 |         if: runner.os != 'Windows'
50 |         uses: actions/cache@v1
51 |         with:
52 |           path: ${{ env.R_LIBS_USER }}
53 |           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
54 |           restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
55 | 
56 |       - name: Install system dependencies
57 |         if: runner.os == 'Linux'
58 |         run: |
59 |           while read -r cmd
60 |           do
61 |             eval sudo $cmd
62 |           done < <(Rscript -e 'cat(remotes::system_requirements("ubuntu", "20.04"), sep = "\n")')
63 | 
64 |       - name: Install dependencies
65 |         run: |
66 |           remotes::install_deps(dependencies = TRUE)
67 |           remotes::install_cran("rcmdcheck")
68 |         shell: Rscript {0}
69 | 
70 |       - name: Check
71 |         env:
72 |           _R_CHECK_CRAN_INCOMING_REMOTE_: false
73 |         run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
74 |         shell: Rscript {0}
75 | 
76 |       - name: Upload check results
77 |         if: failure()
78 |         uses: actions/upload-artifact@main
79 |         with:
80 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
81 |           path: check
82 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - master
 5 |   pull_request:
 6 |     branches:
 7 |       - master
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: macOS-latest
14 |     steps:
15 |       - uses: actions/checkout@v2
16 | 
17 |       - uses: r-lib/actions/setup-r@v2
18 |         with:
19 |           r-version: 'release'
20 | 
21 |       - uses: r-lib/actions/setup-pandoc@v2
22 | 
23 |       - name: Query dependencies
24 |         run: |
25 |           install.packages('remotes')
26 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
27 |         shell: Rscript {0}
28 | 
29 |       - name: Cache R packages
30 |         uses: actions/cache@v1
31 |         with:
32 |           path: ${{ env.R_LIBS_USER }}
33 |           key: macOS-r-4.0-2-${{ hashFiles('.github/depends.Rds') }}
34 |           restore-keys: macOS-r-4.0-2-
35 | 
36 |       - name: Install dependencies
37 |         run: |
38 |           install.packages(c("remotes"))
39 |           remotes::install_deps(dependencies = TRUE)
40 |           remotes::install_cran("covr")
41 |         shell: Rscript {0}
42 | 
43 |       - name: Test coverage
44 |         run: covr::codecov()
45 |         shell: Rscript {0}
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | # Session Data files
 5 | .RData
 6 | # Example code in package build process
 7 | *-Ex.R
 8 | # Output files from R CMD build
 9 | /*.tar.gz
10 | # Output files from R CMD check
11 | /*.Rcheck/
12 | # RStudio files
13 | .Rproj.user/
14 | # produced vignettes
15 | vignettes/*.html
16 | vignettes/*.pdf
17 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
18 | .httr-oauth
19 | # knitr and R markdown default cache directories
20 | /*_cache/
21 | /cache/
22 | # Temporary files created by R markdown
23 | *.utf8.md
24 | *.knit.md
25 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
26 | rsconnect/
27 | .Rproj.user
28 | quanteda.svm.Rproj
29 | .DS_Store
30 | 
31 | tests/misc/test-LMRD.md
32 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | language: R
 3 | 
 4 | dist: xenial
 5 | sudo: false
 6 | 
 7 | cache:
 8 |   packages: true
 9 |   directories:
10 |     - $HOME/.keras
11 |     - $HOME/.cache/pip
12 | 
13 | warnings_are_errors: true
14 | 
15 | matrix:
16 |   include:
17 |     - name: "Backend: TensorFlow | Implementation: Keras"
18 |       env:
19 |         - KERAS_BACKEND="tensorflow"
20 |         - KERAS_IMPLEMENTATION="tensorflow"
21 |         - TENSORFLOW_VERSION="default"
22 |         
23 | before_script:
24 |   - sudo apt-get update
25 |   - sudo apt-get install python3 python3-pip
26 |   - pip3 install --upgrade --ignore-installed --user travis virtualenv
27 |   - R CMD INSTALL .
28 |   - R -e 'Sys.setenv(PIP_QUIET=1); keras::install_keras(tensorflow = Sys.getenv("TENSORFLOW_VERSION"), extra_packages = "IPython")'
29 |   - R -e 'tensorflow::tf_config()'
30 |   
31 | after_success:
32 |   - Rscript -e 'covr::codecov()'
33 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: quanteda.classifiers
 2 | Type: Package
 3 | Title: Models for supervised text classification
 4 | Version: 0.4
 5 | Authors@R: c(person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role = c("aut", "cre")),
 6 |              person("Patrick", "Chester", email = "pjc468@nyu.edu", role = c("aut")),
 7 |              person("Müller", "Stefan", email = "mueller@ipz.uzh.ch", role = c("aut")))
 8 | Description: A test package for developing additional text classifier models (textmodels) including neural network-based methods.
 9 | Encoding: UTF-8
10 | License: GPL-3
11 | Depends:
12 |     R (>= 3.5.0),
13 | Imports:
14 |     quanteda (>= 2.0),
15 |     keras (>= 2.1.0),
16 |     quanteda.textmodels,
17 |     groupdata2
18 | Suggests:
19 |     spelling,
20 |     testthat,
21 |     covr
22 | URL: https://github.com/quanteda/quanteda.classifiers
23 | BugReports: https://github.com/quanteda/quanteda.classifiers/issues
24 | LazyData: TRUE
25 | LazyDataCompression: bzip2
26 | RoxygenNote: 7.2.3
27 | Language: en-GB
28 | Roxygen: list(markdown = TRUE)
29 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(accuracy,default)
 4 | S3method(accuracy,table)
 5 | S3method(balanced_accuracy,default)
 6 | S3method(balanced_accuracy,list)
 7 | S3method(balanced_accuracy,table)
 8 | S3method(crossval,textmodel)
 9 | S3method(f1_score,default)
10 | S3method(f1_score,table)
11 | S3method(head,textmodel_evaluate)
12 | S3method(load,textmodel_cnnlstmemb)
13 | S3method(load,textmodel_mlp)
14 | S3method(performance,default)
15 | S3method(performance,table)
16 | S3method(precision,default)
17 | S3method(precision,table)
18 | S3method(predict,textmodel_cnnlstmemb)
19 | S3method(predict,textmodel_mlp)
20 | S3method(print,predict.textmodel_cnnlstmemb)
21 | S3method(print,predict.textmodel_mlp)
22 | S3method(print,textmodel_cnnlstmemb)
23 | S3method(print,textmodel_evaluate)
24 | S3method(print,textmodel_mlp)
25 | S3method(print,tokens2sequences)
26 | S3method(recall,default)
27 | S3method(recall,table)
28 | S3method(save,textmodel_cnnlstmemb)
29 | S3method(save,textmodel_mlp)
30 | S3method(summary,textmodel_cnnlstmemb)
31 | S3method(summary,textmodel_mlp)
32 | S3method(textmodel_cnnlstmemb,tokens)
33 | S3method(textmodel_cnnlstmemb,tokens2sequences)
34 | S3method(textmodel_evaluate,dfm)
35 | S3method(textmodel_evaluate,tokens)
36 | S3method(textmodel_mlp,dfm)
37 | S3method(tokens2sequences,character)
38 | S3method(tokens2sequences,tokens)
39 | S3method(tokens2sequences,tokens2sequences)
40 | S3method(tokens2sequences_conform,tokens2sequences)
41 | export(accuracy)
42 | export(balanced_accuracy)
43 | export(crossval)
44 | export(f1_score)
45 | export(is.tokens2sequences)
46 | export(performance)
47 | export(precision)
48 | export(recall)
49 | export(textmodel_cnnlstmemb)
50 | export(textmodel_evaluate)
51 | export(textmodel_mlp)
52 | export(tokens2sequences)
53 | export(tokens2sequences_conform)
54 | import(quanteda)
55 | import(quanteda.textmodels)
56 | importFrom(groupdata2,fold)
57 | importFrom(keras,bidirectional)
58 | importFrom(keras,compile)
59 | importFrom(keras,fit)
60 | importFrom(keras,keras_model_sequential)
61 | importFrom(keras,layer_activation)
62 | importFrom(keras,layer_conv_1d)
63 | importFrom(keras,layer_dense)
64 | importFrom(keras,layer_dropout)
65 | importFrom(keras,layer_embedding)
66 | importFrom(keras,layer_lstm)
67 | importFrom(keras,layer_max_pooling_1d)
68 | importFrom(keras,predict_classes)
69 | importFrom(keras,predict_proba)
70 | importFrom(keras,serialize_model)
71 | importFrom(keras,to_categorical)
72 | importFrom(keras,unserialize_model)
73 | importFrom(quanteda,dfm)
74 | importFrom(quanteda,dfm_subset)
75 | importFrom(stats,na.omit)
76 | importFrom(stats,predict)
77 | importFrom(utils,head)
78 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # quanteda.classifiers 0.4
2 | 
3 | * Fixed a performance issue in `crossval()`; now it's much improved in speed.
4 | * Added a `NEWS.md` file to track changes to the package.
5 | 


--------------------------------------------------------------------------------
/R/crossval.R:
--------------------------------------------------------------------------------
 1 | #' Cross-validate a fitted textmodel
 2 | #'
 3 | #' Cross-validate a fitted textmodel using _k_-fold cross-validation.
 4 | #' @param x a fitted textmodel
 5 | #' @param k number of folds
 6 | #' @inheritParams performance
 7 | #' @param verbose logical; if `TRUE`, output results to the console
 8 | #' @export
 9 | #' @examples
10 | #' library("quanteda")
11 | #' library("quanteda.textmodels")
12 | #' dfmat <- tokens(data_corpus_moviereviews) |>
13 | #'   dfm()
14 | #' tmod <- textmodel_nb(dfmat, y = data_corpus_moviereviews$sentiment)
15 | #' crossval(tmod, k = 5, by_class = TRUE)
16 | #' crossval(tmod, k = 5, by_class = FALSE)
17 | #' crossval(tmod, k = 5, by_class = FALSE, verbose = TRUE)
18 | crossval <- function(x, k = 5, by_class = FALSE, verbose = FALSE) {
19 |     UseMethod("crossval")
20 | }
21 | 
22 | #' @importFrom groupdata2 fold
23 | #' @importFrom quanteda dfm dfm_subset
24 | #' @import quanteda.textmodels
25 | #' @export
26 | crossval.textmodel <- function(x, k = 5, by_class = FALSE, verbose = FALSE) {
27 |     # create folds vector - many ways to do this, I chose something available
28 |     folds <- fold(data.frame(doc_id = docnames(x)), k = k)[[".folds"]]
29 | 
30 |     # result list (could be a df, I'm old-fashioned though)
31 |     results <- list()
32 | 
33 |     # loop across folds and refit model, add to results list
34 |     for (i in seq_len(k)) {
35 |         this_mod <- do.call(class(x)[1],
36 |                             args = list(x = dfm_subset(x$x, folds != i),
37 |                                         y = x$y[folds != i]))
38 |         this_pred <- predict(this_mod, newdata = dfm_subset(x$x, folds == i),
39 |                              type = "class")
40 |         results <- c(results,
41 |                      structure(list(c(performance(this_pred, x$y[folds == i]),
42 |                                       list(obs = split(seq_len(ndoc(x)), folds)[[i]]))),
43 |                                names = paste0("fold_", i)))
44 |     }
45 | 
46 |     summ <- summarize_results(results)
47 | 
48 |     # this may not be the "correct" way to do it - here it averages across
49 |     # class-specific averages.  Should we average across classes first within
50 |     # folds and then average across folds?
51 |     if (!by_class)
52 |         summ <- apply(summ, 2, mean)
53 | 
54 |     if (verbose) {
55 |         cat("Cross-validation:\n\nMean results for k =", k, "folds:\n\n")
56 |         print(summ)
57 |         invisible(summ)
58 |     } else {
59 |         summ
60 |     }
61 | }
62 | 
63 | # old-skool function to aggregate across a 3-D array
64 | summarize_results <- function(x) {
65 |     # remove the "obs"
66 |     x <- lapply(x, function(y) y[-which(names(y) == "obs")])
67 | 
68 |     # make into a 3D array
69 |     x_df <- lapply(x, data.frame)
70 |     x_array <- array(unlist(x_df), dim <- c(dim(x_df[[1]]), length(x_df)),
71 |                      dimnames = c(dimnames(x_df[[1]]), list(names(x_df))))
72 | 
73 |     apply(x_array, c(1, 2), mean)
74 | }
75 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
  1 | #' Sentence-level corpus of UK party manifestos 1945--2017, partially annotated
  2 | #'
  3 | #' @description A text corpus of sentences from publicly available party manifestos from the
  4 | #' United Kingdom, published between 1945 and 2019  Some manifestos sentences
  5 | #' have been rated in terms of the direction of policy using crowd-sourced coders.
  6 | #'
  7 | #' @description The manifestos from the
  8 | #' three main parties (Labour Party, Conservatives, Liberal Democrats) between
  9 | #' 1987 and 2010 have been labelled as Economic Policy, Social
 10 | #' Policy, or Other, and rated in terms of the direction of Economic Policy and
 11 | #' Social Policy.  All party
 12 | #' manifestos from the 2010 General Election have been crowd-coded in terms of
 13 | #' immigration policy, and the direction of immigration policy. For more
 14 | #' information on the coding approach see
 15 | #' [Benoit et al. (2016)](https://doi.org/10.1017/S0003055416000058).
 16 | #'
 17 | #' @description The
 18 | #' corpus contains the aggregated crowd coding values on the level of sentences.
 19 | #' Note that the segmentation into sentences does not always work correctly due
 20 | #' to missing punctuation. See Examples for how to remove very short and very
 21 | #' long sentences using [quanteda::corpus_trim()].
 22 | #' 
 23 | #' @format
 24 | #'   The corpus consists of 88,954 documents (i.e. sentences) and includes the following
 25 | #'   document-level variables: \describe{
 26 | #'   \item{party}{factor; abbreviation of the party that wrote the manifesto.}
 27 | #'   \item{partyname}{factor; party that wrote the manifesto.}
 28 | #'   \item{year}{integer; 4-digit year of the election.}
 29 | #'   \item{crowd_econsocial_label}{factor; indicates the majority label assigned
 30 | #'   by crowd workers (Economic Policy, Social Policy, or Neither). The variable
 31 | #'   has missing values (`NA`) for all non-annotated manifestos.}
 32 | #'   \item{crowd_econsocial_mean}{numeric; the direction of statements coded as
 33 | #'   "Economic Policy" or "Social Policy" based on the aggregated crowd codings.
 34 | #'   The variable is the mean of the scores assigned by the workers workers who
 35 | #'   coded the sentence and who allocated the sentence to the "majority"
 36 | #'   category. The variable ranges from -2 to +2.
 37 | #'
 38 | #'   For the statements aggregated as "Economic" Policy, -2 corresponds to "Very
 39 | #'   left"; +2 corresponds to "Very right". For the statements aggregated as
 40 | #'   "Social Policy"  -2 corresponds to "Very liberal"; +2 corresponds to "Very
 41 | #'   conservative". The variable has missing values (NA) for all sentences that
 42 | #'   were aggregated as "Neither" and for all non-annotated manifestos.)}
 43 | #'   \item{crowd_econsocial_n}{integer; the number of coders who contributed to the
 44 | #'   mean score `crowd_econsocial_mean`.}
 45 | #'   \item{crowd_immigration_label}{Factor indicating whether the majority of
 46 | #'   crowd workers labelled a sentence as referring to immigration or not. The
 47 | #'   variable has missing values (`NA`) for all non-annotated manifestos.}
 48 | #'   \item{crowd_immigration_mean}{numeric; the direction
 49 | #'   of statements coded as "Immigration" based on the aggregated crowd codings.
 50 | #'   The variable is the mean of the scores assigned by workers who coded a
 51 | #'   sentence and who allocated the sentence to the "Immigration" category. The
 52 | #'   variable ranges from -1 ("Negative and closed immigration policy") to +1
 53 | #'   (Favorable and open immigration policy). The variable has missing values
 54 | #'   (`NA`) for all non-annotated manifestos or if a sentence was not coded as
 55 | #'   referring to immigration policy based on the aggregation of crowd codings.}
 56 | #'   \item{crowd_immigration_n}{integer; the number of coders who
 57 | #'   contributed to the
 58 | #'   mean score `crowd_immigration_mean`.}
 59 | #'   }
 60 | #' @examples
 61 | #' \donttest{
 62 | #' library("quanteda")
 63 | #'
 64 | #' # keep only crowd coded manifestos (with respect to economic and social policy)
 65 | #' corp_crowdeconsocial <-
 66 | #'     corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_econsocial_label))
 67 | #'
 68 | #' # keep only crowd coded manifestos (with respect to immigration policy)
 69 | #' corp_crowdimmig <-
 70 | #'     corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
 71 | #' }
 72 | #' @references Benoit, K., Conway, D., Lauderdale, B.E., Laver, M., & Mikhaylov, S. (2016).
 73 | #'   [Crowd-sourced Text Analysis:
 74 | #'   Reproducible and Agile Production of Political Data](https://doi.org/10.1017/S0003055416000058).
 75 | #'   *American Political Science Review*, 100,(2), 278--295.
 76 | #' @format
 77 | #'  A [corpus][quanteda::corpus] object.
 78 | #' @keywords data
 79 | "data_corpus_manifestosentsUK"
 80 | 
 81 | #' Large Movie Review Dataset from Maas et. al. (2011)
 82 | #' 
 83 | #' A corpus object containing a dataset for sentiment classification containing
 84 | #' 25,000 highly polar movie reviews for training, and 25,000 for testing, from
 85 | #' Maas et. al. (2011).
 86 | #' @format The corpus docvars consist of:
 87 | #'   \describe{
 88 | #'   \item{docnumber}{serial (within set and polarity) document number}
 89 | #'   \item{rating}{user-assigned movie rating on a 1-10 point integer scale}
 90 | #'   \item{set}{used for test v. training set}
 91 | #'   \item{polarity}{either `neg` or `pos` to indicate whether the 
 92 | #'     movie review was negative or positive.  See Maas et al (2011) for the 
 93 | #'     cut-off values that governed this assignment.}
 94 | #'   }
 95 | #' @references Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew
 96 | #'   Y. Ng, and Christopher Potts. (2011).
 97 | #'   "[Learning
 98 | #'   Word Vectors for Sentiment Analysis](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)". The 49th Annual Meeting of the
 99 | #'   Association for Computational Linguistics (ACL 2011).
100 | #' @source <http://ai.stanford.edu/~amaas/data/sentiment/>
101 | #' @keywords data
102 | "data_corpus_LMRD"
103 | 


--------------------------------------------------------------------------------
/R/performance.R:
--------------------------------------------------------------------------------
  1 | # overall performance ----------
  2 | 
  3 | #' Performance statistics for prediction
  4 | #' 
  5 | #' @description Functions for computing performance statistics used for model
  6 | #'   evaluation.
  7 | #' 
  8 | #' @description `performance()` computes all of the following, which are also
  9 | #'   available via specific functions:
 10 | #'   
 11 | #' Given a 2 x 2 table with notation
 12 | #' 
 13 | #' \tabular{rcc}{ \tab Truth \tab \cr Predicted \tab Positive \tab
 14 | #' Negative \cr Positive \tab _A_ \tab _B_ \cr Negative \tab _C_ \tab _D_ \cr }
 15 | #' 
 16 | #' The metrics computed here are:
 17 | #' \itemize{
 18 | #'   \item{precision: }{\eqn{A / (A + B)}}
 19 | #'   \item{recall: }{\eqn{A / (A + C)}}
 20 | #'   \item{_F1_: }{\eqn{2 / (recall^{-1} + precision^{-1})}}
 21 | #'   \item{accuracy: }{\eqn{(A + D) / (A + B + C + D)}, or correctly predicted / all}
 22 | #'   \item{balanced_accuracy: }{mean(recall) for all categories}
 23 | #'  }
 24 | #' @param data a table of predicted by truth, or vector of predicted labels
 25 | #' @param truth vector of "true" labels, or if a table, `2` to indicate that the 
 26 | #'   "true" values are in columns, or `1` if in rows.
 27 | #' @param by_class logical; if `TRUE`, estimate performance score separately for
 28 | #'   each class, otherwise average across classes
 29 | #' @param ... not used
 30 | #' @return named list consisting of the selected measure(s), where each element
 31 | #'   is a scalar if `by_class = FALSE`, or a vector named by class if `by_class
 32 | #'   = TRUE`.
 33 | #' @references
 34 | #' Powers, D. (2007). "Evaluation: From Precision, Recall and F Factor to ROC,
 35 | #' Informedness, Markedness and Correlation." _Technical Report SIE-07-001_,
 36 | #' Flinders University.
 37 | #' @examples
 38 | #' ## Data in Table 2 of Powers (2007)
 39 | #' 
 40 | #' lvs <- c("Relevant", "Irrelevant")
 41 | #' tbl_2_1_pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs)
 42 | #' tbl_2_1_truth <- factor(c(rep(lvs, times = c(30, 12)),
 43 | #'                           rep(lvs, times = c(30, 28))),               
 44 | #'                         levels = lvs)
 45 | #'                         
 46 | #' performance(tbl_2_1_pred, tbl_2_1_truth)
 47 | #' performance(tbl_2_1_pred, tbl_2_1_truth, by_class = FALSE)
 48 | #' performance(table(tbl_2_1_pred, tbl_2_1_truth), by_class = TRUE)
 49 | #' 
 50 | #' @export
 51 | performance <- function(data, truth, by_class = TRUE, ...) {
 52 |     UseMethod("performance")
 53 | }
 54 | 
 55 | #' @export
 56 | performance.default <- function(data, truth, by_class = TRUE, ...) {
 57 |     performance(build_table(data, truth), by_class = by_class)
 58 | }
 59 | 
 60 | #' @export
 61 | performance.table <- function(data, truth = 2, by_class = TRUE, ...) {
 62 |     data <- check_table(data, truth)
 63 |     result <- as.list(c(precision(data, by_class = by_class),
 64 |                         recall(data, by_class = by_class),
 65 |                         accuracy(data),
 66 |                         balanced_accuracy(data),
 67 |                         f1_score(data, by_class = by_class)))
 68 |     result[c("precision", "recall", "f1", "accuracy", "balanced_accuracy")]
 69 | }
 70 | 
 71 | 
 72 | # precision, recall, f1 ----------
 73 | 
 74 | #' @rdname performance
 75 | #' @export
 76 | #' @examples
 77 | #' precision(tbl_2_1_pred, tbl_2_1_truth)
 78 | #' 
 79 | precision <- function(data, truth, by_class = TRUE, ...) {
 80 |     UseMethod("precision")
 81 | }
 82 | 
 83 | #' @export
 84 | precision.default <- function(data, truth, by_class = TRUE, ...) {
 85 |     precision(build_table(data, truth), by_class = by_class)
 86 | }    
 87 | 
 88 | #' @export
 89 | precision.table <- function(data, truth = 2, by_class = TRUE, ...) {
 90 |     data <- check_table(data, truth)
 91 |     prec <- sapply(seq_along(diag(data)), 
 92 |                    function(x) diag(data)[x] / sum(data[x, ]))
 93 |     prec <- list(precision = prec)
 94 |     if (by_class) prec else sapply(prec, mean)
 95 | }
 96 | 
 97 | 
 98 | #' @rdname performance
 99 | #' @export
100 | #' @examples
101 | #' recall(tbl_2_1_pred, tbl_2_1_truth)
102 | #' 
103 | recall <- function(data, truth, by_class = TRUE, ...) {
104 |     UseMethod("recall")
105 | }
106 | 
107 | #' @export
108 | recall.default <- function(data, truth, by_class = TRUE, ...) {
109 |     recall(build_table(data, truth), by_class = by_class)
110 | }   
111 | 
112 | #' @export
113 | recall.table <- function(data, truth = 2, by_class = TRUE, ...) {
114 |     data <- check_table(data, truth)
115 |     prec <- sapply(seq_along(diag(data)), 
116 |                    function(x) diag(data)[x] / sum(data[, x]))
117 |     prec <- list(recall = prec)
118 |     if (by_class) prec else sapply(prec, mean)
119 | }
120 |  
121 | #' @rdname performance
122 | #' @export
123 | #' @examples
124 | #' f1_score(tbl_2_1_pred, tbl_2_1_truth)
125 | #' 
126 | f1_score <- function(data, truth, by_class = TRUE, ...) {
127 |     UseMethod("f1_score")
128 | }
129 | 
130 | #' @export
131 | f1_score.default <- function(data, truth, by_class = TRUE, ...) {
132 |     f1_score(build_table(data, truth), by_class = by_class)
133 | }
134 | 
135 | #' @export
136 | f1_score.table <- function(data, truth = 2, by_class = TRUE, ...) {
137 |     data <- check_table(data, truth)
138 |     pr <- data.frame(precision = precision(data, by_class = TRUE)[[1]],
139 |                 recall = recall(data, by_class = TRUE)[[1]])
140 |     #f1_score(pr)
141 |     f1 <- list(f1 = apply(pr[c("precision", "recall")], 1, 
142 |                               function(y) 2 / sum(y^(-1))))
143 |     if (by_class) f1 else sapply(f1, mean)
144 | }
145 | 
146 | # #' @export
147 | # f1_score.list <- function(data, ...) {
148 | #    if (!all(c("precision", "recall") %in% names(data)))
149 | #        stop("list must contain both precision and recall")
150 | #    pr <- list(precision = precision(data, by_class = by_class),
151 | #               recall = recall(data, by_class = by_class))
152 | #    
153 | #    result <- list(f1 = apply(data.frame(data[c("precision", "recall")]), 1, 
154 | #                              function(y) 2 / sum(y^(-1))))
155 | #    if (length(result[[1]]) == 1) result[[1]] <- unname(result[[1]])
156 | #    result
157 | #}
158 | 
159 | # accuracy ----------
160 | 
161 | #' @rdname performance
162 | #' @export
163 | #' @examples
164 | #' accuracy(tbl_2_1_pred, tbl_2_1_truth)
165 | #' 
166 | accuracy <- function(data, truth, ...) {
167 |     UseMethod("accuracy")
168 | }
169 | 
170 | #' @export
171 | accuracy.default <- function(data, truth, ...) {
172 |     accuracy(build_table(data, truth))
173 | }
174 | 
175 | #' @export
176 | accuracy.table <- function(data, truth = 2, ...) {
177 |     data <- check_table(data, truth)
178 |     list(accuracy = sum(diag(data)) / sum(data))
179 | }
180 | 
181 | #' @rdname performance
182 | #' @export
183 | #' @examples
184 | #' balanced_accuracy(tbl_2_1_pred, tbl_2_1_truth)
185 | #' 
186 | balanced_accuracy <- function(data, ...) {
187 |     UseMethod("balanced_accuracy")
188 | }
189 | 
190 | #' @export
191 | balanced_accuracy.default <- function(data, truth, by_class = TRUE, ...) {
192 |     balanced_accuracy(build_table(data, truth))
193 | }
194 | 
195 | #' @export
196 | balanced_accuracy.table <- function(data, truth = 2, ...) {
197 |     data <- check_table(data, truth)
198 |     rec <- recall(data, by_class = TRUE)
199 |     balanced_accuracy(rec)
200 | }
201 | 
202 | #' @export
203 | balanced_accuracy.list <- function(data, ...) {
204 |     if (! "recall" %in% names(data))
205 |         stop("list must include recall")
206 |     if (length(data[["recall"]]) < 2)
207 |         stop("recall must be computed by class")
208 |     list(balanced_accuracy = mean(unlist(data["recall"])))
209 | }
210 | 
211 | # utility functions -------------
212 | 
213 | check_table <- function(data, truth) {
214 |     if (!truth %in% c(1, 2))
215 |         stop("truth must be 2 for columns or 1 for rows")
216 |     if (!identical(rownames(data), colnames(data)))
217 |         stop("predicted and truth values must have the same order and names")
218 |     if (truth == 1) data <- t(data)
219 |     data
220 | }
221 | 
222 | build_table <- function(data, truth) {
223 |     truth <- as.factor(truth)
224 |     data <- factor(data, levels = levels(truth))
225 |     table(data, truth)
226 | }
227 | 


--------------------------------------------------------------------------------
/R/quanteda.classifiers-package.R:
--------------------------------------------------------------------------------
 1 | #' quanteda.classifiers
 2 | #'
 3 | #' Extensions to \pkg{quanteda} that provide supervised machine learning models
 4 | #' for document-feature matrices.
 5 | #' @import quanteda.textmodels
 6 | #' @import quanteda
 7 | #' @keywords internal
 8 | "_PACKAGE"
 9 | 
10 | # The following block is used by usethis to automatically manage
11 | # roxygen namespace tags. Modify with care!
12 | ## usethis namespace: start
13 | ## usethis namespace: end
14 | NULL
15 | 


--------------------------------------------------------------------------------
/R/textmodel_cnnlstmemb.R:
--------------------------------------------------------------------------------
  1 | #' \[Experimental\] Convolutional NN + LSTM model fitted to word embeddings
  2 | #'
  3 | #' A function that combines a convolutional neural network layer with a long
  4 | #' short-term memory layer. It is designed to incorporate word sequences,
  5 | #' represented as sequentially ordered word embeddings, into text
  6 | #' classification. The model takes as an input a \pkg{quanteda} tokens object.
  7 | #'
  8 | #' @param x tokens object
  9 | #' @inheritParams quanteda.textmodels::textmodel_svm
 10 | #' @param dropout1 A floating variable bound between 0 and 1. It determines the
 11 | #'   rate at which units are dropped for the linear transformation of the
 12 | #'   inputs for the embedding layer.
 13 | #' @param dropout2 A floating variable bound between 0 and 1. It determines the
 14 | #'   rate at which units are dropped for the linear transformation of the
 15 | #'   inputs for the CNN layer.
 16 | #' @param dropout3 A floating variable bound between 0 and 1. It determines the
 17 | #'   rate at which units are dropped for the linear transformation of the
 18 | #'   inputs for the recurrent layer.
 19 | #' @param dropout4 A floating variable bound between 0 and 1. It determines the
 20 | #'   rate at which units are dropped for the linear transformation of the
 21 | #'   inputs for the recurrent layer.
 22 | #' @param wordembeddim The number of word embedding dimensions to be fit
 23 | #' @param cnnlayer A logical parameter that allows user to include or exclude a
 24 | #'   convolutional layer in the neural network model
 25 | #' @param filter The number of output filters in the convolution
 26 | #' @param kernel_size An integer or list of a single integer, specifying the
 27 | #'   length of the 1D convolution window
 28 | #' @param pool_size Size of the max pooling windows.
 29 | #'   [keras::layer_max_pooling_1d()]
 30 | #' @param units_lstm The number of nodes of the lstm layer
 31 | #' @param words The maximum number of words used to train model. Defaults to the
 32 | #'   number of features in `x`
 33 | #' @param maxsenlen The maximum sentence length of training data
 34 | #' @param optimizer optimizer used to fit model to training data, see
 35 | #'   [keras::compile.keras.engine.training.Model()]
 36 | #' @param loss objective loss function, see
 37 | #'   [keras::compile.keras.engine.training.Model()]
 38 | #' @param metrics metric used to train algorithm, see
 39 | #'   [keras::compile.keras.engine.training.Model()]
 40 | #' @param ... additional options passed to
 41 | #'   [keras::fit.keras.engine.training.Model()]
 42 | #' @keywords textmodel
 43 | #' @importFrom keras keras_model_sequential to_categorical
 44 | #' @importFrom keras layer_dense layer_activation layer_dropout compile fit
 45 | #' @importFrom keras layer_embedding layer_conv_1d layer_max_pooling_1d
 46 | #'   layer_lstm bidirectional
 47 | #' @seealso [save.textmodel_cnnlstmemb()], [load.textmodel_cnnlstmemb()]
 48 | #' @export
 49 | #' @examples
 50 | #' \dontrun{
 51 | #' # create dataset with evenly balanced coded & uncoded immigration sentences
 52 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK,
 53 | #'                            !is.na(crowd_immigration_label))
 54 | #' corpuncoded <- data_corpus_manifestosentsUK %>%
 55 | #'     corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
 56 | #'     corpus_sample(size = ndoc(corpcoded))
 57 | #' corp <- corpcoded + corpuncoded
 58 | #'
 59 | #' tok <- tokens(corp)
 60 | #'
 61 | #' tmod <- textmodel_cnnlstmemb(tok,
 62 | #'                              y = docvars(tok, "crowd_immigration_label"),
 63 | #'                              epochs = 5, verbose = 1)
 64 | #'
 65 | #' newdata = tokens_subset(tok, subset = is.na(crowd_immigration_label))
 66 | #' pred <- predict(tmod, newdata = newdata)
 67 | #' table(pred)
 68 | #' tail(texts(corpuncoded)[pred == "Immigration"], 10)
 69 | #'
 70 | #' }
 71 | textmodel_cnnlstmemb <- function(x, y, dropout1 = 0.2, dropout2 = 0.2,
 72 |                                  dropout3 = 0.2, dropout4 = 0.2,
 73 |                                  wordembeddim = 30, cnnlayer = TRUE,
 74 |                                  filter = 48, kernel_size = 5, pool_size = 4,
 75 |                                  units_lstm = 128, words = NULL,
 76 |                                  maxsenlen = 100, optimizer = "adam",
 77 |                                  loss = "categorical_crossentropy",
 78 |                                  metrics = "categorical_accuracy", ...) {
 79 |     UseMethod("textmodel_cnnlstmemb")
 80 | }
 81 | 
 82 | #' @export
 83 | textmodel_cnnlstmemb.tokens <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2,
 84 |                                         dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48,
 85 |                                         kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL,
 86 |                                         maxsenlen = 100, optimizer = "adam",
 87 |                                         loss = "categorical_crossentropy",
 88 |                                         metrics = "categorical_accuracy", ...) {
 89 |     stopifnot(ndoc(x) == length(y))
 90 |     stopifnot(is.tokens(x))
 91 |     y <- as.factor(y)
 92 |     result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
 93 |     # trim missings for fitting model
 94 |     na_ind <- which(is.na(y))
 95 |     if (length(na_ind) > 0) {
 96 |         y <- y[-na_ind]
 97 |         # workaround just because negative indexing is broken in v2 for now
 98 |         na_ind_logical <- rep(TRUE, length(y))
 99 |         na_ind_logical[na_ind] <- FALSE
100 |         x <- x[na_ind_logical]
101 |     }
102 |     
103 |     x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words)
104 |     
105 |     if (is.null(words))
106 |         words <- x$nfeatures
107 |     # "one-hot" encode y
108 |     y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
109 |     
110 |     # use keras to fit the model
111 |     model <- keras_model_sequential()
112 |     
113 |     model %>%
114 |         layer_embedding(input_dim = words + 1, output_dim = wordembeddim,
115 |                         input_length = maxsenlen) %>%
116 |         layer_dropout(rate = dropout1)
117 |     
118 |     if (cnnlayer == TRUE) {
119 |         model %>%
120 |             layer_conv_1d(filters = filter, kernel_size = kernel_size,
121 |                           activation = "relu") %>%
122 |             layer_max_pooling_1d(pool_size = pool_size) %>%
123 |             layer_dropout(rate = dropout2)
124 |     }
125 |     
126 |     model %>%
127 |         bidirectional(layer_lstm(units = units_lstm, dropout = dropout3,
128 |                                  recurrent_dropout = dropout4)) %>%
129 |         layer_dense(units = nlevels(y), activation = "softmax")
130 |     
131 |     compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
132 |     history <- fit(model, x$matrix, y2, ...)
133 |     
134 |     # compile, class, and return the result
135 |     result <- c(result,
136 |                 nfeatures = x$nfeatures,
137 |                 maxsenlen = maxsenlen,
138 |                 list(clefitted = model))
139 |     class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list")
140 |     return(result)
141 | }
142 | 
143 | #' @export
144 | textmodel_cnnlstmemb.tokens2sequences <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2,
145 |                                                   dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48,
146 |                                                   kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL,
147 |                                                   maxsenlen = 100,
148 |                                                   optimizer = "adam",
149 |                                                   loss = "categorical_crossentropy",
150 |                                                   metrics = "categorical_accuracy", ...) {
151 |     stopifnot(nrow(x$matrix) == length(y))
152 |     stopifnot(is.tokens2sequences(x))
153 |     x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words)
154 |     y <- as.factor(y)
155 |     result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
156 |     
157 |     # trim missings for fitting model
158 |     na_ind <- which(is.na(y))
159 |     if (length(na_ind) > 0) {
160 |         y <- y[-na_ind]
161 |         # workaround just because negative indexing is broken in v2 for now
162 |         na_ind_logical <- rep(TRUE, length(y))
163 |         na_ind_logical[na_ind] <- FALSE
164 |         x$matrix <- x$matrix[na_ind_logical, ]
165 |     }
166 |     
167 |     words <- x$nfeatures
168 |     maxsenlen <- ncol(x$matrix)
169 |     # "one-hot" encode y
170 |     y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
171 |     
172 |     # use keras to fit the model
173 |     model <- keras_model_sequential()
174 |     
175 |     model %>%
176 |         layer_embedding(input_dim = words + 1, output_dim = wordembeddim,
177 |                         input_length = maxsenlen) %>%
178 |         layer_dropout(rate = dropout1)
179 |     
180 |     if (cnnlayer == TRUE) {
181 |         model %>%
182 |             layer_conv_1d(filters = filter, kernel_size = kernel_size,
183 |                           activation = "relu") %>%
184 |             layer_max_pooling_1d(pool_size = pool_size) %>%
185 |             layer_dropout(rate = dropout2)
186 |     }
187 |     
188 |     model %>%
189 |         layer_lstm(units = units_lstm, dropout = dropout3,
190 |                                  recurrent_dropout = dropout4) %>%
191 |         layer_dense(units = nlevels(y), activation = "softmax")
192 |     
193 |     compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
194 |     history <- fit(model, x$matrix, y2, ...)
195 |     
196 |     # compile, class, and return the result
197 |     result <- c(result,
198 |                 nfeatures = x$nfeatures,
199 |                 maxsenlen = maxsenlen,
200 |                 list(clefitted = model))
201 |     class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list")
202 |     return(result)
203 | }
204 | 
205 | #' Prediction from a fitted textmodel_cnnlstmemb object
206 | #'
207 | #' `predict.textmodel_cnnlstmemb()` implements class predictions from a
208 | #' fitted long short-term memory neural network model.
209 | #' @param object a fitted [textmodel_cnnlstmemb] model
210 | #' @param newdata dfm on which prediction should be made
211 | #' @param type the type of predicted values to be returned; see Value
212 | #' @param force make `newdata`'s feature set conformant to the model terms
213 | #' @param ... not used
214 | #' @return `predict.textmodel_cnnlstmemb` returns either a vector of class
215 | #'   predictions for each row of `newdata` (when `type = "class"`), or
216 | #'   a document-by-class matrix of class probabilities (when `type =
217 | #'   "probability"`).
218 | #' @seealso [textmodel_cnnlstmemb()]
219 | #' @keywords textmodel internal
220 | #' @importFrom keras predict_classes predict_proba
221 | #' @importFrom stats predict
222 | #' @export
223 | predict.textmodel_cnnlstmemb <- function(object, newdata = NULL,
224 |                                   type = c("class", "probability"),
225 |                                   force = TRUE,
226 |                                   ...) {
227 |     quanteda:::unused_dots(...)
228 | 
229 |     type <- match.arg(type)
230 | 
231 |     if (!is.null(newdata)) {
232 |         if(is.tokens(newdata)) {
233 |          data <- tokens2sequences(newdata, maxsenlen = object$maxsenlen,
234 |                                  keepn = object$nfeatures)           
235 |         } else {
236 |             data <- newdata
237 |         }
238 |         t2s_object <- tokens2sequences(object$x, maxsenlen = object$maxsenlen,
239 |                                        keepn = object$nfeatures)
240 |         data <- tokens2sequences_conform(data, t2s_object)
241 |     } else {
242 |         data <- tokens2sequences(object$x, maxsenlen = object$maxsenlen,
243 |                                  keepn = object$nfeatures)
244 |     }
245 | 
246 |     if (type == "class") {
247 |         pred_y <- predict_classes(object$clefitted, x = data$matrix)
248 |         pred_y <- factor(pred_y, labels = object$classnames,
249 |                          levels = (seq_along(object$classnames) - 1))
250 |         names(pred_y) <- rownames(data$matrix)
251 |     } else if (type == "probability") {
252 |         pred_y <- predict_proba(object$clefitted, x = data$matrix)
253 |         colnames(pred_y) <- object$classnames
254 |         rownames(pred_y) <- rownames(data$matrix)
255 |     }
256 | 
257 |     pred_y
258 | }
259 | 
260 | #' @export
261 | #' @importFrom stats na.omit
262 | #' @method print textmodel_cnnlstmemb
263 | print.textmodel_cnnlstmemb <- function(x, ...) {
264 |     layer_names <- gsub(pattern = "_\\d*", "",
265 |                         lapply(x$clefitted$layers, function(z) z$name))
266 |     cat("\nCall:\n")
267 |     print(x$call)
268 |     cat("\n",
269 |         format(length(na.omit(x$y)), big.mark = ","), " training documents; ",
270 |         format(length(x$nfeatures), big.mark = ","), " fitted features",
271 |         ".\n",
272 |         "Structure: ", paste(layer_names, collapse = " -> "), "\n",
273 |         sep = "")
274 | }
275 | 
276 | #' summary method for textmodel_cnnlstmemb objects
277 | #' @param object output from [textmodel_cnnlstmemb()]
278 | #' @param ... additional arguments not used
279 | #' @keywords textmodel internal
280 | #' @method summary textmodel_cnnlstmemb
281 | #' @export
282 | summary.textmodel_cnnlstmemb <- function(object, ...) {
283 |     layer_names <- gsub(pattern = "_\\d*", "",
284 |                         lapply(object$clefitted$layers, function(x) x$name))
285 | 
286 |     result <- list(
287 |         "call" = object$call,
288 |         "model structure" = paste(layer_names, collapse = " -> ")
289 |     )
290 |     as.summary.textmodel(result)
291 | }
292 | 
293 | #' @export
294 | #' @method print predict.textmodel_cnnlstmemb
295 | print.predict.textmodel_cnnlstmemb <- function(x, ...) {
296 |     print(unclass(x))
297 | }
298 | 
299 | 
300 | #' @rdname save.textmodel_mlp
301 | #' @importFrom keras serialize_model
302 | #' @method save textmodel_cnnlstmemb
303 | #' @export
304 | save.textmodel_cnnlstmemb <- function(x, ...) {
305 |     x$clefitted <- serialize_model(x$clefitted)
306 |     save(x, ...)
307 | }
308 | 
309 | #' @rdname save.textmodel_mlp
310 | #' @importFrom keras unserialize_model
311 | #' @method load textmodel_cnnlstmemb
312 | #' @export
313 | load.textmodel_cnnlstmemb <- function(x, ...) {
314 |     load(x, ...)
315 |     x$clefitted <- unserialize_model(x$clefitted)
316 | }
317 | 


--------------------------------------------------------------------------------
/R/textmodel_evaluate.R:
--------------------------------------------------------------------------------
  1 | #' Model evaluation function
  2 | #'
  3 | #' Designed to streamline the parameter tuning and evaluation process. Users
  4 | #' chose a function to evaluate and include parameter values as a list. If
  5 | #' multiple parameter values are provided, the function will perform a grid
  6 | #' search by estimating a separate model for every combination of parameters.
  7 | #'
  8 | #' @param x the \link{dfm} or \link{tokens} object on which the model will be
  9 | #'   fit.  Does not need to contain only the training documents.
 10 | #' @param y vector of training labels associated with each document identified
 11 | #'   in \code{train}.  (These will be converted to factors if not already
 12 | #'   factors.)
 13 | #' @param model the name of the machine learning function that will be evaluated
 14 | #' @param fun the name of the function that will be used to evaluate the machine
 15 | #'   learning model. Can take the values "accuracy", "precision", "recall", or
 16 | #'   "f1_score"
 17 | #' @param k number of folds
 18 | #' @param parameters model hyperparameters
 19 | #' @param seed a seed that can allow for replication of k training data splits.
 20 | #'   If seed is not provided a seed is chosen based on the current time.
 21 | #' @param time a logical parameter that determines whether output will include
 22 | #'   training time (in seconds) of model
 23 | #' @param by_class estimates a separate value of provided evaluation function
 24 | #'   for every class of the true vector
 25 | #' @importFrom stats predict
 26 | #' @examples
 27 | #' # evaluate immigration classification performance
 28 | #' \dontrun{
 29 | #' dfmat <- dfm(data_corpus_manifestosentsUK)
 30 | #' codes <- docvars(data_corpus_manifestosentsUK, "crowd_immigration_label")
 31 | #' evaluation <- textmodel_evaluate(dfmat, codes, k = 3,
 32 | #'                                  model = "textmodel_mlp", fun = "f1_score",
 33 | #'                                  parameters = list(epochs = c(3, 4)))
 34 | #' head(evaluation)
 35 | #' aggregate(evaluation, by = list(evaluation$cost), FUN = "mean")
 36 | #' }
 37 | #' @export
 38 | #'
 39 | textmodel_evaluate <- function(x, y,
 40 |                                model,
 41 |                                fun = "f1_score",
 42 |                                k = 5,
 43 |                                parameters = list(),
 44 |                                seed = as.numeric(Sys.time()),
 45 |                                time = TRUE,
 46 |                                by_class = FALSE) {
 47 |     UseMethod("textmodel_evaluate")
 48 | }
 49 | 
 50 | #' @export
 51 | textmodel_evaluate.dfm <- function(x, y, model, fun = "f1_score", k = 5,
 52 |                                    parameters = list(),
 53 |                                    seed = as.numeric(Sys.time()),
 54 |                                    time = TRUE, by_class = FALSE) {
 55 |     stopifnot(is.dfm(x))
 56 |     if ("accuracy" %in% fun & by_class) {
 57 |         cat("No class oriented accuracy score defined. Calculating average accuracy accross all classes.\n")
 58 |         }
 59 |     total_start <- Sys.time()
 60 |     set.seed(seed)
 61 |     y <- as.factor(y)
 62 |     folds <- cut(seq(1, length(y)), breaks = k, labels = FALSE)
 63 |     folds <- sample(folds, length(folds), replace = FALSE)
 64 |     output <- list()
 65 |     params_df <- expand.grid(parameters, stringsAsFactors = FALSE)
 66 |     param_len <- ifelse(length(parameters) != 0, nrow(params_df), 1)
 67 |     w <- 1
 68 |     for (t in 1:param_len) {
 69 |         # drop = FALSE ensures that params_df remains a data.frame even if
 70 |         # there is only a single input parameter
 71 |         param_list <- as.list(params_df[t, , drop = FALSE])
 72 |         for (i in 1:k) {
 73 |             test_set <- which(folds == i)
 74 |             x_train <- x[-test_set, ]
 75 |             y_train <- y[-test_set]
 76 |             x_test <- x[test_set, ]
 77 |             y_test <- y[test_set]
 78 |             start <- Sys.time()
 79 |             mod <- do.call(what = model, args = c(list(x = x_train, y = y_train), param_list))
 80 |             time <- round(as.numeric(difftime(Sys.time(), start, units = "secs")), 2) # Outputs time in seconds
 81 |             names(time) <- "time"
 82 |             y_pred <- predict(mod, x_test)
 83 |             met <- lapply(fun, function(x) do.call(what = x, args = list(y_pred, y_test, by_class))) # Accepts any evaluation function that takes predicted and test vectors as inputs
 84 |             #met <- as.list(met)
 85 |             if ( is.null(names(met))) names(met) <- fun
 86 |             if (length(parameters) != 0) {
 87 |                 output[[w]] <- data.frame(k = i, met, param_list, as.list(time), seed)
 88 |             } else {
 89 |                 output[[w]] <- data.frame(k = i, met, as.list(time), seed)
 90 |             }
 91 |             if (by_class) {
 92 |                 output[[w]]$class <- rownames(output[[w]])
 93 |             }
 94 |             w <- w + 1
 95 |         }
 96 |     }
 97 |     output <- do.call(rbind, output)
 98 |     rownames(output) <- NULL
 99 |     total_time <- round(as.numeric(difftime(Sys.time(), total_start, units = "secs")), 2) # Outputs time in seconds
100 |     attr(output, "model") <- model
101 |     attr(output, "fun") <- fun
102 |     attr(output, "k") <- k
103 |     attr(output, "parameters") <- parameters
104 |     attr(output, "nparameters") <- param_len
105 |     attr(output, "total_time") <- total_time
106 |     class(output) <- c("textmodel_evaluate", "data.frame")
107 |     return(output)
108 | }
109 | 
110 | #' @export
111 | textmodel_evaluate.tokens <- function(x, y, model, fun = "f1_score", k = 5,
112 |                                       parameters = list(),
113 |                                       seed = as.numeric(Sys.time()),
114 |                                       time = TRUE, by_class = FALSE) {
115 |     stopifnot(is.tokens(x))
116 |     if ("accuracy" %in% fun & by_class) {
117 |         cat("No class oriented accuracy score defined. Calculating average accuracy accross all classes.\n")
118 |     }
119 |     total_start <- Sys.time()
120 |     set.seed(seed)
121 |     y <- as.factor(y)
122 |     folds <- cut(seq(1, length(y)), breaks = k, labels = FALSE)
123 |     folds <- sample(folds, length(folds), replace = FALSE)
124 |     output <- list()
125 |     params_df <- expand.grid(parameters, stringsAsFactors = FALSE)
126 |     param_len <- ifelse(length(parameters) != 0, nrow(params_df), 1)
127 |     w <- 1
128 |     for (t in 1:param_len) {
129 |         param_list <- as.list(params_df[t, , drop = FALSE]) # drop = FALSE ensures that params_df remains a data.frame even if there is only a single input parameter
130 |         for (i in 1:k) {
131 |             test_set <- which(folds == i)
132 |             x_train <- x[-test_set]
133 |             y_train <- y[-test_set]
134 |             x_test <- x[test_set]
135 |             y_test <- y[test_set]
136 |             start <- Sys.time()
137 |             mod <- do.call(what = model, args = c(list(x = x_train, y = y_train), param_list))
138 |             time <- round(as.numeric(difftime(Sys.time(), start, units = "secs")), 2) # Outputs time in seconds
139 |             names(time) <- "time"
140 |             y_pred <- predict(mod, x_test)
141 |             met <- lapply(fun, function(x) do.call(what = x, args = list(y_pred, y_test, by_class))) # Accepts any evaluation function that takes predicted and test vectors as inputs
142 |             #met <- as.list(met)
143 |             if (is.null(names(met))) names(met) <- fun
144 |             if (length(parameters) != 0) {
145 |                 output[[w]] <- data.frame(k = i, met, param_list, as.list(time), seed)
146 |             } else {
147 |                 output[[w]] <- data.frame(k = i, met, as.list(time), seed)
148 |             }
149 |             if (by_class) {
150 |                 output[[w]]$class <- rownames(output[[w]])
151 |             }
152 |             w <- w + 1
153 |         }
154 |     }
155 |     output <- do.call(rbind, output)
156 |     rownames(output) <- NULL
157 |     total_time <- round(as.numeric(difftime(Sys.time(), total_start, units = "secs")), 2) # Outputs time in seconds
158 |     attr(output, "model") <- model
159 |     attr(output, "fun") <- fun
160 |     attr(output, "k") <- k
161 |     attr(output, "parameters") <- parameters
162 |     attr(output, "nparameters") <- param_len
163 |     attr(output, "total_time") <- total_time
164 |     class(output) <- c("textmodel_evaluate", "data.frame")
165 |     return(output)
166 | }
167 | 
168 | #' @seealso [textmodel_evaluate()]
169 | #' @importFrom utils head
170 | #' @method head textmodel_evaluate
171 | #' @export
172 | #'
173 | head.textmodel_evaluate <- function(x, n = 5, ...) {
174 |     return(head(as.data.frame(x), n))
175 | }
176 | 
177 | #' @seealso [textmodel_evaluate()]
178 | #' @method print textmodel_evaluate
179 | #' @export
180 | print.textmodel_evaluate <- function(x, ...) {
181 |     # output
182 |     cat("Evaluation of", attr(x, "model"), "using the", attr(x, "fun"), "function.",
183 |         "\n")
184 |     #return(head(x, 4))
185 | }
186 | 


--------------------------------------------------------------------------------
/R/textmodel_mlp.R:
--------------------------------------------------------------------------------
  1 | #' Multilayer perceptron network (MLP) model for text classification
  2 | #'
  3 | #' This function is a wrapper for a multilayer perceptron network model with a
  4 | #' single hidden layer network with two layers, implemented in the \pkg{keras}
  5 | #' package.
  6 | #' @inheritParams quanteda.textmodels::textmodel_svm
  7 | #' @param units The number of network nodes used in the first layer of the
  8 | #'   sequential model
  9 | #' @param dropout A floating variable bound between 0 and 1. It determines the
 10 | #'   rate at which units are dropped for the linear transformation of the
 11 | #'   inputs.
 12 | #' @param optimizer optimizer used to fit model to training data, see
 13 | #'   [keras::compile.keras.engine.training.Model()]
 14 | #' @param loss objective loss function, see
 15 | #'   [keras::compile.keras.engine.training.Model()]
 16 | #' @param metrics metric used to train algorithm, see
 17 | #'   [keras::compile.keras.engine.training.Model()]
 18 | #' @param ... additional options passed to
 19 | #'   [keras::fit.keras.engine.training.Model()]
 20 | #' @keywords textmodel
 21 | #' @importFrom keras keras_model_sequential to_categorical
 22 | #' @importFrom keras layer_dense layer_activation layer_dropout compile fit
 23 | #' @seealso [save.textmodel_mlp()], [load.textmodel_mlp()]
 24 | #' @export
 25 | #' @examples
 26 | #' \dontrun{
 27 | #' # create a dataset with evenly balanced coded and uncoded immigration sentences
 28 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
 29 | #' corpuncoded <- data_corpus_manifestosentsUK %>%
 30 | #'     corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
 31 | #'     corpus_sample(size = ndoc(corpcoded))
 32 | #' corp <- corpcoded + corpuncoded
 33 | #'
 34 | #' # form a tf-idf-weighted dfm
 35 | #' dfmat <- dfm(corp) %>%
 36 | #'     dfm_tfidf()
 37 | #'
 38 | #' set.seed(1000)
 39 | #' tmod <- textmodel_mlp(dfmat, y = docvars(dfmat, "crowd_immigration_label"),
 40 | #'                         epochs = 5, verbose = 1)
 41 | #' pred <- predict(tmod, newdata = dfm_subset(dfmat, is.na(crowd_immigration_label)))
 42 | #' table(pred)
 43 | #' tail(texts(corpuncoded)[pred == "Immigration"], 10)
 44 | #' }
 45 | textmodel_mlp <- function(x, y, units = 512, dropout = .2,
 46 |                             optimizer = "adam",
 47 |                             loss = "categorical_crossentropy",
 48 |                             metrics = "categorical_accuracy",
 49 |                             ...) {
 50 |     UseMethod("textmodel_mlp")
 51 | }
 52 | 
 53 | #' @export
 54 | textmodel_mlp.dfm <- function(x, y, units = 512, dropout = .2,
 55 |                                 optimizer = "adam",
 56 |                                 loss = "categorical_crossentropy",
 57 |                                 metrics = "categorical_accuracy", ...) {
 58 |     stopifnot(ndoc(x) == length(y))
 59 | 
 60 |     x <- as.dfm(x)
 61 |     y <- as.factor(y)
 62 |     result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
 63 | 
 64 |     # trim missings for fitting model
 65 |     na_ind <- which(is.na(y))
 66 |     if (length(na_ind) > 0) {
 67 |         # message(length(na_ind), "observations with the value 'NA' were removed.")
 68 |         y <- y[-na_ind]
 69 |         x <- x[-na_ind, ]
 70 |     }
 71 | 
 72 |     # "one-hot" encode y
 73 |     y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
 74 | 
 75 |     # use keras to fit the model
 76 |     model <- keras_model_sequential() %>%
 77 |         layer_dense(units = units, input_shape = nfeat(x), activation = "relu") %>%
 78 |         layer_dropout(rate = dropout) %>%
 79 |         layer_dense(units = nlevels(y), activation = "softmax")
 80 |     compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
 81 |     history <- fit(model, x, y2, ...)
 82 | 
 83 |     # compile, class, and return the result
 84 |     result <- c(result, nfeatures = nfeat(x), list(seqfitted = model))
 85 |     class(result) <- c("textmodel_mlp", "textmodel", "list")
 86 |     return(result)
 87 | }
 88 | 
 89 | #' Prediction from a fitted textmodel_mlp object
 90 | #'
 91 | #' `predict.textmodel_mlp()` implements class predictions from a fitted
 92 | #' multilayer perceptron network model.
 93 | #' @param object a fitted [textmodel_mlp] model
 94 | #' @param newdata dfm on which prediction should be made
 95 | #' @param type the type of predicted values to be returned; see Value
 96 | #' @param force make `newdata`'s feature set conformant to the model terms
 97 | #' @param ... not used
 98 | #' @return `predict.textmodel_mlp` returns either a vector of class
 99 | #'   predictions for each row of `newdata` (when `type = "class"`), or
100 | #'   a document-by-class matrix of class probabilities (when `type =
101 | #'   "probability"`).
102 | #' @seealso [textmodel_mlp()]
103 | #' @keywords textmodel internal
104 | #' @importFrom keras predict_classes predict_proba
105 | #' @importFrom stats predict
106 | #' @export
107 | predict.textmodel_mlp <- function(object, newdata = NULL,
108 |                                   type = c("class", "probability"),
109 |                                   force = TRUE,
110 |                                   ...) {
111 |     quanteda:::unused_dots(...)
112 | 
113 |     type <- match.arg(type)
114 | 
115 |     if (!is.null(newdata)) {
116 |         data <- as.dfm(newdata)
117 |     } else {
118 |         data <- as.dfm(object$x)
119 |     }
120 |     model_featnames <- colnames(object$x)
121 |     data <- if (is.null(newdata)) {
122 |         suppressWarnings(quanteda.textmodels:::force_conformance(data, model_featnames, force))
123 |     } else {
124 |         quanteda.textmodels:::force_conformance(data, model_featnames, force)
125 |     }
126 | 
127 |     if (type == "class") {
128 |         pred_y <- predict_classes(object$seqfitted, x = data)
129 |         pred_y <- factor(pred_y, labels = object$classnames, levels = (seq_along(object$classnames) - 1))
130 |         names(pred_y) <- docnames(data)
131 |     } else if (type == "probability") {
132 |         pred_y <- predict_proba(object$seqfitted, x = data)
133 |         colnames(pred_y) <- object$classnames
134 |         rownames(pred_y) <- docnames(data)
135 |     }
136 | 
137 |     pred_y
138 | }
139 | 
140 | #' @export
141 | #' @importFrom stats na.omit
142 | #' @method print textmodel_mlp
143 | print.textmodel_mlp <- function(x, ...) {
144 |     layer_names <- gsub(pattern = "_\\d*", "", lapply(x$seqfitted$layers, function(z) z$name))
145 |     cat("\nCall:\n")
146 |     print(x$call)
147 |     cat("\n",
148 |         format(length(na.omit(x$y)), big.mark = ","), " training documents; ",
149 |         format(length(x$nfeatures), big.mark = ","), " fitted features",
150 |         ".\n",
151 |         "Structure: ", paste(layer_names, collapse = " -> "), "\n",
152 |         sep = "")
153 | }
154 | 
155 | #' summary method for textmodel_mlp objects
156 | #' @param object output from [textmodel_mlp()]
157 | #' @param ... additional arguments not used
158 | #' @keywords textmodel internal
159 | #' @method summary textmodel_mlp
160 | #' @export
161 | summary.textmodel_mlp <- function(object, ...) {
162 |     layer_names <- gsub(pattern = "_\\d*", "", lapply(object$seqfitted$layers, function(x) x$name))
163 | 
164 |     result <- list(
165 |         "call" = object$call,
166 |         "model structure" = paste(layer_names, collapse = " -> ")
167 |     )
168 |     as.summary.textmodel(result)
169 | }
170 | 
171 | #' @export
172 | #' @method print predict.textmodel_mlp
173 | print.predict.textmodel_mlp <- function(x, ...) {
174 |     print(unclass(x))
175 | }
176 | 
177 | #' Load or save keras-based textmodels
178 | #'
179 | #' Functions for loading and saving \pkg{keras}-based models.  Because these are
180 | #' stored as references, they need to be "serialized" prior to saving, or
181 | #' serialized upon loading.  This applies to models fit using
182 | #' [textmodel_cnnlstmemb()] and [textmodel_mlp()].
183 | #' @param x a \pkg{keras}-based fitted textmodel
184 | #' @param ... additional arguments passed to [save()] or [load()]
185 | #' @importFrom keras serialize_model
186 | #' @keywords internal
187 | #' @export
188 | #' @method save textmodel_mlp
189 | save.textmodel_mlp <- function(x, ...) {
190 |     x$seqfitted <- serialize_model(x$seqfitted)
191 |     save(x, ...)
192 | }
193 | 
194 | #' @rdname save.textmodel_mlp
195 | #' @importFrom keras unserialize_model
196 | #' @method load textmodel_mlp
197 | #' @export
198 | load.textmodel_mlp <- function(x, ...) {
199 |     load(x, ...)
200 |     x$seqfitted <- unserialize_model(x$seqfitted)
201 | }
202 | 


--------------------------------------------------------------------------------
/R/tokens2sequences.R:
--------------------------------------------------------------------------------
  1 | #' \[Experimental\] Convert quanteda tokens to keras sequences
  2 | #'
  3 | #' This function converts a \pkg{quanteda} [quanteda::tokens()] object
  4 | #' into a tokens sequence object as expected by some functions in the
  5 | #' \pkg{keras} package.
  6 | #' @param x [quanteda::tokens()] object
  7 | #' @param maxsenlen the maximum sentence length kept in output matrix
  8 | #' @param keepn the maximum number of features to keep
  9 | #' @return [tokens2sequences()] The output matrix has a number of rows
 10 | #'   which represent each tokenized sentence input into the function and a
 11 | #'   number of columns determined by `maxsenlen`. The matrix contains a
 12 | #'   numeric code for every unique token kept (determined by `keepn`) and
 13 | #'   they are arranged in the same sequence indicated by the original
 14 | #'   [quanteda::tokens()] object.
 15 | #' @seealso [is.tokens2sequences()], [tokens2sequences_conform()]
 16 | #' @export
 17 | #' @examples
 18 | #' library("quanteda")
 19 | #' corp <- corpus_subset(data_corpus_inaugural, Year <= 1793)
 20 | #' corptok <- tokens(corp)
 21 | #' print(corp)
 22 | #' seqs <- tokens2sequences(corptok, maxsenlen = 200)
 23 | #' print(seqs)
 24 | tokens2sequences <- function(x, maxsenlen = 100, keepn = NULL) {
 25 |     UseMethod("tokens2sequences")
 26 | }
 27 | 
 28 | #' @export
 29 | tokens2sequences.tokens <- function(x, maxsenlen = 100, keepn = NULL) {
 30 |     stopifnot(is.tokens(x))
 31 |     tfeq <- sort(table(unlist(x)), decreasing = T) # Creates a table of tokens and their frequencies sorted from most common to least
 32 |     doc_nam <- docnames(x) # Store docnames from tokens object for future use
 33 |     x <- unclass(x) # Convert tokens to integer IDs
 34 |     features <- attr(x, "types") # Store feature names
 35 |     data <- data.frame(features = features, # Create a dataframe that maps each token to its id and frequency
 36 |                        label1 = 1:length(features),
 37 |                        freq1 = as.integer(tfeq[features]),
 38 |                        stringsAsFactors = FALSE)
 39 |     attributes(x) <- NULL
 40 |     data <- data[order(data$freq, decreasing = TRUE), ] # Reorders feature dictionary by frequency
 41 |     x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y)
 42 |     words <- data.frame(table(unlist(x)))
 43 |     names(words) <- c("label1", "freq")
 44 |     data <- merge(data, words, by = "label1", all.x = TRUE)
 45 |     data <- data[order(data$freq, decreasing = TRUE), ]
 46 |     data$label <- NA
 47 |     if (!is.null(keepn)) {
 48 |         if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist
 49 |         data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common
 50 |         
 51 |     } else {
 52 |         data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq))
 53 |     }
 54 |     data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs
 55 |     x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list
 56 |     mat <- do.call("rbind", lapply(x, function(y) {
 57 |         if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame
 58 |         return(y)
 59 |     }
 60 |     ))
 61 |     rownames(mat) <- doc_nam # Adds docname to each row of the matrix
 62 |     colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column
 63 |     dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T))
 64 |     dropped_types <- 1 -  with(data, length(na.omit(label)) / length(na.omit(label1)))
 65 |     data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary
 66 |     data <- data[order(data$label, decreasing = FALSE),
 67 |                  c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID
 68 |     rownames(data) <- NULL # Resets rownames of dictionary
 69 |     output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens)
 70 |     class(output) <- "tokens2sequences"
 71 |     return(output)
 72 | }
 73 | 
 74 | #' @export
 75 | tokens2sequences.character <- function(x, maxsenlen = 100, keepn = NULL) {
 76 |     stopifnot(is.character(x))
 77 |     x <- tokens(x)
 78 |     tfeq <- sort(table(unlist(x)), decreasing = T) # Creates a table of tokens and their frequencies sorted from most common to least
 79 |     doc_nam <- docnames(x) # Store docnames from tokens object for future use
 80 |     x <- unclass(x) # Convert tokens to integer IDs
 81 |     features <- attr(x, "types") # Store feature names
 82 |     data <- data.frame(features = features, # Create a dataframe that maps each token to its id and frequency
 83 |                        label1 = 1:length(features),
 84 |                        freq1 = as.integer(tfeq[features]),
 85 |                        stringsAsFactors = FALSE)
 86 |     attributes(x) <- NULL
 87 |     data <- data[order(data$freq, decreasing = TRUE), ] # Reorders feature dictionary by frequency
 88 |     x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y)
 89 |     words <- data.frame(table(unlist(x)))
 90 |     names(words) <- c("label1", "freq")
 91 |     data <- merge(data, words, by = "label1", all.x = TRUE)
 92 |     data <- data[order(data$freq, decreasing = TRUE), ]
 93 |     data$label <- NA
 94 |     if (!is.null(keepn)) {
 95 |         if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist
 96 |         data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common
 97 |         
 98 |     } else {
 99 |         data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq))
100 |     }
101 |     data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs
102 |     x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list
103 |     mat <- do.call("rbind", lapply(x, function(y) {
104 |         if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame
105 |         return(y)
106 |     }
107 |     ))
108 |     rownames(mat) <- doc_nam # Adds docname to each row of the matrix
109 |     colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column
110 |     dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T))
111 |     dropped_types <- 1 -  with(data, length(na.omit(label)) / length(na.omit(label1)))
112 |     data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary
113 |     data <- data[order(data$label, decreasing = FALSE),
114 |                  c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID
115 |     rownames(data) <- NULL # Resets rownames of dictionary
116 |     output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens)
117 |     class(output) <- "tokens2sequences"
118 |     return(output)
119 | }
120 | 
121 | #' @export
122 | tokens2sequences.tokens2sequences <- function(x, maxsenlen = 100, keepn = NULL) {
123 |     stopifnot(is.tokens2sequences(x))
124 |     doc_nam <- rownames(x$matrix) # Store docnames from tokens object for future use
125 |     data <- x$features
126 |     names(data)[names(data) %in% c("label", "freq")] <- c('label1', "freq1")
127 |     x <- x$matrix
128 |     x <- lapply(1:nrow(x), function(y) {
129 |         j <- x[y, ]
130 |         return(j[j != 0])
131 |     })
132 |     x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y)
133 |     words <- data.frame(table(unlist(x)))
134 |     names(words) <- c("label1", "freq")
135 |     data <- merge(data, words, by = "label1", all.x = TRUE)
136 |     data <- data[order(data$freq, decreasing = TRUE), ]
137 |     data$label <- NA
138 |     if (!is.null(keepn)) {
139 |         if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist
140 |         data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common
141 |         
142 |     } else {
143 |         data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq))
144 |     }
145 |     data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs
146 |     x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list
147 |     mat <- do.call("rbind", lapply(x, function(y) {
148 |         if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame
149 |         return(y)
150 |     }
151 |     ))
152 |     rownames(mat) <- doc_nam # Adds docname to each row of the matrix
153 |     colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column
154 |     dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T))
155 |     dropped_types <- 1 -  with(data, length(na.omit(label)) / length(na.omit(label1)))
156 |     data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary
157 |     data <- data[order(data$label, decreasing = FALSE),
158 |                  c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID
159 |     rownames(data) <- NULL # Resets rownames of dictionary
160 |     output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens)
161 |     class(output) <- "tokens2sequences"
162 |     return(output)
163 | }
164 | 
165 | #' @seealso [tokens2sequences()]
166 | #' @export
167 | #' @importFrom utils head
168 | #' @method print tokens2sequences
169 | print.tokens2sequences <- function(x, removed = FALSE, ...) {
170 |     # calculate % sparse
171 |     zeros <- sum(colSums(x$matrix == 0))
172 |     tot <- nrow(x$matrix) * ncol(x$matrix)
173 |     sparse_pct <- round(zeros / tot * 100, 1)
174 | 
175 |     # determine max number of features to print
176 |     max_n <- ifelse(ncol(x$matrix) > 10, 10, ncol(x$matrix))
177 | 
178 |     # output
179 |     cat("Ordered feature matrix of: ", format(nrow(x$matrix), big.mark = ","),
180 |         " documents, ", format(x$nfeatures, big.mark = ","), " features ",
181 |         "(", sparse_pct, "% sparse).\n\n", sep = "")
182 |     #cat(nrow(x$matrix), " x ", ncol(x$matrix),
183 |     #    " Matrix of class \"tokens2sequences\" \n\n", sep = "")
184 |     if(removed) cat("Current parameter settings resulted in the removal of", round(100*x$dropped_types, 1), "percent of types\nand", round(100*x$dropped_tokens, 1), "percent of tokens.\n\n")
185 |     print(head(x$matrix[, 1:max_n], 4))
186 | }
187 | 
188 | #' Match the feature names of one tokens2sequences object to another
189 | #'
190 | #' Converts the feature names of one tokens2sequences object to those of
191 | #' another.  Useful in aligning training and test sets.
192 | #' @param x [tokens2sequences()] object that will be forced to conform
193 | #' @param y [tokens2sequences()] object whose feature names will be
194 | #'   used to change token labels for `x`
195 | #' @seealso [tokens2sequences()]
196 | #' @keywords internal
197 | #' @export
198 | #' @examples
199 | #' \dontrun{
200 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
201 | #' corpuncoded <- data_corpus_manifestosentsUK %>%
202 | #'     corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
203 | #'     corpus_sample(size = ndoc(corpcoded))
204 | #'
205 | #' tokx <- tokens(corpuncoded)
206 | #' toky <- tokens(corpcoded)
207 | #'
208 | #' seqx <- tokens2sequences(tokx, maxsenlen = 50, keepn = 5000)
209 | #' seqy <- tokens2sequences(toky, maxsenlen = 50, keepn = 5000)
210 | #' tokens2sequences_conform(seqx, seqy)
211 | #' }
212 | tokens2sequences_conform <- function(x, y) {
213 |     UseMethod("tokens2sequences_conform")
214 | }
215 | 
216 | #' @export
217 | #' @importFrom stats na.omit
218 | tokens2sequences_conform.tokens2sequences <- function(x, y) {
219 |     stopifnot(is.tokens2sequences(x) & is.tokens2sequences(y))
220 |     joint_feat <- merge(x$features, y$features[, -3], by = "features",
221 |                         all.x = TRUE)
222 |     joint_feat <- joint_feat[order(joint_feat$label.x, decreasing = FALSE), ]
223 |     mat <- apply(x$matrix, 1,
224 |                  function(x) as.integer(na.omit(joint_feat$label.y[x])))
225 |     mat <- do.call("rbind", lapply(mat, function(y)
226 |         if (length(y) >= ncol(x$matrix))
227 |             y[1:ncol(x$matrix)]
228 |         else
229 |             c(rep(0, times = ncol(x$matrix) - length(y)), y)
230 |     ))
231 |     rownames(mat) <- rownames(x$matrix)
232 |     colnames(mat) <- colnames(x$matrix)
233 |     joint_feat <- joint_feat[, c("features", "label.y", "freq")]
234 |     names(joint_feat)[2] <- "label"
235 |     joint_feat <- joint_feat[order(joint_feat$label, decreasing = F), ]
236 |     joint_feat <- joint_feat[!is.na(joint_feat$label), ]
237 |     rownames(joint_feat) <- NULL
238 | 
239 |     output <-
240 |         list(matrix = mat, nfeatures = nrow(joint_feat), features = joint_feat)
241 |     class(output) <- c("tokens2sequences")
242 |     return(output)
243 | }
244 | 
245 | #' Check to see if function is a tokens2sequences type
246 | #'
247 | #'
248 | #' @param x Object that will be checked to see if it is of the type [tokens2sequences()]
249 | #' @seealso [tokens2sequences()]
250 | #' @keywords internal
251 | #' @export
252 | is.tokens2sequences <- function(x) {
253 |     "tokens2sequences" %in% class(x)
254 | }
255 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onAttach <- function(...) {
2 |     options(keras.fit_verbose = 0)
3 |     options(keras.view_metrics = FALSE)
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | ```{r, echo = FALSE}
 6 | knitr::opts_chunk$set(
 7 |   collapse = TRUE,
 8 |   comment = "##",
 9 |   fig.path = "man/images/"
10 | )
11 | ```
12 | 
13 | # quanteda.classifiers: Text classification textmodel extensions for quanteda
14 | 
15 | [![CRAN Version](https://www.r-pkg.org/badges/version/quanteda.classifiers)](https://CRAN.R-project.org/package=quanteda.classifiers)
16 | [![R build status](https://github.com/quanteda/quanteda.classifiers/workflows/R-CMD-check/badge.svg)](https://github.com/quanteda/quanteda.classifiers/actions)
17 | [![Coverage status](https://codecov.io/gh/quanteda/quanteda.classifiers/branch/master/graph/badge.svg)](https://codecov.io/github/quanteda/quanteda.classifiers?branch=master)
18 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)
19 | 
20 | 
21 | ## Installation
22 | 
23 | To install this package, use the following, which also installs what the R **keras** package needs in order to run.
24 | ```{r eval = FALSE}
25 | # devtools package required to install quanteda from Github 
26 | devtools::install_github("quanteda/quanteda.classifiers") 
27 | 
28 | keras::install_keras(method = "conda")
29 | ```
30 | 
31 | ## Available classifiers
32 | 
33 | This package contains two experimental methods that are built on top of the **keras** package.  (The SVM models have been moved to [**quanteda.textmodels**](https://github.com/quanteda/quanteda.textmodels).)
34 | 
35 | Classifier | Command
36 | --|--
37 | Multilevel perceptron network | `textmodel_mlp()`
38 | Convolutional neural network + LSTM model fitted to word embeddings | `textmodel_cnnlstmemb()`
39 | 
40 | ## Available human-annotated corpora
41 | 
42 | Corpus | Name
43 | --|--
44 | Sentence-level corpus of UK party manifestos 1945–2019, partially annotated | `data_corpus_manifestosentsUK`
45 | Large Movie Review Dataset of 50,000 annotated highly polar movie reviews for training and testing, from Maas et. al. (2011) | `data_corpus_LMRD`
46 | 
47 | ## Demonstration
48 | 
49 | See this (very preliminary!) [performance comparison](https://htmlpreview.github.io/?https://github.com/quanteda/quanteda.classifiers/blob/master/tests/misc/test-LMRD.nb.html).
50 | 
51 | 
52 | ## How to cite
53 | 
54 | Benoit, Kenneth, Patrick Chester, and Stefan Müller (2019). quanteda.classifiers: Models for supervised text classification. R package version 0.2. URL: http://github.com/quanteda/quanteda.svm.
55 | 
56 | For a BibTeX entry, use the output from citation(package = "quanteda.classifiers").
57 | 
58 | ## Issues
59 | 
60 | *  Please file an issue (with a bug, wish list, etc.) [via GitHub](https://github.com/quanteda/quanteda.classifiers/issues).
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # quanteda.classifiers: Text classification textmodel extensions for quanteda
 3 | 
 4 | [![CRAN
 5 | Version](https://www.r-pkg.org/badges/version/quanteda.classifiers)](https://CRAN.R-project.org/package=quanteda.classifiers)
 6 | [![R build
 7 | status](https://github.com/quanteda/quanteda.classifiers/workflows/R-CMD-check/badge.svg)](https://github.com/quanteda/quanteda.classifiers/actions)
 8 | [![Coverage
 9 | status](https://codecov.io/gh/quanteda/quanteda.classifiers/branch/master/graph/badge.svg)](https://codecov.io/github/quanteda/quanteda.classifiers?branch=master)
10 | [![Lifecycle:
11 | experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)
12 | 
13 | ## Installation
14 | 
15 | To install this package, use the following, which also installs what the
16 | R **keras** package needs in order to run.
17 | 
18 | ``` r
19 | # devtools package required to install quanteda from Github 
20 | devtools::install_github("quanteda/quanteda.classifiers") 
21 | 
22 | keras::install_keras(method = "conda")
23 | ```
24 | 
25 | ## Available classifiers
26 | 
27 | This package contains two experimental methods that are built on top of
28 | the **keras** package. (The SVM models have been moved to
29 | [**quanteda.textmodels**](https://github.com/quanteda/quanteda.textmodels).)
30 | 
31 | | Classifier                                                          | Command                  |
32 | |---------------------------------------------------------------------|--------------------------|
33 | | Multilevel perceptron network                                       | `textmodel_mlp()`        |
34 | | Convolutional neural network + LSTM model fitted to word embeddings | `textmodel_cnnlstmemb()` |
35 | 
36 | ## Available human-annotated corpora
37 | 
38 | | Corpus                                                                                                                       | Name                           |
39 | |------------------------------------------------------------------------------------------------------------------------------|--------------------------------|
40 | | Sentence-level corpus of UK party manifestos 1945–2019, partially annotated                                                  | `data_corpus_manifestosentsUK` |
41 | | Large Movie Review Dataset of 50,000 annotated highly polar movie reviews for training and testing, from Maas et. al. (2011) | `data_corpus_LMRD`             |
42 | 
43 | ## Demonstration
44 | 
45 | See this (very preliminary!) [performance
46 | comparison](https://htmlpreview.github.io/?https://github.com/quanteda/quanteda.classifiers/blob/master/tests/misc/test-LMRD.nb.html).
47 | 
48 | ## How to cite
49 | 
50 | Benoit, Kenneth, Patrick Chester, and Stefan Müller (2019).
51 | quanteda.classifiers: Models for supervised text classification. R
52 | package version 0.2. URL: <http://github.com/quanteda/quanteda.svm>.
53 | 
54 | For a BibTeX entry, use the output from citation(package =
55 | “quanteda.classifiers”).
56 | 
57 | ## Issues
58 | 
59 | -   Please file an issue (with a bug, wish list, etc.) [via
60 |     GitHub](https://github.com/quanteda/quanteda.classifiers/issues).
61 | 


--------------------------------------------------------------------------------
/data/data_corpus_LMRD.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/data/data_corpus_LMRD.rda


--------------------------------------------------------------------------------
/data/data_corpus_manifestosentsUK.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/data/data_corpus_manifestosentsUK.rda


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | ACL
 2 | cnnlstmemb
 3 | com
 4 | Crowdflower
 5 | dfm
 6 | docvars
 7 | etc
 8 | Favorable
 9 | github
10 | Informedness
11 | keras
12 | Lifecycle
13 | lstm
14 | LSTM
15 | Maas
16 | Mikhaylov
17 | mlp
18 | MLP
19 | NN
20 | perceptron
21 | Pham
22 | quanteda
23 | SIE
24 | svm
25 | textmodel
26 | textmodels
27 | th
28 | 


--------------------------------------------------------------------------------
/man/crossval.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/crossval.R
 3 | \name{crossval}
 4 | \alias{crossval}
 5 | \title{Cross-validate a fitted textmodel}
 6 | \usage{
 7 | crossval(x, k = 5, by_class = FALSE, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{a fitted textmodel}
11 | 
12 | \item{k}{number of folds}
13 | 
14 | \item{by_class}{logical; if \code{TRUE}, estimate performance score separately for
15 | each class, otherwise average across classes}
16 | 
17 | \item{verbose}{logical; if \code{TRUE}, output results to the console}
18 | }
19 | \description{
20 | Cross-validate a fitted textmodel using \emph{k}-fold cross-validation.
21 | }
22 | \examples{
23 | library("quanteda")
24 | library("quanteda.textmodels")
25 | dfmat <- tokens(data_corpus_moviereviews) |>
26 |   dfm()
27 | tmod <- textmodel_nb(dfmat, y = data_corpus_moviereviews$sentiment)
28 | crossval(tmod, k = 5, by_class = TRUE)
29 | crossval(tmod, k = 5, by_class = FALSE)
30 | crossval(tmod, k = 5, by_class = FALSE, verbose = TRUE)
31 | }
32 | 


--------------------------------------------------------------------------------
/man/data_corpus_LMRD.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{data_corpus_LMRD}
 5 | \alias{data_corpus_LMRD}
 6 | \title{Large Movie Review Dataset from Maas et. al. (2011)}
 7 | \format{
 8 | The corpus docvars consist of:
 9 | \describe{
10 | \item{docnumber}{serial (within set and polarity) document number}
11 | \item{rating}{user-assigned movie rating on a 1-10 point integer scale}
12 | \item{set}{used for test v. training set}
13 | \item{polarity}{either \code{neg} or \code{pos} to indicate whether the
14 | movie review was negative or positive.  See Maas et al (2011) for the
15 | cut-off values that governed this assignment.}
16 | }
17 | }
18 | \source{
19 | \url{http://ai.stanford.edu/~amaas/data/sentiment/}
20 | }
21 | \usage{
22 | data_corpus_LMRD
23 | }
24 | \description{
25 | A corpus object containing a dataset for sentiment classification containing
26 | 25,000 highly polar movie reviews for training, and 25,000 for testing, from
27 | Maas et. al. (2011).
28 | }
29 | \references{
30 | Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew
31 | Y. Ng, and Christopher Potts. (2011).
32 | "\href{http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf}{Learning Word Vectors for Sentiment Analysis}". The 49th Annual Meeting of the
33 | Association for Computational Linguistics (ACL 2011).
34 | }
35 | \keyword{data}
36 | 


--------------------------------------------------------------------------------
/man/data_corpus_manifestosentsUK.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{data_corpus_manifestosentsUK}
 5 | \alias{data_corpus_manifestosentsUK}
 6 | \title{Sentence-level corpus of UK party manifestos 1945--2017, partially annotated}
 7 | \format{
 8 | The corpus consists of 88,954 documents (i.e. sentences) and includes the following
 9 | document-level variables: \describe{
10 | \item{party}{factor; abbreviation of the party that wrote the manifesto.}
11 | \item{partyname}{factor; party that wrote the manifesto.}
12 | \item{year}{integer; 4-digit year of the election.}
13 | \item{crowd_econsocial_label}{factor; indicates the majority label assigned
14 | by crowd workers (Economic Policy, Social Policy, or Neither). The variable
15 | has missing values (\code{NA}) for all non-annotated manifestos.}
16 | \item{crowd_econsocial_mean}{numeric; the direction of statements coded as
17 | "Economic Policy" or "Social Policy" based on the aggregated crowd codings.
18 | The variable is the mean of the scores assigned by the workers workers who
19 | coded the sentence and who allocated the sentence to the "majority"
20 | category. The variable ranges from -2 to +2.
21 | 
22 | For the statements aggregated as "Economic" Policy, -2 corresponds to "Very
23 | left"; +2 corresponds to "Very right". For the statements aggregated as
24 | "Social Policy"  -2 corresponds to "Very liberal"; +2 corresponds to "Very
25 | conservative". The variable has missing values (NA) for all sentences that
26 | were aggregated as "Neither" and for all non-annotated manifestos.)}
27 | \item{crowd_econsocial_n}{integer; the number of coders who contributed to the
28 | mean score \code{crowd_econsocial_mean}.}
29 | \item{crowd_immigration_label}{Factor indicating whether the majority of
30 | crowd workers labelled a sentence as referring to immigration or not. The
31 | variable has missing values (\code{NA}) for all non-annotated manifestos.}
32 | \item{crowd_immigration_mean}{numeric; the direction
33 | of statements coded as "Immigration" based on the aggregated crowd codings.
34 | The variable is the mean of the scores assigned by workers who coded a
35 | sentence and who allocated the sentence to the "Immigration" category. The
36 | variable ranges from -1 ("Negative and closed immigration policy") to +1
37 | (Favorable and open immigration policy). The variable has missing values
38 | (\code{NA}) for all non-annotated manifestos or if a sentence was not coded as
39 | referring to immigration policy based on the aggregation of crowd codings.}
40 | \item{crowd_immigration_n}{integer; the number of coders who
41 | contributed to the
42 | mean score \code{crowd_immigration_mean}.}
43 | }
44 | 
45 | A \link[quanteda:corpus]{corpus} object.
46 | }
47 | \usage{
48 | data_corpus_manifestosentsUK
49 | }
50 | \description{
51 | A text corpus of sentences from publicly available party manifestos from the
52 | United Kingdom, published between 1945 and 2019  Some manifestos sentences
53 | have been rated in terms of the direction of policy using crowd-sourced coders.
54 | 
55 | The manifestos from the
56 | three main parties (Labour Party, Conservatives, Liberal Democrats) between
57 | 1987 and 2010 have been labelled as Economic Policy, Social
58 | Policy, or Other, and rated in terms of the direction of Economic Policy and
59 | Social Policy.  All party
60 | manifestos from the 2010 General Election have been crowd-coded in terms of
61 | immigration policy, and the direction of immigration policy. For more
62 | information on the coding approach see
63 | \href{https://doi.org/10.1017/S0003055416000058}{Benoit et al. (2016)}.
64 | 
65 | The
66 | corpus contains the aggregated crowd coding values on the level of sentences.
67 | Note that the segmentation into sentences does not always work correctly due
68 | to missing punctuation. See Examples for how to remove very short and very
69 | long sentences using \code{\link[quanteda:corpus_trim]{quanteda::corpus_trim()}}.
70 | }
71 | \examples{
72 | \donttest{
73 | library("quanteda")
74 | 
75 | # keep only crowd coded manifestos (with respect to economic and social policy)
76 | corp_crowdeconsocial <-
77 |     corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_econsocial_label))
78 | 
79 | # keep only crowd coded manifestos (with respect to immigration policy)
80 | corp_crowdimmig <-
81 |     corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
82 | }
83 | }
84 | \references{
85 | Benoit, K., Conway, D., Lauderdale, B.E., Laver, M., & Mikhaylov, S. (2016).
86 | \href{https://doi.org/10.1017/S0003055416000058}{Crowd-sourced Text Analysis: Reproducible and Agile Production of Political Data}.
87 | \emph{American Political Science Review}, 100,(2), 278--295.
88 | }
89 | \keyword{data}
90 | 


--------------------------------------------------------------------------------
/man/is.tokens2sequences.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokens2sequences.R
 3 | \name{is.tokens2sequences}
 4 | \alias{is.tokens2sequences}
 5 | \title{Check to see if function is a tokens2sequences type}
 6 | \usage{
 7 | is.tokens2sequences(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Object that will be checked to see if it is of the type \code{\link[=tokens2sequences]{tokens2sequences()}}}
11 | }
12 | \description{
13 | Check to see if function is a tokens2sequences type
14 | }
15 | \seealso{
16 | \code{\link[=tokens2sequences]{tokens2sequences()}}
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/performance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/performance.R
 3 | \name{performance}
 4 | \alias{performance}
 5 | \alias{precision}
 6 | \alias{recall}
 7 | \alias{f1_score}
 8 | \alias{accuracy}
 9 | \alias{balanced_accuracy}
10 | \title{Performance statistics for prediction}
11 | \usage{
12 | performance(data, truth, by_class = TRUE, ...)
13 | 
14 | precision(data, truth, by_class = TRUE, ...)
15 | 
16 | recall(data, truth, by_class = TRUE, ...)
17 | 
18 | f1_score(data, truth, by_class = TRUE, ...)
19 | 
20 | accuracy(data, truth, ...)
21 | 
22 | balanced_accuracy(data, ...)
23 | }
24 | \arguments{
25 | \item{data}{a table of predicted by truth, or vector of predicted labels}
26 | 
27 | \item{truth}{vector of "true" labels, or if a table, \code{2} to indicate that the
28 | "true" values are in columns, or \code{1} if in rows.}
29 | 
30 | \item{by_class}{logical; if \code{TRUE}, estimate performance score separately for
31 | each class, otherwise average across classes}
32 | 
33 | \item{...}{not used}
34 | }
35 | \value{
36 | named list consisting of the selected measure(s), where each element
37 | is a scalar if \code{by_class = FALSE}, or a vector named by class if \code{by_class = TRUE}.
38 | }
39 | \description{
40 | Functions for computing performance statistics used for model
41 | evaluation.
42 | 
43 | \code{performance()} computes all of the following, which are also
44 | available via specific functions:
45 | 
46 | Given a 2 x 2 table with notation
47 | 
48 | \tabular{rcc}{ \tab Truth \tab \cr Predicted \tab Positive \tab
49 | Negative \cr Positive \tab \emph{A} \tab \emph{B} \cr Negative \tab \emph{C} \tab \emph{D} \cr }
50 | 
51 | The metrics computed here are:
52 | \itemize{
53 | \item{precision: }{\eqn{A / (A + B)}}
54 | \item{recall: }{\eqn{A / (A + C)}}
55 | \item{\emph{F1}: }{\eqn{2 / (recall^{-1} + precision^{-1})}}
56 | \item{accuracy: }{\eqn{(A + D) / (A + B + C + D)}, or correctly predicted / all}
57 | \item{balanced_accuracy: }{mean(recall) for all categories}
58 | }
59 | }
60 | \examples{
61 | ## Data in Table 2 of Powers (2007)
62 | 
63 | lvs <- c("Relevant", "Irrelevant")
64 | tbl_2_1_pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs)
65 | tbl_2_1_truth <- factor(c(rep(lvs, times = c(30, 12)),
66 |                           rep(lvs, times = c(30, 28))),               
67 |                         levels = lvs)
68 |                         
69 | performance(tbl_2_1_pred, tbl_2_1_truth)
70 | performance(tbl_2_1_pred, tbl_2_1_truth, by_class = FALSE)
71 | performance(table(tbl_2_1_pred, tbl_2_1_truth), by_class = TRUE)
72 | 
73 | precision(tbl_2_1_pred, tbl_2_1_truth)
74 | 
75 | recall(tbl_2_1_pred, tbl_2_1_truth)
76 | 
77 | f1_score(tbl_2_1_pred, tbl_2_1_truth)
78 | 
79 | accuracy(tbl_2_1_pred, tbl_2_1_truth)
80 | 
81 | balanced_accuracy(tbl_2_1_pred, tbl_2_1_truth)
82 | 
83 | }
84 | \references{
85 | Powers, D. (2007). "Evaluation: From Precision, Recall and F Factor to ROC,
86 | Informedness, Markedness and Correlation." \emph{Technical Report SIE-07-001},
87 | Flinders University.
88 | }
89 | 


--------------------------------------------------------------------------------
/man/predict.textmodel_cnnlstmemb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textmodel_cnnlstmemb.R
 3 | \name{predict.textmodel_cnnlstmemb}
 4 | \alias{predict.textmodel_cnnlstmemb}
 5 | \title{Prediction from a fitted textmodel_cnnlstmemb object}
 6 | \usage{
 7 | \method{predict}{textmodel_cnnlstmemb}(
 8 |   object,
 9 |   newdata = NULL,
10 |   type = c("class", "probability"),
11 |   force = TRUE,
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{object}{a fitted \link{textmodel_cnnlstmemb} model}
17 | 
18 | \item{newdata}{dfm on which prediction should be made}
19 | 
20 | \item{type}{the type of predicted values to be returned; see Value}
21 | 
22 | \item{force}{make \code{newdata}'s feature set conformant to the model terms}
23 | 
24 | \item{...}{not used}
25 | }
26 | \value{
27 | \code{predict.textmodel_cnnlstmemb} returns either a vector of class
28 | predictions for each row of \code{newdata} (when \code{type = "class"}), or
29 | a document-by-class matrix of class probabilities (when \code{type = "probability"}).
30 | }
31 | \description{
32 | \code{predict.textmodel_cnnlstmemb()} implements class predictions from a
33 | fitted long short-term memory neural network model.
34 | }
35 | \seealso{
36 | \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}}
37 | }
38 | \keyword{internal}
39 | \keyword{textmodel}
40 | 


--------------------------------------------------------------------------------
/man/predict.textmodel_mlp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textmodel_mlp.R
 3 | \name{predict.textmodel_mlp}
 4 | \alias{predict.textmodel_mlp}
 5 | \title{Prediction from a fitted textmodel_mlp object}
 6 | \usage{
 7 | \method{predict}{textmodel_mlp}(
 8 |   object,
 9 |   newdata = NULL,
10 |   type = c("class", "probability"),
11 |   force = TRUE,
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{object}{a fitted \link{textmodel_mlp} model}
17 | 
18 | \item{newdata}{dfm on which prediction should be made}
19 | 
20 | \item{type}{the type of predicted values to be returned; see Value}
21 | 
22 | \item{force}{make \code{newdata}'s feature set conformant to the model terms}
23 | 
24 | \item{...}{not used}
25 | }
26 | \value{
27 | \code{predict.textmodel_mlp} returns either a vector of class
28 | predictions for each row of \code{newdata} (when \code{type = "class"}), or
29 | a document-by-class matrix of class probabilities (when \code{type = "probability"}).
30 | }
31 | \description{
32 | \code{predict.textmodel_mlp()} implements class predictions from a fitted
33 | multilayer perceptron network model.
34 | }
35 | \seealso{
36 | \code{\link[=textmodel_mlp]{textmodel_mlp()}}
37 | }
38 | \keyword{internal}
39 | \keyword{textmodel}
40 | 


--------------------------------------------------------------------------------
/man/quanteda.classifiers-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/quanteda.classifiers-package.R
 3 | \docType{package}
 4 | \name{quanteda.classifiers-package}
 5 | \alias{quanteda.classifiers}
 6 | \alias{quanteda.classifiers-package}
 7 | \title{quanteda.classifiers}
 8 | \description{
 9 | Extensions to \pkg{quanteda} that provide supervised machine learning models
10 | for document-feature matrices.
11 | }
12 | \seealso{
13 | Useful links:
14 | \itemize{
15 |   \item \url{https://github.com/quanteda/quanteda.classifiers}
16 |   \item Report bugs at \url{https://github.com/quanteda/quanteda.classifiers/issues}
17 | }
18 | 
19 | }
20 | \author{
21 | \strong{Maintainer}: Kenneth Benoit \email{kbenoit@lse.ac.uk}
22 | 
23 | Authors:
24 | \itemize{
25 |   \item Patrick Chester \email{pjc468@nyu.edu}
26 |   \item Müller Stefan \email{mueller@ipz.uzh.ch}
27 | }
28 | 
29 | }
30 | \keyword{internal}
31 | 


--------------------------------------------------------------------------------
/man/save.textmodel_mlp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textmodel_cnnlstmemb.R, R/textmodel_mlp.R
 3 | \name{save.textmodel_cnnlstmemb}
 4 | \alias{save.textmodel_cnnlstmemb}
 5 | \alias{load.textmodel_cnnlstmemb}
 6 | \alias{save.textmodel_mlp}
 7 | \alias{load.textmodel_mlp}
 8 | \title{Load or save keras-based textmodels}
 9 | \usage{
10 | \method{save}{textmodel_cnnlstmemb}(x, ...)
11 | 
12 | \method{load}{textmodel_cnnlstmemb}(x, ...)
13 | 
14 | \method{save}{textmodel_mlp}(x, ...)
15 | 
16 | \method{load}{textmodel_mlp}(x, ...)
17 | }
18 | \arguments{
19 | \item{x}{a \pkg{keras}-based fitted textmodel}
20 | 
21 | \item{...}{additional arguments passed to \code{\link[=save]{save()}} or \code{\link[=load]{load()}}}
22 | }
23 | \description{
24 | Functions for loading and saving \pkg{keras}-based models.  Because these are
25 | stored as references, they need to be "serialized" prior to saving, or
26 | serialized upon loading.  This applies to models fit using
27 | \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}} and \code{\link[=textmodel_mlp]{textmodel_mlp()}}.
28 | }
29 | \keyword{internal}
30 | 


--------------------------------------------------------------------------------
/man/summary.textmodel_cnnlstmemb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textmodel_cnnlstmemb.R
 3 | \name{summary.textmodel_cnnlstmemb}
 4 | \alias{summary.textmodel_cnnlstmemb}
 5 | \title{summary method for textmodel_cnnlstmemb objects}
 6 | \usage{
 7 | \method{summary}{textmodel_cnnlstmemb}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{output from \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}}}
11 | 
12 | \item{...}{additional arguments not used}
13 | }
14 | \description{
15 | summary method for textmodel_cnnlstmemb objects
16 | }
17 | \keyword{internal}
18 | \keyword{textmodel}
19 | 


--------------------------------------------------------------------------------
/man/summary.textmodel_mlp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textmodel_mlp.R
 3 | \name{summary.textmodel_mlp}
 4 | \alias{summary.textmodel_mlp}
 5 | \title{summary method for textmodel_mlp objects}
 6 | \usage{
 7 | \method{summary}{textmodel_mlp}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{output from \code{\link[=textmodel_mlp]{textmodel_mlp()}}}
11 | 
12 | \item{...}{additional arguments not used}
13 | }
14 | \description{
15 | summary method for textmodel_mlp objects
16 | }
17 | \keyword{internal}
18 | \keyword{textmodel}
19 | 


--------------------------------------------------------------------------------
/man/textmodel_cnnlstmemb.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/textmodel_cnnlstmemb.R
  3 | \name{textmodel_cnnlstmemb}
  4 | \alias{textmodel_cnnlstmemb}
  5 | \title{[Experimental] Convolutional NN + LSTM model fitted to word embeddings}
  6 | \usage{
  7 | textmodel_cnnlstmemb(
  8 |   x,
  9 |   y,
 10 |   dropout1 = 0.2,
 11 |   dropout2 = 0.2,
 12 |   dropout3 = 0.2,
 13 |   dropout4 = 0.2,
 14 |   wordembeddim = 30,
 15 |   cnnlayer = TRUE,
 16 |   filter = 48,
 17 |   kernel_size = 5,
 18 |   pool_size = 4,
 19 |   units_lstm = 128,
 20 |   words = NULL,
 21 |   maxsenlen = 100,
 22 |   optimizer = "adam",
 23 |   loss = "categorical_crossentropy",
 24 |   metrics = "categorical_accuracy",
 25 |   ...
 26 | )
 27 | }
 28 | \arguments{
 29 | \item{x}{tokens object}
 30 | 
 31 | \item{y}{vector of training labels associated with each document identified
 32 | in \code{train}.  (These will be converted to factors if not already
 33 | factors.)}
 34 | 
 35 | \item{dropout1}{A floating variable bound between 0 and 1. It determines the
 36 | rate at which units are dropped for the linear transformation of the
 37 | inputs for the embedding layer.}
 38 | 
 39 | \item{dropout2}{A floating variable bound between 0 and 1. It determines the
 40 | rate at which units are dropped for the linear transformation of the
 41 | inputs for the CNN layer.}
 42 | 
 43 | \item{dropout3}{A floating variable bound between 0 and 1. It determines the
 44 | rate at which units are dropped for the linear transformation of the
 45 | inputs for the recurrent layer.}
 46 | 
 47 | \item{dropout4}{A floating variable bound between 0 and 1. It determines the
 48 | rate at which units are dropped for the linear transformation of the
 49 | inputs for the recurrent layer.}
 50 | 
 51 | \item{wordembeddim}{The number of word embedding dimensions to be fit}
 52 | 
 53 | \item{cnnlayer}{A logical parameter that allows user to include or exclude a
 54 | convolutional layer in the neural network model}
 55 | 
 56 | \item{filter}{The number of output filters in the convolution}
 57 | 
 58 | \item{kernel_size}{An integer or list of a single integer, specifying the
 59 | length of the 1D convolution window}
 60 | 
 61 | \item{pool_size}{Size of the max pooling windows.
 62 | \code{\link[keras:layer_max_pooling_1d]{keras::layer_max_pooling_1d()}}}
 63 | 
 64 | \item{units_lstm}{The number of nodes of the lstm layer}
 65 | 
 66 | \item{words}{The maximum number of words used to train model. Defaults to the
 67 | number of features in \code{x}}
 68 | 
 69 | \item{maxsenlen}{The maximum sentence length of training data}
 70 | 
 71 | \item{optimizer}{optimizer used to fit model to training data, see
 72 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
 73 | 
 74 | \item{loss}{objective loss function, see
 75 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
 76 | 
 77 | \item{metrics}{metric used to train algorithm, see
 78 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
 79 | 
 80 | \item{...}{additional options passed to
 81 | \code{\link[keras:fit.keras.engine.training.Model]{keras::fit.keras.engine.training.Model()}}}
 82 | }
 83 | \description{
 84 | A function that combines a convolutional neural network layer with a long
 85 | short-term memory layer. It is designed to incorporate word sequences,
 86 | represented as sequentially ordered word embeddings, into text
 87 | classification. The model takes as an input a \pkg{quanteda} tokens object.
 88 | }
 89 | \examples{
 90 | \dontrun{
 91 | # create dataset with evenly balanced coded & uncoded immigration sentences
 92 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK,
 93 |                            !is.na(crowd_immigration_label))
 94 | corpuncoded <- data_corpus_manifestosentsUK \%>\%
 95 |     corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\%
 96 |     corpus_sample(size = ndoc(corpcoded))
 97 | corp <- corpcoded + corpuncoded
 98 | 
 99 | tok <- tokens(corp)
100 | 
101 | tmod <- textmodel_cnnlstmemb(tok,
102 |                              y = docvars(tok, "crowd_immigration_label"),
103 |                              epochs = 5, verbose = 1)
104 | 
105 | newdata = tokens_subset(tok, subset = is.na(crowd_immigration_label))
106 | pred <- predict(tmod, newdata = newdata)
107 | table(pred)
108 | tail(texts(corpuncoded)[pred == "Immigration"], 10)
109 | 
110 | }
111 | }
112 | \seealso{
113 | \code{\link[=save.textmodel_cnnlstmemb]{save.textmodel_cnnlstmemb()}}, \code{\link[=load.textmodel_cnnlstmemb]{load.textmodel_cnnlstmemb()}}
114 | }
115 | \keyword{textmodel}
116 | 


--------------------------------------------------------------------------------
/man/textmodel_evaluate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textmodel_evaluate.R
 3 | \name{textmodel_evaluate}
 4 | \alias{textmodel_evaluate}
 5 | \title{Model evaluation function}
 6 | \usage{
 7 | textmodel_evaluate(
 8 |   x,
 9 |   y,
10 |   model,
11 |   fun = "f1_score",
12 |   k = 5,
13 |   parameters = list(),
14 |   seed = as.numeric(Sys.time()),
15 |   time = TRUE,
16 |   by_class = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{x}{the \link{dfm} or \link{tokens} object on which the model will be
21 | fit.  Does not need to contain only the training documents.}
22 | 
23 | \item{y}{vector of training labels associated with each document identified
24 | in \code{train}.  (These will be converted to factors if not already
25 | factors.)}
26 | 
27 | \item{model}{the name of the machine learning function that will be evaluated}
28 | 
29 | \item{fun}{the name of the function that will be used to evaluate the machine
30 | learning model. Can take the values "accuracy", "precision", "recall", or
31 | "f1_score"}
32 | 
33 | \item{k}{number of folds}
34 | 
35 | \item{parameters}{model hyperparameters}
36 | 
37 | \item{seed}{a seed that can allow for replication of k training data splits.
38 | If seed is not provided a seed is chosen based on the current time.}
39 | 
40 | \item{time}{a logical parameter that determines whether output will include
41 | training time (in seconds) of model}
42 | 
43 | \item{by_class}{estimates a separate value of provided evaluation function
44 | for every class of the true vector}
45 | }
46 | \description{
47 | Designed to streamline the parameter tuning and evaluation process. Users
48 | chose a function to evaluate and include parameter values as a list. If
49 | multiple parameter values are provided, the function will perform a grid
50 | search by estimating a separate model for every combination of parameters.
51 | }
52 | \examples{
53 | # evaluate immigration classification performance
54 | \dontrun{
55 | dfmat <- dfm(data_corpus_manifestosentsUK)
56 | codes <- docvars(data_corpus_manifestosentsUK, "crowd_immigration_label")
57 | evaluation <- textmodel_evaluate(dfmat, codes, k = 3,
58 |                                  model = "textmodel_mlp", fun = "f1_score",
59 |                                  parameters = list(epochs = c(3, 4)))
60 | head(evaluation)
61 | aggregate(evaluation, by = list(evaluation$cost), FUN = "mean")
62 | }
63 | }
64 | 


--------------------------------------------------------------------------------
/man/textmodel_mlp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textmodel_mlp.R
 3 | \name{textmodel_mlp}
 4 | \alias{textmodel_mlp}
 5 | \title{Multilayer perceptron network (MLP) model for text classification}
 6 | \usage{
 7 | textmodel_mlp(
 8 |   x,
 9 |   y,
10 |   units = 512,
11 |   dropout = 0.2,
12 |   optimizer = "adam",
13 |   loss = "categorical_crossentropy",
14 |   metrics = "categorical_accuracy",
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{x}{the \link{dfm} on which the model will be fit.  Does not need to
20 | contain only the training documents.}
21 | 
22 | \item{y}{vector of training labels associated with each document identified
23 | in \code{train}.  (These will be converted to factors if not already
24 | factors.)}
25 | 
26 | \item{units}{The number of network nodes used in the first layer of the
27 | sequential model}
28 | 
29 | \item{dropout}{A floating variable bound between 0 and 1. It determines the
30 | rate at which units are dropped for the linear transformation of the
31 | inputs.}
32 | 
33 | \item{optimizer}{optimizer used to fit model to training data, see
34 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
35 | 
36 | \item{loss}{objective loss function, see
37 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
38 | 
39 | \item{metrics}{metric used to train algorithm, see
40 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
41 | 
42 | \item{...}{additional options passed to
43 | \code{\link[keras:fit.keras.engine.training.Model]{keras::fit.keras.engine.training.Model()}}}
44 | }
45 | \description{
46 | This function is a wrapper for a multilayer perceptron network model with a
47 | single hidden layer network with two layers, implemented in the \pkg{keras}
48 | package.
49 | }
50 | \examples{
51 | \dontrun{
52 | # create a dataset with evenly balanced coded and uncoded immigration sentences
53 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
54 | corpuncoded <- data_corpus_manifestosentsUK \%>\%
55 |     corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\%
56 |     corpus_sample(size = ndoc(corpcoded))
57 | corp <- corpcoded + corpuncoded
58 | 
59 | # form a tf-idf-weighted dfm
60 | dfmat <- dfm(corp) \%>\%
61 |     dfm_tfidf()
62 | 
63 | set.seed(1000)
64 | tmod <- textmodel_mlp(dfmat, y = docvars(dfmat, "crowd_immigration_label"),
65 |                         epochs = 5, verbose = 1)
66 | pred <- predict(tmod, newdata = dfm_subset(dfmat, is.na(crowd_immigration_label)))
67 | table(pred)
68 | tail(texts(corpuncoded)[pred == "Immigration"], 10)
69 | }
70 | }
71 | \seealso{
72 | \code{\link[=save.textmodel_mlp]{save.textmodel_mlp()}}, \code{\link[=load.textmodel_mlp]{load.textmodel_mlp()}}
73 | }
74 | \keyword{textmodel}
75 | 


--------------------------------------------------------------------------------
/man/tokens2sequences.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokens2sequences.R
 3 | \name{tokens2sequences}
 4 | \alias{tokens2sequences}
 5 | \title{[Experimental] Convert quanteda tokens to keras sequences}
 6 | \usage{
 7 | tokens2sequences(x, maxsenlen = 100, keepn = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{\code{\link[quanteda:tokens]{quanteda::tokens()}} object}
11 | 
12 | \item{maxsenlen}{the maximum sentence length kept in output matrix}
13 | 
14 | \item{keepn}{the maximum number of features to keep}
15 | }
16 | \value{
17 | \code{\link[=tokens2sequences]{tokens2sequences()}} The output matrix has a number of rows
18 | which represent each tokenized sentence input into the function and a
19 | number of columns determined by \code{maxsenlen}. The matrix contains a
20 | numeric code for every unique token kept (determined by \code{keepn}) and
21 | they are arranged in the same sequence indicated by the original
22 | \code{\link[quanteda:tokens]{quanteda::tokens()}} object.
23 | }
24 | \description{
25 | This function converts a \pkg{quanteda} \code{\link[quanteda:tokens]{quanteda::tokens()}} object
26 | into a tokens sequence object as expected by some functions in the
27 | \pkg{keras} package.
28 | }
29 | \examples{
30 | library("quanteda")
31 | corp <- corpus_subset(data_corpus_inaugural, Year <= 1793)
32 | corptok <- tokens(corp)
33 | print(corp)
34 | seqs <- tokens2sequences(corptok, maxsenlen = 200)
35 | print(seqs)
36 | }
37 | \seealso{
38 | \code{\link[=is.tokens2sequences]{is.tokens2sequences()}}, \code{\link[=tokens2sequences_conform]{tokens2sequences_conform()}}
39 | }
40 | 


--------------------------------------------------------------------------------
/man/tokens2sequences_conform.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokens2sequences.R
 3 | \name{tokens2sequences_conform}
 4 | \alias{tokens2sequences_conform}
 5 | \title{Match the feature names of one tokens2sequences object to another}
 6 | \usage{
 7 | tokens2sequences_conform(x, y)
 8 | }
 9 | \arguments{
10 | \item{x}{\code{\link[=tokens2sequences]{tokens2sequences()}} object that will be forced to conform}
11 | 
12 | \item{y}{\code{\link[=tokens2sequences]{tokens2sequences()}} object whose feature names will be
13 | used to change token labels for \code{x}}
14 | }
15 | \description{
16 | Converts the feature names of one tokens2sequences object to those of
17 | another.  Useful in aligning training and test sets.
18 | }
19 | \examples{
20 | \dontrun{
21 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
22 | corpuncoded <- data_corpus_manifestosentsUK \%>\%
23 |     corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\%
24 |     corpus_sample(size = ndoc(corpcoded))
25 | 
26 | tokx <- tokens(corpuncoded)
27 | toky <- tokens(corpcoded)
28 | 
29 | seqx <- tokens2sequences(tokx, maxsenlen = 50, keepn = 5000)
30 | seqy <- tokens2sequences(toky, maxsenlen = 50, keepn = 5000)
31 | tokens2sequences_conform(seqx, seqy)
32 | }
33 | }
34 | \seealso{
35 | \code{\link[=tokens2sequences]{tokens2sequences()}}
36 | }
37 | \keyword{internal}
38 | 


--------------------------------------------------------------------------------
/quanteda.classifiers.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageCheckArgs: --as-cran
19 | PackageRoxygenize: rd,collate,namespace,vignette
20 | 


--------------------------------------------------------------------------------
/tests/data_creation/create_data_corpus_manifestosentUK.R:
--------------------------------------------------------------------------------
  1 | ########################
  2 | ### Create UK election manifesto dataset on the level of sentences
  3 | ########################
  4 | 
  5 | # load packages
  6 | library(dplyr)
  7 | library(tidyr)
  8 | library(car)
  9 | library(readtext)
 10 | library(quanteda)
 11 | library(quanteda.corpora)
 12 | library(spacyr)
 13 | 
 14 | ########################
 15 | ### 1. Load and aggregate crowdcoded data ----
 16 | ########################
 17 | 
 18 | ## 1.1 Load APSR data on economic policy/social policy/neither ----
 19 | 
 20 | data_readtext_uk_econsocial <- readtext("tests/data_creation/data_uk_policyarea.zip",
 21 |                                          ignore_missing_files = TRUE, encoding = "utf-8")
 22 | 
 23 | # exclude screeners from data frame using a regular expression
 24 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 
 25 |   dplyr::filter(!grepl('Code this sentence as', sentence_text))
 26 | 
 27 | ## For some sentences, special characters are displayed differently depending on the
 28 | ## crowd coding job. Therefore, I remove sentence_text here and merge it later 
 29 | ## using "master.sentences.Rdata" which does not contain these encoding errors
 30 | 
 31 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 
 32 |   select(-sentence_text)
 33 | 
 34 | # load "master sentences" from the APSR replication data with metadata on manifestos and years
 35 | load("tests/data_creation/master.sentences.Rdata")
 36 | 
 37 | sentences_metadata <- sentences %>% 
 38 |     select(sentenceid, manifestoid, party, year) # select relevant variables
 39 | 
 40 | # merge metadata on manifestos with crowd coded texts
 41 | data_readtext_uk_econsocial <- left_join(data_readtext_uk_econsocial, sentences_metadata, by = "sentenceid")
 42 | 
 43 | # create numeric indicator for aggregation of policy area
 44 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 
 45 |     mutate(class_policyarea_num = ifelse(policy_area == 1, 0,
 46 |                                         ifelse(policy_area == 2, -1,
 47 |                                                ifelse(policy_area == 3, 1, NA))))
 48 | 
 49 | # create numeric indicator for aggregation of direction
 50 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
 51 |     mutate(class_policyarea_direction_num = ifelse(!is.na(soc_scale), soc_scale,
 52 |                                                   ifelse(!is.na(econ_scale), econ_scale, NA)))
 53 | 
 54 | # aggregate data to the level of sentences
 55 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 
 56 |     group_by(manifestoid, sentenceid) %>% 
 57 |     mutate(crowd_econsocial_n = n(),
 58 |            class_policyarea_mean = mean(class_policyarea_num, na.rm = TRUE))
 59 | 
 60 | # create variable with the policy area based on the aggregated coding
 61 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 
 62 |     mutate(crowd_econsocial_label = ifelse(class_policyarea_mean < 0, "Economic",
 63 |                                        ifelse(class_policyarea_mean == 0, "Not Economic or Social",
 64 |                                               ifelse(class_policyarea_mean > 0, "Social", NA))))
 65 | 
 66 | 
 67 | # here I make sure that the policy direction mean only takes 
 68 | # into account the majority category and not also the minority category position values
 69 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 
 70 |   mutate(class_policy_direction_num = 
 71 |            ifelse(crowd_econsocial_label == "Economic" & class_policyarea_num == -1, class_policyarea_direction_num, 
 72 |                   ifelse(crowd_econsocial_label == "Social" & class_policyarea_num == 1, class_policyarea_direction_num,
 73 |                          NA)))
 74 | 
 75 | # aggregate data to the level of sentences
 76 | data_uk_econsocial <- data_readtext_uk_econsocial %>% 
 77 |   group_by(manifestoid, sentenceid, crowd_econsocial_n, crowd_econsocial_label) %>% 
 78 |   summarise(crowd_econsocial_mean = mean(class_policy_direction_num, na.rm = TRUE)) %>% 
 79 |   ungroup() 
 80 | 
 81 | # separate the variable manifestoid into Party and Year
 82 | data_uk_econsocial <- data_uk_econsocial %>% 
 83 |     separate(manifestoid, into = c("Party", "Year"),
 84 |              remove = FALSE)
 85 | 
 86 | # merge text from "sentences" data frame which does not contain encoding errors
 87 | dat_sentences <- sentences %>% 
 88 |   select(sentence_text, sentenceid)
 89 | 
 90 | data_uk_econsocial <- left_join(data_uk_econsocial, dat_sentences, 
 91 |                                     by = "sentenceid")
 92 | 
 93 | # select some of the variables and add three additional variables
 94 | data_uk_man_econsocial <- data_uk_econsocial %>% 
 95 |     ungroup() %>% 
 96 |     select(text = sentence_text, sentenceid, Party, Year, 
 97 |            crowd_econsocial_label, crowd_econsocial_mean,
 98 |            crowd_econsocial_n) 
 99 | 
100 | # replace nan values in class_direction_mean with na
101 | data_uk_man_econsocial$crowd_econsocial_mean[is.nan(data_uk_man_econsocial$crowd_econsocial_mean)] <- NA
102 | 
103 | 
104 | ## 1.2 Load APSR data on immigration ----
105 | data_readtext_uk_immigration <- readtext("tests/data_creation/data_uk_immigration.zip",
106 |                                          ignore_missing_files = TRUE)
107 | 
108 | # exclude screeners
109 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>% 
110 |     subset(manifestoid != "screener")
111 | 
112 | # use manifestoid to create a party and year variable
113 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>% 
114 |     separate(manifestoid, into = c("Party", "Year")) 
115 | 
116 | # get the class
117 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>% 
118 |     mutate(class_immigration_num = ifelse(policy_area == 4, 1, 0))
119 | 
120 | # aggregate to the level of setence
121 | data_uk_immigration <- data_readtext_uk_immigration %>% 
122 |     group_by(Party, Year, sentenceid, sentence_text) %>% 
123 |     summarise(crowd_immigration_n = n(),
124 |               class_immigration_num = mean(class_immigration_num, na.rm = TRUE),
125 |               crowd_immigration_mean = mean(immigr_scale, na.rm = TRUE))
126 | 
127 | # replace nan values in class_direction_mean with na
128 | data_uk_immigration$crowd_immigration_mean[is.nan(data_uk_immigration$crowd_immigration_mean)] <- NA
129 | 
130 | # use the average evaluations to construct the majority-rule based classification
131 | data_uk_immigration <- data_uk_immigration %>% 
132 |     mutate(crowd_immigration_label = ifelse(class_immigration_num >= 0.5, "Immigration", "Not immigration")) 
133 | 
134 | # select some of the variables and add three additional variables
135 | data_uk_immig_man_2010 <- data_uk_immigration %>% 
136 |     ungroup() %>% 
137 |     select(text = sentence_text, sentenceid, Party, Year, crowd_immigration_label, 
138 |            crowd_immigration_mean,
139 |            crowd_immigration_n) 
140 | 
141 | 
142 | ########################
143 | ## 1.3 Merge both crowdcoded datasets ----
144 | ########################
145 | 
146 | # Note that the 2010 manifestos have been coded both with regards to 
147 | # immigration AND econ/social/neither
148 | 
149 | # full join of both crowdcoded datasets
150 | dat_uk_crowdcoded <- full_join(data_uk_man_econsocial,
151 |                                data_uk_immig_man_2010, by = c("sentenceid"))
152 | 
153 | table(dat_uk_crowdcoded$Party.x)
154 | table(dat_uk_crowdcoded$Party.y)
155 | 
156 | # rename and remove unnessary variables
157 | dat_uk_man_crowdcoded_clean <- dat_uk_crowdcoded %>% 
158 |     mutate(Party = ifelse(is.na(Party.x), Party.y, Party.x)) %>% 
159 |     mutate(Year = ifelse(is.na(Year.x), Year.y, Year.x)) %>% 
160 |     mutate(text = ifelse(is.na(text.x), text.y, text.x)) %>% 
161 |     select(-c(Party.x, Party.y, Year.x, Year.y, text.x, text.y))
162 | 
163 | ########################
164 | ## 2. Add data_corpus_ukmanifestos from quanteda.corpora package ----
165 | ########################
166 | 
167 | # Since some of the manifestos are already coded in terms of social/eoconomic/neither,
168 | # we remove these documents from the non-annotated corpus
169 | 
170 | # get overview of crowdcoded manifesto
171 | manifestos_ukseconsocial <- dat_uk_man_crowdcoded_clean %>% 
172 |     ungroup() %>% 
173 |     select(Party, Year) %>% 
174 |     unique() %>% 
175 |     mutate(party_year = paste(Party, Year, sep = "_"))
176 | 
177 | # create new docvar used for removing crowdcoded manifestos 
178 | # from not-annotated corpus
179 | docvars(data_corpus_ukmanifestos, "party_year") <- paste(
180 |     docvars(data_corpus_ukmanifestos, "Party"),
181 |     docvars(data_corpus_ukmanifestos, "Year"), sep = "_"
182 | )
183 | 
184 | # remove manifestos that have been crowdcoded
185 | data_corpus_ukmanifestos_subset <- data_corpus_ukmanifestos %>% 
186 |     corpus_subset(!party_year %in% manifestos_ukseconsocial$party_year)
187 | 
188 | ndoc(data_corpus_ukmanifestos)
189 | ndoc(data_corpus_ukmanifestos_subset)
190 | 
191 | # check whether Lab, Con, Lib manifestos are excluded for time between 1987 and 2010 (yes!)
192 | table(docvars(data_corpus_ukmanifestos_subset, "party_year"))
193 | 
194 | # only keep manifestos from national general elections (Type == "natl")
195 | data_corpus_ukmanifestos_sentences <- data_corpus_ukmanifestos_subset %>% 
196 |     corpus_subset(Type == "natl") 
197 |     
198 | # transform to data frame for easier adjustments of document-level variables
199 | data_uk_man <- data.frame(
200 |     doc_id = docnames(data_corpus_ukmanifestos_sentences),
201 |     text = texts(data_corpus_ukmanifestos_sentences),
202 |     docvars(data_corpus_ukmanifestos_sentences),
203 |     stringsAsFactors = FALSE
204 | )
205 | 
206 | # use spacyr to tokenize manifestos to sentence-level
207 | spacy_initialize(model = "en")
208 | 
209 | data_uk_man_sentences <- spacy_tokenize(data_uk_man,
210 |                                         remove_separators = FALSE,
211 |                                         what = "sentence", 
212 |                                         output = "data.frame") 
213 | 
214 | data_uk_man_sentences <- rename(data_uk_man_sentences, text = token)
215 | 
216 | 
217 | # merge metadata
218 | data_uk_man_sentences <- left_join(data_uk_man_sentences, 
219 |                                    select(data_uk_man, -c("text")),
220 |                                    by = "doc_id")
221 | 
222 | 
223 | ########################
224 | ## 3. Import additional manifestos from 2010, 2015, and 2017 elections ----
225 | ########################
226 | 
227 | ## Note: data retrieved from http://polidoc.net (2010 and 2015) and from parties' websites (2019)
228 | 
229 | data_uk_1519 <- readtext("tests/data_creation/data_uk_manifestos_2015-2019.zip",
230 |                             ignore_missing_files = TRUE,
231 |                             encoding = "utf-8",
232 |                             docvarsfrom = "filenames",
233 |                             docvarnames = c("Year", "Party"))
234 | 
235 | 
236 | # tokenize documents to the level of sentences
237 | data_uk_man_1519_sentences <- spacy_tokenize(data_uk_1519,
238 |                                              remove_separators = FALSE,
239 |                                              what = "sentence", 
240 |                                              output = "data.frame") 
241 | 
242 | # merge metadata
243 | data_uk_man_1519_sentences <- left_join(data_uk_man_1519_sentences, 
244 |                                    select(data_uk_1519, -c("text")),
245 |                                    by = "doc_id")
246 | 
247 | data_uk_man_1519_sentences <- rename(data_uk_man_1519_sentences, text = token)
248 | 
249 | 
250 | ########################
251 | ## 4. Combine manifestos and create corpus ----
252 | ########################
253 | 
254 | dat_uk_man_crowdcoded_clean$Year <- as.integer(dat_uk_man_crowdcoded_clean$Year)
255 | 
256 | data_uk_manifestos <- bind_rows(data_uk_man_sentences, 
257 |                                 dat_uk_man_crowdcoded_clean, 
258 |                                 data_uk_man_1519_sentences)
259 | 
260 | 
261 | # create unique id for each sentence and select only necessary variables
262 | data_uk_manifestos_selectvars <- data_uk_manifestos %>% 
263 |     select(-c(party_year, sentenceid)) %>% 
264 |     mutate(year = as.factor(Year)) %>% 
265 |     select(text,
266 |            party = Party,
267 |            year,
268 |            starts_with("crowd_"))
269 | 
270 | # create a quanteda corpus
271 | corp <- corpus(data_uk_manifestos_selectvars)
272 | 
273 | # count the number of tokens per sentence (after removing punctuation characters)
274 | docvars(corp, "ntoken_sent") <- ntoken(corp, remove_punct = TRUE)
275 | 
276 | # remove observation with 0 tokens (i.e. only punctuation character)
277 | corp_small <- corp %>% 
278 |     corpus_subset(ntoken_sent > 0)
279 | 
280 | # transform back to data frame to create nice doc_id after having removed "empty" sentences
281 | dat_corpus <- data.frame(
282 |     text = texts(corp_small),
283 |     docvars(corp_small),
284 |     stringsAsFactors = FALSE
285 | )
286 | 
287 | dat_corpus <- dat_corpus %>% 
288 |     arrange(year, party) %>% 
289 |     group_by(party, year) %>% 
290 |     mutate(sentence_no = 1:n()) %>% 
291 |     mutate(doc_id = paste(party, year, sentence_no, sep = "_")) %>% 
292 |     ungroup()
293 | 
294 | # rename variables
295 | dat_corpus_renamed <- dat_corpus %>% 
296 |   mutate(party = ifelse(party == "Comm", "CP", party)) %>% 
297 |   mutate(party = ifelse(party == "Gr", "Greens", party)) %>% 
298 |   mutate(party = ifelse(party == "OMRL", "MRLP", party)) %>% 
299 |   mutate(party = ifelse(party == "PCy", "PC", party))
300 | 
301 | 
302 | recode_party <- c("'Coalition'='Coalition Agreement';
303 |                   'BNP'='British National Party';
304 |                   'CAP'='Community Action Party';
305 |                   'Con'='Conservative Party';
306 |                   'Comm'='Communist Party';
307 |                   'CP'='Communist Party';
308 |                   'Dem'='Democratic Party';
309 |                   'DUP'='Democratic Unionist Party';
310 |                   'EDP'='English Democrats';
311 |                   'EIP'='English Independence Party';
312 |                   'FSP'='Free Scotland Party';
313 |                   'FW'='Forward Wales';
314 |                   'Greens'='Green Party';
315 |                   'IGV'='Independent Green Voice';
316 |                   'LA'='Left Alliance';
317 |                   'Lab'='Labour Party';
318 |                   'Lib'='Liberal Party';
319 |                   'LD'='Liberal Demoracts';
320 |                   'LibSDP'='Social Democratic Party';
321 |                   'MK'='Mebyon Kernow - the Party for Cornwall';
322 |                   'MRLP'='Official Monster Raving Loony Party';
323 |                   'ND'='National Democrats';
324 |                   'NIA'='Northern Ireland Alliance';
325 |                   'PA'='Prolife Alliance';
326 |                   'PC'='Plaid Cymru';
327 |                   'PP'='Peace Party';
328 |                   'PUP'='Progressive Unionist Party';
329 |                   'PVP'='Protest Vote Party';
330 |                   'Resp'='Respect';
331 |                   'RT'='Richard Taylor Personal Manifesto';
332 |                   'Scon'='Scottish Conservative Party';
333 |                   'SDLP'='Social Democratic and Labour Party';
334 |                   'SEP'='Socialist Equality Party';
335 |                   'SF'='Sinn Féin';
336 |                   'SGr'='Scottish Green Party';
337 |                   'SLab'='Scottish Labour Party';
338 |                   'SLD'='Scottish Liberal Democrats';
339 |                   'SNP'='Scottish National Party';
340 |                   'SP'='Socialist Party';
341 |                   'SCon'='Scottish Conservative Party';
342 |                   'SSoc'='Scottish Socialist Party';
343 |                   'SSP'='Scottish Socialist Party';
344 |                   'Stuck'='Stuckist Party';
345 |                   'TW'='Third Way';
346 |                   'UKIP'='UK Independence Party';
347 |                   'UUP'='Ulster Unionist Party';
348 |                   'Ver'='Veritas Party'")
349 | 
350 | 
351 | dat_corpus_renamed <- dat_corpus_renamed %>% 
352 |   mutate(partyname = car::recode(party, recode_party))
353 | 
354 | dat_corpus_renamed <- dat_corpus_renamed %>% 
355 |   select(doc_id, text, party, partyname, year, 
356 |          crowd_econsocial_label, crowd_econsocial_mean,
357 |          crowd_econsocial_n, crowd_immigration_label, 
358 |          crowd_immigration_mean, 
359 |          crowd_immigration_n) 
360 | 
361 | dat_corpus_renamed$crowd_econsocial_label <- factor(dat_corpus_renamed$crowd_econsocial_label)
362 | dat_corpus_renamed$crowd_immigration_label <- factor(dat_corpus_renamed$crowd_immigration_label)
363 | dat_corpus_renamed$year <- as.integer(as.character(dat_corpus_renamed$year))
364 | dat_corpus_renamed$party <- as.factor(dat_corpus_renamed$party)
365 | dat_corpus_renamed$partyname <- as.factor(dat_corpus_renamed$partyname)
366 | dat_corpus_renamed$crowd_immigration_n <- as.integer(dat_corpus_renamed$crowd_immigration_n)
367 | dat_corpus_renamed$crowd_econsocial_n <- as.integer(dat_corpus_renamed$crowd_econsocial_n)
368 | 
369 | # create final corpus
370 | data_corpus_manifestosentsUK <- corpus(dat_corpus_renamed,
371 |                                        docid_field = "doc_id")
372 | 
373 | # add corpus to package
374 | usethis::use_data(data_corpus_manifestosentsUK, overwrite = TRUE)
375 |   


--------------------------------------------------------------------------------
/tests/data_creation/data_uk_immigration.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_immigration.zip


--------------------------------------------------------------------------------
/tests/data_creation/data_uk_manifestos_2015-2019.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_manifestos_2015-2019.zip


--------------------------------------------------------------------------------
/tests/data_creation/data_uk_policyarea.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_policyarea.zip


--------------------------------------------------------------------------------
/tests/data_creation/master.sentences.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/master.sentences.Rdata


--------------------------------------------------------------------------------
/tests/misc/test-LMRD.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Testing text classification using movie reviews"
  3 | author: "Kenneth Benoit"
  4 | output:
  5 |   html_notebook:
  6 |     toc: yes
  7 |     toc_float: yes
  8 | ---
  9 | 
 10 | 
 11 | Tests using the Large Movie Review Dataset, a dataset for sentiment classification containing 25,000 highly polar movie reviews for training, and 25,000 for testing, from Maas et. al. (2011).
 12 | 
 13 | Source: Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis". The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
 14 | 
 15 | # Setting up {.tabset .tabset-fade}
 16 | 
 17 | ```{r setup, echo=FALSE}
 18 | knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE)
 19 | ```
 20 | 
 21 | 
 22 | ## Test and training data 
 23 | 
 24 | ```{r}
 25 | dfmat <- dfm(data_corpus_LMRD)
 26 | dfmat_train <- dfm_trim(dfmat, min_termfreq = 100) %>%
 27 |     dfm_subset(set == "train")
 28 | dfmat_test <- dfm_trim(dfmat, min_termfreq = 100) %>%
 29 |     dfm_subset(set == "test")
 30 | ```
 31 | 
 32 | ## Some functions for evaluation
 33 | 
 34 | ```{r}
 35 | performance <- function(mytable, verbose = TRUE) {
 36 |   truePositives <- mytable[1, 1]
 37 |   trueNegatives <- sum(diag(mytable)[-1])
 38 |   falsePositives <- sum(mytable[1, ]) - truePositives
 39 |   falseNegatives <- sum(mytable[, 1]) - truePositives
 40 |   precision <- truePositives / (truePositives + falsePositives)
 41 |   recall <- truePositives / (truePositives + falseNegatives)
 42 |   accuracy <- sum(diag(mytable)) / sum(mytable)
 43 |   tnr <- trueNegatives / (trueNegatives + falsePositives)
 44 |   balanced_accuracy <- sum(c(precision, tnr), na.rm = TRUE) / 2
 45 |   if (verbose) {
 46 |     print(mytable)
 47 |     cat(
 48 |       "\n    precision =", round(precision, 2),
 49 |       "\n       recall =", round(recall, 2),
 50 |       "\n     accuracy =", round(accuracy, 2),
 51 |       "\n    bal. acc. =", round(balanced_accuracy, 2),
 52 |       "\n"
 53 |     )
 54 |   }
 55 |   invisible(c(precision, recall))
 56 | }
 57 | ```
 58 | 
 59 | 
 60 | 
 61 | # Naive Bayes {.tabset .tabset-fade}
 62 | 
 63 | ## No weights
 64 | 
 65 | ```{r}
 66 | system.time({
 67 | tmod_nb <- textmodel_nb(dfmat_train, y = docvars(dfmat_train, "polarity"))
 68 | pred_nb <- predict(tmod_nb, newdata = dfmat_test, type = "class")
 69 | table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
 70 | })
 71 | ```
 72 | 
 73 | ## tf-idf weighting
 74 | 
 75 | ```{r}
 76 | system.time({
 77 | tmod_nb <- textmodel_nb(dfm_tfidf(dfmat_train), 
 78 |                         y = docvars(dfmat_train, "polarity"))
 79 | pred_nb <- predict(tmod_nb, newdata = dfm_tfidf(dfmat_test), type = "class")
 80 | table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
 81 | })
 82 | ```
 83 | 
 84 | # SVM {.tabset .tabset-fade}
 85 | 
 86 | ## SVM
 87 | 
 88 | ```{r}
 89 | system.time({
 90 | tmod_svm <- textmodel_svm(dfmat_train, y = docvars(dfmat_train, "polarity"))
 91 | pred_svm <- predict(tmod_svm, newdata = dfmat_test, type = "class")
 92 | table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
 93 | })
 94 | ```
 95 | 
 96 | ## SVM w/tf-idf weights
 97 | 
 98 | ```{r}
 99 | tmod_svm2 <- textmodel_svm(dfm_tfidf(dfmat_train),
100 |                            y = docvars(dfmat_train, "polarity"))
101 | pred_svm2 <- predict(tmod_svm2, newdata = dfm_tfidf(dfmat_test), type = "class")
102 | table(pred_svm2, dfmat_test$polarity)[2:1, 2:1] %>% performance()
103 | ```
104 | 
105 | ## SVMlin
106 | 
107 | ```{r}
108 | system.time({
109 | tmod_svmlin <- textmodel_svmlin(dfmat_train,
110 |                              y = docvars(dfmat_train, "polarity"))
111 | pred_svmlin <- predict(tmod_svmlin, newdata = dfmat_test, type = "class")
112 | table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
113 | })
114 | ```
115 | 
116 | 
117 | # Multilayer Perceptron Network (MLP)
118 | 
119 | ```{r}
120 | system.time({
121 | tmod_mlp <- textmodel_mlp(dfmat_train,
122 |                               y = docvars(dfmat_train, "polarity"), epochs = 10)
123 | pred_mlp <- predict(tmod_mlp, newdata = dfmat_test, type = "class")
124 | table(pred_mlp, dfmat_test$polarity)[2:1, 2:1] %>% performance()
125 | })
126 | ```
127 | 
128 | 
129 | # CNN-embedding-LSTM model
130 | 
131 | ```{r}
132 | system.time({
133 | toks_train <- data_corpus_LMRD %>%
134 |     corpus_subset(set == "test") %>%
135 |     tokens() %>%
136 |     tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
137 | tmod_cnn <- textmodel_cnnlstmemb(toks_train, 
138 |                                  y = docvars(dfmat_train, "polarity"), 
139 |                                  epochs = 10)
140 | 
141 | toks_test <- data_corpus_LMRD %>%
142 |     corpus_subset(set == "train") %>%
143 |     tokens() %>%
144 |     tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
145 | pred_cnn <- predict(tmod_cnn, newdata = toks_test, type = "class")
146 | table(pred_cnn, dfmat_test$polarity) %>% performance()
147 | })
148 | ```
149 | 
150 | 


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if(requireNamespace('spelling', quietly = TRUE))
2 |   spelling::spell_check_test(vignettes = TRUE, error = TRUE,
3 |                              skip_on_cran = TRUE)
4 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | library("testthat")
 2 | library("quanteda.classifiers")
 3 | 
 4 | library("quanteda")
 5 | library("quanteda.textmodels")
 6 | 
 7 | # library("keras")
 8 | # use_session_with_seed(42)
 9 | # see https://github.com/rstudio/keras/issues/890#issuecomment-539044011
10 | # tensorflow::tf$random$set_seed(42)
11 | 
12 | test_check("quanteda.classifiers")
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test-performance.R:
--------------------------------------------------------------------------------
 1 | context("test evaluation functions")
 2 | 
 3 | lvs <- c("Relevant", "Irrelevant")
 4 | pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs)
 5 | true <- factor(c(rep(lvs, times = c(30, 12)),
 6 |                  rep(lvs, times = c(30, 28))), levels = lvs)
 7 | tab <- table(pred, true)
 8 | 
 9 | test_that("performance works by_class = TRUE", {
10 |     perf <- performance(tab, by_class = TRUE)
11 |     expect_equal(perf,
12 |                  list(precision = c(Relevant = 0.714, 
13 |                                     Irrelevant = 0.483),
14 |                       recall = c(Relevant = 0.5, Irrelevant = 0.7), 
15 |                       f1 = c(Relevant = 0.588, 
16 |                              Irrelevant = 0.571), 
17 |                       accuracy = 0.58, 
18 |                       balanced_accuracy = 0.6),
19 |                  tol = .001
20 |     )
21 | })
22 | 
23 | test_that("performance works by_class = FALSE", {
24 |     perf <- performance(tab, by_class = FALSE)
25 |     expect_equal(perf,
26 |                  list(precision = 0.599, recall = 0.6, f1 = 0.580, 
27 |                       accuracy = 0.58, balanced_accuracy = 0.6),
28 |                  tol = .001
29 |     )
30 | })
31 | 
32 | test_that("exceptions work", {
33 |     perf <- performance(tab, by_class = TRUE)
34 |     #expect_error(
35 |     #    f1_score(perf[c("precision", "accuracy")]),
36 |     #    "list must contain both precision and recall"
37 |     #)
38 |     expect_error(
39 |         balanced_accuracy(perf[c("precision", "accuracy")]),
40 |         "list must include recall"
41 |     )
42 |     
43 |     expect_error(
44 |         balanced_accuracy(performance(tab, by_class = FALSE)),
45 |         "recall must be computed by class"
46 |     )
47 |     
48 |     expect_error(
49 |         quanteda.classifiers:::check_table(tab, 3),
50 |         "truth must be 2 for columns or 1 for rows"
51 |     )
52 |     tab2 <- tab
53 |     colnames(tab2)[2] <- "dummy" 
54 |     expect_error(
55 |         quanteda.classifiers:::check_table(tab2, 2),
56 |         "predicted and truth values must have the same order and names"
57 |     )
58 | })
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test-textmodel_cnnlstmemb.R:
--------------------------------------------------------------------------------
 1 | context("test textmodel_cnnlstmemb")
 2 | 
 3 | test_that("the cnnlstmemb model works", {
 4 |     skip()
 5 |     skip_on_cran()
 6 | 
 7 |     data(data_corpus_EPcoaldebate, package = "quanteda.textmodels")
 8 |     corp <- corpus_subset(data_corpus_EPcoaldebate,
 9 |                           subset = language == "English") %>%
10 |         corpus_sample(500)
11 | 
12 |     toks <- tokens(corp)
13 |     label <- ifelse(docvars(corp, "crowd_subsidy_label") == "Pro-Subsidy", 1, 0)
14 |     tmod <- textmodel_cnnlstmemb(toks, y = label, epochs = 8)
15 | 
16 |     expect_output(
17 |         print(tmod),
18 |         "Call:"
19 |     )
20 |     expect_equal(names(summary(tmod)), c("call", "model structure"))
21 |     con_mat <- table(predict(tmod, type = "class"), label)
22 |     accuracy <- sum(diag(con_mat)) / sum(con_mat)
23 |     expect_equal(
24 |         accuracy,
25 |         0.87,
26 |         tolerance = 0.1
27 |     )
28 |     set.seed(10)
29 |     pred_out <- predict(tmod, type = "probability")
30 |     pred_max <- apply(pred_out, 1, function(x) colnames(pred_out)[which.max(x)])
31 |     con_mat <- table(pred_max, label)
32 |     accuracy <- sum(diag(con_mat)) / sum(con_mat)
33 |     expect_equal(
34 |         accuracy,
35 |         0.87,
36 |         tolerance = 0.1
37 |     )
38 | })
39 | 
40 | test_that("multiclass prediction works", {
41 |     skip()
42 |     skip_on_cran()
43 | 
44 |     data(data_corpus_irishbudget2010, package = "quanteda.textmodels")
45 |     toks <- tokens(data_corpus_irishbudget2010)
46 |     y <- docvars(data_corpus_irishbudget2010, "party")
47 |     y[5] <- NA
48 |     tmod2 <- textmodel_cnnlstmemb(toks, y = y)
49 |     expect_equal(
50 |         names(predict(tmod2, type = "class"))[5],
51 |         "Cowen, Brian (FF)"
52 |     )
53 | 
54 |     probmat <- predict(tmod2, type = "probability")
55 |     expect_equal(dim(probmat), c(14, 5))
56 |     expect_equal(rownames(probmat), docnames(toks))
57 |     expect_equal(colnames(probmat), tmod2$classnames)
58 |     expect_equal(unname(rowSums(probmat)), rep(1, nrow(probmat)), tol = .000001)
59 | })
60 | 
61 | test_that("cnnlstmemb works with tokens2sequences", {
62 |     skip()
63 |     skip_on_cran()
64 | 
65 |     data(data_corpus_irishbudget2010, package = "quanteda.textmodels")
66 |     toks1 <- tokens2sequences(x = tokens(data_corpus_irishbudget2010),keepn = 100)
67 |     y <- docvars(data_corpus_irishbudget2010, "party")
68 |     y[5] <- NA
69 |     tmod2 <- textmodel_cnnlstmemb(x = toks1, y = y)
70 |     expect_equal(
71 |         names(predict(tmod2, type = "class"))[5],
72 |         "Cowen, Brian (FF)"
73 |     )
74 | 
75 |     probmat <- predict(tmod2, type = "probability")
76 |     expect_equal(dim(probmat), c(14, 5))
77 |     expect_equal(rownames(probmat), rownames(toks1$matrix))
78 |     expect_equal(colnames(probmat), tmod2$classnames)
79 |     expect_equal(unname(rowSums(probmat)), rep(1, nrow(probmat)), tol = .000001)
80 | })
81 | 


--------------------------------------------------------------------------------
/tests/testthat/test-textmodel_evaluate.R:
--------------------------------------------------------------------------------
 1 | context("test textmodel_evaluate")
 2 | 
 3 | test_that("textmodel_evaluate works", {
 4 |     skip("until rewritten")
 5 |     skip_on_cran()
 6 |     
 7 |     data(data_corpus_EPcoaldebate, package = "quanteda.textmodels")
 8 |     
 9 |     set.seed(100)
10 |     corp <- corpus_sample(data_corpus_EPcoaldebate, size = 500, by = "crowd_subsidy_label")
11 |     dfmat <- dfm(corp) %>% 
12 |         dfm_trim(min_termfreq = 10)
13 |     labels <- docvars(dfmat, "crowd_subsidy_label")
14 |     model_eval <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "f1_score", k = 3, seed = 5)
15 |         
16 |     # Check ouptuts for consistency
17 |     expect_equal(dim(model_eval), c(3, 4))
18 |     expect_equal(names(model_eval), c("k", "f1_score", "time", "seed"))
19 |     expect_equal(max(model_eval$k), 3)
20 |     
21 |     
22 |     model_eval2 <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "f1_score", k = 2, parameters = list(epochs = c(3, 4)), seed = 5)
23 |     
24 |     # Check ouptuts for consistency
25 |     expect_equal(dim(model_eval2), c(4, 5))
26 |     expect_equal(names(model_eval2), c("k", "f1_score", "epochs", "time", "seed"))
27 |     expect_equal(max(model_eval2$k), 2)
28 |     
29 |     # Check by_class
30 |     model_eval3 <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "recall", k = 2, seed = 5, by_class = TRUE)
31 |     expect_true("class" %in% names(model_eval3)) 
32 |     expect_true(sum(levels(labels) %in% model_eval3$class) == 3)
33 |     
34 |     # Check if it works with textmodel_cnnlstm
35 |     
36 |     corp_tok <- corpus_sample(data_corpus_EPcoaldebate, size = 500, by = "crowd_subsidy_label")
37 |     tok <- tokens(corp_tok)
38 |     labels <- docvars(corp_tok, "crowd_subsidy_label")
39 |     model_eval4 <- textmodel_evaluate(x = tok, y = labels, model = "textmodel_cnnlstmemb", fun = "f1_score", k = 3, seed = 5)
40 |     expect_equal(dim(model_eval4), c(3, 4))
41 |     expect_equal(names(model_eval4), c("k", "f1_score", "time", "seed"))
42 |     expect_equal(max(model_eval4$k), 3)
43 |     expect_true(min(model_eval4$f1_score) > 0.1 & max(model_eval4$f1_score) < 1)
44 | })
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/testthat/test-textmodel_mlp.R:
--------------------------------------------------------------------------------
 1 | context("test textmodel_mlp")
 2 | 
 3 | test_that("the mlp model works", {
 4 |     skip("because of tensorflow install problems")
 5 |     skip_on_cran()
 6 | 
 7 |     data(data_corpus_EPcoaldebate, package = "quanteda.textmodels")
 8 |     
 9 |     set.seed(100)
10 |     corp_train <- corpus_sample(data_corpus_EPcoaldebate, size = 3000, by = "crowd_subsidy_label")
11 |     corp_test <- corpus_sample(data_corpus_EPcoaldebate, size = 10, by = "crowd_subsidy_label")
12 |     dfmat_train <- dfm(corp_train)
13 |     dfmat_test <- dfm(corp_test)
14 | 
15 |     tmod <- textmodel_mlp(dfmat_train, y = docvars(dfmat_train, "crowd_subsidy_label"), epoch = 5)
16 | 
17 |     # label
18 |     pred <- predict(tmod, newdata = dfmat_test, type = "class")
19 |     tab <- table(pred, dfmat_test$crowd_subsidy_label)
20 |     acc <- sum(diag(tab)) / sum(tab)
21 |     expect_gte(acc, .6)
22 | 
23 |     # predicted prob
24 |     prob <- predict(tmod, newdata = dfmat_test, type = "probability")
25 |     expect_gte(prob["PL_Lamberts_3_3", "Anti-Subsidy"], .95)
26 | 
27 |     expect_output(
28 |         print(tmod),
29 |         "Call:"
30 |     )
31 | 
32 |     expect_equal(names(summary(tmod)), c("call", "model structure"))
33 |     set.seed(10)
34 |     pred_max <- apply(prob, 1, function(x) colnames(prob)[which.max(x)])
35 |     expect_equivalent(
36 |         pred_max,
37 |         as.character(pred)
38 |     )
39 | })
40 | 


--------------------------------------------------------------------------------
/tests/testthat/test-tokens2sequences.R:
--------------------------------------------------------------------------------
 1 | context("test tokens2sequences")
 2 | 
 3 | test_that("tokens2sequences works", {
 4 |     skip()
 5 |     skip_on_cran()
 6 | 
 7 |     ## Example from 13.1 of _An Introduction to Information Retrieval_
 8 |     text <- c("Chinese Beijing Chinese",
 9 |               "Chinese Chinese Shanghai",
10 |               "Chinese Macao",
11 |               "Tokyo Japan",
12 |               "Chinese Chinese Chinese Tokyo Japan")
13 |     text_tokens <- tokens(text)
14 |     seq <- tokens2sequences(text_tokens, maxsenlen = 10, keepn = 5)
15 | 
16 |     # Check ouptuts for consistency
17 |     expect_equal(dim(seq$matrix), c(5, 10))
18 |     expect_equal(seq$nfeatures, 5)
19 |     expect_equal(max(seq$matrix), 5)
20 |     expect_equal(min(seq$matrix), 0)
21 |     expect_equal(as.integer(apply(seq$matrix, 1, function(x) sum(x != 0))), c(3, 3, 1, 2, 5))
22 | 
23 |     # Compare with keras's texts_to_sequences function
24 |     tok <- keras::text_tokenizer(filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r",lower = T,num_words = 6) %>% # Note: Keras includes 0 as a word. tokens2sequences does not
25 |         keras::fit_text_tokenizer(text)
26 |     tok_mat <- keras::texts_to_sequences(tok,texts = text) %>% keras::pad_sequences(maxlen = 10)
27 |     seq_mat <- unname(seq$matrix)
28 |     expect_equal(seq_mat, tok_mat)
29 |     
30 |     # Check whether it works on character vectors
31 |     seq2 <- tokens2sequences(text, maxsenlen = 10, keepn = 5)
32 |     expect_equal(dim(seq$matrix), dim(seq2$matrix)) 
33 |     expect_equal(seq$nfeatures, seq2$nfeatures) 
34 |     
35 |     # Check whether tokens2sequences performs recursively
36 |     seq_short <- tokens2sequences(x = seq2, maxsenlen = 2, keepn = 5)
37 |     expect_equal(ncol(seq_short$matrix), 2)
38 |     expect_equal(seq_short$nfeatures, 4)
39 | })
40 | 
41 | test_that("tokens2sequences_conform works", {
42 |     skip()
43 |     txt1 <- "This is sentence one.  And here is sentence two."
44 |     txt2 <- "This is sentence 3.  Sentence 4!  A fifth and final example."
45 |     toks1 <- corpus(txt1) %>%
46 |         corpus_reshape(to = "sentence") %>%
47 |         tokens() %>%
48 |         tokens_tolower()
49 |     toks2 <- corpus(txt2) %>%
50 |         corpus_reshape(to = "sentence") %>%
51 |         tokens() %>%
52 |         tokens_tolower()
53 | 
54 |     seqx <- tokens2sequences(toks1, maxsenlen = 4, keepn = 4)
55 |     seqy <- tokens2sequences(toks2, maxsenlen = 4, keepn = 4)
56 | 
57 |     seqxy <- tokens2sequences_conform(seqx, seqy)
58 |     expect_equal(dim(seqxy$matrix), c(2, 4))
59 |     expect_equal(ncol(seqxy$features), 3)
60 | })
61 | 


--------------------------------------------------------------------------------