├── .Rbuildignore
├── .github
├── .gitignore
└── workflows
│ ├── R-CMD-check.yaml
│ └── test-coverage.yaml
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
├── crossval.R
├── data.R
├── performance.R
├── quanteda.classifiers-package.R
├── textmodel_cnnlstmemb.R
├── textmodel_evaluate.R
├── textmodel_mlp.R
├── tokens2sequences.R
└── zzz.R
├── README.Rmd
├── README.md
├── data
├── data_corpus_LMRD.rda
└── data_corpus_manifestosentsUK.rda
├── inst
└── WORDLIST
├── man
├── crossval.Rd
├── data_corpus_LMRD.Rd
├── data_corpus_manifestosentsUK.Rd
├── is.tokens2sequences.Rd
├── performance.Rd
├── predict.textmodel_cnnlstmemb.Rd
├── predict.textmodel_mlp.Rd
├── quanteda.classifiers-package.Rd
├── save.textmodel_mlp.Rd
├── summary.textmodel_cnnlstmemb.Rd
├── summary.textmodel_mlp.Rd
├── textmodel_cnnlstmemb.Rd
├── textmodel_evaluate.Rd
├── textmodel_mlp.Rd
├── tokens2sequences.Rd
└── tokens2sequences_conform.Rd
├── quanteda.classifiers.Rproj
└── tests
├── data_creation
├── create_data_corpus_manifestosentUK.R
├── data_uk_immigration.zip
├── data_uk_manifestos_2015-2019.zip
├── data_uk_policyarea.zip
└── master.sentences.Rdata
├── misc
├── test-LMRD.Rmd
└── test-LMRD.nb.html
├── spelling.R
├── testthat.R
└── testthat
├── test-performance.R
├── test-textmodel_cnnlstmemb.R
├── test-textmodel_evaluate.R
├── test-textmodel_mlp.R
└── test-tokens2sequences.R
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^codecov\.yml$
2 | ^appveyor\.yml$
3 | ^\.travis\.yml$
4 | ^.*\.Rproj$
5 | ^\.Rproj\.user$
6 | README.Rmd
7 | tests/misc/
8 | \.github/
9 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | branches:
9 | - master
10 |
11 | name: R-CMD-check
12 |
13 | jobs:
14 | R-CMD-check:
15 | runs-on: ${{ matrix.config.os }}
16 |
17 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
18 |
19 | strategy:
20 | fail-fast: false
21 | matrix:
22 | config:
23 | - {os: windows-latest, r: 'release'}
24 | - {os: macOS-latest, r: 'release'}
25 | - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
26 | - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
27 |
28 | env:
29 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
30 | RSPM: ${{ matrix.config.rspm }}
31 |
32 | steps:
33 | - uses: actions/checkout@v2
34 |
35 | - uses: r-lib/actions/setup-r@v2
36 | with:
37 | r-version: ${{ matrix.config.r }}
38 |
39 | - uses: r-lib/actions/setup-pandoc@v2
40 |
41 | - name: Query dependencies
42 | run: |
43 | install.packages('remotes')
44 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
45 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
46 | shell: Rscript {0}
47 |
48 | - name: Cache R packages
49 | if: runner.os != 'Windows'
50 | uses: actions/cache@v1
51 | with:
52 | path: ${{ env.R_LIBS_USER }}
53 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
54 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
55 |
56 | - name: Install system dependencies
57 | if: runner.os == 'Linux'
58 | run: |
59 | while read -r cmd
60 | do
61 | eval sudo $cmd
62 | done < <(Rscript -e 'cat(remotes::system_requirements("ubuntu", "20.04"), sep = "\n")')
63 |
64 | - name: Install dependencies
65 | run: |
66 | remotes::install_deps(dependencies = TRUE)
67 | remotes::install_cran("rcmdcheck")
68 | shell: Rscript {0}
69 |
70 | - name: Check
71 | env:
72 | _R_CHECK_CRAN_INCOMING_REMOTE_: false
73 | run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
74 | shell: Rscript {0}
75 |
76 | - name: Upload check results
77 | if: failure()
78 | uses: actions/upload-artifact@main
79 | with:
80 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results
81 | path: check
82 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | branches:
4 | - master
5 | pull_request:
6 | branches:
7 | - master
8 |
9 | name: test-coverage
10 |
11 | jobs:
12 | test-coverage:
13 | runs-on: macOS-latest
14 | steps:
15 | - uses: actions/checkout@v2
16 |
17 | - uses: r-lib/actions/setup-r@v2
18 | with:
19 | r-version: 'release'
20 |
21 | - uses: r-lib/actions/setup-pandoc@v2
22 |
23 | - name: Query dependencies
24 | run: |
25 | install.packages('remotes')
26 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
27 | shell: Rscript {0}
28 |
29 | - name: Cache R packages
30 | uses: actions/cache@v1
31 | with:
32 | path: ${{ env.R_LIBS_USER }}
33 | key: macOS-r-4.0-2-${{ hashFiles('.github/depends.Rds') }}
34 | restore-keys: macOS-r-4.0-2-
35 |
36 | - name: Install dependencies
37 | run: |
38 | install.packages(c("remotes"))
39 | remotes::install_deps(dependencies = TRUE)
40 | remotes::install_cran("covr")
41 | shell: Rscript {0}
42 |
43 | - name: Test coverage
44 | run: covr::codecov()
45 | shell: Rscript {0}
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | .Rapp.history
4 | # Session Data files
5 | .RData
6 | # Example code in package build process
7 | *-Ex.R
8 | # Output files from R CMD build
9 | /*.tar.gz
10 | # Output files from R CMD check
11 | /*.Rcheck/
12 | # RStudio files
13 | .Rproj.user/
14 | # produced vignettes
15 | vignettes/*.html
16 | vignettes/*.pdf
17 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
18 | .httr-oauth
19 | # knitr and R markdown default cache directories
20 | /*_cache/
21 | /cache/
22 | # Temporary files created by R markdown
23 | *.utf8.md
24 | *.knit.md
25 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
26 | rsconnect/
27 | .Rproj.user
28 | quanteda.svm.Rproj
29 | .DS_Store
30 |
31 | tests/misc/test-LMRD.md
32 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
2 | language: R
3 |
4 | dist: xenial
5 | sudo: false
6 |
7 | cache:
8 | packages: true
9 | directories:
10 | - $HOME/.keras
11 | - $HOME/.cache/pip
12 |
13 | warnings_are_errors: true
14 |
15 | matrix:
16 | include:
17 | - name: "Backend: TensorFlow | Implementation: Keras"
18 | env:
19 | - KERAS_BACKEND="tensorflow"
20 | - KERAS_IMPLEMENTATION="tensorflow"
21 | - TENSORFLOW_VERSION="default"
22 |
23 | before_script:
24 | - sudo apt-get update
25 | - sudo apt-get install python3 python3-pip
26 | - pip3 install --upgrade --ignore-installed --user travis virtualenv
27 | - R CMD INSTALL .
28 | - R -e 'Sys.setenv(PIP_QUIET=1); keras::install_keras(tensorflow = Sys.getenv("TENSORFLOW_VERSION"), extra_packages = "IPython")'
29 | - R -e 'tensorflow::tf_config()'
30 |
31 | after_success:
32 | - Rscript -e 'covr::codecov()'
33 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: quanteda.classifiers
2 | Type: Package
3 | Title: Models for supervised text classification
4 | Version: 0.4
5 | Authors@R: c(person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role = c("aut", "cre")),
6 | person("Patrick", "Chester", email = "pjc468@nyu.edu", role = c("aut")),
7 | person("Müller", "Stefan", email = "mueller@ipz.uzh.ch", role = c("aut")))
8 | Description: A test package for developing additional text classifier models (textmodels) including neural network-based methods.
9 | Encoding: UTF-8
10 | License: GPL-3
11 | Depends:
12 | R (>= 3.5.0),
13 | Imports:
14 | quanteda (>= 2.0),
15 | keras (>= 2.1.0),
16 | quanteda.textmodels,
17 | groupdata2
18 | Suggests:
19 | spelling,
20 | testthat,
21 | covr
22 | URL: https://github.com/quanteda/quanteda.classifiers
23 | BugReports: https://github.com/quanteda/quanteda.classifiers/issues
24 | LazyData: TRUE
25 | LazyDataCompression: bzip2
26 | RoxygenNote: 7.2.3
27 | Language: en-GB
28 | Roxygen: list(markdown = TRUE)
29 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(accuracy,default)
4 | S3method(accuracy,table)
5 | S3method(balanced_accuracy,default)
6 | S3method(balanced_accuracy,list)
7 | S3method(balanced_accuracy,table)
8 | S3method(crossval,textmodel)
9 | S3method(f1_score,default)
10 | S3method(f1_score,table)
11 | S3method(head,textmodel_evaluate)
12 | S3method(load,textmodel_cnnlstmemb)
13 | S3method(load,textmodel_mlp)
14 | S3method(performance,default)
15 | S3method(performance,table)
16 | S3method(precision,default)
17 | S3method(precision,table)
18 | S3method(predict,textmodel_cnnlstmemb)
19 | S3method(predict,textmodel_mlp)
20 | S3method(print,predict.textmodel_cnnlstmemb)
21 | S3method(print,predict.textmodel_mlp)
22 | S3method(print,textmodel_cnnlstmemb)
23 | S3method(print,textmodel_evaluate)
24 | S3method(print,textmodel_mlp)
25 | S3method(print,tokens2sequences)
26 | S3method(recall,default)
27 | S3method(recall,table)
28 | S3method(save,textmodel_cnnlstmemb)
29 | S3method(save,textmodel_mlp)
30 | S3method(summary,textmodel_cnnlstmemb)
31 | S3method(summary,textmodel_mlp)
32 | S3method(textmodel_cnnlstmemb,tokens)
33 | S3method(textmodel_cnnlstmemb,tokens2sequences)
34 | S3method(textmodel_evaluate,dfm)
35 | S3method(textmodel_evaluate,tokens)
36 | S3method(textmodel_mlp,dfm)
37 | S3method(tokens2sequences,character)
38 | S3method(tokens2sequences,tokens)
39 | S3method(tokens2sequences,tokens2sequences)
40 | S3method(tokens2sequences_conform,tokens2sequences)
41 | export(accuracy)
42 | export(balanced_accuracy)
43 | export(crossval)
44 | export(f1_score)
45 | export(is.tokens2sequences)
46 | export(performance)
47 | export(precision)
48 | export(recall)
49 | export(textmodel_cnnlstmemb)
50 | export(textmodel_evaluate)
51 | export(textmodel_mlp)
52 | export(tokens2sequences)
53 | export(tokens2sequences_conform)
54 | import(quanteda)
55 | import(quanteda.textmodels)
56 | importFrom(groupdata2,fold)
57 | importFrom(keras,bidirectional)
58 | importFrom(keras,compile)
59 | importFrom(keras,fit)
60 | importFrom(keras,keras_model_sequential)
61 | importFrom(keras,layer_activation)
62 | importFrom(keras,layer_conv_1d)
63 | importFrom(keras,layer_dense)
64 | importFrom(keras,layer_dropout)
65 | importFrom(keras,layer_embedding)
66 | importFrom(keras,layer_lstm)
67 | importFrom(keras,layer_max_pooling_1d)
68 | importFrom(keras,predict_classes)
69 | importFrom(keras,predict_proba)
70 | importFrom(keras,serialize_model)
71 | importFrom(keras,to_categorical)
72 | importFrom(keras,unserialize_model)
73 | importFrom(quanteda,dfm)
74 | importFrom(quanteda,dfm_subset)
75 | importFrom(stats,na.omit)
76 | importFrom(stats,predict)
77 | importFrom(utils,head)
78 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # quanteda.classifiers 0.4
2 |
3 | * Fixed a performance issue in `crossval()`; now it's much improved in speed.
4 | * Added a `NEWS.md` file to track changes to the package.
5 |
--------------------------------------------------------------------------------
/R/crossval.R:
--------------------------------------------------------------------------------
1 | #' Cross-validate a fitted textmodel
2 | #'
3 | #' Cross-validate a fitted textmodel using _k_-fold cross-validation.
4 | #' @param x a fitted textmodel
5 | #' @param k number of folds
6 | #' @inheritParams performance
7 | #' @param verbose logical; if `TRUE`, output results to the console
8 | #' @export
9 | #' @examples
10 | #' library("quanteda")
11 | #' library("quanteda.textmodels")
12 | #' dfmat <- tokens(data_corpus_moviereviews) |>
13 | #' dfm()
14 | #' tmod <- textmodel_nb(dfmat, y = data_corpus_moviereviews$sentiment)
15 | #' crossval(tmod, k = 5, by_class = TRUE)
16 | #' crossval(tmod, k = 5, by_class = FALSE)
17 | #' crossval(tmod, k = 5, by_class = FALSE, verbose = TRUE)
18 | crossval <- function(x, k = 5, by_class = FALSE, verbose = FALSE) {
19 | UseMethod("crossval")
20 | }
21 |
22 | #' @importFrom groupdata2 fold
23 | #' @importFrom quanteda dfm dfm_subset
24 | #' @import quanteda.textmodels
25 | #' @export
26 | crossval.textmodel <- function(x, k = 5, by_class = FALSE, verbose = FALSE) {
27 | # create folds vector - many ways to do this, I chose something available
28 | folds <- fold(data.frame(doc_id = docnames(x)), k = k)[[".folds"]]
29 |
30 | # result list (could be a df, I'm old-fashioned though)
31 | results <- list()
32 |
33 | # loop across folds and refit model, add to results list
34 | for (i in seq_len(k)) {
35 | this_mod <- do.call(class(x)[1],
36 | args = list(x = dfm_subset(x$x, folds != i),
37 | y = x$y[folds != i]))
38 | this_pred <- predict(this_mod, newdata = dfm_subset(x$x, folds == i),
39 | type = "class")
40 | results <- c(results,
41 | structure(list(c(performance(this_pred, x$y[folds == i]),
42 | list(obs = split(seq_len(ndoc(x)), folds)[[i]]))),
43 | names = paste0("fold_", i)))
44 | }
45 |
46 | summ <- summarize_results(results)
47 |
48 | # this may not be the "correct" way to do it - here it averages across
49 | # class-specific averages. Should we average across classes first within
50 | # folds and then average across folds?
51 | if (!by_class)
52 | summ <- apply(summ, 2, mean)
53 |
54 | if (verbose) {
55 | cat("Cross-validation:\n\nMean results for k =", k, "folds:\n\n")
56 | print(summ)
57 | invisible(summ)
58 | } else {
59 | summ
60 | }
61 | }
62 |
63 | # old-skool function to aggregate across a 3-D array
64 | summarize_results <- function(x) {
65 | # remove the "obs"
66 | x <- lapply(x, function(y) y[-which(names(y) == "obs")])
67 |
68 | # make into a 3D array
69 | x_df <- lapply(x, data.frame)
70 | x_array <- array(unlist(x_df), dim <- c(dim(x_df[[1]]), length(x_df)),
71 | dimnames = c(dimnames(x_df[[1]]), list(names(x_df))))
72 |
73 | apply(x_array, c(1, 2), mean)
74 | }
75 |
--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
1 | #' Sentence-level corpus of UK party manifestos 1945--2017, partially annotated
2 | #'
3 | #' @description A text corpus of sentences from publicly available party manifestos from the
4 | #' United Kingdom, published between 1945 and 2019 Some manifestos sentences
5 | #' have been rated in terms of the direction of policy using crowd-sourced coders.
6 | #'
7 | #' @description The manifestos from the
8 | #' three main parties (Labour Party, Conservatives, Liberal Democrats) between
9 | #' 1987 and 2010 have been labelled as Economic Policy, Social
10 | #' Policy, or Other, and rated in terms of the direction of Economic Policy and
11 | #' Social Policy. All party
12 | #' manifestos from the 2010 General Election have been crowd-coded in terms of
13 | #' immigration policy, and the direction of immigration policy. For more
14 | #' information on the coding approach see
15 | #' [Benoit et al. (2016)](https://doi.org/10.1017/S0003055416000058).
16 | #'
17 | #' @description The
18 | #' corpus contains the aggregated crowd coding values on the level of sentences.
19 | #' Note that the segmentation into sentences does not always work correctly due
20 | #' to missing punctuation. See Examples for how to remove very short and very
21 | #' long sentences using [quanteda::corpus_trim()].
22 | #'
23 | #' @format
24 | #' The corpus consists of 88,954 documents (i.e. sentences) and includes the following
25 | #' document-level variables: \describe{
26 | #' \item{party}{factor; abbreviation of the party that wrote the manifesto.}
27 | #' \item{partyname}{factor; party that wrote the manifesto.}
28 | #' \item{year}{integer; 4-digit year of the election.}
29 | #' \item{crowd_econsocial_label}{factor; indicates the majority label assigned
30 | #' by crowd workers (Economic Policy, Social Policy, or Neither). The variable
31 | #' has missing values (`NA`) for all non-annotated manifestos.}
32 | #' \item{crowd_econsocial_mean}{numeric; the direction of statements coded as
33 | #' "Economic Policy" or "Social Policy" based on the aggregated crowd codings.
34 | #' The variable is the mean of the scores assigned by the workers workers who
35 | #' coded the sentence and who allocated the sentence to the "majority"
36 | #' category. The variable ranges from -2 to +2.
37 | #'
38 | #' For the statements aggregated as "Economic" Policy, -2 corresponds to "Very
39 | #' left"; +2 corresponds to "Very right". For the statements aggregated as
40 | #' "Social Policy" -2 corresponds to "Very liberal"; +2 corresponds to "Very
41 | #' conservative". The variable has missing values (NA) for all sentences that
42 | #' were aggregated as "Neither" and for all non-annotated manifestos.)}
43 | #' \item{crowd_econsocial_n}{integer; the number of coders who contributed to the
44 | #' mean score `crowd_econsocial_mean`.}
45 | #' \item{crowd_immigration_label}{Factor indicating whether the majority of
46 | #' crowd workers labelled a sentence as referring to immigration or not. The
47 | #' variable has missing values (`NA`) for all non-annotated manifestos.}
48 | #' \item{crowd_immigration_mean}{numeric; the direction
49 | #' of statements coded as "Immigration" based on the aggregated crowd codings.
50 | #' The variable is the mean of the scores assigned by workers who coded a
51 | #' sentence and who allocated the sentence to the "Immigration" category. The
52 | #' variable ranges from -1 ("Negative and closed immigration policy") to +1
53 | #' (Favorable and open immigration policy). The variable has missing values
54 | #' (`NA`) for all non-annotated manifestos or if a sentence was not coded as
55 | #' referring to immigration policy based on the aggregation of crowd codings.}
56 | #' \item{crowd_immigration_n}{integer; the number of coders who
57 | #' contributed to the
58 | #' mean score `crowd_immigration_mean`.}
59 | #' }
60 | #' @examples
61 | #' \donttest{
62 | #' library("quanteda")
63 | #'
64 | #' # keep only crowd coded manifestos (with respect to economic and social policy)
65 | #' corp_crowdeconsocial <-
66 | #' corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_econsocial_label))
67 | #'
68 | #' # keep only crowd coded manifestos (with respect to immigration policy)
69 | #' corp_crowdimmig <-
70 | #' corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
71 | #' }
72 | #' @references Benoit, K., Conway, D., Lauderdale, B.E., Laver, M., & Mikhaylov, S. (2016).
73 | #' [Crowd-sourced Text Analysis:
74 | #' Reproducible and Agile Production of Political Data](https://doi.org/10.1017/S0003055416000058).
75 | #' *American Political Science Review*, 100,(2), 278--295.
76 | #' @format
77 | #' A [corpus][quanteda::corpus] object.
78 | #' @keywords data
79 | "data_corpus_manifestosentsUK"
80 |
81 | #' Large Movie Review Dataset from Maas et. al. (2011)
82 | #'
83 | #' A corpus object containing a dataset for sentiment classification containing
84 | #' 25,000 highly polar movie reviews for training, and 25,000 for testing, from
85 | #' Maas et. al. (2011).
86 | #' @format The corpus docvars consist of:
87 | #' \describe{
88 | #' \item{docnumber}{serial (within set and polarity) document number}
89 | #' \item{rating}{user-assigned movie rating on a 1-10 point integer scale}
90 | #' \item{set}{used for test v. training set}
91 | #' \item{polarity}{either `neg` or `pos` to indicate whether the
92 | #' movie review was negative or positive. See Maas et al (2011) for the
93 | #' cut-off values that governed this assignment.}
94 | #' }
95 | #' @references Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew
96 | #' Y. Ng, and Christopher Potts. (2011).
97 | #' "[Learning
98 | #' Word Vectors for Sentiment Analysis](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)". The 49th Annual Meeting of the
99 | #' Association for Computational Linguistics (ACL 2011).
100 | #' @source
101 | #' @keywords data
102 | "data_corpus_LMRD"
103 |
--------------------------------------------------------------------------------
/R/performance.R:
--------------------------------------------------------------------------------
1 | # overall performance ----------
2 |
3 | #' Performance statistics for prediction
4 | #'
5 | #' @description Functions for computing performance statistics used for model
6 | #' evaluation.
7 | #'
8 | #' @description `performance()` computes all of the following, which are also
9 | #' available via specific functions:
10 | #'
11 | #' Given a 2 x 2 table with notation
12 | #'
13 | #' \tabular{rcc}{ \tab Truth \tab \cr Predicted \tab Positive \tab
14 | #' Negative \cr Positive \tab _A_ \tab _B_ \cr Negative \tab _C_ \tab _D_ \cr }
15 | #'
16 | #' The metrics computed here are:
17 | #' \itemize{
18 | #' \item{precision: }{\eqn{A / (A + B)}}
19 | #' \item{recall: }{\eqn{A / (A + C)}}
20 | #' \item{_F1_: }{\eqn{2 / (recall^{-1} + precision^{-1})}}
21 | #' \item{accuracy: }{\eqn{(A + D) / (A + B + C + D)}, or correctly predicted / all}
22 | #' \item{balanced_accuracy: }{mean(recall) for all categories}
23 | #' }
24 | #' @param data a table of predicted by truth, or vector of predicted labels
25 | #' @param truth vector of "true" labels, or if a table, `2` to indicate that the
26 | #' "true" values are in columns, or `1` if in rows.
27 | #' @param by_class logical; if `TRUE`, estimate performance score separately for
28 | #' each class, otherwise average across classes
29 | #' @param ... not used
30 | #' @return named list consisting of the selected measure(s), where each element
31 | #' is a scalar if `by_class = FALSE`, or a vector named by class if `by_class
32 | #' = TRUE`.
33 | #' @references
34 | #' Powers, D. (2007). "Evaluation: From Precision, Recall and F Factor to ROC,
35 | #' Informedness, Markedness and Correlation." _Technical Report SIE-07-001_,
36 | #' Flinders University.
37 | #' @examples
38 | #' ## Data in Table 2 of Powers (2007)
39 | #'
40 | #' lvs <- c("Relevant", "Irrelevant")
41 | #' tbl_2_1_pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs)
42 | #' tbl_2_1_truth <- factor(c(rep(lvs, times = c(30, 12)),
43 | #' rep(lvs, times = c(30, 28))),
44 | #' levels = lvs)
45 | #'
46 | #' performance(tbl_2_1_pred, tbl_2_1_truth)
47 | #' performance(tbl_2_1_pred, tbl_2_1_truth, by_class = FALSE)
48 | #' performance(table(tbl_2_1_pred, tbl_2_1_truth), by_class = TRUE)
49 | #'
50 | #' @export
51 | performance <- function(data, truth, by_class = TRUE, ...) {
52 | UseMethod("performance")
53 | }
54 |
55 | #' @export
56 | performance.default <- function(data, truth, by_class = TRUE, ...) {
57 | performance(build_table(data, truth), by_class = by_class)
58 | }
59 |
60 | #' @export
61 | performance.table <- function(data, truth = 2, by_class = TRUE, ...) {
62 | data <- check_table(data, truth)
63 | result <- as.list(c(precision(data, by_class = by_class),
64 | recall(data, by_class = by_class),
65 | accuracy(data),
66 | balanced_accuracy(data),
67 | f1_score(data, by_class = by_class)))
68 | result[c("precision", "recall", "f1", "accuracy", "balanced_accuracy")]
69 | }
70 |
71 |
72 | # precision, recall, f1 ----------
73 |
74 | #' @rdname performance
75 | #' @export
76 | #' @examples
77 | #' precision(tbl_2_1_pred, tbl_2_1_truth)
78 | #'
79 | precision <- function(data, truth, by_class = TRUE, ...) {
80 | UseMethod("precision")
81 | }
82 |
83 | #' @export
84 | precision.default <- function(data, truth, by_class = TRUE, ...) {
85 | precision(build_table(data, truth), by_class = by_class)
86 | }
87 |
88 | #' @export
89 | precision.table <- function(data, truth = 2, by_class = TRUE, ...) {
90 | data <- check_table(data, truth)
91 | prec <- sapply(seq_along(diag(data)),
92 | function(x) diag(data)[x] / sum(data[x, ]))
93 | prec <- list(precision = prec)
94 | if (by_class) prec else sapply(prec, mean)
95 | }
96 |
97 |
98 | #' @rdname performance
99 | #' @export
100 | #' @examples
101 | #' recall(tbl_2_1_pred, tbl_2_1_truth)
102 | #'
103 | recall <- function(data, truth, by_class = TRUE, ...) {
104 | UseMethod("recall")
105 | }
106 |
107 | #' @export
108 | recall.default <- function(data, truth, by_class = TRUE, ...) {
109 | recall(build_table(data, truth), by_class = by_class)
110 | }
111 |
112 | #' @export
113 | recall.table <- function(data, truth = 2, by_class = TRUE, ...) {
114 | data <- check_table(data, truth)
115 | prec <- sapply(seq_along(diag(data)),
116 | function(x) diag(data)[x] / sum(data[, x]))
117 | prec <- list(recall = prec)
118 | if (by_class) prec else sapply(prec, mean)
119 | }
120 |
121 | #' @rdname performance
122 | #' @export
123 | #' @examples
124 | #' f1_score(tbl_2_1_pred, tbl_2_1_truth)
125 | #'
126 | f1_score <- function(data, truth, by_class = TRUE, ...) {
127 | UseMethod("f1_score")
128 | }
129 |
130 | #' @export
131 | f1_score.default <- function(data, truth, by_class = TRUE, ...) {
132 | f1_score(build_table(data, truth), by_class = by_class)
133 | }
134 |
135 | #' @export
136 | f1_score.table <- function(data, truth = 2, by_class = TRUE, ...) {
137 | data <- check_table(data, truth)
138 | pr <- data.frame(precision = precision(data, by_class = TRUE)[[1]],
139 | recall = recall(data, by_class = TRUE)[[1]])
140 | #f1_score(pr)
141 | f1 <- list(f1 = apply(pr[c("precision", "recall")], 1,
142 | function(y) 2 / sum(y^(-1))))
143 | if (by_class) f1 else sapply(f1, mean)
144 | }
145 |
146 | # #' @export
147 | # f1_score.list <- function(data, ...) {
148 | # if (!all(c("precision", "recall") %in% names(data)))
149 | # stop("list must contain both precision and recall")
150 | # pr <- list(precision = precision(data, by_class = by_class),
151 | # recall = recall(data, by_class = by_class))
152 | #
153 | # result <- list(f1 = apply(data.frame(data[c("precision", "recall")]), 1,
154 | # function(y) 2 / sum(y^(-1))))
155 | # if (length(result[[1]]) == 1) result[[1]] <- unname(result[[1]])
156 | # result
157 | #}
158 |
159 | # accuracy ----------
160 |
161 | #' @rdname performance
162 | #' @export
163 | #' @examples
164 | #' accuracy(tbl_2_1_pred, tbl_2_1_truth)
165 | #'
166 | accuracy <- function(data, truth, ...) {
167 | UseMethod("accuracy")
168 | }
169 |
170 | #' @export
171 | accuracy.default <- function(data, truth, ...) {
172 | accuracy(build_table(data, truth))
173 | }
174 |
175 | #' @export
176 | accuracy.table <- function(data, truth = 2, ...) {
177 | data <- check_table(data, truth)
178 | list(accuracy = sum(diag(data)) / sum(data))
179 | }
180 |
181 | #' @rdname performance
182 | #' @export
183 | #' @examples
184 | #' balanced_accuracy(tbl_2_1_pred, tbl_2_1_truth)
185 | #'
186 | balanced_accuracy <- function(data, ...) {
187 | UseMethod("balanced_accuracy")
188 | }
189 |
190 | #' @export
191 | balanced_accuracy.default <- function(data, truth, by_class = TRUE, ...) {
192 | balanced_accuracy(build_table(data, truth))
193 | }
194 |
195 | #' @export
196 | balanced_accuracy.table <- function(data, truth = 2, ...) {
197 | data <- check_table(data, truth)
198 | rec <- recall(data, by_class = TRUE)
199 | balanced_accuracy(rec)
200 | }
201 |
202 | #' @export
203 | balanced_accuracy.list <- function(data, ...) {
204 | if (! "recall" %in% names(data))
205 | stop("list must include recall")
206 | if (length(data[["recall"]]) < 2)
207 | stop("recall must be computed by class")
208 | list(balanced_accuracy = mean(unlist(data["recall"])))
209 | }
210 |
211 | # utility functions -------------
212 |
213 | check_table <- function(data, truth) {
214 | if (!truth %in% c(1, 2))
215 | stop("truth must be 2 for columns or 1 for rows")
216 | if (!identical(rownames(data), colnames(data)))
217 | stop("predicted and truth values must have the same order and names")
218 | if (truth == 1) data <- t(data)
219 | data
220 | }
221 |
222 | build_table <- function(data, truth) {
223 | truth <- as.factor(truth)
224 | data <- factor(data, levels = levels(truth))
225 | table(data, truth)
226 | }
227 |
--------------------------------------------------------------------------------
/R/quanteda.classifiers-package.R:
--------------------------------------------------------------------------------
1 | #' quanteda.classifiers
2 | #'
3 | #' Extensions to \pkg{quanteda} that provide supervised machine learning models
4 | #' for document-feature matrices.
5 | #' @import quanteda.textmodels
6 | #' @import quanteda
7 | #' @keywords internal
8 | "_PACKAGE"
9 |
10 | # The following block is used by usethis to automatically manage
11 | # roxygen namespace tags. Modify with care!
12 | ## usethis namespace: start
13 | ## usethis namespace: end
14 | NULL
15 |
--------------------------------------------------------------------------------
/R/textmodel_cnnlstmemb.R:
--------------------------------------------------------------------------------
1 | #' \[Experimental\] Convolutional NN + LSTM model fitted to word embeddings
2 | #'
3 | #' A function that combines a convolutional neural network layer with a long
4 | #' short-term memory layer. It is designed to incorporate word sequences,
5 | #' represented as sequentially ordered word embeddings, into text
6 | #' classification. The model takes as an input a \pkg{quanteda} tokens object.
7 | #'
8 | #' @param x tokens object
9 | #' @inheritParams quanteda.textmodels::textmodel_svm
10 | #' @param dropout1 A floating variable bound between 0 and 1. It determines the
11 | #' rate at which units are dropped for the linear transformation of the
12 | #' inputs for the embedding layer.
13 | #' @param dropout2 A floating variable bound between 0 and 1. It determines the
14 | #' rate at which units are dropped for the linear transformation of the
15 | #' inputs for the CNN layer.
16 | #' @param dropout3 A floating variable bound between 0 and 1. It determines the
17 | #' rate at which units are dropped for the linear transformation of the
18 | #' inputs for the recurrent layer.
19 | #' @param dropout4 A floating variable bound between 0 and 1. It determines the
20 | #' rate at which units are dropped for the linear transformation of the
21 | #' inputs for the recurrent layer.
22 | #' @param wordembeddim The number of word embedding dimensions to be fit
23 | #' @param cnnlayer A logical parameter that allows user to include or exclude a
24 | #' convolutional layer in the neural network model
25 | #' @param filter The number of output filters in the convolution
26 | #' @param kernel_size An integer or list of a single integer, specifying the
27 | #' length of the 1D convolution window
28 | #' @param pool_size Size of the max pooling windows.
29 | #' [keras::layer_max_pooling_1d()]
30 | #' @param units_lstm The number of nodes of the lstm layer
31 | #' @param words The maximum number of words used to train model. Defaults to the
32 | #' number of features in `x`
33 | #' @param maxsenlen The maximum sentence length of training data
34 | #' @param optimizer optimizer used to fit model to training data, see
35 | #' [keras::compile.keras.engine.training.Model()]
36 | #' @param loss objective loss function, see
37 | #' [keras::compile.keras.engine.training.Model()]
38 | #' @param metrics metric used to train algorithm, see
39 | #' [keras::compile.keras.engine.training.Model()]
40 | #' @param ... additional options passed to
41 | #' [keras::fit.keras.engine.training.Model()]
42 | #' @keywords textmodel
43 | #' @importFrom keras keras_model_sequential to_categorical
44 | #' @importFrom keras layer_dense layer_activation layer_dropout compile fit
45 | #' @importFrom keras layer_embedding layer_conv_1d layer_max_pooling_1d
46 | #' layer_lstm bidirectional
47 | #' @seealso [save.textmodel_cnnlstmemb()], [load.textmodel_cnnlstmemb()]
48 | #' @export
49 | #' @examples
50 | #' \dontrun{
51 | #' # create dataset with evenly balanced coded & uncoded immigration sentences
52 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK,
53 | #' !is.na(crowd_immigration_label))
54 | #' corpuncoded <- data_corpus_manifestosentsUK %>%
55 | #' corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
56 | #' corpus_sample(size = ndoc(corpcoded))
57 | #' corp <- corpcoded + corpuncoded
58 | #'
59 | #' tok <- tokens(corp)
60 | #'
61 | #' tmod <- textmodel_cnnlstmemb(tok,
62 | #' y = docvars(tok, "crowd_immigration_label"),
63 | #' epochs = 5, verbose = 1)
64 | #'
65 | #' newdata = tokens_subset(tok, subset = is.na(crowd_immigration_label))
66 | #' pred <- predict(tmod, newdata = newdata)
67 | #' table(pred)
68 | #' tail(texts(corpuncoded)[pred == "Immigration"], 10)
69 | #'
70 | #' }
71 | textmodel_cnnlstmemb <- function(x, y, dropout1 = 0.2, dropout2 = 0.2,
72 | dropout3 = 0.2, dropout4 = 0.2,
73 | wordembeddim = 30, cnnlayer = TRUE,
74 | filter = 48, kernel_size = 5, pool_size = 4,
75 | units_lstm = 128, words = NULL,
76 | maxsenlen = 100, optimizer = "adam",
77 | loss = "categorical_crossentropy",
78 | metrics = "categorical_accuracy", ...) {
79 | UseMethod("textmodel_cnnlstmemb")
80 | }
81 |
82 | #' @export
83 | textmodel_cnnlstmemb.tokens <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2,
84 | dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48,
85 | kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL,
86 | maxsenlen = 100, optimizer = "adam",
87 | loss = "categorical_crossentropy",
88 | metrics = "categorical_accuracy", ...) {
89 | stopifnot(ndoc(x) == length(y))
90 | stopifnot(is.tokens(x))
91 | y <- as.factor(y)
92 | result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
93 | # trim missings for fitting model
94 | na_ind <- which(is.na(y))
95 | if (length(na_ind) > 0) {
96 | y <- y[-na_ind]
97 | # workaround just because negative indexing is broken in v2 for now
98 | na_ind_logical <- rep(TRUE, length(y))
99 | na_ind_logical[na_ind] <- FALSE
100 | x <- x[na_ind_logical]
101 | }
102 |
103 | x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words)
104 |
105 | if (is.null(words))
106 | words <- x$nfeatures
107 | # "one-hot" encode y
108 | y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
109 |
110 | # use keras to fit the model
111 | model <- keras_model_sequential()
112 |
113 | model %>%
114 | layer_embedding(input_dim = words + 1, output_dim = wordembeddim,
115 | input_length = maxsenlen) %>%
116 | layer_dropout(rate = dropout1)
117 |
118 | if (cnnlayer == TRUE) {
119 | model %>%
120 | layer_conv_1d(filters = filter, kernel_size = kernel_size,
121 | activation = "relu") %>%
122 | layer_max_pooling_1d(pool_size = pool_size) %>%
123 | layer_dropout(rate = dropout2)
124 | }
125 |
126 | model %>%
127 | bidirectional(layer_lstm(units = units_lstm, dropout = dropout3,
128 | recurrent_dropout = dropout4)) %>%
129 | layer_dense(units = nlevels(y), activation = "softmax")
130 |
131 | compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
132 | history <- fit(model, x$matrix, y2, ...)
133 |
134 | # compile, class, and return the result
135 | result <- c(result,
136 | nfeatures = x$nfeatures,
137 | maxsenlen = maxsenlen,
138 | list(clefitted = model))
139 | class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list")
140 | return(result)
141 | }
142 |
143 | #' @export
144 | textmodel_cnnlstmemb.tokens2sequences <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2,
145 | dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48,
146 | kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL,
147 | maxsenlen = 100,
148 | optimizer = "adam",
149 | loss = "categorical_crossentropy",
150 | metrics = "categorical_accuracy", ...) {
151 | stopifnot(nrow(x$matrix) == length(y))
152 | stopifnot(is.tokens2sequences(x))
153 | x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words)
154 | y <- as.factor(y)
155 | result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
156 |
157 | # trim missings for fitting model
158 | na_ind <- which(is.na(y))
159 | if (length(na_ind) > 0) {
160 | y <- y[-na_ind]
161 | # workaround just because negative indexing is broken in v2 for now
162 | na_ind_logical <- rep(TRUE, length(y))
163 | na_ind_logical[na_ind] <- FALSE
164 | x$matrix <- x$matrix[na_ind_logical, ]
165 | }
166 |
167 | words <- x$nfeatures
168 | maxsenlen <- ncol(x$matrix)
169 | # "one-hot" encode y
170 | y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
171 |
172 | # use keras to fit the model
173 | model <- keras_model_sequential()
174 |
175 | model %>%
176 | layer_embedding(input_dim = words + 1, output_dim = wordembeddim,
177 | input_length = maxsenlen) %>%
178 | layer_dropout(rate = dropout1)
179 |
180 | if (cnnlayer == TRUE) {
181 | model %>%
182 | layer_conv_1d(filters = filter, kernel_size = kernel_size,
183 | activation = "relu") %>%
184 | layer_max_pooling_1d(pool_size = pool_size) %>%
185 | layer_dropout(rate = dropout2)
186 | }
187 |
188 | model %>%
189 | layer_lstm(units = units_lstm, dropout = dropout3,
190 | recurrent_dropout = dropout4) %>%
191 | layer_dense(units = nlevels(y), activation = "softmax")
192 |
193 | compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
194 | history <- fit(model, x$matrix, y2, ...)
195 |
196 | # compile, class, and return the result
197 | result <- c(result,
198 | nfeatures = x$nfeatures,
199 | maxsenlen = maxsenlen,
200 | list(clefitted = model))
201 | class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list")
202 | return(result)
203 | }
204 |
205 | #' Prediction from a fitted textmodel_cnnlstmemb object
206 | #'
207 | #' `predict.textmodel_cnnlstmemb()` implements class predictions from a
208 | #' fitted long short-term memory neural network model.
209 | #' @param object a fitted [textmodel_cnnlstmemb] model
210 | #' @param newdata dfm on which prediction should be made
211 | #' @param type the type of predicted values to be returned; see Value
212 | #' @param force make `newdata`'s feature set conformant to the model terms
213 | #' @param ... not used
214 | #' @return `predict.textmodel_cnnlstmemb` returns either a vector of class
215 | #' predictions for each row of `newdata` (when `type = "class"`), or
216 | #' a document-by-class matrix of class probabilities (when `type =
217 | #' "probability"`).
218 | #' @seealso [textmodel_cnnlstmemb()]
219 | #' @keywords textmodel internal
220 | #' @importFrom keras predict_classes predict_proba
221 | #' @importFrom stats predict
222 | #' @export
223 | predict.textmodel_cnnlstmemb <- function(object, newdata = NULL,
224 | type = c("class", "probability"),
225 | force = TRUE,
226 | ...) {
227 | quanteda:::unused_dots(...)
228 |
229 | type <- match.arg(type)
230 |
231 | if (!is.null(newdata)) {
232 | if(is.tokens(newdata)) {
233 | data <- tokens2sequences(newdata, maxsenlen = object$maxsenlen,
234 | keepn = object$nfeatures)
235 | } else {
236 | data <- newdata
237 | }
238 | t2s_object <- tokens2sequences(object$x, maxsenlen = object$maxsenlen,
239 | keepn = object$nfeatures)
240 | data <- tokens2sequences_conform(data, t2s_object)
241 | } else {
242 | data <- tokens2sequences(object$x, maxsenlen = object$maxsenlen,
243 | keepn = object$nfeatures)
244 | }
245 |
246 | if (type == "class") {
247 | pred_y <- predict_classes(object$clefitted, x = data$matrix)
248 | pred_y <- factor(pred_y, labels = object$classnames,
249 | levels = (seq_along(object$classnames) - 1))
250 | names(pred_y) <- rownames(data$matrix)
251 | } else if (type == "probability") {
252 | pred_y <- predict_proba(object$clefitted, x = data$matrix)
253 | colnames(pred_y) <- object$classnames
254 | rownames(pred_y) <- rownames(data$matrix)
255 | }
256 |
257 | pred_y
258 | }
259 |
260 | #' @export
261 | #' @importFrom stats na.omit
262 | #' @method print textmodel_cnnlstmemb
263 | print.textmodel_cnnlstmemb <- function(x, ...) {
264 | layer_names <- gsub(pattern = "_\\d*", "",
265 | lapply(x$clefitted$layers, function(z) z$name))
266 | cat("\nCall:\n")
267 | print(x$call)
268 | cat("\n",
269 | format(length(na.omit(x$y)), big.mark = ","), " training documents; ",
270 | format(length(x$nfeatures), big.mark = ","), " fitted features",
271 | ".\n",
272 | "Structure: ", paste(layer_names, collapse = " -> "), "\n",
273 | sep = "")
274 | }
275 |
276 | #' summary method for textmodel_cnnlstmemb objects
277 | #' @param object output from [textmodel_cnnlstmemb()]
278 | #' @param ... additional arguments not used
279 | #' @keywords textmodel internal
280 | #' @method summary textmodel_cnnlstmemb
281 | #' @export
282 | summary.textmodel_cnnlstmemb <- function(object, ...) {
283 | layer_names <- gsub(pattern = "_\\d*", "",
284 | lapply(object$clefitted$layers, function(x) x$name))
285 |
286 | result <- list(
287 | "call" = object$call,
288 | "model structure" = paste(layer_names, collapse = " -> ")
289 | )
290 | as.summary.textmodel(result)
291 | }
292 |
293 | #' @export
294 | #' @method print predict.textmodel_cnnlstmemb
295 | print.predict.textmodel_cnnlstmemb <- function(x, ...) {
296 | print(unclass(x))
297 | }
298 |
299 |
300 | #' @rdname save.textmodel_mlp
301 | #' @importFrom keras serialize_model
302 | #' @method save textmodel_cnnlstmemb
303 | #' @export
304 | save.textmodel_cnnlstmemb <- function(x, ...) {
305 | x$clefitted <- serialize_model(x$clefitted)
306 | save(x, ...)
307 | }
308 |
309 | #' @rdname save.textmodel_mlp
310 | #' @importFrom keras unserialize_model
311 | #' @method load textmodel_cnnlstmemb
312 | #' @export
313 | load.textmodel_cnnlstmemb <- function(x, ...) {
314 | load(x, ...)
315 | x$clefitted <- unserialize_model(x$clefitted)
316 | }
317 |
--------------------------------------------------------------------------------
/R/textmodel_evaluate.R:
--------------------------------------------------------------------------------
1 | #' Model evaluation function
2 | #'
3 | #' Designed to streamline the parameter tuning and evaluation process. Users
4 | #' chose a function to evaluate and include parameter values as a list. If
5 | #' multiple parameter values are provided, the function will perform a grid
6 | #' search by estimating a separate model for every combination of parameters.
7 | #'
8 | #' @param x the \link{dfm} or \link{tokens} object on which the model will be
9 | #' fit. Does not need to contain only the training documents.
10 | #' @param y vector of training labels associated with each document identified
11 | #' in \code{train}. (These will be converted to factors if not already
12 | #' factors.)
13 | #' @param model the name of the machine learning function that will be evaluated
14 | #' @param fun the name of the function that will be used to evaluate the machine
15 | #' learning model. Can take the values "accuracy", "precision", "recall", or
16 | #' "f1_score"
17 | #' @param k number of folds
18 | #' @param parameters model hyperparameters
19 | #' @param seed a seed that can allow for replication of k training data splits.
20 | #' If seed is not provided a seed is chosen based on the current time.
21 | #' @param time a logical parameter that determines whether output will include
22 | #' training time (in seconds) of model
23 | #' @param by_class estimates a separate value of provided evaluation function
24 | #' for every class of the true vector
25 | #' @importFrom stats predict
26 | #' @examples
27 | #' # evaluate immigration classification performance
28 | #' \dontrun{
29 | #' dfmat <- dfm(data_corpus_manifestosentsUK)
30 | #' codes <- docvars(data_corpus_manifestosentsUK, "crowd_immigration_label")
31 | #' evaluation <- textmodel_evaluate(dfmat, codes, k = 3,
32 | #' model = "textmodel_mlp", fun = "f1_score",
33 | #' parameters = list(epochs = c(3, 4)))
34 | #' head(evaluation)
35 | #' aggregate(evaluation, by = list(evaluation$cost), FUN = "mean")
36 | #' }
37 | #' @export
38 | #'
39 | textmodel_evaluate <- function(x, y,
40 | model,
41 | fun = "f1_score",
42 | k = 5,
43 | parameters = list(),
44 | seed = as.numeric(Sys.time()),
45 | time = TRUE,
46 | by_class = FALSE) {
47 | UseMethod("textmodel_evaluate")
48 | }
49 |
50 | #' @export
51 | textmodel_evaluate.dfm <- function(x, y, model, fun = "f1_score", k = 5,
52 | parameters = list(),
53 | seed = as.numeric(Sys.time()),
54 | time = TRUE, by_class = FALSE) {
55 | stopifnot(is.dfm(x))
56 | if ("accuracy" %in% fun & by_class) {
57 | cat("No class oriented accuracy score defined. Calculating average accuracy accross all classes.\n")
58 | }
59 | total_start <- Sys.time()
60 | set.seed(seed)
61 | y <- as.factor(y)
62 | folds <- cut(seq(1, length(y)), breaks = k, labels = FALSE)
63 | folds <- sample(folds, length(folds), replace = FALSE)
64 | output <- list()
65 | params_df <- expand.grid(parameters, stringsAsFactors = FALSE)
66 | param_len <- ifelse(length(parameters) != 0, nrow(params_df), 1)
67 | w <- 1
68 | for (t in 1:param_len) {
69 | # drop = FALSE ensures that params_df remains a data.frame even if
70 | # there is only a single input parameter
71 | param_list <- as.list(params_df[t, , drop = FALSE])
72 | for (i in 1:k) {
73 | test_set <- which(folds == i)
74 | x_train <- x[-test_set, ]
75 | y_train <- y[-test_set]
76 | x_test <- x[test_set, ]
77 | y_test <- y[test_set]
78 | start <- Sys.time()
79 | mod <- do.call(what = model, args = c(list(x = x_train, y = y_train), param_list))
80 | time <- round(as.numeric(difftime(Sys.time(), start, units = "secs")), 2) # Outputs time in seconds
81 | names(time) <- "time"
82 | y_pred <- predict(mod, x_test)
83 | met <- lapply(fun, function(x) do.call(what = x, args = list(y_pred, y_test, by_class))) # Accepts any evaluation function that takes predicted and test vectors as inputs
84 | #met <- as.list(met)
85 | if ( is.null(names(met))) names(met) <- fun
86 | if (length(parameters) != 0) {
87 | output[[w]] <- data.frame(k = i, met, param_list, as.list(time), seed)
88 | } else {
89 | output[[w]] <- data.frame(k = i, met, as.list(time), seed)
90 | }
91 | if (by_class) {
92 | output[[w]]$class <- rownames(output[[w]])
93 | }
94 | w <- w + 1
95 | }
96 | }
97 | output <- do.call(rbind, output)
98 | rownames(output) <- NULL
99 | total_time <- round(as.numeric(difftime(Sys.time(), total_start, units = "secs")), 2) # Outputs time in seconds
100 | attr(output, "model") <- model
101 | attr(output, "fun") <- fun
102 | attr(output, "k") <- k
103 | attr(output, "parameters") <- parameters
104 | attr(output, "nparameters") <- param_len
105 | attr(output, "total_time") <- total_time
106 | class(output) <- c("textmodel_evaluate", "data.frame")
107 | return(output)
108 | }
109 |
110 | #' @export
111 | textmodel_evaluate.tokens <- function(x, y, model, fun = "f1_score", k = 5,
112 | parameters = list(),
113 | seed = as.numeric(Sys.time()),
114 | time = TRUE, by_class = FALSE) {
115 | stopifnot(is.tokens(x))
116 | if ("accuracy" %in% fun & by_class) {
117 | cat("No class oriented accuracy score defined. Calculating average accuracy accross all classes.\n")
118 | }
119 | total_start <- Sys.time()
120 | set.seed(seed)
121 | y <- as.factor(y)
122 | folds <- cut(seq(1, length(y)), breaks = k, labels = FALSE)
123 | folds <- sample(folds, length(folds), replace = FALSE)
124 | output <- list()
125 | params_df <- expand.grid(parameters, stringsAsFactors = FALSE)
126 | param_len <- ifelse(length(parameters) != 0, nrow(params_df), 1)
127 | w <- 1
128 | for (t in 1:param_len) {
129 | param_list <- as.list(params_df[t, , drop = FALSE]) # drop = FALSE ensures that params_df remains a data.frame even if there is only a single input parameter
130 | for (i in 1:k) {
131 | test_set <- which(folds == i)
132 | x_train <- x[-test_set]
133 | y_train <- y[-test_set]
134 | x_test <- x[test_set]
135 | y_test <- y[test_set]
136 | start <- Sys.time()
137 | mod <- do.call(what = model, args = c(list(x = x_train, y = y_train), param_list))
138 | time <- round(as.numeric(difftime(Sys.time(), start, units = "secs")), 2) # Outputs time in seconds
139 | names(time) <- "time"
140 | y_pred <- predict(mod, x_test)
141 | met <- lapply(fun, function(x) do.call(what = x, args = list(y_pred, y_test, by_class))) # Accepts any evaluation function that takes predicted and test vectors as inputs
142 | #met <- as.list(met)
143 | if (is.null(names(met))) names(met) <- fun
144 | if (length(parameters) != 0) {
145 | output[[w]] <- data.frame(k = i, met, param_list, as.list(time), seed)
146 | } else {
147 | output[[w]] <- data.frame(k = i, met, as.list(time), seed)
148 | }
149 | if (by_class) {
150 | output[[w]]$class <- rownames(output[[w]])
151 | }
152 | w <- w + 1
153 | }
154 | }
155 | output <- do.call(rbind, output)
156 | rownames(output) <- NULL
157 | total_time <- round(as.numeric(difftime(Sys.time(), total_start, units = "secs")), 2) # Outputs time in seconds
158 | attr(output, "model") <- model
159 | attr(output, "fun") <- fun
160 | attr(output, "k") <- k
161 | attr(output, "parameters") <- parameters
162 | attr(output, "nparameters") <- param_len
163 | attr(output, "total_time") <- total_time
164 | class(output) <- c("textmodel_evaluate", "data.frame")
165 | return(output)
166 | }
167 |
168 | #' @seealso [textmodel_evaluate()]
169 | #' @importFrom utils head
170 | #' @method head textmodel_evaluate
171 | #' @export
172 | #'
173 | head.textmodel_evaluate <- function(x, n = 5, ...) {
174 | return(head(as.data.frame(x), n))
175 | }
176 |
177 | #' @seealso [textmodel_evaluate()]
178 | #' @method print textmodel_evaluate
179 | #' @export
180 | print.textmodel_evaluate <- function(x, ...) {
181 | # output
182 | cat("Evaluation of", attr(x, "model"), "using the", attr(x, "fun"), "function.",
183 | "\n")
184 | #return(head(x, 4))
185 | }
186 |
--------------------------------------------------------------------------------
/R/textmodel_mlp.R:
--------------------------------------------------------------------------------
1 | #' Multilayer perceptron network (MLP) model for text classification
2 | #'
3 | #' This function is a wrapper for a multilayer perceptron network model with a
4 | #' single hidden layer network with two layers, implemented in the \pkg{keras}
5 | #' package.
6 | #' @inheritParams quanteda.textmodels::textmodel_svm
7 | #' @param units The number of network nodes used in the first layer of the
8 | #' sequential model
9 | #' @param dropout A floating variable bound between 0 and 1. It determines the
10 | #' rate at which units are dropped for the linear transformation of the
11 | #' inputs.
12 | #' @param optimizer optimizer used to fit model to training data, see
13 | #' [keras::compile.keras.engine.training.Model()]
14 | #' @param loss objective loss function, see
15 | #' [keras::compile.keras.engine.training.Model()]
16 | #' @param metrics metric used to train algorithm, see
17 | #' [keras::compile.keras.engine.training.Model()]
18 | #' @param ... additional options passed to
19 | #' [keras::fit.keras.engine.training.Model()]
20 | #' @keywords textmodel
21 | #' @importFrom keras keras_model_sequential to_categorical
22 | #' @importFrom keras layer_dense layer_activation layer_dropout compile fit
23 | #' @seealso [save.textmodel_mlp()], [load.textmodel_mlp()]
24 | #' @export
25 | #' @examples
26 | #' \dontrun{
27 | #' # create a dataset with evenly balanced coded and uncoded immigration sentences
28 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
29 | #' corpuncoded <- data_corpus_manifestosentsUK %>%
30 | #' corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
31 | #' corpus_sample(size = ndoc(corpcoded))
32 | #' corp <- corpcoded + corpuncoded
33 | #'
34 | #' # form a tf-idf-weighted dfm
35 | #' dfmat <- dfm(corp) %>%
36 | #' dfm_tfidf()
37 | #'
38 | #' set.seed(1000)
39 | #' tmod <- textmodel_mlp(dfmat, y = docvars(dfmat, "crowd_immigration_label"),
40 | #' epochs = 5, verbose = 1)
41 | #' pred <- predict(tmod, newdata = dfm_subset(dfmat, is.na(crowd_immigration_label)))
42 | #' table(pred)
43 | #' tail(texts(corpuncoded)[pred == "Immigration"], 10)
44 | #' }
45 | textmodel_mlp <- function(x, y, units = 512, dropout = .2,
46 | optimizer = "adam",
47 | loss = "categorical_crossentropy",
48 | metrics = "categorical_accuracy",
49 | ...) {
50 | UseMethod("textmodel_mlp")
51 | }
52 |
53 | #' @export
54 | textmodel_mlp.dfm <- function(x, y, units = 512, dropout = .2,
55 | optimizer = "adam",
56 | loss = "categorical_crossentropy",
57 | metrics = "categorical_accuracy", ...) {
58 | stopifnot(ndoc(x) == length(y))
59 |
60 | x <- as.dfm(x)
61 | y <- as.factor(y)
62 | result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
63 |
64 | # trim missings for fitting model
65 | na_ind <- which(is.na(y))
66 | if (length(na_ind) > 0) {
67 | # message(length(na_ind), "observations with the value 'NA' were removed.")
68 | y <- y[-na_ind]
69 | x <- x[-na_ind, ]
70 | }
71 |
72 | # "one-hot" encode y
73 | y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
74 |
75 | # use keras to fit the model
76 | model <- keras_model_sequential() %>%
77 | layer_dense(units = units, input_shape = nfeat(x), activation = "relu") %>%
78 | layer_dropout(rate = dropout) %>%
79 | layer_dense(units = nlevels(y), activation = "softmax")
80 | compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
81 | history <- fit(model, x, y2, ...)
82 |
83 | # compile, class, and return the result
84 | result <- c(result, nfeatures = nfeat(x), list(seqfitted = model))
85 | class(result) <- c("textmodel_mlp", "textmodel", "list")
86 | return(result)
87 | }
88 |
89 | #' Prediction from a fitted textmodel_mlp object
90 | #'
91 | #' `predict.textmodel_mlp()` implements class predictions from a fitted
92 | #' multilayer perceptron network model.
93 | #' @param object a fitted [textmodel_mlp] model
94 | #' @param newdata dfm on which prediction should be made
95 | #' @param type the type of predicted values to be returned; see Value
96 | #' @param force make `newdata`'s feature set conformant to the model terms
97 | #' @param ... not used
98 | #' @return `predict.textmodel_mlp` returns either a vector of class
99 | #' predictions for each row of `newdata` (when `type = "class"`), or
100 | #' a document-by-class matrix of class probabilities (when `type =
101 | #' "probability"`).
102 | #' @seealso [textmodel_mlp()]
103 | #' @keywords textmodel internal
104 | #' @importFrom keras predict_classes predict_proba
105 | #' @importFrom stats predict
106 | #' @export
107 | predict.textmodel_mlp <- function(object, newdata = NULL,
108 | type = c("class", "probability"),
109 | force = TRUE,
110 | ...) {
111 | quanteda:::unused_dots(...)
112 |
113 | type <- match.arg(type)
114 |
115 | if (!is.null(newdata)) {
116 | data <- as.dfm(newdata)
117 | } else {
118 | data <- as.dfm(object$x)
119 | }
120 | model_featnames <- colnames(object$x)
121 | data <- if (is.null(newdata)) {
122 | suppressWarnings(quanteda.textmodels:::force_conformance(data, model_featnames, force))
123 | } else {
124 | quanteda.textmodels:::force_conformance(data, model_featnames, force)
125 | }
126 |
127 | if (type == "class") {
128 | pred_y <- predict_classes(object$seqfitted, x = data)
129 | pred_y <- factor(pred_y, labels = object$classnames, levels = (seq_along(object$classnames) - 1))
130 | names(pred_y) <- docnames(data)
131 | } else if (type == "probability") {
132 | pred_y <- predict_proba(object$seqfitted, x = data)
133 | colnames(pred_y) <- object$classnames
134 | rownames(pred_y) <- docnames(data)
135 | }
136 |
137 | pred_y
138 | }
139 |
140 | #' @export
141 | #' @importFrom stats na.omit
142 | #' @method print textmodel_mlp
143 | print.textmodel_mlp <- function(x, ...) {
144 | layer_names <- gsub(pattern = "_\\d*", "", lapply(x$seqfitted$layers, function(z) z$name))
145 | cat("\nCall:\n")
146 | print(x$call)
147 | cat("\n",
148 | format(length(na.omit(x$y)), big.mark = ","), " training documents; ",
149 | format(length(x$nfeatures), big.mark = ","), " fitted features",
150 | ".\n",
151 | "Structure: ", paste(layer_names, collapse = " -> "), "\n",
152 | sep = "")
153 | }
154 |
155 | #' summary method for textmodel_mlp objects
156 | #' @param object output from [textmodel_mlp()]
157 | #' @param ... additional arguments not used
158 | #' @keywords textmodel internal
159 | #' @method summary textmodel_mlp
160 | #' @export
161 | summary.textmodel_mlp <- function(object, ...) {
162 | layer_names <- gsub(pattern = "_\\d*", "", lapply(object$seqfitted$layers, function(x) x$name))
163 |
164 | result <- list(
165 | "call" = object$call,
166 | "model structure" = paste(layer_names, collapse = " -> ")
167 | )
168 | as.summary.textmodel(result)
169 | }
170 |
171 | #' @export
172 | #' @method print predict.textmodel_mlp
173 | print.predict.textmodel_mlp <- function(x, ...) {
174 | print(unclass(x))
175 | }
176 |
177 | #' Load or save keras-based textmodels
178 | #'
179 | #' Functions for loading and saving \pkg{keras}-based models. Because these are
180 | #' stored as references, they need to be "serialized" prior to saving, or
181 | #' serialized upon loading. This applies to models fit using
182 | #' [textmodel_cnnlstmemb()] and [textmodel_mlp()].
183 | #' @param x a \pkg{keras}-based fitted textmodel
184 | #' @param ... additional arguments passed to [save()] or [load()]
185 | #' @importFrom keras serialize_model
186 | #' @keywords internal
187 | #' @export
188 | #' @method save textmodel_mlp
189 | save.textmodel_mlp <- function(x, ...) {
190 | x$seqfitted <- serialize_model(x$seqfitted)
191 | save(x, ...)
192 | }
193 |
194 | #' @rdname save.textmodel_mlp
195 | #' @importFrom keras unserialize_model
196 | #' @method load textmodel_mlp
197 | #' @export
198 | load.textmodel_mlp <- function(x, ...) {
199 | load(x, ...)
200 | x$seqfitted <- unserialize_model(x$seqfitted)
201 | }
202 |
--------------------------------------------------------------------------------
/R/tokens2sequences.R:
--------------------------------------------------------------------------------
1 | #' \[Experimental\] Convert quanteda tokens to keras sequences
2 | #'
3 | #' This function converts a \pkg{quanteda} [quanteda::tokens()] object
4 | #' into a tokens sequence object as expected by some functions in the
5 | #' \pkg{keras} package.
6 | #' @param x [quanteda::tokens()] object
7 | #' @param maxsenlen the maximum sentence length kept in output matrix
8 | #' @param keepn the maximum number of features to keep
9 | #' @return [tokens2sequences()] The output matrix has a number of rows
10 | #' which represent each tokenized sentence input into the function and a
11 | #' number of columns determined by `maxsenlen`. The matrix contains a
12 | #' numeric code for every unique token kept (determined by `keepn`) and
13 | #' they are arranged in the same sequence indicated by the original
14 | #' [quanteda::tokens()] object.
15 | #' @seealso [is.tokens2sequences()], [tokens2sequences_conform()]
16 | #' @export
17 | #' @examples
18 | #' library("quanteda")
19 | #' corp <- corpus_subset(data_corpus_inaugural, Year <= 1793)
20 | #' corptok <- tokens(corp)
21 | #' print(corp)
22 | #' seqs <- tokens2sequences(corptok, maxsenlen = 200)
23 | #' print(seqs)
24 | tokens2sequences <- function(x, maxsenlen = 100, keepn = NULL) {
25 | UseMethod("tokens2sequences")
26 | }
27 |
28 | #' @export
29 | tokens2sequences.tokens <- function(x, maxsenlen = 100, keepn = NULL) {
30 | stopifnot(is.tokens(x))
31 | tfeq <- sort(table(unlist(x)), decreasing = T) # Creates a table of tokens and their frequencies sorted from most common to least
32 | doc_nam <- docnames(x) # Store docnames from tokens object for future use
33 | x <- unclass(x) # Convert tokens to integer IDs
34 | features <- attr(x, "types") # Store feature names
35 | data <- data.frame(features = features, # Create a dataframe that maps each token to its id and frequency
36 | label1 = 1:length(features),
37 | freq1 = as.integer(tfeq[features]),
38 | stringsAsFactors = FALSE)
39 | attributes(x) <- NULL
40 | data <- data[order(data$freq, decreasing = TRUE), ] # Reorders feature dictionary by frequency
41 | x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y)
42 | words <- data.frame(table(unlist(x)))
43 | names(words) <- c("label1", "freq")
44 | data <- merge(data, words, by = "label1", all.x = TRUE)
45 | data <- data[order(data$freq, decreasing = TRUE), ]
46 | data$label <- NA
47 | if (!is.null(keepn)) {
48 | if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist
49 | data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common
50 |
51 | } else {
52 | data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq))
53 | }
54 | data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs
55 | x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list
56 | mat <- do.call("rbind", lapply(x, function(y) {
57 | if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame
58 | return(y)
59 | }
60 | ))
61 | rownames(mat) <- doc_nam # Adds docname to each row of the matrix
62 | colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column
63 | dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T))
64 | dropped_types <- 1 - with(data, length(na.omit(label)) / length(na.omit(label1)))
65 | data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary
66 | data <- data[order(data$label, decreasing = FALSE),
67 | c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID
68 | rownames(data) <- NULL # Resets rownames of dictionary
69 | output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens)
70 | class(output) <- "tokens2sequences"
71 | return(output)
72 | }
73 |
74 | #' @export
75 | tokens2sequences.character <- function(x, maxsenlen = 100, keepn = NULL) {
76 | stopifnot(is.character(x))
77 | x <- tokens(x)
78 | tfeq <- sort(table(unlist(x)), decreasing = T) # Creates a table of tokens and their frequencies sorted from most common to least
79 | doc_nam <- docnames(x) # Store docnames from tokens object for future use
80 | x <- unclass(x) # Convert tokens to integer IDs
81 | features <- attr(x, "types") # Store feature names
82 | data <- data.frame(features = features, # Create a dataframe that maps each token to its id and frequency
83 | label1 = 1:length(features),
84 | freq1 = as.integer(tfeq[features]),
85 | stringsAsFactors = FALSE)
86 | attributes(x) <- NULL
87 | data <- data[order(data$freq, decreasing = TRUE), ] # Reorders feature dictionary by frequency
88 | x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y)
89 | words <- data.frame(table(unlist(x)))
90 | names(words) <- c("label1", "freq")
91 | data <- merge(data, words, by = "label1", all.x = TRUE)
92 | data <- data[order(data$freq, decreasing = TRUE), ]
93 | data$label <- NA
94 | if (!is.null(keepn)) {
95 | if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist
96 | data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common
97 |
98 | } else {
99 | data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq))
100 | }
101 | data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs
102 | x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list
103 | mat <- do.call("rbind", lapply(x, function(y) {
104 | if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame
105 | return(y)
106 | }
107 | ))
108 | rownames(mat) <- doc_nam # Adds docname to each row of the matrix
109 | colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column
110 | dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T))
111 | dropped_types <- 1 - with(data, length(na.omit(label)) / length(na.omit(label1)))
112 | data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary
113 | data <- data[order(data$label, decreasing = FALSE),
114 | c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID
115 | rownames(data) <- NULL # Resets rownames of dictionary
116 | output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens)
117 | class(output) <- "tokens2sequences"
118 | return(output)
119 | }
120 |
121 | #' @export
122 | tokens2sequences.tokens2sequences <- function(x, maxsenlen = 100, keepn = NULL) {
123 | stopifnot(is.tokens2sequences(x))
124 | doc_nam <- rownames(x$matrix) # Store docnames from tokens object for future use
125 | data <- x$features
126 | names(data)[names(data) %in% c("label", "freq")] <- c('label1', "freq1")
127 | x <- x$matrix
128 | x <- lapply(1:nrow(x), function(y) {
129 | j <- x[y, ]
130 | return(j[j != 0])
131 | })
132 | x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y)
133 | words <- data.frame(table(unlist(x)))
134 | names(words) <- c("label1", "freq")
135 | data <- merge(data, words, by = "label1", all.x = TRUE)
136 | data <- data[order(data$freq, decreasing = TRUE), ]
137 | data$label <- NA
138 | if (!is.null(keepn)) {
139 | if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist
140 | data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common
141 |
142 | } else {
143 | data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq))
144 | }
145 | data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs
146 | x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list
147 | mat <- do.call("rbind", lapply(x, function(y) {
148 | if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame
149 | return(y)
150 | }
151 | ))
152 | rownames(mat) <- doc_nam # Adds docname to each row of the matrix
153 | colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column
154 | dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T))
155 | dropped_types <- 1 - with(data, length(na.omit(label)) / length(na.omit(label1)))
156 | data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary
157 | data <- data[order(data$label, decreasing = FALSE),
158 | c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID
159 | rownames(data) <- NULL # Resets rownames of dictionary
160 | output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens)
161 | class(output) <- "tokens2sequences"
162 | return(output)
163 | }
164 |
165 | #' @seealso [tokens2sequences()]
166 | #' @export
167 | #' @importFrom utils head
168 | #' @method print tokens2sequences
169 | print.tokens2sequences <- function(x, removed = FALSE, ...) {
170 | # calculate % sparse
171 | zeros <- sum(colSums(x$matrix == 0))
172 | tot <- nrow(x$matrix) * ncol(x$matrix)
173 | sparse_pct <- round(zeros / tot * 100, 1)
174 |
175 | # determine max number of features to print
176 | max_n <- ifelse(ncol(x$matrix) > 10, 10, ncol(x$matrix))
177 |
178 | # output
179 | cat("Ordered feature matrix of: ", format(nrow(x$matrix), big.mark = ","),
180 | " documents, ", format(x$nfeatures, big.mark = ","), " features ",
181 | "(", sparse_pct, "% sparse).\n\n", sep = "")
182 | #cat(nrow(x$matrix), " x ", ncol(x$matrix),
183 | # " Matrix of class \"tokens2sequences\" \n\n", sep = "")
184 | if(removed) cat("Current parameter settings resulted in the removal of", round(100*x$dropped_types, 1), "percent of types\nand", round(100*x$dropped_tokens, 1), "percent of tokens.\n\n")
185 | print(head(x$matrix[, 1:max_n], 4))
186 | }
187 |
188 | #' Match the feature names of one tokens2sequences object to another
189 | #'
190 | #' Converts the feature names of one tokens2sequences object to those of
191 | #' another. Useful in aligning training and test sets.
192 | #' @param x [tokens2sequences()] object that will be forced to conform
193 | #' @param y [tokens2sequences()] object whose feature names will be
194 | #' used to change token labels for `x`
195 | #' @seealso [tokens2sequences()]
196 | #' @keywords internal
197 | #' @export
198 | #' @examples
199 | #' \dontrun{
200 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
201 | #' corpuncoded <- data_corpus_manifestosentsUK %>%
202 | #' corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
203 | #' corpus_sample(size = ndoc(corpcoded))
204 | #'
205 | #' tokx <- tokens(corpuncoded)
206 | #' toky <- tokens(corpcoded)
207 | #'
208 | #' seqx <- tokens2sequences(tokx, maxsenlen = 50, keepn = 5000)
209 | #' seqy <- tokens2sequences(toky, maxsenlen = 50, keepn = 5000)
210 | #' tokens2sequences_conform(seqx, seqy)
211 | #' }
212 | tokens2sequences_conform <- function(x, y) {
213 | UseMethod("tokens2sequences_conform")
214 | }
215 |
216 | #' @export
217 | #' @importFrom stats na.omit
218 | tokens2sequences_conform.tokens2sequences <- function(x, y) {
219 | stopifnot(is.tokens2sequences(x) & is.tokens2sequences(y))
220 | joint_feat <- merge(x$features, y$features[, -3], by = "features",
221 | all.x = TRUE)
222 | joint_feat <- joint_feat[order(joint_feat$label.x, decreasing = FALSE), ]
223 | mat <- apply(x$matrix, 1,
224 | function(x) as.integer(na.omit(joint_feat$label.y[x])))
225 | mat <- do.call("rbind", lapply(mat, function(y)
226 | if (length(y) >= ncol(x$matrix))
227 | y[1:ncol(x$matrix)]
228 | else
229 | c(rep(0, times = ncol(x$matrix) - length(y)), y)
230 | ))
231 | rownames(mat) <- rownames(x$matrix)
232 | colnames(mat) <- colnames(x$matrix)
233 | joint_feat <- joint_feat[, c("features", "label.y", "freq")]
234 | names(joint_feat)[2] <- "label"
235 | joint_feat <- joint_feat[order(joint_feat$label, decreasing = F), ]
236 | joint_feat <- joint_feat[!is.na(joint_feat$label), ]
237 | rownames(joint_feat) <- NULL
238 |
239 | output <-
240 | list(matrix = mat, nfeatures = nrow(joint_feat), features = joint_feat)
241 | class(output) <- c("tokens2sequences")
242 | return(output)
243 | }
244 |
245 | #' Check to see if function is a tokens2sequences type
246 | #'
247 | #'
248 | #' @param x Object that will be checked to see if it is of the type [tokens2sequences()]
249 | #' @seealso [tokens2sequences()]
250 | #' @keywords internal
251 | #' @export
252 | is.tokens2sequences <- function(x) {
253 | "tokens2sequences" %in% class(x)
254 | }
255 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onAttach <- function(...) {
2 | options(keras.fit_verbose = 0)
3 | options(keras.view_metrics = FALSE)
4 | }
5 |
6 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 | ```{r, echo = FALSE}
6 | knitr::opts_chunk$set(
7 | collapse = TRUE,
8 | comment = "##",
9 | fig.path = "man/images/"
10 | )
11 | ```
12 |
13 | # quanteda.classifiers: Text classification textmodel extensions for quanteda
14 |
15 | [](https://CRAN.R-project.org/package=quanteda.classifiers)
16 | [](https://github.com/quanteda/quanteda.classifiers/actions)
17 | [](https://codecov.io/github/quanteda/quanteda.classifiers?branch=master)
18 | [](https://www.tidyverse.org/lifecycle/#experimental)
19 |
20 |
21 | ## Installation
22 |
23 | To install this package, use the following, which also installs what the R **keras** package needs in order to run.
24 | ```{r eval = FALSE}
25 | # devtools package required to install quanteda from Github
26 | devtools::install_github("quanteda/quanteda.classifiers")
27 |
28 | keras::install_keras(method = "conda")
29 | ```
30 |
31 | ## Available classifiers
32 |
33 | This package contains two experimental methods that are built on top of the **keras** package. (The SVM models have been moved to [**quanteda.textmodels**](https://github.com/quanteda/quanteda.textmodels).)
34 |
35 | Classifier | Command
36 | --|--
37 | Multilevel perceptron network | `textmodel_mlp()`
38 | Convolutional neural network + LSTM model fitted to word embeddings | `textmodel_cnnlstmemb()`
39 |
40 | ## Available human-annotated corpora
41 |
42 | Corpus | Name
43 | --|--
44 | Sentence-level corpus of UK party manifestos 1945–2019, partially annotated | `data_corpus_manifestosentsUK`
45 | Large Movie Review Dataset of 50,000 annotated highly polar movie reviews for training and testing, from Maas et. al. (2011) | `data_corpus_LMRD`
46 |
47 | ## Demonstration
48 |
49 | See this (very preliminary!) [performance comparison](https://htmlpreview.github.io/?https://github.com/quanteda/quanteda.classifiers/blob/master/tests/misc/test-LMRD.nb.html).
50 |
51 |
52 | ## How to cite
53 |
54 | Benoit, Kenneth, Patrick Chester, and Stefan Müller (2019). quanteda.classifiers: Models for supervised text classification. R package version 0.2. URL: http://github.com/quanteda/quanteda.svm.
55 |
56 | For a BibTeX entry, use the output from citation(package = "quanteda.classifiers").
57 |
58 | ## Issues
59 |
60 | * Please file an issue (with a bug, wish list, etc.) [via GitHub](https://github.com/quanteda/quanteda.classifiers/issues).
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # quanteda.classifiers: Text classification textmodel extensions for quanteda
3 |
4 | [](https://CRAN.R-project.org/package=quanteda.classifiers)
6 | [](https://github.com/quanteda/quanteda.classifiers/actions)
8 | [](https://codecov.io/github/quanteda/quanteda.classifiers?branch=master)
10 | [](https://www.tidyverse.org/lifecycle/#experimental)
12 |
13 | ## Installation
14 |
15 | To install this package, use the following, which also installs what the
16 | R **keras** package needs in order to run.
17 |
18 | ``` r
19 | # devtools package required to install quanteda from Github
20 | devtools::install_github("quanteda/quanteda.classifiers")
21 |
22 | keras::install_keras(method = "conda")
23 | ```
24 |
25 | ## Available classifiers
26 |
27 | This package contains two experimental methods that are built on top of
28 | the **keras** package. (The SVM models have been moved to
29 | [**quanteda.textmodels**](https://github.com/quanteda/quanteda.textmodels).)
30 |
31 | | Classifier | Command |
32 | |---------------------------------------------------------------------|--------------------------|
33 | | Multilevel perceptron network | `textmodel_mlp()` |
34 | | Convolutional neural network + LSTM model fitted to word embeddings | `textmodel_cnnlstmemb()` |
35 |
36 | ## Available human-annotated corpora
37 |
38 | | Corpus | Name |
39 | |------------------------------------------------------------------------------------------------------------------------------|--------------------------------|
40 | | Sentence-level corpus of UK party manifestos 1945–2019, partially annotated | `data_corpus_manifestosentsUK` |
41 | | Large Movie Review Dataset of 50,000 annotated highly polar movie reviews for training and testing, from Maas et. al. (2011) | `data_corpus_LMRD` |
42 |
43 | ## Demonstration
44 |
45 | See this (very preliminary!) [performance
46 | comparison](https://htmlpreview.github.io/?https://github.com/quanteda/quanteda.classifiers/blob/master/tests/misc/test-LMRD.nb.html).
47 |
48 | ## How to cite
49 |
50 | Benoit, Kenneth, Patrick Chester, and Stefan Müller (2019).
51 | quanteda.classifiers: Models for supervised text classification. R
52 | package version 0.2. URL: .
53 |
54 | For a BibTeX entry, use the output from citation(package =
55 | “quanteda.classifiers”).
56 |
57 | ## Issues
58 |
59 | - Please file an issue (with a bug, wish list, etc.) [via
60 | GitHub](https://github.com/quanteda/quanteda.classifiers/issues).
61 |
--------------------------------------------------------------------------------
/data/data_corpus_LMRD.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/data/data_corpus_LMRD.rda
--------------------------------------------------------------------------------
/data/data_corpus_manifestosentsUK.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/data/data_corpus_manifestosentsUK.rda
--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
1 | ACL
2 | cnnlstmemb
3 | com
4 | Crowdflower
5 | dfm
6 | docvars
7 | etc
8 | Favorable
9 | github
10 | Informedness
11 | keras
12 | Lifecycle
13 | lstm
14 | LSTM
15 | Maas
16 | Mikhaylov
17 | mlp
18 | MLP
19 | NN
20 | perceptron
21 | Pham
22 | quanteda
23 | SIE
24 | svm
25 | textmodel
26 | textmodels
27 | th
28 |
--------------------------------------------------------------------------------
/man/crossval.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/crossval.R
3 | \name{crossval}
4 | \alias{crossval}
5 | \title{Cross-validate a fitted textmodel}
6 | \usage{
7 | crossval(x, k = 5, by_class = FALSE, verbose = FALSE)
8 | }
9 | \arguments{
10 | \item{x}{a fitted textmodel}
11 |
12 | \item{k}{number of folds}
13 |
14 | \item{by_class}{logical; if \code{TRUE}, estimate performance score separately for
15 | each class, otherwise average across classes}
16 |
17 | \item{verbose}{logical; if \code{TRUE}, output results to the console}
18 | }
19 | \description{
20 | Cross-validate a fitted textmodel using \emph{k}-fold cross-validation.
21 | }
22 | \examples{
23 | library("quanteda")
24 | library("quanteda.textmodels")
25 | dfmat <- tokens(data_corpus_moviereviews) |>
26 | dfm()
27 | tmod <- textmodel_nb(dfmat, y = data_corpus_moviereviews$sentiment)
28 | crossval(tmod, k = 5, by_class = TRUE)
29 | crossval(tmod, k = 5, by_class = FALSE)
30 | crossval(tmod, k = 5, by_class = FALSE, verbose = TRUE)
31 | }
32 |
--------------------------------------------------------------------------------
/man/data_corpus_LMRD.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{data_corpus_LMRD}
5 | \alias{data_corpus_LMRD}
6 | \title{Large Movie Review Dataset from Maas et. al. (2011)}
7 | \format{
8 | The corpus docvars consist of:
9 | \describe{
10 | \item{docnumber}{serial (within set and polarity) document number}
11 | \item{rating}{user-assigned movie rating on a 1-10 point integer scale}
12 | \item{set}{used for test v. training set}
13 | \item{polarity}{either \code{neg} or \code{pos} to indicate whether the
14 | movie review was negative or positive. See Maas et al (2011) for the
15 | cut-off values that governed this assignment.}
16 | }
17 | }
18 | \source{
19 | \url{http://ai.stanford.edu/~amaas/data/sentiment/}
20 | }
21 | \usage{
22 | data_corpus_LMRD
23 | }
24 | \description{
25 | A corpus object containing a dataset for sentiment classification containing
26 | 25,000 highly polar movie reviews for training, and 25,000 for testing, from
27 | Maas et. al. (2011).
28 | }
29 | \references{
30 | Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew
31 | Y. Ng, and Christopher Potts. (2011).
32 | "\href{http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf}{Learning Word Vectors for Sentiment Analysis}". The 49th Annual Meeting of the
33 | Association for Computational Linguistics (ACL 2011).
34 | }
35 | \keyword{data}
36 |
--------------------------------------------------------------------------------
/man/data_corpus_manifestosentsUK.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{data_corpus_manifestosentsUK}
5 | \alias{data_corpus_manifestosentsUK}
6 | \title{Sentence-level corpus of UK party manifestos 1945--2017, partially annotated}
7 | \format{
8 | The corpus consists of 88,954 documents (i.e. sentences) and includes the following
9 | document-level variables: \describe{
10 | \item{party}{factor; abbreviation of the party that wrote the manifesto.}
11 | \item{partyname}{factor; party that wrote the manifesto.}
12 | \item{year}{integer; 4-digit year of the election.}
13 | \item{crowd_econsocial_label}{factor; indicates the majority label assigned
14 | by crowd workers (Economic Policy, Social Policy, or Neither). The variable
15 | has missing values (\code{NA}) for all non-annotated manifestos.}
16 | \item{crowd_econsocial_mean}{numeric; the direction of statements coded as
17 | "Economic Policy" or "Social Policy" based on the aggregated crowd codings.
18 | The variable is the mean of the scores assigned by the workers workers who
19 | coded the sentence and who allocated the sentence to the "majority"
20 | category. The variable ranges from -2 to +2.
21 |
22 | For the statements aggregated as "Economic" Policy, -2 corresponds to "Very
23 | left"; +2 corresponds to "Very right". For the statements aggregated as
24 | "Social Policy" -2 corresponds to "Very liberal"; +2 corresponds to "Very
25 | conservative". The variable has missing values (NA) for all sentences that
26 | were aggregated as "Neither" and for all non-annotated manifestos.)}
27 | \item{crowd_econsocial_n}{integer; the number of coders who contributed to the
28 | mean score \code{crowd_econsocial_mean}.}
29 | \item{crowd_immigration_label}{Factor indicating whether the majority of
30 | crowd workers labelled a sentence as referring to immigration or not. The
31 | variable has missing values (\code{NA}) for all non-annotated manifestos.}
32 | \item{crowd_immigration_mean}{numeric; the direction
33 | of statements coded as "Immigration" based on the aggregated crowd codings.
34 | The variable is the mean of the scores assigned by workers who coded a
35 | sentence and who allocated the sentence to the "Immigration" category. The
36 | variable ranges from -1 ("Negative and closed immigration policy") to +1
37 | (Favorable and open immigration policy). The variable has missing values
38 | (\code{NA}) for all non-annotated manifestos or if a sentence was not coded as
39 | referring to immigration policy based on the aggregation of crowd codings.}
40 | \item{crowd_immigration_n}{integer; the number of coders who
41 | contributed to the
42 | mean score \code{crowd_immigration_mean}.}
43 | }
44 |
45 | A \link[quanteda:corpus]{corpus} object.
46 | }
47 | \usage{
48 | data_corpus_manifestosentsUK
49 | }
50 | \description{
51 | A text corpus of sentences from publicly available party manifestos from the
52 | United Kingdom, published between 1945 and 2019 Some manifestos sentences
53 | have been rated in terms of the direction of policy using crowd-sourced coders.
54 |
55 | The manifestos from the
56 | three main parties (Labour Party, Conservatives, Liberal Democrats) between
57 | 1987 and 2010 have been labelled as Economic Policy, Social
58 | Policy, or Other, and rated in terms of the direction of Economic Policy and
59 | Social Policy. All party
60 | manifestos from the 2010 General Election have been crowd-coded in terms of
61 | immigration policy, and the direction of immigration policy. For more
62 | information on the coding approach see
63 | \href{https://doi.org/10.1017/S0003055416000058}{Benoit et al. (2016)}.
64 |
65 | The
66 | corpus contains the aggregated crowd coding values on the level of sentences.
67 | Note that the segmentation into sentences does not always work correctly due
68 | to missing punctuation. See Examples for how to remove very short and very
69 | long sentences using \code{\link[quanteda:corpus_trim]{quanteda::corpus_trim()}}.
70 | }
71 | \examples{
72 | \donttest{
73 | library("quanteda")
74 |
75 | # keep only crowd coded manifestos (with respect to economic and social policy)
76 | corp_crowdeconsocial <-
77 | corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_econsocial_label))
78 |
79 | # keep only crowd coded manifestos (with respect to immigration policy)
80 | corp_crowdimmig <-
81 | corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
82 | }
83 | }
84 | \references{
85 | Benoit, K., Conway, D., Lauderdale, B.E., Laver, M., & Mikhaylov, S. (2016).
86 | \href{https://doi.org/10.1017/S0003055416000058}{Crowd-sourced Text Analysis: Reproducible and Agile Production of Political Data}.
87 | \emph{American Political Science Review}, 100,(2), 278--295.
88 | }
89 | \keyword{data}
90 |
--------------------------------------------------------------------------------
/man/is.tokens2sequences.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tokens2sequences.R
3 | \name{is.tokens2sequences}
4 | \alias{is.tokens2sequences}
5 | \title{Check to see if function is a tokens2sequences type}
6 | \usage{
7 | is.tokens2sequences(x)
8 | }
9 | \arguments{
10 | \item{x}{Object that will be checked to see if it is of the type \code{\link[=tokens2sequences]{tokens2sequences()}}}
11 | }
12 | \description{
13 | Check to see if function is a tokens2sequences type
14 | }
15 | \seealso{
16 | \code{\link[=tokens2sequences]{tokens2sequences()}}
17 | }
18 | \keyword{internal}
19 |
--------------------------------------------------------------------------------
/man/performance.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/performance.R
3 | \name{performance}
4 | \alias{performance}
5 | \alias{precision}
6 | \alias{recall}
7 | \alias{f1_score}
8 | \alias{accuracy}
9 | \alias{balanced_accuracy}
10 | \title{Performance statistics for prediction}
11 | \usage{
12 | performance(data, truth, by_class = TRUE, ...)
13 |
14 | precision(data, truth, by_class = TRUE, ...)
15 |
16 | recall(data, truth, by_class = TRUE, ...)
17 |
18 | f1_score(data, truth, by_class = TRUE, ...)
19 |
20 | accuracy(data, truth, ...)
21 |
22 | balanced_accuracy(data, ...)
23 | }
24 | \arguments{
25 | \item{data}{a table of predicted by truth, or vector of predicted labels}
26 |
27 | \item{truth}{vector of "true" labels, or if a table, \code{2} to indicate that the
28 | "true" values are in columns, or \code{1} if in rows.}
29 |
30 | \item{by_class}{logical; if \code{TRUE}, estimate performance score separately for
31 | each class, otherwise average across classes}
32 |
33 | \item{...}{not used}
34 | }
35 | \value{
36 | named list consisting of the selected measure(s), where each element
37 | is a scalar if \code{by_class = FALSE}, or a vector named by class if \code{by_class = TRUE}.
38 | }
39 | \description{
40 | Functions for computing performance statistics used for model
41 | evaluation.
42 |
43 | \code{performance()} computes all of the following, which are also
44 | available via specific functions:
45 |
46 | Given a 2 x 2 table with notation
47 |
48 | \tabular{rcc}{ \tab Truth \tab \cr Predicted \tab Positive \tab
49 | Negative \cr Positive \tab \emph{A} \tab \emph{B} \cr Negative \tab \emph{C} \tab \emph{D} \cr }
50 |
51 | The metrics computed here are:
52 | \itemize{
53 | \item{precision: }{\eqn{A / (A + B)}}
54 | \item{recall: }{\eqn{A / (A + C)}}
55 | \item{\emph{F1}: }{\eqn{2 / (recall^{-1} + precision^{-1})}}
56 | \item{accuracy: }{\eqn{(A + D) / (A + B + C + D)}, or correctly predicted / all}
57 | \item{balanced_accuracy: }{mean(recall) for all categories}
58 | }
59 | }
60 | \examples{
61 | ## Data in Table 2 of Powers (2007)
62 |
63 | lvs <- c("Relevant", "Irrelevant")
64 | tbl_2_1_pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs)
65 | tbl_2_1_truth <- factor(c(rep(lvs, times = c(30, 12)),
66 | rep(lvs, times = c(30, 28))),
67 | levels = lvs)
68 |
69 | performance(tbl_2_1_pred, tbl_2_1_truth)
70 | performance(tbl_2_1_pred, tbl_2_1_truth, by_class = FALSE)
71 | performance(table(tbl_2_1_pred, tbl_2_1_truth), by_class = TRUE)
72 |
73 | precision(tbl_2_1_pred, tbl_2_1_truth)
74 |
75 | recall(tbl_2_1_pred, tbl_2_1_truth)
76 |
77 | f1_score(tbl_2_1_pred, tbl_2_1_truth)
78 |
79 | accuracy(tbl_2_1_pred, tbl_2_1_truth)
80 |
81 | balanced_accuracy(tbl_2_1_pred, tbl_2_1_truth)
82 |
83 | }
84 | \references{
85 | Powers, D. (2007). "Evaluation: From Precision, Recall and F Factor to ROC,
86 | Informedness, Markedness and Correlation." \emph{Technical Report SIE-07-001},
87 | Flinders University.
88 | }
89 |
--------------------------------------------------------------------------------
/man/predict.textmodel_cnnlstmemb.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_cnnlstmemb.R
3 | \name{predict.textmodel_cnnlstmemb}
4 | \alias{predict.textmodel_cnnlstmemb}
5 | \title{Prediction from a fitted textmodel_cnnlstmemb object}
6 | \usage{
7 | \method{predict}{textmodel_cnnlstmemb}(
8 | object,
9 | newdata = NULL,
10 | type = c("class", "probability"),
11 | force = TRUE,
12 | ...
13 | )
14 | }
15 | \arguments{
16 | \item{object}{a fitted \link{textmodel_cnnlstmemb} model}
17 |
18 | \item{newdata}{dfm on which prediction should be made}
19 |
20 | \item{type}{the type of predicted values to be returned; see Value}
21 |
22 | \item{force}{make \code{newdata}'s feature set conformant to the model terms}
23 |
24 | \item{...}{not used}
25 | }
26 | \value{
27 | \code{predict.textmodel_cnnlstmemb} returns either a vector of class
28 | predictions for each row of \code{newdata} (when \code{type = "class"}), or
29 | a document-by-class matrix of class probabilities (when \code{type = "probability"}).
30 | }
31 | \description{
32 | \code{predict.textmodel_cnnlstmemb()} implements class predictions from a
33 | fitted long short-term memory neural network model.
34 | }
35 | \seealso{
36 | \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}}
37 | }
38 | \keyword{internal}
39 | \keyword{textmodel}
40 |
--------------------------------------------------------------------------------
/man/predict.textmodel_mlp.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_mlp.R
3 | \name{predict.textmodel_mlp}
4 | \alias{predict.textmodel_mlp}
5 | \title{Prediction from a fitted textmodel_mlp object}
6 | \usage{
7 | \method{predict}{textmodel_mlp}(
8 | object,
9 | newdata = NULL,
10 | type = c("class", "probability"),
11 | force = TRUE,
12 | ...
13 | )
14 | }
15 | \arguments{
16 | \item{object}{a fitted \link{textmodel_mlp} model}
17 |
18 | \item{newdata}{dfm on which prediction should be made}
19 |
20 | \item{type}{the type of predicted values to be returned; see Value}
21 |
22 | \item{force}{make \code{newdata}'s feature set conformant to the model terms}
23 |
24 | \item{...}{not used}
25 | }
26 | \value{
27 | \code{predict.textmodel_mlp} returns either a vector of class
28 | predictions for each row of \code{newdata} (when \code{type = "class"}), or
29 | a document-by-class matrix of class probabilities (when \code{type = "probability"}).
30 | }
31 | \description{
32 | \code{predict.textmodel_mlp()} implements class predictions from a fitted
33 | multilayer perceptron network model.
34 | }
35 | \seealso{
36 | \code{\link[=textmodel_mlp]{textmodel_mlp()}}
37 | }
38 | \keyword{internal}
39 | \keyword{textmodel}
40 |
--------------------------------------------------------------------------------
/man/quanteda.classifiers-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/quanteda.classifiers-package.R
3 | \docType{package}
4 | \name{quanteda.classifiers-package}
5 | \alias{quanteda.classifiers}
6 | \alias{quanteda.classifiers-package}
7 | \title{quanteda.classifiers}
8 | \description{
9 | Extensions to \pkg{quanteda} that provide supervised machine learning models
10 | for document-feature matrices.
11 | }
12 | \seealso{
13 | Useful links:
14 | \itemize{
15 | \item \url{https://github.com/quanteda/quanteda.classifiers}
16 | \item Report bugs at \url{https://github.com/quanteda/quanteda.classifiers/issues}
17 | }
18 |
19 | }
20 | \author{
21 | \strong{Maintainer}: Kenneth Benoit \email{kbenoit@lse.ac.uk}
22 |
23 | Authors:
24 | \itemize{
25 | \item Patrick Chester \email{pjc468@nyu.edu}
26 | \item Müller Stefan \email{mueller@ipz.uzh.ch}
27 | }
28 |
29 | }
30 | \keyword{internal}
31 |
--------------------------------------------------------------------------------
/man/save.textmodel_mlp.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_cnnlstmemb.R, R/textmodel_mlp.R
3 | \name{save.textmodel_cnnlstmemb}
4 | \alias{save.textmodel_cnnlstmemb}
5 | \alias{load.textmodel_cnnlstmemb}
6 | \alias{save.textmodel_mlp}
7 | \alias{load.textmodel_mlp}
8 | \title{Load or save keras-based textmodels}
9 | \usage{
10 | \method{save}{textmodel_cnnlstmemb}(x, ...)
11 |
12 | \method{load}{textmodel_cnnlstmemb}(x, ...)
13 |
14 | \method{save}{textmodel_mlp}(x, ...)
15 |
16 | \method{load}{textmodel_mlp}(x, ...)
17 | }
18 | \arguments{
19 | \item{x}{a \pkg{keras}-based fitted textmodel}
20 |
21 | \item{...}{additional arguments passed to \code{\link[=save]{save()}} or \code{\link[=load]{load()}}}
22 | }
23 | \description{
24 | Functions for loading and saving \pkg{keras}-based models. Because these are
25 | stored as references, they need to be "serialized" prior to saving, or
26 | serialized upon loading. This applies to models fit using
27 | \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}} and \code{\link[=textmodel_mlp]{textmodel_mlp()}}.
28 | }
29 | \keyword{internal}
30 |
--------------------------------------------------------------------------------
/man/summary.textmodel_cnnlstmemb.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_cnnlstmemb.R
3 | \name{summary.textmodel_cnnlstmemb}
4 | \alias{summary.textmodel_cnnlstmemb}
5 | \title{summary method for textmodel_cnnlstmemb objects}
6 | \usage{
7 | \method{summary}{textmodel_cnnlstmemb}(object, ...)
8 | }
9 | \arguments{
10 | \item{object}{output from \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}}}
11 |
12 | \item{...}{additional arguments not used}
13 | }
14 | \description{
15 | summary method for textmodel_cnnlstmemb objects
16 | }
17 | \keyword{internal}
18 | \keyword{textmodel}
19 |
--------------------------------------------------------------------------------
/man/summary.textmodel_mlp.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_mlp.R
3 | \name{summary.textmodel_mlp}
4 | \alias{summary.textmodel_mlp}
5 | \title{summary method for textmodel_mlp objects}
6 | \usage{
7 | \method{summary}{textmodel_mlp}(object, ...)
8 | }
9 | \arguments{
10 | \item{object}{output from \code{\link[=textmodel_mlp]{textmodel_mlp()}}}
11 |
12 | \item{...}{additional arguments not used}
13 | }
14 | \description{
15 | summary method for textmodel_mlp objects
16 | }
17 | \keyword{internal}
18 | \keyword{textmodel}
19 |
--------------------------------------------------------------------------------
/man/textmodel_cnnlstmemb.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_cnnlstmemb.R
3 | \name{textmodel_cnnlstmemb}
4 | \alias{textmodel_cnnlstmemb}
5 | \title{[Experimental] Convolutional NN + LSTM model fitted to word embeddings}
6 | \usage{
7 | textmodel_cnnlstmemb(
8 | x,
9 | y,
10 | dropout1 = 0.2,
11 | dropout2 = 0.2,
12 | dropout3 = 0.2,
13 | dropout4 = 0.2,
14 | wordembeddim = 30,
15 | cnnlayer = TRUE,
16 | filter = 48,
17 | kernel_size = 5,
18 | pool_size = 4,
19 | units_lstm = 128,
20 | words = NULL,
21 | maxsenlen = 100,
22 | optimizer = "adam",
23 | loss = "categorical_crossentropy",
24 | metrics = "categorical_accuracy",
25 | ...
26 | )
27 | }
28 | \arguments{
29 | \item{x}{tokens object}
30 |
31 | \item{y}{vector of training labels associated with each document identified
32 | in \code{train}. (These will be converted to factors if not already
33 | factors.)}
34 |
35 | \item{dropout1}{A floating variable bound between 0 and 1. It determines the
36 | rate at which units are dropped for the linear transformation of the
37 | inputs for the embedding layer.}
38 |
39 | \item{dropout2}{A floating variable bound between 0 and 1. It determines the
40 | rate at which units are dropped for the linear transformation of the
41 | inputs for the CNN layer.}
42 |
43 | \item{dropout3}{A floating variable bound between 0 and 1. It determines the
44 | rate at which units are dropped for the linear transformation of the
45 | inputs for the recurrent layer.}
46 |
47 | \item{dropout4}{A floating variable bound between 0 and 1. It determines the
48 | rate at which units are dropped for the linear transformation of the
49 | inputs for the recurrent layer.}
50 |
51 | \item{wordembeddim}{The number of word embedding dimensions to be fit}
52 |
53 | \item{cnnlayer}{A logical parameter that allows user to include or exclude a
54 | convolutional layer in the neural network model}
55 |
56 | \item{filter}{The number of output filters in the convolution}
57 |
58 | \item{kernel_size}{An integer or list of a single integer, specifying the
59 | length of the 1D convolution window}
60 |
61 | \item{pool_size}{Size of the max pooling windows.
62 | \code{\link[keras:layer_max_pooling_1d]{keras::layer_max_pooling_1d()}}}
63 |
64 | \item{units_lstm}{The number of nodes of the lstm layer}
65 |
66 | \item{words}{The maximum number of words used to train model. Defaults to the
67 | number of features in \code{x}}
68 |
69 | \item{maxsenlen}{The maximum sentence length of training data}
70 |
71 | \item{optimizer}{optimizer used to fit model to training data, see
72 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
73 |
74 | \item{loss}{objective loss function, see
75 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
76 |
77 | \item{metrics}{metric used to train algorithm, see
78 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
79 |
80 | \item{...}{additional options passed to
81 | \code{\link[keras:fit.keras.engine.training.Model]{keras::fit.keras.engine.training.Model()}}}
82 | }
83 | \description{
84 | A function that combines a convolutional neural network layer with a long
85 | short-term memory layer. It is designed to incorporate word sequences,
86 | represented as sequentially ordered word embeddings, into text
87 | classification. The model takes as an input a \pkg{quanteda} tokens object.
88 | }
89 | \examples{
90 | \dontrun{
91 | # create dataset with evenly balanced coded & uncoded immigration sentences
92 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK,
93 | !is.na(crowd_immigration_label))
94 | corpuncoded <- data_corpus_manifestosentsUK \%>\%
95 | corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\%
96 | corpus_sample(size = ndoc(corpcoded))
97 | corp <- corpcoded + corpuncoded
98 |
99 | tok <- tokens(corp)
100 |
101 | tmod <- textmodel_cnnlstmemb(tok,
102 | y = docvars(tok, "crowd_immigration_label"),
103 | epochs = 5, verbose = 1)
104 |
105 | newdata = tokens_subset(tok, subset = is.na(crowd_immigration_label))
106 | pred <- predict(tmod, newdata = newdata)
107 | table(pred)
108 | tail(texts(corpuncoded)[pred == "Immigration"], 10)
109 |
110 | }
111 | }
112 | \seealso{
113 | \code{\link[=save.textmodel_cnnlstmemb]{save.textmodel_cnnlstmemb()}}, \code{\link[=load.textmodel_cnnlstmemb]{load.textmodel_cnnlstmemb()}}
114 | }
115 | \keyword{textmodel}
116 |
--------------------------------------------------------------------------------
/man/textmodel_evaluate.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_evaluate.R
3 | \name{textmodel_evaluate}
4 | \alias{textmodel_evaluate}
5 | \title{Model evaluation function}
6 | \usage{
7 | textmodel_evaluate(
8 | x,
9 | y,
10 | model,
11 | fun = "f1_score",
12 | k = 5,
13 | parameters = list(),
14 | seed = as.numeric(Sys.time()),
15 | time = TRUE,
16 | by_class = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{x}{the \link{dfm} or \link{tokens} object on which the model will be
21 | fit. Does not need to contain only the training documents.}
22 |
23 | \item{y}{vector of training labels associated with each document identified
24 | in \code{train}. (These will be converted to factors if not already
25 | factors.)}
26 |
27 | \item{model}{the name of the machine learning function that will be evaluated}
28 |
29 | \item{fun}{the name of the function that will be used to evaluate the machine
30 | learning model. Can take the values "accuracy", "precision", "recall", or
31 | "f1_score"}
32 |
33 | \item{k}{number of folds}
34 |
35 | \item{parameters}{model hyperparameters}
36 |
37 | \item{seed}{a seed that can allow for replication of k training data splits.
38 | If seed is not provided a seed is chosen based on the current time.}
39 |
40 | \item{time}{a logical parameter that determines whether output will include
41 | training time (in seconds) of model}
42 |
43 | \item{by_class}{estimates a separate value of provided evaluation function
44 | for every class of the true vector}
45 | }
46 | \description{
47 | Designed to streamline the parameter tuning and evaluation process. Users
48 | chose a function to evaluate and include parameter values as a list. If
49 | multiple parameter values are provided, the function will perform a grid
50 | search by estimating a separate model for every combination of parameters.
51 | }
52 | \examples{
53 | # evaluate immigration classification performance
54 | \dontrun{
55 | dfmat <- dfm(data_corpus_manifestosentsUK)
56 | codes <- docvars(data_corpus_manifestosentsUK, "crowd_immigration_label")
57 | evaluation <- textmodel_evaluate(dfmat, codes, k = 3,
58 | model = "textmodel_mlp", fun = "f1_score",
59 | parameters = list(epochs = c(3, 4)))
60 | head(evaluation)
61 | aggregate(evaluation, by = list(evaluation$cost), FUN = "mean")
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/man/textmodel_mlp.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textmodel_mlp.R
3 | \name{textmodel_mlp}
4 | \alias{textmodel_mlp}
5 | \title{Multilayer perceptron network (MLP) model for text classification}
6 | \usage{
7 | textmodel_mlp(
8 | x,
9 | y,
10 | units = 512,
11 | dropout = 0.2,
12 | optimizer = "adam",
13 | loss = "categorical_crossentropy",
14 | metrics = "categorical_accuracy",
15 | ...
16 | )
17 | }
18 | \arguments{
19 | \item{x}{the \link{dfm} on which the model will be fit. Does not need to
20 | contain only the training documents.}
21 |
22 | \item{y}{vector of training labels associated with each document identified
23 | in \code{train}. (These will be converted to factors if not already
24 | factors.)}
25 |
26 | \item{units}{The number of network nodes used in the first layer of the
27 | sequential model}
28 |
29 | \item{dropout}{A floating variable bound between 0 and 1. It determines the
30 | rate at which units are dropped for the linear transformation of the
31 | inputs.}
32 |
33 | \item{optimizer}{optimizer used to fit model to training data, see
34 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
35 |
36 | \item{loss}{objective loss function, see
37 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
38 |
39 | \item{metrics}{metric used to train algorithm, see
40 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}}
41 |
42 | \item{...}{additional options passed to
43 | \code{\link[keras:fit.keras.engine.training.Model]{keras::fit.keras.engine.training.Model()}}}
44 | }
45 | \description{
46 | This function is a wrapper for a multilayer perceptron network model with a
47 | single hidden layer network with two layers, implemented in the \pkg{keras}
48 | package.
49 | }
50 | \examples{
51 | \dontrun{
52 | # create a dataset with evenly balanced coded and uncoded immigration sentences
53 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
54 | corpuncoded <- data_corpus_manifestosentsUK \%>\%
55 | corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\%
56 | corpus_sample(size = ndoc(corpcoded))
57 | corp <- corpcoded + corpuncoded
58 |
59 | # form a tf-idf-weighted dfm
60 | dfmat <- dfm(corp) \%>\%
61 | dfm_tfidf()
62 |
63 | set.seed(1000)
64 | tmod <- textmodel_mlp(dfmat, y = docvars(dfmat, "crowd_immigration_label"),
65 | epochs = 5, verbose = 1)
66 | pred <- predict(tmod, newdata = dfm_subset(dfmat, is.na(crowd_immigration_label)))
67 | table(pred)
68 | tail(texts(corpuncoded)[pred == "Immigration"], 10)
69 | }
70 | }
71 | \seealso{
72 | \code{\link[=save.textmodel_mlp]{save.textmodel_mlp()}}, \code{\link[=load.textmodel_mlp]{load.textmodel_mlp()}}
73 | }
74 | \keyword{textmodel}
75 |
--------------------------------------------------------------------------------
/man/tokens2sequences.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tokens2sequences.R
3 | \name{tokens2sequences}
4 | \alias{tokens2sequences}
5 | \title{[Experimental] Convert quanteda tokens to keras sequences}
6 | \usage{
7 | tokens2sequences(x, maxsenlen = 100, keepn = NULL)
8 | }
9 | \arguments{
10 | \item{x}{\code{\link[quanteda:tokens]{quanteda::tokens()}} object}
11 |
12 | \item{maxsenlen}{the maximum sentence length kept in output matrix}
13 |
14 | \item{keepn}{the maximum number of features to keep}
15 | }
16 | \value{
17 | \code{\link[=tokens2sequences]{tokens2sequences()}} The output matrix has a number of rows
18 | which represent each tokenized sentence input into the function and a
19 | number of columns determined by \code{maxsenlen}. The matrix contains a
20 | numeric code for every unique token kept (determined by \code{keepn}) and
21 | they are arranged in the same sequence indicated by the original
22 | \code{\link[quanteda:tokens]{quanteda::tokens()}} object.
23 | }
24 | \description{
25 | This function converts a \pkg{quanteda} \code{\link[quanteda:tokens]{quanteda::tokens()}} object
26 | into a tokens sequence object as expected by some functions in the
27 | \pkg{keras} package.
28 | }
29 | \examples{
30 | library("quanteda")
31 | corp <- corpus_subset(data_corpus_inaugural, Year <= 1793)
32 | corptok <- tokens(corp)
33 | print(corp)
34 | seqs <- tokens2sequences(corptok, maxsenlen = 200)
35 | print(seqs)
36 | }
37 | \seealso{
38 | \code{\link[=is.tokens2sequences]{is.tokens2sequences()}}, \code{\link[=tokens2sequences_conform]{tokens2sequences_conform()}}
39 | }
40 |
--------------------------------------------------------------------------------
/man/tokens2sequences_conform.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tokens2sequences.R
3 | \name{tokens2sequences_conform}
4 | \alias{tokens2sequences_conform}
5 | \title{Match the feature names of one tokens2sequences object to another}
6 | \usage{
7 | tokens2sequences_conform(x, y)
8 | }
9 | \arguments{
10 | \item{x}{\code{\link[=tokens2sequences]{tokens2sequences()}} object that will be forced to conform}
11 |
12 | \item{y}{\code{\link[=tokens2sequences]{tokens2sequences()}} object whose feature names will be
13 | used to change token labels for \code{x}}
14 | }
15 | \description{
16 | Converts the feature names of one tokens2sequences object to those of
17 | another. Useful in aligning training and test sets.
18 | }
19 | \examples{
20 | \dontrun{
21 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
22 | corpuncoded <- data_corpus_manifestosentsUK \%>\%
23 | corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\%
24 | corpus_sample(size = ndoc(corpcoded))
25 |
26 | tokx <- tokens(corpuncoded)
27 | toky <- tokens(corpcoded)
28 |
29 | seqx <- tokens2sequences(tokx, maxsenlen = 50, keepn = 5000)
30 | seqy <- tokens2sequences(toky, maxsenlen = 50, keepn = 5000)
31 | tokens2sequences_conform(seqx, seqy)
32 | }
33 | }
34 | \seealso{
35 | \code{\link[=tokens2sequences]{tokens2sequences()}}
36 | }
37 | \keyword{internal}
38 |
--------------------------------------------------------------------------------
/quanteda.classifiers.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageCheckArgs: --as-cran
19 | PackageRoxygenize: rd,collate,namespace,vignette
20 |
--------------------------------------------------------------------------------
/tests/data_creation/create_data_corpus_manifestosentUK.R:
--------------------------------------------------------------------------------
1 | ########################
2 | ### Create UK election manifesto dataset on the level of sentences
3 | ########################
4 |
5 | # load packages
6 | library(dplyr)
7 | library(tidyr)
8 | library(car)
9 | library(readtext)
10 | library(quanteda)
11 | library(quanteda.corpora)
12 | library(spacyr)
13 |
14 | ########################
15 | ### 1. Load and aggregate crowdcoded data ----
16 | ########################
17 |
18 | ## 1.1 Load APSR data on economic policy/social policy/neither ----
19 |
20 | data_readtext_uk_econsocial <- readtext("tests/data_creation/data_uk_policyarea.zip",
21 | ignore_missing_files = TRUE, encoding = "utf-8")
22 |
23 | # exclude screeners from data frame using a regular expression
24 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
25 | dplyr::filter(!grepl('Code this sentence as', sentence_text))
26 |
27 | ## For some sentences, special characters are displayed differently depending on the
28 | ## crowd coding job. Therefore, I remove sentence_text here and merge it later
29 | ## using "master.sentences.Rdata" which does not contain these encoding errors
30 |
31 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
32 | select(-sentence_text)
33 |
34 | # load "master sentences" from the APSR replication data with metadata on manifestos and years
35 | load("tests/data_creation/master.sentences.Rdata")
36 |
37 | sentences_metadata <- sentences %>%
38 | select(sentenceid, manifestoid, party, year) # select relevant variables
39 |
40 | # merge metadata on manifestos with crowd coded texts
41 | data_readtext_uk_econsocial <- left_join(data_readtext_uk_econsocial, sentences_metadata, by = "sentenceid")
42 |
43 | # create numeric indicator for aggregation of policy area
44 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
45 | mutate(class_policyarea_num = ifelse(policy_area == 1, 0,
46 | ifelse(policy_area == 2, -1,
47 | ifelse(policy_area == 3, 1, NA))))
48 |
49 | # create numeric indicator for aggregation of direction
50 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
51 | mutate(class_policyarea_direction_num = ifelse(!is.na(soc_scale), soc_scale,
52 | ifelse(!is.na(econ_scale), econ_scale, NA)))
53 |
54 | # aggregate data to the level of sentences
55 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
56 | group_by(manifestoid, sentenceid) %>%
57 | mutate(crowd_econsocial_n = n(),
58 | class_policyarea_mean = mean(class_policyarea_num, na.rm = TRUE))
59 |
60 | # create variable with the policy area based on the aggregated coding
61 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
62 | mutate(crowd_econsocial_label = ifelse(class_policyarea_mean < 0, "Economic",
63 | ifelse(class_policyarea_mean == 0, "Not Economic or Social",
64 | ifelse(class_policyarea_mean > 0, "Social", NA))))
65 |
66 |
67 | # here I make sure that the policy direction mean only takes
68 | # into account the majority category and not also the minority category position values
69 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>%
70 | mutate(class_policy_direction_num =
71 | ifelse(crowd_econsocial_label == "Economic" & class_policyarea_num == -1, class_policyarea_direction_num,
72 | ifelse(crowd_econsocial_label == "Social" & class_policyarea_num == 1, class_policyarea_direction_num,
73 | NA)))
74 |
75 | # aggregate data to the level of sentences
76 | data_uk_econsocial <- data_readtext_uk_econsocial %>%
77 | group_by(manifestoid, sentenceid, crowd_econsocial_n, crowd_econsocial_label) %>%
78 | summarise(crowd_econsocial_mean = mean(class_policy_direction_num, na.rm = TRUE)) %>%
79 | ungroup()
80 |
81 | # separate the variable manifestoid into Party and Year
82 | data_uk_econsocial <- data_uk_econsocial %>%
83 | separate(manifestoid, into = c("Party", "Year"),
84 | remove = FALSE)
85 |
86 | # merge text from "sentences" data frame which does not contain encoding errors
87 | dat_sentences <- sentences %>%
88 | select(sentence_text, sentenceid)
89 |
90 | data_uk_econsocial <- left_join(data_uk_econsocial, dat_sentences,
91 | by = "sentenceid")
92 |
93 | # select some of the variables and add three additional variables
94 | data_uk_man_econsocial <- data_uk_econsocial %>%
95 | ungroup() %>%
96 | select(text = sentence_text, sentenceid, Party, Year,
97 | crowd_econsocial_label, crowd_econsocial_mean,
98 | crowd_econsocial_n)
99 |
100 | # replace nan values in class_direction_mean with na
101 | data_uk_man_econsocial$crowd_econsocial_mean[is.nan(data_uk_man_econsocial$crowd_econsocial_mean)] <- NA
102 |
103 |
104 | ## 1.2 Load APSR data on immigration ----
105 | data_readtext_uk_immigration <- readtext("tests/data_creation/data_uk_immigration.zip",
106 | ignore_missing_files = TRUE)
107 |
108 | # exclude screeners
109 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>%
110 | subset(manifestoid != "screener")
111 |
112 | # use manifestoid to create a party and year variable
113 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>%
114 | separate(manifestoid, into = c("Party", "Year"))
115 |
116 | # get the class
117 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>%
118 | mutate(class_immigration_num = ifelse(policy_area == 4, 1, 0))
119 |
120 | # aggregate to the level of setence
121 | data_uk_immigration <- data_readtext_uk_immigration %>%
122 | group_by(Party, Year, sentenceid, sentence_text) %>%
123 | summarise(crowd_immigration_n = n(),
124 | class_immigration_num = mean(class_immigration_num, na.rm = TRUE),
125 | crowd_immigration_mean = mean(immigr_scale, na.rm = TRUE))
126 |
127 | # replace nan values in class_direction_mean with na
128 | data_uk_immigration$crowd_immigration_mean[is.nan(data_uk_immigration$crowd_immigration_mean)] <- NA
129 |
130 | # use the average evaluations to construct the majority-rule based classification
131 | data_uk_immigration <- data_uk_immigration %>%
132 | mutate(crowd_immigration_label = ifelse(class_immigration_num >= 0.5, "Immigration", "Not immigration"))
133 |
134 | # select some of the variables and add three additional variables
135 | data_uk_immig_man_2010 <- data_uk_immigration %>%
136 | ungroup() %>%
137 | select(text = sentence_text, sentenceid, Party, Year, crowd_immigration_label,
138 | crowd_immigration_mean,
139 | crowd_immigration_n)
140 |
141 |
142 | ########################
143 | ## 1.3 Merge both crowdcoded datasets ----
144 | ########################
145 |
146 | # Note that the 2010 manifestos have been coded both with regards to
147 | # immigration AND econ/social/neither
148 |
149 | # full join of both crowdcoded datasets
150 | dat_uk_crowdcoded <- full_join(data_uk_man_econsocial,
151 | data_uk_immig_man_2010, by = c("sentenceid"))
152 |
153 | table(dat_uk_crowdcoded$Party.x)
154 | table(dat_uk_crowdcoded$Party.y)
155 |
156 | # rename and remove unnessary variables
157 | dat_uk_man_crowdcoded_clean <- dat_uk_crowdcoded %>%
158 | mutate(Party = ifelse(is.na(Party.x), Party.y, Party.x)) %>%
159 | mutate(Year = ifelse(is.na(Year.x), Year.y, Year.x)) %>%
160 | mutate(text = ifelse(is.na(text.x), text.y, text.x)) %>%
161 | select(-c(Party.x, Party.y, Year.x, Year.y, text.x, text.y))
162 |
163 | ########################
164 | ## 2. Add data_corpus_ukmanifestos from quanteda.corpora package ----
165 | ########################
166 |
167 | # Since some of the manifestos are already coded in terms of social/eoconomic/neither,
168 | # we remove these documents from the non-annotated corpus
169 |
170 | # get overview of crowdcoded manifesto
171 | manifestos_ukseconsocial <- dat_uk_man_crowdcoded_clean %>%
172 | ungroup() %>%
173 | select(Party, Year) %>%
174 | unique() %>%
175 | mutate(party_year = paste(Party, Year, sep = "_"))
176 |
177 | # create new docvar used for removing crowdcoded manifestos
178 | # from not-annotated corpus
179 | docvars(data_corpus_ukmanifestos, "party_year") <- paste(
180 | docvars(data_corpus_ukmanifestos, "Party"),
181 | docvars(data_corpus_ukmanifestos, "Year"), sep = "_"
182 | )
183 |
184 | # remove manifestos that have been crowdcoded
185 | data_corpus_ukmanifestos_subset <- data_corpus_ukmanifestos %>%
186 | corpus_subset(!party_year %in% manifestos_ukseconsocial$party_year)
187 |
188 | ndoc(data_corpus_ukmanifestos)
189 | ndoc(data_corpus_ukmanifestos_subset)
190 |
191 | # check whether Lab, Con, Lib manifestos are excluded for time between 1987 and 2010 (yes!)
192 | table(docvars(data_corpus_ukmanifestos_subset, "party_year"))
193 |
194 | # only keep manifestos from national general elections (Type == "natl")
195 | data_corpus_ukmanifestos_sentences <- data_corpus_ukmanifestos_subset %>%
196 | corpus_subset(Type == "natl")
197 |
198 | # transform to data frame for easier adjustments of document-level variables
199 | data_uk_man <- data.frame(
200 | doc_id = docnames(data_corpus_ukmanifestos_sentences),
201 | text = texts(data_corpus_ukmanifestos_sentences),
202 | docvars(data_corpus_ukmanifestos_sentences),
203 | stringsAsFactors = FALSE
204 | )
205 |
206 | # use spacyr to tokenize manifestos to sentence-level
207 | spacy_initialize(model = "en")
208 |
209 | data_uk_man_sentences <- spacy_tokenize(data_uk_man,
210 | remove_separators = FALSE,
211 | what = "sentence",
212 | output = "data.frame")
213 |
214 | data_uk_man_sentences <- rename(data_uk_man_sentences, text = token)
215 |
216 |
217 | # merge metadata
218 | data_uk_man_sentences <- left_join(data_uk_man_sentences,
219 | select(data_uk_man, -c("text")),
220 | by = "doc_id")
221 |
222 |
223 | ########################
224 | ## 3. Import additional manifestos from 2010, 2015, and 2017 elections ----
225 | ########################
226 |
227 | ## Note: data retrieved from http://polidoc.net (2010 and 2015) and from parties' websites (2019)
228 |
229 | data_uk_1519 <- readtext("tests/data_creation/data_uk_manifestos_2015-2019.zip",
230 | ignore_missing_files = TRUE,
231 | encoding = "utf-8",
232 | docvarsfrom = "filenames",
233 | docvarnames = c("Year", "Party"))
234 |
235 |
236 | # tokenize documents to the level of sentences
237 | data_uk_man_1519_sentences <- spacy_tokenize(data_uk_1519,
238 | remove_separators = FALSE,
239 | what = "sentence",
240 | output = "data.frame")
241 |
242 | # merge metadata
243 | data_uk_man_1519_sentences <- left_join(data_uk_man_1519_sentences,
244 | select(data_uk_1519, -c("text")),
245 | by = "doc_id")
246 |
247 | data_uk_man_1519_sentences <- rename(data_uk_man_1519_sentences, text = token)
248 |
249 |
250 | ########################
251 | ## 4. Combine manifestos and create corpus ----
252 | ########################
253 |
254 | dat_uk_man_crowdcoded_clean$Year <- as.integer(dat_uk_man_crowdcoded_clean$Year)
255 |
256 | data_uk_manifestos <- bind_rows(data_uk_man_sentences,
257 | dat_uk_man_crowdcoded_clean,
258 | data_uk_man_1519_sentences)
259 |
260 |
261 | # create unique id for each sentence and select only necessary variables
262 | data_uk_manifestos_selectvars <- data_uk_manifestos %>%
263 | select(-c(party_year, sentenceid)) %>%
264 | mutate(year = as.factor(Year)) %>%
265 | select(text,
266 | party = Party,
267 | year,
268 | starts_with("crowd_"))
269 |
270 | # create a quanteda corpus
271 | corp <- corpus(data_uk_manifestos_selectvars)
272 |
273 | # count the number of tokens per sentence (after removing punctuation characters)
274 | docvars(corp, "ntoken_sent") <- ntoken(corp, remove_punct = TRUE)
275 |
276 | # remove observation with 0 tokens (i.e. only punctuation character)
277 | corp_small <- corp %>%
278 | corpus_subset(ntoken_sent > 0)
279 |
280 | # transform back to data frame to create nice doc_id after having removed "empty" sentences
281 | dat_corpus <- data.frame(
282 | text = texts(corp_small),
283 | docvars(corp_small),
284 | stringsAsFactors = FALSE
285 | )
286 |
287 | dat_corpus <- dat_corpus %>%
288 | arrange(year, party) %>%
289 | group_by(party, year) %>%
290 | mutate(sentence_no = 1:n()) %>%
291 | mutate(doc_id = paste(party, year, sentence_no, sep = "_")) %>%
292 | ungroup()
293 |
294 | # rename variables
295 | dat_corpus_renamed <- dat_corpus %>%
296 | mutate(party = ifelse(party == "Comm", "CP", party)) %>%
297 | mutate(party = ifelse(party == "Gr", "Greens", party)) %>%
298 | mutate(party = ifelse(party == "OMRL", "MRLP", party)) %>%
299 | mutate(party = ifelse(party == "PCy", "PC", party))
300 |
301 |
302 | recode_party <- c("'Coalition'='Coalition Agreement';
303 | 'BNP'='British National Party';
304 | 'CAP'='Community Action Party';
305 | 'Con'='Conservative Party';
306 | 'Comm'='Communist Party';
307 | 'CP'='Communist Party';
308 | 'Dem'='Democratic Party';
309 | 'DUP'='Democratic Unionist Party';
310 | 'EDP'='English Democrats';
311 | 'EIP'='English Independence Party';
312 | 'FSP'='Free Scotland Party';
313 | 'FW'='Forward Wales';
314 | 'Greens'='Green Party';
315 | 'IGV'='Independent Green Voice';
316 | 'LA'='Left Alliance';
317 | 'Lab'='Labour Party';
318 | 'Lib'='Liberal Party';
319 | 'LD'='Liberal Demoracts';
320 | 'LibSDP'='Social Democratic Party';
321 | 'MK'='Mebyon Kernow - the Party for Cornwall';
322 | 'MRLP'='Official Monster Raving Loony Party';
323 | 'ND'='National Democrats';
324 | 'NIA'='Northern Ireland Alliance';
325 | 'PA'='Prolife Alliance';
326 | 'PC'='Plaid Cymru';
327 | 'PP'='Peace Party';
328 | 'PUP'='Progressive Unionist Party';
329 | 'PVP'='Protest Vote Party';
330 | 'Resp'='Respect';
331 | 'RT'='Richard Taylor Personal Manifesto';
332 | 'Scon'='Scottish Conservative Party';
333 | 'SDLP'='Social Democratic and Labour Party';
334 | 'SEP'='Socialist Equality Party';
335 | 'SF'='Sinn Féin';
336 | 'SGr'='Scottish Green Party';
337 | 'SLab'='Scottish Labour Party';
338 | 'SLD'='Scottish Liberal Democrats';
339 | 'SNP'='Scottish National Party';
340 | 'SP'='Socialist Party';
341 | 'SCon'='Scottish Conservative Party';
342 | 'SSoc'='Scottish Socialist Party';
343 | 'SSP'='Scottish Socialist Party';
344 | 'Stuck'='Stuckist Party';
345 | 'TW'='Third Way';
346 | 'UKIP'='UK Independence Party';
347 | 'UUP'='Ulster Unionist Party';
348 | 'Ver'='Veritas Party'")
349 |
350 |
351 | dat_corpus_renamed <- dat_corpus_renamed %>%
352 | mutate(partyname = car::recode(party, recode_party))
353 |
354 | dat_corpus_renamed <- dat_corpus_renamed %>%
355 | select(doc_id, text, party, partyname, year,
356 | crowd_econsocial_label, crowd_econsocial_mean,
357 | crowd_econsocial_n, crowd_immigration_label,
358 | crowd_immigration_mean,
359 | crowd_immigration_n)
360 |
361 | dat_corpus_renamed$crowd_econsocial_label <- factor(dat_corpus_renamed$crowd_econsocial_label)
362 | dat_corpus_renamed$crowd_immigration_label <- factor(dat_corpus_renamed$crowd_immigration_label)
363 | dat_corpus_renamed$year <- as.integer(as.character(dat_corpus_renamed$year))
364 | dat_corpus_renamed$party <- as.factor(dat_corpus_renamed$party)
365 | dat_corpus_renamed$partyname <- as.factor(dat_corpus_renamed$partyname)
366 | dat_corpus_renamed$crowd_immigration_n <- as.integer(dat_corpus_renamed$crowd_immigration_n)
367 | dat_corpus_renamed$crowd_econsocial_n <- as.integer(dat_corpus_renamed$crowd_econsocial_n)
368 |
369 | # create final corpus
370 | data_corpus_manifestosentsUK <- corpus(dat_corpus_renamed,
371 | docid_field = "doc_id")
372 |
373 | # add corpus to package
374 | usethis::use_data(data_corpus_manifestosentsUK, overwrite = TRUE)
375 |
--------------------------------------------------------------------------------
/tests/data_creation/data_uk_immigration.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_immigration.zip
--------------------------------------------------------------------------------
/tests/data_creation/data_uk_manifestos_2015-2019.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_manifestos_2015-2019.zip
--------------------------------------------------------------------------------
/tests/data_creation/data_uk_policyarea.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_policyarea.zip
--------------------------------------------------------------------------------
/tests/data_creation/master.sentences.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/master.sentences.Rdata
--------------------------------------------------------------------------------
/tests/misc/test-LMRD.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Testing text classification using movie reviews"
3 | author: "Kenneth Benoit"
4 | output:
5 | html_notebook:
6 | toc: yes
7 | toc_float: yes
8 | ---
9 |
10 |
11 | Tests using the Large Movie Review Dataset, a dataset for sentiment classification containing 25,000 highly polar movie reviews for training, and 25,000 for testing, from Maas et. al. (2011).
12 |
13 | Source: Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis". The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
14 |
15 | # Setting up {.tabset .tabset-fade}
16 |
17 | ```{r setup, echo=FALSE}
18 | knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE)
19 | ```
20 |
21 |
22 | ## Test and training data
23 |
24 | ```{r}
25 | dfmat <- dfm(data_corpus_LMRD)
26 | dfmat_train <- dfm_trim(dfmat, min_termfreq = 100) %>%
27 | dfm_subset(set == "train")
28 | dfmat_test <- dfm_trim(dfmat, min_termfreq = 100) %>%
29 | dfm_subset(set == "test")
30 | ```
31 |
32 | ## Some functions for evaluation
33 |
34 | ```{r}
35 | performance <- function(mytable, verbose = TRUE) {
36 | truePositives <- mytable[1, 1]
37 | trueNegatives <- sum(diag(mytable)[-1])
38 | falsePositives <- sum(mytable[1, ]) - truePositives
39 | falseNegatives <- sum(mytable[, 1]) - truePositives
40 | precision <- truePositives / (truePositives + falsePositives)
41 | recall <- truePositives / (truePositives + falseNegatives)
42 | accuracy <- sum(diag(mytable)) / sum(mytable)
43 | tnr <- trueNegatives / (trueNegatives + falsePositives)
44 | balanced_accuracy <- sum(c(precision, tnr), na.rm = TRUE) / 2
45 | if (verbose) {
46 | print(mytable)
47 | cat(
48 | "\n precision =", round(precision, 2),
49 | "\n recall =", round(recall, 2),
50 | "\n accuracy =", round(accuracy, 2),
51 | "\n bal. acc. =", round(balanced_accuracy, 2),
52 | "\n"
53 | )
54 | }
55 | invisible(c(precision, recall))
56 | }
57 | ```
58 |
59 |
60 |
61 | # Naive Bayes {.tabset .tabset-fade}
62 |
63 | ## No weights
64 |
65 | ```{r}
66 | system.time({
67 | tmod_nb <- textmodel_nb(dfmat_train, y = docvars(dfmat_train, "polarity"))
68 | pred_nb <- predict(tmod_nb, newdata = dfmat_test, type = "class")
69 | table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
70 | })
71 | ```
72 |
73 | ## tf-idf weighting
74 |
75 | ```{r}
76 | system.time({
77 | tmod_nb <- textmodel_nb(dfm_tfidf(dfmat_train),
78 | y = docvars(dfmat_train, "polarity"))
79 | pred_nb <- predict(tmod_nb, newdata = dfm_tfidf(dfmat_test), type = "class")
80 | table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
81 | })
82 | ```
83 |
84 | # SVM {.tabset .tabset-fade}
85 |
86 | ## SVM
87 |
88 | ```{r}
89 | system.time({
90 | tmod_svm <- textmodel_svm(dfmat_train, y = docvars(dfmat_train, "polarity"))
91 | pred_svm <- predict(tmod_svm, newdata = dfmat_test, type = "class")
92 | table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
93 | })
94 | ```
95 |
96 | ## SVM w/tf-idf weights
97 |
98 | ```{r}
99 | tmod_svm2 <- textmodel_svm(dfm_tfidf(dfmat_train),
100 | y = docvars(dfmat_train, "polarity"))
101 | pred_svm2 <- predict(tmod_svm2, newdata = dfm_tfidf(dfmat_test), type = "class")
102 | table(pred_svm2, dfmat_test$polarity)[2:1, 2:1] %>% performance()
103 | ```
104 |
105 | ## SVMlin
106 |
107 | ```{r}
108 | system.time({
109 | tmod_svmlin <- textmodel_svmlin(dfmat_train,
110 | y = docvars(dfmat_train, "polarity"))
111 | pred_svmlin <- predict(tmod_svmlin, newdata = dfmat_test, type = "class")
112 | table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
113 | })
114 | ```
115 |
116 |
117 | # Multilayer Perceptron Network (MLP)
118 |
119 | ```{r}
120 | system.time({
121 | tmod_mlp <- textmodel_mlp(dfmat_train,
122 | y = docvars(dfmat_train, "polarity"), epochs = 10)
123 | pred_mlp <- predict(tmod_mlp, newdata = dfmat_test, type = "class")
124 | table(pred_mlp, dfmat_test$polarity)[2:1, 2:1] %>% performance()
125 | })
126 | ```
127 |
128 |
129 | # CNN-embedding-LSTM model
130 |
131 | ```{r}
132 | system.time({
133 | toks_train <- data_corpus_LMRD %>%
134 | corpus_subset(set == "test") %>%
135 | tokens() %>%
136 | tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
137 | tmod_cnn <- textmodel_cnnlstmemb(toks_train,
138 | y = docvars(dfmat_train, "polarity"),
139 | epochs = 10)
140 |
141 | toks_test <- data_corpus_LMRD %>%
142 | corpus_subset(set == "train") %>%
143 | tokens() %>%
144 | tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
145 | pred_cnn <- predict(tmod_cnn, newdata = toks_test, type = "class")
146 | table(pred_cnn, dfmat_test$polarity) %>% performance()
147 | })
148 | ```
149 |
150 |
--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if(requireNamespace('spelling', quietly = TRUE))
2 | spelling::spell_check_test(vignettes = TRUE, error = TRUE,
3 | skip_on_cran = TRUE)
4 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library("testthat")
2 | library("quanteda.classifiers")
3 |
4 | library("quanteda")
5 | library("quanteda.textmodels")
6 |
7 | # library("keras")
8 | # use_session_with_seed(42)
9 | # see https://github.com/rstudio/keras/issues/890#issuecomment-539044011
10 | # tensorflow::tf$random$set_seed(42)
11 |
12 | test_check("quanteda.classifiers")
13 |
--------------------------------------------------------------------------------
/tests/testthat/test-performance.R:
--------------------------------------------------------------------------------
1 | context("test evaluation functions")
2 |
3 | lvs <- c("Relevant", "Irrelevant")
4 | pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs)
5 | true <- factor(c(rep(lvs, times = c(30, 12)),
6 | rep(lvs, times = c(30, 28))), levels = lvs)
7 | tab <- table(pred, true)
8 |
9 | test_that("performance works by_class = TRUE", {
10 | perf <- performance(tab, by_class = TRUE)
11 | expect_equal(perf,
12 | list(precision = c(Relevant = 0.714,
13 | Irrelevant = 0.483),
14 | recall = c(Relevant = 0.5, Irrelevant = 0.7),
15 | f1 = c(Relevant = 0.588,
16 | Irrelevant = 0.571),
17 | accuracy = 0.58,
18 | balanced_accuracy = 0.6),
19 | tol = .001
20 | )
21 | })
22 |
23 | test_that("performance works by_class = FALSE", {
24 | perf <- performance(tab, by_class = FALSE)
25 | expect_equal(perf,
26 | list(precision = 0.599, recall = 0.6, f1 = 0.580,
27 | accuracy = 0.58, balanced_accuracy = 0.6),
28 | tol = .001
29 | )
30 | })
31 |
32 | test_that("exceptions work", {
33 | perf <- performance(tab, by_class = TRUE)
34 | #expect_error(
35 | # f1_score(perf[c("precision", "accuracy")]),
36 | # "list must contain both precision and recall"
37 | #)
38 | expect_error(
39 | balanced_accuracy(perf[c("precision", "accuracy")]),
40 | "list must include recall"
41 | )
42 |
43 | expect_error(
44 | balanced_accuracy(performance(tab, by_class = FALSE)),
45 | "recall must be computed by class"
46 | )
47 |
48 | expect_error(
49 | quanteda.classifiers:::check_table(tab, 3),
50 | "truth must be 2 for columns or 1 for rows"
51 | )
52 | tab2 <- tab
53 | colnames(tab2)[2] <- "dummy"
54 | expect_error(
55 | quanteda.classifiers:::check_table(tab2, 2),
56 | "predicted and truth values must have the same order and names"
57 | )
58 | })
59 |
--------------------------------------------------------------------------------
/tests/testthat/test-textmodel_cnnlstmemb.R:
--------------------------------------------------------------------------------
1 | context("test textmodel_cnnlstmemb")
2 |
3 | test_that("the cnnlstmemb model works", {
4 | skip()
5 | skip_on_cran()
6 |
7 | data(data_corpus_EPcoaldebate, package = "quanteda.textmodels")
8 | corp <- corpus_subset(data_corpus_EPcoaldebate,
9 | subset = language == "English") %>%
10 | corpus_sample(500)
11 |
12 | toks <- tokens(corp)
13 | label <- ifelse(docvars(corp, "crowd_subsidy_label") == "Pro-Subsidy", 1, 0)
14 | tmod <- textmodel_cnnlstmemb(toks, y = label, epochs = 8)
15 |
16 | expect_output(
17 | print(tmod),
18 | "Call:"
19 | )
20 | expect_equal(names(summary(tmod)), c("call", "model structure"))
21 | con_mat <- table(predict(tmod, type = "class"), label)
22 | accuracy <- sum(diag(con_mat)) / sum(con_mat)
23 | expect_equal(
24 | accuracy,
25 | 0.87,
26 | tolerance = 0.1
27 | )
28 | set.seed(10)
29 | pred_out <- predict(tmod, type = "probability")
30 | pred_max <- apply(pred_out, 1, function(x) colnames(pred_out)[which.max(x)])
31 | con_mat <- table(pred_max, label)
32 | accuracy <- sum(diag(con_mat)) / sum(con_mat)
33 | expect_equal(
34 | accuracy,
35 | 0.87,
36 | tolerance = 0.1
37 | )
38 | })
39 |
40 | test_that("multiclass prediction works", {
41 | skip()
42 | skip_on_cran()
43 |
44 | data(data_corpus_irishbudget2010, package = "quanteda.textmodels")
45 | toks <- tokens(data_corpus_irishbudget2010)
46 | y <- docvars(data_corpus_irishbudget2010, "party")
47 | y[5] <- NA
48 | tmod2 <- textmodel_cnnlstmemb(toks, y = y)
49 | expect_equal(
50 | names(predict(tmod2, type = "class"))[5],
51 | "Cowen, Brian (FF)"
52 | )
53 |
54 | probmat <- predict(tmod2, type = "probability")
55 | expect_equal(dim(probmat), c(14, 5))
56 | expect_equal(rownames(probmat), docnames(toks))
57 | expect_equal(colnames(probmat), tmod2$classnames)
58 | expect_equal(unname(rowSums(probmat)), rep(1, nrow(probmat)), tol = .000001)
59 | })
60 |
61 | test_that("cnnlstmemb works with tokens2sequences", {
62 | skip()
63 | skip_on_cran()
64 |
65 | data(data_corpus_irishbudget2010, package = "quanteda.textmodels")
66 | toks1 <- tokens2sequences(x = tokens(data_corpus_irishbudget2010),keepn = 100)
67 | y <- docvars(data_corpus_irishbudget2010, "party")
68 | y[5] <- NA
69 | tmod2 <- textmodel_cnnlstmemb(x = toks1, y = y)
70 | expect_equal(
71 | names(predict(tmod2, type = "class"))[5],
72 | "Cowen, Brian (FF)"
73 | )
74 |
75 | probmat <- predict(tmod2, type = "probability")
76 | expect_equal(dim(probmat), c(14, 5))
77 | expect_equal(rownames(probmat), rownames(toks1$matrix))
78 | expect_equal(colnames(probmat), tmod2$classnames)
79 | expect_equal(unname(rowSums(probmat)), rep(1, nrow(probmat)), tol = .000001)
80 | })
81 |
--------------------------------------------------------------------------------
/tests/testthat/test-textmodel_evaluate.R:
--------------------------------------------------------------------------------
1 | context("test textmodel_evaluate")
2 |
3 | test_that("textmodel_evaluate works", {
4 | skip("until rewritten")
5 | skip_on_cran()
6 |
7 | data(data_corpus_EPcoaldebate, package = "quanteda.textmodels")
8 |
9 | set.seed(100)
10 | corp <- corpus_sample(data_corpus_EPcoaldebate, size = 500, by = "crowd_subsidy_label")
11 | dfmat <- dfm(corp) %>%
12 | dfm_trim(min_termfreq = 10)
13 | labels <- docvars(dfmat, "crowd_subsidy_label")
14 | model_eval <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "f1_score", k = 3, seed = 5)
15 |
16 | # Check ouptuts for consistency
17 | expect_equal(dim(model_eval), c(3, 4))
18 | expect_equal(names(model_eval), c("k", "f1_score", "time", "seed"))
19 | expect_equal(max(model_eval$k), 3)
20 |
21 |
22 | model_eval2 <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "f1_score", k = 2, parameters = list(epochs = c(3, 4)), seed = 5)
23 |
24 | # Check ouptuts for consistency
25 | expect_equal(dim(model_eval2), c(4, 5))
26 | expect_equal(names(model_eval2), c("k", "f1_score", "epochs", "time", "seed"))
27 | expect_equal(max(model_eval2$k), 2)
28 |
29 | # Check by_class
30 | model_eval3 <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "recall", k = 2, seed = 5, by_class = TRUE)
31 | expect_true("class" %in% names(model_eval3))
32 | expect_true(sum(levels(labels) %in% model_eval3$class) == 3)
33 |
34 | # Check if it works with textmodel_cnnlstm
35 |
36 | corp_tok <- corpus_sample(data_corpus_EPcoaldebate, size = 500, by = "crowd_subsidy_label")
37 | tok <- tokens(corp_tok)
38 | labels <- docvars(corp_tok, "crowd_subsidy_label")
39 | model_eval4 <- textmodel_evaluate(x = tok, y = labels, model = "textmodel_cnnlstmemb", fun = "f1_score", k = 3, seed = 5)
40 | expect_equal(dim(model_eval4), c(3, 4))
41 | expect_equal(names(model_eval4), c("k", "f1_score", "time", "seed"))
42 | expect_equal(max(model_eval4$k), 3)
43 | expect_true(min(model_eval4$f1_score) > 0.1 & max(model_eval4$f1_score) < 1)
44 | })
45 |
46 |
--------------------------------------------------------------------------------
/tests/testthat/test-textmodel_mlp.R:
--------------------------------------------------------------------------------
1 | context("test textmodel_mlp")
2 |
3 | test_that("the mlp model works", {
4 | skip("because of tensorflow install problems")
5 | skip_on_cran()
6 |
7 | data(data_corpus_EPcoaldebate, package = "quanteda.textmodels")
8 |
9 | set.seed(100)
10 | corp_train <- corpus_sample(data_corpus_EPcoaldebate, size = 3000, by = "crowd_subsidy_label")
11 | corp_test <- corpus_sample(data_corpus_EPcoaldebate, size = 10, by = "crowd_subsidy_label")
12 | dfmat_train <- dfm(corp_train)
13 | dfmat_test <- dfm(corp_test)
14 |
15 | tmod <- textmodel_mlp(dfmat_train, y = docvars(dfmat_train, "crowd_subsidy_label"), epoch = 5)
16 |
17 | # label
18 | pred <- predict(tmod, newdata = dfmat_test, type = "class")
19 | tab <- table(pred, dfmat_test$crowd_subsidy_label)
20 | acc <- sum(diag(tab)) / sum(tab)
21 | expect_gte(acc, .6)
22 |
23 | # predicted prob
24 | prob <- predict(tmod, newdata = dfmat_test, type = "probability")
25 | expect_gte(prob["PL_Lamberts_3_3", "Anti-Subsidy"], .95)
26 |
27 | expect_output(
28 | print(tmod),
29 | "Call:"
30 | )
31 |
32 | expect_equal(names(summary(tmod)), c("call", "model structure"))
33 | set.seed(10)
34 | pred_max <- apply(prob, 1, function(x) colnames(prob)[which.max(x)])
35 | expect_equivalent(
36 | pred_max,
37 | as.character(pred)
38 | )
39 | })
40 |
--------------------------------------------------------------------------------
/tests/testthat/test-tokens2sequences.R:
--------------------------------------------------------------------------------
1 | context("test tokens2sequences")
2 |
3 | test_that("tokens2sequences works", {
4 | skip()
5 | skip_on_cran()
6 |
7 | ## Example from 13.1 of _An Introduction to Information Retrieval_
8 | text <- c("Chinese Beijing Chinese",
9 | "Chinese Chinese Shanghai",
10 | "Chinese Macao",
11 | "Tokyo Japan",
12 | "Chinese Chinese Chinese Tokyo Japan")
13 | text_tokens <- tokens(text)
14 | seq <- tokens2sequences(text_tokens, maxsenlen = 10, keepn = 5)
15 |
16 | # Check ouptuts for consistency
17 | expect_equal(dim(seq$matrix), c(5, 10))
18 | expect_equal(seq$nfeatures, 5)
19 | expect_equal(max(seq$matrix), 5)
20 | expect_equal(min(seq$matrix), 0)
21 | expect_equal(as.integer(apply(seq$matrix, 1, function(x) sum(x != 0))), c(3, 3, 1, 2, 5))
22 |
23 | # Compare with keras's texts_to_sequences function
24 | tok <- keras::text_tokenizer(filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r",lower = T,num_words = 6) %>% # Note: Keras includes 0 as a word. tokens2sequences does not
25 | keras::fit_text_tokenizer(text)
26 | tok_mat <- keras::texts_to_sequences(tok,texts = text) %>% keras::pad_sequences(maxlen = 10)
27 | seq_mat <- unname(seq$matrix)
28 | expect_equal(seq_mat, tok_mat)
29 |
30 | # Check whether it works on character vectors
31 | seq2 <- tokens2sequences(text, maxsenlen = 10, keepn = 5)
32 | expect_equal(dim(seq$matrix), dim(seq2$matrix))
33 | expect_equal(seq$nfeatures, seq2$nfeatures)
34 |
35 | # Check whether tokens2sequences performs recursively
36 | seq_short <- tokens2sequences(x = seq2, maxsenlen = 2, keepn = 5)
37 | expect_equal(ncol(seq_short$matrix), 2)
38 | expect_equal(seq_short$nfeatures, 4)
39 | })
40 |
41 | test_that("tokens2sequences_conform works", {
42 | skip()
43 | txt1 <- "This is sentence one. And here is sentence two."
44 | txt2 <- "This is sentence 3. Sentence 4! A fifth and final example."
45 | toks1 <- corpus(txt1) %>%
46 | corpus_reshape(to = "sentence") %>%
47 | tokens() %>%
48 | tokens_tolower()
49 | toks2 <- corpus(txt2) %>%
50 | corpus_reshape(to = "sentence") %>%
51 | tokens() %>%
52 | tokens_tolower()
53 |
54 | seqx <- tokens2sequences(toks1, maxsenlen = 4, keepn = 4)
55 | seqy <- tokens2sequences(toks2, maxsenlen = 4, keepn = 4)
56 |
57 | seqxy <- tokens2sequences_conform(seqx, seqy)
58 | expect_equal(dim(seqxy$matrix), c(2, 4))
59 | expect_equal(ncol(seqxy$features), 3)
60 | })
61 |
--------------------------------------------------------------------------------