├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── test-coverage.yaml ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── crossval.R ├── data.R ├── performance.R ├── quanteda.classifiers-package.R ├── textmodel_cnnlstmemb.R ├── textmodel_evaluate.R ├── textmodel_mlp.R ├── tokens2sequences.R └── zzz.R ├── README.Rmd ├── README.md ├── data ├── data_corpus_LMRD.rda └── data_corpus_manifestosentsUK.rda ├── inst └── WORDLIST ├── man ├── crossval.Rd ├── data_corpus_LMRD.Rd ├── data_corpus_manifestosentsUK.Rd ├── is.tokens2sequences.Rd ├── performance.Rd ├── predict.textmodel_cnnlstmemb.Rd ├── predict.textmodel_mlp.Rd ├── quanteda.classifiers-package.Rd ├── save.textmodel_mlp.Rd ├── summary.textmodel_cnnlstmemb.Rd ├── summary.textmodel_mlp.Rd ├── textmodel_cnnlstmemb.Rd ├── textmodel_evaluate.Rd ├── textmodel_mlp.Rd ├── tokens2sequences.Rd └── tokens2sequences_conform.Rd ├── quanteda.classifiers.Rproj └── tests ├── data_creation ├── create_data_corpus_manifestosentUK.R ├── data_uk_immigration.zip ├── data_uk_manifestos_2015-2019.zip ├── data_uk_policyarea.zip └── master.sentences.Rdata ├── misc ├── test-LMRD.Rmd └── test-LMRD.nb.html ├── spelling.R ├── testthat.R └── testthat ├── test-performance.R ├── test-textmodel_cnnlstmemb.R ├── test-textmodel_evaluate.R ├── test-textmodel_mlp.R └── test-tokens2sequences.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^codecov\.yml$ 2 | ^appveyor\.yml$ 3 | ^\.travis\.yml$ 4 | ^.*\.Rproj$ 5 | ^\.Rproj\.user$ 6 | README.Rmd 7 | tests/misc/ 8 | \.github/ 9 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | name: R-CMD-check 12 | 13 | jobs: 14 | R-CMD-check: 15 | runs-on: ${{ matrix.config.os }} 16 | 17 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 18 | 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | config: 23 | - {os: windows-latest, r: 'release'} 24 | - {os: macOS-latest, r: 'release'} 25 | - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 26 | - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 27 | 28 | env: 29 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 30 | RSPM: ${{ matrix.config.rspm }} 31 | 32 | steps: 33 | - uses: actions/checkout@v2 34 | 35 | - uses: r-lib/actions/setup-r@v2 36 | with: 37 | r-version: ${{ matrix.config.r }} 38 | 39 | - uses: r-lib/actions/setup-pandoc@v2 40 | 41 | - name: Query dependencies 42 | run: | 43 | install.packages('remotes') 44 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 45 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 46 | shell: Rscript {0} 47 | 48 | - name: Cache R packages 49 | if: runner.os != 'Windows' 50 | uses: actions/cache@v1 51 | with: 52 | path: ${{ env.R_LIBS_USER }} 53 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 54 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 55 | 56 | - name: Install system dependencies 57 | if: runner.os == 'Linux' 58 | run: | 59 | while read -r cmd 60 | do 61 | eval sudo $cmd 62 | done < <(Rscript -e 'cat(remotes::system_requirements("ubuntu", "20.04"), sep = "\n")') 63 | 64 | - name: Install dependencies 65 | run: | 66 | remotes::install_deps(dependencies = TRUE) 67 | remotes::install_cran("rcmdcheck") 68 | shell: Rscript {0} 69 | 70 | - name: Check 71 | env: 72 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 73 | run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 74 | shell: Rscript {0} 75 | 76 | - name: Upload check results 77 | if: failure() 78 | uses: actions/upload-artifact@main 79 | with: 80 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 81 | path: check 82 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: macOS-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - uses: r-lib/actions/setup-r@v2 18 | with: 19 | r-version: 'release' 20 | 21 | - uses: r-lib/actions/setup-pandoc@v2 22 | 23 | - name: Query dependencies 24 | run: | 25 | install.packages('remotes') 26 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 27 | shell: Rscript {0} 28 | 29 | - name: Cache R packages 30 | uses: actions/cache@v1 31 | with: 32 | path: ${{ env.R_LIBS_USER }} 33 | key: macOS-r-4.0-2-${{ hashFiles('.github/depends.Rds') }} 34 | restore-keys: macOS-r-4.0-2- 35 | 36 | - name: Install dependencies 37 | run: | 38 | install.packages(c("remotes")) 39 | remotes::install_deps(dependencies = TRUE) 40 | remotes::install_cran("covr") 41 | shell: Rscript {0} 42 | 43 | - name: Test coverage 44 | run: covr::codecov() 45 | shell: Rscript {0} 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | # Session Data files 5 | .RData 6 | # Example code in package build process 7 | *-Ex.R 8 | # Output files from R CMD build 9 | /*.tar.gz 10 | # Output files from R CMD check 11 | /*.Rcheck/ 12 | # RStudio files 13 | .Rproj.user/ 14 | # produced vignettes 15 | vignettes/*.html 16 | vignettes/*.pdf 17 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 18 | .httr-oauth 19 | # knitr and R markdown default cache directories 20 | /*_cache/ 21 | /cache/ 22 | # Temporary files created by R markdown 23 | *.utf8.md 24 | *.knit.md 25 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 26 | rsconnect/ 27 | .Rproj.user 28 | quanteda.svm.Rproj 29 | .DS_Store 30 | 31 | tests/misc/test-LMRD.md 32 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | language: R 3 | 4 | dist: xenial 5 | sudo: false 6 | 7 | cache: 8 | packages: true 9 | directories: 10 | - $HOME/.keras 11 | - $HOME/.cache/pip 12 | 13 | warnings_are_errors: true 14 | 15 | matrix: 16 | include: 17 | - name: "Backend: TensorFlow | Implementation: Keras" 18 | env: 19 | - KERAS_BACKEND="tensorflow" 20 | - KERAS_IMPLEMENTATION="tensorflow" 21 | - TENSORFLOW_VERSION="default" 22 | 23 | before_script: 24 | - sudo apt-get update 25 | - sudo apt-get install python3 python3-pip 26 | - pip3 install --upgrade --ignore-installed --user travis virtualenv 27 | - R CMD INSTALL . 28 | - R -e 'Sys.setenv(PIP_QUIET=1); keras::install_keras(tensorflow = Sys.getenv("TENSORFLOW_VERSION"), extra_packages = "IPython")' 29 | - R -e 'tensorflow::tf_config()' 30 | 31 | after_success: 32 | - Rscript -e 'covr::codecov()' 33 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: quanteda.classifiers 2 | Type: Package 3 | Title: Models for supervised text classification 4 | Version: 0.4 5 | Authors@R: c(person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role = c("aut", "cre")), 6 | person("Patrick", "Chester", email = "pjc468@nyu.edu", role = c("aut")), 7 | person("Müller", "Stefan", email = "mueller@ipz.uzh.ch", role = c("aut"))) 8 | Description: A test package for developing additional text classifier models (textmodels) including neural network-based methods. 9 | Encoding: UTF-8 10 | License: GPL-3 11 | Depends: 12 | R (>= 3.5.0), 13 | Imports: 14 | quanteda (>= 2.0), 15 | keras (>= 2.1.0), 16 | quanteda.textmodels, 17 | groupdata2 18 | Suggests: 19 | spelling, 20 | testthat, 21 | covr 22 | URL: https://github.com/quanteda/quanteda.classifiers 23 | BugReports: https://github.com/quanteda/quanteda.classifiers/issues 24 | LazyData: TRUE 25 | LazyDataCompression: bzip2 26 | RoxygenNote: 7.2.3 27 | Language: en-GB 28 | Roxygen: list(markdown = TRUE) 29 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(accuracy,default) 4 | S3method(accuracy,table) 5 | S3method(balanced_accuracy,default) 6 | S3method(balanced_accuracy,list) 7 | S3method(balanced_accuracy,table) 8 | S3method(crossval,textmodel) 9 | S3method(f1_score,default) 10 | S3method(f1_score,table) 11 | S3method(head,textmodel_evaluate) 12 | S3method(load,textmodel_cnnlstmemb) 13 | S3method(load,textmodel_mlp) 14 | S3method(performance,default) 15 | S3method(performance,table) 16 | S3method(precision,default) 17 | S3method(precision,table) 18 | S3method(predict,textmodel_cnnlstmemb) 19 | S3method(predict,textmodel_mlp) 20 | S3method(print,predict.textmodel_cnnlstmemb) 21 | S3method(print,predict.textmodel_mlp) 22 | S3method(print,textmodel_cnnlstmemb) 23 | S3method(print,textmodel_evaluate) 24 | S3method(print,textmodel_mlp) 25 | S3method(print,tokens2sequences) 26 | S3method(recall,default) 27 | S3method(recall,table) 28 | S3method(save,textmodel_cnnlstmemb) 29 | S3method(save,textmodel_mlp) 30 | S3method(summary,textmodel_cnnlstmemb) 31 | S3method(summary,textmodel_mlp) 32 | S3method(textmodel_cnnlstmemb,tokens) 33 | S3method(textmodel_cnnlstmemb,tokens2sequences) 34 | S3method(textmodel_evaluate,dfm) 35 | S3method(textmodel_evaluate,tokens) 36 | S3method(textmodel_mlp,dfm) 37 | S3method(tokens2sequences,character) 38 | S3method(tokens2sequences,tokens) 39 | S3method(tokens2sequences,tokens2sequences) 40 | S3method(tokens2sequences_conform,tokens2sequences) 41 | export(accuracy) 42 | export(balanced_accuracy) 43 | export(crossval) 44 | export(f1_score) 45 | export(is.tokens2sequences) 46 | export(performance) 47 | export(precision) 48 | export(recall) 49 | export(textmodel_cnnlstmemb) 50 | export(textmodel_evaluate) 51 | export(textmodel_mlp) 52 | export(tokens2sequences) 53 | export(tokens2sequences_conform) 54 | import(quanteda) 55 | import(quanteda.textmodels) 56 | importFrom(groupdata2,fold) 57 | importFrom(keras,bidirectional) 58 | importFrom(keras,compile) 59 | importFrom(keras,fit) 60 | importFrom(keras,keras_model_sequential) 61 | importFrom(keras,layer_activation) 62 | importFrom(keras,layer_conv_1d) 63 | importFrom(keras,layer_dense) 64 | importFrom(keras,layer_dropout) 65 | importFrom(keras,layer_embedding) 66 | importFrom(keras,layer_lstm) 67 | importFrom(keras,layer_max_pooling_1d) 68 | importFrom(keras,predict_classes) 69 | importFrom(keras,predict_proba) 70 | importFrom(keras,serialize_model) 71 | importFrom(keras,to_categorical) 72 | importFrom(keras,unserialize_model) 73 | importFrom(quanteda,dfm) 74 | importFrom(quanteda,dfm_subset) 75 | importFrom(stats,na.omit) 76 | importFrom(stats,predict) 77 | importFrom(utils,head) 78 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # quanteda.classifiers 0.4 2 | 3 | * Fixed a performance issue in `crossval()`; now it's much improved in speed. 4 | * Added a `NEWS.md` file to track changes to the package. 5 | -------------------------------------------------------------------------------- /R/crossval.R: -------------------------------------------------------------------------------- 1 | #' Cross-validate a fitted textmodel 2 | #' 3 | #' Cross-validate a fitted textmodel using _k_-fold cross-validation. 4 | #' @param x a fitted textmodel 5 | #' @param k number of folds 6 | #' @inheritParams performance 7 | #' @param verbose logical; if `TRUE`, output results to the console 8 | #' @export 9 | #' @examples 10 | #' library("quanteda") 11 | #' library("quanteda.textmodels") 12 | #' dfmat <- tokens(data_corpus_moviereviews) |> 13 | #' dfm() 14 | #' tmod <- textmodel_nb(dfmat, y = data_corpus_moviereviews$sentiment) 15 | #' crossval(tmod, k = 5, by_class = TRUE) 16 | #' crossval(tmod, k = 5, by_class = FALSE) 17 | #' crossval(tmod, k = 5, by_class = FALSE, verbose = TRUE) 18 | crossval <- function(x, k = 5, by_class = FALSE, verbose = FALSE) { 19 | UseMethod("crossval") 20 | } 21 | 22 | #' @importFrom groupdata2 fold 23 | #' @importFrom quanteda dfm dfm_subset 24 | #' @import quanteda.textmodels 25 | #' @export 26 | crossval.textmodel <- function(x, k = 5, by_class = FALSE, verbose = FALSE) { 27 | # create folds vector - many ways to do this, I chose something available 28 | folds <- fold(data.frame(doc_id = docnames(x)), k = k)[[".folds"]] 29 | 30 | # result list (could be a df, I'm old-fashioned though) 31 | results <- list() 32 | 33 | # loop across folds and refit model, add to results list 34 | for (i in seq_len(k)) { 35 | this_mod <- do.call(class(x)[1], 36 | args = list(x = dfm_subset(x$x, folds != i), 37 | y = x$y[folds != i])) 38 | this_pred <- predict(this_mod, newdata = dfm_subset(x$x, folds == i), 39 | type = "class") 40 | results <- c(results, 41 | structure(list(c(performance(this_pred, x$y[folds == i]), 42 | list(obs = split(seq_len(ndoc(x)), folds)[[i]]))), 43 | names = paste0("fold_", i))) 44 | } 45 | 46 | summ <- summarize_results(results) 47 | 48 | # this may not be the "correct" way to do it - here it averages across 49 | # class-specific averages. Should we average across classes first within 50 | # folds and then average across folds? 51 | if (!by_class) 52 | summ <- apply(summ, 2, mean) 53 | 54 | if (verbose) { 55 | cat("Cross-validation:\n\nMean results for k =", k, "folds:\n\n") 56 | print(summ) 57 | invisible(summ) 58 | } else { 59 | summ 60 | } 61 | } 62 | 63 | # old-skool function to aggregate across a 3-D array 64 | summarize_results <- function(x) { 65 | # remove the "obs" 66 | x <- lapply(x, function(y) y[-which(names(y) == "obs")]) 67 | 68 | # make into a 3D array 69 | x_df <- lapply(x, data.frame) 70 | x_array <- array(unlist(x_df), dim <- c(dim(x_df[[1]]), length(x_df)), 71 | dimnames = c(dimnames(x_df[[1]]), list(names(x_df)))) 72 | 73 | apply(x_array, c(1, 2), mean) 74 | } 75 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Sentence-level corpus of UK party manifestos 1945--2017, partially annotated 2 | #' 3 | #' @description A text corpus of sentences from publicly available party manifestos from the 4 | #' United Kingdom, published between 1945 and 2019 Some manifestos sentences 5 | #' have been rated in terms of the direction of policy using crowd-sourced coders. 6 | #' 7 | #' @description The manifestos from the 8 | #' three main parties (Labour Party, Conservatives, Liberal Democrats) between 9 | #' 1987 and 2010 have been labelled as Economic Policy, Social 10 | #' Policy, or Other, and rated in terms of the direction of Economic Policy and 11 | #' Social Policy. All party 12 | #' manifestos from the 2010 General Election have been crowd-coded in terms of 13 | #' immigration policy, and the direction of immigration policy. For more 14 | #' information on the coding approach see 15 | #' [Benoit et al. (2016)](https://doi.org/10.1017/S0003055416000058). 16 | #' 17 | #' @description The 18 | #' corpus contains the aggregated crowd coding values on the level of sentences. 19 | #' Note that the segmentation into sentences does not always work correctly due 20 | #' to missing punctuation. See Examples for how to remove very short and very 21 | #' long sentences using [quanteda::corpus_trim()]. 22 | #' 23 | #' @format 24 | #' The corpus consists of 88,954 documents (i.e. sentences) and includes the following 25 | #' document-level variables: \describe{ 26 | #' \item{party}{factor; abbreviation of the party that wrote the manifesto.} 27 | #' \item{partyname}{factor; party that wrote the manifesto.} 28 | #' \item{year}{integer; 4-digit year of the election.} 29 | #' \item{crowd_econsocial_label}{factor; indicates the majority label assigned 30 | #' by crowd workers (Economic Policy, Social Policy, or Neither). The variable 31 | #' has missing values (`NA`) for all non-annotated manifestos.} 32 | #' \item{crowd_econsocial_mean}{numeric; the direction of statements coded as 33 | #' "Economic Policy" or "Social Policy" based on the aggregated crowd codings. 34 | #' The variable is the mean of the scores assigned by the workers workers who 35 | #' coded the sentence and who allocated the sentence to the "majority" 36 | #' category. The variable ranges from -2 to +2. 37 | #' 38 | #' For the statements aggregated as "Economic" Policy, -2 corresponds to "Very 39 | #' left"; +2 corresponds to "Very right". For the statements aggregated as 40 | #' "Social Policy" -2 corresponds to "Very liberal"; +2 corresponds to "Very 41 | #' conservative". The variable has missing values (NA) for all sentences that 42 | #' were aggregated as "Neither" and for all non-annotated manifestos.)} 43 | #' \item{crowd_econsocial_n}{integer; the number of coders who contributed to the 44 | #' mean score `crowd_econsocial_mean`.} 45 | #' \item{crowd_immigration_label}{Factor indicating whether the majority of 46 | #' crowd workers labelled a sentence as referring to immigration or not. The 47 | #' variable has missing values (`NA`) for all non-annotated manifestos.} 48 | #' \item{crowd_immigration_mean}{numeric; the direction 49 | #' of statements coded as "Immigration" based on the aggregated crowd codings. 50 | #' The variable is the mean of the scores assigned by workers who coded a 51 | #' sentence and who allocated the sentence to the "Immigration" category. The 52 | #' variable ranges from -1 ("Negative and closed immigration policy") to +1 53 | #' (Favorable and open immigration policy). The variable has missing values 54 | #' (`NA`) for all non-annotated manifestos or if a sentence was not coded as 55 | #' referring to immigration policy based on the aggregation of crowd codings.} 56 | #' \item{crowd_immigration_n}{integer; the number of coders who 57 | #' contributed to the 58 | #' mean score `crowd_immigration_mean`.} 59 | #' } 60 | #' @examples 61 | #' \donttest{ 62 | #' library("quanteda") 63 | #' 64 | #' # keep only crowd coded manifestos (with respect to economic and social policy) 65 | #' corp_crowdeconsocial <- 66 | #' corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_econsocial_label)) 67 | #' 68 | #' # keep only crowd coded manifestos (with respect to immigration policy) 69 | #' corp_crowdimmig <- 70 | #' corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label)) 71 | #' } 72 | #' @references Benoit, K., Conway, D., Lauderdale, B.E., Laver, M., & Mikhaylov, S. (2016). 73 | #' [Crowd-sourced Text Analysis: 74 | #' Reproducible and Agile Production of Political Data](https://doi.org/10.1017/S0003055416000058). 75 | #' *American Political Science Review*, 100,(2), 278--295. 76 | #' @format 77 | #' A [corpus][quanteda::corpus] object. 78 | #' @keywords data 79 | "data_corpus_manifestosentsUK" 80 | 81 | #' Large Movie Review Dataset from Maas et. al. (2011) 82 | #' 83 | #' A corpus object containing a dataset for sentiment classification containing 84 | #' 25,000 highly polar movie reviews for training, and 25,000 for testing, from 85 | #' Maas et. al. (2011). 86 | #' @format The corpus docvars consist of: 87 | #' \describe{ 88 | #' \item{docnumber}{serial (within set and polarity) document number} 89 | #' \item{rating}{user-assigned movie rating on a 1-10 point integer scale} 90 | #' \item{set}{used for test v. training set} 91 | #' \item{polarity}{either `neg` or `pos` to indicate whether the 92 | #' movie review was negative or positive. See Maas et al (2011) for the 93 | #' cut-off values that governed this assignment.} 94 | #' } 95 | #' @references Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew 96 | #' Y. Ng, and Christopher Potts. (2011). 97 | #' "[Learning 98 | #' Word Vectors for Sentiment Analysis](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)". The 49th Annual Meeting of the 99 | #' Association for Computational Linguistics (ACL 2011). 100 | #' @source 101 | #' @keywords data 102 | "data_corpus_LMRD" 103 | -------------------------------------------------------------------------------- /R/performance.R: -------------------------------------------------------------------------------- 1 | # overall performance ---------- 2 | 3 | #' Performance statistics for prediction 4 | #' 5 | #' @description Functions for computing performance statistics used for model 6 | #' evaluation. 7 | #' 8 | #' @description `performance()` computes all of the following, which are also 9 | #' available via specific functions: 10 | #' 11 | #' Given a 2 x 2 table with notation 12 | #' 13 | #' \tabular{rcc}{ \tab Truth \tab \cr Predicted \tab Positive \tab 14 | #' Negative \cr Positive \tab _A_ \tab _B_ \cr Negative \tab _C_ \tab _D_ \cr } 15 | #' 16 | #' The metrics computed here are: 17 | #' \itemize{ 18 | #' \item{precision: }{\eqn{A / (A + B)}} 19 | #' \item{recall: }{\eqn{A / (A + C)}} 20 | #' \item{_F1_: }{\eqn{2 / (recall^{-1} + precision^{-1})}} 21 | #' \item{accuracy: }{\eqn{(A + D) / (A + B + C + D)}, or correctly predicted / all} 22 | #' \item{balanced_accuracy: }{mean(recall) for all categories} 23 | #' } 24 | #' @param data a table of predicted by truth, or vector of predicted labels 25 | #' @param truth vector of "true" labels, or if a table, `2` to indicate that the 26 | #' "true" values are in columns, or `1` if in rows. 27 | #' @param by_class logical; if `TRUE`, estimate performance score separately for 28 | #' each class, otherwise average across classes 29 | #' @param ... not used 30 | #' @return named list consisting of the selected measure(s), where each element 31 | #' is a scalar if `by_class = FALSE`, or a vector named by class if `by_class 32 | #' = TRUE`. 33 | #' @references 34 | #' Powers, D. (2007). "Evaluation: From Precision, Recall and F Factor to ROC, 35 | #' Informedness, Markedness and Correlation." _Technical Report SIE-07-001_, 36 | #' Flinders University. 37 | #' @examples 38 | #' ## Data in Table 2 of Powers (2007) 39 | #' 40 | #' lvs <- c("Relevant", "Irrelevant") 41 | #' tbl_2_1_pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs) 42 | #' tbl_2_1_truth <- factor(c(rep(lvs, times = c(30, 12)), 43 | #' rep(lvs, times = c(30, 28))), 44 | #' levels = lvs) 45 | #' 46 | #' performance(tbl_2_1_pred, tbl_2_1_truth) 47 | #' performance(tbl_2_1_pred, tbl_2_1_truth, by_class = FALSE) 48 | #' performance(table(tbl_2_1_pred, tbl_2_1_truth), by_class = TRUE) 49 | #' 50 | #' @export 51 | performance <- function(data, truth, by_class = TRUE, ...) { 52 | UseMethod("performance") 53 | } 54 | 55 | #' @export 56 | performance.default <- function(data, truth, by_class = TRUE, ...) { 57 | performance(build_table(data, truth), by_class = by_class) 58 | } 59 | 60 | #' @export 61 | performance.table <- function(data, truth = 2, by_class = TRUE, ...) { 62 | data <- check_table(data, truth) 63 | result <- as.list(c(precision(data, by_class = by_class), 64 | recall(data, by_class = by_class), 65 | accuracy(data), 66 | balanced_accuracy(data), 67 | f1_score(data, by_class = by_class))) 68 | result[c("precision", "recall", "f1", "accuracy", "balanced_accuracy")] 69 | } 70 | 71 | 72 | # precision, recall, f1 ---------- 73 | 74 | #' @rdname performance 75 | #' @export 76 | #' @examples 77 | #' precision(tbl_2_1_pred, tbl_2_1_truth) 78 | #' 79 | precision <- function(data, truth, by_class = TRUE, ...) { 80 | UseMethod("precision") 81 | } 82 | 83 | #' @export 84 | precision.default <- function(data, truth, by_class = TRUE, ...) { 85 | precision(build_table(data, truth), by_class = by_class) 86 | } 87 | 88 | #' @export 89 | precision.table <- function(data, truth = 2, by_class = TRUE, ...) { 90 | data <- check_table(data, truth) 91 | prec <- sapply(seq_along(diag(data)), 92 | function(x) diag(data)[x] / sum(data[x, ])) 93 | prec <- list(precision = prec) 94 | if (by_class) prec else sapply(prec, mean) 95 | } 96 | 97 | 98 | #' @rdname performance 99 | #' @export 100 | #' @examples 101 | #' recall(tbl_2_1_pred, tbl_2_1_truth) 102 | #' 103 | recall <- function(data, truth, by_class = TRUE, ...) { 104 | UseMethod("recall") 105 | } 106 | 107 | #' @export 108 | recall.default <- function(data, truth, by_class = TRUE, ...) { 109 | recall(build_table(data, truth), by_class = by_class) 110 | } 111 | 112 | #' @export 113 | recall.table <- function(data, truth = 2, by_class = TRUE, ...) { 114 | data <- check_table(data, truth) 115 | prec <- sapply(seq_along(diag(data)), 116 | function(x) diag(data)[x] / sum(data[, x])) 117 | prec <- list(recall = prec) 118 | if (by_class) prec else sapply(prec, mean) 119 | } 120 | 121 | #' @rdname performance 122 | #' @export 123 | #' @examples 124 | #' f1_score(tbl_2_1_pred, tbl_2_1_truth) 125 | #' 126 | f1_score <- function(data, truth, by_class = TRUE, ...) { 127 | UseMethod("f1_score") 128 | } 129 | 130 | #' @export 131 | f1_score.default <- function(data, truth, by_class = TRUE, ...) { 132 | f1_score(build_table(data, truth), by_class = by_class) 133 | } 134 | 135 | #' @export 136 | f1_score.table <- function(data, truth = 2, by_class = TRUE, ...) { 137 | data <- check_table(data, truth) 138 | pr <- data.frame(precision = precision(data, by_class = TRUE)[[1]], 139 | recall = recall(data, by_class = TRUE)[[1]]) 140 | #f1_score(pr) 141 | f1 <- list(f1 = apply(pr[c("precision", "recall")], 1, 142 | function(y) 2 / sum(y^(-1)))) 143 | if (by_class) f1 else sapply(f1, mean) 144 | } 145 | 146 | # #' @export 147 | # f1_score.list <- function(data, ...) { 148 | # if (!all(c("precision", "recall") %in% names(data))) 149 | # stop("list must contain both precision and recall") 150 | # pr <- list(precision = precision(data, by_class = by_class), 151 | # recall = recall(data, by_class = by_class)) 152 | # 153 | # result <- list(f1 = apply(data.frame(data[c("precision", "recall")]), 1, 154 | # function(y) 2 / sum(y^(-1)))) 155 | # if (length(result[[1]]) == 1) result[[1]] <- unname(result[[1]]) 156 | # result 157 | #} 158 | 159 | # accuracy ---------- 160 | 161 | #' @rdname performance 162 | #' @export 163 | #' @examples 164 | #' accuracy(tbl_2_1_pred, tbl_2_1_truth) 165 | #' 166 | accuracy <- function(data, truth, ...) { 167 | UseMethod("accuracy") 168 | } 169 | 170 | #' @export 171 | accuracy.default <- function(data, truth, ...) { 172 | accuracy(build_table(data, truth)) 173 | } 174 | 175 | #' @export 176 | accuracy.table <- function(data, truth = 2, ...) { 177 | data <- check_table(data, truth) 178 | list(accuracy = sum(diag(data)) / sum(data)) 179 | } 180 | 181 | #' @rdname performance 182 | #' @export 183 | #' @examples 184 | #' balanced_accuracy(tbl_2_1_pred, tbl_2_1_truth) 185 | #' 186 | balanced_accuracy <- function(data, ...) { 187 | UseMethod("balanced_accuracy") 188 | } 189 | 190 | #' @export 191 | balanced_accuracy.default <- function(data, truth, by_class = TRUE, ...) { 192 | balanced_accuracy(build_table(data, truth)) 193 | } 194 | 195 | #' @export 196 | balanced_accuracy.table <- function(data, truth = 2, ...) { 197 | data <- check_table(data, truth) 198 | rec <- recall(data, by_class = TRUE) 199 | balanced_accuracy(rec) 200 | } 201 | 202 | #' @export 203 | balanced_accuracy.list <- function(data, ...) { 204 | if (! "recall" %in% names(data)) 205 | stop("list must include recall") 206 | if (length(data[["recall"]]) < 2) 207 | stop("recall must be computed by class") 208 | list(balanced_accuracy = mean(unlist(data["recall"]))) 209 | } 210 | 211 | # utility functions ------------- 212 | 213 | check_table <- function(data, truth) { 214 | if (!truth %in% c(1, 2)) 215 | stop("truth must be 2 for columns or 1 for rows") 216 | if (!identical(rownames(data), colnames(data))) 217 | stop("predicted and truth values must have the same order and names") 218 | if (truth == 1) data <- t(data) 219 | data 220 | } 221 | 222 | build_table <- function(data, truth) { 223 | truth <- as.factor(truth) 224 | data <- factor(data, levels = levels(truth)) 225 | table(data, truth) 226 | } 227 | -------------------------------------------------------------------------------- /R/quanteda.classifiers-package.R: -------------------------------------------------------------------------------- 1 | #' quanteda.classifiers 2 | #' 3 | #' Extensions to \pkg{quanteda} that provide supervised machine learning models 4 | #' for document-feature matrices. 5 | #' @import quanteda.textmodels 6 | #' @import quanteda 7 | #' @keywords internal 8 | "_PACKAGE" 9 | 10 | # The following block is used by usethis to automatically manage 11 | # roxygen namespace tags. Modify with care! 12 | ## usethis namespace: start 13 | ## usethis namespace: end 14 | NULL 15 | -------------------------------------------------------------------------------- /R/textmodel_cnnlstmemb.R: -------------------------------------------------------------------------------- 1 | #' \[Experimental\] Convolutional NN + LSTM model fitted to word embeddings 2 | #' 3 | #' A function that combines a convolutional neural network layer with a long 4 | #' short-term memory layer. It is designed to incorporate word sequences, 5 | #' represented as sequentially ordered word embeddings, into text 6 | #' classification. The model takes as an input a \pkg{quanteda} tokens object. 7 | #' 8 | #' @param x tokens object 9 | #' @inheritParams quanteda.textmodels::textmodel_svm 10 | #' @param dropout1 A floating variable bound between 0 and 1. It determines the 11 | #' rate at which units are dropped for the linear transformation of the 12 | #' inputs for the embedding layer. 13 | #' @param dropout2 A floating variable bound between 0 and 1. It determines the 14 | #' rate at which units are dropped for the linear transformation of the 15 | #' inputs for the CNN layer. 16 | #' @param dropout3 A floating variable bound between 0 and 1. It determines the 17 | #' rate at which units are dropped for the linear transformation of the 18 | #' inputs for the recurrent layer. 19 | #' @param dropout4 A floating variable bound between 0 and 1. It determines the 20 | #' rate at which units are dropped for the linear transformation of the 21 | #' inputs for the recurrent layer. 22 | #' @param wordembeddim The number of word embedding dimensions to be fit 23 | #' @param cnnlayer A logical parameter that allows user to include or exclude a 24 | #' convolutional layer in the neural network model 25 | #' @param filter The number of output filters in the convolution 26 | #' @param kernel_size An integer or list of a single integer, specifying the 27 | #' length of the 1D convolution window 28 | #' @param pool_size Size of the max pooling windows. 29 | #' [keras::layer_max_pooling_1d()] 30 | #' @param units_lstm The number of nodes of the lstm layer 31 | #' @param words The maximum number of words used to train model. Defaults to the 32 | #' number of features in `x` 33 | #' @param maxsenlen The maximum sentence length of training data 34 | #' @param optimizer optimizer used to fit model to training data, see 35 | #' [keras::compile.keras.engine.training.Model()] 36 | #' @param loss objective loss function, see 37 | #' [keras::compile.keras.engine.training.Model()] 38 | #' @param metrics metric used to train algorithm, see 39 | #' [keras::compile.keras.engine.training.Model()] 40 | #' @param ... additional options passed to 41 | #' [keras::fit.keras.engine.training.Model()] 42 | #' @keywords textmodel 43 | #' @importFrom keras keras_model_sequential to_categorical 44 | #' @importFrom keras layer_dense layer_activation layer_dropout compile fit 45 | #' @importFrom keras layer_embedding layer_conv_1d layer_max_pooling_1d 46 | #' layer_lstm bidirectional 47 | #' @seealso [save.textmodel_cnnlstmemb()], [load.textmodel_cnnlstmemb()] 48 | #' @export 49 | #' @examples 50 | #' \dontrun{ 51 | #' # create dataset with evenly balanced coded & uncoded immigration sentences 52 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, 53 | #' !is.na(crowd_immigration_label)) 54 | #' corpuncoded <- data_corpus_manifestosentsUK %>% 55 | #' corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>% 56 | #' corpus_sample(size = ndoc(corpcoded)) 57 | #' corp <- corpcoded + corpuncoded 58 | #' 59 | #' tok <- tokens(corp) 60 | #' 61 | #' tmod <- textmodel_cnnlstmemb(tok, 62 | #' y = docvars(tok, "crowd_immigration_label"), 63 | #' epochs = 5, verbose = 1) 64 | #' 65 | #' newdata = tokens_subset(tok, subset = is.na(crowd_immigration_label)) 66 | #' pred <- predict(tmod, newdata = newdata) 67 | #' table(pred) 68 | #' tail(texts(corpuncoded)[pred == "Immigration"], 10) 69 | #' 70 | #' } 71 | textmodel_cnnlstmemb <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, 72 | dropout3 = 0.2, dropout4 = 0.2, 73 | wordembeddim = 30, cnnlayer = TRUE, 74 | filter = 48, kernel_size = 5, pool_size = 4, 75 | units_lstm = 128, words = NULL, 76 | maxsenlen = 100, optimizer = "adam", 77 | loss = "categorical_crossentropy", 78 | metrics = "categorical_accuracy", ...) { 79 | UseMethod("textmodel_cnnlstmemb") 80 | } 81 | 82 | #' @export 83 | textmodel_cnnlstmemb.tokens <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2, 84 | dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48, 85 | kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL, 86 | maxsenlen = 100, optimizer = "adam", 87 | loss = "categorical_crossentropy", 88 | metrics = "categorical_accuracy", ...) { 89 | stopifnot(ndoc(x) == length(y)) 90 | stopifnot(is.tokens(x)) 91 | y <- as.factor(y) 92 | result <- list(x = x, y = y, call = match.call(), classnames = levels(y)) 93 | # trim missings for fitting model 94 | na_ind <- which(is.na(y)) 95 | if (length(na_ind) > 0) { 96 | y <- y[-na_ind] 97 | # workaround just because negative indexing is broken in v2 for now 98 | na_ind_logical <- rep(TRUE, length(y)) 99 | na_ind_logical[na_ind] <- FALSE 100 | x <- x[na_ind_logical] 101 | } 102 | 103 | x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words) 104 | 105 | if (is.null(words)) 106 | words <- x$nfeatures 107 | # "one-hot" encode y 108 | y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y)) 109 | 110 | # use keras to fit the model 111 | model <- keras_model_sequential() 112 | 113 | model %>% 114 | layer_embedding(input_dim = words + 1, output_dim = wordembeddim, 115 | input_length = maxsenlen) %>% 116 | layer_dropout(rate = dropout1) 117 | 118 | if (cnnlayer == TRUE) { 119 | model %>% 120 | layer_conv_1d(filters = filter, kernel_size = kernel_size, 121 | activation = "relu") %>% 122 | layer_max_pooling_1d(pool_size = pool_size) %>% 123 | layer_dropout(rate = dropout2) 124 | } 125 | 126 | model %>% 127 | bidirectional(layer_lstm(units = units_lstm, dropout = dropout3, 128 | recurrent_dropout = dropout4)) %>% 129 | layer_dense(units = nlevels(y), activation = "softmax") 130 | 131 | compile(model, loss = loss, optimizer = optimizer, metrics = metrics) 132 | history <- fit(model, x$matrix, y2, ...) 133 | 134 | # compile, class, and return the result 135 | result <- c(result, 136 | nfeatures = x$nfeatures, 137 | maxsenlen = maxsenlen, 138 | list(clefitted = model)) 139 | class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list") 140 | return(result) 141 | } 142 | 143 | #' @export 144 | textmodel_cnnlstmemb.tokens2sequences <- function(x, y, dropout1 = 0.2, dropout2 = 0.2, dropout3 = 0.2, 145 | dropout4 = 0.2, wordembeddim = 30, cnnlayer = TRUE, filter = 48, 146 | kernel_size = 5, pool_size = 4, units_lstm = 128, words = NULL, 147 | maxsenlen = 100, 148 | optimizer = "adam", 149 | loss = "categorical_crossentropy", 150 | metrics = "categorical_accuracy", ...) { 151 | stopifnot(nrow(x$matrix) == length(y)) 152 | stopifnot(is.tokens2sequences(x)) 153 | x <- tokens2sequences(x, maxsenlen = maxsenlen, keepn = words) 154 | y <- as.factor(y) 155 | result <- list(x = x, y = y, call = match.call(), classnames = levels(y)) 156 | 157 | # trim missings for fitting model 158 | na_ind <- which(is.na(y)) 159 | if (length(na_ind) > 0) { 160 | y <- y[-na_ind] 161 | # workaround just because negative indexing is broken in v2 for now 162 | na_ind_logical <- rep(TRUE, length(y)) 163 | na_ind_logical[na_ind] <- FALSE 164 | x$matrix <- x$matrix[na_ind_logical, ] 165 | } 166 | 167 | words <- x$nfeatures 168 | maxsenlen <- ncol(x$matrix) 169 | # "one-hot" encode y 170 | y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y)) 171 | 172 | # use keras to fit the model 173 | model <- keras_model_sequential() 174 | 175 | model %>% 176 | layer_embedding(input_dim = words + 1, output_dim = wordembeddim, 177 | input_length = maxsenlen) %>% 178 | layer_dropout(rate = dropout1) 179 | 180 | if (cnnlayer == TRUE) { 181 | model %>% 182 | layer_conv_1d(filters = filter, kernel_size = kernel_size, 183 | activation = "relu") %>% 184 | layer_max_pooling_1d(pool_size = pool_size) %>% 185 | layer_dropout(rate = dropout2) 186 | } 187 | 188 | model %>% 189 | layer_lstm(units = units_lstm, dropout = dropout3, 190 | recurrent_dropout = dropout4) %>% 191 | layer_dense(units = nlevels(y), activation = "softmax") 192 | 193 | compile(model, loss = loss, optimizer = optimizer, metrics = metrics) 194 | history <- fit(model, x$matrix, y2, ...) 195 | 196 | # compile, class, and return the result 197 | result <- c(result, 198 | nfeatures = x$nfeatures, 199 | maxsenlen = maxsenlen, 200 | list(clefitted = model)) 201 | class(result) <- c("textmodel_cnnlstmemb", "textmodel", "list") 202 | return(result) 203 | } 204 | 205 | #' Prediction from a fitted textmodel_cnnlstmemb object 206 | #' 207 | #' `predict.textmodel_cnnlstmemb()` implements class predictions from a 208 | #' fitted long short-term memory neural network model. 209 | #' @param object a fitted [textmodel_cnnlstmemb] model 210 | #' @param newdata dfm on which prediction should be made 211 | #' @param type the type of predicted values to be returned; see Value 212 | #' @param force make `newdata`'s feature set conformant to the model terms 213 | #' @param ... not used 214 | #' @return `predict.textmodel_cnnlstmemb` returns either a vector of class 215 | #' predictions for each row of `newdata` (when `type = "class"`), or 216 | #' a document-by-class matrix of class probabilities (when `type = 217 | #' "probability"`). 218 | #' @seealso [textmodel_cnnlstmemb()] 219 | #' @keywords textmodel internal 220 | #' @importFrom keras predict_classes predict_proba 221 | #' @importFrom stats predict 222 | #' @export 223 | predict.textmodel_cnnlstmemb <- function(object, newdata = NULL, 224 | type = c("class", "probability"), 225 | force = TRUE, 226 | ...) { 227 | quanteda:::unused_dots(...) 228 | 229 | type <- match.arg(type) 230 | 231 | if (!is.null(newdata)) { 232 | if(is.tokens(newdata)) { 233 | data <- tokens2sequences(newdata, maxsenlen = object$maxsenlen, 234 | keepn = object$nfeatures) 235 | } else { 236 | data <- newdata 237 | } 238 | t2s_object <- tokens2sequences(object$x, maxsenlen = object$maxsenlen, 239 | keepn = object$nfeatures) 240 | data <- tokens2sequences_conform(data, t2s_object) 241 | } else { 242 | data <- tokens2sequences(object$x, maxsenlen = object$maxsenlen, 243 | keepn = object$nfeatures) 244 | } 245 | 246 | if (type == "class") { 247 | pred_y <- predict_classes(object$clefitted, x = data$matrix) 248 | pred_y <- factor(pred_y, labels = object$classnames, 249 | levels = (seq_along(object$classnames) - 1)) 250 | names(pred_y) <- rownames(data$matrix) 251 | } else if (type == "probability") { 252 | pred_y <- predict_proba(object$clefitted, x = data$matrix) 253 | colnames(pred_y) <- object$classnames 254 | rownames(pred_y) <- rownames(data$matrix) 255 | } 256 | 257 | pred_y 258 | } 259 | 260 | #' @export 261 | #' @importFrom stats na.omit 262 | #' @method print textmodel_cnnlstmemb 263 | print.textmodel_cnnlstmemb <- function(x, ...) { 264 | layer_names <- gsub(pattern = "_\\d*", "", 265 | lapply(x$clefitted$layers, function(z) z$name)) 266 | cat("\nCall:\n") 267 | print(x$call) 268 | cat("\n", 269 | format(length(na.omit(x$y)), big.mark = ","), " training documents; ", 270 | format(length(x$nfeatures), big.mark = ","), " fitted features", 271 | ".\n", 272 | "Structure: ", paste(layer_names, collapse = " -> "), "\n", 273 | sep = "") 274 | } 275 | 276 | #' summary method for textmodel_cnnlstmemb objects 277 | #' @param object output from [textmodel_cnnlstmemb()] 278 | #' @param ... additional arguments not used 279 | #' @keywords textmodel internal 280 | #' @method summary textmodel_cnnlstmemb 281 | #' @export 282 | summary.textmodel_cnnlstmemb <- function(object, ...) { 283 | layer_names <- gsub(pattern = "_\\d*", "", 284 | lapply(object$clefitted$layers, function(x) x$name)) 285 | 286 | result <- list( 287 | "call" = object$call, 288 | "model structure" = paste(layer_names, collapse = " -> ") 289 | ) 290 | as.summary.textmodel(result) 291 | } 292 | 293 | #' @export 294 | #' @method print predict.textmodel_cnnlstmemb 295 | print.predict.textmodel_cnnlstmemb <- function(x, ...) { 296 | print(unclass(x)) 297 | } 298 | 299 | 300 | #' @rdname save.textmodel_mlp 301 | #' @importFrom keras serialize_model 302 | #' @method save textmodel_cnnlstmemb 303 | #' @export 304 | save.textmodel_cnnlstmemb <- function(x, ...) { 305 | x$clefitted <- serialize_model(x$clefitted) 306 | save(x, ...) 307 | } 308 | 309 | #' @rdname save.textmodel_mlp 310 | #' @importFrom keras unserialize_model 311 | #' @method load textmodel_cnnlstmemb 312 | #' @export 313 | load.textmodel_cnnlstmemb <- function(x, ...) { 314 | load(x, ...) 315 | x$clefitted <- unserialize_model(x$clefitted) 316 | } 317 | -------------------------------------------------------------------------------- /R/textmodel_evaluate.R: -------------------------------------------------------------------------------- 1 | #' Model evaluation function 2 | #' 3 | #' Designed to streamline the parameter tuning and evaluation process. Users 4 | #' chose a function to evaluate and include parameter values as a list. If 5 | #' multiple parameter values are provided, the function will perform a grid 6 | #' search by estimating a separate model for every combination of parameters. 7 | #' 8 | #' @param x the \link{dfm} or \link{tokens} object on which the model will be 9 | #' fit. Does not need to contain only the training documents. 10 | #' @param y vector of training labels associated with each document identified 11 | #' in \code{train}. (These will be converted to factors if not already 12 | #' factors.) 13 | #' @param model the name of the machine learning function that will be evaluated 14 | #' @param fun the name of the function that will be used to evaluate the machine 15 | #' learning model. Can take the values "accuracy", "precision", "recall", or 16 | #' "f1_score" 17 | #' @param k number of folds 18 | #' @param parameters model hyperparameters 19 | #' @param seed a seed that can allow for replication of k training data splits. 20 | #' If seed is not provided a seed is chosen based on the current time. 21 | #' @param time a logical parameter that determines whether output will include 22 | #' training time (in seconds) of model 23 | #' @param by_class estimates a separate value of provided evaluation function 24 | #' for every class of the true vector 25 | #' @importFrom stats predict 26 | #' @examples 27 | #' # evaluate immigration classification performance 28 | #' \dontrun{ 29 | #' dfmat <- dfm(data_corpus_manifestosentsUK) 30 | #' codes <- docvars(data_corpus_manifestosentsUK, "crowd_immigration_label") 31 | #' evaluation <- textmodel_evaluate(dfmat, codes, k = 3, 32 | #' model = "textmodel_mlp", fun = "f1_score", 33 | #' parameters = list(epochs = c(3, 4))) 34 | #' head(evaluation) 35 | #' aggregate(evaluation, by = list(evaluation$cost), FUN = "mean") 36 | #' } 37 | #' @export 38 | #' 39 | textmodel_evaluate <- function(x, y, 40 | model, 41 | fun = "f1_score", 42 | k = 5, 43 | parameters = list(), 44 | seed = as.numeric(Sys.time()), 45 | time = TRUE, 46 | by_class = FALSE) { 47 | UseMethod("textmodel_evaluate") 48 | } 49 | 50 | #' @export 51 | textmodel_evaluate.dfm <- function(x, y, model, fun = "f1_score", k = 5, 52 | parameters = list(), 53 | seed = as.numeric(Sys.time()), 54 | time = TRUE, by_class = FALSE) { 55 | stopifnot(is.dfm(x)) 56 | if ("accuracy" %in% fun & by_class) { 57 | cat("No class oriented accuracy score defined. Calculating average accuracy accross all classes.\n") 58 | } 59 | total_start <- Sys.time() 60 | set.seed(seed) 61 | y <- as.factor(y) 62 | folds <- cut(seq(1, length(y)), breaks = k, labels = FALSE) 63 | folds <- sample(folds, length(folds), replace = FALSE) 64 | output <- list() 65 | params_df <- expand.grid(parameters, stringsAsFactors = FALSE) 66 | param_len <- ifelse(length(parameters) != 0, nrow(params_df), 1) 67 | w <- 1 68 | for (t in 1:param_len) { 69 | # drop = FALSE ensures that params_df remains a data.frame even if 70 | # there is only a single input parameter 71 | param_list <- as.list(params_df[t, , drop = FALSE]) 72 | for (i in 1:k) { 73 | test_set <- which(folds == i) 74 | x_train <- x[-test_set, ] 75 | y_train <- y[-test_set] 76 | x_test <- x[test_set, ] 77 | y_test <- y[test_set] 78 | start <- Sys.time() 79 | mod <- do.call(what = model, args = c(list(x = x_train, y = y_train), param_list)) 80 | time <- round(as.numeric(difftime(Sys.time(), start, units = "secs")), 2) # Outputs time in seconds 81 | names(time) <- "time" 82 | y_pred <- predict(mod, x_test) 83 | met <- lapply(fun, function(x) do.call(what = x, args = list(y_pred, y_test, by_class))) # Accepts any evaluation function that takes predicted and test vectors as inputs 84 | #met <- as.list(met) 85 | if ( is.null(names(met))) names(met) <- fun 86 | if (length(parameters) != 0) { 87 | output[[w]] <- data.frame(k = i, met, param_list, as.list(time), seed) 88 | } else { 89 | output[[w]] <- data.frame(k = i, met, as.list(time), seed) 90 | } 91 | if (by_class) { 92 | output[[w]]$class <- rownames(output[[w]]) 93 | } 94 | w <- w + 1 95 | } 96 | } 97 | output <- do.call(rbind, output) 98 | rownames(output) <- NULL 99 | total_time <- round(as.numeric(difftime(Sys.time(), total_start, units = "secs")), 2) # Outputs time in seconds 100 | attr(output, "model") <- model 101 | attr(output, "fun") <- fun 102 | attr(output, "k") <- k 103 | attr(output, "parameters") <- parameters 104 | attr(output, "nparameters") <- param_len 105 | attr(output, "total_time") <- total_time 106 | class(output) <- c("textmodel_evaluate", "data.frame") 107 | return(output) 108 | } 109 | 110 | #' @export 111 | textmodel_evaluate.tokens <- function(x, y, model, fun = "f1_score", k = 5, 112 | parameters = list(), 113 | seed = as.numeric(Sys.time()), 114 | time = TRUE, by_class = FALSE) { 115 | stopifnot(is.tokens(x)) 116 | if ("accuracy" %in% fun & by_class) { 117 | cat("No class oriented accuracy score defined. Calculating average accuracy accross all classes.\n") 118 | } 119 | total_start <- Sys.time() 120 | set.seed(seed) 121 | y <- as.factor(y) 122 | folds <- cut(seq(1, length(y)), breaks = k, labels = FALSE) 123 | folds <- sample(folds, length(folds), replace = FALSE) 124 | output <- list() 125 | params_df <- expand.grid(parameters, stringsAsFactors = FALSE) 126 | param_len <- ifelse(length(parameters) != 0, nrow(params_df), 1) 127 | w <- 1 128 | for (t in 1:param_len) { 129 | param_list <- as.list(params_df[t, , drop = FALSE]) # drop = FALSE ensures that params_df remains a data.frame even if there is only a single input parameter 130 | for (i in 1:k) { 131 | test_set <- which(folds == i) 132 | x_train <- x[-test_set] 133 | y_train <- y[-test_set] 134 | x_test <- x[test_set] 135 | y_test <- y[test_set] 136 | start <- Sys.time() 137 | mod <- do.call(what = model, args = c(list(x = x_train, y = y_train), param_list)) 138 | time <- round(as.numeric(difftime(Sys.time(), start, units = "secs")), 2) # Outputs time in seconds 139 | names(time) <- "time" 140 | y_pred <- predict(mod, x_test) 141 | met <- lapply(fun, function(x) do.call(what = x, args = list(y_pred, y_test, by_class))) # Accepts any evaluation function that takes predicted and test vectors as inputs 142 | #met <- as.list(met) 143 | if (is.null(names(met))) names(met) <- fun 144 | if (length(parameters) != 0) { 145 | output[[w]] <- data.frame(k = i, met, param_list, as.list(time), seed) 146 | } else { 147 | output[[w]] <- data.frame(k = i, met, as.list(time), seed) 148 | } 149 | if (by_class) { 150 | output[[w]]$class <- rownames(output[[w]]) 151 | } 152 | w <- w + 1 153 | } 154 | } 155 | output <- do.call(rbind, output) 156 | rownames(output) <- NULL 157 | total_time <- round(as.numeric(difftime(Sys.time(), total_start, units = "secs")), 2) # Outputs time in seconds 158 | attr(output, "model") <- model 159 | attr(output, "fun") <- fun 160 | attr(output, "k") <- k 161 | attr(output, "parameters") <- parameters 162 | attr(output, "nparameters") <- param_len 163 | attr(output, "total_time") <- total_time 164 | class(output) <- c("textmodel_evaluate", "data.frame") 165 | return(output) 166 | } 167 | 168 | #' @seealso [textmodel_evaluate()] 169 | #' @importFrom utils head 170 | #' @method head textmodel_evaluate 171 | #' @export 172 | #' 173 | head.textmodel_evaluate <- function(x, n = 5, ...) { 174 | return(head(as.data.frame(x), n)) 175 | } 176 | 177 | #' @seealso [textmodel_evaluate()] 178 | #' @method print textmodel_evaluate 179 | #' @export 180 | print.textmodel_evaluate <- function(x, ...) { 181 | # output 182 | cat("Evaluation of", attr(x, "model"), "using the", attr(x, "fun"), "function.", 183 | "\n") 184 | #return(head(x, 4)) 185 | } 186 | -------------------------------------------------------------------------------- /R/textmodel_mlp.R: -------------------------------------------------------------------------------- 1 | #' Multilayer perceptron network (MLP) model for text classification 2 | #' 3 | #' This function is a wrapper for a multilayer perceptron network model with a 4 | #' single hidden layer network with two layers, implemented in the \pkg{keras} 5 | #' package. 6 | #' @inheritParams quanteda.textmodels::textmodel_svm 7 | #' @param units The number of network nodes used in the first layer of the 8 | #' sequential model 9 | #' @param dropout A floating variable bound between 0 and 1. It determines the 10 | #' rate at which units are dropped for the linear transformation of the 11 | #' inputs. 12 | #' @param optimizer optimizer used to fit model to training data, see 13 | #' [keras::compile.keras.engine.training.Model()] 14 | #' @param loss objective loss function, see 15 | #' [keras::compile.keras.engine.training.Model()] 16 | #' @param metrics metric used to train algorithm, see 17 | #' [keras::compile.keras.engine.training.Model()] 18 | #' @param ... additional options passed to 19 | #' [keras::fit.keras.engine.training.Model()] 20 | #' @keywords textmodel 21 | #' @importFrom keras keras_model_sequential to_categorical 22 | #' @importFrom keras layer_dense layer_activation layer_dropout compile fit 23 | #' @seealso [save.textmodel_mlp()], [load.textmodel_mlp()] 24 | #' @export 25 | #' @examples 26 | #' \dontrun{ 27 | #' # create a dataset with evenly balanced coded and uncoded immigration sentences 28 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label)) 29 | #' corpuncoded <- data_corpus_manifestosentsUK %>% 30 | #' corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>% 31 | #' corpus_sample(size = ndoc(corpcoded)) 32 | #' corp <- corpcoded + corpuncoded 33 | #' 34 | #' # form a tf-idf-weighted dfm 35 | #' dfmat <- dfm(corp) %>% 36 | #' dfm_tfidf() 37 | #' 38 | #' set.seed(1000) 39 | #' tmod <- textmodel_mlp(dfmat, y = docvars(dfmat, "crowd_immigration_label"), 40 | #' epochs = 5, verbose = 1) 41 | #' pred <- predict(tmod, newdata = dfm_subset(dfmat, is.na(crowd_immigration_label))) 42 | #' table(pred) 43 | #' tail(texts(corpuncoded)[pred == "Immigration"], 10) 44 | #' } 45 | textmodel_mlp <- function(x, y, units = 512, dropout = .2, 46 | optimizer = "adam", 47 | loss = "categorical_crossentropy", 48 | metrics = "categorical_accuracy", 49 | ...) { 50 | UseMethod("textmodel_mlp") 51 | } 52 | 53 | #' @export 54 | textmodel_mlp.dfm <- function(x, y, units = 512, dropout = .2, 55 | optimizer = "adam", 56 | loss = "categorical_crossentropy", 57 | metrics = "categorical_accuracy", ...) { 58 | stopifnot(ndoc(x) == length(y)) 59 | 60 | x <- as.dfm(x) 61 | y <- as.factor(y) 62 | result <- list(x = x, y = y, call = match.call(), classnames = levels(y)) 63 | 64 | # trim missings for fitting model 65 | na_ind <- which(is.na(y)) 66 | if (length(na_ind) > 0) { 67 | # message(length(na_ind), "observations with the value 'NA' were removed.") 68 | y <- y[-na_ind] 69 | x <- x[-na_ind, ] 70 | } 71 | 72 | # "one-hot" encode y 73 | y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y)) 74 | 75 | # use keras to fit the model 76 | model <- keras_model_sequential() %>% 77 | layer_dense(units = units, input_shape = nfeat(x), activation = "relu") %>% 78 | layer_dropout(rate = dropout) %>% 79 | layer_dense(units = nlevels(y), activation = "softmax") 80 | compile(model, loss = loss, optimizer = optimizer, metrics = metrics) 81 | history <- fit(model, x, y2, ...) 82 | 83 | # compile, class, and return the result 84 | result <- c(result, nfeatures = nfeat(x), list(seqfitted = model)) 85 | class(result) <- c("textmodel_mlp", "textmodel", "list") 86 | return(result) 87 | } 88 | 89 | #' Prediction from a fitted textmodel_mlp object 90 | #' 91 | #' `predict.textmodel_mlp()` implements class predictions from a fitted 92 | #' multilayer perceptron network model. 93 | #' @param object a fitted [textmodel_mlp] model 94 | #' @param newdata dfm on which prediction should be made 95 | #' @param type the type of predicted values to be returned; see Value 96 | #' @param force make `newdata`'s feature set conformant to the model terms 97 | #' @param ... not used 98 | #' @return `predict.textmodel_mlp` returns either a vector of class 99 | #' predictions for each row of `newdata` (when `type = "class"`), or 100 | #' a document-by-class matrix of class probabilities (when `type = 101 | #' "probability"`). 102 | #' @seealso [textmodel_mlp()] 103 | #' @keywords textmodel internal 104 | #' @importFrom keras predict_classes predict_proba 105 | #' @importFrom stats predict 106 | #' @export 107 | predict.textmodel_mlp <- function(object, newdata = NULL, 108 | type = c("class", "probability"), 109 | force = TRUE, 110 | ...) { 111 | quanteda:::unused_dots(...) 112 | 113 | type <- match.arg(type) 114 | 115 | if (!is.null(newdata)) { 116 | data <- as.dfm(newdata) 117 | } else { 118 | data <- as.dfm(object$x) 119 | } 120 | model_featnames <- colnames(object$x) 121 | data <- if (is.null(newdata)) { 122 | suppressWarnings(quanteda.textmodels:::force_conformance(data, model_featnames, force)) 123 | } else { 124 | quanteda.textmodels:::force_conformance(data, model_featnames, force) 125 | } 126 | 127 | if (type == "class") { 128 | pred_y <- predict_classes(object$seqfitted, x = data) 129 | pred_y <- factor(pred_y, labels = object$classnames, levels = (seq_along(object$classnames) - 1)) 130 | names(pred_y) <- docnames(data) 131 | } else if (type == "probability") { 132 | pred_y <- predict_proba(object$seqfitted, x = data) 133 | colnames(pred_y) <- object$classnames 134 | rownames(pred_y) <- docnames(data) 135 | } 136 | 137 | pred_y 138 | } 139 | 140 | #' @export 141 | #' @importFrom stats na.omit 142 | #' @method print textmodel_mlp 143 | print.textmodel_mlp <- function(x, ...) { 144 | layer_names <- gsub(pattern = "_\\d*", "", lapply(x$seqfitted$layers, function(z) z$name)) 145 | cat("\nCall:\n") 146 | print(x$call) 147 | cat("\n", 148 | format(length(na.omit(x$y)), big.mark = ","), " training documents; ", 149 | format(length(x$nfeatures), big.mark = ","), " fitted features", 150 | ".\n", 151 | "Structure: ", paste(layer_names, collapse = " -> "), "\n", 152 | sep = "") 153 | } 154 | 155 | #' summary method for textmodel_mlp objects 156 | #' @param object output from [textmodel_mlp()] 157 | #' @param ... additional arguments not used 158 | #' @keywords textmodel internal 159 | #' @method summary textmodel_mlp 160 | #' @export 161 | summary.textmodel_mlp <- function(object, ...) { 162 | layer_names <- gsub(pattern = "_\\d*", "", lapply(object$seqfitted$layers, function(x) x$name)) 163 | 164 | result <- list( 165 | "call" = object$call, 166 | "model structure" = paste(layer_names, collapse = " -> ") 167 | ) 168 | as.summary.textmodel(result) 169 | } 170 | 171 | #' @export 172 | #' @method print predict.textmodel_mlp 173 | print.predict.textmodel_mlp <- function(x, ...) { 174 | print(unclass(x)) 175 | } 176 | 177 | #' Load or save keras-based textmodels 178 | #' 179 | #' Functions for loading and saving \pkg{keras}-based models. Because these are 180 | #' stored as references, they need to be "serialized" prior to saving, or 181 | #' serialized upon loading. This applies to models fit using 182 | #' [textmodel_cnnlstmemb()] and [textmodel_mlp()]. 183 | #' @param x a \pkg{keras}-based fitted textmodel 184 | #' @param ... additional arguments passed to [save()] or [load()] 185 | #' @importFrom keras serialize_model 186 | #' @keywords internal 187 | #' @export 188 | #' @method save textmodel_mlp 189 | save.textmodel_mlp <- function(x, ...) { 190 | x$seqfitted <- serialize_model(x$seqfitted) 191 | save(x, ...) 192 | } 193 | 194 | #' @rdname save.textmodel_mlp 195 | #' @importFrom keras unserialize_model 196 | #' @method load textmodel_mlp 197 | #' @export 198 | load.textmodel_mlp <- function(x, ...) { 199 | load(x, ...) 200 | x$seqfitted <- unserialize_model(x$seqfitted) 201 | } 202 | -------------------------------------------------------------------------------- /R/tokens2sequences.R: -------------------------------------------------------------------------------- 1 | #' \[Experimental\] Convert quanteda tokens to keras sequences 2 | #' 3 | #' This function converts a \pkg{quanteda} [quanteda::tokens()] object 4 | #' into a tokens sequence object as expected by some functions in the 5 | #' \pkg{keras} package. 6 | #' @param x [quanteda::tokens()] object 7 | #' @param maxsenlen the maximum sentence length kept in output matrix 8 | #' @param keepn the maximum number of features to keep 9 | #' @return [tokens2sequences()] The output matrix has a number of rows 10 | #' which represent each tokenized sentence input into the function and a 11 | #' number of columns determined by `maxsenlen`. The matrix contains a 12 | #' numeric code for every unique token kept (determined by `keepn`) and 13 | #' they are arranged in the same sequence indicated by the original 14 | #' [quanteda::tokens()] object. 15 | #' @seealso [is.tokens2sequences()], [tokens2sequences_conform()] 16 | #' @export 17 | #' @examples 18 | #' library("quanteda") 19 | #' corp <- corpus_subset(data_corpus_inaugural, Year <= 1793) 20 | #' corptok <- tokens(corp) 21 | #' print(corp) 22 | #' seqs <- tokens2sequences(corptok, maxsenlen = 200) 23 | #' print(seqs) 24 | tokens2sequences <- function(x, maxsenlen = 100, keepn = NULL) { 25 | UseMethod("tokens2sequences") 26 | } 27 | 28 | #' @export 29 | tokens2sequences.tokens <- function(x, maxsenlen = 100, keepn = NULL) { 30 | stopifnot(is.tokens(x)) 31 | tfeq <- sort(table(unlist(x)), decreasing = T) # Creates a table of tokens and their frequencies sorted from most common to least 32 | doc_nam <- docnames(x) # Store docnames from tokens object for future use 33 | x <- unclass(x) # Convert tokens to integer IDs 34 | features <- attr(x, "types") # Store feature names 35 | data <- data.frame(features = features, # Create a dataframe that maps each token to its id and frequency 36 | label1 = 1:length(features), 37 | freq1 = as.integer(tfeq[features]), 38 | stringsAsFactors = FALSE) 39 | attributes(x) <- NULL 40 | data <- data[order(data$freq, decreasing = TRUE), ] # Reorders feature dictionary by frequency 41 | x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y) 42 | words <- data.frame(table(unlist(x))) 43 | names(words) <- c("label1", "freq") 44 | data <- merge(data, words, by = "label1", all.x = TRUE) 45 | data <- data[order(data$freq, decreasing = TRUE), ] 46 | data$label <- NA 47 | if (!is.null(keepn)) { 48 | if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist 49 | data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common 50 | 51 | } else { 52 | data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq)) 53 | } 54 | data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs 55 | x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list 56 | mat <- do.call("rbind", lapply(x, function(y) { 57 | if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame 58 | return(y) 59 | } 60 | )) 61 | rownames(mat) <- doc_nam # Adds docname to each row of the matrix 62 | colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column 63 | dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T)) 64 | dropped_types <- 1 - with(data, length(na.omit(label)) / length(na.omit(label1))) 65 | data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary 66 | data <- data[order(data$label, decreasing = FALSE), 67 | c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID 68 | rownames(data) <- NULL # Resets rownames of dictionary 69 | output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens) 70 | class(output) <- "tokens2sequences" 71 | return(output) 72 | } 73 | 74 | #' @export 75 | tokens2sequences.character <- function(x, maxsenlen = 100, keepn = NULL) { 76 | stopifnot(is.character(x)) 77 | x <- tokens(x) 78 | tfeq <- sort(table(unlist(x)), decreasing = T) # Creates a table of tokens and their frequencies sorted from most common to least 79 | doc_nam <- docnames(x) # Store docnames from tokens object for future use 80 | x <- unclass(x) # Convert tokens to integer IDs 81 | features <- attr(x, "types") # Store feature names 82 | data <- data.frame(features = features, # Create a dataframe that maps each token to its id and frequency 83 | label1 = 1:length(features), 84 | freq1 = as.integer(tfeq[features]), 85 | stringsAsFactors = FALSE) 86 | attributes(x) <- NULL 87 | data <- data[order(data$freq, decreasing = TRUE), ] # Reorders feature dictionary by frequency 88 | x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y) 89 | words <- data.frame(table(unlist(x))) 90 | names(words) <- c("label1", "freq") 91 | data <- merge(data, words, by = "label1", all.x = TRUE) 92 | data <- data[order(data$freq, decreasing = TRUE), ] 93 | data$label <- NA 94 | if (!is.null(keepn)) { 95 | if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist 96 | data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common 97 | 98 | } else { 99 | data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq)) 100 | } 101 | data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs 102 | x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list 103 | mat <- do.call("rbind", lapply(x, function(y) { 104 | if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame 105 | return(y) 106 | } 107 | )) 108 | rownames(mat) <- doc_nam # Adds docname to each row of the matrix 109 | colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column 110 | dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T)) 111 | dropped_types <- 1 - with(data, length(na.omit(label)) / length(na.omit(label1))) 112 | data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary 113 | data <- data[order(data$label, decreasing = FALSE), 114 | c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID 115 | rownames(data) <- NULL # Resets rownames of dictionary 116 | output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens) 117 | class(output) <- "tokens2sequences" 118 | return(output) 119 | } 120 | 121 | #' @export 122 | tokens2sequences.tokens2sequences <- function(x, maxsenlen = 100, keepn = NULL) { 123 | stopifnot(is.tokens2sequences(x)) 124 | doc_nam <- rownames(x$matrix) # Store docnames from tokens object for future use 125 | data <- x$features 126 | names(data)[names(data) %in% c("label", "freq")] <- c('label1', "freq1") 127 | x <- x$matrix 128 | x <- lapply(1:nrow(x), function(y) { 129 | j <- x[y, ] 130 | return(j[j != 0]) 131 | }) 132 | x <- lapply(x, function(y) if(length(y) > maxsenlen) y[1:maxsenlen] else y) 133 | words <- data.frame(table(unlist(x))) 134 | names(words) <- c("label1", "freq") 135 | data <- merge(data, words, by = "label1", all.x = TRUE) 136 | data <- data[order(data$freq, decreasing = TRUE), ] 137 | data$label <- NA 138 | if (!is.null(keepn)) { 139 | if (keepn > sum(!is.na(data$freq))) keepn <- sum(!is.na(data$freq)) # Makes sure that we are not attempting to keep more features than exist 140 | data$label[1:keepn] <- 1:keepn # Subsets tokens to include only the n most common 141 | 142 | } else { 143 | data$label[1:sum(!is.na(data$freq))] <- 1:sum(!is.na(data$freq)) 144 | } 145 | data <- data[order(data$label1, decreasing = FALSE), ] # Orders by original numeric labels. This is done to allow 1:1 mapping of dictionary index numbers to original IDs 146 | x <- lapply(x, function(y) as.integer(na.omit(data$label[y]))) # Assign new, frequency-based IDs to word sequence list 147 | mat <- do.call("rbind", lapply(x, function(y) { 148 | if (length(y) < maxsenlen) {y = c(rep(0L, times = maxsenlen - length(y)), y)} # Adds zeros to ensure an even number of rows across word sequences and binds into a single data frame 149 | return(y) 150 | } 151 | )) 152 | rownames(mat) <- doc_nam # Adds docname to each row of the matrix 153 | colnames(mat) <- as.character(1:maxsenlen) # Adds a numeric label to each column 154 | dropped_tokens <- 1 - with(data, sum(freq[!is.na(label)], na.rm = T) / sum(freq1, na.rm = T)) 155 | dropped_types <- 1 - with(data, length(na.omit(label)) / length(na.omit(label1))) 156 | data <- data[!is.na(data$label), ] # Removes words that were not assigned numeric ids from the dictionary 157 | data <- data[order(data$label, decreasing = FALSE), 158 | c("features", "label", "freq")] # selects feature names, ids, and frequency for dictionary and orders by frequency-based ID 159 | rownames(data) <- NULL # Resets rownames of dictionary 160 | output <- list(matrix = mat, nfeatures = nrow(data), features = data, dropped_types = dropped_types, dropped_tokens = dropped_tokens) 161 | class(output) <- "tokens2sequences" 162 | return(output) 163 | } 164 | 165 | #' @seealso [tokens2sequences()] 166 | #' @export 167 | #' @importFrom utils head 168 | #' @method print tokens2sequences 169 | print.tokens2sequences <- function(x, removed = FALSE, ...) { 170 | # calculate % sparse 171 | zeros <- sum(colSums(x$matrix == 0)) 172 | tot <- nrow(x$matrix) * ncol(x$matrix) 173 | sparse_pct <- round(zeros / tot * 100, 1) 174 | 175 | # determine max number of features to print 176 | max_n <- ifelse(ncol(x$matrix) > 10, 10, ncol(x$matrix)) 177 | 178 | # output 179 | cat("Ordered feature matrix of: ", format(nrow(x$matrix), big.mark = ","), 180 | " documents, ", format(x$nfeatures, big.mark = ","), " features ", 181 | "(", sparse_pct, "% sparse).\n\n", sep = "") 182 | #cat(nrow(x$matrix), " x ", ncol(x$matrix), 183 | # " Matrix of class \"tokens2sequences\" \n\n", sep = "") 184 | if(removed) cat("Current parameter settings resulted in the removal of", round(100*x$dropped_types, 1), "percent of types\nand", round(100*x$dropped_tokens, 1), "percent of tokens.\n\n") 185 | print(head(x$matrix[, 1:max_n], 4)) 186 | } 187 | 188 | #' Match the feature names of one tokens2sequences object to another 189 | #' 190 | #' Converts the feature names of one tokens2sequences object to those of 191 | #' another. Useful in aligning training and test sets. 192 | #' @param x [tokens2sequences()] object that will be forced to conform 193 | #' @param y [tokens2sequences()] object whose feature names will be 194 | #' used to change token labels for `x` 195 | #' @seealso [tokens2sequences()] 196 | #' @keywords internal 197 | #' @export 198 | #' @examples 199 | #' \dontrun{ 200 | #' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label)) 201 | #' corpuncoded <- data_corpus_manifestosentsUK %>% 202 | #' corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>% 203 | #' corpus_sample(size = ndoc(corpcoded)) 204 | #' 205 | #' tokx <- tokens(corpuncoded) 206 | #' toky <- tokens(corpcoded) 207 | #' 208 | #' seqx <- tokens2sequences(tokx, maxsenlen = 50, keepn = 5000) 209 | #' seqy <- tokens2sequences(toky, maxsenlen = 50, keepn = 5000) 210 | #' tokens2sequences_conform(seqx, seqy) 211 | #' } 212 | tokens2sequences_conform <- function(x, y) { 213 | UseMethod("tokens2sequences_conform") 214 | } 215 | 216 | #' @export 217 | #' @importFrom stats na.omit 218 | tokens2sequences_conform.tokens2sequences <- function(x, y) { 219 | stopifnot(is.tokens2sequences(x) & is.tokens2sequences(y)) 220 | joint_feat <- merge(x$features, y$features[, -3], by = "features", 221 | all.x = TRUE) 222 | joint_feat <- joint_feat[order(joint_feat$label.x, decreasing = FALSE), ] 223 | mat <- apply(x$matrix, 1, 224 | function(x) as.integer(na.omit(joint_feat$label.y[x]))) 225 | mat <- do.call("rbind", lapply(mat, function(y) 226 | if (length(y) >= ncol(x$matrix)) 227 | y[1:ncol(x$matrix)] 228 | else 229 | c(rep(0, times = ncol(x$matrix) - length(y)), y) 230 | )) 231 | rownames(mat) <- rownames(x$matrix) 232 | colnames(mat) <- colnames(x$matrix) 233 | joint_feat <- joint_feat[, c("features", "label.y", "freq")] 234 | names(joint_feat)[2] <- "label" 235 | joint_feat <- joint_feat[order(joint_feat$label, decreasing = F), ] 236 | joint_feat <- joint_feat[!is.na(joint_feat$label), ] 237 | rownames(joint_feat) <- NULL 238 | 239 | output <- 240 | list(matrix = mat, nfeatures = nrow(joint_feat), features = joint_feat) 241 | class(output) <- c("tokens2sequences") 242 | return(output) 243 | } 244 | 245 | #' Check to see if function is a tokens2sequences type 246 | #' 247 | #' 248 | #' @param x Object that will be checked to see if it is of the type [tokens2sequences()] 249 | #' @seealso [tokens2sequences()] 250 | #' @keywords internal 251 | #' @export 252 | is.tokens2sequences <- function(x) { 253 | "tokens2sequences" %in% class(x) 254 | } 255 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(...) { 2 | options(keras.fit_verbose = 0) 3 | options(keras.view_metrics = FALSE) 4 | } 5 | 6 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r, echo = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "##", 9 | fig.path = "man/images/" 10 | ) 11 | ``` 12 | 13 | # quanteda.classifiers: Text classification textmodel extensions for quanteda 14 | 15 | [![CRAN Version](https://www.r-pkg.org/badges/version/quanteda.classifiers)](https://CRAN.R-project.org/package=quanteda.classifiers) 16 | [![R build status](https://github.com/quanteda/quanteda.classifiers/workflows/R-CMD-check/badge.svg)](https://github.com/quanteda/quanteda.classifiers/actions) 17 | [![Coverage status](https://codecov.io/gh/quanteda/quanteda.classifiers/branch/master/graph/badge.svg)](https://codecov.io/github/quanteda/quanteda.classifiers?branch=master) 18 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental) 19 | 20 | 21 | ## Installation 22 | 23 | To install this package, use the following, which also installs what the R **keras** package needs in order to run. 24 | ```{r eval = FALSE} 25 | # devtools package required to install quanteda from Github 26 | devtools::install_github("quanteda/quanteda.classifiers") 27 | 28 | keras::install_keras(method = "conda") 29 | ``` 30 | 31 | ## Available classifiers 32 | 33 | This package contains two experimental methods that are built on top of the **keras** package. (The SVM models have been moved to [**quanteda.textmodels**](https://github.com/quanteda/quanteda.textmodels).) 34 | 35 | Classifier | Command 36 | --|-- 37 | Multilevel perceptron network | `textmodel_mlp()` 38 | Convolutional neural network + LSTM model fitted to word embeddings | `textmodel_cnnlstmemb()` 39 | 40 | ## Available human-annotated corpora 41 | 42 | Corpus | Name 43 | --|-- 44 | Sentence-level corpus of UK party manifestos 1945–2019, partially annotated | `data_corpus_manifestosentsUK` 45 | Large Movie Review Dataset of 50,000 annotated highly polar movie reviews for training and testing, from Maas et. al. (2011) | `data_corpus_LMRD` 46 | 47 | ## Demonstration 48 | 49 | See this (very preliminary!) [performance comparison](https://htmlpreview.github.io/?https://github.com/quanteda/quanteda.classifiers/blob/master/tests/misc/test-LMRD.nb.html). 50 | 51 | 52 | ## How to cite 53 | 54 | Benoit, Kenneth, Patrick Chester, and Stefan Müller (2019). quanteda.classifiers: Models for supervised text classification. R package version 0.2. URL: http://github.com/quanteda/quanteda.svm. 55 | 56 | For a BibTeX entry, use the output from citation(package = "quanteda.classifiers"). 57 | 58 | ## Issues 59 | 60 | * Please file an issue (with a bug, wish list, etc.) [via GitHub](https://github.com/quanteda/quanteda.classifiers/issues). 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # quanteda.classifiers: Text classification textmodel extensions for quanteda 3 | 4 | [![CRAN 5 | Version](https://www.r-pkg.org/badges/version/quanteda.classifiers)](https://CRAN.R-project.org/package=quanteda.classifiers) 6 | [![R build 7 | status](https://github.com/quanteda/quanteda.classifiers/workflows/R-CMD-check/badge.svg)](https://github.com/quanteda/quanteda.classifiers/actions) 8 | [![Coverage 9 | status](https://codecov.io/gh/quanteda/quanteda.classifiers/branch/master/graph/badge.svg)](https://codecov.io/github/quanteda/quanteda.classifiers?branch=master) 10 | [![Lifecycle: 11 | experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental) 12 | 13 | ## Installation 14 | 15 | To install this package, use the following, which also installs what the 16 | R **keras** package needs in order to run. 17 | 18 | ``` r 19 | # devtools package required to install quanteda from Github 20 | devtools::install_github("quanteda/quanteda.classifiers") 21 | 22 | keras::install_keras(method = "conda") 23 | ``` 24 | 25 | ## Available classifiers 26 | 27 | This package contains two experimental methods that are built on top of 28 | the **keras** package. (The SVM models have been moved to 29 | [**quanteda.textmodels**](https://github.com/quanteda/quanteda.textmodels).) 30 | 31 | | Classifier | Command | 32 | |---------------------------------------------------------------------|--------------------------| 33 | | Multilevel perceptron network | `textmodel_mlp()` | 34 | | Convolutional neural network + LSTM model fitted to word embeddings | `textmodel_cnnlstmemb()` | 35 | 36 | ## Available human-annotated corpora 37 | 38 | | Corpus | Name | 39 | |------------------------------------------------------------------------------------------------------------------------------|--------------------------------| 40 | | Sentence-level corpus of UK party manifestos 1945–2019, partially annotated | `data_corpus_manifestosentsUK` | 41 | | Large Movie Review Dataset of 50,000 annotated highly polar movie reviews for training and testing, from Maas et. al. (2011) | `data_corpus_LMRD` | 42 | 43 | ## Demonstration 44 | 45 | See this (very preliminary!) [performance 46 | comparison](https://htmlpreview.github.io/?https://github.com/quanteda/quanteda.classifiers/blob/master/tests/misc/test-LMRD.nb.html). 47 | 48 | ## How to cite 49 | 50 | Benoit, Kenneth, Patrick Chester, and Stefan Müller (2019). 51 | quanteda.classifiers: Models for supervised text classification. R 52 | package version 0.2. URL: . 53 | 54 | For a BibTeX entry, use the output from citation(package = 55 | “quanteda.classifiers”). 56 | 57 | ## Issues 58 | 59 | - Please file an issue (with a bug, wish list, etc.) [via 60 | GitHub](https://github.com/quanteda/quanteda.classifiers/issues). 61 | -------------------------------------------------------------------------------- /data/data_corpus_LMRD.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/data/data_corpus_LMRD.rda -------------------------------------------------------------------------------- /data/data_corpus_manifestosentsUK.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/data/data_corpus_manifestosentsUK.rda -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | ACL 2 | cnnlstmemb 3 | com 4 | Crowdflower 5 | dfm 6 | docvars 7 | etc 8 | Favorable 9 | github 10 | Informedness 11 | keras 12 | Lifecycle 13 | lstm 14 | LSTM 15 | Maas 16 | Mikhaylov 17 | mlp 18 | MLP 19 | NN 20 | perceptron 21 | Pham 22 | quanteda 23 | SIE 24 | svm 25 | textmodel 26 | textmodels 27 | th 28 | -------------------------------------------------------------------------------- /man/crossval.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/crossval.R 3 | \name{crossval} 4 | \alias{crossval} 5 | \title{Cross-validate a fitted textmodel} 6 | \usage{ 7 | crossval(x, k = 5, by_class = FALSE, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{a fitted textmodel} 11 | 12 | \item{k}{number of folds} 13 | 14 | \item{by_class}{logical; if \code{TRUE}, estimate performance score separately for 15 | each class, otherwise average across classes} 16 | 17 | \item{verbose}{logical; if \code{TRUE}, output results to the console} 18 | } 19 | \description{ 20 | Cross-validate a fitted textmodel using \emph{k}-fold cross-validation. 21 | } 22 | \examples{ 23 | library("quanteda") 24 | library("quanteda.textmodels") 25 | dfmat <- tokens(data_corpus_moviereviews) |> 26 | dfm() 27 | tmod <- textmodel_nb(dfmat, y = data_corpus_moviereviews$sentiment) 28 | crossval(tmod, k = 5, by_class = TRUE) 29 | crossval(tmod, k = 5, by_class = FALSE) 30 | crossval(tmod, k = 5, by_class = FALSE, verbose = TRUE) 31 | } 32 | -------------------------------------------------------------------------------- /man/data_corpus_LMRD.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data_corpus_LMRD} 5 | \alias{data_corpus_LMRD} 6 | \title{Large Movie Review Dataset from Maas et. al. (2011)} 7 | \format{ 8 | The corpus docvars consist of: 9 | \describe{ 10 | \item{docnumber}{serial (within set and polarity) document number} 11 | \item{rating}{user-assigned movie rating on a 1-10 point integer scale} 12 | \item{set}{used for test v. training set} 13 | \item{polarity}{either \code{neg} or \code{pos} to indicate whether the 14 | movie review was negative or positive. See Maas et al (2011) for the 15 | cut-off values that governed this assignment.} 16 | } 17 | } 18 | \source{ 19 | \url{http://ai.stanford.edu/~amaas/data/sentiment/} 20 | } 21 | \usage{ 22 | data_corpus_LMRD 23 | } 24 | \description{ 25 | A corpus object containing a dataset for sentiment classification containing 26 | 25,000 highly polar movie reviews for training, and 25,000 for testing, from 27 | Maas et. al. (2011). 28 | } 29 | \references{ 30 | Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew 31 | Y. Ng, and Christopher Potts. (2011). 32 | "\href{http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf}{Learning Word Vectors for Sentiment Analysis}". The 49th Annual Meeting of the 33 | Association for Computational Linguistics (ACL 2011). 34 | } 35 | \keyword{data} 36 | -------------------------------------------------------------------------------- /man/data_corpus_manifestosentsUK.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data_corpus_manifestosentsUK} 5 | \alias{data_corpus_manifestosentsUK} 6 | \title{Sentence-level corpus of UK party manifestos 1945--2017, partially annotated} 7 | \format{ 8 | The corpus consists of 88,954 documents (i.e. sentences) and includes the following 9 | document-level variables: \describe{ 10 | \item{party}{factor; abbreviation of the party that wrote the manifesto.} 11 | \item{partyname}{factor; party that wrote the manifesto.} 12 | \item{year}{integer; 4-digit year of the election.} 13 | \item{crowd_econsocial_label}{factor; indicates the majority label assigned 14 | by crowd workers (Economic Policy, Social Policy, or Neither). The variable 15 | has missing values (\code{NA}) for all non-annotated manifestos.} 16 | \item{crowd_econsocial_mean}{numeric; the direction of statements coded as 17 | "Economic Policy" or "Social Policy" based on the aggregated crowd codings. 18 | The variable is the mean of the scores assigned by the workers workers who 19 | coded the sentence and who allocated the sentence to the "majority" 20 | category. The variable ranges from -2 to +2. 21 | 22 | For the statements aggregated as "Economic" Policy, -2 corresponds to "Very 23 | left"; +2 corresponds to "Very right". For the statements aggregated as 24 | "Social Policy" -2 corresponds to "Very liberal"; +2 corresponds to "Very 25 | conservative". The variable has missing values (NA) for all sentences that 26 | were aggregated as "Neither" and for all non-annotated manifestos.)} 27 | \item{crowd_econsocial_n}{integer; the number of coders who contributed to the 28 | mean score \code{crowd_econsocial_mean}.} 29 | \item{crowd_immigration_label}{Factor indicating whether the majority of 30 | crowd workers labelled a sentence as referring to immigration or not. The 31 | variable has missing values (\code{NA}) for all non-annotated manifestos.} 32 | \item{crowd_immigration_mean}{numeric; the direction 33 | of statements coded as "Immigration" based on the aggregated crowd codings. 34 | The variable is the mean of the scores assigned by workers who coded a 35 | sentence and who allocated the sentence to the "Immigration" category. The 36 | variable ranges from -1 ("Negative and closed immigration policy") to +1 37 | (Favorable and open immigration policy). The variable has missing values 38 | (\code{NA}) for all non-annotated manifestos or if a sentence was not coded as 39 | referring to immigration policy based on the aggregation of crowd codings.} 40 | \item{crowd_immigration_n}{integer; the number of coders who 41 | contributed to the 42 | mean score \code{crowd_immigration_mean}.} 43 | } 44 | 45 | A \link[quanteda:corpus]{corpus} object. 46 | } 47 | \usage{ 48 | data_corpus_manifestosentsUK 49 | } 50 | \description{ 51 | A text corpus of sentences from publicly available party manifestos from the 52 | United Kingdom, published between 1945 and 2019 Some manifestos sentences 53 | have been rated in terms of the direction of policy using crowd-sourced coders. 54 | 55 | The manifestos from the 56 | three main parties (Labour Party, Conservatives, Liberal Democrats) between 57 | 1987 and 2010 have been labelled as Economic Policy, Social 58 | Policy, or Other, and rated in terms of the direction of Economic Policy and 59 | Social Policy. All party 60 | manifestos from the 2010 General Election have been crowd-coded in terms of 61 | immigration policy, and the direction of immigration policy. For more 62 | information on the coding approach see 63 | \href{https://doi.org/10.1017/S0003055416000058}{Benoit et al. (2016)}. 64 | 65 | The 66 | corpus contains the aggregated crowd coding values on the level of sentences. 67 | Note that the segmentation into sentences does not always work correctly due 68 | to missing punctuation. See Examples for how to remove very short and very 69 | long sentences using \code{\link[quanteda:corpus_trim]{quanteda::corpus_trim()}}. 70 | } 71 | \examples{ 72 | \donttest{ 73 | library("quanteda") 74 | 75 | # keep only crowd coded manifestos (with respect to economic and social policy) 76 | corp_crowdeconsocial <- 77 | corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_econsocial_label)) 78 | 79 | # keep only crowd coded manifestos (with respect to immigration policy) 80 | corp_crowdimmig <- 81 | corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label)) 82 | } 83 | } 84 | \references{ 85 | Benoit, K., Conway, D., Lauderdale, B.E., Laver, M., & Mikhaylov, S. (2016). 86 | \href{https://doi.org/10.1017/S0003055416000058}{Crowd-sourced Text Analysis: Reproducible and Agile Production of Political Data}. 87 | \emph{American Political Science Review}, 100,(2), 278--295. 88 | } 89 | \keyword{data} 90 | -------------------------------------------------------------------------------- /man/is.tokens2sequences.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokens2sequences.R 3 | \name{is.tokens2sequences} 4 | \alias{is.tokens2sequences} 5 | \title{Check to see if function is a tokens2sequences type} 6 | \usage{ 7 | is.tokens2sequences(x) 8 | } 9 | \arguments{ 10 | \item{x}{Object that will be checked to see if it is of the type \code{\link[=tokens2sequences]{tokens2sequences()}}} 11 | } 12 | \description{ 13 | Check to see if function is a tokens2sequences type 14 | } 15 | \seealso{ 16 | \code{\link[=tokens2sequences]{tokens2sequences()}} 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/performance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/performance.R 3 | \name{performance} 4 | \alias{performance} 5 | \alias{precision} 6 | \alias{recall} 7 | \alias{f1_score} 8 | \alias{accuracy} 9 | \alias{balanced_accuracy} 10 | \title{Performance statistics for prediction} 11 | \usage{ 12 | performance(data, truth, by_class = TRUE, ...) 13 | 14 | precision(data, truth, by_class = TRUE, ...) 15 | 16 | recall(data, truth, by_class = TRUE, ...) 17 | 18 | f1_score(data, truth, by_class = TRUE, ...) 19 | 20 | accuracy(data, truth, ...) 21 | 22 | balanced_accuracy(data, ...) 23 | } 24 | \arguments{ 25 | \item{data}{a table of predicted by truth, or vector of predicted labels} 26 | 27 | \item{truth}{vector of "true" labels, or if a table, \code{2} to indicate that the 28 | "true" values are in columns, or \code{1} if in rows.} 29 | 30 | \item{by_class}{logical; if \code{TRUE}, estimate performance score separately for 31 | each class, otherwise average across classes} 32 | 33 | \item{...}{not used} 34 | } 35 | \value{ 36 | named list consisting of the selected measure(s), where each element 37 | is a scalar if \code{by_class = FALSE}, or a vector named by class if \code{by_class = TRUE}. 38 | } 39 | \description{ 40 | Functions for computing performance statistics used for model 41 | evaluation. 42 | 43 | \code{performance()} computes all of the following, which are also 44 | available via specific functions: 45 | 46 | Given a 2 x 2 table with notation 47 | 48 | \tabular{rcc}{ \tab Truth \tab \cr Predicted \tab Positive \tab 49 | Negative \cr Positive \tab \emph{A} \tab \emph{B} \cr Negative \tab \emph{C} \tab \emph{D} \cr } 50 | 51 | The metrics computed here are: 52 | \itemize{ 53 | \item{precision: }{\eqn{A / (A + B)}} 54 | \item{recall: }{\eqn{A / (A + C)}} 55 | \item{\emph{F1}: }{\eqn{2 / (recall^{-1} + precision^{-1})}} 56 | \item{accuracy: }{\eqn{(A + D) / (A + B + C + D)}, or correctly predicted / all} 57 | \item{balanced_accuracy: }{mean(recall) for all categories} 58 | } 59 | } 60 | \examples{ 61 | ## Data in Table 2 of Powers (2007) 62 | 63 | lvs <- c("Relevant", "Irrelevant") 64 | tbl_2_1_pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs) 65 | tbl_2_1_truth <- factor(c(rep(lvs, times = c(30, 12)), 66 | rep(lvs, times = c(30, 28))), 67 | levels = lvs) 68 | 69 | performance(tbl_2_1_pred, tbl_2_1_truth) 70 | performance(tbl_2_1_pred, tbl_2_1_truth, by_class = FALSE) 71 | performance(table(tbl_2_1_pred, tbl_2_1_truth), by_class = TRUE) 72 | 73 | precision(tbl_2_1_pred, tbl_2_1_truth) 74 | 75 | recall(tbl_2_1_pred, tbl_2_1_truth) 76 | 77 | f1_score(tbl_2_1_pred, tbl_2_1_truth) 78 | 79 | accuracy(tbl_2_1_pred, tbl_2_1_truth) 80 | 81 | balanced_accuracy(tbl_2_1_pred, tbl_2_1_truth) 82 | 83 | } 84 | \references{ 85 | Powers, D. (2007). "Evaluation: From Precision, Recall and F Factor to ROC, 86 | Informedness, Markedness and Correlation." \emph{Technical Report SIE-07-001}, 87 | Flinders University. 88 | } 89 | -------------------------------------------------------------------------------- /man/predict.textmodel_cnnlstmemb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_cnnlstmemb.R 3 | \name{predict.textmodel_cnnlstmemb} 4 | \alias{predict.textmodel_cnnlstmemb} 5 | \title{Prediction from a fitted textmodel_cnnlstmemb object} 6 | \usage{ 7 | \method{predict}{textmodel_cnnlstmemb}( 8 | object, 9 | newdata = NULL, 10 | type = c("class", "probability"), 11 | force = TRUE, 12 | ... 13 | ) 14 | } 15 | \arguments{ 16 | \item{object}{a fitted \link{textmodel_cnnlstmemb} model} 17 | 18 | \item{newdata}{dfm on which prediction should be made} 19 | 20 | \item{type}{the type of predicted values to be returned; see Value} 21 | 22 | \item{force}{make \code{newdata}'s feature set conformant to the model terms} 23 | 24 | \item{...}{not used} 25 | } 26 | \value{ 27 | \code{predict.textmodel_cnnlstmemb} returns either a vector of class 28 | predictions for each row of \code{newdata} (when \code{type = "class"}), or 29 | a document-by-class matrix of class probabilities (when \code{type = "probability"}). 30 | } 31 | \description{ 32 | \code{predict.textmodel_cnnlstmemb()} implements class predictions from a 33 | fitted long short-term memory neural network model. 34 | } 35 | \seealso{ 36 | \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}} 37 | } 38 | \keyword{internal} 39 | \keyword{textmodel} 40 | -------------------------------------------------------------------------------- /man/predict.textmodel_mlp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_mlp.R 3 | \name{predict.textmodel_mlp} 4 | \alias{predict.textmodel_mlp} 5 | \title{Prediction from a fitted textmodel_mlp object} 6 | \usage{ 7 | \method{predict}{textmodel_mlp}( 8 | object, 9 | newdata = NULL, 10 | type = c("class", "probability"), 11 | force = TRUE, 12 | ... 13 | ) 14 | } 15 | \arguments{ 16 | \item{object}{a fitted \link{textmodel_mlp} model} 17 | 18 | \item{newdata}{dfm on which prediction should be made} 19 | 20 | \item{type}{the type of predicted values to be returned; see Value} 21 | 22 | \item{force}{make \code{newdata}'s feature set conformant to the model terms} 23 | 24 | \item{...}{not used} 25 | } 26 | \value{ 27 | \code{predict.textmodel_mlp} returns either a vector of class 28 | predictions for each row of \code{newdata} (when \code{type = "class"}), or 29 | a document-by-class matrix of class probabilities (when \code{type = "probability"}). 30 | } 31 | \description{ 32 | \code{predict.textmodel_mlp()} implements class predictions from a fitted 33 | multilayer perceptron network model. 34 | } 35 | \seealso{ 36 | \code{\link[=textmodel_mlp]{textmodel_mlp()}} 37 | } 38 | \keyword{internal} 39 | \keyword{textmodel} 40 | -------------------------------------------------------------------------------- /man/quanteda.classifiers-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/quanteda.classifiers-package.R 3 | \docType{package} 4 | \name{quanteda.classifiers-package} 5 | \alias{quanteda.classifiers} 6 | \alias{quanteda.classifiers-package} 7 | \title{quanteda.classifiers} 8 | \description{ 9 | Extensions to \pkg{quanteda} that provide supervised machine learning models 10 | for document-feature matrices. 11 | } 12 | \seealso{ 13 | Useful links: 14 | \itemize{ 15 | \item \url{https://github.com/quanteda/quanteda.classifiers} 16 | \item Report bugs at \url{https://github.com/quanteda/quanteda.classifiers/issues} 17 | } 18 | 19 | } 20 | \author{ 21 | \strong{Maintainer}: Kenneth Benoit \email{kbenoit@lse.ac.uk} 22 | 23 | Authors: 24 | \itemize{ 25 | \item Patrick Chester \email{pjc468@nyu.edu} 26 | \item Müller Stefan \email{mueller@ipz.uzh.ch} 27 | } 28 | 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /man/save.textmodel_mlp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_cnnlstmemb.R, R/textmodel_mlp.R 3 | \name{save.textmodel_cnnlstmemb} 4 | \alias{save.textmodel_cnnlstmemb} 5 | \alias{load.textmodel_cnnlstmemb} 6 | \alias{save.textmodel_mlp} 7 | \alias{load.textmodel_mlp} 8 | \title{Load or save keras-based textmodels} 9 | \usage{ 10 | \method{save}{textmodel_cnnlstmemb}(x, ...) 11 | 12 | \method{load}{textmodel_cnnlstmemb}(x, ...) 13 | 14 | \method{save}{textmodel_mlp}(x, ...) 15 | 16 | \method{load}{textmodel_mlp}(x, ...) 17 | } 18 | \arguments{ 19 | \item{x}{a \pkg{keras}-based fitted textmodel} 20 | 21 | \item{...}{additional arguments passed to \code{\link[=save]{save()}} or \code{\link[=load]{load()}}} 22 | } 23 | \description{ 24 | Functions for loading and saving \pkg{keras}-based models. Because these are 25 | stored as references, they need to be "serialized" prior to saving, or 26 | serialized upon loading. This applies to models fit using 27 | \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}} and \code{\link[=textmodel_mlp]{textmodel_mlp()}}. 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /man/summary.textmodel_cnnlstmemb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_cnnlstmemb.R 3 | \name{summary.textmodel_cnnlstmemb} 4 | \alias{summary.textmodel_cnnlstmemb} 5 | \title{summary method for textmodel_cnnlstmemb objects} 6 | \usage{ 7 | \method{summary}{textmodel_cnnlstmemb}(object, ...) 8 | } 9 | \arguments{ 10 | \item{object}{output from \code{\link[=textmodel_cnnlstmemb]{textmodel_cnnlstmemb()}}} 11 | 12 | \item{...}{additional arguments not used} 13 | } 14 | \description{ 15 | summary method for textmodel_cnnlstmemb objects 16 | } 17 | \keyword{internal} 18 | \keyword{textmodel} 19 | -------------------------------------------------------------------------------- /man/summary.textmodel_mlp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_mlp.R 3 | \name{summary.textmodel_mlp} 4 | \alias{summary.textmodel_mlp} 5 | \title{summary method for textmodel_mlp objects} 6 | \usage{ 7 | \method{summary}{textmodel_mlp}(object, ...) 8 | } 9 | \arguments{ 10 | \item{object}{output from \code{\link[=textmodel_mlp]{textmodel_mlp()}}} 11 | 12 | \item{...}{additional arguments not used} 13 | } 14 | \description{ 15 | summary method for textmodel_mlp objects 16 | } 17 | \keyword{internal} 18 | \keyword{textmodel} 19 | -------------------------------------------------------------------------------- /man/textmodel_cnnlstmemb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_cnnlstmemb.R 3 | \name{textmodel_cnnlstmemb} 4 | \alias{textmodel_cnnlstmemb} 5 | \title{[Experimental] Convolutional NN + LSTM model fitted to word embeddings} 6 | \usage{ 7 | textmodel_cnnlstmemb( 8 | x, 9 | y, 10 | dropout1 = 0.2, 11 | dropout2 = 0.2, 12 | dropout3 = 0.2, 13 | dropout4 = 0.2, 14 | wordembeddim = 30, 15 | cnnlayer = TRUE, 16 | filter = 48, 17 | kernel_size = 5, 18 | pool_size = 4, 19 | units_lstm = 128, 20 | words = NULL, 21 | maxsenlen = 100, 22 | optimizer = "adam", 23 | loss = "categorical_crossentropy", 24 | metrics = "categorical_accuracy", 25 | ... 26 | ) 27 | } 28 | \arguments{ 29 | \item{x}{tokens object} 30 | 31 | \item{y}{vector of training labels associated with each document identified 32 | in \code{train}. (These will be converted to factors if not already 33 | factors.)} 34 | 35 | \item{dropout1}{A floating variable bound between 0 and 1. It determines the 36 | rate at which units are dropped for the linear transformation of the 37 | inputs for the embedding layer.} 38 | 39 | \item{dropout2}{A floating variable bound between 0 and 1. It determines the 40 | rate at which units are dropped for the linear transformation of the 41 | inputs for the CNN layer.} 42 | 43 | \item{dropout3}{A floating variable bound between 0 and 1. It determines the 44 | rate at which units are dropped for the linear transformation of the 45 | inputs for the recurrent layer.} 46 | 47 | \item{dropout4}{A floating variable bound between 0 and 1. It determines the 48 | rate at which units are dropped for the linear transformation of the 49 | inputs for the recurrent layer.} 50 | 51 | \item{wordembeddim}{The number of word embedding dimensions to be fit} 52 | 53 | \item{cnnlayer}{A logical parameter that allows user to include or exclude a 54 | convolutional layer in the neural network model} 55 | 56 | \item{filter}{The number of output filters in the convolution} 57 | 58 | \item{kernel_size}{An integer or list of a single integer, specifying the 59 | length of the 1D convolution window} 60 | 61 | \item{pool_size}{Size of the max pooling windows. 62 | \code{\link[keras:layer_max_pooling_1d]{keras::layer_max_pooling_1d()}}} 63 | 64 | \item{units_lstm}{The number of nodes of the lstm layer} 65 | 66 | \item{words}{The maximum number of words used to train model. Defaults to the 67 | number of features in \code{x}} 68 | 69 | \item{maxsenlen}{The maximum sentence length of training data} 70 | 71 | \item{optimizer}{optimizer used to fit model to training data, see 72 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}} 73 | 74 | \item{loss}{objective loss function, see 75 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}} 76 | 77 | \item{metrics}{metric used to train algorithm, see 78 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}} 79 | 80 | \item{...}{additional options passed to 81 | \code{\link[keras:fit.keras.engine.training.Model]{keras::fit.keras.engine.training.Model()}}} 82 | } 83 | \description{ 84 | A function that combines a convolutional neural network layer with a long 85 | short-term memory layer. It is designed to incorporate word sequences, 86 | represented as sequentially ordered word embeddings, into text 87 | classification. The model takes as an input a \pkg{quanteda} tokens object. 88 | } 89 | \examples{ 90 | \dontrun{ 91 | # create dataset with evenly balanced coded & uncoded immigration sentences 92 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK, 93 | !is.na(crowd_immigration_label)) 94 | corpuncoded <- data_corpus_manifestosentsUK \%>\% 95 | corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\% 96 | corpus_sample(size = ndoc(corpcoded)) 97 | corp <- corpcoded + corpuncoded 98 | 99 | tok <- tokens(corp) 100 | 101 | tmod <- textmodel_cnnlstmemb(tok, 102 | y = docvars(tok, "crowd_immigration_label"), 103 | epochs = 5, verbose = 1) 104 | 105 | newdata = tokens_subset(tok, subset = is.na(crowd_immigration_label)) 106 | pred <- predict(tmod, newdata = newdata) 107 | table(pred) 108 | tail(texts(corpuncoded)[pred == "Immigration"], 10) 109 | 110 | } 111 | } 112 | \seealso{ 113 | \code{\link[=save.textmodel_cnnlstmemb]{save.textmodel_cnnlstmemb()}}, \code{\link[=load.textmodel_cnnlstmemb]{load.textmodel_cnnlstmemb()}} 114 | } 115 | \keyword{textmodel} 116 | -------------------------------------------------------------------------------- /man/textmodel_evaluate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_evaluate.R 3 | \name{textmodel_evaluate} 4 | \alias{textmodel_evaluate} 5 | \title{Model evaluation function} 6 | \usage{ 7 | textmodel_evaluate( 8 | x, 9 | y, 10 | model, 11 | fun = "f1_score", 12 | k = 5, 13 | parameters = list(), 14 | seed = as.numeric(Sys.time()), 15 | time = TRUE, 16 | by_class = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{the \link{dfm} or \link{tokens} object on which the model will be 21 | fit. Does not need to contain only the training documents.} 22 | 23 | \item{y}{vector of training labels associated with each document identified 24 | in \code{train}. (These will be converted to factors if not already 25 | factors.)} 26 | 27 | \item{model}{the name of the machine learning function that will be evaluated} 28 | 29 | \item{fun}{the name of the function that will be used to evaluate the machine 30 | learning model. Can take the values "accuracy", "precision", "recall", or 31 | "f1_score"} 32 | 33 | \item{k}{number of folds} 34 | 35 | \item{parameters}{model hyperparameters} 36 | 37 | \item{seed}{a seed that can allow for replication of k training data splits. 38 | If seed is not provided a seed is chosen based on the current time.} 39 | 40 | \item{time}{a logical parameter that determines whether output will include 41 | training time (in seconds) of model} 42 | 43 | \item{by_class}{estimates a separate value of provided evaluation function 44 | for every class of the true vector} 45 | } 46 | \description{ 47 | Designed to streamline the parameter tuning and evaluation process. Users 48 | chose a function to evaluate and include parameter values as a list. If 49 | multiple parameter values are provided, the function will perform a grid 50 | search by estimating a separate model for every combination of parameters. 51 | } 52 | \examples{ 53 | # evaluate immigration classification performance 54 | \dontrun{ 55 | dfmat <- dfm(data_corpus_manifestosentsUK) 56 | codes <- docvars(data_corpus_manifestosentsUK, "crowd_immigration_label") 57 | evaluation <- textmodel_evaluate(dfmat, codes, k = 3, 58 | model = "textmodel_mlp", fun = "f1_score", 59 | parameters = list(epochs = c(3, 4))) 60 | head(evaluation) 61 | aggregate(evaluation, by = list(evaluation$cost), FUN = "mean") 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /man/textmodel_mlp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel_mlp.R 3 | \name{textmodel_mlp} 4 | \alias{textmodel_mlp} 5 | \title{Multilayer perceptron network (MLP) model for text classification} 6 | \usage{ 7 | textmodel_mlp( 8 | x, 9 | y, 10 | units = 512, 11 | dropout = 0.2, 12 | optimizer = "adam", 13 | loss = "categorical_crossentropy", 14 | metrics = "categorical_accuracy", 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{x}{the \link{dfm} on which the model will be fit. Does not need to 20 | contain only the training documents.} 21 | 22 | \item{y}{vector of training labels associated with each document identified 23 | in \code{train}. (These will be converted to factors if not already 24 | factors.)} 25 | 26 | \item{units}{The number of network nodes used in the first layer of the 27 | sequential model} 28 | 29 | \item{dropout}{A floating variable bound between 0 and 1. It determines the 30 | rate at which units are dropped for the linear transformation of the 31 | inputs.} 32 | 33 | \item{optimizer}{optimizer used to fit model to training data, see 34 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}} 35 | 36 | \item{loss}{objective loss function, see 37 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}} 38 | 39 | \item{metrics}{metric used to train algorithm, see 40 | \code{\link[keras:compile.keras.engine.training.Model]{keras::compile.keras.engine.training.Model()}}} 41 | 42 | \item{...}{additional options passed to 43 | \code{\link[keras:fit.keras.engine.training.Model]{keras::fit.keras.engine.training.Model()}}} 44 | } 45 | \description{ 46 | This function is a wrapper for a multilayer perceptron network model with a 47 | single hidden layer network with two layers, implemented in the \pkg{keras} 48 | package. 49 | } 50 | \examples{ 51 | \dontrun{ 52 | # create a dataset with evenly balanced coded and uncoded immigration sentences 53 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label)) 54 | corpuncoded <- data_corpus_manifestosentsUK \%>\% 55 | corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\% 56 | corpus_sample(size = ndoc(corpcoded)) 57 | corp <- corpcoded + corpuncoded 58 | 59 | # form a tf-idf-weighted dfm 60 | dfmat <- dfm(corp) \%>\% 61 | dfm_tfidf() 62 | 63 | set.seed(1000) 64 | tmod <- textmodel_mlp(dfmat, y = docvars(dfmat, "crowd_immigration_label"), 65 | epochs = 5, verbose = 1) 66 | pred <- predict(tmod, newdata = dfm_subset(dfmat, is.na(crowd_immigration_label))) 67 | table(pred) 68 | tail(texts(corpuncoded)[pred == "Immigration"], 10) 69 | } 70 | } 71 | \seealso{ 72 | \code{\link[=save.textmodel_mlp]{save.textmodel_mlp()}}, \code{\link[=load.textmodel_mlp]{load.textmodel_mlp()}} 73 | } 74 | \keyword{textmodel} 75 | -------------------------------------------------------------------------------- /man/tokens2sequences.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokens2sequences.R 3 | \name{tokens2sequences} 4 | \alias{tokens2sequences} 5 | \title{[Experimental] Convert quanteda tokens to keras sequences} 6 | \usage{ 7 | tokens2sequences(x, maxsenlen = 100, keepn = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{\code{\link[quanteda:tokens]{quanteda::tokens()}} object} 11 | 12 | \item{maxsenlen}{the maximum sentence length kept in output matrix} 13 | 14 | \item{keepn}{the maximum number of features to keep} 15 | } 16 | \value{ 17 | \code{\link[=tokens2sequences]{tokens2sequences()}} The output matrix has a number of rows 18 | which represent each tokenized sentence input into the function and a 19 | number of columns determined by \code{maxsenlen}. The matrix contains a 20 | numeric code for every unique token kept (determined by \code{keepn}) and 21 | they are arranged in the same sequence indicated by the original 22 | \code{\link[quanteda:tokens]{quanteda::tokens()}} object. 23 | } 24 | \description{ 25 | This function converts a \pkg{quanteda} \code{\link[quanteda:tokens]{quanteda::tokens()}} object 26 | into a tokens sequence object as expected by some functions in the 27 | \pkg{keras} package. 28 | } 29 | \examples{ 30 | library("quanteda") 31 | corp <- corpus_subset(data_corpus_inaugural, Year <= 1793) 32 | corptok <- tokens(corp) 33 | print(corp) 34 | seqs <- tokens2sequences(corptok, maxsenlen = 200) 35 | print(seqs) 36 | } 37 | \seealso{ 38 | \code{\link[=is.tokens2sequences]{is.tokens2sequences()}}, \code{\link[=tokens2sequences_conform]{tokens2sequences_conform()}} 39 | } 40 | -------------------------------------------------------------------------------- /man/tokens2sequences_conform.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokens2sequences.R 3 | \name{tokens2sequences_conform} 4 | \alias{tokens2sequences_conform} 5 | \title{Match the feature names of one tokens2sequences object to another} 6 | \usage{ 7 | tokens2sequences_conform(x, y) 8 | } 9 | \arguments{ 10 | \item{x}{\code{\link[=tokens2sequences]{tokens2sequences()}} object that will be forced to conform} 11 | 12 | \item{y}{\code{\link[=tokens2sequences]{tokens2sequences()}} object whose feature names will be 13 | used to change token labels for \code{x}} 14 | } 15 | \description{ 16 | Converts the feature names of one tokens2sequences object to those of 17 | another. Useful in aligning training and test sets. 18 | } 19 | \examples{ 20 | \dontrun{ 21 | corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label)) 22 | corpuncoded <- data_corpus_manifestosentsUK \%>\% 23 | corpus_subset(is.na(crowd_immigration_label) & year > 1980) \%>\% 24 | corpus_sample(size = ndoc(corpcoded)) 25 | 26 | tokx <- tokens(corpuncoded) 27 | toky <- tokens(corpcoded) 28 | 29 | seqx <- tokens2sequences(tokx, maxsenlen = 50, keepn = 5000) 30 | seqy <- tokens2sequences(toky, maxsenlen = 50, keepn = 5000) 31 | tokens2sequences_conform(seqx, seqy) 32 | } 33 | } 34 | \seealso{ 35 | \code{\link[=tokens2sequences]{tokens2sequences()}} 36 | } 37 | \keyword{internal} 38 | -------------------------------------------------------------------------------- /quanteda.classifiers.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageCheckArgs: --as-cran 19 | PackageRoxygenize: rd,collate,namespace,vignette 20 | -------------------------------------------------------------------------------- /tests/data_creation/create_data_corpus_manifestosentUK.R: -------------------------------------------------------------------------------- 1 | ######################## 2 | ### Create UK election manifesto dataset on the level of sentences 3 | ######################## 4 | 5 | # load packages 6 | library(dplyr) 7 | library(tidyr) 8 | library(car) 9 | library(readtext) 10 | library(quanteda) 11 | library(quanteda.corpora) 12 | library(spacyr) 13 | 14 | ######################## 15 | ### 1. Load and aggregate crowdcoded data ---- 16 | ######################## 17 | 18 | ## 1.1 Load APSR data on economic policy/social policy/neither ---- 19 | 20 | data_readtext_uk_econsocial <- readtext("tests/data_creation/data_uk_policyarea.zip", 21 | ignore_missing_files = TRUE, encoding = "utf-8") 22 | 23 | # exclude screeners from data frame using a regular expression 24 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 25 | dplyr::filter(!grepl('Code this sentence as', sentence_text)) 26 | 27 | ## For some sentences, special characters are displayed differently depending on the 28 | ## crowd coding job. Therefore, I remove sentence_text here and merge it later 29 | ## using "master.sentences.Rdata" which does not contain these encoding errors 30 | 31 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 32 | select(-sentence_text) 33 | 34 | # load "master sentences" from the APSR replication data with metadata on manifestos and years 35 | load("tests/data_creation/master.sentences.Rdata") 36 | 37 | sentences_metadata <- sentences %>% 38 | select(sentenceid, manifestoid, party, year) # select relevant variables 39 | 40 | # merge metadata on manifestos with crowd coded texts 41 | data_readtext_uk_econsocial <- left_join(data_readtext_uk_econsocial, sentences_metadata, by = "sentenceid") 42 | 43 | # create numeric indicator for aggregation of policy area 44 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 45 | mutate(class_policyarea_num = ifelse(policy_area == 1, 0, 46 | ifelse(policy_area == 2, -1, 47 | ifelse(policy_area == 3, 1, NA)))) 48 | 49 | # create numeric indicator for aggregation of direction 50 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 51 | mutate(class_policyarea_direction_num = ifelse(!is.na(soc_scale), soc_scale, 52 | ifelse(!is.na(econ_scale), econ_scale, NA))) 53 | 54 | # aggregate data to the level of sentences 55 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 56 | group_by(manifestoid, sentenceid) %>% 57 | mutate(crowd_econsocial_n = n(), 58 | class_policyarea_mean = mean(class_policyarea_num, na.rm = TRUE)) 59 | 60 | # create variable with the policy area based on the aggregated coding 61 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 62 | mutate(crowd_econsocial_label = ifelse(class_policyarea_mean < 0, "Economic", 63 | ifelse(class_policyarea_mean == 0, "Not Economic or Social", 64 | ifelse(class_policyarea_mean > 0, "Social", NA)))) 65 | 66 | 67 | # here I make sure that the policy direction mean only takes 68 | # into account the majority category and not also the minority category position values 69 | data_readtext_uk_econsocial <- data_readtext_uk_econsocial %>% 70 | mutate(class_policy_direction_num = 71 | ifelse(crowd_econsocial_label == "Economic" & class_policyarea_num == -1, class_policyarea_direction_num, 72 | ifelse(crowd_econsocial_label == "Social" & class_policyarea_num == 1, class_policyarea_direction_num, 73 | NA))) 74 | 75 | # aggregate data to the level of sentences 76 | data_uk_econsocial <- data_readtext_uk_econsocial %>% 77 | group_by(manifestoid, sentenceid, crowd_econsocial_n, crowd_econsocial_label) %>% 78 | summarise(crowd_econsocial_mean = mean(class_policy_direction_num, na.rm = TRUE)) %>% 79 | ungroup() 80 | 81 | # separate the variable manifestoid into Party and Year 82 | data_uk_econsocial <- data_uk_econsocial %>% 83 | separate(manifestoid, into = c("Party", "Year"), 84 | remove = FALSE) 85 | 86 | # merge text from "sentences" data frame which does not contain encoding errors 87 | dat_sentences <- sentences %>% 88 | select(sentence_text, sentenceid) 89 | 90 | data_uk_econsocial <- left_join(data_uk_econsocial, dat_sentences, 91 | by = "sentenceid") 92 | 93 | # select some of the variables and add three additional variables 94 | data_uk_man_econsocial <- data_uk_econsocial %>% 95 | ungroup() %>% 96 | select(text = sentence_text, sentenceid, Party, Year, 97 | crowd_econsocial_label, crowd_econsocial_mean, 98 | crowd_econsocial_n) 99 | 100 | # replace nan values in class_direction_mean with na 101 | data_uk_man_econsocial$crowd_econsocial_mean[is.nan(data_uk_man_econsocial$crowd_econsocial_mean)] <- NA 102 | 103 | 104 | ## 1.2 Load APSR data on immigration ---- 105 | data_readtext_uk_immigration <- readtext("tests/data_creation/data_uk_immigration.zip", 106 | ignore_missing_files = TRUE) 107 | 108 | # exclude screeners 109 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>% 110 | subset(manifestoid != "screener") 111 | 112 | # use manifestoid to create a party and year variable 113 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>% 114 | separate(manifestoid, into = c("Party", "Year")) 115 | 116 | # get the class 117 | data_readtext_uk_immigration <- data_readtext_uk_immigration %>% 118 | mutate(class_immigration_num = ifelse(policy_area == 4, 1, 0)) 119 | 120 | # aggregate to the level of setence 121 | data_uk_immigration <- data_readtext_uk_immigration %>% 122 | group_by(Party, Year, sentenceid, sentence_text) %>% 123 | summarise(crowd_immigration_n = n(), 124 | class_immigration_num = mean(class_immigration_num, na.rm = TRUE), 125 | crowd_immigration_mean = mean(immigr_scale, na.rm = TRUE)) 126 | 127 | # replace nan values in class_direction_mean with na 128 | data_uk_immigration$crowd_immigration_mean[is.nan(data_uk_immigration$crowd_immigration_mean)] <- NA 129 | 130 | # use the average evaluations to construct the majority-rule based classification 131 | data_uk_immigration <- data_uk_immigration %>% 132 | mutate(crowd_immigration_label = ifelse(class_immigration_num >= 0.5, "Immigration", "Not immigration")) 133 | 134 | # select some of the variables and add three additional variables 135 | data_uk_immig_man_2010 <- data_uk_immigration %>% 136 | ungroup() %>% 137 | select(text = sentence_text, sentenceid, Party, Year, crowd_immigration_label, 138 | crowd_immigration_mean, 139 | crowd_immigration_n) 140 | 141 | 142 | ######################## 143 | ## 1.3 Merge both crowdcoded datasets ---- 144 | ######################## 145 | 146 | # Note that the 2010 manifestos have been coded both with regards to 147 | # immigration AND econ/social/neither 148 | 149 | # full join of both crowdcoded datasets 150 | dat_uk_crowdcoded <- full_join(data_uk_man_econsocial, 151 | data_uk_immig_man_2010, by = c("sentenceid")) 152 | 153 | table(dat_uk_crowdcoded$Party.x) 154 | table(dat_uk_crowdcoded$Party.y) 155 | 156 | # rename and remove unnessary variables 157 | dat_uk_man_crowdcoded_clean <- dat_uk_crowdcoded %>% 158 | mutate(Party = ifelse(is.na(Party.x), Party.y, Party.x)) %>% 159 | mutate(Year = ifelse(is.na(Year.x), Year.y, Year.x)) %>% 160 | mutate(text = ifelse(is.na(text.x), text.y, text.x)) %>% 161 | select(-c(Party.x, Party.y, Year.x, Year.y, text.x, text.y)) 162 | 163 | ######################## 164 | ## 2. Add data_corpus_ukmanifestos from quanteda.corpora package ---- 165 | ######################## 166 | 167 | # Since some of the manifestos are already coded in terms of social/eoconomic/neither, 168 | # we remove these documents from the non-annotated corpus 169 | 170 | # get overview of crowdcoded manifesto 171 | manifestos_ukseconsocial <- dat_uk_man_crowdcoded_clean %>% 172 | ungroup() %>% 173 | select(Party, Year) %>% 174 | unique() %>% 175 | mutate(party_year = paste(Party, Year, sep = "_")) 176 | 177 | # create new docvar used for removing crowdcoded manifestos 178 | # from not-annotated corpus 179 | docvars(data_corpus_ukmanifestos, "party_year") <- paste( 180 | docvars(data_corpus_ukmanifestos, "Party"), 181 | docvars(data_corpus_ukmanifestos, "Year"), sep = "_" 182 | ) 183 | 184 | # remove manifestos that have been crowdcoded 185 | data_corpus_ukmanifestos_subset <- data_corpus_ukmanifestos %>% 186 | corpus_subset(!party_year %in% manifestos_ukseconsocial$party_year) 187 | 188 | ndoc(data_corpus_ukmanifestos) 189 | ndoc(data_corpus_ukmanifestos_subset) 190 | 191 | # check whether Lab, Con, Lib manifestos are excluded for time between 1987 and 2010 (yes!) 192 | table(docvars(data_corpus_ukmanifestos_subset, "party_year")) 193 | 194 | # only keep manifestos from national general elections (Type == "natl") 195 | data_corpus_ukmanifestos_sentences <- data_corpus_ukmanifestos_subset %>% 196 | corpus_subset(Type == "natl") 197 | 198 | # transform to data frame for easier adjustments of document-level variables 199 | data_uk_man <- data.frame( 200 | doc_id = docnames(data_corpus_ukmanifestos_sentences), 201 | text = texts(data_corpus_ukmanifestos_sentences), 202 | docvars(data_corpus_ukmanifestos_sentences), 203 | stringsAsFactors = FALSE 204 | ) 205 | 206 | # use spacyr to tokenize manifestos to sentence-level 207 | spacy_initialize(model = "en") 208 | 209 | data_uk_man_sentences <- spacy_tokenize(data_uk_man, 210 | remove_separators = FALSE, 211 | what = "sentence", 212 | output = "data.frame") 213 | 214 | data_uk_man_sentences <- rename(data_uk_man_sentences, text = token) 215 | 216 | 217 | # merge metadata 218 | data_uk_man_sentences <- left_join(data_uk_man_sentences, 219 | select(data_uk_man, -c("text")), 220 | by = "doc_id") 221 | 222 | 223 | ######################## 224 | ## 3. Import additional manifestos from 2010, 2015, and 2017 elections ---- 225 | ######################## 226 | 227 | ## Note: data retrieved from http://polidoc.net (2010 and 2015) and from parties' websites (2019) 228 | 229 | data_uk_1519 <- readtext("tests/data_creation/data_uk_manifestos_2015-2019.zip", 230 | ignore_missing_files = TRUE, 231 | encoding = "utf-8", 232 | docvarsfrom = "filenames", 233 | docvarnames = c("Year", "Party")) 234 | 235 | 236 | # tokenize documents to the level of sentences 237 | data_uk_man_1519_sentences <- spacy_tokenize(data_uk_1519, 238 | remove_separators = FALSE, 239 | what = "sentence", 240 | output = "data.frame") 241 | 242 | # merge metadata 243 | data_uk_man_1519_sentences <- left_join(data_uk_man_1519_sentences, 244 | select(data_uk_1519, -c("text")), 245 | by = "doc_id") 246 | 247 | data_uk_man_1519_sentences <- rename(data_uk_man_1519_sentences, text = token) 248 | 249 | 250 | ######################## 251 | ## 4. Combine manifestos and create corpus ---- 252 | ######################## 253 | 254 | dat_uk_man_crowdcoded_clean$Year <- as.integer(dat_uk_man_crowdcoded_clean$Year) 255 | 256 | data_uk_manifestos <- bind_rows(data_uk_man_sentences, 257 | dat_uk_man_crowdcoded_clean, 258 | data_uk_man_1519_sentences) 259 | 260 | 261 | # create unique id for each sentence and select only necessary variables 262 | data_uk_manifestos_selectvars <- data_uk_manifestos %>% 263 | select(-c(party_year, sentenceid)) %>% 264 | mutate(year = as.factor(Year)) %>% 265 | select(text, 266 | party = Party, 267 | year, 268 | starts_with("crowd_")) 269 | 270 | # create a quanteda corpus 271 | corp <- corpus(data_uk_manifestos_selectvars) 272 | 273 | # count the number of tokens per sentence (after removing punctuation characters) 274 | docvars(corp, "ntoken_sent") <- ntoken(corp, remove_punct = TRUE) 275 | 276 | # remove observation with 0 tokens (i.e. only punctuation character) 277 | corp_small <- corp %>% 278 | corpus_subset(ntoken_sent > 0) 279 | 280 | # transform back to data frame to create nice doc_id after having removed "empty" sentences 281 | dat_corpus <- data.frame( 282 | text = texts(corp_small), 283 | docvars(corp_small), 284 | stringsAsFactors = FALSE 285 | ) 286 | 287 | dat_corpus <- dat_corpus %>% 288 | arrange(year, party) %>% 289 | group_by(party, year) %>% 290 | mutate(sentence_no = 1:n()) %>% 291 | mutate(doc_id = paste(party, year, sentence_no, sep = "_")) %>% 292 | ungroup() 293 | 294 | # rename variables 295 | dat_corpus_renamed <- dat_corpus %>% 296 | mutate(party = ifelse(party == "Comm", "CP", party)) %>% 297 | mutate(party = ifelse(party == "Gr", "Greens", party)) %>% 298 | mutate(party = ifelse(party == "OMRL", "MRLP", party)) %>% 299 | mutate(party = ifelse(party == "PCy", "PC", party)) 300 | 301 | 302 | recode_party <- c("'Coalition'='Coalition Agreement'; 303 | 'BNP'='British National Party'; 304 | 'CAP'='Community Action Party'; 305 | 'Con'='Conservative Party'; 306 | 'Comm'='Communist Party'; 307 | 'CP'='Communist Party'; 308 | 'Dem'='Democratic Party'; 309 | 'DUP'='Democratic Unionist Party'; 310 | 'EDP'='English Democrats'; 311 | 'EIP'='English Independence Party'; 312 | 'FSP'='Free Scotland Party'; 313 | 'FW'='Forward Wales'; 314 | 'Greens'='Green Party'; 315 | 'IGV'='Independent Green Voice'; 316 | 'LA'='Left Alliance'; 317 | 'Lab'='Labour Party'; 318 | 'Lib'='Liberal Party'; 319 | 'LD'='Liberal Demoracts'; 320 | 'LibSDP'='Social Democratic Party'; 321 | 'MK'='Mebyon Kernow - the Party for Cornwall'; 322 | 'MRLP'='Official Monster Raving Loony Party'; 323 | 'ND'='National Democrats'; 324 | 'NIA'='Northern Ireland Alliance'; 325 | 'PA'='Prolife Alliance'; 326 | 'PC'='Plaid Cymru'; 327 | 'PP'='Peace Party'; 328 | 'PUP'='Progressive Unionist Party'; 329 | 'PVP'='Protest Vote Party'; 330 | 'Resp'='Respect'; 331 | 'RT'='Richard Taylor Personal Manifesto'; 332 | 'Scon'='Scottish Conservative Party'; 333 | 'SDLP'='Social Democratic and Labour Party'; 334 | 'SEP'='Socialist Equality Party'; 335 | 'SF'='Sinn Féin'; 336 | 'SGr'='Scottish Green Party'; 337 | 'SLab'='Scottish Labour Party'; 338 | 'SLD'='Scottish Liberal Democrats'; 339 | 'SNP'='Scottish National Party'; 340 | 'SP'='Socialist Party'; 341 | 'SCon'='Scottish Conservative Party'; 342 | 'SSoc'='Scottish Socialist Party'; 343 | 'SSP'='Scottish Socialist Party'; 344 | 'Stuck'='Stuckist Party'; 345 | 'TW'='Third Way'; 346 | 'UKIP'='UK Independence Party'; 347 | 'UUP'='Ulster Unionist Party'; 348 | 'Ver'='Veritas Party'") 349 | 350 | 351 | dat_corpus_renamed <- dat_corpus_renamed %>% 352 | mutate(partyname = car::recode(party, recode_party)) 353 | 354 | dat_corpus_renamed <- dat_corpus_renamed %>% 355 | select(doc_id, text, party, partyname, year, 356 | crowd_econsocial_label, crowd_econsocial_mean, 357 | crowd_econsocial_n, crowd_immigration_label, 358 | crowd_immigration_mean, 359 | crowd_immigration_n) 360 | 361 | dat_corpus_renamed$crowd_econsocial_label <- factor(dat_corpus_renamed$crowd_econsocial_label) 362 | dat_corpus_renamed$crowd_immigration_label <- factor(dat_corpus_renamed$crowd_immigration_label) 363 | dat_corpus_renamed$year <- as.integer(as.character(dat_corpus_renamed$year)) 364 | dat_corpus_renamed$party <- as.factor(dat_corpus_renamed$party) 365 | dat_corpus_renamed$partyname <- as.factor(dat_corpus_renamed$partyname) 366 | dat_corpus_renamed$crowd_immigration_n <- as.integer(dat_corpus_renamed$crowd_immigration_n) 367 | dat_corpus_renamed$crowd_econsocial_n <- as.integer(dat_corpus_renamed$crowd_econsocial_n) 368 | 369 | # create final corpus 370 | data_corpus_manifestosentsUK <- corpus(dat_corpus_renamed, 371 | docid_field = "doc_id") 372 | 373 | # add corpus to package 374 | usethis::use_data(data_corpus_manifestosentsUK, overwrite = TRUE) 375 | -------------------------------------------------------------------------------- /tests/data_creation/data_uk_immigration.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_immigration.zip -------------------------------------------------------------------------------- /tests/data_creation/data_uk_manifestos_2015-2019.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_manifestos_2015-2019.zip -------------------------------------------------------------------------------- /tests/data_creation/data_uk_policyarea.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/data_uk_policyarea.zip -------------------------------------------------------------------------------- /tests/data_creation/master.sentences.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quanteda/quanteda.classifiers/3aabdb708468932c4bd5d1a3d3005ae3d653c07d/tests/data_creation/master.sentences.Rdata -------------------------------------------------------------------------------- /tests/misc/test-LMRD.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Testing text classification using movie reviews" 3 | author: "Kenneth Benoit" 4 | output: 5 | html_notebook: 6 | toc: yes 7 | toc_float: yes 8 | --- 9 | 10 | 11 | Tests using the Large Movie Review Dataset, a dataset for sentiment classification containing 25,000 highly polar movie reviews for training, and 25,000 for testing, from Maas et. al. (2011). 12 | 13 | Source: Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis". The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011). 14 | 15 | # Setting up {.tabset .tabset-fade} 16 | 17 | ```{r setup, echo=FALSE} 18 | knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE) 19 | ``` 20 | 21 | 22 | ## Test and training data 23 | 24 | ```{r} 25 | dfmat <- dfm(data_corpus_LMRD) 26 | dfmat_train <- dfm_trim(dfmat, min_termfreq = 100) %>% 27 | dfm_subset(set == "train") 28 | dfmat_test <- dfm_trim(dfmat, min_termfreq = 100) %>% 29 | dfm_subset(set == "test") 30 | ``` 31 | 32 | ## Some functions for evaluation 33 | 34 | ```{r} 35 | performance <- function(mytable, verbose = TRUE) { 36 | truePositives <- mytable[1, 1] 37 | trueNegatives <- sum(diag(mytable)[-1]) 38 | falsePositives <- sum(mytable[1, ]) - truePositives 39 | falseNegatives <- sum(mytable[, 1]) - truePositives 40 | precision <- truePositives / (truePositives + falsePositives) 41 | recall <- truePositives / (truePositives + falseNegatives) 42 | accuracy <- sum(diag(mytable)) / sum(mytable) 43 | tnr <- trueNegatives / (trueNegatives + falsePositives) 44 | balanced_accuracy <- sum(c(precision, tnr), na.rm = TRUE) / 2 45 | if (verbose) { 46 | print(mytable) 47 | cat( 48 | "\n precision =", round(precision, 2), 49 | "\n recall =", round(recall, 2), 50 | "\n accuracy =", round(accuracy, 2), 51 | "\n bal. acc. =", round(balanced_accuracy, 2), 52 | "\n" 53 | ) 54 | } 55 | invisible(c(precision, recall)) 56 | } 57 | ``` 58 | 59 | 60 | 61 | # Naive Bayes {.tabset .tabset-fade} 62 | 63 | ## No weights 64 | 65 | ```{r} 66 | system.time({ 67 | tmod_nb <- textmodel_nb(dfmat_train, y = docvars(dfmat_train, "polarity")) 68 | pred_nb <- predict(tmod_nb, newdata = dfmat_test, type = "class") 69 | table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance() 70 | }) 71 | ``` 72 | 73 | ## tf-idf weighting 74 | 75 | ```{r} 76 | system.time({ 77 | tmod_nb <- textmodel_nb(dfm_tfidf(dfmat_train), 78 | y = docvars(dfmat_train, "polarity")) 79 | pred_nb <- predict(tmod_nb, newdata = dfm_tfidf(dfmat_test), type = "class") 80 | table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance() 81 | }) 82 | ``` 83 | 84 | # SVM {.tabset .tabset-fade} 85 | 86 | ## SVM 87 | 88 | ```{r} 89 | system.time({ 90 | tmod_svm <- textmodel_svm(dfmat_train, y = docvars(dfmat_train, "polarity")) 91 | pred_svm <- predict(tmod_svm, newdata = dfmat_test, type = "class") 92 | table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance() 93 | }) 94 | ``` 95 | 96 | ## SVM w/tf-idf weights 97 | 98 | ```{r} 99 | tmod_svm2 <- textmodel_svm(dfm_tfidf(dfmat_train), 100 | y = docvars(dfmat_train, "polarity")) 101 | pred_svm2 <- predict(tmod_svm2, newdata = dfm_tfidf(dfmat_test), type = "class") 102 | table(pred_svm2, dfmat_test$polarity)[2:1, 2:1] %>% performance() 103 | ``` 104 | 105 | ## SVMlin 106 | 107 | ```{r} 108 | system.time({ 109 | tmod_svmlin <- textmodel_svmlin(dfmat_train, 110 | y = docvars(dfmat_train, "polarity")) 111 | pred_svmlin <- predict(tmod_svmlin, newdata = dfmat_test, type = "class") 112 | table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance() 113 | }) 114 | ``` 115 | 116 | 117 | # Multilayer Perceptron Network (MLP) 118 | 119 | ```{r} 120 | system.time({ 121 | tmod_mlp <- textmodel_mlp(dfmat_train, 122 | y = docvars(dfmat_train, "polarity"), epochs = 10) 123 | pred_mlp <- predict(tmod_mlp, newdata = dfmat_test, type = "class") 124 | table(pred_mlp, dfmat_test$polarity)[2:1, 2:1] %>% performance() 125 | }) 126 | ``` 127 | 128 | 129 | # CNN-embedding-LSTM model 130 | 131 | ```{r} 132 | system.time({ 133 | toks_train <- data_corpus_LMRD %>% 134 | corpus_subset(set == "test") %>% 135 | tokens() %>% 136 | tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE) 137 | tmod_cnn <- textmodel_cnnlstmemb(toks_train, 138 | y = docvars(dfmat_train, "polarity"), 139 | epochs = 10) 140 | 141 | toks_test <- data_corpus_LMRD %>% 142 | corpus_subset(set == "train") %>% 143 | tokens() %>% 144 | tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE) 145 | pred_cnn <- predict(tmod_cnn, newdata = toks_test, type = "class") 146 | table(pred_cnn, dfmat_test$polarity) %>% performance() 147 | }) 148 | ``` 149 | 150 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly = TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = TRUE, 3 | skip_on_cran = TRUE) 4 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library("testthat") 2 | library("quanteda.classifiers") 3 | 4 | library("quanteda") 5 | library("quanteda.textmodels") 6 | 7 | # library("keras") 8 | # use_session_with_seed(42) 9 | # see https://github.com/rstudio/keras/issues/890#issuecomment-539044011 10 | # tensorflow::tf$random$set_seed(42) 11 | 12 | test_check("quanteda.classifiers") 13 | -------------------------------------------------------------------------------- /tests/testthat/test-performance.R: -------------------------------------------------------------------------------- 1 | context("test evaluation functions") 2 | 3 | lvs <- c("Relevant", "Irrelevant") 4 | pred <- factor(rep(lvs, times = c(42, 58)), levels = lvs) 5 | true <- factor(c(rep(lvs, times = c(30, 12)), 6 | rep(lvs, times = c(30, 28))), levels = lvs) 7 | tab <- table(pred, true) 8 | 9 | test_that("performance works by_class = TRUE", { 10 | perf <- performance(tab, by_class = TRUE) 11 | expect_equal(perf, 12 | list(precision = c(Relevant = 0.714, 13 | Irrelevant = 0.483), 14 | recall = c(Relevant = 0.5, Irrelevant = 0.7), 15 | f1 = c(Relevant = 0.588, 16 | Irrelevant = 0.571), 17 | accuracy = 0.58, 18 | balanced_accuracy = 0.6), 19 | tol = .001 20 | ) 21 | }) 22 | 23 | test_that("performance works by_class = FALSE", { 24 | perf <- performance(tab, by_class = FALSE) 25 | expect_equal(perf, 26 | list(precision = 0.599, recall = 0.6, f1 = 0.580, 27 | accuracy = 0.58, balanced_accuracy = 0.6), 28 | tol = .001 29 | ) 30 | }) 31 | 32 | test_that("exceptions work", { 33 | perf <- performance(tab, by_class = TRUE) 34 | #expect_error( 35 | # f1_score(perf[c("precision", "accuracy")]), 36 | # "list must contain both precision and recall" 37 | #) 38 | expect_error( 39 | balanced_accuracy(perf[c("precision", "accuracy")]), 40 | "list must include recall" 41 | ) 42 | 43 | expect_error( 44 | balanced_accuracy(performance(tab, by_class = FALSE)), 45 | "recall must be computed by class" 46 | ) 47 | 48 | expect_error( 49 | quanteda.classifiers:::check_table(tab, 3), 50 | "truth must be 2 for columns or 1 for rows" 51 | ) 52 | tab2 <- tab 53 | colnames(tab2)[2] <- "dummy" 54 | expect_error( 55 | quanteda.classifiers:::check_table(tab2, 2), 56 | "predicted and truth values must have the same order and names" 57 | ) 58 | }) 59 | -------------------------------------------------------------------------------- /tests/testthat/test-textmodel_cnnlstmemb.R: -------------------------------------------------------------------------------- 1 | context("test textmodel_cnnlstmemb") 2 | 3 | test_that("the cnnlstmemb model works", { 4 | skip() 5 | skip_on_cran() 6 | 7 | data(data_corpus_EPcoaldebate, package = "quanteda.textmodels") 8 | corp <- corpus_subset(data_corpus_EPcoaldebate, 9 | subset = language == "English") %>% 10 | corpus_sample(500) 11 | 12 | toks <- tokens(corp) 13 | label <- ifelse(docvars(corp, "crowd_subsidy_label") == "Pro-Subsidy", 1, 0) 14 | tmod <- textmodel_cnnlstmemb(toks, y = label, epochs = 8) 15 | 16 | expect_output( 17 | print(tmod), 18 | "Call:" 19 | ) 20 | expect_equal(names(summary(tmod)), c("call", "model structure")) 21 | con_mat <- table(predict(tmod, type = "class"), label) 22 | accuracy <- sum(diag(con_mat)) / sum(con_mat) 23 | expect_equal( 24 | accuracy, 25 | 0.87, 26 | tolerance = 0.1 27 | ) 28 | set.seed(10) 29 | pred_out <- predict(tmod, type = "probability") 30 | pred_max <- apply(pred_out, 1, function(x) colnames(pred_out)[which.max(x)]) 31 | con_mat <- table(pred_max, label) 32 | accuracy <- sum(diag(con_mat)) / sum(con_mat) 33 | expect_equal( 34 | accuracy, 35 | 0.87, 36 | tolerance = 0.1 37 | ) 38 | }) 39 | 40 | test_that("multiclass prediction works", { 41 | skip() 42 | skip_on_cran() 43 | 44 | data(data_corpus_irishbudget2010, package = "quanteda.textmodels") 45 | toks <- tokens(data_corpus_irishbudget2010) 46 | y <- docvars(data_corpus_irishbudget2010, "party") 47 | y[5] <- NA 48 | tmod2 <- textmodel_cnnlstmemb(toks, y = y) 49 | expect_equal( 50 | names(predict(tmod2, type = "class"))[5], 51 | "Cowen, Brian (FF)" 52 | ) 53 | 54 | probmat <- predict(tmod2, type = "probability") 55 | expect_equal(dim(probmat), c(14, 5)) 56 | expect_equal(rownames(probmat), docnames(toks)) 57 | expect_equal(colnames(probmat), tmod2$classnames) 58 | expect_equal(unname(rowSums(probmat)), rep(1, nrow(probmat)), tol = .000001) 59 | }) 60 | 61 | test_that("cnnlstmemb works with tokens2sequences", { 62 | skip() 63 | skip_on_cran() 64 | 65 | data(data_corpus_irishbudget2010, package = "quanteda.textmodels") 66 | toks1 <- tokens2sequences(x = tokens(data_corpus_irishbudget2010),keepn = 100) 67 | y <- docvars(data_corpus_irishbudget2010, "party") 68 | y[5] <- NA 69 | tmod2 <- textmodel_cnnlstmemb(x = toks1, y = y) 70 | expect_equal( 71 | names(predict(tmod2, type = "class"))[5], 72 | "Cowen, Brian (FF)" 73 | ) 74 | 75 | probmat <- predict(tmod2, type = "probability") 76 | expect_equal(dim(probmat), c(14, 5)) 77 | expect_equal(rownames(probmat), rownames(toks1$matrix)) 78 | expect_equal(colnames(probmat), tmod2$classnames) 79 | expect_equal(unname(rowSums(probmat)), rep(1, nrow(probmat)), tol = .000001) 80 | }) 81 | -------------------------------------------------------------------------------- /tests/testthat/test-textmodel_evaluate.R: -------------------------------------------------------------------------------- 1 | context("test textmodel_evaluate") 2 | 3 | test_that("textmodel_evaluate works", { 4 | skip("until rewritten") 5 | skip_on_cran() 6 | 7 | data(data_corpus_EPcoaldebate, package = "quanteda.textmodels") 8 | 9 | set.seed(100) 10 | corp <- corpus_sample(data_corpus_EPcoaldebate, size = 500, by = "crowd_subsidy_label") 11 | dfmat <- dfm(corp) %>% 12 | dfm_trim(min_termfreq = 10) 13 | labels <- docvars(dfmat, "crowd_subsidy_label") 14 | model_eval <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "f1_score", k = 3, seed = 5) 15 | 16 | # Check ouptuts for consistency 17 | expect_equal(dim(model_eval), c(3, 4)) 18 | expect_equal(names(model_eval), c("k", "f1_score", "time", "seed")) 19 | expect_equal(max(model_eval$k), 3) 20 | 21 | 22 | model_eval2 <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "f1_score", k = 2, parameters = list(epochs = c(3, 4)), seed = 5) 23 | 24 | # Check ouptuts for consistency 25 | expect_equal(dim(model_eval2), c(4, 5)) 26 | expect_equal(names(model_eval2), c("k", "f1_score", "epochs", "time", "seed")) 27 | expect_equal(max(model_eval2$k), 2) 28 | 29 | # Check by_class 30 | model_eval3 <- textmodel_evaluate(x = dfmat, y = labels, model = "textmodel_mlp", fun = "recall", k = 2, seed = 5, by_class = TRUE) 31 | expect_true("class" %in% names(model_eval3)) 32 | expect_true(sum(levels(labels) %in% model_eval3$class) == 3) 33 | 34 | # Check if it works with textmodel_cnnlstm 35 | 36 | corp_tok <- corpus_sample(data_corpus_EPcoaldebate, size = 500, by = "crowd_subsidy_label") 37 | tok <- tokens(corp_tok) 38 | labels <- docvars(corp_tok, "crowd_subsidy_label") 39 | model_eval4 <- textmodel_evaluate(x = tok, y = labels, model = "textmodel_cnnlstmemb", fun = "f1_score", k = 3, seed = 5) 40 | expect_equal(dim(model_eval4), c(3, 4)) 41 | expect_equal(names(model_eval4), c("k", "f1_score", "time", "seed")) 42 | expect_equal(max(model_eval4$k), 3) 43 | expect_true(min(model_eval4$f1_score) > 0.1 & max(model_eval4$f1_score) < 1) 44 | }) 45 | 46 | -------------------------------------------------------------------------------- /tests/testthat/test-textmodel_mlp.R: -------------------------------------------------------------------------------- 1 | context("test textmodel_mlp") 2 | 3 | test_that("the mlp model works", { 4 | skip("because of tensorflow install problems") 5 | skip_on_cran() 6 | 7 | data(data_corpus_EPcoaldebate, package = "quanteda.textmodels") 8 | 9 | set.seed(100) 10 | corp_train <- corpus_sample(data_corpus_EPcoaldebate, size = 3000, by = "crowd_subsidy_label") 11 | corp_test <- corpus_sample(data_corpus_EPcoaldebate, size = 10, by = "crowd_subsidy_label") 12 | dfmat_train <- dfm(corp_train) 13 | dfmat_test <- dfm(corp_test) 14 | 15 | tmod <- textmodel_mlp(dfmat_train, y = docvars(dfmat_train, "crowd_subsidy_label"), epoch = 5) 16 | 17 | # label 18 | pred <- predict(tmod, newdata = dfmat_test, type = "class") 19 | tab <- table(pred, dfmat_test$crowd_subsidy_label) 20 | acc <- sum(diag(tab)) / sum(tab) 21 | expect_gte(acc, .6) 22 | 23 | # predicted prob 24 | prob <- predict(tmod, newdata = dfmat_test, type = "probability") 25 | expect_gte(prob["PL_Lamberts_3_3", "Anti-Subsidy"], .95) 26 | 27 | expect_output( 28 | print(tmod), 29 | "Call:" 30 | ) 31 | 32 | expect_equal(names(summary(tmod)), c("call", "model structure")) 33 | set.seed(10) 34 | pred_max <- apply(prob, 1, function(x) colnames(prob)[which.max(x)]) 35 | expect_equivalent( 36 | pred_max, 37 | as.character(pred) 38 | ) 39 | }) 40 | -------------------------------------------------------------------------------- /tests/testthat/test-tokens2sequences.R: -------------------------------------------------------------------------------- 1 | context("test tokens2sequences") 2 | 3 | test_that("tokens2sequences works", { 4 | skip() 5 | skip_on_cran() 6 | 7 | ## Example from 13.1 of _An Introduction to Information Retrieval_ 8 | text <- c("Chinese Beijing Chinese", 9 | "Chinese Chinese Shanghai", 10 | "Chinese Macao", 11 | "Tokyo Japan", 12 | "Chinese Chinese Chinese Tokyo Japan") 13 | text_tokens <- tokens(text) 14 | seq <- tokens2sequences(text_tokens, maxsenlen = 10, keepn = 5) 15 | 16 | # Check ouptuts for consistency 17 | expect_equal(dim(seq$matrix), c(5, 10)) 18 | expect_equal(seq$nfeatures, 5) 19 | expect_equal(max(seq$matrix), 5) 20 | expect_equal(min(seq$matrix), 0) 21 | expect_equal(as.integer(apply(seq$matrix, 1, function(x) sum(x != 0))), c(3, 3, 1, 2, 5)) 22 | 23 | # Compare with keras's texts_to_sequences function 24 | tok <- keras::text_tokenizer(filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r",lower = T,num_words = 6) %>% # Note: Keras includes 0 as a word. tokens2sequences does not 25 | keras::fit_text_tokenizer(text) 26 | tok_mat <- keras::texts_to_sequences(tok,texts = text) %>% keras::pad_sequences(maxlen = 10) 27 | seq_mat <- unname(seq$matrix) 28 | expect_equal(seq_mat, tok_mat) 29 | 30 | # Check whether it works on character vectors 31 | seq2 <- tokens2sequences(text, maxsenlen = 10, keepn = 5) 32 | expect_equal(dim(seq$matrix), dim(seq2$matrix)) 33 | expect_equal(seq$nfeatures, seq2$nfeatures) 34 | 35 | # Check whether tokens2sequences performs recursively 36 | seq_short <- tokens2sequences(x = seq2, maxsenlen = 2, keepn = 5) 37 | expect_equal(ncol(seq_short$matrix), 2) 38 | expect_equal(seq_short$nfeatures, 4) 39 | }) 40 | 41 | test_that("tokens2sequences_conform works", { 42 | skip() 43 | txt1 <- "This is sentence one. And here is sentence two." 44 | txt2 <- "This is sentence 3. Sentence 4! A fifth and final example." 45 | toks1 <- corpus(txt1) %>% 46 | corpus_reshape(to = "sentence") %>% 47 | tokens() %>% 48 | tokens_tolower() 49 | toks2 <- corpus(txt2) %>% 50 | corpus_reshape(to = "sentence") %>% 51 | tokens() %>% 52 | tokens_tolower() 53 | 54 | seqx <- tokens2sequences(toks1, maxsenlen = 4, keepn = 4) 55 | seqy <- tokens2sequences(toks2, maxsenlen = 4, keepn = 4) 56 | 57 | seqxy <- tokens2sequences_conform(seqx, seqy) 58 | expect_equal(dim(seqxy$matrix), c(2, 4)) 59 | expect_equal(ncol(seqxy$features), 3) 60 | }) 61 | --------------------------------------------------------------------------------