├── .Rbuildignore ├── .github └── workflows │ ├── check-standard.yaml │ └── test-coverage.yaml ├── .gitignore ├── DESCRIPTION ├── LSX.Rproj ├── NAMESPACE ├── NEWS.md ├── R ├── as.textmodel.R ├── bootstrap.R ├── data.R ├── predict.R ├── textmodel-methods.R ├── textmodel.R ├── textplot.R ├── textstat.R └── utils.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── cran-comments.md ├── data-raw └── generate.R ├── data ├── data_dictionary_ideology.RData ├── data_dictionary_sentiment.RData └── data_textmodel_lss_russianprotests.rda ├── docs ├── 404.html ├── articles │ ├── index.html │ └── pkgdown │ │ ├── application.html │ │ ├── application_files │ │ └── figure-html │ │ │ ├── plot-term-1.png │ │ │ └── plot-trend-1.png │ │ ├── basic.html │ │ ├── basic_files │ │ └── figure-html │ │ │ ├── plot-hostility-term-1.png │ │ │ ├── plot-hostility-trend-1.png │ │ │ ├── plot-sentiment-dict-1.png │ │ │ ├── plot-sentiment-term-1.png │ │ │ ├── plot-sentiment-trend-1.png │ │ │ ├── plot-sentimnet-emoji-1.png │ │ │ └── plot-sentimnet-random-1.png │ │ ├── introduction.html │ │ ├── introduction_files │ │ └── figure-html │ │ │ ├── plot-hostility-term-1.png │ │ │ ├── plot-hostility-trend-1.png │ │ │ ├── plot-sentiment-term-1.png │ │ │ ├── plot-sentiment-trend-1.png │ │ │ └── plot-sentimnet-emoji-1.png │ │ ├── research.html │ │ ├── research_files │ │ └── figure-html │ │ │ ├── plot-term-1.png │ │ │ └── plot-trend-1.png │ │ └── seedwords.html ├── authors.html ├── deps │ ├── bootstrap-5.2.2 │ │ ├── bootstrap.bundle.min.js │ │ ├── bootstrap.bundle.min.js.map │ │ └── bootstrap.min.css │ ├── bootstrap-5.3.1 │ │ ├── bootstrap.bundle.min.js │ │ ├── bootstrap.bundle.min.js.map │ │ └── bootstrap.min.css │ ├── data-deps.txt │ └── jquery-3.6.0 │ │ ├── jquery-3.6.0.js │ │ ├── jquery-3.6.0.min.js │ │ └── jquery-3.6.0.min.map ├── index.html ├── link.svg ├── news │ └── index.html ├── pkgdown.js ├── pkgdown.yml ├── reference │ ├── Rplot001.png │ ├── as.coefficients_textmodel.html │ ├── as.seedwords.html │ ├── as.statistics_textmodel.html │ ├── as.summary.textmodel.html │ ├── as.textmodel_lss.html │ ├── bootstrap_lss.html │ ├── coef.textmodel_lss.html │ ├── cohesion.html │ ├── data_dictionary_ideology.html │ ├── data_dictionary_sentiment.html │ ├── data_textmodel_lss_russianprotests.html │ ├── diagnosys.html │ ├── index.html │ ├── optimize_lss.html │ ├── predict.textmodel_lss.html │ ├── print.coefficients_textmodel.html │ ├── print.statistics_textmodel.html │ ├── print.summary.textmodel.html │ ├── seedwords.html │ ├── smooth_lss.html │ ├── textmodel_lss.html │ ├── textplot_components.html │ ├── textplot_simil.html │ ├── textplot_terms.html │ ├── textstat_context.html │ └── weight_seeds.html ├── search.json └── sitemap.xml ├── inst ├── CITATION └── WORDLIST ├── man ├── as.coefficients_textmodel.Rd ├── as.seedwords.Rd ├── as.statistics_textmodel.Rd ├── as.summary.textmodel.Rd ├── as.textmodel_lss.Rd ├── bootstrap_lss.Rd ├── coef.textmodel_lss.Rd ├── cohesion.Rd ├── data_dictionary_ideology.Rd ├── data_dictionary_sentiment.Rd ├── data_textmodel_lss_russianprotests.Rd ├── diagnosys.Rd ├── optimize_lss.Rd ├── predict.textmodel_lss.Rd ├── print.coefficients_textmodel.Rd ├── print.statistics_textmodel.Rd ├── print.summary.textmodel.Rd ├── seedwords.Rd ├── smooth_lss.Rd ├── textmodel_lss.Rd ├── textplot_components.Rd ├── textplot_simil.Rd ├── textplot_terms.Rd ├── textstat_context.Rd └── weight_seeds.Rd ├── tests ├── data │ ├── data_dictionary_ideology.RData │ ├── data_dictionary_sentiment.RData │ ├── lss_k300.RDS │ ├── matrix_k100.RDS │ ├── prediction_v0.93.RDA │ ├── prediction_v0.99.RDA │ ├── save.R │ ├── tokens.RDS │ ├── word2vec-prob.RDS │ └── word2vec.RDS ├── misc │ └── replication.R ├── spelling.R ├── testthat.R └── testthat │ ├── test-as.textmodel.R │ ├── test-bootstrap.R │ ├── test-textmodel.R │ ├── test-textplot.R │ ├── test-textstat.R │ └── test-utils.R └── vignettes └── pkgdown ├── .gitignore ├── basic.Rmd ├── dictionary.yml ├── research.Rmd └── seedwords.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.github/ 4 | ^.*_cache/ 5 | ^README.Rmd 6 | ^README.md 7 | ^data-raw/ 8 | ^images/ 9 | ^cran-comments.md$ 10 | ^lss_cache 11 | ^vignettes/.*\.rds$ 12 | ^_pkgdown\.yml$ 13 | ^docs$ 14 | ^vignettes/pkgdown$ 15 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | permissions: read-all 12 | 13 | jobs: 14 | test-coverage: 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - uses: r-lib/actions/setup-r@v2 23 | with: 24 | use-public-rspm: true 25 | 26 | - uses: r-lib/actions/setup-r-dependencies@v2 27 | with: 28 | extra-packages: any::covr, any::xml2 29 | needs: coverage 30 | 31 | - name: Test coverage 32 | run: | 33 | cov <- covr::package_coverage( 34 | quiet = FALSE, 35 | clean = FALSE, 36 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 37 | ) 38 | covr::to_cobertura(cov) 39 | shell: Rscript {0} 40 | 41 | - uses: codecov/codecov-action@v4 42 | with: 43 | fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }} 44 | file: ./cobertura.xml 45 | plugin: noop 46 | disable_search: true 47 | token: ${{ secrets.CODECOV_TOKEN }} 48 | 49 | - name: Show testthat output 50 | if: always() 51 | run: | 52 | ## -------------------------------------------------------------------- 53 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 54 | shell: bash 55 | 56 | - name: Upload test results 57 | if: failure() 58 | uses: actions/upload-artifact@v4 59 | with: 60 | name: coverage-test-failures 61 | path: ${{ runner.temp }}/package 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | LSS.Rproj 6 | *_cache 7 | Rplots.pdf 8 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: LSX 2 | Type: Package 3 | Title: Semi-Supervised Algorithm for Document Scaling 4 | Version: 1.4.5 5 | Authors@R: person("Kohei", "Watanabe", email = "watanabe.kohei@gmail.com", role = c("aut", "cre", "cph")) 6 | Description: A word embeddings-based semi-supervised model for document scaling Watanabe (2020) . 7 | LSS allows users to analyze large and complex corpora on arbitrary dimensions with seed words exploiting efficiency of word embeddings (SVD, Glove). 8 | It can generate word vectors on a users-provided corpus or incorporate a pre-trained word vectors. 9 | License: GPL-3 10 | LazyData: TRUE 11 | Encoding: UTF-8 12 | Depends: 13 | R (>= 3.5.0) 14 | Imports: 15 | methods, 16 | quanteda (>= 2.0), 17 | quanteda.textstats, 18 | stringi, 19 | digest, 20 | Matrix, 21 | RSpectra, 22 | proxyC, 23 | stats, 24 | ggplot2, 25 | ggrepel, 26 | reshape2, 27 | locfit 28 | Suggests: 29 | testthat, 30 | spelling, 31 | knitr, 32 | rmarkdown, 33 | wordvector, 34 | irlba, 35 | rsvd, 36 | rsparse 37 | RoxygenNote: 7.3.2 38 | Roxygen: list(markdown = TRUE) 39 | BugReports: https://github.com/koheiw/LSX/issues 40 | URL: https://koheiw.github.io/LSX/ 41 | Language: en-US 42 | -------------------------------------------------------------------------------- /LSX.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: 7985635d-e1dc-4855-b781-313f42a839cc 3 | 4 | RestoreWorkspace: Default 5 | SaveWorkspace: Default 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: Sweave 14 | LaTeX: pdfLaTeX 15 | 16 | AutoAppendNewline: Yes 17 | StripTrailingWhitespace: Yes 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageCheckArgs: --as-cran 23 | PackageRoxygenize: rd,collate,namespace,vignette 24 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(as.coefficients_textmodel,data.frame) 4 | S3method(as.coefficients_textmodel,matrix) 5 | S3method(as.coefficients_textmodel,numeric) 6 | S3method(as.statistics_textmodel,data.frame) 7 | S3method(as.statistics_textmodel,matrix) 8 | S3method(as.textmodel_lss,matrix) 9 | S3method(as.textmodel_lss,numeric) 10 | S3method(as.textmodel_lss,textmodel_lss) 11 | S3method(as.textmodel_lss,textmodel_wordvector) 12 | S3method(coef,textmodel_lss) 13 | S3method(diagnosys,character) 14 | S3method(diagnosys,corpus) 15 | S3method(predict,textmodel_lss) 16 | S3method(print,coefficients_textmodel) 17 | S3method(print,statistics_textmodel) 18 | S3method(print,summary.textmodel) 19 | S3method(print,textmodel_lss) 20 | S3method(summary,textmodel_lss) 21 | S3method(textmodel_lss,dfm) 22 | S3method(textmodel_lss,fcm) 23 | S3method(textplot_components,textmodel_lss) 24 | S3method(textplot_simil,textmodel_lss) 25 | S3method(textplot_terms,textmodel_lss) 26 | export(as.coefficients_textmodel) 27 | export(as.seedwords) 28 | export(as.statistics_textmodel) 29 | export(as.summary.textmodel) 30 | export(as.textmodel_lss) 31 | export(bootstrap_lss) 32 | export(char_context) 33 | export(coefficients.textmodel_lss) 34 | export(cohesion) 35 | export(diagnosys) 36 | export(optimize_lss) 37 | export(seedwords) 38 | export(smooth_lss) 39 | export(textmodel_lss) 40 | export(textplot_components) 41 | export(textplot_simil) 42 | export(textplot_terms) 43 | export(textstat_context) 44 | import(ggplot2) 45 | import(ggrepel) 46 | import(locfit) 47 | import(methods) 48 | import(stats) 49 | import(stringi) 50 | importFrom(Matrix,Matrix) 51 | importFrom(Matrix,colSums) 52 | importFrom(Matrix,rowMeans) 53 | importFrom(Matrix,rowSums) 54 | importFrom(Matrix,t) 55 | importFrom(Matrix,tcrossprod) 56 | importFrom(Matrix,tril) 57 | importFrom(ggrepel,geom_text_repel) 58 | importFrom(quanteda,as.dfm) 59 | importFrom(quanteda,check_character) 60 | importFrom(quanteda,check_double) 61 | importFrom(quanteda,check_integer) 62 | importFrom(quanteda,check_logical) 63 | importFrom(quanteda,convert) 64 | importFrom(quanteda,corpus) 65 | importFrom(quanteda,dfm) 66 | importFrom(quanteda,dfm_group) 67 | importFrom(quanteda,dfm_lookup) 68 | importFrom(quanteda,dfm_match) 69 | importFrom(quanteda,dfm_remove) 70 | importFrom(quanteda,dfm_select) 71 | importFrom(quanteda,dfm_trim) 72 | importFrom(quanteda,dictionary) 73 | importFrom(quanteda,featnames) 74 | importFrom(quanteda,is.dfm) 75 | importFrom(quanteda,is.dictionary) 76 | importFrom(quanteda,is.tokens) 77 | importFrom(quanteda,meta) 78 | importFrom(quanteda,ntoken) 79 | importFrom(quanteda,texts) 80 | importFrom(quanteda,tokens) 81 | importFrom(quanteda,tokens_ngrams) 82 | importFrom(quanteda,tokens_remove) 83 | importFrom(quanteda,tokens_select) 84 | importFrom(quanteda.textstats,textstat_keyness) 85 | importFrom(stats,coef) 86 | importFrom(stats,coefficients) 87 | importFrom(stats,cutree) 88 | importFrom(stats,hclust) 89 | importFrom(stringi,stri_detect_fixed) 90 | importFrom(stringi,stri_replace_all_fixed) 91 | importFrom(stringi,stri_trans_totitle) 92 | importFrom(utils,head) 93 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## Changes in v1.4.5 2 | 3 | * Enable grouping by multiple variables using `smooth_lss()`. 4 | 5 | ## Changes in v1.4.4 6 | 7 | * Fix a bug in `as.textmodel_lss()` when a `textmodel_wordvector` object is given. 8 | * Add `sampling` to `textplot_terms()` to improve highlighting of words when the distribution of polarity scores is asymmetric. 9 | 10 | ## Changes in v1.4.3 11 | 12 | * Improve the handling of `textmodel_wordvector` objects from the **wordvector** package in `as.textmodel_lss()`. 13 | * Deprecate `auto_weight` in `textmodel_lss()`. 14 | * Deprecate `textplot_simil()`. 15 | 16 | ## Changes in v1.4.2 17 | 18 | * Add `as.textmodel_lss()` for objects from the **wordvector** package. 19 | * Reduce dependent packages by moving **rsparse**, **irlba** and **rsvd** to Suggests. 20 | * Fix handling of phrasal patterns in `textplot_terms()`. 21 | * Improve objects created by `as.textmodel_lss.textmodel_lss()`. 22 | 23 | ## Changes in v1.4.1 24 | 25 | * Add `group` to `smooth_lss()` to smooth LSS scores by group. 26 | * Add `optimize_lss()` as an experimental function. 27 | 28 | ## Changes in v1.4.0 29 | 30 | * Change the default value to `max_highlighted = 1000` in `textplot_terms()`. 31 | * Add `...` to customize text labels to `textplot_terms()`. 32 | * Highlight words in different colors when a dictionary is passed to `highlighted`. 33 | * Add `mode = "predict"` and `remove = FALSE` to `bootstrap_lss()`. 34 | 35 | ## Changes in v1.3.2 36 | 37 | * Fix the error in `textplot_terms()` when the frequency of terms are zero (#85). 38 | 39 | ## Changes in v1.3.1 40 | 41 | * Fix the range of scores when `cut` is used. 42 | * Add `bootstrap_lss()` as an experimental function. 43 | 44 | ## Changes in v1.3.0 45 | 46 | * Add `cut` to `predict`. 47 | * Move examples to the new package website: http://koheiw.github.io/LSX. 48 | * Rename "rescaling" to "rescale" for simplicity and consistency. 49 | * Improve random sampling of words to highlight in `textplot_terms()` to avoid congestion. 50 | 51 | ## Changes in v1.2.0 52 | 53 | * Add `group_data` to `textmodel_lss()` to simplify the workflow. 54 | * Add `max_highlighted` to `textplot_terms()` to automatically highlight polarity words. 55 | 56 | ## Changes in v1.1.4 57 | 58 | * Update `as.textmodel_lss()` to avoid errors in `textplot_terms()` when `terms` is used. 59 | 60 | ## Changes in v1.1.3 61 | 62 | * Restore examples for `textmodel_lss()`. 63 | * Defunct `char_keyness()` that has been deprecated for long. 64 | 65 | ## Changes in v1.1.2 66 | 67 | * Update examples to pass CRAN tests. 68 | 69 | ## Changes in v1.1.1 70 | 71 | * Add `min_n` to `predict()` to make polarity scores of short documents more stable. 72 | 73 | ## Changes in v1.1.0 74 | 75 | * Add `as.textmodel_lss()` for textmodel_lss objects to allow modifying existing models. 76 | * Allow `terms` in `textmodel_lss()` to be a named numeric vector to give arbitrary weights. 77 | 78 | ## Changes in v1.0.2 79 | 80 | * Add the `auto_weight` argument to `textmodel_lss()` and `as.textmodel_lss()` to improve the accuracy of scaling. 81 | * Remove the `group` argument from `textplot_simil()` to simplify the object. 82 | * Make `as.seedwords()` to accept multiple indices for `upper` and `lower`. 83 | 84 | ## Changes in v1.0.0 85 | 86 | * Add `max_count` to `textmodel_lss.fcm()` that will be passed to `x_max` in `rsparse::GloVe$new()`. 87 | * Add `max_words` to `textplot_terms()` to avoid overcrowding. 88 | * Make `textplot_terms()` to work with objects from `textmodel_lss.fcm()`. 89 | * Add `concatenator` to `as.seedwords()`. 90 | 91 | ## Changes in v0.9.9 92 | 93 | * Correct how `textstat_context()` and `char_context()` computes statistics. 94 | * Deprecate `char_keyness()`. 95 | 96 | ## Changes in v0.9.8 97 | 98 | * Stop using functions and arguments deprecated in quanteda v3.0.0. 99 | 100 | ## Changes in v0.9.7 101 | 102 | * Make `as.textmodel_lss.matrix()` more reliable. 103 | * Remove **quanteda.textplots** from dependencies. 104 | 105 | ## Changes in v0.9.6 106 | 107 | * Updated to reflect changes in **quanteda** (creation of **quanteda.textstats**). 108 | 109 | ## Changes in v0.9.4 110 | 111 | * Fix `char_context()` to always return more frequent words in context. 112 | * Experimental `textplot_factor()` has been removed. 113 | * `as.textmodel_lss()` takes a pre-trained word-embedding. 114 | 115 | ## Changes in v0.9.3 116 | 117 | * Add `textstat_context()` and `char_context()` to replace `char_keyness()`. 118 | * Make the absolute sum of seed weight equal to 1.0 in both upper and lower ends. 119 | * `textplot_terms()` takes glob patterns in character vector or a dictionary object. 120 | * `char_keyness()` no longer raise error when no patter is found in tokens object. 121 | * Add `engine` to `smooth_lss()` to apply `locfit()` to large datasets. 122 | 123 | ## Changes in v0.9.2 124 | 125 | * Updated unit tests for the new versions of stringi and quanteda. 126 | 127 | ## Changes in v0.9.0 128 | 129 | * Renamed from LSS to LSX for CRAN submission. 130 | 131 | ## Changes in v0.8.7 132 | 133 | * Added `textplot_terms()` to improve visualization of model terms. 134 | -------------------------------------------------------------------------------- /R/as.textmodel.R: -------------------------------------------------------------------------------- 1 | #' Create a Latent Semantic Scaling model from various objects 2 | #' 3 | #' Create a new [textmodel_lss] object from an existing or foreign objects. 4 | #' @param x an object from which a new [textmodel_lss] object is created. See details. 5 | #' @param ... arguments used to create a new object. `seeds` must be given 6 | #' when `x` is a dense matrix or a fitted textmodel_lss. 7 | #' @details 8 | #' If `x` is a [textmodel_lss], original word vectors are reused to compute polarity 9 | #' scores with new seed words. It is also possible to subset word vectors via `slice` 10 | #' if it was trained originally using SVD. 11 | #' 12 | #' If `x` is a dense matrix, it is treated as a column-oriented word vectors with which 13 | #' polarity of words are computed. If `x` is a named numeric vector, the values are treated 14 | #' as polarity scores of the words in the names. 15 | #' 16 | #' If `x` is a normalized [wordvector::textmodel_word2vec], it returns a spatial model; 17 | #' if not normalized, a probabilistic model. While the polarity scores of words are 18 | #' their cosine similarity to seed words in spatial models, they are 19 | #' predicted probability that the seed words to occur in their proximity. 20 | #' 21 | #' @export 22 | #' @return a dummy [textmodel_lss] object 23 | as.textmodel_lss <- function(x, ...) { 24 | UseMethod("as.textmodel_lss") 25 | } 26 | 27 | #' @export 28 | #' @method as.textmodel_lss matrix 29 | as.textmodel_lss.matrix <- function(x, seeds, 30 | terms = NULL, slice = NULL, 31 | simil_method = "cosine", 32 | auto_weight = FALSE, 33 | verbose = FALSE, ...) { 34 | 35 | args <- list(terms = terms, seeds = seeds) 36 | if (is.null(colnames(x))) 37 | stop("x must have column names for features") 38 | if (any(is.na(colnames(x)))) 39 | stop("x must not have NA in the column names") 40 | if (any(is.na(x))) 41 | stop("x must not have NA") 42 | 43 | seeds <- expand_seeds(seeds, colnames(x), verbose) 44 | seed <- unlist(unname(seeds)) 45 | theta <- get_theta(terms, colnames(x)) 46 | 47 | if (is.null(slice)) { 48 | slice <- nrow(x) 49 | } else { 50 | slice <- check_integer(slice, min_len = 1, max_len = nrow(x), min = 1, max = nrow(x)) 51 | } 52 | if (length(slice) == 1) 53 | slice <- seq_len(slice) 54 | 55 | simil <- get_simil(x, names(seed), names(theta), slice, simil_method) 56 | if (auto_weight) 57 | seed <- optimize_weight(seed, simil, verbose) 58 | beta <- get_beta(simil, seed) * theta 59 | 60 | result <- build_lss( 61 | beta = beta, 62 | k = nrow(x), 63 | slice = slice, 64 | terms = args$terms, 65 | seeds = args$seeds, 66 | seeds_weighted = seed, 67 | embedding = x, 68 | similarity = simil$seed, 69 | call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE), 70 | version = utils::packageVersion("LSX") 71 | ) 72 | return(result) 73 | } 74 | 75 | #' @export 76 | #' @method as.textmodel_lss numeric 77 | as.textmodel_lss.numeric <- function(x, ...) { 78 | 79 | if (is.null(names(x))) 80 | stop("x must have names for features") 81 | if (any(is.na(names(x)))) 82 | stop("x must not have NA in the names") 83 | if (any(is.na(x))) 84 | stop("x must not have NA") 85 | 86 | result <- build_lss( 87 | beta = x, 88 | terms = names(x), 89 | call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) 90 | ) 91 | return(result) 92 | } 93 | 94 | #' @export 95 | #' @method as.textmodel_lss textmodel_lss 96 | as.textmodel_lss.textmodel_lss <- function(x, ...) { 97 | if (is.null(x$embedding)) 98 | stop("x must be a valid textmodel_lss object") 99 | result <- as.textmodel_lss(x$embedding, ...) 100 | result$concatenator <- x$concatenator 101 | result$data <- x$data 102 | result$frequency <- x$frequency[names(result$beta)] 103 | return(result) 104 | } 105 | 106 | #' @export 107 | #' @method as.textmodel_lss textmodel_wordvector 108 | as.textmodel_lss.textmodel_wordvector <- function(x, seeds, 109 | terms = NULL, 110 | verbose = FALSE, 111 | ...) { 112 | 113 | args <- list(terms = terms, seeds = seeds) 114 | if (x$normalize) { 115 | 116 | if (x$version == as.numeric_version("0.1.0")) { 117 | v <- t(x$vector) 118 | } else { 119 | v <- t(x$values) 120 | } 121 | result <- as.textmodel_lss(v, seeds = seeds, terms = terms, ...) 122 | result$frequency <- x$frequency[names(result$beta)] 123 | result$call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) 124 | 125 | } else { 126 | 127 | if (!requireNamespace("wordvector")) 128 | stop("wordvector package must be installed") 129 | if (x$version < as.numeric_version("0.2.0")) 130 | stop("wordvector package must be v0.2.0 or later") 131 | 132 | seeds <- expand_seeds(seeds, rownames(x$values), verbose) 133 | seed <- unlist(unname(seeds)) 134 | theta <- get_theta(terms, rownames(x$values)) 135 | 136 | suppressWarnings({ 137 | prob <- wordvector::probability(x, names(seed), "values") 138 | }) 139 | beta <- rowSums(prob[names(theta),,drop = FALSE] %*% seed) * theta 140 | 141 | result <- build_lss( 142 | beta = beta, 143 | beta_type = "probability", 144 | k = x$dim, 145 | terms = args$terms, 146 | seeds = args$seeds, 147 | seeds_weighted = seed, 148 | frequency = x$frequency[names(beta)], 149 | call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE) 150 | ) 151 | } 152 | return(result) 153 | } 154 | -------------------------------------------------------------------------------- /R/bootstrap.R: -------------------------------------------------------------------------------- 1 | #' \[experimental\] Compute polarity scores with different hyper-parameters 2 | #' 3 | #' A function to compute polarity scores of words and documents by resampling 4 | #' hyper-parameters from a fitted LSS model. 5 | #' @param x a fitted textmodel_lss object. 6 | #' @param what choose the hyper-parameter to resample in bootstrapping. 7 | #' @param mode choose the type of the result of bootstrapping. If `coef`, 8 | #' returns the polarity scores of words; if `terms`, returns words sorted by 9 | #' the polarity scores in descending order; if `predict`, returns the polarity 10 | #' scores of documents. 11 | #' @param remove if `TRUE`, remove each seed word when `what = "seeds"`. 12 | #' @param from,to,by passed to `seq()` to generate values for `k`; only used 13 | #' when `what = "k"`. 14 | #' @param ... additional arguments passed to [as.textmodel_lss()] and 15 | #' [predict()]. 16 | #' @param verbose show messages if `TRUE`. 17 | #' @details `bootstrap_lss()` creates LSS fitted textmodel_lss objects internally by 18 | #' resampling hyper-parameters and computes polarity of words or documents. 19 | #' The resulting matrix can be used to asses the validity and the reliability 20 | #' of seeds or k. 21 | #' 22 | #' Note that the objects created by [as.textmodel_lss()] does not contain data, users 23 | #' must pass `newdata` via `...` when `mode = "predict"`. 24 | #' @export 25 | #' @importFrom quanteda check_integer check_logical 26 | bootstrap_lss <- function(x, what = c("seeds", "k"), 27 | mode = c("terms", "coef", "predict"), 28 | remove = FALSE, 29 | from = 100, to = NULL, by = 50, verbose = FALSE, ...) { 30 | 31 | what <- match.arg(what) 32 | mode <- match.arg(mode) 33 | from <- check_integer(from, min = 1, max = x$k) 34 | remove <- check_logical(remove) 35 | if (!is.null(to)) { 36 | to <- check_integer(to, min = 1, max = x$k) 37 | } else { 38 | to <- x$k 39 | } 40 | by <- check_integer(by, min = 1, max = x$k) 41 | if (verbose) 42 | cat(sprintf("Call %s(x) with different hyper-parameters...\n", mode)) 43 | if (what == "seeds") { 44 | param <- names(x$seeds_weighted) 45 | beta <- lapply(param, function(y) { 46 | if (remove) { 47 | seed <- setdiff(param, y) 48 | if (verbose) cat(sprintf(' seeds != "%s"\n', y)) 49 | } else { 50 | seed <- y 51 | if (verbose) cat(sprintf(' seeds = "%s"\n', y)) 52 | } 53 | 54 | as.textmodel_lss(x, seeds = seed, terms = x$terms, ...)$beta 55 | }) 56 | names(beta) <- param 57 | } else { 58 | param <- seq(from, to, by = by) 59 | beta <- lapply(param, function(y) { 60 | if (verbose) cat(sprintf(' k = %d\n', y)) 61 | as.textmodel_lss(x, seeds = x$seeds, terms = x$terms, slice = y, ...)$beta 62 | }) 63 | names(beta) <- as.character(param) 64 | 65 | } 66 | if (mode == "terms") { 67 | result <- sapply(beta, function(y) names(sort(y, decreasing = TRUE))) 68 | } else if (mode == "predict") { 69 | result <- sapply(beta, function(x) { 70 | suppressWarnings({ 71 | predict(as.textmodel_lss(x), ..., se_fit = FALSE, density = FALSE) 72 | }) 73 | }) 74 | } else { 75 | result <- do.call(cbind, beta) 76 | } 77 | 78 | attr(result, "what") <- what 79 | attr(result, "values") <- param 80 | return(result) 81 | } 82 | 83 | 84 | #' \[experimental\] Compute variance ratios with different hyper-parameters 85 | #' @param x a fitted textmodel_lss object. 86 | #' @param ... additional arguments passed to [bootstrap_lss]. 87 | #' @export 88 | #' @details `optimize_lss()` computes variance ratios with different values of 89 | #' hyper-parameters using [bootstrap_lss]. The variance ration \eqn{v} is defined 90 | #' as \deqn{v = \sigma^2_{documents} / \sigma^2_{words}.} It maximizes 91 | #' when the model best distinguishes between the documents on the latent scale. 92 | #' @examples 93 | #' \dontrun{ 94 | #' # the unit of analysis is not sentences 95 | #' dfmt_grp <- dfm_group(dfmt) 96 | #' 97 | #' # choose best k 98 | #' v1 <- optimize_lss(lss, what = "k", from = 50, 99 | #' newdata = dfmt_grp, verbose = TRUE) 100 | #' plot(names(v1), v1) 101 | #' 102 | #' # find bad seed words 103 | #' v2 <- optimize_lss(lss, what = "seeds", remove = TRUE, 104 | #' newdata = dfmt_grp, verbose = TRUE) 105 | #' barplot(v2, las = 2) 106 | #' } 107 | #' 108 | optimize_lss <- function(x, ...) { 109 | beta <- bootstrap_lss(x, mode = "coef", ...) 110 | pred <- bootstrap_lss(x, mode = "pred", ..., rescale = FALSE) 111 | disc <- apply(pred, 2, var, na.rm = TRUE) / apply(beta, 2, var, na.rm = TRUE) 112 | return(disc) 113 | } 114 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' A fitted LSS model on street protest in Russia 2 | #' 3 | #' This model was trained on a Russian media corpus (newspapers, TV transcripts 4 | #' and newswires) to analyze framing of street protests. The scale is protests 5 | #' as "freedom of expression" (high) vs "social disorder" (low). Although some 6 | #' slots are missing in this object (because the model was imported from the 7 | #' original Python implementation), it allows you to scale texts using 8 | #' `predict`. 9 | #' @name data_textmodel_lss_russianprotests 10 | #' @docType data 11 | #' @keywords data 12 | #' @references Lankina, Tomila, and Kohei Watanabe. “'Russian Spring' or 'Spring 13 | #' Betrayal'? The Media as a Mirror of Putin's Evolving Strategy in Ukraine.” 14 | #' Europe-Asia Studies 69, no. 10 (2017): 1526–56. 15 | #' \doi{10.1080/09668136.2017.1397603}. 16 | NULL 17 | 18 | #' Seed words for analysis of positive-negative sentiment 19 | #' 20 | #' @examples 21 | #' as.seedwords(data_dictionary_sentiment) 22 | #' @name data_dictionary_sentiment 23 | #' @docType data 24 | #' @references Turney, P. D., & Littman, M. L. (2003). Measuring Praise and 25 | #' Criticism: Inference of Semantic Orientation from Association. ACM Trans. 26 | #' Inf. Syst., 21(4), 315–346. \doi{10.1145/944012.944013} 27 | NULL 28 | 29 | #' Seed words for analysis of left-right political ideology 30 | #' 31 | #' @examples 32 | #' as.seedwords(data_dictionary_ideology) 33 | #' @name data_dictionary_ideology 34 | #' @docType data 35 | NULL 36 | -------------------------------------------------------------------------------- /R/predict.R: -------------------------------------------------------------------------------- 1 | #' Prediction method for textmodel_lss 2 | #' 3 | #' @method predict textmodel_lss 4 | #' @param object a fitted LSS textmodel. 5 | #' @param newdata a dfm on which prediction should be made. 6 | #' @param se_fit if `TRUE`, returns standard error of document scores. 7 | #' @param density if `TRUE`, returns frequency of polarity words in documents. 8 | #' @param cut a vector of one or two percentile values to dichotomized polarty 9 | #' scores of words. When two values are given, words between them receive zero 10 | #' polarity. 11 | #' @param rescale if `TRUE`, normalizes polarity scores using `scale()`. 12 | #' @param min_n set the minimum number of polarity words in documents. 13 | #' @param ... not used 14 | #' @details Polarity scores of documents are the means of polarity scores of 15 | #' words weighted by their frequency. When `se_fit = TRUE`, this function 16 | #' returns the weighted means, their standard errors, and the number of 17 | #' polarity words in the documents. When `rescale = TRUE`, it converts the raw 18 | #' polarity scores to z sores for easier interpretation. When `rescale = 19 | #' FALSE` and `cut` is used, polarity scores of documents are bounded by 20 | #' \[-1.0, 1.0\]. 21 | #' 22 | #' Documents tend to receive extreme polarity scores when they have only few 23 | #' polarity words. This is problematic when LSS is applied to short documents 24 | #' (e.g. social media posts) or individual sentences, but users can alleviate 25 | #' this problem by adding zero polarity words to short documents using 26 | #' `min_n`. This setting does not affect empty documents. 27 | #' @import methods 28 | #' @importFrom Matrix Matrix rowSums t 29 | #' @importFrom quanteda is.dfm dfm_select check_integer check_double 30 | #' @export 31 | predict.textmodel_lss <- function(object, newdata = NULL, se_fit = FALSE, 32 | density = FALSE, rescale = TRUE, 33 | cut = NULL, min_n = 0L, ...){ 34 | 35 | 36 | (function(se.fit, recaling, ...) unused_dots(...))(...) # trap deprecated args 37 | args <- list(...) 38 | if ("se.fit" %in% names(args)) { 39 | .Deprecated(msg = "'se.fit' is deprecated; use 'se_fit'\n") 40 | se_fit <- args$se.fit 41 | } 42 | if ("rescaling" %in% names(args)) { 43 | .Deprecated(msg = "'rescaling' is deprecated; use 'rescale'\n") 44 | rescale <- args$rescaling 45 | } 46 | min_n <- check_integer(min_n, min = 0) 47 | 48 | if (!is.null(cut)) { 49 | cut <- check_double(cut, min = 0, max = 1, min_len = 1, max_len = 2) 50 | object$beta <- cut_beta(object$beta, cut) 51 | } 52 | 53 | beta <- Matrix(object$beta, nrow = 1, sparse = TRUE, 54 | dimnames = list(NULL, names(object$beta))) 55 | 56 | if (is.null(newdata)) { 57 | if (is.null(object$data)) 58 | stop("The model includes no data, use newdata to supply a dfm.\n", 59 | call. = FALSE) 60 | data <- object$data 61 | } else { 62 | if (!is.dfm(newdata)) 63 | stop("newdata must be a dfm\n", call. = FALSE) 64 | data <- newdata 65 | } 66 | 67 | if (density) 68 | den <- unname(rowSums(dfm_select(data, object$terms)) / rowSums(data)) 69 | 70 | data <- dfm_match(data, colnames(beta)) 71 | len <- unname(rowSums(data)) 72 | n <- ifelse(len == 0, 0, pmax(len, min_n)) 73 | fit <- ifelse(len == 0, NA, rowSums(data %*% t(beta)) / n) 74 | names(fit) <- rownames(data) 75 | 76 | if (rescale) { 77 | fit_scaled <- scale(fit) 78 | result <- list(fit = rowSums(fit_scaled)) 79 | } else { 80 | result <- list(fit = fit) 81 | } 82 | 83 | if (se_fit) { 84 | # sparse variance computation 85 | weight <- t(t(data > 0) * colSums(beta)) 86 | var <- (rowSums(weight ^ 2 * data) / n) - (rowSums(weight * data) / n) ^ 2 87 | var <- zapsmall(var) 88 | se <- ifelse(n > 1, unname(sqrt(var) / sqrt(n)), NA) 89 | if (rescale) 90 | se <- se / attr(fit_scaled, "scaled:scale") 91 | result$se.fit <- se 92 | result$n <- n 93 | } 94 | if (density) 95 | result$density <- den 96 | 97 | if (!se_fit && !density) { 98 | return(result$fit) 99 | } else { 100 | return(result) 101 | } 102 | } 103 | 104 | cut_beta <- function(x, p = 0.5) { 105 | q <- c(-Inf, quantile(x, p, na.rm = TRUE), Inf) 106 | v <- as.integer(cut(x, q)) 107 | beta <- double(length(x)) 108 | beta[v == min(v)] <- -1.0 109 | beta[v == max(v)] <- 1.0 110 | names(beta) <- names(x) 111 | return(beta) 112 | } 113 | 114 | -------------------------------------------------------------------------------- /R/textmodel-methods.R: -------------------------------------------------------------------------------- 1 | # generic methods ----------- 2 | 3 | #' print method for summary.textmodel 4 | #' 5 | #' @param x a `summary.textmodel` object 6 | #' @param digits minimal number of *significant digits*, see 7 | #' [print.default()] 8 | #' @param ... additional arguments not used 9 | #' @method print summary.textmodel 10 | #' @importFrom stringi stri_trans_totitle stri_replace_all_fixed 11 | #' stri_detect_fixed 12 | #' @keywords textmodel internal 13 | #' @export 14 | print.summary.textmodel <- function(x, digits = max(3L, getOption("digits") - 3L), ...) { 15 | label <- stri_trans_totitle(stri_replace_all_fixed(names(x), ".", " ")) 16 | # print the formatted remaining elements 17 | for (i in seq_along(x)) { 18 | cat("\n") 19 | cat(label[i], ':\n', sep = '') 20 | if (stri_detect_fixed(label[i], "Feature")) { 21 | print(t(x[[i]]), digits = digits) 22 | } else { 23 | print(x[[i]], digits = digits) 24 | } 25 | 26 | } 27 | } 28 | 29 | #' Assign the summary.textmodel class to a list 30 | #' @param x a named list 31 | #' @keywords internal 32 | #' @export 33 | as.summary.textmodel <- function(x) { 34 | class(x) <- c("summary.textmodel", "list") 35 | x 36 | } 37 | 38 | # 39 | # #' Print methods for textmodel features estimates 40 | # #' 41 | # #' @param x a textmodel_features object 42 | # #' @param digits minimal number of \emph{significant digits}, see 43 | # #' \code{\link{print.default}} 44 | # #' @param n how many coefficients to print before truncating 45 | # #' @param ... additional arguments not used 46 | # #' @method print coef.textmodel 47 | # #' @export 48 | # print.coef.textmodel <- function(x, digits = max(3L, getOption("digits") - 3L), n = 30L, ...) { 49 | # x <- unclass(x) 50 | # if (length(x) > n) 51 | # cat("(showing first", length(x), "elements)\n") 52 | # NextMethod(digits = digits) 53 | # } 54 | # 55 | # #' Assign the textmodel_coefficients class to a numeric vector 56 | # #' @param x a numeric vector 57 | # #' @keywords internal 58 | # as.coef.textmodel <- function(x) { 59 | # class(x) <- c("coef.textmodel", "numeric") 60 | # return(x) 61 | # } 62 | 63 | #' Print methods for textmodel features estimates 64 | 65 | #' This is a helper function used in `print.summary.textmodel`. 66 | #' @param x a coefficients_textmodel object 67 | #' @param digits minimal number of *significant digits*, see 68 | #' [print.default()] 69 | #' @param ... additional arguments not used 70 | #' @method print coefficients_textmodel 71 | #' @keywords internal textmodel 72 | #' @export 73 | print.coefficients_textmodel <- function(x, digits = max(3L, getOption("digits") - 3L), ...) { 74 | if (is.data.frame(x)) { 75 | n <- nrow(x) 76 | x <- as.data.frame(x) 77 | } else { 78 | n <- length(x) 79 | x <- unclass(x) 80 | } 81 | cat("(showing first", n, "elements)\n") 82 | print(x, digits = digits) 83 | } 84 | 85 | #' Coerce various objects to coefficients_textmodel 86 | 87 | #' This is a helper function used in `summary.textmodel_*`. 88 | #' @param x an object to be coerced 89 | #' @importFrom stats coefficients 90 | #' @importFrom stats coef 91 | #' @keywords internal 92 | #' @export 93 | as.coefficients_textmodel <- function(x) { 94 | UseMethod('as.coefficients_textmodel') 95 | } 96 | 97 | #' @noRd 98 | #' @method as.coefficients_textmodel data.frame 99 | #' @keywords internal 100 | #' @export 101 | as.coefficients_textmodel.data.frame <- function(x) { 102 | class(x) <- c("coefficients_textmodel", "data.frame") 103 | return(x) 104 | } 105 | 106 | #' @noRd 107 | #' @method as.coefficients_textmodel numeric 108 | #' @keywords internal 109 | #' @export 110 | as.coefficients_textmodel.numeric <- function(x) { 111 | class(x) <- c("coefficients_textmodel", "numeric") 112 | return(x) 113 | } 114 | 115 | #' @noRd 116 | #' @method as.coefficients_textmodel matrix 117 | #' @keywords internal 118 | #' @export 119 | as.coefficients_textmodel.matrix <- function(x) { 120 | as.coefficients_textmodel(as.data.frame(x)) 121 | } 122 | 123 | #' Implements print methods for textmodel_statistics 124 | #' 125 | #' @param x a textmodel_wordscore_statistics object 126 | #' @param digits minimal number of *significant digits*, see 127 | #' [print.default()] 128 | #' @param ... further arguments passed to or from other methods 129 | #' @method print statistics_textmodel 130 | #' @keywords internal textmodel 131 | #' @export 132 | print.statistics_textmodel <- function(x, digits = max(3L, getOption("digits") - 3L), ...) { 133 | NextMethod(digits = digits, row.names = TRUE) 134 | } 135 | 136 | #' Coerce various objects to statistics_textmodel 137 | #' 138 | #' This is a helper function used in `summary.textmodel_*`. 139 | #' @param x an object to be coerced 140 | #' @keywords internal textmodel 141 | #' @export 142 | as.statistics_textmodel <- function(x) { 143 | UseMethod("as.statistics_textmodel") 144 | } 145 | 146 | #' @noRd 147 | #' @method as.statistics_textmodel data.frame 148 | #' @keywords internal textmodel 149 | #' @export 150 | as.statistics_textmodel.data.frame <- function(x) { 151 | class(x) <- c("statistics_textmodel", "data.frame") 152 | return(x) 153 | } 154 | 155 | #' @noRd 156 | #' @method as.statistics_textmodel matrix 157 | #' @keywords internal textmodel 158 | #' @export 159 | as.statistics_textmodel.matrix <- function(x) { 160 | as.statistics_textmodel(as.data.frame(x)) 161 | } 162 | -------------------------------------------------------------------------------- /R/textstat.R: -------------------------------------------------------------------------------- 1 | 2 | #' Identify context words 3 | #' 4 | #' Identify context words using user-provided patterns. 5 | #' @param x a tokens object created by [quanteda::tokens()]. 6 | #' @param pattern [quanteda::pattern()] to specify target words. 7 | #' @param valuetype the type of pattern matching: `"glob"` for "glob"-style 8 | #' wildcard expressions; `"regex"` for regular expressions; or `"fixed"` for 9 | #' exact matching. See [quanteda::valuetype()] for details. 10 | #' @param case_insensitive if `TRUE`, ignore case when matching. 11 | #' @param window size of window for collocation analysis. 12 | #' @param p threshold for statistical significance of collocations. 13 | #' @param min_count minimum frequency of words within the window to be 14 | #' considered as collocations. 15 | #' @param remove_pattern if `TRUE`, keywords do not contain target words. 16 | #' @inheritParams quanteda::tokens_ngrams 17 | #' @param ... additional arguments passed to [quanteda.textstats::textstat_keyness()]. 18 | #' @importFrom quanteda.textstats textstat_keyness 19 | #' @importFrom quanteda is.tokens tokens_remove tokens_select tokens_ngrams dfm 20 | #' dfm_trim dfm_match featnames as.dfm dfm_remove 21 | #' @export 22 | #' @seealso [quanteda.textstats::textstat_keyness()] 23 | textstat_context <- function(x, pattern, valuetype = c("glob", "regex", "fixed"), 24 | case_insensitive = TRUE, window = 10, min_count = 10, 25 | remove_pattern = TRUE, n = 1, skip = 0, ...) { 26 | 27 | valuetype <- match.arg(valuetype) 28 | if (!is.tokens(x)) 29 | stop("x must be a tokens object\n", call. = FALSE) 30 | 31 | # reference 32 | y <- tokens_remove(x, pattern, valuetype = valuetype, 33 | case_insensitive = case_insensitive, 34 | window = window, padding = FALSE) 35 | if (any(n > 1)) 36 | y <- tokens_ngrams(y, n = n, skip = skip) 37 | 38 | # target 39 | x <- tokens_select(x, pattern, valuetype = valuetype, 40 | case_insensitive = case_insensitive, 41 | window = window, padding = FALSE) 42 | if (remove_pattern) 43 | x <- tokens_remove(x, pattern, valuetype = valuetype, 44 | case_insensitive = case_insensitive) 45 | if (any(n > 1)) 46 | x <- tokens_ngrams(x, n = n, skip = skip) 47 | 48 | y <- dfm_remove(dfm(y), pattern = "") 49 | x <- dfm_remove(dfm(x), pattern = "") 50 | 51 | f <- union(featnames(x), featnames(y)) 52 | x <- dfm_match(x, f) 53 | y <- dfm_match(y, f) 54 | 55 | if (sum(x) > 0) { 56 | result <- textstat_keyness(as.dfm(rbind(quanteda::colSums(x), quanteda::colSums(y))), ...) 57 | result <- result[result$n_target >= min_count,] 58 | } else { 59 | result <- head(textstat_keyness(as.dfm(matrix(c(1, 0))), ...), 0) # dummy object 60 | } 61 | colnames(result)[c(4, 5)] <- c("n_inside", "n_outside") 62 | return(result) 63 | } 64 | 65 | #' @rdname textstat_context 66 | #' @export 67 | char_context <- function(x, pattern, valuetype = c("glob", "regex", "fixed"), 68 | case_insensitive = TRUE, window = 10, min_count = 10, 69 | remove_pattern = TRUE, p = 0.001, n = 1, skip = 0) { 70 | result <- textstat_context(x, pattern = pattern, valuetype = valuetype, 71 | case_insensitive = case_insensitive, window = window, 72 | min_count = min_count, remove_pattern = remove_pattern, 73 | n = n, skip = skip) 74 | result <- result[result[[2]] > 0 & result$p < p,] 75 | return(result$feature) 76 | } 77 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | rmarkdown::github_document 4 | --- 5 | 6 | ```{r, echo=FALSE} 7 | knitr::opts_chunk$set( 8 | collapse = FALSE, 9 | comment = "##", 10 | fig.path = "images/", 11 | dpi = 150, 12 | fig.height = 5, 13 | fig.width = 10 14 | ) 15 | ``` 16 | 17 | # LSS: Semi-supervised algorithm for document scaling 18 | 19 | 20 | [![CRAN 21 | Version](https://www.r-pkg.org/badges/version/LSX)](https://CRAN.R-project.org/package=LSX) 22 | [![Downloads](https://cranlogs.r-pkg.org/badges/LSX)](https://CRAN.R-project.org/package=LSX) 23 | [![Total 24 | Downloads](https://cranlogs.r-pkg.org/badges/grand-total/LSX?color=orange)](https://CRAN.R-project.org/package=LSX) 25 | [![R build 26 | status](https://github.com/koheiw/LSX/workflows/R-CMD-check/badge.svg)](https://github.com/koheiw/LSX/actions) 27 | [![codecov](https://codecov.io/gh/koheiw/LSX/branch/master/graph/badge.svg)](https://app.codecov.io/gh/koheiw/LSX) 28 | 29 | 30 | In quantitative text analysis, the cost of training supervised machine learning models tend to be very high when the corpus is large. Latent Semantic Scaling (LSS) is a semi-supervised document scaling technique that I developed to perform large scale analysis at low cost. Taking user-provided *seed words* as weak supervision, it estimates polarity of words in the corpus by latent semantic analysis and locates documents on a unidimensional scale (e.g. sentiment). 31 | 32 | ## Installation 33 | 34 | From CRAN: 35 | 36 | ```{r, eval=FALSE} 37 | install.packages("LSX") 38 | ``` 39 | 40 | From Github: 41 | 42 | ```{r, eval=FALSE} 43 | devtools::install_github("koheiw/LSX") 44 | ``` 45 | 46 | ## Examples 47 | 48 | Please visit the package website to understand the usage of the functions: 49 | 50 | - [Introduction to LSX](https://koheiw.github.io/LSX/articles/pkgdown/basic.html) 51 | - [Application in research](https://koheiw.github.io/LSX/articles/pkgdown/research.html) 52 | - [Selection of seed words](https://koheiw.github.io/LSX/articles/pkgdown/seedwords.html) 53 | 54 | Please read the following papers for the algorithm and methodology, and its application to non-English texts (Japanese and Hebrew): 55 | 56 | - Watanabe, Kohei. 2020. ["Latent Semantic Scaling: A Semisupervised Text Analysis Technique for New Domains and Languages"](https://www.tandfonline.com/doi/full/10.1080/19312458.2020.1832976), *Communication Methods and Measures*. 57 | - Watanabe, Kohei, Segev, Elad, & Tago, Atsushi. (2022). ["Discursive diversion: Manipulation of nuclear threats by the conservative leaders in Japan and Israel"](https://journals.sagepub.com/doi/full/10.1177/17480485221097967), *International Communication Gazette*. 58 | 59 | ## Other publications 60 | 61 | LSS has been used for research in various fields of social science. 62 | 63 | - Nakamura, Kentaro. 2022 [Balancing Opportunities and Incentives: How Rising China’s Mediated Public Diplomacy Changes Under Crisis](https://ijoc.org/index.php/ijoc/article/view/18676/3968), *International Journal of Communication*. 64 | - Zollinger, Delia. 2022 [Cleavage Identities in Voters’ Own Words: Harnessing Open-Ended Survey Responses](https://onlinelibrary.wiley.com/doi/10.1111/ajps.12743), *American Journal of Political Science*. 65 | - Brändle, Verena K., and Olga Eisele. 2022. ["A Thin Line: Governmental Border Communication in Times of European Crises"](https://onlinelibrary.wiley.com/doi/full/10.1111/jcms.13398) *Journal of Common Market Studies*. 66 | - Umansky, Natalia. 2022. ["Who gets a say in this? Speaking security on social media"](https://journals.sagepub.com/doi/10.1177/14614448221111009). *New Media & Society*. 67 | - Rauh, Christian, 2022. ["Supranational emergency politics? What executives’ public crisis communication may tell us"](https://www.tandfonline.com/doi/full/10.1080/13501763.2021.1916058), *Journal of European Public Policy*. 68 | - Trubowitz, Peter and Watanabe, Kohei. 2021. ["The Geopolitical Threat Index: A Text-Based Computational Approach to Identifying Foreign Threats"](https://academic.oup.com/isq/advance-article/doi/10.1093/isq/sqab029/6278490), *International Studies Quarterly*. 69 | - Vydra, Simon and Kantorowicz, Jaroslaw. 2020. ["Tracing Policy-relevant Information in Social Media: The Case of Twitter before and during the COVID-19 Crisis"](https://www.degruyter.com/document/doi/10.1515/spp-2020-0013/html). *Statistics, Politics and Policy*. 70 | - Watanabe, Kohei. 2017. ["Measuring News Bias: Russia's Official News Agency ITAR-TASS’s Coverage of the Ukraine Crisis"](http://journals.sagepub.com/eprint/TBc9miIc89njZvY3gyAt/full), *European Journal Communication*. 71 | 72 | More publications are available on [Google Scholar](https://scholar.google.com/scholar?oi=bibs&hl=en&cites=5312969973901591795). 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # LSS: Semi-supervised algorithm for document scaling 3 | 4 | 5 | 6 | [![CRAN 7 | Version](https://www.r-pkg.org/badges/version/LSX)](https://CRAN.R-project.org/package=LSX) 8 | [![Downloads](https://cranlogs.r-pkg.org/badges/LSX)](https://CRAN.R-project.org/package=LSX) 9 | [![Total 10 | Downloads](https://cranlogs.r-pkg.org/badges/grand-total/LSX?color=orange)](https://CRAN.R-project.org/package=LSX) 11 | [![R build 12 | status](https://github.com/koheiw/LSX/workflows/R-CMD-check/badge.svg)](https://github.com/koheiw/LSX/actions) 13 | [![codecov](https://codecov.io/gh/koheiw/LSX/branch/master/graph/badge.svg)](https://app.codecov.io/gh/koheiw/LSX) 14 | 15 | 16 | In quantitative text analysis, the cost of training supervised machine 17 | learning models tend to be very high when the corpus is large. Latent 18 | Semantic Scaling (LSS) is a semi-supervised document scaling technique 19 | that I developed to perform large scale analysis at low cost. Taking 20 | user-provided *seed words* as weak supervision, it estimates polarity of 21 | words in the corpus by latent semantic analysis and locates documents on 22 | a unidimensional scale (e.g. sentiment). 23 | 24 | ## Installation 25 | 26 | From CRAN: 27 | 28 | ``` r 29 | install.packages("LSX") 30 | ``` 31 | 32 | From Github: 33 | 34 | ``` r 35 | devtools::install_github("koheiw/LSX") 36 | ``` 37 | 38 | ## Examples 39 | 40 | Please visit the package website to understand the usage of the 41 | functions: 42 | 43 | - [Introduction to 44 | LSX](https://koheiw.github.io/LSX/articles/pkgdown/basic.html) 45 | - [Application in 46 | research](https://koheiw.github.io/LSX/articles/pkgdown/research.html) 47 | - [Selection of seed 48 | words](https://koheiw.github.io/LSX/articles/pkgdown/seedwords.html) 49 | 50 | Please read the following papers for the algorithm and methodology, and 51 | its application to non-English texts (Japanese and Hebrew): 52 | 53 | - Watanabe, Kohei. 2020. [“Latent Semantic Scaling: A Semisupervised 54 | Text Analysis Technique for New Domains and 55 | Languages”](https://www.tandfonline.com/doi/full/10.1080/19312458.2020.1832976), 56 | *Communication Methods and Measures*. 57 | - Watanabe, Kohei, Segev, Elad, & Tago, Atsushi. (2022). [“Discursive 58 | diversion: Manipulation of nuclear threats by the conservative leaders 59 | in Japan and 60 | Israel”](https://journals.sagepub.com/doi/full/10.1177/17480485221097967), 61 | *International Communication Gazette*. 62 | 63 | ## Other publications 64 | 65 | LSS has been used for research in various fields of social science. 66 | 67 | - Nakamura, Kentaro. 2022 [Balancing Opportunities and Incentives: How 68 | Rising China’s Mediated Public Diplomacy Changes Under 69 | Crisis](https://ijoc.org/index.php/ijoc/article/view/18676/3968), 70 | *International Journal of Communication*. 71 | - Zollinger, Delia. 2022 [Cleavage Identities in Voters’ Own Words: 72 | Harnessing Open-Ended Survey 73 | Responses](https://onlinelibrary.wiley.com/doi/10.1111/ajps.12743), 74 | *American Journal of Political Science*. 75 | - Brändle, Verena K., and Olga Eisele. 2022. [“A Thin Line: Governmental 76 | Border Communication in Times of European 77 | Crises”](https://onlinelibrary.wiley.com/doi/full/10.1111/jcms.13398) 78 | *Journal of Common Market Studies*. 79 | - Umansky, Natalia. 2022. [“Who gets a say in this? Speaking security on 80 | social 81 | media”](https://journals.sagepub.com/doi/10.1177/14614448221111009). 82 | *New Media & Society*. 83 | - Rauh, Christian, 2022. [“Supranational emergency politics? What 84 | executives’ public crisis communication may tell 85 | us”](https://www.tandfonline.com/doi/full/10.1080/13501763.2021.1916058), 86 | *Journal of European Public Policy*. 87 | - Trubowitz, Peter and Watanabe, Kohei. 2021. [“The Geopolitical Threat 88 | Index: A Text-Based Computational Approach to Identifying Foreign 89 | Threats”](https://academic.oup.com/isq/advance-article/doi/10.1093/isq/sqab029/6278490), 90 | *International Studies Quarterly*. 91 | - Vydra, Simon and Kantorowicz, Jaroslaw. 2020. [“Tracing 92 | Policy-relevant Information in Social Media: The Case of Twitter 93 | before and during the COVID-19 94 | Crisis”](https://www.degruyter.com/document/doi/10.1515/spp-2020-0013/html). 95 | *Statistics, Politics and Policy*. 96 | - Watanabe, Kohei. 2017. [“Measuring News Bias: Russia’s Official News 97 | Agency ITAR-TASS’s Coverage of the Ukraine 98 | Crisis”](http://journals.sagepub.com/eprint/TBc9miIc89njZvY3gyAt/full), 99 | *European Journal Communication*. 100 | 101 | More publications are available on [Google 102 | Scholar](https://scholar.google.com/scholar?oi=bibs&hl=en&cites=5312969973901591795). 103 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: ~ 2 | template: 3 | bootstrap: 5 4 | bootswatch: litera 5 | includes: 6 | in_header: | 7 | 8 | 9 | 16 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | # Submission notes 2 | 3 | ## Purpose 4 | 5 | - Update to reflect new modular structure of quanteda and quanteda.textstats 6 | - Remove quanteda.textmodels from dependencies 7 | 8 | ## Test environments 9 | 10 | * local KDE neon User Edition 5.13, R 4.0.3 11 | * local Windows 10, R 4.0.3 12 | * Github Actions Windows latest (release) 13 | * Github Actions MacOS latest (release) 14 | * Github Actions Linux 18.04 (release) 15 | * Github Actions Linux 18.04 (devel) 16 | 17 | ## R CMD check results 18 | 19 | 0 errors | 0 warnings | 1 notes 20 | 21 | ## Downstream dependencies 22 | 23 | There are no reverse dependencies on this package. 24 | -------------------------------------------------------------------------------- /data-raw/generate.R: -------------------------------------------------------------------------------- 1 | require(quanteda) 2 | 3 | data_dictionary_sentiment <- dictionary( 4 | list( 5 | "positive" = c("good", "nice", "excellent", "positive", "fortunate", "correct", "superior"), 6 | "negative" = c("bad", "nasty", "poor", "negative", "unfortunate", "wrong", "inferior") 7 | ) 8 | ) 9 | save(data_dictionary_sentiment, file = "data/data_dictionary_sentiment.RData") 10 | 11 | 12 | data_dictionary_ideology <- dictionary( 13 | list( 14 | "right" = c("deficit", "austerity", "unstable", "recession", "inflation", "currency", "workforce"), 15 | "left" = c("poor", "poverty", "free", "benefits", "prices", "money", "workers") 16 | ) 17 | ) 18 | save(data_dictionary_ideology, file = "data/data_dictionary_ideology.RData") 19 | -------------------------------------------------------------------------------- /data/data_dictionary_ideology.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/data/data_dictionary_ideology.RData -------------------------------------------------------------------------------- /data/data_dictionary_sentiment.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/data/data_dictionary_sentiment.RData -------------------------------------------------------------------------------- /data/data_textmodel_lss_russianprotests.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/data/data_textmodel_lss_russianprotests.rda -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Page not found (404) • LSX 9 | 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | Skip to contents 27 | 28 | 29 |
74 |
75 |
79 | 80 | Content not found. Please use links in the navbar. 81 | 82 |
83 |
84 | 85 | 86 |
89 | 90 | 93 | 94 |
95 |
96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | Articles • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
59 | 60 |
61 |

All vignettes

62 |
63 | 64 |
Introduction to LSX
65 |
66 |
Application in research
67 |
68 |
Selection of seed words
69 |
70 |
71 |
72 | 73 | 74 |
77 | 78 | 81 | 82 |
83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /docs/articles/pkgdown/application_files/figure-html/plot-term-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/application_files/figure-html/plot-term-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/application_files/figure-html/plot-trend-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/application_files/figure-html/plot-trend-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/basic_files/figure-html/plot-hostility-term-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/basic_files/figure-html/plot-hostility-term-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/basic_files/figure-html/plot-hostility-trend-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/basic_files/figure-html/plot-hostility-trend-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/basic_files/figure-html/plot-sentiment-dict-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/basic_files/figure-html/plot-sentiment-dict-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/basic_files/figure-html/plot-sentiment-term-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/basic_files/figure-html/plot-sentiment-term-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/basic_files/figure-html/plot-sentiment-trend-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/basic_files/figure-html/plot-sentiment-trend-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/basic_files/figure-html/plot-sentimnet-emoji-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/basic_files/figure-html/plot-sentimnet-emoji-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/basic_files/figure-html/plot-sentimnet-random-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/basic_files/figure-html/plot-sentimnet-random-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/introduction_files/figure-html/plot-hostility-term-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/introduction_files/figure-html/plot-hostility-term-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/introduction_files/figure-html/plot-hostility-trend-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/introduction_files/figure-html/plot-hostility-trend-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/introduction_files/figure-html/plot-sentiment-term-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/introduction_files/figure-html/plot-sentiment-term-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/introduction_files/figure-html/plot-sentiment-trend-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/introduction_files/figure-html/plot-sentiment-trend-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/introduction_files/figure-html/plot-sentimnet-emoji-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/introduction_files/figure-html/plot-sentimnet-emoji-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/research_files/figure-html/plot-term-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/research_files/figure-html/plot-term-1.png -------------------------------------------------------------------------------- /docs/articles/pkgdown/research_files/figure-html/plot-trend-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/articles/pkgdown/research_files/figure-html/plot-trend-1.png -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | Authors and Citation • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
59 | 60 |
61 |

Authors

62 | 63 |
  • 64 |

    Kohei Watanabe. Author, maintainer, copyright holder. 65 |

    66 |
  • 67 |
68 | 69 |
70 |

Citation

71 |

Source: DESCRIPTION

72 | 73 |

Watanabe K (2024). 74 | LSX: Semi-Supervised Algorithm for Document Scaling. 75 | R package version 1.4.1, https://koheiw.github.io/LSX/. 76 |

77 |
@Manual{,
 78 |   title = {LSX: Semi-Supervised Algorithm for Document Scaling},
 79 |   author = {Kohei Watanabe},
 80 |   year = {2024},
 81 |   note = {R package version 1.4.1},
 82 |   url = {https://koheiw.github.io/LSX/},
 83 | }
84 |
85 |
87 | 88 | 89 |
92 | 93 | 96 | 97 |
98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /docs/deps/data-deps.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('nav.navbar').headroom(); 6 | 7 | Toc.init({ 8 | $nav: $("#toc"), 9 | $scope: $("main h2, main h3, main h4, main h5, main h6") 10 | }); 11 | 12 | if ($('#toc').length) { 13 | $('body').scrollspy({ 14 | target: '#toc', 15 | offset: $("nav.navbar").outerHeight() + 1 16 | }); 17 | } 18 | 19 | // Activate popovers 20 | $('[data-bs-toggle="popover"]').popover({ 21 | container: 'body', 22 | html: true, 23 | trigger: 'focus', 24 | placement: "top", 25 | sanitize: false, 26 | }); 27 | 28 | $('[data-bs-toggle="tooltip"]').tooltip(); 29 | 30 | /* Clipboard --------------------------*/ 31 | 32 | function changeTooltipMessage(element, msg) { 33 | var tooltipOriginalTitle=element.getAttribute('data-bs-original-title'); 34 | element.setAttribute('data-bs-original-title', msg); 35 | $(element).tooltip('show'); 36 | element.setAttribute('data-bs-original-title', tooltipOriginalTitle); 37 | } 38 | 39 | if(ClipboardJS.isSupported()) { 40 | $(document).ready(function() { 41 | var copyButton = ""; 42 | 43 | $("div.sourceCode").addClass("hasCopyButton"); 44 | 45 | // Insert copy buttons: 46 | $(copyButton).prependTo(".hasCopyButton"); 47 | 48 | // Initialize tooltips: 49 | $('.btn-copy-ex').tooltip({container: 'body'}); 50 | 51 | // Initialize clipboard: 52 | var clipboard = new ClipboardJS('[data-clipboard-copy]', { 53 | text: function(trigger) { 54 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); 55 | } 56 | }); 57 | 58 | clipboard.on('success', function(e) { 59 | changeTooltipMessage(e.trigger, 'Copied!'); 60 | e.clearSelection(); 61 | }); 62 | 63 | clipboard.on('error', function(e) { 64 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 65 | }); 66 | 67 | }); 68 | } 69 | 70 | /* Search marking --------------------------*/ 71 | var url = new URL(window.location.href); 72 | var toMark = url.searchParams.get("q"); 73 | var mark = new Mark("main#main"); 74 | if (toMark) { 75 | mark.mark(toMark, { 76 | accuracy: { 77 | value: "complementary", 78 | limiters: [",", ".", ":", "/"], 79 | } 80 | }); 81 | } 82 | 83 | /* Search --------------------------*/ 84 | /* Adapted from https://github.com/rstudio/bookdown/blob/2d692ba4b61f1e466c92e78fd712b0ab08c11d31/inst/resources/bs4_book/bs4_book.js#L25 */ 85 | // Initialise search index on focus 86 | var fuse; 87 | $("#search-input").focus(async function(e) { 88 | if (fuse) { 89 | return; 90 | } 91 | 92 | $(e.target).addClass("loading"); 93 | var response = await fetch($("#search-input").data("search-index")); 94 | var data = await response.json(); 95 | 96 | var options = { 97 | keys: ["what", "text", "code"], 98 | ignoreLocation: true, 99 | threshold: 0.1, 100 | includeMatches: true, 101 | includeScore: true, 102 | }; 103 | fuse = new Fuse(data, options); 104 | 105 | $(e.target).removeClass("loading"); 106 | }); 107 | 108 | // Use algolia autocomplete 109 | var options = { 110 | autoselect: true, 111 | debug: true, 112 | hint: false, 113 | minLength: 2, 114 | }; 115 | var q; 116 | async function searchFuse(query, callback) { 117 | await fuse; 118 | 119 | var items; 120 | if (!fuse) { 121 | items = []; 122 | } else { 123 | q = query; 124 | var results = fuse.search(query, { limit: 20 }); 125 | items = results 126 | .filter((x) => x.score <= 0.75) 127 | .map((x) => x.item); 128 | if (items.length === 0) { 129 | items = [{dir:"Sorry 😿",previous_headings:"",title:"No results found.",what:"No results found.",path:window.location.href}]; 130 | } 131 | } 132 | callback(items); 133 | } 134 | $("#search-input").autocomplete(options, [ 135 | { 136 | name: "content", 137 | source: searchFuse, 138 | templates: { 139 | suggestion: (s) => { 140 | if (s.title == s.what) { 141 | return `${s.dir} >
${s.title}
`; 142 | } else if (s.previous_headings == "") { 143 | return `${s.dir} >
${s.title}
> ${s.what}`; 144 | } else { 145 | return `${s.dir} >
${s.title}
> ${s.previous_headings} > ${s.what}`; 146 | } 147 | }, 148 | }, 149 | }, 150 | ]).on('autocomplete:selected', function(event, s) { 151 | window.location.href = s.path + "?q=" + q + "#" + s.id; 152 | }); 153 | }); 154 | })(window.jQuery || window.$) 155 | 156 | 157 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 3.1.11 2 | pkgdown: 2.0.9 3 | pkgdown_sha: ~ 4 | articles: 5 | basic: pkgdown/basic.html 6 | research: pkgdown/research.html 7 | seedwords: pkgdown/seedwords.html 8 | last_built: 2024-06-17T03:28Z 9 | 10 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/reference/as.coefficients_textmodel.html: -------------------------------------------------------------------------------- 1 | 2 | Coerce various objects to coefficients_textmodel This is a helper function used in summary.textmodel_*. — as.coefficients_textmodel • LSX 14 | Skip to contents 15 | 16 | 17 |
57 |
58 |
63 | 64 |
65 |

Coerce various objects to coefficients_textmodel 66 | This is a helper function used in summary.textmodel_*.

67 |
68 | 69 |
70 |

Usage

71 |
as.coefficients_textmodel(x)
72 |
73 | 74 |
75 |

Arguments

76 |
x
77 |

an object to be coerced

78 | 79 |
80 | 81 |
83 | 84 | 85 |
88 | 89 | 92 | 93 |
94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /docs/reference/as.statistics_textmodel.html: -------------------------------------------------------------------------------- 1 | 2 | Coerce various objects to statistics_textmodel — as.statistics_textmodel • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
61 | 62 |
63 |

This is a helper function used in summary.textmodel_*.

64 |
65 | 66 |
67 |

Usage

68 |
as.statistics_textmodel(x)
69 |
70 | 71 |
72 |

Arguments

73 |
x
74 |

an object to be coerced

75 | 76 |
77 | 78 |
80 | 81 | 82 |
85 | 86 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /docs/reference/as.summary.textmodel.html: -------------------------------------------------------------------------------- 1 | 2 | Assign the summary.textmodel class to a list — as.summary.textmodel • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
61 | 62 |
63 |

Assign the summary.textmodel class to a list

64 |
65 | 66 |
67 |

Usage

68 |
as.summary.textmodel(x)
69 |
70 | 71 |
72 |

Arguments

73 |
x
74 |

a named list

75 | 76 |
77 | 78 |
80 | 81 | 82 |
85 | 86 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /docs/reference/cohesion.html: -------------------------------------------------------------------------------- 1 | 2 | Computes cohesion of components of latent semantic analysis — cohesion • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
61 | 62 |
63 |

Computes cohesion of components of latent semantic analysis

64 |
65 | 66 |
67 |

Usage

68 |
cohesion(x, bandwidth = 10)
69 |
70 | 71 |
72 |

Arguments

73 |
x
74 |

a fitted textmodel_lss

75 | 76 | 77 |
bandwidth
78 |

size of window for smoothing

79 | 80 |
81 | 82 |
84 | 85 | 86 |
89 | 90 | 93 | 94 |
95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /docs/reference/data_dictionary_ideology.html: -------------------------------------------------------------------------------- 1 | 2 | Seed words for analysis of left-right political ideology — data_dictionary_ideology • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
61 | 62 |
63 |

Seed words for analysis of left-right political ideology

64 |
65 | 66 | 67 | 68 |
69 |

Examples

70 |
as.seedwords(data_dictionary_ideology)
71 | #>   deficit austerity  unstable recession inflation  currency workforce      poor 
72 | #>         1         1         1         1         1         1         1        -1 
73 | #>   poverty      free  benefits    prices     money   workers 
74 | #>        -1        -1        -1        -1        -1        -1 
75 | 
76 |
77 |
78 | 79 | 80 |
83 | 84 | 87 | 88 |
89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /docs/reference/diagnosys.html: -------------------------------------------------------------------------------- 1 | 2 | Identify noisy documents in a corpus — diagnosys • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
61 | 62 |
63 |

Identify noisy documents in a corpus

64 |
65 | 66 |
67 |

Usage

68 |
diagnosys(x, ...)
69 |
70 | 71 |
72 |

Arguments

73 |
x
74 |

character or corpus object whose texts will be diagnosed.

75 | 76 | 77 |
...
78 |

extra arguments passed to tokens.

79 | 80 |
81 | 82 |
84 | 85 | 86 |
89 | 90 | 93 | 94 |
95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /docs/reference/textplot_simil.html: -------------------------------------------------------------------------------- 1 | 2 | Plot similarity between seed words — textplot_simil • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
61 | 62 |
63 |

Plot similarity between seed words

64 |
65 | 66 |
67 |

Usage

68 |
textplot_simil(x)
69 |
70 | 71 |
72 |

Arguments

73 |
x
74 |

fitted textmodel_lss object.

75 | 76 |
77 | 78 |
80 | 81 | 82 |
85 | 86 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /docs/reference/weight_seeds.html: -------------------------------------------------------------------------------- 1 | 2 | Internal function to generate equally-weighted seed set — weight_seeds • LSX 12 | Skip to contents 13 | 14 | 15 |
55 |
56 |
61 | 62 |
63 |

Internal function to generate equally-weighted seed set

64 |
65 | 66 |
67 |

Usage

68 |
weight_seeds(seeds, type)
69 |
70 | 71 | 72 |
73 | 74 | 75 |
78 | 79 | 82 | 83 |
84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /404.html 5 | 6 | 7 | /articles/index.html 8 | 9 | 10 | /articles/pkgdown/application.html 11 | 12 | 13 | /articles/pkgdown/basic.html 14 | 15 | 16 | /articles/pkgdown/introduction.html 17 | 18 | 19 | /articles/pkgdown/research.html 20 | 21 | 22 | /articles/pkgdown/seedwords.html 23 | 24 | 25 | /authors.html 26 | 27 | 28 | /index.html 29 | 30 | 31 | /news/index.html 32 | 33 | 34 | /reference/as.coefficients_textmodel.html 35 | 36 | 37 | /reference/as.seedwords.html 38 | 39 | 40 | /reference/as.statistics_textmodel.html 41 | 42 | 43 | /reference/as.summary.textmodel.html 44 | 45 | 46 | /reference/as.textmodel_lss.html 47 | 48 | 49 | /reference/bootstrap_lss.html 50 | 51 | 52 | /reference/coef.textmodel_lss.html 53 | 54 | 55 | /reference/cohesion.html 56 | 57 | 58 | /reference/data_dictionary_ideology.html 59 | 60 | 61 | /reference/data_dictionary_sentiment.html 62 | 63 | 64 | /reference/data_textmodel_lss_russianprotests.html 65 | 66 | 67 | /reference/diagnosys.html 68 | 69 | 70 | /reference/index.html 71 | 72 | 73 | /reference/optimize_lss.html 74 | 75 | 76 | /reference/predict.textmodel_lss.html 77 | 78 | 79 | /reference/print.coefficients_textmodel.html 80 | 81 | 82 | /reference/print.statistics_textmodel.html 83 | 84 | 85 | /reference/print.summary.textmodel.html 86 | 87 | 88 | /reference/seedwords.html 89 | 90 | 91 | /reference/smooth_lss.html 92 | 93 | 94 | /reference/textmodel_lss.html 95 | 96 | 97 | /reference/textplot_components.html 98 | 99 | 100 | /reference/textplot_simil.html 101 | 102 | 103 | /reference/textplot_terms.html 104 | 105 | 106 | /reference/textstat_context.html 107 | 108 | 109 | /reference/weight_seeds.html 110 | 111 | 112 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "Article", 3 | title = "Latent Semantic Scaling: A Semisupervised Text Analysis Technique for New Domains and Languages", 4 | author = person("Kohei", "Watanabe"), 5 | journal = "Communication Methods and Measures", 6 | year = "2020", 7 | volume = "15", 8 | number = "2", 9 | pages = "81-102", 10 | doi = "10.1080/19312458.2020.1832976" 11 | ) 12 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | ACM 2 | Atsushi 3 | Brändle 4 | COVID 5 | DFM 6 | Eisele 7 | Elad 8 | Github 9 | ITAR 10 | Jaroslaw 11 | Kantorowicz 12 | Kentaro 13 | LSS 14 | Lankina 15 | Littman 16 | Nakamura 17 | Pre 18 | Preperation 19 | Rauh 20 | Segev 21 | Semisupervised 22 | Syst 23 | TASS’ 24 | TASS’s 25 | Tago 26 | Tomila 27 | Trubowitz 28 | Turney 29 | Umansky 30 | Verena 31 | Voters’ 32 | Vydra 33 | Zhou 34 | Zollinger 35 | al 36 | ceiled 37 | codecov 38 | cooccurrence 39 | dfm 40 | diagnosys 41 | doi 42 | embeddings 43 | et 44 | executives’ 45 | fcm 46 | finland 47 | irlba 48 | locfit 49 | lss 50 | nato 51 | neighbouring 52 | newsmap 53 | newswires 54 | polarty 55 | pre 56 | quanteda 57 | rescale 58 | rescaling 59 | rsparse 60 | rsvd 61 | word2vec 62 | russia 63 | semisupervised 64 | stringi 65 | sweden 66 | textmodel 67 | textplots 68 | textstats 69 | ukraine 70 | unidimensional 71 | unipolar 72 | valuetype 73 | wordscore 74 | wordvector 75 | vec 76 | β 77 | 78 | 79 | -------------------------------------------------------------------------------- /man/as.coefficients_textmodel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel-methods.R 3 | \name{as.coefficients_textmodel} 4 | \alias{as.coefficients_textmodel} 5 | \title{Coerce various objects to coefficients_textmodel 6 | This is a helper function used in \verb{summary.textmodel_*}.} 7 | \usage{ 8 | as.coefficients_textmodel(x) 9 | } 10 | \arguments{ 11 | \item{x}{an object to be coerced} 12 | } 13 | \description{ 14 | Coerce various objects to coefficients_textmodel 15 | This is a helper function used in \verb{summary.textmodel_*}. 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/as.seedwords.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{as.seedwords} 4 | \alias{as.seedwords} 5 | \title{Convert a list or a dictionary to seed words} 6 | \usage{ 7 | as.seedwords(x, upper = 1, lower = 2, concatenator = "_") 8 | } 9 | \arguments{ 10 | \item{x}{a list of characters vectors or a \link[quanteda:dictionary]{dictionary} object.} 11 | 12 | \item{upper}{numeric index or key for seed words for higher scores.} 13 | 14 | \item{lower}{numeric index or key for seed words for lower scores.} 15 | 16 | \item{concatenator}{character to replace separators of multi-word seed words.} 17 | } 18 | \value{ 19 | named numeric vector for seed words with polarity scores 20 | } 21 | \description{ 22 | Convert a list or a dictionary to seed words 23 | } 24 | -------------------------------------------------------------------------------- /man/as.statistics_textmodel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel-methods.R 3 | \name{as.statistics_textmodel} 4 | \alias{as.statistics_textmodel} 5 | \title{Coerce various objects to statistics_textmodel} 6 | \usage{ 7 | as.statistics_textmodel(x) 8 | } 9 | \arguments{ 10 | \item{x}{an object to be coerced} 11 | } 12 | \description{ 13 | This is a helper function used in \verb{summary.textmodel_*}. 14 | } 15 | \keyword{internal} 16 | \keyword{textmodel} 17 | -------------------------------------------------------------------------------- /man/as.summary.textmodel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel-methods.R 3 | \name{as.summary.textmodel} 4 | \alias{as.summary.textmodel} 5 | \title{Assign the summary.textmodel class to a list} 6 | \usage{ 7 | as.summary.textmodel(x) 8 | } 9 | \arguments{ 10 | \item{x}{a named list} 11 | } 12 | \description{ 13 | Assign the summary.textmodel class to a list 14 | } 15 | \keyword{internal} 16 | -------------------------------------------------------------------------------- /man/as.textmodel_lss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/as.textmodel.R 3 | \name{as.textmodel_lss} 4 | \alias{as.textmodel_lss} 5 | \title{Create a Latent Semantic Scaling model from various objects} 6 | \usage{ 7 | as.textmodel_lss(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{an object from which a new \link{textmodel_lss} object is created. See details.} 11 | 12 | \item{...}{arguments used to create a new object. \code{seeds} must be given 13 | when \code{x} is a dense matrix or a fitted textmodel_lss.} 14 | } 15 | \value{ 16 | a dummy \link{textmodel_lss} object 17 | } 18 | \description{ 19 | Create a new \link{textmodel_lss} object from an existing or foreign objects. 20 | } 21 | \details{ 22 | If \code{x} is a \link{textmodel_lss}, original word vectors are reused to compute polarity 23 | scores with new seed words. It is also possible to subset word vectors via \code{slice} 24 | if it was trained originally using SVD. 25 | 26 | If \code{x} is a dense matrix, it is treated as a column-oriented word vectors with which 27 | polarity of words are computed. If \code{x} is a named numeric vector, the values are treated 28 | as polarity scores of the words in the names. 29 | 30 | If \code{x} is a normalized \link[wordvector:textmodel_word2vec]{wordvector::textmodel_word2vec}, it returns a spatial model; 31 | if not normalized, a probabilistic model. While the polarity scores of words are 32 | their cosine similarity to seed words in spatial models, they are 33 | predicted probability that the seed words to occur in their proximity. 34 | } 35 | -------------------------------------------------------------------------------- /man/bootstrap_lss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bootstrap.R 3 | \name{bootstrap_lss} 4 | \alias{bootstrap_lss} 5 | \title{[experimental] Compute polarity scores with different hyper-parameters} 6 | \usage{ 7 | bootstrap_lss( 8 | x, 9 | what = c("seeds", "k"), 10 | mode = c("terms", "coef", "predict"), 11 | remove = FALSE, 12 | from = 100, 13 | to = NULL, 14 | by = 50, 15 | verbose = FALSE, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{a fitted textmodel_lss object.} 21 | 22 | \item{what}{choose the hyper-parameter to resample in bootstrapping.} 23 | 24 | \item{mode}{choose the type of the result of bootstrapping. If \code{coef}, 25 | returns the polarity scores of words; if \code{terms}, returns words sorted by 26 | the polarity scores in descending order; if \code{predict}, returns the polarity 27 | scores of documents.} 28 | 29 | \item{remove}{if \code{TRUE}, remove each seed word when \code{what = "seeds"}.} 30 | 31 | \item{from, to, by}{passed to \code{seq()} to generate values for \code{k}; only used 32 | when \code{what = "k"}.} 33 | 34 | \item{verbose}{show messages if \code{TRUE}.} 35 | 36 | \item{...}{additional arguments passed to \code{\link[=as.textmodel_lss]{as.textmodel_lss()}} and 37 | \code{\link[=predict]{predict()}}.} 38 | } 39 | \description{ 40 | A function to compute polarity scores of words and documents by resampling 41 | hyper-parameters from a fitted LSS model. 42 | } 43 | \details{ 44 | \code{bootstrap_lss()} creates LSS fitted textmodel_lss objects internally by 45 | resampling hyper-parameters and computes polarity of words or documents. 46 | The resulting matrix can be used to asses the validity and the reliability 47 | of seeds or k. 48 | 49 | Note that the objects created by \code{\link[=as.textmodel_lss]{as.textmodel_lss()}} does not contain data, users 50 | must pass \code{newdata} via \code{...} when \code{mode = "predict"}. 51 | } 52 | -------------------------------------------------------------------------------- /man/coef.textmodel_lss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel.R 3 | \name{coef.textmodel_lss} 4 | \alias{coef.textmodel_lss} 5 | \alias{coefficients.textmodel_lss} 6 | \title{Extract model coefficients from a fitted textmodel_lss object} 7 | \usage{ 8 | \method{coef}{textmodel_lss}(object, ...) 9 | 10 | coefficients.textmodel_lss(object, ...) 11 | } 12 | \arguments{ 13 | \item{object}{a fitted \link{textmodel_lss} object.} 14 | 15 | \item{...}{not used.} 16 | } 17 | \description{ 18 | \code{coef()} extract model coefficients from a fitted \code{textmodel_lss} 19 | object. \code{coefficients()} is an alias. 20 | } 21 | -------------------------------------------------------------------------------- /man/cohesion.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{cohesion} 4 | \alias{cohesion} 5 | \title{Computes cohesion of components of latent semantic analysis} 6 | \usage{ 7 | cohesion(x, bandwidth = 10) 8 | } 9 | \arguments{ 10 | \item{x}{a fitted \code{textmodel_lss}} 11 | 12 | \item{bandwidth}{size of window for smoothing} 13 | } 14 | \description{ 15 | Computes cohesion of components of latent semantic analysis 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/data_dictionary_ideology.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data_dictionary_ideology} 5 | \alias{data_dictionary_ideology} 6 | \title{Seed words for analysis of left-right political ideology} 7 | \description{ 8 | Seed words for analysis of left-right political ideology 9 | } 10 | \examples{ 11 | as.seedwords(data_dictionary_ideology) 12 | } 13 | -------------------------------------------------------------------------------- /man/data_dictionary_sentiment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data_dictionary_sentiment} 5 | \alias{data_dictionary_sentiment} 6 | \title{Seed words for analysis of positive-negative sentiment} 7 | \description{ 8 | Seed words for analysis of positive-negative sentiment 9 | } 10 | \examples{ 11 | as.seedwords(data_dictionary_sentiment) 12 | } 13 | \references{ 14 | Turney, P. D., & Littman, M. L. (2003). Measuring Praise and 15 | Criticism: Inference of Semantic Orientation from Association. ACM Trans. 16 | Inf. Syst., 21(4), 315–346. \doi{10.1145/944012.944013} 17 | } 18 | -------------------------------------------------------------------------------- /man/data_textmodel_lss_russianprotests.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data_textmodel_lss_russianprotests} 5 | \alias{data_textmodel_lss_russianprotests} 6 | \title{A fitted LSS model on street protest in Russia} 7 | \description{ 8 | This model was trained on a Russian media corpus (newspapers, TV transcripts 9 | and newswires) to analyze framing of street protests. The scale is protests 10 | as "freedom of expression" (high) vs "social disorder" (low). Although some 11 | slots are missing in this object (because the model was imported from the 12 | original Python implementation), it allows you to scale texts using 13 | \code{predict}. 14 | } 15 | \references{ 16 | Lankina, Tomila, and Kohei Watanabe. “'Russian Spring' or 'Spring 17 | Betrayal'? The Media as a Mirror of Putin's Evolving Strategy in Ukraine.” 18 | Europe-Asia Studies 69, no. 10 (2017): 1526–56. 19 | \doi{10.1080/09668136.2017.1397603}. 20 | } 21 | \keyword{data} 22 | -------------------------------------------------------------------------------- /man/diagnosys.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{diagnosys} 4 | \alias{diagnosys} 5 | \title{Identify noisy documents in a corpus} 6 | \usage{ 7 | diagnosys(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{character or \code{\link[quanteda:corpus]{quanteda::corpus()}} object whose texts will be diagnosed.} 11 | 12 | \item{...}{extra arguments passed to \code{\link[quanteda:tokens]{quanteda::tokens()}}.} 13 | } 14 | \description{ 15 | Identify noisy documents in a corpus 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/optimize_lss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bootstrap.R 3 | \name{optimize_lss} 4 | \alias{optimize_lss} 5 | \title{[experimental] Compute variance ratios with different hyper-parameters} 6 | \usage{ 7 | optimize_lss(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{a fitted textmodel_lss object.} 11 | 12 | \item{...}{additional arguments passed to \link{bootstrap_lss}.} 13 | } 14 | \description{ 15 | [experimental] Compute variance ratios with different hyper-parameters 16 | } 17 | \details{ 18 | \code{optimize_lss()} computes variance ratios with different values of 19 | hyper-parameters using \link{bootstrap_lss}. The variance ration \eqn{v} is defined 20 | as \deqn{v = \sigma^2_{documents} / \sigma^2_{words}.} It maximizes 21 | when the model best distinguishes between the documents on the latent scale. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | # the unit of analysis is not sentences 26 | dfmt_grp <- dfm_group(dfmt) 27 | 28 | # choose best k 29 | v1 <- optimize_lss(lss, what = "k", from = 50, 30 | newdata = dfmt_grp, verbose = TRUE) 31 | plot(names(v1), v1) 32 | 33 | # find bad seed words 34 | v2 <- optimize_lss(lss, what = "seeds", remove = TRUE, 35 | newdata = dfmt_grp, verbose = TRUE) 36 | barplot(v2, las = 2) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /man/predict.textmodel_lss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict.R 3 | \name{predict.textmodel_lss} 4 | \alias{predict.textmodel_lss} 5 | \title{Prediction method for textmodel_lss} 6 | \usage{ 7 | \method{predict}{textmodel_lss}( 8 | object, 9 | newdata = NULL, 10 | se_fit = FALSE, 11 | density = FALSE, 12 | rescale = TRUE, 13 | cut = NULL, 14 | min_n = 0L, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{object}{a fitted LSS textmodel.} 20 | 21 | \item{newdata}{a dfm on which prediction should be made.} 22 | 23 | \item{se_fit}{if \code{TRUE}, returns standard error of document scores.} 24 | 25 | \item{density}{if \code{TRUE}, returns frequency of polarity words in documents.} 26 | 27 | \item{rescale}{if \code{TRUE}, normalizes polarity scores using \code{scale()}.} 28 | 29 | \item{cut}{a vector of one or two percentile values to dichotomized polarty 30 | scores of words. When two values are given, words between them receive zero 31 | polarity.} 32 | 33 | \item{min_n}{set the minimum number of polarity words in documents.} 34 | 35 | \item{...}{not used} 36 | } 37 | \description{ 38 | Prediction method for textmodel_lss 39 | } 40 | \details{ 41 | Polarity scores of documents are the means of polarity scores of 42 | words weighted by their frequency. When \code{se_fit = TRUE}, this function 43 | returns the weighted means, their standard errors, and the number of 44 | polarity words in the documents. When \code{rescale = TRUE}, it converts the raw 45 | polarity scores to z sores for easier interpretation. When \code{rescale = FALSE} and \code{cut} is used, polarity scores of documents are bounded by 46 | [-1.0, 1.0]. 47 | 48 | Documents tend to receive extreme polarity scores when they have only few 49 | polarity words. This is problematic when LSS is applied to short documents 50 | (e.g. social media posts) or individual sentences, but users can alleviate 51 | this problem by adding zero polarity words to short documents using 52 | \code{min_n}. This setting does not affect empty documents. 53 | } 54 | -------------------------------------------------------------------------------- /man/print.coefficients_textmodel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel-methods.R 3 | \name{print.coefficients_textmodel} 4 | \alias{print.coefficients_textmodel} 5 | \title{Print methods for textmodel features estimates 6 | This is a helper function used in \code{print.summary.textmodel}.} 7 | \usage{ 8 | \method{print}{coefficients_textmodel}(x, digits = max(3L, getOption("digits") - 3L), ...) 9 | } 10 | \arguments{ 11 | \item{x}{a coefficients_textmodel object} 12 | 13 | \item{digits}{minimal number of \emph{significant digits}, see 14 | \code{\link[=print.default]{print.default()}}} 15 | 16 | \item{...}{additional arguments not used} 17 | } 18 | \description{ 19 | Print methods for textmodel features estimates 20 | This is a helper function used in \code{print.summary.textmodel}. 21 | } 22 | \keyword{internal} 23 | \keyword{textmodel} 24 | -------------------------------------------------------------------------------- /man/print.statistics_textmodel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel-methods.R 3 | \name{print.statistics_textmodel} 4 | \alias{print.statistics_textmodel} 5 | \title{Implements print methods for textmodel_statistics} 6 | \usage{ 7 | \method{print}{statistics_textmodel}(x, digits = max(3L, getOption("digits") - 3L), ...) 8 | } 9 | \arguments{ 10 | \item{x}{a textmodel_wordscore_statistics object} 11 | 12 | \item{digits}{minimal number of \emph{significant digits}, see 13 | \code{\link[=print.default]{print.default()}}} 14 | 15 | \item{...}{further arguments passed to or from other methods} 16 | } 17 | \description{ 18 | Implements print methods for textmodel_statistics 19 | } 20 | \keyword{internal} 21 | \keyword{textmodel} 22 | -------------------------------------------------------------------------------- /man/print.summary.textmodel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel-methods.R 3 | \name{print.summary.textmodel} 4 | \alias{print.summary.textmodel} 5 | \title{print method for summary.textmodel} 6 | \usage{ 7 | \method{print}{summary.textmodel}(x, digits = max(3L, getOption("digits") - 3L), ...) 8 | } 9 | \arguments{ 10 | \item{x}{a \code{summary.textmodel} object} 11 | 12 | \item{digits}{minimal number of \emph{significant digits}, see 13 | \code{\link[=print.default]{print.default()}}} 14 | 15 | \item{...}{additional arguments not used} 16 | } 17 | \description{ 18 | print method for summary.textmodel 19 | } 20 | \keyword{internal} 21 | \keyword{textmodel} 22 | -------------------------------------------------------------------------------- /man/seedwords.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{seedwords} 4 | \alias{seedwords} 5 | \title{Seed words for Latent Semantic Analysis} 6 | \usage{ 7 | seedwords(type) 8 | } 9 | \arguments{ 10 | \item{type}{type of seed words currently only for sentiment (\code{sentiment}) 11 | or political ideology (\code{ideology}).} 12 | } 13 | \description{ 14 | Seed words for Latent Semantic Analysis 15 | } 16 | \examples{ 17 | seedwords('sentiment') 18 | } 19 | \references{ 20 | Turney, P. D., & Littman, M. L. (2003). Measuring Praise and 21 | Criticism: Inference of Semantic Orientation from Association. ACM Trans. 22 | Inf. Syst., 21(4), 315–346. \doi{10.1145/944012.944013} 23 | } 24 | -------------------------------------------------------------------------------- /man/smooth_lss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{smooth_lss} 4 | \alias{smooth_lss} 5 | \title{Smooth predicted polarity scores} 6 | \usage{ 7 | smooth_lss( 8 | x, 9 | lss_var = "fit", 10 | date_var = "date", 11 | span = 0.1, 12 | groups = NULL, 13 | from = NULL, 14 | to = NULL, 15 | by = "day", 16 | engine = c("loess", "locfit"), 17 | ... 18 | ) 19 | } 20 | \arguments{ 21 | \item{x}{a \link{data.frame} containing polarity scores and dates.} 22 | 23 | \item{lss_var}{the name of the column in \code{x} for polarity scores.} 24 | 25 | \item{date_var}{the name of the column in \code{x} for dates.} 26 | 27 | \item{span}{the level of smoothing.} 28 | 29 | \item{groups}{specify the columns in \code{x} to smooth separately 30 | by the group; the columns must be factor, character or logical.} 31 | 32 | \item{from, to, by}{the the range and the internal of the smoothed scores; 33 | passed to \link{seq.Date}.} 34 | 35 | \item{engine}{specifies the function to be used for smoothing.} 36 | 37 | \item{...}{additional arguments passed to the smoothing function.} 38 | } 39 | \description{ 40 | Smooth predicted polarity scores by local polynomial regression. 41 | } 42 | \details{ 43 | Smoothing is performed using \code{\link[stats:loess]{stats::loess()}} or \code{\link[locfit:locfit]{locfit::locfit()}}. 44 | When the \code{x} has more than 10000 rows, it is usually better to choose 45 | the latter by setting \code{engine = "locfit"}. In this case, \code{span} is passed to 46 | \code{locfit::lp(nn = span)}. 47 | } 48 | -------------------------------------------------------------------------------- /man/textmodel_lss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel.R 3 | \name{textmodel_lss} 4 | \alias{textmodel_lss} 5 | \alias{textmodel_lss.dfm} 6 | \alias{textmodel_lss.fcm} 7 | \title{Fit a Latent Semantic Scaling model} 8 | \usage{ 9 | textmodel_lss(x, ...) 10 | 11 | \method{textmodel_lss}{dfm}( 12 | x, 13 | seeds, 14 | terms = NULL, 15 | k = 300, 16 | slice = NULL, 17 | weight = "count", 18 | cache = FALSE, 19 | simil_method = "cosine", 20 | engine = c("RSpectra", "irlba", "rsvd"), 21 | auto_weight = FALSE, 22 | include_data = FALSE, 23 | group_data = FALSE, 24 | verbose = FALSE, 25 | ... 26 | ) 27 | 28 | \method{textmodel_lss}{fcm}( 29 | x, 30 | seeds, 31 | terms = NULL, 32 | w = 50, 33 | max_count = 10, 34 | weight = "count", 35 | cache = FALSE, 36 | simil_method = "cosine", 37 | engine = c("rsparse"), 38 | auto_weight = FALSE, 39 | verbose = FALSE, 40 | ... 41 | ) 42 | } 43 | \arguments{ 44 | \item{x}{a dfm or fcm created by \code{\link[quanteda:dfm]{quanteda::dfm()}} or \code{\link[quanteda:fcm]{quanteda::fcm()}}} 45 | 46 | \item{...}{additional arguments passed to the underlying engine.} 47 | 48 | \item{seeds}{a character vector or named numeric vector that contains seed 49 | words. If seed words contain "*", they are interpreted as glob patterns. 50 | See \link[quanteda:valuetype]{quanteda::valuetype}.} 51 | 52 | \item{terms}{a character vector or named numeric vector that specify words 53 | for which polarity scores will be computed; if a numeric vector, words' polarity 54 | scores will be weighted accordingly; if \code{NULL}, all the features of 55 | \code{\link[quanteda:dfm]{quanteda::dfm()}} or \code{\link[quanteda:fcm]{quanteda::fcm()}} will be used.} 56 | 57 | \item{k}{the number of singular values requested to the SVD engine. Only used 58 | when \code{x} is a \code{dfm}.} 59 | 60 | \item{slice}{a number or indices of the components of word vectors used to 61 | compute similarity; \code{slice < k} to further truncate word vectors; useful 62 | for diagnosys and simulation.} 63 | 64 | \item{weight}{weighting scheme passed to \code{\link[quanteda:dfm_weight]{quanteda::dfm_weight()}}. Ignored 65 | when \code{engine} is "rsparse".} 66 | 67 | \item{cache}{if \code{TRUE}, save result of SVD for next execution with identical 68 | \code{x} and settings. Use the \code{base::options(lss_cache_dir)} to change the 69 | location cache files to be save.} 70 | 71 | \item{simil_method}{specifies method to compute similarity between features. 72 | The value is passed to \code{\link[quanteda.textstats:textstat_simil]{quanteda.textstats::textstat_simil()}}, "cosine" is 73 | used otherwise.} 74 | 75 | \item{engine}{select the engine to factorize \code{x} to generate word vectors. Choose 76 | from \code{\link[RSpectra:svds]{RSpectra::svds()}}, \code{\link[irlba:irlba]{irlba::irlba()}}, \code{\link[rsvd:rsvd]{rsvd::rsvd()}}, and 77 | \code{\link[rsparse:GloVe]{rsparse::GloVe()}}.} 78 | 79 | \item{auto_weight}{automatically determine weights to approximate the 80 | polarity of terms to seed words. Deprecated.} 81 | 82 | \item{include_data}{if \code{TRUE}, fitted model includes the dfm supplied as \code{x}.} 83 | 84 | \item{group_data}{if \code{TRUE}, apply \code{dfm_group(x)} before saving the dfm.} 85 | 86 | \item{verbose}{show messages if \code{TRUE}.} 87 | 88 | \item{w}{the size of word vectors. Used only when \code{x} is a \code{fcm}.} 89 | 90 | \item{max_count}{passed to \code{x_max} in \code{rsparse::GloVe$new()} where cooccurrence 91 | counts are ceiled to this threshold. It should be changed according to the 92 | size of the corpus. Used only when \code{x} is a \code{fcm}.} 93 | } 94 | \description{ 95 | Latent Semantic Scaling (LSS) is a word embedding-based semisupervised algorithm 96 | for document scaling. 97 | } 98 | \details{ 99 | Latent Semantic Scaling (LSS) is a semisupervised document scaling 100 | method. \code{textmodel_lss()} constructs word vectors from use-provided 101 | documents (\code{x}) and weights words (\code{terms}) based on their semantic 102 | proximity to seed words (\code{seeds}). Seed words are any known polarity words 103 | (e.g. sentiment words) that users should manually choose. The required 104 | number of seed words are usually 5 to 10 for each end of the scale. 105 | 106 | If \code{seeds} is a named numeric vector with positive and negative values, a 107 | bipolar LSS model is construct; if \code{seeds} is a character vector, a 108 | unipolar LSS model. Usually bipolar models perform better in document 109 | scaling because both ends of the scale are defined by the user. 110 | 111 | A seed word's polarity score computed by \code{textmodel_lss()} tends to diverge 112 | from its original score given by the user because it's score is affected 113 | not only by its original score but also by the original scores of all other 114 | seed words. If \code{auto_weight = TRUE}, the original scores are weighted 115 | automatically using \code{\link[stats:optim]{stats::optim()}} to minimize the squared difference 116 | between seed words' computed and original scores. Weighted scores are saved 117 | in \code{seed_weighted} in the object. 118 | 119 | Please visit the \href{https://koheiw.github.io/LSX/}{package website} for examples. 120 | } 121 | \references{ 122 | Watanabe, Kohei. 2020. "Latent Semantic Scaling: A Semisupervised 123 | Text Analysis Technique for New Domains and Languages", Communication 124 | Methods and Measures. \doi{10.1080/19312458.2020.1832976}. 125 | 126 | Watanabe, Kohei. 2017. "Measuring News Bias: Russia's Official News Agency 127 | ITAR-TASS' Coverage of the Ukraine Crisis" European Journal of 128 | Communication. \doi{10.1177/0267323117695735}. 129 | } 130 | -------------------------------------------------------------------------------- /man/textplot_components.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textplot.R 3 | \name{textplot_components} 4 | \alias{textplot_components} 5 | \title{[experimental] Plot clusters of word vectors} 6 | \usage{ 7 | textplot_components( 8 | x, 9 | n = 5, 10 | method = "ward.D2", 11 | scale = c("absolute", "relative") 12 | ) 13 | } 14 | \arguments{ 15 | \item{x}{a fitted \code{textmodel_lss}.} 16 | 17 | \item{n}{the number of cluster.} 18 | 19 | \item{method}{the method for hierarchical clustering.} 20 | 21 | \item{scale}{change the scale of y-axis.} 22 | } 23 | \description{ 24 | Experimental function to find clusters of word vectors 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/textplot_simil.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textplot.R 3 | \name{textplot_simil} 4 | \alias{textplot_simil} 5 | \title{Plot similarity between seed words} 6 | \usage{ 7 | textplot_simil(x) 8 | } 9 | \arguments{ 10 | \item{x}{fitted textmodel_lss object.} 11 | } 12 | \description{ 13 | Plot similarity between seed words 14 | } 15 | -------------------------------------------------------------------------------- /man/textplot_terms.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textplot.R 3 | \name{textplot_terms} 4 | \alias{textplot_terms} 5 | \title{Plot polarity scores of words} 6 | \usage{ 7 | textplot_terms( 8 | x, 9 | highlighted = NULL, 10 | max_highlighted = 50, 11 | max_words = 1000, 12 | sampling = c("absolute", "relative"), 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{a fitted textmodel_lss object.} 18 | 19 | \item{highlighted}{\link[quanteda:pattern]{quanteda::pattern} to select words to highlight. If a 20 | \link[quanteda:dictionary]{quanteda::dictionary} is passed, words in the top-level categories are 21 | highlighted in different colors.} 22 | 23 | \item{max_highlighted}{the maximum number of words to highlight. When 24 | \code{highlighted = NULL}, words are randomly sampled proportionally to 25 | \code{beta ^ 2 * log(frequency)} for highlighting.} 26 | 27 | \item{max_words}{the maximum number of words to plot. Words are randomly 28 | sampled to keep the number below the limit.} 29 | 30 | \item{sampling}{if "relative", words are sampled based on their squared deviation 31 | from the mean for highlighting; if "absolute", they are sampled 32 | based on the squared distance from zero.} 33 | 34 | \item{...}{passed to underlying functions. See the Details.} 35 | } 36 | \description{ 37 | Plot polarity scores of words 38 | } 39 | \details{ 40 | Users can customize the plots through \code{...}, which is 41 | passed to \code{\link[ggplot2:geom_text]{ggplot2::geom_text()}} and \code{\link[ggrepel:geom_text_repel]{ggrepel::geom_text_repel()}}. The 42 | colors are specified internally but users can override the settings by appending 43 | \code{\link[ggplot2:scale_manual]{ggplot2::scale_colour_manual()}} or \code{\link[ggplot2:scale_brewer]{ggplot2::scale_colour_brewer()}}. The 44 | legend title can also be modified using \code{\link[ggplot2:labs]{ggplot2::labs()}}. 45 | } 46 | -------------------------------------------------------------------------------- /man/textstat_context.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textstat.R 3 | \name{textstat_context} 4 | \alias{textstat_context} 5 | \alias{char_context} 6 | \title{Identify context words} 7 | \usage{ 8 | textstat_context( 9 | x, 10 | pattern, 11 | valuetype = c("glob", "regex", "fixed"), 12 | case_insensitive = TRUE, 13 | window = 10, 14 | min_count = 10, 15 | remove_pattern = TRUE, 16 | n = 1, 17 | skip = 0, 18 | ... 19 | ) 20 | 21 | char_context( 22 | x, 23 | pattern, 24 | valuetype = c("glob", "regex", "fixed"), 25 | case_insensitive = TRUE, 26 | window = 10, 27 | min_count = 10, 28 | remove_pattern = TRUE, 29 | p = 0.001, 30 | n = 1, 31 | skip = 0 32 | ) 33 | } 34 | \arguments{ 35 | \item{x}{a tokens object created by \code{\link[quanteda:tokens]{quanteda::tokens()}}.} 36 | 37 | \item{pattern}{\code{\link[quanteda:pattern]{quanteda::pattern()}} to specify target words.} 38 | 39 | \item{valuetype}{the type of pattern matching: \code{"glob"} for "glob"-style 40 | wildcard expressions; \code{"regex"} for regular expressions; or \code{"fixed"} for 41 | exact matching. See \code{\link[quanteda:valuetype]{quanteda::valuetype()}} for details.} 42 | 43 | \item{case_insensitive}{if \code{TRUE}, ignore case when matching.} 44 | 45 | \item{window}{size of window for collocation analysis.} 46 | 47 | \item{min_count}{minimum frequency of words within the window to be 48 | considered as collocations.} 49 | 50 | \item{remove_pattern}{if \code{TRUE}, keywords do not contain target words.} 51 | 52 | \item{n}{integer vector specifying the number of elements to be concatenated 53 | in each n-gram. Each element of this vector will define a \eqn{n} in the 54 | \eqn{n}-gram(s) that are produced.} 55 | 56 | \item{skip}{integer vector specifying the adjacency skip size for tokens 57 | forming the n-grams, default is 0 for only immediately neighbouring words. 58 | For \code{skipgrams}, \code{skip} can be a vector of integers, as the 59 | "classic" approach to forming skip-grams is to set skip = \eqn{k} where 60 | \eqn{k} is the distance for which \eqn{k} or fewer skips are used to 61 | construct the \eqn{n}-gram. Thus a "4-skip-n-gram" defined as \code{skip = 0:4} produces results that include 4 skips, 3 skips, 2 skips, 1 skip, and 0 62 | skips (where 0 skips are typical n-grams formed from adjacent words). See 63 | Guthrie et al (2006).} 64 | 65 | \item{...}{additional arguments passed to \code{\link[quanteda.textstats:textstat_keyness]{quanteda.textstats::textstat_keyness()}}.} 66 | 67 | \item{p}{threshold for statistical significance of collocations.} 68 | } 69 | \description{ 70 | Identify context words using user-provided patterns. 71 | } 72 | \seealso{ 73 | \code{\link[quanteda.textstats:textstat_keyness]{quanteda.textstats::textstat_keyness()}} 74 | } 75 | -------------------------------------------------------------------------------- /man/weight_seeds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textmodel.R 3 | \name{weight_seeds} 4 | \alias{weight_seeds} 5 | \title{Internal function to generate equally-weighted seed set} 6 | \usage{ 7 | weight_seeds(seeds, type) 8 | } 9 | \description{ 10 | Internal function to generate equally-weighted seed set 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /tests/data/data_dictionary_ideology.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/data_dictionary_ideology.RData -------------------------------------------------------------------------------- /tests/data/data_dictionary_sentiment.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/data_dictionary_sentiment.RData -------------------------------------------------------------------------------- /tests/data/lss_k300.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/lss_k300.RDS -------------------------------------------------------------------------------- /tests/data/matrix_k100.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/matrix_k100.RDS -------------------------------------------------------------------------------- /tests/data/prediction_v0.93.RDA: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/prediction_v0.93.RDA -------------------------------------------------------------------------------- /tests/data/prediction_v0.99.RDA: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/prediction_v0.99.RDA -------------------------------------------------------------------------------- /tests/data/save.R: -------------------------------------------------------------------------------- 1 | require(quanteda) 2 | require(wordvector) 3 | 4 | toks <- readRDS("tests/data/tokens.RDS") %>% 5 | tokens_remove(stopwords("en"), min_nchar = 2) %>% 6 | tokens_tolower() 7 | 8 | feat <- head(char_context(toks, "america*", min_count = 1, p = 0.05), 100) 9 | dfmt <- dfm(toks) 10 | seed <- as.seedwords(data_dictionary_sentiment) 11 | 12 | lss <- textmodel_lss(dfmt, seed, terms = feat, k = 300) 13 | saveRDS(lss, "tests/data/lss_k300.RDS") 14 | 15 | wdv <- textmodel_word2vec(head(toks, 10), min_count = 1) 16 | saveRDS(wdv, "tests/data/word2vec.RDS") 17 | 18 | wdv2 <- textmodel_word2vec(head(toks, 10), min_count = 1, normalize = FALSE) 19 | saveRDS(wdv2, "tests/data/word2vec-prob.RDS") 20 | -------------------------------------------------------------------------------- /tests/data/tokens.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/tokens.RDS -------------------------------------------------------------------------------- /tests/data/word2vec-prob.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/word2vec-prob.RDS -------------------------------------------------------------------------------- /tests/data/word2vec.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koheiw/LSX/6e4b64fa51c3b32b384c473b57ea68854eeb4cb9/tests/data/word2vec.RDS -------------------------------------------------------------------------------- /tests/misc/replication.R: -------------------------------------------------------------------------------- 1 | require(quanteda) 2 | require(wordvector) 3 | require(LSX) 4 | require(testthat) 5 | 6 | # original function 7 | as.textmodel_lss0 <- function(x, seeds) { 8 | 9 | prob <- probability(x, names(seeds), "values") 10 | seeds <- seeds[names(seeds) %in% rownames(prob)] 11 | res <- list(model = prob, 12 | seeds = seeds, 13 | beta = rowMeans(prob %*% diag(seeds)), 14 | frequency = x$frequency) 15 | class(res) <- "textmodel_lss" 16 | return(res) 17 | } 18 | 19 | toks <- tokens(data_corpus_news2014) %>% 20 | tokens_remove(stopwords(), min_nchar = 2) %>% 21 | tokens_tolower() 22 | 23 | seed <- as.seedwords(data_dictionary_sentiment) 24 | 25 | wdv <- textmodel_word2vec(toks, dim = 100, type = "skip-gram", normalize = FALSE, 26 | verbpse = TRUE) 27 | lss_wdv0 <- as.textmodel_lss0(wdv, seed) 28 | lss_wdv <- as.textmodel_lss(wdv, seed, spatial = FALSE) 29 | 30 | expect_setequal( 31 | names(lss_wdv0$beta), 32 | names(lss_wdv$beta) 33 | ) 34 | 35 | expect_equal( 36 | cor(coef(lss_wdv0), coef(lss_wdv)), 37 | 1.0 38 | ) 39 | 40 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly = TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = FALSE, 3 | skip_on_cran = TRUE) 4 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | require(testthat) 2 | require(quanteda) 3 | quanteda_options(threads = 2) 4 | test_check("LSX") 5 | quanteda_options(reset = TRUE) 6 | -------------------------------------------------------------------------------- /tests/testthat/test-as.textmodel.R: -------------------------------------------------------------------------------- 1 | require(quanteda) 2 | 3 | mat_test <- readRDS("../data/matrix_k100.RDS") 4 | toks_test <- readRDS("../data/tokens.RDS") 5 | feat_test <- head(char_context(toks_test, "america*", min_count = 1, p = 0.05), 100) 6 | dfmt_test <- dfm(toks_test) 7 | seed <- as.seedwords(data_dictionary_sentiment) 8 | lss_test <- textmodel_lss(dfmt_test, seed, terms = feat_test, k = 50, 9 | include_data = FALSE) 10 | 11 | test_that("as.textmodel_lss works with matrix", { 12 | 13 | term <- c("decision", "instance", "universal", "foundations", "the") 14 | 15 | # with terms 16 | lss1 <- as.textmodel_lss(mat_test, seed, term) 17 | expect_equal(names(lss1), names(LSX:::build_lss())) 18 | expect_identical(lss1$embedding, mat_test) 19 | expect_false(any(duplicated(names(coef(lss1))))) 20 | pred1 <- predict(lss1, dfmt_test) 21 | expect_equal(names(pred1), rownames(dfmt_test)) 22 | expect_equal(rowSums(dfmt_test[,names(lss1$beta)]) == 0, 23 | is.na(pred1)) 24 | 25 | # without terms 26 | lss2 <- as.textmodel_lss(mat_test, seed) 27 | expect_equal(names(lss2), names(LSX:::build_lss())) 28 | expect_identical(lss2$embedding, mat_test) 29 | expect_false(any(duplicated(names(coef(lss2))))) 30 | pred2 <- predict(lss2, dfmt_test) 31 | expect_equal(names(pred2), rownames(dfmt_test)) 32 | expect_equal(rowSums(dfmt_test[,names(lss2$beta)]) == 0, 33 | is.na(pred2)) 34 | 35 | # with special features 36 | mat_special <- mat_test 37 | colnames(mat_special)[1:2] <- c("", "*") 38 | lss3 <- as.textmodel_lss(mat_special, seed) 39 | expect_equal(sum("" == names(coef(lss3))), 0) 40 | expect_equal(sum("*" == names(coef(lss3))), 1) 41 | 42 | # with slice 43 | lss4 <- as.textmodel_lss(mat_test, seed, slice = 50) 44 | expect_error( 45 | as.textmodel_lss(mat_test, seed, slice = 150), 46 | "The value of slice must be between 1 and 100" 47 | ) 48 | expect_error( 49 | as.textmodel_lss(mat_test, seed, slice = 1:150), 50 | "The length of slice must be between 1 and 100" 51 | ) 52 | expect_identical(coef(lss4), 53 | coef(as.textmodel_lss(mat_test, seed, slice = 1:50))) 54 | expect_identical(lss4$embedding, mat_test) 55 | }) 56 | 57 | test_that("as.textmodel_lss errors with invalid columns", { 58 | seed <- as.seedwords(data_dictionary_sentiment) 59 | mat_nocol <- mat_nacol <- mat_na <- mat_test 60 | colnames(mat_nocol) <- NULL 61 | expect_error(as.textmodel_lss(mat_nocol, seed), 62 | "x must have column names for features") 63 | colnames(mat_nacol)[1] <- NA 64 | expect_error(as.textmodel_lss(mat_nacol, seed), 65 | "x must not have NA in the column names") 66 | mat_na[1,1] <- NA 67 | expect_error(as.textmodel_lss(mat_na, seed), 68 | "x must not have NA") 69 | }) 70 | 71 | test_that("as.textmodel_lss works with textmodel_lss", { 72 | 73 | # with fitted model 74 | lss <- as.textmodel_lss(lss_test, seed, terms = feat_test, slice = 10) 75 | expect_equal(lss$embedding, lss_test$embedding) 76 | expect_identical(lss$data, lss_test$data) 77 | expect_identical(lss$frequency, lss_test$frequency) 78 | expect_identical(lss$concatenator, lss_test$concatenator) 79 | 80 | expect_error( 81 | as.textmodel_lss(lss_test, seed, slice = 100), 82 | "The value of slice must be between 1 and 50" 83 | ) 84 | expect_error( 85 | as.textmodel_lss(lss_test, seed, slice = 1:100), 86 | "The length of slice must be between 1 and 50" 87 | ) 88 | 89 | # with dummy LSS 90 | weight <- c("decision" = 0.1, "instance" = -0.1, 91 | "foundations" = 0.3, "the" = 0) 92 | lss_dummy <- as.textmodel_lss(weight) 93 | expect_error( 94 | as.textmodel_lss(lss_dummy, seed), 95 | "x must be a valid textmodel_lss object" 96 | ) 97 | }) 98 | 99 | test_that("as.textmodel_lss works with textmodel_wordvector", { 100 | 101 | # spatial 102 | wdv <- readRDS("../data/word2vec.RDS") 103 | lss <- as.textmodel_lss(wdv, seed) 104 | 105 | expect_equal(lss$beta_type, "similarity") 106 | expect_equal(lss$embedding, t(wdv$values)) 107 | expect_identical(lss$frequency, wdv$frequency) 108 | expect_identical(names(lss$frequency), names(lss$frequency)) 109 | expect_identical(names(lss$beta), names(lss$frequency)) 110 | 111 | # probabilistic 112 | wdv2 <- readRDS("../data/word2vec-prob.RDS") 113 | lss2 <- as.textmodel_lss(wdv2, seed) 114 | 115 | expect_equal(lss2$beta_type, "probability") 116 | expect_true(is.null(lss2$embedding)) 117 | expect_identical(lss2$frequency, wdv2$frequency) 118 | expect_identical(names(lss2$frequency), names(wdv2$frequency)) 119 | expect_identical(names(lss2$beta), names(lss2$frequency)) 120 | 121 | 122 | lss3 <- as.textmodel_lss(wdv2, "good") # single seed 123 | expect_true(is.null(lss3$embedding)) 124 | expect_identical(lss3$frequency, wdv2$frequency) 125 | expect_identical(names(lss3$frequency), names(wdv2$frequency)) 126 | expect_identical(names(lss3$beta), names(lss3$frequency)) 127 | }) 128 | 129 | test_that("as.textmodel_lss works with vector", { 130 | weight <- c("decision" = 0.1, "instance" = -0.1, 131 | "foundations" = 0.3, "the" = 0) 132 | lss <- as.textmodel_lss(weight) 133 | expect_equal(names(lss), names(LSX:::build_lss())) 134 | pred <- predict(lss, dfmt_test) 135 | expect_equal(names(pred), rownames(dfmt_test)) 136 | expect_equal(rowSums(dfmt_test[,names(lss$beta)]) == 0, 137 | is.na(pred)) 138 | }) 139 | 140 | test_that("as.textmodel_lss errors with vector", { 141 | weight <- c("decision" = 0.1, "instance" = -0.1, 142 | "foundations" = 0.3, "the" = 0) 143 | weight_noname <- weight_naname <- weight_na <- weight 144 | names(weight_noname) <- NULL 145 | expect_error(as.textmodel_lss(weight_noname), 146 | "x must have names for features") 147 | names(weight_naname)[1] <- NA 148 | expect_error(as.textmodel_lss(weight_naname), 149 | "x must not have NA in the names") 150 | weight_na[1] <- NA 151 | expect_error(as.textmodel_lss(weight_na), 152 | "x must not have NA") 153 | }) 154 | 155 | test_that("auto_weight is working", { 156 | skip_on_cran() 157 | 158 | lss1 <- as.textmodel_lss(mat_test, seed) 159 | suppressWarnings({ 160 | lss2 <- as.textmodel_lss(mat_test, seed, auto_weight = TRUE) 161 | }) 162 | expect_true( 163 | all(lss1$seeds_weighted != lss2$seeds_weighted) 164 | ) 165 | expect_true( 166 | all(sign(lss1$seeds_weighted) == sign(lss2$seeds_weighted)) 167 | ) 168 | expect_true( 169 | all(abs(lss2$beta[names(lss2$seeds_weighted)] - lss1$seeds_weighted) < 0.05) 170 | ) 171 | expect_warning( 172 | as.textmodel_lss(mat_test, seed, auto_weight = TRUE, verbose = FALSE), 173 | "'auto_weight' is deprecated" 174 | ) 175 | }) 176 | 177 | test_that("terms is working", { 178 | skip_on_cran() 179 | 180 | lss <- textmodel_lss(dfmt_test, seed, k = 50) 181 | 182 | # glob pattern 183 | lss1 <- as.textmodel_lss(lss, seed, terms = "poli*") 184 | expect_equal(sum(stringi::stri_startswith_fixed(names(lss1$beta), "poli")), 11) 185 | expect_identical(names(lss1$beta), names(lss1$frequency)) 186 | 187 | # numeric vector 188 | weight <- sample(1:10, length(lss1$beta), replace = TRUE) / 10 189 | names(weight) <- names(lss1$beta) 190 | lss2 <- as.textmodel_lss(lss, seed, terms = weight) 191 | expect_true(all(lss2$beta == lss1$beta * weight)) 192 | expect_error(as.textmodel_lss(lss, seed, terms = c("polity" = 0.2, "politic" = -0.1)), 193 | "terms must be positive values without NA") 194 | expect_error(as.textmodel_lss(lss, seed, terms = c("polity" = 0.2, "politic" = NA)), 195 | "terms must be positive values without NA") 196 | expect_error(as.textmodel_lss(lss, seed, terms = c(01, 0.2)), 197 | "terms must be named") 198 | 199 | }) 200 | 201 | -------------------------------------------------------------------------------- /tests/testthat/test-bootstrap.R: -------------------------------------------------------------------------------- 1 | require(quanteda) 2 | 3 | toks_test <- readRDS("../data/tokens.RDS") 4 | dfmt_test <- dfm(toks_test) %>% 5 | dfm_group() 6 | lss_test <- readRDS("../data/lss_k300.RDS") 7 | 8 | test_that("bootstrap_lss works with what = seeds", { 9 | 10 | bs1 <- bootstrap_lss(lss_test, "seeds") 11 | expect_true(is.character(as.vector(bs1))) 12 | expect_equal(class(as.vector(bs1)), "character") 13 | expect_equal(ncol(bs1), 12) 14 | expect_equal(nrow(bs1), length(lss_test$beta)) 15 | expect_equal(attr(bs1, "values"), names(lss_test$seeds_weighted)) 16 | 17 | bs2 <- bootstrap_lss(lss_test, "seeds", remove = TRUE) 18 | expect_true(is.character(as.vector(bs2))) 19 | expect_equal(class(as.vector(bs1)), "character") 20 | expect_equal(ncol(bs2), 12) 21 | expect_equal(nrow(bs2), length(lss_test$beta)) 22 | expect_equal(attr(bs2, "values"), names(lss_test$seeds_weighted)) 23 | 24 | bs3 <- bootstrap_lss(lss_test, mode = "coef") 25 | expect_equal(class(as.vector(bs3)), "numeric") 26 | expect_equal(ncol(bs3), 12) 27 | expect_equal(nrow(bs3), length(lss_test$beta)) 28 | expect_equal(attr(bs3, "values"), names(lss_test$seeds_weighted)) 29 | 30 | bs4 <- bootstrap_lss(lss_test, mode = "predict", newdata = dfmt_test) 31 | expect_equal(class(as.vector(bs4)), "numeric") 32 | expect_equal(ncol(bs4), 12) 33 | expect_equal(attr(bs4, "values"), names(lss_test$seeds_weighted)) 34 | expect_equal(rownames(bs4), docnames(dfmt_test)) 35 | expect_error( 36 | bootstrap_lss(lss_test, mode = "predict", newdata = dfmt_test, se_fit = TRUE), 37 | 'formal argument "se_fit" matched by multiple actual arguments' 38 | ) 39 | }) 40 | 41 | test_that("bootstrap_lss works with remove = TRUE", { 42 | lss_test$terms <- NULL 43 | 44 | bs4 <- bootstrap_lss(lss_test, "seeds") 45 | expect_true(identical(colnames(bs4), unname(bs4[1,]))) 46 | bs5 <- bootstrap_lss(lss_test, "seeds", remove = TRUE) 47 | expect_false(identical(colnames(bs5), unname(bs5[1,]))) 48 | 49 | expect_error(bootstrap_lss(lss_test, what = "seed", remove = NULL), 50 | "remove cannot be NULL") 51 | }) 52 | 53 | test_that("bootstrap_lss with what = k", { 54 | 55 | bs1 <- bootstrap_lss(lss_test, what = "k") 56 | expect_equal(class(as.vector(bs1)), "character") 57 | expect_equal(ncol(bs1), 5) 58 | expect_equal(nrow(bs1), length(lss_test$beta)) 59 | expect_equal(attr(bs1, "values"), seq(100, 300, 50)) 60 | 61 | bs2 <- bootstrap_lss(lss_test, what = "k", by = 10) 62 | expect_equal(ncol(bs2), 21) 63 | expect_equal(attr(bs2, "values"), seq(100, 300, 10)) 64 | 65 | bs3 <- bootstrap_lss(lss_test, what = "k", from = 150, to = 200, by = 10) 66 | expect_equal(ncol(bs3), 6) 67 | expect_equal(attr(bs3, "values"), seq(150, 200, 10)) 68 | 69 | expect_error(bootstrap_lss(lss_test, what = "k", from = 0), 70 | "The value of from must be between 1 and 300") 71 | expect_error(bootstrap_lss(lss_test, what = "k", to = 0), 72 | "The value of to must be between 1 and 300") 73 | expect_error(bootstrap_lss(lss_test, what = "k", by = -1), 74 | "The value of by must be between 1 and 300") 75 | expect_error(bootstrap_lss(lss_test, what = "k", by = 1000), 76 | "The value of by must be between 1 and 300") 77 | }) 78 | 79 | test_that("bootstrap_lss show messages", { 80 | 81 | expect_silent( 82 | bootstrap_lss(lss_test, "k", verbose = FALSE) 83 | ) 84 | expect_output( 85 | bootstrap_lss(lss_test, "seeds", verbose = TRUE), 86 | "Call terms\\(x\\) with different hyper-parameters.*" 87 | ) 88 | expect_output( 89 | bootstrap_lss(lss_test, "k", verbose = TRUE), 90 | "Call terms\\(x\\) with different hyper-parameters.*" 91 | ) 92 | expect_output( 93 | bootstrap_lss(lss_test, "seeds", verbose = TRUE), 94 | 'seeds = "good"' 95 | ) 96 | expect_output( 97 | bootstrap_lss(lss_test, "seeds", remove = TRUE, verbose = TRUE), 98 | 'seeds != "good"' 99 | ) 100 | }) 101 | 102 | test_that("optimize works", { 103 | 104 | r1 <- optimize_lss(lss_test, newdata = dfmt_test, what = "k") 105 | expect_identical(names(r1), c("100", "150", "200", "250", "300")) 106 | 107 | r2 <- optimize_lss(lss_test, newdata = dfmt_test, what = "seed", remove = TRUE) 108 | expect_identical(names(r2), names(lss_test$seeds_weighted)) 109 | 110 | }) 111 | -------------------------------------------------------------------------------- /tests/testthat/test-textplot.R: -------------------------------------------------------------------------------- 1 | 2 | require(quanteda) 3 | toks_test <- readRDS("../data/tokens.RDS") 4 | toks_test <- tokens_remove(toks_test, stopwords()) 5 | feat_test <- head(char_context(toks_test, "america*", min_count = 1, p = 0.05), 100) 6 | dict <- dictionary(list("keywords" = c("positive", "bad", "xxxx"))) 7 | 8 | test_that("textplot_* works with SVD", { 9 | dfmt <- dfm(toks_test) 10 | seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1) 11 | lss <- textmodel_lss(dfmt, seed, k = 10) 12 | suppressWarnings({ 13 | expect_equal(class(textplot_simil(lss)), c("gg", "ggplot")) 14 | }) 15 | expect_equal(class(textplot_terms(lss, highlighted = dict$keywords)), 16 | c("gg", "ggplot")) 17 | expect_equal(class(textplot_terms(lss, highlighted = dict$keywords, max_words = 2)), 18 | c("gg", "ggplot")) 19 | expect_equal(class(textplot_terms(lss, highlighted = dict$keywords, max_highlighted = 10)), 20 | c("gg", "ggplot")) 21 | expect_equal(class(textplot_terms(lss, highlighted = dict$keywords, max_highlighted = 0)), 22 | c("gg", "ggplot")) 23 | expect_equal(class(textplot_terms(lss, highlighted = dict)), 24 | c("gg", "ggplot")) 25 | expect_equal(class(textplot_terms(lss, highlighted = character())), 26 | c("gg", "ggplot")) 27 | expect_equal(class(textplot_terms(lss)), c("gg", "ggplot")) 28 | expect_equal(class(textplot_terms(lss, max_highlighted = 10)), c("gg", "ggplot")) 29 | expect_equal(class(textplot_terms(lss, sampling = "relative")), c("gg", "ggplot")) 30 | expect_equal(class(textplot_terms(lss, sampling = "absolute")), c("gg", "ggplot")) 31 | expect_error(textplot_terms(lss, sampling = "xxx")) 32 | 33 | lss2 <- textmodel_lss(dfmt, seed, terms = feat_test, k = 10) 34 | expect_equal(class(textplot_terms(lss2)), c("gg", "ggplot")) 35 | expect_equal(class(textplot_terms(lss2, sampling = "relative")), c("gg", "ggplot")) 36 | expect_equal(class(textplot_terms(lss2, sampling = "absolute")), c("gg", "ggplot")) 37 | expect_error(textplot_terms(lss2, sampling = "xxx")) 38 | }) 39 | 40 | test_that("textplot_* works even when frequency and beta do not match (#71)", { 41 | dfmt <- dfm(toks_test) 42 | seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1) 43 | lss <- textmodel_lss(dfmt, seed, k = 10) 44 | lss$frequency <- c(lss$frequency, "xxx" = 1, "yyy" = 1) # replicate #71 45 | expect_equal(class(textplot_terms(lss)), c("gg", "ggplot")) 46 | }) 47 | 48 | test_that("textplot_* works with Glove", { 49 | fcmt <- fcm(toks_test) 50 | seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1) 51 | lss <- textmodel_lss(fcmt, seed, w = 10) 52 | suppressWarnings({ 53 | expect_equal(class(textplot_simil(lss)), c("gg", "ggplot")) 54 | }) 55 | expect_equal(class(textplot_terms(lss, highlighted = dict$keywords)), 56 | c("gg", "ggplot")) 57 | expect_equal(class(textplot_terms(lss, highlighted = dict$keywords, max_words = 2)), 58 | c("gg", "ggplot")) 59 | expect_equal(class(textplot_terms(lss, highlighted = dict)), 60 | c("gg", "ggplot")) 61 | expect_equal(class(textplot_terms(lss)), c("gg", "ggplot")) 62 | expect_error(textplot_terms(lss, highlighted = dict, max_words = 100:200), 63 | "The length of max_words must be 1") 64 | 65 | lss2 <- textmodel_lss(fcmt, seed, terms = feat_test, w = 10) 66 | expect_equal(class(textplot_terms(lss2)), c("gg", "ggplot")) 67 | }) 68 | 69 | test_that("textplot_components() works", { 70 | 71 | seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1) 72 | 73 | dfmt <- dfm(toks_test) 74 | lss_svd <- textmodel_lss(dfmt, seed, k = 10) 75 | fcmt <- fcm(toks_test) 76 | lss_glove <- textmodel_lss(fcmt, seed, w = 10) 77 | 78 | gg1 <- textplot_components(lss_svd, n = 5) 79 | expect_equal(length(levels(gg1$data$group)), 5) 80 | gg2 <- textplot_components(lss_svd, n = 3) 81 | expect_equal(length(levels(gg2$data$group)), 3) 82 | 83 | expect_equal(class(textplot_components(lss_svd, 3)), c("gg", "ggplot")) 84 | expect_equal(class(textplot_components(lss_svd, 3, scale = "relative")), c("gg", "ggplot")) 85 | expect_error(textplot_components(lss_svd, n = c(5, 6)), "The length of n must be 1") 86 | expect_error(textplot_components(lss_svd, n = 20), "The value of n must be between 2 and 10") 87 | expect_error(textplot_components(lss_glove), "SVD must be used to generate word vectors") 88 | }) 89 | 90 | test_that("textplot_* raise error when attributes are missing", { 91 | dfmt <- dfm(toks_test) 92 | coef <- rnorm(100) 93 | names(coef) <- topfeatures(dfmt, 100) 94 | lss <- as.textmodel_lss(coef) 95 | suppressWarnings({ 96 | expect_error(textplot_simil(lss), 97 | "textplot_simil() does not work with dummy models", fixed = TRUE) 98 | }) 99 | }) 100 | 101 | test_that("textplot_terms works even when frequency has zeros (#85)", { 102 | dfmt <- dfm(toks_test) %>% 103 | dfm_subset(Year > 2000) 104 | seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1) 105 | suppressWarnings( 106 | lss <- textmodel_lss(dfmt, seed, k = 10) 107 | ) 108 | expect_true(any(lss$frequency == 0)) 109 | expect_equal(class(textplot_terms(lss)), c("gg", "ggplot")) 110 | expect_silent(print(textplot_terms(lss, max_highlighted = 10))) 111 | }) 112 | 113 | test_that("textplot_terms works with dictionary", { 114 | 115 | dict <- dictionary(list("american" = c("american *"), 116 | "president" = c("president *"))) 117 | toks <- tokens_subset(toks_test, Year > 2000) %>% 118 | tokens_compound(dict) 119 | dfmt <- dfm(toks) 120 | seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1) 121 | suppressWarnings( 122 | lss <- textmodel_lss(dfmt, seed, k = 10) 123 | ) 124 | expect_silent(print( 125 | textplot_terms(lss, dict, max_highlighted = 10) 126 | )) 127 | expect_silent(print( 128 | textplot_terms(lss, dictionary(list(none = "xxxxx"))) 129 | )) 130 | }) 131 | 132 | -------------------------------------------------------------------------------- /tests/testthat/test-textstat.R: -------------------------------------------------------------------------------- 1 | 2 | require(quanteda) 3 | toks_test <- readRDS("../data/tokens.RDS") 4 | 5 | test_that("textstat_context works", { 6 | 7 | char <- char_context(toks_test, phrase("united states"), 8 | min_count = 1, window = 10) 9 | dat <- textstat_context(toks_test, phrase("united states"), 10 | min_count = 1, window = 10) 11 | 12 | expect_identical(head(dat$feature, 10), head(char, 10)) 13 | expect_identical(names(dat), c("feature", "chi2", "p", "n_inside", "n_outside")) 14 | }) 15 | 16 | test_that("char_context removes multi-word target", { 17 | 18 | key_rp <- textstat_context(toks_test, phrase("united states"), 19 | min_count = 1, window = 0) 20 | expect_equal(nrow(key_rp), 0) 21 | suppressWarnings({ 22 | feat_rp <- char_context(toks_test, phrase("united states"), 23 | min_count = 1, p = 0.05, window = 0) 24 | }) 25 | expect_equal(length(feat_rp), 0) 26 | 27 | key_kp <- textstat_context(toks_test, phrase("united states"), 28 | min_count = 1, window = 0, remove_pattern = FALSE) 29 | expect_equal(nrow(key_kp), 2) 30 | 31 | feat_kp <- char_context(toks_test, phrase("united states"), 32 | min_count = 1, p = 0.05, window = 0, remove_pattern = FALSE) 33 | expect_identical(feat_kp, c("united", "states")) 34 | }) 35 | 36 | test_that("char_context removes multi-word target", { 37 | 38 | # unigram 39 | txt <- "a a b b z b c c d d" 40 | toks <- tokens(txt) 41 | cont_uni <- textstat_context(toks, "z", window = 2, min_count = 0) 42 | dfmt_uni <- tokens(c(inside = "b b b c", outside = "a a c d d")) %>% dfm() 43 | key_uni <- textstat_keyness(dfmt_uni) 44 | expect_equivalent(cont_uni, key_uni) 45 | 46 | # bigram 47 | cont_bi <- textstat_context(toks, "z", window = 2, min_count = 0, n = 2) 48 | dfmt_bi <- tokens(c(inside = "b b b c", outside = "a a c d d")) %>% 49 | tokens_ngrams(n = 2) %>% 50 | dfm() 51 | key_bi <- textstat_keyness(dfmt_bi) 52 | expect_equivalent(cont_bi, cont_bi) 53 | }) 54 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | require(quanteda) 2 | require(ggplot2) 3 | 4 | lss_test <- readRDS("../data/lss_k300.RDS") 5 | 6 | test_that("diagnosys works", { 7 | skip_on_travis() 8 | txt <- c("a b c d 0.2 . (xxx) \u2700", "a b_c 1st 2nd k100@gmail.com", 9 | "Aa bb cc. Xx yy zz.", "Aa bb cc. Xx yy zz.") 10 | dat <- data.frame(doc_id = paste0("text", 1:4), 11 | number = c(1, 3, 0, 0), 12 | punct = c(4, 2, 2, 2), 13 | symbol = c(1, 0, 0, 0), 14 | any = c(5, 4, 2, 2), 15 | n_sent = c(1, 1, 2, 2), 16 | n_token = c(10, 5, 8, 8), 17 | dupli = c(FALSE, FALSE, FALSE, TRUE), 18 | noise = c(0.5, 0.8, 0.25, 0.25), 19 | stringsAsFactors = FALSE) 20 | suppressWarnings( 21 | expect_equal(diagnosys(txt), dat) 22 | ) 23 | expect_warning(diagnosys(txt), 24 | "'diagnosys.corpus' is deprecated") 25 | 26 | }) 27 | 28 | test_that("as.seedwords works", { 29 | lis1 <- list(c("a", "b", "c"), c("d", "e", "f")) 30 | expect_equal(as.seedwords(lis1), 31 | c("a" = 1, "b" = 1, "c" = 1, "d" = -1, "e" = -1, "f" = -1)) 32 | lis2 <- list(c("a", "b", "c"), c("d", "e", "f")) 33 | expect_equal(as.seedwords(lis2, upper = 2, lower = 1), 34 | c("d" = 1, "e" = 1, "f" = 1, "a" = -1, "b" = -1, "c" = -1)) 35 | lis3 <- list("pos" = c("a", "b", "c"), "neg" = c("d", "e", "f")) 36 | expect_equal(as.seedwords(lis3, upper = "pos", lower = "neg"), 37 | c("a" = 1, "b" = 1, "c" = 1, "d" = -1, "e" = -1, "f" = -1)) 38 | lis4 <- list("pos" = c("a", "a"), "neg" = c("b", "b")) 39 | expect_equal(as.seedwords(lis4, upper = "pos", lower = "neg"), 40 | c("a" = 1, "b" = -1)) 41 | lis5 <- list("pos1" = c("a", "b"), "pos2" = c("c"), "neg" = c("d", "e", "f")) 42 | expect_equal(as.seedwords(lis5, upper = c("pos1", "pos2"), lower = "neg"), 43 | c("a" = 1, "b" = 1, "c" = 1, "d" = -1, "e" = -1, "f" = -1)) 44 | 45 | dict1 <- dictionary(lis3) 46 | expect_equal(as.seedwords(dict1, upper = "pos", lower = "neg"), 47 | c("a" = 1, "b" = 1, "c" = 1, "d" = -1, "e" = -1, "f" = -1)) 48 | expect_error(as.seedwords(data.frame(1:3)), "x must be a list or dictionary object") 49 | 50 | dict2 <- dictionary(list("pos" = "very good", "neg" = "very bad")) 51 | expect_equal(as.seedwords(dict2), 52 | c("very_good" = 1, "very_bad" = -1)) 53 | expect_equal(as.seedwords(dict2, concatenator = "+"), 54 | c("very+good" = 1, "very+bad" = -1)) 55 | 56 | }) 57 | 58 | 59 | 60 | test_that("smooth_lss works", { 61 | 62 | skip_on_cran() # takes to much time 63 | 64 | corp <- corpus_reshape(data_corpus_inaugural) 65 | toks <- tokens(corp) 66 | dfmt <- dfm(toks, remove_padding = TRUE) %>% 67 | dfm_subset(Party %in% c("Democratic", "Republican")) %>% 68 | dfm_trim() 69 | seed <- as.seedwords(data_dictionary_ideology) 70 | lss <- textmodel_lss(dfmt, seed, k = 150, include_data = TRUE, 71 | group_data = TRUE) 72 | 73 | dat <- docvars(lss$data) 74 | dat$lss <- predict(lss) 75 | dat$date <- as.Date(paste0(dat$Year, "-01-20")) 76 | 77 | smo_le <- smooth_lss(dat, lss_var = "lss", by = "year", 78 | span = 0.1, engine = "loess") 79 | expect_equal(colnames(smo_le), 80 | c("date", "time", "fit", "se.fit")) 81 | 82 | smo_lf <- smooth_lss(dat, lss_var = "lss", by = "year", 83 | span = 0.1, engine = "locfit") 84 | expect_equal(colnames(smo_lf), 85 | c("date", "time", "fit", "se.fit")) 86 | 87 | expect_true(cor(smo_le$fit, smo_lf$fit, use = "pair") > 0.90) 88 | 89 | # group by variable 90 | smo_gr_le <- smooth_lss(dat, lss_var = "lss", by = "year", 91 | span = 0.1, groups = "Party", engine = "loess") 92 | expect_equal(colnames(smo_gr_le), 93 | c("date", "time", "fit", "se.fit", "Party")) 94 | expect_equal(levels(smo_gr_le$Party), 95 | c("Democratic", "Republican")) 96 | 97 | smo_gr_lf <- smooth_lss(dat, lss_var = "lss", by = "year", 98 | span = 0.1, groups = "Party", engine = "locfit") 99 | expect_equal(colnames(smo_gr_lf), 100 | c("date", "time", "fit", "se.fit", "Party")) 101 | expect_equal(levels(smo_gr_lf$Party), 102 | c("Democratic", "Republican")) 103 | 104 | expect_true(cor(smo_gr_le$fit, smo_gr_lf$fit, use = "pair") > 0.90) 105 | 106 | # check input values 107 | expect_error( 108 | smooth_lss(dat), 109 | "fit does not exist in x" 110 | ) 111 | expect_error( 112 | smooth_lss(smooth_lss(dat, lss_var = "President")), 113 | "lss_var must be a numeric column" 114 | ) 115 | expect_error( 116 | smooth_lss(dat, lss_var = "lss", date_var = "xxx"), 117 | "xxx does not exist in x" 118 | ) 119 | expect_error( 120 | smooth_lss(dat, lss_var = "lss", date_var = "Year"), 121 | "date_var must be a date column" 122 | ) 123 | expect_error( 124 | smooth_lss(dat, lss_var = "lss", groups = "xxx"), 125 | "xxx does not exist in x" 126 | ) 127 | }) 128 | 129 | test_that("smooth_lss works with multiple grouping variables", { 130 | 131 | date <- seq(as.Date("2025-01-01"), as.Date("2025-01-31"), by = "1 day") 132 | n <- 1000 133 | dat <- data.frame(fit = rnorm(n), 134 | date = sample(date, n, replace = TRUE), 135 | class1 = factor(sample(c("a", "b", "c"), n, replace = TRUE), 136 | levels = c("a", "b", "c", "d")), 137 | class2 = sample(c("A", "B"), n, replace = TRUE), 138 | number = sample(1:10000, n)) 139 | smo1 <- smooth_lss(dat) 140 | smo2 <- smooth_lss(dat, groups = "class1") 141 | smo3 <- smooth_lss(dat, groups = c("class1", "class2")) 142 | 143 | expect_equal( 144 | nrow(smo1), 31 145 | ) 146 | expect_equal( 147 | sapply(smo1, class), 148 | c(date = "Date", time = "numeric", fit = "numeric", se.fit = "numeric") 149 | ) 150 | 151 | expect_equal( 152 | sapply(smo2, class), 153 | c(date = "Date", time = "numeric", fit = "numeric", se.fit = "numeric", 154 | class1 = "factor") 155 | ) 156 | expect_equal( 157 | nrow(smo2), 31 * 3, 158 | ) 159 | expect_equal( 160 | levels(smo2$class1), 161 | c("a", "b", "c") 162 | ) 163 | 164 | expect_equal( 165 | sapply(smo3, class), 166 | c(date = "Date", time = "numeric", fit = "numeric", se.fit = "numeric", 167 | class1 = "factor", class2 = "character") 168 | ) 169 | expect_equal( 170 | nrow(smo3), 171 | 31 * 3 * 2 172 | ) 173 | expect_equal( 174 | levels(smo3$class1), 175 | c("a", "b", "c") 176 | ) 177 | 178 | # error 179 | expect_error( 180 | smooth_lss(dat, groups = c("class1", "xxxx")), 181 | "xxxx does not exist in x" 182 | ) 183 | 184 | expect_error( 185 | smooth_lss(dat, groups = c("class1", "number")), 186 | "columns for grouping cannot be numeric" 187 | ) 188 | }) 189 | -------------------------------------------------------------------------------- /vignettes/pkgdown/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | *.rds 4 | -------------------------------------------------------------------------------- /vignettes/pkgdown/dictionary.yml: -------------------------------------------------------------------------------- 1 | hostility: 2 | hostile: ["adversary", "adversaries", "enemy", "enemies", "foe", "foes", "hostile"] 3 | friendly: ["aid", "aids", "friends", "friend", "ally", "allies", "peaceful"] 4 | 5 | country: 6 | us: ["United States", "US", "American*", "Washington"] 7 | uk: ["United Kingdom", "UK", "British", "London"] 8 | eu: ["European Union", "EU", "European*", "Brussels"] 9 | se: ["Sweden", "Swedish", "Stockholm"] 10 | fi: ["Finland", "Finnish", "Helsinki"] 11 | ua: ["Ukraine", "Ukrainian*", "Kiev", "Kyiv"] 12 | ru: ["Russia", "Russian*", "Moscow"] 13 | 14 | energy: ["gas", "oil", "engery"] 15 | -------------------------------------------------------------------------------- /vignettes/pkgdown/research.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Application in research" 3 | subtitle: "Russian state media's coverage of energy issues in the US and the EU" 4 | output: 5 | html_document: 6 | toc: true 7 | --- 8 | 9 | ```{r, include = FALSE} 10 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>", 11 | fig.width = 8, fig.height = 4, dpi = 150, out.width = 760) 12 | ``` 13 | 14 | In this example, we will analyze how much the proposition of [the European Gas Demand Reduction Plan](https://ec.europa.eu/commission/presscorner/detail/en/ip_22_4608) on 20 July affected Sputnik's coverage of energy issues in the United States and the European Union. 15 | 16 | ```{r setup} 17 | library(LSX) 18 | library(quanteda) 19 | library(ggplot2) 20 | ``` 21 | 22 | ### Preperation 23 | 24 | We will analyze the same [corpus](https://www.dropbox.com/s/abme18nlrwxgmz8/data_corpus_sputnik2022.rds?dl=1) as the [introduction](./introduction.html), so too the pre-processing. 25 | 26 | ```{r include=FALSE} 27 | if (!file.exists("data_corpus_sputnik2022.rds")) { 28 | download.file("https://www.dropbox.com/s/abme18nlrwxgmz8/data_corpus_sputnik2022.rds?dl=1", 29 | "data_corpus_sputnik2022.rds", mode = "wb") 30 | } 31 | ``` 32 | 33 | ```{r} 34 | corp <- readRDS("data_corpus_sputnik2022.rds") |> 35 | corpus_reshape() 36 | toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE, 37 | remove_numbers = TRUE, remove_url = TRUE) 38 | dfmt <- dfm(toks) |> 39 | dfm_remove(stopwords("en")) 40 | ``` 41 | 42 | We will use a [dictionary](https://github.com/koheiw/LSX/tree/master/vignettes) of keywords in this example. 43 | 44 | ```{r} 45 | dict <- dictionary(file = "dictionary.yml") 46 | print(dict[c("country", "energy")]) 47 | ``` 48 | 49 | ### Estimate the polarity of words 50 | 51 | To measure the sentiment specifically about energy issues, we collect words that occur frequently around keywords such as "oil", "gas", "energy" and passing them to `terms`. These keywords are called target words. 52 | 53 | ```{r message=FALSE} 54 | seed <- as.seedwords(data_dictionary_sentiment) 55 | term <- char_context(toks, pattern = dict$energy, p = 0.01) 56 | lss <- textmodel_lss(dfmt, seeds = seed, terms = term, cache = TRUE, 57 | include_data = TRUE, group_data = TRUE) 58 | ``` 59 | 60 | ```{r plot-term, warning=FALSE} 61 | textplot_terms(lss, highlighted = data_dictionary_LSD2015[1:2]) 62 | ``` 63 | 64 | ### Predict the polarity of documents 65 | 66 | We can extract the document variables from the DFM in the LSS model and save the predicted polarity scores as a new variable. 67 | 68 | ```{r} 69 | dat <- docvars(lss$data) 70 | dat$lss <- predict(lss) 71 | print(nrow(dat)) 72 | ``` 73 | 74 | ### Detect the mentions of country/region 75 | 76 | We can detect the mentions of countries using the dictionary. If you want to classify texts by country more accurately, you should use the [newsmap package](https://cran.r-project.org/package=newsmap). 77 | 78 | ```{r} 79 | dfmt_dict <- dfm(tokens_lookup(toks, dict$country[c("us", "eu")])) 80 | print(head(dfmt_dict)) 81 | ``` 82 | 83 | We can create dummy variables for mentions of country/region by `dfm_group(dfmt_dict) > 0`. We must group documents because the unit of analysis is the articles in this example (recall `textmodel_lss(group_data = TRUE)` above). 84 | 85 | ```{r} 86 | mat <- as.matrix(dfm_group(dfmt_dict) > 0) 87 | print(head(mat)) 88 | dat <- cbind(dat, mat) 89 | ``` 90 | 91 | ### Results 92 | 93 | We must smooth the polarity scores of documents separately for the country/region using `smooth_lss()`. After smoothing, we can see that the difference between the US and EU has expanded soon after the proposition of the European Gas Demand Reduction Plan. 94 | 95 | ```{r} 96 | smo_us <- smooth_lss(subset(dat, us), lss_var = "lss", date_var = "date") 97 | smo_us$country <- "us" 98 | smo_eu <- smooth_lss(subset(dat, eu), lss_var = "lss", date_var = "date") 99 | smo_eu$country <- "eu" 100 | smo <- rbind(smo_us, smo_eu) 101 | ``` 102 | 103 | ```{r plot-trend} 104 | ggplot(smo, aes(x = date, y = fit, color = country)) + 105 | geom_line() + 106 | geom_ribbon(aes(ymin = fit - se.fit * 1.96, ymax = fit + se.fit * 1.96, fill = country), 107 | alpha = 0.1, colour = NA) + 108 | geom_vline(xintercept = as.Date("2022-06-26"), linetype = "dotted") + 109 | scale_x_date(date_breaks = "months", date_labels = "%b") + 110 | labs(title = "Sentiment on energy", x = "Date", y = "Sentiment", 111 | fill = "Country", color = "Country") 112 | ``` 113 | 114 | To test if the changes after the proposition is statistically significant, we should create a dummy variable `after` for the period after the proposition and perform regression analysis with its interactions with the country/region dummies. This is akin to [the difference-in-differences design](https://en.wikipedia.org/wiki/Difference_in_differences) that I often employ in analysis of news (Watanabe 2017; Watanabe et al. 2022). 115 | 116 | ```{r} 117 | dat_war <- subset(dat, date >= as.Date("2022-02-24")) 118 | dat_war$after <- dat_war$date >= as.Date("2022-06-20") 119 | summary(dat_war[c("lss", "us", "eu", "after")]) 120 | ``` 121 | 122 | `dat_war` contains only the scores since the beginning of the war, so the `intercept` is the average sentiment of the articles without the mentions of the US or the EU before the proposition during the war; `usTRUE` and `euTRUE` are the average sentiment for the articles with the mentions of the US and the EU in the period, respectively. 123 | 124 | The coefficient of `afterTRUE` indicates that the overall sentiment became more negative after the proposition (*β* = -0.11; *p* < 0.01). The insignificant coefficient of `euTRUE:afterTRUE` shows that the sentiment for the EU also decreased, but the large positive coefficient of `usTRUE:afterTRUE` suggests that the sentiment for the US increased (*β* = 0.22; *p* < 0.001) and became more positive than before the proposition. 125 | 126 | ```{r} 127 | reg <- lm(lss ~ us + eu + after + us * after + eu * after, dat_war) 128 | summary(reg) 129 | ``` 130 | 131 | ### Conclusions 132 | 133 | Our analysis shows that the Sputnik covered the energy issues in the US more positively while those in the EU more negatively after the proposition the European Gas Demand Reduction Plan. Our findings are preliminary, but we can give them a tentative interpretation: the Russian government attempted to create divisions between the US and the EU by emphasizing the different impact of the Ukraine war and the sanctions against Russia on American and European lives. 134 | 135 | ### References 136 | 137 | - Watanabe, K. (2017). Measuring news bias: Russia’s official news agency ITAR-TASS’ coverage of the Ukraine crisis. *European Journal of Communication*. https://doi.org/10.1177/0267323117695735. 138 | - Watanabe, K., Segev, E., & Tago, A. (2022). Discursive diversion: Manipulation of nuclear threats by the conservative leaders in Japan and Israel, *International Communication Gazette*. https://doi.org/10.1177/17480485221097967. 139 | -------------------------------------------------------------------------------- /vignettes/pkgdown/seedwords.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Selection of seed words" 3 | subtitle: "Evaluating seed words using utility functions" 4 | output: 5 | html_document: 6 | toc: true 7 | --- 8 | 9 | ```{r, include = FALSE} 10 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>", 11 | fig.width = 8, fig.height = 4, dpi = 150, out.width = 760) 12 | ``` 13 | 14 | We must define the measurement with seed words to apply LSS. If seed words are not available, we must create a list of seed words using thesaurus and glossaries, but some of the words might be used in many different contexts, making them unsuitable as seed words. Good seed words are words that appear only in the contexts of the target concepts. 15 | 16 | We can evaluate the suitability of seed words by checking their synonyms identified in the corpus: fitting a LSS model with one seed word at a time and checking words with highest polarity scores. This repetitive process can be automated using `bootstrap_lss()`. This function extracts seed words from a fitted LSS model, fits a LSS model with each seed words internally and returns their synonyms or similarity scores. We use the LSS model fitted in the [introduction](basic.html) in this example. 17 | 18 | ```{r setup} 19 | library(LSX) 20 | lss <- readRDS("lss.rds") 21 | print(lss) 22 | ``` 23 | 24 | The model is fitted with the generic sentiment seed words. Their original polarity scores are weighted by the inverse of the number of seed words (`1/7 = 0.142`) to allow unequal numbers for opposing ends. 25 | 26 | ```{r message=FALSE} 27 | print(lss$seeds) 28 | print(lss$seeds_weighted) 29 | ``` 30 | 31 | ## Evaluation with synonyms 32 | 33 | By default, `bootstrap_lss()` returns lists of synonyms for seed words. Each column is a list of words sorted by their similarity to the seed word shown at the top. There are many proper names in the example, but we can find many words are positive for positive seed words and negative for negative seed words. If a list is a mixture of positive and negative words, the seed word is probably too ambiguous. 34 | 35 | ```{r} 36 | bs_term <- bootstrap_lss(lss, mode = "terms") 37 | knitr::kable(head(bs_term, 10)) 38 | ``` 39 | 40 | ## Evaluation with similarity scores 41 | 42 | If `mode = "coef"`, the function returns the similarity scores of words for each seed word (words were sorted by these scores in the lists above). We can use the matrix to evaluate the seed words more systematically if lists of synonyms are not useful or sufficient. 43 | 44 | ```{r} 45 | bs_coef <- bootstrap_lss(lss, mode = "coef") 46 | knitr::kable(head(bs_coef, 10), digits = 3) 47 | ``` 48 | 49 | We can use words with known polarity such as "russia" and "ukraine" as anchor words in evaluating seed words. We know that "russia" is more positive than "ukraine" because the corpus is a collection of articles published by the Russian state media. 50 | 51 | We can confirm that the difference in similarity scores between the anchor words and the polarity scores of the seed words largely agree. However, "fortunate" and "negative" disagree with the expected differences, suggesting that they are more ambiguous than other seed words. 52 | 53 | ```{r} 54 | dat_seed <- data.frame(seed = lss$seeds, diff = bs_coef["russia",] - bs_coef["ukraine",]) 55 | print(dat_seed) 56 | ``` 57 | 58 | ## Conclusions 59 | 60 | We should make a list of seed words and evaluate them one by one using `bootstrap_lss()` to create accurate measurement. However, seed words become much less ambiguous when they are used as a set, so we should not be too nervous about its results. Seed word selection should be motivated primarily by the theoretical framework 61 | 62 | ### References 63 | 64 | - Watanabe, K., & Zhou, Y. (2020). Theory-Driven Analysis of Large Corpora: Semisupervised Topic Classification of the UN Speeches, *Social Science Computer Review*, https://doi.org/10.1177/0894439320907027. 65 | - Watanabe, K. (2021). Latent Semantic Scaling: A Semisupervised Text Analysis Technique for New Domains and Languages, *Communication Methods and Measures*, https://doi.org/10.1080/19312458.2020.1832976. 66 | 67 | 68 | 69 | --------------------------------------------------------------------------------