├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   ├── pkgdown.yaml
    │   └── pr-commands.yaml
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── cache_info.R
    ├── dataset_ag_news.R
    ├── dataset_dbpedia.R
    ├── dataset_imdb.R
    ├── dataset_sentence_polarity.R
    ├── dataset_trec.R
    ├── download_functions.R
    ├── embedding_glove.R
    ├── info.R
    ├── lexicon_afinn.R
    ├── lexicon_bing.R
    ├── lexicon_loughran.R
    ├── lexicon_nrc.R
    ├── lexicon_nrc_eil.R
    ├── lexicon_nrc_vad.R
    ├── load_dataset.R
    ├── printer.R
    ├── process_functions.R
    └── textdata-package.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── codecov.yml
├── cran-comments.md
├── man
    ├── cache_info.Rd
    ├── catalogue.Rd
    ├── dataset_ag_news.Rd
    ├── dataset_dbpedia.Rd
    ├── dataset_imdb.Rd
    ├── dataset_sentence_polarity.Rd
    ├── dataset_trec.Rd
    ├── embedding_glove.Rd
    ├── figures
    │   ├── .DS_Store
    │   ├── logo.png
    │   ├── screen-shot.png
    │   └── textdata_demo.gif
    ├── lexicon_afinn.Rd
    ├── lexicon_bing.Rd
    ├── lexicon_loughran.Rd
    ├── lexicon_nrc.Rd
    ├── lexicon_nrc_eil.Rd
    ├── lexicon_nrc_vad.Rd
    ├── load_dataset.Rd
    └── textdata-package.Rd
├── pkgdown
    └── favicon
    │   ├── apple-touch-icon-120x120.png
    │   ├── apple-touch-icon-152x152.png
    │   ├── apple-touch-icon-180x180.png
    │   ├── apple-touch-icon-60x60.png
    │   ├── apple-touch-icon-76x76.png
    │   ├── apple-touch-icon.png
    │   ├── favicon-16x16.png
    │   ├── favicon-32x32.png
    │   └── favicon.ico
├── revdep
    ├── README.md
    ├── cran.md
    ├── failures.md
    └── problems.md
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-download_functions.R
    │   ├── test-info.R
    │   └── test-process_functions.R
├── textdata.Rproj
└── vignettes
    ├── .gitignore
    └── How-to-add-a-data-set.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^README\.Rmd$
 2 | ^LICENSE\.md$
 3 | ^textdata\.Rproj$
 4 | ^\.Rproj\.user$
 5 | ^\.travis\.yml$
 6 | ^CODE_OF_CONDUCT\.md$
 7 | ^cran-comments\.md$
 8 | ^_pkgdown\.yml$
 9 | ^docs$
10 | ^pkgdown$
11 | ^CRAN-RELEASE$
12 | ^revdep$
13 | ^codecov\.yml$
14 | ^\.github/workflows/R-CMD-check\.yaml$
15 | ^\.github/workflows/pr-commands\.yaml$
16 | ^\.github$
17 | ^CRAN-SUBMISSION$
18 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macOS-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v2
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     steps:
23 |       - uses: actions/checkout@v2
24 | 
25 |       - uses: r-lib/actions/setup-pandoc@v2
26 | 
27 |       - uses: r-lib/actions/setup-r@v2
28 |         with:
29 |           use-public-rspm: true
30 | 
31 |       - uses: r-lib/actions/setup-r-dependencies@v2
32 |         with:
33 |           extra-packages: any::pkgdown, local::.
34 |           needs: website
35 | 
36 |       - name: Build site
37 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 |         shell: Rscript {0}
39 | 
40 |       - name: Deploy to GitHub pages 🚀
41 |         if: github.event_name != 'pull_request'
42 |         uses: JamesIves/github-pages-deploy-action@4.1.4
43 |         with:
44 |           clean: false
45 |           branch: gh-pages
46 |           folder: docs
47 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   issue_comment:
 3 |     types: [created]
 4 | name: Commands
 5 | jobs:
 6 |   document:
 7 |     if: startsWith(github.event.comment.body, '/document')
 8 |     name: document
 9 |     runs-on: macOS-latest
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |       - uses: r-lib/actions/pr-fetch@master
13 |         with:
14 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
15 |       - uses: r-lib/actions/setup-r@master
16 |       - name: Install dependencies
17 |         run: Rscript -e 'install.packages(c("remotes", "roxygen2"))' -e 'remotes::install_deps(dependencies = TRUE)'
18 |       - name: Document
19 |         run: Rscript -e 'roxygen2::roxygenise()'
20 |       - name: commit
21 |         run: |
22 |           git add man/\* NAMESPACE
23 |           git commit -m 'Document'
24 |       - uses: r-lib/actions/pr-push@master
25 |         with:
26 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
27 |   style:
28 |     if: startsWith(github.event.comment.body, '/style')
29 |     name: style
30 |     runs-on: macOS-latest
31 |     steps:
32 |       - uses: actions/checkout@v2
33 |       - uses: r-lib/actions/pr-fetch@master
34 |         with:
35 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
36 |       - uses: r-lib/actions/setup-r@master
37 |       - name: Install dependencies
38 |         run: Rscript -e 'install.packages("styler")'
39 |       - name: Style
40 |         run: Rscript -e 'styler::style_pkg()'
41 |       - name: commit
42 |         run: |
43 |           git add \*.R
44 |           git commit -m 'Style'
45 |       - uses: r-lib/actions/pr-push@master
46 |         with:
47 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
48 |   # A mock job just to ensure we have a successful build status
49 |   finish:
50 |     runs-on: ubuntu-latest
51 |     steps:
52 |       - run: true
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | .DS_Store
 6 | inst/doc
 7 | docs/
 8 | 
 9 | revdep/checks
10 | revdep/library
11 | revdep/checks.noindex
12 | revdep/library.noindex
13 | revdep/data.sqlite
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | cache: packages
 5 | 
 6 | before_cache: Rscript -e 'remotes::install_cran("pkgdown")'
 7 | deploy:
 8 |   provider: script
 9 |   script: Rscript -e 'pkgdown::deploy_site_github()'
10 |   skip_cleanup: true
11 | 
12 | after_success:
13 |   - Rscript -e 'covr::codecov()'
14 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (https://www.contributor-covenant.org), version 1.0.0, available at 
25 | https://contributor-covenant.org/version/1/0/0/.
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: textdata
 2 | Title: Download and Load Various Text Datasets
 3 | Version: 0.4.5.9000
 4 | Authors@R: c(
 5 |     person("Emil", "Hvitfeldt", , "emilhhvitfeldt@gmail.com", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0000-0002-0679-1945")),
 7 |     person("Julia", "Silge", , "julia.silge@gmail.com", role = "ctb",
 8 |            comment = c(ORCID = "0000-0002-3671-836X"))
 9 |   )
10 | Description: Provides a framework to download, parse, and store text
11 |     datasets on the disk and load them when needed. Includes various
12 |     sentiment lexicons and labeled text data sets for classification and
13 |     analysis.
14 | License: MIT + file LICENSE
15 | URL: https://emilhvitfeldt.github.io/textdata/, https://github.com/EmilHvitfeldt/textdata
16 | BugReports: https://github.com/EmilHvitfeldt/textdata/issues
17 | Imports: 
18 |     fs,
19 |     rappdirs,
20 |     readr,
21 |     tibble
22 | Suggests: 
23 |     covr,
24 |     knitr,
25 |     rmarkdown,
26 |     testthat (>= 2.1.0)
27 | VignetteBuilder: 
28 |     knitr
29 | Encoding: UTF-8
30 | RoxygenNote: 7.3.1.9000
31 | Collate: 
32 |     'cache_info.R'
33 |     'dataset_ag_news.R'
34 |     'dataset_dbpedia.R'
35 |     'dataset_imdb.R'
36 |     'dataset_sentence_polarity.R'
37 |     'dataset_trec.R'
38 |     'embedding_glove.R'
39 |     'lexicon_nrc_vad.R'
40 |     'lexicon_nrc_eil.R'
41 |     'lexicon_nrc.R'
42 |     'lexicon_bing.R'
43 |     'lexicon_loughran.R'
44 |     'lexicon_afinn.R'
45 |     'download_functions.R'
46 |     'info.R'
47 |     'load_dataset.R'
48 |     'printer.R'
49 |     'process_functions.R'
50 |     'textdata-package.R'
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Emil Hvitfeldt
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2018 Emil Hvitfeldt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(cache_info)
 4 | export(catalogue)
 5 | export(dataset_ag_news)
 6 | export(dataset_dbpedia)
 7 | export(dataset_imdb)
 8 | export(dataset_sentence_polarity)
 9 | export(dataset_trec)
10 | export(embedding_glove27b)
11 | export(embedding_glove42b)
12 | export(embedding_glove6b)
13 | export(embedding_glove840b)
14 | export(lexicon_afinn)
15 | export(lexicon_bing)
16 | export(lexicon_loughran)
17 | export(lexicon_nrc)
18 | export(lexicon_nrc_eil)
19 | export(lexicon_nrc_vad)
20 | export(load_dataset)
21 | importFrom(fs,dir_create)
22 | importFrom(fs,dir_delete)
23 | importFrom(fs,dir_exists)
24 | importFrom(fs,dir_ls)
25 | importFrom(fs,file_delete)
26 | importFrom(fs,file_exists)
27 | importFrom(fs,path)
28 | importFrom(readr,col_character)
29 | importFrom(readr,col_double)
30 | importFrom(readr,cols)
31 | importFrom(readr,cols_only)
32 | importFrom(readr,read_csv)
33 | importFrom(readr,read_delim)
34 | importFrom(readr,read_lines)
35 | importFrom(readr,read_rds)
36 | importFrom(readr,read_tsv)
37 | importFrom(readr,write_rds)
38 | importFrom(tibble,tibble)
39 | importFrom(utils,download.file)
40 | importFrom(utils,menu)
41 | importFrom(utils,untar)
42 | importFrom(utils,unzip)
43 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # textdata (development version)
 2 | 
 3 | # textdata 0.4.5
 4 | 
 5 | * Fixed bug where `lexicon_nrc_vad()` didn't have column names. (#53)
 6 | 
 7 | # textdata 0.4.4
 8 | 
 9 | * Update path to correctly path source for NRC lexicon.
10 | 
11 | # textdata 0.4.3
12 | 
13 | * Fixed documentation to be HTML5 friendly.
14 | 
15 | # textdata 0.4.2
16 | 
17 | * `cache_info()` function has to added to allow for quick overview of cacheing size.
18 | * Update download url for `lexicon_nrc()`.
19 | 
20 | # textdata 0.4.1
21 | 
22 | # textdata 0.4.0
23 | 
24 | * `embedding_glove6b()`, `embedding_glove27b()`, `embedding_glove42b()`, and `embedding_glove840b()` have been added to give access to the Stanford NLP Global Vectors for Word Representations pre-trained word vectors (@jonthegeek, #26).
25 | * `manual_download` argument have been added to all functions to allow the user to manual place file download at right place.
26 | 
27 | # textdata 0.3.0
28 | 
29 | *  `lexicon_nrc_eil()` has been added to give access to the NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5.
30 | *  `lexicon_nrc_vad()` has been added to give access to the The NRC Valence, Arousal, and Dominance Lexicon.
31 | * The argument `clean` have been added to all functions to allow deletion of intermediate files.
32 | * An optional information prompt is implemented. This will be turned off by default and turned on by original authors request.
33 | * `dataset_nrc()` got improved url for faster and more reliable downloads.
34 | 
35 | # textdata 0.2.0
36 | 
37 | * `dataset_imdb()` has been added to give access to the IMDb Large Movie Review Dataset.
38 | * `dataset_trec()` has been added to give access to the TREC-6 and TREC-50 classification datasets.
39 | * `dataset_dbpedia()` has been added to give access to DBpedia Ontology classification dataset.
40 | * `dataset_ag_news()` has been added to give access to AG's News Topic classification dataset.
41 | * Functions will now notify the user about the download mechanism used to download the data. http/https etc. (#12).
42 | * `lexicon_nrc()` has been added to give access to the  NRC Emotion lexicon (@juliasilge, #11).
43 | 
44 | # textdata 0.1.0
45 | 


--------------------------------------------------------------------------------
/R/cache_info.R:
--------------------------------------------------------------------------------
 1 | #' List folders and their sizes in cache
 2 | #'
 3 | #' This function will return a tibble with the name and sizes of all folder in
 4 | #' specified directory. Will default to textdata's default cache.
 5 | #'
 6 | #' @inheritParams lexicon_afinn
 7 | #'
 8 | #' @return A tibble with 2 variables:
 9 | #' \describe{
10 | #'   \item{name}{Name of the folder}
11 | #'   \item{size}{Size of the folder}
12 | #' }
13 | #' @export
14 | #'
15 | #' @examples
16 | #' \dontrun{
17 | #' cache_info()
18 | #' }
19 | cache_info <- function(dir = NULL) {
20 |   dir <- ifelse(is.null(dir), rappdirs::user_cache_dir("textdata"), dir)
21 | 
22 |   folders <- fs::dir_info(dir)
23 | 
24 |   folders <- folders$path[folders$type == "directory"]
25 | 
26 |   sizes <- vapply(folders, folder_size, numeric(1))
27 | 
28 |   tibble::tibble(
29 |     name = basename(folders),
30 |     size = fs::as_fs_bytes(sizes)
31 |   )
32 | }
33 | 
34 | folder_size <- function(x) {
35 |   sum(fs::dir_info(x)$size)
36 | }
37 | 


--------------------------------------------------------------------------------
/R/dataset_ag_news.R:
--------------------------------------------------------------------------------
  1 | #' AG's News Topic Classification Dataset
  2 | #'
  3 | #' The AG's news topic classification dataset is constructed by choosing 4
  4 | #' largest classes from the original corpus. Each class contains 30,000 training
  5 | #' samples and 1,900 testing samples. The total number of training samples is
  6 | #' 120,000 and testing 7,600.
  7 | 
  8 | #' Version 3, Updated 09/09/2015
  9 | #'
 10 | #' The classes in this dataset are
 11 | #'
 12 | #' \itemize{
 13 | #' \item World
 14 | #' \item Sports
 15 | #' \item Business
 16 | #' \item Sci/Tech
 17 | #' }
 18 | #'
 19 | #' @inheritParams lexicon_afinn
 20 | #' @param split Character. Return training ("train") data or testing ("test")
 21 | #'     data. Defaults to "train".
 22 | #' @return A tibble with 120,000 or 30,000 rows for "train" and "test"
 23 | #'     respectively and 3 variables:
 24 | #' \describe{
 25 | #'   \item{class}{Character, denoting new class}
 26 | #'   \item{title}{Character, title of article}
 27 | #'   \item{description}{Character, description of article}
 28 | #' }
 29 | #' @source \url{http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html}
 30 | #' @source \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
 31 | #' @keywords datasets
 32 | #' @family topic
 33 | #' @export
 34 | #' @examples
 35 | #' \dontrun{
 36 | #' dataset_ag_news()
 37 | #'
 38 | #' # Custom directory
 39 | #' dataset_ag_news(dir = "data/")
 40 | #'
 41 | #' # Deleting dataset
 42 | #' dataset_ag_news(delete = TRUE)
 43 | #'
 44 | #' # Returning filepath of data
 45 | #' dataset_ag_news(return_path = TRUE)
 46 | #'
 47 | #' # Access both training and testing dataset
 48 | #' train <- dataset_ag_news(split = "train")
 49 | #' test <- dataset_ag_news(split = "test")
 50 | #' }
 51 | #'
 52 | #' @importFrom fs file_exists dir_exists dir_create path
 53 | #' @importFrom readr read_rds
 54 | #' @importFrom utils menu
 55 | dataset_ag_news <- function(dir = NULL, split = c("train", "test"),
 56 |                             delete = FALSE, return_path = FALSE,
 57 |                             clean = FALSE, manual_download = FALSE) {
 58 |   all_files <- paste0("ag_news_", c("train", "test"), ".rds")
 59 |   split <- match.arg(split)
 60 |   name <- paste0("ag_news_", split, ".rds")
 61 |   load_dataset(
 62 |     data_name = "ag_news", name = name, dir = dir,
 63 |     delete = delete, return_path = return_path, clean = clean,
 64 |     clean_manual = all_files,
 65 |     manual_download = manual_download
 66 |   )
 67 | }
 68 | 
 69 | #' @importFrom utils download.file
 70 | download_ag_news <- function(folder_path) {
 71 |   file_path_test <- path(folder_path, "ag_news_test.csv")
 72 |   file_path_train <- path(folder_path, "ag_news_train.csv")
 73 | 
 74 |   if (file_exists(file_path_test) & file_exists(file_path_train)) {
 75 |     return(invisible())
 76 |   }
 77 | 
 78 |   download.file(
 79 |     url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv",
 80 |     destfile = file_path_test
 81 |   )
 82 |   download.file(
 83 |     url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv",
 84 |     destfile = file_path_train
 85 |   )
 86 | }
 87 | 
 88 | #' @importFrom readr read_tsv write_rds cols col_character col_double
 89 | #' @importFrom tibble tibble
 90 | process_ag_news <- function(folder_path, name_path) {
 91 |   file_path_test <- path(folder_path, "ag_news_test.csv")
 92 |   file_path_train <- path(folder_path, "ag_news_train.csv")
 93 | 
 94 |   data_test <- read_csv(file_path_test,
 95 |     col_names = c("class", "title", "description"),
 96 |     col_types = cols(
 97 |       class = col_double(),
 98 |       title = col_character(),
 99 |       description = col_character()
100 |     )
101 |   )
102 |   data_train <- read_csv(file_path_train,
103 |     col_names = c("class", "title", "description"),
104 |     col_types = cols(
105 |       class = col_double(),
106 |       title = col_character(),
107 |       description = col_character()
108 |     )
109 |   )
110 | 
111 |   classes <- c("World", "Sports", "Business", "Sci/Tech")
112 | 
113 |   data_test$class <- classes[data_test$class]
114 |   data_train$class <- classes[data_train$class]
115 | 
116 |   write_rds(data_test, path(folder_path, "ag_news_test.rds"))
117 |   write_rds(data_train, path(folder_path, "ag_news_train.rds"))
118 | }
119 | 


--------------------------------------------------------------------------------
/R/dataset_dbpedia.R:
--------------------------------------------------------------------------------
  1 | #' DBpedia Ontology Dataset
  2 | #'
  3 | #' DBpedia ontology dataset classification dataset. It contains 560,000 training
  4 | #' samples and 70,000 testing samples for each of 14 nonoverlapping classes
  5 | #' from DBpedia.
  6 | #'
  7 | #' The classes are
  8 | #'
  9 | #' \itemize{
 10 | #' \item Company
 11 | #' \item EducationalInstitution
 12 | #' \item Artist
 13 | #' \item Athlete
 14 | #' \item OfficeHolder
 15 | #' \item MeanOfTransportation
 16 | #' \item Building
 17 | #' \item NaturalPlace
 18 | #' \item Village
 19 | #' \item Animal
 20 | #' \item Plant
 21 | #' \item Album
 22 | #' \item Film
 23 | #' \item WrittenWork
 24 | #' }
 25 | #'
 26 | #' @source \url{https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf}
 27 | #' @source \url{https://www.dbpedia.org/}
 28 | #' @source \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
 29 | #' @inheritParams lexicon_afinn
 30 | #' @param split Character. Return training ("train") data or testing ("test")
 31 | #'     data. Defaults to "train".
 32 | #' @return A tibble with 560,000 or 70,000 rows for "train" and "test"
 33 | #'     respectively and 3 variables:
 34 | #' \describe{
 35 | #'   \item{class}{Character, denoting the class class}
 36 | #'   \item{title}{Character, title of article}
 37 | #'   \item{description}{Character, description of article}
 38 | #' }
 39 | #' @keywords datasets
 40 | #' @family topic
 41 | #' @export
 42 | #' @examples
 43 | #' \dontrun{
 44 | #' dataset_dbpedia()
 45 | #'
 46 | #' # Custom directory
 47 | #' dataset_dbpedia(dir = "data/")
 48 | #'
 49 | #' # Deleting dataset
 50 | #' dataset_dbpedia(delete = TRUE)
 51 | #'
 52 | #' # Returning filepath of data
 53 | #' dataset_dbpedia(return_path = TRUE)
 54 | #'
 55 | #' # Access both training and testing dataset
 56 | #' train <- dataset_dbpedia(split = "train")
 57 | #' test <- dataset_dbpedia(split = "test")
 58 | #' }
 59 | #'
 60 | #' @importFrom fs file_exists dir_exists dir_create path
 61 | #' @importFrom readr read_rds
 62 | #' @importFrom utils menu untar
 63 | dataset_dbpedia <- function(dir = NULL, split = c("train", "test"),
 64 |                             delete = FALSE, return_path = FALSE,
 65 |                             clean = FALSE, manual_download = FALSE) {
 66 |   all_files <- paste0("dbpedia_", c("train", "test"), ".rds")
 67 |   split <- match.arg(split)
 68 |   name <- paste0("dbpedia_", split, ".rds")
 69 |   load_dataset(
 70 |     data_name = "dbpedia", name = name, dir = dir,
 71 |     delete = delete, return_path = return_path, clean = clean,
 72 |     clean_manual = all_files,
 73 |     manual_download = manual_download
 74 |   )
 75 | }
 76 | 
 77 | #' @importFrom utils download.file
 78 | download_dbpedia <- function(folder_path) {
 79 |   file_path <- path(folder_path, "dbpedia_csv.tar.gz")
 80 |   if (file_exists(file_path)) {
 81 |     return(invisible())
 82 |   }
 83 |   download.file(
 84 |     url = "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz",
 85 |     destfile = file_path
 86 |   )
 87 | }
 88 | 
 89 | #' @importFrom readr read_tsv write_rds cols col_character col_double
 90 | #' @importFrom tibble tibble
 91 | process_dbpedia <- function(folder_path, name_path) {
 92 |   file_path_test <- path(folder_path, "dbpedia_csv/test.csv")
 93 |   file_path_train <- path(folder_path, "dbpedia_csv/train.csv")
 94 | 
 95 |   zip_path <- path(folder_path, "dbpedia_csv.tar.gz")
 96 | 
 97 |   untar(zip_path, files = c(
 98 |     "dbpedia_csv/test.csv",
 99 |     "dbpedia_csv/train.csv"
100 |   ), exdir = folder_path)
101 | 
102 |   data_test <- read_csv(file_path_test,
103 |     col_names = c("class", "title", "description"),
104 |     col_types = cols(
105 |       class = col_double(),
106 |       title = col_character(),
107 |       description = col_character()
108 |     )
109 |   )
110 |   data_train <- read_csv(file_path_train,
111 |     col_names = c("class", "title", "description"),
112 |     col_types = cols(
113 |       class = col_double(),
114 |       title = col_character(),
115 |       description = col_character()
116 |     )
117 |   )
118 | 
119 |   classes <- c(
120 |     "Company", "EducationalInstitution", "Artist", "Athlete",
121 |     "OfficeHolder", "MeanOfTransportation", "Building",
122 |     "NaturalPlace", "Village", "Animal", "Plant", "Album", "Film",
123 |     "WrittenWork"
124 |   )
125 | 
126 |   data_test$class <- classes[data_test$class]
127 |   data_train$class <- classes[data_train$class]
128 | 
129 |   write_rds(data_test, path(folder_path, "dbpedia_test.rds"))
130 |   write_rds(data_train, path(folder_path, "dbpedia_train.rds"))
131 | 
132 |   fs::file_delete(path = file_path_test)
133 |   fs::file_delete(path = file_path_train)
134 | }
135 | 


--------------------------------------------------------------------------------
/R/dataset_imdb.R:
--------------------------------------------------------------------------------
  1 | #' IMDB Large Movie Review Dataset
  2 | #'
  3 | #' The core dataset contains 50,000 reviews split evenly into 25k train and
  4 | #' 25k test sets. The overall distribution of labels is balanced (25k pos and
  5 | #' 25k neg).
  6 | #'
  7 | #' In the entire collection, no more than 30 reviews are allowed for any
  8 | #' given movie because reviews for the same movie tend to have correlated
  9 | #' ratings. Further, the train and test sets contain a disjoint set of
 10 | #' movies, so no significant performance is obtained by memorizing
 11 | #' movie-unique terms and their associated with observed labels. In the
 12 | #' labeled train/test sets, a negative review has a score <= 4 out of 10,
 13 | #' and a positive review has a score >= 7 out of 10. Thus reviews with
 14 | #' more neutral ratings are not included in the train/test sets. In the
 15 | #' unsupervised set, reviews of any rating are included and there are an
 16 | #' even number of reviews > 5 and <= 5.
 17 | #'
 18 | #' When using this dataset, please cite the ACL 2011 paper
 19 | #'
 20 | #' InProceedings\{maas-EtAl:2011:ACL-HLT2011, \cr
 21 | #' author    = \{Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher\}, \cr
 22 | #' title     = \{Learning Word Vectors for Sentiment Analysis\}, \cr
 23 | #' booktitle = \{Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies\}, \cr
 24 | #' month     = \{June\}, \cr
 25 | #' year      = \{2011\}, \cr
 26 | #' address   = \{Portland, Oregon, USA\}, \cr
 27 | #' publisher = \{Association for Computational Linguistics\}, \cr
 28 | #' pages     = \{142--150\}, \cr
 29 | #' url       = \{http://www.aclweb.org/anthology/P11-1015\}
 30 | #' \}
 31 | #'
 32 | #' @source \url{http://ai.stanford.edu/~amaas/data/sentiment/}
 33 | #' @inheritParams lexicon_afinn
 34 | #' @param split Character. Return training ("train") data or testing ("test")
 35 | #'     data. Defaults to "train".
 36 | #' @return A tibble with 25,000 rows and 2 variables:
 37 | #' \describe{
 38 | #'   \item{Sentiment}{Character, denoting the sentiment}
 39 | #'   \item{text}{Character, text of the review}
 40 | #' }
 41 | #' @keywords datasets
 42 | #' @family topic sentiment
 43 | #' @export
 44 | #' @examples
 45 | #' \dontrun{
 46 | #' dataset_imdb()
 47 | #'
 48 | #' # Custom directory
 49 | #' dataset_imdb(dir = "data/")
 50 | #'
 51 | #' # Deleting dataset
 52 | #' dataset_imdb(delete = TRUE)
 53 | #'
 54 | #' # Returning filepath of data
 55 | #' dataset_imdb(return_path = TRUE)
 56 | #'
 57 | #' # Access both training and testing dataset
 58 | #' train <- dataset_imdb(split = "train")
 59 | #' test <- dataset_imdb(split = "test")
 60 | #' }
 61 | #'
 62 | #' @importFrom fs file_exists dir_exists dir_create path
 63 | #' @importFrom readr read_rds
 64 | #' @importFrom utils menu untar
 65 | dataset_imdb <- function(dir = NULL, split = c("train", "test"),
 66 |                          delete = FALSE, return_path = FALSE, clean = FALSE,
 67 |                          manual_download = FALSE) {
 68 |   all_files <- paste0("imdb_", c("train", "test"), ".rds")
 69 |   split <- match.arg(split)
 70 |   name <- paste0("imdb_", split, ".rds")
 71 |   load_dataset(
 72 |     data_name = "imdb", name = name, dir = dir,
 73 |     delete = delete, return_path = return_path, clean = clean,
 74 |     clean_manual = all_files,
 75 |     manual_download = manual_download
 76 |   )
 77 | }
 78 | 
 79 | #' @importFrom utils download.file
 80 | download_imdb <- function(folder_path) {
 81 |   file_path <- path(folder_path, "imdb.tar.gz")
 82 |   if (file_exists(file_path)) {
 83 |     return(invisible())
 84 |   }
 85 |   download.file(
 86 |     url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
 87 |     destfile = file_path
 88 |   )
 89 | }
 90 | 
 91 | #' @importFrom readr read_tsv write_rds cols col_character col_double
 92 | #' @importFrom fs dir_ls
 93 | #' @importFrom tibble tibble
 94 | process_imdb <- function(folder_path, name_path) {
 95 |   file_path_test <- path(folder_path, "imdb_csv/test.csv")
 96 |   file_path_train <- path(folder_path, "imdb_csv/train.csv")
 97 | 
 98 |   zip_path <- path(folder_path, "imdb.tar.gz")
 99 | 
100 |   untar(zip_path, exdir = folder_path)
101 | 
102 |   files_test_neg <- dir_ls(path(folder_path, "aclimdb", "test", "neg"))
103 |   files_test_pos <- dir_ls(path(folder_path, "aclimdb", "test", "pos"))
104 | 
105 |   data_test <- tibble(
106 |     sentiment = rep(
107 |       c("neg", "pos"),
108 |       c(
109 |         length(files_test_neg),
110 |         length(files_test_pos)
111 |       )
112 |     ),
113 |     text = c(
114 |       vapply(files_test_neg, read_lines, character(1)),
115 |       vapply(files_test_pos, read_lines, character(1))
116 |     )
117 |   )
118 | 
119 |   files_train_neg <- dir_ls(path(folder_path, "aclimdb", "train", "neg"))
120 |   files_train_pos <- dir_ls(path(folder_path, "aclimdb", "train", "pos"))
121 | 
122 |   data_train <- tibble(
123 |     sentiment = rep(
124 |       c("neg", "pos"),
125 |       c(
126 |         length(files_train_neg),
127 |         length(files_train_pos)
128 |       )
129 |     ),
130 |     text = c(
131 |       vapply(files_train_neg, read_lines, character(1)),
132 |       vapply(files_train_pos, read_lines, character(1))
133 |     )
134 |   )
135 | 
136 |   write_rds(data_test, path(folder_path, "imdb_test.rds"))
137 |   write_rds(data_train, path(folder_path, "imdb_train.rds"))
138 | }
139 | 


--------------------------------------------------------------------------------
/R/dataset_sentence_polarity.R:
--------------------------------------------------------------------------------
 1 | #' v1.0 sentence polarity dataset
 2 | #'
 3 | #' 5331 positive and 5331 negative processed sentences / snippets.
 4 | #' Introduced in Pang/Lee ACL 2005. Released July 2005.
 5 | #'
 6 | #' Citation info:
 7 | #'
 8 | #' This data was first used in Bo Pang and Lillian Lee,
 9 | #' ``Seeing stars: Exploiting class relationships for sentiment categorization
10 | #' with respect to rating scales.'', Proceedings of the ACL, 2005.
11 | #'
12 | #' InProceedings\{pang05, \cr
13 | #' author    = \{Bo Pang and Lillian Lee\}, \cr
14 | #' title     = \{Seeing stars: Exploiting class relationships for sentiment \cr
15 | #'               categorization with respect to rating scales\}, \cr
16 | #' booktitle = \{Proceedings of the ACL\}, \cr
17 | #' year      = 2005 \cr
18 | #' \}
19 | #'
20 | #' @inheritParams lexicon_afinn
21 | #' @return A tibble with 10,662 rows and 2 variables:
22 | #' \describe{
23 | #'   \item{text}{Sentences or snippets}
24 | #'   \item{sentiment}{Indicator for sentiment, "neg" for negative and "pos"
25 | #'                    for positive}
26 | #' }
27 | #' @source \url{https://www.cs.cornell.edu/people/pabo/movie-review-data/}
28 | #' @keywords datasets
29 | #' @family sentiment
30 | #' @export
31 | #' @examples
32 | #' \dontrun{
33 | #' dataset_sentence_polarity()
34 | #'
35 | #' # Custom directory
36 | #' dataset_sentence_polarity(dir = "data/")
37 | #'
38 | #' # Deleting dataset
39 | #' dataset_sentence_polarity(delete = TRUE)
40 | #'
41 | #' # Returning filepath of data
42 | #' dataset_sentence_polarity(return_path = TRUE)
43 | #' }
44 | #'
45 | #' @importFrom fs file_exists dir_exists dir_create path
46 | #' @importFrom readr read_rds
47 | #' @importFrom utils menu
48 | dataset_sentence_polarity <- function(dir = NULL, delete = FALSE,
49 |                                       return_path = FALSE, clean = FALSE,
50 |                                       manual_download = FALSE) {
51 |   load_dataset(
52 |     data_name = "sentence_polarity", name = "rt-polarity.rds",
53 |     dir = dir, delete = delete, return_path = return_path,
54 |     clean = clean, manual_download = manual_download
55 |   )
56 | }
57 | 
58 | #' @importFrom utils download.file
59 | download_sentence_polarity <- function(folder_path) {
60 |   file_path <- path(folder_path, "rt-polaritydata.tar.gz")
61 |   if (file_exists(file_path)) {
62 |     return(invisible())
63 |   }
64 |   download.file(
65 |     url = "https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz",
66 |     destfile = file_path
67 |   )
68 | }
69 | 
70 | #' @importFrom readr read_tsv write_rds cols col_character col_double
71 | #' @importFrom tibble tibble
72 | process_sentence_polarity <- function(folder_path, name_path) {
73 |   full_text <- read_lines(path(folder_path, "rt-polaritydata.tar.gz"))
74 | 
75 |   neq_text <- full_text[55:5385]
76 |   neq_text[1] <- "simplistic , silly and tedious . "
77 |   pos_text <- full_text[5386:10716]
78 |   pos_text[1] <- "othe rock is destined to be the 21st century's new \" conan \" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . "
79 | 
80 |   data <- tibble(
81 |     text = c(neq_text, pos_text),
82 |     sentiment = c(
83 |       rep("neg", length(neq_text)),
84 |       rep("pos", length(pos_text))
85 |     )
86 |   )
87 |   write_rds(data, name_path)
88 | }
89 | 


--------------------------------------------------------------------------------
/R/dataset_trec.R:
--------------------------------------------------------------------------------
  1 | #' TREC dataset
  2 | #'
  3 | #' The TREC dataset is dataset for question classification consisting of
  4 | #' open-domain, fact-based questions divided into broad semantic categories.
  5 | #' It has both a six-class (TREC-6) and a fifty-class (TREC-50) version. Both
  6 | #' have 5,452 training examples and 500 test examples, but TREC-50 has
  7 | #' finer-grained labels. Models are evaluated based on accuracy.
  8 | #'
  9 | #' The classes in TREC-6 are
 10 | #'
 11 | #' \itemize{
 12 | #' \item ABBR - Abbreviation
 13 | #' \item DESC - Description and abstract concepts
 14 | #' \item ENTY - Entities
 15 | #' \item HUM - Human beings
 16 | #' \item LOC - Locations
 17 | #' \item NYM - Numeric values
 18 | #' }
 19 | #'
 20 | #' the classes in TREC-50 can be found here
 21 | #' \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/definition.html}.
 22 | #'
 23 | #' @source \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/}
 24 | #' @source \url{https://trec.nist.gov/data/qa.html}
 25 | #' @inheritParams lexicon_afinn
 26 | #' @param split Character. Return training ("train") data or testing ("test")
 27 | #'     data. Defaults to "train".
 28 | #' @param version Character. Version 6("6") or version 50("50"). Defaults to
 29 | #'     "6".
 30 | #' @return A tibble with 5,452 or 500 rows for "train" and "test"
 31 | #'     respectively and 2 variables:
 32 | #' \describe{
 33 | #'   \item{class}{Character, denoting the class}
 34 | #'   \item{text}{Character, question text}
 35 | #' }
 36 | #' @keywords datasets
 37 | #' @family topic
 38 | #' @export
 39 | #' @examples
 40 | #' \dontrun{
 41 | #' dataset_trec()
 42 | #'
 43 | #' # Custom directory
 44 | #' dataset_trec(dir = "data/")
 45 | #'
 46 | #' # Deleting dataset
 47 | #' dataset_trec(delete = TRUE)
 48 | #'
 49 | #' # Returning filepath of data
 50 | #' dataset_trec(return_path = TRUE)
 51 | #'
 52 | #' # Access both training and testing dataset
 53 | #' train_6 <- dataset_trec(split = "train")
 54 | #' test_6 <- dataset_trec(split = "test")
 55 | #'
 56 | #' train_50 <- dataset_trec(split = "train", version = "50")
 57 | #' test_50 <- dataset_trec(split = "test", version = "50")
 58 | #' }
 59 | #'
 60 | #' @importFrom fs file_exists dir_exists dir_create path
 61 | #' @importFrom readr read_rds
 62 | #' @importFrom utils menu untar
 63 | dataset_trec <- function(dir = NULL, split = c("train", "test"),
 64 |                          version = c("6", "50"), delete = FALSE,
 65 |                          return_path = FALSE, clean = FALSE,
 66 |                          manual_download = FALSE) {
 67 |   all_files <- paste0(
 68 |     "trec_", rep(c("6", "50"), 2), "_",
 69 |     rep(c("train", "test"), each = 2), ".rds"
 70 |   )
 71 |   split <- match.arg(split)
 72 |   version <- match.arg(version)
 73 |   name <- paste0("trec_", version, "_", split, ".rds")
 74 |   load_dataset(
 75 |     data_name = "trec", name = name, dir = dir,
 76 |     delete = delete, return_path = return_path, clean = clean,
 77 |     clean_manual = all_files,
 78 |     manual_download = manual_download
 79 |   )
 80 | }
 81 | 
 82 | #' @importFrom utils download.file
 83 | download_trec <- function(folder_path) {
 84 |   file_path_train <- path(folder_path, "train_5500.label")
 85 |   file_path_test <- path(folder_path, "TREC_10.label")
 86 | 
 87 |   if (file_exists(file_path_train) & file_exists(file_path_test)) {
 88 |     return(invisible())
 89 |   }
 90 |   download.file(
 91 |     url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label",
 92 |     destfile = file_path_train
 93 |   )
 94 |   download.file(
 95 |     url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label",
 96 |     destfile = file_path_test
 97 |   )
 98 | }
 99 | 
100 | #' @importFrom readr read_tsv write_rds cols col_character col_double
101 | #' @importFrom tibble tibble
102 | process_trec <- function(folder_path, name_path) {
103 |   file_path_train <- path(folder_path, "train_5500.label")
104 |   file_path_test <- path(folder_path, "TREC_10.label")
105 | 
106 |   # Test data
107 |   data_test <- read_lines(file_path_test)
108 | 
109 |   text_test <- gsub("^\\S* ", "", data_test)
110 | 
111 |   label_test <- sub("\\s.*", "", data_test)
112 | 
113 |   trec6_label_test <- sapply(strsplit(label_test, ":"), function(x) x[1])
114 |   trec50_label_test <- sapply(strsplit(label_test, ":"), function(x) x[2])
115 | 
116 |   trec_6_test <- tibble(
117 |     class = trec6_label_test,
118 |     text = text_test
119 |   )
120 |   trec_50_test <- tibble(
121 |     class = trec50_label_test,
122 |     text = text_test
123 |   )
124 |   # train data
125 |   data_train <- read_lines(file_path_train)
126 | 
127 |   text_train <- gsub("^\\S* ", "", data_train)
128 | 
129 |   label_train <- sub("\\s.*", "", data_train)
130 | 
131 |   trec6_label_train <- sapply(strsplit(label_train, ":"), function(x) x[1])
132 |   trec50_label_train <- sapply(strsplit(label_train, ":"), function(x) x[2])
133 | 
134 |   trec_6_train <- tibble(
135 |     class = trec6_label_train,
136 |     text = text_train
137 |   )
138 |   trec_50_train <- tibble(
139 |     class = trec50_label_train,
140 |     text = text_train
141 |   )
142 | 
143 |   write_rds(trec_6_test, path(folder_path, "trec_6_test.rds"))
144 |   write_rds(trec_6_train, path(folder_path, "trec_6_train.rds"))
145 | 
146 |   write_rds(trec_50_test, path(folder_path, "trec_50_test.rds"))
147 |   write_rds(trec_50_train, path(folder_path, "trec_50_train.rds"))
148 | }
149 | 


--------------------------------------------------------------------------------
/R/download_functions.R:
--------------------------------------------------------------------------------
 1 | #' List of all download functions used in load_dataset
 2 | #'
 3 | #' @format Named list of all download functions
 4 | #' @include lexicon_afinn.R lexicon_loughran.R lexicon_bing.R lexicon_nrc.R
 5 | #' @include dataset_sentence_polarity.R dataset_ag_news.R dataset_dbpedia.R
 6 | #' @include dataset_trec.R dataset_imdb.R lexicon_nrc_eil.R lexicon_nrc_vad.R
 7 | #' @include embedding_glove.R
 8 | #'
 9 | #' @name download_functions
10 | #' @noRd
11 | NULL
12 | 
13 | download_functions <- list(
14 |   afinn = download_afinn,
15 |   sentence_polarity = download_sentence_polarity,
16 |   loughran = download_loughran,
17 |   bing = download_bing,
18 |   nrc = download_nrc,
19 |   nrc_eil = download_nrc_eil,
20 |   nrc_vad = download_nrc_vad,
21 |   ag_news = download_ag_news,
22 |   dbpedia = download_dbpedia,
23 |   trec = download_trec,
24 |   imdb = download_imdb,
25 |   glove6b = download_glove6b,
26 |   glove27b = download_glove27b,
27 |   glove42b = download_glove42b,
28 |   glove840b = download_glove840b
29 | )
30 | 


--------------------------------------------------------------------------------
/R/embedding_glove.R:
--------------------------------------------------------------------------------
  1 | #' Global Vectors for Word Representation
  2 | #'
  3 | #' The GloVe pre-trained word vectors provide word embeddings created using
  4 | #' varying numbers of tokens.
  5 | #'
  6 | #' Citation info:
  7 | #'
  8 | #' InProceedings\{pennington2014glove, \cr
  9 | #' author     = \{Jeffrey Pennington and Richard Socher and Christopher D. \cr
 10 | #'                Manning\}, \cr
 11 | #' title      = \{GloVe: Global Vectors for Word Representation\}, \cr
 12 | #' booktitle  = \{Empirical Methods in Natural Language Processing (EMNLP)\}, \cr
 13 | #' year       = 2014 \cr
 14 | #' pages      = \{1532-1543\} \cr
 15 | #' url        = \{http://www.aclweb.org/anthology/D14-1162\} \cr
 16 | #' \}
 17 | #'
 18 | #' @references Jeffrey Pennington, Richard Socher, and Christopher D. Manning.
 19 | #'   2014. GloVe: Global Vectors for Word Representation.
 20 | #'
 21 | #' @inheritParams lexicon_afinn
 22 | #' @param dimensions A number indicating the number of vectors to include. One
 23 | #'   of 50, 100, 200, or 300 for glove6b, or one of 25, 50, 100, or 200 for
 24 | #'   glove27b.
 25 | #' @return A tibble with 400k, 1.9m, 2.2m, or 1.2m rows (one row for each unique
 26 | #'   token in the vocabulary) and the following variables:
 27 | #' \describe{
 28 | #'   \item{token}{An individual token (usually a word)}
 29 | #'   \item{d1, d2, etc}{The embeddings for that token.}
 30 | #' }
 31 | #' @source \url{https://nlp.stanford.edu/projects/glove/}
 32 | #' @keywords datasets
 33 | #' @family embeddings
 34 | #' @examples
 35 | #' \dontrun{
 36 | #' embedding_glove6b(dimensions = 50)
 37 | #'
 38 | #' # Custom directory
 39 | #' embedding_glove42b(dir = "data/")
 40 | #'
 41 | #' # Deleting dataset
 42 | #' embedding_glove6b(delete = TRUE, dimensions = 300)
 43 | #'
 44 | #' # Returning filepath of data
 45 | #' embedding_glove840b(return_path = TRUE)
 46 | #' }
 47 | #' @name embedding_glove
 48 | NULL
 49 | 
 50 | #' @rdname embedding_glove
 51 | #' @export
 52 | #' @importFrom fs file_exists dir_exists dir_create path
 53 | #' @importFrom readr read_rds
 54 | #' @importFrom utils menu
 55 | embedding_glove6b <- function(dir = NULL,
 56 |                               dimensions = c(50, 100, 200, 300),
 57 |                               delete = FALSE,
 58 |                               return_path = FALSE,
 59 |                               clean = FALSE,
 60 |                               manual_download = FALSE) {
 61 |   this_glove <- "6b"
 62 |   available_dims <- c(50, 100, 200, 300)
 63 |   all_names <- construct_glove_name(this_glove, available_dims)
 64 |   dimensions <- as.character(dimensions)
 65 |   dimensions <- match.arg(dimensions, as.character(available_dims))
 66 |   name <- construct_glove_name(this_glove, dimensions)
 67 |   load_dataset(
 68 |     data_name = "glove6b", name = name, dir = dir,
 69 |     delete = delete, return_path = return_path, clean = clean,
 70 |     clean_manual = all_names,
 71 |     manual_download = manual_download
 72 |   )
 73 | }
 74 | 
 75 | #' @keywords internal
 76 | construct_glove_name <- function(tokens = c("6b", "27b"),
 77 |                                  dimensions = c(25, 50, 100, 200, 300)) {
 78 |   tokens <- match.arg(tokens)
 79 |   dimensions <- as.character(dimensions)
 80 |   dimensions <- match.arg(
 81 |     dimensions,
 82 |     choices = as.character(c(25, 50, 100, 200, 300)),
 83 |     several.ok = TRUE
 84 |   )
 85 |   paste0(
 86 |     paste(
 87 |       "glove",
 88 |       tokens,
 89 |       dimensions,
 90 |       sep = "_"
 91 |     ),
 92 |     ".rds"
 93 |   )
 94 | }
 95 | 
 96 | #' @rdname embedding_glove
 97 | #' @export
 98 | #' @importFrom fs file_exists dir_exists dir_create path
 99 | #' @importFrom readr read_rds
100 | #' @importFrom utils menu
101 | embedding_glove27b <- function(dir = NULL,
102 |                                dimensions = c(25, 50, 100, 200),
103 |                                delete = FALSE,
104 |                                return_path = FALSE,
105 |                                clean = FALSE,
106 |                                manual_download = FALSE) {
107 |   this_glove <- "27b"
108 |   available_dims <- c(25, 50, 100, 200)
109 |   all_names <- construct_glove_name(this_glove, available_dims)
110 |   dimensions <- as.character(dimensions)
111 |   dimensions <- match.arg(dimensions, as.character(available_dims))
112 |   name <- construct_glove_name(this_glove, dimensions)
113 |   load_dataset(
114 |     data_name = "glove27b", name = name, dir = dir,
115 |     delete = delete, return_path = return_path, clean = clean,
116 |     clean_manual = all_names,
117 |     manual_download = manual_download
118 |   )
119 | }
120 | 
121 | #' @rdname embedding_glove
122 | #' @export
123 | #' @importFrom fs file_exists dir_exists dir_create path
124 | #' @importFrom readr read_rds
125 | #' @importFrom utils menu
126 | embedding_glove42b <- function(dir = NULL,
127 |                                delete = FALSE,
128 |                                return_path = FALSE,
129 |                                clean = FALSE,
130 |                                manual_download = FALSE) {
131 |   name <- "glove_42b.rds"
132 |   load_dataset(
133 |     data_name = "glove42b", name = name, dir = dir,
134 |     delete = delete, return_path = return_path, clean = clean,
135 |     manual_download = manual_download
136 |   )
137 | }
138 | 
139 | #' @rdname embedding_glove
140 | #' @export
141 | #' @importFrom fs file_exists dir_exists dir_create path
142 | #' @importFrom readr read_rds
143 | #' @importFrom utils menu
144 | embedding_glove840b <- function(dir = NULL,
145 |                                 delete = FALSE,
146 |                                 return_path = FALSE,
147 |                                 clean = FALSE,
148 |                                 manual_download = FALSE) {
149 |   name <- "glove_840b.rds"
150 |   load_dataset(
151 |     data_name = "glove840b", name = name, dir = dir,
152 |     delete = delete, return_path = return_path, clean = clean,
153 |     manual_download = manual_download
154 |   )
155 | }
156 | 
157 | #' @importFrom utils download.file
158 | #' @keywords internal
159 | download_glove6b <- function(folder_path) {
160 |   file_path <- path(folder_path, "glove.6B.zip")
161 |   if (file_exists(file_path)) {
162 |     return(invisible())
163 |   }
164 |   download.file(
165 |     url = "http://nlp.stanford.edu/data/glove.6B.zip",
166 |     destfile = file_path
167 |   )
168 | }
169 | 
170 | #' @importFrom utils download.file
171 | #' @keywords internal
172 | download_glove42b <- function(folder_path) {
173 |   file_path <- path(folder_path, "glove.42B.300d.zip")
174 |   if (file_exists(file_path)) {
175 |     return(invisible())
176 |   }
177 |   download.file(
178 |     url = "http://nlp.stanford.edu/data/glove.42B.300d.zip",
179 |     destfile = file_path
180 |   )
181 | }
182 | 
183 | #' @importFrom utils download.file
184 | #' @keywords internal
185 | download_glove840b <- function(folder_path) {
186 |   file_path <- path(folder_path, "glove.840B.300d.zip")
187 |   if (file_exists(file_path)) {
188 |     return(invisible())
189 |   }
190 |   download.file(
191 |     url = "http://nlp.stanford.edu/data/glove.840B.300d.zip",
192 |     destfile = file_path
193 |   )
194 | }
195 | 
196 | #' @importFrom utils download.file
197 | #' @keywords internal
198 | download_glove27b <- function(folder_path) {
199 |   file_path <- path(folder_path, "glove.twitter.27B.zip")
200 |   if (file_exists(file_path)) {
201 |     return(invisible())
202 |   }
203 |   download.file(
204 |     url = "http://nlp.stanford.edu/data/glove.twitter.27B.zip",
205 |     destfile = file_path
206 |   )
207 | }
208 | 
209 | #' @keywords internal
210 | process_glove6b <- function(folder_path, name_path) {
211 |   # Processing all datasets when they only need one adds time. We'll
212 |   # specifically deal with the one they requested, which means we need to
213 |   # extract the dimensions back out of the name to build the raw filename.
214 |   filename <- gsub(folder_path, "", name_path)
215 |   dimensions <- unlist(strsplit(filename, "_|\\."))[[3]]
216 |   raw_name <- paste0("glove.6B.", dimensions, "d.txt")
217 |   file <- unz(path(folder_path, "glove.6B.zip"), raw_name)
218 | 
219 |   write_glove(file, name_path, dimensions)
220 | }
221 | 
222 | #' @keywords internal
223 | process_glove42b <- function(folder_path, name_path) {
224 |   dimensions <- 300
225 |   raw_name <- "glove.42B.300d.txt"
226 |   file <- unz(path(folder_path, "glove.42B.300d.zip"), raw_name)
227 | 
228 |   write_glove(file, name_path, dimensions)
229 | }
230 | 
231 | #' @keywords internal
232 | process_glove840b <- function(folder_path, name_path) {
233 |   dimensions <- 300
234 |   raw_name <- "glove.840B.300d.txt"
235 |   file <- unz(path(folder_path, "glove.840B.300d.zip"), raw_name)
236 | 
237 |   write_glove(file, name_path, dimensions)
238 | }
239 | 
240 | #' @keywords internal
241 | process_glove27b <- function(folder_path, name_path) {
242 |   filename <- gsub(folder_path, "", name_path)
243 |   dimensions <- unlist(strsplit(filename, "_|\\."))[[3]]
244 |   raw_name <- paste0("glove.twitter.27B.", dimensions, "d.txt")
245 | 
246 |   file <- unz(path(folder_path, "glove.twitter.27B.zip"), raw_name)
247 | 
248 |   write_glove(file, name_path, dimensions)
249 | }
250 | 
251 | #' @importFrom readr read_delim write_rds
252 | #' @keywords internal
253 | write_glove <- function(file, name_path, dimensions) {
254 |   embeddings <- read_delim(
255 |     file,
256 |     delim = " ",
257 |     quote = "",
258 |     col_names = c(
259 |       "token",
260 |       paste0("d", seq_len(dimensions))
261 |     ),
262 |     col_types = paste0(
263 |       c(
264 |         "c",
265 |         rep("d", dimensions)
266 |       ),
267 |       collapse = ""
268 |     )
269 |   )
270 | 
271 |   write_rds(embeddings, name_path)
272 | }
273 | 


--------------------------------------------------------------------------------
/R/info.R:
--------------------------------------------------------------------------------
  1 | print_info <- list(
  2 |   afinn =
  3 |     list(
  4 |       name = "AFINN-111",
  5 |       url = "http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010",
  6 |       license = "Open Database License (ODbL) v1.0",
  7 |       size = "78 KB (cleaned 59 KB)",
  8 |       type = "lexicon",
  9 |       download_mech = "https",
 10 |       description = "",
 11 |       citation = NA
 12 |     ),
 13 |   sentence_polarity =
 14 |     list(
 15 |       name = "v1.0 sentence polarity",
 16 |       url = "http://www.cs.cornell.edu/people/pabo/movie-review-data",
 17 |       license = "Cite the paper when used.",
 18 |       size = "2 MB (cleaned 1.4 MB)",
 19 |       type = "dataset",
 20 |       download_mech = "https",
 21 |       description = "Dataset with sentences labeled with negative or positive sentiment.",
 22 |       citation = NA
 23 |     ),
 24 |   loughran =
 25 |     list(
 26 |       name = "Loughran-McDonald Sentiment lexicon",
 27 |       url = "https://sraf.nd.edu/textual-analysis/resources/",
 28 |       license = "License required for commercial use. Please contact tloughra@nd.edu.",
 29 |       size = "6.7 MB (cleaned 142 KB)",
 30 |       type = "lexicon",
 31 |       download_mech = "https",
 32 |       description = "",
 33 |       citation = NA
 34 |     ),
 35 |   bing =
 36 |     list(
 37 |       name = "Bing Sentiment Lexicon",
 38 |       url = "https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html",
 39 |       license = "May be used (research, commercial, etc) with attribution.",
 40 |       size = "287 KB (cleaned 220 KB)",
 41 |       type = "lexicon",
 42 |       download_mech = "http",
 43 |       description = "",
 44 |       citation = NA
 45 |     ),
 46 |   nrc =
 47 |     list(
 48 |       name = "NRC Word-Emotion Association Lexicon",
 49 |       url = "http://saifmohammad.com/WebPages/lexicons.html",
 50 |       license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).",
 51 |       size = "22.8 MB (cleaned 424 KB)",
 52 |       type = "lexicon",
 53 |       download_mech = "http",
 54 |       description = "",
 55 |       citation = "Citation info:
 56 | 
 57 | This dataset was published in Saif M. Mohammad and Peter Turney. (2013), ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational Intelligence, 29(3): 436-465.
 58 | 
 59 | article{mohammad13,
 60 | author = {Mohammad, Saif M. and Turney, Peter D.},
 61 | title = {Crowdsourcing a Word-Emotion Association Lexicon},
 62 | journal = {Computational Intelligence},
 63 | volume = {29},
 64 | number = {3},
 65 | pages = {436-465},
 66 | doi = {10.1111/j.1467-8640.2012.00460.x},
 67 | url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x},
 68 | eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x},
 69 | year = {2013}
 70 | }
 71 | If you use this lexicon, then please cite it."
 72 |     ),
 73 |   nrc_eil =
 74 |     list(
 75 |       name = "NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon)",
 76 |       url = "www.saifmohammad.com/WebPages/AffectIntensity.htm",
 77 |       license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).",
 78 |       size = "333 KB (cleaned 212 KB)",
 79 |       type = "lexicon",
 80 |       download_mech = "http",
 81 |       description = "",
 82 |       citation = "Citation info:
 83 | Details of the lexicon are in this paper.
 84 | Word Affect Intensities. Saif M. Mohammad. arXiv preprint arXiv, April 2017.
 85 | 
 86 | inproceedings{LREC18-AIL,
 87 | author = {Mohammad, Saif M.},
 88 | title = {Word Affect Intensities},
 89 | booktitle = {Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)},
 90 | year = {2018},
 91 | address={Miyazaki, Japan}
 92 | }
 93 | 
 94 | If you use this lexicon, then please cite it."
 95 |     ),
 96 |   nrc_vad =
 97 |     list(
 98 |       name = "The NRC Valence, Arousal, and Dominance Lexicon",
 99 |       url = "https://saifmohammad.com/WebPages/nrc-vad.html",
100 |       license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).",
101 |       size = "150.8 MB (cleaned 792 KB)",
102 |       type = "lexicon",
103 |       download_mech = "http",
104 |       description = "",
105 |       citation = "Citation info:
106 | 
107 | inproceedings{vad-acl2018,
108 | title={Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words},
109 | author={Mohammad, Saif M.},
110 | booktitle={Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)},
111 | year={2018},
112 | address={Melbourne, Australia}
113 | }
114 | 
115 | If you use this lexicon, then please cite it."
116 |     ),
117 |   ag_news =
118 |     list(
119 |       name = "AG News",
120 |       url = "https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html",
121 |       license = "You are encouraged to download this corpus for any non-commercial use.",
122 |       size = "64.4 MB (cleaned 33.9 MB)",
123 |       type = "dataset",
124 |       download_mech = "https",
125 |       description = "",
126 |       citation = NA
127 |     ),
128 |   dbpedia =
129 |     list(
130 |       name = "DBpedia",
131 |       url = "https://wiki.dbpedia.org/",
132 |       license = "Creative Commons Attribution-ShareAlike 3.0 License",
133 |       size = "279.5 MB (cleaned 211.1 MB)",
134 |       type = "dataset",
135 |       download_mech = "https",
136 |       description = "",
137 |       citation = NA
138 |     ),
139 |   trec =
140 |     list(
141 |       name = "TREC-6 & TREC-50",
142 |       url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/",
143 |       license = "Freely reusable public information licence",
144 |       size = "1.2 MB (cleaned 827 KB)",
145 |       type = "dataset",
146 |       download_mech = "https",
147 |       description = "",
148 |       citation = NA
149 |     ),
150 |   imdb =
151 |     list(
152 |       name = "IMDb Large Movie Review Dataset",
153 |       url = "http://ai.stanford.edu/~amaas/data/sentiment/",
154 |       license = "No license specified, the work may be protected by copyright.",
155 |       size = "376.4 MB (cleaned 71 MB)",
156 |       type = "dataset",
157 |       download_mech = "http",
158 |       description = "",
159 |       citation = NA
160 |     ),
161 |   glove6b =
162 |     list(
163 |       name = "GloVe 6B",
164 |       url = "https://nlp.stanford.edu/projects/glove/",
165 |       license = "Public Domain Dedication and License v1.0",
166 |       size = "822.2 MB (158MB, 311MB, 616MB, and 921MB processed)",
167 |       type = "embeddings",
168 |       download_mech = "https",
169 |       description = "Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors)",
170 |       citation = "Citation info:
171 | inproceedings{pennington2014glove,
172 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
173 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
174 | title = {GloVe: Global Vectors for Word Representation},
175 | year = {2014},
176 | pages = {1532--1543},
177 | url = {http://www.aclweb.org/anthology/D14-1162},
178 | }"
179 |     ),
180 |   glove27b =
181 |     list(
182 |       name = "GloVe Twitter 27B",
183 |       url = "https://nlp.stanford.edu/projects/glove/",
184 |       license = "Public Domain Dedication and License v1.0",
185 |       size = "1.42 GB (248MB, 476MB, 931MB, and 1.79GB processed)",
186 |       type = "embeddings",
187 |       download_mech = "https",
188 |       description = "Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors)",
189 |       citation = "Citation info:
190 | inproceedings{pennington2014glove,
191 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
192 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
193 | title = {GloVe: Global Vectors for Word Representation},
194 | year = {2014},
195 | pages = {1532--1543},
196 | url = {http://www.aclweb.org/anthology/D14-1162},
197 | }"
198 |     ),
199 |   glove42b =
200 |     list(
201 |       name = "GloVe Common Crawl 42B",
202 |       url = "https://nlp.stanford.edu/projects/glove/",
203 |       license = "Public Domain Dedication and License v1.0",
204 |       size = "1.75 GB (4.31GB processed)",
205 |       type = "embeddings",
206 |       download_mech = "https",
207 |       description = "Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors)",
208 |       citation = "Citation info:
209 | inproceedings{pennington2014glove,
210 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
211 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
212 | title = {GloVe: Global Vectors for Word Representation},
213 | year = {2014},
214 | pages = {1532--1543},
215 | url = {http://www.aclweb.org/anthology/D14-1162},
216 | }"
217 |     ),
218 |   glove840b =
219 |     list(
220 |       name = "GloVe Common Crawl 840B",
221 |       url = "https://nlp.stanford.edu/projects/glove/",
222 |       license = "Public Domain Dedication and License v1.0",
223 |       size = "2.03 GB (4.94GB processed)",
224 |       type = "embeddings",
225 |       download_mech = "https",
226 |       description = "Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors)",
227 |       citation = "Citation info:
228 | inproceedings{pennington2014glove,
229 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
230 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
231 | title = {GloVe: Global Vectors for Word Representation},
232 | year = {2014},
233 | pages = {1532--1543},
234 | url = {http://www.aclweb.org/anthology/D14-1162},
235 | }"
236 |     )
237 | )
238 | 
239 | #' Catalogue of all available data sources
240 | #' @export
241 | "catalogue"
242 | catalogue <- Reduce(rbind, lapply(print_info, as.data.frame,
243 |   stringsAsFactors = FALSE
244 | ))
245 | 


--------------------------------------------------------------------------------
/R/lexicon_afinn.R:
--------------------------------------------------------------------------------
 1 | #' AFINN-111 dataset
 2 | #'
 3 | #' AFINN is a lexicon of English words rated for valence with an integer
 4 | #' between minus five (negative) and plus five (positive). The words have
 5 | #' been manually labeled by Finn Årup Nielsen in 2009-2011.
 6 | #'
 7 | #' This dataset is the newest version with 2477 words and phrases.
 8 | #'
 9 | #' Citation info:
10 | #'
11 | #' This dataset was published in Finn Ärup Nielsen (2011),
12 | #' ``A new Evaluation of a word list for sentiment analysis in
13 | #' microblogs'', Proceedings of the ESWC2011 Workshop on
14 | #' 'Making Sense of Microposts': Big things come in small packages (2011) 93-98.
15 | #'
16 | #' article\{nielsen11, \cr
17 | #' author    = \{Finn Äruprup Nielsen\}, \cr
18 | #' title     = \{A new Evaluation of a word list for sentiment analysis in microblogs\}, \cr
19 | #' journal   = \{CoRR\}, \cr
20 | #' volume    = \{abs/1103.2903\}, \cr
21 | #' year      = \{2011\}, \cr
22 | #' url       = \{http://arxiv.org/abs/1103.2903\}, \cr
23 | #' archivePrefix = \{arXiv\}, \cr
24 | #' eprint    = \{1103.2903\}, \cr
25 | #' biburl    = \{https://dblp.org/rec/bib/journals/corr/abs-1103-2903\}, \cr
26 | #' bibsource = \{dblp computer science bibliography, https://dblp.org\} \cr
27 | #' \}
28 | #'
29 | #' @param dir Character, path to directory where data will be stored. If
30 | #'     \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.
31 | #' @param delete Logical, set \code{TRUE} to delete dataset.
32 | #' @param return_path Logical, set \code{TRUE} to return the path of the dataset.
33 | #' @param clean Logical, set \code{TRUE} to remove intermediate files. This can
34 | #'     greatly reduce the size. Defaults to FALSE.
35 | #' @param manual_download Logical, set \code{TRUE} if you have manually
36 | #'     downloaded the file and placed it in the folder designated by running
37 | #'     this function with \code{return_path = TRUE}.
38 | #' @return A tibble with 2,477 rows and 2 variables:
39 | #' \describe{
40 | #'   \item{word}{An English word}
41 | #'   \item{score}{Indicator for sentiment: integer between -5 and +5}
42 | #' }
43 | #'
44 | #' @keywords datasets
45 | #' @family lexicon
46 | #' @importFrom fs file_exists dir_exists dir_create
47 | #' @importFrom readr read_rds
48 | #' @importFrom utils menu
49 | #' @export
50 | #' @examples
51 | #' \dontrun{
52 | #' lexicon_afinn()
53 | #'
54 | #' # Custom directory
55 | #' lexicon_afinn(dir = "data/")
56 | #'
57 | #' # Deleting dataset
58 | #' lexicon_afinn(delete = TRUE)
59 | #'
60 | #' # Returning filepath of data
61 | #' lexicon_afinn(return_path = TRUE)
62 | #' }
63 | lexicon_afinn <- function(dir = NULL, delete = FALSE, return_path = FALSE,
64 |                           clean = FALSE, manual_download = FALSE) {
65 |   load_dataset(
66 |     data_name = "afinn", name = "afinn_111.rds", dir = dir,
67 |     delete = delete, return_path = return_path, clean = clean,
68 |     manual_download = manual_download
69 |   )
70 | }
71 | 
72 | #' @importFrom utils download.file
73 | download_afinn <- function(folder_path) {
74 |   file_path <- path(folder_path, "imm6010.zip")
75 |   if (file_exists(file_path)) {
76 |     return(invisible())
77 |   }
78 |   download.file(
79 |     url = "http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip",
80 |     destfile = file_path
81 |   )
82 | }
83 | 
84 | #' @importFrom readr read_tsv write_rds cols col_character col_double
85 | process_afinn <- function(folder_path, name_path) {
86 |   file <- unz(path(folder_path, "imm6010.zip"), "AFINN/AFINN-111.txt")
87 |   data <- read_tsv(file,
88 |     col_types = cols(
89 |       word = col_character(),
90 |       value = col_double()
91 |     ),
92 |     col_names = c("word", "value")
93 |   )
94 |   write_rds(data, name_path)
95 | }
96 | 


--------------------------------------------------------------------------------
/R/lexicon_bing.R:
--------------------------------------------------------------------------------
  1 | #' Bing sentiment lexicon
  2 | #'
  3 | #' General purpose English sentiment lexicon that categorizes words in a
  4 | #' binary fashion, either positive or negative
  5 | #'
  6 | #' Citation info:
  7 | #'
  8 | #' This dataset was first published in Minqing Hu and Bing Liu, ``Mining and
  9 | #' summarizing customer reviews.'', Proceedings of the ACM SIGKDD International
 10 | #' Conference on Knowledge Discovery & Data Mining (KDD-2004), 2004.
 11 | #'
 12 | #' inproceedings\{Hu04, \cr
 13 | #' author    = \{Hu, Minqing and Liu, Bing\}, \cr
 14 | #' title     = \{Mining and Summarizing Customer Reviews\}, \cr
 15 | #' booktitle = \{Proceedings of the Tenth ACM SIGKDD International Conference
 16 | #'               on Knowledge Discovery and Data Mining\}, \cr
 17 | #' series    = \{KDD '04\}, \cr
 18 | #' year      = \{2004\}, \cr
 19 | #' isbn      = \{1-58113-888-1\}, \cr
 20 | #' location  = \{Seattle, WA, USA\}, \cr
 21 | #' pages     = \{168--177\}, \cr
 22 | #' numpages  = \{10\}, \cr
 23 | #' url       = \{http://doi.acm.org/10.1145/1014052.1014073\}, \cr
 24 | #' doi       = \{10.1145/1014052.1014073\}, \cr
 25 | #' acmid     = \{1014073\}, \cr
 26 | #' publisher = \{ACM\}, \cr
 27 | #' address   = \{New York, NY, USA\}, \cr
 28 | #' keywords  = \{reviews, sentiment classification, summarization, text mining\}, \cr
 29 | #' \}
 30 | #'
 31 | #' @inheritParams lexicon_afinn
 32 | #' @return A tibble with 6,787 rows and 2 variables:
 33 | #' \describe{
 34 | #'   \item{word}{An English word}
 35 | #'   \item{sentiment}{Indicator for sentiment: "negative" or "positive"}
 36 | #' }
 37 | #'
 38 | #' @source \url{https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html}
 39 | #' @keywords datasets
 40 | #' @family lexicon
 41 | #' @importFrom fs file_exists dir_exists dir_create
 42 | #' @importFrom readr read_rds
 43 | #' @importFrom utils menu
 44 | #' @export
 45 | #' @examples
 46 | #' \dontrun{
 47 | #' lexicon_bing()
 48 | #'
 49 | #' # Custom directory
 50 | #' lexicon_bing(dir = "data/")
 51 | #'
 52 | #' # Deleting dataset
 53 | #' lexicon_bing(delete = TRUE)
 54 | #'
 55 | #' # Returning filepath of data
 56 | #' lexicon_bing(return_path = TRUE)
 57 | #' }
 58 | lexicon_bing <- function(dir = NULL, delete = FALSE, return_path = FALSE,
 59 |                          clean = FALSE, manual_download = FALSE) {
 60 |   load_dataset(
 61 |     data_name = "bing", name = "bing.rds", dir = dir,
 62 |     delete = delete, return_path = return_path, clean = clean,
 63 |     manual_download = manual_download
 64 |   )
 65 | }
 66 | 
 67 | 
 68 | #' @importFrom utils download.file
 69 | #' @importFrom fs path
 70 | download_bing <- function(folder_path) {
 71 |   file_path_neg <- path(folder_path, "negative-words.txt")
 72 |   file_path_pos <- path(folder_path, "positive-words.txt")
 73 | 
 74 |   if (file_exists(file_path_pos) & file_exists(file_path_neg)) {
 75 |     return(invisible())
 76 |   }
 77 | 
 78 |   download.file(
 79 |     url = "http://ptrckprry.com/course/ssd/data/negative-words.txt",
 80 |     destfile = file_path_neg
 81 |   )
 82 |   download.file(
 83 |     url = "http://ptrckprry.com/course/ssd/data/positive-words.txt",
 84 |     destfile = file_path_pos
 85 |   )
 86 | }
 87 | 
 88 | #' @importFrom readr read_lines
 89 | process_bing <- function(folder_path, name_path) {
 90 |   file_path_neg <- path(folder_path, "negative-words.txt")
 91 |   file_path_pos <- path(folder_path, "positive-words.txt")
 92 | 
 93 |   neg_words <- read_lines(file_path_neg, skip = 35)
 94 |   pos_words <- read_lines(file_path_pos, skip = 35)
 95 | 
 96 |   data <- tibble(
 97 |     word = c(neg_words, pos_words),
 98 |     sentiment = rep(
 99 |       c("negative", "positive"),
100 |       c(length(neg_words), length(pos_words))
101 |     )
102 |   )
103 | 
104 |   write_rds(data, name_path)
105 | }
106 | 


--------------------------------------------------------------------------------
/R/lexicon_loughran.R:
--------------------------------------------------------------------------------
  1 | #' Loughran-McDonald sentiment lexicon
  2 | #'
  3 | #' English sentiment lexicon created for use with financial documents. This
  4 | #' lexicon labels words with six possible sentiments important in financial
  5 | #' contexts: "negative", "positive", "litigious", "uncertainty", "constraining",
  6 | #' or "superfluous".
  7 | #'
  8 | #' Citation info:
  9 | #'
 10 | #' This dataset was published in Loughran, T. and McDonald, B. (2011),
 11 | #' ``When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and
 12 | #' 10-Ks.'' The Journal of Finance, 66: 35-65.
 13 | #'
 14 | #' article\{loughran11, \cr
 15 | #' author  = \{Loughran, Tim and McDonald, Bill\}, \cr
 16 | #' title   = \{When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10-Ks\}, \cr
 17 | #' journal = \{The Journal of Finance\}, \cr
 18 | #' volume  = \{66\}, \cr
 19 | #' number  = \{1\}, \cr
 20 | #' pages   = \{35-65\}, \cr
 21 | #' doi     = \{10.1111/j.1540-6261.2010.01625.x\}, \cr
 22 | #' url     = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-6261.2010.01625.x\}, \cr
 23 | #' eprint  = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1540-6261.2010.01625.x\}, \cr
 24 | #' year    = \{2011\} \cr
 25 | #' \}
 26 | #'
 27 | #'
 28 | #' @inheritParams lexicon_afinn
 29 | #' @return A tibble with 4,150 rows and 2 variables:
 30 | #' \describe{
 31 | #'   \item{word}{An English word}
 32 | #'   \item{sentiment}{Indicator for sentiment: "negative", "positive",
 33 | #'   "litigious", "uncertainty", "constraining", or "superfluous"}
 34 | #' }
 35 | #'
 36 | #' @source \url{https://sraf.nd.edu/loughranmcdonald-master-dictionary/}
 37 | #' @keywords datasets
 38 | #' @family lexicon
 39 | #' @importFrom fs file_exists dir_exists dir_create path
 40 | #' @importFrom readr read_rds
 41 | #' @importFrom utils menu
 42 | #' @export
 43 | #' @examples
 44 | #' \dontrun{
 45 | #' lexicon_loughran()
 46 | #'
 47 | #' # Custom directory
 48 | #' lexicon_loughran(dir = "data/")
 49 | #'
 50 | #' # Deleting dataset
 51 | #' lexicon_loughran(delete = TRUE)
 52 | #'
 53 | #' # Returning filepath of data
 54 | #' lexicon_loughran(return_path = TRUE)
 55 | #' }
 56 | lexicon_loughran <- function(dir = NULL, delete = FALSE, return_path = FALSE,
 57 |                              clean = FALSE, manual_download = FALSE) {
 58 |   load_dataset(
 59 |     data_name = "loughran", name = "LoughranMcDonald.rds", dir = dir,
 60 |     delete = delete, return_path = return_path, clean = clean,
 61 |     manual_download = manual_download
 62 |   )
 63 | }
 64 | 
 65 | #' @importFrom utils download.file
 66 | download_loughran <- function(folder_path) {
 67 |   file_path <- path(
 68 |     folder_path,
 69 |     "LoughranMcDonald_MasterDictionary_2018 - LoughranMcDonald_MasterDictionary_2018.csv"
 70 |   )
 71 |   if (file_exists(file_path)) {
 72 |     return(invisible())
 73 |   }
 74 |   download.file(
 75 |     url = "https://drive.google.com/uc?id=12ECPJMxV2wSalXG8ykMmkpa1fq_ur0Rf&export=download",
 76 |     destfile = file_path
 77 |   )
 78 | }
 79 | #' @importFrom readr read_csv cols_only col_character col_double
 80 | process_loughran <- function(folder_path, name_path) {
 81 |   data <- read_csv(path(folder_path, "LoughranMcDonald_MasterDictionary_2018 - LoughranMcDonald_MasterDictionary_2018.csv"),
 82 |     col_types = cols_only(
 83 |       Word = col_character(),
 84 |       Negative = col_double(),
 85 |       Positive = col_double(),
 86 |       Uncertainty = col_double(),
 87 |       Litigious = col_double(),
 88 |       Constraining = col_double(),
 89 |       Superfluous = col_double()
 90 |     )
 91 |   )
 92 | 
 93 |   types <- c("Negative", "Positive", "Uncertainty", "Litigious", "Constraining", "Superfluous")
 94 | 
 95 |   out <- list()
 96 |   for (type in types) {
 97 |     out[[type]] <- tibble(
 98 |       word = tolower(as.character(data$Word[data[[type]] != 0])),
 99 |       sentiment = tolower(type)
100 |     )
101 |   }
102 | 
103 |   write_rds(Reduce(rbind, out), name_path)
104 | }
105 | 


--------------------------------------------------------------------------------
/R/lexicon_nrc.R:
--------------------------------------------------------------------------------
  1 | #' NRC word-emotion association lexicon
  2 | #'
  3 | #' General purpose English sentiment/emotion lexicon. This lexicon labels words
  4 | #' with six possible sentiments or emotions: "negative", "positive", "anger",
  5 | #' "anticipation", "disgust", "fear", "joy", "sadness", "surprise", or "trust".
  6 | #' The annotations were manually done through Amazon's Mechanical Turk.
  7 | #'
  8 | #' License required for commercial use. Please contact Saif M. Mohammad
  9 | #' (saif.mohammad@nrc-cnrc.gc.ca).
 10 | #'
 11 | #' Citation info:
 12 | #'
 13 | #' This dataset was published in Saif Mohammad and Peter Turney. (2013),
 14 | #' ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational
 15 | #' Intelligence, 29(3): 436-465.
 16 | #'
 17 | #' article\{mohammad13, \cr
 18 | #' author = \{Mohammad, Saif M. and Turney, Peter D.\}, \cr
 19 | #' title = \{CROWDSOURCING A WORD–EMOTION ASSOCIATION LEXICON\}, \cr
 20 | #' journal = \{Computational Intelligence\}, \cr
 21 | #' volume = \{29\}, \cr
 22 | #' number = \{3\}, \cr
 23 | #' pages = \{436-465\}, \cr
 24 | #' doi = \{10.1111/j.1467-8640.2012.00460.x\}, \cr
 25 | #' url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x\}, \cr
 26 | #' eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x\}, \cr
 27 | #' year = \{2013\} \cr
 28 | #' \}
 29 | #'
 30 | #'
 31 | #'
 32 | #' @inheritParams lexicon_afinn
 33 | #' @return A tibble with 13,901 rows and 2 variables:
 34 | #' \describe{
 35 | #'   \item{word}{An English word}
 36 | #'   \item{sentiment}{Indicator for sentiment or emotion: "negative",
 37 | #'   "positive", "anger", "anticipation", "disgust", "fear", "joy", "sadness",
 38 | #'   "surprise", or "trust"}
 39 | #' }
 40 | #'
 41 | #' @source \url{http://saifmohammad.com/WebPages/lexicons.html}
 42 | #' @keywords datasets
 43 | #' @family lexicon
 44 | #' @importFrom fs file_exists dir_exists dir_create path
 45 | #' @importFrom readr read_rds
 46 | #' @importFrom utils menu
 47 | #' @export
 48 | #' @examples
 49 | #' \dontrun{
 50 | #' lexicon_nrc()
 51 | #'
 52 | #' # Custom directory
 53 | #' lexicon_nrc(dir = "data/")
 54 | #'
 55 | #' # Deleting dataset
 56 | #' lexicon_nrc(delete = TRUE)
 57 | #'
 58 | #' # Returning filepath of data
 59 | #' lexicon_nrc(return_path = TRUE)
 60 | #' }
 61 | lexicon_nrc <- function(dir = NULL, delete = FALSE, return_path = FALSE,
 62 |                         clean = FALSE, manual_download = FALSE) {
 63 |   load_dataset(
 64 |     data_name = "nrc", name = "NRCWordEmotion.rds", dir = dir,
 65 |     delete = delete, return_path = return_path, clean = clean,
 66 |     manual_download = manual_download
 67 |   )
 68 | }
 69 | 
 70 | #' @importFrom utils download.file
 71 | download_nrc <- function(folder_path) {
 72 |   file_path <- path(
 73 |     folder_path,
 74 |     "NRC-Emotion-Lexicon.zip"
 75 |   )
 76 |   if (file_exists(file_path)) {
 77 |     return(invisible())
 78 |   }
 79 |   download.file(
 80 |     url = "http://saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip",
 81 |     destfile = file_path
 82 |   )
 83 |   unzip(path(folder_path, "NRC-Emotion-Lexicon.zip"),
 84 |     exdir = folder_path
 85 |   )
 86 | }
 87 | 
 88 | #' @importFrom readr read_tsv
 89 | #' @importFrom utils unzip
 90 | 
 91 | process_nrc <- function(folder_path, name_path) {
 92 |   data <- read_tsv(path(
 93 |     folder_path,
 94 |     "NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
 95 |   ),
 96 |   col_names = FALSE, col_types = cols(
 97 |     X1 = col_character(),
 98 |     X2 = col_character(),
 99 |     X3 = col_double()
100 |   )
101 |   )
102 | 
103 |   data <- data[data$X3 == 1, ]
104 |   data <- tibble(
105 |     word = data$X1,
106 |     sentiment = data$X2
107 |   )
108 | 
109 |   write_rds(data, name_path)
110 | }
111 | 


--------------------------------------------------------------------------------
/R/lexicon_nrc_eil.R:
--------------------------------------------------------------------------------
 1 | #' NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5
 2 | #'
 3 | #' General purpose English sentiment/emotion lexicon. The NRC Affect Intensity
 4 | #' Lexicon is a list of English words and their associations with four basic
 5 | #' emotions (anger, fear, sadness, joy).
 6 | #'
 7 | #' For a given word and emotion X, the scores range from 0 to 1. A score of 1
 8 | #' means that the word conveys the highest amount of emotion X.  A score of 0
 9 | #' means that the word conveys the lowest amount of emotion X.
10 | #'
11 | #' License required for commercial use. Please contact Saif M. Mohammad
12 | #' (saif.mohammad@nrc-cnrc.gc.ca).
13 | #'
14 | #' Citation info:
15 | #'
16 | #' Details of the lexicon are in this paper.
17 | #' Word Affect Intensities. Saif M. Mohammad. In Proceedings of the 11th Edition
18 | #' of the Language Resources and Evaluation Conference (LREC-2018), May 2018,
19 | #' Miyazaki, Japan.
20 | #'
21 | #' inproceedings\{LREC18-AIL, \cr
22 | #' author = \{Mohammad, Saif M.\}, \cr
23 | #' title = \{Word Affect Intensities\}, \cr
24 | #' booktitle = \{Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)\}, \cr
25 | #' year = \{2018\}, \cr
26 | #' address=\{Miyazaki, Japan\} \cr
27 | #' \} \cr
28 | #'
29 | #' @inheritParams lexicon_afinn
30 | #' @return A tibble with 5.814 rows and 3 variables:
31 | #' \describe{
32 | #'   \item{term}{An English word}
33 | #'   \item{score}{Value between 0 and 1}
34 | #'   \item{AffectDimension}{Indicator for sentiment or emotion: ("anger",
35 | #'                          "fear", "sadness", "joy")}
36 | #' }
37 | #'
38 | #' @source \url{https://saifmohammad.com/WebPages/AffectIntensity.htm}
39 | #' @keywords datasets
40 | #' @family lexicon
41 | #' @importFrom fs file_exists dir_exists dir_create path
42 | #' @importFrom readr read_rds
43 | #' @importFrom utils menu
44 | #' @export
45 | #' @examples
46 | #' \dontrun{
47 | #' lexicon_nrc_eil()
48 | #'
49 | #' # Custom directory
50 | #' lexicon_nrc_eil(dir = "data/")
51 | #'
52 | #' # Deleting dataset
53 | #' lexicon_nrc_eil(delete = TRUE)
54 | #'
55 | #' # Returning filepath of data
56 | #' lexicon_nrc_eil(return_path = TRUE)
57 | #' }
58 | lexicon_nrc_eil <- function(dir = NULL, delete = FALSE, return_path = FALSE,
59 |                             clean = FALSE, manual_download = FALSE) {
60 |   load_dataset(
61 |     data_name = "nrc_eil", name = "nrc_eil.rds", dir = dir,
62 |     delete = delete, return_path = return_path, clean = clean,
63 |     manual_download = manual_download
64 |   )
65 | }
66 | 
67 | #' @importFrom utils download.file
68 | download_nrc_eil <- function(folder_path) {
69 |   file_path <- path(
70 |     folder_path,
71 |     "NRC-AffectIntensity-Lexicon.txt"
72 |   )
73 |   if (file_exists(file_path)) {
74 |     return(invisible())
75 |   }
76 |   download.file(
77 |     url = "http://saifmohammad.com/WebDocs/NRC-AffectIntensity-Lexicon.txt",
78 |     destfile = file_path
79 |   )
80 | }
81 | 
82 | #' @importFrom readr read_tsv
83 | #' @importFrom utils unzip
84 | 
85 | process_nrc_eil <- function(folder_path, name_path) {
86 |   data <- read_tsv(
87 |     file = path(folder_path, "NRC-AffectIntensity-Lexicon.txt"),
88 |     skip = 36,
89 |     col_types = cols(
90 |       term = col_character(),
91 |       score = col_double(),
92 |       AffectDimension = col_character()
93 |     )
94 |   )
95 |   write_rds(data, name_path)
96 | }
97 | 


--------------------------------------------------------------------------------
/R/lexicon_nrc_vad.R:
--------------------------------------------------------------------------------
  1 | #' The NRC Valence, Arousal, and Dominance Lexicon
  2 | #'
  3 | #' The NRC Valence, Arousal, and Dominance (VAD) Lexicon includes a list of
  4 | #' more than 20,000 English words and their valence, arousal, and dominance
  5 | #' scores. For a given word and a dimension (V/A/D), the scores range from 0
  6 | #' (lowest V/A/D) to 1 (highest V/A/D). The lexicon with its fine-grained real-
  7 | #' valued scores was created by manual annotation using best--worst scaling.
  8 | #' The lexicon is markedly larger than any of the existing VAD lexicons. We also
  9 | #' show that the ratings obtained are substantially more reliable than those in
 10 | #' existing lexicons.
 11 | #'
 12 | #' License required for commercial use. Please contact Saif M. Mohammad
 13 | #' (saif.mohammad@nrc-cnrc.gc.ca).
 14 | #'
 15 | #' Citation info:
 16 | #'
 17 | #' Details of the NRC VAD Lexicon are available in this paper:
 18 | #'
 19 | #' Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for
 20 | #' 20,000 English Words.  Saif M. Mohammad. In Proceedings of the 56th Annual
 21 | #' Meeting of the Association for Computational Linguistics, Melbourne,
 22 | #' Australia, July 2018.
 23 | #'
 24 | #' inproceedings\{vad-acl2018, \cr
 25 | #' title=\{Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words\}, \cr
 26 | #' author=\{Mohammad, Saif M.\}, \cr
 27 | #' booktitle=\{Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)\}, \cr
 28 | #' year=\{2018\}, \cr
 29 | #' address=\{Melbourne, Australia\} \cr
 30 | #' \}
 31 | #'
 32 | #' @inheritParams lexicon_afinn
 33 | #' @return A tibble with 20.007 rows and 4 variables:
 34 | #' \describe{
 35 | #'   \item{word}{An English word}
 36 | #'   \item{Valence}{valence score of the word}
 37 | #'   \item{Arousal}{arousal score of the word}
 38 | #'   \item{Dominance}{dominance score of the word}
 39 | #' }
 40 | #'
 41 | #' @source \url{https://saifmohammad.com/WebPages/nrc-vad.html}
 42 | #' @keywords datasets
 43 | #' @family lexicon
 44 | #' @importFrom fs file_exists dir_exists dir_create path
 45 | #' @importFrom readr read_rds
 46 | #' @importFrom utils menu
 47 | #' @export
 48 | #' @examples
 49 | #' \dontrun{
 50 | #' lexicon_nrc_vad()
 51 | #'
 52 | #' # Custom directory
 53 | #' lexicon_nrc_vad(dir = "data/")
 54 | #'
 55 | #' # Deleting dataset
 56 | #' lexicon_nrc_vad(delete = TRUE)
 57 | #'
 58 | #' # Returning filepath of data
 59 | #' lexicon_nrc_vad(return_path = TRUE)
 60 | #' }
 61 | lexicon_nrc_vad <- function(dir = NULL, delete = FALSE, return_path = FALSE,
 62 |                             clean = FALSE, manual_download = FALSE) {
 63 |   load_dataset(
 64 |     data_name = "nrc_vad", name = "nrc_vad.rds", dir = dir,
 65 |     delete = delete, return_path = return_path, clean = clean,
 66 |     manual_download = manual_download
 67 |   )
 68 | }
 69 | 
 70 | #' @importFrom utils download.file
 71 | download_nrc_vad <- function(folder_path) {
 72 |   file_path <- path(
 73 |     folder_path,
 74 |     "NRC-VAD-Lexicon-Aug2018Release.zip"
 75 |   )
 76 |   if (file_exists(file_path)) {
 77 |     return(invisible())
 78 |   }
 79 |   download.file(
 80 |     url = "http://saifmohammad.com/WebDocs/VAD/NRC-VAD-Lexicon-Aug2018Release.zip",
 81 |     destfile = file_path
 82 |   )
 83 |   unzip(path(folder_path, "NRC-VAD-Lexicon-Aug2018Release.zip"),
 84 |     exdir = folder_path
 85 |   )
 86 | }
 87 | 
 88 | #' @importFrom readr read_tsv
 89 | #' @importFrom utils unzip
 90 | 
 91 | process_nrc_vad <- function(folder_path, name_path) {
 92 |   data <- read_tsv(path(
 93 |     folder_path,
 94 |     "NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt"
 95 |   ),
 96 |   col_names = FALSE, 
 97 |   show_col_types = FALSE) 
 98 |   data <- stats::setNames(data, c("Word", "Valence", "Arousal", "Dominance"))
 99 | 
100 |   write_rds(data, name_path)
101 | }
102 | 


--------------------------------------------------------------------------------
/R/load_dataset.R:
--------------------------------------------------------------------------------
 1 | #' Internal Functions
 2 | #'
 3 | #' These are not to be used directly by the users.
 4 | #' @export
 5 | #' @importFrom fs dir_delete path file_delete
 6 | #' @keywords internal
 7 | load_dataset <- function(data_name, name, dir, delete, return_path, clean,
 8 |                          clean_manual = NULL, manual_download) {
 9 |   dir <- ifelse(is.null(dir), rappdirs::user_cache_dir("textdata"), dir)
10 | 
11 |   name_path <- path(dir, data_name, name)
12 |   folder_path <- path(dir, data_name)
13 | 
14 |   if (!manual_download) {
15 |     if (return_path) {
16 |       return(folder_path)
17 |     }
18 | 
19 |     if (delete) {
20 |       dir_delete(folder_path)
21 |       return(invisible())
22 |     }
23 | 
24 |     if (file_exists(name_path)) {
25 |       return(read_rds(name_path))
26 |     }
27 | 
28 |     if (printer(data_name) == 2) {
29 |       return(invisible())
30 |     }
31 | 
32 |     if (!dir_exists(folder_path)) {
33 |       dir_create(folder_path)
34 |     }
35 | 
36 |     download_functions[[data_name]](folder_path)
37 |   }
38 | 
39 |   process_functions[[data_name]](folder_path, name_path)
40 | 
41 |   if (clean) {
42 |     if (!is.null(clean_manual)) {
43 |       intermediate_files <- setdiff(
44 |         dir_ls(folder_path),
45 |         path(folder_path, clean_manual)
46 |       )
47 |     } else {
48 |       intermediate_files <- setdiff(dir_ls(folder_path), name_path)
49 |     }
50 |     file_delete(intermediate_files)
51 |   }
52 | 
53 |   read_rds(name_path)
54 | }
55 | 


--------------------------------------------------------------------------------
/R/printer.R:
--------------------------------------------------------------------------------
 1 | #' Internal Functions
 2 | #'
 3 | #' These are not to be used directly by the users.
 4 | #' @keywords internal
 5 | #' @noRd
 6 | printer <- function(name) {
 7 |   title <- cat(
 8 |     "Do you want to download:\n",
 9 |     "Name:", print_info[[name]][["name"]], "\n",
10 |     "URL:", print_info[[name]][["url"]], "\n",
11 |     "License:", print_info[[name]][["license"]], "\n",
12 |     "Size:", print_info[[name]][["size"]], "\n",
13 |     "Download mechanism:", print_info[[name]][["download_mech"]], "\n"
14 |   )
15 | 
16 |   if (!is.na(print_info[[name]][["citation"]])) {
17 |     title <- cat(
18 |       title,
19 |       print_info[[name]][["citation"]], "\n"
20 |     )
21 |   }
22 | 
23 |   menu(choices = c("Yes", "No"), title = title)
24 | }
25 | 


--------------------------------------------------------------------------------
/R/process_functions.R:
--------------------------------------------------------------------------------
 1 | #' List of all process functions used in load_dataset
 2 | #'
 3 | #' @format Named list of all process functions
 4 | #' @include download_functions.R
 5 | #'
 6 | #' @name process_functions
 7 | #' @noRd
 8 | NULL
 9 | 
10 | process_functions <- list(
11 |   afinn = process_afinn,
12 |   sentence_polarity = process_sentence_polarity,
13 |   loughran = process_loughran,
14 |   bing = process_bing,
15 |   nrc = process_nrc,
16 |   nrc_eil = process_nrc_eil,
17 |   nrc_vad = process_nrc_vad,
18 |   ag_news = process_ag_news,
19 |   dbpedia = process_dbpedia,
20 |   trec = process_trec,
21 |   imdb = process_imdb,
22 |   glove6b = process_glove6b,
23 |   glove27b = process_glove27b,
24 |   glove42b = process_glove42b,
25 |   glove840b = process_glove840b
26 | )
27 | 


--------------------------------------------------------------------------------
/R/textdata-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 | 
4 | ## usethis namespace: start
5 | ## usethis namespace: end
6 | NULL
7 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, include = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "man/figures/README-"
12 | )
13 | ```
14 | 
15 | # textdata <img src='man/figures/logo.png' style="float:right" height="139" />
16 | 
17 | <!-- badges: start -->
18 | [![R-CMD-check](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml)
19 | [![CRAN status](https://www.r-pkg.org/badges/version/textdata)](https://CRAN.R-project.org/package=textdata)
20 | [![Downloads](http://cranlogs.r-pkg.org/badges/textdata)](https://cran.r-project.org/package=textdata)
21 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3244433.svg)](https://doi.org/10.5281/zenodo.3244433)
22 | [![Codecov test coverage](https://codecov.io/gh/EmilHvitfeldt/textdata/branch/main/graph/badge.svg)](https://app.codecov.io/gh/EmilHvitfeldt/textdata?branch=main)
23 | [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html)
24 | <!-- badges: end -->
25 | 
26 | The goal of textdata is to provide access to text-related data sets for easy access without bundling them inside a package. Some text datasets are too large to store within an R package or are licensed in such a way that prevents them from being included in an OSS-licensed package. Instead, this package provides a framework to download, parse, and store the datasets on the disk and load them when needed.
27 | 
28 | ## Installation
29 | 
30 | You can install the not yet released version of textdata from [CRAN](https://CRAN.R-project.org) with:
31 | 
32 | ``` r
33 | install.packages("textdata")
34 | ```
35 | 
36 | And the development version from [GitHub](https://github.com/) with:
37 | 
38 | ``` r
39 | # install.packages("remotes")
40 | remotes::install_github("EmilHvitfeldt/textdata")
41 | ```
42 | ## Example
43 | 
44 | The first time you use one of the functions for accessing an included text dataset, such as `lexicon_afinn()` or `dataset_sentence_polarity()`, the function will prompt you to agree that you understand the dataset's license or terms of use and then download the dataset to your computer.
45 | 
46 | ![](man/figures/textdata_demo.gif)
47 | 
48 | After the first use, each time you use a function like `lexicon_afinn()`, the function will load the dataset from disk.
49 | 
50 | ## Included text datasets
51 | 
52 | As of today, the datasets included in textdata are:
53 | 
54 | | Dataset                                                         | Function                      |
55 | | --------------------------------------------------------------- | ----------------------------- |
56 | | v1.0 sentence polarity dataset                                  | `dataset_sentence_polarity()` |
57 | | AFINN-111 sentiment lexicon                                     | `lexicon_afinn()`             |
58 | | Hu and Liu's opinion lexicon                                    | `lexicon_bing()`              |
59 | | NRC word-emotion association lexicon                            | `lexicon_nrc()`               |
60 | | NRC Emotion Intensity Lexicon                                   | `lexicon_nrc_eil()`           |
61 | | The NRC Valence, Arousal, and Dominance Lexicon                 | `lexicon_nrc_vad()`           |
62 | | Loughran and McDonald's opinion lexicon for financial documents | `lexicon_loughran()`          |
63 | | AG's News                                                       | `dataset_ag_news()`           |
64 | | DBpedia ontology                                                | `dataset_dbpedia()`           |
65 | | Trec-6 and Trec-50                                              | `dataset_trec()`              |
66 | | IMDb Large Movie Review Dataset	                                | `dataset_imdb()`              |
67 | | Stanford NLP GloVe pre-trained word vectors                     | `embedding_glove6b()`         |
68 | |                                                                 | `embedding_glove27b()`        |
69 | |                                                                 | `embedding_glove42b()`        |
70 | |                                                                 | `embedding_glove840b()`       |
71 | 
72 | Check out each function's documentation for detailed information (including citations) for the relevant dataset.
73 | 
74 | ## Community Guidelines
75 | 
76 | Note that this project is released with a
77 | [Contributor Code of Conduct](https://github.com/EmilHvitfeldt/textdata/blob/main/CODE_OF_CONDUCT.md).
78 | By contributing to this project, you agree to abide by its terms. 
79 | Feedback, bug reports (and fixes!), and feature requests are welcome; file 
80 | issues or seek support [here](https://github.com/EmilHvitfeldt/textdata/issues).
81 | For details on how to add a new dataset to this package, check out the vignette!
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # textdata <img src='man/figures/logo.png' style="float:right" height="139" />
 5 | 
 6 | <!-- badges: start -->
 7 | 
 8 | [![R-CMD-check](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml)
 9 | [![CRAN
10 | status](https://www.r-pkg.org/badges/version/textdata)](https://CRAN.R-project.org/package=textdata)
11 | [![Downloads](http://cranlogs.r-pkg.org/badges/textdata)](https://cran.r-project.org/package=textdata)
12 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3244433.svg)](https://doi.org/10.5281/zenodo.3244433)
13 | [![Codecov test
14 | coverage](https://codecov.io/gh/EmilHvitfeldt/textdata/branch/main/graph/badge.svg)](https://app.codecov.io/gh/EmilHvitfeldt/textdata?branch=main)
15 | [![Lifecycle:
16 | stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html)
17 | <!-- badges: end -->
18 | 
19 | The goal of textdata is to provide access to text-related data sets for
20 | easy access without bundling them inside a package. Some text datasets
21 | are too large to store within an R package or are licensed in such a way
22 | that prevents them from being included in an OSS-licensed package.
23 | Instead, this package provides a framework to download, parse, and store
24 | the datasets on the disk and load them when needed.
25 | 
26 | ## Installation
27 | 
28 | You can install the not yet released version of textdata from
29 | [CRAN](https://CRAN.R-project.org) with:
30 | 
31 | ``` r
32 | install.packages("textdata")
33 | ```
34 | 
35 | And the development version from [GitHub](https://github.com/) with:
36 | 
37 | ``` r
38 | # install.packages("remotes")
39 | remotes::install_github("EmilHvitfeldt/textdata")
40 | ```
41 | 
42 | ## Example
43 | 
44 | The first time you use one of the functions for accessing an included
45 | text dataset, such as `lexicon_afinn()` or
46 | `dataset_sentence_polarity()`, the function will prompt you to agree
47 | that you understand the dataset’s license or terms of use and then
48 | download the dataset to your computer.
49 | 
50 | ![](man/figures/textdata_demo.gif)
51 | 
52 | After the first use, each time you use a function like
53 | `lexicon_afinn()`, the function will load the dataset from disk.
54 | 
55 | ## Included text datasets
56 | 
57 | As of today, the datasets included in textdata are:
58 | 
59 | | Dataset                                                         | Function                      |
60 | |-----------------------------------------------------------------|-------------------------------|
61 | | v1.0 sentence polarity dataset                                  | `dataset_sentence_polarity()` |
62 | | AFINN-111 sentiment lexicon                                     | `lexicon_afinn()`             |
63 | | Hu and Liu’s opinion lexicon                                    | `lexicon_bing()`              |
64 | | NRC word-emotion association lexicon                            | `lexicon_nrc()`               |
65 | | NRC Emotion Intensity Lexicon                                   | `lexicon_nrc_eil()`           |
66 | | The NRC Valence, Arousal, and Dominance Lexicon                 | `lexicon_nrc_vad()`           |
67 | | Loughran and McDonald’s opinion lexicon for financial documents | `lexicon_loughran()`          |
68 | | AG’s News                                                       | `dataset_ag_news()`           |
69 | | DBpedia ontology                                                | `dataset_dbpedia()`           |
70 | | Trec-6 and Trec-50                                              | `dataset_trec()`              |
71 | | IMDb Large Movie Review Dataset                                 | `dataset_imdb()`              |
72 | | Stanford NLP GloVe pre-trained word vectors                     | `embedding_glove6b()`         |
73 | |                                                                 | `embedding_glove27b()`        |
74 | |                                                                 | `embedding_glove42b()`        |
75 | |                                                                 | `embedding_glove840b()`       |
76 | 
77 | Check out each function’s documentation for detailed information
78 | (including citations) for the relevant dataset.
79 | 
80 | ## Community Guidelines
81 | 
82 | Note that this project is released with a [Contributor Code of
83 | Conduct](https://github.com/EmilHvitfeldt/textdata/blob/main/CODE_OF_CONDUCT.md).
84 | By contributing to this project, you agree to abide by its terms.
85 | Feedback, bug reports (and fixes!), and feature requests are welcome;
86 | file issues or seek support
87 | [here](https://github.com/EmilHvitfeldt/textdata/issues). For details on
88 | how to add a new dataset to this package, check out the vignette!
89 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | template:
 2 |   bootstrap: 5
 3 | 
 4 | development:
 5 |   mode: auto
 6 | 
 7 | reference:
 8 |   - title: Lexicons
 9 |     contents:
10 |     - lexicon_afinn
11 |     - lexicon_bing
12 |     - lexicon_nrc
13 |     - lexicon_nrc_eil
14 |     - lexicon_nrc_vad
15 |     - lexicon_loughran
16 |   - title: Data Sets
17 |     contents:
18 |     - dataset_sentence_polarity
19 |     - dataset_ag_news
20 |     - dataset_dbpedia
21 |     - dataset_trec
22 |     - dataset_imdb
23 |   - title: Embeddings
24 |     contents:
25 |     - embedding_glove6b
26 |     - embedding_glove27b
27 |     - embedding_glove42b
28 |     - embedding_glove840b
29 |   - title: Other
30 |     contents:
31 |     - catalogue
32 |     - cache_info
33 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |     patch:
10 |       default:
11 |         target: auto
12 |         threshold: 1%
13 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Release Summary
 2 | 
 3 | This is the 7th CRAN release of textdata. Fixes a bug that produces data with no column names.
 4 | 
 5 | ## R CMD check results
 6 | 
 7 | 0 errors | 0 warnings | 0 note
 8 | 
 9 | ## Downstream dependencies
10 | 
11 | I ran R CMD check on the 3 downstream dependencies and there were no problems related to textdata.
12 | 


--------------------------------------------------------------------------------
/man/cache_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cache_info.R
 3 | \name{cache_info}
 4 | \alias{cache_info}
 5 | \title{List folders and their sizes in cache}
 6 | \usage{
 7 | cache_info(dir = NULL)
 8 | }
 9 | \arguments{
10 | \item{dir}{Character, path to directory where data will be stored. If
11 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
12 | }
13 | \value{
14 | A tibble with 2 variables:
15 | \describe{
16 |   \item{name}{Name of the folder}
17 |   \item{size}{Size of the folder}
18 | }
19 | }
20 | \description{
21 | This function will return a tibble with the name and sizes of all folder in
22 | specified directory. Will default to textdata's default cache.
23 | }
24 | \examples{
25 | \dontrun{
26 | cache_info()
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/man/catalogue.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/info.R
 3 | \docType{data}
 4 | \name{catalogue}
 5 | \alias{catalogue}
 6 | \title{Catalogue of all available data sources}
 7 | \format{
 8 | An object of class \code{data.frame} with 15 rows and 8 columns.
 9 | }
10 | \usage{
11 | catalogue
12 | }
13 | \description{
14 | Catalogue of all available data sources
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/dataset_ag_news.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataset_ag_news.R
 3 | \name{dataset_ag_news}
 4 | \alias{dataset_ag_news}
 5 | \title{AG's News Topic Classification Dataset}
 6 | \source{
 7 | \url{http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html}
 8 | 
 9 | \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
10 | }
11 | \usage{
12 | dataset_ag_news(
13 |   dir = NULL,
14 |   split = c("train", "test"),
15 |   delete = FALSE,
16 |   return_path = FALSE,
17 |   clean = FALSE,
18 |   manual_download = FALSE
19 | )
20 | }
21 | \arguments{
22 | \item{dir}{Character, path to directory where data will be stored. If
23 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
24 | 
25 | \item{split}{Character. Return training ("train") data or testing ("test")
26 | data. Defaults to "train".}
27 | 
28 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
29 | 
30 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
31 | 
32 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
33 | greatly reduce the size. Defaults to FALSE.}
34 | 
35 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
36 | downloaded the file and placed it in the folder designated by running
37 | this function with \code{return_path = TRUE}.}
38 | }
39 | \value{
40 | A tibble with 120,000 or 30,000 rows for "train" and "test"
41 |     respectively and 3 variables:
42 | \describe{
43 |   \item{class}{Character, denoting new class}
44 |   \item{title}{Character, title of article}
45 |   \item{description}{Character, description of article}
46 | }
47 | }
48 | \description{
49 | The AG's news topic classification dataset is constructed by choosing 4
50 | largest classes from the original corpus. Each class contains 30,000 training
51 | samples and 1,900 testing samples. The total number of training samples is
52 | 120,000 and testing 7,600.
53 | Version 3, Updated 09/09/2015
54 | }
55 | \details{
56 | The classes in this dataset are
57 | 
58 | \itemize{
59 | \item World
60 | \item Sports
61 | \item Business
62 | \item Sci/Tech
63 | }
64 | }
65 | \examples{
66 | \dontrun{
67 | dataset_ag_news()
68 | 
69 | # Custom directory
70 | dataset_ag_news(dir = "data/")
71 | 
72 | # Deleting dataset
73 | dataset_ag_news(delete = TRUE)
74 | 
75 | # Returning filepath of data
76 | dataset_ag_news(return_path = TRUE)
77 | 
78 | # Access both training and testing dataset
79 | train <- dataset_ag_news(split = "train")
80 | test <- dataset_ag_news(split = "test")
81 | }
82 | 
83 | }
84 | \seealso{
85 | Other topic: 
86 | \code{\link{dataset_dbpedia}()},
87 | \code{\link{dataset_trec}()}
88 | }
89 | \concept{topic}
90 | \keyword{datasets}
91 | 


--------------------------------------------------------------------------------
/man/dataset_dbpedia.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/dataset_dbpedia.R
  3 | \name{dataset_dbpedia}
  4 | \alias{dataset_dbpedia}
  5 | \title{DBpedia Ontology Dataset}
  6 | \source{
  7 | \url{https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf}
  8 | 
  9 | \url{https://www.dbpedia.org/}
 10 | 
 11 | \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
 12 | }
 13 | \usage{
 14 | dataset_dbpedia(
 15 |   dir = NULL,
 16 |   split = c("train", "test"),
 17 |   delete = FALSE,
 18 |   return_path = FALSE,
 19 |   clean = FALSE,
 20 |   manual_download = FALSE
 21 | )
 22 | }
 23 | \arguments{
 24 | \item{dir}{Character, path to directory where data will be stored. If
 25 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
 26 | 
 27 | \item{split}{Character. Return training ("train") data or testing ("test")
 28 | data. Defaults to "train".}
 29 | 
 30 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
 31 | 
 32 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
 33 | 
 34 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
 35 | greatly reduce the size. Defaults to FALSE.}
 36 | 
 37 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
 38 | downloaded the file and placed it in the folder designated by running
 39 | this function with \code{return_path = TRUE}.}
 40 | }
 41 | \value{
 42 | A tibble with 560,000 or 70,000 rows for "train" and "test"
 43 |     respectively and 3 variables:
 44 | \describe{
 45 |   \item{class}{Character, denoting the class class}
 46 |   \item{title}{Character, title of article}
 47 |   \item{description}{Character, description of article}
 48 | }
 49 | }
 50 | \description{
 51 | DBpedia ontology dataset classification dataset. It contains 560,000 training
 52 | samples and 70,000 testing samples for each of 14 nonoverlapping classes
 53 | from DBpedia.
 54 | }
 55 | \details{
 56 | The classes are
 57 | 
 58 | \itemize{
 59 | \item Company
 60 | \item EducationalInstitution
 61 | \item Artist
 62 | \item Athlete
 63 | \item OfficeHolder
 64 | \item MeanOfTransportation
 65 | \item Building
 66 | \item NaturalPlace
 67 | \item Village
 68 | \item Animal
 69 | \item Plant
 70 | \item Album
 71 | \item Film
 72 | \item WrittenWork
 73 | }
 74 | }
 75 | \examples{
 76 | \dontrun{
 77 | dataset_dbpedia()
 78 | 
 79 | # Custom directory
 80 | dataset_dbpedia(dir = "data/")
 81 | 
 82 | # Deleting dataset
 83 | dataset_dbpedia(delete = TRUE)
 84 | 
 85 | # Returning filepath of data
 86 | dataset_dbpedia(return_path = TRUE)
 87 | 
 88 | # Access both training and testing dataset
 89 | train <- dataset_dbpedia(split = "train")
 90 | test <- dataset_dbpedia(split = "test")
 91 | }
 92 | 
 93 | }
 94 | \seealso{
 95 | Other topic: 
 96 | \code{\link{dataset_ag_news}()},
 97 | \code{\link{dataset_trec}()}
 98 | }
 99 | \concept{topic}
100 | \keyword{datasets}
101 | 


--------------------------------------------------------------------------------
/man/dataset_imdb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataset_imdb.R
 3 | \name{dataset_imdb}
 4 | \alias{dataset_imdb}
 5 | \title{IMDB Large Movie Review Dataset}
 6 | \source{
 7 | \url{http://ai.stanford.edu/~amaas/data/sentiment/}
 8 | }
 9 | \usage{
10 | dataset_imdb(
11 |   dir = NULL,
12 |   split = c("train", "test"),
13 |   delete = FALSE,
14 |   return_path = FALSE,
15 |   clean = FALSE,
16 |   manual_download = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{dir}{Character, path to directory where data will be stored. If
21 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
22 | 
23 | \item{split}{Character. Return training ("train") data or testing ("test")
24 | data. Defaults to "train".}
25 | 
26 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
27 | 
28 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
29 | 
30 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
31 | greatly reduce the size. Defaults to FALSE.}
32 | 
33 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
34 | downloaded the file and placed it in the folder designated by running
35 | this function with \code{return_path = TRUE}.}
36 | }
37 | \value{
38 | A tibble with 25,000 rows and 2 variables:
39 | \describe{
40 |   \item{Sentiment}{Character, denoting the sentiment}
41 |   \item{text}{Character, text of the review}
42 | }
43 | }
44 | \description{
45 | The core dataset contains 50,000 reviews split evenly into 25k train and
46 | 25k test sets. The overall distribution of labels is balanced (25k pos and
47 | 25k neg).
48 | }
49 | \details{
50 | In the entire collection, no more than 30 reviews are allowed for any
51 | given movie because reviews for the same movie tend to have correlated
52 | ratings. Further, the train and test sets contain a disjoint set of
53 | movies, so no significant performance is obtained by memorizing
54 | movie-unique terms and their associated with observed labels. In the
55 | labeled train/test sets, a negative review has a score <= 4 out of 10,
56 | and a positive review has a score >= 7 out of 10. Thus reviews with
57 | more neutral ratings are not included in the train/test sets. In the
58 | unsupervised set, reviews of any rating are included and there are an
59 | even number of reviews > 5 and <= 5.
60 | 
61 | When using this dataset, please cite the ACL 2011 paper
62 | 
63 | InProceedings\{maas-EtAl:2011:ACL-HLT2011, \cr
64 | author    = \{Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher\}, \cr
65 | title     = \{Learning Word Vectors for Sentiment Analysis\}, \cr
66 | booktitle = \{Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies\}, \cr
67 | month     = \{June\}, \cr
68 | year      = \{2011\}, \cr
69 | address   = \{Portland, Oregon, USA\}, \cr
70 | publisher = \{Association for Computational Linguistics\}, \cr
71 | pages     = \{142--150\}, \cr
72 | url       = \{http://www.aclweb.org/anthology/P11-1015\}
73 | \}
74 | }
75 | \examples{
76 | \dontrun{
77 | dataset_imdb()
78 | 
79 | # Custom directory
80 | dataset_imdb(dir = "data/")
81 | 
82 | # Deleting dataset
83 | dataset_imdb(delete = TRUE)
84 | 
85 | # Returning filepath of data
86 | dataset_imdb(return_path = TRUE)
87 | 
88 | # Access both training and testing dataset
89 | train <- dataset_imdb(split = "train")
90 | test <- dataset_imdb(split = "test")
91 | }
92 | 
93 | }
94 | \concept{topic sentiment}
95 | \keyword{datasets}
96 | 


--------------------------------------------------------------------------------
/man/dataset_sentence_polarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataset_sentence_polarity.R
 3 | \name{dataset_sentence_polarity}
 4 | \alias{dataset_sentence_polarity}
 5 | \title{v1.0 sentence polarity dataset}
 6 | \source{
 7 | \url{https://www.cs.cornell.edu/people/pabo/movie-review-data/}
 8 | }
 9 | \usage{
10 | dataset_sentence_polarity(
11 |   dir = NULL,
12 |   delete = FALSE,
13 |   return_path = FALSE,
14 |   clean = FALSE,
15 |   manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 | 
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 | 
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 | 
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 | 
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 10,662 rows and 2 variables:
35 | \describe{
36 |   \item{text}{Sentences or snippets}
37 |   \item{sentiment}{Indicator for sentiment, "neg" for negative and "pos"
38 |                    for positive}
39 | }
40 | }
41 | \description{
42 | 5331 positive and 5331 negative processed sentences / snippets.
43 | Introduced in Pang/Lee ACL 2005. Released July 2005.
44 | }
45 | \details{
46 | Citation info:
47 | 
48 | This data was first used in Bo Pang and Lillian Lee,
49 | ``Seeing stars: Exploiting class relationships for sentiment categorization
50 | with respect to rating scales.'', Proceedings of the ACL, 2005.
51 | 
52 | InProceedings\{pang05, \cr
53 | author    = \{Bo Pang and Lillian Lee\}, \cr
54 | title     = \{Seeing stars: Exploiting class relationships for sentiment \cr
55 |               categorization with respect to rating scales\}, \cr
56 | booktitle = \{Proceedings of the ACL\}, \cr
57 | year      = 2005 \cr
58 | \}
59 | }
60 | \examples{
61 | \dontrun{
62 | dataset_sentence_polarity()
63 | 
64 | # Custom directory
65 | dataset_sentence_polarity(dir = "data/")
66 | 
67 | # Deleting dataset
68 | dataset_sentence_polarity(delete = TRUE)
69 | 
70 | # Returning filepath of data
71 | dataset_sentence_polarity(return_path = TRUE)
72 | }
73 | 
74 | }
75 | \concept{sentiment}
76 | \keyword{datasets}
77 | 


--------------------------------------------------------------------------------
/man/dataset_trec.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/dataset_trec.R
  3 | \name{dataset_trec}
  4 | \alias{dataset_trec}
  5 | \title{TREC dataset}
  6 | \source{
  7 | \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/}
  8 | 
  9 | \url{https://trec.nist.gov/data/qa.html}
 10 | }
 11 | \usage{
 12 | dataset_trec(
 13 |   dir = NULL,
 14 |   split = c("train", "test"),
 15 |   version = c("6", "50"),
 16 |   delete = FALSE,
 17 |   return_path = FALSE,
 18 |   clean = FALSE,
 19 |   manual_download = FALSE
 20 | )
 21 | }
 22 | \arguments{
 23 | \item{dir}{Character, path to directory where data will be stored. If
 24 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
 25 | 
 26 | \item{split}{Character. Return training ("train") data or testing ("test")
 27 | data. Defaults to "train".}
 28 | 
 29 | \item{version}{Character. Version 6("6") or version 50("50"). Defaults to
 30 | "6".}
 31 | 
 32 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
 33 | 
 34 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
 35 | 
 36 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
 37 | greatly reduce the size. Defaults to FALSE.}
 38 | 
 39 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
 40 | downloaded the file and placed it in the folder designated by running
 41 | this function with \code{return_path = TRUE}.}
 42 | }
 43 | \value{
 44 | A tibble with 5,452 or 500 rows for "train" and "test"
 45 |     respectively and 2 variables:
 46 | \describe{
 47 |   \item{class}{Character, denoting the class}
 48 |   \item{text}{Character, question text}
 49 | }
 50 | }
 51 | \description{
 52 | The TREC dataset is dataset for question classification consisting of
 53 | open-domain, fact-based questions divided into broad semantic categories.
 54 | It has both a six-class (TREC-6) and a fifty-class (TREC-50) version. Both
 55 | have 5,452 training examples and 500 test examples, but TREC-50 has
 56 | finer-grained labels. Models are evaluated based on accuracy.
 57 | }
 58 | \details{
 59 | The classes in TREC-6 are
 60 | 
 61 | \itemize{
 62 | \item ABBR - Abbreviation
 63 | \item DESC - Description and abstract concepts
 64 | \item ENTY - Entities
 65 | \item HUM - Human beings
 66 | \item LOC - Locations
 67 | \item NYM - Numeric values
 68 | }
 69 | 
 70 | the classes in TREC-50 can be found here
 71 | \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/definition.html}.
 72 | }
 73 | \examples{
 74 | \dontrun{
 75 | dataset_trec()
 76 | 
 77 | # Custom directory
 78 | dataset_trec(dir = "data/")
 79 | 
 80 | # Deleting dataset
 81 | dataset_trec(delete = TRUE)
 82 | 
 83 | # Returning filepath of data
 84 | dataset_trec(return_path = TRUE)
 85 | 
 86 | # Access both training and testing dataset
 87 | train_6 <- dataset_trec(split = "train")
 88 | test_6 <- dataset_trec(split = "test")
 89 | 
 90 | train_50 <- dataset_trec(split = "train", version = "50")
 91 | test_50 <- dataset_trec(split = "test", version = "50")
 92 | }
 93 | 
 94 | }
 95 | \seealso{
 96 | Other topic: 
 97 | \code{\link{dataset_ag_news}()},
 98 | \code{\link{dataset_dbpedia}()}
 99 | }
100 | \concept{topic}
101 | \keyword{datasets}
102 | 


--------------------------------------------------------------------------------
/man/embedding_glove.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/embedding_glove.R
  3 | \name{embedding_glove}
  4 | \alias{embedding_glove}
  5 | \alias{embedding_glove6b}
  6 | \alias{embedding_glove27b}
  7 | \alias{embedding_glove42b}
  8 | \alias{embedding_glove840b}
  9 | \title{Global Vectors for Word Representation}
 10 | \source{
 11 | \url{https://nlp.stanford.edu/projects/glove/}
 12 | }
 13 | \usage{
 14 | embedding_glove6b(
 15 |   dir = NULL,
 16 |   dimensions = c(50, 100, 200, 300),
 17 |   delete = FALSE,
 18 |   return_path = FALSE,
 19 |   clean = FALSE,
 20 |   manual_download = FALSE
 21 | )
 22 | 
 23 | embedding_glove27b(
 24 |   dir = NULL,
 25 |   dimensions = c(25, 50, 100, 200),
 26 |   delete = FALSE,
 27 |   return_path = FALSE,
 28 |   clean = FALSE,
 29 |   manual_download = FALSE
 30 | )
 31 | 
 32 | embedding_glove42b(
 33 |   dir = NULL,
 34 |   delete = FALSE,
 35 |   return_path = FALSE,
 36 |   clean = FALSE,
 37 |   manual_download = FALSE
 38 | )
 39 | 
 40 | embedding_glove840b(
 41 |   dir = NULL,
 42 |   delete = FALSE,
 43 |   return_path = FALSE,
 44 |   clean = FALSE,
 45 |   manual_download = FALSE
 46 | )
 47 | }
 48 | \arguments{
 49 | \item{dir}{Character, path to directory where data will be stored. If
 50 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
 51 | 
 52 | \item{dimensions}{A number indicating the number of vectors to include. One
 53 | of 50, 100, 200, or 300 for glove6b, or one of 25, 50, 100, or 200 for
 54 | glove27b.}
 55 | 
 56 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
 57 | 
 58 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
 59 | 
 60 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
 61 | greatly reduce the size. Defaults to FALSE.}
 62 | 
 63 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
 64 | downloaded the file and placed it in the folder designated by running
 65 | this function with \code{return_path = TRUE}.}
 66 | }
 67 | \value{
 68 | A tibble with 400k, 1.9m, 2.2m, or 1.2m rows (one row for each unique
 69 |   token in the vocabulary) and the following variables:
 70 | \describe{
 71 |   \item{token}{An individual token (usually a word)}
 72 |   \item{d1, d2, etc}{The embeddings for that token.}
 73 | }
 74 | }
 75 | \description{
 76 | The GloVe pre-trained word vectors provide word embeddings created using
 77 | varying numbers of tokens.
 78 | }
 79 | \details{
 80 | Citation info:
 81 | 
 82 | InProceedings\{pennington2014glove, \cr
 83 | author     = \{Jeffrey Pennington and Richard Socher and Christopher D. \cr
 84 |                Manning\}, \cr
 85 | title      = \{GloVe: Global Vectors for Word Representation\}, \cr
 86 | booktitle  = \{Empirical Methods in Natural Language Processing (EMNLP)\}, \cr
 87 | year       = 2014 \cr
 88 | pages      = \{1532-1543\} \cr
 89 | url        = \{http://www.aclweb.org/anthology/D14-1162\} \cr
 90 | \}
 91 | }
 92 | \examples{
 93 | \dontrun{
 94 | embedding_glove6b(dimensions = 50)
 95 | 
 96 | # Custom directory
 97 | embedding_glove42b(dir = "data/")
 98 | 
 99 | # Deleting dataset
100 | embedding_glove6b(delete = TRUE, dimensions = 300)
101 | 
102 | # Returning filepath of data
103 | embedding_glove840b(return_path = TRUE)
104 | }
105 | }
106 | \references{
107 | Jeffrey Pennington, Richard Socher, and Christopher D. Manning.
108 |   2014. GloVe: Global Vectors for Word Representation.
109 | }
110 | \concept{embeddings}
111 | \keyword{datasets}
112 | 


--------------------------------------------------------------------------------
/man/figures/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/.DS_Store


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/logo.png


--------------------------------------------------------------------------------
/man/figures/screen-shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/screen-shot.png


--------------------------------------------------------------------------------
/man/figures/textdata_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/textdata_demo.gif


--------------------------------------------------------------------------------
/man/lexicon_afinn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lexicon_afinn.R
 3 | \name{lexicon_afinn}
 4 | \alias{lexicon_afinn}
 5 | \title{AFINN-111 dataset}
 6 | \usage{
 7 | lexicon_afinn(
 8 |   dir = NULL,
 9 |   delete = FALSE,
10 |   return_path = FALSE,
11 |   clean = FALSE,
12 |   manual_download = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{dir}{Character, path to directory where data will be stored. If
17 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
18 | 
19 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
20 | 
21 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
22 | 
23 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
24 | greatly reduce the size. Defaults to FALSE.}
25 | 
26 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
27 | downloaded the file and placed it in the folder designated by running
28 | this function with \code{return_path = TRUE}.}
29 | }
30 | \value{
31 | A tibble with 2,477 rows and 2 variables:
32 | \describe{
33 |   \item{word}{An English word}
34 |   \item{score}{Indicator for sentiment: integer between -5 and +5}
35 | }
36 | }
37 | \description{
38 | AFINN is a lexicon of English words rated for valence with an integer
39 | between minus five (negative) and plus five (positive). The words have
40 | been manually labeled by Finn Årup Nielsen in 2009-2011.
41 | }
42 | \details{
43 | This dataset is the newest version with 2477 words and phrases.
44 | 
45 | Citation info:
46 | 
47 | This dataset was published in Finn Ärup Nielsen (2011),
48 | ``A new Evaluation of a word list for sentiment analysis in
49 | microblogs'', Proceedings of the ESWC2011 Workshop on
50 | 'Making Sense of Microposts': Big things come in small packages (2011) 93-98.
51 | 
52 | article\{nielsen11, \cr
53 | author    = \{Finn Äruprup Nielsen\}, \cr
54 | title     = \{A new Evaluation of a word list for sentiment analysis in microblogs\}, \cr
55 | journal   = \{CoRR\}, \cr
56 | volume    = \{abs/1103.2903\}, \cr
57 | year      = \{2011\}, \cr
58 | url       = \{http://arxiv.org/abs/1103.2903\}, \cr
59 | archivePrefix = \{arXiv\}, \cr
60 | eprint    = \{1103.2903\}, \cr
61 | biburl    = \{https://dblp.org/rec/bib/journals/corr/abs-1103-2903\}, \cr
62 | bibsource = \{dblp computer science bibliography, https://dblp.org\} \cr
63 | \}
64 | }
65 | \examples{
66 | \dontrun{
67 | lexicon_afinn()
68 | 
69 | # Custom directory
70 | lexicon_afinn(dir = "data/")
71 | 
72 | # Deleting dataset
73 | lexicon_afinn(delete = TRUE)
74 | 
75 | # Returning filepath of data
76 | lexicon_afinn(return_path = TRUE)
77 | }
78 | }
79 | \seealso{
80 | Other lexicon: 
81 | \code{\link{lexicon_bing}()},
82 | \code{\link{lexicon_loughran}()},
83 | \code{\link{lexicon_nrc}()},
84 | \code{\link{lexicon_nrc_eil}()},
85 | \code{\link{lexicon_nrc_vad}()}
86 | }
87 | \concept{lexicon}
88 | \keyword{datasets}
89 | 


--------------------------------------------------------------------------------
/man/lexicon_bing.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lexicon_bing.R
 3 | \name{lexicon_bing}
 4 | \alias{lexicon_bing}
 5 | \title{Bing sentiment lexicon}
 6 | \source{
 7 | \url{https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html}
 8 | }
 9 | \usage{
10 | lexicon_bing(
11 |   dir = NULL,
12 |   delete = FALSE,
13 |   return_path = FALSE,
14 |   clean = FALSE,
15 |   manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 | 
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 | 
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 | 
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 | 
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 6,787 rows and 2 variables:
35 | \describe{
36 |   \item{word}{An English word}
37 |   \item{sentiment}{Indicator for sentiment: "negative" or "positive"}
38 | }
39 | }
40 | \description{
41 | General purpose English sentiment lexicon that categorizes words in a
42 | binary fashion, either positive or negative
43 | }
44 | \details{
45 | Citation info:
46 | 
47 | This dataset was first published in Minqing Hu and Bing Liu, ``Mining and
48 | summarizing customer reviews.'', Proceedings of the ACM SIGKDD International
49 | Conference on Knowledge Discovery & Data Mining (KDD-2004), 2004.
50 | 
51 | inproceedings\{Hu04, \cr
52 | author    = \{Hu, Minqing and Liu, Bing\}, \cr
53 | title     = \{Mining and Summarizing Customer Reviews\}, \cr
54 | booktitle = \{Proceedings of the Tenth ACM SIGKDD International Conference
55 |               on Knowledge Discovery and Data Mining\}, \cr
56 | series    = \{KDD '04\}, \cr
57 | year      = \{2004\}, \cr
58 | isbn      = \{1-58113-888-1\}, \cr
59 | location  = \{Seattle, WA, USA\}, \cr
60 | pages     = \{168--177\}, \cr
61 | numpages  = \{10\}, \cr
62 | url       = \{http://doi.acm.org/10.1145/1014052.1014073\}, \cr
63 | doi       = \{10.1145/1014052.1014073\}, \cr
64 | acmid     = \{1014073\}, \cr
65 | publisher = \{ACM\}, \cr
66 | address   = \{New York, NY, USA\}, \cr
67 | keywords  = \{reviews, sentiment classification, summarization, text mining\}, \cr
68 | \}
69 | }
70 | \examples{
71 | \dontrun{
72 | lexicon_bing()
73 | 
74 | # Custom directory
75 | lexicon_bing(dir = "data/")
76 | 
77 | # Deleting dataset
78 | lexicon_bing(delete = TRUE)
79 | 
80 | # Returning filepath of data
81 | lexicon_bing(return_path = TRUE)
82 | }
83 | }
84 | \seealso{
85 | Other lexicon: 
86 | \code{\link{lexicon_afinn}()},
87 | \code{\link{lexicon_loughran}()},
88 | \code{\link{lexicon_nrc}()},
89 | \code{\link{lexicon_nrc_eil}()},
90 | \code{\link{lexicon_nrc_vad}()}
91 | }
92 | \concept{lexicon}
93 | \keyword{datasets}
94 | 


--------------------------------------------------------------------------------
/man/lexicon_loughran.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lexicon_loughran.R
 3 | \name{lexicon_loughran}
 4 | \alias{lexicon_loughran}
 5 | \title{Loughran-McDonald sentiment lexicon}
 6 | \source{
 7 | \url{https://sraf.nd.edu/loughranmcdonald-master-dictionary/}
 8 | }
 9 | \usage{
10 | lexicon_loughran(
11 |   dir = NULL,
12 |   delete = FALSE,
13 |   return_path = FALSE,
14 |   clean = FALSE,
15 |   manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 | 
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 | 
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 | 
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 | 
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 4,150 rows and 2 variables:
35 | \describe{
36 |   \item{word}{An English word}
37 |   \item{sentiment}{Indicator for sentiment: "negative", "positive",
38 |   "litigious", "uncertainty", "constraining", or "superfluous"}
39 | }
40 | }
41 | \description{
42 | English sentiment lexicon created for use with financial documents. This
43 | lexicon labels words with six possible sentiments important in financial
44 | contexts: "negative", "positive", "litigious", "uncertainty", "constraining",
45 | or "superfluous".
46 | }
47 | \details{
48 | Citation info:
49 | 
50 | This dataset was published in Loughran, T. and McDonald, B. (2011),
51 | ``When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and
52 | 10-Ks.'' The Journal of Finance, 66: 35-65.
53 | 
54 | article\{loughran11, \cr
55 | author  = \{Loughran, Tim and McDonald, Bill\}, \cr
56 | title   = \{When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10-Ks\}, \cr
57 | journal = \{The Journal of Finance\}, \cr
58 | volume  = \{66\}, \cr
59 | number  = \{1\}, \cr
60 | pages   = \{35-65\}, \cr
61 | doi     = \{10.1111/j.1540-6261.2010.01625.x\}, \cr
62 | url     = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-6261.2010.01625.x\}, \cr
63 | eprint  = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1540-6261.2010.01625.x\}, \cr
64 | year    = \{2011\} \cr
65 | \}
66 | }
67 | \examples{
68 | \dontrun{
69 | lexicon_loughran()
70 | 
71 | # Custom directory
72 | lexicon_loughran(dir = "data/")
73 | 
74 | # Deleting dataset
75 | lexicon_loughran(delete = TRUE)
76 | 
77 | # Returning filepath of data
78 | lexicon_loughran(return_path = TRUE)
79 | }
80 | }
81 | \seealso{
82 | Other lexicon: 
83 | \code{\link{lexicon_afinn}()},
84 | \code{\link{lexicon_bing}()},
85 | \code{\link{lexicon_nrc}()},
86 | \code{\link{lexicon_nrc_eil}()},
87 | \code{\link{lexicon_nrc_vad}()}
88 | }
89 | \concept{lexicon}
90 | \keyword{datasets}
91 | 


--------------------------------------------------------------------------------
/man/lexicon_nrc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lexicon_nrc.R
 3 | \name{lexicon_nrc}
 4 | \alias{lexicon_nrc}
 5 | \title{NRC word-emotion association lexicon}
 6 | \source{
 7 | \url{http://saifmohammad.com/WebPages/lexicons.html}
 8 | }
 9 | \usage{
10 | lexicon_nrc(
11 |   dir = NULL,
12 |   delete = FALSE,
13 |   return_path = FALSE,
14 |   clean = FALSE,
15 |   manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 | 
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 | 
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 | 
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 | 
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 13,901 rows and 2 variables:
35 | \describe{
36 |   \item{word}{An English word}
37 |   \item{sentiment}{Indicator for sentiment or emotion: "negative",
38 |   "positive", "anger", "anticipation", "disgust", "fear", "joy", "sadness",
39 |   "surprise", or "trust"}
40 | }
41 | }
42 | \description{
43 | General purpose English sentiment/emotion lexicon. This lexicon labels words
44 | with six possible sentiments or emotions: "negative", "positive", "anger",
45 | "anticipation", "disgust", "fear", "joy", "sadness", "surprise", or "trust".
46 | The annotations were manually done through Amazon's Mechanical Turk.
47 | }
48 | \details{
49 | License required for commercial use. Please contact Saif M. Mohammad
50 | (saif.mohammad@nrc-cnrc.gc.ca).
51 | 
52 | Citation info:
53 | 
54 | This dataset was published in Saif Mohammad and Peter Turney. (2013),
55 | ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational
56 | Intelligence, 29(3): 436-465.
57 | 
58 | article\{mohammad13, \cr
59 | author = \{Mohammad, Saif M. and Turney, Peter D.\}, \cr
60 | title = \{CROWDSOURCING A WORD–EMOTION ASSOCIATION LEXICON\}, \cr
61 | journal = \{Computational Intelligence\}, \cr
62 | volume = \{29\}, \cr
63 | number = \{3\}, \cr
64 | pages = \{436-465\}, \cr
65 | doi = \{10.1111/j.1467-8640.2012.00460.x\}, \cr
66 | url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x\}, \cr
67 | eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x\}, \cr
68 | year = \{2013\} \cr
69 | \}
70 | }
71 | \examples{
72 | \dontrun{
73 | lexicon_nrc()
74 | 
75 | # Custom directory
76 | lexicon_nrc(dir = "data/")
77 | 
78 | # Deleting dataset
79 | lexicon_nrc(delete = TRUE)
80 | 
81 | # Returning filepath of data
82 | lexicon_nrc(return_path = TRUE)
83 | }
84 | }
85 | \seealso{
86 | Other lexicon: 
87 | \code{\link{lexicon_afinn}()},
88 | \code{\link{lexicon_bing}()},
89 | \code{\link{lexicon_loughran}()},
90 | \code{\link{lexicon_nrc_eil}()},
91 | \code{\link{lexicon_nrc_vad}()}
92 | }
93 | \concept{lexicon}
94 | \keyword{datasets}
95 | 


--------------------------------------------------------------------------------
/man/lexicon_nrc_eil.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lexicon_nrc_eil.R
 3 | \name{lexicon_nrc_eil}
 4 | \alias{lexicon_nrc_eil}
 5 | \title{NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5}
 6 | \source{
 7 | \url{https://saifmohammad.com/WebPages/AffectIntensity.htm}
 8 | }
 9 | \usage{
10 | lexicon_nrc_eil(
11 |   dir = NULL,
12 |   delete = FALSE,
13 |   return_path = FALSE,
14 |   clean = FALSE,
15 |   manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 | 
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 | 
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 | 
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 | 
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 5.814 rows and 3 variables:
35 | \describe{
36 |   \item{term}{An English word}
37 |   \item{score}{Value between 0 and 1}
38 |   \item{AffectDimension}{Indicator for sentiment or emotion: ("anger",
39 |                          "fear", "sadness", "joy")}
40 | }
41 | }
42 | \description{
43 | General purpose English sentiment/emotion lexicon. The NRC Affect Intensity
44 | Lexicon is a list of English words and their associations with four basic
45 | emotions (anger, fear, sadness, joy).
46 | }
47 | \details{
48 | For a given word and emotion X, the scores range from 0 to 1. A score of 1
49 | means that the word conveys the highest amount of emotion X.  A score of 0
50 | means that the word conveys the lowest amount of emotion X.
51 | 
52 | License required for commercial use. Please contact Saif M. Mohammad
53 | (saif.mohammad@nrc-cnrc.gc.ca).
54 | 
55 | Citation info:
56 | 
57 | Details of the lexicon are in this paper.
58 | Word Affect Intensities. Saif M. Mohammad. In Proceedings of the 11th Edition
59 | of the Language Resources and Evaluation Conference (LREC-2018), May 2018,
60 | Miyazaki, Japan.
61 | 
62 | inproceedings\{LREC18-AIL, \cr
63 | author = \{Mohammad, Saif M.\}, \cr
64 | title = \{Word Affect Intensities\}, \cr
65 | booktitle = \{Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)\}, \cr
66 | year = \{2018\}, \cr
67 | address=\{Miyazaki, Japan\} \cr
68 | \} \cr
69 | }
70 | \examples{
71 | \dontrun{
72 | lexicon_nrc_eil()
73 | 
74 | # Custom directory
75 | lexicon_nrc_eil(dir = "data/")
76 | 
77 | # Deleting dataset
78 | lexicon_nrc_eil(delete = TRUE)
79 | 
80 | # Returning filepath of data
81 | lexicon_nrc_eil(return_path = TRUE)
82 | }
83 | }
84 | \seealso{
85 | Other lexicon: 
86 | \code{\link{lexicon_afinn}()},
87 | \code{\link{lexicon_bing}()},
88 | \code{\link{lexicon_loughran}()},
89 | \code{\link{lexicon_nrc}()},
90 | \code{\link{lexicon_nrc_vad}()}
91 | }
92 | \concept{lexicon}
93 | \keyword{datasets}
94 | 


--------------------------------------------------------------------------------
/man/lexicon_nrc_vad.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lexicon_nrc_vad.R
 3 | \name{lexicon_nrc_vad}
 4 | \alias{lexicon_nrc_vad}
 5 | \title{The NRC Valence, Arousal, and Dominance Lexicon}
 6 | \source{
 7 | \url{https://saifmohammad.com/WebPages/nrc-vad.html}
 8 | }
 9 | \usage{
10 | lexicon_nrc_vad(
11 |   dir = NULL,
12 |   delete = FALSE,
13 |   return_path = FALSE,
14 |   clean = FALSE,
15 |   manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 | 
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 | 
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 | 
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 | 
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 20.007 rows and 4 variables:
35 | \describe{
36 |   \item{word}{An English word}
37 |   \item{Valence}{valence score of the word}
38 |   \item{Arousal}{arousal score of the word}
39 |   \item{Dominance}{dominance score of the word}
40 | }
41 | }
42 | \description{
43 | The NRC Valence, Arousal, and Dominance (VAD) Lexicon includes a list of
44 | more than 20,000 English words and their valence, arousal, and dominance
45 | scores. For a given word and a dimension (V/A/D), the scores range from 0
46 | (lowest V/A/D) to 1 (highest V/A/D). The lexicon with its fine-grained real-
47 | valued scores was created by manual annotation using best--worst scaling.
48 | The lexicon is markedly larger than any of the existing VAD lexicons. We also
49 | show that the ratings obtained are substantially more reliable than those in
50 | existing lexicons.
51 | }
52 | \details{
53 | License required for commercial use. Please contact Saif M. Mohammad
54 | (saif.mohammad@nrc-cnrc.gc.ca).
55 | 
56 | Citation info:
57 | 
58 | Details of the NRC VAD Lexicon are available in this paper:
59 | 
60 | Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for
61 | 20,000 English Words.  Saif M. Mohammad. In Proceedings of the 56th Annual
62 | Meeting of the Association for Computational Linguistics, Melbourne,
63 | Australia, July 2018.
64 | 
65 | inproceedings\{vad-acl2018, \cr
66 | title=\{Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words\}, \cr
67 | author=\{Mohammad, Saif M.\}, \cr
68 | booktitle=\{Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)\}, \cr
69 | year=\{2018\}, \cr
70 | address=\{Melbourne, Australia\} \cr
71 | \}
72 | }
73 | \examples{
74 | \dontrun{
75 | lexicon_nrc_vad()
76 | 
77 | # Custom directory
78 | lexicon_nrc_vad(dir = "data/")
79 | 
80 | # Deleting dataset
81 | lexicon_nrc_vad(delete = TRUE)
82 | 
83 | # Returning filepath of data
84 | lexicon_nrc_vad(return_path = TRUE)
85 | }
86 | }
87 | \seealso{
88 | Other lexicon: 
89 | \code{\link{lexicon_afinn}()},
90 | \code{\link{lexicon_bing}()},
91 | \code{\link{lexicon_loughran}()},
92 | \code{\link{lexicon_nrc}()},
93 | \code{\link{lexicon_nrc_eil}()}
94 | }
95 | \concept{lexicon}
96 | \keyword{datasets}
97 | 


--------------------------------------------------------------------------------
/man/load_dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_dataset.R
 3 | \name{load_dataset}
 4 | \alias{load_dataset}
 5 | \title{Internal Functions}
 6 | \usage{
 7 | load_dataset(
 8 |   data_name,
 9 |   name,
10 |   dir,
11 |   delete,
12 |   return_path,
13 |   clean,
14 |   clean_manual = NULL,
15 |   manual_download
16 | )
17 | }
18 | \description{
19 | These are not to be used directly by the users.
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/man/textdata-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textdata-package.R
 3 | \docType{package}
 4 | \name{textdata-package}
 5 | \alias{textdata}
 6 | \alias{textdata-package}
 7 | \title{textdata: Download and Load Various Text Datasets}
 8 | \description{
 9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
10 | 
11 | Provides a framework to download, parse, and store text datasets on the disk and load them when needed. Includes various sentiment lexicons and labeled text data sets for classification and analysis.
12 | }
13 | \seealso{
14 | Useful links:
15 | \itemize{
16 |   \item \url{https://emilhvitfeldt.github.io/textdata/}
17 |   \item \url{https://github.com/EmilHvitfeldt/textdata}
18 |   \item Report bugs at \url{https://github.com/EmilHvitfeldt/textdata/issues}
19 | }
20 | 
21 | }
22 | \author{
23 | \strong{Maintainer}: Emil Hvitfeldt \email{emilhhvitfeldt@gmail.com} (\href{https://orcid.org/0000-0002-0679-1945}{ORCID})
24 | 
25 | Other contributors:
26 | \itemize{
27 |   \item Julia Silge \email{julia.silge@gmail.com} (\href{https://orcid.org/0000-0002-3671-836X}{ORCID}) [contributor]
28 | }
29 | 
30 | }
31 | \keyword{internal}
32 | 


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-120x120.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-152x152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-152x152.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-180x180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-180x180.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-60x60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-60x60.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-76x76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-76x76.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon.png


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon-16x16.png


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon-32x32.png


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon.ico


--------------------------------------------------------------------------------
/revdep/README.md:
--------------------------------------------------------------------------------
 1 | # Platform
 2 | 
 3 | |field    |value                                                                                    |
 4 | |:--------|:----------------------------------------------------------------------------------------|
 5 | |version  |R version 4.3.3 (2024-02-29)                                                             |
 6 | |os       |macOS Sonoma 14.4.1                                                                      |
 7 | |system   |aarch64, darwin20                                                                        |
 8 | |ui       |X11                                                                                      |
 9 | |language |(EN)                                                                                     |
10 | |collate  |en_US.UTF-8                                                                              |
11 | |ctype    |en_US.UTF-8                                                                              |
12 | |tz       |America/Los_Angeles                                                                      |
13 | |date     |2024-05-28                                                                               |
14 | |pandoc   |3.1.12.3 @ /Applications/Positron.app/Contents/Resources/app/bin/pandoc/ (via rmarkdown) |
15 | 
16 | # Dependencies
17 | 
18 | |package     |old   |new        |Δ  |
19 | |:-----------|:-----|:----------|:--|
20 | |textdata    |0.4.4 |0.4.4.9000 |*  |
21 | |bit         |4.0.5 |4.0.5      |   |
22 | |bit64       |4.0.5 |4.0.5      |   |
23 | |cli         |3.6.2 |3.6.2      |   |
24 | |clipr       |0.8.0 |0.8.0      |   |
25 | |cpp11       |0.4.7 |0.4.7      |   |
26 | |crayon      |1.5.2 |1.5.2      |   |
27 | |fansi       |1.0.6 |1.0.6      |   |
28 | |fs          |1.6.4 |1.6.4      |   |
29 | |glue        |1.7.0 |1.7.0      |   |
30 | |hms         |1.1.3 |1.1.3      |   |
31 | |lifecycle   |1.0.4 |1.0.4      |   |
32 | |magrittr    |2.0.3 |2.0.3      |   |
33 | |pillar      |1.9.0 |1.9.0      |   |
34 | |pkgconfig   |2.0.3 |2.0.3      |   |
35 | |prettyunits |1.2.0 |1.2.0      |   |
36 | |progress    |1.2.3 |1.2.3      |   |
37 | |R6          |2.5.1 |2.5.1      |   |
38 | |rappdirs    |0.3.3 |0.3.3      |   |
39 | |readr       |2.1.5 |2.1.5      |   |
40 | |rlang       |1.1.3 |1.1.3      |   |
41 | |tibble      |3.2.1 |3.2.1      |   |
42 | |tidyselect  |1.2.1 |1.2.1      |   |
43 | |tzdb        |0.4.0 |0.4.0      |   |
44 | |utf8        |1.2.4 |1.2.4      |   |
45 | |vctrs       |0.6.5 |0.6.5      |   |
46 | |vroom       |1.6.5 |1.6.5      |   |
47 | |withr       |3.0.0 |3.0.0      |   |
48 | 
49 | # Revdeps
50 | 
51 | 


--------------------------------------------------------------------------------
/revdep/cran.md:
--------------------------------------------------------------------------------
1 | ## revdepcheck results
2 | 
3 | We checked 3 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
4 | 
5 |  * We saw 0 new problems
6 |  * We failed to check 0 packages
7 | 
8 | 


--------------------------------------------------------------------------------
/revdep/failures.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*


--------------------------------------------------------------------------------
/revdep/problems.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(textdata)
3 | 
4 | test_check("textdata")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-download_functions.R:
--------------------------------------------------------------------------------
 1 | downloads <- setdiff(
 2 |   ls(getNamespace("textdata"), pattern = "^download_"),
 3 |   "download_functions"
 4 | )
 5 | 
 6 | test_that("All download functions are included in download_functions", {
 7 |   expect_equal(
 8 |     length(downloads),
 9 |     length(textdata:::download_functions)
10 |   )
11 | })
12 | 
13 | test_that("All download functions has the folder_path argument", {
14 |   for (fun in downloads) {
15 |     expect_equal(
16 |       names(formals(get(fun, getNamespace("textdata")))),
17 |       "folder_path"
18 |     )
19 |   }
20 | })
21 | 
22 | test_that("the download functions are named right according to print_info", {
23 |   testthat::expect_setequal(
24 |     paste0("download_", names(textdata:::print_info)),
25 |     downloads
26 |   )
27 | })
28 | 


--------------------------------------------------------------------------------
/tests/testthat/test-info.R:
--------------------------------------------------------------------------------
1 | test_that("print_info has right names", {
2 |   lapply(
3 |     textdata:::print_info,
4 |     function(x) expect_true(all(names(x) == c("name", "url", "license", "size", "type", "download_mech", "description", "citation")))
5 |   )
6 | })
7 | 


--------------------------------------------------------------------------------
/tests/testthat/test-process_functions.R:
--------------------------------------------------------------------------------
 1 | processs <- setdiff(
 2 |   ls(getNamespace("textdata"), pattern = "^process_"),
 3 |   "process_functions"
 4 | )
 5 | 
 6 | test_that("All process functions are included in process_functions", {
 7 |   expect_equal(
 8 |     length(processs),
 9 |     length(textdata:::process_functions)
10 |   )
11 | })
12 | 
13 | test_that("All process functions has the folder_path argument", {
14 |   for (fun in processs) {
15 |     expect_equal(
16 |       names(formals(get(fun, getNamespace("textdata")))),
17 |       c("folder_path", "name_path")
18 |     )
19 |   }
20 | })
21 | 
22 | test_that("the process functions are named right according to print_info", {
23 |   testthat::expect_setequal(
24 |     paste0("process_", names(textdata:::print_info)),
25 |     processs
26 |   )
27 | })
28 | 


--------------------------------------------------------------------------------
/textdata.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/How-to-add-a-data-set.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "How to add a data set"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{How to add a data set}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | ---
 9 | 
10 | ```{r, include = FALSE}
11 | knitr::opts_chunk$set(
12 |   collapse = TRUE,
13 |   comment = "#>"
14 | )
15 | ```
16 | 
17 | ```{r setup}
18 | library(textdata)
19 | ```
20 | 
21 | This package provides infrastructure to make text datasets available within R, even when they are too large to store within an R package or are licensed in such a way that prevents them from being included in OSS-licensed packages.
22 | 
23 | Do you want to add a new dataset to the textdata package?
24 | 
25 | - Create a  R file named `prefix_*.R` in the `R/` folder, where `*` is the name of the dataset. Supported prefixes include
26 |     - `dataset_`
27 |     - `lexicon_`
28 | - Inside that file create 3 functions named `download_*()`, `process_*()` and `dataset_*()`.
29 |   - The `download_*()` function should take 1 argument named `folder_path`. It has 2 tasks, first it should check if the file is already downloaded. If it is already downloaded it should return `invisible()`. If the file isn't at the path it should download the file to said path.
30 |   - The `process_*()` function should take 2 arguments, `folder_path` and `name_path`. `folder_path` denotes the the path to the file returned by `download_*` and `name_path` is the path to where the polished data should live. Main point of `process_*()` is to turn the downloaded file into a .rds file containing a tidy tibble.
31 |   - The `dataset_*()` function should wrap the `load_dataset()`.
32 | - Add the `process_*()` function to the named list `process_functions` in the file process_functions.R.
33 | - Add the `download_*()` function to the named list `download_functions` in the file download_functions.R.
34 | - Modify the `print_info` list in the info.R file.
35 | - Add `dataset_*.R` to the @include tags in `download_functions.R`.
36 | - Add the dataset to the table in `README.Rmd`.
37 | - Add the dataset to `_pkgdown.yml`.
38 | - Write a bullet in the `NEWS.md file`.
39 | 
40 | What are the guidelines for adding datasets?
41 | 
42 | # Guidelines for textdata datasets
43 | 
44 | - All datasets must have a license or terms of use clearly specified.
45 | - Data should be a vector or tibble.
46 | - Use `word` instead of `words` for column names.
47 | 
48 | # Classification datasets
49 | 
50 | For datasets that comes with a testing and training dataset. Let the user pick which one to retrieve with a `split` argument similar to how `dataset_ag_news()` is doing.
51 | 


--------------------------------------------------------------------------------