├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ └── pr-commands.yaml ├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── cache_info.R ├── dataset_ag_news.R ├── dataset_dbpedia.R ├── dataset_imdb.R ├── dataset_sentence_polarity.R ├── dataset_trec.R ├── download_functions.R ├── embedding_glove.R ├── info.R ├── lexicon_afinn.R ├── lexicon_bing.R ├── lexicon_loughran.R ├── lexicon_nrc.R ├── lexicon_nrc_eil.R ├── lexicon_nrc_vad.R ├── load_dataset.R ├── printer.R ├── process_functions.R └── textdata-package.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── codecov.yml ├── cran-comments.md ├── man ├── cache_info.Rd ├── catalogue.Rd ├── dataset_ag_news.Rd ├── dataset_dbpedia.Rd ├── dataset_imdb.Rd ├── dataset_sentence_polarity.Rd ├── dataset_trec.Rd ├── embedding_glove.Rd ├── figures │ ├── .DS_Store │ ├── logo.png │ ├── screen-shot.png │ └── textdata_demo.gif ├── lexicon_afinn.Rd ├── lexicon_bing.Rd ├── lexicon_loughran.Rd ├── lexicon_nrc.Rd ├── lexicon_nrc_eil.Rd ├── lexicon_nrc_vad.Rd ├── load_dataset.Rd └── textdata-package.Rd ├── pkgdown └── favicon │ ├── apple-touch-icon-120x120.png │ ├── apple-touch-icon-152x152.png │ ├── apple-touch-icon-180x180.png │ ├── apple-touch-icon-60x60.png │ ├── apple-touch-icon-76x76.png │ ├── apple-touch-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ └── favicon.ico ├── revdep ├── README.md ├── cran.md ├── failures.md └── problems.md ├── tests ├── testthat.R └── testthat │ ├── test-download_functions.R │ ├── test-info.R │ └── test-process_functions.R ├── textdata.Rproj └── vignettes ├── .gitignore └── How-to-add-a-data-set.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^README\.Rmd$ 2 | ^LICENSE\.md$ 3 | ^textdata\.Rproj$ 4 | ^\.Rproj\.user$ 5 | ^\.travis\.yml$ 6 | ^CODE_OF_CONDUCT\.md$ 7 | ^cran-comments\.md$ 8 | ^_pkgdown\.yml$ 9 | ^docs$ 10 | ^pkgdown$ 11 | ^CRAN-RELEASE$ 12 | ^revdep$ 13 | ^codecov\.yml$ 14 | ^\.github/workflows/R-CMD-check\.yaml$ 15 | ^\.github/workflows/pr-commands\.yaml$ 16 | ^\.github$ 17 | ^CRAN-SUBMISSION$ 18 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macOS-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v2 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@4.1.4 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | issue_comment: 3 | types: [created] 4 | name: Commands 5 | jobs: 6 | document: 7 | if: startsWith(github.event.comment.body, '/document') 8 | name: document 9 | runs-on: macOS-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: r-lib/actions/pr-fetch@master 13 | with: 14 | repo-token: ${{ secrets.GITHUB_TOKEN }} 15 | - uses: r-lib/actions/setup-r@master 16 | - name: Install dependencies 17 | run: Rscript -e 'install.packages(c("remotes", "roxygen2"))' -e 'remotes::install_deps(dependencies = TRUE)' 18 | - name: Document 19 | run: Rscript -e 'roxygen2::roxygenise()' 20 | - name: commit 21 | run: | 22 | git add man/\* NAMESPACE 23 | git commit -m 'Document' 24 | - uses: r-lib/actions/pr-push@master 25 | with: 26 | repo-token: ${{ secrets.GITHUB_TOKEN }} 27 | style: 28 | if: startsWith(github.event.comment.body, '/style') 29 | name: style 30 | runs-on: macOS-latest 31 | steps: 32 | - uses: actions/checkout@v2 33 | - uses: r-lib/actions/pr-fetch@master 34 | with: 35 | repo-token: ${{ secrets.GITHUB_TOKEN }} 36 | - uses: r-lib/actions/setup-r@master 37 | - name: Install dependencies 38 | run: Rscript -e 'install.packages("styler")' 39 | - name: Style 40 | run: Rscript -e 'styler::style_pkg()' 41 | - name: commit 42 | run: | 43 | git add \*.R 44 | git commit -m 'Style' 45 | - uses: r-lib/actions/pr-push@master 46 | with: 47 | repo-token: ${{ secrets.GITHUB_TOKEN }} 48 | # A mock job just to ensure we have a successful build status 49 | finish: 50 | runs-on: ubuntu-latest 51 | steps: 52 | - run: true 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | inst/doc 7 | docs/ 8 | 9 | revdep/checks 10 | revdep/library 11 | revdep/checks.noindex 12 | revdep/library.noindex 13 | revdep/data.sqlite 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | cache: packages 5 | 6 | before_cache: Rscript -e 'remotes::install_cran("pkgdown")' 7 | deploy: 8 | provider: script 9 | script: Rscript -e 'pkgdown::deploy_site_github()' 10 | skip_cleanup: true 11 | 12 | after_success: 13 | - Rscript -e 'covr::codecov()' 14 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (https://www.contributor-covenant.org), version 1.0.0, available at 25 | https://contributor-covenant.org/version/1/0/0/. 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: textdata 2 | Title: Download and Load Various Text Datasets 3 | Version: 0.4.5.9000 4 | Authors@R: c( 5 | person("Emil", "Hvitfeldt", , "emilhhvitfeldt@gmail.com", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0002-0679-1945")), 7 | person("Julia", "Silge", , "julia.silge@gmail.com", role = "ctb", 8 | comment = c(ORCID = "0000-0002-3671-836X")) 9 | ) 10 | Description: Provides a framework to download, parse, and store text 11 | datasets on the disk and load them when needed. Includes various 12 | sentiment lexicons and labeled text data sets for classification and 13 | analysis. 14 | License: MIT + file LICENSE 15 | URL: https://emilhvitfeldt.github.io/textdata/, https://github.com/EmilHvitfeldt/textdata 16 | BugReports: https://github.com/EmilHvitfeldt/textdata/issues 17 | Imports: 18 | fs, 19 | rappdirs, 20 | readr, 21 | tibble 22 | Suggests: 23 | covr, 24 | knitr, 25 | rmarkdown, 26 | testthat (>= 2.1.0) 27 | VignetteBuilder: 28 | knitr 29 | Encoding: UTF-8 30 | RoxygenNote: 7.3.1.9000 31 | Collate: 32 | 'cache_info.R' 33 | 'dataset_ag_news.R' 34 | 'dataset_dbpedia.R' 35 | 'dataset_imdb.R' 36 | 'dataset_sentence_polarity.R' 37 | 'dataset_trec.R' 38 | 'embedding_glove.R' 39 | 'lexicon_nrc_vad.R' 40 | 'lexicon_nrc_eil.R' 41 | 'lexicon_nrc.R' 42 | 'lexicon_bing.R' 43 | 'lexicon_loughran.R' 44 | 'lexicon_afinn.R' 45 | 'download_functions.R' 46 | 'info.R' 47 | 'load_dataset.R' 48 | 'printer.R' 49 | 'process_functions.R' 50 | 'textdata-package.R' 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Emil Hvitfeldt 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2018 Emil Hvitfeldt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(cache_info) 4 | export(catalogue) 5 | export(dataset_ag_news) 6 | export(dataset_dbpedia) 7 | export(dataset_imdb) 8 | export(dataset_sentence_polarity) 9 | export(dataset_trec) 10 | export(embedding_glove27b) 11 | export(embedding_glove42b) 12 | export(embedding_glove6b) 13 | export(embedding_glove840b) 14 | export(lexicon_afinn) 15 | export(lexicon_bing) 16 | export(lexicon_loughran) 17 | export(lexicon_nrc) 18 | export(lexicon_nrc_eil) 19 | export(lexicon_nrc_vad) 20 | export(load_dataset) 21 | importFrom(fs,dir_create) 22 | importFrom(fs,dir_delete) 23 | importFrom(fs,dir_exists) 24 | importFrom(fs,dir_ls) 25 | importFrom(fs,file_delete) 26 | importFrom(fs,file_exists) 27 | importFrom(fs,path) 28 | importFrom(readr,col_character) 29 | importFrom(readr,col_double) 30 | importFrom(readr,cols) 31 | importFrom(readr,cols_only) 32 | importFrom(readr,read_csv) 33 | importFrom(readr,read_delim) 34 | importFrom(readr,read_lines) 35 | importFrom(readr,read_rds) 36 | importFrom(readr,read_tsv) 37 | importFrom(readr,write_rds) 38 | importFrom(tibble,tibble) 39 | importFrom(utils,download.file) 40 | importFrom(utils,menu) 41 | importFrom(utils,untar) 42 | importFrom(utils,unzip) 43 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # textdata (development version) 2 | 3 | # textdata 0.4.5 4 | 5 | * Fixed bug where `lexicon_nrc_vad()` didn't have column names. (#53) 6 | 7 | # textdata 0.4.4 8 | 9 | * Update path to correctly path source for NRC lexicon. 10 | 11 | # textdata 0.4.3 12 | 13 | * Fixed documentation to be HTML5 friendly. 14 | 15 | # textdata 0.4.2 16 | 17 | * `cache_info()` function has to added to allow for quick overview of cacheing size. 18 | * Update download url for `lexicon_nrc()`. 19 | 20 | # textdata 0.4.1 21 | 22 | # textdata 0.4.0 23 | 24 | * `embedding_glove6b()`, `embedding_glove27b()`, `embedding_glove42b()`, and `embedding_glove840b()` have been added to give access to the Stanford NLP Global Vectors for Word Representations pre-trained word vectors (@jonthegeek, #26). 25 | * `manual_download` argument have been added to all functions to allow the user to manual place file download at right place. 26 | 27 | # textdata 0.3.0 28 | 29 | * `lexicon_nrc_eil()` has been added to give access to the NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5. 30 | * `lexicon_nrc_vad()` has been added to give access to the The NRC Valence, Arousal, and Dominance Lexicon. 31 | * The argument `clean` have been added to all functions to allow deletion of intermediate files. 32 | * An optional information prompt is implemented. This will be turned off by default and turned on by original authors request. 33 | * `dataset_nrc()` got improved url for faster and more reliable downloads. 34 | 35 | # textdata 0.2.0 36 | 37 | * `dataset_imdb()` has been added to give access to the IMDb Large Movie Review Dataset. 38 | * `dataset_trec()` has been added to give access to the TREC-6 and TREC-50 classification datasets. 39 | * `dataset_dbpedia()` has been added to give access to DBpedia Ontology classification dataset. 40 | * `dataset_ag_news()` has been added to give access to AG's News Topic classification dataset. 41 | * Functions will now notify the user about the download mechanism used to download the data. http/https etc. (#12). 42 | * `lexicon_nrc()` has been added to give access to the NRC Emotion lexicon (@juliasilge, #11). 43 | 44 | # textdata 0.1.0 45 | -------------------------------------------------------------------------------- /R/cache_info.R: -------------------------------------------------------------------------------- 1 | #' List folders and their sizes in cache 2 | #' 3 | #' This function will return a tibble with the name and sizes of all folder in 4 | #' specified directory. Will default to textdata's default cache. 5 | #' 6 | #' @inheritParams lexicon_afinn 7 | #' 8 | #' @return A tibble with 2 variables: 9 | #' \describe{ 10 | #' \item{name}{Name of the folder} 11 | #' \item{size}{Size of the folder} 12 | #' } 13 | #' @export 14 | #' 15 | #' @examples 16 | #' \dontrun{ 17 | #' cache_info() 18 | #' } 19 | cache_info <- function(dir = NULL) { 20 | dir <- ifelse(is.null(dir), rappdirs::user_cache_dir("textdata"), dir) 21 | 22 | folders <- fs::dir_info(dir) 23 | 24 | folders <- folders$path[folders$type == "directory"] 25 | 26 | sizes <- vapply(folders, folder_size, numeric(1)) 27 | 28 | tibble::tibble( 29 | name = basename(folders), 30 | size = fs::as_fs_bytes(sizes) 31 | ) 32 | } 33 | 34 | folder_size <- function(x) { 35 | sum(fs::dir_info(x)$size) 36 | } 37 | -------------------------------------------------------------------------------- /R/dataset_ag_news.R: -------------------------------------------------------------------------------- 1 | #' AG's News Topic Classification Dataset 2 | #' 3 | #' The AG's news topic classification dataset is constructed by choosing 4 4 | #' largest classes from the original corpus. Each class contains 30,000 training 5 | #' samples and 1,900 testing samples. The total number of training samples is 6 | #' 120,000 and testing 7,600. 7 | 8 | #' Version 3, Updated 09/09/2015 9 | #' 10 | #' The classes in this dataset are 11 | #' 12 | #' \itemize{ 13 | #' \item World 14 | #' \item Sports 15 | #' \item Business 16 | #' \item Sci/Tech 17 | #' } 18 | #' 19 | #' @inheritParams lexicon_afinn 20 | #' @param split Character. Return training ("train") data or testing ("test") 21 | #' data. Defaults to "train". 22 | #' @return A tibble with 120,000 or 30,000 rows for "train" and "test" 23 | #' respectively and 3 variables: 24 | #' \describe{ 25 | #' \item{class}{Character, denoting new class} 26 | #' \item{title}{Character, title of article} 27 | #' \item{description}{Character, description of article} 28 | #' } 29 | #' @source \url{http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html} 30 | #' @source \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz} 31 | #' @keywords datasets 32 | #' @family topic 33 | #' @export 34 | #' @examples 35 | #' \dontrun{ 36 | #' dataset_ag_news() 37 | #' 38 | #' # Custom directory 39 | #' dataset_ag_news(dir = "data/") 40 | #' 41 | #' # Deleting dataset 42 | #' dataset_ag_news(delete = TRUE) 43 | #' 44 | #' # Returning filepath of data 45 | #' dataset_ag_news(return_path = TRUE) 46 | #' 47 | #' # Access both training and testing dataset 48 | #' train <- dataset_ag_news(split = "train") 49 | #' test <- dataset_ag_news(split = "test") 50 | #' } 51 | #' 52 | #' @importFrom fs file_exists dir_exists dir_create path 53 | #' @importFrom readr read_rds 54 | #' @importFrom utils menu 55 | dataset_ag_news <- function(dir = NULL, split = c("train", "test"), 56 | delete = FALSE, return_path = FALSE, 57 | clean = FALSE, manual_download = FALSE) { 58 | all_files <- paste0("ag_news_", c("train", "test"), ".rds") 59 | split <- match.arg(split) 60 | name <- paste0("ag_news_", split, ".rds") 61 | load_dataset( 62 | data_name = "ag_news", name = name, dir = dir, 63 | delete = delete, return_path = return_path, clean = clean, 64 | clean_manual = all_files, 65 | manual_download = manual_download 66 | ) 67 | } 68 | 69 | #' @importFrom utils download.file 70 | download_ag_news <- function(folder_path) { 71 | file_path_test <- path(folder_path, "ag_news_test.csv") 72 | file_path_train <- path(folder_path, "ag_news_train.csv") 73 | 74 | if (file_exists(file_path_test) & file_exists(file_path_train)) { 75 | return(invisible()) 76 | } 77 | 78 | download.file( 79 | url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv", 80 | destfile = file_path_test 81 | ) 82 | download.file( 83 | url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv", 84 | destfile = file_path_train 85 | ) 86 | } 87 | 88 | #' @importFrom readr read_tsv write_rds cols col_character col_double 89 | #' @importFrom tibble tibble 90 | process_ag_news <- function(folder_path, name_path) { 91 | file_path_test <- path(folder_path, "ag_news_test.csv") 92 | file_path_train <- path(folder_path, "ag_news_train.csv") 93 | 94 | data_test <- read_csv(file_path_test, 95 | col_names = c("class", "title", "description"), 96 | col_types = cols( 97 | class = col_double(), 98 | title = col_character(), 99 | description = col_character() 100 | ) 101 | ) 102 | data_train <- read_csv(file_path_train, 103 | col_names = c("class", "title", "description"), 104 | col_types = cols( 105 | class = col_double(), 106 | title = col_character(), 107 | description = col_character() 108 | ) 109 | ) 110 | 111 | classes <- c("World", "Sports", "Business", "Sci/Tech") 112 | 113 | data_test$class <- classes[data_test$class] 114 | data_train$class <- classes[data_train$class] 115 | 116 | write_rds(data_test, path(folder_path, "ag_news_test.rds")) 117 | write_rds(data_train, path(folder_path, "ag_news_train.rds")) 118 | } 119 | -------------------------------------------------------------------------------- /R/dataset_dbpedia.R: -------------------------------------------------------------------------------- 1 | #' DBpedia Ontology Dataset 2 | #' 3 | #' DBpedia ontology dataset classification dataset. It contains 560,000 training 4 | #' samples and 70,000 testing samples for each of 14 nonoverlapping classes 5 | #' from DBpedia. 6 | #' 7 | #' The classes are 8 | #' 9 | #' \itemize{ 10 | #' \item Company 11 | #' \item EducationalInstitution 12 | #' \item Artist 13 | #' \item Athlete 14 | #' \item OfficeHolder 15 | #' \item MeanOfTransportation 16 | #' \item Building 17 | #' \item NaturalPlace 18 | #' \item Village 19 | #' \item Animal 20 | #' \item Plant 21 | #' \item Album 22 | #' \item Film 23 | #' \item WrittenWork 24 | #' } 25 | #' 26 | #' @source \url{https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf} 27 | #' @source \url{https://www.dbpedia.org/} 28 | #' @source \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz} 29 | #' @inheritParams lexicon_afinn 30 | #' @param split Character. Return training ("train") data or testing ("test") 31 | #' data. Defaults to "train". 32 | #' @return A tibble with 560,000 or 70,000 rows for "train" and "test" 33 | #' respectively and 3 variables: 34 | #' \describe{ 35 | #' \item{class}{Character, denoting the class class} 36 | #' \item{title}{Character, title of article} 37 | #' \item{description}{Character, description of article} 38 | #' } 39 | #' @keywords datasets 40 | #' @family topic 41 | #' @export 42 | #' @examples 43 | #' \dontrun{ 44 | #' dataset_dbpedia() 45 | #' 46 | #' # Custom directory 47 | #' dataset_dbpedia(dir = "data/") 48 | #' 49 | #' # Deleting dataset 50 | #' dataset_dbpedia(delete = TRUE) 51 | #' 52 | #' # Returning filepath of data 53 | #' dataset_dbpedia(return_path = TRUE) 54 | #' 55 | #' # Access both training and testing dataset 56 | #' train <- dataset_dbpedia(split = "train") 57 | #' test <- dataset_dbpedia(split = "test") 58 | #' } 59 | #' 60 | #' @importFrom fs file_exists dir_exists dir_create path 61 | #' @importFrom readr read_rds 62 | #' @importFrom utils menu untar 63 | dataset_dbpedia <- function(dir = NULL, split = c("train", "test"), 64 | delete = FALSE, return_path = FALSE, 65 | clean = FALSE, manual_download = FALSE) { 66 | all_files <- paste0("dbpedia_", c("train", "test"), ".rds") 67 | split <- match.arg(split) 68 | name <- paste0("dbpedia_", split, ".rds") 69 | load_dataset( 70 | data_name = "dbpedia", name = name, dir = dir, 71 | delete = delete, return_path = return_path, clean = clean, 72 | clean_manual = all_files, 73 | manual_download = manual_download 74 | ) 75 | } 76 | 77 | #' @importFrom utils download.file 78 | download_dbpedia <- function(folder_path) { 79 | file_path <- path(folder_path, "dbpedia_csv.tar.gz") 80 | if (file_exists(file_path)) { 81 | return(invisible()) 82 | } 83 | download.file( 84 | url = "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz", 85 | destfile = file_path 86 | ) 87 | } 88 | 89 | #' @importFrom readr read_tsv write_rds cols col_character col_double 90 | #' @importFrom tibble tibble 91 | process_dbpedia <- function(folder_path, name_path) { 92 | file_path_test <- path(folder_path, "dbpedia_csv/test.csv") 93 | file_path_train <- path(folder_path, "dbpedia_csv/train.csv") 94 | 95 | zip_path <- path(folder_path, "dbpedia_csv.tar.gz") 96 | 97 | untar(zip_path, files = c( 98 | "dbpedia_csv/test.csv", 99 | "dbpedia_csv/train.csv" 100 | ), exdir = folder_path) 101 | 102 | data_test <- read_csv(file_path_test, 103 | col_names = c("class", "title", "description"), 104 | col_types = cols( 105 | class = col_double(), 106 | title = col_character(), 107 | description = col_character() 108 | ) 109 | ) 110 | data_train <- read_csv(file_path_train, 111 | col_names = c("class", "title", "description"), 112 | col_types = cols( 113 | class = col_double(), 114 | title = col_character(), 115 | description = col_character() 116 | ) 117 | ) 118 | 119 | classes <- c( 120 | "Company", "EducationalInstitution", "Artist", "Athlete", 121 | "OfficeHolder", "MeanOfTransportation", "Building", 122 | "NaturalPlace", "Village", "Animal", "Plant", "Album", "Film", 123 | "WrittenWork" 124 | ) 125 | 126 | data_test$class <- classes[data_test$class] 127 | data_train$class <- classes[data_train$class] 128 | 129 | write_rds(data_test, path(folder_path, "dbpedia_test.rds")) 130 | write_rds(data_train, path(folder_path, "dbpedia_train.rds")) 131 | 132 | fs::file_delete(path = file_path_test) 133 | fs::file_delete(path = file_path_train) 134 | } 135 | -------------------------------------------------------------------------------- /R/dataset_imdb.R: -------------------------------------------------------------------------------- 1 | #' IMDB Large Movie Review Dataset 2 | #' 3 | #' The core dataset contains 50,000 reviews split evenly into 25k train and 4 | #' 25k test sets. The overall distribution of labels is balanced (25k pos and 5 | #' 25k neg). 6 | #' 7 | #' In the entire collection, no more than 30 reviews are allowed for any 8 | #' given movie because reviews for the same movie tend to have correlated 9 | #' ratings. Further, the train and test sets contain a disjoint set of 10 | #' movies, so no significant performance is obtained by memorizing 11 | #' movie-unique terms and their associated with observed labels. In the 12 | #' labeled train/test sets, a negative review has a score <= 4 out of 10, 13 | #' and a positive review has a score >= 7 out of 10. Thus reviews with 14 | #' more neutral ratings are not included in the train/test sets. In the 15 | #' unsupervised set, reviews of any rating are included and there are an 16 | #' even number of reviews > 5 and <= 5. 17 | #' 18 | #' When using this dataset, please cite the ACL 2011 paper 19 | #' 20 | #' InProceedings\{maas-EtAl:2011:ACL-HLT2011, \cr 21 | #' author = \{Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher\}, \cr 22 | #' title = \{Learning Word Vectors for Sentiment Analysis\}, \cr 23 | #' booktitle = \{Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies\}, \cr 24 | #' month = \{June\}, \cr 25 | #' year = \{2011\}, \cr 26 | #' address = \{Portland, Oregon, USA\}, \cr 27 | #' publisher = \{Association for Computational Linguistics\}, \cr 28 | #' pages = \{142--150\}, \cr 29 | #' url = \{http://www.aclweb.org/anthology/P11-1015\} 30 | #' \} 31 | #' 32 | #' @source \url{http://ai.stanford.edu/~amaas/data/sentiment/} 33 | #' @inheritParams lexicon_afinn 34 | #' @param split Character. Return training ("train") data or testing ("test") 35 | #' data. Defaults to "train". 36 | #' @return A tibble with 25,000 rows and 2 variables: 37 | #' \describe{ 38 | #' \item{Sentiment}{Character, denoting the sentiment} 39 | #' \item{text}{Character, text of the review} 40 | #' } 41 | #' @keywords datasets 42 | #' @family topic sentiment 43 | #' @export 44 | #' @examples 45 | #' \dontrun{ 46 | #' dataset_imdb() 47 | #' 48 | #' # Custom directory 49 | #' dataset_imdb(dir = "data/") 50 | #' 51 | #' # Deleting dataset 52 | #' dataset_imdb(delete = TRUE) 53 | #' 54 | #' # Returning filepath of data 55 | #' dataset_imdb(return_path = TRUE) 56 | #' 57 | #' # Access both training and testing dataset 58 | #' train <- dataset_imdb(split = "train") 59 | #' test <- dataset_imdb(split = "test") 60 | #' } 61 | #' 62 | #' @importFrom fs file_exists dir_exists dir_create path 63 | #' @importFrom readr read_rds 64 | #' @importFrom utils menu untar 65 | dataset_imdb <- function(dir = NULL, split = c("train", "test"), 66 | delete = FALSE, return_path = FALSE, clean = FALSE, 67 | manual_download = FALSE) { 68 | all_files <- paste0("imdb_", c("train", "test"), ".rds") 69 | split <- match.arg(split) 70 | name <- paste0("imdb_", split, ".rds") 71 | load_dataset( 72 | data_name = "imdb", name = name, dir = dir, 73 | delete = delete, return_path = return_path, clean = clean, 74 | clean_manual = all_files, 75 | manual_download = manual_download 76 | ) 77 | } 78 | 79 | #' @importFrom utils download.file 80 | download_imdb <- function(folder_path) { 81 | file_path <- path(folder_path, "imdb.tar.gz") 82 | if (file_exists(file_path)) { 83 | return(invisible()) 84 | } 85 | download.file( 86 | url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 87 | destfile = file_path 88 | ) 89 | } 90 | 91 | #' @importFrom readr read_tsv write_rds cols col_character col_double 92 | #' @importFrom fs dir_ls 93 | #' @importFrom tibble tibble 94 | process_imdb <- function(folder_path, name_path) { 95 | file_path_test <- path(folder_path, "imdb_csv/test.csv") 96 | file_path_train <- path(folder_path, "imdb_csv/train.csv") 97 | 98 | zip_path <- path(folder_path, "imdb.tar.gz") 99 | 100 | untar(zip_path, exdir = folder_path) 101 | 102 | files_test_neg <- dir_ls(path(folder_path, "aclimdb", "test", "neg")) 103 | files_test_pos <- dir_ls(path(folder_path, "aclimdb", "test", "pos")) 104 | 105 | data_test <- tibble( 106 | sentiment = rep( 107 | c("neg", "pos"), 108 | c( 109 | length(files_test_neg), 110 | length(files_test_pos) 111 | ) 112 | ), 113 | text = c( 114 | vapply(files_test_neg, read_lines, character(1)), 115 | vapply(files_test_pos, read_lines, character(1)) 116 | ) 117 | ) 118 | 119 | files_train_neg <- dir_ls(path(folder_path, "aclimdb", "train", "neg")) 120 | files_train_pos <- dir_ls(path(folder_path, "aclimdb", "train", "pos")) 121 | 122 | data_train <- tibble( 123 | sentiment = rep( 124 | c("neg", "pos"), 125 | c( 126 | length(files_train_neg), 127 | length(files_train_pos) 128 | ) 129 | ), 130 | text = c( 131 | vapply(files_train_neg, read_lines, character(1)), 132 | vapply(files_train_pos, read_lines, character(1)) 133 | ) 134 | ) 135 | 136 | write_rds(data_test, path(folder_path, "imdb_test.rds")) 137 | write_rds(data_train, path(folder_path, "imdb_train.rds")) 138 | } 139 | -------------------------------------------------------------------------------- /R/dataset_sentence_polarity.R: -------------------------------------------------------------------------------- 1 | #' v1.0 sentence polarity dataset 2 | #' 3 | #' 5331 positive and 5331 negative processed sentences / snippets. 4 | #' Introduced in Pang/Lee ACL 2005. Released July 2005. 5 | #' 6 | #' Citation info: 7 | #' 8 | #' This data was first used in Bo Pang and Lillian Lee, 9 | #' ``Seeing stars: Exploiting class relationships for sentiment categorization 10 | #' with respect to rating scales.'', Proceedings of the ACL, 2005. 11 | #' 12 | #' InProceedings\{pang05, \cr 13 | #' author = \{Bo Pang and Lillian Lee\}, \cr 14 | #' title = \{Seeing stars: Exploiting class relationships for sentiment \cr 15 | #' categorization with respect to rating scales\}, \cr 16 | #' booktitle = \{Proceedings of the ACL\}, \cr 17 | #' year = 2005 \cr 18 | #' \} 19 | #' 20 | #' @inheritParams lexicon_afinn 21 | #' @return A tibble with 10,662 rows and 2 variables: 22 | #' \describe{ 23 | #' \item{text}{Sentences or snippets} 24 | #' \item{sentiment}{Indicator for sentiment, "neg" for negative and "pos" 25 | #' for positive} 26 | #' } 27 | #' @source \url{https://www.cs.cornell.edu/people/pabo/movie-review-data/} 28 | #' @keywords datasets 29 | #' @family sentiment 30 | #' @export 31 | #' @examples 32 | #' \dontrun{ 33 | #' dataset_sentence_polarity() 34 | #' 35 | #' # Custom directory 36 | #' dataset_sentence_polarity(dir = "data/") 37 | #' 38 | #' # Deleting dataset 39 | #' dataset_sentence_polarity(delete = TRUE) 40 | #' 41 | #' # Returning filepath of data 42 | #' dataset_sentence_polarity(return_path = TRUE) 43 | #' } 44 | #' 45 | #' @importFrom fs file_exists dir_exists dir_create path 46 | #' @importFrom readr read_rds 47 | #' @importFrom utils menu 48 | dataset_sentence_polarity <- function(dir = NULL, delete = FALSE, 49 | return_path = FALSE, clean = FALSE, 50 | manual_download = FALSE) { 51 | load_dataset( 52 | data_name = "sentence_polarity", name = "rt-polarity.rds", 53 | dir = dir, delete = delete, return_path = return_path, 54 | clean = clean, manual_download = manual_download 55 | ) 56 | } 57 | 58 | #' @importFrom utils download.file 59 | download_sentence_polarity <- function(folder_path) { 60 | file_path <- path(folder_path, "rt-polaritydata.tar.gz") 61 | if (file_exists(file_path)) { 62 | return(invisible()) 63 | } 64 | download.file( 65 | url = "https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz", 66 | destfile = file_path 67 | ) 68 | } 69 | 70 | #' @importFrom readr read_tsv write_rds cols col_character col_double 71 | #' @importFrom tibble tibble 72 | process_sentence_polarity <- function(folder_path, name_path) { 73 | full_text <- read_lines(path(folder_path, "rt-polaritydata.tar.gz")) 74 | 75 | neq_text <- full_text[55:5385] 76 | neq_text[1] <- "simplistic , silly and tedious . " 77 | pos_text <- full_text[5386:10716] 78 | pos_text[1] <- "othe rock is destined to be the 21st century's new \" conan \" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . " 79 | 80 | data <- tibble( 81 | text = c(neq_text, pos_text), 82 | sentiment = c( 83 | rep("neg", length(neq_text)), 84 | rep("pos", length(pos_text)) 85 | ) 86 | ) 87 | write_rds(data, name_path) 88 | } 89 | -------------------------------------------------------------------------------- /R/dataset_trec.R: -------------------------------------------------------------------------------- 1 | #' TREC dataset 2 | #' 3 | #' The TREC dataset is dataset for question classification consisting of 4 | #' open-domain, fact-based questions divided into broad semantic categories. 5 | #' It has both a six-class (TREC-6) and a fifty-class (TREC-50) version. Both 6 | #' have 5,452 training examples and 500 test examples, but TREC-50 has 7 | #' finer-grained labels. Models are evaluated based on accuracy. 8 | #' 9 | #' The classes in TREC-6 are 10 | #' 11 | #' \itemize{ 12 | #' \item ABBR - Abbreviation 13 | #' \item DESC - Description and abstract concepts 14 | #' \item ENTY - Entities 15 | #' \item HUM - Human beings 16 | #' \item LOC - Locations 17 | #' \item NYM - Numeric values 18 | #' } 19 | #' 20 | #' the classes in TREC-50 can be found here 21 | #' \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/definition.html}. 22 | #' 23 | #' @source \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/} 24 | #' @source \url{https://trec.nist.gov/data/qa.html} 25 | #' @inheritParams lexicon_afinn 26 | #' @param split Character. Return training ("train") data or testing ("test") 27 | #' data. Defaults to "train". 28 | #' @param version Character. Version 6("6") or version 50("50"). Defaults to 29 | #' "6". 30 | #' @return A tibble with 5,452 or 500 rows for "train" and "test" 31 | #' respectively and 2 variables: 32 | #' \describe{ 33 | #' \item{class}{Character, denoting the class} 34 | #' \item{text}{Character, question text} 35 | #' } 36 | #' @keywords datasets 37 | #' @family topic 38 | #' @export 39 | #' @examples 40 | #' \dontrun{ 41 | #' dataset_trec() 42 | #' 43 | #' # Custom directory 44 | #' dataset_trec(dir = "data/") 45 | #' 46 | #' # Deleting dataset 47 | #' dataset_trec(delete = TRUE) 48 | #' 49 | #' # Returning filepath of data 50 | #' dataset_trec(return_path = TRUE) 51 | #' 52 | #' # Access both training and testing dataset 53 | #' train_6 <- dataset_trec(split = "train") 54 | #' test_6 <- dataset_trec(split = "test") 55 | #' 56 | #' train_50 <- dataset_trec(split = "train", version = "50") 57 | #' test_50 <- dataset_trec(split = "test", version = "50") 58 | #' } 59 | #' 60 | #' @importFrom fs file_exists dir_exists dir_create path 61 | #' @importFrom readr read_rds 62 | #' @importFrom utils menu untar 63 | dataset_trec <- function(dir = NULL, split = c("train", "test"), 64 | version = c("6", "50"), delete = FALSE, 65 | return_path = FALSE, clean = FALSE, 66 | manual_download = FALSE) { 67 | all_files <- paste0( 68 | "trec_", rep(c("6", "50"), 2), "_", 69 | rep(c("train", "test"), each = 2), ".rds" 70 | ) 71 | split <- match.arg(split) 72 | version <- match.arg(version) 73 | name <- paste0("trec_", version, "_", split, ".rds") 74 | load_dataset( 75 | data_name = "trec", name = name, dir = dir, 76 | delete = delete, return_path = return_path, clean = clean, 77 | clean_manual = all_files, 78 | manual_download = manual_download 79 | ) 80 | } 81 | 82 | #' @importFrom utils download.file 83 | download_trec <- function(folder_path) { 84 | file_path_train <- path(folder_path, "train_5500.label") 85 | file_path_test <- path(folder_path, "TREC_10.label") 86 | 87 | if (file_exists(file_path_train) & file_exists(file_path_test)) { 88 | return(invisible()) 89 | } 90 | download.file( 91 | url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label", 92 | destfile = file_path_train 93 | ) 94 | download.file( 95 | url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label", 96 | destfile = file_path_test 97 | ) 98 | } 99 | 100 | #' @importFrom readr read_tsv write_rds cols col_character col_double 101 | #' @importFrom tibble tibble 102 | process_trec <- function(folder_path, name_path) { 103 | file_path_train <- path(folder_path, "train_5500.label") 104 | file_path_test <- path(folder_path, "TREC_10.label") 105 | 106 | # Test data 107 | data_test <- read_lines(file_path_test) 108 | 109 | text_test <- gsub("^\\S* ", "", data_test) 110 | 111 | label_test <- sub("\\s.*", "", data_test) 112 | 113 | trec6_label_test <- sapply(strsplit(label_test, ":"), function(x) x[1]) 114 | trec50_label_test <- sapply(strsplit(label_test, ":"), function(x) x[2]) 115 | 116 | trec_6_test <- tibble( 117 | class = trec6_label_test, 118 | text = text_test 119 | ) 120 | trec_50_test <- tibble( 121 | class = trec50_label_test, 122 | text = text_test 123 | ) 124 | # train data 125 | data_train <- read_lines(file_path_train) 126 | 127 | text_train <- gsub("^\\S* ", "", data_train) 128 | 129 | label_train <- sub("\\s.*", "", data_train) 130 | 131 | trec6_label_train <- sapply(strsplit(label_train, ":"), function(x) x[1]) 132 | trec50_label_train <- sapply(strsplit(label_train, ":"), function(x) x[2]) 133 | 134 | trec_6_train <- tibble( 135 | class = trec6_label_train, 136 | text = text_train 137 | ) 138 | trec_50_train <- tibble( 139 | class = trec50_label_train, 140 | text = text_train 141 | ) 142 | 143 | write_rds(trec_6_test, path(folder_path, "trec_6_test.rds")) 144 | write_rds(trec_6_train, path(folder_path, "trec_6_train.rds")) 145 | 146 | write_rds(trec_50_test, path(folder_path, "trec_50_test.rds")) 147 | write_rds(trec_50_train, path(folder_path, "trec_50_train.rds")) 148 | } 149 | -------------------------------------------------------------------------------- /R/download_functions.R: -------------------------------------------------------------------------------- 1 | #' List of all download functions used in load_dataset 2 | #' 3 | #' @format Named list of all download functions 4 | #' @include lexicon_afinn.R lexicon_loughran.R lexicon_bing.R lexicon_nrc.R 5 | #' @include dataset_sentence_polarity.R dataset_ag_news.R dataset_dbpedia.R 6 | #' @include dataset_trec.R dataset_imdb.R lexicon_nrc_eil.R lexicon_nrc_vad.R 7 | #' @include embedding_glove.R 8 | #' 9 | #' @name download_functions 10 | #' @noRd 11 | NULL 12 | 13 | download_functions <- list( 14 | afinn = download_afinn, 15 | sentence_polarity = download_sentence_polarity, 16 | loughran = download_loughran, 17 | bing = download_bing, 18 | nrc = download_nrc, 19 | nrc_eil = download_nrc_eil, 20 | nrc_vad = download_nrc_vad, 21 | ag_news = download_ag_news, 22 | dbpedia = download_dbpedia, 23 | trec = download_trec, 24 | imdb = download_imdb, 25 | glove6b = download_glove6b, 26 | glove27b = download_glove27b, 27 | glove42b = download_glove42b, 28 | glove840b = download_glove840b 29 | ) 30 | -------------------------------------------------------------------------------- /R/embedding_glove.R: -------------------------------------------------------------------------------- 1 | #' Global Vectors for Word Representation 2 | #' 3 | #' The GloVe pre-trained word vectors provide word embeddings created using 4 | #' varying numbers of tokens. 5 | #' 6 | #' Citation info: 7 | #' 8 | #' InProceedings\{pennington2014glove, \cr 9 | #' author = \{Jeffrey Pennington and Richard Socher and Christopher D. \cr 10 | #' Manning\}, \cr 11 | #' title = \{GloVe: Global Vectors for Word Representation\}, \cr 12 | #' booktitle = \{Empirical Methods in Natural Language Processing (EMNLP)\}, \cr 13 | #' year = 2014 \cr 14 | #' pages = \{1532-1543\} \cr 15 | #' url = \{http://www.aclweb.org/anthology/D14-1162\} \cr 16 | #' \} 17 | #' 18 | #' @references Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 19 | #' 2014. GloVe: Global Vectors for Word Representation. 20 | #' 21 | #' @inheritParams lexicon_afinn 22 | #' @param dimensions A number indicating the number of vectors to include. One 23 | #' of 50, 100, 200, or 300 for glove6b, or one of 25, 50, 100, or 200 for 24 | #' glove27b. 25 | #' @return A tibble with 400k, 1.9m, 2.2m, or 1.2m rows (one row for each unique 26 | #' token in the vocabulary) and the following variables: 27 | #' \describe{ 28 | #' \item{token}{An individual token (usually a word)} 29 | #' \item{d1, d2, etc}{The embeddings for that token.} 30 | #' } 31 | #' @source \url{https://nlp.stanford.edu/projects/glove/} 32 | #' @keywords datasets 33 | #' @family embeddings 34 | #' @examples 35 | #' \dontrun{ 36 | #' embedding_glove6b(dimensions = 50) 37 | #' 38 | #' # Custom directory 39 | #' embedding_glove42b(dir = "data/") 40 | #' 41 | #' # Deleting dataset 42 | #' embedding_glove6b(delete = TRUE, dimensions = 300) 43 | #' 44 | #' # Returning filepath of data 45 | #' embedding_glove840b(return_path = TRUE) 46 | #' } 47 | #' @name embedding_glove 48 | NULL 49 | 50 | #' @rdname embedding_glove 51 | #' @export 52 | #' @importFrom fs file_exists dir_exists dir_create path 53 | #' @importFrom readr read_rds 54 | #' @importFrom utils menu 55 | embedding_glove6b <- function(dir = NULL, 56 | dimensions = c(50, 100, 200, 300), 57 | delete = FALSE, 58 | return_path = FALSE, 59 | clean = FALSE, 60 | manual_download = FALSE) { 61 | this_glove <- "6b" 62 | available_dims <- c(50, 100, 200, 300) 63 | all_names <- construct_glove_name(this_glove, available_dims) 64 | dimensions <- as.character(dimensions) 65 | dimensions <- match.arg(dimensions, as.character(available_dims)) 66 | name <- construct_glove_name(this_glove, dimensions) 67 | load_dataset( 68 | data_name = "glove6b", name = name, dir = dir, 69 | delete = delete, return_path = return_path, clean = clean, 70 | clean_manual = all_names, 71 | manual_download = manual_download 72 | ) 73 | } 74 | 75 | #' @keywords internal 76 | construct_glove_name <- function(tokens = c("6b", "27b"), 77 | dimensions = c(25, 50, 100, 200, 300)) { 78 | tokens <- match.arg(tokens) 79 | dimensions <- as.character(dimensions) 80 | dimensions <- match.arg( 81 | dimensions, 82 | choices = as.character(c(25, 50, 100, 200, 300)), 83 | several.ok = TRUE 84 | ) 85 | paste0( 86 | paste( 87 | "glove", 88 | tokens, 89 | dimensions, 90 | sep = "_" 91 | ), 92 | ".rds" 93 | ) 94 | } 95 | 96 | #' @rdname embedding_glove 97 | #' @export 98 | #' @importFrom fs file_exists dir_exists dir_create path 99 | #' @importFrom readr read_rds 100 | #' @importFrom utils menu 101 | embedding_glove27b <- function(dir = NULL, 102 | dimensions = c(25, 50, 100, 200), 103 | delete = FALSE, 104 | return_path = FALSE, 105 | clean = FALSE, 106 | manual_download = FALSE) { 107 | this_glove <- "27b" 108 | available_dims <- c(25, 50, 100, 200) 109 | all_names <- construct_glove_name(this_glove, available_dims) 110 | dimensions <- as.character(dimensions) 111 | dimensions <- match.arg(dimensions, as.character(available_dims)) 112 | name <- construct_glove_name(this_glove, dimensions) 113 | load_dataset( 114 | data_name = "glove27b", name = name, dir = dir, 115 | delete = delete, return_path = return_path, clean = clean, 116 | clean_manual = all_names, 117 | manual_download = manual_download 118 | ) 119 | } 120 | 121 | #' @rdname embedding_glove 122 | #' @export 123 | #' @importFrom fs file_exists dir_exists dir_create path 124 | #' @importFrom readr read_rds 125 | #' @importFrom utils menu 126 | embedding_glove42b <- function(dir = NULL, 127 | delete = FALSE, 128 | return_path = FALSE, 129 | clean = FALSE, 130 | manual_download = FALSE) { 131 | name <- "glove_42b.rds" 132 | load_dataset( 133 | data_name = "glove42b", name = name, dir = dir, 134 | delete = delete, return_path = return_path, clean = clean, 135 | manual_download = manual_download 136 | ) 137 | } 138 | 139 | #' @rdname embedding_glove 140 | #' @export 141 | #' @importFrom fs file_exists dir_exists dir_create path 142 | #' @importFrom readr read_rds 143 | #' @importFrom utils menu 144 | embedding_glove840b <- function(dir = NULL, 145 | delete = FALSE, 146 | return_path = FALSE, 147 | clean = FALSE, 148 | manual_download = FALSE) { 149 | name <- "glove_840b.rds" 150 | load_dataset( 151 | data_name = "glove840b", name = name, dir = dir, 152 | delete = delete, return_path = return_path, clean = clean, 153 | manual_download = manual_download 154 | ) 155 | } 156 | 157 | #' @importFrom utils download.file 158 | #' @keywords internal 159 | download_glove6b <- function(folder_path) { 160 | file_path <- path(folder_path, "glove.6B.zip") 161 | if (file_exists(file_path)) { 162 | return(invisible()) 163 | } 164 | download.file( 165 | url = "http://nlp.stanford.edu/data/glove.6B.zip", 166 | destfile = file_path 167 | ) 168 | } 169 | 170 | #' @importFrom utils download.file 171 | #' @keywords internal 172 | download_glove42b <- function(folder_path) { 173 | file_path <- path(folder_path, "glove.42B.300d.zip") 174 | if (file_exists(file_path)) { 175 | return(invisible()) 176 | } 177 | download.file( 178 | url = "http://nlp.stanford.edu/data/glove.42B.300d.zip", 179 | destfile = file_path 180 | ) 181 | } 182 | 183 | #' @importFrom utils download.file 184 | #' @keywords internal 185 | download_glove840b <- function(folder_path) { 186 | file_path <- path(folder_path, "glove.840B.300d.zip") 187 | if (file_exists(file_path)) { 188 | return(invisible()) 189 | } 190 | download.file( 191 | url = "http://nlp.stanford.edu/data/glove.840B.300d.zip", 192 | destfile = file_path 193 | ) 194 | } 195 | 196 | #' @importFrom utils download.file 197 | #' @keywords internal 198 | download_glove27b <- function(folder_path) { 199 | file_path <- path(folder_path, "glove.twitter.27B.zip") 200 | if (file_exists(file_path)) { 201 | return(invisible()) 202 | } 203 | download.file( 204 | url = "http://nlp.stanford.edu/data/glove.twitter.27B.zip", 205 | destfile = file_path 206 | ) 207 | } 208 | 209 | #' @keywords internal 210 | process_glove6b <- function(folder_path, name_path) { 211 | # Processing all datasets when they only need one adds time. We'll 212 | # specifically deal with the one they requested, which means we need to 213 | # extract the dimensions back out of the name to build the raw filename. 214 | filename <- gsub(folder_path, "", name_path) 215 | dimensions <- unlist(strsplit(filename, "_|\\."))[[3]] 216 | raw_name <- paste0("glove.6B.", dimensions, "d.txt") 217 | file <- unz(path(folder_path, "glove.6B.zip"), raw_name) 218 | 219 | write_glove(file, name_path, dimensions) 220 | } 221 | 222 | #' @keywords internal 223 | process_glove42b <- function(folder_path, name_path) { 224 | dimensions <- 300 225 | raw_name <- "glove.42B.300d.txt" 226 | file <- unz(path(folder_path, "glove.42B.300d.zip"), raw_name) 227 | 228 | write_glove(file, name_path, dimensions) 229 | } 230 | 231 | #' @keywords internal 232 | process_glove840b <- function(folder_path, name_path) { 233 | dimensions <- 300 234 | raw_name <- "glove.840B.300d.txt" 235 | file <- unz(path(folder_path, "glove.840B.300d.zip"), raw_name) 236 | 237 | write_glove(file, name_path, dimensions) 238 | } 239 | 240 | #' @keywords internal 241 | process_glove27b <- function(folder_path, name_path) { 242 | filename <- gsub(folder_path, "", name_path) 243 | dimensions <- unlist(strsplit(filename, "_|\\."))[[3]] 244 | raw_name <- paste0("glove.twitter.27B.", dimensions, "d.txt") 245 | 246 | file <- unz(path(folder_path, "glove.twitter.27B.zip"), raw_name) 247 | 248 | write_glove(file, name_path, dimensions) 249 | } 250 | 251 | #' @importFrom readr read_delim write_rds 252 | #' @keywords internal 253 | write_glove <- function(file, name_path, dimensions) { 254 | embeddings <- read_delim( 255 | file, 256 | delim = " ", 257 | quote = "", 258 | col_names = c( 259 | "token", 260 | paste0("d", seq_len(dimensions)) 261 | ), 262 | col_types = paste0( 263 | c( 264 | "c", 265 | rep("d", dimensions) 266 | ), 267 | collapse = "" 268 | ) 269 | ) 270 | 271 | write_rds(embeddings, name_path) 272 | } 273 | -------------------------------------------------------------------------------- /R/info.R: -------------------------------------------------------------------------------- 1 | print_info <- list( 2 | afinn = 3 | list( 4 | name = "AFINN-111", 5 | url = "http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010", 6 | license = "Open Database License (ODbL) v1.0", 7 | size = "78 KB (cleaned 59 KB)", 8 | type = "lexicon", 9 | download_mech = "https", 10 | description = "", 11 | citation = NA 12 | ), 13 | sentence_polarity = 14 | list( 15 | name = "v1.0 sentence polarity", 16 | url = "http://www.cs.cornell.edu/people/pabo/movie-review-data", 17 | license = "Cite the paper when used.", 18 | size = "2 MB (cleaned 1.4 MB)", 19 | type = "dataset", 20 | download_mech = "https", 21 | description = "Dataset with sentences labeled with negative or positive sentiment.", 22 | citation = NA 23 | ), 24 | loughran = 25 | list( 26 | name = "Loughran-McDonald Sentiment lexicon", 27 | url = "https://sraf.nd.edu/textual-analysis/resources/", 28 | license = "License required for commercial use. Please contact tloughra@nd.edu.", 29 | size = "6.7 MB (cleaned 142 KB)", 30 | type = "lexicon", 31 | download_mech = "https", 32 | description = "", 33 | citation = NA 34 | ), 35 | bing = 36 | list( 37 | name = "Bing Sentiment Lexicon", 38 | url = "https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html", 39 | license = "May be used (research, commercial, etc) with attribution.", 40 | size = "287 KB (cleaned 220 KB)", 41 | type = "lexicon", 42 | download_mech = "http", 43 | description = "", 44 | citation = NA 45 | ), 46 | nrc = 47 | list( 48 | name = "NRC Word-Emotion Association Lexicon", 49 | url = "http://saifmohammad.com/WebPages/lexicons.html", 50 | license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).", 51 | size = "22.8 MB (cleaned 424 KB)", 52 | type = "lexicon", 53 | download_mech = "http", 54 | description = "", 55 | citation = "Citation info: 56 | 57 | This dataset was published in Saif M. Mohammad and Peter Turney. (2013), ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational Intelligence, 29(3): 436-465. 58 | 59 | article{mohammad13, 60 | author = {Mohammad, Saif M. and Turney, Peter D.}, 61 | title = {Crowdsourcing a Word-Emotion Association Lexicon}, 62 | journal = {Computational Intelligence}, 63 | volume = {29}, 64 | number = {3}, 65 | pages = {436-465}, 66 | doi = {10.1111/j.1467-8640.2012.00460.x}, 67 | url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x}, 68 | eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x}, 69 | year = {2013} 70 | } 71 | If you use this lexicon, then please cite it." 72 | ), 73 | nrc_eil = 74 | list( 75 | name = "NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon)", 76 | url = "www.saifmohammad.com/WebPages/AffectIntensity.htm", 77 | license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).", 78 | size = "333 KB (cleaned 212 KB)", 79 | type = "lexicon", 80 | download_mech = "http", 81 | description = "", 82 | citation = "Citation info: 83 | Details of the lexicon are in this paper. 84 | Word Affect Intensities. Saif M. Mohammad. arXiv preprint arXiv, April 2017. 85 | 86 | inproceedings{LREC18-AIL, 87 | author = {Mohammad, Saif M.}, 88 | title = {Word Affect Intensities}, 89 | booktitle = {Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)}, 90 | year = {2018}, 91 | address={Miyazaki, Japan} 92 | } 93 | 94 | If you use this lexicon, then please cite it." 95 | ), 96 | nrc_vad = 97 | list( 98 | name = "The NRC Valence, Arousal, and Dominance Lexicon", 99 | url = "https://saifmohammad.com/WebPages/nrc-vad.html", 100 | license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).", 101 | size = "150.8 MB (cleaned 792 KB)", 102 | type = "lexicon", 103 | download_mech = "http", 104 | description = "", 105 | citation = "Citation info: 106 | 107 | inproceedings{vad-acl2018, 108 | title={Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words}, 109 | author={Mohammad, Saif M.}, 110 | booktitle={Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)}, 111 | year={2018}, 112 | address={Melbourne, Australia} 113 | } 114 | 115 | If you use this lexicon, then please cite it." 116 | ), 117 | ag_news = 118 | list( 119 | name = "AG News", 120 | url = "https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html", 121 | license = "You are encouraged to download this corpus for any non-commercial use.", 122 | size = "64.4 MB (cleaned 33.9 MB)", 123 | type = "dataset", 124 | download_mech = "https", 125 | description = "", 126 | citation = NA 127 | ), 128 | dbpedia = 129 | list( 130 | name = "DBpedia", 131 | url = "https://wiki.dbpedia.org/", 132 | license = "Creative Commons Attribution-ShareAlike 3.0 License", 133 | size = "279.5 MB (cleaned 211.1 MB)", 134 | type = "dataset", 135 | download_mech = "https", 136 | description = "", 137 | citation = NA 138 | ), 139 | trec = 140 | list( 141 | name = "TREC-6 & TREC-50", 142 | url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/", 143 | license = "Freely reusable public information licence", 144 | size = "1.2 MB (cleaned 827 KB)", 145 | type = "dataset", 146 | download_mech = "https", 147 | description = "", 148 | citation = NA 149 | ), 150 | imdb = 151 | list( 152 | name = "IMDb Large Movie Review Dataset", 153 | url = "http://ai.stanford.edu/~amaas/data/sentiment/", 154 | license = "No license specified, the work may be protected by copyright.", 155 | size = "376.4 MB (cleaned 71 MB)", 156 | type = "dataset", 157 | download_mech = "http", 158 | description = "", 159 | citation = NA 160 | ), 161 | glove6b = 162 | list( 163 | name = "GloVe 6B", 164 | url = "https://nlp.stanford.edu/projects/glove/", 165 | license = "Public Domain Dedication and License v1.0", 166 | size = "822.2 MB (158MB, 311MB, 616MB, and 921MB processed)", 167 | type = "embeddings", 168 | download_mech = "https", 169 | description = "Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors)", 170 | citation = "Citation info: 171 | inproceedings{pennington2014glove, 172 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning}, 173 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, 174 | title = {GloVe: Global Vectors for Word Representation}, 175 | year = {2014}, 176 | pages = {1532--1543}, 177 | url = {http://www.aclweb.org/anthology/D14-1162}, 178 | }" 179 | ), 180 | glove27b = 181 | list( 182 | name = "GloVe Twitter 27B", 183 | url = "https://nlp.stanford.edu/projects/glove/", 184 | license = "Public Domain Dedication and License v1.0", 185 | size = "1.42 GB (248MB, 476MB, 931MB, and 1.79GB processed)", 186 | type = "embeddings", 187 | download_mech = "https", 188 | description = "Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors)", 189 | citation = "Citation info: 190 | inproceedings{pennington2014glove, 191 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning}, 192 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, 193 | title = {GloVe: Global Vectors for Word Representation}, 194 | year = {2014}, 195 | pages = {1532--1543}, 196 | url = {http://www.aclweb.org/anthology/D14-1162}, 197 | }" 198 | ), 199 | glove42b = 200 | list( 201 | name = "GloVe Common Crawl 42B", 202 | url = "https://nlp.stanford.edu/projects/glove/", 203 | license = "Public Domain Dedication and License v1.0", 204 | size = "1.75 GB (4.31GB processed)", 205 | type = "embeddings", 206 | download_mech = "https", 207 | description = "Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors)", 208 | citation = "Citation info: 209 | inproceedings{pennington2014glove, 210 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning}, 211 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, 212 | title = {GloVe: Global Vectors for Word Representation}, 213 | year = {2014}, 214 | pages = {1532--1543}, 215 | url = {http://www.aclweb.org/anthology/D14-1162}, 216 | }" 217 | ), 218 | glove840b = 219 | list( 220 | name = "GloVe Common Crawl 840B", 221 | url = "https://nlp.stanford.edu/projects/glove/", 222 | license = "Public Domain Dedication and License v1.0", 223 | size = "2.03 GB (4.94GB processed)", 224 | type = "embeddings", 225 | download_mech = "https", 226 | description = "Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors)", 227 | citation = "Citation info: 228 | inproceedings{pennington2014glove, 229 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning}, 230 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, 231 | title = {GloVe: Global Vectors for Word Representation}, 232 | year = {2014}, 233 | pages = {1532--1543}, 234 | url = {http://www.aclweb.org/anthology/D14-1162}, 235 | }" 236 | ) 237 | ) 238 | 239 | #' Catalogue of all available data sources 240 | #' @export 241 | "catalogue" 242 | catalogue <- Reduce(rbind, lapply(print_info, as.data.frame, 243 | stringsAsFactors = FALSE 244 | )) 245 | -------------------------------------------------------------------------------- /R/lexicon_afinn.R: -------------------------------------------------------------------------------- 1 | #' AFINN-111 dataset 2 | #' 3 | #' AFINN is a lexicon of English words rated for valence with an integer 4 | #' between minus five (negative) and plus five (positive). The words have 5 | #' been manually labeled by Finn Årup Nielsen in 2009-2011. 6 | #' 7 | #' This dataset is the newest version with 2477 words and phrases. 8 | #' 9 | #' Citation info: 10 | #' 11 | #' This dataset was published in Finn Ärup Nielsen (2011), 12 | #' ``A new Evaluation of a word list for sentiment analysis in 13 | #' microblogs'', Proceedings of the ESWC2011 Workshop on 14 | #' 'Making Sense of Microposts': Big things come in small packages (2011) 93-98. 15 | #' 16 | #' article\{nielsen11, \cr 17 | #' author = \{Finn Äruprup Nielsen\}, \cr 18 | #' title = \{A new Evaluation of a word list for sentiment analysis in microblogs\}, \cr 19 | #' journal = \{CoRR\}, \cr 20 | #' volume = \{abs/1103.2903\}, \cr 21 | #' year = \{2011\}, \cr 22 | #' url = \{http://arxiv.org/abs/1103.2903\}, \cr 23 | #' archivePrefix = \{arXiv\}, \cr 24 | #' eprint = \{1103.2903\}, \cr 25 | #' biburl = \{https://dblp.org/rec/bib/journals/corr/abs-1103-2903\}, \cr 26 | #' bibsource = \{dblp computer science bibliography, https://dblp.org\} \cr 27 | #' \} 28 | #' 29 | #' @param dir Character, path to directory where data will be stored. If 30 | #' \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path. 31 | #' @param delete Logical, set \code{TRUE} to delete dataset. 32 | #' @param return_path Logical, set \code{TRUE} to return the path of the dataset. 33 | #' @param clean Logical, set \code{TRUE} to remove intermediate files. This can 34 | #' greatly reduce the size. Defaults to FALSE. 35 | #' @param manual_download Logical, set \code{TRUE} if you have manually 36 | #' downloaded the file and placed it in the folder designated by running 37 | #' this function with \code{return_path = TRUE}. 38 | #' @return A tibble with 2,477 rows and 2 variables: 39 | #' \describe{ 40 | #' \item{word}{An English word} 41 | #' \item{score}{Indicator for sentiment: integer between -5 and +5} 42 | #' } 43 | #' 44 | #' @keywords datasets 45 | #' @family lexicon 46 | #' @importFrom fs file_exists dir_exists dir_create 47 | #' @importFrom readr read_rds 48 | #' @importFrom utils menu 49 | #' @export 50 | #' @examples 51 | #' \dontrun{ 52 | #' lexicon_afinn() 53 | #' 54 | #' # Custom directory 55 | #' lexicon_afinn(dir = "data/") 56 | #' 57 | #' # Deleting dataset 58 | #' lexicon_afinn(delete = TRUE) 59 | #' 60 | #' # Returning filepath of data 61 | #' lexicon_afinn(return_path = TRUE) 62 | #' } 63 | lexicon_afinn <- function(dir = NULL, delete = FALSE, return_path = FALSE, 64 | clean = FALSE, manual_download = FALSE) { 65 | load_dataset( 66 | data_name = "afinn", name = "afinn_111.rds", dir = dir, 67 | delete = delete, return_path = return_path, clean = clean, 68 | manual_download = manual_download 69 | ) 70 | } 71 | 72 | #' @importFrom utils download.file 73 | download_afinn <- function(folder_path) { 74 | file_path <- path(folder_path, "imm6010.zip") 75 | if (file_exists(file_path)) { 76 | return(invisible()) 77 | } 78 | download.file( 79 | url = "http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip", 80 | destfile = file_path 81 | ) 82 | } 83 | 84 | #' @importFrom readr read_tsv write_rds cols col_character col_double 85 | process_afinn <- function(folder_path, name_path) { 86 | file <- unz(path(folder_path, "imm6010.zip"), "AFINN/AFINN-111.txt") 87 | data <- read_tsv(file, 88 | col_types = cols( 89 | word = col_character(), 90 | value = col_double() 91 | ), 92 | col_names = c("word", "value") 93 | ) 94 | write_rds(data, name_path) 95 | } 96 | -------------------------------------------------------------------------------- /R/lexicon_bing.R: -------------------------------------------------------------------------------- 1 | #' Bing sentiment lexicon 2 | #' 3 | #' General purpose English sentiment lexicon that categorizes words in a 4 | #' binary fashion, either positive or negative 5 | #' 6 | #' Citation info: 7 | #' 8 | #' This dataset was first published in Minqing Hu and Bing Liu, ``Mining and 9 | #' summarizing customer reviews.'', Proceedings of the ACM SIGKDD International 10 | #' Conference on Knowledge Discovery & Data Mining (KDD-2004), 2004. 11 | #' 12 | #' inproceedings\{Hu04, \cr 13 | #' author = \{Hu, Minqing and Liu, Bing\}, \cr 14 | #' title = \{Mining and Summarizing Customer Reviews\}, \cr 15 | #' booktitle = \{Proceedings of the Tenth ACM SIGKDD International Conference 16 | #' on Knowledge Discovery and Data Mining\}, \cr 17 | #' series = \{KDD '04\}, \cr 18 | #' year = \{2004\}, \cr 19 | #' isbn = \{1-58113-888-1\}, \cr 20 | #' location = \{Seattle, WA, USA\}, \cr 21 | #' pages = \{168--177\}, \cr 22 | #' numpages = \{10\}, \cr 23 | #' url = \{http://doi.acm.org/10.1145/1014052.1014073\}, \cr 24 | #' doi = \{10.1145/1014052.1014073\}, \cr 25 | #' acmid = \{1014073\}, \cr 26 | #' publisher = \{ACM\}, \cr 27 | #' address = \{New York, NY, USA\}, \cr 28 | #' keywords = \{reviews, sentiment classification, summarization, text mining\}, \cr 29 | #' \} 30 | #' 31 | #' @inheritParams lexicon_afinn 32 | #' @return A tibble with 6,787 rows and 2 variables: 33 | #' \describe{ 34 | #' \item{word}{An English word} 35 | #' \item{sentiment}{Indicator for sentiment: "negative" or "positive"} 36 | #' } 37 | #' 38 | #' @source \url{https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html} 39 | #' @keywords datasets 40 | #' @family lexicon 41 | #' @importFrom fs file_exists dir_exists dir_create 42 | #' @importFrom readr read_rds 43 | #' @importFrom utils menu 44 | #' @export 45 | #' @examples 46 | #' \dontrun{ 47 | #' lexicon_bing() 48 | #' 49 | #' # Custom directory 50 | #' lexicon_bing(dir = "data/") 51 | #' 52 | #' # Deleting dataset 53 | #' lexicon_bing(delete = TRUE) 54 | #' 55 | #' # Returning filepath of data 56 | #' lexicon_bing(return_path = TRUE) 57 | #' } 58 | lexicon_bing <- function(dir = NULL, delete = FALSE, return_path = FALSE, 59 | clean = FALSE, manual_download = FALSE) { 60 | load_dataset( 61 | data_name = "bing", name = "bing.rds", dir = dir, 62 | delete = delete, return_path = return_path, clean = clean, 63 | manual_download = manual_download 64 | ) 65 | } 66 | 67 | 68 | #' @importFrom utils download.file 69 | #' @importFrom fs path 70 | download_bing <- function(folder_path) { 71 | file_path_neg <- path(folder_path, "negative-words.txt") 72 | file_path_pos <- path(folder_path, "positive-words.txt") 73 | 74 | if (file_exists(file_path_pos) & file_exists(file_path_neg)) { 75 | return(invisible()) 76 | } 77 | 78 | download.file( 79 | url = "http://ptrckprry.com/course/ssd/data/negative-words.txt", 80 | destfile = file_path_neg 81 | ) 82 | download.file( 83 | url = "http://ptrckprry.com/course/ssd/data/positive-words.txt", 84 | destfile = file_path_pos 85 | ) 86 | } 87 | 88 | #' @importFrom readr read_lines 89 | process_bing <- function(folder_path, name_path) { 90 | file_path_neg <- path(folder_path, "negative-words.txt") 91 | file_path_pos <- path(folder_path, "positive-words.txt") 92 | 93 | neg_words <- read_lines(file_path_neg, skip = 35) 94 | pos_words <- read_lines(file_path_pos, skip = 35) 95 | 96 | data <- tibble( 97 | word = c(neg_words, pos_words), 98 | sentiment = rep( 99 | c("negative", "positive"), 100 | c(length(neg_words), length(pos_words)) 101 | ) 102 | ) 103 | 104 | write_rds(data, name_path) 105 | } 106 | -------------------------------------------------------------------------------- /R/lexicon_loughran.R: -------------------------------------------------------------------------------- 1 | #' Loughran-McDonald sentiment lexicon 2 | #' 3 | #' English sentiment lexicon created for use with financial documents. This 4 | #' lexicon labels words with six possible sentiments important in financial 5 | #' contexts: "negative", "positive", "litigious", "uncertainty", "constraining", 6 | #' or "superfluous". 7 | #' 8 | #' Citation info: 9 | #' 10 | #' This dataset was published in Loughran, T. and McDonald, B. (2011), 11 | #' ``When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 12 | #' 10-Ks.'' The Journal of Finance, 66: 35-65. 13 | #' 14 | #' article\{loughran11, \cr 15 | #' author = \{Loughran, Tim and McDonald, Bill\}, \cr 16 | #' title = \{When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10-Ks\}, \cr 17 | #' journal = \{The Journal of Finance\}, \cr 18 | #' volume = \{66\}, \cr 19 | #' number = \{1\}, \cr 20 | #' pages = \{35-65\}, \cr 21 | #' doi = \{10.1111/j.1540-6261.2010.01625.x\}, \cr 22 | #' url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-6261.2010.01625.x\}, \cr 23 | #' eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1540-6261.2010.01625.x\}, \cr 24 | #' year = \{2011\} \cr 25 | #' \} 26 | #' 27 | #' 28 | #' @inheritParams lexicon_afinn 29 | #' @return A tibble with 4,150 rows and 2 variables: 30 | #' \describe{ 31 | #' \item{word}{An English word} 32 | #' \item{sentiment}{Indicator for sentiment: "negative", "positive", 33 | #' "litigious", "uncertainty", "constraining", or "superfluous"} 34 | #' } 35 | #' 36 | #' @source \url{https://sraf.nd.edu/loughranmcdonald-master-dictionary/} 37 | #' @keywords datasets 38 | #' @family lexicon 39 | #' @importFrom fs file_exists dir_exists dir_create path 40 | #' @importFrom readr read_rds 41 | #' @importFrom utils menu 42 | #' @export 43 | #' @examples 44 | #' \dontrun{ 45 | #' lexicon_loughran() 46 | #' 47 | #' # Custom directory 48 | #' lexicon_loughran(dir = "data/") 49 | #' 50 | #' # Deleting dataset 51 | #' lexicon_loughran(delete = TRUE) 52 | #' 53 | #' # Returning filepath of data 54 | #' lexicon_loughran(return_path = TRUE) 55 | #' } 56 | lexicon_loughran <- function(dir = NULL, delete = FALSE, return_path = FALSE, 57 | clean = FALSE, manual_download = FALSE) { 58 | load_dataset( 59 | data_name = "loughran", name = "LoughranMcDonald.rds", dir = dir, 60 | delete = delete, return_path = return_path, clean = clean, 61 | manual_download = manual_download 62 | ) 63 | } 64 | 65 | #' @importFrom utils download.file 66 | download_loughran <- function(folder_path) { 67 | file_path <- path( 68 | folder_path, 69 | "LoughranMcDonald_MasterDictionary_2018 - LoughranMcDonald_MasterDictionary_2018.csv" 70 | ) 71 | if (file_exists(file_path)) { 72 | return(invisible()) 73 | } 74 | download.file( 75 | url = "https://drive.google.com/uc?id=12ECPJMxV2wSalXG8ykMmkpa1fq_ur0Rf&export=download", 76 | destfile = file_path 77 | ) 78 | } 79 | #' @importFrom readr read_csv cols_only col_character col_double 80 | process_loughran <- function(folder_path, name_path) { 81 | data <- read_csv(path(folder_path, "LoughranMcDonald_MasterDictionary_2018 - LoughranMcDonald_MasterDictionary_2018.csv"), 82 | col_types = cols_only( 83 | Word = col_character(), 84 | Negative = col_double(), 85 | Positive = col_double(), 86 | Uncertainty = col_double(), 87 | Litigious = col_double(), 88 | Constraining = col_double(), 89 | Superfluous = col_double() 90 | ) 91 | ) 92 | 93 | types <- c("Negative", "Positive", "Uncertainty", "Litigious", "Constraining", "Superfluous") 94 | 95 | out <- list() 96 | for (type in types) { 97 | out[[type]] <- tibble( 98 | word = tolower(as.character(data$Word[data[[type]] != 0])), 99 | sentiment = tolower(type) 100 | ) 101 | } 102 | 103 | write_rds(Reduce(rbind, out), name_path) 104 | } 105 | -------------------------------------------------------------------------------- /R/lexicon_nrc.R: -------------------------------------------------------------------------------- 1 | #' NRC word-emotion association lexicon 2 | #' 3 | #' General purpose English sentiment/emotion lexicon. This lexicon labels words 4 | #' with six possible sentiments or emotions: "negative", "positive", "anger", 5 | #' "anticipation", "disgust", "fear", "joy", "sadness", "surprise", or "trust". 6 | #' The annotations were manually done through Amazon's Mechanical Turk. 7 | #' 8 | #' License required for commercial use. Please contact Saif M. Mohammad 9 | #' (saif.mohammad@nrc-cnrc.gc.ca). 10 | #' 11 | #' Citation info: 12 | #' 13 | #' This dataset was published in Saif Mohammad and Peter Turney. (2013), 14 | #' ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational 15 | #' Intelligence, 29(3): 436-465. 16 | #' 17 | #' article\{mohammad13, \cr 18 | #' author = \{Mohammad, Saif M. and Turney, Peter D.\}, \cr 19 | #' title = \{CROWDSOURCING A WORD–EMOTION ASSOCIATION LEXICON\}, \cr 20 | #' journal = \{Computational Intelligence\}, \cr 21 | #' volume = \{29\}, \cr 22 | #' number = \{3\}, \cr 23 | #' pages = \{436-465\}, \cr 24 | #' doi = \{10.1111/j.1467-8640.2012.00460.x\}, \cr 25 | #' url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x\}, \cr 26 | #' eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x\}, \cr 27 | #' year = \{2013\} \cr 28 | #' \} 29 | #' 30 | #' 31 | #' 32 | #' @inheritParams lexicon_afinn 33 | #' @return A tibble with 13,901 rows and 2 variables: 34 | #' \describe{ 35 | #' \item{word}{An English word} 36 | #' \item{sentiment}{Indicator for sentiment or emotion: "negative", 37 | #' "positive", "anger", "anticipation", "disgust", "fear", "joy", "sadness", 38 | #' "surprise", or "trust"} 39 | #' } 40 | #' 41 | #' @source \url{http://saifmohammad.com/WebPages/lexicons.html} 42 | #' @keywords datasets 43 | #' @family lexicon 44 | #' @importFrom fs file_exists dir_exists dir_create path 45 | #' @importFrom readr read_rds 46 | #' @importFrom utils menu 47 | #' @export 48 | #' @examples 49 | #' \dontrun{ 50 | #' lexicon_nrc() 51 | #' 52 | #' # Custom directory 53 | #' lexicon_nrc(dir = "data/") 54 | #' 55 | #' # Deleting dataset 56 | #' lexicon_nrc(delete = TRUE) 57 | #' 58 | #' # Returning filepath of data 59 | #' lexicon_nrc(return_path = TRUE) 60 | #' } 61 | lexicon_nrc <- function(dir = NULL, delete = FALSE, return_path = FALSE, 62 | clean = FALSE, manual_download = FALSE) { 63 | load_dataset( 64 | data_name = "nrc", name = "NRCWordEmotion.rds", dir = dir, 65 | delete = delete, return_path = return_path, clean = clean, 66 | manual_download = manual_download 67 | ) 68 | } 69 | 70 | #' @importFrom utils download.file 71 | download_nrc <- function(folder_path) { 72 | file_path <- path( 73 | folder_path, 74 | "NRC-Emotion-Lexicon.zip" 75 | ) 76 | if (file_exists(file_path)) { 77 | return(invisible()) 78 | } 79 | download.file( 80 | url = "http://saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip", 81 | destfile = file_path 82 | ) 83 | unzip(path(folder_path, "NRC-Emotion-Lexicon.zip"), 84 | exdir = folder_path 85 | ) 86 | } 87 | 88 | #' @importFrom readr read_tsv 89 | #' @importFrom utils unzip 90 | 91 | process_nrc <- function(folder_path, name_path) { 92 | data <- read_tsv(path( 93 | folder_path, 94 | "NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt" 95 | ), 96 | col_names = FALSE, col_types = cols( 97 | X1 = col_character(), 98 | X2 = col_character(), 99 | X3 = col_double() 100 | ) 101 | ) 102 | 103 | data <- data[data$X3 == 1, ] 104 | data <- tibble( 105 | word = data$X1, 106 | sentiment = data$X2 107 | ) 108 | 109 | write_rds(data, name_path) 110 | } 111 | -------------------------------------------------------------------------------- /R/lexicon_nrc_eil.R: -------------------------------------------------------------------------------- 1 | #' NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5 2 | #' 3 | #' General purpose English sentiment/emotion lexicon. The NRC Affect Intensity 4 | #' Lexicon is a list of English words and their associations with four basic 5 | #' emotions (anger, fear, sadness, joy). 6 | #' 7 | #' For a given word and emotion X, the scores range from 0 to 1. A score of 1 8 | #' means that the word conveys the highest amount of emotion X. A score of 0 9 | #' means that the word conveys the lowest amount of emotion X. 10 | #' 11 | #' License required for commercial use. Please contact Saif M. Mohammad 12 | #' (saif.mohammad@nrc-cnrc.gc.ca). 13 | #' 14 | #' Citation info: 15 | #' 16 | #' Details of the lexicon are in this paper. 17 | #' Word Affect Intensities. Saif M. Mohammad. In Proceedings of the 11th Edition 18 | #' of the Language Resources and Evaluation Conference (LREC-2018), May 2018, 19 | #' Miyazaki, Japan. 20 | #' 21 | #' inproceedings\{LREC18-AIL, \cr 22 | #' author = \{Mohammad, Saif M.\}, \cr 23 | #' title = \{Word Affect Intensities\}, \cr 24 | #' booktitle = \{Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)\}, \cr 25 | #' year = \{2018\}, \cr 26 | #' address=\{Miyazaki, Japan\} \cr 27 | #' \} \cr 28 | #' 29 | #' @inheritParams lexicon_afinn 30 | #' @return A tibble with 5.814 rows and 3 variables: 31 | #' \describe{ 32 | #' \item{term}{An English word} 33 | #' \item{score}{Value between 0 and 1} 34 | #' \item{AffectDimension}{Indicator for sentiment or emotion: ("anger", 35 | #' "fear", "sadness", "joy")} 36 | #' } 37 | #' 38 | #' @source \url{https://saifmohammad.com/WebPages/AffectIntensity.htm} 39 | #' @keywords datasets 40 | #' @family lexicon 41 | #' @importFrom fs file_exists dir_exists dir_create path 42 | #' @importFrom readr read_rds 43 | #' @importFrom utils menu 44 | #' @export 45 | #' @examples 46 | #' \dontrun{ 47 | #' lexicon_nrc_eil() 48 | #' 49 | #' # Custom directory 50 | #' lexicon_nrc_eil(dir = "data/") 51 | #' 52 | #' # Deleting dataset 53 | #' lexicon_nrc_eil(delete = TRUE) 54 | #' 55 | #' # Returning filepath of data 56 | #' lexicon_nrc_eil(return_path = TRUE) 57 | #' } 58 | lexicon_nrc_eil <- function(dir = NULL, delete = FALSE, return_path = FALSE, 59 | clean = FALSE, manual_download = FALSE) { 60 | load_dataset( 61 | data_name = "nrc_eil", name = "nrc_eil.rds", dir = dir, 62 | delete = delete, return_path = return_path, clean = clean, 63 | manual_download = manual_download 64 | ) 65 | } 66 | 67 | #' @importFrom utils download.file 68 | download_nrc_eil <- function(folder_path) { 69 | file_path <- path( 70 | folder_path, 71 | "NRC-AffectIntensity-Lexicon.txt" 72 | ) 73 | if (file_exists(file_path)) { 74 | return(invisible()) 75 | } 76 | download.file( 77 | url = "http://saifmohammad.com/WebDocs/NRC-AffectIntensity-Lexicon.txt", 78 | destfile = file_path 79 | ) 80 | } 81 | 82 | #' @importFrom readr read_tsv 83 | #' @importFrom utils unzip 84 | 85 | process_nrc_eil <- function(folder_path, name_path) { 86 | data <- read_tsv( 87 | file = path(folder_path, "NRC-AffectIntensity-Lexicon.txt"), 88 | skip = 36, 89 | col_types = cols( 90 | term = col_character(), 91 | score = col_double(), 92 | AffectDimension = col_character() 93 | ) 94 | ) 95 | write_rds(data, name_path) 96 | } 97 | -------------------------------------------------------------------------------- /R/lexicon_nrc_vad.R: -------------------------------------------------------------------------------- 1 | #' The NRC Valence, Arousal, and Dominance Lexicon 2 | #' 3 | #' The NRC Valence, Arousal, and Dominance (VAD) Lexicon includes a list of 4 | #' more than 20,000 English words and their valence, arousal, and dominance 5 | #' scores. For a given word and a dimension (V/A/D), the scores range from 0 6 | #' (lowest V/A/D) to 1 (highest V/A/D). The lexicon with its fine-grained real- 7 | #' valued scores was created by manual annotation using best--worst scaling. 8 | #' The lexicon is markedly larger than any of the existing VAD lexicons. We also 9 | #' show that the ratings obtained are substantially more reliable than those in 10 | #' existing lexicons. 11 | #' 12 | #' License required for commercial use. Please contact Saif M. Mohammad 13 | #' (saif.mohammad@nrc-cnrc.gc.ca). 14 | #' 15 | #' Citation info: 16 | #' 17 | #' Details of the NRC VAD Lexicon are available in this paper: 18 | #' 19 | #' Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20 | #' 20,000 English Words. Saif M. Mohammad. In Proceedings of the 56th Annual 21 | #' Meeting of the Association for Computational Linguistics, Melbourne, 22 | #' Australia, July 2018. 23 | #' 24 | #' inproceedings\{vad-acl2018, \cr 25 | #' title=\{Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words\}, \cr 26 | #' author=\{Mohammad, Saif M.\}, \cr 27 | #' booktitle=\{Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)\}, \cr 28 | #' year=\{2018\}, \cr 29 | #' address=\{Melbourne, Australia\} \cr 30 | #' \} 31 | #' 32 | #' @inheritParams lexicon_afinn 33 | #' @return A tibble with 20.007 rows and 4 variables: 34 | #' \describe{ 35 | #' \item{word}{An English word} 36 | #' \item{Valence}{valence score of the word} 37 | #' \item{Arousal}{arousal score of the word} 38 | #' \item{Dominance}{dominance score of the word} 39 | #' } 40 | #' 41 | #' @source \url{https://saifmohammad.com/WebPages/nrc-vad.html} 42 | #' @keywords datasets 43 | #' @family lexicon 44 | #' @importFrom fs file_exists dir_exists dir_create path 45 | #' @importFrom readr read_rds 46 | #' @importFrom utils menu 47 | #' @export 48 | #' @examples 49 | #' \dontrun{ 50 | #' lexicon_nrc_vad() 51 | #' 52 | #' # Custom directory 53 | #' lexicon_nrc_vad(dir = "data/") 54 | #' 55 | #' # Deleting dataset 56 | #' lexicon_nrc_vad(delete = TRUE) 57 | #' 58 | #' # Returning filepath of data 59 | #' lexicon_nrc_vad(return_path = TRUE) 60 | #' } 61 | lexicon_nrc_vad <- function(dir = NULL, delete = FALSE, return_path = FALSE, 62 | clean = FALSE, manual_download = FALSE) { 63 | load_dataset( 64 | data_name = "nrc_vad", name = "nrc_vad.rds", dir = dir, 65 | delete = delete, return_path = return_path, clean = clean, 66 | manual_download = manual_download 67 | ) 68 | } 69 | 70 | #' @importFrom utils download.file 71 | download_nrc_vad <- function(folder_path) { 72 | file_path <- path( 73 | folder_path, 74 | "NRC-VAD-Lexicon-Aug2018Release.zip" 75 | ) 76 | if (file_exists(file_path)) { 77 | return(invisible()) 78 | } 79 | download.file( 80 | url = "http://saifmohammad.com/WebDocs/VAD/NRC-VAD-Lexicon-Aug2018Release.zip", 81 | destfile = file_path 82 | ) 83 | unzip(path(folder_path, "NRC-VAD-Lexicon-Aug2018Release.zip"), 84 | exdir = folder_path 85 | ) 86 | } 87 | 88 | #' @importFrom readr read_tsv 89 | #' @importFrom utils unzip 90 | 91 | process_nrc_vad <- function(folder_path, name_path) { 92 | data <- read_tsv(path( 93 | folder_path, 94 | "NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt" 95 | ), 96 | col_names = FALSE, 97 | show_col_types = FALSE) 98 | data <- stats::setNames(data, c("Word", "Valence", "Arousal", "Dominance")) 99 | 100 | write_rds(data, name_path) 101 | } 102 | -------------------------------------------------------------------------------- /R/load_dataset.R: -------------------------------------------------------------------------------- 1 | #' Internal Functions 2 | #' 3 | #' These are not to be used directly by the users. 4 | #' @export 5 | #' @importFrom fs dir_delete path file_delete 6 | #' @keywords internal 7 | load_dataset <- function(data_name, name, dir, delete, return_path, clean, 8 | clean_manual = NULL, manual_download) { 9 | dir <- ifelse(is.null(dir), rappdirs::user_cache_dir("textdata"), dir) 10 | 11 | name_path <- path(dir, data_name, name) 12 | folder_path <- path(dir, data_name) 13 | 14 | if (!manual_download) { 15 | if (return_path) { 16 | return(folder_path) 17 | } 18 | 19 | if (delete) { 20 | dir_delete(folder_path) 21 | return(invisible()) 22 | } 23 | 24 | if (file_exists(name_path)) { 25 | return(read_rds(name_path)) 26 | } 27 | 28 | if (printer(data_name) == 2) { 29 | return(invisible()) 30 | } 31 | 32 | if (!dir_exists(folder_path)) { 33 | dir_create(folder_path) 34 | } 35 | 36 | download_functions[[data_name]](folder_path) 37 | } 38 | 39 | process_functions[[data_name]](folder_path, name_path) 40 | 41 | if (clean) { 42 | if (!is.null(clean_manual)) { 43 | intermediate_files <- setdiff( 44 | dir_ls(folder_path), 45 | path(folder_path, clean_manual) 46 | ) 47 | } else { 48 | intermediate_files <- setdiff(dir_ls(folder_path), name_path) 49 | } 50 | file_delete(intermediate_files) 51 | } 52 | 53 | read_rds(name_path) 54 | } 55 | -------------------------------------------------------------------------------- /R/printer.R: -------------------------------------------------------------------------------- 1 | #' Internal Functions 2 | #' 3 | #' These are not to be used directly by the users. 4 | #' @keywords internal 5 | #' @noRd 6 | printer <- function(name) { 7 | title <- cat( 8 | "Do you want to download:\n", 9 | "Name:", print_info[[name]][["name"]], "\n", 10 | "URL:", print_info[[name]][["url"]], "\n", 11 | "License:", print_info[[name]][["license"]], "\n", 12 | "Size:", print_info[[name]][["size"]], "\n", 13 | "Download mechanism:", print_info[[name]][["download_mech"]], "\n" 14 | ) 15 | 16 | if (!is.na(print_info[[name]][["citation"]])) { 17 | title <- cat( 18 | title, 19 | print_info[[name]][["citation"]], "\n" 20 | ) 21 | } 22 | 23 | menu(choices = c("Yes", "No"), title = title) 24 | } 25 | -------------------------------------------------------------------------------- /R/process_functions.R: -------------------------------------------------------------------------------- 1 | #' List of all process functions used in load_dataset 2 | #' 3 | #' @format Named list of all process functions 4 | #' @include download_functions.R 5 | #' 6 | #' @name process_functions 7 | #' @noRd 8 | NULL 9 | 10 | process_functions <- list( 11 | afinn = process_afinn, 12 | sentence_polarity = process_sentence_polarity, 13 | loughran = process_loughran, 14 | bing = process_bing, 15 | nrc = process_nrc, 16 | nrc_eil = process_nrc_eil, 17 | nrc_vad = process_nrc_vad, 18 | ag_news = process_ag_news, 19 | dbpedia = process_dbpedia, 20 | trec = process_trec, 21 | imdb = process_imdb, 22 | glove6b = process_glove6b, 23 | glove27b = process_glove27b, 24 | glove42b = process_glove42b, 25 | glove840b = process_glove840b 26 | ) 27 | -------------------------------------------------------------------------------- /R/textdata-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | ## usethis namespace: start 5 | ## usethis namespace: end 6 | NULL 7 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-" 12 | ) 13 | ``` 14 | 15 | # textdata 16 | 17 | 18 | [![R-CMD-check](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml) 19 | [![CRAN status](https://www.r-pkg.org/badges/version/textdata)](https://CRAN.R-project.org/package=textdata) 20 | [![Downloads](http://cranlogs.r-pkg.org/badges/textdata)](https://cran.r-project.org/package=textdata) 21 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3244433.svg)](https://doi.org/10.5281/zenodo.3244433) 22 | [![Codecov test coverage](https://codecov.io/gh/EmilHvitfeldt/textdata/branch/main/graph/badge.svg)](https://app.codecov.io/gh/EmilHvitfeldt/textdata?branch=main) 23 | [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html) 24 | 25 | 26 | The goal of textdata is to provide access to text-related data sets for easy access without bundling them inside a package. Some text datasets are too large to store within an R package or are licensed in such a way that prevents them from being included in an OSS-licensed package. Instead, this package provides a framework to download, parse, and store the datasets on the disk and load them when needed. 27 | 28 | ## Installation 29 | 30 | You can install the not yet released version of textdata from [CRAN](https://CRAN.R-project.org) with: 31 | 32 | ``` r 33 | install.packages("textdata") 34 | ``` 35 | 36 | And the development version from [GitHub](https://github.com/) with: 37 | 38 | ``` r 39 | # install.packages("remotes") 40 | remotes::install_github("EmilHvitfeldt/textdata") 41 | ``` 42 | ## Example 43 | 44 | The first time you use one of the functions for accessing an included text dataset, such as `lexicon_afinn()` or `dataset_sentence_polarity()`, the function will prompt you to agree that you understand the dataset's license or terms of use and then download the dataset to your computer. 45 | 46 | ![](man/figures/textdata_demo.gif) 47 | 48 | After the first use, each time you use a function like `lexicon_afinn()`, the function will load the dataset from disk. 49 | 50 | ## Included text datasets 51 | 52 | As of today, the datasets included in textdata are: 53 | 54 | | Dataset | Function | 55 | | --------------------------------------------------------------- | ----------------------------- | 56 | | v1.0 sentence polarity dataset | `dataset_sentence_polarity()` | 57 | | AFINN-111 sentiment lexicon | `lexicon_afinn()` | 58 | | Hu and Liu's opinion lexicon | `lexicon_bing()` | 59 | | NRC word-emotion association lexicon | `lexicon_nrc()` | 60 | | NRC Emotion Intensity Lexicon | `lexicon_nrc_eil()` | 61 | | The NRC Valence, Arousal, and Dominance Lexicon | `lexicon_nrc_vad()` | 62 | | Loughran and McDonald's opinion lexicon for financial documents | `lexicon_loughran()` | 63 | | AG's News | `dataset_ag_news()` | 64 | | DBpedia ontology | `dataset_dbpedia()` | 65 | | Trec-6 and Trec-50 | `dataset_trec()` | 66 | | IMDb Large Movie Review Dataset | `dataset_imdb()` | 67 | | Stanford NLP GloVe pre-trained word vectors | `embedding_glove6b()` | 68 | | | `embedding_glove27b()` | 69 | | | `embedding_glove42b()` | 70 | | | `embedding_glove840b()` | 71 | 72 | Check out each function's documentation for detailed information (including citations) for the relevant dataset. 73 | 74 | ## Community Guidelines 75 | 76 | Note that this project is released with a 77 | [Contributor Code of Conduct](https://github.com/EmilHvitfeldt/textdata/blob/main/CODE_OF_CONDUCT.md). 78 | By contributing to this project, you agree to abide by its terms. 79 | Feedback, bug reports (and fixes!), and feature requests are welcome; file 80 | issues or seek support [here](https://github.com/EmilHvitfeldt/textdata/issues). 81 | For details on how to add a new dataset to this package, check out the vignette! 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # textdata 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml) 9 | [![CRAN 10 | status](https://www.r-pkg.org/badges/version/textdata)](https://CRAN.R-project.org/package=textdata) 11 | [![Downloads](http://cranlogs.r-pkg.org/badges/textdata)](https://cran.r-project.org/package=textdata) 12 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3244433.svg)](https://doi.org/10.5281/zenodo.3244433) 13 | [![Codecov test 14 | coverage](https://codecov.io/gh/EmilHvitfeldt/textdata/branch/main/graph/badge.svg)](https://app.codecov.io/gh/EmilHvitfeldt/textdata?branch=main) 15 | [![Lifecycle: 16 | stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html) 17 | 18 | 19 | The goal of textdata is to provide access to text-related data sets for 20 | easy access without bundling them inside a package. Some text datasets 21 | are too large to store within an R package or are licensed in such a way 22 | that prevents them from being included in an OSS-licensed package. 23 | Instead, this package provides a framework to download, parse, and store 24 | the datasets on the disk and load them when needed. 25 | 26 | ## Installation 27 | 28 | You can install the not yet released version of textdata from 29 | [CRAN](https://CRAN.R-project.org) with: 30 | 31 | ``` r 32 | install.packages("textdata") 33 | ``` 34 | 35 | And the development version from [GitHub](https://github.com/) with: 36 | 37 | ``` r 38 | # install.packages("remotes") 39 | remotes::install_github("EmilHvitfeldt/textdata") 40 | ``` 41 | 42 | ## Example 43 | 44 | The first time you use one of the functions for accessing an included 45 | text dataset, such as `lexicon_afinn()` or 46 | `dataset_sentence_polarity()`, the function will prompt you to agree 47 | that you understand the dataset’s license or terms of use and then 48 | download the dataset to your computer. 49 | 50 | ![](man/figures/textdata_demo.gif) 51 | 52 | After the first use, each time you use a function like 53 | `lexicon_afinn()`, the function will load the dataset from disk. 54 | 55 | ## Included text datasets 56 | 57 | As of today, the datasets included in textdata are: 58 | 59 | | Dataset | Function | 60 | |-----------------------------------------------------------------|-------------------------------| 61 | | v1.0 sentence polarity dataset | `dataset_sentence_polarity()` | 62 | | AFINN-111 sentiment lexicon | `lexicon_afinn()` | 63 | | Hu and Liu’s opinion lexicon | `lexicon_bing()` | 64 | | NRC word-emotion association lexicon | `lexicon_nrc()` | 65 | | NRC Emotion Intensity Lexicon | `lexicon_nrc_eil()` | 66 | | The NRC Valence, Arousal, and Dominance Lexicon | `lexicon_nrc_vad()` | 67 | | Loughran and McDonald’s opinion lexicon for financial documents | `lexicon_loughran()` | 68 | | AG’s News | `dataset_ag_news()` | 69 | | DBpedia ontology | `dataset_dbpedia()` | 70 | | Trec-6 and Trec-50 | `dataset_trec()` | 71 | | IMDb Large Movie Review Dataset | `dataset_imdb()` | 72 | | Stanford NLP GloVe pre-trained word vectors | `embedding_glove6b()` | 73 | | | `embedding_glove27b()` | 74 | | | `embedding_glove42b()` | 75 | | | `embedding_glove840b()` | 76 | 77 | Check out each function’s documentation for detailed information 78 | (including citations) for the relevant dataset. 79 | 80 | ## Community Guidelines 81 | 82 | Note that this project is released with a [Contributor Code of 83 | Conduct](https://github.com/EmilHvitfeldt/textdata/blob/main/CODE_OF_CONDUCT.md). 84 | By contributing to this project, you agree to abide by its terms. 85 | Feedback, bug reports (and fixes!), and feature requests are welcome; 86 | file issues or seek support 87 | [here](https://github.com/EmilHvitfeldt/textdata/issues). For details on 88 | how to add a new dataset to this package, check out the vignette! 89 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | template: 2 | bootstrap: 5 3 | 4 | development: 5 | mode: auto 6 | 7 | reference: 8 | - title: Lexicons 9 | contents: 10 | - lexicon_afinn 11 | - lexicon_bing 12 | - lexicon_nrc 13 | - lexicon_nrc_eil 14 | - lexicon_nrc_vad 15 | - lexicon_loughran 16 | - title: Data Sets 17 | contents: 18 | - dataset_sentence_polarity 19 | - dataset_ag_news 20 | - dataset_dbpedia 21 | - dataset_trec 22 | - dataset_imdb 23 | - title: Embeddings 24 | contents: 25 | - embedding_glove6b 26 | - embedding_glove27b 27 | - embedding_glove42b 28 | - embedding_glove840b 29 | - title: Other 30 | contents: 31 | - catalogue 32 | - cache_info 33 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 1% 13 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Release Summary 2 | 3 | This is the 7th CRAN release of textdata. Fixes a bug that produces data with no column names. 4 | 5 | ## R CMD check results 6 | 7 | 0 errors | 0 warnings | 0 note 8 | 9 | ## Downstream dependencies 10 | 11 | I ran R CMD check on the 3 downstream dependencies and there were no problems related to textdata. 12 | -------------------------------------------------------------------------------- /man/cache_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cache_info.R 3 | \name{cache_info} 4 | \alias{cache_info} 5 | \title{List folders and their sizes in cache} 6 | \usage{ 7 | cache_info(dir = NULL) 8 | } 9 | \arguments{ 10 | \item{dir}{Character, path to directory where data will be stored. If 11 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 12 | } 13 | \value{ 14 | A tibble with 2 variables: 15 | \describe{ 16 | \item{name}{Name of the folder} 17 | \item{size}{Size of the folder} 18 | } 19 | } 20 | \description{ 21 | This function will return a tibble with the name and sizes of all folder in 22 | specified directory. Will default to textdata's default cache. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | cache_info() 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/catalogue.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/info.R 3 | \docType{data} 4 | \name{catalogue} 5 | \alias{catalogue} 6 | \title{Catalogue of all available data sources} 7 | \format{ 8 | An object of class \code{data.frame} with 15 rows and 8 columns. 9 | } 10 | \usage{ 11 | catalogue 12 | } 13 | \description{ 14 | Catalogue of all available data sources 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/dataset_ag_news.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataset_ag_news.R 3 | \name{dataset_ag_news} 4 | \alias{dataset_ag_news} 5 | \title{AG's News Topic Classification Dataset} 6 | \source{ 7 | \url{http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html} 8 | 9 | \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz} 10 | } 11 | \usage{ 12 | dataset_ag_news( 13 | dir = NULL, 14 | split = c("train", "test"), 15 | delete = FALSE, 16 | return_path = FALSE, 17 | clean = FALSE, 18 | manual_download = FALSE 19 | ) 20 | } 21 | \arguments{ 22 | \item{dir}{Character, path to directory where data will be stored. If 23 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 24 | 25 | \item{split}{Character. Return training ("train") data or testing ("test") 26 | data. Defaults to "train".} 27 | 28 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 29 | 30 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 31 | 32 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 33 | greatly reduce the size. Defaults to FALSE.} 34 | 35 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 36 | downloaded the file and placed it in the folder designated by running 37 | this function with \code{return_path = TRUE}.} 38 | } 39 | \value{ 40 | A tibble with 120,000 or 30,000 rows for "train" and "test" 41 | respectively and 3 variables: 42 | \describe{ 43 | \item{class}{Character, denoting new class} 44 | \item{title}{Character, title of article} 45 | \item{description}{Character, description of article} 46 | } 47 | } 48 | \description{ 49 | The AG's news topic classification dataset is constructed by choosing 4 50 | largest classes from the original corpus. Each class contains 30,000 training 51 | samples and 1,900 testing samples. The total number of training samples is 52 | 120,000 and testing 7,600. 53 | Version 3, Updated 09/09/2015 54 | } 55 | \details{ 56 | The classes in this dataset are 57 | 58 | \itemize{ 59 | \item World 60 | \item Sports 61 | \item Business 62 | \item Sci/Tech 63 | } 64 | } 65 | \examples{ 66 | \dontrun{ 67 | dataset_ag_news() 68 | 69 | # Custom directory 70 | dataset_ag_news(dir = "data/") 71 | 72 | # Deleting dataset 73 | dataset_ag_news(delete = TRUE) 74 | 75 | # Returning filepath of data 76 | dataset_ag_news(return_path = TRUE) 77 | 78 | # Access both training and testing dataset 79 | train <- dataset_ag_news(split = "train") 80 | test <- dataset_ag_news(split = "test") 81 | } 82 | 83 | } 84 | \seealso{ 85 | Other topic: 86 | \code{\link{dataset_dbpedia}()}, 87 | \code{\link{dataset_trec}()} 88 | } 89 | \concept{topic} 90 | \keyword{datasets} 91 | -------------------------------------------------------------------------------- /man/dataset_dbpedia.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataset_dbpedia.R 3 | \name{dataset_dbpedia} 4 | \alias{dataset_dbpedia} 5 | \title{DBpedia Ontology Dataset} 6 | \source{ 7 | \url{https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf} 8 | 9 | \url{https://www.dbpedia.org/} 10 | 11 | \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz} 12 | } 13 | \usage{ 14 | dataset_dbpedia( 15 | dir = NULL, 16 | split = c("train", "test"), 17 | delete = FALSE, 18 | return_path = FALSE, 19 | clean = FALSE, 20 | manual_download = FALSE 21 | ) 22 | } 23 | \arguments{ 24 | \item{dir}{Character, path to directory where data will be stored. If 25 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 26 | 27 | \item{split}{Character. Return training ("train") data or testing ("test") 28 | data. Defaults to "train".} 29 | 30 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 31 | 32 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 33 | 34 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 35 | greatly reduce the size. Defaults to FALSE.} 36 | 37 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 38 | downloaded the file and placed it in the folder designated by running 39 | this function with \code{return_path = TRUE}.} 40 | } 41 | \value{ 42 | A tibble with 560,000 or 70,000 rows for "train" and "test" 43 | respectively and 3 variables: 44 | \describe{ 45 | \item{class}{Character, denoting the class class} 46 | \item{title}{Character, title of article} 47 | \item{description}{Character, description of article} 48 | } 49 | } 50 | \description{ 51 | DBpedia ontology dataset classification dataset. It contains 560,000 training 52 | samples and 70,000 testing samples for each of 14 nonoverlapping classes 53 | from DBpedia. 54 | } 55 | \details{ 56 | The classes are 57 | 58 | \itemize{ 59 | \item Company 60 | \item EducationalInstitution 61 | \item Artist 62 | \item Athlete 63 | \item OfficeHolder 64 | \item MeanOfTransportation 65 | \item Building 66 | \item NaturalPlace 67 | \item Village 68 | \item Animal 69 | \item Plant 70 | \item Album 71 | \item Film 72 | \item WrittenWork 73 | } 74 | } 75 | \examples{ 76 | \dontrun{ 77 | dataset_dbpedia() 78 | 79 | # Custom directory 80 | dataset_dbpedia(dir = "data/") 81 | 82 | # Deleting dataset 83 | dataset_dbpedia(delete = TRUE) 84 | 85 | # Returning filepath of data 86 | dataset_dbpedia(return_path = TRUE) 87 | 88 | # Access both training and testing dataset 89 | train <- dataset_dbpedia(split = "train") 90 | test <- dataset_dbpedia(split = "test") 91 | } 92 | 93 | } 94 | \seealso{ 95 | Other topic: 96 | \code{\link{dataset_ag_news}()}, 97 | \code{\link{dataset_trec}()} 98 | } 99 | \concept{topic} 100 | \keyword{datasets} 101 | -------------------------------------------------------------------------------- /man/dataset_imdb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataset_imdb.R 3 | \name{dataset_imdb} 4 | \alias{dataset_imdb} 5 | \title{IMDB Large Movie Review Dataset} 6 | \source{ 7 | \url{http://ai.stanford.edu/~amaas/data/sentiment/} 8 | } 9 | \usage{ 10 | dataset_imdb( 11 | dir = NULL, 12 | split = c("train", "test"), 13 | delete = FALSE, 14 | return_path = FALSE, 15 | clean = FALSE, 16 | manual_download = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{dir}{Character, path to directory where data will be stored. If 21 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 22 | 23 | \item{split}{Character. Return training ("train") data or testing ("test") 24 | data. Defaults to "train".} 25 | 26 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 27 | 28 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 29 | 30 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 31 | greatly reduce the size. Defaults to FALSE.} 32 | 33 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 34 | downloaded the file and placed it in the folder designated by running 35 | this function with \code{return_path = TRUE}.} 36 | } 37 | \value{ 38 | A tibble with 25,000 rows and 2 variables: 39 | \describe{ 40 | \item{Sentiment}{Character, denoting the sentiment} 41 | \item{text}{Character, text of the review} 42 | } 43 | } 44 | \description{ 45 | The core dataset contains 50,000 reviews split evenly into 25k train and 46 | 25k test sets. The overall distribution of labels is balanced (25k pos and 47 | 25k neg). 48 | } 49 | \details{ 50 | In the entire collection, no more than 30 reviews are allowed for any 51 | given movie because reviews for the same movie tend to have correlated 52 | ratings. Further, the train and test sets contain a disjoint set of 53 | movies, so no significant performance is obtained by memorizing 54 | movie-unique terms and their associated with observed labels. In the 55 | labeled train/test sets, a negative review has a score <= 4 out of 10, 56 | and a positive review has a score >= 7 out of 10. Thus reviews with 57 | more neutral ratings are not included in the train/test sets. In the 58 | unsupervised set, reviews of any rating are included and there are an 59 | even number of reviews > 5 and <= 5. 60 | 61 | When using this dataset, please cite the ACL 2011 paper 62 | 63 | InProceedings\{maas-EtAl:2011:ACL-HLT2011, \cr 64 | author = \{Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher\}, \cr 65 | title = \{Learning Word Vectors for Sentiment Analysis\}, \cr 66 | booktitle = \{Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies\}, \cr 67 | month = \{June\}, \cr 68 | year = \{2011\}, \cr 69 | address = \{Portland, Oregon, USA\}, \cr 70 | publisher = \{Association for Computational Linguistics\}, \cr 71 | pages = \{142--150\}, \cr 72 | url = \{http://www.aclweb.org/anthology/P11-1015\} 73 | \} 74 | } 75 | \examples{ 76 | \dontrun{ 77 | dataset_imdb() 78 | 79 | # Custom directory 80 | dataset_imdb(dir = "data/") 81 | 82 | # Deleting dataset 83 | dataset_imdb(delete = TRUE) 84 | 85 | # Returning filepath of data 86 | dataset_imdb(return_path = TRUE) 87 | 88 | # Access both training and testing dataset 89 | train <- dataset_imdb(split = "train") 90 | test <- dataset_imdb(split = "test") 91 | } 92 | 93 | } 94 | \concept{topic sentiment} 95 | \keyword{datasets} 96 | -------------------------------------------------------------------------------- /man/dataset_sentence_polarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataset_sentence_polarity.R 3 | \name{dataset_sentence_polarity} 4 | \alias{dataset_sentence_polarity} 5 | \title{v1.0 sentence polarity dataset} 6 | \source{ 7 | \url{https://www.cs.cornell.edu/people/pabo/movie-review-data/} 8 | } 9 | \usage{ 10 | dataset_sentence_polarity( 11 | dir = NULL, 12 | delete = FALSE, 13 | return_path = FALSE, 14 | clean = FALSE, 15 | manual_download = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{dir}{Character, path to directory where data will be stored. If 20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 21 | 22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 23 | 24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 25 | 26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 27 | greatly reduce the size. Defaults to FALSE.} 28 | 29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 30 | downloaded the file and placed it in the folder designated by running 31 | this function with \code{return_path = TRUE}.} 32 | } 33 | \value{ 34 | A tibble with 10,662 rows and 2 variables: 35 | \describe{ 36 | \item{text}{Sentences or snippets} 37 | \item{sentiment}{Indicator for sentiment, "neg" for negative and "pos" 38 | for positive} 39 | } 40 | } 41 | \description{ 42 | 5331 positive and 5331 negative processed sentences / snippets. 43 | Introduced in Pang/Lee ACL 2005. Released July 2005. 44 | } 45 | \details{ 46 | Citation info: 47 | 48 | This data was first used in Bo Pang and Lillian Lee, 49 | ``Seeing stars: Exploiting class relationships for sentiment categorization 50 | with respect to rating scales.'', Proceedings of the ACL, 2005. 51 | 52 | InProceedings\{pang05, \cr 53 | author = \{Bo Pang and Lillian Lee\}, \cr 54 | title = \{Seeing stars: Exploiting class relationships for sentiment \cr 55 | categorization with respect to rating scales\}, \cr 56 | booktitle = \{Proceedings of the ACL\}, \cr 57 | year = 2005 \cr 58 | \} 59 | } 60 | \examples{ 61 | \dontrun{ 62 | dataset_sentence_polarity() 63 | 64 | # Custom directory 65 | dataset_sentence_polarity(dir = "data/") 66 | 67 | # Deleting dataset 68 | dataset_sentence_polarity(delete = TRUE) 69 | 70 | # Returning filepath of data 71 | dataset_sentence_polarity(return_path = TRUE) 72 | } 73 | 74 | } 75 | \concept{sentiment} 76 | \keyword{datasets} 77 | -------------------------------------------------------------------------------- /man/dataset_trec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataset_trec.R 3 | \name{dataset_trec} 4 | \alias{dataset_trec} 5 | \title{TREC dataset} 6 | \source{ 7 | \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/} 8 | 9 | \url{https://trec.nist.gov/data/qa.html} 10 | } 11 | \usage{ 12 | dataset_trec( 13 | dir = NULL, 14 | split = c("train", "test"), 15 | version = c("6", "50"), 16 | delete = FALSE, 17 | return_path = FALSE, 18 | clean = FALSE, 19 | manual_download = FALSE 20 | ) 21 | } 22 | \arguments{ 23 | \item{dir}{Character, path to directory where data will be stored. If 24 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 25 | 26 | \item{split}{Character. Return training ("train") data or testing ("test") 27 | data. Defaults to "train".} 28 | 29 | \item{version}{Character. Version 6("6") or version 50("50"). Defaults to 30 | "6".} 31 | 32 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 33 | 34 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 35 | 36 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 37 | greatly reduce the size. Defaults to FALSE.} 38 | 39 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 40 | downloaded the file and placed it in the folder designated by running 41 | this function with \code{return_path = TRUE}.} 42 | } 43 | \value{ 44 | A tibble with 5,452 or 500 rows for "train" and "test" 45 | respectively and 2 variables: 46 | \describe{ 47 | \item{class}{Character, denoting the class} 48 | \item{text}{Character, question text} 49 | } 50 | } 51 | \description{ 52 | The TREC dataset is dataset for question classification consisting of 53 | open-domain, fact-based questions divided into broad semantic categories. 54 | It has both a six-class (TREC-6) and a fifty-class (TREC-50) version. Both 55 | have 5,452 training examples and 500 test examples, but TREC-50 has 56 | finer-grained labels. Models are evaluated based on accuracy. 57 | } 58 | \details{ 59 | The classes in TREC-6 are 60 | 61 | \itemize{ 62 | \item ABBR - Abbreviation 63 | \item DESC - Description and abstract concepts 64 | \item ENTY - Entities 65 | \item HUM - Human beings 66 | \item LOC - Locations 67 | \item NYM - Numeric values 68 | } 69 | 70 | the classes in TREC-50 can be found here 71 | \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/definition.html}. 72 | } 73 | \examples{ 74 | \dontrun{ 75 | dataset_trec() 76 | 77 | # Custom directory 78 | dataset_trec(dir = "data/") 79 | 80 | # Deleting dataset 81 | dataset_trec(delete = TRUE) 82 | 83 | # Returning filepath of data 84 | dataset_trec(return_path = TRUE) 85 | 86 | # Access both training and testing dataset 87 | train_6 <- dataset_trec(split = "train") 88 | test_6 <- dataset_trec(split = "test") 89 | 90 | train_50 <- dataset_trec(split = "train", version = "50") 91 | test_50 <- dataset_trec(split = "test", version = "50") 92 | } 93 | 94 | } 95 | \seealso{ 96 | Other topic: 97 | \code{\link{dataset_ag_news}()}, 98 | \code{\link{dataset_dbpedia}()} 99 | } 100 | \concept{topic} 101 | \keyword{datasets} 102 | -------------------------------------------------------------------------------- /man/embedding_glove.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/embedding_glove.R 3 | \name{embedding_glove} 4 | \alias{embedding_glove} 5 | \alias{embedding_glove6b} 6 | \alias{embedding_glove27b} 7 | \alias{embedding_glove42b} 8 | \alias{embedding_glove840b} 9 | \title{Global Vectors for Word Representation} 10 | \source{ 11 | \url{https://nlp.stanford.edu/projects/glove/} 12 | } 13 | \usage{ 14 | embedding_glove6b( 15 | dir = NULL, 16 | dimensions = c(50, 100, 200, 300), 17 | delete = FALSE, 18 | return_path = FALSE, 19 | clean = FALSE, 20 | manual_download = FALSE 21 | ) 22 | 23 | embedding_glove27b( 24 | dir = NULL, 25 | dimensions = c(25, 50, 100, 200), 26 | delete = FALSE, 27 | return_path = FALSE, 28 | clean = FALSE, 29 | manual_download = FALSE 30 | ) 31 | 32 | embedding_glove42b( 33 | dir = NULL, 34 | delete = FALSE, 35 | return_path = FALSE, 36 | clean = FALSE, 37 | manual_download = FALSE 38 | ) 39 | 40 | embedding_glove840b( 41 | dir = NULL, 42 | delete = FALSE, 43 | return_path = FALSE, 44 | clean = FALSE, 45 | manual_download = FALSE 46 | ) 47 | } 48 | \arguments{ 49 | \item{dir}{Character, path to directory where data will be stored. If 50 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 51 | 52 | \item{dimensions}{A number indicating the number of vectors to include. One 53 | of 50, 100, 200, or 300 for glove6b, or one of 25, 50, 100, or 200 for 54 | glove27b.} 55 | 56 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 57 | 58 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 59 | 60 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 61 | greatly reduce the size. Defaults to FALSE.} 62 | 63 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 64 | downloaded the file and placed it in the folder designated by running 65 | this function with \code{return_path = TRUE}.} 66 | } 67 | \value{ 68 | A tibble with 400k, 1.9m, 2.2m, or 1.2m rows (one row for each unique 69 | token in the vocabulary) and the following variables: 70 | \describe{ 71 | \item{token}{An individual token (usually a word)} 72 | \item{d1, d2, etc}{The embeddings for that token.} 73 | } 74 | } 75 | \description{ 76 | The GloVe pre-trained word vectors provide word embeddings created using 77 | varying numbers of tokens. 78 | } 79 | \details{ 80 | Citation info: 81 | 82 | InProceedings\{pennington2014glove, \cr 83 | author = \{Jeffrey Pennington and Richard Socher and Christopher D. \cr 84 | Manning\}, \cr 85 | title = \{GloVe: Global Vectors for Word Representation\}, \cr 86 | booktitle = \{Empirical Methods in Natural Language Processing (EMNLP)\}, \cr 87 | year = 2014 \cr 88 | pages = \{1532-1543\} \cr 89 | url = \{http://www.aclweb.org/anthology/D14-1162\} \cr 90 | \} 91 | } 92 | \examples{ 93 | \dontrun{ 94 | embedding_glove6b(dimensions = 50) 95 | 96 | # Custom directory 97 | embedding_glove42b(dir = "data/") 98 | 99 | # Deleting dataset 100 | embedding_glove6b(delete = TRUE, dimensions = 300) 101 | 102 | # Returning filepath of data 103 | embedding_glove840b(return_path = TRUE) 104 | } 105 | } 106 | \references{ 107 | Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 108 | 2014. GloVe: Global Vectors for Word Representation. 109 | } 110 | \concept{embeddings} 111 | \keyword{datasets} 112 | -------------------------------------------------------------------------------- /man/figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/.DS_Store -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/logo.png -------------------------------------------------------------------------------- /man/figures/screen-shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/screen-shot.png -------------------------------------------------------------------------------- /man/figures/textdata_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/textdata_demo.gif -------------------------------------------------------------------------------- /man/lexicon_afinn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lexicon_afinn.R 3 | \name{lexicon_afinn} 4 | \alias{lexicon_afinn} 5 | \title{AFINN-111 dataset} 6 | \usage{ 7 | lexicon_afinn( 8 | dir = NULL, 9 | delete = FALSE, 10 | return_path = FALSE, 11 | clean = FALSE, 12 | manual_download = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{dir}{Character, path to directory where data will be stored. If 17 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 18 | 19 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 20 | 21 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 22 | 23 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 24 | greatly reduce the size. Defaults to FALSE.} 25 | 26 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 27 | downloaded the file and placed it in the folder designated by running 28 | this function with \code{return_path = TRUE}.} 29 | } 30 | \value{ 31 | A tibble with 2,477 rows and 2 variables: 32 | \describe{ 33 | \item{word}{An English word} 34 | \item{score}{Indicator for sentiment: integer between -5 and +5} 35 | } 36 | } 37 | \description{ 38 | AFINN is a lexicon of English words rated for valence with an integer 39 | between minus five (negative) and plus five (positive). The words have 40 | been manually labeled by Finn Årup Nielsen in 2009-2011. 41 | } 42 | \details{ 43 | This dataset is the newest version with 2477 words and phrases. 44 | 45 | Citation info: 46 | 47 | This dataset was published in Finn Ärup Nielsen (2011), 48 | ``A new Evaluation of a word list for sentiment analysis in 49 | microblogs'', Proceedings of the ESWC2011 Workshop on 50 | 'Making Sense of Microposts': Big things come in small packages (2011) 93-98. 51 | 52 | article\{nielsen11, \cr 53 | author = \{Finn Äruprup Nielsen\}, \cr 54 | title = \{A new Evaluation of a word list for sentiment analysis in microblogs\}, \cr 55 | journal = \{CoRR\}, \cr 56 | volume = \{abs/1103.2903\}, \cr 57 | year = \{2011\}, \cr 58 | url = \{http://arxiv.org/abs/1103.2903\}, \cr 59 | archivePrefix = \{arXiv\}, \cr 60 | eprint = \{1103.2903\}, \cr 61 | biburl = \{https://dblp.org/rec/bib/journals/corr/abs-1103-2903\}, \cr 62 | bibsource = \{dblp computer science bibliography, https://dblp.org\} \cr 63 | \} 64 | } 65 | \examples{ 66 | \dontrun{ 67 | lexicon_afinn() 68 | 69 | # Custom directory 70 | lexicon_afinn(dir = "data/") 71 | 72 | # Deleting dataset 73 | lexicon_afinn(delete = TRUE) 74 | 75 | # Returning filepath of data 76 | lexicon_afinn(return_path = TRUE) 77 | } 78 | } 79 | \seealso{ 80 | Other lexicon: 81 | \code{\link{lexicon_bing}()}, 82 | \code{\link{lexicon_loughran}()}, 83 | \code{\link{lexicon_nrc}()}, 84 | \code{\link{lexicon_nrc_eil}()}, 85 | \code{\link{lexicon_nrc_vad}()} 86 | } 87 | \concept{lexicon} 88 | \keyword{datasets} 89 | -------------------------------------------------------------------------------- /man/lexicon_bing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lexicon_bing.R 3 | \name{lexicon_bing} 4 | \alias{lexicon_bing} 5 | \title{Bing sentiment lexicon} 6 | \source{ 7 | \url{https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html} 8 | } 9 | \usage{ 10 | lexicon_bing( 11 | dir = NULL, 12 | delete = FALSE, 13 | return_path = FALSE, 14 | clean = FALSE, 15 | manual_download = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{dir}{Character, path to directory where data will be stored. If 20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 21 | 22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 23 | 24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 25 | 26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 27 | greatly reduce the size. Defaults to FALSE.} 28 | 29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 30 | downloaded the file and placed it in the folder designated by running 31 | this function with \code{return_path = TRUE}.} 32 | } 33 | \value{ 34 | A tibble with 6,787 rows and 2 variables: 35 | \describe{ 36 | \item{word}{An English word} 37 | \item{sentiment}{Indicator for sentiment: "negative" or "positive"} 38 | } 39 | } 40 | \description{ 41 | General purpose English sentiment lexicon that categorizes words in a 42 | binary fashion, either positive or negative 43 | } 44 | \details{ 45 | Citation info: 46 | 47 | This dataset was first published in Minqing Hu and Bing Liu, ``Mining and 48 | summarizing customer reviews.'', Proceedings of the ACM SIGKDD International 49 | Conference on Knowledge Discovery & Data Mining (KDD-2004), 2004. 50 | 51 | inproceedings\{Hu04, \cr 52 | author = \{Hu, Minqing and Liu, Bing\}, \cr 53 | title = \{Mining and Summarizing Customer Reviews\}, \cr 54 | booktitle = \{Proceedings of the Tenth ACM SIGKDD International Conference 55 | on Knowledge Discovery and Data Mining\}, \cr 56 | series = \{KDD '04\}, \cr 57 | year = \{2004\}, \cr 58 | isbn = \{1-58113-888-1\}, \cr 59 | location = \{Seattle, WA, USA\}, \cr 60 | pages = \{168--177\}, \cr 61 | numpages = \{10\}, \cr 62 | url = \{http://doi.acm.org/10.1145/1014052.1014073\}, \cr 63 | doi = \{10.1145/1014052.1014073\}, \cr 64 | acmid = \{1014073\}, \cr 65 | publisher = \{ACM\}, \cr 66 | address = \{New York, NY, USA\}, \cr 67 | keywords = \{reviews, sentiment classification, summarization, text mining\}, \cr 68 | \} 69 | } 70 | \examples{ 71 | \dontrun{ 72 | lexicon_bing() 73 | 74 | # Custom directory 75 | lexicon_bing(dir = "data/") 76 | 77 | # Deleting dataset 78 | lexicon_bing(delete = TRUE) 79 | 80 | # Returning filepath of data 81 | lexicon_bing(return_path = TRUE) 82 | } 83 | } 84 | \seealso{ 85 | Other lexicon: 86 | \code{\link{lexicon_afinn}()}, 87 | \code{\link{lexicon_loughran}()}, 88 | \code{\link{lexicon_nrc}()}, 89 | \code{\link{lexicon_nrc_eil}()}, 90 | \code{\link{lexicon_nrc_vad}()} 91 | } 92 | \concept{lexicon} 93 | \keyword{datasets} 94 | -------------------------------------------------------------------------------- /man/lexicon_loughran.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lexicon_loughran.R 3 | \name{lexicon_loughran} 4 | \alias{lexicon_loughran} 5 | \title{Loughran-McDonald sentiment lexicon} 6 | \source{ 7 | \url{https://sraf.nd.edu/loughranmcdonald-master-dictionary/} 8 | } 9 | \usage{ 10 | lexicon_loughran( 11 | dir = NULL, 12 | delete = FALSE, 13 | return_path = FALSE, 14 | clean = FALSE, 15 | manual_download = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{dir}{Character, path to directory where data will be stored. If 20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 21 | 22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 23 | 24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 25 | 26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 27 | greatly reduce the size. Defaults to FALSE.} 28 | 29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 30 | downloaded the file and placed it in the folder designated by running 31 | this function with \code{return_path = TRUE}.} 32 | } 33 | \value{ 34 | A tibble with 4,150 rows and 2 variables: 35 | \describe{ 36 | \item{word}{An English word} 37 | \item{sentiment}{Indicator for sentiment: "negative", "positive", 38 | "litigious", "uncertainty", "constraining", or "superfluous"} 39 | } 40 | } 41 | \description{ 42 | English sentiment lexicon created for use with financial documents. This 43 | lexicon labels words with six possible sentiments important in financial 44 | contexts: "negative", "positive", "litigious", "uncertainty", "constraining", 45 | or "superfluous". 46 | } 47 | \details{ 48 | Citation info: 49 | 50 | This dataset was published in Loughran, T. and McDonald, B. (2011), 51 | ``When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 52 | 10-Ks.'' The Journal of Finance, 66: 35-65. 53 | 54 | article\{loughran11, \cr 55 | author = \{Loughran, Tim and McDonald, Bill\}, \cr 56 | title = \{When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10-Ks\}, \cr 57 | journal = \{The Journal of Finance\}, \cr 58 | volume = \{66\}, \cr 59 | number = \{1\}, \cr 60 | pages = \{35-65\}, \cr 61 | doi = \{10.1111/j.1540-6261.2010.01625.x\}, \cr 62 | url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-6261.2010.01625.x\}, \cr 63 | eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1540-6261.2010.01625.x\}, \cr 64 | year = \{2011\} \cr 65 | \} 66 | } 67 | \examples{ 68 | \dontrun{ 69 | lexicon_loughran() 70 | 71 | # Custom directory 72 | lexicon_loughran(dir = "data/") 73 | 74 | # Deleting dataset 75 | lexicon_loughran(delete = TRUE) 76 | 77 | # Returning filepath of data 78 | lexicon_loughran(return_path = TRUE) 79 | } 80 | } 81 | \seealso{ 82 | Other lexicon: 83 | \code{\link{lexicon_afinn}()}, 84 | \code{\link{lexicon_bing}()}, 85 | \code{\link{lexicon_nrc}()}, 86 | \code{\link{lexicon_nrc_eil}()}, 87 | \code{\link{lexicon_nrc_vad}()} 88 | } 89 | \concept{lexicon} 90 | \keyword{datasets} 91 | -------------------------------------------------------------------------------- /man/lexicon_nrc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lexicon_nrc.R 3 | \name{lexicon_nrc} 4 | \alias{lexicon_nrc} 5 | \title{NRC word-emotion association lexicon} 6 | \source{ 7 | \url{http://saifmohammad.com/WebPages/lexicons.html} 8 | } 9 | \usage{ 10 | lexicon_nrc( 11 | dir = NULL, 12 | delete = FALSE, 13 | return_path = FALSE, 14 | clean = FALSE, 15 | manual_download = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{dir}{Character, path to directory where data will be stored. If 20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 21 | 22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 23 | 24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 25 | 26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 27 | greatly reduce the size. Defaults to FALSE.} 28 | 29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 30 | downloaded the file and placed it in the folder designated by running 31 | this function with \code{return_path = TRUE}.} 32 | } 33 | \value{ 34 | A tibble with 13,901 rows and 2 variables: 35 | \describe{ 36 | \item{word}{An English word} 37 | \item{sentiment}{Indicator for sentiment or emotion: "negative", 38 | "positive", "anger", "anticipation", "disgust", "fear", "joy", "sadness", 39 | "surprise", or "trust"} 40 | } 41 | } 42 | \description{ 43 | General purpose English sentiment/emotion lexicon. This lexicon labels words 44 | with six possible sentiments or emotions: "negative", "positive", "anger", 45 | "anticipation", "disgust", "fear", "joy", "sadness", "surprise", or "trust". 46 | The annotations were manually done through Amazon's Mechanical Turk. 47 | } 48 | \details{ 49 | License required for commercial use. Please contact Saif M. Mohammad 50 | (saif.mohammad@nrc-cnrc.gc.ca). 51 | 52 | Citation info: 53 | 54 | This dataset was published in Saif Mohammad and Peter Turney. (2013), 55 | ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational 56 | Intelligence, 29(3): 436-465. 57 | 58 | article\{mohammad13, \cr 59 | author = \{Mohammad, Saif M. and Turney, Peter D.\}, \cr 60 | title = \{CROWDSOURCING A WORD–EMOTION ASSOCIATION LEXICON\}, \cr 61 | journal = \{Computational Intelligence\}, \cr 62 | volume = \{29\}, \cr 63 | number = \{3\}, \cr 64 | pages = \{436-465\}, \cr 65 | doi = \{10.1111/j.1467-8640.2012.00460.x\}, \cr 66 | url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x\}, \cr 67 | eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x\}, \cr 68 | year = \{2013\} \cr 69 | \} 70 | } 71 | \examples{ 72 | \dontrun{ 73 | lexicon_nrc() 74 | 75 | # Custom directory 76 | lexicon_nrc(dir = "data/") 77 | 78 | # Deleting dataset 79 | lexicon_nrc(delete = TRUE) 80 | 81 | # Returning filepath of data 82 | lexicon_nrc(return_path = TRUE) 83 | } 84 | } 85 | \seealso{ 86 | Other lexicon: 87 | \code{\link{lexicon_afinn}()}, 88 | \code{\link{lexicon_bing}()}, 89 | \code{\link{lexicon_loughran}()}, 90 | \code{\link{lexicon_nrc_eil}()}, 91 | \code{\link{lexicon_nrc_vad}()} 92 | } 93 | \concept{lexicon} 94 | \keyword{datasets} 95 | -------------------------------------------------------------------------------- /man/lexicon_nrc_eil.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lexicon_nrc_eil.R 3 | \name{lexicon_nrc_eil} 4 | \alias{lexicon_nrc_eil} 5 | \title{NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5} 6 | \source{ 7 | \url{https://saifmohammad.com/WebPages/AffectIntensity.htm} 8 | } 9 | \usage{ 10 | lexicon_nrc_eil( 11 | dir = NULL, 12 | delete = FALSE, 13 | return_path = FALSE, 14 | clean = FALSE, 15 | manual_download = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{dir}{Character, path to directory where data will be stored. If 20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 21 | 22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 23 | 24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 25 | 26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 27 | greatly reduce the size. Defaults to FALSE.} 28 | 29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 30 | downloaded the file and placed it in the folder designated by running 31 | this function with \code{return_path = TRUE}.} 32 | } 33 | \value{ 34 | A tibble with 5.814 rows and 3 variables: 35 | \describe{ 36 | \item{term}{An English word} 37 | \item{score}{Value between 0 and 1} 38 | \item{AffectDimension}{Indicator for sentiment or emotion: ("anger", 39 | "fear", "sadness", "joy")} 40 | } 41 | } 42 | \description{ 43 | General purpose English sentiment/emotion lexicon. The NRC Affect Intensity 44 | Lexicon is a list of English words and their associations with four basic 45 | emotions (anger, fear, sadness, joy). 46 | } 47 | \details{ 48 | For a given word and emotion X, the scores range from 0 to 1. A score of 1 49 | means that the word conveys the highest amount of emotion X. A score of 0 50 | means that the word conveys the lowest amount of emotion X. 51 | 52 | License required for commercial use. Please contact Saif M. Mohammad 53 | (saif.mohammad@nrc-cnrc.gc.ca). 54 | 55 | Citation info: 56 | 57 | Details of the lexicon are in this paper. 58 | Word Affect Intensities. Saif M. Mohammad. In Proceedings of the 11th Edition 59 | of the Language Resources and Evaluation Conference (LREC-2018), May 2018, 60 | Miyazaki, Japan. 61 | 62 | inproceedings\{LREC18-AIL, \cr 63 | author = \{Mohammad, Saif M.\}, \cr 64 | title = \{Word Affect Intensities\}, \cr 65 | booktitle = \{Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)\}, \cr 66 | year = \{2018\}, \cr 67 | address=\{Miyazaki, Japan\} \cr 68 | \} \cr 69 | } 70 | \examples{ 71 | \dontrun{ 72 | lexicon_nrc_eil() 73 | 74 | # Custom directory 75 | lexicon_nrc_eil(dir = "data/") 76 | 77 | # Deleting dataset 78 | lexicon_nrc_eil(delete = TRUE) 79 | 80 | # Returning filepath of data 81 | lexicon_nrc_eil(return_path = TRUE) 82 | } 83 | } 84 | \seealso{ 85 | Other lexicon: 86 | \code{\link{lexicon_afinn}()}, 87 | \code{\link{lexicon_bing}()}, 88 | \code{\link{lexicon_loughran}()}, 89 | \code{\link{lexicon_nrc}()}, 90 | \code{\link{lexicon_nrc_vad}()} 91 | } 92 | \concept{lexicon} 93 | \keyword{datasets} 94 | -------------------------------------------------------------------------------- /man/lexicon_nrc_vad.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lexicon_nrc_vad.R 3 | \name{lexicon_nrc_vad} 4 | \alias{lexicon_nrc_vad} 5 | \title{The NRC Valence, Arousal, and Dominance Lexicon} 6 | \source{ 7 | \url{https://saifmohammad.com/WebPages/nrc-vad.html} 8 | } 9 | \usage{ 10 | lexicon_nrc_vad( 11 | dir = NULL, 12 | delete = FALSE, 13 | return_path = FALSE, 14 | clean = FALSE, 15 | manual_download = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{dir}{Character, path to directory where data will be stored. If 20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.} 21 | 22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.} 23 | 24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.} 25 | 26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can 27 | greatly reduce the size. Defaults to FALSE.} 28 | 29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually 30 | downloaded the file and placed it in the folder designated by running 31 | this function with \code{return_path = TRUE}.} 32 | } 33 | \value{ 34 | A tibble with 20.007 rows and 4 variables: 35 | \describe{ 36 | \item{word}{An English word} 37 | \item{Valence}{valence score of the word} 38 | \item{Arousal}{arousal score of the word} 39 | \item{Dominance}{dominance score of the word} 40 | } 41 | } 42 | \description{ 43 | The NRC Valence, Arousal, and Dominance (VAD) Lexicon includes a list of 44 | more than 20,000 English words and their valence, arousal, and dominance 45 | scores. For a given word and a dimension (V/A/D), the scores range from 0 46 | (lowest V/A/D) to 1 (highest V/A/D). The lexicon with its fine-grained real- 47 | valued scores was created by manual annotation using best--worst scaling. 48 | The lexicon is markedly larger than any of the existing VAD lexicons. We also 49 | show that the ratings obtained are substantially more reliable than those in 50 | existing lexicons. 51 | } 52 | \details{ 53 | License required for commercial use. Please contact Saif M. Mohammad 54 | (saif.mohammad@nrc-cnrc.gc.ca). 55 | 56 | Citation info: 57 | 58 | Details of the NRC VAD Lexicon are available in this paper: 59 | 60 | Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 61 | 20,000 English Words. Saif M. Mohammad. In Proceedings of the 56th Annual 62 | Meeting of the Association for Computational Linguistics, Melbourne, 63 | Australia, July 2018. 64 | 65 | inproceedings\{vad-acl2018, \cr 66 | title=\{Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words\}, \cr 67 | author=\{Mohammad, Saif M.\}, \cr 68 | booktitle=\{Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)\}, \cr 69 | year=\{2018\}, \cr 70 | address=\{Melbourne, Australia\} \cr 71 | \} 72 | } 73 | \examples{ 74 | \dontrun{ 75 | lexicon_nrc_vad() 76 | 77 | # Custom directory 78 | lexicon_nrc_vad(dir = "data/") 79 | 80 | # Deleting dataset 81 | lexicon_nrc_vad(delete = TRUE) 82 | 83 | # Returning filepath of data 84 | lexicon_nrc_vad(return_path = TRUE) 85 | } 86 | } 87 | \seealso{ 88 | Other lexicon: 89 | \code{\link{lexicon_afinn}()}, 90 | \code{\link{lexicon_bing}()}, 91 | \code{\link{lexicon_loughran}()}, 92 | \code{\link{lexicon_nrc}()}, 93 | \code{\link{lexicon_nrc_eil}()} 94 | } 95 | \concept{lexicon} 96 | \keyword{datasets} 97 | -------------------------------------------------------------------------------- /man/load_dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/load_dataset.R 3 | \name{load_dataset} 4 | \alias{load_dataset} 5 | \title{Internal Functions} 6 | \usage{ 7 | load_dataset( 8 | data_name, 9 | name, 10 | dir, 11 | delete, 12 | return_path, 13 | clean, 14 | clean_manual = NULL, 15 | manual_download 16 | ) 17 | } 18 | \description{ 19 | These are not to be used directly by the users. 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/textdata-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/textdata-package.R 3 | \docType{package} 4 | \name{textdata-package} 5 | \alias{textdata} 6 | \alias{textdata-package} 7 | \title{textdata: Download and Load Various Text Datasets} 8 | \description{ 9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} 10 | 11 | Provides a framework to download, parse, and store text datasets on the disk and load them when needed. Includes various sentiment lexicons and labeled text data sets for classification and analysis. 12 | } 13 | \seealso{ 14 | Useful links: 15 | \itemize{ 16 | \item \url{https://emilhvitfeldt.github.io/textdata/} 17 | \item \url{https://github.com/EmilHvitfeldt/textdata} 18 | \item Report bugs at \url{https://github.com/EmilHvitfeldt/textdata/issues} 19 | } 20 | 21 | } 22 | \author{ 23 | \strong{Maintainer}: Emil Hvitfeldt \email{emilhhvitfeldt@gmail.com} (\href{https://orcid.org/0000-0002-0679-1945}{ORCID}) 24 | 25 | Other contributors: 26 | \itemize{ 27 | \item Julia Silge \email{julia.silge@gmail.com} (\href{https://orcid.org/0000-0002-3671-836X}{ORCID}) [contributor] 28 | } 29 | 30 | } 31 | \keyword{internal} 32 | -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-120x120.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-152x152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-152x152.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-180x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-180x180.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-60x60.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-76x76.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon.ico -------------------------------------------------------------------------------- /revdep/README.md: -------------------------------------------------------------------------------- 1 | # Platform 2 | 3 | |field |value | 4 | |:--------|:----------------------------------------------------------------------------------------| 5 | |version |R version 4.3.3 (2024-02-29) | 6 | |os |macOS Sonoma 14.4.1 | 7 | |system |aarch64, darwin20 | 8 | |ui |X11 | 9 | |language |(EN) | 10 | |collate |en_US.UTF-8 | 11 | |ctype |en_US.UTF-8 | 12 | |tz |America/Los_Angeles | 13 | |date |2024-05-28 | 14 | |pandoc |3.1.12.3 @ /Applications/Positron.app/Contents/Resources/app/bin/pandoc/ (via rmarkdown) | 15 | 16 | # Dependencies 17 | 18 | |package |old |new |Δ | 19 | |:-----------|:-----|:----------|:--| 20 | |textdata |0.4.4 |0.4.4.9000 |* | 21 | |bit |4.0.5 |4.0.5 | | 22 | |bit64 |4.0.5 |4.0.5 | | 23 | |cli |3.6.2 |3.6.2 | | 24 | |clipr |0.8.0 |0.8.0 | | 25 | |cpp11 |0.4.7 |0.4.7 | | 26 | |crayon |1.5.2 |1.5.2 | | 27 | |fansi |1.0.6 |1.0.6 | | 28 | |fs |1.6.4 |1.6.4 | | 29 | |glue |1.7.0 |1.7.0 | | 30 | |hms |1.1.3 |1.1.3 | | 31 | |lifecycle |1.0.4 |1.0.4 | | 32 | |magrittr |2.0.3 |2.0.3 | | 33 | |pillar |1.9.0 |1.9.0 | | 34 | |pkgconfig |2.0.3 |2.0.3 | | 35 | |prettyunits |1.2.0 |1.2.0 | | 36 | |progress |1.2.3 |1.2.3 | | 37 | |R6 |2.5.1 |2.5.1 | | 38 | |rappdirs |0.3.3 |0.3.3 | | 39 | |readr |2.1.5 |2.1.5 | | 40 | |rlang |1.1.3 |1.1.3 | | 41 | |tibble |3.2.1 |3.2.1 | | 42 | |tidyselect |1.2.1 |1.2.1 | | 43 | |tzdb |0.4.0 |0.4.0 | | 44 | |utf8 |1.2.4 |1.2.4 | | 45 | |vctrs |0.6.5 |0.6.5 | | 46 | |vroom |1.6.5 |1.6.5 | | 47 | |withr |3.0.0 |3.0.0 | | 48 | 49 | # Revdeps 50 | 51 | -------------------------------------------------------------------------------- /revdep/cran.md: -------------------------------------------------------------------------------- 1 | ## revdepcheck results 2 | 3 | We checked 3 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. 4 | 5 | * We saw 0 new problems 6 | * We failed to check 0 packages 7 | 8 | -------------------------------------------------------------------------------- /revdep/failures.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /revdep/problems.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(textdata) 3 | 4 | test_check("textdata") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-download_functions.R: -------------------------------------------------------------------------------- 1 | downloads <- setdiff( 2 | ls(getNamespace("textdata"), pattern = "^download_"), 3 | "download_functions" 4 | ) 5 | 6 | test_that("All download functions are included in download_functions", { 7 | expect_equal( 8 | length(downloads), 9 | length(textdata:::download_functions) 10 | ) 11 | }) 12 | 13 | test_that("All download functions has the folder_path argument", { 14 | for (fun in downloads) { 15 | expect_equal( 16 | names(formals(get(fun, getNamespace("textdata")))), 17 | "folder_path" 18 | ) 19 | } 20 | }) 21 | 22 | test_that("the download functions are named right according to print_info", { 23 | testthat::expect_setequal( 24 | paste0("download_", names(textdata:::print_info)), 25 | downloads 26 | ) 27 | }) 28 | -------------------------------------------------------------------------------- /tests/testthat/test-info.R: -------------------------------------------------------------------------------- 1 | test_that("print_info has right names", { 2 | lapply( 3 | textdata:::print_info, 4 | function(x) expect_true(all(names(x) == c("name", "url", "license", "size", "type", "download_mech", "description", "citation"))) 5 | ) 6 | }) 7 | -------------------------------------------------------------------------------- /tests/testthat/test-process_functions.R: -------------------------------------------------------------------------------- 1 | processs <- setdiff( 2 | ls(getNamespace("textdata"), pattern = "^process_"), 3 | "process_functions" 4 | ) 5 | 6 | test_that("All process functions are included in process_functions", { 7 | expect_equal( 8 | length(processs), 9 | length(textdata:::process_functions) 10 | ) 11 | }) 12 | 13 | test_that("All process functions has the folder_path argument", { 14 | for (fun in processs) { 15 | expect_equal( 16 | names(formals(get(fun, getNamespace("textdata")))), 17 | c("folder_path", "name_path") 18 | ) 19 | } 20 | }) 21 | 22 | test_that("the process functions are named right according to print_info", { 23 | testthat::expect_setequal( 24 | paste0("process_", names(textdata:::print_info)), 25 | processs 26 | ) 27 | }) 28 | -------------------------------------------------------------------------------- /textdata.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/How-to-add-a-data-set.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to add a data set" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{How to add a data set} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | ```{r setup} 18 | library(textdata) 19 | ``` 20 | 21 | This package provides infrastructure to make text datasets available within R, even when they are too large to store within an R package or are licensed in such a way that prevents them from being included in OSS-licensed packages. 22 | 23 | Do you want to add a new dataset to the textdata package? 24 | 25 | - Create a R file named `prefix_*.R` in the `R/` folder, where `*` is the name of the dataset. Supported prefixes include 26 | - `dataset_` 27 | - `lexicon_` 28 | - Inside that file create 3 functions named `download_*()`, `process_*()` and `dataset_*()`. 29 | - The `download_*()` function should take 1 argument named `folder_path`. It has 2 tasks, first it should check if the file is already downloaded. If it is already downloaded it should return `invisible()`. If the file isn't at the path it should download the file to said path. 30 | - The `process_*()` function should take 2 arguments, `folder_path` and `name_path`. `folder_path` denotes the the path to the file returned by `download_*` and `name_path` is the path to where the polished data should live. Main point of `process_*()` is to turn the downloaded file into a .rds file containing a tidy tibble. 31 | - The `dataset_*()` function should wrap the `load_dataset()`. 32 | - Add the `process_*()` function to the named list `process_functions` in the file process_functions.R. 33 | - Add the `download_*()` function to the named list `download_functions` in the file download_functions.R. 34 | - Modify the `print_info` list in the info.R file. 35 | - Add `dataset_*.R` to the @include tags in `download_functions.R`. 36 | - Add the dataset to the table in `README.Rmd`. 37 | - Add the dataset to `_pkgdown.yml`. 38 | - Write a bullet in the `NEWS.md file`. 39 | 40 | What are the guidelines for adding datasets? 41 | 42 | # Guidelines for textdata datasets 43 | 44 | - All datasets must have a license or terms of use clearly specified. 45 | - Data should be a vector or tibble. 46 | - Use `word` instead of `words` for column names. 47 | 48 | # Classification datasets 49 | 50 | For datasets that comes with a testing and training dataset. Let the user pick which one to retrieve with a `split` argument similar to how `dataset_ag_news()` is doing. 51 | --------------------------------------------------------------------------------