├── .Rbuildignore
├── .github
├── .gitignore
└── workflows
│ ├── R-CMD-check.yaml
│ ├── pkgdown.yaml
│ └── pr-commands.yaml
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
├── cache_info.R
├── dataset_ag_news.R
├── dataset_dbpedia.R
├── dataset_imdb.R
├── dataset_sentence_polarity.R
├── dataset_trec.R
├── download_functions.R
├── embedding_glove.R
├── info.R
├── lexicon_afinn.R
├── lexicon_bing.R
├── lexicon_loughran.R
├── lexicon_nrc.R
├── lexicon_nrc_eil.R
├── lexicon_nrc_vad.R
├── load_dataset.R
├── printer.R
├── process_functions.R
└── textdata-package.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── codecov.yml
├── cran-comments.md
├── man
├── cache_info.Rd
├── catalogue.Rd
├── dataset_ag_news.Rd
├── dataset_dbpedia.Rd
├── dataset_imdb.Rd
├── dataset_sentence_polarity.Rd
├── dataset_trec.Rd
├── embedding_glove.Rd
├── figures
│ ├── .DS_Store
│ ├── logo.png
│ ├── screen-shot.png
│ └── textdata_demo.gif
├── lexicon_afinn.Rd
├── lexicon_bing.Rd
├── lexicon_loughran.Rd
├── lexicon_nrc.Rd
├── lexicon_nrc_eil.Rd
├── lexicon_nrc_vad.Rd
├── load_dataset.Rd
└── textdata-package.Rd
├── pkgdown
└── favicon
│ ├── apple-touch-icon-120x120.png
│ ├── apple-touch-icon-152x152.png
│ ├── apple-touch-icon-180x180.png
│ ├── apple-touch-icon-60x60.png
│ ├── apple-touch-icon-76x76.png
│ ├── apple-touch-icon.png
│ ├── favicon-16x16.png
│ ├── favicon-32x32.png
│ └── favicon.ico
├── revdep
├── README.md
├── cran.md
├── failures.md
└── problems.md
├── tests
├── testthat.R
└── testthat
│ ├── test-download_functions.R
│ ├── test-info.R
│ └── test-process_functions.R
├── textdata.Rproj
└── vignettes
├── .gitignore
└── How-to-add-a-data-set.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^README\.Rmd$
2 | ^LICENSE\.md$
3 | ^textdata\.Rproj$
4 | ^\.Rproj\.user$
5 | ^\.travis\.yml$
6 | ^CODE_OF_CONDUCT\.md$
7 | ^cran-comments\.md$
8 | ^_pkgdown\.yml$
9 | ^docs$
10 | ^pkgdown$
11 | ^CRAN-RELEASE$
12 | ^revdep$
13 | ^codecov\.yml$
14 | ^\.github/workflows/R-CMD-check\.yaml$
15 | ^\.github/workflows/pr-commands\.yaml$
16 | ^\.github$
17 | ^CRAN-SUBMISSION$
18 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 |
9 | name: R-CMD-check
10 |
11 | jobs:
12 | R-CMD-check:
13 | runs-on: ${{ matrix.config.os }}
14 |
15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 |
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | config:
21 | - {os: macOS-latest, r: 'release'}
22 | - {os: windows-latest, r: 'release'}
23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
24 | - {os: ubuntu-latest, r: 'release'}
25 | - {os: ubuntu-latest, r: 'oldrel-1'}
26 |
27 | env:
28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 | R_KEEP_PKG_SOURCE: yes
30 |
31 | steps:
32 | - uses: actions/checkout@v2
33 |
34 | - uses: r-lib/actions/setup-pandoc@v2
35 |
36 | - uses: r-lib/actions/setup-r@v2
37 | with:
38 | r-version: ${{ matrix.config.r }}
39 | http-user-agent: ${{ matrix.config.http-user-agent }}
40 | use-public-rspm: true
41 |
42 | - uses: r-lib/actions/setup-r-dependencies@v2
43 | with:
44 | extra-packages: any::rcmdcheck
45 | needs: check
46 |
47 | - uses: r-lib/actions/check-r-package@v2
48 | with:
49 | upload-snapshots: true
50 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 | release:
9 | types: [published]
10 | workflow_dispatch:
11 |
12 | name: pkgdown
13 |
14 | jobs:
15 | pkgdown:
16 | runs-on: ubuntu-latest
17 | # Only restrict concurrency for non-PR jobs
18 | concurrency:
19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 | env:
21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 | steps:
23 | - uses: actions/checkout@v2
24 |
25 | - uses: r-lib/actions/setup-pandoc@v2
26 |
27 | - uses: r-lib/actions/setup-r@v2
28 | with:
29 | use-public-rspm: true
30 |
31 | - uses: r-lib/actions/setup-r-dependencies@v2
32 | with:
33 | extra-packages: any::pkgdown, local::.
34 | needs: website
35 |
36 | - name: Build site
37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 | shell: Rscript {0}
39 |
40 | - name: Deploy to GitHub pages 🚀
41 | if: github.event_name != 'pull_request'
42 | uses: JamesIves/github-pages-deploy-action@4.1.4
43 | with:
44 | clean: false
45 | branch: gh-pages
46 | folder: docs
47 |
--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
1 | on:
2 | issue_comment:
3 | types: [created]
4 | name: Commands
5 | jobs:
6 | document:
7 | if: startsWith(github.event.comment.body, '/document')
8 | name: document
9 | runs-on: macOS-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - uses: r-lib/actions/pr-fetch@master
13 | with:
14 | repo-token: ${{ secrets.GITHUB_TOKEN }}
15 | - uses: r-lib/actions/setup-r@master
16 | - name: Install dependencies
17 | run: Rscript -e 'install.packages(c("remotes", "roxygen2"))' -e 'remotes::install_deps(dependencies = TRUE)'
18 | - name: Document
19 | run: Rscript -e 'roxygen2::roxygenise()'
20 | - name: commit
21 | run: |
22 | git add man/\* NAMESPACE
23 | git commit -m 'Document'
24 | - uses: r-lib/actions/pr-push@master
25 | with:
26 | repo-token: ${{ secrets.GITHUB_TOKEN }}
27 | style:
28 | if: startsWith(github.event.comment.body, '/style')
29 | name: style
30 | runs-on: macOS-latest
31 | steps:
32 | - uses: actions/checkout@v2
33 | - uses: r-lib/actions/pr-fetch@master
34 | with:
35 | repo-token: ${{ secrets.GITHUB_TOKEN }}
36 | - uses: r-lib/actions/setup-r@master
37 | - name: Install dependencies
38 | run: Rscript -e 'install.packages("styler")'
39 | - name: Style
40 | run: Rscript -e 'styler::style_pkg()'
41 | - name: commit
42 | run: |
43 | git add \*.R
44 | git commit -m 'Style'
45 | - uses: r-lib/actions/pr-push@master
46 | with:
47 | repo-token: ${{ secrets.GITHUB_TOKEN }}
48 | # A mock job just to ensure we have a successful build status
49 | finish:
50 | runs-on: ubuntu-latest
51 | steps:
52 | - run: true
53 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | .DS_Store
6 | inst/doc
7 | docs/
8 |
9 | revdep/checks
10 | revdep/library
11 | revdep/checks.noindex
12 | revdep/library.noindex
13 | revdep/data.sqlite
14 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
2 |
3 | language: R
4 | cache: packages
5 |
6 | before_cache: Rscript -e 'remotes::install_cran("pkgdown")'
7 | deploy:
8 | provider: script
9 | script: Rscript -e 'pkgdown::deploy_site_github()'
10 | skip_cleanup: true
11 |
12 | after_success:
13 | - Rscript -e 'covr::codecov()'
14 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Code of Conduct
2 |
3 | As contributors and maintainers of this project, we pledge to respect all people who
4 | contribute through reporting issues, posting feature requests, updating documentation,
5 | submitting pull requests or patches, and other activities.
6 |
7 | We are committed to making participation in this project a harassment-free experience for
8 | everyone, regardless of level of experience, gender, gender identity and expression,
9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 |
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 |
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed
18 | from the project team.
19 |
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by
21 | opening an issue or contacting one or more of the project maintainers.
22 |
23 | This Code of Conduct is adapted from the Contributor Covenant
24 | (https://www.contributor-covenant.org), version 1.0.0, available at
25 | https://contributor-covenant.org/version/1/0/0/.
26 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: textdata
2 | Title: Download and Load Various Text Datasets
3 | Version: 0.4.5.9000
4 | Authors@R: c(
5 | person("Emil", "Hvitfeldt", , "emilhhvitfeldt@gmail.com", role = c("aut", "cre"),
6 | comment = c(ORCID = "0000-0002-0679-1945")),
7 | person("Julia", "Silge", , "julia.silge@gmail.com", role = "ctb",
8 | comment = c(ORCID = "0000-0002-3671-836X"))
9 | )
10 | Description: Provides a framework to download, parse, and store text
11 | datasets on the disk and load them when needed. Includes various
12 | sentiment lexicons and labeled text data sets for classification and
13 | analysis.
14 | License: MIT + file LICENSE
15 | URL: https://emilhvitfeldt.github.io/textdata/, https://github.com/EmilHvitfeldt/textdata
16 | BugReports: https://github.com/EmilHvitfeldt/textdata/issues
17 | Imports:
18 | fs,
19 | rappdirs,
20 | readr,
21 | tibble
22 | Suggests:
23 | covr,
24 | knitr,
25 | rmarkdown,
26 | testthat (>= 2.1.0)
27 | VignetteBuilder:
28 | knitr
29 | Encoding: UTF-8
30 | RoxygenNote: 7.3.1.9000
31 | Collate:
32 | 'cache_info.R'
33 | 'dataset_ag_news.R'
34 | 'dataset_dbpedia.R'
35 | 'dataset_imdb.R'
36 | 'dataset_sentence_polarity.R'
37 | 'dataset_trec.R'
38 | 'embedding_glove.R'
39 | 'lexicon_nrc_vad.R'
40 | 'lexicon_nrc_eil.R'
41 | 'lexicon_nrc.R'
42 | 'lexicon_bing.R'
43 | 'lexicon_loughran.R'
44 | 'lexicon_afinn.R'
45 | 'download_functions.R'
46 | 'info.R'
47 | 'load_dataset.R'
48 | 'printer.R'
49 | 'process_functions.R'
50 | 'textdata-package.R'
51 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Emil Hvitfeldt
3 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2018 Emil Hvitfeldt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(cache_info)
4 | export(catalogue)
5 | export(dataset_ag_news)
6 | export(dataset_dbpedia)
7 | export(dataset_imdb)
8 | export(dataset_sentence_polarity)
9 | export(dataset_trec)
10 | export(embedding_glove27b)
11 | export(embedding_glove42b)
12 | export(embedding_glove6b)
13 | export(embedding_glove840b)
14 | export(lexicon_afinn)
15 | export(lexicon_bing)
16 | export(lexicon_loughran)
17 | export(lexicon_nrc)
18 | export(lexicon_nrc_eil)
19 | export(lexicon_nrc_vad)
20 | export(load_dataset)
21 | importFrom(fs,dir_create)
22 | importFrom(fs,dir_delete)
23 | importFrom(fs,dir_exists)
24 | importFrom(fs,dir_ls)
25 | importFrom(fs,file_delete)
26 | importFrom(fs,file_exists)
27 | importFrom(fs,path)
28 | importFrom(readr,col_character)
29 | importFrom(readr,col_double)
30 | importFrom(readr,cols)
31 | importFrom(readr,cols_only)
32 | importFrom(readr,read_csv)
33 | importFrom(readr,read_delim)
34 | importFrom(readr,read_lines)
35 | importFrom(readr,read_rds)
36 | importFrom(readr,read_tsv)
37 | importFrom(readr,write_rds)
38 | importFrom(tibble,tibble)
39 | importFrom(utils,download.file)
40 | importFrom(utils,menu)
41 | importFrom(utils,untar)
42 | importFrom(utils,unzip)
43 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # textdata (development version)
2 |
3 | # textdata 0.4.5
4 |
5 | * Fixed bug where `lexicon_nrc_vad()` didn't have column names. (#53)
6 |
7 | # textdata 0.4.4
8 |
9 | * Update path to correctly path source for NRC lexicon.
10 |
11 | # textdata 0.4.3
12 |
13 | * Fixed documentation to be HTML5 friendly.
14 |
15 | # textdata 0.4.2
16 |
17 | * `cache_info()` function has to added to allow for quick overview of cacheing size.
18 | * Update download url for `lexicon_nrc()`.
19 |
20 | # textdata 0.4.1
21 |
22 | # textdata 0.4.0
23 |
24 | * `embedding_glove6b()`, `embedding_glove27b()`, `embedding_glove42b()`, and `embedding_glove840b()` have been added to give access to the Stanford NLP Global Vectors for Word Representations pre-trained word vectors (@jonthegeek, #26).
25 | * `manual_download` argument have been added to all functions to allow the user to manual place file download at right place.
26 |
27 | # textdata 0.3.0
28 |
29 | * `lexicon_nrc_eil()` has been added to give access to the NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5.
30 | * `lexicon_nrc_vad()` has been added to give access to the The NRC Valence, Arousal, and Dominance Lexicon.
31 | * The argument `clean` have been added to all functions to allow deletion of intermediate files.
32 | * An optional information prompt is implemented. This will be turned off by default and turned on by original authors request.
33 | * `dataset_nrc()` got improved url for faster and more reliable downloads.
34 |
35 | # textdata 0.2.0
36 |
37 | * `dataset_imdb()` has been added to give access to the IMDb Large Movie Review Dataset.
38 | * `dataset_trec()` has been added to give access to the TREC-6 and TREC-50 classification datasets.
39 | * `dataset_dbpedia()` has been added to give access to DBpedia Ontology classification dataset.
40 | * `dataset_ag_news()` has been added to give access to AG's News Topic classification dataset.
41 | * Functions will now notify the user about the download mechanism used to download the data. http/https etc. (#12).
42 | * `lexicon_nrc()` has been added to give access to the NRC Emotion lexicon (@juliasilge, #11).
43 |
44 | # textdata 0.1.0
45 |
--------------------------------------------------------------------------------
/R/cache_info.R:
--------------------------------------------------------------------------------
1 | #' List folders and their sizes in cache
2 | #'
3 | #' This function will return a tibble with the name and sizes of all folder in
4 | #' specified directory. Will default to textdata's default cache.
5 | #'
6 | #' @inheritParams lexicon_afinn
7 | #'
8 | #' @return A tibble with 2 variables:
9 | #' \describe{
10 | #' \item{name}{Name of the folder}
11 | #' \item{size}{Size of the folder}
12 | #' }
13 | #' @export
14 | #'
15 | #' @examples
16 | #' \dontrun{
17 | #' cache_info()
18 | #' }
19 | cache_info <- function(dir = NULL) {
20 | dir <- ifelse(is.null(dir), rappdirs::user_cache_dir("textdata"), dir)
21 |
22 | folders <- fs::dir_info(dir)
23 |
24 | folders <- folders$path[folders$type == "directory"]
25 |
26 | sizes <- vapply(folders, folder_size, numeric(1))
27 |
28 | tibble::tibble(
29 | name = basename(folders),
30 | size = fs::as_fs_bytes(sizes)
31 | )
32 | }
33 |
34 | folder_size <- function(x) {
35 | sum(fs::dir_info(x)$size)
36 | }
37 |
--------------------------------------------------------------------------------
/R/dataset_ag_news.R:
--------------------------------------------------------------------------------
1 | #' AG's News Topic Classification Dataset
2 | #'
3 | #' The AG's news topic classification dataset is constructed by choosing 4
4 | #' largest classes from the original corpus. Each class contains 30,000 training
5 | #' samples and 1,900 testing samples. The total number of training samples is
6 | #' 120,000 and testing 7,600.
7 |
8 | #' Version 3, Updated 09/09/2015
9 | #'
10 | #' The classes in this dataset are
11 | #'
12 | #' \itemize{
13 | #' \item World
14 | #' \item Sports
15 | #' \item Business
16 | #' \item Sci/Tech
17 | #' }
18 | #'
19 | #' @inheritParams lexicon_afinn
20 | #' @param split Character. Return training ("train") data or testing ("test")
21 | #' data. Defaults to "train".
22 | #' @return A tibble with 120,000 or 30,000 rows for "train" and "test"
23 | #' respectively and 3 variables:
24 | #' \describe{
25 | #' \item{class}{Character, denoting new class}
26 | #' \item{title}{Character, title of article}
27 | #' \item{description}{Character, description of article}
28 | #' }
29 | #' @source \url{http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html}
30 | #' @source \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
31 | #' @keywords datasets
32 | #' @family topic
33 | #' @export
34 | #' @examples
35 | #' \dontrun{
36 | #' dataset_ag_news()
37 | #'
38 | #' # Custom directory
39 | #' dataset_ag_news(dir = "data/")
40 | #'
41 | #' # Deleting dataset
42 | #' dataset_ag_news(delete = TRUE)
43 | #'
44 | #' # Returning filepath of data
45 | #' dataset_ag_news(return_path = TRUE)
46 | #'
47 | #' # Access both training and testing dataset
48 | #' train <- dataset_ag_news(split = "train")
49 | #' test <- dataset_ag_news(split = "test")
50 | #' }
51 | #'
52 | #' @importFrom fs file_exists dir_exists dir_create path
53 | #' @importFrom readr read_rds
54 | #' @importFrom utils menu
55 | dataset_ag_news <- function(dir = NULL, split = c("train", "test"),
56 | delete = FALSE, return_path = FALSE,
57 | clean = FALSE, manual_download = FALSE) {
58 | all_files <- paste0("ag_news_", c("train", "test"), ".rds")
59 | split <- match.arg(split)
60 | name <- paste0("ag_news_", split, ".rds")
61 | load_dataset(
62 | data_name = "ag_news", name = name, dir = dir,
63 | delete = delete, return_path = return_path, clean = clean,
64 | clean_manual = all_files,
65 | manual_download = manual_download
66 | )
67 | }
68 |
69 | #' @importFrom utils download.file
70 | download_ag_news <- function(folder_path) {
71 | file_path_test <- path(folder_path, "ag_news_test.csv")
72 | file_path_train <- path(folder_path, "ag_news_train.csv")
73 |
74 | if (file_exists(file_path_test) & file_exists(file_path_train)) {
75 | return(invisible())
76 | }
77 |
78 | download.file(
79 | url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv",
80 | destfile = file_path_test
81 | )
82 | download.file(
83 | url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv",
84 | destfile = file_path_train
85 | )
86 | }
87 |
88 | #' @importFrom readr read_tsv write_rds cols col_character col_double
89 | #' @importFrom tibble tibble
90 | process_ag_news <- function(folder_path, name_path) {
91 | file_path_test <- path(folder_path, "ag_news_test.csv")
92 | file_path_train <- path(folder_path, "ag_news_train.csv")
93 |
94 | data_test <- read_csv(file_path_test,
95 | col_names = c("class", "title", "description"),
96 | col_types = cols(
97 | class = col_double(),
98 | title = col_character(),
99 | description = col_character()
100 | )
101 | )
102 | data_train <- read_csv(file_path_train,
103 | col_names = c("class", "title", "description"),
104 | col_types = cols(
105 | class = col_double(),
106 | title = col_character(),
107 | description = col_character()
108 | )
109 | )
110 |
111 | classes <- c("World", "Sports", "Business", "Sci/Tech")
112 |
113 | data_test$class <- classes[data_test$class]
114 | data_train$class <- classes[data_train$class]
115 |
116 | write_rds(data_test, path(folder_path, "ag_news_test.rds"))
117 | write_rds(data_train, path(folder_path, "ag_news_train.rds"))
118 | }
119 |
--------------------------------------------------------------------------------
/R/dataset_dbpedia.R:
--------------------------------------------------------------------------------
1 | #' DBpedia Ontology Dataset
2 | #'
3 | #' DBpedia ontology dataset classification dataset. It contains 560,000 training
4 | #' samples and 70,000 testing samples for each of 14 nonoverlapping classes
5 | #' from DBpedia.
6 | #'
7 | #' The classes are
8 | #'
9 | #' \itemize{
10 | #' \item Company
11 | #' \item EducationalInstitution
12 | #' \item Artist
13 | #' \item Athlete
14 | #' \item OfficeHolder
15 | #' \item MeanOfTransportation
16 | #' \item Building
17 | #' \item NaturalPlace
18 | #' \item Village
19 | #' \item Animal
20 | #' \item Plant
21 | #' \item Album
22 | #' \item Film
23 | #' \item WrittenWork
24 | #' }
25 | #'
26 | #' @source \url{https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf}
27 | #' @source \url{https://www.dbpedia.org/}
28 | #' @source \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
29 | #' @inheritParams lexicon_afinn
30 | #' @param split Character. Return training ("train") data or testing ("test")
31 | #' data. Defaults to "train".
32 | #' @return A tibble with 560,000 or 70,000 rows for "train" and "test"
33 | #' respectively and 3 variables:
34 | #' \describe{
35 | #' \item{class}{Character, denoting the class class}
36 | #' \item{title}{Character, title of article}
37 | #' \item{description}{Character, description of article}
38 | #' }
39 | #' @keywords datasets
40 | #' @family topic
41 | #' @export
42 | #' @examples
43 | #' \dontrun{
44 | #' dataset_dbpedia()
45 | #'
46 | #' # Custom directory
47 | #' dataset_dbpedia(dir = "data/")
48 | #'
49 | #' # Deleting dataset
50 | #' dataset_dbpedia(delete = TRUE)
51 | #'
52 | #' # Returning filepath of data
53 | #' dataset_dbpedia(return_path = TRUE)
54 | #'
55 | #' # Access both training and testing dataset
56 | #' train <- dataset_dbpedia(split = "train")
57 | #' test <- dataset_dbpedia(split = "test")
58 | #' }
59 | #'
60 | #' @importFrom fs file_exists dir_exists dir_create path
61 | #' @importFrom readr read_rds
62 | #' @importFrom utils menu untar
63 | dataset_dbpedia <- function(dir = NULL, split = c("train", "test"),
64 | delete = FALSE, return_path = FALSE,
65 | clean = FALSE, manual_download = FALSE) {
66 | all_files <- paste0("dbpedia_", c("train", "test"), ".rds")
67 | split <- match.arg(split)
68 | name <- paste0("dbpedia_", split, ".rds")
69 | load_dataset(
70 | data_name = "dbpedia", name = name, dir = dir,
71 | delete = delete, return_path = return_path, clean = clean,
72 | clean_manual = all_files,
73 | manual_download = manual_download
74 | )
75 | }
76 |
77 | #' @importFrom utils download.file
78 | download_dbpedia <- function(folder_path) {
79 | file_path <- path(folder_path, "dbpedia_csv.tar.gz")
80 | if (file_exists(file_path)) {
81 | return(invisible())
82 | }
83 | download.file(
84 | url = "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz",
85 | destfile = file_path
86 | )
87 | }
88 |
89 | #' @importFrom readr read_tsv write_rds cols col_character col_double
90 | #' @importFrom tibble tibble
91 | process_dbpedia <- function(folder_path, name_path) {
92 | file_path_test <- path(folder_path, "dbpedia_csv/test.csv")
93 | file_path_train <- path(folder_path, "dbpedia_csv/train.csv")
94 |
95 | zip_path <- path(folder_path, "dbpedia_csv.tar.gz")
96 |
97 | untar(zip_path, files = c(
98 | "dbpedia_csv/test.csv",
99 | "dbpedia_csv/train.csv"
100 | ), exdir = folder_path)
101 |
102 | data_test <- read_csv(file_path_test,
103 | col_names = c("class", "title", "description"),
104 | col_types = cols(
105 | class = col_double(),
106 | title = col_character(),
107 | description = col_character()
108 | )
109 | )
110 | data_train <- read_csv(file_path_train,
111 | col_names = c("class", "title", "description"),
112 | col_types = cols(
113 | class = col_double(),
114 | title = col_character(),
115 | description = col_character()
116 | )
117 | )
118 |
119 | classes <- c(
120 | "Company", "EducationalInstitution", "Artist", "Athlete",
121 | "OfficeHolder", "MeanOfTransportation", "Building",
122 | "NaturalPlace", "Village", "Animal", "Plant", "Album", "Film",
123 | "WrittenWork"
124 | )
125 |
126 | data_test$class <- classes[data_test$class]
127 | data_train$class <- classes[data_train$class]
128 |
129 | write_rds(data_test, path(folder_path, "dbpedia_test.rds"))
130 | write_rds(data_train, path(folder_path, "dbpedia_train.rds"))
131 |
132 | fs::file_delete(path = file_path_test)
133 | fs::file_delete(path = file_path_train)
134 | }
135 |
--------------------------------------------------------------------------------
/R/dataset_imdb.R:
--------------------------------------------------------------------------------
1 | #' IMDB Large Movie Review Dataset
2 | #'
3 | #' The core dataset contains 50,000 reviews split evenly into 25k train and
4 | #' 25k test sets. The overall distribution of labels is balanced (25k pos and
5 | #' 25k neg).
6 | #'
7 | #' In the entire collection, no more than 30 reviews are allowed for any
8 | #' given movie because reviews for the same movie tend to have correlated
9 | #' ratings. Further, the train and test sets contain a disjoint set of
10 | #' movies, so no significant performance is obtained by memorizing
11 | #' movie-unique terms and their associated with observed labels. In the
12 | #' labeled train/test sets, a negative review has a score <= 4 out of 10,
13 | #' and a positive review has a score >= 7 out of 10. Thus reviews with
14 | #' more neutral ratings are not included in the train/test sets. In the
15 | #' unsupervised set, reviews of any rating are included and there are an
16 | #' even number of reviews > 5 and <= 5.
17 | #'
18 | #' When using this dataset, please cite the ACL 2011 paper
19 | #'
20 | #' InProceedings\{maas-EtAl:2011:ACL-HLT2011, \cr
21 | #' author = \{Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher\}, \cr
22 | #' title = \{Learning Word Vectors for Sentiment Analysis\}, \cr
23 | #' booktitle = \{Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies\}, \cr
24 | #' month = \{June\}, \cr
25 | #' year = \{2011\}, \cr
26 | #' address = \{Portland, Oregon, USA\}, \cr
27 | #' publisher = \{Association for Computational Linguistics\}, \cr
28 | #' pages = \{142--150\}, \cr
29 | #' url = \{http://www.aclweb.org/anthology/P11-1015\}
30 | #' \}
31 | #'
32 | #' @source \url{http://ai.stanford.edu/~amaas/data/sentiment/}
33 | #' @inheritParams lexicon_afinn
34 | #' @param split Character. Return training ("train") data or testing ("test")
35 | #' data. Defaults to "train".
36 | #' @return A tibble with 25,000 rows and 2 variables:
37 | #' \describe{
38 | #' \item{Sentiment}{Character, denoting the sentiment}
39 | #' \item{text}{Character, text of the review}
40 | #' }
41 | #' @keywords datasets
42 | #' @family topic sentiment
43 | #' @export
44 | #' @examples
45 | #' \dontrun{
46 | #' dataset_imdb()
47 | #'
48 | #' # Custom directory
49 | #' dataset_imdb(dir = "data/")
50 | #'
51 | #' # Deleting dataset
52 | #' dataset_imdb(delete = TRUE)
53 | #'
54 | #' # Returning filepath of data
55 | #' dataset_imdb(return_path = TRUE)
56 | #'
57 | #' # Access both training and testing dataset
58 | #' train <- dataset_imdb(split = "train")
59 | #' test <- dataset_imdb(split = "test")
60 | #' }
61 | #'
62 | #' @importFrom fs file_exists dir_exists dir_create path
63 | #' @importFrom readr read_rds
64 | #' @importFrom utils menu untar
65 | dataset_imdb <- function(dir = NULL, split = c("train", "test"),
66 | delete = FALSE, return_path = FALSE, clean = FALSE,
67 | manual_download = FALSE) {
68 | all_files <- paste0("imdb_", c("train", "test"), ".rds")
69 | split <- match.arg(split)
70 | name <- paste0("imdb_", split, ".rds")
71 | load_dataset(
72 | data_name = "imdb", name = name, dir = dir,
73 | delete = delete, return_path = return_path, clean = clean,
74 | clean_manual = all_files,
75 | manual_download = manual_download
76 | )
77 | }
78 |
79 | #' @importFrom utils download.file
80 | download_imdb <- function(folder_path) {
81 | file_path <- path(folder_path, "imdb.tar.gz")
82 | if (file_exists(file_path)) {
83 | return(invisible())
84 | }
85 | download.file(
86 | url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
87 | destfile = file_path
88 | )
89 | }
90 |
91 | #' @importFrom readr read_tsv write_rds cols col_character col_double
92 | #' @importFrom fs dir_ls
93 | #' @importFrom tibble tibble
94 | process_imdb <- function(folder_path, name_path) {
95 | file_path_test <- path(folder_path, "imdb_csv/test.csv")
96 | file_path_train <- path(folder_path, "imdb_csv/train.csv")
97 |
98 | zip_path <- path(folder_path, "imdb.tar.gz")
99 |
100 | untar(zip_path, exdir = folder_path)
101 |
102 | files_test_neg <- dir_ls(path(folder_path, "aclimdb", "test", "neg"))
103 | files_test_pos <- dir_ls(path(folder_path, "aclimdb", "test", "pos"))
104 |
105 | data_test <- tibble(
106 | sentiment = rep(
107 | c("neg", "pos"),
108 | c(
109 | length(files_test_neg),
110 | length(files_test_pos)
111 | )
112 | ),
113 | text = c(
114 | vapply(files_test_neg, read_lines, character(1)),
115 | vapply(files_test_pos, read_lines, character(1))
116 | )
117 | )
118 |
119 | files_train_neg <- dir_ls(path(folder_path, "aclimdb", "train", "neg"))
120 | files_train_pos <- dir_ls(path(folder_path, "aclimdb", "train", "pos"))
121 |
122 | data_train <- tibble(
123 | sentiment = rep(
124 | c("neg", "pos"),
125 | c(
126 | length(files_train_neg),
127 | length(files_train_pos)
128 | )
129 | ),
130 | text = c(
131 | vapply(files_train_neg, read_lines, character(1)),
132 | vapply(files_train_pos, read_lines, character(1))
133 | )
134 | )
135 |
136 | write_rds(data_test, path(folder_path, "imdb_test.rds"))
137 | write_rds(data_train, path(folder_path, "imdb_train.rds"))
138 | }
139 |
--------------------------------------------------------------------------------
/R/dataset_sentence_polarity.R:
--------------------------------------------------------------------------------
1 | #' v1.0 sentence polarity dataset
2 | #'
3 | #' 5331 positive and 5331 negative processed sentences / snippets.
4 | #' Introduced in Pang/Lee ACL 2005. Released July 2005.
5 | #'
6 | #' Citation info:
7 | #'
8 | #' This data was first used in Bo Pang and Lillian Lee,
9 | #' ``Seeing stars: Exploiting class relationships for sentiment categorization
10 | #' with respect to rating scales.'', Proceedings of the ACL, 2005.
11 | #'
12 | #' InProceedings\{pang05, \cr
13 | #' author = \{Bo Pang and Lillian Lee\}, \cr
14 | #' title = \{Seeing stars: Exploiting class relationships for sentiment \cr
15 | #' categorization with respect to rating scales\}, \cr
16 | #' booktitle = \{Proceedings of the ACL\}, \cr
17 | #' year = 2005 \cr
18 | #' \}
19 | #'
20 | #' @inheritParams lexicon_afinn
21 | #' @return A tibble with 10,662 rows and 2 variables:
22 | #' \describe{
23 | #' \item{text}{Sentences or snippets}
24 | #' \item{sentiment}{Indicator for sentiment, "neg" for negative and "pos"
25 | #' for positive}
26 | #' }
27 | #' @source \url{https://www.cs.cornell.edu/people/pabo/movie-review-data/}
28 | #' @keywords datasets
29 | #' @family sentiment
30 | #' @export
31 | #' @examples
32 | #' \dontrun{
33 | #' dataset_sentence_polarity()
34 | #'
35 | #' # Custom directory
36 | #' dataset_sentence_polarity(dir = "data/")
37 | #'
38 | #' # Deleting dataset
39 | #' dataset_sentence_polarity(delete = TRUE)
40 | #'
41 | #' # Returning filepath of data
42 | #' dataset_sentence_polarity(return_path = TRUE)
43 | #' }
44 | #'
45 | #' @importFrom fs file_exists dir_exists dir_create path
46 | #' @importFrom readr read_rds
47 | #' @importFrom utils menu
48 | dataset_sentence_polarity <- function(dir = NULL, delete = FALSE,
49 | return_path = FALSE, clean = FALSE,
50 | manual_download = FALSE) {
51 | load_dataset(
52 | data_name = "sentence_polarity", name = "rt-polarity.rds",
53 | dir = dir, delete = delete, return_path = return_path,
54 | clean = clean, manual_download = manual_download
55 | )
56 | }
57 |
58 | #' @importFrom utils download.file
59 | download_sentence_polarity <- function(folder_path) {
60 | file_path <- path(folder_path, "rt-polaritydata.tar.gz")
61 | if (file_exists(file_path)) {
62 | return(invisible())
63 | }
64 | download.file(
65 | url = "https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz",
66 | destfile = file_path
67 | )
68 | }
69 |
70 | #' @importFrom readr read_tsv write_rds cols col_character col_double
71 | #' @importFrom tibble tibble
72 | process_sentence_polarity <- function(folder_path, name_path) {
73 | full_text <- read_lines(path(folder_path, "rt-polaritydata.tar.gz"))
74 |
75 | neq_text <- full_text[55:5385]
76 | neq_text[1] <- "simplistic , silly and tedious . "
77 | pos_text <- full_text[5386:10716]
78 | pos_text[1] <- "othe rock is destined to be the 21st century's new \" conan \" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . "
79 |
80 | data <- tibble(
81 | text = c(neq_text, pos_text),
82 | sentiment = c(
83 | rep("neg", length(neq_text)),
84 | rep("pos", length(pos_text))
85 | )
86 | )
87 | write_rds(data, name_path)
88 | }
89 |
--------------------------------------------------------------------------------
/R/dataset_trec.R:
--------------------------------------------------------------------------------
1 | #' TREC dataset
2 | #'
3 | #' The TREC dataset is dataset for question classification consisting of
4 | #' open-domain, fact-based questions divided into broad semantic categories.
5 | #' It has both a six-class (TREC-6) and a fifty-class (TREC-50) version. Both
6 | #' have 5,452 training examples and 500 test examples, but TREC-50 has
7 | #' finer-grained labels. Models are evaluated based on accuracy.
8 | #'
9 | #' The classes in TREC-6 are
10 | #'
11 | #' \itemize{
12 | #' \item ABBR - Abbreviation
13 | #' \item DESC - Description and abstract concepts
14 | #' \item ENTY - Entities
15 | #' \item HUM - Human beings
16 | #' \item LOC - Locations
17 | #' \item NYM - Numeric values
18 | #' }
19 | #'
20 | #' the classes in TREC-50 can be found here
21 | #' \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/definition.html}.
22 | #'
23 | #' @source \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/}
24 | #' @source \url{https://trec.nist.gov/data/qa.html}
25 | #' @inheritParams lexicon_afinn
26 | #' @param split Character. Return training ("train") data or testing ("test")
27 | #' data. Defaults to "train".
28 | #' @param version Character. Version 6("6") or version 50("50"). Defaults to
29 | #' "6".
30 | #' @return A tibble with 5,452 or 500 rows for "train" and "test"
31 | #' respectively and 2 variables:
32 | #' \describe{
33 | #' \item{class}{Character, denoting the class}
34 | #' \item{text}{Character, question text}
35 | #' }
36 | #' @keywords datasets
37 | #' @family topic
38 | #' @export
39 | #' @examples
40 | #' \dontrun{
41 | #' dataset_trec()
42 | #'
43 | #' # Custom directory
44 | #' dataset_trec(dir = "data/")
45 | #'
46 | #' # Deleting dataset
47 | #' dataset_trec(delete = TRUE)
48 | #'
49 | #' # Returning filepath of data
50 | #' dataset_trec(return_path = TRUE)
51 | #'
52 | #' # Access both training and testing dataset
53 | #' train_6 <- dataset_trec(split = "train")
54 | #' test_6 <- dataset_trec(split = "test")
55 | #'
56 | #' train_50 <- dataset_trec(split = "train", version = "50")
57 | #' test_50 <- dataset_trec(split = "test", version = "50")
58 | #' }
59 | #'
60 | #' @importFrom fs file_exists dir_exists dir_create path
61 | #' @importFrom readr read_rds
62 | #' @importFrom utils menu untar
63 | dataset_trec <- function(dir = NULL, split = c("train", "test"),
64 | version = c("6", "50"), delete = FALSE,
65 | return_path = FALSE, clean = FALSE,
66 | manual_download = FALSE) {
67 | all_files <- paste0(
68 | "trec_", rep(c("6", "50"), 2), "_",
69 | rep(c("train", "test"), each = 2), ".rds"
70 | )
71 | split <- match.arg(split)
72 | version <- match.arg(version)
73 | name <- paste0("trec_", version, "_", split, ".rds")
74 | load_dataset(
75 | data_name = "trec", name = name, dir = dir,
76 | delete = delete, return_path = return_path, clean = clean,
77 | clean_manual = all_files,
78 | manual_download = manual_download
79 | )
80 | }
81 |
82 | #' @importFrom utils download.file
83 | download_trec <- function(folder_path) {
84 | file_path_train <- path(folder_path, "train_5500.label")
85 | file_path_test <- path(folder_path, "TREC_10.label")
86 |
87 | if (file_exists(file_path_train) & file_exists(file_path_test)) {
88 | return(invisible())
89 | }
90 | download.file(
91 | url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label",
92 | destfile = file_path_train
93 | )
94 | download.file(
95 | url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label",
96 | destfile = file_path_test
97 | )
98 | }
99 |
100 | #' @importFrom readr read_tsv write_rds cols col_character col_double
101 | #' @importFrom tibble tibble
102 | process_trec <- function(folder_path, name_path) {
103 | file_path_train <- path(folder_path, "train_5500.label")
104 | file_path_test <- path(folder_path, "TREC_10.label")
105 |
106 | # Test data
107 | data_test <- read_lines(file_path_test)
108 |
109 | text_test <- gsub("^\\S* ", "", data_test)
110 |
111 | label_test <- sub("\\s.*", "", data_test)
112 |
113 | trec6_label_test <- sapply(strsplit(label_test, ":"), function(x) x[1])
114 | trec50_label_test <- sapply(strsplit(label_test, ":"), function(x) x[2])
115 |
116 | trec_6_test <- tibble(
117 | class = trec6_label_test,
118 | text = text_test
119 | )
120 | trec_50_test <- tibble(
121 | class = trec50_label_test,
122 | text = text_test
123 | )
124 | # train data
125 | data_train <- read_lines(file_path_train)
126 |
127 | text_train <- gsub("^\\S* ", "", data_train)
128 |
129 | label_train <- sub("\\s.*", "", data_train)
130 |
131 | trec6_label_train <- sapply(strsplit(label_train, ":"), function(x) x[1])
132 | trec50_label_train <- sapply(strsplit(label_train, ":"), function(x) x[2])
133 |
134 | trec_6_train <- tibble(
135 | class = trec6_label_train,
136 | text = text_train
137 | )
138 | trec_50_train <- tibble(
139 | class = trec50_label_train,
140 | text = text_train
141 | )
142 |
143 | write_rds(trec_6_test, path(folder_path, "trec_6_test.rds"))
144 | write_rds(trec_6_train, path(folder_path, "trec_6_train.rds"))
145 |
146 | write_rds(trec_50_test, path(folder_path, "trec_50_test.rds"))
147 | write_rds(trec_50_train, path(folder_path, "trec_50_train.rds"))
148 | }
149 |
--------------------------------------------------------------------------------
/R/download_functions.R:
--------------------------------------------------------------------------------
1 | #' List of all download functions used in load_dataset
2 | #'
3 | #' @format Named list of all download functions
4 | #' @include lexicon_afinn.R lexicon_loughran.R lexicon_bing.R lexicon_nrc.R
5 | #' @include dataset_sentence_polarity.R dataset_ag_news.R dataset_dbpedia.R
6 | #' @include dataset_trec.R dataset_imdb.R lexicon_nrc_eil.R lexicon_nrc_vad.R
7 | #' @include embedding_glove.R
8 | #'
9 | #' @name download_functions
10 | #' @noRd
11 | NULL
12 |
13 | download_functions <- list(
14 | afinn = download_afinn,
15 | sentence_polarity = download_sentence_polarity,
16 | loughran = download_loughran,
17 | bing = download_bing,
18 | nrc = download_nrc,
19 | nrc_eil = download_nrc_eil,
20 | nrc_vad = download_nrc_vad,
21 | ag_news = download_ag_news,
22 | dbpedia = download_dbpedia,
23 | trec = download_trec,
24 | imdb = download_imdb,
25 | glove6b = download_glove6b,
26 | glove27b = download_glove27b,
27 | glove42b = download_glove42b,
28 | glove840b = download_glove840b
29 | )
30 |
--------------------------------------------------------------------------------
/R/embedding_glove.R:
--------------------------------------------------------------------------------
1 | #' Global Vectors for Word Representation
2 | #'
3 | #' The GloVe pre-trained word vectors provide word embeddings created using
4 | #' varying numbers of tokens.
5 | #'
6 | #' Citation info:
7 | #'
8 | #' InProceedings\{pennington2014glove, \cr
9 | #' author = \{Jeffrey Pennington and Richard Socher and Christopher D. \cr
10 | #' Manning\}, \cr
11 | #' title = \{GloVe: Global Vectors for Word Representation\}, \cr
12 | #' booktitle = \{Empirical Methods in Natural Language Processing (EMNLP)\}, \cr
13 | #' year = 2014 \cr
14 | #' pages = \{1532-1543\} \cr
15 | #' url = \{http://www.aclweb.org/anthology/D14-1162\} \cr
16 | #' \}
17 | #'
18 | #' @references Jeffrey Pennington, Richard Socher, and Christopher D. Manning.
19 | #' 2014. GloVe: Global Vectors for Word Representation.
20 | #'
21 | #' @inheritParams lexicon_afinn
22 | #' @param dimensions A number indicating the number of vectors to include. One
23 | #' of 50, 100, 200, or 300 for glove6b, or one of 25, 50, 100, or 200 for
24 | #' glove27b.
25 | #' @return A tibble with 400k, 1.9m, 2.2m, or 1.2m rows (one row for each unique
26 | #' token in the vocabulary) and the following variables:
27 | #' \describe{
28 | #' \item{token}{An individual token (usually a word)}
29 | #' \item{d1, d2, etc}{The embeddings for that token.}
30 | #' }
31 | #' @source \url{https://nlp.stanford.edu/projects/glove/}
32 | #' @keywords datasets
33 | #' @family embeddings
34 | #' @examples
35 | #' \dontrun{
36 | #' embedding_glove6b(dimensions = 50)
37 | #'
38 | #' # Custom directory
39 | #' embedding_glove42b(dir = "data/")
40 | #'
41 | #' # Deleting dataset
42 | #' embedding_glove6b(delete = TRUE, dimensions = 300)
43 | #'
44 | #' # Returning filepath of data
45 | #' embedding_glove840b(return_path = TRUE)
46 | #' }
47 | #' @name embedding_glove
48 | NULL
49 |
50 | #' @rdname embedding_glove
51 | #' @export
52 | #' @importFrom fs file_exists dir_exists dir_create path
53 | #' @importFrom readr read_rds
54 | #' @importFrom utils menu
55 | embedding_glove6b <- function(dir = NULL,
56 | dimensions = c(50, 100, 200, 300),
57 | delete = FALSE,
58 | return_path = FALSE,
59 | clean = FALSE,
60 | manual_download = FALSE) {
61 | this_glove <- "6b"
62 | available_dims <- c(50, 100, 200, 300)
63 | all_names <- construct_glove_name(this_glove, available_dims)
64 | dimensions <- as.character(dimensions)
65 | dimensions <- match.arg(dimensions, as.character(available_dims))
66 | name <- construct_glove_name(this_glove, dimensions)
67 | load_dataset(
68 | data_name = "glove6b", name = name, dir = dir,
69 | delete = delete, return_path = return_path, clean = clean,
70 | clean_manual = all_names,
71 | manual_download = manual_download
72 | )
73 | }
74 |
75 | #' @keywords internal
76 | construct_glove_name <- function(tokens = c("6b", "27b"),
77 | dimensions = c(25, 50, 100, 200, 300)) {
78 | tokens <- match.arg(tokens)
79 | dimensions <- as.character(dimensions)
80 | dimensions <- match.arg(
81 | dimensions,
82 | choices = as.character(c(25, 50, 100, 200, 300)),
83 | several.ok = TRUE
84 | )
85 | paste0(
86 | paste(
87 | "glove",
88 | tokens,
89 | dimensions,
90 | sep = "_"
91 | ),
92 | ".rds"
93 | )
94 | }
95 |
96 | #' @rdname embedding_glove
97 | #' @export
98 | #' @importFrom fs file_exists dir_exists dir_create path
99 | #' @importFrom readr read_rds
100 | #' @importFrom utils menu
101 | embedding_glove27b <- function(dir = NULL,
102 | dimensions = c(25, 50, 100, 200),
103 | delete = FALSE,
104 | return_path = FALSE,
105 | clean = FALSE,
106 | manual_download = FALSE) {
107 | this_glove <- "27b"
108 | available_dims <- c(25, 50, 100, 200)
109 | all_names <- construct_glove_name(this_glove, available_dims)
110 | dimensions <- as.character(dimensions)
111 | dimensions <- match.arg(dimensions, as.character(available_dims))
112 | name <- construct_glove_name(this_glove, dimensions)
113 | load_dataset(
114 | data_name = "glove27b", name = name, dir = dir,
115 | delete = delete, return_path = return_path, clean = clean,
116 | clean_manual = all_names,
117 | manual_download = manual_download
118 | )
119 | }
120 |
121 | #' @rdname embedding_glove
122 | #' @export
123 | #' @importFrom fs file_exists dir_exists dir_create path
124 | #' @importFrom readr read_rds
125 | #' @importFrom utils menu
126 | embedding_glove42b <- function(dir = NULL,
127 | delete = FALSE,
128 | return_path = FALSE,
129 | clean = FALSE,
130 | manual_download = FALSE) {
131 | name <- "glove_42b.rds"
132 | load_dataset(
133 | data_name = "glove42b", name = name, dir = dir,
134 | delete = delete, return_path = return_path, clean = clean,
135 | manual_download = manual_download
136 | )
137 | }
138 |
139 | #' @rdname embedding_glove
140 | #' @export
141 | #' @importFrom fs file_exists dir_exists dir_create path
142 | #' @importFrom readr read_rds
143 | #' @importFrom utils menu
144 | embedding_glove840b <- function(dir = NULL,
145 | delete = FALSE,
146 | return_path = FALSE,
147 | clean = FALSE,
148 | manual_download = FALSE) {
149 | name <- "glove_840b.rds"
150 | load_dataset(
151 | data_name = "glove840b", name = name, dir = dir,
152 | delete = delete, return_path = return_path, clean = clean,
153 | manual_download = manual_download
154 | )
155 | }
156 |
157 | #' @importFrom utils download.file
158 | #' @keywords internal
159 | download_glove6b <- function(folder_path) {
160 | file_path <- path(folder_path, "glove.6B.zip")
161 | if (file_exists(file_path)) {
162 | return(invisible())
163 | }
164 | download.file(
165 | url = "http://nlp.stanford.edu/data/glove.6B.zip",
166 | destfile = file_path
167 | )
168 | }
169 |
170 | #' @importFrom utils download.file
171 | #' @keywords internal
172 | download_glove42b <- function(folder_path) {
173 | file_path <- path(folder_path, "glove.42B.300d.zip")
174 | if (file_exists(file_path)) {
175 | return(invisible())
176 | }
177 | download.file(
178 | url = "http://nlp.stanford.edu/data/glove.42B.300d.zip",
179 | destfile = file_path
180 | )
181 | }
182 |
183 | #' @importFrom utils download.file
184 | #' @keywords internal
185 | download_glove840b <- function(folder_path) {
186 | file_path <- path(folder_path, "glove.840B.300d.zip")
187 | if (file_exists(file_path)) {
188 | return(invisible())
189 | }
190 | download.file(
191 | url = "http://nlp.stanford.edu/data/glove.840B.300d.zip",
192 | destfile = file_path
193 | )
194 | }
195 |
196 | #' @importFrom utils download.file
197 | #' @keywords internal
198 | download_glove27b <- function(folder_path) {
199 | file_path <- path(folder_path, "glove.twitter.27B.zip")
200 | if (file_exists(file_path)) {
201 | return(invisible())
202 | }
203 | download.file(
204 | url = "http://nlp.stanford.edu/data/glove.twitter.27B.zip",
205 | destfile = file_path
206 | )
207 | }
208 |
209 | #' @keywords internal
210 | process_glove6b <- function(folder_path, name_path) {
211 | # Processing all datasets when they only need one adds time. We'll
212 | # specifically deal with the one they requested, which means we need to
213 | # extract the dimensions back out of the name to build the raw filename.
214 | filename <- gsub(folder_path, "", name_path)
215 | dimensions <- unlist(strsplit(filename, "_|\\."))[[3]]
216 | raw_name <- paste0("glove.6B.", dimensions, "d.txt")
217 | file <- unz(path(folder_path, "glove.6B.zip"), raw_name)
218 |
219 | write_glove(file, name_path, dimensions)
220 | }
221 |
222 | #' @keywords internal
223 | process_glove42b <- function(folder_path, name_path) {
224 | dimensions <- 300
225 | raw_name <- "glove.42B.300d.txt"
226 | file <- unz(path(folder_path, "glove.42B.300d.zip"), raw_name)
227 |
228 | write_glove(file, name_path, dimensions)
229 | }
230 |
231 | #' @keywords internal
232 | process_glove840b <- function(folder_path, name_path) {
233 | dimensions <- 300
234 | raw_name <- "glove.840B.300d.txt"
235 | file <- unz(path(folder_path, "glove.840B.300d.zip"), raw_name)
236 |
237 | write_glove(file, name_path, dimensions)
238 | }
239 |
240 | #' @keywords internal
241 | process_glove27b <- function(folder_path, name_path) {
242 | filename <- gsub(folder_path, "", name_path)
243 | dimensions <- unlist(strsplit(filename, "_|\\."))[[3]]
244 | raw_name <- paste0("glove.twitter.27B.", dimensions, "d.txt")
245 |
246 | file <- unz(path(folder_path, "glove.twitter.27B.zip"), raw_name)
247 |
248 | write_glove(file, name_path, dimensions)
249 | }
250 |
251 | #' @importFrom readr read_delim write_rds
252 | #' @keywords internal
253 | write_glove <- function(file, name_path, dimensions) {
254 | embeddings <- read_delim(
255 | file,
256 | delim = " ",
257 | quote = "",
258 | col_names = c(
259 | "token",
260 | paste0("d", seq_len(dimensions))
261 | ),
262 | col_types = paste0(
263 | c(
264 | "c",
265 | rep("d", dimensions)
266 | ),
267 | collapse = ""
268 | )
269 | )
270 |
271 | write_rds(embeddings, name_path)
272 | }
273 |
--------------------------------------------------------------------------------
/R/info.R:
--------------------------------------------------------------------------------
1 | print_info <- list(
2 | afinn =
3 | list(
4 | name = "AFINN-111",
5 | url = "http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010",
6 | license = "Open Database License (ODbL) v1.0",
7 | size = "78 KB (cleaned 59 KB)",
8 | type = "lexicon",
9 | download_mech = "https",
10 | description = "",
11 | citation = NA
12 | ),
13 | sentence_polarity =
14 | list(
15 | name = "v1.0 sentence polarity",
16 | url = "http://www.cs.cornell.edu/people/pabo/movie-review-data",
17 | license = "Cite the paper when used.",
18 | size = "2 MB (cleaned 1.4 MB)",
19 | type = "dataset",
20 | download_mech = "https",
21 | description = "Dataset with sentences labeled with negative or positive sentiment.",
22 | citation = NA
23 | ),
24 | loughran =
25 | list(
26 | name = "Loughran-McDonald Sentiment lexicon",
27 | url = "https://sraf.nd.edu/textual-analysis/resources/",
28 | license = "License required for commercial use. Please contact tloughra@nd.edu.",
29 | size = "6.7 MB (cleaned 142 KB)",
30 | type = "lexicon",
31 | download_mech = "https",
32 | description = "",
33 | citation = NA
34 | ),
35 | bing =
36 | list(
37 | name = "Bing Sentiment Lexicon",
38 | url = "https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html",
39 | license = "May be used (research, commercial, etc) with attribution.",
40 | size = "287 KB (cleaned 220 KB)",
41 | type = "lexicon",
42 | download_mech = "http",
43 | description = "",
44 | citation = NA
45 | ),
46 | nrc =
47 | list(
48 | name = "NRC Word-Emotion Association Lexicon",
49 | url = "http://saifmohammad.com/WebPages/lexicons.html",
50 | license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).",
51 | size = "22.8 MB (cleaned 424 KB)",
52 | type = "lexicon",
53 | download_mech = "http",
54 | description = "",
55 | citation = "Citation info:
56 |
57 | This dataset was published in Saif M. Mohammad and Peter Turney. (2013), ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational Intelligence, 29(3): 436-465.
58 |
59 | article{mohammad13,
60 | author = {Mohammad, Saif M. and Turney, Peter D.},
61 | title = {Crowdsourcing a Word-Emotion Association Lexicon},
62 | journal = {Computational Intelligence},
63 | volume = {29},
64 | number = {3},
65 | pages = {436-465},
66 | doi = {10.1111/j.1467-8640.2012.00460.x},
67 | url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x},
68 | eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x},
69 | year = {2013}
70 | }
71 | If you use this lexicon, then please cite it."
72 | ),
73 | nrc_eil =
74 | list(
75 | name = "NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon)",
76 | url = "www.saifmohammad.com/WebPages/AffectIntensity.htm",
77 | license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).",
78 | size = "333 KB (cleaned 212 KB)",
79 | type = "lexicon",
80 | download_mech = "http",
81 | description = "",
82 | citation = "Citation info:
83 | Details of the lexicon are in this paper.
84 | Word Affect Intensities. Saif M. Mohammad. arXiv preprint arXiv, April 2017.
85 |
86 | inproceedings{LREC18-AIL,
87 | author = {Mohammad, Saif M.},
88 | title = {Word Affect Intensities},
89 | booktitle = {Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)},
90 | year = {2018},
91 | address={Miyazaki, Japan}
92 | }
93 |
94 | If you use this lexicon, then please cite it."
95 | ),
96 | nrc_vad =
97 | list(
98 | name = "The NRC Valence, Arousal, and Dominance Lexicon",
99 | url = "https://saifmohammad.com/WebPages/nrc-vad.html",
100 | license = "License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca).",
101 | size = "150.8 MB (cleaned 792 KB)",
102 | type = "lexicon",
103 | download_mech = "http",
104 | description = "",
105 | citation = "Citation info:
106 |
107 | inproceedings{vad-acl2018,
108 | title={Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words},
109 | author={Mohammad, Saif M.},
110 | booktitle={Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)},
111 | year={2018},
112 | address={Melbourne, Australia}
113 | }
114 |
115 | If you use this lexicon, then please cite it."
116 | ),
117 | ag_news =
118 | list(
119 | name = "AG News",
120 | url = "https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html",
121 | license = "You are encouraged to download this corpus for any non-commercial use.",
122 | size = "64.4 MB (cleaned 33.9 MB)",
123 | type = "dataset",
124 | download_mech = "https",
125 | description = "",
126 | citation = NA
127 | ),
128 | dbpedia =
129 | list(
130 | name = "DBpedia",
131 | url = "https://wiki.dbpedia.org/",
132 | license = "Creative Commons Attribution-ShareAlike 3.0 License",
133 | size = "279.5 MB (cleaned 211.1 MB)",
134 | type = "dataset",
135 | download_mech = "https",
136 | description = "",
137 | citation = NA
138 | ),
139 | trec =
140 | list(
141 | name = "TREC-6 & TREC-50",
142 | url = "https://cogcomp.seas.upenn.edu/Data/QA/QC/",
143 | license = "Freely reusable public information licence",
144 | size = "1.2 MB (cleaned 827 KB)",
145 | type = "dataset",
146 | download_mech = "https",
147 | description = "",
148 | citation = NA
149 | ),
150 | imdb =
151 | list(
152 | name = "IMDb Large Movie Review Dataset",
153 | url = "http://ai.stanford.edu/~amaas/data/sentiment/",
154 | license = "No license specified, the work may be protected by copyright.",
155 | size = "376.4 MB (cleaned 71 MB)",
156 | type = "dataset",
157 | download_mech = "http",
158 | description = "",
159 | citation = NA
160 | ),
161 | glove6b =
162 | list(
163 | name = "GloVe 6B",
164 | url = "https://nlp.stanford.edu/projects/glove/",
165 | license = "Public Domain Dedication and License v1.0",
166 | size = "822.2 MB (158MB, 311MB, 616MB, and 921MB processed)",
167 | type = "embeddings",
168 | download_mech = "https",
169 | description = "Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors)",
170 | citation = "Citation info:
171 | inproceedings{pennington2014glove,
172 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
173 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
174 | title = {GloVe: Global Vectors for Word Representation},
175 | year = {2014},
176 | pages = {1532--1543},
177 | url = {http://www.aclweb.org/anthology/D14-1162},
178 | }"
179 | ),
180 | glove27b =
181 | list(
182 | name = "GloVe Twitter 27B",
183 | url = "https://nlp.stanford.edu/projects/glove/",
184 | license = "Public Domain Dedication and License v1.0",
185 | size = "1.42 GB (248MB, 476MB, 931MB, and 1.79GB processed)",
186 | type = "embeddings",
187 | download_mech = "https",
188 | description = "Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors)",
189 | citation = "Citation info:
190 | inproceedings{pennington2014glove,
191 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
192 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
193 | title = {GloVe: Global Vectors for Word Representation},
194 | year = {2014},
195 | pages = {1532--1543},
196 | url = {http://www.aclweb.org/anthology/D14-1162},
197 | }"
198 | ),
199 | glove42b =
200 | list(
201 | name = "GloVe Common Crawl 42B",
202 | url = "https://nlp.stanford.edu/projects/glove/",
203 | license = "Public Domain Dedication and License v1.0",
204 | size = "1.75 GB (4.31GB processed)",
205 | type = "embeddings",
206 | download_mech = "https",
207 | description = "Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors)",
208 | citation = "Citation info:
209 | inproceedings{pennington2014glove,
210 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
211 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
212 | title = {GloVe: Global Vectors for Word Representation},
213 | year = {2014},
214 | pages = {1532--1543},
215 | url = {http://www.aclweb.org/anthology/D14-1162},
216 | }"
217 | ),
218 | glove840b =
219 | list(
220 | name = "GloVe Common Crawl 840B",
221 | url = "https://nlp.stanford.edu/projects/glove/",
222 | license = "Public Domain Dedication and License v1.0",
223 | size = "2.03 GB (4.94GB processed)",
224 | type = "embeddings",
225 | download_mech = "https",
226 | description = "Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors)",
227 | citation = "Citation info:
228 | inproceedings{pennington2014glove,
229 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
230 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
231 | title = {GloVe: Global Vectors for Word Representation},
232 | year = {2014},
233 | pages = {1532--1543},
234 | url = {http://www.aclweb.org/anthology/D14-1162},
235 | }"
236 | )
237 | )
238 |
239 | #' Catalogue of all available data sources
240 | #' @export
241 | "catalogue"
242 | catalogue <- Reduce(rbind, lapply(print_info, as.data.frame,
243 | stringsAsFactors = FALSE
244 | ))
245 |
--------------------------------------------------------------------------------
/R/lexicon_afinn.R:
--------------------------------------------------------------------------------
1 | #' AFINN-111 dataset
2 | #'
3 | #' AFINN is a lexicon of English words rated for valence with an integer
4 | #' between minus five (negative) and plus five (positive). The words have
5 | #' been manually labeled by Finn Årup Nielsen in 2009-2011.
6 | #'
7 | #' This dataset is the newest version with 2477 words and phrases.
8 | #'
9 | #' Citation info:
10 | #'
11 | #' This dataset was published in Finn Ärup Nielsen (2011),
12 | #' ``A new Evaluation of a word list for sentiment analysis in
13 | #' microblogs'', Proceedings of the ESWC2011 Workshop on
14 | #' 'Making Sense of Microposts': Big things come in small packages (2011) 93-98.
15 | #'
16 | #' article\{nielsen11, \cr
17 | #' author = \{Finn Äruprup Nielsen\}, \cr
18 | #' title = \{A new Evaluation of a word list for sentiment analysis in microblogs\}, \cr
19 | #' journal = \{CoRR\}, \cr
20 | #' volume = \{abs/1103.2903\}, \cr
21 | #' year = \{2011\}, \cr
22 | #' url = \{http://arxiv.org/abs/1103.2903\}, \cr
23 | #' archivePrefix = \{arXiv\}, \cr
24 | #' eprint = \{1103.2903\}, \cr
25 | #' biburl = \{https://dblp.org/rec/bib/journals/corr/abs-1103-2903\}, \cr
26 | #' bibsource = \{dblp computer science bibliography, https://dblp.org\} \cr
27 | #' \}
28 | #'
29 | #' @param dir Character, path to directory where data will be stored. If
30 | #' \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.
31 | #' @param delete Logical, set \code{TRUE} to delete dataset.
32 | #' @param return_path Logical, set \code{TRUE} to return the path of the dataset.
33 | #' @param clean Logical, set \code{TRUE} to remove intermediate files. This can
34 | #' greatly reduce the size. Defaults to FALSE.
35 | #' @param manual_download Logical, set \code{TRUE} if you have manually
36 | #' downloaded the file and placed it in the folder designated by running
37 | #' this function with \code{return_path = TRUE}.
38 | #' @return A tibble with 2,477 rows and 2 variables:
39 | #' \describe{
40 | #' \item{word}{An English word}
41 | #' \item{score}{Indicator for sentiment: integer between -5 and +5}
42 | #' }
43 | #'
44 | #' @keywords datasets
45 | #' @family lexicon
46 | #' @importFrom fs file_exists dir_exists dir_create
47 | #' @importFrom readr read_rds
48 | #' @importFrom utils menu
49 | #' @export
50 | #' @examples
51 | #' \dontrun{
52 | #' lexicon_afinn()
53 | #'
54 | #' # Custom directory
55 | #' lexicon_afinn(dir = "data/")
56 | #'
57 | #' # Deleting dataset
58 | #' lexicon_afinn(delete = TRUE)
59 | #'
60 | #' # Returning filepath of data
61 | #' lexicon_afinn(return_path = TRUE)
62 | #' }
63 | lexicon_afinn <- function(dir = NULL, delete = FALSE, return_path = FALSE,
64 | clean = FALSE, manual_download = FALSE) {
65 | load_dataset(
66 | data_name = "afinn", name = "afinn_111.rds", dir = dir,
67 | delete = delete, return_path = return_path, clean = clean,
68 | manual_download = manual_download
69 | )
70 | }
71 |
72 | #' @importFrom utils download.file
73 | download_afinn <- function(folder_path) {
74 | file_path <- path(folder_path, "imm6010.zip")
75 | if (file_exists(file_path)) {
76 | return(invisible())
77 | }
78 | download.file(
79 | url = "http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip",
80 | destfile = file_path
81 | )
82 | }
83 |
84 | #' @importFrom readr read_tsv write_rds cols col_character col_double
85 | process_afinn <- function(folder_path, name_path) {
86 | file <- unz(path(folder_path, "imm6010.zip"), "AFINN/AFINN-111.txt")
87 | data <- read_tsv(file,
88 | col_types = cols(
89 | word = col_character(),
90 | value = col_double()
91 | ),
92 | col_names = c("word", "value")
93 | )
94 | write_rds(data, name_path)
95 | }
96 |
--------------------------------------------------------------------------------
/R/lexicon_bing.R:
--------------------------------------------------------------------------------
1 | #' Bing sentiment lexicon
2 | #'
3 | #' General purpose English sentiment lexicon that categorizes words in a
4 | #' binary fashion, either positive or negative
5 | #'
6 | #' Citation info:
7 | #'
8 | #' This dataset was first published in Minqing Hu and Bing Liu, ``Mining and
9 | #' summarizing customer reviews.'', Proceedings of the ACM SIGKDD International
10 | #' Conference on Knowledge Discovery & Data Mining (KDD-2004), 2004.
11 | #'
12 | #' inproceedings\{Hu04, \cr
13 | #' author = \{Hu, Minqing and Liu, Bing\}, \cr
14 | #' title = \{Mining and Summarizing Customer Reviews\}, \cr
15 | #' booktitle = \{Proceedings of the Tenth ACM SIGKDD International Conference
16 | #' on Knowledge Discovery and Data Mining\}, \cr
17 | #' series = \{KDD '04\}, \cr
18 | #' year = \{2004\}, \cr
19 | #' isbn = \{1-58113-888-1\}, \cr
20 | #' location = \{Seattle, WA, USA\}, \cr
21 | #' pages = \{168--177\}, \cr
22 | #' numpages = \{10\}, \cr
23 | #' url = \{http://doi.acm.org/10.1145/1014052.1014073\}, \cr
24 | #' doi = \{10.1145/1014052.1014073\}, \cr
25 | #' acmid = \{1014073\}, \cr
26 | #' publisher = \{ACM\}, \cr
27 | #' address = \{New York, NY, USA\}, \cr
28 | #' keywords = \{reviews, sentiment classification, summarization, text mining\}, \cr
29 | #' \}
30 | #'
31 | #' @inheritParams lexicon_afinn
32 | #' @return A tibble with 6,787 rows and 2 variables:
33 | #' \describe{
34 | #' \item{word}{An English word}
35 | #' \item{sentiment}{Indicator for sentiment: "negative" or "positive"}
36 | #' }
37 | #'
38 | #' @source \url{https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html}
39 | #' @keywords datasets
40 | #' @family lexicon
41 | #' @importFrom fs file_exists dir_exists dir_create
42 | #' @importFrom readr read_rds
43 | #' @importFrom utils menu
44 | #' @export
45 | #' @examples
46 | #' \dontrun{
47 | #' lexicon_bing()
48 | #'
49 | #' # Custom directory
50 | #' lexicon_bing(dir = "data/")
51 | #'
52 | #' # Deleting dataset
53 | #' lexicon_bing(delete = TRUE)
54 | #'
55 | #' # Returning filepath of data
56 | #' lexicon_bing(return_path = TRUE)
57 | #' }
58 | lexicon_bing <- function(dir = NULL, delete = FALSE, return_path = FALSE,
59 | clean = FALSE, manual_download = FALSE) {
60 | load_dataset(
61 | data_name = "bing", name = "bing.rds", dir = dir,
62 | delete = delete, return_path = return_path, clean = clean,
63 | manual_download = manual_download
64 | )
65 | }
66 |
67 |
68 | #' @importFrom utils download.file
69 | #' @importFrom fs path
70 | download_bing <- function(folder_path) {
71 | file_path_neg <- path(folder_path, "negative-words.txt")
72 | file_path_pos <- path(folder_path, "positive-words.txt")
73 |
74 | if (file_exists(file_path_pos) & file_exists(file_path_neg)) {
75 | return(invisible())
76 | }
77 |
78 | download.file(
79 | url = "http://ptrckprry.com/course/ssd/data/negative-words.txt",
80 | destfile = file_path_neg
81 | )
82 | download.file(
83 | url = "http://ptrckprry.com/course/ssd/data/positive-words.txt",
84 | destfile = file_path_pos
85 | )
86 | }
87 |
88 | #' @importFrom readr read_lines
89 | process_bing <- function(folder_path, name_path) {
90 | file_path_neg <- path(folder_path, "negative-words.txt")
91 | file_path_pos <- path(folder_path, "positive-words.txt")
92 |
93 | neg_words <- read_lines(file_path_neg, skip = 35)
94 | pos_words <- read_lines(file_path_pos, skip = 35)
95 |
96 | data <- tibble(
97 | word = c(neg_words, pos_words),
98 | sentiment = rep(
99 | c("negative", "positive"),
100 | c(length(neg_words), length(pos_words))
101 | )
102 | )
103 |
104 | write_rds(data, name_path)
105 | }
106 |
--------------------------------------------------------------------------------
/R/lexicon_loughran.R:
--------------------------------------------------------------------------------
1 | #' Loughran-McDonald sentiment lexicon
2 | #'
3 | #' English sentiment lexicon created for use with financial documents. This
4 | #' lexicon labels words with six possible sentiments important in financial
5 | #' contexts: "negative", "positive", "litigious", "uncertainty", "constraining",
6 | #' or "superfluous".
7 | #'
8 | #' Citation info:
9 | #'
10 | #' This dataset was published in Loughran, T. and McDonald, B. (2011),
11 | #' ``When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and
12 | #' 10-Ks.'' The Journal of Finance, 66: 35-65.
13 | #'
14 | #' article\{loughran11, \cr
15 | #' author = \{Loughran, Tim and McDonald, Bill\}, \cr
16 | #' title = \{When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10-Ks\}, \cr
17 | #' journal = \{The Journal of Finance\}, \cr
18 | #' volume = \{66\}, \cr
19 | #' number = \{1\}, \cr
20 | #' pages = \{35-65\}, \cr
21 | #' doi = \{10.1111/j.1540-6261.2010.01625.x\}, \cr
22 | #' url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-6261.2010.01625.x\}, \cr
23 | #' eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1540-6261.2010.01625.x\}, \cr
24 | #' year = \{2011\} \cr
25 | #' \}
26 | #'
27 | #'
28 | #' @inheritParams lexicon_afinn
29 | #' @return A tibble with 4,150 rows and 2 variables:
30 | #' \describe{
31 | #' \item{word}{An English word}
32 | #' \item{sentiment}{Indicator for sentiment: "negative", "positive",
33 | #' "litigious", "uncertainty", "constraining", or "superfluous"}
34 | #' }
35 | #'
36 | #' @source \url{https://sraf.nd.edu/loughranmcdonald-master-dictionary/}
37 | #' @keywords datasets
38 | #' @family lexicon
39 | #' @importFrom fs file_exists dir_exists dir_create path
40 | #' @importFrom readr read_rds
41 | #' @importFrom utils menu
42 | #' @export
43 | #' @examples
44 | #' \dontrun{
45 | #' lexicon_loughran()
46 | #'
47 | #' # Custom directory
48 | #' lexicon_loughran(dir = "data/")
49 | #'
50 | #' # Deleting dataset
51 | #' lexicon_loughran(delete = TRUE)
52 | #'
53 | #' # Returning filepath of data
54 | #' lexicon_loughran(return_path = TRUE)
55 | #' }
56 | lexicon_loughran <- function(dir = NULL, delete = FALSE, return_path = FALSE,
57 | clean = FALSE, manual_download = FALSE) {
58 | load_dataset(
59 | data_name = "loughran", name = "LoughranMcDonald.rds", dir = dir,
60 | delete = delete, return_path = return_path, clean = clean,
61 | manual_download = manual_download
62 | )
63 | }
64 |
65 | #' @importFrom utils download.file
66 | download_loughran <- function(folder_path) {
67 | file_path <- path(
68 | folder_path,
69 | "LoughranMcDonald_MasterDictionary_2018 - LoughranMcDonald_MasterDictionary_2018.csv"
70 | )
71 | if (file_exists(file_path)) {
72 | return(invisible())
73 | }
74 | download.file(
75 | url = "https://drive.google.com/uc?id=12ECPJMxV2wSalXG8ykMmkpa1fq_ur0Rf&export=download",
76 | destfile = file_path
77 | )
78 | }
79 | #' @importFrom readr read_csv cols_only col_character col_double
80 | process_loughran <- function(folder_path, name_path) {
81 | data <- read_csv(path(folder_path, "LoughranMcDonald_MasterDictionary_2018 - LoughranMcDonald_MasterDictionary_2018.csv"),
82 | col_types = cols_only(
83 | Word = col_character(),
84 | Negative = col_double(),
85 | Positive = col_double(),
86 | Uncertainty = col_double(),
87 | Litigious = col_double(),
88 | Constraining = col_double(),
89 | Superfluous = col_double()
90 | )
91 | )
92 |
93 | types <- c("Negative", "Positive", "Uncertainty", "Litigious", "Constraining", "Superfluous")
94 |
95 | out <- list()
96 | for (type in types) {
97 | out[[type]] <- tibble(
98 | word = tolower(as.character(data$Word[data[[type]] != 0])),
99 | sentiment = tolower(type)
100 | )
101 | }
102 |
103 | write_rds(Reduce(rbind, out), name_path)
104 | }
105 |
--------------------------------------------------------------------------------
/R/lexicon_nrc.R:
--------------------------------------------------------------------------------
1 | #' NRC word-emotion association lexicon
2 | #'
3 | #' General purpose English sentiment/emotion lexicon. This lexicon labels words
4 | #' with six possible sentiments or emotions: "negative", "positive", "anger",
5 | #' "anticipation", "disgust", "fear", "joy", "sadness", "surprise", or "trust".
6 | #' The annotations were manually done through Amazon's Mechanical Turk.
7 | #'
8 | #' License required for commercial use. Please contact Saif M. Mohammad
9 | #' (saif.mohammad@nrc-cnrc.gc.ca).
10 | #'
11 | #' Citation info:
12 | #'
13 | #' This dataset was published in Saif Mohammad and Peter Turney. (2013),
14 | #' ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational
15 | #' Intelligence, 29(3): 436-465.
16 | #'
17 | #' article\{mohammad13, \cr
18 | #' author = \{Mohammad, Saif M. and Turney, Peter D.\}, \cr
19 | #' title = \{CROWDSOURCING A WORD–EMOTION ASSOCIATION LEXICON\}, \cr
20 | #' journal = \{Computational Intelligence\}, \cr
21 | #' volume = \{29\}, \cr
22 | #' number = \{3\}, \cr
23 | #' pages = \{436-465\}, \cr
24 | #' doi = \{10.1111/j.1467-8640.2012.00460.x\}, \cr
25 | #' url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x\}, \cr
26 | #' eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x\}, \cr
27 | #' year = \{2013\} \cr
28 | #' \}
29 | #'
30 | #'
31 | #'
32 | #' @inheritParams lexicon_afinn
33 | #' @return A tibble with 13,901 rows and 2 variables:
34 | #' \describe{
35 | #' \item{word}{An English word}
36 | #' \item{sentiment}{Indicator for sentiment or emotion: "negative",
37 | #' "positive", "anger", "anticipation", "disgust", "fear", "joy", "sadness",
38 | #' "surprise", or "trust"}
39 | #' }
40 | #'
41 | #' @source \url{http://saifmohammad.com/WebPages/lexicons.html}
42 | #' @keywords datasets
43 | #' @family lexicon
44 | #' @importFrom fs file_exists dir_exists dir_create path
45 | #' @importFrom readr read_rds
46 | #' @importFrom utils menu
47 | #' @export
48 | #' @examples
49 | #' \dontrun{
50 | #' lexicon_nrc()
51 | #'
52 | #' # Custom directory
53 | #' lexicon_nrc(dir = "data/")
54 | #'
55 | #' # Deleting dataset
56 | #' lexicon_nrc(delete = TRUE)
57 | #'
58 | #' # Returning filepath of data
59 | #' lexicon_nrc(return_path = TRUE)
60 | #' }
61 | lexicon_nrc <- function(dir = NULL, delete = FALSE, return_path = FALSE,
62 | clean = FALSE, manual_download = FALSE) {
63 | load_dataset(
64 | data_name = "nrc", name = "NRCWordEmotion.rds", dir = dir,
65 | delete = delete, return_path = return_path, clean = clean,
66 | manual_download = manual_download
67 | )
68 | }
69 |
70 | #' @importFrom utils download.file
71 | download_nrc <- function(folder_path) {
72 | file_path <- path(
73 | folder_path,
74 | "NRC-Emotion-Lexicon.zip"
75 | )
76 | if (file_exists(file_path)) {
77 | return(invisible())
78 | }
79 | download.file(
80 | url = "http://saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip",
81 | destfile = file_path
82 | )
83 | unzip(path(folder_path, "NRC-Emotion-Lexicon.zip"),
84 | exdir = folder_path
85 | )
86 | }
87 |
88 | #' @importFrom readr read_tsv
89 | #' @importFrom utils unzip
90 |
91 | process_nrc <- function(folder_path, name_path) {
92 | data <- read_tsv(path(
93 | folder_path,
94 | "NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
95 | ),
96 | col_names = FALSE, col_types = cols(
97 | X1 = col_character(),
98 | X2 = col_character(),
99 | X3 = col_double()
100 | )
101 | )
102 |
103 | data <- data[data$X3 == 1, ]
104 | data <- tibble(
105 | word = data$X1,
106 | sentiment = data$X2
107 | )
108 |
109 | write_rds(data, name_path)
110 | }
111 |
--------------------------------------------------------------------------------
/R/lexicon_nrc_eil.R:
--------------------------------------------------------------------------------
1 | #' NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5
2 | #'
3 | #' General purpose English sentiment/emotion lexicon. The NRC Affect Intensity
4 | #' Lexicon is a list of English words and their associations with four basic
5 | #' emotions (anger, fear, sadness, joy).
6 | #'
7 | #' For a given word and emotion X, the scores range from 0 to 1. A score of 1
8 | #' means that the word conveys the highest amount of emotion X. A score of 0
9 | #' means that the word conveys the lowest amount of emotion X.
10 | #'
11 | #' License required for commercial use. Please contact Saif M. Mohammad
12 | #' (saif.mohammad@nrc-cnrc.gc.ca).
13 | #'
14 | #' Citation info:
15 | #'
16 | #' Details of the lexicon are in this paper.
17 | #' Word Affect Intensities. Saif M. Mohammad. In Proceedings of the 11th Edition
18 | #' of the Language Resources and Evaluation Conference (LREC-2018), May 2018,
19 | #' Miyazaki, Japan.
20 | #'
21 | #' inproceedings\{LREC18-AIL, \cr
22 | #' author = \{Mohammad, Saif M.\}, \cr
23 | #' title = \{Word Affect Intensities\}, \cr
24 | #' booktitle = \{Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)\}, \cr
25 | #' year = \{2018\}, \cr
26 | #' address=\{Miyazaki, Japan\} \cr
27 | #' \} \cr
28 | #'
29 | #' @inheritParams lexicon_afinn
30 | #' @return A tibble with 5.814 rows and 3 variables:
31 | #' \describe{
32 | #' \item{term}{An English word}
33 | #' \item{score}{Value between 0 and 1}
34 | #' \item{AffectDimension}{Indicator for sentiment or emotion: ("anger",
35 | #' "fear", "sadness", "joy")}
36 | #' }
37 | #'
38 | #' @source \url{https://saifmohammad.com/WebPages/AffectIntensity.htm}
39 | #' @keywords datasets
40 | #' @family lexicon
41 | #' @importFrom fs file_exists dir_exists dir_create path
42 | #' @importFrom readr read_rds
43 | #' @importFrom utils menu
44 | #' @export
45 | #' @examples
46 | #' \dontrun{
47 | #' lexicon_nrc_eil()
48 | #'
49 | #' # Custom directory
50 | #' lexicon_nrc_eil(dir = "data/")
51 | #'
52 | #' # Deleting dataset
53 | #' lexicon_nrc_eil(delete = TRUE)
54 | #'
55 | #' # Returning filepath of data
56 | #' lexicon_nrc_eil(return_path = TRUE)
57 | #' }
58 | lexicon_nrc_eil <- function(dir = NULL, delete = FALSE, return_path = FALSE,
59 | clean = FALSE, manual_download = FALSE) {
60 | load_dataset(
61 | data_name = "nrc_eil", name = "nrc_eil.rds", dir = dir,
62 | delete = delete, return_path = return_path, clean = clean,
63 | manual_download = manual_download
64 | )
65 | }
66 |
67 | #' @importFrom utils download.file
68 | download_nrc_eil <- function(folder_path) {
69 | file_path <- path(
70 | folder_path,
71 | "NRC-AffectIntensity-Lexicon.txt"
72 | )
73 | if (file_exists(file_path)) {
74 | return(invisible())
75 | }
76 | download.file(
77 | url = "http://saifmohammad.com/WebDocs/NRC-AffectIntensity-Lexicon.txt",
78 | destfile = file_path
79 | )
80 | }
81 |
82 | #' @importFrom readr read_tsv
83 | #' @importFrom utils unzip
84 |
85 | process_nrc_eil <- function(folder_path, name_path) {
86 | data <- read_tsv(
87 | file = path(folder_path, "NRC-AffectIntensity-Lexicon.txt"),
88 | skip = 36,
89 | col_types = cols(
90 | term = col_character(),
91 | score = col_double(),
92 | AffectDimension = col_character()
93 | )
94 | )
95 | write_rds(data, name_path)
96 | }
97 |
--------------------------------------------------------------------------------
/R/lexicon_nrc_vad.R:
--------------------------------------------------------------------------------
1 | #' The NRC Valence, Arousal, and Dominance Lexicon
2 | #'
3 | #' The NRC Valence, Arousal, and Dominance (VAD) Lexicon includes a list of
4 | #' more than 20,000 English words and their valence, arousal, and dominance
5 | #' scores. For a given word and a dimension (V/A/D), the scores range from 0
6 | #' (lowest V/A/D) to 1 (highest V/A/D). The lexicon with its fine-grained real-
7 | #' valued scores was created by manual annotation using best--worst scaling.
8 | #' The lexicon is markedly larger than any of the existing VAD lexicons. We also
9 | #' show that the ratings obtained are substantially more reliable than those in
10 | #' existing lexicons.
11 | #'
12 | #' License required for commercial use. Please contact Saif M. Mohammad
13 | #' (saif.mohammad@nrc-cnrc.gc.ca).
14 | #'
15 | #' Citation info:
16 | #'
17 | #' Details of the NRC VAD Lexicon are available in this paper:
18 | #'
19 | #' Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for
20 | #' 20,000 English Words. Saif M. Mohammad. In Proceedings of the 56th Annual
21 | #' Meeting of the Association for Computational Linguistics, Melbourne,
22 | #' Australia, July 2018.
23 | #'
24 | #' inproceedings\{vad-acl2018, \cr
25 | #' title=\{Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words\}, \cr
26 | #' author=\{Mohammad, Saif M.\}, \cr
27 | #' booktitle=\{Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)\}, \cr
28 | #' year=\{2018\}, \cr
29 | #' address=\{Melbourne, Australia\} \cr
30 | #' \}
31 | #'
32 | #' @inheritParams lexicon_afinn
33 | #' @return A tibble with 20.007 rows and 4 variables:
34 | #' \describe{
35 | #' \item{word}{An English word}
36 | #' \item{Valence}{valence score of the word}
37 | #' \item{Arousal}{arousal score of the word}
38 | #' \item{Dominance}{dominance score of the word}
39 | #' }
40 | #'
41 | #' @source \url{https://saifmohammad.com/WebPages/nrc-vad.html}
42 | #' @keywords datasets
43 | #' @family lexicon
44 | #' @importFrom fs file_exists dir_exists dir_create path
45 | #' @importFrom readr read_rds
46 | #' @importFrom utils menu
47 | #' @export
48 | #' @examples
49 | #' \dontrun{
50 | #' lexicon_nrc_vad()
51 | #'
52 | #' # Custom directory
53 | #' lexicon_nrc_vad(dir = "data/")
54 | #'
55 | #' # Deleting dataset
56 | #' lexicon_nrc_vad(delete = TRUE)
57 | #'
58 | #' # Returning filepath of data
59 | #' lexicon_nrc_vad(return_path = TRUE)
60 | #' }
61 | lexicon_nrc_vad <- function(dir = NULL, delete = FALSE, return_path = FALSE,
62 | clean = FALSE, manual_download = FALSE) {
63 | load_dataset(
64 | data_name = "nrc_vad", name = "nrc_vad.rds", dir = dir,
65 | delete = delete, return_path = return_path, clean = clean,
66 | manual_download = manual_download
67 | )
68 | }
69 |
70 | #' @importFrom utils download.file
71 | download_nrc_vad <- function(folder_path) {
72 | file_path <- path(
73 | folder_path,
74 | "NRC-VAD-Lexicon-Aug2018Release.zip"
75 | )
76 | if (file_exists(file_path)) {
77 | return(invisible())
78 | }
79 | download.file(
80 | url = "http://saifmohammad.com/WebDocs/VAD/NRC-VAD-Lexicon-Aug2018Release.zip",
81 | destfile = file_path
82 | )
83 | unzip(path(folder_path, "NRC-VAD-Lexicon-Aug2018Release.zip"),
84 | exdir = folder_path
85 | )
86 | }
87 |
88 | #' @importFrom readr read_tsv
89 | #' @importFrom utils unzip
90 |
91 | process_nrc_vad <- function(folder_path, name_path) {
92 | data <- read_tsv(path(
93 | folder_path,
94 | "NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt"
95 | ),
96 | col_names = FALSE,
97 | show_col_types = FALSE)
98 | data <- stats::setNames(data, c("Word", "Valence", "Arousal", "Dominance"))
99 |
100 | write_rds(data, name_path)
101 | }
102 |
--------------------------------------------------------------------------------
/R/load_dataset.R:
--------------------------------------------------------------------------------
1 | #' Internal Functions
2 | #'
3 | #' These are not to be used directly by the users.
4 | #' @export
5 | #' @importFrom fs dir_delete path file_delete
6 | #' @keywords internal
7 | load_dataset <- function(data_name, name, dir, delete, return_path, clean,
8 | clean_manual = NULL, manual_download) {
9 | dir <- ifelse(is.null(dir), rappdirs::user_cache_dir("textdata"), dir)
10 |
11 | name_path <- path(dir, data_name, name)
12 | folder_path <- path(dir, data_name)
13 |
14 | if (!manual_download) {
15 | if (return_path) {
16 | return(folder_path)
17 | }
18 |
19 | if (delete) {
20 | dir_delete(folder_path)
21 | return(invisible())
22 | }
23 |
24 | if (file_exists(name_path)) {
25 | return(read_rds(name_path))
26 | }
27 |
28 | if (printer(data_name) == 2) {
29 | return(invisible())
30 | }
31 |
32 | if (!dir_exists(folder_path)) {
33 | dir_create(folder_path)
34 | }
35 |
36 | download_functions[[data_name]](folder_path)
37 | }
38 |
39 | process_functions[[data_name]](folder_path, name_path)
40 |
41 | if (clean) {
42 | if (!is.null(clean_manual)) {
43 | intermediate_files <- setdiff(
44 | dir_ls(folder_path),
45 | path(folder_path, clean_manual)
46 | )
47 | } else {
48 | intermediate_files <- setdiff(dir_ls(folder_path), name_path)
49 | }
50 | file_delete(intermediate_files)
51 | }
52 |
53 | read_rds(name_path)
54 | }
55 |
--------------------------------------------------------------------------------
/R/printer.R:
--------------------------------------------------------------------------------
1 | #' Internal Functions
2 | #'
3 | #' These are not to be used directly by the users.
4 | #' @keywords internal
5 | #' @noRd
6 | printer <- function(name) {
7 | title <- cat(
8 | "Do you want to download:\n",
9 | "Name:", print_info[[name]][["name"]], "\n",
10 | "URL:", print_info[[name]][["url"]], "\n",
11 | "License:", print_info[[name]][["license"]], "\n",
12 | "Size:", print_info[[name]][["size"]], "\n",
13 | "Download mechanism:", print_info[[name]][["download_mech"]], "\n"
14 | )
15 |
16 | if (!is.na(print_info[[name]][["citation"]])) {
17 | title <- cat(
18 | title,
19 | print_info[[name]][["citation"]], "\n"
20 | )
21 | }
22 |
23 | menu(choices = c("Yes", "No"), title = title)
24 | }
25 |
--------------------------------------------------------------------------------
/R/process_functions.R:
--------------------------------------------------------------------------------
1 | #' List of all process functions used in load_dataset
2 | #'
3 | #' @format Named list of all process functions
4 | #' @include download_functions.R
5 | #'
6 | #' @name process_functions
7 | #' @noRd
8 | NULL
9 |
10 | process_functions <- list(
11 | afinn = process_afinn,
12 | sentence_polarity = process_sentence_polarity,
13 | loughran = process_loughran,
14 | bing = process_bing,
15 | nrc = process_nrc,
16 | nrc_eil = process_nrc_eil,
17 | nrc_vad = process_nrc_vad,
18 | ag_news = process_ag_news,
19 | dbpedia = process_dbpedia,
20 | trec = process_trec,
21 | imdb = process_imdb,
22 | glove6b = process_glove6b,
23 | glove27b = process_glove27b,
24 | glove42b = process_glove42b,
25 | glove840b = process_glove840b
26 | )
27 |
--------------------------------------------------------------------------------
/R/textdata-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 |
4 | ## usethis namespace: start
5 | ## usethis namespace: end
6 | NULL
7 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 |
6 |
7 | ```{r, include = FALSE}
8 | knitr::opts_chunk$set(
9 | collapse = TRUE,
10 | comment = "#>",
11 | fig.path = "man/figures/README-"
12 | )
13 | ```
14 |
15 | # textdata
16 |
17 |
18 | [](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml)
19 | [](https://CRAN.R-project.org/package=textdata)
20 | [](https://cran.r-project.org/package=textdata)
21 | [](https://doi.org/10.5281/zenodo.3244433)
22 | [](https://app.codecov.io/gh/EmilHvitfeldt/textdata?branch=main)
23 | [](https://lifecycle.r-lib.org/articles/stages.html)
24 |
25 |
26 | The goal of textdata is to provide access to text-related data sets for easy access without bundling them inside a package. Some text datasets are too large to store within an R package or are licensed in such a way that prevents them from being included in an OSS-licensed package. Instead, this package provides a framework to download, parse, and store the datasets on the disk and load them when needed.
27 |
28 | ## Installation
29 |
30 | You can install the not yet released version of textdata from [CRAN](https://CRAN.R-project.org) with:
31 |
32 | ``` r
33 | install.packages("textdata")
34 | ```
35 |
36 | And the development version from [GitHub](https://github.com/) with:
37 |
38 | ``` r
39 | # install.packages("remotes")
40 | remotes::install_github("EmilHvitfeldt/textdata")
41 | ```
42 | ## Example
43 |
44 | The first time you use one of the functions for accessing an included text dataset, such as `lexicon_afinn()` or `dataset_sentence_polarity()`, the function will prompt you to agree that you understand the dataset's license or terms of use and then download the dataset to your computer.
45 |
46 | 
47 |
48 | After the first use, each time you use a function like `lexicon_afinn()`, the function will load the dataset from disk.
49 |
50 | ## Included text datasets
51 |
52 | As of today, the datasets included in textdata are:
53 |
54 | | Dataset | Function |
55 | | --------------------------------------------------------------- | ----------------------------- |
56 | | v1.0 sentence polarity dataset | `dataset_sentence_polarity()` |
57 | | AFINN-111 sentiment lexicon | `lexicon_afinn()` |
58 | | Hu and Liu's opinion lexicon | `lexicon_bing()` |
59 | | NRC word-emotion association lexicon | `lexicon_nrc()` |
60 | | NRC Emotion Intensity Lexicon | `lexicon_nrc_eil()` |
61 | | The NRC Valence, Arousal, and Dominance Lexicon | `lexicon_nrc_vad()` |
62 | | Loughran and McDonald's opinion lexicon for financial documents | `lexicon_loughran()` |
63 | | AG's News | `dataset_ag_news()` |
64 | | DBpedia ontology | `dataset_dbpedia()` |
65 | | Trec-6 and Trec-50 | `dataset_trec()` |
66 | | IMDb Large Movie Review Dataset | `dataset_imdb()` |
67 | | Stanford NLP GloVe pre-trained word vectors | `embedding_glove6b()` |
68 | | | `embedding_glove27b()` |
69 | | | `embedding_glove42b()` |
70 | | | `embedding_glove840b()` |
71 |
72 | Check out each function's documentation for detailed information (including citations) for the relevant dataset.
73 |
74 | ## Community Guidelines
75 |
76 | Note that this project is released with a
77 | [Contributor Code of Conduct](https://github.com/EmilHvitfeldt/textdata/blob/main/CODE_OF_CONDUCT.md).
78 | By contributing to this project, you agree to abide by its terms.
79 | Feedback, bug reports (and fixes!), and feature requests are welcome; file
80 | issues or seek support [here](https://github.com/EmilHvitfeldt/textdata/issues).
81 | For details on how to add a new dataset to this package, check out the vignette!
82 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # textdata
5 |
6 |
7 |
8 | [](https://github.com/EmilHvitfeldt/textdata/actions/workflows/R-CMD-check.yaml)
9 | [](https://CRAN.R-project.org/package=textdata)
11 | [](https://cran.r-project.org/package=textdata)
12 | [](https://doi.org/10.5281/zenodo.3244433)
13 | [](https://app.codecov.io/gh/EmilHvitfeldt/textdata?branch=main)
15 | [](https://lifecycle.r-lib.org/articles/stages.html)
17 |
18 |
19 | The goal of textdata is to provide access to text-related data sets for
20 | easy access without bundling them inside a package. Some text datasets
21 | are too large to store within an R package or are licensed in such a way
22 | that prevents them from being included in an OSS-licensed package.
23 | Instead, this package provides a framework to download, parse, and store
24 | the datasets on the disk and load them when needed.
25 |
26 | ## Installation
27 |
28 | You can install the not yet released version of textdata from
29 | [CRAN](https://CRAN.R-project.org) with:
30 |
31 | ``` r
32 | install.packages("textdata")
33 | ```
34 |
35 | And the development version from [GitHub](https://github.com/) with:
36 |
37 | ``` r
38 | # install.packages("remotes")
39 | remotes::install_github("EmilHvitfeldt/textdata")
40 | ```
41 |
42 | ## Example
43 |
44 | The first time you use one of the functions for accessing an included
45 | text dataset, such as `lexicon_afinn()` or
46 | `dataset_sentence_polarity()`, the function will prompt you to agree
47 | that you understand the dataset’s license or terms of use and then
48 | download the dataset to your computer.
49 |
50 | 
51 |
52 | After the first use, each time you use a function like
53 | `lexicon_afinn()`, the function will load the dataset from disk.
54 |
55 | ## Included text datasets
56 |
57 | As of today, the datasets included in textdata are:
58 |
59 | | Dataset | Function |
60 | |-----------------------------------------------------------------|-------------------------------|
61 | | v1.0 sentence polarity dataset | `dataset_sentence_polarity()` |
62 | | AFINN-111 sentiment lexicon | `lexicon_afinn()` |
63 | | Hu and Liu’s opinion lexicon | `lexicon_bing()` |
64 | | NRC word-emotion association lexicon | `lexicon_nrc()` |
65 | | NRC Emotion Intensity Lexicon | `lexicon_nrc_eil()` |
66 | | The NRC Valence, Arousal, and Dominance Lexicon | `lexicon_nrc_vad()` |
67 | | Loughran and McDonald’s opinion lexicon for financial documents | `lexicon_loughran()` |
68 | | AG’s News | `dataset_ag_news()` |
69 | | DBpedia ontology | `dataset_dbpedia()` |
70 | | Trec-6 and Trec-50 | `dataset_trec()` |
71 | | IMDb Large Movie Review Dataset | `dataset_imdb()` |
72 | | Stanford NLP GloVe pre-trained word vectors | `embedding_glove6b()` |
73 | | | `embedding_glove27b()` |
74 | | | `embedding_glove42b()` |
75 | | | `embedding_glove840b()` |
76 |
77 | Check out each function’s documentation for detailed information
78 | (including citations) for the relevant dataset.
79 |
80 | ## Community Guidelines
81 |
82 | Note that this project is released with a [Contributor Code of
83 | Conduct](https://github.com/EmilHvitfeldt/textdata/blob/main/CODE_OF_CONDUCT.md).
84 | By contributing to this project, you agree to abide by its terms.
85 | Feedback, bug reports (and fixes!), and feature requests are welcome;
86 | file issues or seek support
87 | [here](https://github.com/EmilHvitfeldt/textdata/issues). For details on
88 | how to add a new dataset to this package, check out the vignette!
89 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | template:
2 | bootstrap: 5
3 |
4 | development:
5 | mode: auto
6 |
7 | reference:
8 | - title: Lexicons
9 | contents:
10 | - lexicon_afinn
11 | - lexicon_bing
12 | - lexicon_nrc
13 | - lexicon_nrc_eil
14 | - lexicon_nrc_vad
15 | - lexicon_loughran
16 | - title: Data Sets
17 | contents:
18 | - dataset_sentence_polarity
19 | - dataset_ag_news
20 | - dataset_dbpedia
21 | - dataset_trec
22 | - dataset_imdb
23 | - title: Embeddings
24 | contents:
25 | - embedding_glove6b
26 | - embedding_glove27b
27 | - embedding_glove42b
28 | - embedding_glove840b
29 | - title: Other
30 | contents:
31 | - catalogue
32 | - cache_info
33 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 1%
9 | patch:
10 | default:
11 | target: auto
12 | threshold: 1%
13 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Release Summary
2 |
3 | This is the 7th CRAN release of textdata. Fixes a bug that produces data with no column names.
4 |
5 | ## R CMD check results
6 |
7 | 0 errors | 0 warnings | 0 note
8 |
9 | ## Downstream dependencies
10 |
11 | I ran R CMD check on the 3 downstream dependencies and there were no problems related to textdata.
12 |
--------------------------------------------------------------------------------
/man/cache_info.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cache_info.R
3 | \name{cache_info}
4 | \alias{cache_info}
5 | \title{List folders and their sizes in cache}
6 | \usage{
7 | cache_info(dir = NULL)
8 | }
9 | \arguments{
10 | \item{dir}{Character, path to directory where data will be stored. If
11 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
12 | }
13 | \value{
14 | A tibble with 2 variables:
15 | \describe{
16 | \item{name}{Name of the folder}
17 | \item{size}{Size of the folder}
18 | }
19 | }
20 | \description{
21 | This function will return a tibble with the name and sizes of all folder in
22 | specified directory. Will default to textdata's default cache.
23 | }
24 | \examples{
25 | \dontrun{
26 | cache_info()
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/man/catalogue.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/info.R
3 | \docType{data}
4 | \name{catalogue}
5 | \alias{catalogue}
6 | \title{Catalogue of all available data sources}
7 | \format{
8 | An object of class \code{data.frame} with 15 rows and 8 columns.
9 | }
10 | \usage{
11 | catalogue
12 | }
13 | \description{
14 | Catalogue of all available data sources
15 | }
16 | \keyword{datasets}
17 |
--------------------------------------------------------------------------------
/man/dataset_ag_news.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/dataset_ag_news.R
3 | \name{dataset_ag_news}
4 | \alias{dataset_ag_news}
5 | \title{AG's News Topic Classification Dataset}
6 | \source{
7 | \url{http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html}
8 |
9 | \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
10 | }
11 | \usage{
12 | dataset_ag_news(
13 | dir = NULL,
14 | split = c("train", "test"),
15 | delete = FALSE,
16 | return_path = FALSE,
17 | clean = FALSE,
18 | manual_download = FALSE
19 | )
20 | }
21 | \arguments{
22 | \item{dir}{Character, path to directory where data will be stored. If
23 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
24 |
25 | \item{split}{Character. Return training ("train") data or testing ("test")
26 | data. Defaults to "train".}
27 |
28 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
29 |
30 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
31 |
32 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
33 | greatly reduce the size. Defaults to FALSE.}
34 |
35 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
36 | downloaded the file and placed it in the folder designated by running
37 | this function with \code{return_path = TRUE}.}
38 | }
39 | \value{
40 | A tibble with 120,000 or 30,000 rows for "train" and "test"
41 | respectively and 3 variables:
42 | \describe{
43 | \item{class}{Character, denoting new class}
44 | \item{title}{Character, title of article}
45 | \item{description}{Character, description of article}
46 | }
47 | }
48 | \description{
49 | The AG's news topic classification dataset is constructed by choosing 4
50 | largest classes from the original corpus. Each class contains 30,000 training
51 | samples and 1,900 testing samples. The total number of training samples is
52 | 120,000 and testing 7,600.
53 | Version 3, Updated 09/09/2015
54 | }
55 | \details{
56 | The classes in this dataset are
57 |
58 | \itemize{
59 | \item World
60 | \item Sports
61 | \item Business
62 | \item Sci/Tech
63 | }
64 | }
65 | \examples{
66 | \dontrun{
67 | dataset_ag_news()
68 |
69 | # Custom directory
70 | dataset_ag_news(dir = "data/")
71 |
72 | # Deleting dataset
73 | dataset_ag_news(delete = TRUE)
74 |
75 | # Returning filepath of data
76 | dataset_ag_news(return_path = TRUE)
77 |
78 | # Access both training and testing dataset
79 | train <- dataset_ag_news(split = "train")
80 | test <- dataset_ag_news(split = "test")
81 | }
82 |
83 | }
84 | \seealso{
85 | Other topic:
86 | \code{\link{dataset_dbpedia}()},
87 | \code{\link{dataset_trec}()}
88 | }
89 | \concept{topic}
90 | \keyword{datasets}
91 |
--------------------------------------------------------------------------------
/man/dataset_dbpedia.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/dataset_dbpedia.R
3 | \name{dataset_dbpedia}
4 | \alias{dataset_dbpedia}
5 | \title{DBpedia Ontology Dataset}
6 | \source{
7 | \url{https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf}
8 |
9 | \url{https://www.dbpedia.org/}
10 |
11 | \url{https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz}
12 | }
13 | \usage{
14 | dataset_dbpedia(
15 | dir = NULL,
16 | split = c("train", "test"),
17 | delete = FALSE,
18 | return_path = FALSE,
19 | clean = FALSE,
20 | manual_download = FALSE
21 | )
22 | }
23 | \arguments{
24 | \item{dir}{Character, path to directory where data will be stored. If
25 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
26 |
27 | \item{split}{Character. Return training ("train") data or testing ("test")
28 | data. Defaults to "train".}
29 |
30 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
31 |
32 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
33 |
34 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
35 | greatly reduce the size. Defaults to FALSE.}
36 |
37 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
38 | downloaded the file and placed it in the folder designated by running
39 | this function with \code{return_path = TRUE}.}
40 | }
41 | \value{
42 | A tibble with 560,000 or 70,000 rows for "train" and "test"
43 | respectively and 3 variables:
44 | \describe{
45 | \item{class}{Character, denoting the class class}
46 | \item{title}{Character, title of article}
47 | \item{description}{Character, description of article}
48 | }
49 | }
50 | \description{
51 | DBpedia ontology dataset classification dataset. It contains 560,000 training
52 | samples and 70,000 testing samples for each of 14 nonoverlapping classes
53 | from DBpedia.
54 | }
55 | \details{
56 | The classes are
57 |
58 | \itemize{
59 | \item Company
60 | \item EducationalInstitution
61 | \item Artist
62 | \item Athlete
63 | \item OfficeHolder
64 | \item MeanOfTransportation
65 | \item Building
66 | \item NaturalPlace
67 | \item Village
68 | \item Animal
69 | \item Plant
70 | \item Album
71 | \item Film
72 | \item WrittenWork
73 | }
74 | }
75 | \examples{
76 | \dontrun{
77 | dataset_dbpedia()
78 |
79 | # Custom directory
80 | dataset_dbpedia(dir = "data/")
81 |
82 | # Deleting dataset
83 | dataset_dbpedia(delete = TRUE)
84 |
85 | # Returning filepath of data
86 | dataset_dbpedia(return_path = TRUE)
87 |
88 | # Access both training and testing dataset
89 | train <- dataset_dbpedia(split = "train")
90 | test <- dataset_dbpedia(split = "test")
91 | }
92 |
93 | }
94 | \seealso{
95 | Other topic:
96 | \code{\link{dataset_ag_news}()},
97 | \code{\link{dataset_trec}()}
98 | }
99 | \concept{topic}
100 | \keyword{datasets}
101 |
--------------------------------------------------------------------------------
/man/dataset_imdb.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/dataset_imdb.R
3 | \name{dataset_imdb}
4 | \alias{dataset_imdb}
5 | \title{IMDB Large Movie Review Dataset}
6 | \source{
7 | \url{http://ai.stanford.edu/~amaas/data/sentiment/}
8 | }
9 | \usage{
10 | dataset_imdb(
11 | dir = NULL,
12 | split = c("train", "test"),
13 | delete = FALSE,
14 | return_path = FALSE,
15 | clean = FALSE,
16 | manual_download = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{dir}{Character, path to directory where data will be stored. If
21 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
22 |
23 | \item{split}{Character. Return training ("train") data or testing ("test")
24 | data. Defaults to "train".}
25 |
26 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
27 |
28 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
29 |
30 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
31 | greatly reduce the size. Defaults to FALSE.}
32 |
33 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
34 | downloaded the file and placed it in the folder designated by running
35 | this function with \code{return_path = TRUE}.}
36 | }
37 | \value{
38 | A tibble with 25,000 rows and 2 variables:
39 | \describe{
40 | \item{Sentiment}{Character, denoting the sentiment}
41 | \item{text}{Character, text of the review}
42 | }
43 | }
44 | \description{
45 | The core dataset contains 50,000 reviews split evenly into 25k train and
46 | 25k test sets. The overall distribution of labels is balanced (25k pos and
47 | 25k neg).
48 | }
49 | \details{
50 | In the entire collection, no more than 30 reviews are allowed for any
51 | given movie because reviews for the same movie tend to have correlated
52 | ratings. Further, the train and test sets contain a disjoint set of
53 | movies, so no significant performance is obtained by memorizing
54 | movie-unique terms and their associated with observed labels. In the
55 | labeled train/test sets, a negative review has a score <= 4 out of 10,
56 | and a positive review has a score >= 7 out of 10. Thus reviews with
57 | more neutral ratings are not included in the train/test sets. In the
58 | unsupervised set, reviews of any rating are included and there are an
59 | even number of reviews > 5 and <= 5.
60 |
61 | When using this dataset, please cite the ACL 2011 paper
62 |
63 | InProceedings\{maas-EtAl:2011:ACL-HLT2011, \cr
64 | author = \{Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher\}, \cr
65 | title = \{Learning Word Vectors for Sentiment Analysis\}, \cr
66 | booktitle = \{Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies\}, \cr
67 | month = \{June\}, \cr
68 | year = \{2011\}, \cr
69 | address = \{Portland, Oregon, USA\}, \cr
70 | publisher = \{Association for Computational Linguistics\}, \cr
71 | pages = \{142--150\}, \cr
72 | url = \{http://www.aclweb.org/anthology/P11-1015\}
73 | \}
74 | }
75 | \examples{
76 | \dontrun{
77 | dataset_imdb()
78 |
79 | # Custom directory
80 | dataset_imdb(dir = "data/")
81 |
82 | # Deleting dataset
83 | dataset_imdb(delete = TRUE)
84 |
85 | # Returning filepath of data
86 | dataset_imdb(return_path = TRUE)
87 |
88 | # Access both training and testing dataset
89 | train <- dataset_imdb(split = "train")
90 | test <- dataset_imdb(split = "test")
91 | }
92 |
93 | }
94 | \concept{topic sentiment}
95 | \keyword{datasets}
96 |
--------------------------------------------------------------------------------
/man/dataset_sentence_polarity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/dataset_sentence_polarity.R
3 | \name{dataset_sentence_polarity}
4 | \alias{dataset_sentence_polarity}
5 | \title{v1.0 sentence polarity dataset}
6 | \source{
7 | \url{https://www.cs.cornell.edu/people/pabo/movie-review-data/}
8 | }
9 | \usage{
10 | dataset_sentence_polarity(
11 | dir = NULL,
12 | delete = FALSE,
13 | return_path = FALSE,
14 | clean = FALSE,
15 | manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 |
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 |
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 |
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 |
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 10,662 rows and 2 variables:
35 | \describe{
36 | \item{text}{Sentences or snippets}
37 | \item{sentiment}{Indicator for sentiment, "neg" for negative and "pos"
38 | for positive}
39 | }
40 | }
41 | \description{
42 | 5331 positive and 5331 negative processed sentences / snippets.
43 | Introduced in Pang/Lee ACL 2005. Released July 2005.
44 | }
45 | \details{
46 | Citation info:
47 |
48 | This data was first used in Bo Pang and Lillian Lee,
49 | ``Seeing stars: Exploiting class relationships for sentiment categorization
50 | with respect to rating scales.'', Proceedings of the ACL, 2005.
51 |
52 | InProceedings\{pang05, \cr
53 | author = \{Bo Pang and Lillian Lee\}, \cr
54 | title = \{Seeing stars: Exploiting class relationships for sentiment \cr
55 | categorization with respect to rating scales\}, \cr
56 | booktitle = \{Proceedings of the ACL\}, \cr
57 | year = 2005 \cr
58 | \}
59 | }
60 | \examples{
61 | \dontrun{
62 | dataset_sentence_polarity()
63 |
64 | # Custom directory
65 | dataset_sentence_polarity(dir = "data/")
66 |
67 | # Deleting dataset
68 | dataset_sentence_polarity(delete = TRUE)
69 |
70 | # Returning filepath of data
71 | dataset_sentence_polarity(return_path = TRUE)
72 | }
73 |
74 | }
75 | \concept{sentiment}
76 | \keyword{datasets}
77 |
--------------------------------------------------------------------------------
/man/dataset_trec.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/dataset_trec.R
3 | \name{dataset_trec}
4 | \alias{dataset_trec}
5 | \title{TREC dataset}
6 | \source{
7 | \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/}
8 |
9 | \url{https://trec.nist.gov/data/qa.html}
10 | }
11 | \usage{
12 | dataset_trec(
13 | dir = NULL,
14 | split = c("train", "test"),
15 | version = c("6", "50"),
16 | delete = FALSE,
17 | return_path = FALSE,
18 | clean = FALSE,
19 | manual_download = FALSE
20 | )
21 | }
22 | \arguments{
23 | \item{dir}{Character, path to directory where data will be stored. If
24 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
25 |
26 | \item{split}{Character. Return training ("train") data or testing ("test")
27 | data. Defaults to "train".}
28 |
29 | \item{version}{Character. Version 6("6") or version 50("50"). Defaults to
30 | "6".}
31 |
32 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
33 |
34 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
35 |
36 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
37 | greatly reduce the size. Defaults to FALSE.}
38 |
39 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
40 | downloaded the file and placed it in the folder designated by running
41 | this function with \code{return_path = TRUE}.}
42 | }
43 | \value{
44 | A tibble with 5,452 or 500 rows for "train" and "test"
45 | respectively and 2 variables:
46 | \describe{
47 | \item{class}{Character, denoting the class}
48 | \item{text}{Character, question text}
49 | }
50 | }
51 | \description{
52 | The TREC dataset is dataset for question classification consisting of
53 | open-domain, fact-based questions divided into broad semantic categories.
54 | It has both a six-class (TREC-6) and a fifty-class (TREC-50) version. Both
55 | have 5,452 training examples and 500 test examples, but TREC-50 has
56 | finer-grained labels. Models are evaluated based on accuracy.
57 | }
58 | \details{
59 | The classes in TREC-6 are
60 |
61 | \itemize{
62 | \item ABBR - Abbreviation
63 | \item DESC - Description and abstract concepts
64 | \item ENTY - Entities
65 | \item HUM - Human beings
66 | \item LOC - Locations
67 | \item NYM - Numeric values
68 | }
69 |
70 | the classes in TREC-50 can be found here
71 | \url{https://cogcomp.seas.upenn.edu/Data/QA/QC/definition.html}.
72 | }
73 | \examples{
74 | \dontrun{
75 | dataset_trec()
76 |
77 | # Custom directory
78 | dataset_trec(dir = "data/")
79 |
80 | # Deleting dataset
81 | dataset_trec(delete = TRUE)
82 |
83 | # Returning filepath of data
84 | dataset_trec(return_path = TRUE)
85 |
86 | # Access both training and testing dataset
87 | train_6 <- dataset_trec(split = "train")
88 | test_6 <- dataset_trec(split = "test")
89 |
90 | train_50 <- dataset_trec(split = "train", version = "50")
91 | test_50 <- dataset_trec(split = "test", version = "50")
92 | }
93 |
94 | }
95 | \seealso{
96 | Other topic:
97 | \code{\link{dataset_ag_news}()},
98 | \code{\link{dataset_dbpedia}()}
99 | }
100 | \concept{topic}
101 | \keyword{datasets}
102 |
--------------------------------------------------------------------------------
/man/embedding_glove.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/embedding_glove.R
3 | \name{embedding_glove}
4 | \alias{embedding_glove}
5 | \alias{embedding_glove6b}
6 | \alias{embedding_glove27b}
7 | \alias{embedding_glove42b}
8 | \alias{embedding_glove840b}
9 | \title{Global Vectors for Word Representation}
10 | \source{
11 | \url{https://nlp.stanford.edu/projects/glove/}
12 | }
13 | \usage{
14 | embedding_glove6b(
15 | dir = NULL,
16 | dimensions = c(50, 100, 200, 300),
17 | delete = FALSE,
18 | return_path = FALSE,
19 | clean = FALSE,
20 | manual_download = FALSE
21 | )
22 |
23 | embedding_glove27b(
24 | dir = NULL,
25 | dimensions = c(25, 50, 100, 200),
26 | delete = FALSE,
27 | return_path = FALSE,
28 | clean = FALSE,
29 | manual_download = FALSE
30 | )
31 |
32 | embedding_glove42b(
33 | dir = NULL,
34 | delete = FALSE,
35 | return_path = FALSE,
36 | clean = FALSE,
37 | manual_download = FALSE
38 | )
39 |
40 | embedding_glove840b(
41 | dir = NULL,
42 | delete = FALSE,
43 | return_path = FALSE,
44 | clean = FALSE,
45 | manual_download = FALSE
46 | )
47 | }
48 | \arguments{
49 | \item{dir}{Character, path to directory where data will be stored. If
50 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
51 |
52 | \item{dimensions}{A number indicating the number of vectors to include. One
53 | of 50, 100, 200, or 300 for glove6b, or one of 25, 50, 100, or 200 for
54 | glove27b.}
55 |
56 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
57 |
58 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
59 |
60 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
61 | greatly reduce the size. Defaults to FALSE.}
62 |
63 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
64 | downloaded the file and placed it in the folder designated by running
65 | this function with \code{return_path = TRUE}.}
66 | }
67 | \value{
68 | A tibble with 400k, 1.9m, 2.2m, or 1.2m rows (one row for each unique
69 | token in the vocabulary) and the following variables:
70 | \describe{
71 | \item{token}{An individual token (usually a word)}
72 | \item{d1, d2, etc}{The embeddings for that token.}
73 | }
74 | }
75 | \description{
76 | The GloVe pre-trained word vectors provide word embeddings created using
77 | varying numbers of tokens.
78 | }
79 | \details{
80 | Citation info:
81 |
82 | InProceedings\{pennington2014glove, \cr
83 | author = \{Jeffrey Pennington and Richard Socher and Christopher D. \cr
84 | Manning\}, \cr
85 | title = \{GloVe: Global Vectors for Word Representation\}, \cr
86 | booktitle = \{Empirical Methods in Natural Language Processing (EMNLP)\}, \cr
87 | year = 2014 \cr
88 | pages = \{1532-1543\} \cr
89 | url = \{http://www.aclweb.org/anthology/D14-1162\} \cr
90 | \}
91 | }
92 | \examples{
93 | \dontrun{
94 | embedding_glove6b(dimensions = 50)
95 |
96 | # Custom directory
97 | embedding_glove42b(dir = "data/")
98 |
99 | # Deleting dataset
100 | embedding_glove6b(delete = TRUE, dimensions = 300)
101 |
102 | # Returning filepath of data
103 | embedding_glove840b(return_path = TRUE)
104 | }
105 | }
106 | \references{
107 | Jeffrey Pennington, Richard Socher, and Christopher D. Manning.
108 | 2014. GloVe: Global Vectors for Word Representation.
109 | }
110 | \concept{embeddings}
111 | \keyword{datasets}
112 |
--------------------------------------------------------------------------------
/man/figures/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/.DS_Store
--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/logo.png
--------------------------------------------------------------------------------
/man/figures/screen-shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/screen-shot.png
--------------------------------------------------------------------------------
/man/figures/textdata_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/man/figures/textdata_demo.gif
--------------------------------------------------------------------------------
/man/lexicon_afinn.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lexicon_afinn.R
3 | \name{lexicon_afinn}
4 | \alias{lexicon_afinn}
5 | \title{AFINN-111 dataset}
6 | \usage{
7 | lexicon_afinn(
8 | dir = NULL,
9 | delete = FALSE,
10 | return_path = FALSE,
11 | clean = FALSE,
12 | manual_download = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{dir}{Character, path to directory where data will be stored. If
17 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
18 |
19 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
20 |
21 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
22 |
23 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
24 | greatly reduce the size. Defaults to FALSE.}
25 |
26 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
27 | downloaded the file and placed it in the folder designated by running
28 | this function with \code{return_path = TRUE}.}
29 | }
30 | \value{
31 | A tibble with 2,477 rows and 2 variables:
32 | \describe{
33 | \item{word}{An English word}
34 | \item{score}{Indicator for sentiment: integer between -5 and +5}
35 | }
36 | }
37 | \description{
38 | AFINN is a lexicon of English words rated for valence with an integer
39 | between minus five (negative) and plus five (positive). The words have
40 | been manually labeled by Finn Årup Nielsen in 2009-2011.
41 | }
42 | \details{
43 | This dataset is the newest version with 2477 words and phrases.
44 |
45 | Citation info:
46 |
47 | This dataset was published in Finn Ärup Nielsen (2011),
48 | ``A new Evaluation of a word list for sentiment analysis in
49 | microblogs'', Proceedings of the ESWC2011 Workshop on
50 | 'Making Sense of Microposts': Big things come in small packages (2011) 93-98.
51 |
52 | article\{nielsen11, \cr
53 | author = \{Finn Äruprup Nielsen\}, \cr
54 | title = \{A new Evaluation of a word list for sentiment analysis in microblogs\}, \cr
55 | journal = \{CoRR\}, \cr
56 | volume = \{abs/1103.2903\}, \cr
57 | year = \{2011\}, \cr
58 | url = \{http://arxiv.org/abs/1103.2903\}, \cr
59 | archivePrefix = \{arXiv\}, \cr
60 | eprint = \{1103.2903\}, \cr
61 | biburl = \{https://dblp.org/rec/bib/journals/corr/abs-1103-2903\}, \cr
62 | bibsource = \{dblp computer science bibliography, https://dblp.org\} \cr
63 | \}
64 | }
65 | \examples{
66 | \dontrun{
67 | lexicon_afinn()
68 |
69 | # Custom directory
70 | lexicon_afinn(dir = "data/")
71 |
72 | # Deleting dataset
73 | lexicon_afinn(delete = TRUE)
74 |
75 | # Returning filepath of data
76 | lexicon_afinn(return_path = TRUE)
77 | }
78 | }
79 | \seealso{
80 | Other lexicon:
81 | \code{\link{lexicon_bing}()},
82 | \code{\link{lexicon_loughran}()},
83 | \code{\link{lexicon_nrc}()},
84 | \code{\link{lexicon_nrc_eil}()},
85 | \code{\link{lexicon_nrc_vad}()}
86 | }
87 | \concept{lexicon}
88 | \keyword{datasets}
89 |
--------------------------------------------------------------------------------
/man/lexicon_bing.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lexicon_bing.R
3 | \name{lexicon_bing}
4 | \alias{lexicon_bing}
5 | \title{Bing sentiment lexicon}
6 | \source{
7 | \url{https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html}
8 | }
9 | \usage{
10 | lexicon_bing(
11 | dir = NULL,
12 | delete = FALSE,
13 | return_path = FALSE,
14 | clean = FALSE,
15 | manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 |
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 |
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 |
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 |
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 6,787 rows and 2 variables:
35 | \describe{
36 | \item{word}{An English word}
37 | \item{sentiment}{Indicator for sentiment: "negative" or "positive"}
38 | }
39 | }
40 | \description{
41 | General purpose English sentiment lexicon that categorizes words in a
42 | binary fashion, either positive or negative
43 | }
44 | \details{
45 | Citation info:
46 |
47 | This dataset was first published in Minqing Hu and Bing Liu, ``Mining and
48 | summarizing customer reviews.'', Proceedings of the ACM SIGKDD International
49 | Conference on Knowledge Discovery & Data Mining (KDD-2004), 2004.
50 |
51 | inproceedings\{Hu04, \cr
52 | author = \{Hu, Minqing and Liu, Bing\}, \cr
53 | title = \{Mining and Summarizing Customer Reviews\}, \cr
54 | booktitle = \{Proceedings of the Tenth ACM SIGKDD International Conference
55 | on Knowledge Discovery and Data Mining\}, \cr
56 | series = \{KDD '04\}, \cr
57 | year = \{2004\}, \cr
58 | isbn = \{1-58113-888-1\}, \cr
59 | location = \{Seattle, WA, USA\}, \cr
60 | pages = \{168--177\}, \cr
61 | numpages = \{10\}, \cr
62 | url = \{http://doi.acm.org/10.1145/1014052.1014073\}, \cr
63 | doi = \{10.1145/1014052.1014073\}, \cr
64 | acmid = \{1014073\}, \cr
65 | publisher = \{ACM\}, \cr
66 | address = \{New York, NY, USA\}, \cr
67 | keywords = \{reviews, sentiment classification, summarization, text mining\}, \cr
68 | \}
69 | }
70 | \examples{
71 | \dontrun{
72 | lexicon_bing()
73 |
74 | # Custom directory
75 | lexicon_bing(dir = "data/")
76 |
77 | # Deleting dataset
78 | lexicon_bing(delete = TRUE)
79 |
80 | # Returning filepath of data
81 | lexicon_bing(return_path = TRUE)
82 | }
83 | }
84 | \seealso{
85 | Other lexicon:
86 | \code{\link{lexicon_afinn}()},
87 | \code{\link{lexicon_loughran}()},
88 | \code{\link{lexicon_nrc}()},
89 | \code{\link{lexicon_nrc_eil}()},
90 | \code{\link{lexicon_nrc_vad}()}
91 | }
92 | \concept{lexicon}
93 | \keyword{datasets}
94 |
--------------------------------------------------------------------------------
/man/lexicon_loughran.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lexicon_loughran.R
3 | \name{lexicon_loughran}
4 | \alias{lexicon_loughran}
5 | \title{Loughran-McDonald sentiment lexicon}
6 | \source{
7 | \url{https://sraf.nd.edu/loughranmcdonald-master-dictionary/}
8 | }
9 | \usage{
10 | lexicon_loughran(
11 | dir = NULL,
12 | delete = FALSE,
13 | return_path = FALSE,
14 | clean = FALSE,
15 | manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 |
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 |
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 |
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 |
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 4,150 rows and 2 variables:
35 | \describe{
36 | \item{word}{An English word}
37 | \item{sentiment}{Indicator for sentiment: "negative", "positive",
38 | "litigious", "uncertainty", "constraining", or "superfluous"}
39 | }
40 | }
41 | \description{
42 | English sentiment lexicon created for use with financial documents. This
43 | lexicon labels words with six possible sentiments important in financial
44 | contexts: "negative", "positive", "litigious", "uncertainty", "constraining",
45 | or "superfluous".
46 | }
47 | \details{
48 | Citation info:
49 |
50 | This dataset was published in Loughran, T. and McDonald, B. (2011),
51 | ``When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and
52 | 10-Ks.'' The Journal of Finance, 66: 35-65.
53 |
54 | article\{loughran11, \cr
55 | author = \{Loughran, Tim and McDonald, Bill\}, \cr
56 | title = \{When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10-Ks\}, \cr
57 | journal = \{The Journal of Finance\}, \cr
58 | volume = \{66\}, \cr
59 | number = \{1\}, \cr
60 | pages = \{35-65\}, \cr
61 | doi = \{10.1111/j.1540-6261.2010.01625.x\}, \cr
62 | url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-6261.2010.01625.x\}, \cr
63 | eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1540-6261.2010.01625.x\}, \cr
64 | year = \{2011\} \cr
65 | \}
66 | }
67 | \examples{
68 | \dontrun{
69 | lexicon_loughran()
70 |
71 | # Custom directory
72 | lexicon_loughran(dir = "data/")
73 |
74 | # Deleting dataset
75 | lexicon_loughran(delete = TRUE)
76 |
77 | # Returning filepath of data
78 | lexicon_loughran(return_path = TRUE)
79 | }
80 | }
81 | \seealso{
82 | Other lexicon:
83 | \code{\link{lexicon_afinn}()},
84 | \code{\link{lexicon_bing}()},
85 | \code{\link{lexicon_nrc}()},
86 | \code{\link{lexicon_nrc_eil}()},
87 | \code{\link{lexicon_nrc_vad}()}
88 | }
89 | \concept{lexicon}
90 | \keyword{datasets}
91 |
--------------------------------------------------------------------------------
/man/lexicon_nrc.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lexicon_nrc.R
3 | \name{lexicon_nrc}
4 | \alias{lexicon_nrc}
5 | \title{NRC word-emotion association lexicon}
6 | \source{
7 | \url{http://saifmohammad.com/WebPages/lexicons.html}
8 | }
9 | \usage{
10 | lexicon_nrc(
11 | dir = NULL,
12 | delete = FALSE,
13 | return_path = FALSE,
14 | clean = FALSE,
15 | manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 |
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 |
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 |
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 |
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 13,901 rows and 2 variables:
35 | \describe{
36 | \item{word}{An English word}
37 | \item{sentiment}{Indicator for sentiment or emotion: "negative",
38 | "positive", "anger", "anticipation", "disgust", "fear", "joy", "sadness",
39 | "surprise", or "trust"}
40 | }
41 | }
42 | \description{
43 | General purpose English sentiment/emotion lexicon. This lexicon labels words
44 | with six possible sentiments or emotions: "negative", "positive", "anger",
45 | "anticipation", "disgust", "fear", "joy", "sadness", "surprise", or "trust".
46 | The annotations were manually done through Amazon's Mechanical Turk.
47 | }
48 | \details{
49 | License required for commercial use. Please contact Saif M. Mohammad
50 | (saif.mohammad@nrc-cnrc.gc.ca).
51 |
52 | Citation info:
53 |
54 | This dataset was published in Saif Mohammad and Peter Turney. (2013),
55 | ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational
56 | Intelligence, 29(3): 436-465.
57 |
58 | article\{mohammad13, \cr
59 | author = \{Mohammad, Saif M. and Turney, Peter D.\}, \cr
60 | title = \{CROWDSOURCING A WORD–EMOTION ASSOCIATION LEXICON\}, \cr
61 | journal = \{Computational Intelligence\}, \cr
62 | volume = \{29\}, \cr
63 | number = \{3\}, \cr
64 | pages = \{436-465\}, \cr
65 | doi = \{10.1111/j.1467-8640.2012.00460.x\}, \cr
66 | url = \{https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x\}, \cr
67 | eprint = \{https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x\}, \cr
68 | year = \{2013\} \cr
69 | \}
70 | }
71 | \examples{
72 | \dontrun{
73 | lexicon_nrc()
74 |
75 | # Custom directory
76 | lexicon_nrc(dir = "data/")
77 |
78 | # Deleting dataset
79 | lexicon_nrc(delete = TRUE)
80 |
81 | # Returning filepath of data
82 | lexicon_nrc(return_path = TRUE)
83 | }
84 | }
85 | \seealso{
86 | Other lexicon:
87 | \code{\link{lexicon_afinn}()},
88 | \code{\link{lexicon_bing}()},
89 | \code{\link{lexicon_loughran}()},
90 | \code{\link{lexicon_nrc_eil}()},
91 | \code{\link{lexicon_nrc_vad}()}
92 | }
93 | \concept{lexicon}
94 | \keyword{datasets}
95 |
--------------------------------------------------------------------------------
/man/lexicon_nrc_eil.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lexicon_nrc_eil.R
3 | \name{lexicon_nrc_eil}
4 | \alias{lexicon_nrc_eil}
5 | \title{NRC Emotion Intensity Lexicon (aka Affect Intensity Lexicon) v0.5}
6 | \source{
7 | \url{https://saifmohammad.com/WebPages/AffectIntensity.htm}
8 | }
9 | \usage{
10 | lexicon_nrc_eil(
11 | dir = NULL,
12 | delete = FALSE,
13 | return_path = FALSE,
14 | clean = FALSE,
15 | manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 |
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 |
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 |
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 |
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 5.814 rows and 3 variables:
35 | \describe{
36 | \item{term}{An English word}
37 | \item{score}{Value between 0 and 1}
38 | \item{AffectDimension}{Indicator for sentiment or emotion: ("anger",
39 | "fear", "sadness", "joy")}
40 | }
41 | }
42 | \description{
43 | General purpose English sentiment/emotion lexicon. The NRC Affect Intensity
44 | Lexicon is a list of English words and their associations with four basic
45 | emotions (anger, fear, sadness, joy).
46 | }
47 | \details{
48 | For a given word and emotion X, the scores range from 0 to 1. A score of 1
49 | means that the word conveys the highest amount of emotion X. A score of 0
50 | means that the word conveys the lowest amount of emotion X.
51 |
52 | License required for commercial use. Please contact Saif M. Mohammad
53 | (saif.mohammad@nrc-cnrc.gc.ca).
54 |
55 | Citation info:
56 |
57 | Details of the lexicon are in this paper.
58 | Word Affect Intensities. Saif M. Mohammad. In Proceedings of the 11th Edition
59 | of the Language Resources and Evaluation Conference (LREC-2018), May 2018,
60 | Miyazaki, Japan.
61 |
62 | inproceedings\{LREC18-AIL, \cr
63 | author = \{Mohammad, Saif M.\}, \cr
64 | title = \{Word Affect Intensities\}, \cr
65 | booktitle = \{Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)\}, \cr
66 | year = \{2018\}, \cr
67 | address=\{Miyazaki, Japan\} \cr
68 | \} \cr
69 | }
70 | \examples{
71 | \dontrun{
72 | lexicon_nrc_eil()
73 |
74 | # Custom directory
75 | lexicon_nrc_eil(dir = "data/")
76 |
77 | # Deleting dataset
78 | lexicon_nrc_eil(delete = TRUE)
79 |
80 | # Returning filepath of data
81 | lexicon_nrc_eil(return_path = TRUE)
82 | }
83 | }
84 | \seealso{
85 | Other lexicon:
86 | \code{\link{lexicon_afinn}()},
87 | \code{\link{lexicon_bing}()},
88 | \code{\link{lexicon_loughran}()},
89 | \code{\link{lexicon_nrc}()},
90 | \code{\link{lexicon_nrc_vad}()}
91 | }
92 | \concept{lexicon}
93 | \keyword{datasets}
94 |
--------------------------------------------------------------------------------
/man/lexicon_nrc_vad.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lexicon_nrc_vad.R
3 | \name{lexicon_nrc_vad}
4 | \alias{lexicon_nrc_vad}
5 | \title{The NRC Valence, Arousal, and Dominance Lexicon}
6 | \source{
7 | \url{https://saifmohammad.com/WebPages/nrc-vad.html}
8 | }
9 | \usage{
10 | lexicon_nrc_vad(
11 | dir = NULL,
12 | delete = FALSE,
13 | return_path = FALSE,
14 | clean = FALSE,
15 | manual_download = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{dir}{Character, path to directory where data will be stored. If
20 | \code{NULL}, \link[rappdirs]{user_cache_dir} will be used to determine path.}
21 |
22 | \item{delete}{Logical, set \code{TRUE} to delete dataset.}
23 |
24 | \item{return_path}{Logical, set \code{TRUE} to return the path of the dataset.}
25 |
26 | \item{clean}{Logical, set \code{TRUE} to remove intermediate files. This can
27 | greatly reduce the size. Defaults to FALSE.}
28 |
29 | \item{manual_download}{Logical, set \code{TRUE} if you have manually
30 | downloaded the file and placed it in the folder designated by running
31 | this function with \code{return_path = TRUE}.}
32 | }
33 | \value{
34 | A tibble with 20.007 rows and 4 variables:
35 | \describe{
36 | \item{word}{An English word}
37 | \item{Valence}{valence score of the word}
38 | \item{Arousal}{arousal score of the word}
39 | \item{Dominance}{dominance score of the word}
40 | }
41 | }
42 | \description{
43 | The NRC Valence, Arousal, and Dominance (VAD) Lexicon includes a list of
44 | more than 20,000 English words and their valence, arousal, and dominance
45 | scores. For a given word and a dimension (V/A/D), the scores range from 0
46 | (lowest V/A/D) to 1 (highest V/A/D). The lexicon with its fine-grained real-
47 | valued scores was created by manual annotation using best--worst scaling.
48 | The lexicon is markedly larger than any of the existing VAD lexicons. We also
49 | show that the ratings obtained are substantially more reliable than those in
50 | existing lexicons.
51 | }
52 | \details{
53 | License required for commercial use. Please contact Saif M. Mohammad
54 | (saif.mohammad@nrc-cnrc.gc.ca).
55 |
56 | Citation info:
57 |
58 | Details of the NRC VAD Lexicon are available in this paper:
59 |
60 | Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for
61 | 20,000 English Words. Saif M. Mohammad. In Proceedings of the 56th Annual
62 | Meeting of the Association for Computational Linguistics, Melbourne,
63 | Australia, July 2018.
64 |
65 | inproceedings\{vad-acl2018, \cr
66 | title=\{Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words\}, \cr
67 | author=\{Mohammad, Saif M.\}, \cr
68 | booktitle=\{Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)\}, \cr
69 | year=\{2018\}, \cr
70 | address=\{Melbourne, Australia\} \cr
71 | \}
72 | }
73 | \examples{
74 | \dontrun{
75 | lexicon_nrc_vad()
76 |
77 | # Custom directory
78 | lexicon_nrc_vad(dir = "data/")
79 |
80 | # Deleting dataset
81 | lexicon_nrc_vad(delete = TRUE)
82 |
83 | # Returning filepath of data
84 | lexicon_nrc_vad(return_path = TRUE)
85 | }
86 | }
87 | \seealso{
88 | Other lexicon:
89 | \code{\link{lexicon_afinn}()},
90 | \code{\link{lexicon_bing}()},
91 | \code{\link{lexicon_loughran}()},
92 | \code{\link{lexicon_nrc}()},
93 | \code{\link{lexicon_nrc_eil}()}
94 | }
95 | \concept{lexicon}
96 | \keyword{datasets}
97 |
--------------------------------------------------------------------------------
/man/load_dataset.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/load_dataset.R
3 | \name{load_dataset}
4 | \alias{load_dataset}
5 | \title{Internal Functions}
6 | \usage{
7 | load_dataset(
8 | data_name,
9 | name,
10 | dir,
11 | delete,
12 | return_path,
13 | clean,
14 | clean_manual = NULL,
15 | manual_download
16 | )
17 | }
18 | \description{
19 | These are not to be used directly by the users.
20 | }
21 | \keyword{internal}
22 |
--------------------------------------------------------------------------------
/man/textdata-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textdata-package.R
3 | \docType{package}
4 | \name{textdata-package}
5 | \alias{textdata}
6 | \alias{textdata-package}
7 | \title{textdata: Download and Load Various Text Datasets}
8 | \description{
9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
10 |
11 | Provides a framework to download, parse, and store text datasets on the disk and load them when needed. Includes various sentiment lexicons and labeled text data sets for classification and analysis.
12 | }
13 | \seealso{
14 | Useful links:
15 | \itemize{
16 | \item \url{https://emilhvitfeldt.github.io/textdata/}
17 | \item \url{https://github.com/EmilHvitfeldt/textdata}
18 | \item Report bugs at \url{https://github.com/EmilHvitfeldt/textdata/issues}
19 | }
20 |
21 | }
22 | \author{
23 | \strong{Maintainer}: Emil Hvitfeldt \email{emilhhvitfeldt@gmail.com} (\href{https://orcid.org/0000-0002-0679-1945}{ORCID})
24 |
25 | Other contributors:
26 | \itemize{
27 | \item Julia Silge \email{julia.silge@gmail.com} (\href{https://orcid.org/0000-0002-3671-836X}{ORCID}) [contributor]
28 | }
29 |
30 | }
31 | \keyword{internal}
32 |
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-120x120.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-152x152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-152x152.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-180x180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-180x180.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-60x60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-60x60.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-76x76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon-76x76.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/apple-touch-icon.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon-16x16.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon-32x32.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmilHvitfeldt/textdata/7a99a97b4e7f30927bc5509d5dfaafd2aa8b58d6/pkgdown/favicon/favicon.ico
--------------------------------------------------------------------------------
/revdep/README.md:
--------------------------------------------------------------------------------
1 | # Platform
2 |
3 | |field |value |
4 | |:--------|:----------------------------------------------------------------------------------------|
5 | |version |R version 4.3.3 (2024-02-29) |
6 | |os |macOS Sonoma 14.4.1 |
7 | |system |aarch64, darwin20 |
8 | |ui |X11 |
9 | |language |(EN) |
10 | |collate |en_US.UTF-8 |
11 | |ctype |en_US.UTF-8 |
12 | |tz |America/Los_Angeles |
13 | |date |2024-05-28 |
14 | |pandoc |3.1.12.3 @ /Applications/Positron.app/Contents/Resources/app/bin/pandoc/ (via rmarkdown) |
15 |
16 | # Dependencies
17 |
18 | |package |old |new |Δ |
19 | |:-----------|:-----|:----------|:--|
20 | |textdata |0.4.4 |0.4.4.9000 |* |
21 | |bit |4.0.5 |4.0.5 | |
22 | |bit64 |4.0.5 |4.0.5 | |
23 | |cli |3.6.2 |3.6.2 | |
24 | |clipr |0.8.0 |0.8.0 | |
25 | |cpp11 |0.4.7 |0.4.7 | |
26 | |crayon |1.5.2 |1.5.2 | |
27 | |fansi |1.0.6 |1.0.6 | |
28 | |fs |1.6.4 |1.6.4 | |
29 | |glue |1.7.0 |1.7.0 | |
30 | |hms |1.1.3 |1.1.3 | |
31 | |lifecycle |1.0.4 |1.0.4 | |
32 | |magrittr |2.0.3 |2.0.3 | |
33 | |pillar |1.9.0 |1.9.0 | |
34 | |pkgconfig |2.0.3 |2.0.3 | |
35 | |prettyunits |1.2.0 |1.2.0 | |
36 | |progress |1.2.3 |1.2.3 | |
37 | |R6 |2.5.1 |2.5.1 | |
38 | |rappdirs |0.3.3 |0.3.3 | |
39 | |readr |2.1.5 |2.1.5 | |
40 | |rlang |1.1.3 |1.1.3 | |
41 | |tibble |3.2.1 |3.2.1 | |
42 | |tidyselect |1.2.1 |1.2.1 | |
43 | |tzdb |0.4.0 |0.4.0 | |
44 | |utf8 |1.2.4 |1.2.4 | |
45 | |vctrs |0.6.5 |0.6.5 | |
46 | |vroom |1.6.5 |1.6.5 | |
47 | |withr |3.0.0 |3.0.0 | |
48 |
49 | # Revdeps
50 |
51 |
--------------------------------------------------------------------------------
/revdep/cran.md:
--------------------------------------------------------------------------------
1 | ## revdepcheck results
2 |
3 | We checked 3 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
4 |
5 | * We saw 0 new problems
6 | * We failed to check 0 packages
7 |
8 |
--------------------------------------------------------------------------------
/revdep/failures.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*
--------------------------------------------------------------------------------
/revdep/problems.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(textdata)
3 |
4 | test_check("textdata")
5 |
--------------------------------------------------------------------------------
/tests/testthat/test-download_functions.R:
--------------------------------------------------------------------------------
1 | downloads <- setdiff(
2 | ls(getNamespace("textdata"), pattern = "^download_"),
3 | "download_functions"
4 | )
5 |
6 | test_that("All download functions are included in download_functions", {
7 | expect_equal(
8 | length(downloads),
9 | length(textdata:::download_functions)
10 | )
11 | })
12 |
13 | test_that("All download functions has the folder_path argument", {
14 | for (fun in downloads) {
15 | expect_equal(
16 | names(formals(get(fun, getNamespace("textdata")))),
17 | "folder_path"
18 | )
19 | }
20 | })
21 |
22 | test_that("the download functions are named right according to print_info", {
23 | testthat::expect_setequal(
24 | paste0("download_", names(textdata:::print_info)),
25 | downloads
26 | )
27 | })
28 |
--------------------------------------------------------------------------------
/tests/testthat/test-info.R:
--------------------------------------------------------------------------------
1 | test_that("print_info has right names", {
2 | lapply(
3 | textdata:::print_info,
4 | function(x) expect_true(all(names(x) == c("name", "url", "license", "size", "type", "download_mech", "description", "citation")))
5 | )
6 | })
7 |
--------------------------------------------------------------------------------
/tests/testthat/test-process_functions.R:
--------------------------------------------------------------------------------
1 | processs <- setdiff(
2 | ls(getNamespace("textdata"), pattern = "^process_"),
3 | "process_functions"
4 | )
5 |
6 | test_that("All process functions are included in process_functions", {
7 | expect_equal(
8 | length(processs),
9 | length(textdata:::process_functions)
10 | )
11 | })
12 |
13 | test_that("All process functions has the folder_path argument", {
14 | for (fun in processs) {
15 | expect_equal(
16 | names(formals(get(fun, getNamespace("textdata")))),
17 | c("folder_path", "name_path")
18 | )
19 | }
20 | })
21 |
22 | test_that("the process functions are named right according to print_info", {
23 | testthat::expect_setequal(
24 | paste0("process_", names(textdata:::print_info)),
25 | processs
26 | )
27 | })
28 |
--------------------------------------------------------------------------------
/textdata.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/vignettes/How-to-add-a-data-set.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "How to add a data set"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{How to add a data set}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r, include = FALSE}
11 | knitr::opts_chunk$set(
12 | collapse = TRUE,
13 | comment = "#>"
14 | )
15 | ```
16 |
17 | ```{r setup}
18 | library(textdata)
19 | ```
20 |
21 | This package provides infrastructure to make text datasets available within R, even when they are too large to store within an R package or are licensed in such a way that prevents them from being included in OSS-licensed packages.
22 |
23 | Do you want to add a new dataset to the textdata package?
24 |
25 | - Create a R file named `prefix_*.R` in the `R/` folder, where `*` is the name of the dataset. Supported prefixes include
26 | - `dataset_`
27 | - `lexicon_`
28 | - Inside that file create 3 functions named `download_*()`, `process_*()` and `dataset_*()`.
29 | - The `download_*()` function should take 1 argument named `folder_path`. It has 2 tasks, first it should check if the file is already downloaded. If it is already downloaded it should return `invisible()`. If the file isn't at the path it should download the file to said path.
30 | - The `process_*()` function should take 2 arguments, `folder_path` and `name_path`. `folder_path` denotes the the path to the file returned by `download_*` and `name_path` is the path to where the polished data should live. Main point of `process_*()` is to turn the downloaded file into a .rds file containing a tidy tibble.
31 | - The `dataset_*()` function should wrap the `load_dataset()`.
32 | - Add the `process_*()` function to the named list `process_functions` in the file process_functions.R.
33 | - Add the `download_*()` function to the named list `download_functions` in the file download_functions.R.
34 | - Modify the `print_info` list in the info.R file.
35 | - Add `dataset_*.R` to the @include tags in `download_functions.R`.
36 | - Add the dataset to the table in `README.Rmd`.
37 | - Add the dataset to `_pkgdown.yml`.
38 | - Write a bullet in the `NEWS.md file`.
39 |
40 | What are the guidelines for adding datasets?
41 |
42 | # Guidelines for textdata datasets
43 |
44 | - All datasets must have a license or terms of use clearly specified.
45 | - Data should be a vector or tibble.
46 | - Use `word` instead of `words` for column names.
47 |
48 | # Classification datasets
49 |
50 | For datasets that comes with a testing and training dataset. Let the user pick which one to retrieve with a `split` argument similar to how `dataset_ag_news()` is doing.
51 |
--------------------------------------------------------------------------------