├── .Rbuildignore
├── .Rprofile
├── .gitignore
├── .gitmodules
├── .travis.yml
├── CONDUCT.md
├── DESCRIPTION
├── LICENSE
├── Makefile
├── NAMESPACE
├── NEWS.md
├── R
├── coerce.R
├── corpus_frame.R
├── deprecated.R
├── frame-stats.R
├── frame.R
├── gutenberg.R
├── json.R
├── logging.R
├── sentence.R
├── stem.R
├── style.R
├── term.R
├── text-base.R
├── text-primitive.R
├── text-stats.R
├── text-utils.R
├── text.R
├── text_filter.R
├── text_locate.R
├── text_split.R
├── text_stats.R
├── text_types.R
├── token.R
├── util.R
└── wordlist.R
├── README.md
├── TODO.md
├── _pkgdown.yml
├── appveyor.yml
├── bench
├── bench-term_matrix.R
├── bench-term_matrix.Rout
└── bench.R
├── data
├── abbreviations.rda
├── abbreviations
│ └── 01_make_abbreviations.R
├── affect_wordnet.rda
├── affect_wordnet
│ ├── 01_make_tsv.py
│ ├── 02_make_rda.R
│ ├── README
│ ├── wn-affect-1.1
│ │ ├── a-hierarchy.xml
│ │ ├── a-synsets.xml
│ │ └── readme-wn-affect-1.1.txt
│ ├── wnaffect.tsv
│ └── wordnet-1.6
│ │ ├── LICENSE
│ │ ├── dict
│ │ ├── Makefile
│ │ ├── adj.exc
│ │ ├── adv.exc
│ │ ├── cntlist
│ │ ├── cousin.exc
│ │ ├── cousin.tops
│ │ ├── data.adj
│ │ ├── data.adv
│ │ ├── data.noun
│ │ ├── data.verb
│ │ ├── index.adj
│ │ ├── index.adv
│ │ ├── index.gloss
│ │ ├── index.noun
│ │ ├── index.sense
│ │ ├── index.verb
│ │ ├── lexnames
│ │ ├── noun.exc
│ │ ├── sentidx.vrb
│ │ ├── sents.vrb
│ │ ├── stoplist.pl
│ │ └── verb.exc
│ │ └── man
│ │ └── html
│ │ ├── binsrch.htm
│ │ ├── cntlist.htm
│ │ ├── glossidx.htm
│ │ ├── lexnames.htm
│ │ ├── morph.htm
│ │ ├── morphy.htm
│ │ ├── senseidx.htm
│ │ ├── taglist.htm
│ │ ├── uniqbeg.htm
│ │ ├── wn.htm
│ │ ├── wnb.htm
│ │ ├── wndb.htm
│ │ ├── wngloss.htm
│ │ ├── wngroups.htm
│ │ ├── wninput.htm
│ │ ├── wnintro1.htm
│ │ ├── wnintro3.htm
│ │ ├── wnintro5.htm
│ │ ├── wnintro7.htm
│ │ ├── wnlicens.htm
│ │ ├── wnpkgs.htm
│ │ ├── wnsearch.htm
│ │ ├── wnstats.htm
│ │ └── wnutil.htm
├── federalist.rda
├── federalist
│ ├── .gitignore
│ ├── 01_download_raw.sh
│ ├── 02_make_json.py
│ └── 03_make_rda.R
├── sentiment_afinn.rda
├── sentiment_afinn
│ ├── 01_download_raw.sh
│ ├── 02_make_rda.R
│ └── AFINN
│ │ ├── AFINN-111.txt
│ │ ├── AFINN-96.txt
│ │ └── AFINN-README.txt
├── stopwords.rda
└── stopwords
│ └── 01_make_stopwords.R
├── docs
├── CNAME
├── LICENSE.html
├── articles
│ ├── chinese-wordcloud-1.png
│ ├── chinese.html
│ ├── corpus-emotion-1.png
│ ├── corpus-heapslaw-1.png
│ ├── corpus-witch-occurrences-1.png
│ ├── corpus.html
│ ├── gender-estimates-1.png
│ ├── gender-estimates_se-1.png
│ ├── gender-signif-1.png
│ ├── gender.html
│ ├── index.html
│ ├── stemmer.html
│ ├── textdata.html
│ └── unicode.html
├── authors.html
├── favicon.ico
├── index.html
├── jquery.sticky-kit.min.js
├── link.svg
├── logo.png
├── news
│ └── index.html
├── pkgdown.css
├── pkgdown.js
└── reference
│ ├── abbreviations.html
│ ├── affect_wordnet.html
│ ├── corpus-deprecated.html
│ ├── corpus-package.html
│ ├── corpus_frame.html
│ ├── corpus_text.html
│ ├── federalist.html
│ ├── figures
│ ├── banner.png
│ ├── logo.png
│ └── logo
│ │ ├── 01_make_logo.R
│ │ ├── README
│ │ ├── c-07.jpg
│ │ ├── logo-slide.tiff
│ │ └── logo.key
│ ├── gutenberg_corpus.html
│ ├── index.html
│ ├── new_stemmer.html
│ ├── print.corpus_frame.html
│ ├── read_ndjson.html
│ ├── sentiment_afinn.html
│ ├── stem_snowball.html
│ ├── stopwords.html
│ ├── term_matrix.html
│ ├── term_stats.html
│ ├── text_filter.html
│ ├── text_locate.html
│ ├── text_split.html
│ ├── text_stats.html
│ ├── text_sub.html
│ ├── text_tokens.html
│ ├── text_types.html
│ └── utf8.html
├── inst
└── WORDLIST
├── man
├── abbreviations.Rd
├── affect_wordnet.Rd
├── corpus-deprecated.Rd
├── corpus-package.Rd
├── corpus_frame.Rd
├── corpus_text.Rd
├── federalist.Rd
├── figures
│ ├── banner.png
│ ├── logo.png
│ └── logo
│ │ ├── 01_make_logo.R
│ │ ├── README
│ │ ├── c-07.jpg
│ │ ├── logo-slide.tiff
│ │ └── logo.key
├── gutenberg_corpus.Rd
├── new_stemmer.Rd
├── print.corpus_frame.Rd
├── read_ndjson.Rd
├── sentiment_afinn.Rd
├── stem_snowball.Rd
├── stopwords.Rd
├── term_matrix.Rd
├── term_stats.Rd
├── text_filter.Rd
├── text_locate.Rd
├── text_split.Rd
├── text_stats.Rd
├── text_sub.Rd
├── text_tokens.Rd
└── text_types.Rd
├── src
├── Makevars
├── context.c
├── decode.c
├── filebuf.c
├── init.c
├── json.c
├── logging.c
├── mkchar.c
├── ndjson.c
├── rcorpus.h
├── search.c
├── stemmer.c
├── term_matrix.c
├── term_stats.c
├── termset.c
├── text.c
├── text_c.c
├── text_filter.c
├── text_locate.c
├── text_methods.c
├── text_nunit.c
├── text_split.c
├── text_sub.c
├── text_tokens.c
├── text_trunc.c
├── text_types.c
├── util.c
└── wordlist.c
├── tests
├── testthat.R
└── testthat
│ ├── helper-capture_output.R
│ ├── helper-locale.R
│ ├── helper-options.R
│ ├── test-foreign.R
│ ├── test-frame-stats.R
│ ├── test-frame.R
│ ├── test-gutenberg_corpus.R
│ ├── test-json_record.R
│ ├── test-json_scalar.R
│ ├── test-json_serialize.R
│ ├── test-read_ndjson.R
│ ├── test-stemmer.R
│ ├── test-term_counts.R
│ ├── test-term_matrix.R
│ ├── test-term_stats.R
│ ├── test-text-stats.R
│ ├── test-text.R
│ ├── test-text_base.R
│ ├── test-text_c.R
│ ├── test-text_filter.R
│ ├── test-text_format.R
│ ├── test-text_index.R
│ ├── test-text_locate.R
│ ├── test-text_names.R
│ ├── test-text_nunit.R
│ ├── test-text_primitive.R
│ ├── test-text_print.R
│ ├── test-text_split_sentences.R
│ ├── test-text_split_tokens.R
│ ├── test-text_stats.R
│ ├── test-text_sub.R
│ ├── test-text_tokens.R
│ ├── test-text_types.R
│ └── test-wordlist.R
└── vignettes
├── chinese-wordcloud-1.png
├── chinese.Rmd
├── chinese.Rmd.in
├── corpus-emotion-1.png
├── corpus-heapslaw-1.png
├── corpus-witch-occurrences-1.png
├── corpus.Rmd
├── corpus.Rmd.in
├── gender-estimates-1.png
├── gender-estimates_se-1.png
├── gender-signif-1.png
├── gender.Rmd
├── gender.Rmd.in
├── stemmer.Rmd
├── stemmer.Rmd.in
├── textdata.Rmd
└── textdata.Rmd.in
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | [.]a$
2 | [.]o$
3 | [.]so$
4 | ^[.]Rprofile$
5 | ^[.]git
6 | ^[.]travis[.]yml$
7 | ^_pkgdown[.]yml$
8 | ^CONDUCT[.]md$
9 | ^Makefile$
10 | ^NEWS[.]md$
11 | ^README[.]md$
12 | ^TODO[.]md$
13 | ^appveyor[.]yml$
14 | ^bench$
15 | ^data/abbreviations$
16 | ^data/abbreviations/
17 | ^data/affect_wordnet$
18 | ^data/affect_wordnet/
19 | ^data/federalist$
20 | ^data/federalist/
21 | ^data/sentiment_afinn$
22 | ^data/sentiment_afinn/
23 | ^data/stopwords$
24 | ^data/stopwords/
25 | ^dist$
26 | ^docs$
27 | ^man/figures/logo$
28 | ^man/figures/logo/
29 | ^src/corpus/[.]git$
30 | ^src/corpus/[.]travis[.]yml$
31 | ^src/corpus/Makefile$
32 | ^src/corpus/data$
33 | ^src/corpus/lib/utf8lite/[.]git$
34 | ^src/corpus/lib/utf8lite/[.]travis[.]yml$
35 | ^src/corpus/lib/utf8lite/Makefile$
36 | ^src/corpus/lib/utf8lite/data$
37 | ^vignettes/gender[.]Rmd
38 | ^vignettes/gender-.*png
39 | ^vignettes/.*[.]in$
40 | ^_pkgdown\.yml$
41 |
--------------------------------------------------------------------------------
/.Rprofile:
--------------------------------------------------------------------------------
1 | if (interactive()) {
2 | if (requireNamespace("devtools", quietly = TRUE)) {
3 | devtools::load_all(".")
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.a
2 | *.o
3 | *.so
4 | .Rhistory
5 | /NEWS
6 | /README
7 | /dist/
8 | /docs/articles/*.Rmd.in
9 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/corpus"]
2 | path = src/corpus
3 | url = https://github.com/patperry/corpus.git
4 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: R
2 | cache: packages
3 |
4 | # valgrind (disabled; takes too long, and errors don't affect the build)
5 | #
6 | # r_check_args: '--use-valgrind'
7 | #
8 | # addons:
9 | # apt:
10 | # packages:
11 | # - valgrind
12 |
13 | r_github_packages:
14 | - jimhester/covr
15 |
16 | before_install:
17 | - make NEWS
18 | - make README
19 |
20 | # for devel version of 'utf8', put the following in `before_install`:
21 | # - pushd $(mktemp -d)
22 | # - git clone --recursive https://github.com/patperry/r-utf8.git
23 | # - Rscript -e 'devtools::install("r-utf8")'
24 | # - popd
25 |
26 | matrix:
27 | include:
28 | - os: linux
29 | r: oldrel
30 | - os: linux
31 | dist: trusty
32 | r: release
33 | env: R_CODECOV=true
34 | - os: linux
35 | r: devel
36 |
37 | warnings_are_errors: true
38 |
39 | after_success:
40 | - export LC_ALL="C"
41 | - export TEST_WEB_RESOURCES="true"
42 | - if [[ "${R_CODECOV}" ]]; then Rscript -e 'covr::codecov(line_exclusions = c("R/deprecated.R", "R/wordlist.R", "src/wordlist.c", list.files("src/corpus", recursive = TRUE, full.names = TRUE)))'; fi
43 |
--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age,
8 | body size, disability, ethnicity, gender identity and expression, level of
9 | experience, nationality, personal appearance, race, religion, or sexual
10 | identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an
52 | appointed representative at an online or offline event. Representation of a
53 | project may be further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by opening an issue or contacting one or more of the project
59 | maintainers. All complaints will be reviewed and investigated and will result
60 | in a response that is deemed necessary and appropriate to the circumstances.
61 | The project team is obligated to maintain confidentiality with regard to the
62 | reporter of an incident. Further details of specific enforcement policies may
63 | be posted separately.
64 |
65 | Project maintainers who do not follow or enforce the Code of Conduct in good
66 | faith may face temporary or permanent repercussions as determined by other
67 | members of the project's leadership.
68 |
69 | ## Attribution
70 |
71 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
72 | version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
73 |
74 | [homepage]: http://contributor-covenant.org
75 | [version]: http://contributor-covenant.org/version/1/4/
76 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: corpus
2 | Version: 0.10.0.9000
3 | Title: Text Corpus Analysis
4 | Authors@R: c(
5 | person(c("Patrick", "O."), "Perry",
6 | role = c("aut", "cph", "cre"),
7 | email = "pperry@stern.nyu.edu"),
8 | person(c("Finn", "\u00c5rup"), "Nielsen",
9 | role = c("cph", "dtc"),
10 | comment = "AFINN Sentiment Lexicon"),
11 | person("Martin Porter and Richard Boulton",
12 | role = c("ctb", "cph", "dtc"),
13 | comment = "Snowball Stemmer and Stopword Lists"),
14 | person("The Regents of the University of California",
15 | role = c("ctb", "cph"),
16 | comment = "Strtod Library Procedure"),
17 | person("Carlo Strapparava and Alessandro Valitutti",
18 | role = c("cph", "dtc"),
19 | comment = "WordNet-Affect Lexicon"),
20 | person("Unicode, Inc.",
21 | role = c("cph", "dtc"),
22 | comment = "Unicode Character Database"))
23 | Depends:
24 | R (>= 3.3),
25 | Imports:
26 | stats,
27 | utf8 (>= 1.1.0)
28 | Suggests:
29 | knitr,
30 | Matrix,
31 | testthat
32 | Enhances:
33 | quanteda,
34 | tm
35 | Description: Text corpus data analysis, with full support for international text (Unicode). Functions for reading data from newline-delimited 'JSON' files, for normalizing and tokenizing text, for searching for term occurrences, and for computing term occurrence frequencies, including n-grams.
36 | License: Apache License (== 2.0) | file LICENSE
37 | URL: http://corpustext.com,
38 | https://github.com/patperry/r-corpus
39 | BugReports: https://github.com/patperry/r-corpus/issues
40 | LazyData: Yes
41 | Encoding: UTF-8
42 | VignetteBuilder: knitr
43 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | RSCRIPT= Rscript --vanilla
2 | CORPUS_LIB= src/corpus.so
3 | BUILT_VIGNETTES= \
4 | vignettes/chinese.Rmd vignettes/corpus.Rmd vignettes/gender.Rmd \
5 | vignettes/stemmer.Rmd vignettes/textdata.Rmd
6 |
7 | all: $(CORPUS_LIB) $(BUILT_VIGNETTES)
8 |
9 | $(CORPUS_LIB):
10 | $(RSCRIPT) -e 'devtools::compile_dll(".")'
11 |
12 | NEWS: NEWS.md
13 | sed -e 's/^### //g; s/`//g' $< > $@
14 |
15 | README: README.md
16 | sed -e '/\*Corpus\*/,$$!d' \
17 | -e 's/…../.../' \
18 | -e 's/..…/.../' \
19 | -e 's/⋮/./' $< > $@
20 |
21 | vignettes/%.Rmd: vignettes/%.Rmd.in
22 | $(RSCRIPT) -e 'devtools::load_all("."); setwd("vignettes"); knitr::knit(basename("$<"), basename("$@"))'
23 |
24 | bench:
25 | $(RSCRIPT) -e 'devtools::load_all("."); source("bench/bench.R")'
26 |
27 | check: $(CORPUS_LIB)
28 | $(RSCRIPT) -e 'devtools::test(".")'
29 |
30 | clean:
31 | $(RSCRIPT) -e 'devtools::clean_dll(".")'
32 |
33 | cov:
34 | $(RSCRIPT) -e 'covr::package_coverage(line_exclusions = c("R/deprecated.R", list.files("src/corpus", recursive = TRUE, full.names = TRUE)))'
35 |
36 | dist: $(BUILT_VIGNETTES) NEWS README
37 | mkdir -p dist && cd dist && R CMD build ..
38 |
39 | distclean: clean
40 | rm -rf $(BUILT_VIGNETTES)
41 |
42 | doc: $(BUILT_VIGNETTES) NEWS README
43 |
44 | install: $(CORPUS_LIB)
45 | $(RSCRIPT) -e 'devtools::install(".")'
46 |
47 | site: $(BUILT_VIGNETTES)
48 | $(RSCRIPT) -e 'pkgdown::build_site(".")'
49 |
50 | .PHONY: all bench check clean con dist distclean doc install site
51 |
--------------------------------------------------------------------------------
/R/deprecated.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/R/frame-stats.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | na.fail.corpus_frame <- function(object, ...)
16 | {
17 | if (!any(vapply(object, anyNA, FALSE)))
18 | object
19 | else stop("missing values in object")
20 | }
21 |
22 |
23 | na.omit.corpus_frame <- function(object, ...)
24 | {
25 | if (!any(vapply(object, anyNA, FALSE))) {
26 | return(object)
27 | }
28 |
29 | # find the missing entries; cast to a matrix
30 | na <- matrix(c(lapply(object, is.na), recursive = TRUE),
31 | ncol = length(object))
32 |
33 | # find rows containing missing entries
34 | omit <- which(apply(na, 1, any))
35 | names(omit) <- rownames(object)[omit]
36 |
37 | # drop the rows that miss observations
38 | object <- object[-omit,,drop = FALSE]
39 | attr(omit, "class") <- "omit"
40 | attr(object, "na.action") <- omit
41 | object
42 | }
43 |
44 |
45 | na.exclude.corpus_frame <- function(object, ...)
46 | {
47 | object <- na.omit.corpus_frame(object, ...)
48 | exclude <- attr(object, "na.action")
49 | if (is.null(exclude)) {
50 | return(object)
51 | }
52 |
53 | attr(exclude, "class") <- "exclude"
54 | attr(object, "na.action") <- exclude
55 | object
56 | }
57 |
--------------------------------------------------------------------------------
/R/logging.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | logging_off <- function()
17 | {
18 | .Call(C_logging_off)
19 | }
20 |
21 |
22 | logging_on <- function()
23 | {
24 | .Call(C_logging_on)
25 | }
26 |
--------------------------------------------------------------------------------
/R/sentence.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | text_nsentence <- function(x, filter = NULL, ...)
17 | {
18 | with_rethrow({
19 | x <- as_corpus_text(x, filter, ...)
20 | })
21 | .Call(C_text_nsentence, x)
22 | }
23 |
--------------------------------------------------------------------------------
/R/stem.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | stem_snowball <- function(x, algorithm = "en")
17 | {
18 | with_rethrow({
19 | x <- as_character_vector("x", x)
20 | algorithm <- as_snowball_algorithm("algorithm", algorithm)
21 | })
22 |
23 | .Call(C_stem_snowball, x, algorithm)
24 | }
25 |
26 |
27 | new_stemmer<- function(term, stem, default = NULL, duplicates = "first",
28 | vectorize = TRUE)
29 | {
30 | call <- sys.call()
31 | with_rethrow({
32 | term <- as_character_vector("term", term)
33 | stem <- as_character_vector("stem", stem)
34 | default <- as_character_scalar("default", default)
35 | duplicates <- as_enum("duplicates", duplicates,
36 | c("first", "last", "omit", "fail"))
37 | })
38 |
39 | if (is.null(term)) {
40 | term <- character()
41 | }
42 |
43 | if (is.null(stem)) {
44 | stem <- character()
45 | }
46 |
47 | if (length(term) != length(stem)) {
48 | stop("'term' argument length must equal 'stem' argument length")
49 | }
50 |
51 | if (duplicates == "last") {
52 | term <- rev(term)
53 | stem <- rev(stem)
54 | duplicates <- "first"
55 | }
56 |
57 | if (duplicates != "first") {
58 | dup <- duplicated(term)
59 | if (duplicates == "omit") {
60 | dups <- term[dup]
61 | rm <- term %in% dups
62 | term <- term[!rm]
63 | stem <- stem[!rm]
64 | } else if (any(dup)) { # duplicates == "fail"
65 | stop("'term' argument entries must be unique")
66 | }
67 | }
68 |
69 | # parse dynamically so that we can add a comment with the function call
70 | comment <- paste(" #", deparse(call), collapse = "\n")
71 | if (is.null(default)) {
72 | src <- paste('function(x) {',
73 | comment,
74 | ' i <- match(x, term, 0L)',
75 | ' if (i > 0L)',
76 | ' stem[[i]]',
77 | ' else x',
78 | '}',
79 | sep = '\n')
80 | } else {
81 | src <- paste('function(x) {',
82 | comment,
83 | ' i <- match(x, term, 0L)',
84 | ' if (i > 0L)',
85 | ' stem[[i]]',
86 | ' else default',
87 | '}',
88 | sep = '\n')
89 | }
90 |
91 | env <- new.env()
92 | assign("term", term, env)
93 | assign("stem", stem, env)
94 | assign("default", default, env)
95 | stem_term <- eval(parse(text = src), env)
96 |
97 | if (vectorize) {
98 | vsrc <- paste('function(x) {',
99 | comment,
100 | ' vapply(x, stem_term, "", USE.NAMES = !is.null(names(x)))',
101 | '}',
102 | sep = '\n')
103 | assign("stem_term", stem_term, env)
104 | stem_term <- eval(parse(text = vsrc, keep.source = TRUE), env)
105 | }
106 |
107 | stem_term
108 | }
109 |
--------------------------------------------------------------------------------
/R/style.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | # RStudio doesn't support ANSI faint, use gray instead
17 | style_faint <- "38;5;246" #666666
18 |
19 | # RStudio ANSI bold is broken, use color instead
20 | # https://github.com/rstudio/rstudio/issues/1721
21 | style_bold <- "38;5;203" #FF3333
22 | #style_bold <- "36" # cyan
23 | #style_bold <- "38;5;63" #3333FF
24 |
--------------------------------------------------------------------------------
/R/text-stats.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | na.fail.corpus_text <- function(object, ...)
16 | {
17 | if (!anyNA(object))
18 | object
19 | else stop("missing values in object")
20 | }
21 |
22 |
23 | na.omit.corpus_text <- function(object, ...)
24 | {
25 | if (!anyNA(object)) {
26 | return(object)
27 | }
28 |
29 | omit <- which(is.na(object))
30 | names(omit) <- names(object)[omit]
31 | object <- object[-omit]
32 | attr(omit, "class") <- "omit"
33 | attr(object, "na.action") <- omit
34 | object
35 | }
36 |
37 |
38 | na.exclude.corpus_text <- function(object, ...)
39 | {
40 | object <- na.omit.corpus_text(object, ...)
41 | exclude <- attr(object, "na.action")
42 | if (is.null(exclude)) {
43 | return(object)
44 | }
45 |
46 | attr(exclude, "class") <- "exclude"
47 | attr(object, "na.action") <- exclude
48 | object
49 | }
50 |
--------------------------------------------------------------------------------
/R/text-utils.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | str.corpus_text <- function(object, ...)
16 | {
17 | n <- length(object)
18 | if (n == 0) {
19 | "text(0)"
20 | } else {
21 | paste0("text [1:", n, "]")
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/R/text.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | as_corpus_text <- function(x, filter = NULL, ..., names = NULL)
16 | {
17 | UseMethod("as_corpus_text")
18 | }
19 |
20 |
21 | as_corpus_text.default <- function(x, filter = NULL, ..., names = NULL)
22 | {
23 | if (length(dim(x)) > 1) {
24 | stop("cannot convert multi-dimensional array to text")
25 | }
26 |
27 | x <- structure(as.character(x), names = names(x))
28 | as_corpus_text(x, filter = filter, ..., names = names)
29 | }
30 |
31 |
32 | as_corpus_text.character <- function(x, filter = NULL, ..., names = NULL)
33 | {
34 | if (length(dim(x)) > 1) {
35 | stop("cannot convert multi-dimensional array to text")
36 | }
37 |
38 | with_rethrow({
39 | x <- as_utf8(x)
40 | })
41 |
42 | if (is.null(names)) {
43 | names <- names(x)
44 | if (anyDuplicated(names)) {
45 | warning("renaming entries with duplicate names")
46 | names <- make.unique(names)
47 | }
48 | }
49 |
50 | x <- .Call(C_as_text_character, x, NULL)
51 | as_corpus_text(x, filter = filter, ..., names = names)
52 | }
53 |
54 |
55 | as_corpus_text.corpus_json <- function(x, filter = NULL, ..., names = NULL)
56 | {
57 | if (length(dim(x)) == 2) {
58 | if (!"text" %in% names(x)) {
59 | stop("no column named \"text\" in JSON object")
60 | }
61 | x <- x[["text"]]
62 | } else {
63 | x <- .Call(C_as_text_json, x, NULL)
64 | }
65 | as_corpus_text(x, filter = filter, ..., names = names)
66 | }
67 |
68 |
69 | as_corpus_text.corpus_text <- function(x, filter = NULL, ..., names = NULL)
70 | {
71 | if (!is_corpus_text(x)) {
72 | stop("argument is not a valid text object")
73 | }
74 |
75 | with_rethrow({
76 | filter <- as_filter("filter", filter)
77 | names <- as_names("names", names, length(x))
78 | })
79 |
80 | attrs <- attributes(x)
81 | for (a in names(attrs)) {
82 | if (!a %in% c("class", "names")) {
83 | attr(x, a) <- NULL
84 | }
85 | }
86 | attr(x, "class") <- "corpus_text"
87 |
88 | if (!is.null(names)) {
89 | names(x) <- names
90 | }
91 | if (!is.null(filter)) {
92 | text_filter(x) <- filter
93 | }
94 |
95 | props <- list(...)
96 | if (length(props) > 0) {
97 | pnames <- names(props)
98 | if (is.null(pnames) || any(pnames == "")) {
99 | stop("unnamed arguments are not allowed")
100 | }
101 | f <- text_filter(x)
102 | for (name in names(props)) {
103 | f[[name]] <- props[[name]]
104 | }
105 | text_filter(x) <- f
106 | }
107 |
108 | x
109 | }
110 |
111 |
112 | as_corpus_text.data.frame <- function(x, filter = NULL, ..., names = NULL)
113 | {
114 | if (!is.data.frame(x)) {
115 | stop("argument is not a valid data frame")
116 | }
117 | if (!"text" %in% names(x)) {
118 | stop("no column named \"text\" in data frame")
119 | }
120 |
121 | text <- x[["text"]]
122 | if (.row_names_info(x) > 0) {
123 | names(text) <- row.names(x)
124 | }
125 |
126 | as_corpus_text(text, filter = filter, ..., names = names)
127 | }
128 |
129 |
130 | # tm::Corpus
131 | as_corpus_text.Corpus <- function(x, filter = NULL, ..., names = NULL)
132 | {
133 | with_package("tm", {
134 | x <- vapply(x, as.character, "")
135 | })
136 | as_corpus_text(x, filter = filter, ..., names = names)
137 | }
138 |
139 | # quanteda::corpus
140 | as_corpus_text.corpus <- function(x, filter = NULL, ..., names = NULL)
141 | {
142 | with_package("quanteda", {
143 | text <- quanteda::texts(x)
144 | })
145 | as_corpus_text(text, filter = filter, ..., names = names)
146 | }
147 |
148 |
149 | is_corpus_text <- function(x)
150 | {
151 | if (!inherits(x, "corpus_text")) {
152 | return(FALSE)
153 | }
154 | .Call(C_text_valid, x)
155 | }
156 |
--------------------------------------------------------------------------------
/R/text_split.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | text_split <- function(x, units = "sentences", size = 1, filter = NULL, ...)
17 | {
18 | with_rethrow({
19 | x <- as_corpus_text(x, filter, ...)
20 | units <- as_enum("units", units, choices = c("sentences", "tokens"))
21 | size <- as_size(size)
22 | })
23 |
24 | if (units == "sentences") {
25 | ans <- .Call(C_text_split_sentences, x, size)
26 | } else {
27 | stopifnot(units == "tokens")
28 | ans <- .Call(C_text_split_tokens, x, size)
29 | }
30 |
31 | ans$parent <- structure(as.integer(ans$parent), class = "factor",
32 | levels = labels(x))
33 | ans
34 | }
35 |
--------------------------------------------------------------------------------
/R/text_stats.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | text_stats <- function(x, filter = NULL, ...)
17 | {
18 | with_rethrow({
19 | x <- as_corpus_text(x, filter, ...)
20 | })
21 |
22 | ans <- data.frame(tokens = text_ntoken(x),
23 | types = text_ntype(x),
24 | sentences = text_nsentence(x),
25 | row.names = names(x))
26 | class(ans) <- c("corpus_frame", "data.frame")
27 | ans
28 | }
29 |
--------------------------------------------------------------------------------
/R/text_types.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | text_ntype <- function(x, filter = NULL, collapse = FALSE, ...)
17 | {
18 | with_rethrow({
19 | x <- as_corpus_text(x, filter, ...)
20 | collapse <- as_option("collapse", collapse)
21 | })
22 | .Call(C_text_ntype, x, collapse)
23 | }
24 |
25 |
26 | text_types <- function(x, filter = NULL, collapse = FALSE, ...)
27 | {
28 | with_rethrow({
29 | x <- as_corpus_text(x, filter, ...)
30 | collapse <- as_option("collapse", collapse)
31 | })
32 | typs <- .Call(C_text_types, x, collapse)
33 | if (collapse) {
34 | typs <- sort(typs, method = "radix")
35 | } else {
36 | typs <- lapply(typs, sort, method = "radix")
37 | }
38 | typs
39 | }
40 |
--------------------------------------------------------------------------------
/R/token.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | text_tokens <- function(x, filter = NULL, ...)
17 | {
18 | with_rethrow({
19 | x <- as_corpus_text(x, filter, ...)
20 | })
21 | .Call(C_text_tokens, x)
22 | }
23 |
24 |
25 | text_ntoken <- function(x, filter = NULL, ...)
26 | {
27 | with_rethrow({
28 | x <- as_corpus_text(x, filter, ...)
29 | })
30 | .Call(C_text_ntoken, x)
31 | }
32 |
33 |
34 | text_sub <- function(x, start = 1L, end = -1L, filter = NULL, ...)
35 | {
36 | with_rethrow({
37 | x <- as_corpus_text(x, filter, ...)
38 | })
39 | n <- length(x)
40 |
41 | if (!(is.numeric(start)
42 | && (length(dim(start)) <= 1
43 | || is.matrix(start) && ncol(start) == 2))) {
44 | stop("'start' must be an integer vector or two-column matrix")
45 | }
46 |
47 | nstart <- if (is.matrix(start)) nrow(start) else length(start)
48 | if ((nstart == 0 && n > 0) || (nstart > 0 && n %% nstart != 0)) {
49 | stop("'start' length does not evenly divide argument length")
50 | }
51 |
52 | if (is.matrix(start)) {
53 | if (!missing(end)) {
54 | warning("'end' argument is ignored when 'start' is a two-column matrix")
55 | }
56 | end <- as.integer(start[,2])
57 | start <- as.integer(start[,1])
58 | } else {
59 | start <- as.integer(start)
60 |
61 | if (!(is.numeric(end) && length(dim(end)) <= 1)) {
62 | stop("'end' must be an integer vector")
63 | }
64 |
65 | nend <- length(end)
66 | if ((nend == 0 && n > 0) || (nend > 0 && n %% nend != 0)) {
67 | stop("'end' length does not evenly divide argument length")
68 | }
69 | end <- as.integer(end)
70 | }
71 |
72 | .Call(C_text_sub, x, start, end)
73 | }
74 |
--------------------------------------------------------------------------------
/R/util.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | with_rethrow <- function(expr)
17 | {
18 | parentcall <- sys.call(-1)
19 | eval(envir = parent.frame(),
20 | withCallingHandlers(expr,
21 | error = function(e, call = parentcall) {
22 | e$call <- call
23 | stop(e)
24 | },
25 | warning = function(w, call = parentcall) {
26 | w$call <- call
27 | warning(w)
28 | invokeRestart("muffleWarning")
29 | },
30 | message = function(m, call = parentcall) {
31 | m$call <- call
32 | }
33 | )
34 | )
35 | }
36 |
37 |
38 | with_package <- function(package, expr)
39 | {
40 | if (!isNamespaceLoaded(package)) {
41 | if (!requireNamespace(package, quietly = TRUE)) {
42 | stop(sprintf("Failed attaching name space for package '%s'",
43 | package))
44 | }
45 | }
46 |
47 | force(expr)
48 | expr
49 | }
50 |
--------------------------------------------------------------------------------
/R/wordlist.R:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Patrick O. Perry.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | abbreviations <- function(kind = "english")
17 | {
18 | .Deprecated("abbreviations_en")
19 | with_rethrow({
20 | wordlist(kind, function(k) .Call(C_abbreviations, k))
21 | })
22 | }
23 |
24 |
25 | stopwords <- function(kind = "english")
26 | {
27 | .Deprecated("stopwords_en")
28 | with_rethrow({
29 | wordlist(kind, function(k) .Call(C_stopwords, k))
30 | })
31 | }
32 |
33 |
34 | wordlist <- function(kind, call)
35 | {
36 | kind <- as_kind(kind)
37 |
38 | words <- character()
39 | for (k in kind) {
40 | wk <- call(k)
41 | words <- c(words, wk)
42 | }
43 |
44 | if (length(words) == 0) {
45 | return(NULL)
46 | }
47 |
48 | words <- unique(sort(words, method = "radix"))
49 | words
50 | }
51 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | To Do
2 | =====
3 |
4 | Bugs
5 | ----
6 |
7 | (no known bugs)
8 |
9 |
10 | Features
11 | --------
12 |
13 | * Add a `text_untoken()` function to turn token sequence into text:
14 | insert word-joiner (U+2060) to keep multi-word phrases together;
15 | put specified space character (ZWSP or SP) between tokens
16 |
17 | * wrap.pad, width arguments to `utf8_print`
18 |
19 | * `token_kind` and `token_map` functions (?)
20 |
21 | * Add demonstration of dictionary scaling with `text_match`:
22 |
23 | m <- text_match(x, dict$term)
24 | score <- tapply(dict$score[m$term], m$text, mean, default = 0)
25 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | template:
2 | params:
3 | ganalytics: UA-4636081-3
4 |
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | # DO NOT CHANGE the "init" and "install" sections below
2 |
3 | # Download script file from GitHub
4 | init:
5 | ps: |
6 | $ErrorActionPreference = "Stop"
7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
8 | Import-Module '..\appveyor-tool.ps1'
9 |
10 | install:
11 | - ps: Bootstrap
12 | - git submodule update --init --recursive
13 |
14 | cache:
15 | - C:\RLibrary
16 |
17 | # Adapt as necessary starting from here
18 |
19 | build_script:
20 | - travis-tool.sh install_deps
21 |
22 | test_script:
23 | - travis-tool.sh run_tests
24 |
25 | on_failure:
26 | - 7z a failure.zip *.Rcheck\*
27 | - appveyor PushArtifact failure.zip
28 |
29 | artifacts:
30 | - path: '*.Rcheck\**\*.log'
31 | name: Logs
32 |
33 | - path: '*.Rcheck\**\*.out'
34 | name: Logs
35 |
36 | - path: '*.Rcheck\**\*.fail'
37 | name: Logs
38 |
39 | - path: '*.Rcheck\**\*.Rout'
40 | name: Logs
41 |
42 | - path: '\*_*.tar.gz'
43 | name: Bits
44 |
45 | - path: '\*_*.zip'
46 | name: Bits
47 |
--------------------------------------------------------------------------------
/bench/bench-term_matrix.R:
--------------------------------------------------------------------------------
1 | library("dplyr", warn.conflicts = FALSE)
2 | library("janeaustenr")
3 | library("magrittr")
4 | library("stringr")
5 |
6 | lines <- (austen_books()
7 | %>% group_by(book)
8 | %>% mutate(
9 | linenumber = row_number(),
10 | chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
11 | ignore_case = TRUE))))
12 | %>% ungroup())
13 |
14 | text <- c(tapply(lines$text, paste(lines$book, lines$chapter),
15 | paste, collapse = "\n"))
16 | if (packageVersion("janeaustenr") < '0.1.5') {
17 | text <- iconv(text, "latin1", "UTF-8")
18 | }
19 |
20 | stop_words <- stopwords("english")
21 |
22 | make_matrix <- function(text, ngrams = 1) {
23 | f <- corpus::token_filter(stemmer = "english", drop_punct = TRUE,
24 | drop_number = TRUE, drop = stop_words)
25 | stats <- corpus::term_counts(text, f, ngrams = ngrams, min = 5)
26 | x <- corpus::term_matrix(text, f, select = stats$term)
27 | x
28 | }
29 |
30 | results <- microbenchmark::microbenchmark(
31 | unigrams = make_matrix(text, 1),
32 | bigrams = make_matrix(text, 1:2),
33 | trigrams = make_matrix(text, 1:3),
34 | "4-grams" = make_matrix(text, 1:4),
35 | "5-grams" = make_matrix(text, 1:5),
36 | times = 5
37 | )
38 |
39 | print(results)
40 |
--------------------------------------------------------------------------------
/bench/bench-term_matrix.Rout:
--------------------------------------------------------------------------------
1 | Unit: milliseconds
2 | expr min lq mean median uq max neval
3 | unigrams 236.2898 236.8613 270.4002 247.5848 257.6114 373.6539 5
4 | bigrams 297.9806 300.2708 309.5385 301.1933 321.1791 327.0687 5
5 | trigrams 302.7908 314.1854 322.2455 315.6736 318.7499 359.8280 5
6 | 4-grams 307.0178 318.1708 318.1261 318.5333 323.0232 323.8856 5
7 | 5-grams 305.0779 314.1598 316.7992 314.9060 321.3384 328.5137 5
8 |
--------------------------------------------------------------------------------
/bench/bench.R:
--------------------------------------------------------------------------------
1 |
2 | Sys.setlocale(locale = "C")
3 | files <- dir("bench", "^bench-.*\\.[rR]$", full.names = TRUE)
4 | for (file in files) {
5 | name <- substr(file, 1, nchar(file) - 2)
6 | message("Running ", name, "...", appendLF = FALSE)
7 | time <- proc.time()
8 | sink(paste0(file, "out"))
9 | set.seed(0)
10 | NS <- new.env()
11 | source(file, local = NS)
12 | sink()
13 | new_time <- proc.time()
14 | diff <- summary(structure(new_time - time, class = "proc_time"))
15 | elapsed <- diff[["user"]] + diff[["system"]]
16 | message("done. (", elapsed, "s)")
17 | }
18 |
--------------------------------------------------------------------------------
/data/abbreviations.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/abbreviations.rda
--------------------------------------------------------------------------------
/data/abbreviations/01_make_abbreviations.R:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/Rscript --vanilla
2 |
3 | kinds <- c(de = "german",
4 | en = "english",
5 | es = "spanish",
6 | fr = "french",
7 | it = "italian",
8 | pt = "portuguese",
9 | ru = "russian")
10 |
11 | for (lang in names(kinds)) {
12 | words <- suppressWarnings(corpus:::abbreviations(kinds[[lang]]))
13 | words <- stringr::str_sort(words, locale = lang)
14 | assign(paste0("abbreviations_", lang), words)
15 | }
16 |
17 | filename <- file.path("..", paste0("abbreviations.rda"))
18 | save(list = paste0("abbreviations_", names(kinds)), file = filename)
19 | tools::resaveRdaFiles(filename)
20 |
21 |
--------------------------------------------------------------------------------
/data/affect_wordnet.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/affect_wordnet.rda
--------------------------------------------------------------------------------
/data/affect_wordnet/02_make_rda.R:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/Rscript --vanilla
2 |
3 | raw <- read.table("wnaffect.tsv", header = TRUE, stringsAsFactors = FALSE)
4 | raw$pos <- factor(raw$pos, levels = c("NOUN", "ADJ", "VERB", "ADV"))
5 | raw$category <- factor(raw$category, levels = unique(raw$category))
6 | raw$emotion <- factor(raw$emotion, levels = unique(raw$emotion))
7 |
8 | affect_wordnet <- raw
9 | class(affect_wordnet) <- c("corpus_frame", "data.frame")
10 |
11 | save(affect_wordnet, file = "../affect_wordnet.rda")
12 | tools::resaveRdaFiles("../affect_wordnet.rda")
13 |
--------------------------------------------------------------------------------
/data/affect_wordnet/README:
--------------------------------------------------------------------------------
1 | WordNet 1.6
2 | -----------
3 |
4 | URL: https://wordnet.princeton.edu
5 | License: custom (MIT-like)
6 | Path: wordnet-1.6
7 |
8 | Dowloaded from https://wordnet.princeton.edu/wordnet/download/old-versions
9 |
10 |
11 | WordNet-Affect Lexicon
12 | ----------------------
13 |
14 | URL: http://wndomains.fbk.eu/wnaffect.html
15 | License: Creative Commons Attribution 3.0 Unported License
16 | Path: wn-affect-1.1
17 |
18 | Downloaded from https://github.com/larsmans/wordnet-domains-sentiwords
19 |
20 |
21 | Notes
22 | -----
23 |
24 | + wn-affect-1.1/a-hierarchy.xml organizes emotion aspects into a hierarchy
25 |
26 | + wn-affect-1.1/a-synsets.xml associates named aspects with wordnet-1.6 synsets
27 |
28 | + some named aspects do not have associated synsets, e.g. "merriment"
29 |
30 | + three categories in `a-synsets` do not appear in `a-hierarchy`. We remap
31 | them as follows:
32 |
33 | joy-pride -> self-pride
34 | levity-gaiety -> playfulness
35 | general-gaiety -> merriment
36 |
37 | + we merge neutral and ambiguous emotion categories into "neutral"
38 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wn-affect-1.1/readme-wn-affect-1.1.txt:
--------------------------------------------------------------------------------
1 | Characteristics of WordNet-Affect 1.1
2 |
3 | This version includes a smaller number of synsets but the semantic
4 | organization is more well-structured.
5 |
6 | a) Affective Hierarchy
7 |
8 | The affective label "emotion" is expanded in order to include a subset of
9 | new labels, identifying emotional states. These labels, named "affective
10 | categories", are hierarchically organized.
11 |
12 |
13 | b) Valence
14 |
15 | The hierarchy was initially obtained from the hyponym subtree of the
16 | synset "n#feeling#1", but some modifications were performed in order to
17 | classify affective synsets according to emotional valence. In particular,
18 | affective categories are partitioned in 4 classes: "positive" (e.g. joy),
19 | "negative" (e.g. sadness), "ambiguous" (e.g. surprise), and "neutral"
20 | (e.g. apathy).
21 |
22 | c) Causative/Stative Attribute
23 |
24 | Synsets of part of speech (pos) "adjective", "verb", and "adverb"
25 | present an addictional label representing their "causative" or "stative"
26 | semantic function. For example, an emotional adjective is "causative"
27 | if it refers to some emotion that is caused by the entity represented by
28 | the modified noun (e.g. "amusing movie"). On the other hand, an emotional
29 | adjective is "stative" if it refers to the emotion owned or felt by the
30 | subject denoted by the modified noun (e.g. "cheerful/happy boy").
31 |
32 | ---------------------------------------------------------------------
33 | Differences with respect to WordNet-Affect 1.0
34 |
35 | - Source files are formatted in XML standard.
36 |
37 | - Previous affective labels were renamed and expressed without
38 | abbreviations. The mapping between previous and present labels is the
39 | following:
40 |
41 | phy -> physical-state
42 | beh -> behaviour
43 | sit -> (emotion eliciting) situation
44 | tra -> trait
45 | sen -> sensation
46 | cog -> cognitive-state
47 | moo -> mood
48 | emo -> emotion
49 | eds -> edonic-signal
50 |
51 | - We removed the label "core" (referring to manually annotated synsets)
52 | and other labels automatically added to synsets (by application of WordNet
53 | relations, such as "similar-to"). In fact, all synsets in WordNet-Affect
54 | 1.1 were manually reviewed and it is no more useful to trace how they
55 | were collected.
56 |
57 | - Synsets that are not tagged with the label "emo(tion)" in the previous
58 | version are not present in current release. In order to retrieve these
59 | synsets, you have to refer to source files of WordNet-Affect 1.0
60 |
61 | ---------------------------------------------------------------------
62 | File description
63 |
64 | a-hierarchy.xml:
65 |
66 | Includes the affective hierarchy. Each item has 2 attributes:
67 |
68 | name = affective category label
69 | isa = category parent in the hierarchy
70 |
71 |
72 | a-synsets.xml:
73 |
74 | Includes synsets associated with the affective hierarchy. Synsets are
75 | classified according to their pos.
76 |
77 | Synsets of pos "noun" have the following attributes:
78 |
79 | id = label identifying current synset
80 | categ = affective category label
81 |
82 | Synsets of other pos ("adjective", "verb", and "adverb") have the
83 | following attributes:
84 |
85 | id = label identifying current synset
86 | noun-id = id of the noun synset from which the current one was derived
87 | causat-stat = causative/stative label
88 |
89 | The reason why not-noun synsets are connected to the affective categories
90 | via noun synsets is because this relation allows us to study to what
91 | extent the causative/stative character of adjectives, verbs and adverbs
92 | depends on the morphological variation of nouns. In the next release of
93 | WordNet-Affect, it is reasonable to characterize this semantic function
94 | of morphology.
95 |
96 | ---------------------------------------------------------------------
97 | Plans for the future
98 |
99 | The next version of WordNet-Affect will include all synsets that
100 | in WordNet-Affect 1.0 are annotated with labels different from "emo"
101 | (emotion) and that are not included in the current release. In particular,
102 | we want to distinguish labels representing mental states (e.g. cognitive
103 | states, attitudes), attributes of mental states (e.g. valence, intensity
104 | or level of arousal), and other semantic characteristics (e.c. behaviours,
105 | emotion-eliciting situations, emotional responces). Finally, we want to
106 | select only one label for each synset, taking into account its hypernyms.
107 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/LICENSE:
--------------------------------------------------------------------------------
1 | WordNet Release 1.6
2 |
3 | This software and database is being provided to you, the LICENSEE, by
4 | Princeton University under the following license. By obtaining, using
5 | and/or copying this software and database, you agree that you have
6 | read, understood, and will comply with these terms and conditions.:
7 |
8 | Permission to use, copy, modify and distribute this software and
9 | database and its documentation for any purpose and without fee or
10 | royalty is hereby granted, provided that you agree to comply with
11 | the following copyright notice and statements, including the disclaimer,
12 | and that the same appear on ALL copies of the software, database and
13 | documentation, including modifications that you make for internal
14 | use or for distribution.
15 |
16 | WordNet 1.6 Copyright 1997 by Princeton University. All rights reserved.
17 |
18 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
19 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
20 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
21 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
22 | ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
23 | OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
24 | INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
25 | OTHER RIGHTS.
26 |
27 | The name of Princeton University or Princeton may not be used in
28 | advertising or publicity pertaining to distribution of the software
29 | and/or database. Title to copyright in this software, database and
30 | any associated documentation shall at all times remain with
31 | Princeton University and LICENSEE agrees to preserve same.
32 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/dict/Makefile:
--------------------------------------------------------------------------------
1 | SHELL=/bin/sh
2 |
3 | # Makefile for WordNet 1.6 database directory "dict"
4 |
5 | WN_ROOT = /usr/local/wordnet1.6
6 | WN_INSTALLDIR = $(WN_ROOT)/dict
7 |
8 | INSTALLCMD = cp
9 | INSTALLFLAGS = -p
10 |
11 | WN_FILES = data.noun data.verb data.adj data.adv index.noun index.verb index.adj index.adv noun.exc verb.exc adj.exc adv.exc cousin.exc cousin.tops index.sense cntlist lexnames index.gloss sentidx.vrb sents.vrb stoplist.pl
12 |
13 | all: $(WN_FILES)
14 |
15 | install: $(WN_FILES)
16 | @if [ ! -d $(WN_INSTALLDIR) ] ; then \
17 | echo "Making directory $(WN_INSTALLDIR)" ; \
18 | mkdir -p $(WN_INSTALLDIR) ; \
19 | chmod 755 $(WN_INSTALLDIR) ; \
20 | fi ;
21 | @echo "Installing database files in $(WN_INSTALLDIR)"
22 | @for file in $(WN_FILES) ; \
23 | do \
24 | filename=$(WN_INSTALLDIR)/$$file ; \
25 | if [ -f $$filename ] ; then \
26 | echo "Cannot install $$filename: file exists" ; \
27 | else \
28 | echo "Installing $$filename" ; \
29 | $(INSTALLCMD) $(INSTALLFLAGS) $$file $$filename ; \
30 | fi ; \
31 | done ;
32 | @echo "Done installing database files in $(WN_INSTALLDIR)"
33 |
34 | uninstall:
35 | @echo "Cannot uninstall database files automatically" ; \
36 | echo "You must delete them from $(WN_INSTALLDIR) manually" ; \
37 | echo "This is dangerous if you set INSTALLCMD to 'mv'" ; \
38 | echo "Since this is your only copy of WordNet" ;
39 |
40 | reallyuninstall:
41 | @echo "Uninstalling database files from $(WN_INSTALLDIR)"
42 | @for file in $(WN_FILES) ; \
43 | do \
44 | filename=$(WN_INSTALLDIR)/$$file ; \
45 | if [ ! -f $$filename ] ; then \
46 | echo "Cannot uninstall $$filename: not present" ; \
47 | else \
48 | echo "Uninstalling $$filename" ; \
49 | rm -f $$filename ; \
50 | fi ; \
51 | done ;
52 | @echo "Done uninstalling database files from $(WN_INSTALLDIR)"
53 |
54 | clean:
55 | @rm -f *~ "#"*
56 |
57 | cleandbfiles:
58 | @echo "Removing WordNet 1.6 database files from `pwd`"
59 | @for file in $(WN_FILES) ; \
60 | do \
61 | if [ ! -f $$file ] ; then \
62 | echo "Cannot remove $$file" ; \
63 | else \
64 | echo "Removing $$file" ; \
65 | rm -f $$file ; \
66 | fi ; \
67 | done ;
68 | @echo "Done removing WordNet 1.6 database files"
69 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/dict/adv.exc:
--------------------------------------------------------------------------------
1 | best well
2 | better well
3 | deeper deeply
4 | farther far
5 | further far
6 | harder hard
7 | hardest hard
8 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/dict/lexnames:
--------------------------------------------------------------------------------
1 | 00 adj.all 3
2 | 01 adj.pert 3
3 | 02 adv.all 4
4 | 03 noun.Tops 1
5 | 04 noun.act 1
6 | 05 noun.animal 1
7 | 06 noun.artifact 1
8 | 07 noun.attribute 1
9 | 08 noun.body 1
10 | 09 noun.cognition 1
11 | 10 noun.communication 1
12 | 11 noun.event 1
13 | 12 noun.feeling 1
14 | 13 noun.food 1
15 | 14 noun.group 1
16 | 15 noun.location 1
17 | 16 noun.motive 1
18 | 17 noun.object 1
19 | 18 noun.person 1
20 | 19 noun.phenomenon 1
21 | 20 noun.plant 1
22 | 21 noun.possession 1
23 | 22 noun.process 1
24 | 23 noun.quantity 1
25 | 24 noun.relation 1
26 | 25 noun.shape 1
27 | 26 noun.state 1
28 | 27 noun.substance 1
29 | 28 noun.time 1
30 | 29 verb.body 2
31 | 30 verb.change 2
32 | 31 verb.cognition 2
33 | 32 verb.communication 2
34 | 33 verb.competition 2
35 | 34 verb.consumption 2
36 | 35 verb.contact 2
37 | 36 verb.creation 2
38 | 37 verb.emotion 2
39 | 38 verb.motion 2
40 | 39 verb.perception 2
41 | 40 verb.possession 2
42 | 41 verb.social 2
43 | 42 verb.stative 2
44 | 43 verb.weather 2
45 | 44 adj.ppl 3
46 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/binsrch.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | BINSRCH(3WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | bin_search, copyfile, replace_line, insert_line
13 |
14 |
15 | char *bin_search(char
16 | *key, FILE *fp);
17 | void copyfile(FILE *fromfp, FILE *tofp);
18 | char *replace_line(char
19 | *new_line, char *key, FILE *fp);
20 |
21 |
22 |
23 | The WordNet library contains
24 | several general purpose functions for performing a binary search and modifying
25 | sorted files.
26 | bin_search() is the primary binary search algorithm to search
27 | for key as the first item on a line in the file pointed to by fp . The
28 | delimiter between the key and the rest of the fields on the line, if any,
29 | must be a space. A pointer to a static variable containing the entire
30 | line is returned. NULL
31 | is returned if a match is not found.
32 | The remaining
33 | functions are not used by WordNet, and are only briefly described.
34 | copyfile()
35 | copies the contents of one file to another.
36 | replace_line() replaces a line
37 | in a file having searchkey key with the contents of new_line . It returns
38 | the original line or NULL
39 | in case of error.
40 | insert_line() finds the proper
41 | place to insert the contents of new_line , having searchkey key in the
42 | sorted file pointed to by fp . It returns NULL
43 | if a line with this searchkey
44 | is already in the file.
45 |
46 | The maximum length of key is 1024.
47 | The
48 | maximum line length in a file is 25K for Unix platforms, and 8K for the
49 | PC and Macintosh platforms.
50 | If there are no additional fields after the
51 | search key, the key must be followed by at least one space before the
52 | newline character.
53 |
54 | morph (3WN)
55 | , wnsearch (3WN)
56 | , wnutil (3WN)
57 | , wnintro (5WN)
58 | .
59 |
60 |
61 | binsearch() returns a pointer to a static character buffer.
62 | The returned string should be copied by the caller if the results need
63 | to be saved, as a subsequent call will replace the contents of the static
64 | buffer.
65 |
66 |
67 |
68 | Table of Contents
69 |
77 |
78 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/morph.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | MORPH(3WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | morphinit, re_morphinit, morphstr, morphword
13 |
14 |
15 | #include
16 | "wn.h"
17 | int morphinit(void);
18 | int re_morphinit(void);
19 | char *morphstr(char
20 | *origstr, int pos);
21 | char *morphword(char *word, int pos);
22 |
24 |
25 | The WordNet morphological processor, Morphy, is accessed through these
26 | functions:
27 | morphinit() is used to open the exception list files. It returns
28 | 0 if successful, -1 otherwise. The exception list files must be opened
29 | before morphstr() or morphword() are called.
30 | re_morphinit() is used to
31 | close the exception list files and reopen them, and is used exclusively
32 | for WordNet development. Return codes are as described above.
33 | morphstr()
34 | is the basic user interface to Morphy. It tries to find the base form
35 | (lemma) of the word or collocation origstr in the specified pos . The
36 | first call (with origstr specified) returns a pointer to the first base
37 | form found. Subsequent calls requesting base forms of the same string
38 | must be made with the first argument of NULL.
39 | When no more base forms
40 | for origstr can be found, NULL
41 | is returned.
42 | morphword() tries to find
43 | the base form of word in the specified pos . This function is called by
44 | morphstr() for each individual word in a collocation.
45 |
46 | morphinit()
47 | is called by wninit() and is not intended to be called directly by an
48 | application. Applications wishing to use WordNet and/or the morphological
49 | functions must call wninit() at the start of the program. See wnutil (3WN)
50 |
51 | for more information.
52 | origstr may be either a word or a collocation formed
53 | by joining individual words with underscore characters (_ ).
54 | Usually only
55 | morphstr() is called from applications, as it works on both words and
56 | collocations.
57 | pos must be one of the following:
58 |
1 NOUN
59 | 2 VERB
60 | 3 ADJECTIVE
61 |
62 | 4 ADVERB
63 | 5 ADJECTIVE SATELLITE
64 |
65 |
66 | If ADJECTIVE SATELLITE
67 | is passed, it is treated
68 | by morphstr() as ADJECTIVE.
69 |
70 |
71 | wnsearch (3WN)
72 | , wndb (5WN)
73 | , morphy (7WN)
74 | .
75 |
76 |
77 |
78 | Passing an invalid part of speech will result in a core dump.
79 |
80 | The WordNet database files must be open to use morphstr() or morphword().
81 |
82 |
83 |
84 | Table of Contents
85 |
93 |
94 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/uniqbeg.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | UNIQBEG(7WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | uniqbeg - unique beginners for noun hierarchies
13 |
14 | All
15 | of the WordNet noun synsets are organized into hierarchies. Each synset
16 | is part of at least one hierarchy, headed by a synset called a unique
17 | beginner. All of these synsets originate in the lexicographer file noun.Tops
18 | . From any noun synset, except a unique beginner, the hypernym pointers
19 | can be traced up to one of the following synsets:
20 |
{ entity, something,
21 | (anything having existence (living or nonliving)) }
22 |
23 | { psychological_feature,
24 | (a feature of the mental life of a living organism) }
25 |
26 | { abstraction,
27 | (a general concept formed by extracting common features from specific
28 | examples) }
29 |
30 | { state, (the way something is with respect to its main
31 | attributes; "the current state of knowledge";
32 | "his state of health";
33 | "in a weak financial state") }
34 |
35 | { event, (something that happens at a
36 | given place and time) }
37 |
38 | { act, human_action, human_activity, (something
39 | that people do or cause to happen) }
40 |
41 | { group, grouping, (any number
42 | of entities (members) considered as a unit) }
43 |
44 | { possession, (anything
45 | owned or possessed) }
46 |
47 | { phenomenon, (any state or process known through
48 | the senses rather than by intuition or reasoning) }
49 |
50 |
51 |
52 | The lexicographer
53 | files are not included in the WordNet package.
54 |
55 |
56 |
57 |
58 | noun.Tops
59 | unique
60 | beginners for nouns
61 |
62 |
63 |
64 | wndb (5WN)
65 | , wninput (5WN)
66 | , wngloss (7WN)
67 | .
68 |
69 |
70 | Table of Contents
71 |
78 |
79 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnintro1.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | WNINTRO(1WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | wnintro - WordNet user commands
13 |
14 |
15 | escort - search semantic
16 | concordances for sentences containing semantic tags
17 | wn - command line
18 | interface to WordNet database
19 | wnb - window based WordNet browser
20 |
22 | This section of the WordNet Reference Manual contains manual pages that
23 | describe commands available with the various WordNet system packages.
24 |
25 | The WordNet interfaces wn (1WN)
26 | and wnb (1WN)
27 | allow the user to search
28 | the WordNet database and display the information textually. escort (1WN)
29 |
30 | is a window based browser for searching the semantic concordances.
31 |
33 |
34 |
35 |
36 | WNHOME
37 | Base directory for WordNet. Unix default is /usr/local/wordnet1.6
38 | , PC default is C:\wn16 , Macintosh default is : .
39 |
40 | WNSEARCHDIR
41 | Directory
42 | in which the WordNet database has been installed. Unix default is WNHOME/dict
43 | , PC default is WNHOME\dict , Macintosh default is :Database .
44 |
45 | WNDBVERSION
46 |
47 | Indicates which format the WordNet database files in WNSEARCHDIR are
48 | in. The default is 1.6 . Setting WNDBVERION to 1.5 allows the 1.6 commands
49 | to work with the 1.5 database files.
50 |
51 |
52 |
53 | wnintro (3WN)
54 | , wnintro (5WN)
55 | ,
56 | wnintro (7WN)
57 | .
58 | Miller, G. A. (1990), ed. "WordNet: An On-Line Lexical Database"
59 | . International Journal of Lexicography, Volume 3, Number 4.
60 | Miller, G.
61 | A., et al. (1990, 1993). "Five Papers on WordNet" . Cognitive Science Laboratory
62 | Report 43. (Available from ftp://ftp.cogsci.princeton.edu/wordnet/ .)
63 | Fellbaum,
64 | C. (1998), ed. "WordNet: An Electronic Lexical Database" . MIT Press, Cambridge,
65 | MA.
66 |
67 | WordNet has a World Wide Web site at http://www.cogsci.princeton.edu/~wn/
68 |
69 | . From this web site users can learn about the WordNet project, run several
70 | different interfaces to the WordNet database, and download various WordNet
71 | system packages and "Five Papers on WordNet" .
72 |
73 |
74 | Table of Contents
75 |
83 |
84 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnintro5.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | WNINTRO(5WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | wnintro - introduction to descriptions of WordNet file formats
13 |
14 |
15 |
16 | cntlist - format of cntlist file
17 | cxtfile - format of semantically
18 | tagged file
19 | glossidx - index of words found in synset glosses
20 | lexnames
21 | - list of lexicographer file names and numbers
22 | prologdb - description
23 | of Prolog database files
24 | senseidx - format of sense index file
25 | sensemap
26 | - mapping from senses in WordNet 1.5 to corresponding 1.6 senses
27 | taglist
28 | - format of taglist file
29 | wndb - format of WordNet database files
30 | wninput
31 | - format of WordNet lexicographer files
32 |
33 | This section of the
34 | WordNet Reference Manual contains manual pages that describe the formats
35 | of the various files included in different WordNet 1.6 packages.
36 |
38 | All files are in ASCII. Fields are generally separated by one space, unless
39 | otherwise noted, and each line is terminated with a newline character.
40 | In the file format descriptions, terms in italics refer to field names.
41 | Characters or strings in boldface represent an actual character or string
42 | as it appears in the file. Items enclosed in italicized square brackets
43 | ([ ] ) may not be present. Since several files contain fields that have
44 | the identical meaning, field names are consistently defined. For example,
45 | several WordNet files contain one or more synset_offset fields. In each
46 | case, the definition of synset_offset is identical.
47 |
48 | wnintro (1WN)
49 | ,
50 | wnintro (3WN)
51 | , wnintro (7WN)
52 | , wngloss (7WN)
53 | .
54 | Miller, G. A. (1990), ed. "WordNet:
55 | An On-Line Lexical Database" . International Journal of Lexicography, Volume
56 | 3, Number 4.
57 | Miller, G. A., et al. (1990, 1993). "Five Papers on WordNet"
58 | . Cognitive Science Laboratory Report 43. (Available from ftp://ftp.cogsci.princeton.edu/wordnet/
59 | .)
60 | Fellbaum, C. (1998), ed. "WordNet: An Electronic Lexical Database" . MIT
61 | Press, Cambridge, MA.
62 |
63 |
64 |
65 | Table of Contents
66 |
73 |
74 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnintro7.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | WNINTRO(7WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | wnintro - introduction to miscellaneous WordNet information
13 |
15 |
16 | morphy - discussion of WordNet's morphological processing
17 | semcor - discussion
18 | of semantic concordances
19 | uniqbeg - unique beginners for noun hierarchies
20 |
21 | wngloss - glossary of terms used in WordNet
22 | wngroups - discussion of WordNet
23 | search code to group similar senses
24 | wnlicens - text of WordNet license
25 | agreement
26 | wnpkgs - information about WordNet packages and distribution
27 |
28 | wnstats - database statistics
29 |
30 | This section of the WordNet
31 | Reference Manual contains manual pages that describe various topics related
32 | to WordNet and the semantic concordances, and a glossary of terms.
33 |
35 | wnintro (1WN)
36 | , wnintro (3WN)
37 | , wnintro (5WN)
38 | , wngloss (7WN)
39 | .
40 | Miller, G.
41 | A. (1990), ed. "WordNet: An On-Line Lexical Database" . International Journal
42 | of Lexicography, Volume 3, Number 4.
43 | Miller, G. A., et al. (1990, 1993).
44 | "Five Papers on WordNet" . Cognitive Science Laboratory Report 43. (Available
45 | from ftp://ftp.cogsci.princeton.edu/wordnet/ .)
46 | Fellbaum, C. (1998), ed. "WordNet:
47 | An Electronic Lexical Database" . MIT Press, Cambridge, MA.
48 |
49 |
50 | Table of Contents
51 |
57 |
58 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnlicens.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | WNLICENS(7WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | wnlicens - text of WordNet license
13 |
14 | WordNet Release 1.6
15 |
16 | This software and database is being provided to you, the LICENSEE, by
17 | Princeton University under the following license. By obtaining, using
18 | and/or copying this software and database, you agree that you have
19 | read, understood, and will comply with these terms and conditions.:
20 | Permission to use, copy, modify and distribute this software and
21 | database and its documentation for any purpose and without fee or royalty
22 | is hereby granted, provided that you agree to comply with the following
23 | copyright notice and statements, including the disclaimer, and that
24 | the same appear on ALL copies of the software, database and documentation,
25 | including modifications that you make for internal use or for distribution.
26 | WordNet 1.6 Copyright 1997 by Princeton University. All rights reserved.
27 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY
28 | MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF
29 | EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS
30 | OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE
31 | OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION
32 | WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
33 | OTHER RIGHTS. The name of Princeton University or Princeton may
34 | not be used in advertising or publicity pertaining to distribution of
35 | the software and/or database. Title to copyright in this software, database
36 | and any associated documentation shall at all times remain with Princeton
37 | University and LICENSEE agrees to preserve same.
38 |
39 |
40 | Table of Contents
41 |
45 |
46 |
--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnstats.htm:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | WNSTATS(7WN) manual page
7 |
8 |
9 | Table of Contents
10 |
11 |
12 | wnstats - WordNet 1.6 database statistics
13 |
14 |
15 |
17 |
18 | POS Unique Strings Synsets Total Senses
19 | Noun 94474 66025 116317
20 | Verb 10319 12127 22066
21 | Adjective
22 | 20170 17915 29881
23 | Adverb 4546 3575 5677
24 | Totals 121962
25 | 99642 173941
26 |
27 |
28 |
29 |
30 |
31 |
32 | POS Monosemous Words and Senses Polysemous Words Polysemous Senses
33 | Noun 81909 12564
34 | 34408
35 | Verb 5751 4567 16315
36 | Adjective 14795 5374 15086
37 |
38 | Adverb 3795 750 1882
39 | Totals 106250 23255 67691
40 |
41 |
42 |
43 | POS Average
44 | Polysemy Including Monosemous Words Average
45 | Polysemy Excluding Monosemous Words
46 | Noun 1.23 2.73
47 | Verb 2.13 3.57
48 | Adjective
49 | 1.48 2.80
50 | Adverb 1.24 2.50
51 |
52 |
53 |
54 | Statistics for all types of adjectives
55 | and adjective satellites are combined.
56 | The total of all unique noun,
57 | verb, adjective, and adverb strings is greater than 121962. However, many
58 | strings are unique within a syntactic category, but are in more than one
59 | syntactic category. The figure in the table represents the unique strings
60 | when all syntactic categories are combined.
61 |
62 |
63 |
64 | Table of Contents
65 |
74 |
75 |
--------------------------------------------------------------------------------
/data/federalist.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/federalist.rda
--------------------------------------------------------------------------------
/data/federalist/.gitignore:
--------------------------------------------------------------------------------
1 | /federalist.json
2 | /pg18.txt
3 |
--------------------------------------------------------------------------------
/data/federalist/01_download_raw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | FILE="pg18.txt"
6 | FILE_MD5="126eca27b879c078a189ba0783186330"
7 |
8 | if [ ! -f $FILE ];
9 | then
10 | echo "Downloading raw data file '$FILE' from gutenberg.org"
11 | curl 'http://www.gutenberg.org/cache/epub/18/pg18.txt' -o ${FILE}.download
12 | mv ${FILE}.download ${FILE}
13 | fi
14 |
15 | echo "Checking raw data file '${FILE}'"
16 | md5sum -c - <<< "${FILE_MD5} ${FILE}"
17 |
--------------------------------------------------------------------------------
/data/federalist/03_make_rda.R:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/Rscript --vanilla
2 |
3 | library("corpus")
4 |
5 | raw <- read_ndjson("federalist.json", text = NULL, stringsAsFactors = FALSE)
6 |
7 | name <- paste("Federalist No.", raw$paper_id)
8 |
9 | venue <- raw$venue
10 | venue[venue == "For the Independent Fournal"] <- "For the Independent Journal"
11 | venue[grep("^From M[cC]", venue)] <- "From McLean's Edition, New York"
12 |
13 | author <- raw$author
14 | author[author == "HAMILTON"] <- "Hamilton"
15 | author[author == "HAMILTON AND MADISON"] <- NA
16 | author[author == "HAMILTON OR MADISON"] <- NA
17 | author[author == "JAY"] <- "Jay"
18 | author[author == "MADISON"] <- "Madison"
19 | author[raw$paper_id == 58] <- NA # follow Mosteller and Wallace
20 |
21 | date <- raw$date
22 | date <- sub("^(Tuesday|Thursday|Friday),? ", "", date)
23 |
24 | invisible(Sys.setlocale("LC_TIME", "C"))
25 | date <- as.Date(date, "%B %d, %Y")
26 |
27 | federalist <- data.frame(name,
28 | title = raw$title,
29 | venue,
30 | date,
31 | author,
32 | text = raw$text,
33 | stringsAsFactors = FALSE)
34 | class(federalist) <- c("corpus_frame", "data.frame")
35 |
36 | save(federalist, file = "../federalist.rda")
37 | tools::resaveRdaFiles("../federalist.rda")
38 |
--------------------------------------------------------------------------------
/data/sentiment_afinn.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/sentiment_afinn.rda
--------------------------------------------------------------------------------
/data/sentiment_afinn/01_download_raw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | FILE="imm6010.zip"
6 | FILE_MD5="ea6216f43d27188ea2b5bfadf068ff37"
7 |
8 | if [ ! -f $FILE ];
9 | then
10 | echo "Downloading raw data file '$FILE' from www2.imm.dtu.dk"
11 | curl 'http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip' -o ${FILE}.download
12 | mv ${FILE}.download ${FILE}
13 | fi
14 |
15 | if [ ! -d AFINN ];
16 | then
17 | unzip ${FILE}
18 | fi
19 |
--------------------------------------------------------------------------------
/data/sentiment_afinn/02_make_rda.R:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/Rscript --vanilla
2 |
3 | raw <- read.delim(file.path("AFINN", "AFINN-111.txt"),
4 | encoding = "UTF-8", header = FALSE, stringsAsFactors = FALSE)
5 | names(raw) <- c("term", "score")
6 | raw$term <- trimws(raw$term)
7 |
8 | # exclude multi-type terms
9 | multi <- grepl("[[:space:]]", raw$term)
10 | raw <- raw[!multi,]
11 |
12 | # discared row names
13 | rownames(raw) <- NULL
14 |
15 | sentiment_afinn <- raw
16 | class(sentiment_afinn) <- c("corpus_frame", "data.frame")
17 |
18 | save(sentiment_afinn, file = "../sentiment_afinn.rda")
19 | tools::resaveRdaFiles("../sentiment_afinn.rda")
20 |
--------------------------------------------------------------------------------
/data/sentiment_afinn/AFINN/AFINN-README.txt:
--------------------------------------------------------------------------------
1 | AFINN is a list of English words rated for valence with an integer
2 | between minus five (negative) and plus five (positive). The words have
3 | been manually labeled by Finn Årup Nielsen in 2009-2011. The file
4 | is tab-separated. There are two versions:
5 |
6 | AFINN-111: Newest version with 2477 words and phrases.
7 |
8 | AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there
9 | are 1480 lines, as some words are listed twice. The word list in not
10 | entirely in alphabetic ordering.
11 |
12 | An evaluation of the word list is available in:
13 |
14 | Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
15 | sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
16 |
17 | The list was used in:
18 |
19 | Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
20 | Michael Etter, "Good Friends, Bad News - Affect and Virality in
21 | Twitter", The 2011 International Workshop on Social Computing,
22 | Network, and Services (SocialComNet 2011).
23 |
24 |
25 | This database of words is copyright protected and distributed under
26 | "Open Database License (ODbL) v1.0"
27 | http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
28 | copyleft license.
29 |
30 | See comments on the word list here:
31 | http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
32 |
33 |
34 | In Python the file may be read into a dictionary with:
35 |
36 | >>> afinn = dict(map(lambda (k,v): (k,int(v)),
37 | [ line.split('\t') for line in open("AFINN-111.txt") ]))
38 | >>> afinn["Good".lower()]
39 | 3
40 | >>> sum(map(lambda word: afinn.get(word, 0), "Rainy day but still in a good mood".lower().split()))
41 | 2
42 |
43 |
44 |
--------------------------------------------------------------------------------
/data/stopwords.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/stopwords.rda
--------------------------------------------------------------------------------
/data/stopwords/01_make_stopwords.R:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/Rscript --vanilla
2 |
3 | kinds <- c(da = "danish",
4 | de = "german",
5 | en = "english",
6 | es = "spanish",
7 | fi = "finnish",
8 | fr = "french",
9 | hu = "hungarian",
10 | it = "italian",
11 | nl = "dutch",
12 | no = "norwegian",
13 | pt = "portuguese",
14 | ru = "russian",
15 | sv = "swedish")
16 |
17 | for (lang in names(kinds)) {
18 | words <- suppressWarnings(corpus:::stopwords(kinds[[lang]]))
19 | words <- stringr::str_sort(words, locale = lang)
20 | assign(paste0("stopwords_", lang), words)
21 | }
22 |
23 | filename <- file.path("..", paste0("stopwords.rda"))
24 | save(list = paste0("stopwords_", names(kinds)), file = filename)
25 | tools::resaveRdaFiles(filename)
26 |
27 |
--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | corpustext.com
--------------------------------------------------------------------------------
/docs/articles/chinese-wordcloud-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/chinese-wordcloud-1.png
--------------------------------------------------------------------------------
/docs/articles/corpus-emotion-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-emotion-1.png
--------------------------------------------------------------------------------
/docs/articles/corpus-heapslaw-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-heapslaw-1.png
--------------------------------------------------------------------------------
/docs/articles/corpus-witch-occurrences-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-witch-occurrences-1.png
--------------------------------------------------------------------------------
/docs/articles/gender-estimates-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-estimates-1.png
--------------------------------------------------------------------------------
/docs/articles/gender-estimates_se-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-estimates_se-1.png
--------------------------------------------------------------------------------
/docs/articles/gender-signif-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-signif-1.png
--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/favicon.ico
--------------------------------------------------------------------------------
/docs/jquery.sticky-kit.min.js:
--------------------------------------------------------------------------------
1 | /*
2 | Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net
3 | */
4 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k));
5 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
"))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q,
6 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}),
8 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize",
9 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n
2 |
3 |
5 |
8 |
12 |
13 |
--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/logo.png
--------------------------------------------------------------------------------
/docs/pkgdown.css:
--------------------------------------------------------------------------------
1 | /* Sticker footer */
2 | body > .container {
3 | display: flex;
4 | padding-top: 60px;
5 | min-height: calc(100vh);
6 | flex-direction: column;
7 | }
8 |
9 | body > .container .row {
10 | flex: 1;
11 | }
12 |
13 | footer {
14 | margin-top: 45px;
15 | padding: 35px 0 36px;
16 | border-top: 1px solid #e5e5e5;
17 | color: #666;
18 | display: flex;
19 | }
20 | footer p {
21 | margin-bottom: 0;
22 | }
23 | footer div {
24 | flex: 1;
25 | }
26 | footer .pkgdown {
27 | text-align: right;
28 | }
29 | footer p {
30 | margin-bottom: 0;
31 | }
32 |
33 | img.icon {
34 | float: right;
35 | }
36 |
37 | img {
38 | max-width: 100%;
39 | }
40 |
41 | /* Section anchors ---------------------------------*/
42 |
43 | a.anchor {
44 | margin-left: -30px;
45 | display:inline-block;
46 | width: 30px;
47 | height: 30px;
48 | visibility: hidden;
49 |
50 | background-image: url(./link.svg);
51 | background-repeat: no-repeat;
52 | background-size: 20px 20px;
53 | background-position: center center;
54 | }
55 |
56 | .hasAnchor:hover a.anchor {
57 | visibility: visible;
58 | }
59 |
60 | @media (max-width: 767px) {
61 | .hasAnchor:hover a.anchor {
62 | visibility: hidden;
63 | }
64 | }
65 |
66 |
67 | /* Fixes for fixed navbar --------------------------*/
68 |
69 | .contents h1, .contents h2, .contents h3, .contents h4 {
70 | padding-top: 60px;
71 | margin-top: -60px;
72 | }
73 |
74 | /* Static header placement on mobile devices */
75 | @media (max-width: 767px) {
76 | .navbar-fixed-top {
77 | position: absolute;
78 | }
79 | .navbar {
80 | padding: 0;
81 | }
82 | }
83 |
84 |
85 | /* Sidebar --------------------------*/
86 |
87 | #sidebar {
88 | margin-top: 30px;
89 | }
90 | #sidebar h2 {
91 | font-size: 1.5em;
92 | margin-top: 1em;
93 | }
94 |
95 | #sidebar h2:first-child {
96 | margin-top: 0;
97 | }
98 |
99 | #sidebar .list-unstyled li {
100 | margin-bottom: 0.5em;
101 | }
102 |
103 | /* Reference index & topics ----------------------------------------------- */
104 |
105 | .ref-index th {font-weight: normal;}
106 | .ref-index h2 {font-size: 20px;}
107 |
108 | .ref-index td {vertical-align: top;}
109 | .ref-index .alias {width: 40%;}
110 | .ref-index .title {width: 60%;}
111 |
112 | .ref-index .alias {width: 40%;}
113 | .ref-index .title {width: 60%;}
114 |
115 | .ref-arguments th {text-align: right; padding-right: 10px;}
116 | .ref-arguments th, .ref-arguments td {vertical-align: top;}
117 | .ref-arguments .name {width: 20%;}
118 | .ref-arguments .desc {width: 80%;}
119 |
120 | /* Nice scrolling for wide elements --------------------------------------- */
121 |
122 | table {
123 | display: block;
124 | overflow: auto;
125 | }
126 |
127 | /* Syntax highlighting ---------------------------------------------------- */
128 |
129 | pre {
130 | word-wrap: normal;
131 | word-break: normal;
132 | border: 1px solid #eee;
133 | }
134 |
135 | pre, code {
136 | background-color: #f8f8f8;
137 | color: #333;
138 | }
139 |
140 | pre .img {
141 | margin: 5px 0;
142 | }
143 |
144 | pre .img img {
145 | background-color: #fff;
146 | display: block;
147 | height: auto;
148 | }
149 |
150 | code a, pre a {
151 | color: #375f84;
152 | }
153 |
154 | .fl {color: #1514b5;}
155 | .fu {color: #000000;} /* function */
156 | .ch,.st {color: #036a07;} /* string */
157 | .kw {color: #264D66;} /* keyword */
158 | .co {color: #888888;} /* comment */
159 |
160 | .message { color: black; font-weight: bolder;}
161 | .error { color: orange; font-weight: bolder;}
162 | .warning { color: #6A0366; font-weight: bolder;}
163 |
164 |
--------------------------------------------------------------------------------
/docs/pkgdown.js:
--------------------------------------------------------------------------------
1 | $(function() {
2 | $("#sidebar").stick_in_parent({offset_top: 40});
3 | $('body').scrollspy({
4 | target: '#sidebar',
5 | offset: 60
6 | });
7 |
8 | var cur_path = paths(location.pathname);
9 | $("#navbar ul li a").each(function(index, value) {
10 | if (value.text == "Home")
11 | return;
12 | if (value.getAttribute("href") === "#")
13 | return;
14 |
15 | var path = paths(value.pathname);
16 | if (is_prefix(cur_path, path)) {
17 | // Add class to parent , and enclosing if in dropdown
18 | var menu_anchor = $(value);
19 | menu_anchor.parent().addClass("active");
20 | menu_anchor.closest("li.dropdown").addClass("active");
21 | }
22 | });
23 | });
24 |
25 | function paths(pathname) {
26 | var pieces = pathname.split("/");
27 | pieces.shift(); // always starts with /
28 |
29 | var end = pieces[pieces.length - 1];
30 | if (end === "index.html" || end === "")
31 | pieces.pop();
32 | return(pieces);
33 | }
34 |
35 | function is_prefix(needle, haystack) {
36 | if (needle.length > haystack.lengh)
37 | return(false);
38 |
39 | for (var i = 0; i < haystack.length; i++) {
40 | if (needle[i] != haystack[i])
41 | return(false);
42 | }
43 |
44 | return(true);
45 | }
46 |
--------------------------------------------------------------------------------
/docs/reference/figures/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/banner.png
--------------------------------------------------------------------------------
/docs/reference/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo.png
--------------------------------------------------------------------------------
/docs/reference/figures/logo/01_make_logo.R:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/Rscript --vanilla
2 |
3 | library("magick")
4 |
5 | slide <- image_read("logo-slide.tiff")
6 | banner <- image_crop(slide, "370x80+2+2")
7 | image_write(banner, "../banner.png", format = "png")
8 |
9 | ccap <- image_read("c-07.jpg")
10 | logo <- image_scale(ccap, "80x80")
11 | image_write(logo, "../logo.png", format = "png")
12 |
--------------------------------------------------------------------------------
/docs/reference/figures/logo/README:
--------------------------------------------------------------------------------
1 |
2 | The logo was created in Mac OS pages, then exported to TIFF. The script
3 | `logo.R` crops the TIFF and converts to PNG.
4 |
5 | --
6 |
7 | C drop capital is public domain, from http://www.reusableart.com/c-07.html
8 |
9 | "This and numerous other stock print foundry images were included in the book
10 | The Proverbs of Scotland from 1868. It was published by Alexander Hislop &
11 | Company."
12 |
13 |
14 | Font is Matthew Butterick's "Equity" (Caps A Regular, 96pt).
15 |
--------------------------------------------------------------------------------
/docs/reference/figures/logo/c-07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/c-07.jpg
--------------------------------------------------------------------------------
/docs/reference/figures/logo/logo-slide.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/logo-slide.tiff
--------------------------------------------------------------------------------
/docs/reference/figures/logo/logo.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/logo.key
--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
1 | Aarup
2 | AFINN
3 | Årup
4 | Baidu
5 | Baum
6 | bigram
7 | bigrams
8 | blog
9 | CEUR
10 | deserialization
11 | deserialize
12 | deserialized
13 | dorothy
14 | doesn
15 | emoji
16 | Emoji
17 | ESWC
18 | Haiyan
19 | ignorables
20 | indices
21 | JSON
22 | kana
23 | knitr
24 | LF
25 | microblogs
26 | Microposts
27 | Mosteller
28 | NDJSON
29 | NFKC
30 | NFKD
31 | ODbL
32 | PDFs
33 | Publius
34 | Quanteda
35 | rmarkdown
36 | RTF
37 | Silge
38 | Spolsky
39 | stopt
40 | stopwords
41 | STSong
42 | tokenization
43 | Tokenization
44 | tokenize
45 | Tokenize
46 | tokenizer
47 | tokenizes
48 | tokenizing
49 | toto
50 | UAX
51 | unigrams
52 | Unported
53 | URLs
54 | Valitutti
55 | VignetteEncoding
56 | VignetteEngine
57 | VignetteIndexEntry
58 | wordcloud
59 | WordNet
60 | xa
61 | xff
62 | xffff
63 | york
64 |
--------------------------------------------------------------------------------
/man/abbreviations.Rd:
--------------------------------------------------------------------------------
1 | \name{abbreviations}
2 | \docType{data}
3 | \alias{abbreviations}
4 | \alias{abbreviations_de}
5 | \alias{abbreviations_en}
6 | \alias{abbreviations_es}
7 | \alias{abbreviations_fr}
8 | \alias{abbreviations_it}
9 | \alias{abbreviations_pt}
10 | \alias{abbreviations_ru}
11 | \title{Abbreviations}
12 | \description{
13 | Lists of common abbreviations.
14 | }
15 | \details{
16 | The \code{abbreviations_} objects are character vectors of abbreviations.
17 | These are words or phrases containing full stops (periods, ambiguous sentence
18 | terminators) that require special handling for sentence detection and
19 | tokenization.
20 |
21 | The original lists were compiled by the
22 | \href{http://cldr.unicode.org/}{Unicode Common Locale Data Repository}. We
23 | have tailored the English list by adding single-letter abbreviations and
24 | making a few other additions.
25 |
26 | The built-in abbreviation lists are reasonable defaults, but they may require
27 | further tailoring to suit your particular task.
28 | }
29 | \usage{
30 | abbreviations_de
31 | abbreviations_en
32 | abbreviations_es
33 | abbreviations_fr
34 | abbreviations_it
35 | abbreviations_pt
36 | abbreviations_ru
37 | }
38 | \format{A character vector of unique abbreviations.}
39 | \seealso{
40 | \code{\link{text_filter}}.
41 | }
42 | \keyword{datasets}
43 |
--------------------------------------------------------------------------------
/man/affect_wordnet.Rd:
--------------------------------------------------------------------------------
1 | \name{affect_wordnet}
2 | \docType{data}
3 | \alias{affect_wordnet}
4 | \title{WordNet-Affect Lexicon}
5 | \description{
6 | The WordNet-Affect Lexicon is a hand-curate collection of
7 | emotion-related words (nouns, verbs, adjectives, and adverbs),
8 | classified as \dQuote{Positive}, \dQuote{Negative},
9 | \dQuote{Neutral}, or \dQuote{Ambiguous} and categorized into
10 | 28 subcategories (\dQuote{Joy}, \dQuote{Love}, \dQuote{Fear},
11 | etc.).
12 |
13 | Terms can and do appear in multiple categories.
14 |
15 | The original lexicon contains multi-word phrases, but they
16 | are excluded here. Also, we removed the term \sQuote{thing}
17 | from the lexicon.
18 |
19 | The original WordNet-Affect lexicon is distributed as part
20 | of the WordNet Domains project, which is licensed under a
21 | \href{https://creativecommons.org/licenses/by/3.0/}{Creative Commons Attribution 3.0 Unported License}.
22 | You are free to share and adapt the lexicon, as long as you
23 | give attribution to the original authors.
24 | }
25 | \usage{affect_wordnet}
26 | \format{A data frame with one row for each term classification.}
27 | \source{\url{http://wndomains.fbk.eu/wnaffect.html}}
28 | \references{
29 | Strapparava, C and Valitutti A. (2004).
30 | WordNet-Affect: an affective extension of WordNet.
31 | \emph{Proceedings of the 4th International Conference on Language
32 | Resources and Evaluation}
33 | 1083--1086.
34 | }
35 | \keyword{datasets}
36 |
--------------------------------------------------------------------------------
/man/corpus-deprecated.Rd:
--------------------------------------------------------------------------------
1 | \name{corpus-deprecated}
2 | \alias{corpus-deprecated}
3 | \title{Deprecated Functions in Package \pkg{corpus}}
4 | \description{
5 | These functions are provided for compatibility with older versions of
6 | \pkg{corpus} only, and may be defunct as soon as the next release.
7 | }
8 | %\usage{
9 | %}
10 | %\arguments{
11 | %}
12 | %\details{
13 | %}
14 | \seealso{
15 | \code{\link{Deprecated}}
16 | }
17 | \keyword{internal}
18 | \keyword{misc}
19 |
--------------------------------------------------------------------------------
/man/corpus-package.Rd:
--------------------------------------------------------------------------------
1 | \name{corpus-package}
2 | \alias{corpus-package}
3 | \alias{corpus}
4 | \docType{package}
5 | \title{
6 | The Corpus Package
7 | }
8 | \description{
9 | Text corpus analysis functions
10 | }
11 | \details{
12 | This package contains functions for text corpus analysis. To create a text
13 | object, use the \code{\link{read_ndjson}} or \code{\link{as_corpus_text}}
14 | function.
15 | To split text into sentences or token blocks, use \code{\link{text_split}}.
16 | To specify preprocessing behavior for transforming a text into a
17 | token sequence, use \code{\link{text_filter}}. To tokenize text
18 | or compute term frequencies, use \code{\link{text_tokens}},
19 | \code{\link{term_stats}} or \code{\link{term_matrix}}.
20 | To search for or count specific terms,
21 | use \code{\link{text_locate}}, \code{\link{text_count}}, or
22 | \code{\link{text_detect}}.
23 |
24 | For a complete list of functions, use \code{library(help = "corpus")}.
25 | }
26 | \author{
27 | Patrick O. Perry
28 | }
29 | \keyword{ package }
30 |
--------------------------------------------------------------------------------
/man/corpus_frame.Rd:
--------------------------------------------------------------------------------
1 | \name{corpus_frame}
2 | \alias{as_corpus_frame}
3 | \alias{as_corpus_frame.character}
4 | \alias{as_corpus_frame.Corpus}
5 | \alias{as_corpus_frame.corpus}
6 | \alias{as_corpus_frame.corpus_json}
7 | \alias{as_corpus_frame.corpus_text}
8 | \alias{as_corpus_frame.data.frame}
9 | \alias{as_corpus_frame.default}
10 | \alias{corpus_frame}
11 | \alias{is_corpus_frame}
12 | \title{Corpus Data Frame}
13 | \description{
14 | Create or test for corpus objects.
15 | }
16 | \usage{
17 | corpus_frame(..., row.names = NULL, filter = NULL)
18 |
19 | as_corpus_frame(x, filter = NULL, ..., row.names = NULL)
20 |
21 | is_corpus_frame(x)
22 | }
23 | \arguments{
24 | \item{\dots}{data frame columns for \code{corpus_frame};
25 | further arguments passed to \code{as_corpus_text} from
26 | \code{as_corpus_frame}.}
27 |
28 | \item{row.names}{character vector of row names for the corpus object.}
29 |
30 | \item{filter}{text filter object for the \code{"text"} column in the
31 | corpus object.}
32 |
33 | \item{x}{object to be coerced or tested.}
34 | }
35 | \details{
36 | These functions create or convert another object to a corpus object.
37 | A corpus object is just a data frame with special functions for
38 | printing, and a column names \code{"text"} of type \code{"corpus_text"}.
39 |
40 | \code{corpus} has similar semantics to the \code{\link{data.frame}}
41 | function, except that string columns do not get converted to factors.
42 |
43 | \code{as_corpus_frame} converts another object to a corpus data frame
44 | object. By default, the method converts \code{x} to a data frame with
45 | a column named \code{"text"} of type \code{"corpus_text"}, and sets the
46 | class attribute of the result to \code{c("corpus_frame", "data.frame")}.
47 |
48 | \code{is_corpus_frame} tests whether \code{x} is a data frame with a column
49 | named \code{"text"} of type \code{"corpus_text"}.
50 |
51 | \code{as_corpus_frame} is generic: you can write methods to
52 | handle specific classes of objects.
53 | }
54 | \value{
55 | \code{corpus_frame} creates a data frame with a column named \code{"text"}
56 | of type \code{"corpus_text"}, and a class attribute set to
57 | \code{c("corpus_frame", "data.frame")}.
58 |
59 | \code{as_corpus_frame} attempts to coerce its argument to a corpus
60 | data frame object, setting the \code{row.names} and calling
61 | \code{\link{as_corpus_text}} on the \code{"text"} column with
62 | the \code{filter} and \code{\dots} arguments.
63 |
64 | \code{is_corpus_frame} returns \code{TRUE} or \code{FALSE} depending on
65 | whether its argument is a valid corpus object or not.
66 | }
67 | \seealso{
68 | \code{\link{corpus-package}}, \code{\link{print.corpus_frame}},
69 | \code{\link{corpus_text}}, \code{\link{read_ndjson}}.
70 | }
71 | \examples{
72 | # convert a data frame:
73 | emoji <- data.frame(text = sapply(0x1f600 + 1:30, intToUtf8),
74 | stringsAsFactors = FALSE)
75 | as_corpus_frame(emoji)
76 |
77 | # construct directly (no need for stringsAsFactors = FALSE):
78 | corpus_frame(text = sapply(0x1f600 + 1:30, intToUtf8))
79 |
80 | # convert a character vector:
81 | as_corpus_frame(c(a = "goodnight", b = "moon")) # keeps names
82 | as_corpus_frame(c(a = "goodnight", b = "moon"), row.names = NULL) # drops names
83 | }
84 | \keyword{classes}
85 |
--------------------------------------------------------------------------------
/man/corpus_text.Rd:
--------------------------------------------------------------------------------
1 | \name{corpus_text}
2 | \alias{as_corpus_text}
3 | \alias{as_corpus_text.character}
4 | \alias{as_corpus_text.Corpus}
5 | \alias{as_corpus_text.corpus}
6 | \alias{as_corpus_text.corpus_json}
7 | \alias{as_corpus_text.corpus_text}
8 | \alias{as_corpus_text.data.frame}
9 | \alias{as_corpus_text.default}
10 | \alias{corpus_text}
11 | \alias{is_corpus_text}
12 | \title{Text Objects}
13 | \description{
14 | Create or test for text objects.
15 | }
16 | \usage{
17 | as_corpus_text(x, filter = NULL, ..., names = NULL)
18 |
19 | is_corpus_text(x)
20 | }
21 | \arguments{
22 | \item{x}{object to be coerced or tested.}
23 |
24 | \item{filter}{if non-\code{NULL}, a text filter for the converted result.}
25 |
26 | \item{\dots}{text filter properties to set on the result.}
27 |
28 | \item{names}{if non-\code{NULL} character vector of names for
29 | the converted result.}
30 | }
31 | \details{
32 | The \code{corpus_text} type is a new data type provided by the \code{corpus}
33 | package suitable for processing international (Unicode) text. Text vectors
34 | behave like character vectors (and can be converted to them with the
35 | \code{as.character} function). They can be created using the
36 | \code{\link{read_ndjson}} function or by converting another object using the
37 | \code{as_corpus_text} function.
38 |
39 | All text objects have a \code{\link{text_filter}} property specify how to
40 | transform the text into tokens or segment it into sentences.
41 |
42 | The default behavior for \code{as_corpus_text} is to proceed as follows:
43 | \enumerate{
44 | \item If \code{x} is a \code{character} vector, then we create
45 | a new \code{text} vector from \code{x}.
46 |
47 | \item If \code{x} is a data frame, then we call \code{as_corpus_text}
48 | on \code{x$text} if a column named \code{"text"} exists in
49 | the data frame. If the data frame does not have a column
50 | named \code{"text"}, then we fail with an error message.
51 |
52 | \item If \code{x} is a \code{corpus_text} object, then we drop all
53 | attributes and we set the class to \code{"corpus_text"}.
54 |
55 | \item The default behavior for when none of the above conditions
56 | are true is to call \code{as.character} on the object first,
57 | preserving the names, and then and call \code{as_corpus_text} on
58 | the returned character object.
59 | }
60 |
61 | In all cases, when the \code{names} is \code{NULL}, we set the result
62 | names to \code{names(x)} (or \code{rownames(x)} for a data frame
63 | argument). When \code{names} is a character vector, we set the result names
64 | to this vector of names
65 |
66 | Similarly, when \code{filter} is \code{NULL}, we set the result text
67 | filter to \code{text_filter(x)}. When \code{filter} is non-\code{NULL}
68 | missing, we set the result text filter to this value. In either case,
69 | if there are additional names arguments, then we override the filter
70 | properties specified by the names of these arguments with the new values
71 | given.
72 |
73 | Note that the special handling for the names of the object is different
74 | from the other R conversion functions (\code{as.numeric},
75 | \code{as.character}, etc.), which drop the names.
76 |
77 | \code{as_corpus_text} is generic: you can write methods to handle specific
78 | classes of objects.
79 | }
80 | \value{
81 | \code{as_corpus_text} attempts to coerce its argument to \code{text} type and
82 | set its \code{names} and \code{text_filter} properties; it strips
83 | all other attributes.
84 |
85 | \code{is_corpus_text} returns \code{TRUE} or \code{FALSE} depending on
86 | whether its argument is of text type or not.
87 | }
88 | \seealso{
89 | \code{\link{as_utf8}}, \code{\link{text_filter}}, \code{\link{read_ndjson}}.
90 | }
91 | \examples{
92 | as_corpus_text("hello, world!")
93 | as_corpus_text(c(a = "goodnight", b = "moon")) # keeps names
94 |
95 | # set a filter property
96 | as_corpus_text(c(a = "goodnight", b = "moon"), stemmer = "english")
97 |
98 | is_corpus_text("hello") # FALSE, "hello" is character, not text
99 | }
100 | \keyword{classes}
101 |
--------------------------------------------------------------------------------
/man/federalist.Rd:
--------------------------------------------------------------------------------
1 | \name{federalist}
2 | \docType{data}
3 | \alias{federalist}
4 | \title{The Federalist Papers}
5 | \description{
6 | \cite{The Federalist Papers} comprise 85 articles published under the
7 | pseudonym \dQuote{Publius} in New York newspapers between 1787 and
8 | 1788, written to convince residents to ratify the \cite{Constitution}.
9 | John Jay wrote 5 papers, while Alexander Hamilton and James Madison
10 | wrote the remaining 80. Between the last two authors there are
11 | conflicting accounts of which author wrote which paper. Most sources
12 | agree on the authorships of 65 papers (51 by Hamilton and 14 by Madison),
13 | but 15 papers are in dispute.
14 |
15 | In one of the earliest examples of statistical text analysis, F. Mosteller
16 | and D. L. Wallace used a form of Naive Bayes classification to identify
17 | the authorships of the 15 disputed papers, finding strong evidence that
18 | Madison was the author of all of the disputed papers.
19 | }
20 | \usage{federalist}
21 | \format{A data frame with 85 rows, one for each paper.}
22 | \source{\url{http://www.gutenberg.org/ebooks/18}}
23 | \references{
24 | Mosteller, F and Wallace, D. L. (1963).
25 | Inference in an authorship problem.
26 | \emph{Journal of the American Statistical Association}
27 | \strong{58} 275--309.
28 | }
29 | \keyword{datasets}
30 |
--------------------------------------------------------------------------------
/man/figures/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/banner.png
--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo.png
--------------------------------------------------------------------------------
/man/figures/logo/01_make_logo.R:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/Rscript --vanilla
2 |
3 | library("magick")
4 |
5 | slide <- image_read("logo-slide.tiff")
6 | banner <- image_crop(slide, "370x80+2+2")
7 | image_write(banner, "../banner.png", format = "png")
8 |
9 | ccap <- image_read("c-07.jpg")
10 | logo <- image_scale(ccap, "80x80")
11 | image_write(logo, "../logo.png", format = "png")
12 |
--------------------------------------------------------------------------------
/man/figures/logo/README:
--------------------------------------------------------------------------------
1 |
2 | The logo was created in Mac OS pages, then exported to TIFF. The script
3 | `logo.R` crops the TIFF and converts to PNG.
4 |
5 | --
6 |
7 | C drop capital is public domain, from http://www.reusableart.com/c-07.html
8 |
9 | "This and numerous other stock print foundry images were included in the book
10 | The Proverbs of Scotland from 1868. It was published by Alexander Hislop &
11 | Company."
12 |
13 |
14 | Font is Matthew Butterick's "Equity" (Caps A Regular, 96pt).
15 |
--------------------------------------------------------------------------------
/man/figures/logo/c-07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/c-07.jpg
--------------------------------------------------------------------------------
/man/figures/logo/logo-slide.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/logo-slide.tiff
--------------------------------------------------------------------------------
/man/figures/logo/logo.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/logo.key
--------------------------------------------------------------------------------
/man/gutenberg_corpus.Rd:
--------------------------------------------------------------------------------
1 | \name{gutenberg_corpus}
2 | \alias{gutenberg_corpus}
3 | \title{Project Gutenberg Corpora}
4 | \description{
5 | Get a corpus of texts from Project Gutenberg.
6 | }
7 | \usage{
8 | gutenberg_corpus(ids, filter = NULL, mirror = NULL, verbose = TRUE, ...)
9 | }
10 | \arguments{
11 | \item{ids}{an integer vector of requested Gutenberg text IDs.}
12 |
13 | \item{filter}{a text filter to set on the corpus.}
14 |
15 | \item{mirror}{a character string URL for the Gutenberg mirror to use,
16 | or NULL to determine automatically.}
17 |
18 | \item{verbose}{a logical scalar indicating whether to print progress
19 | updates to the console.}
20 |
21 | \item{...}{additional arguments passed to \code{as_corpus}.}
22 | }
23 | \details{
24 | \code{gutenberg_corpus} downloads a set of texts from Project Gutenberg,
25 | creating a corpus with the texts as rows. You specify the texts for inclusion
26 | using their Project Gutenberg IDs, passed to the function in the
27 | \code{ids} argument.
28 |
29 | You can search for Project Gutenberg texts and get their IDs using the
30 | \code{gutenberg_works} function from the \code{gutenbergr} package.
31 | }
32 | \value{
33 | A corpus (data frame) with three columns: \code{"title"}, \code{"author"},
34 | and \code{"text"}.
35 | }
36 | \seealso{
37 | \code{\link{corpus_frame}}.
38 | }
39 | \examples{
40 | # get the texts of George Eliot's novels
41 | \dontrun{eliot <- gutenberg_corpus(c(145, 550, 6688))}
42 | }
43 |
--------------------------------------------------------------------------------
/man/new_stemmer.Rd:
--------------------------------------------------------------------------------
1 | \name{new_stemmer}
2 | \alias{new_stemmer}
3 | \title{Stemmer Construction}
4 | \description{
5 | Make a stemmer from a set of (term, stem) pairs.
6 | }
7 | \usage{
8 | new_stemmer(term, stem, default = NULL, duplicates = "first",
9 | vectorize = TRUE)
10 | }
11 | \arguments{
12 | \item{term}{character vector of terms to stem.}
13 |
14 | \item{stem}{character vector the same length as \code{term} with entries
15 | giving the corresponding stems.}
16 |
17 | \item{default}{if non-\code{NULL}, a default value to use for terms
18 | that do not have a stem; \code{NULL} specifies that such terms
19 | should be left unchanged.}
20 |
21 | \item{duplicates}{action to take for duplicates in the \code{term} list. See
22 | \sQuote{Details}}.
23 |
24 | \item{vectorize}{whether to produce a vectorized stemmer that accepts and
25 | returns vector arguments.}
26 | }
27 | \details{
28 | Giving a list of terms and a corresponding list of stems, this produces a
29 | function that maps terms to their corresponding entry. If
30 | \code{default = NULL}, then values absent from the \code{term} argument
31 | get left as-is; otherwise, they get replaced by the \code{default} value.
32 |
33 | The \code{duplicates} argument indicates the action to take if
34 | there are duplicate entries in the \code{term} argument:
35 | \itemize{
36 | \item \code{duplicates = "first"} take the first matching entry in the
37 | \code{stem} list.
38 |
39 | \item \code{duplicates = "last"} take the last matching entry in the
40 | \code{stem} list.
41 |
42 | \item \code{duplicates = "omit"} use the \code{default} value for
43 | duplicated terms.
44 |
45 | \item \code{duplicates = "fail"} raise an error if there are duplicated
46 | terms.
47 | }
48 | }
49 | \value{
50 | By default, with \code{vectorize = TRUE}, the resulting stemmer accepts a
51 | character vector as input and returns a character vector of the same length
52 | with entries giving the stems of the corresponding input entries.
53 |
54 | Setting \code{vectorize = FALSE} gives a function that accepts a single input
55 | and returns a single output. This can be more efficient when used as part of
56 | a \code{\link{text_filter}}.
57 | }
58 | \seealso{
59 | \code{\link{stem_snowball}, \link{text_filter}}, \code{\link{text_tokens}}.
60 | }
61 | \examples{
62 | # map uppercase to lowercase, leave others unchanged
63 | stemmer <- new_stemmer(LETTERS, letters)
64 | stemmer(c("A", "E", "I", "O", "U", "1", "2", "3"))
65 |
66 | # map uppercase to lowercase, drop others
67 | stemmer <- new_stemmer(LETTERS, letters, default = NA)
68 | stemmer(c("A", "E", "I", "O", "U", "1", "2", "3"))
69 | }
70 |
--------------------------------------------------------------------------------
/man/print.corpus_frame.Rd:
--------------------------------------------------------------------------------
1 | \name{print.corpus_frame}
2 | \title{Corpus Data Frame Printing}
3 | \alias{format.corpus_frame}
4 | \alias{print.corpus_frame}
5 | \description{
6 | Printing and formatting corpus data frames.
7 | }
8 | \usage{
9 | \method{print}{corpus_frame}(x, rows = 20L, chars = NULL, digits = NULL,
10 | quote = FALSE, na.print = NULL, print.gap = NULL,right = FALSE,
11 | row.names = TRUE, max = NULL, display = TRUE, ...)
12 |
13 | \method{format}{corpus_frame}(x, chars = NULL, na.encode = TRUE, quote = FALSE,
14 | na.print = NULL, print.gap = NULL, ..., justify = "none")
15 | }
16 | \arguments{
17 | \item{x}{data frame object to print or format.}
18 |
19 | \item{rows}{integer scalar giving the maximum number of rows to print
20 | before truncating the output. A negative or missing value indicates
21 | no upper limit.}
22 |
23 | \item{chars}{maximum number of character units to display; see
24 | \code{\link{utf8_format}}.}
25 |
26 | \item{digits}{minimal number of significant digits; see
27 | \code{\link{print.default}}.}
28 |
29 | \item{quote}{logical scalar indicating whether to put surrounding
30 | double-quotes (\code{'"'}) around character strings and escape
31 | internal double-quotes.}
32 |
33 | \item{na.print}{character string (or \code{NULL}) indicating
34 | the encoding for \code{NA} values. Ignored when
35 | \code{na.encode} is \code{FALSE}.}
36 |
37 | \item{print.gap}{non-negative integer (or \code{NULL}) giving the
38 | number of spaces in gaps between columns; set to \code{NULL}
39 | or \code{1} for a single space.}
40 |
41 | \item{right}{logical indicating whether to right-align columns
42 | (ignored for text, character, and factor columns).}
43 |
44 | \item{row.names}{logical indicating whether to print row names, or
45 | a character vector giving alternate row names to display.}
46 |
47 | \item{max}{maximum number of entries to print; defaults to
48 | \code{getOption("max.print")}.}
49 |
50 | \item{display}{logical scalar indicating whether to optimize the
51 | printing for display, not byte-for-byte data transmission;
52 | see \code{utf8_encode}.}
53 |
54 | \item{justify}{justification; one of \code{"left"}, \code{"right"},
55 | \code{"centre"}, or \code{"none"}. Can be abbreviated.}
56 |
57 | \item{na.encode}{logical scalar indicating whether to encode
58 | \code{NA} values as character strings.}
59 |
60 | \item{...}{further arguments passed to or from other methods.}
61 | }
62 | \details{
63 | The \code{"corpus_frame"} class is a subclass of \code{"data.frame"},
64 | overriding the default print and format methods. To apply this
65 | class to a data frame, set is class to
66 | \code{c("corpus_frame", "data.frame")}.
67 |
68 | Corpus frame printing left-justifies character and text columns,
69 | truncates the output, and displays emoji on Mac OS.
70 | }
71 | \seealso{
72 | \code{\link{corpus_frame}}, \code{\link{print.data.frame}},
73 | \code{\link{utf8_print}}
74 | }
75 | \examples{
76 | # default data frame printing
77 | x <- data.frame(text = c("hello world", intToUtf8(0x1f638 + 0:3), letters))
78 | print(x)
79 |
80 | # corpus frame printing
81 | y <- x
82 | class(y) <- c("corpus_frame", "data.frame")
83 | print(y)
84 |
85 | print(y, 10) # change truncation limit
86 | }
87 |
--------------------------------------------------------------------------------
/man/sentiment_afinn.Rd:
--------------------------------------------------------------------------------
1 | \name{sentiment_afinn}
2 | \docType{data}
3 | \alias{sentiment_afinn}
4 | \title{AFINN Sentiment Lexicon}
5 | \description{
6 | The AFINN lexicon is a list of English terms manually rated
7 | for valence with an integer between -5 (negative) and +5
8 | (positive) by Finn \enc{Årup}{Aarup} Nielsen between
9 | 2009 and 2011.
10 |
11 | The original lexicon contains some multi-word phrases, but they
12 | are excluded here.
13 |
14 | The original lexicon is distributed under the
15 | \href{https://opendatacommons.org/licenses/odbl/1-0/}{Open Database License (ODbL) v1.0}.
16 | You are free to share, create works from, and adapt the lexicon, as long as you
17 | attribute the original lexicon in your work. If you adapt the lexicon, you
18 | must keep the adapted lexicon open and apply a similar license.
19 | }
20 | \usage{sentiment_afinn}
21 | \format{A data frame with one row for each term}
22 | \source{\url{http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010}}
23 | \references{
24 | Finn \enc{Årup}{Aarup} Nielsen
25 | A new ANEW: Evaluation of a word list for sentiment analysis in microblogs.
26 | \emph{Proceedings of the ESWC2011 Workshop on 'Making Sense of Microposts': Big things come in small packages 718 in CEUR Workshop Proceedings}
27 | 93-98. 2011 May.
28 | \url{http://arxiv.org/abs/1103.2903}.
29 | }
30 | \keyword{datasets}
31 |
--------------------------------------------------------------------------------
/man/stem_snowball.Rd:
--------------------------------------------------------------------------------
1 | \name{stem_snowball}
2 | \alias{stem_snowball}
3 | \title{Snowball Stemmer}
4 | \description{
5 | Stem a set of terms using one of the algorithms provided by the
6 | Snowball stemming library.
7 | }
8 | \usage{
9 | stem_snowball(x, algorithm = "en")
10 | }
11 | \arguments{
12 | \item{x}{character vector of terms to stem.}
13 |
14 | \item{algorithm}{stemming algorithm; see \sQuote{Details} for the valid
15 | choices.}
16 | }
17 | \details{
18 | Apply a Snowball stemming algorithm to a vector of input terms, \code{x},
19 | returning the result in a character vector of the same length with the
20 | same names.
21 |
22 | The \code{algorithm} argument specifies the stemming algorithm. Valid choices
23 | include the following:
24 | \code{"ar"} (\code{"arabic"}),
25 | \code{"da"} (\code{"danish"}),
26 | \code{"de"} (\code{"german"}),
27 | \code{"en"} (\code{"english"}),
28 | \code{"es"} (\code{"spanish"}),
29 | \code{"fi"} (\code{"finnish"}),
30 | \code{"fr"} (\code{"french"}),
31 | \code{"hu"} (\code{"hungarian"}),
32 | \code{"it"} (\code{"italian"}),
33 | \code{"nl"} (\code{"dutch"}),
34 | \code{"no"} (\code{"norwegian"}),
35 | \code{"pt"} (\code{"portuguese"}),
36 | \code{"ro"} (\code{"romanian"}),
37 | \code{"ru"} (\code{"russian"}),
38 | \code{"sv"} (\code{"swedish"}),
39 | \code{"ta"} (\code{"tamil"}),
40 | \code{"tr"} (\code{"turkish"}),
41 | and \code{"porter"}.
42 | Setting \code{algorithm = NULL} gives a stemmer that returns its input
43 | unchanged.
44 |
45 | The function only stems single-word terms of kind "letter"; it leaves
46 | other inputs (multi-word terms, and terms of kind "number", "punct", and
47 | "symbol") unchanged.
48 |
49 | The \href{http://snowballstem.org/algorithms/}{Snowball stemming library}
50 | provides the underlying implementation. The \code{wordStem} function from
51 | the \pkg{SnowballC} package provides a similar interface, but that function
52 | applies the algorithm to all input terms, regardless of the kind of the term.
53 | }
54 | \value{
55 | A character vector the same length and names as the input, \code{x}, with
56 | entries containing the corresponding stems.
57 | }
58 | \seealso{
59 | \code{\link{new_stemmer}}, \link{text_filter}.
60 | }
61 | \examples{
62 | # apply english stemming algorithm; don't stem non-letter terms
63 | stem_snowball(c("win", "winning", "winner", "#winning"))
64 |
65 | # compare with SnowballC, which stems all kinds, not just letter
66 | \dontrun{SnowballC::wordStem(c("win", "winning", "winner", "#winning"), "en")}
67 | }
68 |
--------------------------------------------------------------------------------
/man/stopwords.Rd:
--------------------------------------------------------------------------------
1 | \name{stopwords}
2 | \docType{data}
3 | \alias{stopwords}
4 | \alias{stopwords_da}
5 | \alias{stopwords_de}
6 | \alias{stopwords_en}
7 | \alias{stopwords_es}
8 | \alias{stopwords_fi}
9 | \alias{stopwords_fr}
10 | \alias{stopwords_hu}
11 | \alias{stopwords_it}
12 | \alias{stopwords_nl}
13 | \alias{stopwords_no}
14 | \alias{stopwords_pt}
15 | \alias{stopwords_ru}
16 | \alias{stopwords_sv}
17 | \title{Stop Words}
18 | \description{
19 | Lists of common function words (\sQuote{stop} words).
20 | }
21 | \details{
22 | The \code{stopwords_} objects are character vectors of case-folded
23 | \sQuote{stop} words. These are common function words that often get discarded
24 | before performing other text analysis tasks.
25 |
26 | There are lists available for the following languages:
27 | Danish (\code{stopwords_da}), Dutch (\code{stopwords_nl}),
28 | English (\code{stopwords_en}), Finnish (\code{stopwords_fi}),
29 | French (\code{stopwords_fr}, German (\code{stopwords_de})
30 | Hungarian (\code{stopwords_hu}), Italian (\code{stopwords_it}),
31 | Norwegian (\code{stopwords_no}), Portuguese (\code{stopwords_pt}),
32 | Russian (\code{stopwords_ru}), Spanish (\code{stopwords_es}),
33 | and Swedish (\code{stopwords_sv}).
34 |
35 | These built-in word lists are reasonable defaults, but they may require
36 | further tailoring to suit your particular task. The original lists were
37 | compiled by the \href{http://snowballstem.org/}{Snowball stemming project}.
38 | Following the Quanteda text analysis software, we have tailored the original
39 | lists by adding the word "will" to the English list.
40 | }
41 | \usage{
42 | stopwords_da
43 | stopwords_de
44 | stopwords_en
45 | stopwords_es
46 | stopwords_fi
47 | stopwords_fr
48 | stopwords_hu
49 | stopwords_it
50 | stopwords_nl
51 | stopwords_no
52 | stopwords_pt
53 | stopwords_ru
54 | stopwords_sv
55 | }
56 | \format{A character vector of unique stop words.}
57 | \seealso{
58 | \code{\link{text_filter}}
59 | }
60 | \keyword{datasets}
61 |
--------------------------------------------------------------------------------
/man/term_matrix.Rd:
--------------------------------------------------------------------------------
1 | \name{term_matrix}
2 | \alias{term_counts}
3 | \alias{term_matrix}
4 | \title{Term Frequency Tabulation}
5 | \description{
6 | Tokenize a set of texts and compute a term frequency matrix.
7 | }
8 | \usage{
9 | term_matrix(x, filter = NULL, ngrams = NULL, select = NULL,
10 | group = NULL, transpose = FALSE, ...)
11 |
12 | term_counts(x, filter = NULL, ngrams = NULL, select = NULL,
13 | group = NULL, ...)
14 | }
15 | \arguments{
16 | \item{x}{a text vector to tokenize.}
17 |
18 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
19 | the default text filter for \code{x}.}
20 |
21 | \item{ngrams}{an integer vector of n-gram lengths to include, or
22 | \code{NULL} to use the \code{select} argument to determine the
23 | n-gram lengths.}
24 |
25 | \item{select}{a character vector of terms to count, or \code{NULL} to
26 | count all terms that appear in \code{x}.}
27 |
28 | \item{group}{if non-\code{NULL}, a factor, character string, or
29 | integer vector the same length of \code{x} specifying the grouping
30 | behavior.}
31 |
32 | \item{transpose}{a logical value indicating whether to transpose the
33 | result, putting terms as rows instead of columns.}
34 |
35 | \item{\dots}{additional properties to set on the text filter.}
36 | }
37 | \details{
38 | \code{term_matrix} tokenizes a set of texts and computes the occurrence
39 | counts for each term, returning the result as a sparse matrix
40 | (texts-by-terms). \code{term_counts} returns the same information, but
41 | in a data frame.
42 |
43 | If \code{ngrams} is non-\code{NULL}, then multi-type n-grams are
44 | included in the output for all lengths appearing in the \code{ngrams}
45 | argument. If \code{ngrams} is \code{NULL} but \code{select} is
46 | non-\code{NULL}, then all n-grams appearing in the \code{select} set
47 | are included. If both \code{ngrams} and \code{select} are \code{NULL},
48 | then only unigrams (single type terms) are included.
49 |
50 | If \code{group} is \code{NULL}, then the output has one set of term
51 | counts for each input text. Otherwise, we convert \code{group} to
52 | a \code{factor} and compute one set of term counts for each level.
53 | Texts with \code{NA} values for \code{group} get skipped.
54 | }
55 | \value{
56 | \code{term_matrix} with \code{transpose = FALSE} returns a sparse matrix
57 | in \code{"dgCMatrix"} format with one column for each term and one row for
58 | each input text or (if \code{group} is non-\code{NULL}) for each grouping
59 | level. If \code{filter$select} is non-\code{NULL}, then the column names
60 | will be equal to \code{filter$select}. Otherwise, the columns are assigned
61 | in arbitrary order.
62 |
63 | \code{term_matrix} with \code{transpose = TRUE} returns the transpose of
64 | the term matrix, in \code{"dgCMatrix"} format.
65 |
66 | \code{term_counts} with \code{group = NULL} returns a data frame with one
67 | row for each entry of the term matrix, and columns \code{"text"},
68 | \code{"term"}, and \code{"count"} giving the text ID, term, and count.
69 | The \code{"term"} column is a factor with levels equal to the selected
70 | terms. The \code{"text"}
71 | column is a factor with levels equal to \code{names(as_corpus_text(x))};
72 | calling \code{as.integer} on the \code{"text"} column converts from
73 | the factor values to the integer row index in the term matrix.
74 |
75 | \code{term_counts} with \code{group} non-\code{NULL} behaves similarly,
76 | but the result instead has columns named \code{"group"}, \code{"term"},
77 | and \code{"count"}, with \code{"group"} giving the grouping level, as
78 | a factor.
79 | }
80 | \seealso{
81 | \code{\link{text_tokens}}, \code{\link{term_stats}}.
82 | }
83 | \examples{
84 | text <- c("A rose is a rose is a rose.",
85 | "A Rose is red, a violet is blue!",
86 | "A rose by any other name would smell as sweet.")
87 | term_matrix(text)
88 |
89 | # select certain terms
90 | term_matrix(text, select = c("rose", "red", "violet", "sweet"))
91 |
92 | # specify a grouping factor
93 | term_matrix(text, group = c("Good", "Bad", "Good"))
94 |
95 | # include higher-order n-grams
96 | term_matrix(text, ngrams = 1:3)
97 |
98 | # select certain multi-type terms
99 | term_matrix(text, select = c("a rose", "a violet", "sweet", "smell"))
100 |
101 | # transpose the result
102 | term_matrix(text, ngrams = 1:2, transpose = TRUE)[1:10, ] # first 10 rows
103 |
104 | # data frame
105 | head(term_counts(text), n = 10) # first 10 rows
106 |
107 | # with grouping
108 | term_counts(text, group = c("Good", "Bad", "Good"))
109 |
110 | # taking names from the input
111 | term_counts(c(a = "One sentence.", b = "Another", c = "!!"))
112 | }
113 |
--------------------------------------------------------------------------------
/man/term_stats.Rd:
--------------------------------------------------------------------------------
1 | \name{term_stats}
2 | \alias{term_stats}
3 | \title{Term Statistics}
4 | \description{
5 | Tokenize a set of texts and tabulate the term occurrence statistics.
6 | }
7 | \usage{
8 | term_stats(x, filter = NULL, ngrams = NULL,
9 | min_count = NULL, max_count = NULL,
10 | min_support = NULL, max_support = NULL, types = FALSE,
11 | subset, ...)
12 | }
13 | \arguments{
14 | \item{x}{a text vector to tokenize.}
15 |
16 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
17 | the default text filter for \code{x}.}
18 |
19 | \item{ngrams}{an integer vector of n-gram lengths to include, or
20 | \code{NULL} for length-1 n-grams only.}
21 |
22 | \item{min_count}{a numeric scalar giving the minimum term count to include
23 | in the output, or \code{NULL} for no minimum count.}
24 |
25 | \item{max_count}{a numeric scalar giving the maximum term count to include
26 | in the output, or \code{NULL} for no maximum count.}
27 |
28 | \item{min_support}{a numeric scalar giving the minimum term support to
29 | include in the output, or \code{NULL} for no minimum support.}
30 |
31 | \item{max_support}{a numeric scalar giving the maximum term support to
32 | include in the output, or \code{NULL} for no maximum support.}
33 |
34 | \item{types}{a logical value indicating whether to include columns for
35 | the types that make up the terms.}
36 |
37 | \item{subset}{logical expression indicating elements or rows to keep:
38 | missing values are taken as false.}
39 |
40 | \item{\dots}{additional properties to set on the text filter.}
41 | }
42 | \details{
43 | \code{term_stats} tokenizes a set of texts and computes the occurrence
44 | counts and supports for each term. The \sQuote{count} is the number of
45 | occurrences of the term across all texts; the \sQuote{support} is the
46 | number of texts containing the term. Each appearance of a term
47 | increments its count by one. Likewise, an appearance of a term in text
48 | \code{i} increments its support once, not for each occurrence
49 | in the text.
50 |
51 | To include multi-type terms, specify the designed term lengths using
52 | the \code{ngrams} argument.
53 | }
54 | \value{
55 | A data frame with columns named \code{term}, \code{count}, and
56 | \code{support}, with one row for each appearing term. Rows are sorted
57 | in descending order according to \code{support} and then \code{count},
58 | with ties broken lexicographically by \code{term}, using the
59 | character ordering determined by the current locale
60 | (see \code{\link{Comparison}} for details).
61 |
62 | If \code{types = TRUE}, then the result also includes columns named
63 | \code{type1}, \code{type2}, etc. for the types that make up the
64 | term.
65 | }
66 | \seealso{
67 | \code{\link{text_tokens}}, \code{\link{term_matrix}}.
68 | }
69 | \examples{
70 | term_stats("A rose is a rose is a rose.")
71 |
72 | # remove punctuation and English stop words
73 | term_stats("A rose is a rose is a rose.",
74 | text_filter(drop_symbol = TRUE, drop = stopwords_en))
75 |
76 | # unigrams, bigrams, and trigrams
77 | term_stats("A rose is a rose is a rose.", ngrams = 1:3)
78 |
79 | # also include the type information
80 | term_stats("A rose is a rose is a rose.", ngrams = 1:3, types = TRUE)
81 | }
82 |
--------------------------------------------------------------------------------
/man/text_locate.Rd:
--------------------------------------------------------------------------------
1 | \name{text_locate}
2 | \alias{text_count}
3 | \alias{text_detect}
4 | \alias{text_locate}
5 | \alias{text_match}
6 | \alias{text_sample}
7 | \alias{text_subset}
8 | \title{Searching for Terms}
9 | \description{
10 | Look for instances of one or more terms in a set of texts.
11 | }
12 | \usage{
13 | text_locate(x, terms, filter = NULL, ...)
14 |
15 | text_count(x, terms, filter = NULL, ...)
16 |
17 | text_detect(x, terms, filter = NULL, ...)
18 |
19 | text_match(x, terms, filter = NULL, ...)
20 |
21 | text_sample(x, terms, size = NULL, filter = NULL, ...)
22 |
23 | text_subset(x, terms, filter = NULL, ...)
24 | }
25 | \arguments{
26 | \item{x}{a text or character vector.}
27 |
28 | \item{terms}{a character vector of search terms.}
29 |
30 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
31 | the default text filter for \code{x}.}
32 |
33 | \item{size}{the maximum number of results to return, or \code{NULL}.}
34 |
35 | \item{\dots}{additional properties to set on the text filter.}
36 | }
37 | \details{
38 | \code{text_locate} finds all instances of the search terms in the
39 | input text, along with their contexts.
40 |
41 | \code{text_count} counts the number of search term instances in
42 | each element of the text vector.
43 |
44 | \code{text_detect} indicates whether each text contains at least
45 | one of the search terms.
46 |
47 | \code{text_match} reports the matching instances as a factor variable
48 | with levels equal to the \code{terms} argument.
49 |
50 | \code{text_subset} returns the texts that contain the search terms.
51 |
52 | \code{text_sample} returns a random sample of the results from
53 | \code{text_locate}, in random order. This is this is useful for
54 | hand-inspecting a subset of the \code{text_locate} matches.
55 | }
56 | \value{
57 | \code{text_count} and \code{text_detect} return a numeric vector and
58 | a logical vector, respectively, with length equal to the number of input
59 | texts and names equal to the text names.
60 |
61 | \code{text_locate} and \code{text_sample} both return a data frame with
62 | one row for each search result and columns named \sQuote{text}, \sQuote{before},
63 | \sQuote{instance}, and \sQuote{after}. The \sQuote{text} column gives
64 | the name of the text containing the instance; \sQuote{before} and
65 | \sQuote{after} are text vectors giving the text before and after the
66 | instance. The \sQuote{instance} column gives the token or tokens matching
67 | the search term.
68 |
69 | \code{text_match} returns a data frame for one row for each search result,
70 | with columns names \sQuote{text} and \sQuote{term}. Both columns are
71 | factors. The \sQuote{text} column has levels equal to the text labels,
72 | and the \sQuote{term} column has levels equal to \code{terms} argument.
73 |
74 | \code{text_subset} returns the subset of texts that contain the given
75 | search terms. The resulting has its \code{text_filter} set to the
76 | passed-in \code{filter} argument.
77 | }
78 | \seealso{
79 | \code{\link{term_stats}}, \code{\link{term_matrix}}.
80 | }
81 | \examples{
82 | text <- c("Rose is a rose is a rose is a rose.",
83 | "A rose by any other name would smell as sweet.",
84 | "Snow White and Rose Red")
85 |
86 | text_count(text, "rose")
87 | text_detect(text, "rose")
88 | text_locate(text, "rose")
89 | text_match(text, "rose")
90 | text_sample(text, "rose", 3)
91 | text_subset(text, "a rose")
92 |
93 | # search for multiple terms
94 | text_locate(text, c("rose", "rose red", "snow white"))
95 | }
96 |
--------------------------------------------------------------------------------
/man/text_stats.Rd:
--------------------------------------------------------------------------------
1 | \name{text_stats}
2 | \alias{text_stats}
3 | \title{Text Statistics}
4 | \description{
5 | Report descriptive statistics for a set of texts.
6 | }
7 | \usage{
8 | text_stats(x, filter = NULL, ...)
9 | }
10 | \arguments{
11 | \item{x}{a text corpus.}
12 |
13 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
14 | the default text filter for \code{x}.}
15 |
16 | \item{\dots}{additional properties to set on the text filter.}
17 | }
18 | \details{
19 | \code{text_stats} reports descriptive statistics for a set of texts:
20 | the number of tokens, unique types, and sentences.
21 | }
22 | \value{
23 | A data frame with columns named \code{tokens}, \code{types}, and
24 | \code{sentences}, with one row for each text.
25 | }
26 | \seealso{
27 | \code{\link{text_filter}}, \code{\link{term_stats}}.
28 | }
29 | \examples{
30 | text_stats(c("A rose is a rose is a rose.",
31 | "A Rose is red. A violet is blue!"))
32 | }
33 |
--------------------------------------------------------------------------------
/man/text_sub.Rd:
--------------------------------------------------------------------------------
1 | \name{text_sub}
2 | \alias{text_sub}
3 | \title{Text Subsequences}
4 | \description{
5 | Extract token subsequences from a set of texts.
6 | }
7 | \usage{
8 | text_sub(x, start = 1L, end = -1L, filter = NULL, ...)
9 | }
10 | \arguments{
11 | \item{x}{text vector or corpus object.}
12 |
13 | \item{start}{integer vector giving the starting positions of the
14 | subsequences, or a two-column integer matrix giving the starting
15 | and ending positions.}
16 |
17 | \item{end}{integer vector giving the ending positions of the
18 | subsequences; ignored if \code{start} is a two-column matrix.}
19 |
20 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
21 | the default text filter for \code{x}.}
22 |
23 | \item{\dots}{additional properties to set on the text filter.}
24 | }
25 | \details{
26 | \code{text_sub} extracts token subsequences from a set of texts.
27 | The \code{start} and \code{end} arguments specifying the
28 | positions of the subsequences within the parent texts, as an inclusive
29 | range. Negative indices are interpreted as counting from the end of
30 | the text, with \code{-1L} referring to the last element.
31 | }
32 | \value{
33 | A text vector with the same length and names as \code{x}, with the
34 | desired subsequences.
35 | }
36 | \seealso{
37 | \code{\link{text_tokens}}, \code{\link{text_ntoken}}.
38 | }
39 | \examples{
40 | x <- as_corpus_text(c("A man, a plan.", "A \"canal\"?", "Panama!"),
41 | drop_punct = TRUE)
42 |
43 | # entire text
44 | text_sub(x, 1, -1)
45 |
46 | # first three elements
47 | text_sub(x, 1, 3)
48 |
49 | # last two elements
50 | text_sub(x, -2, -1)
51 | }
52 |
--------------------------------------------------------------------------------
/man/text_types.Rd:
--------------------------------------------------------------------------------
1 | \name{text_types}
2 | \alias{text_ntype}
3 | \alias{text_types}
4 | \title{Text Type Sets}
5 | \description{
6 | Get or measure the set of types (unique token values).
7 | }
8 | \usage{
9 | text_types(x, filter = NULL, collapse = FALSE, ...)
10 |
11 | text_ntype(x, filter = NULL, collapse = FALSE, ...)
12 | }
13 | \arguments{
14 | \item{x}{a text or character vector.}
15 |
16 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
17 | the default text filter for \code{x}.}
18 |
19 | \item{collapse}{a logical value indicating whether to collapse the
20 | aggregation over all rows of the input.}
21 |
22 | \item{\dots}{additional properties to set on the text filter.}
23 | }
24 | \details{
25 | \code{text_ntype} counts the number of unique types in each text;
26 | \code{text_types} returns the set of unique types, as a character
27 | vector. Types are determined according to the \code{filter} argument.
28 | }
29 | \value{
30 | If \code{collapse = FALSE}, then \code{text_ntype} produces a numeric
31 | vector with the same length and names as the input text, with the elements
32 | giving the number of units in the corresponding texts. For
33 | \code{text_types}, the result is a list of character vector with each
34 | vector giving the unique types in the corresponding text, ordered
35 | according to the \code{\link{sort}} function.
36 |
37 | If \code{collapse = TRUE}, then we aggregate over all rows of the input.
38 | In this case, \code{text_ntype} produces a scalar indicating the number
39 | of unique types in \code{x}, and \code{text_types} produces a character
40 | vector with the unique types.
41 | }
42 | \seealso{
43 | \code{\link{text_filter}}, \code{\link{text_tokens}}.
44 | }
45 | \examples{
46 | text <- c("I saw Mr. Jones today.",
47 | "Split across\na line.",
48 | "What. Are. You. Doing????",
49 | "She asked 'do you really mean that?' and I said 'yes.'")
50 |
51 | # count the number of unique types
52 | text_ntype(text)
53 | text_ntype(text, collapse = TRUE)
54 |
55 | # get the type sets
56 | text_types(text)
57 | text_types(text, collapse = TRUE)
58 | }
59 |
--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_CFLAGS = -Icorpus/src
2 | PKG_LIBS = -L. -lccorpus
3 |
4 | SNOWBALL = corpus/lib/libstemmer_c
5 | STEMMER_O = $(SNOWBALL)/src_c/stem_UTF_8_arabic.o \
6 | $(SNOWBALL)/src_c/stem_UTF_8_danish.o \
7 | $(SNOWBALL)/src_c/stem_UTF_8_dutch.o \
8 | $(SNOWBALL)/src_c/stem_UTF_8_english.o \
9 | $(SNOWBALL)/src_c/stem_UTF_8_finnish.o \
10 | $(SNOWBALL)/src_c/stem_UTF_8_french.o \
11 | $(SNOWBALL)/src_c/stem_UTF_8_german.o \
12 | $(SNOWBALL)/src_c/stem_UTF_8_hungarian.o \
13 | $(SNOWBALL)/src_c/stem_UTF_8_italian.o \
14 | $(SNOWBALL)/src_c/stem_UTF_8_norwegian.o \
15 | $(SNOWBALL)/src_c/stem_UTF_8_porter.o \
16 | $(SNOWBALL)/src_c/stem_UTF_8_portuguese.o \
17 | $(SNOWBALL)/src_c/stem_UTF_8_romanian.o \
18 | $(SNOWBALL)/src_c/stem_UTF_8_russian.o \
19 | $(SNOWBALL)/src_c/stem_UTF_8_spanish.o \
20 | $(SNOWBALL)/src_c/stem_UTF_8_swedish.o \
21 | $(SNOWBALL)/src_c/stem_UTF_8_tamil.o \
22 | $(SNOWBALL)/src_c/stem_UTF_8_turkish.o \
23 | $(SNOWBALL)/runtime/api.o \
24 | $(SNOWBALL)/runtime/utilities.o \
25 | $(SNOWBALL)/libstemmer/libstemmer_utf8.o
26 |
27 | UTF8LITE = corpus/lib/utf8lite
28 | UTF8LITE_O = $(UTF8LITE)/src/array.o \
29 | $(UTF8LITE)/src/char.o \
30 | $(UTF8LITE)/src/encode.o \
31 | $(UTF8LITE)/src/error.o \
32 | $(UTF8LITE)/src/escape.o \
33 | $(UTF8LITE)/src/graph.o \
34 | $(UTF8LITE)/src/graphscan.o \
35 | $(UTF8LITE)/src/normalize.o \
36 | $(UTF8LITE)/src/render.o \
37 | $(UTF8LITE)/src/text.o \
38 | $(UTF8LITE)/src/textassign.o \
39 | $(UTF8LITE)/src/textiter.o \
40 | $(UTF8LITE)/src/textmap.o
41 |
42 | LIBCORPUS = corpus/lib/strntod.o corpus/lib/strntoimax.o \
43 | corpus/src/array.o corpus/src/census.o corpus/src/data.o \
44 | corpus/src/datatype.o corpus/src/error.o corpus/src/filebuf.o \
45 | corpus/src/filter.o corpus/src/intset.o corpus/src/memory.o \
46 | corpus/src/ngram.o corpus/src/search.o corpus/src/sentfilter.o \
47 | corpus/src/sentscan.o corpus/src/stem.o corpus/src/stopword.o \
48 | corpus/src/symtab.o corpus/src/table.o corpus/src/termset.o \
49 | corpus/src/textset.o corpus/src/tree.o corpus/src/wordscan.o \
50 | $(STEMMER_O) $(UTF8LITE_O)
51 |
52 | $(SHLIB): libccorpus.a
53 |
54 | libccorpus.a: $(LIBCORPUS)
55 | $(AR) rcs $@ $(LIBCORPUS)
56 | $(RANLIB) $@
57 |
58 | clean:
59 | rm -f $(LIBCORPUS) $(SHLIB) $(OBJECTS) libccorpus.a
60 |
--------------------------------------------------------------------------------
/src/context.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include "rcorpus.h"
22 |
23 | #define CONTEXT_TAG install("corpus::context")
24 |
25 |
26 | struct context {
27 | void *data;
28 | void (*destroy_func)(void *);
29 | };
30 |
31 |
32 | void free_context(SEXP x)
33 | {
34 | struct context *ctx = R_ExternalPtrAddr(x);
35 | R_SetExternalPtrAddr(x, NULL);
36 | if (ctx) {
37 | if (ctx->destroy_func) {
38 | (ctx->destroy_func)(ctx->data);
39 | }
40 | corpus_free(ctx->data);
41 | corpus_free(ctx);
42 | }
43 | }
44 |
45 |
46 | SEXP alloc_context(size_t size, void (*destroy_func)(void *))
47 | {
48 | SEXP ans;
49 | struct context *ctx = NULL;
50 | void *obj = NULL;
51 | int err = 0;
52 |
53 | PROTECT(ans = R_MakeExternalPtr(NULL, CONTEXT_TAG, R_NilValue));
54 | R_RegisterCFinalizerEx(ans, free_context, TRUE);
55 |
56 | TRY_ALLOC(obj = corpus_calloc(1, size == 0 ? 1 : size));
57 | TRY_ALLOC(ctx = corpus_calloc(1, sizeof(*ctx)));
58 |
59 | ctx->data = obj;
60 | ctx->destroy_func = destroy_func;
61 | R_SetExternalPtrAddr(ans, ctx);
62 | ctx = NULL;
63 | obj = NULL;
64 | out:
65 | corpus_free(ctx);
66 | corpus_free(obj);
67 | CHECK_ERROR(err);
68 | UNPROTECT(1);
69 | return ans;
70 | }
71 |
72 |
73 | int is_context(SEXP x)
74 | {
75 | return ((TYPEOF(x) == EXTPTRSXP)
76 | && (R_ExternalPtrTag(x) == CONTEXT_TAG));
77 | }
78 |
79 |
80 | void *as_context(SEXP x)
81 | {
82 | struct context *ctx;
83 |
84 | if (!is_context(x)) {
85 | error("invalid context object");
86 | }
87 |
88 | ctx = R_ExternalPtrAddr(x);
89 | return ctx->data;
90 | }
91 |
--------------------------------------------------------------------------------
/src/filebuf.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include "corpus/src/memory.h"
20 | #include "corpus/src/filebuf.h"
21 | #include
22 | #include "rcorpus.h"
23 |
24 | #define FILEBUF_TAG install("corpus::filebuf")
25 |
26 |
27 | static struct corpus_filebuf *filebuf_new(const char *filename)
28 | {
29 | struct corpus_filebuf *obj = NULL;
30 | struct corpus_filebuf buf;
31 |
32 | errno = 0;
33 |
34 | if (corpus_filebuf_init(&buf, filename) == 0) {
35 | if (!(obj = corpus_malloc(sizeof(*obj)))) {
36 | corpus_filebuf_destroy(&buf);
37 | error("failed allocating memory");
38 | }
39 | *obj = buf;
40 | } else {
41 | if (errno) {
42 | error("cannot open file '%s': %s",
43 | filename, strerror(errno));
44 | } else {
45 | error("cannot open file '%s'", filename);
46 | }
47 | }
48 |
49 | return obj;
50 | }
51 |
52 |
53 | static void filebuf_free(struct corpus_filebuf *buf)
54 | {
55 | if (buf) {
56 | corpus_filebuf_destroy(buf);
57 | corpus_free(buf);
58 | }
59 | }
60 |
61 |
62 | static void free_filebuf(SEXP sbuf)
63 | {
64 | struct corpus_filebuf *buf = R_ExternalPtrAddr(sbuf);
65 | R_SetExternalPtrAddr(sbuf, NULL);
66 | filebuf_free(buf);
67 | }
68 |
69 |
70 | SEXP alloc_filebuf(SEXP sfile)
71 | {
72 | SEXP ans, sclass, shandle, snames;
73 | struct corpus_filebuf *buf;
74 | const char *file;
75 |
76 | if (!(isString(sfile) && LENGTH(sfile) == 1)) {
77 | error("invalid 'file' argument");
78 | }
79 |
80 | file = R_ExpandFileName(CHAR(STRING_ELT(sfile, 0)));
81 |
82 | PROTECT(shandle = R_MakeExternalPtr(NULL, FILEBUF_TAG, R_NilValue));
83 | R_RegisterCFinalizerEx(shandle, free_filebuf, TRUE);
84 |
85 | buf = filebuf_new(file);
86 | R_SetExternalPtrAddr(shandle, buf);
87 |
88 | PROTECT(ans = allocVector(VECSXP, 2));
89 | SET_VECTOR_ELT(ans, 0, shandle);
90 | SET_VECTOR_ELT(ans, 1, sfile);
91 |
92 | PROTECT(snames = allocVector(STRSXP, 2));
93 | SET_STRING_ELT(snames, 0, mkChar("handle"));
94 | SET_STRING_ELT(snames, 1, mkChar("file"));
95 | setAttrib(ans, R_NamesSymbol, snames);
96 |
97 | PROTECT(sclass = allocVector(STRSXP, 1));
98 | SET_STRING_ELT(sclass, 0, mkChar("filebuf"));
99 | setAttrib(ans, R_ClassSymbol, sclass);
100 |
101 | UNPROTECT(4);
102 | return ans;
103 | }
104 |
105 |
106 | int is_filebuf(SEXP sbuf)
107 | {
108 | SEXP handle, file;
109 |
110 | if (!isVectorList(sbuf)) {
111 | return 0;
112 | }
113 |
114 | handle = getListElement(sbuf, "handle");
115 | if (handle == R_NilValue) {
116 | return 0;
117 | }
118 |
119 | file = getListElement(sbuf, "file");
120 | if (file == R_NilValue) {
121 | return 0;
122 | }
123 |
124 | return ((TYPEOF(handle) == EXTPTRSXP)
125 | && (R_ExternalPtrTag(handle) == FILEBUF_TAG));
126 | }
127 |
128 |
129 | struct corpus_filebuf *as_filebuf(SEXP sbuf)
130 | {
131 | SEXP shandle, sfile;
132 | struct corpus_filebuf *buf;
133 | const char *file;
134 |
135 | if (!is_filebuf(sbuf)) {
136 | error("invalid 'filebuf' object");
137 | }
138 |
139 | shandle = getListElement(sbuf, "handle");
140 | buf = R_ExternalPtrAddr(shandle);
141 |
142 | if (buf == NULL) {
143 | R_RegisterCFinalizerEx(shandle, free_filebuf, TRUE);
144 |
145 | sfile = getListElement(sbuf, "file");
146 | file = R_ExpandFileName(CHAR(STRING_ELT(sfile, 0)));
147 | buf = filebuf_new(file);
148 |
149 | if (buf == NULL) {
150 | if (errno) {
151 | error("cannot open file '%s': %s", file,
152 | strerror(errno));
153 | } else {
154 | error("cannot open file '%s'", file);
155 | }
156 | }
157 |
158 | R_SetExternalPtrAddr(shandle, buf);
159 | }
160 |
161 | return buf;
162 | }
163 |
--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include "rcorpus.h"
22 |
23 | #define CALLDEF(name, n) {#name, (DL_FUNC) &name, n}
24 |
25 | static const R_CallMethodDef CallEntries[] = {
26 | CALLDEF(abbreviations, 1),
27 | CALLDEF(alloc_text_handle, 0),
28 | CALLDEF(anyNA_text, 1),
29 | CALLDEF(as_character_json, 1),
30 | CALLDEF(as_character_text, 1),
31 | CALLDEF(as_integer_json, 1),
32 | CALLDEF(as_double_json, 1),
33 | CALLDEF(as_list_json, 1),
34 | CALLDEF(as_logical_json, 1),
35 | CALLDEF(as_text_character, 2),
36 | CALLDEF(as_text_filter_connector, 1),
37 | CALLDEF(as_text_json, 2),
38 | CALLDEF(dim_json, 1),
39 | CALLDEF(is_na_text, 1),
40 | CALLDEF(length_json, 1),
41 | CALLDEF(length_text, 1),
42 | CALLDEF(logging_off, 0),
43 | CALLDEF(logging_on, 0),
44 | CALLDEF(mmap_ndjson, 2),
45 | CALLDEF(names_json, 1),
46 | CALLDEF(names_text, 1),
47 | CALLDEF(print_json, 1),
48 | CALLDEF(read_ndjson, 2),
49 | CALLDEF(simplify_json, 1),
50 | CALLDEF(stem_snowball, 2),
51 | CALLDEF(stopwords, 1),
52 | CALLDEF(subscript_json, 2),
53 | CALLDEF(subset_json, 3),
54 | CALLDEF(term_stats, 7),
55 | CALLDEF(term_matrix, 4),
56 | CALLDEF(text_c, 3),
57 | CALLDEF(text_count, 2),
58 | CALLDEF(text_detect, 2),
59 | CALLDEF(text_locate, 2),
60 | CALLDEF(text_match, 2),
61 | CALLDEF(text_nsentence, 1),
62 | CALLDEF(text_ntoken, 1),
63 | CALLDEF(text_ntype, 2),
64 | CALLDEF(text_split_sentences, 2),
65 | CALLDEF(text_split_tokens, 2),
66 | CALLDEF(text_sub, 3),
67 | CALLDEF(text_trunc, 3),
68 | CALLDEF(text_tokens, 1),
69 | CALLDEF(text_types, 2),
70 | CALLDEF(text_valid, 1),
71 | {NULL, NULL, 0}
72 | };
73 |
74 |
75 | void R_init_corpus(DllInfo *dll)
76 | {
77 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
78 | R_useDynamicSymbols(dll, FALSE);
79 | R_forceSymbols(dll, TRUE);
80 | }
81 |
--------------------------------------------------------------------------------
/src/logging.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "rcorpus.h"
18 | #include "corpus/src/error.h"
19 |
20 |
21 | static void ignore_message(int code, const char *message)
22 | {
23 | (void)code;
24 | (void)message;
25 | }
26 |
27 |
28 | SEXP logging_off(void)
29 | {
30 | corpus_log_func = ignore_message;
31 | return R_NilValue;
32 | }
33 |
34 |
35 | SEXP logging_on(void)
36 | {
37 | corpus_log_func = NULL;
38 | return R_NilValue;
39 | }
40 |
--------------------------------------------------------------------------------
/src/mkchar.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include "rcorpus.h"
19 |
20 |
21 | static void mkchar_ensure(struct mkchar *mk, int nmin);
22 |
23 |
24 | void mkchar_init(struct mkchar *mk)
25 | {
26 | mk->buf = NULL;
27 | mk->size = 0;
28 | }
29 |
30 |
31 | SEXP mkchar_get(struct mkchar *mk, const struct utf8lite_text *text)
32 | {
33 | SEXP ans;
34 | uint8_t *ptr;
35 | size_t len = UTF8LITE_TEXT_SIZE(text);
36 | struct utf8lite_text_iter it;
37 |
38 | if (len > INT_MAX) {
39 | error("character string length exceeds maximum (%d)", INT_MAX);
40 | }
41 |
42 | if (text->ptr == NULL) {
43 | ans = NA_STRING;
44 | } else {
45 | if (UTF8LITE_TEXT_HAS_ESC(text)) {
46 | mkchar_ensure(mk, (int)len);
47 |
48 | utf8lite_text_iter_make(&it, text);
49 | ptr = mk->buf;
50 | while (utf8lite_text_iter_advance(&it)) {
51 | utf8lite_encode_utf8(it.current, &ptr);
52 | }
53 | len = (size_t)(ptr - mk->buf);
54 | ptr = mk->buf;
55 | } else {
56 | ptr = (uint8_t *)text->ptr;
57 | }
58 |
59 | ans = mkCharLenCE((char *)ptr, (int)len, CE_UTF8);
60 | }
61 |
62 | return ans;
63 | }
64 |
65 |
66 | static void mkchar_ensure(struct mkchar *mk, int nmin)
67 | {
68 | int size = mk->size;
69 |
70 | if (nmin <= size) {
71 | return;
72 | }
73 |
74 | corpus_array_size_add(&size, 1, 0, nmin); // can't overflow
75 | mk->buf = (void *)R_alloc(size, sizeof(uint8_t));
76 | mk->size = size;
77 | }
78 |
--------------------------------------------------------------------------------
/src/ndjson.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include "rcorpus.h"
19 |
20 |
21 | SEXP mmap_ndjson(SEXP sfile, SEXP stext)
22 | {
23 | SEXP ans, sbuf;
24 |
25 | PROTECT(sbuf = alloc_filebuf(sfile));
26 | PROTECT(ans = alloc_json(sbuf, R_NilValue, R_NilValue, stext));
27 | as_json(ans); // force data load
28 | UNPROTECT(2);
29 |
30 | return ans;
31 | }
32 |
33 |
34 | SEXP read_ndjson(SEXP sbuffer, SEXP stext)
35 | {
36 | SEXP ans;
37 |
38 | assert(TYPEOF(sbuffer) == RAWSXP);
39 |
40 | PROTECT(ans = alloc_json(sbuffer, R_NilValue, R_NilValue, stext));
41 | as_json(ans); // force data load
42 | UNPROTECT(1);
43 |
44 | return ans;
45 | }
46 |
--------------------------------------------------------------------------------
/src/search.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include "rcorpus.h"
19 |
20 | #define SEARCH_TAG install("corpus::search")
21 |
22 |
23 | static struct corpus_search *search_new(void);
24 |
25 | struct corpus_search *search_new(void)
26 | {
27 | struct corpus_search *obj;
28 | int err;
29 |
30 | TRY_ALLOC(obj = corpus_calloc(1, sizeof(*obj)));
31 | TRY(corpus_search_init(obj));
32 |
33 | err = 0;
34 | out:
35 | if (err) {
36 | corpus_free(obj);
37 | Rf_error("memory allocation failure");
38 | }
39 |
40 | return obj;
41 | }
42 |
43 |
44 | void corpus_search_free(struct corpus_search *obj)
45 | {
46 | if (!obj) {
47 | return;
48 | }
49 |
50 | corpus_search_destroy(obj);
51 | corpus_free(obj);
52 | }
53 |
54 |
55 | static void free_search(SEXP obj)
56 | {
57 | struct corpus_search *search = R_ExternalPtrAddr(obj);
58 | corpus_search_free(search);
59 | R_ClearExternalPtr(obj);
60 | }
61 |
62 |
63 | int is_search(SEXP ssearch)
64 | {
65 | return ((TYPEOF(ssearch) == EXTPTRSXP)
66 | && (R_ExternalPtrTag(ssearch) == SEARCH_TAG));
67 | }
68 |
69 |
70 | struct corpus_search *as_search(SEXP ssearch)
71 | {
72 | if (!is_search(ssearch)) {
73 | Rf_error("invalid 'search' object");
74 | }
75 | return R_ExternalPtrAddr(ssearch);
76 | }
77 |
78 |
79 | SEXP alloc_search(SEXP sterms, const char *name, struct corpus_filter *filter)
80 | {
81 | SEXP ans, sset, items;
82 | const struct corpus_termset_term *term;
83 | struct corpus_search *obj;
84 | struct termset *termset;
85 | int i, n;
86 | int err = 0, nprot;
87 |
88 | nprot = 0;
89 |
90 | obj = search_new();
91 | PROTECT(ans = R_MakeExternalPtr(obj, SEARCH_TAG, R_NilValue)); nprot++;
92 | R_RegisterCFinalizerEx(ans, free_search, TRUE);
93 |
94 | PROTECT(sset = alloc_termset(sterms, name, filter, 1)); nprot++;
95 | termset = as_termset(sset);
96 | items = items_termset(sset);
97 | R_SetExternalPtrProtected(ans, items);
98 |
99 | n = termset->nitem;
100 | for (i = 0; i < n; i++) {
101 | RCORPUS_CHECK_INTERRUPT(i);
102 | term = &termset->set.items[i];
103 | TRY(corpus_search_add(obj, term->type_ids,
104 | term->length, NULL));
105 | }
106 |
107 | out:
108 | CHECK_ERROR(err);
109 | UNPROTECT(nprot);
110 | return ans;
111 | }
112 |
113 |
114 | SEXP items_search(SEXP ssearch)
115 | {
116 | return R_ExternalPtrProtected(ssearch);
117 | }
118 |
--------------------------------------------------------------------------------
/src/text_methods.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include "rcorpus.h"
23 |
24 |
25 | SEXP names_text(SEXP text)
26 | {
27 | if (!is_text(text)) {
28 | error("invalid text object");
29 | }
30 | return getListElement(text, "names");
31 | }
32 |
33 |
34 | SEXP filter_text(SEXP text)
35 | {
36 | if (!is_text(text)) {
37 | error("invalid text object");
38 | }
39 | return getListElement(text, "filter");
40 | }
41 |
42 |
43 | SEXP length_text(SEXP stext)
44 | {
45 | R_xlen_t len;
46 | as_text(stext, &len);
47 | return ScalarReal((double)len);
48 | }
49 |
50 |
51 | SEXP is_na_text(SEXP stext)
52 | {
53 | SEXP ans;
54 | struct utf8lite_text *text;
55 | R_xlen_t i, n;
56 | int *isna;
57 |
58 | text = as_text(stext, &n);
59 | PROTECT(ans = allocVector(LGLSXP, n));
60 | isna = LOGICAL(ans);
61 |
62 | for (i = 0; i < n; i++) {
63 | RCORPUS_CHECK_INTERRUPT(i);
64 |
65 | if (text[i].ptr) {
66 | isna[i] = FALSE;
67 | } else {
68 | isna[i] = TRUE;
69 | }
70 | }
71 |
72 | UNPROTECT(1);
73 | return ans;
74 | }
75 |
76 |
77 | SEXP anyNA_text(SEXP stext)
78 | {
79 | struct utf8lite_text *text;
80 | R_xlen_t i, n;
81 | int anyNA;
82 |
83 | text = as_text(stext, &n);
84 |
85 | anyNA = FALSE;
86 | for (i = 0; i < n; i++) {
87 | RCORPUS_CHECK_INTERRUPT(i);
88 |
89 | if (!text[i].ptr) {
90 | anyNA = TRUE;
91 | break;
92 | }
93 | }
94 |
95 | return ScalarLogical(anyNA);
96 | }
97 |
--------------------------------------------------------------------------------
/src/text_nunit.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 | #include "rcorpus.h"
21 |
22 |
23 | SEXP text_nsentence(SEXP sx)
24 | {
25 | SEXP ans, names;
26 | struct corpus_sentfilter *filter;
27 | const struct utf8lite_text *text;
28 | double *count;
29 | R_xlen_t i, n, nunit;
30 | int nprot, err = 0;
31 |
32 | nprot = 0;
33 |
34 | // x
35 | PROTECT(sx = coerce_text(sx)); nprot++;
36 | text = as_text(sx, &n);
37 | filter = text_sentfilter(sx);
38 | names = names_text(sx);
39 |
40 | PROTECT(ans = allocVector(REALSXP, n)); nprot++;
41 | setAttrib(ans, R_NamesSymbol, names);
42 | count = REAL(ans);
43 |
44 | for (i = 0; i < n; i++) {
45 | RCORPUS_CHECK_INTERRUPT(i);
46 |
47 | if (!text[i].ptr) { // missing value
48 | count[i] = NA_REAL;
49 | continue;
50 | }
51 |
52 | if (UTF8LITE_TEXT_SIZE(&text[i]) == 0) { // empty text
53 | count[i] = 0;
54 | continue;
55 | }
56 |
57 | TRY(corpus_sentfilter_start(filter, &text[i]));
58 |
59 | nunit = 0;
60 | while (corpus_sentfilter_advance(filter)) {
61 | nunit++;
62 | }
63 | TRY(filter->error);
64 |
65 | count[i] = (double)nunit;
66 | }
67 |
68 | out:
69 | CHECK_ERROR(err);
70 | UNPROTECT(nprot);
71 | return ans;
72 | }
73 |
74 |
75 | SEXP text_ntoken(SEXP sx)
76 | {
77 | SEXP ans, names;
78 | struct corpus_filter *filter;
79 | const struct utf8lite_text *text;
80 | double *count;
81 | R_xlen_t i, n, nunit;
82 | int nprot, err = 0;
83 |
84 | nprot = 0;
85 |
86 | PROTECT(sx = coerce_text(sx)); nprot++;
87 | text = as_text(sx, &n);
88 | names = names_text(sx);
89 | filter = text_filter(sx);
90 |
91 | PROTECT(ans = allocVector(REALSXP, n)); nprot++;
92 | setAttrib(ans, R_NamesSymbol, names);
93 | count = REAL(ans);
94 |
95 | for (i = 0; i < n; i++) {
96 | RCORPUS_CHECK_INTERRUPT(i);
97 |
98 | if (!text[i].ptr) { // missing text
99 | count[i] = NA_REAL;
100 | continue;
101 | }
102 |
103 | TRY(corpus_filter_start(filter, &text[i]));
104 |
105 | nunit = 0;
106 |
107 | while (corpus_filter_advance(filter)) {
108 | if (filter->type_id < 0) {
109 | continue;
110 | }
111 | nunit++;
112 | }
113 | TRY(filter->error);
114 |
115 | count[i] = (double)nunit;
116 | }
117 |
118 | out:
119 | UNPROTECT(nprot);
120 | CHECK_ERROR(err);
121 | return ans;
122 | }
123 |
--------------------------------------------------------------------------------
/src/text_sub.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "rcorpus.h"
18 |
19 |
20 | static int text_len(const struct utf8lite_text *text,
21 | struct corpus_filter *filter)
22 | {
23 | int err = 0, len = 0;
24 |
25 | TRY(corpus_filter_start(filter, text));
26 | while (corpus_filter_advance(filter)) {
27 | if (filter->type_id == CORPUS_TYPE_NONE) {
28 | continue;
29 | }
30 | len++;
31 | }
32 | TRY(filter->error);
33 | out:
34 | CHECK_ERROR(err);
35 | return len;
36 | }
37 |
38 |
39 | SEXP text_sub(SEXP sx, SEXP sstart, SEXP send)
40 | {
41 | SEXP ans, sources, table, tsource, trow, tstart, tstop, names, sfilter;
42 | const struct utf8lite_text *text;
43 | const uint8_t *base, *ptr;
44 | struct corpus_filter *filter;
45 | const int *start, *end;
46 | R_xlen_t i, n, nstart, nend;
47 | int err = 0, nprot = 0, s, e, j, m;
48 |
49 | text = as_text(sx, &n);
50 | filter = text_filter(sx);
51 | sources = getListElement(sx, "sources");
52 | table = getListElement(sx, "table");
53 | tsource = getListElement(table, "source");
54 | trow = getListElement(table, "row");
55 | tstart = getListElement(table, "start");
56 | tstop = getListElement(table, "stop");
57 | names = names_text(sx);
58 | sfilter = filter_text(sx);
59 |
60 | PROTECT(tstart = duplicate(tstart)); nprot++;
61 | PROTECT(tstop = duplicate(tstop)); nprot++;
62 |
63 | start = INTEGER(sstart);
64 | nstart = XLENGTH(sstart);
65 |
66 | end = INTEGER(send);
67 | nend = XLENGTH(send);
68 |
69 | for (i = 0; i < n; i++) {
70 | RCORPUS_CHECK_INTERRUPT(i);
71 |
72 | s = start[i % nstart];
73 | e = end[i % nend];
74 |
75 | // handle missing text, missing endpoints
76 | if (!text[i].ptr || s == NA_INTEGER || e == NA_INTEGER) {
77 | INTEGER(tstart)[i] = NA_INTEGER;
78 | INTEGER(tstop)[i] = NA_INTEGER;
79 | continue;
80 | }
81 |
82 | // convert negative indices to non-negative,
83 | // except for end = -1
84 | if (s < 0 || e < -1) {
85 | m = text_len(&text[i], filter);
86 |
87 | if (s < 0) {
88 | s = s + m + 1;
89 | if (s < 0) {
90 | s = 0;
91 | }
92 | }
93 |
94 | if (e < -1) {
95 | e = e + m + 1;
96 | if (e < 0) {
97 | e = 0;
98 | }
99 | }
100 | }
101 |
102 | // clip start to [1,Inf)
103 | if (s == 0) {
104 | s = 1;
105 | }
106 |
107 | base = text[i].ptr - (INTEGER(tstart)[i] - 1);
108 |
109 | // find start
110 | j = 0;
111 | TRY(corpus_filter_start(filter, &text[i]));
112 | while (j != s && corpus_filter_advance(filter)) {
113 | if (filter->type_id == CORPUS_TYPE_NONE) {
114 | // skip ignored
115 | continue;
116 | }
117 | j++;
118 | }
119 | TRY(filter->error);
120 |
121 | // handle case when start is after end of text
122 | if (j < s) {
123 | INTEGER(tstart)[i] = INTEGER(tstop)[i] + 1;
124 | continue;
125 | }
126 |
127 | // set subsequence start
128 | ptr = filter->current.ptr;
129 | INTEGER(tstart)[i] = (int)(ptr - base) + 1;
130 |
131 | // handle case when end is the last token
132 | if (e == -1) {
133 | continue;
134 | }
135 |
136 | // find end
137 | while (j != e + 1 && corpus_filter_advance(filter)) {
138 | if (filter->type_id == CORPUS_TYPE_NONE) {
139 | // skip ignored
140 | continue;
141 | }
142 | j++;
143 | }
144 | TRY(filter->error);
145 |
146 | // handle case when end is after end of text
147 | if (j < e + 1) {
148 | continue;
149 | }
150 |
151 | // set subsequence end
152 | ptr = filter->current.ptr;
153 | INTEGER(tstop)[i] = (int)(ptr - base);
154 | }
155 |
156 | PROTECT(ans = alloc_text(sources, tsource, trow, tstart, tstop,
157 | names, sfilter));
158 | nprot++;
159 |
160 | out:
161 | UNPROTECT(nprot);
162 | CHECK_ERROR(err);
163 | return ans;
164 | }
165 |
--------------------------------------------------------------------------------
/src/text_trunc.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "rcorpus.h"
18 |
19 | static SEXP trunc_left(struct mkchar *mk, const struct utf8lite_text *text,
20 | int chars)
21 | {
22 | struct utf8lite_graphscan scan;
23 | struct utf8lite_text sub;
24 | int err = 0, width = 0, w;
25 |
26 | sub.ptr = text->ptr;
27 | sub.attr = UTF8LITE_TEXT_BITS(text);
28 |
29 | utf8lite_graphscan_make(&scan, text);
30 | while (utf8lite_graphscan_advance(&scan)) {
31 | TRY(utf8lite_graph_measure(&scan.current, 0, &w));
32 | if (w > 0) {
33 | if (width > chars - w) {
34 | break;
35 | }
36 | width += w;
37 | }
38 | }
39 | sub.attr |= (size_t)(scan.ptr - text->ptr);
40 | out:
41 | CHECK_ERROR(err);
42 | return mkchar_get(mk, &sub);
43 | }
44 |
45 |
46 | static SEXP trunc_right(struct mkchar *mk, const struct utf8lite_text *text,
47 | int chars)
48 | {
49 | struct utf8lite_graphscan scan;
50 | struct utf8lite_text sub;
51 | const uint8_t *end;
52 | int err = 0, width = 0, w;
53 |
54 | sub.ptr = NULL;
55 | sub.attr = UTF8LITE_TEXT_BITS(text);
56 | end = text->ptr + UTF8LITE_TEXT_SIZE(text);
57 |
58 | utf8lite_graphscan_make(&scan, text);
59 | utf8lite_graphscan_skip(&scan);
60 | while (utf8lite_graphscan_retreat(&scan)) {
61 | TRY(utf8lite_graph_measure(&scan.current, 0, &w));
62 | if (w > 0) {
63 | if (width > chars - w) {
64 | break;
65 | }
66 | width += w;
67 | }
68 | }
69 | utf8lite_graphscan_retreat(&scan);
70 | sub.ptr = (uint8_t *)scan.ptr;
71 | sub.attr |= (size_t)(end - sub.ptr);
72 | out:
73 | CHECK_ERROR(err);
74 | return mkchar_get(mk, &sub);
75 | }
76 |
77 |
78 | SEXP text_trunc(SEXP sx, SEXP schars, SEXP sright)
79 | {
80 | SEXP ans, names, elt;
81 | struct mkchar mk;
82 | const struct utf8lite_text *text;
83 | R_xlen_t i, n;
84 | int nprot = 0, chars, right;
85 |
86 | text = as_text(sx, &n);
87 | chars = INTEGER(schars)[0];
88 | right = LOGICAL(sright)[0] == TRUE;
89 | mkchar_init(&mk);
90 |
91 | PROTECT(ans = allocVector(STRSXP, n)); nprot++;
92 | PROTECT(names = names_text(sx)); nprot++;
93 | setAttrib(ans, R_NamesSymbol, names);
94 |
95 | for (i = 0; i < n; i++) {
96 | RCORPUS_CHECK_INTERRUPT(i);
97 |
98 | if (!text[i].ptr) {
99 | elt = NA_STRING;
100 | } else if (right) {
101 | elt = trunc_right(&mk, &text[i], chars);
102 | } else {
103 | elt = trunc_left(&mk, &text[i], chars);
104 | }
105 | SET_STRING_ELT(ans, i, elt);
106 | }
107 |
108 | UNPROTECT(nprot);
109 | return ans;
110 | }
111 |
--------------------------------------------------------------------------------
/src/util.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include "rcorpus.h"
22 |
23 |
24 | /* based on R-Exts Section 5.9.6 "handling lists" */
25 | int findListElement(SEXP list, const char *str)
26 | {
27 | SEXP names;
28 | int i, n;
29 | int nprot = 0;
30 | int ans = -1;
31 |
32 | if (list == R_NilValue) {
33 | goto out;
34 | }
35 |
36 | PROTECT(names = getAttrib(list, R_NamesSymbol)); nprot++;
37 | if (names == R_NilValue) {
38 | goto out;
39 | }
40 |
41 | n = LENGTH(list);
42 | for (i = 0; i < n; i++) {
43 | if(strcmp(CHAR(STRING_ELT(names, i)), str) == 0) {
44 | ans = i;
45 | goto out;
46 | }
47 | }
48 | out:
49 | UNPROTECT(nprot);
50 | return ans;
51 | }
52 |
53 |
54 | SEXP getListElement(SEXP list, const char *str)
55 | {
56 | int i = findListElement(list, str);
57 | if (i < 0) {
58 | return R_NilValue;
59 | }
60 | return VECTOR_ELT(list, i);
61 | }
62 |
63 |
64 | double *as_weights(SEXP sweights, R_xlen_t n)
65 | {
66 | R_xlen_t n0;
67 |
68 | if (sweights == R_NilValue) {
69 | return NULL;
70 | }
71 |
72 | n0 = XLENGTH(sweights);
73 | if (n0 != n) {
74 | error("invalid 'weights' vector;"
75 | " length is %"PRIu64" but should be %"PRIu64,
76 | (uint64_t)n0, (uint64_t)n);
77 | }
78 |
79 | return REAL(sweights);
80 | }
81 |
--------------------------------------------------------------------------------
/src/wordlist.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2017 Patrick O. Perry.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "rcorpus.h"
18 |
19 |
20 | static SEXP wordlist(const uint8_t **(*callback)(const char *, int *),
21 | SEXP skind)
22 | {
23 | SEXP ans;
24 | const char **strs;
25 | const char *kind;
26 | int i, n;
27 |
28 | if (skind == R_NilValue) {
29 | return R_NilValue;
30 | }
31 |
32 | PROTECT(skind = coerceVector(skind, STRSXP));
33 | if (STRING_ELT(skind, 0) == NA_STRING) {
34 | UNPROTECT(1);
35 | return R_NilValue;
36 | }
37 |
38 | // assume utf8 encoding
39 | kind = CHAR(STRING_ELT(skind, 0));
40 | strs = (const char **)callback(kind, &n);
41 |
42 | if (!strs) {
43 | error("unknown kind (\"%s\")", kind);
44 | }
45 |
46 | PROTECT(ans = allocVector(STRSXP, n));
47 | for (i = 0; i < n; i++) {
48 | RCORPUS_CHECK_INTERRUPT(i);
49 | SET_STRING_ELT(ans, i, mkCharCE(strs[i], CE_UTF8));
50 | }
51 |
52 | UNPROTECT(2);
53 | return ans;
54 | }
55 |
56 |
57 | SEXP abbreviations(SEXP skind)
58 | {
59 | return wordlist(corpus_sentsuppress_list, skind);
60 | }
61 |
62 |
63 | SEXP stopwords(SEXP skind)
64 | {
65 | return wordlist(corpus_stopword_list, skind);
66 | }
67 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(corpus)
3 |
4 | test_check("corpus")
5 |
--------------------------------------------------------------------------------
/tests/testthat/helper-capture_output.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | if (utils::packageVersion("testthat") <= "1.0.2") {
4 | capture_output <- function(code, print = FALSE, width = 80) {
5 | oldwidth <- getOption("width")
6 | if (width != oldwidth) {
7 | options(width = width)
8 | on.exit(options(width = oldwidth), add = TRUE)
9 | }
10 | testthat::capture_output(code, print)
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/tests/testthat/helper-locale.R:
--------------------------------------------------------------------------------
1 | switch_ctype <- function(mode = c("C", "UTF-8"))
2 | {
3 | mode <- match.arg(mode)
4 |
5 | if (mode == "UTF-8") {
6 | sysname <- Sys.info()[["sysname"]]
7 | if (sysname == "Windows") {
8 | ctype <- "English_United States.1252"
9 | } else if (sysname == "Darwin") {
10 | ctype <- "UTF-8"
11 | } else {
12 | ctype <- "en_US.utf8"
13 | }
14 | } else {
15 | ctype <- "C"
16 | }
17 |
18 | ctype0 <- Sys.getlocale("LC_CTYPE")
19 | suppressWarnings({
20 | Sys.setlocale("LC_CTYPE", ctype)
21 | })
22 | if (Sys.getlocale("LC_CTYPE") != ctype) {
23 | skip(paste0("Cannot change locale to '", ctype, "'"))
24 | }
25 | if (mode == "UTF-8" && !utf8::output_utf8()) {
26 | skip("Cannot change to UTF-8 output")
27 | }
28 |
29 | ctype0
30 | }
31 |
--------------------------------------------------------------------------------
/tests/testthat/helper-options.R:
--------------------------------------------------------------------------------
1 |
2 | options(encoding = "UTF-8")
3 |
--------------------------------------------------------------------------------
/tests/testthat/test-frame-stats.R:
--------------------------------------------------------------------------------
1 | context("frame-stats")
2 |
3 |
4 | test_that("'na.fail' works", {
5 | data <- corpus_frame(x = 26:1, text = letters)
6 | expect_equal(na.fail(data), data)
7 |
8 | data <- corpus_frame(x = c(NA, 26:2), text = letters)
9 | expect_error(na.fail(data), "missing values in object")
10 |
11 | data <- corpus_frame(x = 1:26, text = c(NA, letters[-1]))
12 | expect_error(na.fail(data), "missing values in object")
13 | })
14 |
15 |
16 | test_that("'na.omit' works", {
17 | data <- corpus_frame(text = c(NA, "a", "b", NA, "c"), x = 2:6)
18 | actual <- na.omit(data)
19 | expected <- corpus_frame(text = c("a", "b", "c"), x = c(3, 4, 6),
20 | row.names = c(2L, 3L, 5L))
21 | omit <- c("1" = 1L, "4" = 4L)
22 | attr(omit, "class") <- "omit"
23 | attr(expected, "na.action") <- omit
24 | expect_equal(actual, expected)
25 |
26 | expect_equal(na.omit(corpus_frame(x = 1:26, text = letters)),
27 | corpus_frame(x = 1:26, text = letters))
28 | })
29 |
30 |
31 | test_that("'na.exclude' works", {
32 | data <- corpus_frame(text = letters[1:5], x = c(4, 3, NA, 1, -7),
33 | row.names = c("A", "B", "C", "D", "E"))
34 | actual <- na.exclude(data)
35 | expected <- corpus_frame(text = c("a", "b", "d", "e"),
36 | x = c(4, 3, 1, -7),
37 | row.names = c("A", "B", "D", "E"))
38 | exclude <- c("C" = 3L)
39 | attr(exclude, "class") <- "exclude"
40 | attr(expected, "na.action") <- exclude
41 | expect_equal(actual, expected)
42 |
43 | expect_equal(na.exclude(corpus_frame(x = 1:26, text = letters)),
44 | corpus_frame(x = 1:26, text = letters))
45 | })
46 |
--------------------------------------------------------------------------------
/tests/testthat/test-gutenberg_corpus.R:
--------------------------------------------------------------------------------
1 | context("gutenberg_corpus")
2 |
3 | test_that("'gutenberg_corpus' can download Jules Verne in French", {
4 | if (!identical(Sys.getenv("TEST_WEB_RESOURCES"), "true")) {
5 | skip("Not running web resource tests")
6 | }
7 |
8 | data <- gutenberg_corpus(800, verbose = FALSE)
9 | expect_equal(data$title, "Le Tour du Monde en 80 Jours")
10 | expect_equal(data$author, "Jules Verne")
11 | expect_equal(data$language, "French")
12 | expect_equal(nchar(as.character(data$text)), 421335)
13 | })
14 |
15 |
16 | test_that("'gutenberg_corpus' can handle NA", {
17 | data <- gutenberg_corpus(NA)
18 | expect_equal(data, corpus_frame(title = NA_character_,
19 | author = NA_character_,
20 | language = NA_character_,
21 | text = NA_character_))
22 | })
23 |
--------------------------------------------------------------------------------
/tests/testthat/test-json_serialize.R:
--------------------------------------------------------------------------------
1 | context("json_serialize")
2 |
3 |
4 | test_that("serializing json works", {
5 | x <- c("S", "P", "Q", "R")
6 | file <- tempfile()
7 | writeLines(paste0('"', x, '"'), file)
8 | ds <- read_ndjson(file, simplify=FALSE)
9 |
10 | file2 <- tempfile()
11 | saveRDS(ds, file2)
12 | ds2 <- readRDS(file2)
13 |
14 | expect_equal(as.character(ds), as.character(ds2))
15 | })
16 |
17 |
18 | test_that("serializing mmapped json works", {
19 | x <- c("S", "P", "Q", "R")
20 | file <- tempfile()
21 | writeLines(paste0('"', x, '"'), file)
22 | ds <- read_ndjson(file, mmap=TRUE, simplify=FALSE)
23 |
24 | file2 <- tempfile()
25 | saveRDS(ds, file2)
26 | ds2 <- readRDS(file2)
27 |
28 | expect_equal(as.character(ds), as.character(ds2))
29 | })
30 |
31 |
32 | test_that("serializing mmapped json should use relative, not absolute path", {
33 | wd <- getwd()
34 | on.exit(setwd(wd))
35 |
36 | x <- c("S", "P", "Q", "R")
37 |
38 | # create and change directory to dir/a
39 | dir <- tempfile()
40 | dir.create(dir)
41 | setwd(dir)
42 | dir.create("a")
43 | setwd("a")
44 |
45 | # save dir/a/data.json
46 | # save dir/a/obj.rds
47 | writeLines(paste0('{"x": "', x, '"}'), "data.json")
48 | ds <- read_ndjson("data.json", mmap=TRUE, simplify=FALSE)
49 | saveRDS(ds, "obj.rds")
50 |
51 | # move the files to
52 | # dir/data.json
53 | # dir/obj.rds
54 | file.rename(file.path(dir, "a", "data.json"), file.path(dir, "data.json"))
55 | file.rename(file.path(dir, "a", "obj.rds"), file.path(dir, "obj.rds"))
56 |
57 | # set the working directory to dir
58 | setwd(dir)
59 | unlink(file.path(dir, "a"), recursive=TRUE)
60 |
61 | # read obj.rds
62 | ds2 <- readRDS("obj.rds")
63 | expect_equal(as.character(ds2$x), x)
64 | })
65 |
66 |
67 | test_that("serializing json subset works", {
68 | x <- LETTERS
69 | file <- tempfile()
70 | writeLines(paste0('"', x, '"'), file)
71 | ds <- read_ndjson(file, simplify=FALSE)
72 |
73 | i <- seq(2, 26, 2)
74 | ds <- ds[i]
75 |
76 | file2 <- tempfile()
77 | saveRDS(ds, file2)
78 | ds2 <- readRDS(file2)
79 |
80 | expect_equal(as.character(ds), as.character(ds2))
81 | })
82 |
83 |
84 | test_that("serializing json field works", {
85 | x <- LETTERS
86 | y <- 3.14 * seq_along(LETTERS) - 10
87 | file <- tempfile()
88 | writeLines(paste0('{"x": "', x, '", "z": { "y": ', y, "} }"), file)
89 | ds <- read_ndjson(file, simplify=FALSE)
90 |
91 | ds <- ds$z
92 |
93 | file2 <- tempfile()
94 | saveRDS(ds, file2)
95 | ds2 <- readRDS(file2)
96 |
97 | expect_equal(as.numeric(ds$y), as.numeric(ds2$y))
98 | })
99 |
100 |
101 | test_that("serializing json nested fields works", {
102 | x <- 1:10
103 | file <- tempfile()
104 | writeLines(paste0('{"f1": {"f2": {"f3": {"x": ', x, '}}}}'), file)
105 | ds <- read_ndjson(file, simplify=FALSE)
106 |
107 | ds <- ds$f1$f2$f3
108 |
109 | file2 <- tempfile()
110 | saveRDS(ds, file2)
111 | ds2 <- readRDS(file2)
112 |
113 | expect_equal(as.integer(ds$x), as.numeric(ds2$x))
114 | })
115 |
116 |
117 | test_that("serializing json field subset works", {
118 | x <- LETTERS
119 | y <- 3.14 * seq_along(LETTERS) - 10
120 | file <- tempfile()
121 | writeLines(paste0('{"z": {"x": "', x, '"}, "y": ', y, "}"), file)
122 | ds <- read_ndjson(file, simplify = FALSE)
123 |
124 | i <- c(20, 2, 9, 4, 6, 2)
125 | ds <- ds[i, "z"]
126 |
127 | file2 <- tempfile()
128 | saveRDS(ds, file2)
129 | ds2 <- readRDS(file2)
130 |
131 | expect_equal(as.character(ds$x), as.character(ds2$x))
132 | })
133 |
--------------------------------------------------------------------------------
/tests/testthat/test-read_ndjson.R:
--------------------------------------------------------------------------------
1 | context("read_ndjson")
2 |
3 |
4 | test_that("reading a non-existent file should fail", {
5 | corpus:::logging_off()
6 | expect_error(read_ndjson("foobar", mmap=TRUE), "cannot open file 'foobar'")
7 | corpus:::logging_on()
8 | })
9 |
10 |
11 | test_that("passing a connection when mmap is TRUE should fail", {
12 | tmp <- tempfile()
13 | writeLines(character(), tmp)
14 | file <- file(tmp)
15 | on.exit(close(file))
16 | expect_error(read_ndjson(file, mmap = TRUE),
17 | "'file' must be a character string when 'mmap' is TRUE")
18 | })
19 |
20 |
21 | test_that("passing a file name should succeed", {
22 | file <- tempfile()
23 | writeLines('"foo"', file)
24 | expect_equal(read_ndjson(file), "foo")
25 | })
26 |
27 |
28 | test_that("passing a closed connection should succeed", {
29 | tmp <- tempfile()
30 | file <- file(tmp)
31 | writeLines('"foo"', file)
32 | expect_equal(read_ndjson(file), "foo")
33 | })
34 |
35 |
36 | test_that("passing an empty file should succeed", {
37 | file <- tempfile()
38 | writeLines(character(), file)
39 | expect_equal(read_ndjson(file), NULL)
40 | })
41 |
42 |
43 | test_that("passing a nonscalar should fail", {
44 | expect_error(read_ndjson(17),
45 | "'file' must be a character string or connection")
46 | })
47 |
--------------------------------------------------------------------------------
/tests/testthat/test-stemmer.R:
--------------------------------------------------------------------------------
1 | context("stemmer.R")
2 |
3 | test_that("can use a custom function as a stemmer", {
4 | x <- LETTERS
5 | stem <- function(word) "?"
6 | expect_equal(text_tokens(x, stemmer = stem),
7 | as.list(rep("?", length(x))))
8 | })
9 |
10 |
11 | test_that("handles stemmer logical NAs", {
12 | x <- paste(LETTERS, collapse = " ")
13 | stemmer <-
14 | function(x) {
15 | if (x %in% c("a", "e", "i", "o", "u")) {
16 | paste0(toupper(x), "*")
17 | } else {
18 | NA
19 | }
20 | }
21 | actual <- text_tokens(x, stemmer = stemmer)
22 | expected <- list(c("A*", "E*", "I*", "O*", "U*"))
23 | expect_equal(actual, expected)
24 | })
25 |
26 |
27 | test_that("handles stemmer character NAs", {
28 | x <- paste(LETTERS, collapse = " ")
29 | stemmer <- function(x) NA_character_
30 | actual <- text_tokens(x, stemmer = stemmer)
31 | expected <- list(character())
32 | expect_equal(actual, expected)
33 | })
34 |
35 |
36 | test_that("handles stemmer errors", {
37 | x <- LETTERS
38 |
39 | expect_error(text_tokens(x, stemmer = function(w) c("?", "?")),
40 | "'stemmer' returned multiple values for input \"a\"")
41 |
42 | expect_error(text_tokens(x, stemmer = function(w) character()),
43 | "'stemmer' did not return a value for input \"a\"")
44 |
45 | expect_error(text_tokens(x, stemmer = function(w) NULL),
46 | "'stemmer' did not return a value for input \"a\"")
47 |
48 | expect_error(text_tokens(x, stemmer = function(w) 7),
49 | "'stemmer' returned a non-string value for input \"a\"")
50 | })
51 |
52 |
53 | test_that("handles internal stemmer errors", {
54 | expect_error(text_tokens("hello", stemmer = function(x) stop("what?")),
55 | "'stemmer' raised an error for input \"hello\"")
56 | })
57 |
58 |
59 | test_that("'new_stemmer' can detect errors", {
60 | expect_error(new_stemmer(c("a", "b"), c("a")),
61 | "'term' argument length must equal 'stem' argument length")
62 | })
63 |
64 |
65 | test_that("'new_stemmer' can handle empty inputs", {
66 | fn <- new_stemmer(NULL, NULL)
67 | expect_equal(fn("a"), "a")
68 | })
69 |
70 |
71 | test_that("'new_stemmer' can use a default", {
72 | fn <- new_stemmer(LETTERS, letters, default = NA)
73 | expect_equal(fn("A"), "a")
74 | expect_equal(fn("AB"), NA_character_)
75 | })
76 |
77 |
78 | test_that("'new_stemmer' can handle duplicates", {
79 | term <- c("a", "a", "b", "c", "c", "c", "d")
80 | stem <- c("a1", "a2", "b", "c1", "c2", "c3", "d")
81 |
82 | fn <- new_stemmer(term, stem, duplicates = "first", vectorize = FALSE)
83 | expect_equal(sapply(term, fn, USE.NAMES = FALSE),
84 | c("a1", "a1", "b", "c1", "c1", "c1", "d"))
85 |
86 | fn <- new_stemmer(term, stem, duplicates = "last", vectorize = FALSE)
87 | expect_equal(sapply(term, fn, USE.NAMES = FALSE),
88 | c("a2", "a2", "b", "c3", "c3", "c3", "d"))
89 |
90 | fn <- new_stemmer(term, stem, duplicates = "omit", vectorize = FALSE)
91 | expect_equal(sapply(term, fn, USE.NAMES = FALSE),
92 | c("a", "a", "b", "c", "c", "c", "d"))
93 | })
94 |
95 |
96 | test_that("'new_stemmer' can vectorize ", {
97 | term <- c("a", "a", "b", "c", "c", "c", "d")
98 | stem <- c("a1", "a2", "b", "c1", "c2", "c3", "d")
99 |
100 | fn <- new_stemmer(term, stem, duplicates = "first", vectorize = TRUE)
101 | expect_equal(fn(term), c("a1", "a1", "b", "c1", "c1", "c1", "d"))
102 |
103 | fn <- new_stemmer(term, stem, duplicates = "last", vectorize = TRUE)
104 | expect_equal(fn(term), c("a2", "a2", "b", "c3", "c3", "c3", "d"))
105 |
106 | fn <- new_stemmer(term, stem, duplicates = "omit", vectorize = TRUE)
107 | expect_equal(fn(term), c("a", "a", "b", "c", "c", "c", "d"))
108 |
109 | expect_error(new_stemmer(term, stem, duplicates = "fail",
110 | vectorize = TRUE),
111 | "'term' argument entries must be unique")
112 | })
113 |
114 |
115 | test_that("'stem_snowball' can handle NULL algorithm", {
116 | x <- c("win", "winning", "winner", "#winning")
117 | expect_equal(stem_snowball(x, NULL), x)
118 | })
119 |
120 |
121 | test_that("'stem_snowball' can handle NULL input", {
122 | expect_equal(stem_snowball(NULL), NULL)
123 | })
124 |
125 |
126 | test_that("'stem_snowball' can handle stem input", {
127 | x <- c("win", "winning", "winner", "#winning")
128 | expect_equal(stem_snowball(x),
129 | c("win", "win", "winner", "#winning"))
130 | })
131 |
--------------------------------------------------------------------------------
/tests/testthat/test-term_counts.R:
--------------------------------------------------------------------------------
1 | context("term_counts")
2 |
3 | test_that("'term_counts' gives equivalent results to 'term_matrix'", {
4 | text <- c(a="A rose is a rose is a rose.",
5 | b="A Rose is red, a violet is blue!",
6 | c="A rose by any other name would smell as sweet.")
7 | x <- term_matrix(text)
8 | tf <- term_counts(text)
9 | xtf <- Matrix::sparseMatrix(i = as.integer(tf$text),
10 | j = as.integer(tf$term),
11 | x = tf$count,
12 | dimnames = list(levels(tf$text),
13 | colnames(x)))
14 | expect_equal(x, xtf)
15 | })
16 |
17 |
18 | test_that("'term_counts' gives equivalent results to 'term_matrix' no names", {
19 | text <- c("A rose is a rose is a rose.",
20 | "A Rose is red, a violet is blue!",
21 | "A rose by any other name would smell as sweet.")
22 | x <- term_matrix(text)
23 | rownames(x) <- as.character(seq_along(text))
24 | tf <- term_counts(text)
25 | xtf <- Matrix::sparseMatrix(i = as.integer(tf$text),
26 | j = as.integer(tf$term),
27 | x = tf$count,
28 | dimnames = list(levels(tf$text),
29 | colnames(x)))
30 | expect_equal(x, xtf)
31 | })
32 |
33 |
34 | test_that("'term_counts' with group gives equivalent results to 'term_matrix'", {
35 | text <- c(a="A rose is a rose is a rose.",
36 | b="A Rose is red, a violet is blue!",
37 | c="A rose by any other name would smell as sweet.")
38 | g <- factor(c("X", "Y", "X"))
39 | x <- term_matrix(text, group = g)
40 | tf <- term_counts(text, group = g)
41 | xtf <- Matrix::sparseMatrix(i = as.integer(tf$group),
42 | j = as.integer(tf$term),
43 | x = tf$count,
44 | dimnames = list(levels(tf$group),
45 | colnames(x)))
46 | expect_equal(x, xtf)
47 | })
48 |
--------------------------------------------------------------------------------
/tests/testthat/test-text-stats.R:
--------------------------------------------------------------------------------
1 | context("text-stats")
2 |
3 | test_that("'na.fail' works", {
4 | x <- as_corpus_text(letters)
5 | expect_equal(na.fail(x), x)
6 |
7 | expect_error(na.fail(c(x, NA)), "missing values in object")
8 | })
9 |
10 |
11 | test_that("'na.omit' works", {
12 | x <- as_corpus_text(c(NA, "a", "b", NA, "c"))
13 | actual <- na.omit(x)
14 | expected <- as_corpus_text(c("a", "b", "c"))
15 | omit <- c(1L, 4L)
16 | attr(omit, "class") <- "omit"
17 | attr(expected, "na.action") <- omit
18 | expect_equal(actual, expected)
19 |
20 | expect_equal(na.omit(as_corpus_text(letters)),
21 | as_corpus_text(letters))
22 | })
23 |
24 |
25 | test_that("'na.exclude' works", {
26 | x <- as_corpus_text(c(r = NA, s = "a", t = "b", u = NA, v = "c"))
27 | actual <- na.exclude(x)
28 | expected <- as_corpus_text(c(s = "a", t = "b", v = "c"))
29 | exclude <- c(r = 1L, u = 4L)
30 | attr(exclude, "class") <- "exclude"
31 | attr(expected, "na.action") <- exclude
32 | expect_equal(actual, expected)
33 |
34 | expect_equal(na.exclude(as_corpus_text(letters)),
35 | as_corpus_text(letters))
36 | })
37 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_c.R:
--------------------------------------------------------------------------------
1 | context("text_c")
2 |
3 |
4 | test_that("c should not drop text_filter", {
5 | x <- as_corpus_text("hello")
6 | text_filter(x) <- text_filter(map_case = FALSE)
7 | y <- as_corpus_text("world")
8 | z <- c(x, y)
9 | expect_equal(text_filter(z), text_filter(x))
10 | })
11 |
12 |
13 | test_that("c should work with named or unnamed args", {
14 | x <- as_corpus_text("hello")
15 | y <- as_corpus_text("world")
16 | expect_equal(c(x, y), as_corpus_text(c("hello", "world")))
17 | expect_equal(c(a = x, b = y), as_corpus_text(c(a = "hello", b = "world")))
18 | })
19 |
20 |
21 | test_that("c should work with complex args", {
22 | x <- c(a=as_corpus_text(c("hello", NA, "world")), "!", c=7)
23 | expect_equal(x, as_corpus_text(c(a1 = "hello", a2 = NA, a3 = "world",
24 | "4" = "!", c = "7")))
25 | })
26 |
27 |
28 | test_that("c should work with a single unnamed argument", {
29 | x0 <- as_corpus_text(c(a = "hello", b = "goodbye", "!"))
30 | x <- x0
31 | text_filter(x) <- text_filter(map_case = FALSE)
32 | y <- c(x)
33 | expect_equal(y, x)
34 | })
35 |
36 |
37 | test_that("c should work with a single named argument", {
38 | x <- as_corpus_text(c(a = "hello", b = "goodbye", "!"))
39 | y <- c(a = x)
40 | expect_equal(y, as_corpus_text(c(a.a = "hello", a.b = "goodbye", a3 = "!")))
41 | })
42 |
43 |
44 | test_that("c should work with no names", {
45 | x <- as_corpus_text(c("hello", "goodbye", "!"))
46 | z <- c(x, x)
47 | expect_equal(names(z), NULL)
48 | expect_equal(as.character(z), c(as.character(x), as.character(x)))
49 | })
50 |
51 |
52 | test_that("c should work with no arguments", {
53 | z <- c.corpus_text()
54 | expect_equal(z, as_corpus_text(c()))
55 | })
56 |
57 |
58 | test_that("c should support use.names = FALSE", {
59 | z <- c(a=as_corpus_text("x"), y=c(z="z", "w"), use.names = FALSE)
60 | expect_equal(z, as_corpus_text(c("x", "z", "w")))
61 | })
62 |
63 |
64 | test_that("c should support lists with recursive = TRUE", {
65 | z <- c.corpus_text(list(x = as_corpus_text("a"), y = as_corpus_text("b")), z = "c",
66 | recursive = TRUE)
67 | expect_equal(z, as_corpus_text(c(x = "a", y = "b", z = "c")))
68 | })
69 |
70 |
71 | test_that("c should support pairlists with recursive = TRUE", {
72 | z <- c.corpus_text(pairlist(x = as_corpus_text("a"), y = as_corpus_text("b")), z = "c",
73 | recursive = TRUE)
74 | expect_equal(z, as_corpus_text(c(x = "a", y = "b", z = "c")))
75 | })
76 |
77 |
78 | test_that("c can handle NA after named", {
79 | z <- c(as_corpus_text(c(x = "a")), NA)
80 | expect_equal(z, as_corpus_text(c(x = "a", "2" = NA)))
81 | })
82 |
83 |
84 | test_that("c should take filter from first value", {
85 | f <- text_filter(map_case = FALSE)
86 | x <- as_corpus_text(letters, filter = f)
87 | y <- as_corpus_text(LETTERS)
88 | z <- c(x, y)
89 | expect_equal(z, as_corpus_text(c(letters, LETTERS), filter = f))
90 |
91 | z2 <- c(y, x)
92 | expect_equal(z2, as_corpus_text(c(LETTERS, letters)))
93 | })
94 |
95 |
96 | test_that("c should work with duplicate names", {
97 | x <- as_corpus_text(c(a = "hello", b = "goodbye", "!"))
98 | z <- c(x, x)
99 | expect_equal(names(z), c(names(x), paste0(names(x), ".1")))
100 | expect_equal(as.character(z), c(as.character(x), as.character(x)))
101 | })
102 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_names.R:
--------------------------------------------------------------------------------
1 | context("text_names")
2 |
3 |
4 | test_that("`names` should be NULL for new text", {
5 | x <- as_corpus_text(c("A", "B", "C"))
6 | expect_equal(names(x), NULL)
7 |
8 | expect_equal(names(as_corpus_text(character())), NULL)
9 | })
10 |
11 |
12 | test_that("`names<-` should work on text", {
13 | x <- as_corpus_text(LETTERS)
14 | names(x) <- rev(LETTERS)
15 | expect_equal(names(x), rev(LETTERS))
16 | })
17 |
18 |
19 | test_that("setting `names<-` to NULL should restore defaults", {
20 | x <- as_corpus_text(c(a="x", b="y"))
21 | names(x) <- NULL
22 | expect_equal(names(x), NULL)
23 | })
24 |
25 |
26 | test_that("`as_corpus_text` should not drop names", {
27 | x <- as_corpus_text(c(a="1", b="2"))
28 | expect_equal(names(x), c("a", "b"))
29 | })
30 |
31 |
32 | test_that("`all.equal` should test names", {
33 | x <- as_corpus_text(1:3)
34 | y <- x
35 | names(y) <- c("a", "b", "c")
36 | expect_equal(all.equal(x, y), "names for current but not for target")
37 | expect_equal(all.equal(y, x), "names for target but not for current")
38 | })
39 |
40 |
41 | test_that("`as_corpus_text` should not drop names", {
42 | x <- as_corpus_text(c(foo="hello"))
43 | y <- as_corpus_text(x)
44 |
45 | expect_equal(y, as_corpus_text(c(foo="hello")))
46 | })
47 |
48 |
49 | test_that("`as_corpus_text` should drop attributes", {
50 | x <- as_corpus_text("hello")
51 | attr(x, "foo") <- "bar"
52 | y <- as_corpus_text(x)
53 |
54 | expect_equal(y, as_corpus_text("hello"))
55 | })
56 |
57 |
58 | test_that("`as_corpus_text` should drop attributes for JSON objects", {
59 | file <- tempfile()
60 | writeLines('{"text": "hello"}', file)
61 | x <- read_ndjson(file)$text
62 |
63 | attr(x, "foo") <- "bar"
64 | y <- as_corpus_text(x)
65 |
66 | expect_equal(y, as_corpus_text("hello"))
67 | })
68 |
69 |
70 | test_that("`names<-` should not modify copies", {
71 | x <- as_corpus_text(1:3)
72 | y <- x
73 | names(y) <- c("a", "b", "c")
74 | expect_equal(names(x), NULL)
75 | expect_equal(names(y), c("a", "b", "c"))
76 | })
77 |
78 |
79 | test_that("`names<-` should preserve attributes", {
80 | x <- as_corpus_text(1:3)
81 | attr(x, "foo") <- "bar"
82 | names(x) <- c("a", "b", "c")
83 | expect_equal(names(x), c("a", "b", "c"))
84 | expect_equal(attr(x, "foo"), "bar")
85 | })
86 |
87 |
88 | test_that("`names<-` should not allow NA", {
89 | x <- as_corpus_text(1:3)
90 | expect_error(names(x) <- c("a", NA, "b"),
91 | "missing values in 'names' are not allowed")
92 | })
93 |
94 |
95 | test_that("`names<-` should not allow duplicates", {
96 | x <- as_corpus_text(1:3)
97 | expect_error(names(x) <- c("a", "b", "a"),
98 | "duplicate 'names' are not allowed")
99 | })
100 |
101 |
102 | test_that("names should error for non-text", {
103 | expect_error(names.corpus_text("hello"), "invalid text object")
104 | })
105 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_nunit.R:
--------------------------------------------------------------------------------
1 | context("text_nunit")
2 |
3 |
4 | test_that("text_nsentence can works on sentences", {
5 | text <- c(a="He said, 'Are you going?' John Shook his head.",
6 | b="'Are you going?' John asked",
7 | c="This. Is. A. Long. Sentence!!!",
8 | d="Why all the shouting??")
9 | n0 <- text_nsentence(text)
10 | split <- text_split(text, "sentences")
11 | n <- c(with(split, tapply(index, parent, length)))
12 | names(n) <- names(text)
13 | expect_equal(n, n0)
14 | })
15 |
16 |
17 | test_that("text_nsentence handles NA and empty", {
18 | expect_equal(text_nsentence(c(NA, "")), c(NA, 0))
19 | })
20 |
21 |
22 | test_that("text_ntoken can works on tokens", {
23 | text <- c(a="He said, 'Are you going?' John Shook his head.",
24 | b="'Are you going?' John asked",
25 | c="This. Is. A. Long. Sentence!!!",
26 | d="Why all the shouting??")
27 | n0 <- text_ntoken(text)
28 | split <- text_split(text, "tokens")
29 | n <- c(with(split, tapply(index, parent, length)))
30 | names(n) <- names(text)
31 | expect_equal(n, n0)
32 | })
33 |
34 |
35 | test_that("text_ntoken handles NA and empty", {
36 | expect_equal(text_ntoken(c(NA, "")), c(NA, 0))
37 | })
38 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_primitive.R:
--------------------------------------------------------------------------------
1 | context("text_primitive")
2 |
3 |
4 | test_that("anyNA should work", {
5 | x <- as_corpus_text(c("a", NA, "", "b"))
6 | y <- as_corpus_text(c())
7 | z <- as_corpus_text(letters)
8 |
9 | expect_true(anyNA(x))
10 | expect_false(anyNA(y))
11 | expect_false(anyNA(z))
12 | })
13 |
14 |
15 | test_that("converting to character should work", {
16 | x <- c("hello", NA, "world", "")
17 | y <- as_corpus_text(x)
18 | expect_equal(as.character(y), x)
19 | })
20 |
21 |
22 | test_that("conversions should work", {
23 | expect_equal(as.complex(as_corpus_text("1+2i")), 1+2i)
24 | expect_equal(as.double(as_corpus_text("3.14")), 3.14)
25 | expect_equal(as.integer(as_corpus_text("3.14")), 3)
26 | expect_equal(as.logical(as_corpus_text(c("TRUE", "FALSE", "NA"))),
27 | c(TRUE, FALSE, NA))
28 | expect_equal(as.numeric(as_corpus_text("3.14")), 3.14)
29 | expect_equal(as.raw(as_corpus_text("1")), as.raw("1"))
30 |
31 | expect_warning(x <- as.numeric(as_corpus_text("foo")),
32 | "NAs introduced by coercion")
33 | expect_equal(x, NA_real_)
34 | })
35 |
36 |
37 | test_that("is.na should work", {
38 | x <- as_corpus_text(c("a", NA, "", "b"))
39 | expect_equal(is.na(x), c(FALSE, TRUE, FALSE, FALSE))
40 | expect_equal(is.na(as_corpus_text(c())), logical())
41 | })
42 |
43 |
44 | test_that("rep should work", {
45 | x <- as_corpus_text(c("a", "b", "c"))
46 | y <- rep(x, 7)
47 | expect_equal(y, as_corpus_text(rep(c("a", "b", "c"), 7)))
48 | })
49 |
50 |
51 | test_that("rep should work with names", {
52 | x <- as_corpus_text(c(x="a", y="b"))
53 | y <- rep(x, 2)
54 | expect_equal(y, as_corpus_text(c(x="a", y="b", x.1="a", y.1="b")))
55 | })
56 |
57 |
58 | test_that("invalid operations should error", {
59 | x <- as_corpus_text("hello")
60 | expect_error(x$names, "$ operator is invalid for text objects",
61 | fixed = TRUE)
62 | expect_error(x$names <- "foo", "$<- operator is invalid for text objects",
63 | fixed = TRUE)
64 | expect_error(as.environment(x),
65 | "'as.environment' is invalid for text objects")
66 | })
67 |
68 | test_that("setting length on invalid text should fail", {
69 | x <- letters
70 | expect_error(`length<-.corpus_text`(x, 5), "invalid text object")
71 | })
72 |
73 | test_that("setting invalid length should fail", {
74 | x <- as_corpus_text(letters)
75 | expect_error(length(x) <- NULL, "'length' cannot be NULL")
76 | expect_error(length(x) <- "1", "'length' must be numeric")
77 | expect_error(length(x) <- c(1, 1), "'length' must have length 1")
78 | expect_error(length(x) <- NA, "'length' cannot be NA")
79 | expect_error(length(x) <- NaN, "'length' cannot be NaN")
80 | expect_error(length(x) <- -1, "'length' cannot be negative")
81 | expect_error(length(x) <- 2^53 + 2, "'length' cannot be above 2\\^53")
82 | })
83 |
84 |
85 | test_that("setting short length should work", {
86 | x <- as_corpus_text(letters)
87 | length(x) <- 10
88 | expect_equal(x, as_corpus_text(letters[1:10]))
89 | })
90 |
91 |
92 | test_that("setting same length should work", {
93 | x <- as_corpus_text(letters)
94 | length(x) <- 26
95 | expect_equal(x, as_corpus_text(letters))
96 | })
97 |
98 |
99 | test_that("setting long length should work", {
100 | x <- as_corpus_text(letters)
101 | length(x) <- 30
102 | expect_equal(x, as_corpus_text(c(letters, rep(NA, 4))))
103 | })
104 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_print.R:
--------------------------------------------------------------------------------
1 | context("text_print")
2 |
3 | test_that("'print.text' works without names", {
4 | ctype <- switch_ctype("C")
5 | on.exit(Sys.setlocale("LC_CTYPE", ctype))
6 |
7 | x <- as_corpus_text(LETTERS)
8 | expected <- c(
9 | ' [1] "A" "B" "C" "D" "E" "F" "G" "H" "I"',
10 | '[10] "J" "K" "L" "M" "N" "O" "P" "Q" "R"',
11 | '[19] "S" "T"',
12 | '... (26 entries total)')
13 |
14 | expect_equal(strsplit(capture_output(print(x), width = 40), "\n")[[1]],
15 | expected)
16 | })
17 |
18 |
19 | test_that("'print.text' works with names", {
20 | ctype <- switch_ctype("C")
21 | on.exit(Sys.setlocale("LC_CTYPE", ctype))
22 |
23 | x <- as_corpus_text(LETTERS, names = paste0("foo", 1:26))
24 | expected <- c(
25 | 'foo1 foo2 foo3 foo4 foo5 foo6 foo7 foo8 ',
26 | '"A" "B" "C" "D" "E" "F" "G" "H" ',
27 | 'foo9 foo10 foo11 foo12 foo13 foo14 foo15 foo16',
28 | '"I" "J" "K" "L" "M" "N" "O" "P" ',
29 | 'foo17 foo18 foo19 foo20',
30 | '"Q" "R" "S" "T" ',
31 | '... (26 entries total)')
32 |
33 | expect_equal(strsplit(capture_output(print(x), width = 50), "\n")[[1]],
34 | expected)
35 | })
36 |
37 |
38 | test_that("'print.text' works for empty", {
39 | x <- as_corpus_text(character())
40 | expect_equal(capture_output(print(x)),
41 | "text vector with 0 entries")
42 | })
43 |
44 |
45 | test_that("'print.text' works for NULL", {
46 | expect_equal(print.corpus_text(NULL), NULL)
47 | })
48 |
49 |
50 | test_that("'print.text' errors for invalid", {
51 | expect_error(print.corpus_text("hello"), "argument is not a valid text object")
52 | })
53 |
54 |
55 | test_that("'print.text' with negative rows prints entire object", {
56 | x <- as_corpus_text(LETTERS)
57 | expect_equal(capture_output(print(x, -1)),
58 | capture_output(print(as.character(x))))
59 | })
60 |
61 |
62 | test_that("'print.text' errors for invalid inputs", {
63 | x <- as_corpus_text(LETTERS)
64 | expect_error(print(x, chars = -1), "'chars' must be non-negative")
65 | expect_error(print(x, chars = NA), "'chars' cannot be NA")
66 | expect_error(print(x, chars = c(1,1)), "'chars' must have length 1")
67 | expect_error(print(x, na.print = NA), "'na.print' cannot be NA")
68 | expect_error(print(x, print.gap = 1025), "'print.gap' must be less than or equal to 1024")
69 | })
70 |
71 |
72 | test_that("'format.text' works for empty", {
73 | expect_equal(format(as_corpus_text(character())), character())
74 | })
75 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_split_tokens.R:
--------------------------------------------------------------------------------
1 | context("text_split_tokens")
2 |
3 |
4 | test_that("'split_tokens' can split into threes", {
5 | text <- c(paste(LETTERS, collapse = " "),
6 | paste(letters, collapse = " "))
7 |
8 | expect_equal(text_split(text, "tokens", 3),
9 | structure(class = c("corpus_frame", "data.frame"),
10 | data.frame(parent = factor(as.character(c(rep(1, 9), rep(2, 9)))),
11 | index = c(1:9, 1:9),
12 | text = as_corpus_text(c("A B C ", "D E F ", "G H I ", "J K L ",
13 | "M N O ", "P Q R ", "S T U ", "V W X ",
14 | "Y Z",
15 | "a b c ", "d e f ", "g h i ", "j k l ",
16 | "m n o ", "p q r ", "s t u ", "v w x ",
17 | "y z")),
18 | row.names = NULL)))
19 | })
20 |
21 |
22 | test_that("'split_tokens' doesn't count dropped tokens", {
23 | text <- c(paste(LETTERS, collapse = " "),
24 | paste(letters, collapse = " "))
25 | f <- text_filter(drop = c("a", "e", "i", "o", "u"))
26 |
27 | expect_equal(text_split(text, "tokens", 5, filter = f),
28 | structure(class = c("corpus_frame", "data.frame"),
29 | data.frame(parent = factor(as.character(c(rep(1, 5), rep(2, 5)))),
30 | index = c(1:5, 1:5),
31 | text = as_corpus_text(c("A B C D E F G ", "H I J K L ",
32 | "M N O P Q ", "R S T U V ", "W X Y Z",
33 | "a b c d e f g ", "h i j k l ",
34 | "m n o p q ", "r s t u v ",
35 | "w x y z"),
36 | filter = f),
37 | row.names = NULL)))
38 | })
39 |
40 |
41 | test_that("'split_tokens' keeps trailing whitespace", {
42 | expect_equal(text_split("abc ", "tokens", 2),
43 | structure(class = c("corpus_frame", "data.frame"),
44 | data.frame(parent = factor("1"), index = 1,
45 | text = as_corpus_text("abc "), row.names = NULL)))
46 | })
47 |
48 |
49 | test_that("'split_tokens' handles whitespace-only text", {
50 | expect_equal(text_split(" ", "tokens", 1),
51 | structure(class = c("corpus_frame", "data.frame"),
52 | data.frame(parent = factor("1"),
53 | index = 1, text = as_corpus_text(" "), row.names = NULL)))
54 | })
55 |
56 |
57 | test_that("'split_tokens' handles empty and missing text", {
58 | expect_equal(text_split(c("", NA, NA, "", "a"), "tokens", 1),
59 | structure(class = c("corpus_frame", "data.frame"),
60 | data.frame(parent = factor(c("1", "4", "5"),
61 | levels = as.character(1:5)),
62 | index = c(1, 1, 1),
63 | text = as_corpus_text(c("", "", "a")),
64 | row.names = NULL)))
65 | })
66 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_stats.R:
--------------------------------------------------------------------------------
1 | context("text_stats")
2 |
3 |
4 | test_that("'text_stats' works on a simple example", {
5 | x <- c("A rose is a rose is a rose.", "A Rose is red. A violet is blue!")
6 | actual <- text_stats(x)
7 | expected <- data.frame(tokens = text_ntoken(x),
8 | types = text_ntype(x),
9 | sentences = text_nsentence(x))
10 | class(expected) <- c("corpus_frame", "data.frame")
11 | expect_equal(actual, expected)
12 | })
13 |
--------------------------------------------------------------------------------
/tests/testthat/test-text_types.R:
--------------------------------------------------------------------------------
1 | context("text_types")
2 |
3 | test_that("'text_types' works elementwise", {
4 | text <- c("I saw Mr. Jones today.",
5 | NA,
6 | "",
7 | "Split across\na line.",
8 | "What. Are. You. Doing????",
9 | "She asked 'do you really mean that?' and I said 'yes.'")
10 | toks <- text_tokens(text)
11 | typs <- lapply(toks, function(x) unique(sort(x, method = "radix")))
12 | typs_tot <- unique(sort(c(toks, recursive = TRUE), method = "radix"))
13 |
14 | expect_equal(text_types(text), typs)
15 | expect_equal(text_types(text, collapse = TRUE), typs_tot)
16 | })
17 |
18 |
19 | test_that("text_ntype works on types", {
20 | expect_equal(text_ntype(LETTERS, collapse = TRUE), 26)
21 |
22 | expect_equal(text_ntype(paste(LETTERS, letters, LETTERS)),
23 | rep(1, 26))
24 | })
25 |
26 |
27 | test_that("text_ntype handles NA, empty", {
28 | expect_equal(text_ntype(c("", NA, "hello world")),
29 | c(0, NA, 2))
30 |
31 | expect_equal(text_ntype(c("", NA, "hello world"), collapse = TRUE),
32 | NA_real_)
33 | })
34 |
--------------------------------------------------------------------------------
/tests/testthat/test-wordlist.R:
--------------------------------------------------------------------------------
1 |
2 | context("wordlist")
3 |
4 |
5 | test_that("'abbreviations' has common acronyms", {
6 | expect_true(all(c("Mr.", "Mrs.", "Ms.") %in% abbreviations_en))
7 | expect_true(all(c("ap. J.-C.", "av. J.-C.") %in% abbreviations_fr))
8 | })
9 |
10 |
11 | test_that("'stopwords' has common function words", {
12 | expect_true(all(c("the", "and", "is") %in% stopwords_en))
13 | })
14 |
--------------------------------------------------------------------------------
/vignettes/chinese-wordcloud-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/chinese-wordcloud-1.png
--------------------------------------------------------------------------------
/vignettes/corpus-emotion-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-emotion-1.png
--------------------------------------------------------------------------------
/vignettes/corpus-heapslaw-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-heapslaw-1.png
--------------------------------------------------------------------------------
/vignettes/corpus-witch-occurrences-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-witch-occurrences-1.png
--------------------------------------------------------------------------------
/vignettes/gender-estimates-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-estimates-1.png
--------------------------------------------------------------------------------
/vignettes/gender-estimates_se-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-estimates_se-1.png
--------------------------------------------------------------------------------
/vignettes/gender-signif-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-signif-1.png
--------------------------------------------------------------------------------