├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CONDUCT.md
├── CRAN-SUBMISSION
├── DESCRIPTION
├── Dockerfile
├── LICENSE
├── Makefile
├── NAMESPACE
├── NEWS.md
├── R
    ├── RcppExports.R
    ├── basic-tokenizers.R
    ├── character-shingles-tokenizers.R
    ├── chunk-text.R
    ├── coercion.R
    ├── data-docs.R
    ├── ngram-tokenizers.R
    ├── ptb-tokenizer.R
    ├── stem-tokenizers.R
    ├── tokenizers-package.r
    ├── utils.R
    └── wordcount.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── appveyor.yml
├── cran-comments.md
├── data-raw
    ├── moby-dick.txt
    └── mobydick.R
├── data
    └── mobydick.rda
├── docker-compose.yml
├── inst
    └── CITATION
├── man
    ├── basic-tokenizers.Rd
    ├── chunk_text.Rd
    ├── mobydick.Rd
    ├── ngram-tokenizers.Rd
    ├── ptb-tokenizer.Rd
    ├── shingle-tokenizers.Rd
    ├── stem-tokenizers.Rd
    ├── tokenizers.Rd
    └── word-counting.Rd
├── paper.bib
├── paper.md
├── src
    ├── RcppExports.cpp
    ├── shingle_ngrams.cpp
    └── skip_ngrams.cpp
├── tests
    ├── testthat.R
    └── testthat
    │   ├── helper-data.R
    │   ├── moby-ch1.txt
    │   ├── moby-ch2.txt
    │   ├── moby-ch3.txt
    │   ├── test-basic.R
    │   ├── test-chunking.R
    │   ├── test-encoding.R
    │   ├── test-ngrams.R
    │   ├── test-ptb.R
    │   ├── test-shingles.R
    │   ├── test-stem.R
    │   ├── test-tif.R
    │   ├── test-utils.R
    │   └── test-wordcount.R
├── tokenizers.Rproj
└── vignettes
    ├── introduction-to-tokenizers.R
    ├── introduction-to-tokenizers.Rmd
    ├── introduction-to-tokenizers.html
    ├── tif-and-tokenizers.R
    ├── tif-and-tokenizers.Rmd
    └── tif-and-tokenizers.html


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^\.travis\.yml$
 4 | ^README\.Rmd$
 5 | ^README-.*\.png$
 6 | ^CONDUCT\.md$
 7 | ^cran-comments\.md$
 8 | ^data-raw$
 9 | ^appveyor\.yml$
10 | ^docs$
11 | ^paper.bib$
12 | ^paper.md$
13 | ^Makefile$
14 | ^_pkgdown\.yml$
15 | ^CRAN-SUBMISSION$
16 | ^Dockerfile$
17 | ^docker-compose.yml$
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | src/*.o
5 | src/*.so
6 | src/*.dll
7 | inst/doc
8 | .DS_Store
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | r:
 3 |   - oldrel
 4 |   - release
 5 |   - devel
 6 | sudo: false
 7 | cache: packages
 8 | 
 9 | # Only report coverage for the release version
10 | after_success:
11 |   - test $TRAVIS_R_VERSION_STRING = 'release' && Rscript -e 'covr::codecov()'
12 | 
13 | notifications:
14 |   email:
15 |     on_success: never
16 |     on_failure: always
17 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (http:contributor-covenant.org), version 1.0.0, available at 
25 | http://contributor-covenant.org/version/1/0/0/
26 | 


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 0.3.0
2 | Date: 2022-12-20 21:28:18 UTC
3 | SHA: 2be98d70d9a7d7f052322d2f04394f13f1db3db2
4 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tokenizers
 2 | Type: Package
 3 | Title: Fast, Consistent Tokenization of Natural Language Text
 4 | Version: 0.3.1
 5 | Date: 2024-03-27
 6 | Description: Convert natural language text into tokens. Includes tokenizers for
 7 |     shingled n-grams, skip n-grams, words, word stems, sentences, paragraphs,
 8 |     characters, shingled characters, lines, Penn Treebank, regular
 9 |     expressions, as well as functions for counting characters, words, and sentences,
10 |     and a function for splitting longer texts into separate documents, each with
11 |     the same number of words.  The tokenizers have a consistent interface, and
12 |     the package is built on the 'stringi' and 'Rcpp' packages for  fast
13 |     yet correct tokenization in 'UTF-8'. 
14 | License: MIT + file LICENSE
15 | LazyData: yes
16 | Authors@R: c(person("Thomas", "Charlon", role = c("aut", "cre"),
17 |         email = "charlon@protonmail.com",
18 |         comment = c(ORCID = "0000-0001-7497-0470")),
19 |         person("Lincoln", "Mullen", role = c("aut"),
20 |         email = "lincoln@lincolnmullen.com",
21 |         comment = c(ORCID = "0000-0001-5103-6917")),
22 |         person("Os", "Keyes", role = c("ctb"),
23 |         email = "ironholds@gmail.com",
24 |         comment = c(ORCID = "0000-0001-5196-609X")),
25 |         person("Dmitriy", "Selivanov", role = c("ctb"),
26 |         email = "selivanov.dmitriy@gmail.com"),
27 |         person("Jeffrey", "Arnold", role = c("ctb"),
28 |         email = "jeffrey.arnold@gmail.com",
29 |         comment = c(ORCID = "0000-0001-9953-3904")),
30 |         person("Kenneth", "Benoit", role = c("ctb"),
31 |         email = "kbenoit@lse.ac.uk",
32 |         comment = c(ORCID = "0000-0002-0797-564X")))
33 | URL: https://docs.ropensci.org/tokenizers/, https://github.com/ropensci/tokenizers
34 | BugReports: https://github.com/ropensci/tokenizers/issues
35 | RoxygenNote: 7.3.1
36 | Depends:
37 |   R (>= 3.1.3)
38 | Imports:
39 |   stringi (>= 1.0.1),
40 |   Rcpp (>= 0.12.3),
41 |   SnowballC (>= 0.5.1)
42 | LinkingTo: Rcpp
43 | Encoding: UTF-8
44 | Suggests:
45 |   covr,
46 |   knitr,
47 |   rmarkdown,
48 |   stopwords (>= 0.9.0),
49 |   testthat
50 | VignetteBuilder: knitr
51 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | 
2 | from rocker/shiny-verse:4.3.2
3 | 
4 | add ./ /tokenizers
5 | run R -e "devtools::install('tokenizers', dependencies = TRUE)"
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2016
2 | COPYRIGHT HOLDER: Lincoln Mullen
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # prepare the package for release
 2 | PKGNAME := $(shell sed -n "s/Package: *\([^ ]*\)/\1/p" DESCRIPTION)
 3 | PKGVERS := $(shell sed -n "s/Version: *\([^ ]*\)/\1/p" DESCRIPTION)
 4 | PKGSRC  := $(shell basename `pwd`)
 5 | 
 6 | all: clean devtools_check
 7 | 
 8 | doc.pdf:
 9 | 	R CMD Rd2pdf -o doc.pdf .
10 | 
11 | build:
12 | 	cd ..;\
13 | 	R CMD build --no-manual $(PKGSRC)
14 | 
15 | build-cran:
16 | 	cd ..;\
17 | 	R CMD build $(PKGSRC)
18 | 
19 | install: build
20 | 	cd ..;\
21 | 	R CMD INSTALL $(PKGNAME)_$(PKGVERS).tar.gz
22 | 
23 | check: build-cran
24 | 	cd ..;\
25 | 	R CMD check $(PKGNAME)_$(PKGVERS).tar.gz --as-cran
26 | 
27 | roxygenise:
28 | 	R -e "roxygen2::roxygenise()"
29 | 
30 | devtools_test:
31 | 	R -e "devtools::test()"
32 | 
33 | devtools_check:
34 | 	R -e "devtools::check()"
35 | 
36 | vignette:
37 | 	cd vignettes;\
38 | 	R -e "rmarkdown::render('introduction-to-tokenizers.Rmd')";\
39 | 	R -e "rmarkdown::render('tif-and-tokenizers.Rmd')"
40 | 
41 | clean:
42 | 	$(RM) doc.pdf
43 | 	cd vignettes;\
44 | 	$(RM) *.pdf *.aux *.bbl *.blg *.out *.tex *.log
45 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(tokenize_character_shingles,data.frame)
 4 | S3method(tokenize_character_shingles,default)
 5 | S3method(tokenize_characters,data.frame)
 6 | S3method(tokenize_characters,default)
 7 | S3method(tokenize_lines,data.frame)
 8 | S3method(tokenize_lines,default)
 9 | S3method(tokenize_ngrams,data.frame)
10 | S3method(tokenize_ngrams,default)
11 | S3method(tokenize_paragraphs,data.frame)
12 | S3method(tokenize_paragraphs,default)
13 | S3method(tokenize_ptb,data.frame)
14 | S3method(tokenize_ptb,default)
15 | S3method(tokenize_regex,data.frame)
16 | S3method(tokenize_regex,default)
17 | S3method(tokenize_sentences,data.frame)
18 | S3method(tokenize_sentences,default)
19 | S3method(tokenize_skip_ngrams,data.frame)
20 | S3method(tokenize_skip_ngrams,default)
21 | S3method(tokenize_word_stems,data.frame)
22 | S3method(tokenize_word_stems,default)
23 | S3method(tokenize_words,data.frame)
24 | S3method(tokenize_words,default)
25 | export(chunk_text)
26 | export(count_characters)
27 | export(count_sentences)
28 | export(count_words)
29 | export(tokenize_character_shingles)
30 | export(tokenize_characters)
31 | export(tokenize_lines)
32 | export(tokenize_ngrams)
33 | export(tokenize_paragraphs)
34 | export(tokenize_ptb)
35 | export(tokenize_regex)
36 | export(tokenize_sentences)
37 | export(tokenize_skip_ngrams)
38 | export(tokenize_word_stems)
39 | export(tokenize_words)
40 | importFrom(Rcpp,sourceCpp)
41 | importFrom(SnowballC,getStemLanguages)
42 | importFrom(SnowballC,wordStem)
43 | importFrom(stringi,stri_c)
44 | importFrom(stringi,stri_opts_regex)
45 | importFrom(stringi,stri_replace_all_charclass)
46 | importFrom(stringi,stri_replace_all_regex)
47 | importFrom(stringi,stri_split_boundaries)
48 | importFrom(stringi,stri_split_fixed)
49 | importFrom(stringi,stri_split_lines)
50 | importFrom(stringi,stri_split_regex)
51 | importFrom(stringi,stri_subset_charclass)
52 | importFrom(stringi,stri_trans_tolower)
53 | importFrom(stringi,stri_trim_both)
54 | useDynLib(tokenizers, .registration = TRUE)
55 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # tokenizers 0.3.0
 2 | 
 3 | - Remove the `tokenize_tweets()` function, which is no longer supported.
 4 | 
 5 | # tokenizers 0.2.3
 6 | 
 7 | - Bug fixes and performance enhancements.
 8 | 
 9 | # tokenizers 0.2.1
10 | 
11 | - Add citation information to JOSS paper.
12 | 
13 | # tokenizers 0.2.0
14 | 
15 | ## Features
16 | 
17 | - Add the `tokenize_ptb()` function for Penn Treebank tokenizations (@jrnold) (#12).
18 | - Add a function `chunk_text()` to split long documents into pieces (#30).
19 | - New functions to count words, characters, and sentences without tokenization (#36).
20 | - New function `tokenize_tweets()` preserves usernames, hashtags, and URLS (@kbenoit) (#44).
21 | - The `stopwords()` function has been removed in favor of using the **stopwords** package (#46).
22 | - The package now complies with the basic recommendations of the **Text Interchange Format**. All tokenization functions are now methods. This enables them to take corpus inputs as either TIF-compliant named character vectors, named lists, or data frames. All outputs are still named lists of tokens, but these can be easily coerced to data frames of tokens using the `tif` package. (#49)
23 | - Add a new vignette "The Text Interchange Formats and the tokenizers Package" (#49).
24 | 
25 | ## Bug fixes and performance improvements
26 | 
27 | - `tokenize_skip_ngrams` has been improved to generate unigrams and bigrams, according to the skip definition (#24).
28 | - C++98 has replaced the C++11 code used for n-gram generation, widening the range of compilers `tokenizers` supports (@ironholds) (#26).
29 | - `tokenize_skip_ngrams` now supports stopwords (#31).
30 | - If tokenisers fail to generate tokens for a particular entry, they return `NA` consistently (#33).
31 | - Keyboard interrupt checks have been added to Rcpp-backed functions to enable users to terminate them before completion (#37).
32 | - `tokenize_words()` gains arguments to preserve or strip punctuation and numbers (#48).
33 | - `tokenize_skip_ngrams()` and `tokenize_ngrams()` to return properly marked UTF8 strings on Windows (@patperry) (#58).
34 | - `tokenize_tweets()` now removes stopwords prior to stripping punctuation, making its behavior more consistent with `tokenize_words()` (#76).
35 | 
36 | # tokenizers 0.1.4
37 | 
38 | - Add the `tokenize_character_shingles()` tokenizer.
39 | - Improvements to documentation.
40 | 
41 | # tokenizers 0.1.3
42 | 
43 | - Add vignette.
44 | - Improvements to n-gram tokenizers.
45 | 
46 | # tokenizers 0.1.2
47 | 
48 | - Add stopwords for several languages.
49 | - New stopword options to `tokenize_words()` and `tokenize_word_stems()`.
50 | 
51 | # tokenizers 0.1.1
52 | 
53 | - Fix failing test in non-UTF-8 locales.
54 | 
55 | # tokenizers 0.1.0
56 | 
57 | - Initial release with tokenizers for characters, words, word stems, sentences
58 |   paragraphs, n-grams, skip n-grams, lines, and regular expressions.
59 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | generate_ngrams_batch <- function(documents_list, ngram_min, ngram_max, stopwords = character(), ngram_delim = " ") {
 5 |     .Call(`_tokenizers_generate_ngrams_batch`, documents_list, ngram_min, ngram_max, stopwords, ngram_delim)
 6 | }
 7 | 
 8 | skip_ngrams_vectorised <- function(words, skips, stopwords) {
 9 |     .Call(`_tokenizers_skip_ngrams_vectorised`, words, skips, stopwords)
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/R/basic-tokenizers.R:
--------------------------------------------------------------------------------
  1 | #' Basic tokenizers
  2 | #'
  3 | #' These functions perform basic tokenization into words, sentences, paragraphs,
  4 | #' lines, and characters. The functions can be piped into one another to create
  5 | #' at most two levels of tokenization. For instance, one might split a text into
  6 | #' paragraphs and then word tokens, or into sentences and then word tokens.
  7 | #'
  8 | #' @name basic-tokenizers
  9 | #' @param x A character vector or a list of character vectors to be tokenized.
 10 | #'   If \code{x} is a character vector, it can be of any length, and each element
 11 | #'   will be tokenized separately. If \code{x} is a list of character vectors,
 12 | #'   where each element of the list should have a length of 1.
 13 | #' @param lowercase Should the tokens be made lower case? The default value
 14 | #'   varies by tokenizer; it is only \code{TRUE} by default for the tokenizers
 15 | #'   that you are likely to use last.
 16 | #' @param strip_non_alphanum Should punctuation and white space be stripped?
 17 | #' @param strip_punct Should punctuation be stripped?
 18 | #' @param strip_numeric Should numbers be stripped?
 19 | #' @param paragraph_break A string identifying the boundary between two
 20 | #'   paragraphs.
 21 | #' @param stopwords A character vector of stop words to be excluded.
 22 | #' @param pattern A regular expression that defines the split.
 23 | #' @param simplify \code{FALSE} by default so that a consistent value is
 24 | #'   returned regardless of length of input. If \code{TRUE}, then an input with
 25 | #'   a single element will return a character vector of tokens instead of a
 26 | #'   list.
 27 | #' @return A list of character vectors containing the tokens, with one element
 28 | #'   in the list for each element that was passed as input. If \code{simplify =
 29 | #'   TRUE} and only a single element was passed as input, then the output is a
 30 | #'   character vector of tokens.
 31 | #' @importFrom stringi stri_split_boundaries stri_trans_tolower stri_trim_both
 32 | #'   stri_replace_all_charclass stri_split_fixed stri_split_lines
 33 | #'   stri_split_regex stri_subset_charclass
 34 | #' @examples
 35 | #' song <-  paste0("How many roads must a man walk down\n",
 36 | #'                 "Before you call him a man?\n",
 37 | #'                 "How many seas must a white dove sail\n",
 38 | #'                 "Before she sleeps in the sand?\n",
 39 | #'                 "\n",
 40 | #'                 "How many times must the cannonballs fly\n",
 41 | #'                 "Before they're forever banned?\n",
 42 | #'                 "The answer, my friend, is blowin' in the wind.\n",
 43 | #'                 "The answer is blowin' in the wind.\n")
 44 | #'
 45 | #' tokenize_words(song)
 46 | #' tokenize_words(song, strip_punct = FALSE)
 47 | #' tokenize_sentences(song)
 48 | #' tokenize_paragraphs(song)
 49 | #' tokenize_lines(song)
 50 | #' tokenize_characters(song)
 51 | NULL
 52 | 
 53 | #' @export
 54 | #' @rdname basic-tokenizers
 55 | tokenize_characters <-
 56 |   function(x,
 57 |            lowercase = TRUE,
 58 |            strip_non_alphanum = TRUE,
 59 |            simplify = FALSE) {
 60 |     UseMethod("tokenize_characters")
 61 |   }
 62 | 
 63 | #' @export
 64 | tokenize_characters.data.frame <- function(x,
 65 |                                       lowercase = TRUE,
 66 |                                       strip_non_alphanum = TRUE,
 67 |                                       simplify = FALSE) {
 68 |   x <- corpus_df_as_corpus_vector(x)
 69 |   tokenize_characters(x, lowercase, strip_non_alphanum, simplify)
 70 | }
 71 | 
 72 | #' @export
 73 | tokenize_characters.default <- function(x,
 74 |                                         lowercase = TRUE,
 75 |                                         strip_non_alphanum = TRUE,
 76 |                                         simplify = FALSE) {
 77 |   check_input(x)
 78 |   named <- names(x)
 79 |   if (lowercase)
 80 |     x <- stri_trans_tolower(x)
 81 |   if (strip_non_alphanum)
 82 |     x <-
 83 |     stri_replace_all_charclass(x, "[[:punct:][:whitespace:]]", "")
 84 |   out <- stri_split_boundaries(x, type = "character")
 85 |   if (!is.null(named))
 86 |     names(out) <- named
 87 |   simplify_list(out, simplify)
 88 | }
 89 | 
 90 | #' @export
 91 | #' @rdname basic-tokenizers
 92 | tokenize_words <- function(x, lowercase = TRUE, stopwords = NULL,
 93 |                            strip_punct = TRUE, strip_numeric = FALSE,
 94 |                            simplify = FALSE) {
 95 |   UseMethod("tokenize_words")
 96 | }
 97 | 
 98 | #' @export
 99 | tokenize_words.data.frame <- function(x,
100 |                                       lowercase = TRUE,
101 |                                       stopwords = NULL,
102 |                                       strip_punct = TRUE,
103 |                                       strip_numeric = FALSE,
104 |                                       simplify = FALSE) {
105 |   x <- corpus_df_as_corpus_vector(x)
106 |   tokenize_words(x, lowercase, stopwords, strip_punct, strip_numeric, simplify)
107 | }
108 | 
109 | #' @export
110 | tokenize_words.default <- function(x, lowercase = TRUE, stopwords = NULL,
111 |                                    strip_punct = TRUE, strip_numeric = FALSE,
112 |                                    simplify = FALSE) {
113 |   check_input(x)
114 |   named <- names(x)
115 |   if (lowercase) x <- stri_trans_tolower(x)
116 |   out <- stri_split_boundaries(x, type = "word",
117 |                                skip_word_none = strip_punct,
118 |                                skip_word_number = strip_numeric)
119 |   if (!strip_punct) {
120 |     out <- lapply(out, stri_subset_charclass, "\\p{WHITESPACE}", negate = TRUE)
121 |   }
122 |   if (!is.null(named)) names(out) <- named
123 |   if (!is.null(stopwords)) out <- lapply(out, remove_stopwords, stopwords)
124 |   simplify_list(out, simplify)
125 | }
126 | 
127 | #' @export
128 | #' @rdname basic-tokenizers
129 | tokenize_sentences <-
130 |   function(x,
131 |            lowercase = FALSE,
132 |            strip_punct = FALSE,
133 |            simplify = FALSE) {
134 |     UseMethod("tokenize_sentences")
135 |   }
136 | 
137 | #' @export
138 | tokenize_sentences.data.frame <-
139 |   function(x,
140 |            lowercase = FALSE,
141 |            strip_punct = FALSE,
142 |            simplify = FALSE) {
143 |     x <- corpus_df_as_corpus_vector(x)
144 |     tokenize_sentences(x, lowercase, strip_punct, simplify)
145 |   }
146 | 
147 | #' @export
148 | tokenize_sentences.default <-
149 |   function(x,
150 |            lowercase = FALSE,
151 |            strip_punct = FALSE,
152 |            simplify = FALSE) {
153 |     check_input(x)
154 |     named <- names(x)
155 |     x <- stri_replace_all_charclass(x, "[[:whitespace:]]", " ")
156 |     out <-
157 |       stri_split_boundaries(x, type = "sentence", skip_word_none = FALSE)
158 |     out <- lapply(out, stri_trim_both)
159 |     if (lowercase)
160 |       out <- lapply(out, stri_trans_tolower)
161 |     if (strip_punct)
162 |       out <-
163 |       lapply(out, stri_replace_all_charclass, "[[:punct:]]", "")
164 |     if (!is.null(named))
165 |       names(out) <- named
166 |     simplify_list(out, simplify)
167 |   }
168 | 
169 | #' @export
170 | #' @rdname basic-tokenizers
171 | tokenize_lines <- function(x, simplify = FALSE) {
172 |   UseMethod("tokenize_lines")
173 | }
174 | 
175 | #' @export
176 | tokenize_lines.data.frame <- function(x, simplify = FALSE) {
177 |   x <- corpus_df_as_corpus_vector(x)
178 |   tokenize_lines(x, simplify)
179 | }
180 | 
181 | #' @export
182 | tokenize_lines.default <- function(x, simplify = FALSE) {
183 |   check_input(x)
184 |   named <- names(x)
185 |   out <- stri_split_lines(x, omit_empty = TRUE)
186 |   if (!is.null(named))
187 |     names(out) <- named
188 |   simplify_list(out, simplify)
189 | }
190 | 
191 | #' @export
192 | #' @rdname basic-tokenizers
193 | tokenize_paragraphs <-
194 |   function(x,
195 |            paragraph_break = "\n\n",
196 |            simplify = FALSE) {
197 |     UseMethod("tokenize_paragraphs")
198 |   }
199 | 
200 | #' @export
201 | tokenize_paragraphs.data.frame <-
202 |   function(x,
203 |            paragraph_break = "\n\n",
204 |            simplify = FALSE) {
205 |     x <- corpus_df_as_corpus_vector(x)
206 |     tokenize_paragraphs(x, paragraph_break, simplify)
207 |   }
208 | 
209 | #' @export
210 | tokenize_paragraphs.default <-
211 |   function(x,
212 |            paragraph_break = "\n\n",
213 |            simplify = FALSE) {
214 |     check_input(x)
215 |     named <- names(x)
216 |     out <-
217 |       stri_split_fixed(x, pattern = paragraph_break, omit_empty = TRUE)
218 |     out <-
219 |       lapply(out, stri_replace_all_charclass, "[[:whitespace:]]", " ")
220 |     if (!is.null(named))
221 |       names(out) <- named
222 |     simplify_list(out, simplify)
223 |   }
224 | 
225 | #' @export
226 | #' @rdname basic-tokenizers
227 | tokenize_regex <- function(x,
228 |                            pattern = "\\s+",
229 |                            simplify = FALSE) {
230 |   UseMethod("tokenize_regex")
231 | }
232 | 
233 | #' @export
234 | tokenize_regex.data.frame <-
235 |   function(x,
236 |            pattern = "\\s+",
237 |            simplify = FALSE) {
238 |     x <- corpus_df_as_corpus_vector(x)
239 |     tokenize_regex(x, pattern, simplify)
240 |   }
241 | 
242 | #' @export
243 | tokenize_regex.default <-
244 |   function(x,
245 |            pattern = "\\s+",
246 |            simplify = FALSE) {
247 |     check_input(x)
248 |     named <- names(x)
249 |     out <- stri_split_regex(x, pattern = pattern, omit_empty = TRUE)
250 |     if (!is.null(named))
251 |       names(out) <- named
252 |     simplify_list(out, simplify)
253 |   }
254 | 


--------------------------------------------------------------------------------
/R/character-shingles-tokenizers.R:
--------------------------------------------------------------------------------
 1 | #' Character shingle tokenizers
 2 | #'
 3 | #' The character shingle tokenizer functions like an n-gram tokenizer, except
 4 | #' the units that are shingled are characters instead of words. Options to the
 5 | #' function let you determine whether non-alphanumeric characters like
 6 | #' punctuation should be retained or discarded.
 7 | #'
 8 | #' @param x A character vector or a list of character vectors to be tokenized
 9 | #'   into character shingles. If \code{x} is a character vector, it can be of
10 | #'   any length, and each element will be tokenized separately. If \code{x} is a
11 | #'   list of character vectors, each element of the list should have a length of
12 | #'   1.
13 | #' @param n The number of characters in each shingle. This must be an integer
14 | #'   greater than or equal to 1.
15 | #' @param n_min This must be an integer greater than or equal to 1, and less
16 | #'   than or equal to \code{n}.
17 | #' @param lowercase Should the characters be made lower case?
18 | #' @param strip_non_alphanum Should punctuation and white space be stripped?
19 | #' @param simplify \code{FALSE} by default so that a consistent value is
20 | #'   returned regardless of length of input. If \code{TRUE}, then an input with
21 | #'   a single element will return a character vector of tokens instead of a
22 | #'   list.
23 | #'
24 | #' @return A list of character vectors containing the tokens, with one element
25 | #'   in the list for each element that was passed as input. If \code{simplify =
26 | #'   TRUE} and only a single element was passed as input, then the output is a
27 | #'   character vector of tokens.
28 | #'
29 | #' @examples
30 | #' x <- c("Now is the hour of our discontent")
31 | #' tokenize_character_shingles(x)
32 | #' tokenize_character_shingles(x, n = 5)
33 | #' tokenize_character_shingles(x, n = 5, strip_non_alphanum = FALSE)
34 | #' tokenize_character_shingles(x, n = 5, n_min = 3, strip_non_alphanum = FALSE)
35 | #'
36 | #' @export
37 | #' @rdname shingle-tokenizers
38 | tokenize_character_shingles <- function(x,
39 |                                         n = 3L,
40 |                                         n_min = n,
41 |                                         lowercase = TRUE,
42 |                                         strip_non_alphanum = TRUE,
43 |                                         simplify = FALSE) {
44 |   UseMethod("tokenize_character_shingles")
45 | }
46 | 
47 | #' @export
48 | tokenize_character_shingles.data.frame <-
49 |   function(x,
50 |            n = 3L,
51 |            n_min = n,
52 |            lowercase = TRUE,
53 |            strip_non_alphanum = TRUE,
54 |            simplify = FALSE) {
55 |     x <- corpus_df_as_corpus_vector(x)
56 |     tokenize_character_shingles(x, n, n_min, lowercase, strip_non_alphanum, simplify)
57 |   }
58 | 
59 | #' @export
60 | tokenize_character_shingles.default <-
61 |   function(x,
62 |            n = 3L,
63 |            n_min = n,
64 |            lowercase = TRUE,
65 |            strip_non_alphanum = TRUE,
66 |            simplify = FALSE) {
67 |     check_input(x)
68 |     named <- names(x)
69 |     if (n < n_min || n_min <= 0)
70 |       stop("n and n_min must be integers, and n_min must be less than ",
71 |            "n and greater than 1.")
72 |     chars <- tokenize_characters(x, lowercase = lowercase,
73 |                                  strip_non_alphanum = strip_non_alphanum)
74 |     out <-
75 |       generate_ngrams_batch(
76 |         chars,
77 |         ngram_min = n_min,
78 |         ngram_max = n,
79 |         stopwords = "",
80 |         ngram_delim = ""
81 |       )
82 |     if (!is.null(named))
83 |       names(out) <- named
84 |     simplify_list(out, simplify)
85 |   }
86 | 


--------------------------------------------------------------------------------
/R/chunk-text.R:
--------------------------------------------------------------------------------
 1 | #' Chunk text into smaller segments
 2 | #'
 3 | #' Given a text or vector/list of texts, break the texts into smaller segments
 4 | #' each with the same number of words. This allows you to treat a very long
 5 | #' document, such as a novel, as a set of smaller documents.
 6 | #'
 7 | #' @details Chunking the text passes it through \code{\link{tokenize_words}},
 8 | #'   which will strip punctuation and lowercase the text unless you provide
 9 | #'   arguments to pass along to that function.
10 | #'
11 | #' @param x A character vector or a list of character vectors to be tokenized
12 | #'   into n-grams. If \code{x} is a character vector, it can be of any length,
13 | #'   and each element will be chunked separately. If \code{x} is a list of
14 | #'   character vectors, each element of the list should have a length of 1.
15 | #' @param chunk_size The number of words in each chunk.
16 | #' @param doc_id The document IDs as a character vector. This will be taken from
17 | #'   the names of the \code{x} vector if available. \code{NULL} is acceptable.
18 | #' @param ... Arguments passed on to \code{\link{tokenize_words}}.
19 | #' @examples
20 | #' \dontrun{
21 | #' chunked <- chunk_text(mobydick, chunk_size = 100)
22 | #' length(chunked)
23 | #' chunked[1:3]
24 | #' }
25 | #' @export
26 | chunk_text <- function(x, chunk_size = 100, doc_id = names(x), ...) {
27 |   check_input(x)
28 |   stopifnot(chunk_size > 1)
29 |   if (is.character(x) && length(x) == 1) {
30 |     out <- chunk_individual_text(x = x, chunk_size = chunk_size,
31 |                                  doc_id = doc_id, ...)
32 |   } else {
33 |     out <- lapply(seq_along(x), function(i) {
34 |       chunk_individual_text(x = x[[i]], chunk_size = chunk_size,
35 |                             doc_id = doc_id[[i]], ...)
36 |     })
37 |     out <- unlist(out, recursive = FALSE, use.names = TRUE)
38 |   }
39 |   out
40 | }
41 | 
42 | chunk_individual_text <- function(x, chunk_size, doc_id, ...) {
43 | 
44 |   stopifnot(is.character(x),
45 |             length(x) == 1)
46 |   words <- tokenize_words(x, simplify = TRUE, ...)
47 | 
48 |   if (length(words) <= chunk_size) {
49 |     chunks <- x
50 |   }
51 | 
52 |   chunks <- split(words, ceiling(seq_along(words) / chunk_size))
53 | 
54 |   if (!is.null(doc_id)) {
55 |     num_chars <- stringi::stri_length(length(chunks))
56 |     chunk_ids <- stringi::stri_pad_left(seq(length(chunks)),
57 |                                        width = num_chars, pad = "0")
58 |     names(chunks) <- stringi::stri_c(doc_id, chunk_ids, sep = "-")
59 |   } else {
60 |     names(chunks) <- NULL
61 |   }
62 | 
63 |   out <- lapply(chunks, stringi::stri_c, collapse = " ")
64 | 
65 |   out
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/R/coercion.R:
--------------------------------------------------------------------------------
 1 | is_corpus_df <- function(corpus) {
 2 |   stopifnot(inherits(corpus, "data.frame"),
 3 |             ncol(corpus) >= 2,
 4 |             all(names(corpus)[1L:2L] == c("doc_id", "text")),
 5 |             is.character(corpus$doc_id),
 6 |             is.character(corpus$doc_id),
 7 |             nrow(corpus) > 0)
 8 |   TRUE # if it doesn't fail from the tests above then it fits the standard
 9 | }
10 | 
11 | corpus_df_as_corpus_vector <- function(corpus) {
12 |   if (is_corpus_df(corpus)) {
13 |     out <- corpus$text
14 |     names(out) <- corpus$doc_id
15 |   } else {
16 |     stop("Not a corpus data.frame")
17 |   }
18 |   out
19 | }
20 | 


--------------------------------------------------------------------------------
/R/data-docs.R:
--------------------------------------------------------------------------------
1 | #' The text of Moby Dick
2 | #'
3 | #' The text of Moby Dick, by Herman Melville, taken from Project Gutenberg.
4 | #'
5 | #' @format A named character vector with length 1.
6 | #' @source \url{http://www.gutenberg.org/}
7 | "mobydick"
8 | 


--------------------------------------------------------------------------------
/R/ngram-tokenizers.R:
--------------------------------------------------------------------------------
  1 | #' N-gram tokenizers
  2 | #'
  3 | #' These functions tokenize their inputs into different kinds of n-grams. The
  4 | #' input can be a character vector of any length, or a list of character vectors
  5 | #' where each character vector in the list has a length of 1. See details for an
  6 | #' explanation of what each function does.
  7 | #'
  8 | #' @details
  9 | #'
 10 | #' \describe{ \item{\code{tokenize_ngrams}:}{ Basic shingled n-grams. A
 11 | #' contiguous subsequence of \code{n} words. This will compute shingled n-grams
 12 | #' for every value of between \code{n_min} (which must be at least 1) and
 13 | #' \code{n}. } \item{\code{tokenize_skip_ngrams}:}{Skip n-grams. A subsequence
 14 | #' of \code{n} words which are at most a gap of \code{k} words between them. The
 15 | #' skip n-grams will be calculated for all values from \code{0} to \code{k}. } }
 16 | #'
 17 | #' These functions will strip all punctuation and normalize all whitespace to a
 18 | #' single space character.
 19 | #'
 20 | #' @param x A character vector or a list of character vectors to be tokenized
 21 | #'   into n-grams. If \code{x} is a character vector, it can be of any length,
 22 | #'   and each element will be tokenized separately. If \code{x} is a list of
 23 | #'   character vectors, each element of the list should have a length of 1.
 24 | #' @param n The number of words in the n-gram. This must be an integer greater
 25 | #'   than or equal to 1.
 26 | #' @param n_min The minimum number of words in the n-gram. This must be an
 27 | #'   integer greater than or equal to 1, and less than or equal to \code{n}.
 28 | #' @param k For the skip n-gram tokenizer, the maximum skip distance between
 29 | #'   words. The function will compute all skip n-grams between \code{0} and
 30 | #'   \code{k}.
 31 | #' @param lowercase Should the tokens be made lower case?
 32 | #' @param stopwords A character vector of stop words to be excluded from the
 33 | #'   n-grams.
 34 | #' @param ngram_delim The separator between words in an n-gram.
 35 | #' @param simplify \code{FALSE} by default so that a consistent value is
 36 | #'   returned regardless of length of input. If \code{TRUE}, then an input with
 37 | #'   a single element will return a character vector of tokens instead of a
 38 | #'   list.
 39 | #'
 40 | #' @return A list of character vectors containing the tokens, with one element
 41 | #'   in the list for each element that was passed as input. If \code{simplify =
 42 | #'   TRUE} and only a single element was passed as input, then the output is a
 43 | #'   character vector of tokens.
 44 | #'
 45 | #' @examples
 46 | #' song <-  paste0("How many roads must a man walk down\n",
 47 | #'                 "Before you call him a man?\n",
 48 | #'                 "How many seas must a white dove sail\n",
 49 | #'                 "Before she sleeps in the sand?\n",
 50 | #'                 "\n",
 51 | #'                 "How many times must the cannonballs fly\n",
 52 | #'                 "Before they're forever banned?\n",
 53 | #'                 "The answer, my friend, is blowin' in the wind.\n",
 54 | #'                 "The answer is blowin' in the wind.\n")
 55 | #'
 56 | #' tokenize_ngrams(song, n = 4)
 57 | #' tokenize_ngrams(song, n = 4, n_min = 1)
 58 | #' tokenize_skip_ngrams(song, n = 4, k = 2)
 59 | #' @name ngram-tokenizers
 60 | 
 61 | #' @export
 62 | #' @rdname ngram-tokenizers
 63 | tokenize_ngrams <- function(x,
 64 |                             lowercase = TRUE,
 65 |                             n = 3L,
 66 |                             n_min = n,
 67 |                             stopwords = character(),
 68 |                             ngram_delim = " ",
 69 |                             simplify = FALSE) {
 70 |   UseMethod("tokenize_ngrams")
 71 | }
 72 | 
 73 | #' @export
 74 | tokenize_ngrams.data.frame <-
 75 |   function(x,
 76 |            lowercase = TRUE,
 77 |            n = 3L,
 78 |            n_min = n,
 79 |            stopwords = character(),
 80 |            ngram_delim = " ",
 81 |            simplify = FALSE) {
 82 |     x <- corpus_df_as_corpus_vector(x)
 83 |     tokenize_ngrams(x, lowercase, n, n_min, stopwords, ngram_delim, simplify)
 84 |   }
 85 | 
 86 | #' @export
 87 | tokenize_ngrams.default <-
 88 |   function(x,
 89 |            lowercase = TRUE,
 90 |            n = 3L,
 91 |            n_min = n,
 92 |            stopwords = character(),
 93 |            ngram_delim = " ",
 94 |            simplify = FALSE) {
 95 |     check_input(x)
 96 |     named <- names(x)
 97 |     if (n < n_min || n_min <= 0)
 98 |       stop("n and n_min must be integers, and n_min must be less than ",
 99 |            "n and greater than 1.")
100 |     words <- tokenize_words(x, lowercase = lowercase)
101 |     out <-
102 |       generate_ngrams_batch(
103 |         words,
104 |         ngram_min = n_min,
105 |         ngram_max = n,
106 |         stopwords = stopwords,
107 |         ngram_delim = ngram_delim
108 |       )
109 |     if (!is.null(named))
110 |       names(out) <- named
111 |     simplify_list(out, simplify)
112 |   }
113 | 
114 | # Check the skip distance between words, and return FALSE if the skip is bigger
115 | # than k
116 | check_width <- function(v, k) {
117 |   v_lead <- c(v[2:length(v)], NA_integer_)
118 |   all(v_lead - v - 1 <= k, na.rm = TRUE)
119 | }
120 | 
121 | get_valid_skips <- function(n, k) {
122 |   max_dist <- k * (n - 1) + (n - 1)
123 |   total_combinations <- choose(max_dist, n - 1)
124 |   if (total_combinations > 5e3) {
125 |     warning("Input n and k will produce a very large number of skip n-grams")
126 |   }
127 | 
128 |   # Generate all possible combinations up to the maximum distance
129 |   positions <- utils::combn(max_dist, n - 1, simplify = FALSE)
130 | 
131 |   # Prepend 0 to represent position of starting word. Use 0 indexed vectors
132 |   # because these vectors go to Rcpp.
133 |   positions <- lapply(positions, function(v) { c(0, v) })
134 | 
135 |   # Keep only the combination of positions with the correct skip between words
136 |   keepers <- vapply(positions, check_width, logical(1), k)
137 |   positions[keepers]
138 | }
139 | 
140 | #' @export
141 | #' @rdname ngram-tokenizers
142 | tokenize_skip_ngrams <-
143 |   function(x,
144 |            lowercase = TRUE,
145 |            n_min = 1,
146 |            n = 3,
147 |            k = 1,
148 |            stopwords = character(),
149 |            simplify = FALSE) {
150 |     UseMethod("tokenize_skip_ngrams")
151 |   }
152 | 
153 | #' @export
154 | tokenize_skip_ngrams.data.frame <-
155 |   function(x,
156 |            lowercase = TRUE,
157 |            n_min = 1,
158 |            n = 3,
159 |            k = 1,
160 |            stopwords = character(),
161 |            simplify = FALSE) {
162 |     x <- corpus_df_as_corpus_vector(x)
163 |     tokenize_skip_ngrams(x, lowercase, n_min, n, k, stopwords, simplify)
164 | 
165 |   }
166 | 
167 | #' @export
168 | tokenize_skip_ngrams.default <-
169 |   function(x,
170 |            lowercase = TRUE,
171 |            n_min = 1,
172 |            n = 3,
173 |            k = 1,
174 |            stopwords = character(),
175 |            simplify = FALSE) {
176 |     check_input(x)
177 |     named <- names(x)
178 |     words <- tokenize_words(x, lowercase = lowercase)
179 |     skips <- unique(unlist(
180 |       lapply(n_min:n, get_valid_skips, k),
181 |       recursive = FALSE,
182 |       use.names = FALSE
183 |     ))
184 |     out <- skip_ngrams_vectorised(words, skips, stopwords)
185 |     if (!is.null(named))
186 |       names(out) <- named
187 |     simplify_list(out, simplify)
188 |   }
189 | 


--------------------------------------------------------------------------------
/R/ptb-tokenizer.R:
--------------------------------------------------------------------------------
  1 | #' Penn Treebank Tokenizer
  2 | #'
  3 | #' This function implements the Penn Treebank word tokenizer.
  4 | #'
  5 | #' @details This tokenizer uses regular expressions to tokenize text similar to
  6 | #'   the tokenization used in the Penn Treebank. It assumes that text has
  7 | #'   already been split into sentences. The tokenizer does the following:
  8 | #'
  9 | #'   \itemize{ \item{splits common English contractions, e.g. \verb{don't} is
 10 | #'   tokenized into \verb{do n't} and \verb{they'll} is tokenized into ->
 11 | #'   \verb{they 'll},} \item{handles punctuation characters as separate tokens,}
 12 | #'   \item{splits commas and single quotes off from words, when they are
 13 | #'   followed by whitespace,} \item{splits off periods that occur at the end of
 14 | #'   the sentence.} }
 15 | #' @details This function is a port of the Python NLTK version of the Penn
 16 | #'   Treebank Tokenizer.
 17 | #' @param x A character vector or a list of character vectors to be tokenized
 18 | #'   into n-grams. If \code{x} is a character vector, it can be of any length,
 19 | #'   and each element will be tokenized separately. If \code{x} is a list of
 20 | #'   character vectors, each element of the list should have a length of 1.
 21 | #' @param lowercase	Should the tokens be made lower case?
 22 | #' @param simplify \code{FALSE} by default so that a consistent value is
 23 | #'   returned regardless of length of input. If \code{TRUE}, then an input with
 24 | #'   a single element will return a character vector of tokens instead of a
 25 | #'   list.
 26 | #' @return A list of character vectors containing the tokens, with one element
 27 | #'   in the list for each element that was passed as input. If \code{simplify =
 28 | #'   TRUE} and only a single element was passed as input, then the output is a
 29 | #'   character vector of tokens.
 30 | #' @references
 31 | #' \href{https://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer}{NLTK
 32 | #' TreebankWordTokenizer}
 33 | #' @importFrom stringi stri_c stri_replace_all_regex stri_trim_both
 34 | #'   stri_split_regex stri_opts_regex
 35 | #' @importFrom stringi stri_trans_tolower
 36 | #' @examples
 37 | #' song <- list(paste0("How many roads must a man walk down\n",
 38 | #'                     "Before you call him a man?"),
 39 | #'              paste0("How many seas must a white dove sail\n",
 40 | #'                     "Before she sleeps in the sand?\n"),
 41 | #'              paste0("How many times must the cannonballs fly\n",
 42 | #'                     "Before they're forever banned?\n"),
 43 | #'              "The answer, my friend, is blowin' in the wind.",
 44 | #'              "The answer is blowin' in the wind.")
 45 | #' tokenize_ptb(song)
 46 | #' tokenize_ptb(c("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.",
 47 | #'   "They'll save and invest more.",
 48 | #'   "Hi, I can't say hello."))
 49 | #' @export
 50 | #' @rdname ptb-tokenizer
 51 | tokenize_ptb <- function(x,
 52 |                          lowercase = FALSE,
 53 |                          simplify = FALSE) {
 54 |   UseMethod("tokenize_ptb")
 55 | }
 56 | 
 57 | #' @export
 58 | tokenize_ptb.data.frame <-
 59 |   function(x,
 60 |            lowercase = FALSE,
 61 |            simplify = FALSE) {
 62 |     x <- corpus_df_as_corpus_vector(x)
 63 |     tokenize_ptb(x, lowercase, simplify)
 64 |   }
 65 | 
 66 | #' @export
 67 | tokenize_ptb.default <-
 68 |   function(x,
 69 |            lowercase = FALSE,
 70 |            simplify = FALSE) {
 71 |     check_input(x)
 72 |     named <- names(x)
 73 | 
 74 |     CONTRACTIONS2 <-
 75 |       c(
 76 |         "\\b(can)(not)\\b",
 77 |         "\\b(d)('ye)\\b",
 78 |         "\\b(gon)(na)\\b",
 79 |         "\\b(got)(ta)\\b",
 80 |         "\\b(lem)(me)\\b",
 81 |         "\\b(mor)('n)\\b",
 82 |         "\\b(wan)(na) "
 83 |       )
 84 |     CONTRACTIONS3 <- c(" ('t)(is)\\b", " ('t)(was)\\b")
 85 | 
 86 |     CONTRACTIONS4 <- c("\\b(whad)(dd)(ya)\\b", "\\b(wha)(t)(cha)\\b")
 87 | 
 88 |     # Starting quotes
 89 |     x <- stri_replace_all_regex(x, '^\\"', '``')
 90 |     x <- stri_replace_all_regex(x, '(``)', '$1')
 91 |     x <- stri_replace_all_regex(x, '([ (\\[{<])"', '$1 `` ')
 92 | 
 93 |     # Punctuation
 94 |     x <- stri_replace_all_regex(x, '([:,])([^\\d])', ' $1 $2')
 95 |     x <- stri_replace_all_regex(x, '\\.{3}', ' ... ')
 96 |     x <- stri_replace_all_regex(x, '([,;@#$%&])', ' $1 ')
 97 |     x <- stri_replace_all_regex(x,
 98 |                                 '([^\\.])(\\.)([\\]\\)}>"\\\']*)?\\s*$',
 99 |                                 '$1 $2$3 ')
100 |     x <- stri_replace_all_regex(x, '([?!])', ' $1 ')
101 | 
102 |     x <- stri_replace_all_regex(x, "([^'])' ", "$1 ' ")
103 | 
104 |     # parens, brackets, etc
105 |     x <-
106 |       stri_replace_all_regex(x, '([\\]\\[\\(\\)\\{\\}\\<\\>])', ' $1 ')
107 |     x <- stri_replace_all_regex(x, '--', ' -- ')
108 | 
109 |     # add extra space
110 |     x <- stri_c(" ", x, " ")
111 | 
112 |     # ending quotes
113 |     x <- stri_replace_all_regex(x, '"', " '' ")
114 |     x <- stri_replace_all_regex(x, "(\\S)('')", "\\1 \\2 ")
115 |     x <- stri_replace_all_regex(x, "([^' ])('[sS]|'[mM]|'[dD]|') ",
116 |                                 "$1 $2 ")
117 |     x <- stri_replace_all_regex(x,
118 |                                 "([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ",
119 |                                 "$1 $2 ")
120 | 
121 |     x <- stri_replace_all_regex(
122 |       x,
123 |       CONTRACTIONS2,
124 |       " $1 $2 ",
125 |       opts_regex =
126 |         stri_opts_regex(case_insensitive = TRUE),
127 |       vectorize_all = FALSE
128 |     )
129 |     x <- stri_replace_all_regex(
130 |       x,
131 |       CONTRACTIONS3,
132 |       " $1 $2 ",
133 |       opts_regex = stri_opts_regex(case_insensitive = TRUE),
134 |       vectorize_all = FALSE
135 |     )
136 |     x <- stri_replace_all_regex(
137 |       x,
138 |       CONTRACTIONS4,
139 |       " $1 $2 $3 ",
140 |       opts_regex = stri_opts_regex(case_insensitive = TRUE),
141 |       vectorize_all = FALSE
142 |     )
143 | 
144 |     # return
145 |     x <- stri_split_regex(stri_trim_both(x), '\\s+')
146 | 
147 |     if (lowercase) {
148 |       x <- lapply(x, stri_trans_tolower)
149 |     }
150 | 
151 |     if (!is.null(named)) {
152 |       names(x) <- named
153 |     }
154 | 
155 |     simplify_list(x, simplify)
156 | 
157 |   }
158 | 


--------------------------------------------------------------------------------
/R/stem-tokenizers.R:
--------------------------------------------------------------------------------
 1 | #' Word stem tokenizer
 2 | #'
 3 | #' This function turns its input into a character vector of word stems. This is
 4 | #' just a wrapper around the \code{\link[SnowballC]{wordStem}} function from the
 5 | #' SnowballC package which does the heavy lifting, but this function provides a
 6 | #' consistent interface with the rest of the tokenizers in this package. The
 7 | #' input can be a character vector of any length, or a list of character vectors
 8 | #' where each character vector in the list has a length of 1.
 9 | #'
10 | #' @details This function will strip all white space and punctuation and make
11 | #'   all word stems lowercase.
12 | #' @param x A character vector or a list of character vectors to be tokenized.
13 | #'   If \code{x} is a character vector, it can be of any length, and each
14 | #'   element will be tokenized separately. If \code{x} is a list of character
15 | #'   vectors, where each element of the list should have a length of 1.
16 | #' @param language The language to use for word stemming. This must be one of
17 | #'   the languages available in the SnowballC package. A list is provided by
18 | #'   \code{\link[SnowballC]{getStemLanguages}}.
19 | #' @param stopwords A character vector of stop words to be excluded
20 | #' @param simplify \code{FALSE} by default so that a consistent value is
21 | #'   returned regardless of length of input. If \code{TRUE}, then an input with
22 | #'   a single element will return a character vector of tokens instead of a
23 | #'   list.
24 | #' @return A list of character vectors containing the tokens, with one element
25 | #'   in the list for each element that was passed as input. If \code{simplify =
26 | #'   TRUE} and only a single element was passed as input, then the output is a
27 | #'   character vector of tokens.
28 | #' @importFrom SnowballC wordStem getStemLanguages
29 | #' @seealso \code{\link[SnowballC]{wordStem}}
30 | #' @examples
31 | #' song <-  paste0("How many roads must a man walk down\n",
32 | #'                 "Before you call him a man?\n",
33 | #'                 "How many seas must a white dove sail\n",
34 | #'                 "Before she sleeps in the sand?\n",
35 | #'                 "\n",
36 | #'                 "How many times must the cannonballs fly\n",
37 | #'                 "Before they're forever banned?\n",
38 | #'                 "The answer, my friend, is blowin' in the wind.\n",
39 | #'                 "The answer is blowin' in the wind.\n")
40 | #'
41 | #' tokenize_word_stems(song)
42 | #' @export
43 | #' @rdname stem-tokenizers
44 | tokenize_word_stems <-
45 |   function(x,
46 |            language = "english",
47 |            stopwords = NULL,
48 |            simplify = FALSE) {
49 |     UseMethod("tokenize_word_stems")
50 |   }
51 | 
52 | #' @export
53 | tokenize_word_stems.data.frame <-
54 |   function(x,
55 |            language = "english",
56 |            stopwords = NULL,
57 |            simplify = FALSE) {
58 |     x <- corpus_df_as_corpus_vector(x)
59 |     tokenize_word_stems(x, language, stopwords, simplify)
60 |   }
61 | 
62 | #' @export
63 | tokenize_word_stems.default <-
64 |   function(x,
65 |            language = "english",
66 |            stopwords = NULL,
67 |            simplify = FALSE) {
68 |     check_input(x)
69 |     named <- names(x)
70 |     language <- match.arg(language, getStemLanguages())
71 |     words <-
72 |       tokenize_words(x, lowercase = TRUE, stopwords = stopwords)
73 |     out <- lapply(words, wordStem, language = language)
74 |     if (!is.null(named))
75 |       names(out) <- named
76 |     simplify_list(out, simplify)
77 |   }
78 | 


--------------------------------------------------------------------------------
/R/tokenizers-package.r:
--------------------------------------------------------------------------------
 1 | #' Tokenizers
 2 | #'
 3 | #' A collection of functions with a consistent interface to convert natural
 4 | #' language text into tokens.
 5 | #'
 6 | #' The tokenizers in this package have a consistent interface. They all take
 7 | #' either a character vector of any length, or a list where each element is a
 8 | #' character vector of length one. The idea is that each element comprises a
 9 | #' text. Then each function returns a list with the same length as the input
10 | #' vector, where each element in the list are the tokens generated by the
11 | #' function. If the input character vector or list is named, then the names are
12 | #' preserved.
13 | #'
14 | #' @name tokenizers
15 | #' @docType package
16 | NULL
17 | 
18 | #' @useDynLib tokenizers, .registration = TRUE
19 | #' @importFrom Rcpp sourceCpp
20 | NULL
21 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | simplify_list <- function(x, simplify) {
 2 |   stopifnot(is.logical(simplify))
 3 |   if (simplify && length(x) == 1) x[[1]] else x
 4 | }
 5 | 
 6 | check_input <- function(x) {
 7 |   check_character <- is.character(x) |
 8 |   if (is.list(x)) {
 9 |        check_list <- all(vapply(x, is.character, logical(1))) &
10 |          all(vapply(x, length, integer(1)) == 1L)
11 |   } else {
12 |     check_list <- FALSE
13 |   }
14 |   if (!(check_character | check_list))
15 |     stop("Input must be a character vector of any length or a list of character\n",
16 |          "  vectors, each of which has a length of 1.")
17 | }
18 | 
19 | remove_stopwords <- function(x, stopwords) {
20 |   out <- x[!x %in% stopwords]
21 |   if (!length(out)) {
22 |     return(NA_character_)
23 |   }
24 |   return(out)
25 | }
26 | 


--------------------------------------------------------------------------------
/R/wordcount.R:
--------------------------------------------------------------------------------
 1 | #' Count words, sentences, characters
 2 | #'
 3 | #' Count words, sentences, and characters in input texts. These functions use
 4 | #' the \code{stringi} package, so they handle the counting of Unicode strings
 5 | #' (e.g., characters with diacritical marks) in a way that makes sense to people
 6 | #' counting characters.
 7 | #'
 8 | #' @param x A character vector or a list of character vectors. If \code{x} is a
 9 | #'   character vector, it can be of any length, and each element will be
10 | #'   tokenized separately. If \code{x} is a list of character vectors, each
11 | #'   element of the list should have a length of 1.
12 | #' @return An integer vector containing the counted elements. If the input
13 | #'   vector or list has names, they will be preserved.
14 | #' @rdname word-counting
15 | #' @examples
16 | #' count_words(mobydick)
17 | #' count_sentences(mobydick)
18 | #' count_characters(mobydick)
19 | #' @export
20 | count_words <- function(x) {
21 |   check_input(x)
22 |   named <- names(x)
23 |   out <- stringi::stri_count_words(x)
24 |   if (!is.null(named)) names(out) <- named
25 |   out
26 | }
27 | 
28 | #' @export
29 | #' @rdname word-counting
30 | count_characters <- function(x) {
31 |   check_input(x)
32 |   named <- names(x)
33 |   out <- stringi::stri_count_boundaries(x,
34 |             opts_brkiter = stringi::stri_opts_brkiter(type = "character")
35 |           )
36 |   if (!is.null(named)) names(out) <- named
37 |   out
38 | }
39 | 
40 | #' @export
41 | #' @rdname word-counting
42 | count_sentences <- function(x) {
43 |   check_input(x)
44 |   named <- names(x)
45 |   out <- stringi::stri_count_boundaries(x,
46 |           opts_brkiter = stringi::stri_opts_brkiter(type = "sentence")
47 |   )
48 |   if (!is.null(named)) names(out) <- named
49 |   out
50 | }
51 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | pagetitle: "tokenizers: Fast, Consistent Tokenization of Natural Language Text"
 4 | ---
 5 | 
 6 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 7 | 
 8 | ```{r, echo = FALSE}
 9 | knitr::opts_chunk$set(
10 |   collapse = TRUE,
11 |   comment = "#>",
12 |   fig.path = "README-"
13 | )
14 | ```
15 | 
16 | # tokenizers
17 | 
18 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/tokenizers)](https://cran.r-project.org/package=tokenizers)
19 | [![DOI](http://joss.theoj.org/papers/10.21105/joss.00655/status.svg)](https://doi.org/10.21105/joss.00655)
20 | [![rOpenSci peer review](https://badges.ropensci.org/33_status.svg)](https://github.com/ropensci/software-review/issues/33)
21 | [![CRAN_Downloads](http://cranlogs.r-pkg.org/badges/grand-total/tokenizers)](https://cran.r-project.org/package=tokenizers)
22 | [![Travis-CI Build Status](https://travis-ci.org/ropensci/tokenizers.svg?branch=master)](https://travis-ci.org/ropensci/tokenizers)
23 | [![Coverage Status](https://img.shields.io/codecov/c/github/ropensci/tokenizers/master.svg)](https://codecov.io/github/ropensci/tokenizers?branch=master)
24 | 
25 | ## Overview
26 | 
27 | This R package offers functions with a consistent interface to convert natural language text into tokens. It includes tokenizers for shingled n-grams, skip n-grams, words, word stems, sentences, paragraphs, characters, shingled characters, lines, Penn Treebank, and regular expressions, as well as functions for counting characters, words, and sentences, and a function for splitting longer texts into separate documents, each with the same number of words. The package is built on the [stringi](https://www.gagolewski.com/software/stringi/) and [Rcpp](https://www.rcpp.org/) packages for fast yet correct tokenization in UTF-8. 
28 | 
29 | See the "[Introduction to the tokenizers Package](https://docs.ropensci.org/tokenizers/articles/introduction-to-tokenizers.html)" vignette for an overview of all the functions in this package.
30 | 
31 | This package complies with the standards for input and output recommended by the Text Interchange Formats. The TIF initiative was created at an rOpenSci meeting in 2017, and its recommendations are available as part of the [tif package](https://github.com/ropenscilabs/tif). See the "[The Text Interchange Formats and the tokenizers Package](https://docs.ropensci.org/tokenizers/articles/tif-and-tokenizers.html)" vignette for an explanation of how this package fits into that ecosystem.
32 | 
33 | ## Suggested citation
34 | 
35 | If you use this package for your research, we would appreciate a citation.
36 | 
37 | ```{r}
38 | citation("tokenizers")
39 | ```
40 | 
41 | ## Examples
42 | 
43 | The tokenizers in this package have a consistent interface. They all take either a character vector of any length, or a list where each element is a character vector of length one, or a data.frame that adheres to the [tif corpus format](https://github.com/ropenscilabs/tif). The idea is that each element (or row) comprises a text. Then each function returns a list with the same length as the input vector, where each element in the list contains the tokens generated by the function.  If the input character vector or list is named, then the names are preserved, so that the names can serve as identifiers.  For a tif-formatted data.frame, the `doc_id` field is used as the element names in the returned token list.
44 | 
45 | ```{r}
46 | library(magrittr)
47 | library(tokenizers)
48 | 
49 | james <- paste0(
50 |   "The question thus becomes a verbal one\n",
51 |   "again; and our knowledge of all these early stages of thought and feeling\n",
52 |   "is in any case so conjectural and imperfect that farther discussion would\n",
53 |   "not be worth while.\n",
54 |   "\n",
55 |   "Religion, therefore, as I now ask you arbitrarily to take it, shall mean\n",
56 |   "for us _the feelings, acts, and experiences of individual men in their\n",
57 |   "solitude, so far as they apprehend themselves to stand in relation to\n",
58 |   "whatever they may consider the divine_. Since the relation may be either\n",
59 |   "moral, physical, or ritual, it is evident that out of religion in the\n",
60 |   "sense in which we take it, theologies, philosophies, and ecclesiastical\n",
61 |   "organizations may secondarily grow.\n"
62 | )
63 | names(james) <- "varieties"
64 | 
65 | tokenize_characters(james)[[1]] %>% head(50)
66 | tokenize_character_shingles(james)[[1]] %>% head(20)
67 | tokenize_words(james)[[1]] %>% head(10)
68 | tokenize_word_stems(james)[[1]] %>% head(10)
69 | tokenize_sentences(james) 
70 | tokenize_paragraphs(james)
71 | tokenize_ngrams(james, n = 5, n_min = 2)[[1]] %>% head(10)
72 | tokenize_skip_ngrams(james, n = 5, k = 2)[[1]] %>% head(10)
73 | tokenize_ptb(james)[[1]] %>% head(10)
74 | tokenize_lines(james)[[1]] %>% head(5)
75 | ```
76 | 
77 | The package also contains functions to count words, characters, and sentences, and these functions follow the same consistent interface.
78 | 
79 | ```{r}
80 | count_words(james)
81 | count_characters(james)
82 | count_sentences(james)
83 | ```
84 | 
85 | The `chunk_text()` function splits a document into smaller chunks, each with the same number of words.
86 | 
87 | ## Contributing
88 | 
89 | Contributions to the package are more than welcome. One way that you can help is by using this package in your R package for natural language processing. If you want to contribute a tokenization function to this package, it should follow the same conventions as the rest of the functions whenever it makes sense to do so. 
90 | 
91 | Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.
92 | 
93 | ------------------------------------------------------------------------
94 | 
95 | [![rOpenSCi logo](https://ropensci.org/public_images/github_footer.png)](https://ropensci.org)
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # tokenizers
  5 | 
  6 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/tokenizers)](https://cran.r-project.org/package=tokenizers)
  7 | [![DOI](http://joss.theoj.org/papers/10.21105/joss.00655/status.svg)](https://doi.org/10.21105/joss.00655)
  8 | [![rOpenSci peer
  9 | review](https://badges.ropensci.org/33_status.svg)](https://github.com/ropensci/software-review/issues/33)
 10 | [![CRAN_Downloads](http://cranlogs.r-pkg.org/badges/grand-total/tokenizers)](https://cran.r-project.org/package=tokenizers)
 11 | [![Travis-CI Build
 12 | Status](https://travis-ci.org/ropensci/tokenizers.svg?branch=master)](https://travis-ci.org/ropensci/tokenizers)
 13 | [![Coverage
 14 | Status](https://img.shields.io/codecov/c/github/ropensci/tokenizers/master.svg)](https://codecov.io/github/ropensci/tokenizers?branch=master)
 15 | 
 16 | ## Overview
 17 | 
 18 | This R package offers functions with a consistent interface to convert
 19 | natural language text into tokens. It includes tokenizers for shingled
 20 | n-grams, skip n-grams, words, word stems, sentences, paragraphs,
 21 | characters, shingled characters, lines, Penn Treebank, and regular
 22 | expressions, as well as functions for counting characters, words, and
 23 | sentences, and a function for splitting longer texts into separate
 24 | documents, each with the same number of words. The package is built on
 25 | the [stringi](https://www.gagolewski.com/software/stringi/) and
 26 | [Rcpp](https://www.rcpp.org/) packages for fast yet correct tokenization
 27 | in UTF-8.
 28 | 
 29 | See the “[Introduction to the tokenizers
 30 | Package](https://docs.ropensci.org/tokenizers/articles/introduction-to-tokenizers.html)”
 31 | vignette for an overview of all the functions in this package.
 32 | 
 33 | This package complies with the standards for input and output
 34 | recommended by the Text Interchange Formats. The TIF initiative was
 35 | created at an rOpenSci meeting in 2017, and its recommendations are
 36 | available as part of the [tif
 37 | package](https://github.com/ropenscilabs/tif). See the “[The Text
 38 | Interchange Formats and the tokenizers
 39 | Package](https://docs.ropensci.org/tokenizers/articles/tif-and-tokenizers.html)”
 40 | vignette for an explanation of how this package fits into that
 41 | ecosystem.
 42 | 
 43 | ## Suggested citation
 44 | 
 45 | If you use this package for your research, we would appreciate a
 46 | citation.
 47 | 
 48 | ``` r
 49 | citation("tokenizers")
 50 | #> 
 51 | #> To cite the tokenizers package in publications, please cite the paper
 52 | #> in the Journal of Open Source Software:
 53 | #> 
 54 | #>   Lincoln A. Mullen et al., "Fast, Consistent Tokenization of Natural
 55 | #>   Language Text," Journal of Open Source Software 3, no. 23 (2018):
 56 | #>   655, https://doi.org/10.21105/joss.00655.
 57 | #> 
 58 | #> A BibTeX entry for LaTeX users is
 59 | #> 
 60 | #>   @Article{,
 61 | #>     title = {Fast, Consistent Tokenization of Natural Language Text},
 62 | #>     author = {Lincoln A. Mullen and Kenneth Benoit and Os Keyes and Dmitry Selivanov and Jeffrey Arnold},
 63 | #>     journal = {Journal of Open Source Software},
 64 | #>     year = {2018},
 65 | #>     volume = {3},
 66 | #>     issue = {23},
 67 | #>     pages = {655},
 68 | #>     url = {https://doi.org/10.21105/joss.00655},
 69 | #>     doi = {10.21105/joss.00655},
 70 | #>   }
 71 | ```
 72 | 
 73 | ## Examples
 74 | 
 75 | The tokenizers in this package have a consistent interface. They all
 76 | take either a character vector of any length, or a list where each
 77 | element is a character vector of length one, or a data.frame that
 78 | adheres to the [tif corpus format](https://github.com/ropenscilabs/tif).
 79 | The idea is that each element (or row) comprises a text. Then each
 80 | function returns a list with the same length as the input vector, where
 81 | each element in the list contains the tokens generated by the function.
 82 | If the input character vector or list is named, then the names are
 83 | preserved, so that the names can serve as identifiers. For a
 84 | tif-formatted data.frame, the `doc_id` field is used as the element
 85 | names in the returned token list.
 86 | 
 87 | ``` r
 88 | library(magrittr)
 89 | library(tokenizers)
 90 | 
 91 | james <- paste0(
 92 |   "The question thus becomes a verbal one\n",
 93 |   "again; and our knowledge of all these early stages of thought and feeling\n",
 94 |   "is in any case so conjectural and imperfect that farther discussion would\n",
 95 |   "not be worth while.\n",
 96 |   "\n",
 97 |   "Religion, therefore, as I now ask you arbitrarily to take it, shall mean\n",
 98 |   "for us _the feelings, acts, and experiences of individual men in their\n",
 99 |   "solitude, so far as they apprehend themselves to stand in relation to\n",
100 |   "whatever they may consider the divine_. Since the relation may be either\n",
101 |   "moral, physical, or ritual, it is evident that out of religion in the\n",
102 |   "sense in which we take it, theologies, philosophies, and ecclesiastical\n",
103 |   "organizations may secondarily grow.\n"
104 | )
105 | names(james) <- "varieties"
106 | 
107 | tokenize_characters(james)[[1]] %>% head(50)
108 | #>  [1] "t" "h" "e" "q" "u" "e" "s" "t" "i" "o" "n" "t" "h" "u" "s" "b" "e" "c" "o"
109 | #> [20] "m" "e" "s" "a" "v" "e" "r" "b" "a" "l" "o" "n" "e" "a" "g" "a" "i" "n" "a"
110 | #> [39] "n" "d" "o" "u" "r" "k" "n" "o" "w" "l" "e" "d"
111 | tokenize_character_shingles(james)[[1]] %>% head(20)
112 | #>  [1] "the" "heq" "equ" "que" "ues" "est" "sti" "tio" "ion" "ont" "nth" "thu"
113 | #> [13] "hus" "usb" "sbe" "bec" "eco" "com" "ome" "mes"
114 | tokenize_words(james)[[1]] %>% head(10)
115 | #>  [1] "the"      "question" "thus"     "becomes"  "a"        "verbal"  
116 | #>  [7] "one"      "again"    "and"      "our"
117 | tokenize_word_stems(james)[[1]] %>% head(10)
118 | #>  [1] "the"      "question" "thus"     "becom"    "a"        "verbal"  
119 | #>  [7] "one"      "again"    "and"      "our"
120 | tokenize_sentences(james) 
121 | #> $varieties
122 | #> [1] "The question thus becomes a verbal one again; and our knowledge of all these early stages of thought and feeling is in any case so conjectural and imperfect that farther discussion would not be worth while."                                               
123 | #> [2] "Religion, therefore, as I now ask you arbitrarily to take it, shall mean for us _the feelings, acts, and experiences of individual men in their solitude, so far as they apprehend themselves to stand in relation to whatever they may consider the divine_."
124 | #> [3] "Since the relation may be either moral, physical, or ritual, it is evident that out of religion in the sense in which we take it, theologies, philosophies, and ecclesiastical organizations may secondarily grow."
125 | tokenize_paragraphs(james)
126 | #> $varieties
127 | #> [1] "The question thus becomes a verbal one again; and our knowledge of all these early stages of thought and feeling is in any case so conjectural and imperfect that farther discussion would not be worth while."                                                                                                                                                                                                                                                                   
128 | #> [2] "Religion, therefore, as I now ask you arbitrarily to take it, shall mean for us _the feelings, acts, and experiences of individual men in their solitude, so far as they apprehend themselves to stand in relation to whatever they may consider the divine_. Since the relation may be either moral, physical, or ritual, it is evident that out of religion in the sense in which we take it, theologies, philosophies, and ecclesiastical organizations may secondarily grow. "
129 | tokenize_ngrams(james, n = 5, n_min = 2)[[1]] %>% head(10)
130 | #>  [1] "the question"                   "the question thus"             
131 | #>  [3] "the question thus becomes"      "the question thus becomes a"   
132 | #>  [5] "question thus"                  "question thus becomes"         
133 | #>  [7] "question thus becomes a"        "question thus becomes a verbal"
134 | #>  [9] "thus becomes"                   "thus becomes a"
135 | tokenize_skip_ngrams(james, n = 5, k = 2)[[1]] %>% head(10)
136 | #>  [1] "the"                  "the question"         "the thus"            
137 | #>  [4] "the becomes"          "the question thus"    "the question becomes"
138 | #>  [7] "the question a"       "the thus becomes"     "the thus a"          
139 | #> [10] "the thus verbal"
140 | tokenize_ptb(james)[[1]] %>% head(10)
141 | #>  [1] "The"      "question" "thus"     "becomes"  "a"        "verbal"  
142 | #>  [7] "one"      "again"    ";"        "and"
143 | tokenize_lines(james)[[1]] %>% head(5)
144 | #> [1] "The question thus becomes a verbal one"                                   
145 | #> [2] "again; and our knowledge of all these early stages of thought and feeling"
146 | #> [3] "is in any case so conjectural and imperfect that farther discussion would"
147 | #> [4] "not be worth while."                                                      
148 | #> [5] "Religion, therefore, as I now ask you arbitrarily to take it, shall mean"
149 | ```
150 | 
151 | The package also contains functions to count words, characters, and
152 | sentences, and these functions follow the same consistent interface.
153 | 
154 | ``` r
155 | count_words(james)
156 | #> varieties 
157 | #>       112
158 | count_characters(james)
159 | #> varieties 
160 | #>       673
161 | count_sentences(james)
162 | #> varieties 
163 | #>        13
164 | ```
165 | 
166 | The `chunk_text()` function splits a document into smaller chunks, each
167 | with the same number of words.
168 | 
169 | ## Contributing
170 | 
171 | Contributions to the package are more than welcome. One way that you can
172 | help is by using this package in your R package for natural language
173 | processing. If you want to contribute a tokenization function to this
174 | package, it should follow the same conventions as the rest of the
175 | functions whenever it makes sense to do so.
176 | 
177 | Please note that this project is released with a Contributor Code of
178 | Conduct. By participating in this project you agree to abide by its
179 | terms.
180 | 
181 | ------------------------------------------------------------------------
182 | 
183 | [![rOpenSCi
184 | logo](https://ropensci.org/public_images/github_footer.png)](https://ropensci.org)
185 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://lincolnmullen.com/software/tokenizers
 2 | 
 3 | template:
 4 |   params:
 5 |     bootswatch: united
 6 |     ganalytics: "UA-25121492-1"
 7 | 
 8 | authors:
 9 |   Lincoln Mullen:
10 |     href: "https://lincolnmullen.com"
11 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | 
10 | install:
11 |   ps: Bootstrap
12 | 
13 | # Adapt as necessary starting from here
14 | 
15 | build_script:
16 |   - travis-tool.sh install_deps
17 | 
18 | test_script:
19 |   - travis-tool.sh run_tests
20 | 
21 | on_failure:
22 |   - 7z a failure.zip *.Rcheck\*
23 |   - appveyor PushArtifact failure.zip
24 | 
25 | artifacts:
26 |   - path: '*.Rcheck\**\*.log'
27 |     name: Logs
28 | 
29 |   - path: '*.Rcheck\**\*.out'
30 |     name: Logs
31 | 
32 |   - path: '*.Rcheck\**\*.fail'
33 |     name: Logs
34 | 
35 |   - path: '*.Rcheck\**\*.Rout'
36 |     name: Logs
37 | 
38 |   - path: '\*_*.tar.gz'
39 |     name: Bits
40 | 
41 |   - path: '\*_*.zip'
42 |     name: Bits
43 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | This is a breaking upgrade to address failing tests noted by CRAN maintainers.
 2 | The system dependency ICU has changed core functionality. As a result,
 3 | we have removed the single function and related tests affected by this change.
 4 | 
 5 | ## Test environments
 6 | 
 7 | * Local OS X install: R-Release
 8 | * R-Hub: R-release, R-devel
 9 | * Win-builder: R-devel
10 | 
11 | ## R CMD check results
12 | 
13 | * One NOTE pertains to non-ASCII strings in test files only, which are
14 |   necessary to ensure the package's functionality on Windows.
15 | * There may be a WARNING relating to compiling code with Rcpp. This is an issue
16 |   with the Rcpp package which has been addressed but for which a fix has not yet
17 |   made its way to CRAN.
18 | 
19 | ## revdepcheck results
20 | 
21 | We checked 20 reverse dependencies (18 from CRAN + 2 from Bioconductor), comparing R CMD check results across CRAN and dev versions of this package.
22 | 
23 |  * We have communicated with the quanteda package maintainer (who is also a
24 |    contributor to this package) and believe the fix is very simple.
25 |  * We have communicated with the tidytext package maintainer, since that package
26 |    wraps some functionality.
27 |  * The remaining packages are unaffected.
28 | 


--------------------------------------------------------------------------------
/data-raw/mobydick.R:
--------------------------------------------------------------------------------
1 | mobydick <- readr::read_file("data-raw/moby-dick.txt")
2 | names(mobydick) <- "mobydick"
3 | devtools::use_data(mobydick, overwrite = TRUE)
4 | 


--------------------------------------------------------------------------------
/data/mobydick.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ropensci/tokenizers/b80863d088d4b39695b602ca11e061ac34770ec7/data/mobydick.rda


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 |   manual:
 5 |     build: ./
 6 |     volumes:
 7 |       - ./:/tokenizers/
 8 |     working_dir: /tokenizers
 9 |     command: make doc.pdf
10 |   vignette:
11 |     build: ./
12 |     volumes:
13 |       - ./:/tokenizers/
14 |     working_dir: /tokenizers
15 |     command: make vignette
16 |   build_doc:
17 |     build: ./
18 |     volumes:
19 |       - ./:/tokenizers/
20 |     working_dir: /tokenizers/
21 |     command: make roxygenise
22 |   pkg_test:
23 |     build: ./
24 |     volumes:
25 |       - ./:/tokenizers/
26 |     working_dir: /tokenizers/
27 |     command: make devtools_test
28 |   pkg_check:
29 |     build: ./
30 |     volumes:
31 |       - ./:/tokenizers/
32 |     working_dir: /tokenizers/
33 |     command: make
34 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader(paste0(
 2 |   "To cite the tokenizers package in publications, please cite the ",
 3 |   "paper in the Journal of Open Source Software:"
 4 | ))
 5 | 
 6 | citEntry(
 7 |   entry = "Article",
 8 |   title = "Fast, Consistent Tokenization of Natural Language Text",
 9 |   author = personList(as.person("Lincoln A. Mullen"),
10 |                       as.person("Kenneth Benoit"),
11 |                       as.person("Os Keyes"),
12 |                       as.person("Dmitry Selivanov"),
13 |                       as.person("Jeffrey Arnold")),
14 |   journal = "Journal of Open Source Software",
15 |   year = "2018",
16 |   volume = "3",
17 |   issue = "23",
18 |   pages = "655",
19 |   url = "https://doi.org/10.21105/joss.00655",
20 |   doi = "10.21105/joss.00655",
21 | 
22 |   textVersion = paste('Lincoln A. Mullen et al.,',
23 |                       '"Fast, Consistent Tokenization of Natural Language',
24 |                       'Text," Journal of Open Source Software 3, no. 23',
25 |                       '(2018): 655, https://doi.org/10.21105/joss.00655.')
26 | )
27 | 


--------------------------------------------------------------------------------
/man/basic-tokenizers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/basic-tokenizers.R
 3 | \name{basic-tokenizers}
 4 | \alias{basic-tokenizers}
 5 | \alias{tokenize_characters}
 6 | \alias{tokenize_words}
 7 | \alias{tokenize_sentences}
 8 | \alias{tokenize_lines}
 9 | \alias{tokenize_paragraphs}
10 | \alias{tokenize_regex}
11 | \title{Basic tokenizers}
12 | \usage{
13 | tokenize_characters(
14 |   x,
15 |   lowercase = TRUE,
16 |   strip_non_alphanum = TRUE,
17 |   simplify = FALSE
18 | )
19 | 
20 | tokenize_words(
21 |   x,
22 |   lowercase = TRUE,
23 |   stopwords = NULL,
24 |   strip_punct = TRUE,
25 |   strip_numeric = FALSE,
26 |   simplify = FALSE
27 | )
28 | 
29 | tokenize_sentences(x, lowercase = FALSE, strip_punct = FALSE, simplify = FALSE)
30 | 
31 | tokenize_lines(x, simplify = FALSE)
32 | 
33 | tokenize_paragraphs(x, paragraph_break = "\\n\\n", simplify = FALSE)
34 | 
35 | tokenize_regex(x, pattern = "\\\\s+", simplify = FALSE)
36 | }
37 | \arguments{
38 | \item{x}{A character vector or a list of character vectors to be tokenized.
39 | If \code{x} is a character vector, it can be of any length, and each element
40 | will be tokenized separately. If \code{x} is a list of character vectors,
41 | where each element of the list should have a length of 1.}
42 | 
43 | \item{lowercase}{Should the tokens be made lower case? The default value
44 | varies by tokenizer; it is only \code{TRUE} by default for the tokenizers
45 | that you are likely to use last.}
46 | 
47 | \item{strip_non_alphanum}{Should punctuation and white space be stripped?}
48 | 
49 | \item{simplify}{\code{FALSE} by default so that a consistent value is
50 | returned regardless of length of input. If \code{TRUE}, then an input with
51 | a single element will return a character vector of tokens instead of a
52 | list.}
53 | 
54 | \item{stopwords}{A character vector of stop words to be excluded.}
55 | 
56 | \item{strip_punct}{Should punctuation be stripped?}
57 | 
58 | \item{strip_numeric}{Should numbers be stripped?}
59 | 
60 | \item{paragraph_break}{A string identifying the boundary between two
61 | paragraphs.}
62 | 
63 | \item{pattern}{A regular expression that defines the split.}
64 | }
65 | \value{
66 | A list of character vectors containing the tokens, with one element
67 |   in the list for each element that was passed as input. If \code{simplify =
68 |   TRUE} and only a single element was passed as input, then the output is a
69 |   character vector of tokens.
70 | }
71 | \description{
72 | These functions perform basic tokenization into words, sentences, paragraphs,
73 | lines, and characters. The functions can be piped into one another to create
74 | at most two levels of tokenization. For instance, one might split a text into
75 | paragraphs and then word tokens, or into sentences and then word tokens.
76 | }
77 | \examples{
78 | song <-  paste0("How many roads must a man walk down\n",
79 |                 "Before you call him a man?\n",
80 |                 "How many seas must a white dove sail\n",
81 |                 "Before she sleeps in the sand?\n",
82 |                 "\n",
83 |                 "How many times must the cannonballs fly\n",
84 |                 "Before they're forever banned?\n",
85 |                 "The answer, my friend, is blowin' in the wind.\n",
86 |                 "The answer is blowin' in the wind.\n")
87 | 
88 | tokenize_words(song)
89 | tokenize_words(song, strip_punct = FALSE)
90 | tokenize_sentences(song)
91 | tokenize_paragraphs(song)
92 | tokenize_lines(song)
93 | tokenize_characters(song)
94 | }
95 | 


--------------------------------------------------------------------------------
/man/chunk_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chunk-text.R
 3 | \name{chunk_text}
 4 | \alias{chunk_text}
 5 | \title{Chunk text into smaller segments}
 6 | \usage{
 7 | chunk_text(x, chunk_size = 100, doc_id = names(x), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector or a list of character vectors to be tokenized
11 | into n-grams. If \code{x} is a character vector, it can be of any length,
12 | and each element will be chunked separately. If \code{x} is a list of
13 | character vectors, each element of the list should have a length of 1.}
14 | 
15 | \item{chunk_size}{The number of words in each chunk.}
16 | 
17 | \item{doc_id}{The document IDs as a character vector. This will be taken from
18 | the names of the \code{x} vector if available. \code{NULL} is acceptable.}
19 | 
20 | \item{...}{Arguments passed on to \code{\link{tokenize_words}}.}
21 | }
22 | \description{
23 | Given a text or vector/list of texts, break the texts into smaller segments
24 | each with the same number of words. This allows you to treat a very long
25 | document, such as a novel, as a set of smaller documents.
26 | }
27 | \details{
28 | Chunking the text passes it through \code{\link{tokenize_words}},
29 |   which will strip punctuation and lowercase the text unless you provide
30 |   arguments to pass along to that function.
31 | }
32 | \examples{
33 | \dontrun{
34 | chunked <- chunk_text(mobydick, chunk_size = 100)
35 | length(chunked)
36 | chunked[1:3]
37 | }
38 | }
39 | 


--------------------------------------------------------------------------------
/man/mobydick.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data-docs.R
 3 | \docType{data}
 4 | \name{mobydick}
 5 | \alias{mobydick}
 6 | \title{The text of Moby Dick}
 7 | \format{
 8 | A named character vector with length 1.
 9 | }
10 | \source{
11 | \url{http://www.gutenberg.org/}
12 | }
13 | \usage{
14 | mobydick
15 | }
16 | \description{
17 | The text of Moby Dick, by Herman Melville, taken from Project Gutenberg.
18 | }
19 | \keyword{datasets}
20 | 


--------------------------------------------------------------------------------
/man/ngram-tokenizers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ngram-tokenizers.R
 3 | \name{ngram-tokenizers}
 4 | \alias{ngram-tokenizers}
 5 | \alias{tokenize_ngrams}
 6 | \alias{tokenize_skip_ngrams}
 7 | \title{N-gram tokenizers}
 8 | \usage{
 9 | tokenize_ngrams(
10 |   x,
11 |   lowercase = TRUE,
12 |   n = 3L,
13 |   n_min = n,
14 |   stopwords = character(),
15 |   ngram_delim = " ",
16 |   simplify = FALSE
17 | )
18 | 
19 | tokenize_skip_ngrams(
20 |   x,
21 |   lowercase = TRUE,
22 |   n_min = 1,
23 |   n = 3,
24 |   k = 1,
25 |   stopwords = character(),
26 |   simplify = FALSE
27 | )
28 | }
29 | \arguments{
30 | \item{x}{A character vector or a list of character vectors to be tokenized
31 | into n-grams. If \code{x} is a character vector, it can be of any length,
32 | and each element will be tokenized separately. If \code{x} is a list of
33 | character vectors, each element of the list should have a length of 1.}
34 | 
35 | \item{lowercase}{Should the tokens be made lower case?}
36 | 
37 | \item{n}{The number of words in the n-gram. This must be an integer greater
38 | than or equal to 1.}
39 | 
40 | \item{n_min}{The minimum number of words in the n-gram. This must be an
41 | integer greater than or equal to 1, and less than or equal to \code{n}.}
42 | 
43 | \item{stopwords}{A character vector of stop words to be excluded from the
44 | n-grams.}
45 | 
46 | \item{ngram_delim}{The separator between words in an n-gram.}
47 | 
48 | \item{simplify}{\code{FALSE} by default so that a consistent value is
49 | returned regardless of length of input. If \code{TRUE}, then an input with
50 | a single element will return a character vector of tokens instead of a
51 | list.}
52 | 
53 | \item{k}{For the skip n-gram tokenizer, the maximum skip distance between
54 | words. The function will compute all skip n-grams between \code{0} and
55 | \code{k}.}
56 | }
57 | \value{
58 | A list of character vectors containing the tokens, with one element
59 |   in the list for each element that was passed as input. If \code{simplify =
60 |   TRUE} and only a single element was passed as input, then the output is a
61 |   character vector of tokens.
62 | }
63 | \description{
64 | These functions tokenize their inputs into different kinds of n-grams. The
65 | input can be a character vector of any length, or a list of character vectors
66 | where each character vector in the list has a length of 1. See details for an
67 | explanation of what each function does.
68 | }
69 | \details{
70 | \describe{ \item{\code{tokenize_ngrams}:}{ Basic shingled n-grams. A
71 | contiguous subsequence of \code{n} words. This will compute shingled n-grams
72 | for every value of between \code{n_min} (which must be at least 1) and
73 | \code{n}. } \item{\code{tokenize_skip_ngrams}:}{Skip n-grams. A subsequence
74 | of \code{n} words which are at most a gap of \code{k} words between them. The
75 | skip n-grams will be calculated for all values from \code{0} to \code{k}. } }
76 | 
77 | These functions will strip all punctuation and normalize all whitespace to a
78 | single space character.
79 | }
80 | \examples{
81 | song <-  paste0("How many roads must a man walk down\n",
82 |                 "Before you call him a man?\n",
83 |                 "How many seas must a white dove sail\n",
84 |                 "Before she sleeps in the sand?\n",
85 |                 "\n",
86 |                 "How many times must the cannonballs fly\n",
87 |                 "Before they're forever banned?\n",
88 |                 "The answer, my friend, is blowin' in the wind.\n",
89 |                 "The answer is blowin' in the wind.\n")
90 | 
91 | tokenize_ngrams(song, n = 4)
92 | tokenize_ngrams(song, n = 4, n_min = 1)
93 | tokenize_skip_ngrams(song, n = 4, k = 2)
94 | }
95 | 


--------------------------------------------------------------------------------
/man/ptb-tokenizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ptb-tokenizer.R
 3 | \name{tokenize_ptb}
 4 | \alias{tokenize_ptb}
 5 | \title{Penn Treebank Tokenizer}
 6 | \usage{
 7 | tokenize_ptb(x, lowercase = FALSE, simplify = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{A character vector or a list of character vectors to be tokenized
11 | into n-grams. If \code{x} is a character vector, it can be of any length,
12 | and each element will be tokenized separately. If \code{x} is a list of
13 | character vectors, each element of the list should have a length of 1.}
14 | 
15 | \item{lowercase}{Should the tokens be made lower case?}
16 | 
17 | \item{simplify}{\code{FALSE} by default so that a consistent value is
18 | returned regardless of length of input. If \code{TRUE}, then an input with
19 | a single element will return a character vector of tokens instead of a
20 | list.}
21 | }
22 | \value{
23 | A list of character vectors containing the tokens, with one element
24 |   in the list for each element that was passed as input. If \code{simplify =
25 |   TRUE} and only a single element was passed as input, then the output is a
26 |   character vector of tokens.
27 | }
28 | \description{
29 | This function implements the Penn Treebank word tokenizer.
30 | }
31 | \details{
32 | This tokenizer uses regular expressions to tokenize text similar to
33 |   the tokenization used in the Penn Treebank. It assumes that text has
34 |   already been split into sentences. The tokenizer does the following:
35 | 
36 |   \itemize{ \item{splits common English contractions, e.g. \verb{don't} is
37 |   tokenized into \verb{do n't} and \verb{they'll} is tokenized into ->
38 |   \verb{they 'll},} \item{handles punctuation characters as separate tokens,}
39 |   \item{splits commas and single quotes off from words, when they are
40 |   followed by whitespace,} \item{splits off periods that occur at the end of
41 |   the sentence.} }
42 | 
43 | This function is a port of the Python NLTK version of the Penn
44 |   Treebank Tokenizer.
45 | }
46 | \examples{
47 | song <- list(paste0("How many roads must a man walk down\n",
48 |                     "Before you call him a man?"),
49 |              paste0("How many seas must a white dove sail\n",
50 |                     "Before she sleeps in the sand?\n"),
51 |              paste0("How many times must the cannonballs fly\n",
52 |                     "Before they're forever banned?\n"),
53 |              "The answer, my friend, is blowin' in the wind.",
54 |              "The answer is blowin' in the wind.")
55 | tokenize_ptb(song)
56 | tokenize_ptb(c("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.",
57 |   "They'll save and invest more.",
58 |   "Hi, I can't say hello."))
59 | }
60 | \references{
61 | \href{https://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer}{NLTK
62 | TreebankWordTokenizer}
63 | }
64 | 


--------------------------------------------------------------------------------
/man/shingle-tokenizers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/character-shingles-tokenizers.R
 3 | \name{tokenize_character_shingles}
 4 | \alias{tokenize_character_shingles}
 5 | \title{Character shingle tokenizers}
 6 | \usage{
 7 | tokenize_character_shingles(
 8 |   x,
 9 |   n = 3L,
10 |   n_min = n,
11 |   lowercase = TRUE,
12 |   strip_non_alphanum = TRUE,
13 |   simplify = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A character vector or a list of character vectors to be tokenized
18 | into character shingles. If \code{x} is a character vector, it can be of
19 | any length, and each element will be tokenized separately. If \code{x} is a
20 | list of character vectors, each element of the list should have a length of
21 | 1.}
22 | 
23 | \item{n}{The number of characters in each shingle. This must be an integer
24 | greater than or equal to 1.}
25 | 
26 | \item{n_min}{This must be an integer greater than or equal to 1, and less
27 | than or equal to \code{n}.}
28 | 
29 | \item{lowercase}{Should the characters be made lower case?}
30 | 
31 | \item{strip_non_alphanum}{Should punctuation and white space be stripped?}
32 | 
33 | \item{simplify}{\code{FALSE} by default so that a consistent value is
34 | returned regardless of length of input. If \code{TRUE}, then an input with
35 | a single element will return a character vector of tokens instead of a
36 | list.}
37 | }
38 | \value{
39 | A list of character vectors containing the tokens, with one element
40 |   in the list for each element that was passed as input. If \code{simplify =
41 |   TRUE} and only a single element was passed as input, then the output is a
42 |   character vector of tokens.
43 | }
44 | \description{
45 | The character shingle tokenizer functions like an n-gram tokenizer, except
46 | the units that are shingled are characters instead of words. Options to the
47 | function let you determine whether non-alphanumeric characters like
48 | punctuation should be retained or discarded.
49 | }
50 | \examples{
51 | x <- c("Now is the hour of our discontent")
52 | tokenize_character_shingles(x)
53 | tokenize_character_shingles(x, n = 5)
54 | tokenize_character_shingles(x, n = 5, strip_non_alphanum = FALSE)
55 | tokenize_character_shingles(x, n = 5, n_min = 3, strip_non_alphanum = FALSE)
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/man/stem-tokenizers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stem-tokenizers.R
 3 | \name{tokenize_word_stems}
 4 | \alias{tokenize_word_stems}
 5 | \title{Word stem tokenizer}
 6 | \usage{
 7 | tokenize_word_stems(
 8 |   x,
 9 |   language = "english",
10 |   stopwords = NULL,
11 |   simplify = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{x}{A character vector or a list of character vectors to be tokenized.
16 | If \code{x} is a character vector, it can be of any length, and each
17 | element will be tokenized separately. If \code{x} is a list of character
18 | vectors, where each element of the list should have a length of 1.}
19 | 
20 | \item{language}{The language to use for word stemming. This must be one of
21 | the languages available in the SnowballC package. A list is provided by
22 | \code{\link[SnowballC]{getStemLanguages}}.}
23 | 
24 | \item{stopwords}{A character vector of stop words to be excluded}
25 | 
26 | \item{simplify}{\code{FALSE} by default so that a consistent value is
27 | returned regardless of length of input. If \code{TRUE}, then an input with
28 | a single element will return a character vector of tokens instead of a
29 | list.}
30 | }
31 | \value{
32 | A list of character vectors containing the tokens, with one element
33 |   in the list for each element that was passed as input. If \code{simplify =
34 |   TRUE} and only a single element was passed as input, then the output is a
35 |   character vector of tokens.
36 | }
37 | \description{
38 | This function turns its input into a character vector of word stems. This is
39 | just a wrapper around the \code{\link[SnowballC]{wordStem}} function from the
40 | SnowballC package which does the heavy lifting, but this function provides a
41 | consistent interface with the rest of the tokenizers in this package. The
42 | input can be a character vector of any length, or a list of character vectors
43 | where each character vector in the list has a length of 1.
44 | }
45 | \details{
46 | This function will strip all white space and punctuation and make
47 |   all word stems lowercase.
48 | }
49 | \examples{
50 | song <-  paste0("How many roads must a man walk down\n",
51 |                 "Before you call him a man?\n",
52 |                 "How many seas must a white dove sail\n",
53 |                 "Before she sleeps in the sand?\n",
54 |                 "\n",
55 |                 "How many times must the cannonballs fly\n",
56 |                 "Before they're forever banned?\n",
57 |                 "The answer, my friend, is blowin' in the wind.\n",
58 |                 "The answer is blowin' in the wind.\n")
59 | 
60 | tokenize_word_stems(song)
61 | }
62 | \seealso{
63 | \code{\link[SnowballC]{wordStem}}
64 | }
65 | 


--------------------------------------------------------------------------------
/man/tokenizers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenizers-package.r
 3 | \docType{package}
 4 | \name{tokenizers}
 5 | \alias{tokenizers-package}
 6 | \alias{tokenizers}
 7 | \title{Tokenizers}
 8 | \description{
 9 | A collection of functions with a consistent interface to convert natural
10 | language text into tokens.
11 | }
12 | \details{
13 | The tokenizers in this package have a consistent interface. They all take
14 | either a character vector of any length, or a list where each element is a
15 | character vector of length one. The idea is that each element comprises a
16 | text. Then each function returns a list with the same length as the input
17 | vector, where each element in the list are the tokens generated by the
18 | function. If the input character vector or list is named, then the names are
19 | preserved.
20 | }
21 | \seealso{
22 | Useful links:
23 | \itemize{
24 |   \item \url{https://docs.ropensci.org/tokenizers/}
25 |   \item \url{https://github.com/ropensci/tokenizers}
26 |   \item Report bugs at \url{https://github.com/ropensci/tokenizers/issues}
27 | }
28 | 
29 | }
30 | \author{
31 | \strong{Maintainer}: Thomas Charlon \email{charlon@protonmail.com} (\href{https://orcid.org/0000-0001-7497-0470}{ORCID})
32 | 
33 | Authors:
34 | \itemize{
35 |   \item Lincoln Mullen \email{lincoln@lincolnmullen.com} (\href{https://orcid.org/0000-0001-5103-6917}{ORCID})
36 | }
37 | 
38 | Other contributors:
39 | \itemize{
40 |   \item Os Keyes \email{ironholds@gmail.com} (\href{https://orcid.org/0000-0001-5196-609X}{ORCID}) [contributor]
41 |   \item Dmitriy Selivanov \email{selivanov.dmitriy@gmail.com} [contributor]
42 |   \item Jeffrey Arnold \email{jeffrey.arnold@gmail.com} (\href{https://orcid.org/0000-0001-9953-3904}{ORCID}) [contributor]
43 |   \item Kenneth Benoit \email{kbenoit@lse.ac.uk} (\href{https://orcid.org/0000-0002-0797-564X}{ORCID}) [contributor]
44 | }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/word-counting.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/wordcount.R
 3 | \name{count_words}
 4 | \alias{count_words}
 5 | \alias{count_characters}
 6 | \alias{count_sentences}
 7 | \title{Count words, sentences, characters}
 8 | \usage{
 9 | count_words(x)
10 | 
11 | count_characters(x)
12 | 
13 | count_sentences(x)
14 | }
15 | \arguments{
16 | \item{x}{A character vector or a list of character vectors. If \code{x} is a
17 | character vector, it can be of any length, and each element will be
18 | tokenized separately. If \code{x} is a list of character vectors, each
19 | element of the list should have a length of 1.}
20 | }
21 | \value{
22 | An integer vector containing the counted elements. If the input
23 |   vector or list has names, they will be preserved.
24 | }
25 | \description{
26 | Count words, sentences, and characters in input texts. These functions use
27 | the \code{stringi} package, so they handle the counting of Unicode strings
28 | (e.g., characters with diacritical marks) in a way that makes sense to people
29 | counting characters.
30 | }
31 | \examples{
32 | count_words(mobydick)
33 | count_sentences(mobydick)
34 | count_characters(mobydick)
35 | }
36 | 


--------------------------------------------------------------------------------
/paper.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | @book{silge_text_2017,
  3 |   title = {Text {{Mining}} with {{R}}: {{A Tidy Approach}}},
  4 |   url = {http://tidytextmining.com/},
  5 |   publisher = {{O'Reilly}},
  6 |   date = {2017},
  7 |   author = {Silge, Julia and Robinson, David}
  8 | }
  9 | 
 10 | @book{mullen_americas,
 11 |   title = {America's Public Bible: Biblical Quotations in {{U}}.{{S}}. Newspapers},
 12 |   url = {http://americaspublicbible.org},
 13 |   publisher = {{Stanford University Press}},
 14 |   year = {forthcoming},
 15 |   author = {Mullen, Lincoln A.}
 16 | }
 17 | 
 18 | @article{funkmullen_spine_2018,
 19 |   langid = {english},
 20 |   title = {The Spine of American Law: Digital Text Analysis and {{U}}.{{S}}. Legal Practice},
 21 |   volume = {123},
 22 |   issn = {0002-8762},
 23 |   doi = {10.1093/ahr/123.1.132},
 24 |   shorttitle = {Spine of American Law},
 25 |   number = {1},
 26 |   journaltitle = {American Historical Review},
 27 |   date = {2018-02-01},
 28 |   pages = {132--164},
 29 |   author = {Funk, Kellen and Mullen, Lincoln A.}
 30 | }
 31 | 
 32 | @article{welbers_text_2017,
 33 |   title = {Text Analysis in R},
 34 |   volume = {11},
 35 |   issn = {1931-2458},
 36 |   url = {https:doi.org/10.1080/19312458.2017.1387238},
 37 |   doi = {10.1080/19312458.2017.1387238},
 38 |   number = {4},
 39 |   journaltitle = {Communication Methods and Measures},
 40 |   date = {2017-10-02},
 41 |   pages = {245--265},
 42 |   author = {Welbers, Kasper and Van Atteveldt, Wouter and Benoit, Kenneth}
 43 | }
 44 | 
 45 | @article{funkmullen_servile_2016,
 46 |   title = {A Servile Copy: Text Reuse and Medium Data in American Civil Procedure},
 47 |   url = {http://rg.rg.mpg.de/en/article_id/1040},
 48 |   doi = {10.12946/rg24/341-343},
 49 |   shorttitle = {A {{Servile Copy}}},
 50 |   number = {24},
 51 |   journaltitle = {Rechtsgeschichte [Legal History]},
 52 |   date = {2016},
 53 |   pages = {341--343},
 54 |   author = {Funk, Kellen and Mullen, Lincoln A.}
 55 | }
 56 | 
 57 | @article{sanger_2015_,
 58 |   title = {The 2015 Canadian Election on Twitter: A Tidy Algorithmic Analysis},
 59 |   shorttitle = {The 2015 Canadian Election on Twitter},
 60 |   author = {Sanger, William and Warin, Thierry}
 61 | }
 62 | 
 63 | @inproceedings{ballier_rbased_2017,
 64 |   location = {{Berlin, Germany}},
 65 |   title = {R-Based Strategies for {{DH}} in English Linguistics: A Case Study},
 66 |   volume = {1918},
 67 |   url = {https://hal.archives-ouvertes.fr/hal-01587126},
 68 |   shorttitle = {R-Based Strategies for {{DH}} in {{English Linguistics}}},
 69 |   booktitle = {Teaching {{NLP}} for {{Digital Humanities}}},
 70 |   publisher = {{Peggy Bockwinkel}},
 71 |   date = {2017-09},
 72 |   pages = {1--10},
 73 |   keywords = {digital humanities,NLP,R-based curriculum},
 74 |   author = {Ballier, Nicolas and Lissón, Paula}
 75 | }
 76 | 
 77 | @article{warin_mapping,
 78 |   title = {Mapping {{Innovations}} in {{Artificial Intelligence Through Patents}}: {{A Social Data Science Perspective}}},
 79 |   shorttitle = {Mapping {{Innovations}} in {{Artificial Intelligence Through Patents}}},
 80 |   author = {Warin, Thierry and Le Duc, Romain and Sanger, William}
 81 | }
 82 | 
 83 | @article{xu_using_2018,
 84 |   title = {Using {{Text Mining}} to {{Compare Online Pro}}- and {{Anti}}-{{Vaccine Headlines}}: {{Word Usage}}, {{Sentiments}}, and {{Online Popularity}}},
 85 |   volume = {69},
 86 |   issn = {1051-0974},
 87 |   url = {https://doi.org/10.1080/10510974.2017.1414068},
 88 |   doi = {10.1080/10510974.2017.1414068},
 89 |   shorttitle = {Using {{Text Mining}} to {{Compare Online Pro}}- and {{Anti}}-{{Vaccine Headlines}}},
 90 |   number = {1},
 91 |   journaltitle = {Communication Studies},
 92 |   urldate = {2018-03-13},
 93 |   date = {2018-01-01},
 94 |   pages = {103--122},
 95 |   keywords = {Anti-Vaccine,Misinformation,Sentiment Analysis,Text Mining,Vaccine},
 96 |   author = {Xu, Zhan and Guo, Hao}
 97 | }
 98 | 
 99 | @Manual{rbase,
100 |   title = {R: A Language and Environment for Statistical Computing},
101 |   author = {{R Core Team}},
102 |   organization = {R Foundation for Statistical Computing},
103 |   address = {Vienna, Austria},
104 |   year = {2017},
105 |   url = {https://www.R-project.org/},
106 | }
107 | 
108 | @Manual{gagolewski_2018,
109 |   title = {R package {stringi}: Character string processing facilities},
110 |   author = {Marek Gagolewski},
111 |   year = {2018},
112 |   url = {http://www.gagolewski.com/software/stringi/},
113 | }
114 | 
115 | @Book{eddelbuettel_2013,
116 |   title = {Seamless {R} and {C++} Integration with {Rcpp}},
117 |   author = {Dirk Eddelbuettel},
118 |   publisher = {Springer},
119 |   year = {2013},
120 |   note = {ISBN 978-1-4614-6867-7},
121 |   doi = {10.1007/978-1-4614-6868-4},
122 | }
123 | 
124 | @Article{eddelbuettel_2017,
125 |   title = {{Extending {R} with 	{C++}: A Brief Introduction to {Rcpp}}},
126 |   author = {Dirk Eddelbuettel and James Joseph Balamuta},
127 |   journal = {PeerJ Preprints},
128 |   year = {2017},
129 |   month = {aug},
130 |   volume = {5},
131 |   pages = {e3188v1},
132 |   issn = {2167-9843},
133 |   url = {https://doi.org/10.7287/peerj.preprints.3188v1},
134 |   doi = {10.7287/peerj.preprints.3188v1},
135 | }
136 | 
137 | @Article{silge_2016,
138 |   title = {{tidytext}: Text Mining and Analysis Using Tidy Data Principles in R},
139 |   author = {Julia Silge and David Robinson},
140 |   doi = {10.21105/joss.00037},
141 |   url = {https://doi.org/10.21105/joss.00037},
142 |   year = {2016},
143 |   publisher = {Journal of Open Source Software},
144 |   volume = {1},
145 |   number = {3},
146 |   journal = {JOSS},
147 | }
148 | 
149 | @Manual{selivanov_2018,
150 |   title = {{text2vec}: Modern Text Mining Framework for R},
151 |   author = {Dmitriy Selivanov and Qing Wang},
152 |   year = {2018},
153 |   note = {R package version 0.5.1},
154 |   url = {https://CRAN.R-project.org/package=text2vec},
155 | }
156 | 
157 | @Manual{mullen_2016,
158 |     title = {{textreuse}: Detect Text Reuse and Document Similarity},
159 |     author = {Mullen, Lincoln A.},
160 |     year = {2016},
161 |     note = {R package version 0.1.4},
162 |     url = {https://github.com/ropensci/textreuse},
163 |   }
164 | 
165 | @Manual{tif_2017,
166 |   title = {{tif}: Text Interchange Format},
167 |   author = {{rOpenSci Text Workshop}},
168 |   note = {R package version 0.2},
169 |   year = {2017},
170 |   url = {https://github.com/ropenscilabs/tif},
171 | }
172 | 
173 | @article{denny_text_forthcoming,
174 |   title={Text Preprocessing For Unsupervised Learning: Why It Matters, When It Misleads, And What To Do About It},
175 |   doi={10.1017/pan.2017.44},
176 |   journal={Political Analysis},
177 |   author={Denny, Matthew J. and Spirling, Arthur},
178 |   year={2018}}
179 | 
180 | @book{Manningetal2008,
181 | 	Author = {C. D. Manning and P. Raghavan and H. Sch\"{u}tze},
182 | 	Publisher = {Cambridge University Press},
183 | 	Title = {Introduction to Information Retrieval},
184 | 	Year = {2008}}
185 | 
186 | @inproceedings{guthrie_closer_2006,
187 |   title = {A {{Closer Look}} at {{Skip}}-{{Gram Modelling}}},
188 |   url = {http://www.lrec-conf.org/proceedings/lrec2006/pdf/357_pdf.pdf},
189 |   booktitle = {Proceedings of the 5th {{International Conference}} on {{Language Resources}} and {{Evaluation}}},
190 |   date = {2006},
191 |   author = {Guthrie, David and Allison, Ben and Liu, Wei and Guthrie, Louise and Wilks, Yorick}
192 | }
193 | 
194 | @Manual{tokenizers,
195 |   title = {{tokenizers}: Fast, Consistent Tokenization of Natural Language Text},
196 |   author = {Mullen, Lincoln A.},
197 |   year = {2018},
198 |   note = {R package version 0.2.0},
199 |   url = {https://docs.ropensci.org/tokenizers/index.html},
200 | }
201 | 


--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Fast, Consistent Tokenization of Natural Language Text"
  3 | tags:
  4 | - text mining
  5 | - tokenization
  6 | - natural language processing
  7 | authors:
  8 | - name: Lincoln A. Mullen
  9 |   orcid: 0000-0001-5103-6917
 10 |   affiliation: 1
 11 | - name: Kenneth Benoit
 12 |   orcid: 0000-0002-0797-564X
 13 |   affiliation: 2
 14 | - name: Os Keyes
 15 |   orcid: 0000-0001-5196-609X
 16 |   affiliation: 3
 17 | - name: Dmitry Selivanov
 18 |   affiliation: 4
 19 | - name: Jeffrey Arnold
 20 |   orcid: 0000-0001-9953-3904
 21 |   affiliation: 5
 22 | affiliations: 
 23 | - name: "Department of History and Art History, George Mason University"
 24 |   index: 1
 25 | - name: "Department of Methodology, London School of Economics and Political Science"
 26 |   index: 2
 27 | - name: "Department of Human Centered Design and Engineering, University of Washington"
 28 |   index: 3
 29 | - name: "Open Data Science"
 30 |   index: 4
 31 | - name: "Department of Political Science, University of Washington"
 32 |   index: 5
 33 | date: 12 March 2018
 34 | bibliography: paper.bib
 35 | ...
 36 | 
 37 | Computational text analysis usually proceeds according to a series of 
 38 | well-defined steps. After importing texts, the usual next step is to turn the 
 39 | human-readable text into machine-readable tokens. Tokens are defined as 
 40 | segments of a text identified as meaningful units for the purpose of analyzing 
 41 | the text. They may consist of individual words or of larger or smaller 
 42 | segments, such as word sequences, word subsequences, paragraphs, sentences, or 
 43 | lines [@Manningetal2008, 22]. Tokenization is the process of splitting the text 
 44 | into these smaller pieces, and it often involves preprocessing the text to 
 45 | remove punctuation and transform all tokens into lowercase [@welbers_text_2017, 
 46 | 250-251]. Decisions made during tokenization have a significant effect on 
 47 | subsequent analysis [@denny_text_forthcoming; @guthrie_closer_2006]. Especially 
 48 | for large corpora, tokenization can be computationally expensive, and 
 49 | tokenization is highly language dependent.  Efficiency and correctness are 
 50 | therefore paramount concerns for tokenization.
 51 | 
 52 | The [tokenizers](https://docs.ropensci.org/tokenizers/index.html) package for 
 53 | R provides fast, consistent tokenization for natural language text 
 54 | [@tokenizers; @rbase]. (The package is available on 
 55 | [GitHub](https://github.com/ropensci/tokenizers) and archived on 
 56 | [Zenodo](https://doi.org/10.5281/zenodo.1205017).) Each of the tokenizers 
 57 | expects a consistent input and returns a consistent output, so that the 
 58 | tokenizers can be used interchangeably with one another or relied on in other 
 59 | packages. To ensure the correctness of output, the package depends on the 
 60 | stringi package, which implements Unicode support for R [@gagolewski_2018]. 
 61 | To ensure the speed of tokenization, key components such as the _n_-gram and 
 62 | skip _n_-gram tokenizers are written using the Rcpp package 
 63 | [@eddelbuettel_2013; @eddelbuettel_2017]. The tokenizers package is part of 
 64 | the [rOpenSci project](https://ropensci.org/).
 65 | 
 66 | The most important tokenizers in the current version of the package can be 
 67 | grouped as follows:
 68 | 
 69 | - tokenizers for characters and shingled characters
 70 | - tokenizers for words and word stems, as well as for Penn Treebank tokens 
 71 | - tokenizers _n_-grams and skip _n_-grams
 72 | - tokenizers for tweets, which preserve formatting of usernames and hashtags
 73 | 
 74 | In addition the package provides functions for splitting longer documents into 
 75 | sentences and paragraphs, or for splitting a long text into smaller chunks each 
 76 | with the same number of words. This allows users to treat parts of very long 
 77 | texts as documents in their own right. The package also provides functions for 
 78 | counting words, characters, and sentences.
 79 | 
 80 | The tokenizers in this package can be used on their own, or they can be wrapped 
 81 | by higher-level R packages. For instance, the tokenizers package is a 
 82 | dependency for the tidytext [@silge_2016], text2vec [@selivanov_2018], 
 83 | and textreuse [@mullen_2016] packages. More broadly, the output of the 
 84 | tokenization functions follows the guidelines set by the text-interchange 
 85 | format  defined at an rOpenSci Text Workshop in 2017 [@tif_2017]. Other 
 86 | packages which buy into the text-interchange format can thus use the 
 87 | tokenizers package interchangeably.
 88 | 
 89 | The tokenizers package has research applications in any discipline which 
 90 | uses computational text analysis. The package was originally created for  
 91 | historical research into the use of the Bible in American newspapers 
 92 | [@mullen_americas] and into the borrowing of legal codes of civil procedure in 
 93 | the nineteenth-century United States [@funkmullen_spine_2018, 
 94 | @funkmullen_servile_2016]. The tokenizers package underlies the tidytext 
 95 | package [@silge_text_2017], and via that package tokenizers has been used 
 96 | in disciplines such as political science [@sanger_2015_], social science 
 97 | [@warin_mapping], communication studies [@xu_using_2018], English 
 98 | [@ballier_rbased_2017], and the digital humanities more generally.
 99 | 
100 | # References
101 | 
102 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 9 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 | 
13 | // generate_ngrams_batch
14 | ListOf<CharacterVector> generate_ngrams_batch(const ListOf<const CharacterVector> documents_list, const int ngram_min, const int ngram_max, CharacterVector stopwords, const String ngram_delim);
15 | RcppExport SEXP _tokenizers_generate_ngrams_batch(SEXP documents_listSEXP, SEXP ngram_minSEXP, SEXP ngram_maxSEXP, SEXP stopwordsSEXP, SEXP ngram_delimSEXP) {
16 | BEGIN_RCPP
17 |     Rcpp::RObject rcpp_result_gen;
18 |     Rcpp::RNGScope rcpp_rngScope_gen;
19 |     Rcpp::traits::input_parameter< const ListOf<const CharacterVector> >::type documents_list(documents_listSEXP);
20 |     Rcpp::traits::input_parameter< const int >::type ngram_min(ngram_minSEXP);
21 |     Rcpp::traits::input_parameter< const int >::type ngram_max(ngram_maxSEXP);
22 |     Rcpp::traits::input_parameter< CharacterVector >::type stopwords(stopwordsSEXP);
23 |     Rcpp::traits::input_parameter< const String >::type ngram_delim(ngram_delimSEXP);
24 |     rcpp_result_gen = Rcpp::wrap(generate_ngrams_batch(documents_list, ngram_min, ngram_max, stopwords, ngram_delim));
25 |     return rcpp_result_gen;
26 | END_RCPP
27 | }
28 | // skip_ngrams_vectorised
29 | ListOf<CharacterVector> skip_ngrams_vectorised(ListOf<CharacterVector> words, ListOf<NumericVector> skips, CharacterVector stopwords);
30 | RcppExport SEXP _tokenizers_skip_ngrams_vectorised(SEXP wordsSEXP, SEXP skipsSEXP, SEXP stopwordsSEXP) {
31 | BEGIN_RCPP
32 |     Rcpp::RObject rcpp_result_gen;
33 |     Rcpp::RNGScope rcpp_rngScope_gen;
34 |     Rcpp::traits::input_parameter< ListOf<CharacterVector> >::type words(wordsSEXP);
35 |     Rcpp::traits::input_parameter< ListOf<NumericVector> >::type skips(skipsSEXP);
36 |     Rcpp::traits::input_parameter< CharacterVector >::type stopwords(stopwordsSEXP);
37 |     rcpp_result_gen = Rcpp::wrap(skip_ngrams_vectorised(words, skips, stopwords));
38 |     return rcpp_result_gen;
39 | END_RCPP
40 | }
41 | 
42 | static const R_CallMethodDef CallEntries[] = {
43 |     {"_tokenizers_generate_ngrams_batch", (DL_FUNC) &_tokenizers_generate_ngrams_batch, 5},
44 |     {"_tokenizers_skip_ngrams_vectorised", (DL_FUNC) &_tokenizers_skip_ngrams_vectorised, 3},
45 |     {NULL, NULL, 0}
46 | };
47 | 
48 | RcppExport void R_init_tokenizers(DllInfo *dll) {
49 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
50 |     R_useDynamicSymbols(dll, FALSE);
51 | }
52 | 


--------------------------------------------------------------------------------
/src/shingle_ngrams.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | using namespace Rcpp;
  3 | 
  4 | // calculates size of the ngram vector
  5 | inline size_t get_ngram_seq_len(int input_len, int ngram_min, int ngram_max) {
  6 | 
  7 |   int out_ngram_len_adjust = 0;
  8 |   for (size_t i = ngram_min - 1; i < ngram_max; i++)
  9 |     out_ngram_len_adjust += i;
 10 |   if(input_len < ngram_min)
 11 |     return 0;
 12 |   else
 13 |     return input_len * (ngram_max - ngram_min + 1) - out_ngram_len_adjust;
 14 | }
 15 | 
 16 | CharacterVector generate_ngrams_internal(const CharacterVector terms_raw,
 17 |                                 const int ngram_min,
 18 |                                 const int ngram_max,
 19 |                                 std::set<std::string> &stopwords,
 20 |                                 // pass buffer by reference to avoid memory allocation
 21 |                                 // on each iteration
 22 |                                 std::deque<std::string> &terms_filtered_buffer,
 23 |                                 const std::string ngram_delim) {
 24 |   // clear buffer from previous iteration result
 25 |   terms_filtered_buffer.clear();
 26 |   std::string term;
 27 |   // filter out stopwords
 28 |   for (size_t i = 0; i < terms_raw.size(); i++) {
 29 |     term  = as<std::string>(terms_raw[i]);
 30 |     if(stopwords.find(term) == stopwords.end())
 31 |       terms_filtered_buffer.push_back(term);
 32 |   }
 33 | 
 34 |   int len = terms_filtered_buffer.size();
 35 |   size_t ngram_out_len = get_ngram_seq_len(len, ngram_min, std::min(ngram_max, len));
 36 | 
 37 |   CharacterVector result(ngram_out_len);
 38 | 
 39 |   std::string k_gram;
 40 |   size_t k, i = 0, j_max_observed;
 41 |   // iterates through input vector by window of size = n_max and build n-grams
 42 |   // for terms ["a", "b", "c", "d"] and n_min = 1, n_max = 3
 43 |   // will build 1:3-grams in following order
 44 |   //"a"     "a_b"   "a_b_c" "b"     "b_c"   "b_c_d" "c"     "c_d"   "d"
 45 |   for(size_t j = 0; j < len; j++ ) {
 46 |     k = 1;
 47 |     j_max_observed = j;
 48 |     while (k <= ngram_max && j_max_observed < len) {
 49 | 
 50 |       if( k == 1) {
 51 |         k_gram = terms_filtered_buffer[j_max_observed];
 52 |       } else {
 53 |         k_gram = k_gram + ngram_delim + terms_filtered_buffer[j_max_observed];
 54 |       }
 55 | 
 56 |       if(k >= ngram_min) {
 57 |         result[i] = String(k_gram, CE_UTF8);
 58 |         i++;
 59 |       }
 60 |       j_max_observed = j + k;
 61 |       k = k + 1;
 62 |     }
 63 |   }
 64 | 
 65 |   if(!result.size()){
 66 |     result.push_back(NA_STRING);
 67 |   }
 68 |   return result;
 69 | }
 70 | 
 71 | // [[Rcpp::export]]
 72 | ListOf<CharacterVector> generate_ngrams_batch(const ListOf<const CharacterVector> documents_list,
 73 |                                               const int ngram_min,
 74 |                                               const int ngram_max,
 75 |                                               CharacterVector stopwords = CharacterVector(),
 76 |                                               const String ngram_delim = " ") {
 77 | 
 78 |   std::deque<std::string> terms_filtered_buffer;
 79 |   const std::string std_string_delim = ngram_delim.get_cstring();
 80 |   size_t n_docs = documents_list.size();
 81 |   List result(n_docs);
 82 | 
 83 |   std::set<std::string> stopwords_set;
 84 |   for(size_t i = 0; i < stopwords.size(); i++){
 85 |     if(stopwords[i] != NA_STRING){
 86 |       stopwords_set.insert(as<std::string>(stopwords[i]));
 87 |     }
 88 |   }
 89 | 
 90 |   for (size_t i_document = 0; i_document < n_docs; i_document++) {
 91 |     if(i_document % 10000 == 0){
 92 |       Rcpp::checkUserInterrupt();
 93 |     }
 94 |     result[i_document] = generate_ngrams_internal(documents_list[i_document],
 95 |                                                   ngram_min, ngram_max,
 96 |                                                   stopwords_set,
 97 |                                                   terms_filtered_buffer,
 98 |                                                   std_string_delim);
 99 |   }
100 |   return result;
101 | }
102 | 


--------------------------------------------------------------------------------
/src/skip_ngrams.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | CharacterVector skip_ngrams(CharacterVector words,
 5 |                             ListOf<NumericVector>& skips,
 6 |                             std::set<std::string>& stopwords) {
 7 | 
 8 |   std::deque < std::string > checked_words;
 9 |   std::string str_holding;
10 | 
11 |   // Eliminate stopwords
12 |   for(unsigned int i = 0; i < words.size(); i++){
13 |     if(words[i] != NA_STRING){
14 |       str_holding = as<std::string>(words[i]);
15 |       if(stopwords.find(str_holding)  == stopwords.end()){
16 |         checked_words.push_back(str_holding);
17 |       }
18 |     }
19 |   }
20 | 
21 |   str_holding.clear();
22 |   std::deque < std::string > holding;
23 |   unsigned int checked_size = checked_words.size();
24 | 
25 |   for(unsigned int w = 0; w < checked_size; w++) {
26 |     for(unsigned int i = 0; i < skips.size(); i++){
27 |       unsigned int in_size = skips[i].size();
28 |       if(skips[i][in_size-1] + w < checked_size){
29 |         for(unsigned int j = 0; j < skips[i].size(); j++){
30 |           str_holding += " " + checked_words[skips[i][j] + w];
31 |         }
32 |         if(str_holding.size()){
33 |           str_holding.erase(0,1);
34 |         }
35 |         holding.push_back(str_holding);
36 |         str_holding.clear();
37 |       }
38 |     }
39 |   }
40 | 
41 |   if(!holding.size()){
42 |     return CharacterVector(1,NA_STRING);
43 |   }
44 | 
45 |   CharacterVector output(holding.size());
46 | 
47 |   for(unsigned int i = 0; i < holding.size(); i++){
48 |     if(holding[i].size()){
49 |       output[i] = String(holding[i], CE_UTF8);
50 |     } else {
51 |       output[i] = NA_STRING;
52 |     }
53 |   }
54 |   return output;
55 | }
56 | 
57 | //[[Rcpp::export]]
58 | ListOf<CharacterVector> skip_ngrams_vectorised(ListOf<CharacterVector> words,
59 |                                                ListOf<NumericVector> skips,
60 |                                                CharacterVector stopwords){
61 | 
62 |   // Create output object and set up for further work
63 |   unsigned int input_size = words.size();
64 |   List output(input_size);
65 | 
66 |   // Create stopwords set
67 |   std::set < std::string > checked_stopwords;
68 |   for(unsigned int i = 0; i < stopwords.size(); i++){
69 |     if(stopwords[i] != NA_STRING){
70 |       checked_stopwords.insert(as<std::string>(stopwords[i]));
71 |     }
72 |   }
73 | 
74 |   for(unsigned int i = 0; i < input_size; i++){
75 |     if(i % 10000 == 0){
76 |       Rcpp::checkUserInterrupt();
77 |     }
78 |     output[i] = skip_ngrams(words[i], skips, checked_stopwords);
79 |   }
80 | 
81 |   return output;
82 | }
83 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(tokenizers)
3 | 
4 | test_check("tokenizers")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-data.R:
--------------------------------------------------------------------------------
 1 | paths <- list.files(".", pattern = "\\.txt$", full.names = TRUE)
 2 | docs_full <- lapply(paths, readLines, encoding = "UTF-8")
 3 | docs_l <- lapply(docs_full, paste, collapse = "\n")
 4 | # docs_l <- lapply(docs_full, enc2utf8)
 5 | docs_c <- unlist(docs_l)
 6 | names(docs_l) <- basename(paths)
 7 | names(docs_c) <- basename(paths)
 8 | docs_df <- data.frame(doc_id = names(docs_c),
 9 |                       text = unname(docs_c),
10 |                       stringsAsFactors = FALSE)
11 | 
12 | bad_list <- list(a = paste(letters, collapse = " "), b = letters)
13 | 
14 | # Using this sample sentence only because it comes from the paper where
15 | # skip n-grams are defined. Not my favorite sentence.
16 | input <- "Insurgents killed in ongoing fighting."
17 | 
18 | bigrams <- c("insurgents killed", "killed in", "in ongoing", "ongoing fighting")
19 | 
20 | skip2_bigrams <- c("insurgents killed", "insurgents in", "insurgents ongoing",
21 |                    "killed in", "killed ongoing", "killed fighting",
22 |                    "in ongoing", "in fighting", "ongoing fighting")
23 | 
24 | trigrams <- c("insurgents killed in", "killed in ongoing", "in ongoing fighting")
25 | 
26 | skip2_trigrams <- c("insurgents killed in", "insurgents killed ongoing",
27 |                     "insurgents killed fighting", "insurgents in ongoing",
28 |                     "insurgents in fighting", "insurgents ongoing fighting",
29 |                     "killed in ongoing", "killed in fighting",
30 |                     "killed ongoing fighting", "in ongoing fighting")
31 | 


--------------------------------------------------------------------------------
/tests/testthat/moby-ch1.txt:
--------------------------------------------------------------------------------
  1 | ﻿CHAPTER 1. Loomings.
  2 | 
  3 | Call me Ishmael. Some years ago--never mind how long precisely--having
  4 | little or no money in my purse, and nothing particular to interest me on
  5 | shore, I thought I would sail about a little and see the watery part of
  6 | the world. It is a way I have of driving off the spleen and regulating
  7 | the circulation. Whenever I find myself growing grim about the mouth;
  8 | whenever it is a damp, drizzly November in my soul; whenever I find
  9 | myself involuntarily pausing before coffin warehouses, and bringing up
 10 | the rear of every funeral I meet; and especially whenever my hypos get
 11 | such an upper hand of me, that it requires a strong moral principle to
 12 | prevent me from deliberately stepping into the street, and methodically
 13 | knocking people's hats off--then, I account it high time to get to sea
 14 | as soon as I can. This is my substitute for pistol and ball. With a
 15 | philosophical flourish Cato throws himself upon his sword; I quietly
 16 | take to the ship. There is nothing surprising in this. If they but knew
 17 | it, almost all men in their degree, some time or other, cherish very
 18 | nearly the same feelings towards the ocean with me.
 19 | 
 20 | There now is your insular city of the Manhattoes, belted round by
 21 | wharves as Indian isles by coral reefs--commerce surrounds it with
 22 | her surf. Right and left, the streets take you waterward. Its extreme
 23 | downtown is the battery, where that noble mole is washed by waves, and
 24 | cooled by breezes, which a few hours previous were out of sight of land.
 25 | Look at the crowds of water-gazers there.
 26 | 
 27 | Circumambulate the city of a dreamy Sabbath afternoon. Go from Corlears
 28 | Hook to Coenties Slip, and from thence, by Whitehall, northward. What
 29 | do you see?--Posted like silent sentinels all around the town, stand
 30 | thousands upon thousands of mortal men fixed in ocean reveries. Some
 31 | leaning against the spiles; some seated upon the pier-heads; some
 32 | looking over the bulwarks of ships from China; some high aloft in the
 33 | rigging, as if striving to get a still better seaward peep. But these
 34 | are all landsmen; of week days pent up in lath and plaster--tied to
 35 | counters, nailed to benches, clinched to desks. How then is this? Are
 36 | the green fields gone? What do they here?
 37 | 
 38 | But look! here come more crowds, pacing straight for the water, and
 39 | seemingly bound for a dive. Strange! Nothing will content them but the
 40 | extremest limit of the land; loitering under the shady lee of yonder
 41 | warehouses will not suffice. No. They must get just as nigh the water
 42 | as they possibly can without falling in. And there they stand--miles of
 43 | them--leagues. Inlanders all, they come from lanes and alleys, streets
 44 | and avenues--north, east, south, and west. Yet here they all unite.
 45 | Tell me, does the magnetic virtue of the needles of the compasses of all
 46 | those ships attract them thither?
 47 | 
 48 | Once more. Say you are in the country; in some high land of lakes. Take
 49 | almost any path you please, and ten to one it carries you down in a
 50 | dale, and leaves you there by a pool in the stream. There is magic
 51 | in it. Let the most absent-minded of men be plunged in his deepest
 52 | reveries--stand that man on his legs, set his feet a-going, and he will
 53 | infallibly lead you to water, if water there be in all that region.
 54 | Should you ever be athirst in the great American desert, try this
 55 | experiment, if your caravan happen to be supplied with a metaphysical
 56 | professor. Yes, as every one knows, meditation and water are wedded for
 57 | ever.
 58 | 
 59 | But here is an artist. He desires to paint you the dreamiest, shadiest,
 60 | quietest, most enchanting bit of romantic landscape in all the valley of
 61 | the Saco. What is the chief element he employs? There stand his trees,
 62 | each with a hollow trunk, as if a hermit and a crucifix were within; and
 63 | here sleeps his meadow, and there sleep his cattle; and up from yonder
 64 | cottage goes a sleepy smoke. Deep into distant woodlands winds a
 65 | mazy way, reaching to overlapping spurs of mountains bathed in their
 66 | hill-side blue. But though the picture lies thus tranced, and though
 67 | this pine-tree shakes down its sighs like leaves upon this shepherd's
 68 | head, yet all were vain, unless the shepherd's eye were fixed upon the
 69 | magic stream before him. Go visit the Prairies in June, when for scores
 70 | on scores of miles you wade knee-deep among Tiger-lilies--what is the
 71 | one charm wanting?--Water--there is not a drop of water there! Were
 72 | Niagara but a cataract of sand, would you travel your thousand miles to
 73 | see it? Why did the poor poet of Tennessee, upon suddenly receiving two
 74 | handfuls of silver, deliberate whether to buy him a coat, which he sadly
 75 | needed, or invest his money in a pedestrian trip to Rockaway Beach? Why
 76 | is almost every robust healthy boy with a robust healthy soul in him, at
 77 | some time or other crazy to go to sea? Why upon your first voyage as a
 78 | passenger, did you yourself feel such a mystical vibration, when first
 79 | told that you and your ship were now out of sight of land? Why did the
 80 | old Persians hold the sea holy? Why did the Greeks give it a separate
 81 | deity, and own brother of Jove? Surely all this is not without meaning.
 82 | And still deeper the meaning of that story of Narcissus, who because
 83 | he could not grasp the tormenting, mild image he saw in the fountain,
 84 | plunged into it and was drowned. But that same image, we ourselves see
 85 | in all rivers and oceans. It is the image of the ungraspable phantom of
 86 | life; and this is the key to it all.
 87 | 
 88 | Now, when I say that I am in the habit of going to sea whenever I begin
 89 | to grow hazy about the eyes, and begin to be over conscious of my lungs,
 90 | I do not mean to have it inferred that I ever go to sea as a passenger.
 91 | For to go as a passenger you must needs have a purse, and a purse is
 92 | but a rag unless you have something in it. Besides, passengers get
 93 | sea-sick--grow quarrelsome--don't sleep of nights--do not enjoy
 94 | themselves much, as a general thing;--no, I never go as a passenger;
 95 | nor, though I am something of a salt, do I ever go to sea as a
 96 | Commodore, or a Captain, or a Cook. I abandon the glory and distinction
 97 | of such offices to those who like them. For my part, I abominate all
 98 | honourable respectable toils, trials, and tribulations of every kind
 99 | whatsoever. It is quite as much as I can do to take care of myself,
100 | without taking care of ships, barques, brigs, schooners, and what not.
101 | And as for going as cook,--though I confess there is considerable glory
102 | in that, a cook being a sort of officer on ship-board--yet, somehow,
103 | I never fancied broiling fowls;--though once broiled, judiciously
104 | buttered, and judgmatically salted and peppered, there is no one who
105 | will speak more respectfully, not to say reverentially, of a broiled
106 | fowl than I will. It is out of the idolatrous dotings of the old
107 | Egyptians upon broiled ibis and roasted river horse, that you see the
108 | mummies of those creatures in their huge bake-houses the pyramids.
109 | 
110 | No, when I go to sea, I go as a simple sailor, right before the mast,
111 | plumb down into the forecastle, aloft there to the royal mast-head.
112 | True, they rather order me about some, and make me jump from spar to
113 | spar, like a grasshopper in a May meadow. And at first, this sort
114 | of thing is unpleasant enough. It touches one's sense of honour,
115 | particularly if you come of an old established family in the land, the
116 | Van Rensselaers, or Randolphs, or Hardicanutes. And more than all,
117 | if just previous to putting your hand into the tar-pot, you have been
118 | lording it as a country schoolmaster, making the tallest boys stand
119 | in awe of you. The transition is a keen one, I assure you, from a
120 | schoolmaster to a sailor, and requires a strong decoction of Seneca and
121 | the Stoics to enable you to grin and bear it. But even this wears off in
122 | time.
123 | 
124 | What of it, if some old hunks of a sea-captain orders me to get a broom
125 | and sweep down the decks? What does that indignity amount to, weighed,
126 | I mean, in the scales of the New Testament? Do you think the archangel
127 | Gabriel thinks anything the less of me, because I promptly and
128 | respectfully obey that old hunks in that particular instance? Who ain't
129 | a slave? Tell me that. Well, then, however the old sea-captains may
130 | order me about--however they may thump and punch me about, I have the
131 | satisfaction of knowing that it is all right; that everybody else is
132 | one way or other served in much the same way--either in a physical
133 | or metaphysical point of view, that is; and so the universal thump is
134 | passed round, and all hands should rub each other's shoulder-blades, and
135 | be content.
136 | 
137 | Again, I always go to sea as a sailor, because they make a point of
138 | paying me for my trouble, whereas they never pay passengers a single
139 | penny that I ever heard of. On the contrary, passengers themselves must
140 | pay. And there is all the difference in the world between paying
141 | and being paid. The act of paying is perhaps the most uncomfortable
142 | infliction that the two orchard thieves entailed upon us. But BEING
143 | PAID,--what will compare with it? The urbane activity with which a man
144 | receives money is really marvellous, considering that we so earnestly
145 | believe money to be the root of all earthly ills, and that on no account
146 | can a monied man enter heaven. Ah! how cheerfully we consign ourselves
147 | to perdition!
148 | 
149 | Finally, I always go to sea as a sailor, because of the wholesome
150 | exercise and pure air of the fore-castle deck. For as in this world,
151 | head winds are far more prevalent than winds from astern (that is,
152 | if you never violate the Pythagorean maxim), so for the most part the
153 | Commodore on the quarter-deck gets his atmosphere at second hand from
154 | the sailors on the forecastle. He thinks he breathes it first; but not
155 | so. In much the same way do the commonalty lead their leaders in many
156 | other things, at the same time that the leaders little suspect it.
157 | But wherefore it was that after having repeatedly smelt the sea as a
158 | merchant sailor, I should now take it into my head to go on a whaling
159 | voyage; this the invisible police officer of the Fates, who has the
160 | constant surveillance of me, and secretly dogs me, and influences me
161 | in some unaccountable way--he can better answer than any one else. And,
162 | doubtless, my going on this whaling voyage, formed part of the grand
163 | programme of Providence that was drawn up a long time ago. It came in as
164 | a sort of brief interlude and solo between more extensive performances.
165 | I take it that this part of the bill must have run something like this:
166 | 
167 | 
168 | "GRAND CONTESTED ELECTION FOR THE PRESIDENCY OF THE UNITED STATES.
169 | 
170 | "WHALING VOYAGE BY ONE ISHMAEL.
171 | 
172 | "BLOODY BATTLE IN AFFGHANISTAN."
173 | 
174 | 
175 | Though I cannot tell why it was exactly that those stage managers, the
176 | Fates, put me down for this shabby part of a whaling voyage, when others
177 | were set down for magnificent parts in high tragedies, and short and
178 | easy parts in genteel comedies, and jolly parts in farces--though
179 | I cannot tell why this was exactly; yet, now that I recall all the
180 | circumstances, I think I can see a little into the springs and motives
181 | which being cunningly presented to me under various disguises, induced
182 | me to set about performing the part I did, besides cajoling me into the
183 | delusion that it was a choice resulting from my own unbiased freewill
184 | and discriminating judgment.
185 | 
186 | Chief among these motives was the overwhelming idea of the great
187 | whale himself. Such a portentous and mysterious monster roused all my
188 | curiosity. Then the wild and distant seas where he rolled his island
189 | bulk; the undeliverable, nameless perils of the whale; these, with all
190 | the attending marvels of a thousand Patagonian sights and sounds, helped
191 | to sway me to my wish. With other men, perhaps, such things would not
192 | have been inducements; but as for me, I am tormented with an everlasting
193 | itch for things remote. I love to sail forbidden seas, and land on
194 | barbarous coasts. Not ignoring what is good, I am quick to perceive a
195 | horror, and could still be social with it--would they let me--since it
196 | is but well to be on friendly terms with all the inmates of the place
197 | one lodges in.
198 | 
199 | By reason of these things, then, the whaling voyage was welcome; the
200 | great flood-gates of the wonder-world swung open, and in the wild
201 | conceits that swayed me to my purpose, two and two there floated into
202 | my inmost soul, endless processions of the whale, and, mid most of them
203 | all, one grand hooded phantom, like a snow hill in the air.
204 | 


--------------------------------------------------------------------------------
/tests/testthat/moby-ch2.txt:
--------------------------------------------------------------------------------
  1 | ﻿CHAPTER 2. The Carpet-Bag.
  2 | 
  3 | I stuffed a shirt or two into my old carpet-bag, tucked it under my arm,
  4 | and started for Cape Horn and the Pacific. Quitting the good city of
  5 | old Manhatto, I duly arrived in New Bedford. It was a Saturday night in
  6 | December. Much was I disappointed upon learning that the little packet
  7 | for Nantucket had already sailed, and that no way of reaching that place
  8 | would offer, till the following Monday.
  9 | 
 10 | As most young candidates for the pains and penalties of whaling stop at
 11 | this same New Bedford, thence to embark on their voyage, it may as well
 12 | be related that I, for one, had no idea of so doing. For my mind was
 13 | made up to sail in no other than a Nantucket craft, because there was a
 14 | fine, boisterous something about everything connected with that famous
 15 | old island, which amazingly pleased me. Besides though New Bedford has
 16 | of late been gradually monopolising the business of whaling, and though
 17 | in this matter poor old Nantucket is now much behind her, yet Nantucket
 18 | was her great original--the Tyre of this Carthage;--the place where the
 19 | first dead American whale was stranded. Where else but from Nantucket
 20 | did those aboriginal whalemen, the Red-Men, first sally out in canoes to
 21 | give chase to the Leviathan? And where but from Nantucket, too, did that
 22 | first adventurous little sloop put forth, partly laden with imported
 23 | cobblestones--so goes the story--to throw at the whales, in order to
 24 | discover when they were nigh enough to risk a harpoon from the bowsprit?
 25 | 
 26 | Now having a night, a day, and still another night following before me
 27 | in New Bedford, ere I could embark for my destined port, it became a
 28 | matter of concernment where I was to eat and sleep meanwhile. It was a
 29 | very dubious-looking, nay, a very dark and dismal night, bitingly cold
 30 | and cheerless. I knew no one in the place. With anxious grapnels I had
 31 | sounded my pocket, and only brought up a few pieces of silver,--So,
 32 | wherever you go, Ishmael, said I to myself, as I stood in the middle of
 33 | a dreary street shouldering my bag, and comparing the gloom towards the
 34 | north with the darkness towards the south--wherever in your wisdom you
 35 | may conclude to lodge for the night, my dear Ishmael, be sure to inquire
 36 | the price, and don't be too particular.
 37 | 
 38 | With halting steps I paced the streets, and passed the sign of "The
 39 | Crossed Harpoons"--but it looked too expensive and jolly there. Further
 40 | on, from the bright red windows of the "Sword-Fish Inn," there came such
 41 | fervent rays, that it seemed to have melted the packed snow and ice from
 42 | before the house, for everywhere else the congealed frost lay ten inches
 43 | thick in a hard, asphaltic pavement,--rather weary for me, when I struck
 44 | my foot against the flinty projections, because from hard, remorseless
 45 | service the soles of my boots were in a most miserable plight. Too
 46 | expensive and jolly, again thought I, pausing one moment to watch the
 47 | broad glare in the street, and hear the sounds of the tinkling glasses
 48 | within. But go on, Ishmael, said I at last; don't you hear? get away
 49 | from before the door; your patched boots are stopping the way. So on I
 50 | went. I now by instinct followed the streets that took me waterward, for
 51 | there, doubtless, were the cheapest, if not the cheeriest inns.
 52 | 
 53 | Such dreary streets! blocks of blackness, not houses, on either hand,
 54 | and here and there a candle, like a candle moving about in a tomb. At
 55 | this hour of the night, of the last day of the week, that quarter of
 56 | the town proved all but deserted. But presently I came to a smoky light
 57 | proceeding from a low, wide building, the door of which stood invitingly
 58 | open. It had a careless look, as if it were meant for the uses of the
 59 | public; so, entering, the first thing I did was to stumble over an
 60 | ash-box in the porch. Ha! thought I, ha, as the flying particles almost
 61 | choked me, are these ashes from that destroyed city, Gomorrah? But "The
 62 | Crossed Harpoons," and "The Sword-Fish?"--this, then must needs be the
 63 | sign of "The Trap." However, I picked myself up and hearing a loud voice
 64 | within, pushed on and opened a second, interior door.
 65 | 
 66 | It seemed the great Black Parliament sitting in Tophet. A hundred black
 67 | faces turned round in their rows to peer; and beyond, a black Angel
 68 | of Doom was beating a book in a pulpit. It was a negro church; and the
 69 | preacher's text was about the blackness of darkness, and the weeping and
 70 | wailing and teeth-gnashing there. Ha, Ishmael, muttered I, backing out,
 71 | Wretched entertainment at the sign of 'The Trap!'
 72 | 
 73 | Moving on, I at last came to a dim sort of light not far from the docks,
 74 | and heard a forlorn creaking in the air; and looking up, saw a swinging
 75 | sign over the door with a white painting upon it, faintly representing
 76 | a tall straight jet of misty spray, and these words underneath--"The
 77 | Spouter Inn:--Peter Coffin."
 78 | 
 79 | Coffin?--Spouter?--Rather ominous in that particular connexion, thought
 80 | I. But it is a common name in Nantucket, they say, and I suppose this
 81 | Peter here is an emigrant from there. As the light looked so dim, and
 82 | the place, for the time, looked quiet enough, and the dilapidated little
 83 | wooden house itself looked as if it might have been carted here from
 84 | the ruins of some burnt district, and as the swinging sign had a
 85 | poverty-stricken sort of creak to it, I thought that here was the very
 86 | spot for cheap lodgings, and the best of pea coffee.
 87 | 
 88 | It was a queer sort of place--a gable-ended old house, one side palsied
 89 | as it were, and leaning over sadly. It stood on a sharp bleak corner,
 90 | where that tempestuous wind Euroclydon kept up a worse howling than ever
 91 | it did about poor Paul's tossed craft. Euroclydon, nevertheless, is a
 92 | mighty pleasant zephyr to any one in-doors, with his feet on the hob
 93 | quietly toasting for bed. "In judging of that tempestuous wind called
 94 | Euroclydon," says an old writer--of whose works I possess the only copy
 95 | extant--"it maketh a marvellous difference, whether thou lookest out at
 96 | it from a glass window where the frost is all on the outside, or whether
 97 | thou observest it from that sashless window, where the frost is on both
 98 | sides, and of which the wight Death is the only glazier." True enough,
 99 | thought I, as this passage occurred to my mind--old black-letter, thou
100 | reasonest well. Yes, these eyes are windows, and this body of mine is
101 | the house. What a pity they didn't stop up the chinks and the crannies
102 | though, and thrust in a little lint here and there. But it's too late
103 | to make any improvements now. The universe is finished; the copestone
104 | is on, and the chips were carted off a million years ago. Poor Lazarus
105 | there, chattering his teeth against the curbstone for his pillow, and
106 | shaking off his tatters with his shiverings, he might plug up both ears
107 | with rags, and put a corn-cob into his mouth, and yet that would not
108 | keep out the tempestuous Euroclydon. Euroclydon! says old Dives, in his
109 | red silken wrapper--(he had a redder one afterwards) pooh, pooh! What
110 | a fine frosty night; how Orion glitters; what northern lights! Let them
111 | talk of their oriental summer climes of everlasting conservatories; give
112 | me the privilege of making my own summer with my own coals.
113 | 
114 | But what thinks Lazarus? Can he warm his blue hands by holding them up
115 | to the grand northern lights? Would not Lazarus rather be in Sumatra
116 | than here? Would he not far rather lay him down lengthwise along the
117 | line of the equator; yea, ye gods! go down to the fiery pit itself, in
118 | order to keep out this frost?
119 | 
120 | Now, that Lazarus should lie stranded there on the curbstone before the
121 | door of Dives, this is more wonderful than that an iceberg should be
122 | moored to one of the Moluccas. Yet Dives himself, he too lives like a
123 | Czar in an ice palace made of frozen sighs, and being a president of a
124 | temperance society, he only drinks the tepid tears of orphans.
125 | 
126 | But no more of this blubbering now, we are going a-whaling, and there is
127 | plenty of that yet to come. Let us scrape the ice from our frosted feet,
128 | and see what sort of a place this "Spouter" may be.
129 | 
130 | 


--------------------------------------------------------------------------------
/tests/testthat/moby-ch3.txt:
--------------------------------------------------------------------------------
  1 | ﻿CHAPTER 3
  2 | 
  3 | The Spouter-Inn
  4 | 
  5 | Entering that gable-ended Spouter-Inn, you found yourself
  6 | in a wide, low, straggling entry with old-fashioned wainscots,
  7 | reminding one of the bulwarks of some condemned old craft.
  8 | On one side hung a very large oil painting so thoroughly besmoked,
  9 | and every way defaced, that in the unequal crosslights by which
 10 | you viewed it, it was only by diligent study and a series of
 11 | systematic visits to it, and careful inquiry of the neighbors,
 12 | that you could any way arrive at an understanding of its purpose.
 13 | Such unaccountable masses of shades and shadows, that at
 14 | first you almost thought some ambitious young artist,
 15 | in the time of the New England hags, had endeavored to delineate
 16 | chaos bewitched.  But by dint of much and earnest contemplation,
 17 | and oft repeated ponderings, and especially by throwing open
 18 | the little window towards the back of the entry, you at last
 19 | come to the conclusion that such an idea, however wild,
 20 | might not be altogether unwarranted.
 21 | 
 22 | But what most puzzled and confounded you was a long, limber, portentous,
 23 | black mass of something hovering in the centre of the picture over
 24 | three blue, dim, perpendicular lines floating in a nameless yeast.
 25 | A boggy, soggy, squitchy picture truly, enough to drive
 26 | a nervous man distracted.  Yet was there a sort of indefinite,
 27 | half-attained, unimaginable sublimity about it that fairly froze
 28 | you to it, till you involuntarily took an oath with yourself
 29 | to find out what that marvellous painting meant.  Ever and anon
 30 | a bright, but, alas, deceptive idea would dart you through.--
 31 | It's the Black Sea in a midnight gale.--It's the unnatural
 32 | combat of the four primal elements.--It's a blasted heath.--
 33 | It's a Hyperborean winter scene.--It's the breaking-up of
 34 | the icebound stream of Time.  But at last all these fancies
 35 | yielded to that one portentous something in the picture's midst.
 36 | That once found out, and all the rest were plain.  But stop;
 37 | does it not bear a faint resemblance to a gigantic fish? even
 38 | the great leviathan himself?
 39 | 
 40 | In fact, the artist's design seemed this:  a final theory of my own,
 41 | partly based upon the aggregated opinions of many aged persons
 42 | with whom I conversed upon the subject.  The picture represents
 43 | a Cape-Horner in a great hurricane; the half-foundered ship
 44 | weltering there with its three dismantled masts alone visible;
 45 | and an exasperated whale, purposing to spring clean over the craft,
 46 | is in the enormous act of impaling himself upon the three mast-heads.
 47 | 
 48 | The opposite wall of this entry was hung all over with a heathenish array
 49 | of monstrous clubs and spears.  Some were thickly set with glittering
 50 | teeth resembling ivory saws; others were tufted with knots of human hair;
 51 | and one was sickle-shaped, with a vast handle sweeping round
 52 | like the segment made in the new-mown grass by a long-armed mower.
 53 | You shuddered as you gazed, and wondered what monstrous cannibal
 54 | and savage could ever have gone a death-harvesting with such a hacking,
 55 | horrifying implement.  Mixed with these were rusty old whaling lances
 56 | and harpoons all broken and deformed.  Some were storied weapons.
 57 | With this once long lance, now wildly elbowed, fifty years ago did
 58 | Nathan Swain kill fifteen whales between a sunrise and a sunset.
 59 | And that harpoon--so like a corkscrew now--was flung in Javan seas,
 60 | and run away with by a whale, years afterwards slain off the Cape
 61 | of Blanco.  The original iron entered nigh the tail, and, like a restless
 62 | needle sojourning in the body of a man, travelled full forty feet,
 63 | and at last was found imbedded in the hump.
 64 | 
 65 | Crossing this dusky entry, and on through yon low-arched way--
 66 | cut through what in old times must have been a great central
 67 | chimney with fireplaces all round--you enter the public room.
 68 | A still duskier place is this, with such low ponderous
 69 | beams above, and such old wrinkled planks beneath, that you
 70 | would almost fancy you trod some old craft's cockpits,
 71 | especially of such a howling night, when this corner-anchored
 72 | old ark rocked so furiously.  On one side stood a long, low,
 73 | shelf-like table covered with cracked glass cases, filled with
 74 | dusty rarities gathered from this wide world's remotest nooks.
 75 | Projecting from the further angle of the room stands a
 76 | dark-looking den--the bar--a rude attempt at a right whale's head.
 77 | Be that how it may, there stands the vast arched bone of the
 78 | whale's jaw, so wide, a coach might almost drive beneath it.
 79 | Within are shabby shelves, ranged round with old decanters,
 80 | bottles, flasks; and in those jaws of swift destruction,
 81 | like another cursed Jonah (by which name indeed they called
 82 | him), bustles a little withered old man, who, for their money,
 83 | dearly sells the sailors deliriums and death.
 84 | 
 85 | Abominable are the tumblers into which he pours his poison.
 86 | Though true cylinders without--within, the villanous green goggling
 87 | glasses deceitfully tapered downwards to a cheating bottom.
 88 | Parallel meridians rudely pecked into the glass, surround
 89 | these footpads' goblets.  Fill to this mark, and your charge is
 90 | but a penny; to this a penny more; and so on to the full glass--
 91 | the Cape Horn measure, which you may gulp down for a shilling.
 92 | 
 93 | Upon entering the place I found a number of young seamen gathered about
 94 | a table, examining by a dim light divers specimens of skrimshander.
 95 | I sought the landlord, and telling him I desired to be accommodated
 96 | with a room, received for answer that his house was full--
 97 | not a bed unoccupied.  "But avast," he added, tapping his forehead,
 98 | "you haint no objections to sharing a harpooneer's blanket, have ye?
 99 | I s'pose you are goin' a-whalin', so you'd better get used to that
100 | sort of thing."
101 | 
102 | I told him that I never liked to sleep two in a bed; that if I
103 | should ever do so, it would depend upon who the harpooneer might be,
104 | and that if he (the landlord) really had no other place for me,
105 | and the harpooneer was not decidedly objectionable, why rather
106 | than wander further about a strange town on so bitter a night,
107 | I would put up with the half of any decent man's blanket.
108 | 
109 | "I thought so.  All right; take a seat.  Supper?--you want supper?
110 | Supper'll be ready directly."
111 | 
112 | I sat down on an old wooden settle, carved all over like a
113 | bench on the Battery.  At one end a ruminating tar was still
114 | further adorning it with his jack-knife, stooping over
115 | and diligently working away at the space between his legs.
116 | He was trying his hand at a ship under full sail, but he didn't
117 | make much headway, I thought.
118 | 
119 | At last some four or five of us were summoned to our
120 | meal in an adjoining room.  It was cold as Iceland--
121 | no fire at all--the landlord said he couldn't afford it.
122 | Nothing but two dismal tallow candles, each in a winding sheet.
123 | We were fain to button up our monkey jackets, and hold to our
124 | lips cups of scalding tea with our half frozen fingers.
125 | But the fare was of the most substantial kind--not only meat
126 | and potatoes, but dumplings; good heavens! dumplings for supper!
127 | One young fellow in a green box coat, addressed himself
128 | to these dumplings in a most direful manner.
129 | 
130 | "My boy," said the landlord, "you'll have the nightmare
131 | to a dead sartainty."
132 | 
133 | "Landlord," I whispered, "that aint the harpooneer is it?"
134 | 
135 | "Oh, no," said he, looking a sort of diabolically funny, "the harpooneer
136 | is a dark complexioned chap.  He never eats dumplings, he don't--
137 | he eats nothing but steaks, and he likes 'em rare."
138 | 
139 | "The devil he does," says I. "Where is that harpooneer?
140 | Is he here?"
141 | 
142 | "He'll be here afore long," was the answer.
143 | 
144 | I could not help it, but I began to feel suspicious of this
145 | "dark complexioned" harpooneer.  At any rate, I made up my
146 | mind that if it so turned out that we should sleep together,
147 | he must undress and get into bed before I did.
148 | 
149 | Supper over, the company went back to the bar-room, when,
150 | knowing not what else to do with myself, I resolved to spend
151 | the rest of the evening as a looker on.
152 | 
153 | Presently a rioting noise was heard without.  Starting up,
154 | the landlord cried, "That's the Grampus's crew.  I seed her reported
155 | in the offing this morning; a three years' voyage, and a full ship.
156 | Hurrah, boys; now we'll have the latest news from the Feegees."
157 | 
158 | A tramping of sea boots was heard in the entry; the door was flung open,
159 | and in rolled a wild set of mariners enough.  Enveloped in their shaggy
160 | watch coats, and with their heads muffled in woollen comforters,
161 | all bedarned and ragged, and their beards stiff with icicles,
162 | they seemed an eruption of bears from Labrador.  They had just
163 | landed from their boat, and this was the first house they entered.
164 | No wonder, then, that they made a straight wake for the whale's mouth--
165 | the bar--when the wrinkled little old Jonah, there officiating,
166 | soon poured them out brimmers all round.  One complained of a bad
167 | cold in his head, upon which Jonah mixed him a pitch-like potion
168 | of gin and molasses, which he swore was a sovereign cure for all
169 | colds and catarrhs whatsoever, never mind of how long standing,
170 | or whether caught off the coast of Labrador, or on the weather side
171 | of an ice-island.
172 | 
173 | The liquor soon mounted into their heads, as it generally
174 | does even with the arrantest topers newly landed from sea,
175 | and they began capering about most obstreperously.
176 | 
177 | I observed, however, that one of them held somewhat aloof,
178 | and though he seemed desirous not to spoil the hilarity of his
179 | shipmates by his own sober face, yet upon the whole he refrained from
180 | making as much noise as the rest.  This man interested me at once;
181 | and since the sea-gods had ordained that he should soon become my shipmate
182 | (though but a sleeping partner one, so far as this narrative is
183 | concerned), I will here venture upon a little description of him.
184 | He stood full six feet in height, with noble shoulders, and a chest
185 | like a coffer-dam. I have seldom seen such brawn in a man.
186 | His face was deeply brown and burnt, making his white teeth
187 | dazzling by the contrast; while in the deep shadows of his eyes
188 | floated some reminiscences that did not seem to give him much joy.
189 | His voice at once announced that he was a Southerner, and from his
190 | fine stature, I thought he must be one of those tall mountaineers
191 | from the Alleghanian Ridge in Virginia.  When the revelry of his
192 | companions had mounted to its height, this man slipped away unobserved,
193 | and I saw no more of him till he became my comrade on the sea.
194 | In a few minutes, however, he was missed by his shipmates,
195 | and being, it seems, for some reason a huge favorite with them,
196 | they raised a cry of "Bulkington!  Bulkington! where's Bulkington?"
197 | and darted out of the house in pursuit of him.
198 | 
199 | It was now about nine o'clock, and the room seeming almost
200 | supernaturally quiet after these orgies, I began to congratulate
201 | myself upon a little plan that had occurred to me just previous
202 | to the entrance of the seamen.
203 | 
204 | No man prefers to sleep two in a bed.  In fact, you would
205 | a good deal rather not sleep with your own brother.  I don't know
206 | how it is, but people like to be private when they are sleeping.
207 | And when it comes to sleeping with an unknown stranger,
208 | in a strange inn, in a strange town, and that stranger
209 | a harpooneer, then your objections indefinitely multiply.
210 | Nor was there any earthly reason why I as a sailor should sleep
211 | two in a bed, more than anybody else; for sailors no more
212 | sleep two in a bed at sea, than bachelor Kings do ashore.
213 | To be sure they all sleep together in one apartment, but you
214 | have your own hammock, and cover yourself with your own blanket,
215 | and sleep in your own skin.
216 | 
217 | The more I pondered over this harpooneer, the more I abominated
218 | the thought of sleeping with him.  It was fair to presume that
219 | being a harpooneer, his linen or woollen, as the case might be,
220 | would not be of the tidiest, certainly none of the finest.
221 | I began to twitch all over.  Besides, it was getting late,
222 | and my decent harpooneer ought to be home and going bedwards.
223 | Suppose now, he should tumble in upon me at midnight--
224 | how could I tell from what vile hole he had been coming?
225 | 
226 | "Landlord!  I've changed my mind about that harpooneer.--
227 | I shan't sleep with him.  I'll try the bench here."
228 | 
229 | "Just as you please; I'm sorry I cant spare ye a tablecloth for
230 | a mattress, and it's a plaguy rough board here"--feeling of the knots
231 | and notches.  "But wait a bit, Skrimshander; I've got a carpenter's
232 | plane there in the bar--wait, I say, and I'll make ye snug enough."
233 | So saying he procured the plane; and with his old silk handkerchief
234 | first dusting the bench, vigorously set to planing away at my bed,
235 | the while grinning like an ape.  The shavings flew right and left;
236 | till at last the plane-iron came bump against an indestructible knot.
237 | The landlord was near spraining his wrist, and I told him for heaven's
238 | sake to quit--the bed was soft enough to suit me, and I did not know
239 | how all the planing in the world could make eider down of a pine plank.
240 | So gathering up the shavings with another grin, and throwing them into
241 | the great stove in the middle of the room, he went about his business,
242 | and left me in a brown study.
243 | 
244 | I now took the measure of the bench, and found that it was
245 | a foot too short; but that could be mended with a chair.
246 | But it was a foot too narrow, and the other bench in
247 | the room was about four inches higher than the planed one--
248 | so there was no yoking them.  I then placed the first bench
249 | lengthwise along the only clear space against the wall,
250 | leaving a little interval between, for my back to settle down in.
251 | But I soon found that there came such a draught of cold air
252 | over me from under the sill of the window, that this plan would
253 | never do at all, especially as another current from the rickety
254 | door met the one from the window, and both together formed
255 | a series of small whirlwinds in the immediate vicinity of the spot
256 | where I had thought to spend the night.
257 | 
258 | The devil fetch that harpooneer, thought I, but stop,
259 | couldn't I steal a march on him--bolt his door inside, and jump
260 | into his bed, not to be wakened by the most violent knockings?
261 | It seemed no bad idea but upon second thoughts I dismissed it.
262 | For who could tell but what the next morning, so soon as I popped
263 | out of the room, the harpooneer might be standing in the entry,
264 | all ready to knock me down!
265 | 
266 | Still looking around me again, and seeing no possible chance
267 | of spending a sufferable night unless in some other person's bed,
268 | I began to think that after all I might be cherishing
269 | unwarrantable prejudices against this unknown harpooneer.
270 | Thinks I, I'll wait awhile; he must be dropping in before long.
271 | I'll have a good look at him then, and perhaps we may become
272 | jolly good bedfellows after all--there's no telling.
273 | 
274 | But though the other boarders kept coming in by ones, twos, and threes,
275 | and going to bed, yet no sign of my harpooneer.
276 | 
277 | "Landlord! said I, "what sort of a chap is he--does he always
278 | keep such late hours?"  It was now hard upon twelve o'clock.
279 | 
280 | The landlord chuckled again with his lean chuckle, and seemed
281 | to be mightily tickled at something beyond my comprehension.
282 | "No," he answered, "generally he's an early bird--airley to bed
283 | and airley to rise--yea, he's the bird what catches the worm.
284 | But to-night he went out a peddling, you see, and I don't see
285 | what on airth keeps him so late, unless, may be, he can't
286 | sell his head."
287 | 
288 | "Can't sell his head?--What sort of a bamboozingly story
289 | is this you are telling me?" getting into a towering rage.
290 | "Do you pretend to say, landlord, that this harpooneer is actually
291 | engaged this blessed Saturday night, or rather Sunday morning,
292 | in peddling his head around this town?"
293 | 
294 | "That's precisely it," said the landlord, "and I told him he couldn't
295 | sell it here, the market's overstocked."
296 | 
297 | "With what?" shouted I.
298 | 
299 | "With heads to be sure; ain't there too many heads in the world?"
300 | 
301 | "I tell you what it is, landlord," said I quite calmly,
302 | "you'd better stop spinning that yarn to me--I'm not green."
303 | 
304 | "May be not," taking out a stick and whittling a toothpick,
305 | "but I rayther guess you'll be done brown if that ere harpooneer
306 | hears you a slanderin' his head."
307 | 
308 | "I'll break it for him," said I, now flying into a passion again
309 | at this unaccountable farrago of the landlord's.
310 | 
311 | "It's broke a'ready," said he.
312 | 
313 | "Broke," said I--"broke, do you mean?"
314 | 
315 | "Sartain, and that's the very reason he can't sell it, I guess."
316 | 
317 | "Landlord," said I, going up to him as cool as Mt.  Hecla in a
318 | snowstorm--"landlord, stop whittling.  You and I must understand
319 | one another, and that too without delay.  I come to your house
320 | and want a bed; you tell me you can only give me half a one;
321 | that the other half belongs to a certain harpooneer.
322 | And about this harpooneer, whom I have not yet seen, you persist
323 | in telling me the most mystifying and exasperating stories tending
324 | to beget in me an uncomfortable feeling towards the man whom you
325 | design for my bedfellow--a sort of connexion, landlord, which is
326 | an intimate and confidential one in the highest degree.
327 | I now demand of you to speak out and tell me who and what this
328 | harpooneer is, and whether I shall be in all respects safe
329 | to spend the night with him.  And in the first place, you will
330 | be so good as to unsay that story about selling his head,
331 | which if true I take to be good evidence that this harpooneer
332 | is stark mad, and I've no idea of sleeping with a madman;
333 | and you, sir, you I mean, landlord, you, sir, by trying to induce
334 | me to do so knowingly would thereby render yourself liable
335 | to a criminal prosecution."
336 | 
337 | "Wall," said the landlord, fetching a long breath, "that's a
338 | purty long sarmon for a chap that rips a little now and then.
339 | But be easy, be easy, this here harpooneer I have been tellin'
340 | you of has just arrived from the south seas, where he bought up
341 | a lot of 'balmed New Zealand heads (great curios, you know),
342 | and he's sold all on 'em but one, and that one he's trying to sell
343 | to-night, cause to-morrow's Sunday, and it would not do to be sellin'
344 | human heads about the streets when folks is goin' to churches.
345 | He wanted to last Sunday, but I stopped him just as he was goin'
346 | out of the door with four heads strung on a string, for all
347 | the airth like a string of inions."
348 | 
349 | This account cleared up the otherwise unaccountable mystery,
350 | and showed that the landlord, after all, had had no idea of fooling me--
351 | but at the same time what could I think of a harpooneer who stayed
352 | out of a Saturday night clean into the holy Sabbath, engaged in such
353 | a cannibal business as selling the heads of dead idolators?
354 | 
355 | "Depend upon it, landlord, that harpooneer is a dangerous man."
356 | 
357 | "He pays reg'lar," was the rejoinder.  "But come, it's getting
358 | dreadful late, you had better be turning flukes--it's a nice bed:
359 | Sal and me slept in that ere bed the night we were spliced.
360 | There's plenty of room for two to kick about in that bed;
361 | it's an almighty big bed that.  Why, afore we give it up,
362 | Sal used to put our Sam and little Johnny in the foot of it.
363 | But I got a dreaming and sprawling about one night, and somehow,
364 | Sam got pitched on the floor, and came near breaking his arm.
365 | After that, Sal said it wouldn't do.  Come along here,
366 | I'll give ye a glim in a jiffy;" and so saying he lighted
367 | a candle and held it towards me, offering to lead the way.
368 | But I stood irresolute; when looking at a clock in the corner,
369 | he exclaimed "I vum it's Sunday--you won't see that harpooneer to-night;
370 | he's come to anchor somewhere--come along then; do come;
371 | won't ye come?"
372 | 
373 | I considered the matter a moment, and then up stairs we went,
374 | and I was ushered into a small room, cold as a clam, and furnished,
375 | sure enough, with a prodigious bed, almost big enough indeed
376 | for any four harpooneers to sleep abreast.
377 | 
378 | "There," said the landlord, placing the candle on a crazy old
379 | sea chest that did double duty as a wash-stand and centre table;
380 | "there, make yourself comfortable now; and good night to ye."
381 | I turned round from eyeing the bed, but he had disappeared.
382 | 
383 | Folding back the counterpane, I stooped over the bed.
384 | Though none of the most elegant, it yet stood the scrutiny
385 | tolerably well.  I then glanced round the room; and besides
386 | the bedstead and centre table, could see no other furniture
387 | belonging to the place, but a rude shelf, the four walls,
388 | and a papered fireboard representing a man striking a whale.
389 | Of things not properly belonging to the room, there was a
390 | hammock lashed up, and thrown upon the floor in one corner;
391 | also a large seaman's bag, containing the harpooneer's wardrobe,
392 | no doubt in lieu of a land trunk.  Likewise, there was a parcel
393 | of outlandish bone fish hooks on the shelf over the fire-place,
394 | and a tall harpoon standing at the head of the bed.
395 | 
396 | But what is this on the chest?  I took it up, and held it close
397 | to the light, and felt it, and smelt it, and tried every way
398 | possible to arrive at some satisfactory conclusion concerning it.
399 | I can compare it to nothing but a large door mat,
400 | ornamented at the edges with little tinkling tags something
401 | like the stained porcupine quills round an Indian moccasin.
402 | There was a hole or slit in the middle of this mat, as you see
403 | the same in South American ponchos.  But could it be possible
404 | that any sober harpooneer would get into a door mat, and parade
405 | the streets of any Christian town in that sort of guise?
406 | I put it on, to try it, and it weighed me down like a hamper,
407 | being uncommonly shaggy and thick, and I thought a little damp,
408 | as though this mysterious harpooneer had been wearing it
409 | of a rainy day.  I went up in it to a bit of glass stuck
410 | against the wall, and I never saw such a sight in my life.
411 | I tore myself out of it in such a hurry that I gave myself
412 | a kink in the neck.
413 | 
414 | I sat down on the side of the bed, and commenced thinking
415 | about this head-peddling harpooneer, and his door mat.
416 | After thinking some time on the bed-side, I got up and took off my
417 | monkey jacket, and then stood in the middle of the room thinking.
418 | I then took off my coat, and thought a little more in my shirt sleeves.
419 | But beginning to feel very cold now, half undressed as I was,
420 | and remembering what the landlord said about the harpooneer's
421 | not coming home at all that night, it being so very late,
422 | I made no more ado, but jumped out of my pantaloons and boots,
423 | and then blowing out the light tumbled into bed, and commended
424 | myself to the care of heaven.
425 | 
426 | Whether that mattress was stuffed with corncobs or broken crockery,
427 | there is no telling, but I rolled about a good deal, and could
428 | not sleep for a long time.  At last I slid off into a light doze,
429 | and had pretty nearly made a good offing towards the land of Nod,
430 | when I heard a heavy footfall in the passage, and saw a glimmer
431 | of light come into the room from under the door.
432 | 
433 | Lord save me, thinks I, that must be the harpooneer,
434 | the infernal head-peddler. But I lay perfectly still,
435 | and resolved not to say a word till spoken to.  Holding a light
436 | in one hand, and that identical New Zealand head in the other,
437 | the stranger entered the room, and without looking towards
438 | the bed, placed his candle a good way off from me on the floor
439 | in one corner, and then began working away at the knotted cords
440 | of the large bag I before spoke of as being in the room.
441 | I was all eagerness to see his face, but he kept it averted
442 | for some time while employed in unlacing the bag's mouth.
443 | This accomplished, however, he turned round--when, good heavens;
444 | what a sight!  Such a face!  It was of a dark, purplish, yellow color,
445 | here and there stuck over with large blackish looking squares.
446 | Yes, it's just as I thought, he's a terrible bedfellow;
447 | he's been in a fight, got dreadfully cut, and here he is,
448 | just from the surgeon.  But at that moment he chanced to turn
449 | his face so towards the light, that I plainly saw they could not
450 | be sticking-plasters at all, those black squares on his cheeks.
451 | They were stains of some sort or other.  At first I knew not what
452 | to make of this; but soon an inkling of the truth occurred to me.
453 | I remembered a story of a white man--a whaleman too--
454 | who, falling among the cannibals, had been tattooed by them.
455 | I concluded that this harpooneer, in the course of his
456 | distant voyages, must have met with a similar adventure.
457 | And what is it, thought I, after all!  It's only his outside;
458 | a man can be honest in any sort of skin.  But then, what to make of
459 | his unearthly complexion, that part of it, I mean, lying round about,
460 | and completely independent of the squares of tattooing.
461 | To be sure, it might be nothing but a good coat of tropical tanning;
462 | but I never heard of a hot sun's tanning a white man into a
463 | purplish yellow one.  However, I had never been in the South Seas;
464 | and perhaps the sun there produced these extraordinary effects
465 | upon the skin.  Now, while all these ideas were passing
466 | through me like lightning, this harpooneer never noticed me
467 | at all.  But, after some difficulty having opened his bag,
468 | he commenced fumbling in it, and presently pulled out a sort
469 | of tomahawk, and a seal-skin wallet with the hair on.
470 | Placing these on the old chest in the middle of the room,
471 | he then took the New Zealand head--a ghastly thing enough--
472 | and crammed it down into the bag.  He now took off his hat--
473 | a new beaver hat--when I came nigh singing out with fresh surprise.
474 | There was no hair on his head--none to speak of at least--
475 | nothing but a small scalp-knot twisted up on his forehead.  His bald
476 | purplish head now looked for all the world like a mildewed skull.
477 | Had not the stranger stood between me and the door, I would
478 | have bolted out of it quicker than ever I bolted a dinner.
479 | 
480 | Even as it was, I thought something of slipping out of
481 | the window, but it was the second floor back.  I am no coward,
482 | but what to make of this headpeddling purple rascal altogether
483 | passed my comprehension.  Ignorance is the parent of fear,
484 | and being completely nonplussed and confounded about the stranger,
485 | I confess I was now as much afraid of him as if it was the devil
486 | himself who had thus broken into my room at the dead of night.
487 | In fact, I was so afraid of him that I was not game enough
488 | just then to address him, and demand a satisfactory answer
489 | concerning what seemed inexplicable in him.
490 | 
491 | Meanwhile, he continued the business of undressing, and at
492 | last showed his chest and arms.  As I live, these covered
493 | parts of him were checkered with the same squares as his face,
494 | his back, too, was all over the same dark squares;
495 | he seemed to have been in a Thirty Years' War, and just
496 | escaped from it with a sticking-plaster shirt.
497 | Still more, his very legs were marked, as if a parcel of dark
498 | green frogs were running up the trunks of young palms.
499 | It was now quite plain that he must be some abominable savage
500 | or other shipped aboard of a whaleman in the South Seas,
501 | and so landed in this Christian country.  I quaked to think of it.
502 | A peddler of heads too--perhaps the heads of his own brothers.
503 | He might take a fancy to mine--heavens! look at that tomahawk!
504 | 
505 | But there was no time for shuddering, for now the savage went
506 | about something that completely fascinated my attention,
507 | and convinced me that he must indeed be a heathen.
508 | Going to his heavy grego, or wrapall, or dreadnaught,
509 | which he had previously hung on a chair, he fumbled in the pockets,
510 | and produced at length a curious little deformed image with a hunch
511 | on its back, and exactly the color of a three days' old Congo baby.
512 | Remembering the embalmed head, at first I almost thought that this
513 | black manikin was a real baby preserved in some similar manner.
514 | But seeing that it was not at all limber, and that it glistened
515 | a good deal like polished ebony, I concluded that it must
516 | be nothing but a wooden idol, which indeed it proved to be.
517 | For now the savage goes up to the empty fire-place,
518 | and removing the papered fire-board, sets up this little
519 | hunch-backed image, like a tenpin, between the andirons.
520 | The chimney jambs and all the bricks inside were very sooty,
521 | so that I thought this fire-place made a very appropriate little
522 | shrine or chapel for his Congo idol.
523 | 
524 | I now screwed my eyes hard towards the half hidden image,
525 | feeling but ill at ease meantime--to see what was next to follow.
526 | First he takes about a double handful of shavings out of his grego pocket,
527 | and places them carefully before the idol; then laying a bit of ship
528 | biscuit on top and applying the flame from the lamp, he kindled
529 | the shavings into a sacrificial blaze.  Presently, after many hasty
530 | snatches into the fire, and still hastier withdrawals of his fingers
531 | (whereby he seemed to be scorching them badly), he at last succeeded
532 | in drawing out the biscuit; then blowing off the heat and ashes
533 | a little, he made a polite offer of it to the little negro.
534 | But the little devil did not seem to fancy such dry sort of fare at all;
535 | he never moved his lips.  All these strange antics were accompanied
536 | by still stranger guttural noises from the devotee, who seemed to be
537 | praying in a sing-song or else singing some pagan psalmody or other,
538 | during which his face twitched about in the most unnatural manner.
539 | At last extinguishing the fire, he took the idol up very unceremoniously,
540 | and bagged it again in his grego pocket as carelessly as if he were
541 | a sportsman bagging a dead woodcock.
542 | 
543 | All these queer proceedings increased my uncomfortableness,
544 | and seeing him now exhibiting strong symptoms of concluding
545 | his business operations, and jumping into bed with me, I thought
546 | it was high time, now or never, before the light was put out,
547 | to break the spell in which I had so long been bound.
548 | 
549 | But the interval I spent in deliberating what to say, was a fatal one.
550 | Taking up his tomahawk from the table, he examined the head of it
551 | for an instant, and then holding it to the light, with his mouth
552 | at the handle, he puffed out great clouds of tobacco smoke.
553 | The next moment the light was extinguished, and this wild cannibal,
554 | tomahawk between his teeth, sprang into bed with me.  I sang out,
555 | I could not help it now; and giving a sudden grunt of astonishment
556 | he began feeling me.
557 | 
558 | Stammering out something, I knew not what, I rolled away from him
559 | against the wall, and then conjured him, whoever or whatever he might be,
560 | to keep quiet, and let me get up and light the lamp again.
561 | But his guttural responses satisfied me at once that he but ill
562 | comprehended my meaning.
563 | 
564 | "Who-e debel you?"--he at last said--"you no speak-e, dam-me, I kill-e."
565 | And so saying the lighted tomahawk began flourishing about me in the dark.
566 | 
567 | "Landlord, for God's sake, Peter Coffin!" shouted
568 | I. "Landlord!  Watch!  Coffin!  Angels! save me!"
569 | 
570 | "Speak-e! tell-ee me who-ee be, or dam-me, I kill-e!" again growled
571 | the cannibal, while his horrid flourishings of the tomahawk scattered
572 | the hot tobacco ashes about me till I thought my linen would get on fire.
573 | But thank heaven, at that moment the landlord came into the room light
574 | in hand, and leaping from the bed I ran up to him.
575 | 
576 | "Don't be afraid now," said he, grinning again, "Queequeg here wouldn't
577 | harm a hair of your head."
578 | 
579 | "Stop your grinning," shouted I, "and why didn't you tell me
580 | that that infernal harpooneer was a cannibal?"
581 | 
582 | "I thought ye know'd it;--didn't I tell ye, he was a peddlin'
583 | heads around town?--but turn flukes again and go to sleep.
584 | Queequeg, look here--you sabbee me, I sabbee--you this man
585 | sleepe you--you sabbee?"
586 | 
587 | "Me sabbee plenty"--grunted Queequeg, puffing away at his pipe
588 | and sitting up in bed.
589 | 
590 | "You gettee in," he added, motioning to me with his tomahawk,
591 | and throwing the clothes to one side.  He really did this
592 | in not only a civil but a really kind and charitable way.
593 | I stood looking at him a moment.  For all his tattooings
594 | he was on the whole a clean, comely looking cannibal.
595 | What's all this fuss I have been making about, thought I
596 | to myself--the man's a human being just as I am:  he has just
597 | as much reason to fear me, as I have to be afraid of him.
598 | Better sleep with a sober cannibal than a drunken Christian.
599 | 
600 | "Landlord," said I, "tell him to stash his tomahawk there, or pipe,
601 | or whatever you call it; tell him to stop smoking, in short, and I will
602 | turn in with him.  But I don't fancy having a man smoking in bed with me.
603 | It's dangerous.  Besides, I ain't insured."
604 | 
605 | This being told to Queequeg, he at once complied, and again politely
606 | motioned me to get into bed--rolling over to one side as much as to say--
607 | I won't touch a leg of ye."
608 | 
609 | "Good night, landlord," said I, "you may go."
610 | 
611 | I turned in, and never slept better in my life.
612 | 
613 | 


--------------------------------------------------------------------------------
/tests/testthat/test-basic.R:
--------------------------------------------------------------------------------
  1 | context("Basic tokenizers")
  2 | 
  3 | test_that("Character tokenizer works as expected", {
  4 |   out_l <- tokenize_characters(docs_l)
  5 |   out_c <- tokenize_characters(docs_c)
  6 |   out_1 <- tokenize_characters(docs_c[1], simplify = TRUE)
  7 | 
  8 |   expect_is(out_l, "list")
  9 |   expect_is(out_l[[1]], "character")
 10 |   expect_is(out_c, "list")
 11 |   expect_is(out_c[[1]], "character")
 12 |   expect_is(out_1, "character")
 13 | 
 14 |   expect_identical(out_l, out_c)
 15 |   expect_identical(out_l[[1]], out_1)
 16 |   expect_identical(out_c[[1]], out_1)
 17 | 
 18 |   expect_named(out_l, names(docs_l))
 19 |   expect_named(out_c, names(docs_c))
 20 | 
 21 |   expect_error(tokenize_characters(bad_list))
 22 | })
 23 | 
 24 | test_that("Character tokenizer produces correct output", {
 25 |   # skip_on_os("windows")
 26 |   out_1 <- tokenize_characters(docs_c[1], simplify = TRUE)
 27 |   expected <- c("c", "h", "a", "p", "t")
 28 |   expect_identical(head(out_1, 5), expected)
 29 | })
 30 | 
 31 | 
 32 | test_that("Word tokenizer works as expected", {
 33 |   out_l <- tokenize_words(docs_l)
 34 |   out_c <- tokenize_words(docs_c)
 35 |   out_1 <- tokenize_words(docs_c[1], simplify = TRUE)
 36 | 
 37 |   expect_is(out_l, "list")
 38 |   expect_is(out_l[[1]], "character")
 39 |   expect_is(out_c, "list")
 40 |   expect_is(out_c[[1]], "character")
 41 |   expect_is(out_1, "character")
 42 | 
 43 |   expect_identical(out_l, out_c)
 44 |   expect_identical(out_l[[1]], out_1)
 45 |   expect_identical(out_c[[1]], out_1)
 46 | 
 47 |   expect_named(out_l, names(docs_l))
 48 |   expect_named(out_c, names(docs_c))
 49 | 
 50 |   expect_error(tokenize_words(bad_list))
 51 | })
 52 | 
 53 | test_that("Word tokenizer produces correct output", {
 54 |   # skip_on_os("windows")
 55 |   out_1 <- tokenize_words(docs_c[1], simplify = TRUE)
 56 |   expected <- c("chapter", "1", "loomings", "call", "me")
 57 |   expect_identical(head(out_1, 5), expected)
 58 | })
 59 | 
 60 | test_that("Word tokenizer removes stop words", {
 61 |   test <- "Now is the time for every good person"
 62 |   test_l <- list(test, test)
 63 |   stopwords <- c("is", "the", "for")
 64 |   expected <- c("now", "time", "every", "good", "person")
 65 |   expected_l <- list(expected, expected)
 66 |   expect_equal(tokenize_words(test, simplify = TRUE, stopwords = stopwords),
 67 |                expected)
 68 |   expect_equal(tokenize_words(test_l, stopwords = stopwords), expected_l)
 69 | })
 70 | 
 71 | test_that("Word tokenizer can remove punctuation or numbers", {
 72 |   test_punct <- "This sentence ... has punctuation, doesn't it?"
 73 |   out_punct <- c("this", "sentence", ".", ".", ".", "has", "punctuation",
 74 |                        ",", "doesn't", "it", "?")
 75 |   test_num <- "In 1968 the GDP was 1.2 trillion."
 76 |   out_num_f <- c("in", "1968", "the", "gdp", "was", "1.2", "trillion")
 77 |   out_num_t <- c("in", "the", "gdp", "was", "trillion")
 78 |   expect_equal(tokenize_words(test_punct, simplify = TRUE, strip_punct = FALSE),
 79 |                out_punct)
 80 |   expect_equal(tokenize_words(test_num, simplify = TRUE, strip_numeric = FALSE),
 81 |                out_num_f)
 82 |   expect_equal(tokenize_words(test_num, simplify = TRUE, strip_numeric = TRUE),
 83 |                out_num_t)
 84 | })
 85 | 
 86 | test_that("Sentence tokenizer works as expected", {
 87 |   out_l <- tokenize_sentences(docs_l)
 88 |   out_c <- tokenize_sentences(docs_c)
 89 |   out_1 <- tokenize_sentences(docs_c[1], simplify = TRUE)
 90 |   out_1_lc <- tokenize_sentences(docs_c[1], lowercase = TRUE, simplify = TRUE)
 91 |   out_1_pc <- tokenize_sentences(docs_c[1], strip_punct = TRUE, simplify = TRUE)
 92 | 
 93 |   expect_is(out_l, "list")
 94 |   expect_is(out_l[[1]], "character")
 95 |   expect_is(out_c, "list")
 96 |   expect_is(out_c[[1]], "character")
 97 |   expect_is(out_1, "character")
 98 | 
 99 |   expect_identical(out_l, out_c)
100 |   expect_identical(out_l[[1]], out_1)
101 |   expect_identical(out_c[[1]], out_1)
102 | 
103 |   expect_named(out_l, names(docs_l))
104 |   expect_named(out_c, names(docs_c))
105 | 
106 |   expect_error(tokenize_sentences(bad_list))
107 | })
108 | 
109 | test_that("Sentence tokenizer produces correct output", {
110 |   # skip_on_os("windows")
111 |   out_1 <- tokenize_sentences(docs_c[1], simplify = TRUE)
112 |   out_1_lc <- tokenize_sentences(docs_c[1], lowercase = TRUE, simplify = TRUE)
113 |   out_1_pc <- tokenize_sentences(docs_c[1], strip_punct = TRUE, simplify = TRUE)
114 |   expected <- c("CHAPTER 1.", "Loomings.", "Call me Ishmael.")
115 |   expected_pc <- c("CHAPTER 1", "Loomings", "Call me Ishmael")
116 |   expect_identical(head(out_1, 3), expected)
117 |   expect_identical(head(out_1_lc, 3), tolower(expected))
118 |   expect_identical(head(out_1_pc, 3), expected_pc)
119 | })
120 | 
121 | test_that("Line tokenizer works as expected", {
122 |   out_l <- tokenize_lines(docs_l)
123 |   out_c <- tokenize_lines(docs_c)
124 |   out_1 <- tokenize_lines(docs_c[1], simplify = TRUE)
125 | 
126 |   expect_is(out_l, "list")
127 |   expect_is(out_l[[1]], "character")
128 |   expect_is(out_c, "list")
129 |   expect_is(out_c[[1]], "character")
130 |   expect_is(out_1, "character")
131 | 
132 |   expect_identical(out_l, out_c)
133 |   expect_identical(out_l[[1]], out_1)
134 |   expect_identical(out_c[[1]], out_1)
135 | 
136 |   expect_named(out_l, names(docs_l))
137 |   expect_named(out_c, names(docs_c))
138 | 
139 |   expect_error(tokenize_lines(bad_list))
140 | })
141 | 
142 | test_that("Sentence tokenizer produces correct output", {
143 |   # skip_on_os("windows")
144 |   out_1 <- tokenize_lines(docs_c[1], simplify = TRUE)
145 |   expected <- c("CHAPTER 1. Loomings.",
146 |                 "Call me Ishmael. Some years ago--never mind how long precisely--having")
147 |   expect_identical(head(out_1, 2), expected)
148 | })
149 | 
150 | 
151 | test_that("Paragraph tokenizer works as expected", {
152 |   out_l <- tokenize_paragraphs(docs_l)
153 |   out_c <- tokenize_paragraphs(docs_c)
154 |   out_1 <- tokenize_paragraphs(docs_c[1], simplify = TRUE)
155 | 
156 |   expect_is(out_l, "list")
157 |   expect_is(out_l[[1]], "character")
158 |   expect_is(out_c, "list")
159 |   expect_is(out_c[[1]], "character")
160 |   expect_is(out_1, "character")
161 | 
162 |   expect_identical(out_l, out_c)
163 |   expect_identical(out_l[[1]], out_1)
164 |   expect_identical(out_c[[1]], out_1)
165 | 
166 |   expect_named(out_l, names(docs_l))
167 |   expect_named(out_c, names(docs_c))
168 | 
169 |   expect_error(tokenize_paragraphs(bad_list))
170 | })
171 | 
172 | test_that("Paragraph tokenizer produces correct output", {
173 |   # skip_on_os("windows")
174 |   out_1 <- tokenize_paragraphs(docs_c[1], simplify = TRUE)
175 |   expected <- c("There now is your insular city of the Manhattoes")
176 |   expect_true(grepl(expected, out_1[3]))
177 | })
178 | 
179 | test_that("Regex tokenizer works as expected", {
180 |   out_l <- tokenize_regex(docs_l, pattern = "[[:punct:]\n]")
181 |   out_c <- tokenize_regex(docs_c, pattern = "[[:punct:]\n]")
182 |   out_1 <- tokenize_regex(docs_c[1], pattern = "[[:punct:]\n]", simplify = TRUE)
183 | 
184 |   expect_is(out_l, "list")
185 |   expect_is(out_l[[1]], "character")
186 |   expect_is(out_c, "list")
187 |   expect_is(out_c[[1]], "character")
188 |   expect_is(out_1, "character")
189 | 
190 |   expect_identical(out_l, out_c)
191 |   expect_identical(out_l[[1]], out_1)
192 |   expect_identical(out_c[[1]], out_1)
193 | 
194 |   expect_named(out_l, names(docs_l))
195 |   expect_named(out_c, names(docs_c))
196 | 
197 |   expect_error(tokenize_paragraphs(bad_list))
198 | })
199 | 
200 | test_that("Regex tokenizer produces correct output", {
201 |   # skip_on_os("windows")
202 |   out_1 <- tokenize_regex(docs_c[1], pattern = "[[:punct:]\n]", simplify = TRUE)
203 |   expected <- c("CHAPTER 1", " Loomings", "Call me Ishmael", " Some years ago",
204 |                 "never mind how long precisely")
205 |   expect_identical(head(out_1, 5), expected)
206 | })


--------------------------------------------------------------------------------
/tests/testthat/test-chunking.R:
--------------------------------------------------------------------------------
 1 | context("Document chunking")
 2 | 
 3 | test_that("Document chunking work on lists and character vectors", {
 4 |   chunk_size <- 10
 5 |   out_l <- chunk_text(docs_l, chunk_size = chunk_size)
 6 |   out_c <- chunk_text(docs_c, chunk_size = chunk_size)
 7 | 
 8 |   expect_is(out_l, "list")
 9 |   expect_is(out_l[[1]], "character")
10 |   expect_is(out_c, "list")
11 |   expect_is(out_c[[1]], "character")
12 | 
13 |   expect_identical(out_l, out_c)
14 |   expect_identical(out_l[[1]], out_c[[1]])
15 |   expect_identical(out_c[[1]], out_c[[1]])
16 | 
17 |   expect_named(out_l, names(out_c))
18 |   expect_named(out_c, names(out_l))
19 | 
20 |   expect_error(chunk_text(bad_list))
21 | })
22 | 
23 | test_that("Document chunking splits documents apart correctly", {
24 |   test_doc <- "This is a sentence with exactly eight words. Here's two. And now here are ten words in a great sentence. And five or six left over."
25 |   out <- chunk_text(test_doc, chunk_size = 10, doc_id = "test")
26 |   out_wc <- count_words(out)
27 |   test_wc <- c(10L, 10L, 6L)
28 |   names(test_wc) <- c("test-1", "test-2", "test-3")
29 |   expect_named(out, names(test_wc))
30 |   expect_identical(out_wc, test_wc)
31 | 
32 |   out_short <- chunk_text("This is a short text")
33 |   expect_equal(count_words(out_short[[1]]), 5)
34 |   expect_named(out_short, NULL)
35 | })
36 | 


--------------------------------------------------------------------------------
/tests/testthat/test-encoding.R:
--------------------------------------------------------------------------------
 1 | context("Encodings")
 2 | 
 3 | test_that("Encodings work on Windows", {
 4 |   input <- "César Moreira Nuñez"
 5 |   reference <- c("césar", "moreira", "nuñez")
 6 |   reference_enc <- c("UTF-8", "unknown", "UTF-8")
 7 |   output_n1 <- tokenize_ngrams(input, n = 1, simplify = TRUE)
 8 |   output_words <- tokenize_words(input, simplify = TRUE)
 9 |   output_skip <- tokenize_skip_ngrams(input, n = 1, k = 0, simplify = TRUE)
10 |   expect_equal(output_n1, reference)
11 |   expect_equal(output_words, reference)
12 |   expect_equal(output_skip, reference)
13 |   expect_equal(Encoding(output_n1), reference_enc)
14 |   expect_equal(Encoding(output_words), reference_enc)
15 |   expect_equal(Encoding(output_skip), reference_enc)
16 | })


--------------------------------------------------------------------------------
/tests/testthat/test-ngrams.R:
--------------------------------------------------------------------------------
  1 | context("N-gram tokenizers")
  2 | 
  3 | test_that("Shingled n-gram tokenizer works as expected", {
  4 |   stopwords <- c("chapter", "me")
  5 |   out_l <- tokenize_ngrams(docs_l, n = 3, n_min = 2, stopwords = stopwords)
  6 |   out_c <- tokenize_ngrams(docs_c, n = 3, n_min = 2, stopwords = stopwords)
  7 |   out_1 <- tokenize_ngrams(docs_c[1], n = 3, n_min = 2, stopwords = stopwords,
  8 |                            simplify = TRUE)
  9 | 
 10 |   expect_is(out_l, "list")
 11 |   expect_is(out_l[[1]], "character")
 12 |   expect_is(out_c, "list")
 13 |   expect_is(out_c[[1]], "character")
 14 |   expect_is(out_1, "character")
 15 | 
 16 |   expect_identical(out_l, out_c)
 17 |   expect_identical(out_l[[1]], out_1)
 18 |   expect_identical(out_c[[1]], out_1)
 19 | 
 20 |   # test for https://github.com/lmullen/tokenizers/issues/14
 21 |   expect_identical(tokenize_ngrams("one two three", n = 3, n_min = 2),
 22 |                    tokenize_ngrams("one two three", n = 5, n_min = 2))
 23 | 
 24 |   expect_named(out_l, names(docs_l))
 25 |   expect_named(out_c, names(docs_c))
 26 | 
 27 |   expect_error(tokenize_ngrams(bad_list))
 28 | })
 29 | 
 30 | test_that("Shingled n-gram tokenizer produces correct output", {
 31 |   # skip_on_os("windows")
 32 |   stopwords <- c("chapter", "me")
 33 |   out_1 <- tokenize_ngrams(docs_c[1], n = 3, n_min = 2, stopwords = stopwords,
 34 |                            simplify = TRUE)
 35 |   expected <- c("1 loomings", "1 loomings call", "loomings call",
 36 |                 "loomings call ishmael", "call ishmael", "call ishmael some")
 37 |   expect_identical(head(out_1, 6), expected)
 38 | 
 39 | })
 40 | 
 41 | test_that("Shingled n-gram tokenizer consistently produces NAs where appropriate", {
 42 |   test <- c("This is a text", NA, "So is this")
 43 |   names(test) <- letters[1:3]
 44 |   out <- tokenize_ngrams(test)
 45 |   expect_true(is.na(out$b))
 46 | })
 47 | 
 48 | test_that("Skip n-gram tokenizer consistently produces NAs where appropriate", {
 49 |   test <- c("This is a text", NA, "So is this")
 50 |   names(test) <- letters[1:3]
 51 |   out <- tokenize_skip_ngrams(test)
 52 |   expect_true(is.na(out$b))
 53 | })
 54 | 
 55 | test_that("Skip n-gram tokenizer can use stopwords", {
 56 |   test <- c("This is a text", "So is this")
 57 |   names(test) <- letters[1:2]
 58 |   out <- tokenize_skip_ngrams(test, stopwords = "is", n = 2, n_min = 2)
 59 |   expect_equal(length(out$a), 3)
 60 |   expect_identical(out$a[1], "this a")
 61 | })
 62 | 
 63 | test_that("Skips with values greater than k are refused", {
 64 |   expect_false(check_width(c(0, 4, 5), k = 2))
 65 |   expect_true(check_width(c(0, 3, 5), k = 2))
 66 |   expect_false(check_width(c(0, 1, 3), k = 0))
 67 |   expect_true(check_width(c(0, 1, 2), k = 0))
 68 |   expect_false(check_width(c(0, 10, 11, 12), k = 5))
 69 |   expect_true(check_width(c(0, 6, 11, 16, 18), k = 5))
 70 | })
 71 | 
 72 | test_that("Combinations for skip grams are correct", {
 73 |   skip_pos <- get_valid_skips(2, 2)
 74 |   expect_is(skip_pos, "list")
 75 |   expect_length(skip_pos, 3)
 76 |   expect_identical(skip_pos, list(c(0, 1), c(0, 2), c(0, 3)))
 77 | 
 78 |   skip_pos2 <- get_valid_skips(3, 2)
 79 |   expect_identical(skip_pos2, list(
 80 |     c(0, 1, 2),
 81 |     c(0, 1, 3),
 82 |     c(0, 1, 4),
 83 |     c(0, 2, 3),
 84 |     c(0, 2, 4),
 85 |     c(0, 2, 5),
 86 |     c(0, 3, 4),
 87 |     c(0, 3, 5),
 88 |     c(0, 3, 6)))
 89 | })
 90 | 
 91 | test_that("Skip n-gram tokenizer works as expected", {
 92 |   stopwords <- c("chapter", "me")
 93 |   out_l <- tokenize_skip_ngrams(docs_l, n = 3, k = 2)
 94 |   out_c <- tokenize_skip_ngrams(docs_c, n = 3, k = 2)
 95 |   out_1 <- tokenize_skip_ngrams(docs_c[1], n = 3, k = 2, simplify = TRUE)
 96 | 
 97 |   expect_is(out_l, "list")
 98 |   expect_is(out_l[[1]], "character")
 99 |   expect_is(out_c, "list")
100 |   expect_is(out_c[[1]], "character")
101 |   expect_is(out_1, "character")
102 | 
103 | 
104 |   expect_identical(out_l, out_c)
105 |   expect_identical(out_l[[1]], out_1)
106 |   expect_identical(out_c[[1]], out_1)
107 | 
108 |   expect_named(out_l, names(docs_l))
109 |   expect_named(out_c, names(docs_c))
110 | 
111 |   expect_error(tokenize_skip_ngrams(bad_list))
112 | })
113 | 
114 | test_that("Skip n-gram tokenizer produces correct output", {
115 |   out_n2_k2 <- tokenize_skip_ngrams(input, n = 2, n_min = 2, k = 2, simplify = TRUE)
116 |   expect_equal(sort(skip2_bigrams), sort(out_n2_k2))
117 |   out_n3_k2 <- tokenize_skip_ngrams(input, n = 3, n_min = 3, k = 2, simplify = TRUE)
118 |   expect_equal(sort(skip2_trigrams), sort(out_n3_k2))
119 | })
120 | 
121 | test_that("Skip n-gram tokenizers respects stopwords", {
122 |   out_1 <- tokenize_skip_ngrams("This is a sentence that is for the test.",
123 |                                 n = 3, k = 2, stopwords = c("a", "the"),
124 |                                 simplify = TRUE)
125 |   expect_equal(length(grep("the", out_1)), 0)
126 | })
127 | 
128 | test_that("Skip n-gram tokenizer warns about large combinations", {
129 |   expect_warning(get_valid_skips(n = 7, k = 2), "Input n and k will")
130 | })
131 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ptb.R:
--------------------------------------------------------------------------------
 1 | context("PTB tokenizer")
 2 | 
 3 | test_that("PTB tokenizer works as expected", {
 4 |   out_l <- tokenize_ptb(docs_l)
 5 |   out_c <- tokenize_ptb(docs_c)
 6 |   out_1 <- tokenize_ptb(docs_c[1], simplify = TRUE)
 7 | 
 8 |   expect_is(out_l, "list")
 9 |   expect_is(out_l[[1]], "character")
10 |   expect_is(out_c, "list")
11 |   expect_is(out_c[[1]], "character")
12 |   expect_is(out_1, "character")
13 | 
14 |   expect_identical(out_l, out_c)
15 |   expect_identical(out_l[[1]], out_1)
16 |   expect_identical(out_c[[1]], out_1)
17 | 
18 |   expect_named(out_l, names(docs_l))
19 |   expect_named(out_c, names(docs_c))
20 | 
21 |   expect_error(tokenize_ptb(bad_list))
22 | })
23 | 
24 | test_that("Word tokenizer produces correct output", {
25 |   sents <-
26 |     c(paste0("Good muffins cost $3.88\nin New York. ",
27 |              "Please buy me\\ntwo of them.\\nThanks."),
28 |       "They'll save and invest more.",
29 |       "hi, my name can't hello,")
30 |   expected <-
31 |     list(c("Good", "muffins", "cost", "$", "3.88", "in", "New", "York.",
32 |            "Please", "buy", "me\\ntwo", "of", "them.\\nThanks", "."),
33 |          c("They", "'ll", "save", "and", "invest", "more", "."),
34 |          c("hi", ",", "my", "name", "ca", "n't", "hello", ","))
35 |   expect_identical(tokenize_ptb(sents), expected)
36 | 
37 |   expect_identical(tokenize_ptb("This can't work.", lowercase = TRUE, simplify = TRUE),
38 |                    c("this", "ca", "n't", "work", "."))
39 | })
40 | 


--------------------------------------------------------------------------------
/tests/testthat/test-shingles.R:
--------------------------------------------------------------------------------
 1 | context("Shingle tokenizers")
 2 | 
 3 | test_that("Character shingle tokenizer works as expected", {
 4 |   out_l <- tokenize_character_shingles(docs_l, n = 3, n_min = 2)
 5 |   out_c <- tokenize_character_shingles(docs_c, n = 3, n_min = 2)
 6 |   out_1 <- tokenize_character_shingles(docs_c[1], n = 3, n_min = 2,
 7 |                                        simplify = TRUE)
 8 | 
 9 |   expect_is(out_l, "list")
10 |   expect_is(out_l[[1]], "character")
11 |   expect_is(out_c, "list")
12 |   expect_is(out_c[[1]], "character")
13 |   expect_is(out_1, "character")
14 | 
15 |   expect_identical(out_l, out_c)
16 |   expect_identical(out_l[[1]], out_1)
17 |   expect_identical(out_c[[1]], out_1)
18 | 
19 |   expect_named(out_l, names(docs_l))
20 |   expect_named(out_c, names(docs_c))
21 | 
22 |   expect_error(tokenize_ngrams(bad_list))
23 | 
24 | })
25 | 
26 | test_that("Character shingle tokenizer produces correct output", {
27 |   phrase <- c("Remember who commended thy yellow stockings",
28 |               "And wished to see thee cross-gartered.")
29 |   names(phrase) <- c("Malvolio 1", "Malvolio 2")
30 | 
31 |   out_d <- tokenize_character_shingles(phrase)
32 |   out_asis <- tokenize_character_shingles(phrase, lowercase = FALSE,
33 |                                           strip_non_alphanum = FALSE)
34 | 
35 |   expect_identical(out_d[[1]][1:12], c("rem", "eme", "mem", "emb", "mbe", "ber",
36 |                                        "erw", "rwh", "who", "hoc", "oco", "com"))
37 | 
38 |   expect_identical(out_asis[[2]][1:15], c("And", "nd ", "d w", " wi", "wis",
39 |                                           "ish", "she", "hed", "ed ", "d t",
40 |                                           " to", "to ", "o s", " se", "see"))
41 | 
42 | })
43 | 
44 | test_that("Character shingle tokenizer consistently produces NAs where appropriate", {
45 |   test <- c("This is a text", NA, "So is this")
46 |   names(test) <- letters[1:3]
47 |   out <- tokenize_character_shingles(test)
48 |   expect_true(is.na(out$b))
49 | })


--------------------------------------------------------------------------------
/tests/testthat/test-stem.R:
--------------------------------------------------------------------------------
 1 | context("Stem tokenizers")
 2 | 
 3 | test_that("Word stem tokenizer works as expected", {
 4 |   out_l <- tokenize_word_stems(docs_l)
 5 |   out_c <- tokenize_word_stems(docs_c)
 6 |   out_1 <- tokenize_word_stems(docs_c[1], simplify = TRUE)
 7 | 
 8 |   expect_is(out_l, "list")
 9 |   expect_is(out_l[[1]], "character")
10 |   expect_is(out_c, "list")
11 |   expect_is(out_c[[1]], "character")
12 |   expect_is(out_1, "character")
13 | 
14 |   expect_identical(out_l, out_c)
15 |   expect_identical(out_l[[1]], out_1)
16 |   expect_identical(out_c[[1]], out_1)
17 | 
18 |   expect_named(out_l, names(docs_l))
19 |   expect_named(out_c, names(docs_c))
20 | 
21 |   expect_error(tokenize_word_stems(bad_list))
22 | })
23 | 
24 | test_that("Stem tokenizer produces correct output", {
25 |   # skip_on_os("windows")
26 |   out_1 <- tokenize_word_stems(docs_c[1], simplify = TRUE)
27 |   expected <- c("in", "my", "purs", "and", "noth")
28 |   expect_identical(out_1[20:24], expected)
29 | })
30 | 


--------------------------------------------------------------------------------
/tests/testthat/test-tif.R:
--------------------------------------------------------------------------------
 1 | context("Text Interchange Format")
 2 | 
 3 | test_that("Can detect a TIF compliant data.frame", {
 4 |   expect_true(is_corpus_df(docs_df))
 5 |   bad_df <- docs_df
 6 |   bad_df$doc_id <- NULL
 7 |   expect_error(is_corpus_df(bad_df))
 8 | })
 9 | 
10 | test_that("Can coerce a TIF compliant data.frame to a character vector", {
11 |   output <- docs_df$text
12 |   names(output) <- docs_df$doc_id
13 |   expect_identical(corpus_df_as_corpus_vector(docs_df), output)
14 | })
15 | 
16 | test_that("Different methods produce identical output", {
17 |   expect_identical(tokenize_words(docs_c), tokenize_words(docs_df))
18 |   expect_identical(tokenize_words(docs_l), tokenize_words(docs_df))
19 | 
20 |   expect_identical(tokenize_characters(docs_c), tokenize_characters(docs_df))
21 |   expect_identical(tokenize_characters(docs_l), tokenize_characters(docs_df))
22 | 
23 |   expect_identical(tokenize_sentences(docs_c), tokenize_sentences(docs_df))
24 |   expect_identical(tokenize_sentences(docs_l), tokenize_sentences(docs_df))
25 | 
26 |   expect_identical(tokenize_lines(docs_c), tokenize_lines(docs_df))
27 |   expect_identical(tokenize_lines(docs_l), tokenize_lines(docs_df))
28 | 
29 |   expect_identical(tokenize_paragraphs(docs_c), tokenize_paragraphs(docs_df))
30 |   expect_identical(tokenize_paragraphs(docs_l), tokenize_paragraphs(docs_df))
31 | 
32 |   expect_identical(tokenize_regex(docs_c), tokenize_regex(docs_df))
33 |   expect_identical(tokenize_regex(docs_l), tokenize_regex(docs_df))
34 | 
35 |   expect_identical(tokenize_ngrams(docs_c), tokenize_ngrams(docs_df))
36 |   expect_identical(tokenize_ngrams(docs_l), tokenize_ngrams(docs_df))
37 | 
38 |   expect_identical(tokenize_skip_ngrams(docs_c), tokenize_skip_ngrams(docs_df))
39 |   expect_identical(tokenize_skip_ngrams(docs_l), tokenize_skip_ngrams(docs_df))
40 | 
41 |   expect_identical(tokenize_ptb(docs_c), tokenize_ptb(docs_df))
42 |   expect_identical(tokenize_ptb(docs_l), tokenize_ptb(docs_df))
43 | 
44 |   expect_identical(tokenize_character_shingles(docs_c),
45 |                    tokenize_character_shingles(docs_df))
46 |   expect_identical(tokenize_character_shingles(docs_l),
47 |                    tokenize_character_shingles(docs_df))
48 | 
49 |   expect_identical(tokenize_word_stems(docs_c), tokenize_word_stems(docs_df))
50 |   expect_identical(tokenize_word_stems(docs_l), tokenize_word_stems(docs_df))
51 | })
52 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
 1 | context("Utils")
 2 | 
 3 | test_that("Inputs are verified correct", {
 4 |   expect_silent(check_input(letters))
 5 |   expect_silent(check_input(list(a = "a", b = "b")))
 6 |   expect_error(check_input(1:10))
 7 |   expect_error(check_input(list(a = "a", b = letters)))
 8 |   expect_error(check_input(list(a = "a", b = 2)))
 9 | })
10 | 
11 | test_that("Stopwords are removed", {
12 |   expect_equal(remove_stopwords(letters[1:5], stopwords = c("d", "e")),
13 |                letters[1:3])
14 | })


--------------------------------------------------------------------------------
/tests/testthat/test-wordcount.R:
--------------------------------------------------------------------------------
 1 | context("Word counts")
 2 | 
 3 | test_that("Word counts work on lists and character vectors", {
 4 |   out_l <- count_sentences(docs_l)
 5 |   out_c <- count_sentences(docs_c)
 6 |   expect_identical(out_l, out_c)
 7 |   out_l <- count_words(docs_l)
 8 |   out_c <- count_words(docs_c)
 9 |   expect_identical(out_l, out_c)
10 |   out_l <- count_characters(docs_l)
11 |   out_c <- count_characters(docs_c)
12 |   expect_identical(out_l, out_c)
13 |   expect_named(out_l, names(docs_l))
14 |   expect_named(out_c, names(docs_c))
15 | })
16 | 
17 | test_that("Word counts give correct results", {
18 |   input <- "This input has 10 words; doesn't it? Well---sure does."
19 |   expect_equal(10, count_words(input))
20 |   expect_equal(2, count_sentences(input))
21 |   expect_equal(nchar(input), count_characters(input))
22 | })
23 | 


--------------------------------------------------------------------------------
/tokenizers.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/vignettes/introduction-to-tokenizers.R:
--------------------------------------------------------------------------------
 1 | ## ----setup, include = FALSE---------------------------------------------------
 2 | knitr::opts_chunk$set(
 3 |   collapse = TRUE,
 4 |   comment = "#>"
 5 | )
 6 | 
 7 | ## -----------------------------------------------------------------------------
 8 | library(tokenizers)
 9 | options(max.print = 25)
10 | 
11 | james <- paste0(
12 |   "The question thus becomes a verbal one\n",
13 |   "again; and our knowledge of all these early stages of thought and feeling\n",
14 |   "is in any case so conjectural and imperfect that farther discussion would\n",
15 |   "not be worth while.\n",
16 |   "\n",
17 |   "Religion, therefore, as I now ask you arbitrarily to take it, shall mean\n",
18 |   "for us _the feelings, acts, and experiences of individual men in their\n",
19 |   "solitude, so far as they apprehend themselves to stand in relation to\n",
20 |   "whatever they may consider the divine_. Since the relation may be either\n",
21 |   "moral, physical, or ritual, it is evident that out of religion in the\n",
22 |   "sense in which we take it, theologies, philosophies, and ecclesiastical\n",
23 |   "organizations may secondarily grow.\n"
24 | )
25 | 
26 | ## -----------------------------------------------------------------------------
27 | tokenize_characters(james)[[1]] 
28 | 
29 | ## -----------------------------------------------------------------------------
30 | tokenize_character_shingles(james, n = 3, n_min = 3, 
31 |                             strip_non_alphanum = FALSE)[[1]][1:20]
32 | 
33 | ## -----------------------------------------------------------------------------
34 | tokenize_words(james)
35 | 
36 | ## -----------------------------------------------------------------------------
37 | tokenize_word_stems(james)
38 | 
39 | ## -----------------------------------------------------------------------------
40 | library(stopwords)
41 | tokenize_words(james, stopwords = stopwords::stopwords("en"))
42 | 
43 | ## -----------------------------------------------------------------------------
44 | tokenize_ptb(james)
45 | 
46 | ## -----------------------------------------------------------------------------
47 | tokenize_ngrams(james, n = 5, n_min = 2,
48 |                 stopwords = stopwords::stopwords("en"))
49 | 
50 | ## -----------------------------------------------------------------------------
51 | tokenize_skip_ngrams(james, n = 5, n_min = 2, k = 2,
52 |                      stopwords = stopwords::stopwords("en"))
53 | 
54 | ## ---- collapse=FALSE----------------------------------------------------------
55 | tokenize_sentences(james) 
56 | tokenize_paragraphs(james)
57 | 
58 | ## -----------------------------------------------------------------------------
59 | chunks <- chunk_text(mobydick, chunk_size = 100, doc_id = "mobydick")
60 | length(chunks)
61 | chunks[5:6]
62 | tokenize_words(chunks[5:6])
63 | 
64 | ## -----------------------------------------------------------------------------
65 | count_words(mobydick)
66 | count_characters(mobydick)
67 | count_sentences(mobydick)
68 | 
69 | 


--------------------------------------------------------------------------------
/vignettes/introduction-to-tokenizers.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to the tokenizers Package"
  3 | author: "Lincoln Mullen"
  4 | output: rmarkdown::html_vignette
  5 | vignette: >
  6 |   %\VignetteIndexEntry{Introduction to the tokenizers Package}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\VignetteEncoding{UTF-8}
  9 | ---
 10 | 
 11 | ```{r setup, include = FALSE}
 12 | knitr::opts_chunk$set(
 13 |   collapse = TRUE,
 14 |   comment = "#>"
 15 | )
 16 | ```
 17 | 
 18 | ## Package overview
 19 | 
 20 | In natural language processing, tokenization is the process of breaking human-readable text into machine readable components. The most obvious way to tokenize a text is to split the text into words. But there are many other ways to tokenize a text, the most useful of which are provided by this package.
 21 | 
 22 | The tokenizers in this package have a consistent interface. They all take either a character vector of any length, or a list where each element is a character vector of length one. The idea is that each element comprises a text. Then each function returns a list with the same length as the input vector, where each element in the list contains the tokens generated by the function. If the input character vector or list is named, then the names are preserved, so that the names can serve as identifiers.
 23 | 
 24 | Using the following sample text, the rest of this vignette demonstrates the different kinds of tokenizers in this package.
 25 | 
 26 | ```{r}
 27 | library(tokenizers)
 28 | options(max.print = 25)
 29 | 
 30 | james <- paste0(
 31 |   "The question thus becomes a verbal one\n",
 32 |   "again; and our knowledge of all these early stages of thought and feeling\n",
 33 |   "is in any case so conjectural and imperfect that farther discussion would\n",
 34 |   "not be worth while.\n",
 35 |   "\n",
 36 |   "Religion, therefore, as I now ask you arbitrarily to take it, shall mean\n",
 37 |   "for us _the feelings, acts, and experiences of individual men in their\n",
 38 |   "solitude, so far as they apprehend themselves to stand in relation to\n",
 39 |   "whatever they may consider the divine_. Since the relation may be either\n",
 40 |   "moral, physical, or ritual, it is evident that out of religion in the\n",
 41 |   "sense in which we take it, theologies, philosophies, and ecclesiastical\n",
 42 |   "organizations may secondarily grow.\n"
 43 | )
 44 | ```
 45 | 
 46 | ## Character and character-shingle tokenizers
 47 | 
 48 | The character tokenizer splits texts into individual characters. 
 49 | 
 50 | ```{r}
 51 | tokenize_characters(james)[[1]] 
 52 | ```
 53 | 
 54 | You can also tokenize into character-based shingles.
 55 | 
 56 | ```{r}
 57 | tokenize_character_shingles(james, n = 3, n_min = 3, 
 58 |                             strip_non_alphanum = FALSE)[[1]][1:20]
 59 | ```
 60 | 
 61 | ## Word and word-stem tokenizers
 62 | 
 63 | The word tokenizer splits texts into words. 
 64 | 
 65 | ```{r}
 66 | tokenize_words(james)
 67 | ```
 68 | 
 69 | Word stemming is provided by the [SnowballC](https://cran.r-project.org/package=SnowballC) package.
 70 | 
 71 | ```{r}
 72 | tokenize_word_stems(james)
 73 | ```
 74 | 
 75 | You can also provide a vector of stopwords which will be omitted. The [stopwords package](https://github.com/quanteda/stopwords), which contains stopwords for many languages from several sources, is recommended. This argument also works with the n-gram and skip n-gram tokenizers.
 76 | 
 77 | ```{r}
 78 | library(stopwords)
 79 | tokenize_words(james, stopwords = stopwords::stopwords("en"))
 80 | ```
 81 | 
 82 | An alternative word stemmer often used in NLP that preserves punctuation and separates common English contractions is the Penn Treebank tokenizer.
 83 | 
 84 | ```{r}
 85 | tokenize_ptb(james)
 86 | ```
 87 | 
 88 | ## N-gram and skip n-gram tokenizers
 89 | 
 90 | An n-gram is a contiguous sequence of words containing at least `n_min` words and at most `n` words. This function will generate all such combinations of n-grams, omitting stopwords if desired.
 91 | 
 92 | ```{r}
 93 | tokenize_ngrams(james, n = 5, n_min = 2,
 94 |                 stopwords = stopwords::stopwords("en"))
 95 | ```
 96 | 
 97 | A skip n-gram is like an n-gram in that it takes the `n` and `n_min` parameters. But rather than returning contiguous sequences of words, it will also return sequences of n-grams skipping words with gaps between `0` and the value of `k`. This function generates all such sequences, again omitting stopwords if desired. Note that the number of tokens returned can be very large.
 98 | 
 99 | ```{r}
100 | tokenize_skip_ngrams(james, n = 5, n_min = 2, k = 2,
101 |                      stopwords = stopwords::stopwords("en"))
102 | ```
103 | 
104 | ## Sentence and paragraph tokenizers
105 | 
106 | Sometimes it is desirable to split texts into sentences or paragraphs prior to tokenizing into other forms.
107 | 
108 | ```{r, collapse=FALSE}
109 | tokenize_sentences(james) 
110 | tokenize_paragraphs(james)
111 | ```
112 | 
113 | ## Text chunking
114 | 
115 | When one has a very long document, sometimes it is desirable to split the document into smaller chunks, each with the same length. This function chunks a document and gives it each of the chunks an ID to show their order. These chunks can then be further tokenized.
116 | 
117 | ```{r}
118 | chunks <- chunk_text(mobydick, chunk_size = 100, doc_id = "mobydick")
119 | length(chunks)
120 | chunks[5:6]
121 | tokenize_words(chunks[5:6])
122 | ```
123 | 
124 | ## Counting words, characters, sentences
125 | 
126 | The package also offers functions for counting words, characters, and sentences in a format which works nicely with the rest of the functions.
127 | 
128 | ```{r}
129 | count_words(mobydick)
130 | count_characters(mobydick)
131 | count_sentences(mobydick)
132 | ```
133 | 
134 | 


--------------------------------------------------------------------------------
/vignettes/tif-and-tokenizers.R:
--------------------------------------------------------------------------------
 1 | ## ----setup, include = FALSE---------------------------------------------------
 2 | knitr::opts_chunk$set(
 3 |   collapse = TRUE,
 4 |   comment = "#>"
 5 | )
 6 | 
 7 | ## -----------------------------------------------------------------------------
 8 | # Named list
 9 | (corpus_l <- list(man_comes_around = "There's a man goin' 'round takin' names",
10 |                   wont_back_down = "Well I won't back down, no I won't back down",
11 |                   bird_on_a_wire = "Like a bird on a wire"))
12 | 
13 | # Named character vector
14 | (corpus_c <- unlist(corpus_l))
15 | 
16 | # Data frame
17 | (corpus_d <- data.frame(doc_id = names(corpus_c), text = unname(corpus_c),
18 |                         stringsAsFactors = FALSE))
19 | 
20 | ## -----------------------------------------------------------------------------
21 | library(tokenizers)
22 | 
23 | tokens_l <- tokenize_ngrams(corpus_l, n = 2)
24 | tokens_c <- tokenize_ngrams(corpus_c, n = 2)
25 | tokens_d <- tokenize_ngrams(corpus_c, n = 2)
26 | 
27 | # Are all these identical?
28 | all(identical(tokens_l, tokens_c),
29 |     identical(tokens_c, tokens_d),
30 |     identical(tokens_l, tokens_d))
31 | 
32 | ## -----------------------------------------------------------------------------
33 | tokens_l
34 | 
35 | ## ---- echo=FALSE--------------------------------------------------------------
36 | sample_tokens_df <- structure(list(doc_id = c("man_comes_around", "man_comes_around", 
37 | "man_comes_around", "man_comes_around", "man_comes_around", "man_comes_around", 
38 | "wont_back_down", "wont_back_down", "wont_back_down", "wont_back_down", 
39 | "wont_back_down", "wont_back_down", "wont_back_down", "wont_back_down", 
40 | "wont_back_down", "bird_on_a_wire", "bird_on_a_wire", "bird_on_a_wire", 
41 | "bird_on_a_wire", "bird_on_a_wire"), token = c("there's a", "a man", 
42 | "man goin", "goin round", "round takin", "takin names", "well i", 
43 | "i won't", "won't back", "back down", "down no", "no i", "i won't", 
44 | "won't back", "back down", "like a", "a bird", "bird on", "on a", 
45 | "a wire")), .Names = c("doc_id", "token"), row.names = c(NA, 
46 | -20L), class = "data.frame")
47 | head(sample_tokens_df, 10)
48 | 
49 | 


--------------------------------------------------------------------------------
/vignettes/tif-and-tokenizers.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "The Text Interchange Formats and the tokenizers Package"
 3 | author: "Lincoln Mullen"
 4 | output: rmarkdown::html_vignette
 5 | vignette: >
 6 |   %\VignetteIndexEntry{The Text Interchange Formats and the tokenizers Package}
 7 |   %\VignetteEngine{knitr::rmarkdown}
 8 |   %\VignetteEncoding{UTF-8}
 9 | ---
10 | 
11 | ```{r setup, include = FALSE}
12 | knitr::opts_chunk$set(
13 |   collapse = TRUE,
14 |   comment = "#>"
15 | )
16 | ```
17 | 
18 | The [Text Interchange Formats](https://github.com/ropenscilabs/tif) are a set of standards defined at an [rOpenSci](https://ropensci.org/) sponsored [meeting in London](https://textworkshop17.ropensci.org/) in 2017. The formats allow R text analysis packages to target defined inputs and outputs for corpora, tokens, and document-term matrices. By adhering to these recommendations, R packages can buy into an interoperable ecosystem.
19 | 
20 | The TIF recommendations are still a draft, but the tokenizers package implements its recommendation to accept both of the corpora formats and to output one of its recommended tokens formats. 
21 | 
22 | Consider these two recommended forms of a corpus. One (`corpus_c`) is a named character vector; the other (`corpus_d`) is a data frame. They both include a document ID and the full text for each item. The data frame format obviously allows for the use of other metadata fields besides the document ID, whereas the other format does not. Using the coercion functions in the tif package, one could switch back and forth between these formats. Tokenizers also supports a corpus formatted as a named list where each element is a character vector of length one (`corpus_l`), though this is not a part of the draft TIF standards.
23 | 
24 | ```{r}
25 | # Named list
26 | (corpus_l <- list(man_comes_around = "There's a man goin' 'round takin' names",
27 |                   wont_back_down = "Well I won't back down, no I won't back down",
28 |                   bird_on_a_wire = "Like a bird on a wire"))
29 | 
30 | # Named character vector
31 | (corpus_c <- unlist(corpus_l))
32 | 
33 | # Data frame
34 | (corpus_d <- data.frame(doc_id = names(corpus_c), text = unname(corpus_c),
35 |                         stringsAsFactors = FALSE))
36 | ```
37 | 
38 | All of the tokenizers in this package can accept any of those formats and will return an identical output for each.
39 | 
40 | ```{r}
41 | library(tokenizers)
42 | 
43 | tokens_l <- tokenize_ngrams(corpus_l, n = 2)
44 | tokens_c <- tokenize_ngrams(corpus_c, n = 2)
45 | tokens_d <- tokenize_ngrams(corpus_c, n = 2)
46 | 
47 | # Are all these identical?
48 | all(identical(tokens_l, tokens_c),
49 |     identical(tokens_c, tokens_d),
50 |     identical(tokens_l, tokens_d))
51 | ```
52 | 
53 | The output of all of the tokenizers is a named list, where each element of the list corresponds to a document in the corpus. The names of the list are the document IDs, and the elements are character vectors containing the tokens.
54 | 
55 | ```{r}
56 | tokens_l
57 | ```
58 | 
59 | This format can be coerced to a data frame of document IDs and tokens, one row per token, using the coercion functions in the tif package. That tokens data frame would look like this.
60 | 
61 | ```{r, echo=FALSE}
62 | sample_tokens_df <- structure(list(doc_id = c("man_comes_around", "man_comes_around", 
63 | "man_comes_around", "man_comes_around", "man_comes_around", "man_comes_around", 
64 | "wont_back_down", "wont_back_down", "wont_back_down", "wont_back_down", 
65 | "wont_back_down", "wont_back_down", "wont_back_down", "wont_back_down", 
66 | "wont_back_down", "bird_on_a_wire", "bird_on_a_wire", "bird_on_a_wire", 
67 | "bird_on_a_wire", "bird_on_a_wire"), token = c("there's a", "a man", 
68 | "man goin", "goin round", "round takin", "takin names", "well i", 
69 | "i won't", "won't back", "back down", "down no", "no i", "i won't", 
70 | "won't back", "back down", "like a", "a bird", "bird on", "on a", 
71 | "a wire")), .Names = c("doc_id", "token"), row.names = c(NA, 
72 | -20L), class = "data.frame")
73 | head(sample_tokens_df, 10)
74 | ```
75 | 
76 | 


--------------------------------------------------------------------------------
/vignettes/tif-and-tokenizers.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html>
  4 | 
  5 | <head>
  6 | 
  7 | <meta charset="utf-8" />
  8 | <meta name="generator" content="pandoc" />
  9 | <meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
 10 | 
 11 | <meta name="viewport" content="width=device-width, initial-scale=1" />
 12 | 
 13 | <meta name="author" content="Lincoln Mullen" />
 14 | 
 15 | 
 16 | <title>The Text Interchange Formats and the tokenizers Package</title>
 17 | 
 18 | <script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
 19 | // be compatible with the behavior of Pandoc < 2.8).
 20 | document.addEventListener('DOMContentLoaded', function(e) {
 21 |   var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
 22 |   var i, h, a;
 23 |   for (i = 0; i < hs.length; i++) {
 24 |     h = hs[i];
 25 |     if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
 26 |     a = h.attributes;
 27 |     while (a.length > 0) h.removeAttribute(a[0].name);
 28 |   }
 29 | });
 30 | </script>
 31 | 
 32 | <style type="text/css">
 33 | code{white-space: pre-wrap;}
 34 | span.smallcaps{font-variant: small-caps;}
 35 | span.underline{text-decoration: underline;}
 36 | div.column{display: inline-block; vertical-align: top; width: 50%;}
 37 | div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
 38 | ul.task-list{list-style: none;}
 39 | </style>
 40 | 
 41 | 
 42 | 
 43 | <style type="text/css">
 44 | code {
 45 | white-space: pre;
 46 | }
 47 | .sourceCode {
 48 | overflow: visible;
 49 | }
 50 | </style>
 51 | <style type="text/css" data-origin="pandoc">
 52 | pre > code.sourceCode { white-space: pre; position: relative; }
 53 | pre > code.sourceCode > span { line-height: 1.25; }
 54 | pre > code.sourceCode > span:empty { height: 1.2em; }
 55 | .sourceCode { overflow: visible; }
 56 | code.sourceCode > span { color: inherit; text-decoration: inherit; }
 57 | div.sourceCode { margin: 1em 0; }
 58 | pre.sourceCode { margin: 0; }
 59 | @media screen {
 60 | div.sourceCode { overflow: auto; }
 61 | }
 62 | @media print {
 63 | pre > code.sourceCode { white-space: pre-wrap; }
 64 | pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
 65 | }
 66 | pre.numberSource code
 67 | { counter-reset: source-line 0; }
 68 | pre.numberSource code > span
 69 | { position: relative; left: -4em; counter-increment: source-line; }
 70 | pre.numberSource code > span > a:first-child::before
 71 | { content: counter(source-line);
 72 | position: relative; left: -1em; text-align: right; vertical-align: baseline;
 73 | border: none; display: inline-block;
 74 | -webkit-touch-callout: none; -webkit-user-select: none;
 75 | -khtml-user-select: none; -moz-user-select: none;
 76 | -ms-user-select: none; user-select: none;
 77 | padding: 0 4px; width: 4em;
 78 | color: #aaaaaa;
 79 | }
 80 | pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
 81 | div.sourceCode
 82 | { }
 83 | @media screen {
 84 | pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
 85 | }
 86 | code span.al { color: #ff0000; font-weight: bold; } 
 87 | code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } 
 88 | code span.at { color: #7d9029; } 
 89 | code span.bn { color: #40a070; } 
 90 | code span.bu { color: #008000; } 
 91 | code span.cf { color: #007020; font-weight: bold; } 
 92 | code span.ch { color: #4070a0; } 
 93 | code span.cn { color: #880000; } 
 94 | code span.co { color: #60a0b0; font-style: italic; } 
 95 | code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } 
 96 | code span.do { color: #ba2121; font-style: italic; } 
 97 | code span.dt { color: #902000; } 
 98 | code span.dv { color: #40a070; } 
 99 | code span.er { color: #ff0000; font-weight: bold; } 
100 | code span.ex { } 
101 | code span.fl { color: #40a070; } 
102 | code span.fu { color: #06287e; } 
103 | code span.im { color: #008000; font-weight: bold; } 
104 | code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } 
105 | code span.kw { color: #007020; font-weight: bold; } 
106 | code span.op { color: #666666; } 
107 | code span.ot { color: #007020; } 
108 | code span.pp { color: #bc7a00; } 
109 | code span.sc { color: #4070a0; } 
110 | code span.ss { color: #bb6688; } 
111 | code span.st { color: #4070a0; } 
112 | code span.va { color: #19177c; } 
113 | code span.vs { color: #4070a0; } 
114 | code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } 
115 | </style>
116 | <script>
117 | // apply pandoc div.sourceCode style to pre.sourceCode instead
118 | (function() {
119 |   var sheets = document.styleSheets;
120 |   for (var i = 0; i < sheets.length; i++) {
121 |     if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
122 |     try { var rules = sheets[i].cssRules; } catch (e) { continue; }
123 |     var j = 0;
124 |     while (j < rules.length) {
125 |       var rule = rules[j];
126 |       // check if there is a div.sourceCode rule
127 |       if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
128 |         j++;
129 |         continue;
130 |       }
131 |       var style = rule.style.cssText;
132 |       // check if color or background-color is set
133 |       if (rule.style.color === '' && rule.style.backgroundColor === '') {
134 |         j++;
135 |         continue;
136 |       }
137 |       // replace div.sourceCode by a pre.sourceCode rule
138 |       sheets[i].deleteRule(j);
139 |       sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
140 |     }
141 |   }
142 | })();
143 | </script>
144 | 
145 | 
146 | 
147 | 
148 | <style type="text/css">body {
149 | background-color: #fff;
150 | margin: 1em auto;
151 | max-width: 700px;
152 | overflow: visible;
153 | padding-left: 2em;
154 | padding-right: 2em;
155 | font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
156 | font-size: 14px;
157 | line-height: 1.35;
158 | }
159 | #TOC {
160 | clear: both;
161 | margin: 0 0 10px 10px;
162 | padding: 4px;
163 | width: 400px;
164 | border: 1px solid #CCCCCC;
165 | border-radius: 5px;
166 | background-color: #f6f6f6;
167 | font-size: 13px;
168 | line-height: 1.3;
169 | }
170 | #TOC .toctitle {
171 | font-weight: bold;
172 | font-size: 15px;
173 | margin-left: 5px;
174 | }
175 | #TOC ul {
176 | padding-left: 40px;
177 | margin-left: -1.5em;
178 | margin-top: 5px;
179 | margin-bottom: 5px;
180 | }
181 | #TOC ul ul {
182 | margin-left: -2em;
183 | }
184 | #TOC li {
185 | line-height: 16px;
186 | }
187 | table {
188 | margin: 1em auto;
189 | border-width: 1px;
190 | border-color: #DDDDDD;
191 | border-style: outset;
192 | border-collapse: collapse;
193 | }
194 | table th {
195 | border-width: 2px;
196 | padding: 5px;
197 | border-style: inset;
198 | }
199 | table td {
200 | border-width: 1px;
201 | border-style: inset;
202 | line-height: 18px;
203 | padding: 5px 5px;
204 | }
205 | table, table th, table td {
206 | border-left-style: none;
207 | border-right-style: none;
208 | }
209 | table thead, table tr.even {
210 | background-color: #f7f7f7;
211 | }
212 | p {
213 | margin: 0.5em 0;
214 | }
215 | blockquote {
216 | background-color: #f6f6f6;
217 | padding: 0.25em 0.75em;
218 | }
219 | hr {
220 | border-style: solid;
221 | border: none;
222 | border-top: 1px solid #777;
223 | margin: 28px 0;
224 | }
225 | dl {
226 | margin-left: 0;
227 | }
228 | dl dd {
229 | margin-bottom: 13px;
230 | margin-left: 13px;
231 | }
232 | dl dt {
233 | font-weight: bold;
234 | }
235 | ul {
236 | margin-top: 0;
237 | }
238 | ul li {
239 | list-style: circle outside;
240 | }
241 | ul ul {
242 | margin-bottom: 0;
243 | }
244 | pre, code {
245 | background-color: #f7f7f7;
246 | border-radius: 3px;
247 | color: #333;
248 | white-space: pre-wrap; 
249 | }
250 | pre {
251 | border-radius: 3px;
252 | margin: 5px 0px 10px 0px;
253 | padding: 10px;
254 | }
255 | pre:not([class]) {
256 | background-color: #f7f7f7;
257 | }
258 | code {
259 | font-family: Consolas, Monaco, 'Courier New', monospace;
260 | font-size: 85%;
261 | }
262 | p > code, li > code {
263 | padding: 2px 0px;
264 | }
265 | div.figure {
266 | text-align: center;
267 | }
268 | img {
269 | background-color: #FFFFFF;
270 | padding: 2px;
271 | border: 1px solid #DDDDDD;
272 | border-radius: 3px;
273 | border: 1px solid #CCCCCC;
274 | margin: 0 5px;
275 | }
276 | h1 {
277 | margin-top: 0;
278 | font-size: 35px;
279 | line-height: 40px;
280 | }
281 | h2 {
282 | border-bottom: 4px solid #f7f7f7;
283 | padding-top: 10px;
284 | padding-bottom: 2px;
285 | font-size: 145%;
286 | }
287 | h3 {
288 | border-bottom: 2px solid #f7f7f7;
289 | padding-top: 10px;
290 | font-size: 120%;
291 | }
292 | h4 {
293 | border-bottom: 1px solid #f7f7f7;
294 | margin-left: 8px;
295 | font-size: 105%;
296 | }
297 | h5, h6 {
298 | border-bottom: 1px solid #ccc;
299 | font-size: 105%;
300 | }
301 | a {
302 | color: #0033dd;
303 | text-decoration: none;
304 | }
305 | a:hover {
306 | color: #6666ff; }
307 | a:visited {
308 | color: #800080; }
309 | a:visited:hover {
310 | color: #BB00BB; }
311 | a[href^="http:"] {
312 | text-decoration: underline; }
313 | a[href^="https:"] {
314 | text-decoration: underline; }
315 | 
316 | code > span.kw { color: #555; font-weight: bold; } 
317 | code > span.dt { color: #902000; } 
318 | code > span.dv { color: #40a070; } 
319 | code > span.bn { color: #d14; } 
320 | code > span.fl { color: #d14; } 
321 | code > span.ch { color: #d14; } 
322 | code > span.st { color: #d14; } 
323 | code > span.co { color: #888888; font-style: italic; } 
324 | code > span.ot { color: #007020; } 
325 | code > span.al { color: #ff0000; font-weight: bold; } 
326 | code > span.fu { color: #900; font-weight: bold; } 
327 | code > span.er { color: #a61717; background-color: #e3d2d2; } 
328 | </style>
329 | 
330 | 
331 | 
332 | 
333 | </head>
334 | 
335 | <body>
336 | 
337 | 
338 | 
339 | 
340 | <h1 class="title toc-ignore">The Text Interchange Formats and the
341 | tokenizers Package</h1>
342 | <h4 class="author">Lincoln Mullen</h4>
343 | 
344 | 
345 | 
346 | <p>The <a href="https://github.com/ropenscilabs/tif">Text Interchange
347 | Formats</a> are a set of standards defined at an <a href="https://ropensci.org/">rOpenSci</a> sponsored <a href="https://textworkshop17.ropensci.org/">meeting in London</a> in
348 | 2017. The formats allow R text analysis packages to target defined
349 | inputs and outputs for corpora, tokens, and document-term matrices. By
350 | adhering to these recommendations, R packages can buy into an
351 | interoperable ecosystem.</p>
352 | <p>The TIF recommendations are still a draft, but the tokenizers package
353 | implements its recommendation to accept both of the corpora formats and
354 | to output one of its recommended tokens formats.</p>
355 | <p>Consider these two recommended forms of a corpus. One
356 | (<code>corpus_c</code>) is a named character vector; the other
357 | (<code>corpus_d</code>) is a data frame. They both include a document ID
358 | and the full text for each item. The data frame format obviously allows
359 | for the use of other metadata fields besides the document ID, whereas
360 | the other format does not. Using the coercion functions in the tif
361 | package, one could switch back and forth between these formats.
362 | Tokenizers also supports a corpus formatted as a named list where each
363 | element is a character vector of length one (<code>corpus_l</code>),
364 | though this is not a part of the draft TIF standards.</p>
365 | <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="co"># Named list</span></span>
366 | <span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a>(corpus_l <span class="ot">&lt;-</span> <span class="fu">list</span>(<span class="at">man_comes_around =</span> <span class="st">&quot;There&#39;s a man goin&#39; &#39;round takin&#39; names&quot;</span>,</span>
367 | <span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a>                  <span class="at">wont_back_down =</span> <span class="st">&quot;Well I won&#39;t back down, no I won&#39;t back down&quot;</span>,</span>
368 | <span id="cb1-4"><a href="#cb1-4" tabindex="-1"></a>                  <span class="at">bird_on_a_wire =</span> <span class="st">&quot;Like a bird on a wire&quot;</span>))</span>
369 | <span id="cb1-5"><a href="#cb1-5" tabindex="-1"></a><span class="co">#&gt; $man_comes_around</span></span>
370 | <span id="cb1-6"><a href="#cb1-6" tabindex="-1"></a><span class="co">#&gt; [1] &quot;There&#39;s a man goin&#39; &#39;round takin&#39; names&quot;</span></span>
371 | <span id="cb1-7"><a href="#cb1-7" tabindex="-1"></a><span class="co">#&gt; </span></span>
372 | <span id="cb1-8"><a href="#cb1-8" tabindex="-1"></a><span class="co">#&gt; $wont_back_down</span></span>
373 | <span id="cb1-9"><a href="#cb1-9" tabindex="-1"></a><span class="co">#&gt; [1] &quot;Well I won&#39;t back down, no I won&#39;t back down&quot;</span></span>
374 | <span id="cb1-10"><a href="#cb1-10" tabindex="-1"></a><span class="co">#&gt; </span></span>
375 | <span id="cb1-11"><a href="#cb1-11" tabindex="-1"></a><span class="co">#&gt; $bird_on_a_wire</span></span>
376 | <span id="cb1-12"><a href="#cb1-12" tabindex="-1"></a><span class="co">#&gt; [1] &quot;Like a bird on a wire&quot;</span></span>
377 | <span id="cb1-13"><a href="#cb1-13" tabindex="-1"></a></span>
378 | <span id="cb1-14"><a href="#cb1-14" tabindex="-1"></a><span class="co"># Named character vector</span></span>
379 | <span id="cb1-15"><a href="#cb1-15" tabindex="-1"></a>(corpus_c <span class="ot">&lt;-</span> <span class="fu">unlist</span>(corpus_l))</span>
380 | <span id="cb1-16"><a href="#cb1-16" tabindex="-1"></a><span class="co">#&gt;                               man_comes_around </span></span>
381 | <span id="cb1-17"><a href="#cb1-17" tabindex="-1"></a><span class="co">#&gt;      &quot;There&#39;s a man goin&#39; &#39;round takin&#39; names&quot; </span></span>
382 | <span id="cb1-18"><a href="#cb1-18" tabindex="-1"></a><span class="co">#&gt;                                 wont_back_down </span></span>
383 | <span id="cb1-19"><a href="#cb1-19" tabindex="-1"></a><span class="co">#&gt; &quot;Well I won&#39;t back down, no I won&#39;t back down&quot; </span></span>
384 | <span id="cb1-20"><a href="#cb1-20" tabindex="-1"></a><span class="co">#&gt;                                 bird_on_a_wire </span></span>
385 | <span id="cb1-21"><a href="#cb1-21" tabindex="-1"></a><span class="co">#&gt;                        &quot;Like a bird on a wire&quot;</span></span>
386 | <span id="cb1-22"><a href="#cb1-22" tabindex="-1"></a></span>
387 | <span id="cb1-23"><a href="#cb1-23" tabindex="-1"></a><span class="co"># Data frame</span></span>
388 | <span id="cb1-24"><a href="#cb1-24" tabindex="-1"></a>(corpus_d <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">doc_id =</span> <span class="fu">names</span>(corpus_c), <span class="at">text =</span> <span class="fu">unname</span>(corpus_c),</span>
389 | <span id="cb1-25"><a href="#cb1-25" tabindex="-1"></a>                        <span class="at">stringsAsFactors =</span> <span class="cn">FALSE</span>))</span>
390 | <span id="cb1-26"><a href="#cb1-26" tabindex="-1"></a><span class="co">#&gt;             doc_id                                         text</span></span>
391 | <span id="cb1-27"><a href="#cb1-27" tabindex="-1"></a><span class="co">#&gt; 1 man_comes_around      There&#39;s a man goin&#39; &#39;round takin&#39; names</span></span>
392 | <span id="cb1-28"><a href="#cb1-28" tabindex="-1"></a><span class="co">#&gt; 2   wont_back_down Well I won&#39;t back down, no I won&#39;t back down</span></span>
393 | <span id="cb1-29"><a href="#cb1-29" tabindex="-1"></a><span class="co">#&gt; 3   bird_on_a_wire                        Like a bird on a wire</span></span></code></pre></div>
394 | <p>All of the tokenizers in this package can accept any of those formats
395 | and will return an identical output for each.</p>
396 | <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="fu">library</span>(tokenizers)</span>
397 | <span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a></span>
398 | <span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a>tokens_l <span class="ot">&lt;-</span> <span class="fu">tokenize_ngrams</span>(corpus_l, <span class="at">n =</span> <span class="dv">2</span>)</span>
399 | <span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a>tokens_c <span class="ot">&lt;-</span> <span class="fu">tokenize_ngrams</span>(corpus_c, <span class="at">n =</span> <span class="dv">2</span>)</span>
400 | <span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a>tokens_d <span class="ot">&lt;-</span> <span class="fu">tokenize_ngrams</span>(corpus_c, <span class="at">n =</span> <span class="dv">2</span>)</span>
401 | <span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a></span>
402 | <span id="cb2-7"><a href="#cb2-7" tabindex="-1"></a><span class="co"># Are all these identical?</span></span>
403 | <span id="cb2-8"><a href="#cb2-8" tabindex="-1"></a><span class="fu">all</span>(<span class="fu">identical</span>(tokens_l, tokens_c),</span>
404 | <span id="cb2-9"><a href="#cb2-9" tabindex="-1"></a>    <span class="fu">identical</span>(tokens_c, tokens_d),</span>
405 | <span id="cb2-10"><a href="#cb2-10" tabindex="-1"></a>    <span class="fu">identical</span>(tokens_l, tokens_d))</span>
406 | <span id="cb2-11"><a href="#cb2-11" tabindex="-1"></a><span class="co">#&gt; [1] TRUE</span></span></code></pre></div>
407 | <p>The output of all of the tokenizers is a named list, where each
408 | element of the list corresponds to a document in the corpus. The names
409 | of the list are the document IDs, and the elements are character vectors
410 | containing the tokens.</p>
411 | <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a>tokens_l</span>
412 | <span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="co">#&gt; $man_comes_around</span></span>
413 | <span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;there&#39;s a&quot;   &quot;a man&quot;       &quot;man goin&quot;    &quot;goin round&quot;  &quot;round takin&quot;</span></span>
414 | <span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a><span class="co">#&gt; [6] &quot;takin names&quot;</span></span>
415 | <span id="cb3-5"><a href="#cb3-5" tabindex="-1"></a><span class="co">#&gt; </span></span>
416 | <span id="cb3-6"><a href="#cb3-6" tabindex="-1"></a><span class="co">#&gt; $wont_back_down</span></span>
417 | <span id="cb3-7"><a href="#cb3-7" tabindex="-1"></a><span class="co">#&gt; [1] &quot;well i&quot;     &quot;i won&#39;t&quot;    &quot;won&#39;t back&quot; &quot;back down&quot;  &quot;down no&quot;   </span></span>
418 | <span id="cb3-8"><a href="#cb3-8" tabindex="-1"></a><span class="co">#&gt; [6] &quot;no i&quot;       &quot;i won&#39;t&quot;    &quot;won&#39;t back&quot; &quot;back down&quot; </span></span>
419 | <span id="cb3-9"><a href="#cb3-9" tabindex="-1"></a><span class="co">#&gt; </span></span>
420 | <span id="cb3-10"><a href="#cb3-10" tabindex="-1"></a><span class="co">#&gt; $bird_on_a_wire</span></span>
421 | <span id="cb3-11"><a href="#cb3-11" tabindex="-1"></a><span class="co">#&gt; [1] &quot;like a&quot;  &quot;a bird&quot;  &quot;bird on&quot; &quot;on a&quot;    &quot;a wire&quot;</span></span></code></pre></div>
422 | <p>This format can be coerced to a data frame of document IDs and
423 | tokens, one row per token, using the coercion functions in the tif
424 | package. That tokens data frame would look like this.</p>
425 | <pre><code>#&gt;              doc_id       token
426 | #&gt; 1  man_comes_around   there&#39;s a
427 | #&gt; 2  man_comes_around       a man
428 | #&gt; 3  man_comes_around    man goin
429 | #&gt; 4  man_comes_around  goin round
430 | #&gt; 5  man_comes_around round takin
431 | #&gt; 6  man_comes_around takin names
432 | #&gt; 7    wont_back_down      well i
433 | #&gt; 8    wont_back_down     i won&#39;t
434 | #&gt; 9    wont_back_down  won&#39;t back
435 | #&gt; 10   wont_back_down   back down</code></pre>
436 | 
437 | 
438 | 
439 | <!-- code folding -->
440 | 
441 | 
442 | <!-- dynamically load mathjax for compatibility with self-contained -->
443 | <script>
444 |   (function () {
445 |     var script = document.createElement("script");
446 |     script.type = "text/javascript";
447 |     script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
448 |     document.getElementsByTagName("head")[0].appendChild(script);
449 |   })();
450 | </script>
451 | 
452 | </body>
453 | </html>
454 | 


--------------------------------------------------------------------------------