├── man ├── .Rapp.history ├── tif-package.Rd ├── tif_is_corpus_character.Rd ├── tif_is_tokens_list.Rd ├── tif_is_dtm.Rd ├── tif_is_corpus_df.Rd ├── tif_is_tokens_df.Rd └── tif_as.Rd ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── .Rbuildignore ├── tests ├── testthat.R └── testthat │ └── test-validators.R ├── inst └── examples │ ├── tif_is_dtm.R │ ├── tif_is_corpus_character.R │ ├── tif_is_tokens_list.R │ ├── tif_is_corpus_df.R │ ├── tif_as.R │ └── tif_is_tokens_df.R ├── R ├── pkg.R ├── coercion.R └── validators.R ├── NAMESPACE ├── DESCRIPTION ├── CONDUCT.md ├── NEWS.md └── README.md /man/.Rapp.history: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rproj.user 3 | tif.Rproj 4 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | CONDUCT.md 4 | ^\.github$ 5 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(tif) 3 | 4 | test_check("tif") 5 | -------------------------------------------------------------------------------- /inst/examples/tif_is_dtm.R: -------------------------------------------------------------------------------- 1 | #' @importFrom Matrix Matrix 2 | dtm <- Matrix::Matrix(0, ncol = 26, nrow = 5, sparse = TRUE) 3 | colnames(dtm) <- LETTERS 4 | rownames(dtm) <- sprintf("doc%d", 1:5) 5 | 6 | tif_is_dtm(dtm) 7 | -------------------------------------------------------------------------------- /inst/examples/tif_is_corpus_character.R: -------------------------------------------------------------------------------- 1 | corpus <- c("Aujourd'hui, maman est morte.", 2 | "It was a pleasure to burn.", 3 | "All this happened, more or less.") 4 | 5 | tif_is_corpus_character(corpus) 6 | 7 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut") 8 | tif_is_corpus_character(corpus) 9 | -------------------------------------------------------------------------------- /inst/examples/tif_is_tokens_list.R: -------------------------------------------------------------------------------- 1 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"), 2 | doc2 = c("it", "was", "a", "pleasure", "to", "burn"), 3 | doc3 = c("all", "this", "happened", "more", "or", "less")) 4 | tif_is_tokens_list(tokens) 5 | 6 | names(tokens) <- c("doc1", "doc2", "doc3") 7 | tif_is_tokens_list(tokens) 8 | -------------------------------------------------------------------------------- /inst/examples/tif_is_corpus_df.R: -------------------------------------------------------------------------------- 1 | corpus <- data.frame(doc_id = c("doc1", "doc2", "doc3"), 2 | text = c("Aujourd'hui, maman est morte.", 3 | "It was a pleasure to burn.", 4 | "All this happened, more or less."), 5 | stringsAsFactors = FALSE) 6 | 7 | tif_is_corpus_df(corpus) 8 | 9 | corpus$author <- c("Camus", "Bradbury", "Vonnegut") 10 | tif_is_corpus_df(corpus) 11 | -------------------------------------------------------------------------------- /inst/examples/tif_as.R: -------------------------------------------------------------------------------- 1 | # coerce corpus object 2 | corpus <- c("Aujourd'hui, maman est morte.", 3 | "It was a pleasure to burn.", 4 | "All this happened, more or less.") 5 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut") 6 | 7 | new <- tif_as_corpus_df(corpus) 8 | new 9 | tif_as_corpus_character(new) 10 | 11 | # coerce tokens object 12 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"), 13 | doc2 = c("it", "was", "a", "pleasure", "to", "burn"), 14 | doc3 = c("all", "this", "happened", "more", "or", "less")) 15 | 16 | new <- tif_as_tokens_df(tokens) 17 | new 18 | tif_as_tokens_list(new) 19 | 20 | -------------------------------------------------------------------------------- /inst/examples/tif_is_tokens_df.R: -------------------------------------------------------------------------------- 1 | tokens <- data.frame(doc_id = c("doc1", "doc1", "doc1", "doc1", 2 | "doc2", "doc2", "doc2", "doc2", 3 | "doc2", "doc2", "doc3", "doc3", 4 | "doc3", "doc3", "doc3", "doc3"), 5 | token = c("aujourd'hui", "maman", "est", 6 | "morte", "it", "was", "a", "pleasure", 7 | "to", "burn", "all", "this", "happened", 8 | "more", "or", "less"), 9 | stringsAsFactors = FALSE) 10 | 11 | tif_is_tokens_df(tokens) 12 | 13 | tokens$pos <- "NOUN" 14 | tokens$NER <- "" 15 | tokens$sentiment <- runif(16L) 16 | tif_is_tokens_df(tokens) 17 | -------------------------------------------------------------------------------- /R/pkg.R: -------------------------------------------------------------------------------- 1 | #' tif: Text Interchange Formats 2 | #' 3 | #' This package describes and validates formats for storing 4 | #' common object arising in text analysis as native R objects. 5 | #' Representations of a text corpus, document term matrix, and 6 | #' tokenized text are included. The corpus and tokens objects 7 | #' have multiple valid formats. Packages compliant with the 8 | #' tif proposal should accept all valid formats and should 9 | #' directly return, or provide conversion functions, for 10 | #' converting outputs into at least one of the formats (when 11 | #' applicable). The tokenized text format is extensible to 12 | #' include other annotations such as part of speech tags and 13 | #' named entities. 14 | #' 15 | #' @import Matrix 16 | #' 17 | #' @docType package 18 | "_PACKAGE" 19 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(tif_as_corpus_character,character) 4 | S3method(tif_as_corpus_character,data.frame) 5 | S3method(tif_as_corpus_character,default) 6 | S3method(tif_as_corpus_df,character) 7 | S3method(tif_as_corpus_df,data.frame) 8 | S3method(tif_as_corpus_df,default) 9 | S3method(tif_as_tokens_df,data.frame) 10 | S3method(tif_as_tokens_df,default) 11 | S3method(tif_as_tokens_df,list) 12 | S3method(tif_as_tokens_list,data.frame) 13 | S3method(tif_as_tokens_list,default) 14 | S3method(tif_as_tokens_list,list) 15 | export(tif_as_corpus_character) 16 | export(tif_as_corpus_df) 17 | export(tif_as_tokens_df) 18 | export(tif_as_tokens_list) 19 | export(tif_is_corpus_character) 20 | export(tif_is_corpus_df) 21 | export(tif_is_dtm) 22 | export(tif_is_tokens_df) 23 | export(tif_is_tokens_list) 24 | import(Matrix) 25 | importFrom(Matrix,Matrix) 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tif 2 | Type: Package 3 | Title: Text Interchange Format 4 | Version: 0.4 5 | Authors@R: c(person("Taylor", "Arnold", role = c("aut", "cre"), 6 | email = "tarnold2@richmond.edu"), 7 | person("Ken", "Benoit", role = "aut", 8 | email = "k.r.benoit@lse.ac.uk"), 9 | person("Lincoln", "Mullen", role = "aut", 10 | email = "lmullen@gmu.edu "), 11 | person("Adam", "Obeng", role = "aut", 12 | email = "contact@adamobeng.com"), 13 | person("rOpenSci Text Workshop Participants (2017)", 14 | role = "aut")) 15 | Maintainer: Taylor B. Arnold 16 | Description: Provides validation functions for common 17 | interchange formats for representing text data in R. 18 | Includes formats for corpus objects, document term 19 | matrices, and tokens. Other annotations can be stored 20 | by overloading the tokens structure. 21 | Imports: Matrix 22 | License: GPL-2 23 | Encoding: UTF-8 24 | URL: https://docs.ropensci.org/tif, https://github.com/ropensci/tif 25 | BugReports: http://github.com/ropensci/tif/issues 26 | RoxygenNote: 7.2.1 27 | Suggests: testthat 28 | -------------------------------------------------------------------------------- /man/tif-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pkg.R 3 | \docType{package} 4 | \name{tif-package} 5 | \alias{tif} 6 | \alias{tif-package} 7 | \title{tif: Text Interchange Formats} 8 | \description{ 9 | This package describes and validates formats for storing 10 | common object arising in text analysis as native R objects. 11 | Representations of a text corpus, document term matrix, and 12 | tokenized text are included. The corpus and tokens objects 13 | have multiple valid formats. Packages compliant with the 14 | tif proposal should accept all valid formats and should 15 | directly return, or provide conversion functions, for 16 | converting outputs into at least one of the formats (when 17 | applicable). The tokenized text format is extensible to 18 | include other annotations such as part of speech tags and 19 | named entities. 20 | } 21 | \seealso{ 22 | Useful links: 23 | \itemize{ 24 | \item \url{https://docs.ropensci.org/tif} 25 | \item \url{https://github.com/ropensci/tif} 26 | \item Report bugs at \url{http://github.com/ropensci/tif/issues} 27 | } 28 | 29 | } 30 | \author{ 31 | \strong{Maintainer}: Taylor Arnold \email{taylor.arnold@acm.org} 32 | 33 | Authors: 34 | \itemize{ 35 | \item Ken Benoit \email{k.r.benoit@lse.ac.uk} 36 | \item Lincoln Mullen \email{lmullen@gmu.edu } 37 | \item Adam Obeng \email{contact@adamobeng.com} 38 | \item rOpenSci Text Workshop Participants (2017) 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (http:contributor-covenant.org), version 1.0.0, available at 25 | http://contributor-covenant.org/version/1/0/0/ 26 | -------------------------------------------------------------------------------- /man/tif_is_corpus_character.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validators.R 3 | \name{tif_is_corpus_character} 4 | \alias{tif_is_corpus_character} 5 | \title{Validate Corpus Character Vector Object} 6 | \usage{ 7 | tif_is_corpus_character(corpus, warn = FALSE) 8 | } 9 | \arguments{ 10 | \item{corpus}{a corpus object to test for validity} 11 | 12 | \item{warn}{logical. Should the function produce a 13 | verbose warning for the condition for which 14 | the validation fails. Useful for testing.} 15 | } 16 | \value{ 17 | a logical vector of length one indicating 18 | whether the input is a valid corpus 19 | } 20 | \description{ 21 | A valid character vector corpus object is an character 22 | vector with UTF-8 encoding. If it has names, this should 23 | be a unique character also in UTF-8 encoding. No other 24 | attributes should be present. 25 | } 26 | \details{ 27 | The tests are run sequentially and the function returns, 28 | with a warning if the warn flag is set, on the first test 29 | that fails. We use this implementation because some tests 30 | may fail entirely or be meaningless if the prior ones are 31 | note passed. 32 | } 33 | \examples{ 34 | corpus <- c("Aujourd'hui, maman est morte.", 35 | "It was a pleasure to burn.", 36 | "All this happened, more or less.") 37 | 38 | tif_is_corpus_character(corpus) 39 | 40 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut") 41 | tif_is_corpus_character(corpus) 42 | } 43 | -------------------------------------------------------------------------------- /man/tif_is_tokens_list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validators.R 3 | \name{tif_is_tokens_list} 4 | \alias{tif_is_tokens_list} 5 | \title{Validate Tokens List Object} 6 | \usage{ 7 | tif_is_tokens_list(tokens, warn = FALSE) 8 | } 9 | \arguments{ 10 | \item{tokens}{a tokens object to test for validity} 11 | 12 | \item{warn}{logical. Should the function produce a 13 | verbose warning for the condition for which 14 | the validation fails. Useful for testing.} 15 | } 16 | \value{ 17 | a logical vector of length one indicating 18 | whether the input is a valid tokens 19 | } 20 | \description{ 21 | A valid corpus tokens object is (possibly named) list of 22 | character vectors. The character vectors, as well as 23 | names, should be in UTF-8 encoding. No other attributes 24 | should be present in either the list or any of its elements. 25 | } 26 | \details{ 27 | The tests are run sequentially and the function returns, 28 | with a warning if the warn flag is set, on the first test 29 | that fails. We use this implementation because some tests 30 | may fail entirely or be meaningless if the prior ones are 31 | note passed. 32 | } 33 | \examples{ 34 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"), 35 | doc2 = c("it", "was", "a", "pleasure", "to", "burn"), 36 | doc3 = c("all", "this", "happened", "more", "or", "less")) 37 | tif_is_tokens_list(tokens) 38 | 39 | names(tokens) <- c("doc1", "doc2", "doc3") 40 | tif_is_tokens_list(tokens) 41 | } 42 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macOS-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v2 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /man/tif_is_dtm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validators.R 3 | \name{tif_is_dtm} 4 | \alias{tif_is_dtm} 5 | \title{Validate Document Term Matrix Object} 6 | \usage{ 7 | tif_is_dtm(dtm, warn = FALSE) 8 | } 9 | \arguments{ 10 | \item{dtm}{a document term matrix object to test 11 | the validity of} 12 | 13 | \item{warn}{logical. Should the function produce a 14 | verbose warning for the condition for which 15 | the validation fails. Useful for testing.} 16 | } 17 | \value{ 18 | a logical vector of length one indicating 19 | whether the input is a valid document term 20 | matrix 21 | } 22 | \description{ 23 | A valid document term matrix is a sparse matrix with 24 | the row representing documents and columns representing 25 | terms. The row names is a character vector giving the 26 | document ids with no duplicated entries. The column 27 | names is a character vector giving the terms of the 28 | matrix with no duplicated entries. The spare matrix 29 | should inherit from the Matrix class dgCMatrix. 30 | } 31 | \details{ 32 | The tests are run sequentially and the function returns, 33 | with a warning if the warn flag is set, on the first test 34 | that fails. We use this implementation because some tests 35 | may fail entirely or be meaningless if the prior ones are 36 | note passed. For example, if the dtm object is not a matrix 37 | it may not contain row or column names. 38 | } 39 | \examples{ 40 | #' @importFrom Matrix Matrix 41 | dtm <- Matrix::Matrix(0, ncol = 26, nrow = 5, sparse = TRUE) 42 | colnames(dtm) <- LETTERS 43 | rownames(dtm) <- sprintf("doc\%d", 1:5) 44 | 45 | tif_is_dtm(dtm) 46 | } 47 | -------------------------------------------------------------------------------- /man/tif_is_corpus_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validators.R 3 | \name{tif_is_corpus_df} 4 | \alias{tif_is_corpus_df} 5 | \title{Validate Corpus Data Frame Object} 6 | \usage{ 7 | tif_is_corpus_df(corpus, warn = FALSE) 8 | } 9 | \arguments{ 10 | \item{corpus}{a corpus object to test for validity} 11 | 12 | \item{warn}{logical. Should the function produce a 13 | verbose warning for the condition for which 14 | the validation fails. Useful for testing.} 15 | } 16 | \value{ 17 | a logical vector of length one indicating 18 | whether the input is a valid corpus 19 | } 20 | \description{ 21 | A valid data frame corpus object is an object that 22 | least two columns. One column must be called doc_id 23 | and be a character vector with UTF-8 encoding. Document 24 | ids must be unique. There must also be a column called text 25 | and must also be a character vector in UTF-8 encoding. Each 26 | individual document is represented by a single row in 27 | the data frame. Addition document-level metadata columns 28 | and corpus level attributes are allowed but not required. 29 | } 30 | \details{ 31 | The tests are run sequentially and the function returns, 32 | with a warning if the warn flag is set, on the first test 33 | that fails. We use this implementation because some tests 34 | may fail entirely or be meaningless if the prior ones are 35 | note passed. For example, if the corpus object does not 36 | have a variable named "text" it does not make sense to 37 | check whether this column is a character vector. 38 | } 39 | \examples{ 40 | corpus <- data.frame(doc_id = c("doc1", "doc2", "doc3"), 41 | text = c("Aujourd'hui, maman est morte.", 42 | "It was a pleasure to burn.", 43 | "All this happened, more or less."), 44 | stringsAsFactors = FALSE) 45 | 46 | tif_is_corpus_df(corpus) 47 | 48 | corpus$author <- c("Camus", "Bradbury", "Vonnegut") 49 | tif_is_corpus_df(corpus) 50 | } 51 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # tif 0.3.0 2 | 3 | * Further discussion has lead us to simplify the corpus and token data frame 4 | formats. The doc_id, text, and token columns can be in any position within the 5 | data frame. 6 | 7 | # tif 0.2.0 8 | 9 | * After a round of input for the initial version of the specification, 10 | we decided to allow two formats for corpus and tokens objects. In addition 11 | to the original data frame variants there is a character vector corpus 12 | object and a list-based tokens object. Converts between the various types 13 | are now included in the package. 14 | 15 | ### New Functions 16 | 17 | * `tif_is_corpus_character` returns TRUE or FALSE for whether the input 18 | is a valid character vector corpus object. 19 | 20 | * `tif_is_tokens_list` returns TRUE or FALSE for whether the input 21 | is a valid list-based tokens object. 22 | 23 | * `tif_as_corpus_character` takes a valid tif corpus object and returns 24 | a character vector corpus object. 25 | 26 | * `tif_as_corpus_df` takes a valid tif corpus object and returns 27 | a data frame corpus object. 28 | 29 | * `tif_as_tokens_character` takes a valid tif tokens object and returns 30 | a list-based tokens object. 31 | 32 | * `tif_as_tokens_df` takes a valid tif tokens object and returns 33 | a list-based tokens object. 34 | 35 | ### Renamed Functions 36 | 37 | * The old validate functions have been renamed `tif_is_corpus_df`, 38 | `tif_is_dtm` and `tif_is_tokens_df`. This is more in line with base-R 39 | functions and separates the "df" version of the corpus and tokens from 40 | the alternative new forms. 41 | 42 | # tif 0.1.0 43 | 44 | * This is the initial implementation of the ideas discussed at 45 | the rOpenSci Text Workshop from 21-22 April 2017. 46 | 47 | ### New Functions 48 | 49 | * `tif_corpus_validate` returns TRUE or FALSE for whether the input 50 | is a valid corpus object. 51 | 52 | * `tif_dtm_validate` returns TRUE or FALSE for whether the input is 53 | a valid document corpus object. 54 | 55 | * `tif_tokens_validate` returns TRUE or FALSE for whether the input is 56 | a valid tokens object. 57 | 58 | ### Known issues 59 | 60 | * do not yet have a test suite for the package 61 | 62 | * encoding checkin is not yet working 63 | 64 | -------------------------------------------------------------------------------- /man/tif_is_tokens_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validators.R 3 | \name{tif_is_tokens_df} 4 | \alias{tif_is_tokens_df} 5 | \title{Validate Tokens Data Frame Object} 6 | \usage{ 7 | tif_is_tokens_df(tokens, warn = FALSE) 8 | } 9 | \arguments{ 10 | \item{tokens}{a tokens object to test for validity} 11 | 12 | \item{warn}{logical. Should the function produce a 13 | verbose warning for the condition for which 14 | the validation fails. Useful for testing.} 15 | } 16 | \value{ 17 | a logical vector of length one indicating 18 | whether the input is a valid tokens object 19 | } 20 | \description{ 21 | A valid tokens data frame object is a data frame or an 22 | object that inherits a data frame. It has no row names 23 | and has at least two columns. It must a contain column called 24 | doc_id that is a character vector with UTF-8 encoding. 25 | Document ids must be unique. It must also contain a column called 26 | token that must also be a character vector in UTF-8 encoding. 27 | Each individual token is represented by a single row in 28 | the data frame. Addition token-level metadata columns 29 | are allowed but not required. 30 | } 31 | \details{ 32 | The tests are run sequentially and the function returns, 33 | with a warning if the warn flag is set, on the first test 34 | that fails. We use this implementation because some tests 35 | may fail entirely or be meaningless if the prior ones are 36 | note passed. For example, if the tokens object does not 37 | have a variable named "doc_id" it does not make sense to 38 | check whether this column is a character vector. 39 | } 40 | \examples{ 41 | tokens <- data.frame(doc_id = c("doc1", "doc1", "doc1", "doc1", 42 | "doc2", "doc2", "doc2", "doc2", 43 | "doc2", "doc2", "doc3", "doc3", 44 | "doc3", "doc3", "doc3", "doc3"), 45 | token = c("aujourd'hui", "maman", "est", 46 | "morte", "it", "was", "a", "pleasure", 47 | "to", "burn", "all", "this", "happened", 48 | "more", "or", "less"), 49 | stringsAsFactors = FALSE) 50 | 51 | tif_is_tokens_df(tokens) 52 | 53 | tokens$pos <- "NOUN" 54 | tokens$NER <- "" 55 | tokens$sentiment <- runif(16L) 56 | tif_is_tokens_df(tokens) 57 | } 58 | -------------------------------------------------------------------------------- /man/tif_as.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/coercion.R 3 | \name{tif_as} 4 | \alias{tif_as} 5 | \alias{tif_as_corpus_character} 6 | \alias{tif_as_corpus_character.default} 7 | \alias{tif_as_corpus_character.character} 8 | \alias{tif_as_corpus_character.data.frame} 9 | \alias{tif_as_corpus_df} 10 | \alias{tif_as_corpus_df.default} 11 | \alias{tif_as_corpus_df.character} 12 | \alias{tif_as_corpus_df.data.frame} 13 | \alias{tif_as_tokens_df} 14 | \alias{tif_as_tokens_df.default} 15 | \alias{tif_as_tokens_df.list} 16 | \alias{tif_as_tokens_df.data.frame} 17 | \alias{tif_as_tokens_list} 18 | \alias{tif_as_tokens_list.default} 19 | \alias{tif_as_tokens_list.list} 20 | \alias{tif_as_tokens_list.data.frame} 21 | \title{Coerce Between tif Object Specifications} 22 | \usage{ 23 | tif_as_corpus_character(corpus) 24 | 25 | \method{tif_as_corpus_character}{default}(corpus) 26 | 27 | \method{tif_as_corpus_character}{character}(corpus) 28 | 29 | \method{tif_as_corpus_character}{data.frame}(corpus) 30 | 31 | tif_as_corpus_df(corpus) 32 | 33 | \method{tif_as_corpus_df}{default}(corpus) 34 | 35 | \method{tif_as_corpus_df}{character}(corpus) 36 | 37 | \method{tif_as_corpus_df}{data.frame}(corpus) 38 | 39 | tif_as_tokens_df(tokens) 40 | 41 | \method{tif_as_tokens_df}{default}(tokens) 42 | 43 | \method{tif_as_tokens_df}{list}(tokens) 44 | 45 | \method{tif_as_tokens_df}{data.frame}(tokens) 46 | 47 | tif_as_tokens_list(tokens) 48 | 49 | \method{tif_as_tokens_list}{default}(tokens) 50 | 51 | \method{tif_as_tokens_list}{list}(tokens) 52 | 53 | \method{tif_as_tokens_list}{data.frame}(tokens) 54 | } 55 | \arguments{ 56 | \item{corpus}{valid tif corpus object to coerce} 57 | 58 | \item{tokens}{valid tif tokens object to coerce} 59 | } 60 | \description{ 61 | These functions convert between the various valid 62 | formats for corpus and tokens objects. By using these 63 | in other packages, maintainers need to only handle 64 | whichever specific format they would like to work 65 | with, but gain the freedom to output (or convert 66 | into) the one most suited to their package's paradigm. 67 | } 68 | \details{ 69 | No explicit checking is done on the input; the output 70 | is guaranteed to be valid only if the input is a valid 71 | format. In fact, we make an effort to not modify an 72 | object that appears to be in the required format already 73 | due to R's copy on modify semantics. 74 | } 75 | \examples{ 76 | # coerce corpus object 77 | corpus <- c("Aujourd'hui, maman est morte.", 78 | "It was a pleasure to burn.", 79 | "All this happened, more or less.") 80 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut") 81 | 82 | new <- tif_as_corpus_df(corpus) 83 | new 84 | tif_as_corpus_character(new) 85 | 86 | # coerce tokens object 87 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"), 88 | doc2 = c("it", "was", "a", "pleasure", "to", "burn"), 89 | doc3 = c("all", "this", "happened", "more", "or", "less")) 90 | 91 | new <- tif_as_tokens_df(tokens) 92 | new 93 | tif_as_tokens_list(new) 94 | 95 | } 96 | -------------------------------------------------------------------------------- /tests/testthat/test-validators.R: -------------------------------------------------------------------------------- 1 | test_that("tiFALSE_is_corpus_df", { 2 | # A minimal valid corpus 3 | tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE) 4 | expect_true(tif_is_corpus_df(tc)) 5 | 6 | # Corpus with an additional class 7 | tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE) 8 | class(tc) <- c("data.table", "data.frame") 9 | expect_true(tif_is_corpus_df(tc)) 10 | 11 | # Corpora with incorrect classes 12 | tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE) 13 | expect_false(tif_is_corpus_df(as.matrix(tc))) 14 | expect_warning( 15 | tif_is_corpus_df(as.matrix(tc), warn = TRUE), 16 | "corpus object must inherit the data.frame class" 17 | ) 18 | expect_false(tif_is_corpus_df(unclass(tc))) 19 | expect_warning( 20 | tif_is_corpus_df(unclass(tc), warn = TRUE), 21 | "corpus object must inherit the data.frame class" 22 | ) 23 | 24 | # Corpora with only one column 25 | tc <- data.frame(doc_id = "1", stringsAsFactors = FALSE) 26 | expect_false(tif_is_corpus_df(tc)) 27 | expect_warning( 28 | tif_is_corpus_df(tc, warn = TRUE), 29 | "corpus object must contain at least two columns" 30 | ) 31 | tc <- data.frame(text = c("foobar"), stringsAsFactors = FALSE) 32 | expect_false(tif_is_corpus_df(tc)) 33 | expect_warning( 34 | tif_is_corpus_df(tc, warn = TRUE), 35 | "corpus object must contain at least two columns" 36 | ) 37 | 38 | # A corpus with rownames 39 | tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE) 40 | rownames(tc) <- "baz" 41 | expect_false(tif_is_corpus_df(tc)) 42 | expect_warning( 43 | tif_is_corpus_df(tc, warn = TRUE), 44 | "corpus object should not contain row names" 45 | ) 46 | 47 | # Corpora with incorrect column types 48 | tc <- data.frame(doc_id = 1, text = "foobar", stringsAsFactors = FALSE) 49 | expect_false(tif_is_corpus_df(tc)) 50 | expect_warning( 51 | tif_is_corpus_df(tc, warn = TRUE), 52 | "doc_id must be a character vector" 53 | ) 54 | tc <- data.frame(doc_id = as.factor("1"), text = "foobar", 55 | stringsAsFactors = FALSE) 56 | expect_false(tif_is_corpus_df(tc)) 57 | expect_warning( 58 | tif_is_corpus_df(tc, warn = TRUE), 59 | "doc_id must be a character vector" 60 | ) 61 | 62 | tc <- data.frame(doc_id = "1", text = 1, stringsAsFactors = FALSE) 63 | expect_false(tif_is_corpus_df(tc)) 64 | expect_warning( 65 | tif_is_corpus_df(tc, warn = TRUE), 66 | "text must be a character vector" 67 | ) 68 | tc <- data.frame(doc_id = "1", text = as.factor("foobar"), 69 | stringsAsFactors = FALSE) 70 | expect_false(tif_is_corpus_df(tc)) 71 | expect_warning( 72 | tif_is_corpus_df(tc, warn = TRUE), 73 | "text must be a character vector" 74 | ) 75 | 76 | # If both are incorrect, warning should be for doc_id 77 | tc <- data.frame(doc_id = 1, text = "foobar") 78 | expect_false(tif_is_corpus_df(tc)) 79 | expect_warning( 80 | tif_is_corpus_df(tc, warn = TRUE), 81 | "doc_id must be a character vector" 82 | ) 83 | # If both are incorrect, warning should be for doc_id 84 | tc <- data.frame(doc_id = 1, text = 1) 85 | expect_false(tif_is_corpus_df(tc)) 86 | expect_warning( 87 | tif_is_corpus_df(tc, warn = TRUE), 88 | "doc_id must be a character vector" 89 | ) 90 | 91 | # A corpus with duplicated doc_id s 92 | tc <- data.frame(doc_id = c("1", "2", "1"), text = rep("foobar", 3), 93 | stringsAsFactors = FALSE) 94 | expect_false(tif_is_corpus_df(tc)) 95 | expect_warning( 96 | tif_is_corpus_df(tc, warn = TRUE), 97 | "there are duplicated document ids in the corpus" 98 | ) 99 | 100 | }) 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## tif: Text Interchange Formats 2 | 3 | 4 | [![R-CMD-check](https://github.com/ropensci/tif/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/ropensci/tif/actions/workflows/R-CMD-check.yaml) 5 | 6 | 7 | This package describes and validates formats for storing 8 | common object arising in text analysis as native R objects. 9 | Representations of a text corpus, document term matrix, and 10 | tokenized text are included. The tokenized text format is 11 | extensible to include other annotations. There are two versions 12 | of the corpus and tokens objects; packages should accept 13 | both and return or coerce to at least one of these. 14 | 15 | ## Installation 16 | 17 | You can install the development version using devtools: 18 | 19 | ```{r} 20 | devtools::install_github("ropensci/tif") 21 | ``` 22 | 23 | ## Usage 24 | 25 | The package can be used to check that a particular object is in a valid 26 | format. For example, here we see that the object `corpus` is a valid corpus 27 | data frame: 28 | 29 | ```{r} 30 | library(tif) 31 | corpus <- data.frame(doc_id = c("doc1", "doc2", "doc3"), 32 | text = c("Aujourd'hui, maman est morte.", 33 | "It was a pleasure to burn.", 34 | "All this happened, more or less."), 35 | stringsAsFactors = FALSE) 36 | 37 | tif_is_corpus_df(corpus) 38 | ``` 39 | ``` 40 | TRUE 41 | ``` 42 | 43 | The package also has functions to convert between the list and data frame 44 | formats for corpus and token object. For example: 45 | 46 | ```{r} 47 | tif_as_corpus_character(corpus) 48 | ``` 49 | ``` 50 | doc1 doc2 51 | "Aujourd'hui, maman est morte." "It was a pleasure to burn." 52 | doc3 53 | "All this happened, more or less." 54 | ``` 55 | 56 | Note that extra meta data columns will be lost in the conversion from a data 57 | frame to a named character vector. 58 | 59 | ## Details 60 | 61 | This package describes and validates formats for storing 62 | common object arising in text analysis as native R objects. 63 | Representations of a text corpus, document term matrix, and 64 | tokenized text are included. The tokenized text format is 65 | extensible to include other annotations. There are two versions 66 | of the corpus and tokens objects; packages should accept and return 67 | at least one of these. 68 | 69 | **corpus** (data frame) - A valid corpus data frame object 70 | is a data frame with at least two columns. The first column 71 | is called doc_id and is a character vector with UTF-8 encoding. Document 72 | ids must be unique. The second column is called text and 73 | must also be a character vector in UTF-8 encoding. Each 74 | individual document is represented by a single row in 75 | the data frame. Addition document-level metadata columns 76 | and corpus level attributes are allowed but not required. 77 | 78 | **corpus** (character vector) - A valid character vector corpus 79 | object is an character vector with UTF-8 encoding. If it has 80 | names, this should be a unique character also in UTF-8 81 | encoding. No other attributes should be present. 82 | 83 | **dtm** - A valid document term matrix is a sparse matrix with 84 | the row representing documents and columns representing 85 | terms. The row names is a character vector giving the 86 | document ids with no duplicated entries. The column 87 | names is a character vector giving the terms of the 88 | matrix with no duplicated entries. The sparse matrix 89 | should inherit from the Matrix class dgCMatrix. 90 | 91 | **tokens** (data frame) - A valid data frame tokens 92 | object is a data frame with at least two columns. There must be 93 | a column called doc_id that is a character vector 94 | with UTF-8 encoding. Document ids must be unique. 95 | There must also be a column called token that must also be a 96 | character vector in UTF-8 encoding. 97 | Each individual token is represented by a single row in 98 | the data frame. Addition token-level metadata columns 99 | are allowed but not required. 100 | 101 | **tokens** (list) - A valid corpus tokens object is (possibly 102 | named) list of character vectors. The character vectors, as 103 | well as names, should be in UTF-8 encoding. No other 104 | attributes should be present in either the list or any of its 105 | elements. 106 | -------------------------------------------------------------------------------- /R/coercion.R: -------------------------------------------------------------------------------- 1 | #' Coerce Between tif Object Specifications 2 | #' 3 | #' These functions convert between the various valid 4 | #' formats for corpus and tokens objects. By using these 5 | #' in other packages, maintainers need to only handle 6 | #' whichever specific format they would like to work 7 | #' with, but gain the freedom to output (or convert 8 | #' into) the one most suited to their package's paradigm. 9 | #' 10 | #' @param corpus valid tif corpus object to coerce 11 | #' @param tokens valid tif tokens object to coerce 12 | #' 13 | #' @details 14 | #' No explicit checking is done on the input; the output 15 | #' is guaranteed to be valid only if the input is a valid 16 | #' format. In fact, we make an effort to not modify an 17 | #' object that appears to be in the required format already 18 | #' due to R's copy on modify semantics. 19 | #' 20 | #' @example inst/examples/tif_as.R 21 | #' @name tif_as 22 | NULL 23 | 24 | #' @export 25 | #' @rdname tif_as 26 | tif_as_corpus_character <- function(corpus) { 27 | UseMethod("tif_as_corpus_character") 28 | } 29 | 30 | #' @rdname tif_as 31 | #' @export 32 | tif_as_corpus_character.default <- function(corpus) { 33 | 34 | nd <- length(dim(corpus)) 35 | if (nd <= 1L) { 36 | out <- as.character(corpus) 37 | } else if (nd == 2L) { 38 | out <- as.data.frame(corpus) 39 | } else { 40 | stop(sprintf("Cannot convert object of class %s to tif corpus", 41 | class(corpus))) 42 | } 43 | 44 | return(out) 45 | } 46 | 47 | #' @rdname tif_as 48 | #' @export 49 | tif_as_corpus_character.character <- function(corpus) { 50 | return(corpus) 51 | } 52 | 53 | 54 | #' @rdname tif_as 55 | #' @export 56 | tif_as_corpus_character.data.frame <- function(corpus) { 57 | 58 | out <- as.character(corpus$text) 59 | names(out) <- corpus$doc_id 60 | 61 | return(out) 62 | } 63 | 64 | #' @export 65 | #' @rdname tif_as 66 | tif_as_corpus_df <- function(corpus) { 67 | UseMethod("tif_as_corpus_df") 68 | } 69 | 70 | #' @rdname tif_as 71 | #' @export 72 | tif_as_corpus_df.default <- function(corpus) { 73 | 74 | nd <- length(dim(corpus)) 75 | if (nd <= 1L) { 76 | out <- as.character(corpus) 77 | tif_as_corpus_df(out) 78 | } else if (nd == 2L) { 79 | out <- as.data.frame(corpus) 80 | } else { 81 | stop(sprintf("Cannot convert object of class %s to tif corpus", 82 | class(corpus))) 83 | } 84 | 85 | return(out) 86 | } 87 | 88 | #' @rdname tif_as 89 | #' @export 90 | tif_as_corpus_df.character <- function(corpus) { 91 | 92 | # Need to convert from character 93 | if (is.null(names(corpus))) { 94 | doc_id <- sprintf("doc%d", seq_along(corpus)) 95 | } else { 96 | doc_id <- names(corpus) 97 | } 98 | out <- data.frame(doc_id = doc_id, text = as.character(corpus), 99 | stringsAsFactors = FALSE) 100 | return(out) 101 | } 102 | 103 | #' @rdname tif_as 104 | #' @export 105 | tif_as_corpus_df.data.frame <- function(corpus) { 106 | return(corpus) 107 | } 108 | 109 | #' @export 110 | #' @rdname tif_as 111 | tif_as_tokens_df <- function(tokens) { 112 | UseMethod("tif_as_tokens_df") 113 | } 114 | 115 | #' @rdname tif_as 116 | #' @export 117 | tif_as_tokens_df.default <- function(tokens) { 118 | 119 | nd <- length(dim(tokens)) 120 | if (nd == 2L) { 121 | out <- as.data.frame(tokens) 122 | tif_as_tokens_df(out) 123 | } else { 124 | stop("Cannot convert object of class ", class(tokens), 125 | " to tif tokens") 126 | } 127 | 128 | return(out) 129 | } 130 | 131 | #' @rdname tif_as 132 | #' @export 133 | tif_as_tokens_df.list <- function(tokens) { 134 | 135 | if (is.null(names(tokens))) { 136 | doc_id <- sprintf("doc%d", seq_along(tokens)) 137 | } else { 138 | doc_id <- names(tokens) 139 | } 140 | doc_id <- rep(doc_id, lengths(tokens)) 141 | out <- data.frame(doc_id = unlist(doc_id, use.names = FALSE), 142 | token = unlist(tokens, use.names = FALSE), 143 | stringsAsFactors = FALSE) 144 | 145 | return(out) 146 | } 147 | 148 | 149 | #' @rdname tif_as 150 | #' @export 151 | tif_as_tokens_df.data.frame <- function(tokens) { 152 | return(tokens) 153 | } 154 | 155 | 156 | #' @export 157 | #' @rdname tif_as 158 | tif_as_tokens_list <- function(tokens) { 159 | UseMethod("tif_as_tokens_list") 160 | } 161 | 162 | #' @rdname tif_as 163 | #' @export 164 | tif_as_tokens_list.default <- function(tokens) { 165 | 166 | nd <- length(dim(tokens)) 167 | if (nd == 2L) { 168 | out <- as.data.frame(tokens) 169 | } else { 170 | stop("Cannot convert object of class ", class(tokens), 171 | " to tif tokens") 172 | } 173 | 174 | return(out) 175 | } 176 | 177 | #' @rdname tif_as 178 | #' @export 179 | tif_as_tokens_list.list <- function(tokens) { 180 | return(tokens) 181 | } 182 | 183 | 184 | #' @rdname tif_as 185 | #' @export 186 | tif_as_tokens_list.data.frame <- function(tokens) { 187 | out <- split(tokens$token, tokens$doc_id) 188 | return(out) 189 | } 190 | -------------------------------------------------------------------------------- /R/validators.R: -------------------------------------------------------------------------------- 1 | #' Validate Corpus Data Frame Object 2 | #' 3 | #' A valid data frame corpus object is an object that 4 | # inherits a data frame. It has no row names and has at 5 | #' least two columns. One column must be called doc_id 6 | #' and be a character vector with UTF-8 encoding. Document 7 | #' ids must be unique. There must also be a column called text 8 | #' and must also be a character vector in UTF-8 encoding. Each 9 | #' individual document is represented by a single row in 10 | #' the data frame. Addition document-level metadata columns 11 | #' and corpus level attributes are allowed but not required. 12 | #' 13 | #' @param corpus a corpus object to test for validity 14 | #' @param warn logical. Should the function produce a 15 | #' verbose warning for the condition for which 16 | #' the validation fails. Useful for testing. 17 | #' @return a logical vector of length one indicating 18 | #' whether the input is a valid corpus 19 | #' 20 | #' @details 21 | #' The tests are run sequentially and the function returns, 22 | #' with a warning if the warn flag is set, on the first test 23 | #' that fails. We use this implementation because some tests 24 | #' may fail entirely or be meaningless if the prior ones are 25 | #' note passed. For example, if the corpus object does not 26 | #' have a variable named "text" it does not make sense to 27 | #' check whether this column is a character vector. 28 | #' 29 | #' @example inst/examples/tif_is_corpus_df.R 30 | #' @export 31 | tif_is_corpus_df <- function(corpus, warn = FALSE) { 32 | 33 | if (!inherits(corpus, "data.frame")) { 34 | if (warn) warning("corpus object must inherit the data.frame class") 35 | return(FALSE) 36 | } 37 | 38 | if (ncol(corpus) <= 1L) { 39 | if (warn) warning("corpus object must contain at least two columns") 40 | return(FALSE) 41 | } 42 | 43 | if (!all(c("doc_id", "text") %in% names(corpus))) { 44 | if (warn) warning("corpus object must contain columns named ", 45 | "'doc_id' and 'text'") 46 | return(FALSE) 47 | } 48 | 49 | if (.row_names_info(corpus, type = 1) > 0) { 50 | if (warn) warning("corpus object should not contain row names") 51 | return(FALSE) 52 | } 53 | 54 | if (!is.character(corpus$doc_id)) { 55 | if (warn) warning("doc_id must be a character vector") 56 | return(FALSE) 57 | } 58 | 59 | if (!is.character(corpus$text)) { 60 | if (warn) warning("text must be a character vector") 61 | return(FALSE) 62 | } 63 | 64 | # if (Encoding(corpus$doc_id) != "UTF-8") { 65 | # if (warn) warning("doc_id column must be UTF-8 encoded") 66 | # return(FALSE) 67 | # } 68 | 69 | # if (Encoding(corpus$text) != "UTF-8") { 70 | # if (warn) warning("text column must be UTF-8 encoded") 71 | # return(FALSE) 72 | # } 73 | 74 | if (any(duplicated(corpus$doc_id))) { 75 | if (warn) warning("there are duplicated document ids in the corpus") 76 | return(FALSE) 77 | } 78 | 79 | return(TRUE) 80 | } 81 | 82 | #' Validate Corpus Character Vector Object 83 | #' 84 | #' A valid character vector corpus object is an character 85 | #' vector with UTF-8 encoding. If it has names, this should 86 | #' be a unique character also in UTF-8 encoding. No other 87 | #' attributes should be present. 88 | #' 89 | #' @param corpus a corpus object to test for validity 90 | #' @param warn logical. Should the function produce a 91 | #' verbose warning for the condition for which 92 | #' the validation fails. Useful for testing. 93 | #' @return a logical vector of length one indicating 94 | #' whether the input is a valid corpus 95 | #' 96 | #' @details 97 | #' The tests are run sequentially and the function returns, 98 | #' with a warning if the warn flag is set, on the first test 99 | #' that fails. We use this implementation because some tests 100 | #' may fail entirely or be meaningless if the prior ones are 101 | #' note passed. 102 | #' 103 | #' @example inst/examples/tif_is_corpus_character.R 104 | #' @export 105 | tif_is_corpus_character <- function(corpus, warn = FALSE) { 106 | 107 | if (!is.character(corpus)) { 108 | if (warn) warning("corpus object must be a character vector") 109 | return(FALSE) 110 | } 111 | 112 | if (!is.null(names(corpus)) && any(duplicated(names(corpus)))) { 113 | if (warn) warning("names of corpus object must not be duplicated") 114 | return(FALSE) 115 | } 116 | 117 | if (!is.null(attributes(corpus)) && 118 | any(names(attributes(corpus)) != "names")) { 119 | if (warn) warning("corpus object should only have 'names' attribute") 120 | return(FALSE) 121 | } 122 | 123 | if (!is.null(names(corpus)) && !is.character(names(corpus))) { 124 | if (warn) warning("corpus object names should be a character vector") 125 | return(FALSE) 126 | } 127 | 128 | # if (Encoding(corpus) != "UTF-8") { 129 | # if (warn) warning("corpus must be UTF-8 encoded") 130 | # return(FALSE) 131 | # } 132 | 133 | # if (!is.null(names(corpus)) && Encoding(names(corpus)) != "UTF-8") { 134 | # if (warn) warning("corpus names must be UTF-8 encoded") 135 | # return(FALSE) 136 | # } 137 | 138 | return(TRUE) 139 | } 140 | 141 | 142 | #' Validate Document Term Matrix Object 143 | #' 144 | #' A valid document term matrix is a sparse matrix with 145 | #' the row representing documents and columns representing 146 | #' terms. The row names is a character vector giving the 147 | #' document ids with no duplicated entries. The column 148 | #' names is a character vector giving the terms of the 149 | #' matrix with no duplicated entries. The spare matrix 150 | #' should inherit from the Matrix class dgCMatrix. 151 | #' 152 | #' @param dtm a document term matrix object to test 153 | #' the validity of 154 | #' @param warn logical. Should the function produce a 155 | #' verbose warning for the condition for which 156 | #' the validation fails. Useful for testing. 157 | #' @return a logical vector of length one indicating 158 | #' whether the input is a valid document term 159 | #' matrix 160 | #' 161 | #' @details 162 | #' The tests are run sequentially and the function returns, 163 | #' with a warning if the warn flag is set, on the first test 164 | #' that fails. We use this implementation because some tests 165 | #' may fail entirely or be meaningless if the prior ones are 166 | #' note passed. For example, if the dtm object is not a matrix 167 | #' it may not contain row or column names. 168 | #' 169 | #' @example inst/examples/tif_is_dtm.R 170 | #' @importFrom Matrix Matrix 171 | #' @export 172 | tif_is_dtm <- function(dtm, warn = FALSE) { 173 | 174 | if (!inherits(dtm, "dgCMatrix")) { 175 | if (warn) warning("document term matrix object must inherit", 176 | "the dgCMatrix class") 177 | return(FALSE) 178 | } 179 | 180 | if (is.null(colnames(dtm))) { 181 | if (warn) warning("document term matrix object must have column names") 182 | return(FALSE) 183 | } 184 | 185 | if (is.null(rownames(dtm))) { 186 | if (warn) warning("document term matrix object must have row names") 187 | return(FALSE) 188 | } 189 | 190 | if (!is.character(rownames(dtm))) { 191 | if (warn) warning("document term matrix object must have character", 192 | "row names") 193 | return(FALSE) 194 | } 195 | 196 | if (!is.character(colnames(dtm))) { 197 | if (warn) warning("document term matrix object must have character", 198 | "column names") 199 | return(FALSE) 200 | } 201 | 202 | if (any(duplicated(rownames(dtm)))) { 203 | if (warn) warning("document term matrix object has duplicated row names") 204 | return(FALSE) 205 | } 206 | 207 | if (any(duplicated(colnames(dtm)))) { 208 | if (warn) warning("document term matrix object has duplicated column", 209 | "names") 210 | return(FALSE) 211 | } 212 | 213 | return(TRUE) 214 | } 215 | 216 | #' Validate Tokens Data Frame Object 217 | #' 218 | #' A valid tokens data frame object is a data frame or an 219 | #' object that inherits a data frame. It has no row names 220 | #' and has at least two columns. It must a contain column called 221 | #' doc_id that is a character vector with UTF-8 encoding. 222 | #' Document ids must be unique. It must also contain a column called 223 | #' token that must also be a character vector in UTF-8 encoding. 224 | #' Each individual token is represented by a single row in 225 | #' the data frame. Addition token-level metadata columns 226 | #' are allowed but not required. 227 | #' 228 | #' @param tokens a tokens object to test for validity 229 | #' @param warn logical. Should the function produce a 230 | #' verbose warning for the condition for which 231 | #' the validation fails. Useful for testing. 232 | #' @return a logical vector of length one indicating 233 | #' whether the input is a valid tokens object 234 | #' 235 | #' @details 236 | #' The tests are run sequentially and the function returns, 237 | #' with a warning if the warn flag is set, on the first test 238 | #' that fails. We use this implementation because some tests 239 | #' may fail entirely or be meaningless if the prior ones are 240 | #' note passed. For example, if the tokens object does not 241 | #' have a variable named "doc_id" it does not make sense to 242 | #' check whether this column is a character vector. 243 | #' 244 | #' @example inst/examples/tif_is_tokens_df.R 245 | #' @export 246 | tif_is_tokens_df <- function(tokens, warn = FALSE) { 247 | 248 | if (!inherits(tokens, "data.frame")) { 249 | if (warn) warning("tokens object must inherit the data.frame class") 250 | return(FALSE) 251 | } 252 | 253 | if (ncol(tokens) <= 1L) { 254 | if (warn) warning("tokens object must contain at least two columns") 255 | return(FALSE) 256 | } 257 | 258 | 259 | if (!all(c("doc_id", "token") %in% names(tokens))) { 260 | if (warn) warning("data frame must contain columns named", 261 | "'doc_id' and 'token'") 262 | return(FALSE) 263 | } 264 | 265 | if (.row_names_info(tokens, type = 1) > 0) { 266 | if (warn) warning("tokens object should not contain row names") 267 | return(FALSE) 268 | } 269 | 270 | if (!is.character(tokens$doc_id)) { 271 | if (warn) warning("doc_id must be a character vector") 272 | return(FALSE) 273 | } 274 | 275 | if (!is.character(tokens$token)) { 276 | if (warn) warning("text must be a character vector") 277 | return(FALSE) 278 | } 279 | 280 | # if (Encoding(tokens$doc_id) != "UTF-8") { 281 | # if (warn) warning("doc_id column must be UTF-8 encoded") 282 | # return(FALSE) 283 | # } 284 | 285 | # if (Encoding(tokens$token) != "UTF-8") { 286 | # if (warn) warning("token column must be UTF-8 encoded") 287 | # return(FALSE) 288 | # } 289 | 290 | return(TRUE) 291 | } 292 | 293 | #' Validate Tokens List Object 294 | #' 295 | #' A valid corpus tokens object is (possibly named) list of 296 | #' character vectors. The character vectors, as well as 297 | #' names, should be in UTF-8 encoding. No other attributes 298 | #' should be present in either the list or any of its elements. 299 | #' 300 | #' @param tokens a tokens object to test for validity 301 | #' @param warn logical. Should the function produce a 302 | #' verbose warning for the condition for which 303 | #' the validation fails. Useful for testing. 304 | #' @return a logical vector of length one indicating 305 | #' whether the input is a valid tokens 306 | #' 307 | #' @details 308 | #' The tests are run sequentially and the function returns, 309 | #' with a warning if the warn flag is set, on the first test 310 | #' that fails. We use this implementation because some tests 311 | #' may fail entirely or be meaningless if the prior ones are 312 | #' note passed. 313 | #' 314 | #' @example inst/examples/tif_is_tokens_list.R 315 | #' @export 316 | tif_is_tokens_list <- function(tokens, warn = FALSE) { 317 | 318 | if (!is.list(tokens)) { 319 | if (warn) warning("tokens object must be a list") 320 | return(FALSE) 321 | } 322 | 323 | if (!is.null(names(tokens)) && any(duplicated(names(tokens)))) { 324 | if (warn) warning("names of tokens object must not be duplicated") 325 | return(FALSE) 326 | } 327 | 328 | if (!is.null(attributes(tokens)) && 329 | any(names(attributes(tokens)) != "names")) { 330 | if (warn) warning("tokens object should only have 'names' attribute") 331 | return(FALSE) 332 | } 333 | 334 | if (!is.null(names(tokens)) && !is.character(names(tokens))) { 335 | if (warn) warning("tokens object names should be a character vector") 336 | return(FALSE) 337 | } 338 | 339 | if (any(unlist(lapply(tokens, is.null)))) { 340 | if (warn) warning("no elements of tokens should be 'NULL'") 341 | return(FALSE) 342 | } 343 | 344 | if (!all(unlist(lapply(tokens, is.character)))) { 345 | if (warn) warning("elements of tokens should all be a character vectors") 346 | return(FALSE) 347 | } 348 | 349 | if (!all(unlist(lapply(lapply(tokens, attributes), is.null)))) { 350 | if (warn) { 351 | warning("elements of tokens should have no additional attributes") 352 | } 353 | return(FALSE) 354 | } 355 | 356 | # if (!all(sapply(tokens, Encoding) == "UTF-8")) { 357 | # if (warn) warning("elements of tokens must be UTF-8 encoded") 358 | # return(FALSE) 359 | # } 360 | 361 | # if (!is.null(names(tokens)) && Encoding(names(tokens)) != "UTF-8") { 362 | # if (warn) warning("tokens names must be UTF-8 encoded") 363 | # return(FALSE) 364 | # } 365 | 366 | return(TRUE) 367 | } 368 | --------------------------------------------------------------------------------