├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── NAMESPACE
├── R
├── bind_list.R
├── bind_table.R
├── bind_vector.R
├── change_index.R
├── cluster_matrix.R
├── column_to_rownames.R
├── combine.R
├── duration.R
├── flatten.R
├── from_to.R
├── grab_index.R
├── grab_match.R
├── mtabulate.R
├── split_index.R
├── split_match.R
├── split_match_regex_to_transcript.R
├── split_portion.R
├── split_run.R
├── split_sentence.R
├── split_sentence_token.R
├── split_speaker.R
├── split_token.R
├── split_transcript.R
├── split_word.R
├── textshape-package.R
├── tidy_colo_dtm.R
├── tidy_dtm.R
├── tidy_list.R
├── tidy_matrix.R
├── tidy_table.R
├── tidy_vector.R
├── unique_pairs.R
├── unnest_text.R
└── utils.R
├── README.Rmd
├── README.md
├── data
├── DATA.rda
├── golden_rules.rda
├── hamlet.rda
└── simple_dtm.rda
├── inst
├── build.R
├── docs
│ └── Simpsons_Roasting_on_an_Open_Fire_Script.pdf
├── extra_statdoc
│ └── readme.R
├── make_data
│ └── golden_rules.R
├── staticdocs
│ └── index.R
└── supporting_docs
│ └── LaverGarry.zip
├── man
├── DATA.Rd
├── bind_list.Rd
├── bind_table.Rd
├── bind_vector.Rd
├── change_index.Rd
├── cluster_matrix.Rd
├── column_to_rownames.Rd
├── combine.Rd
├── duration.Rd
├── flatten.Rd
├── from_to.Rd
├── golden_rules.Rd
├── grab_index.Rd
├── grab_match.Rd
├── hamlet.Rd
├── mtabulate.Rd
├── simple_dtm.Rd
├── split_index.Rd
├── split_match.Rd
├── split_portion.Rd
├── split_run.Rd
├── split_sentence.Rd
├── split_sentence_token.Rd
├── split_speaker.Rd
├── split_token.Rd
├── split_transcript.Rd
├── split_word.Rd
├── textshape.Rd
├── tidy_colo_dtm.Rd
├── tidy_dtm.Rd
├── tidy_list.Rd
├── tidy_matrix.Rd
├── tidy_table.Rd
├── tidy_vector.Rd
├── unique_pairs.Rd
└── unnest_text.Rd
├── tests
├── testthat.R
└── testthat
│ ├── test-change_index.R
│ └── test-mtabulate.R
└── tools
├── figure
├── unnamed-chunk-10-1.png
├── unnamed-chunk-16-1.png
├── unnamed-chunk-17-1.png
├── unnamed-chunk-18-1.png
├── unnamed-chunk-19-1.png
├── unnamed-chunk-21-1.png
├── unnamed-chunk-22-1.png
├── unnamed-chunk-23-1.png
├── unnamed-chunk-27-1.png
├── unnamed-chunk-32-1.png
├── unnamed-chunk-44-1.png
├── unnamed-chunk-45-1.png
├── unnamed-chunk-46-1.png
├── unnamed-chunk-55-1.png
├── unnamed-chunk-8-1.png
└── unnamed-chunk-9-1.png
└── textshape_logo
├── r_textshape.png
├── r_textshape.pptx
├── r_textshapea.png
└── resize_icon.txt
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.gitignore
4 | NEWS.md
5 | FAQ.md
6 | NEWS.html
7 | FAQ.html
8 | ^\.travis\.yml$
9 | travis-tool.sh
10 | inst/web
11 | contributors.geojson
12 | inst/build.R
13 | ^.*\.Rprofile$
14 | README.Rmd
15 | README.R
16 | travis.yml
17 | tools/textshape_logo/r_textshapea.png
18 | tools/textshape_logo/r_textshape.pptx
19 | tools/textshape_logo/resize_icon.txt
20 | inst/staticdocs
21 | inst/extra_statdoc
22 | inst/maintenance.R
23 | Thumbs.db
24 | inst/make_data
25 | inst/supporting_docs
26 | ^CODE_OF_CONDUCT\.md$
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | # Example code in package build process
4 | *-Ex.R
5 | .Rprofile
6 | .Rproj.user
7 | textshape.Rproj
8 | inst/maintenance.R
9 | Thumbs.db
10 | .DS_Store
11 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 | r:
3 | - release
4 | - devel
5 | sudo: false
6 | cache: packages
7 |
8 |
9 | r_github_packages: jimhester/covr
10 |
11 | before_install:
12 | - sh -e /etc/init.d/xvfb start
13 |
14 | after_success:
15 | - Rscript -e 'covr::coveralls()'
16 |
17 | r_build_args: "--resave-data=best"
18 | r_check_args: "--as-cran"
19 |
20 | env:
21 | global:
22 | - DISPLAY=:99.0
23 | - BOOTSTRAP_LATEX=1
24 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Code of Conduct
2 |
3 | As contributors and maintainers of this project, we pledge to respect all people who
4 | contribute through reporting issues, posting feature requests, updating documentation,
5 | submitting pull requests or patches, and other activities.
6 |
7 | We are committed to making participation in this project a harassment-free experience for
8 | everyone, regardless of level of experience, gender, gender identity and expression,
9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 |
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 |
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed
18 | from the project team.
19 |
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by
21 | opening an issue or contacting one or more of the project maintainers.
22 |
23 | This Code of Conduct is adapted from the Contributor Covenant
24 | (http://contributor-covenant.org), version 1.0.0, available at
25 | http://contributor-covenant.org/version/1/0/0/
26 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: textshape
2 | Title: Tools for Reshaping Text
3 | Version: 1.7.6
4 | Authors@R: c(
5 | person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")),
6 | person("Joran", "Elias", role = "ctb"),
7 | person("Matthew", "Flickinger", role = "ctb"),
8 | person('Paul', 'Foster', role = "ctb")
9 | )
10 | Maintainer: Tyler Rinker
11 | Description: Tools that can be used to reshape and restructure text data.
12 | Depends: R (>= 3.4.0)
13 | Imports: data.table, slam, stats, stringi, utils
14 | Suggests: testthat
15 | License: GPL-2
16 | LazyData: TRUE
17 | RoxygenNote: 7.2.3
18 | Encoding: UTF-8
19 | URL: https://github.com/trinker/textshape
20 | BugReports: https://github.com/trinker/textshape/issues
21 | Collate:
22 | 'bind_list.R'
23 | 'bind_table.R'
24 | 'bind_vector.R'
25 | 'change_index.R'
26 | 'cluster_matrix.R'
27 | 'column_to_rownames.R'
28 | 'combine.R'
29 | 'duration.R'
30 | 'flatten.R'
31 | 'from_to.R'
32 | 'grab_index.R'
33 | 'grab_match.R'
34 | 'mtabulate.R'
35 | 'split_index.R'
36 | 'split_match.R'
37 | 'split_match_regex_to_transcript.R'
38 | 'split_portion.R'
39 | 'split_run.R'
40 | 'split_sentence.R'
41 | 'split_sentence_token.R'
42 | 'split_speaker.R'
43 | 'split_token.R'
44 | 'split_transcript.R'
45 | 'split_word.R'
46 | 'textshape-package.R'
47 | 'tidy_colo_dtm.R'
48 | 'utils.R'
49 | 'tidy_dtm.R'
50 | 'tidy_list.R'
51 | 'tidy_matrix.R'
52 | 'tidy_table.R'
53 | 'tidy_vector.R'
54 | 'unique_pairs.R'
55 | 'unnest_text.R'
56 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(combine,data.frame)
4 | S3method(combine,default)
5 | S3method(duration,data.frame)
6 | S3method(duration,default)
7 | S3method(duration,numeric)
8 | S3method(from_to,character)
9 | S3method(from_to,data.frame)
10 | S3method(from_to,default)
11 | S3method(from_to,factor)
12 | S3method(from_to,numeric)
13 | S3method(grab_index,character)
14 | S3method(grab_index,data.frame)
15 | S3method(grab_index,default)
16 | S3method(grab_index,list)
17 | S3method(grab_index,matrix)
18 | S3method(grab_match,character)
19 | S3method(grab_match,data.frame)
20 | S3method(grab_match,list)
21 | S3method(split_index,character)
22 | S3method(split_index,data.frame)
23 | S3method(split_index,default)
24 | S3method(split_index,factor)
25 | S3method(split_index,list)
26 | S3method(split_index,matrix)
27 | S3method(split_index,numeric)
28 | S3method(split_run,data.frame)
29 | S3method(split_run,default)
30 | S3method(split_sentence,data.frame)
31 | S3method(split_sentence,default)
32 | S3method(split_sentence_token,data.frame)
33 | S3method(split_sentence_token,default)
34 | S3method(split_token,data.frame)
35 | S3method(split_token,default)
36 | S3method(split_word,data.frame)
37 | S3method(split_word,default)
38 | S3method(unique_pairs,data.table)
39 | S3method(unique_pairs,default)
40 | export(as_list)
41 | export(bind_list)
42 | export(bind_table)
43 | export(bind_vector)
44 | export(change_index)
45 | export(cluster_matrix)
46 | export(column_to_rownames)
47 | export(combine)
48 | export(duration)
49 | export(ends)
50 | export(flatten)
51 | export(from_to)
52 | export(from_to_summarize)
53 | export(grab_index)
54 | export(grab_match)
55 | export(mtabulate)
56 | export(split_index)
57 | export(split_match)
58 | export(split_match_regex)
59 | export(split_portion)
60 | export(split_run)
61 | export(split_sentence)
62 | export(split_sentence_token)
63 | export(split_speaker)
64 | export(split_token)
65 | export(split_transcript)
66 | export(split_word)
67 | export(starts)
68 | export(tidy_adjacency_matrix)
69 | export(tidy_colo_dtm)
70 | export(tidy_colo_tdm)
71 | export(tidy_dtm)
72 | export(tidy_list)
73 | export(tidy_matrix)
74 | export(tidy_table)
75 | export(tidy_tdm)
76 | export(tidy_vector)
77 | export(unique_pairs)
78 | export(unnest_text)
79 | importFrom(data.table,":=")
80 | importFrom(data.table,.N)
81 |
--------------------------------------------------------------------------------
/R/bind_list.R:
--------------------------------------------------------------------------------
1 | #' Row Bind a List of Named Dataframes or Vectors
2 | #'
3 | #' Deprecated, use \code{\link[textshape]{tidy_list}} instead.
4 | #'
5 | #' @param x A named \code{\link[base]{list}} of
6 | #' \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.
7 | #' @param id.name The name to use for the column created from the \code{\link[base]{list}}.
8 | #' @param content.name The name to use for the column created from the \code{\link[base]{list}}
9 | #' of \code{\link[base]{vector}}s (only used if \code{x} is \code{\link[base]{vector}}).
10 | #' @param \ldots ignored.
11 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
12 | #' from the \code{\link[base]{list}} as an \code{id} column.
13 | #' @export
14 | #' @examples
15 | #' \dontrun{
16 | #' bind_list(list(p=1:500, r=letters))
17 | #' bind_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
18 | #'
19 | #' ## 2015 Vice-Presidential Debates Example
20 | #' if (!require("pacman")) install.packages("pacman")
21 | #' pacman::p_load(rvest, magrittr, xml2)
22 | #'
23 | #' debates <- c(
24 | #' wisconsin = "110908",
25 | #' boulder = "110906",
26 | #' california = "110756",
27 | #' ohio = "110489"
28 | #' )
29 | #'
30 | #' lapply(debates, function(x){
31 | #' xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) %>%
32 | #' rvest::html_nodes("p") %>%
33 | #' rvest::html_text() %>%
34 | #' textshape::split_index(grep("^[A-Z]+:", .)) %>%
35 | #' textshape::combine() %>%
36 | #' textshape::split_transcript() %>%
37 | #' textshape::split_sentence()
38 | #' }) %>%
39 | #' textshape::bind_list("location")
40 | #' }
41 | bind_list <- function(x, id.name= "id", content.name = "content", ...){
42 |
43 | warning(
44 | paste0(
45 | "Deprecated, use textshape::tidy_list() instead.\n`bind_list()` ",
46 | "will be removed in the next version."
47 | ),
48 | call. = FALSE
49 | )
50 |
51 |
52 | if (is.data.frame(x[[1]])){
53 | bind_list_df(x = x, id.name = id.name)
54 | } else {
55 |
56 | if (is.vector(x[[1]])){
57 | bind_list_vector(x = x, id.name = id.name, content.name = content.name)
58 | } else {
59 | stop("`x` must be a list of `data.frame`s or `vector`s")
60 | }
61 | }
62 | }
63 |
64 |
65 |
66 | bind_list_df <- function (x, id.name = "id"){
67 | if (is.null(names(x))) {
68 | names(x) <- seq_along(x)
69 | }
70 | list.names <- rep(names(x), sapply2(x, nrow))
71 | x <- lapply(x, data.table::as.data.table)
72 | x[['fill']] <- TRUE
73 | out <- data.frame(list.names, do.call(rbind, x),
74 | row.names = NULL, check.names = FALSE, stringsAsFactors = FALSE)
75 | colnames(out)[1] <- id.name
76 | data.table::data.table(out)
77 | }
78 |
79 | bind_list_vector <- function(x, id.name= "id", content.name = "content"){
80 | if (is.null(names(x))) {
81 | names(x) <- seq_along(x)
82 | }
83 | dat <- data.frame(
84 | rep(names(x), sapply2(x, length)),
85 | unlist(x, use.names = FALSE),
86 | stringsAsFactors = FALSE,
87 | check.names = FALSE,
88 | row.names = NULL
89 | )
90 | colnames(dat) <- c(id.name, content.name)
91 | data.table::data.table(dat)
92 | }
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/R/bind_table.R:
--------------------------------------------------------------------------------
1 | #' Column Bind a Table's Values with Its Names
2 | #'
3 | #' Deprecated, use \code{\link[textshape]{tidy_table}} instead.
4 | #'
5 | #' @param x A \code{\link[base]{table}}.
6 | #' @param id.name The name to use for the column created from the \code{\link[base]{table}}
7 | #' \code{\link[base]{names}}.
8 | #' @param content.name The name to use for the column created from the \code{\link[base]{table}}
9 | #' values.
10 | #' @param \ldots ignored.
11 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
12 | #' from the \code{\link[base]{table}} as an \code{id} column.
13 | #' @export
14 | #' @examples
15 | #' \dontrun{
16 | #' x <- table(sample(LETTERS[1:6], 1000, TRUE))
17 | #' bind_table(x)
18 | #' }
19 | bind_table <- function(x, id.name= "id", content.name = "content", ...){
20 |
21 | warning(
22 | paste0(
23 | "Deprecated, use textshape::tidy_table() instead.\n`bind_table()` ",
24 | "will be removed in the next version."
25 | ),
26 | call. = FALSE
27 | )
28 |
29 | stopifnot(is.table(x))
30 |
31 | out <- data.table::data.table(x = names(x), y = unname(c(x)))
32 | data.table::setnames(out, c(id.name, content.name))
33 | out
34 |
35 | }
36 |
37 |
--------------------------------------------------------------------------------
/R/bind_vector.R:
--------------------------------------------------------------------------------
1 | #' Column Bind an Atomic Vector's Values with Its Names
2 | #'
3 | #' Deprecated, use \code{\link[textshape]{tidy_vector}} instead.
4 | #'
5 | #' @param x A named atomic \code{\link[base]{vector}}.
6 | #' @param id.name The name to use for the column created from the \code{\link[base]{vector}}
7 | #' \code{\link[base]{names}}.
8 | #' @param content.name The name to use for the column created from the \code{\link[base]{vector}}
9 | #' values.
10 | #' @param \ldots ignored.
11 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
12 | #' from the \code{\link[base]{vector}} as an \code{id} column.
13 | #' @export
14 | #' @examples
15 | #' \dontrun{
16 | #' x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
17 | #' bind_vector(x)
18 | #' }
19 | bind_vector <- function(x, id.name= "id", content.name = "content", ...){
20 |
21 | warning(
22 | paste0(
23 | "Deprecated, use textshape::tidy_vector() instead.\n`bind_vector()` ",
24 | "will be removed in the next version."
25 | ),
26 | call. = FALSE
27 | )
28 |
29 | stopifnot(is.atomic(x))
30 |
31 | if (is.null(names)) {
32 | out <- data.table::as.data.table(x)
33 | data.table::setnames(out, id.name)
34 | } else {
35 | out <- data.table::data.table(x = names(x), y = unname(x))
36 | data.table::setnames(out, c(id.name, content.name))
37 | }
38 | out
39 | }
40 |
41 |
42 |
--------------------------------------------------------------------------------
/R/change_index.R:
--------------------------------------------------------------------------------
1 | #' Indexing of Changes in Runs
2 | #'
3 | #' Find the indices of changes in runs in a vector. This function pairs well
4 | #' with \code{split_index} and is the default for the \code{indices} in all
5 | #' \code{split_index} functions that act on atomic vectors.
6 | #'
7 | #' @param x A vector.
8 | #' @param \ldots ignored.
9 | #' @return Returns a vector of integer indices of where a vector initially
10 | #' changes.
11 | #' @export
12 | #' @seealso \code{\link[textshape]{split_index}}
13 | #' @examples
14 | #' set.seed(10)
15 | #' (x <- sample(0:1, 20, TRUE))
16 | #' change_index(x)
17 | #' split_index(x, change_index(x))
18 | #'
19 | #'
20 | #' (p_chng <- change_index(CO2[["Plant"]]))
21 | #' split_index(CO2[["Plant"]], p_chng)
22 | change_index <- function (x, ...) {
23 | utils::head(1 + cumsum(rle(as.character(x))[[1]]), -1)
24 | }
25 |
26 |
--------------------------------------------------------------------------------
/R/cluster_matrix.R:
--------------------------------------------------------------------------------
1 | #' Reorder a Matrix Based on Hierarchical Clustering
2 | #'
3 | #' Reorder matrix rows, columns, or both via hierarchical clustering.
4 | #'
5 | #' @param x A matrix.
6 | #' @param dim The dimension to reorder (cluster); must be set to "row", "col",
7 | #' or "both".
8 | #' @param method The agglomeration method to be used (see
9 | #' \code{\link[stats]{hclust}}).
10 | #' @param \ldots ignored.
11 | #' @return Returns a reordered matrix.
12 | #' @export
13 | #' @seealso \code{\link[stats]{hclust}}
14 | #' @examples
15 | #' cluster_matrix(mtcars)
16 | #' cluster_matrix(mtcars, dim = 'row')
17 | #' cluster_matrix(mtcars, dim = 'col')
18 | #'
19 | #' \dontrun{
20 | #' if (!require("pacman")) install.packages("pacman")
21 | #' pacman::p_load(tidyverse, viridis, gridExtra)
22 | #'
23 | #' ## plot heatmap w/o clustering
24 | #' wo <- mtcars %>%
25 | #' cor() %>%
26 | #' tidy_matrix('car', 'var') %>%
27 | #' ggplot(aes(var, car, fill = value)) +
28 | #' geom_tile() +
29 | #' scale_fill_viridis(name = expression(r[xy])) +
30 | #' theme(
31 | #' axis.text.y = element_text(size = 8) ,
32 | #' axis.text.x = element_text(
33 | #' size = 8,
34 | #' hjust = 1,
35 | #' vjust = 1,
36 | #' angle = 45
37 | #' ),
38 | #' legend.position = 'bottom',
39 | #' legend.key.height = grid::unit(.1, 'cm'),
40 | #' legend.key.width = grid::unit(.5, 'cm')
41 | #' ) +
42 | #' labs(subtitle = "With Out Clustering")
43 | #'
44 | #' ## plot heatmap w clustering
45 | #' w <- mtcars %>%
46 | #' cor() %>%
47 | #' cluster_matrix() %>%
48 | #' tidy_matrix('car', 'var') %>%
49 | #' mutate(
50 | #' var = factor(var, levels = unique(var)),
51 | #' car = factor(car, levels = unique(car))
52 | #' ) %>%
53 | #' group_by(var) %>%
54 | #' ggplot(aes(var, car, fill = value)) +
55 | #' geom_tile() +
56 | #' scale_fill_viridis(name = expression(r[xy])) +
57 | #' theme(
58 | #' axis.text.y = element_text(size = 8) ,
59 | #' axis.text.x = element_text(
60 | #' size = 8,
61 | #' hjust = 1,
62 | #' vjust = 1,
63 | #' angle = 45
64 | #' ),
65 | #' legend.position = 'bottom',
66 | #' legend.key.height = grid::unit(.1, 'cm'),
67 | #' legend.key.width = grid::unit(.5, 'cm')
68 | #' ) +
69 | #' labs(subtitle = "With Clustering")
70 | #'
71 | #' gridExtra::grid.arrange(wo, w, ncol = 2)
72 | #' }
73 | cluster_matrix <- function(x, dim = 'both', method = "ward.D2", ...){
74 |
75 | stopifnot(is.matrix(x) | is.data.frame(x))
76 |
77 | switch(dim,
78 | row = {
79 | hc1 <- stats::hclust(stats::dist(x), method = method)
80 | x[hc1$order, ]
81 | },
82 | col = {
83 | hc2 <- stats::hclust(stats::dist(t(x)), method = method)
84 | x[, hc2$order]
85 | },
86 | both = {
87 | hc1 <- stats::hclust(stats::dist(x), method = method)
88 | hc2 <- stats::hclust(stats::dist(t(x)), method = method)
89 | x[hc1$order, hc2$order]
90 | },
91 | stop('`dim` must be set to "row", "col", or "both"')
92 | )
93 |
94 | }
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/R/column_to_rownames.R:
--------------------------------------------------------------------------------
1 | #' Add a Column as Rownames
2 | #'
3 | #' Takes an existing column and uses it as rownames instead. This is useful
4 | #' when turning a \code{\link[base]{data.frame}} into a \code{\link[base]{matrix}}.
5 | #' Inspired by the \pkg{tibble} package's \code{column_to_row} which is now
6 | #' deprecated if done on a \pkg{tibble} object. By coercing to a
7 | #' \code{\link[base]{data.frame}} this problem is avoided.
8 | #'
9 | #' @param x An object that can be coerced to a \code{\link[base]{data.frame}}.
10 | #' @param loc The column location as either an integer or string index location.
11 | #' Must be unique row names.
12 | #' @return Returns a \code{\link[base]{data.frame}} with the specified column
13 | #' moved to rownames.
14 | #' @export
15 | #' @examples
16 | #' state_dat <- data.frame(state.name, state.area, state.center, state.division)
17 | #' column_to_rownames(state_dat)
18 | #' column_to_rownames(state_dat, 'state.name')
19 | column_to_rownames <- function(x, loc = 1){
20 |
21 | x <- as.data.frame(x, check.names = FALSE, stringsAsFactors = FALSE)
22 | if (!is.numeric(loc)) loc <- which(names(x) %in% loc)[1]
23 | rownames(x) <- x[[loc]]
24 | x[[loc]] <- NULL
25 | x
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/R/combine.R:
--------------------------------------------------------------------------------
1 | #' Combine Elements
2 | #'
3 | #' Combine (\code{\link[base]{paste}}) elements (\code{\link[base]{vector}}s,
4 | #' \code{\link[base]{list}}s, or \code{\link[base]{data.frame}}s) together
5 | #' with \code{collapse = TRUE}.
6 | #'
7 | #' @param x A \code{\link[base]{data.frame}} or character vector with runs.
8 | #' @param text.var The name of the text variable.
9 | #' @param fix.punctuation logical If \code{TRUE} spaces before/after punctuation
10 | #' that should not be are a removed (regex used:
11 | #' \code{"(\\s+(?=[,.?!;:\%-]))|((?<=[$-])\\s+)"}).
12 | #' @param \ldots Ignored.
13 | #' @export
14 | #' @rdname combine
15 | #' @return Returns a vector (if given a list/vector) or an expanded
16 | #' \code{\link[data.table]{data.table}} with elements pasted together.
17 | #' @examples
18 | #' (x <- split_token(DATA[["state"]][1], FALSE))
19 | #' combine(x)
20 | #'
21 | #' (x2 <- split_token(DATA[["state"]], FALSE))
22 | #' combine(x2)
23 | #'
24 | #' (x3 <- split_sentence(DATA))
25 | #'
26 | #' ## without dropping the non-group variable column
27 | #' combine(x3)
28 | #'
29 | #' ## Dropping the non-group variable column
30 | #' combine(x3[, 1:5, with=FALSE])
31 | combine <- function(x, ...) {
32 | UseMethod("combine")
33 | }
34 |
35 | #' @export
36 | #' @rdname combine
37 | #' @method combine default
38 | combine.default <- function(x, fix.punctuation = TRUE, ...) {
39 |
40 | if(!is.list(x)) x <- list(x)
41 | x <- unlist(lapply(x, paste, collapse = " "))
42 | if (isTRUE(fix.punctuation)){
43 | x <- gsub("(\\s+(?=[,.?!;:%-]))|((?<=[$-])\\s+)", "", x, perl = TRUE)
44 | }
45 | unname(x)
46 | }
47 |
48 | #' @export
49 | #' @rdname combine
50 | #' @method combine data.frame
51 | combine.data.frame <- function(x, text.var = TRUE, ...) {
52 |
53 | nms <- colnames(x)
54 | z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
55 |
56 | text.var <- detect_text_column(x, text.var)
57 |
58 | group.vars <- nms[!nms %in% text.var]
59 |
60 | express1 <- parse(text=
61 | paste0(
62 | "list(",
63 | text.var,
64 | " = paste(",
65 | text.var,
66 | ", collapse = \" \"))"
67 | )
68 | )
69 | z <- z[, eval(express1), by = group.vars]
70 | data.table::setcolorder(z, nms)
71 | z
72 | }
73 |
74 |
--------------------------------------------------------------------------------
/R/duration.R:
--------------------------------------------------------------------------------
1 | #' Duration of Turns of Talk
2 | #'
3 | #' \code{duration} - Calculate duration (start and end times) for duration of
4 | #' turns of talk measured in words.
5 | #'
6 | #' @param x A \code{\link[base]{data.frame}} or character vector with a text
7 | #' variable or a numeric vector.
8 | #' @param text.var The name of the text variable. If \code{TRUE}
9 | #' \code{duration} tries to detect the text column.
10 | #' @param grouping.var The grouping variables. Default \code{NULL} generates
11 | #' one word list for all text. Also takes a single grouping variable or a list
12 | #' of 1 or more grouping variables.
13 | #' @param \ldots Ignored.
14 | #' @export
15 | #' @rdname duration
16 | #' @importFrom data.table .N :=
17 | #' @return Returns a vector or data frame of starts and/or ends.
18 | #' @examples
19 | #' (x <- c(
20 | #' "Mr. Brown comes! He says hello. i give him coffee.",
21 | #' "I'll go at 5 p. m. eastern time. Or somewhere in between!",
22 | #' "go there"
23 | #' ))
24 | #' duration(x)
25 | #' group <- c("A", "B", "A")
26 | #' duration(x, group)
27 | #'
28 | #' groups <- list(group1 = c("A", "B", "A"), group2 = c("red", "red", "green"))
29 | #' duration(x, groups)
30 | #'
31 | #' data(DATA)
32 | #' duration(DATA)
33 | #'
34 | #' ## Larger data set
35 | #' duration(hamlet)
36 | #'
37 | #' ## Integer values
38 | #' x <- sample(1:10, 10)
39 | #' duration(x)
40 | #' starts(x)
41 | #' ends(x)
42 | duration <- function(x, ...) {
43 | UseMethod("duration")
44 | }
45 |
46 | #' @export
47 | #' @rdname duration
48 | #' @method duration default
49 | duration.default <- function(x, grouping.var = NULL, ...) {
50 |
51 | if(is.null(grouping.var)) {
52 | G <- "all"
53 | ilen <- 1
54 | } else {
55 | if (is.list(grouping.var)) {
56 | m <- unlist(as.character(substitute(grouping.var))[-1])
57 | m <- sapply2(strsplit(m, "$", fixed=TRUE), function(x) {
58 | x[length(x)]
59 | }
60 | )
61 | ilen <- length(grouping.var)
62 | G <- paste(m, collapse="&")
63 | } else {
64 | G <- as.character(substitute(grouping.var))
65 | ilen <- length(G)
66 | G <- G[length(G)]
67 | }
68 | }
69 | if(is.null(grouping.var)){
70 | grouping <- rep("all", length(x))
71 | } else {
72 | if (is.list(grouping.var) & length(grouping.var)>1) {
73 | grouping <- grouping.var
74 | } else {
75 | grouping <- unlist(grouping.var)
76 | }
77 | }
78 | if (G == "") G <- paste(names(grouping.var), collapse="&")
79 |
80 | dat <- stats::setNames(
81 | data.frame(as.data.frame(grouping), x),
82 | c(strsplit(G, "&")[[1]], "text.var")
83 | )
84 |
85 | duration.data.frame(dat, "text.var")
86 |
87 | }
88 |
89 | #' @export
90 | #' @rdname duration
91 | #' @method duration data.frame
92 | duration.data.frame <- function(x, text.var = TRUE, ...) {
93 |
94 | word.count <- NULL
95 | nms <- colnames(x)
96 | z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
97 |
98 | text.var <- detect_text_column(x, text.var)
99 |
100 | express1 <- parse(
101 | text=paste0("word.count := stringi::stri_count_words(", text.var, ")")
102 | )
103 |
104 | z[, eval(express1)][,
105 | 'word.count' := ifelse(is.na(word.count), 0, word.count)][,
106 | 'end' := cumsum(word.count)]
107 |
108 | z[["start"]] <- c(1, utils::head(z[["end"]] + 1, -1))
109 |
110 | colord <- c(nms[!nms %in% text.var], "word.count", "start", "end", text.var)
111 | data.table:: setcolorder(z, colord)
112 |
113 | z[]
114 |
115 | }
116 |
117 |
118 | #' @export
119 | #' @rdname duration
120 | #' @method duration numeric
121 | duration.numeric <- function(x, ...){
122 | dat <- data.frame(x = x, end = cumsum(x))
123 | dat[["start"]] <- c(1, utils::head(dat[["end"]] + 1 , -1))
124 | dat[c(1, 3:2)]
125 | }
126 |
127 |
128 | #' Duration of Turns of Talk
129 | #'
130 | #' \code{startss} - Calculate start times from a numeric vector.
131 | #'
132 | #' @rdname duration
133 | #' @export
134 | starts <- function(x, ...) c(1, utils::head(ends(x) + 1 , -1))
135 |
136 |
137 |
138 | #' Duration of Turns of Talk
139 | #'
140 | #' \code{ends} - Calculate end times from a numeric vector.
141 | #'
142 | #' @rdname duration
143 | #' @export
144 | ends <- function(x, ...) cumsum(x)
145 |
146 |
--------------------------------------------------------------------------------
/R/flatten.R:
--------------------------------------------------------------------------------
1 | #' Flatten a Nested List of Vectors Into a Single Tier List of Vectors
2 | #'
3 | #' Flatten a named, nested list of atomic vectors to a single level using the
4 | #' concatenated list/atomic vector names as the names of the single tiered
5 | #' list.
6 | #'
7 | #' @param x A nested, named list of vectors.
8 | #' @param sep A separator to use for the concatenation of the names from the
9 | #' nested list.
10 | #' @param \ldots ignored.
11 | #' @return Returns a flattened list.
12 | #' @author StackOverflow user @@Michael and Paul Foster and Tyler
13 | #' Rinker .
14 | #' @export
15 | #' @note The order of the list is sorted alphabetically. Pull requests for the
16 | #' option to return the original order would be appreciated.
17 | #' @references \url{https://stackoverflow.com/a/41882883/1000343} \cr
18 | #' \url{https://stackoverflow.com/a/48357114/1000343}
19 | #' @examples
20 | #' x <- list(
21 | #' urban = list(
22 | #' cars = c('volvo', 'ford'),
23 | #' food.dining = list(
24 | #' local.business = c('carls'),
25 | #' chain.business = c('dennys', 'panera')
26 | #' )
27 | #' ),
28 | #' rural = list(
29 | #' land.use = list(
30 | #' farming =list(
31 | #' dairy = c('cows'),
32 | #' vegie.plan = c('carrots')
33 | #' )
34 | #' ),
35 | #' social.rec = list(
36 | #' community.center = c('town.square')
37 | #' ),
38 | #' people.type = c('good', 'bad', 'in.between')
39 | #' ),
40 | #' other.locales = c('suburban'),
41 | #' missing = list(
42 | #' unknown = c(),
43 | #' known = c()
44 | #' ),
45 | #' end = c('wow')
46 | #' )
47 | #'
48 | #' x
49 | #'
50 | #' flatten(x)
51 | #' flatten(x, ' -> ')
52 | flatten <- function(x , sep = '_', ...){
53 |
54 | stopifnot(is.list(x))
55 |
56 | x <- fix_names(x)
57 |
58 | out<- flatten_h(x)
59 |
60 | names(out) <- gsub('\\.', sep, names(out))
61 |
62 | names(out) <- gsub('unlikelystringtodupe', '.', names(out), fixed = TRUE)
63 |
64 | out[order(names(out))]
65 |
66 | }
67 |
68 | flatten_h <- function(x){
69 |
70 | z <- unlist(lapply(x, function(y) class(y)[1] == "list"))
71 |
72 | out <- c(x[!z], unlist(x[z], recursive=FALSE))
73 |
74 | if (sum(z)){
75 | Recall(out)
76 | } else {
77 | out
78 | }
79 | }
80 |
81 | fix_names <- function(x) {
82 |
83 | if (is.list(x)) {
84 | names(x) <- gsub('\\.', 'unlikelystringtodupe', names(x))
85 | lapply(x, fix_names)
86 | } else {
87 |
88 | x
89 | }
90 | }
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/R/from_to.R:
--------------------------------------------------------------------------------
1 | #' Prepare Discourse Data for Network Plotting
2 | #'
3 | #' \code{from_to} - Add the next speaker as the from variable in a to/from
4 | #' network data structure. Assumes that the flow of discourse is coming from
5 | #' person A to person B, or at the very least the talk is taken up by person B.
6 | #' Works by taking the vector of speakers and shifting everything down one and
7 | #' then adding a closing element.
8 | #'
9 | #' @param x A data form \code{vector} or \code{data.frame}.
10 | #' @param final The name of the closing element or node.
11 | #' @param \ldots Ignored.
12 | #' @param from.var A character string naming the column to be considered the
13 | #' origin of the talk.
14 | #' @param id.vars The variables that correspond to the speaker or are attributes
15 | #' of the speaker (from variable).
16 | #' @param text.var The name of the text variable. If \code{TRUE}
17 | #' \code{duration} tries to detect the text column.
18 | #' @return Returns a vector (if given a vector) or an augmented
19 | #' \code{\link[data.table]{data.table}}.
20 | #' @rdname from_to
21 | #' @export
22 | #' @examples
23 | #' from_to(DATA, 'person')
24 | #' from_to_summarize(DATA, 'person')
25 | #' from_to_summarize(DATA, 'person', c('sex', 'adult'))
26 | #' \dontrun{
27 | #' if (!require("pacman")) install.packages("pacman"); library(pacman)
28 | #' p_load(dplyr, geomnet, qdap, stringi, scales)
29 | #' p_load_current_gh('trinker/textsahpe')
30 | #'
31 | #' dat <- from_to_summarize(DATA, 'person', c('sex', 'adult')) %>%
32 | #' mutate(words = rescale(word.count, c(.5, 1.5)))
33 | #'
34 | #' dat %>%
35 | #' ggplot(aes(from_id = from, to_id = to)) +
36 | #' geom_net(
37 | #' aes(linewidth = words),
38 | #' layout.alg = "fruchtermanreingold",
39 | #' directed = TRUE,
40 | #' labelon = TRUE,
41 | #' size = 1,
42 | #' labelcolour = 'black',
43 | #' ecolour = "grey70",
44 | #' arrowsize = 1,
45 | #' curvature = .1
46 | #' ) +
47 | #' theme_net() +
48 | #' xlim(c(-0.05, 1.05))
49 | #' }
50 | from_to <- function(x, ...){
51 | UseMethod("from_to")
52 | }
53 |
54 | #' @export
55 | #' @method from_to default
56 | #' @rdname from_to
57 | from_to.default <- function(x, final = 'End', ...){
58 | c(x[-1], final)
59 | }
60 |
61 | #' @export
62 | #' @method from_to character
63 | #' @rdname from_to
64 | from_to.character <- function(x, final = 'End', ...){
65 | c(x[-1], final)
66 | }
67 |
68 | #' @export
69 | #' @method from_to factor
70 | #' @rdname from_to
71 | from_to.factor <- function(x, final = 'End', ...){
72 | factor(c(as.character(x[-1]), final), levels = c(levels(x), final))
73 | }
74 |
75 | #' @export
76 | #' @method from_to numeric
77 | #' @rdname from_to
78 | from_to.numeric <- function(x, final = 'End', ...){
79 | c(as.character(x[-1]), final)
80 | }
81 |
82 | #' @export
83 | #' @method from_to data.frame
84 | #' @rdname from_to
85 | from_to.data.frame <- function(x, from.var, final = 'End', ...){
86 |
87 | data.table::data.table(data.frame(
88 | from = x[[from.var]],
89 | to = from_to(x[[from.var]]),
90 | x,
91 | stringsAsFactors = FALSE
92 | ))
93 |
94 | }
95 |
96 | #' Prepare Discourse Data for Network Plotting
97 | #'
98 | #' \code{from_to_summarize} - A wrapper for \code{from_to.data.frame} that
99 | #' adds a \code{word.count} column and then combines duplicate rows.
100 | #'
101 | #' @rdname from_to
102 | #' @export
103 | from_to_summarize <- function(x, from.var, id.vars = NULL, text.var = TRUE,
104 | ...){
105 |
106 | word.count <- NULL
107 |
108 | z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
109 | if (!is.null(id.vars)) {
110 | w <- unique(z[, c(from.var, id.vars), with=FALSE])
111 | }
112 |
113 | text.var <- detect_text_column(x, text.var)
114 |
115 | express1 <- parse(
116 | text=paste0("word.count := stringi::stri_count_words(", text.var, ")")
117 | )
118 |
119 | z <- z[, eval(express1)][,
120 | 'word.count' := ifelse(is.na(word.count), 0, word.count)][]
121 |
122 | out <- from_to(z, from.var)[,
123 | list(word.count = sum(word.count)), c('from', 'to')]
124 |
125 | if (!is.null(id.vars)) {
126 | out <- merge(out, w, all.x=TRUE, by.x = 'from', by.y = from.var)
127 | }
128 |
129 | out
130 | }
131 |
--------------------------------------------------------------------------------
/R/grab_index.R:
--------------------------------------------------------------------------------
1 | #' Get Elements Matching Between 2 Points
2 | #'
3 | #' Use regexes to get all the elements between two points.
4 | #'
5 | #' @param x A character vector, \code{\link[base]{data.frame}}, or list.
6 | #' @param from An integer to start from (if \code{NULL} defaults to the first
7 | #' element/row).
8 | #' @param to A integer to get up to (if \code{NULL} defaults to the last
9 | #' element/row).
10 | #' @param \ldots ignored.
11 | #' @return Returns a subset of the original data set.
12 | #' @export
13 | #' @examples
14 | #' grab_index(DATA, from = 2, to = 4)
15 | #' grab_index(DATA$state, from = 2, to = 4)
16 | #' grab_index(DATA$state, from = 2)
17 | #' grab_index(DATA$state, to = 4)
18 | #' grab_index(matrix(1:100, nrow = 10), 2, 4)
19 | grab_index <- function(x, from = NULL, to = NULL, ...){
20 |
21 | UseMethod('grab_index')
22 |
23 | }
24 |
25 | #' @export
26 | #' @rdname grab_index
27 | #' @method grab_index character
28 | grab_index.character <- function(x, from = NULL, to = NULL, ...){
29 |
30 | grab_index.default(x, from = from, to = to, ...)
31 |
32 | }
33 |
34 | #' @export
35 | #' @rdname grab_index
36 | #' @method grab_index default
37 | grab_index.default <- function(x, from = NULL, to = NULL, ...){
38 |
39 | if (is.null(from)) from <- 1
40 | if (is.null(to)) to <- length(x)
41 |
42 | if (from < 1 | from > length(x)) stop('`from` must be > 1 & < length(x)')
43 | if (to < 1 | to > length(x)) stop('`to` must be > 1 & < length(x)')
44 | x[from:to]
45 |
46 | }
47 |
48 | #' @export
49 | #' @rdname grab_index
50 | #' @method grab_index list
51 | grab_index.list <- function(x, from = NULL, to = NULL, ...){
52 |
53 | grab_index.default(x, from = from, to = to, ...)
54 |
55 | }
56 |
57 | #' @export
58 | #' @rdname grab_index
59 | #' @method grab_index data.frame
60 | grab_index.data.frame <- function(x, from = NULL, to = NULL, ...){
61 |
62 | if (from < 1 | from > length(x)) stop('`from` must be > 1 & < length(x)')
63 | if (to < 1 | to > length(x)) stop('`to` must be > 1 & < length(x)')
64 | x[from:to,, drop =FALSE]
65 |
66 | }
67 |
68 |
69 | ## Helper function(s)
70 | #' @export
71 | #' @rdname grab_index
72 | #' @method grab_index matrix
73 | grab_index.matrix <- function(x, from = NULL, to = NULL, ...){
74 |
75 | if (from < 1 | from > length(x)) stop('`from` must be > 1 & < length(x)')
76 | if (to < 1 | to > length(x)) stop('`to` must be > 1 & < length(x)')
77 | x[from:to,, drop =FALSE]
78 |
79 | }
80 |
81 |
--------------------------------------------------------------------------------
/R/grab_match.R:
--------------------------------------------------------------------------------
1 | #' Get Elements Matching Between 2 Points
2 | #'
3 | #' Use regexes to get all the elements between two points.
4 | #'
5 | #' @param x A character vector, \code{\link[base]{data.frame}}, or list.
6 | #' @param from A regex to start getting from (if \code{NULL} defaults to the
7 | #' first element/row).
8 | #' @param to A regex to get up to (if \code{NULL} defaults to the last element/row).
9 | #' @param from.n If more than one element matches \code{from} this dictates
10 | #' which one should be used. Must be an integer up to the number of possible
11 | #' matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
12 | #' possible), or \code{'n'} (the same as \code{'last'}).
13 | #' @param to.n If more than one element matches \code{to} this dictates
14 | #' which one should be used. Must be an integer up to the number of possible
15 | #' matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
16 | #' possible), or \code{'n'} (the same as \code{'last'}).
17 | #' @param \ldots Other arguments passed to \code{\link[base]{grep}}, most notable
18 | #' is \code{ignore.case}.
19 | #' @param text.var The name of the text variable with matches. If \code{TRUE}
20 | #' \code{grab_match} tries to detect the text column.
21 | #' @return Returns a subset of the original data set.
22 | #' @export
23 | #' @examples
24 | #' grab_match(DATA$state, from = 'dumb', to = 'liar')
25 | #' grab_match(DATA$state, from = 'dumb')
26 | #' grab_match(DATA$state, to = 'liar')
27 | #' grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE)
28 | #' grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE,
29 | #' from.n = 'first', to.n = 'last')
30 | #' grab_match(as.list(DATA$state), from = 'dumb', to = 'liar')
31 | #'
32 | #' ## Data.frame: attempts to find text.var
33 | #' grab_match(DATA, from = 'dumb', to = 'liar')
34 | grab_match <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...){
35 |
36 | UseMethod('grab_match')
37 |
38 | }
39 |
40 | #' @export
41 | #' @rdname grab_match
42 | #' @method grab_match character
43 | grab_match.character <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...){
44 |
45 | locs <- grab_match_helper(x = x, from = from, to = to, from.n = from.n, to.n = to.n)
46 |
47 | grab_index(x, locs[['from.ind']], locs[['to.ind']])
48 | }
49 |
50 | #' @export
51 | #' @rdname grab_match
52 | #' @method grab_match list
53 | grab_match.list <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...){
54 |
55 | locs <- grab_match_helper(x = lapply(x, unlist), from = from, to = to, from.n = from.n, to.n = to.n)
56 |
57 | grab_index(x, locs[['from.ind']], locs[['to.ind']])
58 | }
59 |
60 | #' @export
61 | #' @rdname grab_match
62 | #' @method grab_match data.frame
63 | grab_match.data.frame <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1,
64 | text.var = TRUE, ...){
65 |
66 | text.var <- detect_text_column(x, text.var)
67 |
68 | locs <- grab_match_helper(x = x[[text.var]], from = from, to = to, from.n = from.n, to.n = to.n)
69 |
70 | grab_index(x, locs[['from.ind']], locs[['to.ind']])
71 | }
72 |
73 |
74 |
75 |
76 |
77 | grab_match_helper <- function(x, from, to, from.n, to.n, ...){
78 |
79 | from.n <- nth(from.n)
80 | to.n <- nth(to.n)
81 |
82 | if (is.null(from)) {
83 | fi <- 1
84 | } else {
85 | fi <- get_index(from.n, match_to_index(x, from, use = 'from', ...), use = 'from.n')
86 | }
87 |
88 |
89 | if (is.null(to)) {
90 | ti <- length(x)
91 | } else {
92 | ti <- get_index(to.n, match_to_index(x, to, use = 'to', ...), use = 'to.n')
93 | }
94 |
95 | c(
96 | from.ind = fi,
97 | to.ind = ti
98 | )
99 |
100 | }
101 |
102 |
103 | get_index <- function(desired.i, possible.i, use, ...){
104 |
105 | if (!is.infinite(desired.i) && desired.i > length(possible.i)) {
106 | warning(sprintf('desired `%s` exceeds number of matches; using first match instead', use))
107 | return(possible.i[1])
108 | }
109 | if (is.infinite(desired.i)){
110 | return(possible.i[length(possible.i)])
111 | }
112 | possible.i[desired.i]
113 | }
114 |
115 |
116 | match_to_index <- function(x, regex, use, ...){
117 |
118 | if (is.null(regex)) return(NULL)
119 |
120 | out <- grep(regex, x, perl = TRUE, ...)
121 | if (length(out) == 0) stop(sprintf('`%s` did not have any matches', use))
122 | out
123 | }
124 |
125 |
126 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/R/mtabulate.R:
--------------------------------------------------------------------------------
1 | #' Tabulate Frequency Counts for Multiple Vectors
2 | #'
3 | #' \code{mtabulate} - Similar to \code{\link[base]{tabulate}} that works on
4 | #' multiple vectors.
5 | #'
6 | #' @param vects A \code{\link[base]{vector}}, \code{\link[base]{list}}, or
7 | #' \code{\link[base]{data.frame}} of named/unnamed vectors.
8 | #' @keywords tabulate frequency
9 | #' @export
10 | #' @seealso \code{\link[base]{tabulate}}
11 | #' @return \code{mtabulate} - Returns a \code{\link[base]{data.frame}} with
12 | #' columns equal to number of unique elements and the number of rows equal to
13 | #' the the original length of the \code{\link[base]{vector}},
14 | #' \code{\link[base]{list}}, or \code{\link[base]{data.frame}} (length equals
15 | #' number of columns in \code{\link[base]{data.frame}}). If list of vectors is
16 | #' named these will be the rownames of the dataframe.
17 | #' @author Joran Elias and Tyler Rinker .
18 | #' @rdname mtabulate
19 | #' @references \url{https://stackoverflow.com/a/9961324/1000343}
20 | #' @examples
21 | #' mtabulate(list(w=letters[1:10], x=letters[1:5], z=letters))
22 | #' mtabulate(list(mtcars$cyl[1:10]))
23 | #'
24 | #' ## Dummy coding
25 | #' mtabulate(mtcars$cyl[1:10])
26 | #' mtabulate(CO2[, "Plant"])
27 | #'
28 | #' dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3))
29 | #' mtabulate(dat)
30 | #' as_list(mtabulate(dat))
31 | #' t(mtabulate(dat))
32 | #' as_list(t(mtabulate(dat)))
33 | mtabulate <- function(vects) {
34 |
35 | lev <- sort(unique(unlist(vects)))
36 | dat <- do.call(rbind, lapply(vects, function(x, lev){
37 | tabulate(factor(x, levels = lev, ordered = TRUE),
38 | nbins = length(lev))}, lev = lev))
39 | colnames(dat) <- sort(lev)
40 | data.frame(dat, check.names = FALSE)
41 |
42 | }
43 |
44 |
45 |
46 | #' Tabulate Frequency Counts for Multiple Vectors
47 | #'
48 | #' \code{as_list} - Convert a count matrix to a named list of elements. The
49 | #' semantic inverse of \code{mtabulate}.
50 | #'
51 | #' @param mat A matrix of counts.
52 | #' @param nm A character vector of names to assign to the list.
53 | #' @rdname mtabulate
54 | #' @return \code{as_list} - Returns a list of elements.
55 | #' @export
56 | as_list <- function(mat, nm = rownames(mat)) {
57 |
58 | nms <- colnames(mat)
59 |
60 | lst <- lapply(seq_len(nrow(mat)), function(i) rep(nms, mat[i, , drop =FALSE]))
61 |
62 | #if (nrow(mat) == 1) lst <- list(c(lst))
63 | if (!is.list(lst) & is.atomic(lst)) lst <- as.list(lst)
64 | if(!is.list(lst)) lst <- lapply(seq_len(ncol(lst)), function(i) lst[, i])
65 | stats::setNames(lst, nm = nm)
66 |
67 | }
68 |
69 |
--------------------------------------------------------------------------------
/R/split_index.R:
--------------------------------------------------------------------------------
1 | #' Split Data Forms at Specified Indices
2 | #'
3 | #' Split data forms at specified integer indices.
4 | #'
5 | #' @param x A data form (\code{list}, \code{vector}, \code{data.frame},
6 | #' \code{matrix}).
7 | #' @param indices A vector of integer indices to split at. If \code{indices}
8 | #' contains the index 1, it will be silently dropped. The default value when
9 | #' \code{x} evaluates to \code{TRUE} for \code{\link[base]{is.atomic}} is to use
10 | #' \code{\link[textshape]{change_index}(x)}.
11 | #' @param names Optional vector of names to give to the list elements.
12 | #' @param \ldots Ignored.
13 | #' @return Returns of list of data forms broken at the \code{indices}.
14 | #' @note Two dimensional object will retain dimension (i.e., \code{drop = FALSE}
15 | #' is used).
16 | #' @seealso \code{\link[textshape]{change_index}}
17 | #' @export
18 | #' @examples
19 | #' ## character
20 | #' split_index(LETTERS, c(4, 10, 16))
21 | #' split_index(LETTERS, c(4, 10, 16), c("dog", "cat", "chicken", "rabbit"))
22 | #'
23 | #' ## numeric
24 | #' split_index(1:100, c(33, 66))
25 | #'
26 | #' ## factor
27 | #' (p_chng <- change_index(CO2[["Plant"]]))
28 | #' split_index(CO2[["Plant"]], p_chng)
29 | #' #`change_index` was unnecessary as it is the default of atomic vectors
30 | #' split_index(CO2[["Plant"]])
31 | #'
32 | #' ## list
33 | #' split_index(as.list(LETTERS), c(4, 10, 16))
34 | #'
35 | #' ## data.frame
36 | #' (vs_change <- change_index(mtcars[["vs"]]))
37 | #' split_index(mtcars, vs_change)
38 | #'
39 | #' ## matrix
40 | #' (mat <- matrix(1:50, nrow=10))
41 | #' split_index(mat, c(3, 6, 10))
42 | split_index <- function(x,
43 | indices = if (is.atomic(x)) {NULL} else {change_index(x)}, names = NULL,
44 | ...) {
45 |
46 | indices
47 | names
48 | UseMethod("split_index")
49 |
50 | }
51 |
52 | #' @export
53 | #' @method split_index list
54 | #' @rdname split_index
55 | split_index.list <- function(x, indices, names = NULL, ...) {
56 |
57 | names <- name_len_check(indices, names)
58 | out <- split_index_vector(x, indices, ...)
59 | if(!is.null(names)) names(out) <- names
60 | out
61 | }
62 |
63 | #' @export
64 | #' @method split_index data.frame
65 | #' @rdname split_index
66 | split_index.data.frame <- function(x, indices, names = NULL, ...) {
67 |
68 | names <- name_len_check(indices, names)
69 | out <- split_index_mat(x, indices, ...)
70 | if(!is.null(names)) names(out) <- names
71 | out
72 | }
73 |
74 | #' @export
75 | #' @method split_index matrix
76 | #' @rdname split_index
77 | split_index.matrix <- function(x, indices, names = NULL, ...) {
78 |
79 | names <- name_len_check(indices, names)
80 | out <- split_index_mat(x, indices, ...)
81 | if(!is.null(names)) names(out) <- names
82 | out
83 | }
84 |
85 | #' @export
86 | #' @method split_index numeric
87 | #' @rdname split_index
88 | split_index.numeric <- function(x, indices = change_index(x), names = NULL,
89 | ...) {
90 |
91 | names <- name_len_check(indices, names)
92 | out <- split_index_vector(x, indices, ...)
93 | if(!is.null(names)) names(out) <- names
94 | out
95 | }
96 |
97 | #' @export
98 | #' @method split_index factor
99 | #' @rdname split_index
100 | split_index.factor <- function(x, indices = change_index(x), names = NULL,
101 | ...) {
102 |
103 | names <- name_len_check(indices, names)
104 | out <- split_index_vector(x, indices, ...)
105 | if(!is.null(names)) names(out) <- names
106 | out
107 | }
108 |
109 | #' @export
110 | #' @method split_index character
111 | #' @rdname split_index
112 | split_index.character <- function(x, indices = change_index(x),
113 | names = NULL, ...) {
114 |
115 | names <- name_len_check(indices, names)
116 | out <- split_index_vector(x, indices, ...)
117 | if(!is.null(names)) names(out) <- names
118 | out
119 | }
120 |
121 | #' @export
122 | #' @method split_index default
123 | #' @rdname split_index
124 | split_index.default <- function(x, indices = change_index(x),
125 | names = NULL, ...) {
126 |
127 | names <- name_len_check(indices, names)
128 | out <- split_index_vector(x, indices, ...)
129 | if(!is.null(names)) names(out) <- names
130 | out
131 | }
132 |
133 |
134 |
135 |
136 | split_index_vector <- function(x, indices){
137 | if (any(indices %in% "1")) indices <- indices[!indices %in% "1"]
138 | starts <- c(1, indices)
139 | Map(function(s, e) {x[s:e]}, starts, c(indices - 1, length(x)))
140 | }
141 |
142 |
143 | split_index_mat <- function(x, indices, names = NULL, ...) {
144 |
145 | indices <- indices[!indices %in% "1"]
146 | len <- nrow(x)
147 | if (len < max(indices)) {
148 | stop(
149 | "One or more `indices` elements exceeds nrow of `x`",
150 | call. = FALSE
151 | )
152 | }
153 |
154 | starts <- c(1, indices)
155 | Map(function(s, e) {x[s:e, ,drop=FALSE]}, starts, c(indices - 1, nrow(x)))
156 |
157 | }
158 |
159 |
160 |
161 | name_len_check <- function(indices, names) {
162 |
163 | if (is.null(names)) return(names)
164 | check <- length(indices) + 1 == length(names)
165 | if(!check) {
166 | warning(
167 | paste(
168 | "length of `names` muse be equal to length",
169 | "of `indices` + 1; ignoring `names`",
170 | ),
171 | call. = FALSE
172 | )
173 | }
174 | if (!check) NULL else names
175 | }
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/R/split_match.R:
--------------------------------------------------------------------------------
1 | #' Split a Vector By Split Points
2 | #'
3 | #' \code{split_match} - Splits a \code{vector} into a list of vectors based on
4 | #' split points.
5 | #'
6 | #' @param x A vector with split points.
7 | #' @param split A vector of places (elements) to split on or a regular
8 | #' expression if \code{regex} argument is \code{TRUE}.
9 | #' @param include An integer of \code{1} (\code{split} character(s) are not
10 | #' included in the output), \code{2} (\code{split} character(s) are included at
11 | #' the beginning of the output), or \code{3} (\code{split} character(s) are
12 | #' included at the end of the output).
13 | #' @param regex logical. If \code{TRUE} regular expressions will be enabled for
14 | #' \code{split} argument.
15 | #' @param \ldots other arguments passed to \code{\link[base]{grep}} and
16 | #' \code{\link[base]{grepl}}.
17 | #' @return Returns a list of vectors.
18 | #' @author Matthew Flickinger and Tyler Rinker .
19 | #' @references \url{https://stackoverflow.com/a/24319217/1000343}
20 | #' @export
21 | #' @rdname split_match
22 | #' @examples
23 | #' set.seed(15)
24 | #' x <- sample(c("", LETTERS[1:10]), 25, TRUE, prob=c(.2, rep(.08, 10)))
25 | #'
26 | #' split_match(x)
27 | #' split_match(x, "C")
28 | #' split_match(x, c("", "C"))
29 | #'
30 | #' split_match(x, include = 0)
31 | #' split_match(x, include = 1)
32 | #' split_match(x, include = 2)
33 | #'
34 | #' set.seed(15)
35 | #' x <- sample(1:11, 25, TRUE, prob=c(.2, rep(.08, 10)))
36 | #' split_match(x, 1)
37 | split_match <- function(x, split = "", include = FALSE, regex = FALSE, ...) {
38 |
39 | include <- as.numeric(include)
40 |
41 | if (length(include) != 1 || !include %in% 0:2) {
42 | stop("Supply 0, 1, or 2 to `include`")
43 | }
44 |
45 | if (include %in% 0:1){
46 | if (!regex){
47 | breaks <- x %in% split
48 | } else {
49 | breaks <- grepl(split, x, ...)
50 | }
51 |
52 | if(include == 1) {
53 | inds <- rep(TRUE, length(breaks))
54 | } else {
55 | inds <- !breaks
56 | }
57 | out <- split(x[inds], cumsum(breaks)[inds])
58 | names(out) <- seq_along(out)
59 | out
60 |
61 | } else {
62 | if (!regex){
63 | locs <- which(x %in% split)
64 | } else {
65 | locs <- grep(split, x, ...)
66 | }
67 |
68 | start <- c(1, locs + 1)
69 | end <- c(locs, length(x))
70 |
71 | lapply(Map(":", start, end), function(ind){
72 | x[ind]
73 | })
74 | }
75 | }
76 |
77 |
78 | #' Split a Vector By Split Points
79 | #'
80 | #' \code{split_match_regex} - \code{split_match} with \code{regex = TRUE}.
81 | #'
82 | #' @export
83 | #' @rdname split_match
84 | split_match_regex <- function(x, split = "", include = FALSE, ...){
85 | split_match(x =x, split = split, include = include, regex = TRUE, ...)
86 | }
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/R/split_match_regex_to_transcript.R:
--------------------------------------------------------------------------------
1 | #' #' Split Text by Regex Into a Transcript
2 | #' #'
3 | #' #' A wrapper for \code{\link[textshape]{split_match_regex}} and
4 | #' #' \pkg{textreadr}'s \code{as_transript} to detect person variable, split the
5 | #' #' text into turns of talk, and convert to a data.frame with \code{person} and
6 | #' #' \code{dialogue} variables. There is a bit of cleansing that is closer to
7 | #' #' \code{as_transript} than \code{\link[textshape]{split_transcript}}.
8 | #' #'
9 | #' #' @param x A vector with split points.
10 | #' #' @param person.regex A vector of places (elements) to split on or a regular
11 | #' #' expression if \code{regex} argument is \code{TRUE}.
12 | #' #' @param col.names A character vector specifying the column names of the
13 | #' #' transcript columns.
14 | #' #' @param dash A character string to replace the en and em dashes special
15 | #' #' characters (default is to remove).
16 | #' #' @param ellipsis A character string to replace the ellipsis special
17 | #' #' characters.
18 | #' #' @param quote2bracket logical. If \code{TRUE} replaces curly quotes with
19 | #' #' curly braces (default is \code{FALSE}). If \code{FALSE} curly quotes are
20 | #' #' removed.
21 | #' #' @param rm.empty.rows logical. If \code{TRUE}
22 | #' #' \code{\link[textreadr]{read_transcript}} attempts to remove empty rows.
23 | #' #' @param skip Integer; the number of lines of the data file to skip before
24 | #' #' beginning to read data.
25 | #' #' @param \ldots ignored.
26 | #' #' @return Returns a data.frame of dialogue and people.
27 | #' #' @export
28 | #' split_match_regex_to_transcript <- function (x, person.regex = "^[A-Z]{3,}",
29 | #' col.names = c("Person", "Dialogue"), dash = "", ellipsis = "...",
30 | #' quote2bracket = FALSE, rm.empty.rows = TRUE, skip = 0, ...) {
31 | #'
32 | #' text2transcript(
33 | #' combine_list(
34 | #' split_match(x, split = person.regex, include = TRUE, regex = TRUE)
35 | #' ),
36 | #' person.regex = person.regex,
37 | #' col.names = col.names,
38 | #' dash = dash,
39 | #' ellipsis = ellipsis,
40 | #' quote2bracket = quote2bracket,
41 | #' rm.empty.rows = rm.empty.rows,
42 | #' skip = skip,
43 | #' ...
44 | #' )
45 | #'
46 | #' }
47 | #'
48 | #'
49 | #' combine_list <- function (x, fix.punctuation = TRUE, ...) {
50 | #'
51 | #' if (!is.list(x)) x <- list(x)
52 | #' x <- unlist(lapply(x, paste, collapse = " "))
53 | #' if (isTRUE(fix.punctuation)) {
54 | #' x <- gsub("(\\s+(?=[,.?!;:%-]))|((?<=[$-])\\s+)", "", x, perl = TRUE)
55 | #' }
56 | #' unname(x)
57 | #'
58 | #' }
59 | #'
60 | #'
61 | #'
62 | #' text2transcript <- function(text, person.regex = NULL,
63 | #' col.names = c("Person", "Dialogue"), text.var = NULL,
64 | #' merge.broke.tot = TRUE, header = FALSE, dash = "", ellipsis = "...",
65 | #' quote2bracket = FALSE, rm.empty.rows = TRUE, na = "", skip = 0, ...) {
66 | #'
67 | #' sep <- ":"
68 | #' text <- unlist(strsplit(text, "\n"))
69 | #' text <- paste(
70 | #' gsub(
71 | #' paste0('(', person.regex, ')'),
72 | #' "\\1SEP_PLACE_HOLDER",
73 | #' text,
74 | #' perl = TRUE
75 | #' ),
76 | #' collapse = "\n"
77 | #' )
78 | #'
79 | #' text <- gsub(":", "SYMBOL_PLACE_HOLDER", text)
80 | #' text <- gsub("SEP_PLACE_HOLDER", ":", text, fixed = TRUE)
81 | #'
82 | #' ## Use read.table to split read the text as a table
83 | #' x <- utils::read.table(
84 | #' text=text,
85 | #' header = header,
86 | #' sep = sep,
87 | #' skip=skip,
88 | #' quote = ""
89 | #' )
90 | #'
91 | #' x[[2]] <- gsub("SYMBOL_PLACE_HOLDER", ":", x[[2]], fixed = TRUE)
92 | #'
93 | #' if (!is.null(text.var) & !is.numeric(text.var)) {
94 | #' text.var <- which(colnames(x) == text.var)
95 | #' } else {
96 | #' text.col <- function(dataframe) {
97 | #' dial <- function(x) {
98 | #' if(is.factor(x) | is.character(x)) {
99 | #' n <- max(nchar(as.character(x)), na.rm = TRUE)
100 | #' } else {
101 | #' n <- NA
102 | #' }
103 | #' }
104 | #' which.max(unlist(lapply(dataframe, dial)))
105 | #' }
106 | #' text.var <- text.col(x)
107 | #' }
108 | #'
109 | #' x[[text.var]] <- trimws(
110 | #' iconv(as.character(x[[text.var]]), "", "ASCII", "byte")
111 | #' )
112 | #'
113 | #' if (is.logical(quote2bracket)) {
114 | #' if (quote2bracket) {
115 | #' rbrac <- "}"
116 | #' lbrac <- "{"
117 | #' } else {
118 | #' lbrac <- rbrac <- ""
119 | #' }
120 | #' } else {
121 | #' rbrac <- quote2bracket[2]
122 | #' lbrac <- quote2bracket[1]
123 | #' }
124 | #'
125 | #' ser <- c("<80><9c>", "<80><9d>", "<80><98>", "<80><99>",
126 | #' "<80><9b>", "<87>", "<80>", "<80><93>",
127 | #' "<80><94>", "", "", "")
128 | #'
129 | #' reps <- c(lbrac, rbrac, "'", "'", "'", "'", ellipsis, dash, dash, "a", "e",
130 | #' "half")
131 | #'
132 | #' Encoding(x[[text.var]]) <-"latin1"
133 | #' x[[text.var]] <- clean(.mgsub(ser, reps, x[[text.var]]))
134 | #' if(rm.empty.rows) {
135 | #' x <- rm_empty_row(x)
136 | #' }
137 | #' if (!is.null(col.names)) {
138 | #' colnames(x) <- col.names
139 | #' }
140 | #'
141 | #' x <- as.data.frame(x, stringsAsFactors = FALSE)
142 | #'
143 | #' if (merge.broke.tot) {
144 | #' x <- combine_tot(x)
145 | #' }
146 | #' x <- rm_na_row(x, rm.empty.rows)
147 | #' class(x) <- c("textreadr", "data.frame")
148 | #' x
149 | #' }
150 | #'
151 | #' clean <- function (text.var) {
152 | #' gsub("\\s+", " ", gsub("\\\\r|\\\\n|\\n|\\\\t", " ", text.var))
153 | #' }
154 | #'
155 | #' rm_na_row <- function(x, remove = TRUE) {
156 | #' if (!remove) return(x)
157 | #' x[rowSums(is.na(x)) != ncol(x), ]
158 | #' }
159 | #'
160 | #' rm_empty_row <- function(dataframe) {
161 | #' x <- paste2(dataframe, sep = "")
162 | #' x <- gsub("\\s+", "", x)
163 | #' ind <- x != ""
164 | #' return(dataframe[ind, , drop = FALSE])
165 | #' }
166 | #'
167 | #' #Helper function used in read.transcript
168 | #' #' @importFrom data.table :=
169 | #' combine_tot <- function(x){
170 | #' person <- NULL
171 | #' nms <- colnames(x)
172 | #' colnames(x) <- c('person', 'z')
173 | #' x <- data.table::data.table(x)
174 | #'
175 | #' exp <- parse(text='list(text = paste(z, collapse = " "))')[[1]]
176 | #' out <- x[, eval(exp),
177 | #' by = list(person, 'new' = data.table::rleid(person))][,
178 | #' 'new' := NULL][]
179 | #' data.table::setnames(out, nms)
180 | #' out
181 | #' }
182 | #'
183 |
--------------------------------------------------------------------------------
/R/split_portion.R:
--------------------------------------------------------------------------------
1 | #' Break Text Into Ordered Word Chunks
2 | #'
3 | #' Some visualizations and algorithms require text to be broken into chunks of
4 | #' ordered words. \code{split_portion} breaks text, optionally by grouping
5 | #' variables, into equal chunks. The chunk size can be specified by giving
6 | #' number of words to be in each chunk or the number of chunks.
7 | #'
8 | #' @param text.var The text variable
9 | #' @param grouping.var The grouping variables. Default \code{NULL} generates
10 | #' one word list for all text. Also takes a single grouping variable or a list
11 | #' of 1 or more grouping variables.
12 | #' @param n.words An integer specifying the number of words in each chunk (must
13 | #' specify n.chunks or n.words).
14 | #' @param n.chunks An integer specifying the number of chunks (must specify
15 | #' n.chunks or n.words).
16 | #' @param as.string logical. If \code{TRUE} the chunks are returned as a single
17 | #' string. If \code{FALSE} the chunks are returned as a vector of single words.
18 | #' @param rm.unequal logical. If \code{TRUE} final chunks that are unequal in
19 | #' length to the other chunks are removed.
20 | #' @param as.table logical. If \code{TRUE} the list output is coerced to
21 | #' \code{\link[data.table]{data.table}} or \pkg{tibble}.
22 | #' @param \ldots Ignored.
23 | #' @return Returns a list or \code{\link[data.table]{data.table}} of text chunks.
24 | #' @keywords chunks group text
25 | #' @export
26 | #' @examples
27 | #' with(DATA, split_portion(state, n.chunks = 10))
28 | #' with(DATA, split_portion(state, n.words = 10))
29 | #' with(DATA, split_portion(state, n.chunks = 10, as.string=FALSE))
30 | #' with(DATA, split_portion(state, n.chunks = 10, rm.unequal=TRUE))
31 | #' with(DATA, split_portion(state, person, n.chunks = 10))
32 | #' with(DATA, split_portion(state, list(sex, adult), n.words = 10))
33 | #' with(DATA, split_portion(state, person, n.words = 10, rm.unequal=TRUE))
34 | #'
35 | #' ## Bigger data
36 | #' with(hamlet, split_portion(dialogue, person, n.chunks = 10))
37 | #' with(hamlet, split_portion(dialogue, list(act, scene, person), n.chunks = 10))
38 | #' with(hamlet, split_portion(dialogue, person, n.words = 300))
39 | #' with(hamlet, split_portion(dialogue, list(act, scene, person), n.words = 300))
40 | split_portion <- function(text.var, grouping.var = NULL, n.words, n.chunks,
41 | as.string = TRUE, rm.unequal = FALSE, as.table = TRUE, ...){
42 |
43 | if (missing(n.chunks) && missing(n.words)) {
44 | stop("Must supply either `n.chunks` or `n.words`")
45 | }
46 |
47 | if(is.null(grouping.var)) {
48 | G <- "all"
49 | ilen <- 1
50 | } else {
51 | if (is.list(grouping.var)) {
52 | m <- unlist(as.character(substitute(grouping.var))[-1])
53 | m <- sapply2(strsplit(m, "$", fixed=TRUE), function(x) {
54 | x[length(x)]
55 | }
56 | )
57 | ilen <- length(grouping.var)
58 | G <- paste(m, collapse="&")
59 | } else {
60 | G <- as.character(substitute(grouping.var))
61 | ilen <- length(G)
62 | G <- G[length(G)]
63 | }
64 | }
65 | if(is.null(grouping.var)){
66 | grouping <- rep("all", length(text.var))
67 | } else {
68 | if (is.list(grouping.var) & length(grouping.var)>1) {
69 | grouping <- paste2(grouping.var)
70 | } else {
71 | grouping <- unlist(grouping.var)
72 | }
73 | }
74 |
75 | ## split into ordered words by grouping variable
76 | dat <- lapply(split(as.character(text.var), grouping), function(x) {
77 | unlist(stringi::stri_split_regex(x, "\\s+"))
78 | })
79 |
80 |
81 | if (!missing(n.chunks)){
82 |
83 | ## Check that n.chunks is integer
84 | if (!is.Integer(n.chunks)){
85 | stop("`n.chunks` must be an integer")
86 | }
87 | out <- lapply(
88 | dat,
89 | split_portion_help_groups,
90 | N = n.chunks,
91 | ub = as.string,
92 | rme = rm.unequal
93 | )
94 |
95 | } else {
96 |
97 | ## Check that n.words is integer
98 | if (!is.Integer(n.words)){
99 | stop("`n.words` must be an integer")
100 | }
101 | out <- lapply(
102 | dat,
103 | split_portion_help_words,
104 | N = n.words,
105 | ub = as.string,
106 | rme = rm.unequal
107 | )
108 | }
109 |
110 | grpvar <- stats::setNames(
111 | as.data.frame(
112 | do.call(
113 | rbind,
114 | strsplit(names(unlist(out)), "\\.")
115 | ),
116 | stringsAsFactors = FALSE
117 | ), c(strsplit(G, "&")[[1]], "index")
118 | )
119 |
120 | if (isTRUE(as.table) & isTRUE(as.string)){
121 | out <- data.frame(
122 | grpvar,
123 | text.var = unlist(out),
124 | stringsAsFactors = FALSE,
125 | row.names=NULL
126 | )
127 | data.table::setDT(out)
128 | out <- out
129 | }
130 | out
131 | }
132 |
133 |
134 | split_portion_help_groups <- function(x, N, ub, rme){
135 |
136 | len <- length(x)
137 | size <- floor(len/N)
138 |
139 | ## make the groups, leftover are unequal sized last group
140 | grabs <- rep(seq_len(N), each = size)
141 | if (N * size < len){
142 | leftover <- rep(N + 1, len - N * size)
143 | grabs <- c(grabs, leftover)
144 | }
145 |
146 | y <- suppressWarnings(split(x, grabs))
147 | if (rme){
148 | ylen <- length(y)
149 | ## if there is only one chunk it is returned
150 | if (ylen != 1) {
151 | lens <- lengths(y)
152 | if (!Reduce("==", utils::tail(lens, 2))) y <- y[1:(ylen-1)]
153 | }
154 | }
155 | if (ub) y <- lapply(y, paste, collapse = " ")
156 | y
157 | }
158 |
159 | split_portion_help_words <- function(x, N, ub, rme){
160 |
161 | len <- length(x)
162 | groups <- ceiling(len/N)
163 | y <- suppressWarnings(split(x, rep(seq_len(groups), each = N)))
164 | if (rme){
165 | ylen <- length(y)
166 | if (ylen == 1) return(NULL)
167 | if (length(y[[ylen]]) != N) y <- y[1:(ylen-1)]
168 | }
169 | if (ub) y <- lapply(y, paste, collapse=" ")
170 | y
171 | }
172 |
173 |
174 |
175 |
--------------------------------------------------------------------------------
/R/split_run.R:
--------------------------------------------------------------------------------
1 | #' Split Runs
2 | #'
3 | #' Split runs of consecutive characters.
4 | #'
5 | #' @param x A \code{\link[base]{data.frame}} or character vector with runs.
6 | #' @param text.var The name of the text variable with runs. If \code{TRUE}
7 | #' \code{split_word} tries to detect the text column with runs.
8 | #' @param \ldots Ignored.
9 | #' @export
10 | #' @rdname split_run
11 | #' @importFrom data.table .N :=
12 | #' @return Returns a list of vectors of runs or an expanded
13 | #' \code{\link[data.table]{data.table}} with runs split apart.
14 | #' @examples
15 | #' x1 <- c(
16 | #' "122333444455555666666",
17 | #' NA,
18 | #' "abbcccddddeeeeeffffff",
19 | #' "sddfg",
20 | #' "11112222333"
21 | #' )
22 | #'
23 | #' x <- c(rep(x1, 2), ">>???,,,,....::::;[[")
24 | #'
25 | #' split_run(x)
26 | #'
27 | #'
28 | #' DATA[["run.col"]] <- x
29 | #' split_run(DATA, "run.col")
30 | split_run <- function(x, ...) {
31 | UseMethod("split_run")
32 | }
33 |
34 | #' @export
35 | #' @rdname split_run
36 | #' @method split_run default
37 | split_run.default <- function(x, ...) {
38 | strsplit(x, "(?<=([\\S^]))(?!\\1)", perl = TRUE)
39 | }
40 |
41 | #' @export
42 | #' @rdname split_run
43 | #' @method split_run data.frame
44 | split_run.data.frame <- function(x, text.var = TRUE, ...) {
45 |
46 | element_id <- NULL
47 | nms <- colnames(x)
48 | z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
49 |
50 | text.var <- detect_text_column(x, text.var)
51 |
52 | z[, element_id := 1:.N]
53 | express1 <- parse(
54 | text=paste0(text.var, " := list(split_run.default(", text.var, "))")
55 | )
56 | z[, eval(express1)]
57 | # browser()
58 | express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
59 | z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][,
60 | c(nms, "element_id"), with = FALSE]
61 | z[, 'sentence_id' := 1:.N, by = list(element_id)][]
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/R/split_sentence.R:
--------------------------------------------------------------------------------
1 | #' Split Sentences
2 | #'
3 | #' Split sentences.
4 | #'
5 | #' @param x A \code{\link[base]{data.frame}} or character vector with sentences.
6 | #' @param text.var The name of the text variable. If \code{TRUE}
7 | #' \code{split_sentence} tries to detect the column with sentences.
8 | #' @param \ldots Ignored.
9 | #' @export
10 | #' @rdname split_sentence
11 | #' @importFrom data.table .N :=
12 | #' @return Returns a list of vectors of sentences or a expanded
13 | #' \code{\link[base]{data.frame}} with sentences split apart.
14 | #' @examples
15 | #' (x <- c(paste0(
16 | #' "Mr. Brown comes! He says hello. i give him coffee. i will ",
17 | #' "go at 5 p. m. eastern time. Or somewhere in between!go there"
18 | #' ),
19 | #' paste0(
20 | #' "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
21 | #' "The time has come. The time is now. Just go. Go. GO!",
22 | #' "I don't care how."
23 | #' )))
24 | #' split_sentence(x)
25 | #'
26 | #' data(DATA)
27 | #' split_sentence(DATA)
28 | #'
29 | #' \dontrun{
30 | #' ## Kevin S. Dias' sentence boundary disambiguation test set
31 | #' data(golden_rules)
32 | #' library(magrittr)
33 | #'
34 | #' golden_rules %$%
35 | #' split_sentence(Text)
36 | #' }
37 | split_sentence <- function(x, ...) {
38 | UseMethod("split_sentence")
39 | }
40 |
41 | #' @export
42 | #' @rdname split_sentence
43 | #' @method split_sentence default
44 | split_sentence.default <- function(x, ...) {
45 | get_sents2(x)
46 | }
47 |
48 | #' @export
49 | #' @rdname split_sentence
50 | #' @method split_sentence data.frame
51 | split_sentence.data.frame <- function(x, text.var = TRUE, ...) {
52 |
53 | element_id <- NULL
54 | nms <- colnames(x)
55 | z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
56 |
57 | text.var <- detect_text_column(x, text.var)
58 |
59 | z[, element_id := 1:.N]
60 | express1 <- parse(
61 | text=paste0(text.var, " := list(get_sents2(", text.var, "))")
62 | )
63 | z[, eval(express1)]
64 |
65 | express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
66 | z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][,
67 | c(nms, "element_id"), with = FALSE]
68 | z[, 'sentence_id' := 1:.N, by = list(element_id)][]
69 |
70 | }
71 |
72 |
73 |
74 |
75 |
76 | #get_sents <- function(x) {
77 | # x <- stringi::stri_replace_all_regex(stringi::stri_trans_tolower(x), sent_regex, "")
78 | # stringi::stri_split_regex(x, "(?>>$3<<>>'
154 | )
155 | y <- stringi::stri_replace_all_regex(y, sent_regex, "<<>>")
156 | y <- stringi::stri_replace_all_regex(
157 | y,
158 | '(\\b[Nn]o)(\\.)(\\s+\\d)',
159 | '$1<<>>$3'
160 | )
161 | y <- stringi::stri_replace_all_regex(
162 | y,
163 | '(\\b\\d+\\s+in)(\\.)(\\s[a-z])',
164 | '$1<<>>$3'
165 | )
166 | y <- stringi::stri_replace_all_regex(
167 | y,
168 | '([?.!]+)([\'])([^,])',
169 | '<<>>$1 $3'
170 | )
171 | y <- stringi::stri_replace_all_regex(
172 | y,
173 | '([?.!]+)(["])([^,])',
174 | '<<>>$1 $3'
175 | )
176 | ## midde name handling
177 | y <- stringi::stri_replace_all_regex(y,
178 | '(\\b[A-Z][a-z]+\\s[A-Z])(\\.)(\\s[A-Z][a-z]+\\b)',
179 | '$1<<>>$3'
180 | )
181 |
182 | #2 middle names
183 | y <- stringi::stri_replace_all_regex(y,
184 | '(\\b[A-Z][a-z]+\\s[A-Z])(\\.)(\\s[A-Z])(\\.)(\\s[A-Z][a-z]+\\b)',
185 | '$1<<>>$3<<>>$5'
186 | )
187 | y <- stringi::stri_split_regex(
188 | y,
189 | paste0(
190 | "((?>>", ".")
203 | y <- stringi::stri_replace_all_regex(y, "(<<>>)([?.!]+)", "$2\"")
204 | y <- stringi::stri_replace_all_regex(y, "(<<>>)([?.!]+)", "$2'")
205 |
206 | split_index(y, locs)
207 | }
208 |
209 | get_sentences2 <- function(x, ...) {
210 | lapply(
211 | lapply(
212 | get_sents2(trimws(x)),
213 | function(x) {
214 | gsub("<<>>", ".", x)
215 | }
216 | ),
217 | function(x) {
218 | gsub("^\\s+|\\s+$", "", x)
219 | }
220 | )
221 | }
222 |
223 |
224 |
--------------------------------------------------------------------------------
/R/split_sentence_token.R:
--------------------------------------------------------------------------------
1 | #' Split Sentences & Tokens
2 | #'
3 | #' Split sentences and tokens.
4 | #'
5 | #' @param x A \code{\link[base]{data.frame}} or character vector with sentences.
6 | #' @param text.var The name of the text variable. If \code{TRUE}
7 | #' \code{split_sentence_token} tries to detect the column with sentences.
8 | #' @param lower logical. If \code{TRUE} the words are converted to lower case.
9 | #' @param \ldots Ignored.
10 | #' @export
11 | #' @rdname split_sentence_token
12 | #' @importFrom data.table .N :=
13 | #' @return Returns a list of vectors of sentences or a expanded
14 | #' \code{\link[base]{data.frame}} with sentences split apart.
15 | #' @examples
16 | #' (x <- c(paste0(
17 | #' "Mr. Brown comes! He says hello. i give him coffee. i will ",
18 | #' "go at 5 p. m. eastern time. Or somewhere in between!go there"
19 | #' ),
20 | #' paste0(
21 | #' "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
22 | #' "The time has come. The time is now. Just go. Go. GO!",
23 | #' "I don't care how."
24 | #' )))
25 | #' split_sentence_token(x)
26 | #'
27 | #' data(DATA)
28 | #' split_sentence_token(DATA)
29 | #'
30 | #' \dontrun{
31 | #' ## Kevin S. Dias' sentence boundary disambiguation test set
32 | #' data(golden_rules)
33 | #' library(magrittr)
34 | #'
35 | #' golden_rules %$%
36 | #' split_sentence_token(Text)
37 | #' }
38 | split_sentence_token <- function(x, ...) {
39 | UseMethod("split_sentence_token")
40 | }
41 |
42 | #' @export
43 | #' @rdname split_sentence_token
44 | #' @method split_sentence_token default
45 | split_sentence_token.default <- function(x, lower = TRUE, ...) {
46 |
47 | split_sentence_token.data.frame(data.frame(text = x,
48 | stringsAsFactors = FALSE))
49 |
50 | }
51 |
52 | #' @export
53 | #' @rdname split_sentence_token
54 | #' @method split_sentence_token data.frame
55 | split_sentence_token.data.frame <- function(x, text.var = TRUE,
56 | lower = TRUE, ...) {
57 |
58 | element_id <- NULL
59 | z <- split_sentence(x, text.var = text.var, ...)
60 |
61 | nms <- colnames(z)
62 |
63 | text.var <- detect_text_column(z, text.var)
64 |
65 | express1 <- parse(
66 | text=paste0(
67 | text.var,
68 | " := list(split_token.default(",
69 | text.var,
70 | ", lower = ",
71 | lower,
72 | "))"
73 | )
74 | )
75 |
76 | z[, eval(express1)]
77 |
78 | express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
79 | z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][,
80 | c(nms), with = FALSE]
81 | z[, 'token_id' := 1:.N, by = list(element_id)][]
82 |
83 | }
84 |
85 |
86 |
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/R/split_speaker.R:
--------------------------------------------------------------------------------
1 | #' Break and Stretch if Multiple Persons per Cell
2 | #'
3 | #' Look for cells with multiple people and create separate rows for each person.
4 | #'
5 | #' @param dataframe A dataframe that contains the person variable.
6 | #' @param speaker.var The person variable to be stretched.
7 | #' @param sep The separator(s) to search for and break on. Default is:
8 | #' c("and", "&", ",")
9 | #' @param \ldots Ignored.
10 | #' @return Returns an expanded dataframe with person variable stretched and
11 | #' accompanying rows repeated.
12 | #' @export
13 | #' @examples
14 | #' \dontrun{
15 | #' DATA$person <- as.character(DATA$person)
16 | #' DATA$person[c(1, 4, 6)] <- c("greg, sally, & sam",
17 | #' "greg, sally", "sam and sally")
18 | #'
19 | #' split_speaker(DATA)
20 | #'
21 | #' DATA$person[c(1, 4, 6)] <- c("greg_sally_sam",
22 | #' "greg.sally", "sam; sally")
23 | #'
24 | #' split_speaker(DATA, sep = c(".", "_", ";"))
25 | #'
26 | #' DATA <- textshape::DATA #reset DATA
27 | #' }
28 | split_speaker <- function (dataframe, speaker.var = 1, sep = c("and", "&", ","),
29 | ...){
30 |
31 | element_id <- NULL
32 | nms <- colnames(dataframe)
33 | speaker.var <- colnames(dataframe[,speaker.var, drop=FALSE])
34 | z <- data.table::data.table(data.frame(dataframe, stringsAsFactors = FALSE))
35 |
36 | z[, element_id := 1:.N]
37 | express1 <- parse(text=
38 | paste0(
39 | speaker.var,
40 | " := list(splittify(",
41 | speaker.var,
42 | ", c(",
43 | paste(paste0("\"", sep, "\""), collapse=", "),
44 | ")))"
45 | )
46 | )
47 |
48 | z[, eval(express1)]
49 |
50 | express2 <- parse(
51 | text=paste0(".(", speaker.var, "=unlist(", speaker.var, "))")
52 | )
53 |
54 | z <- z[,
55 | eval(express2),
56 | by = c(colnames(z)[!colnames(z) %in% speaker.var])
57 | ][,
58 | c(nms, "element_id"), with = FALSE]
59 | z[, 'split_id' := 1:.N, by = list(element_id)][]
60 |
61 | }
62 |
63 |
64 | splittify <- function(x, y) {
65 |
66 | y <- .mgsub(esc, paste0('\\', esc), y, perl = FALSE)
67 |
68 | lapply(x, function(z) {
69 | trimws(
70 | grep("^\\s*$",
71 | strsplit(as.character(z), paste(paste(y), collapse="|"))[[1]],
72 | value=TRUE,
73 | invert = TRUE
74 | )
75 | )
76 | })
77 | }
78 |
79 | esc <- c(".", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?")
80 |
--------------------------------------------------------------------------------
/R/split_token.R:
--------------------------------------------------------------------------------
1 | #' Split Tokens
2 | #'
3 | #' Split tokens.
4 | #'
5 | #' @param x A \code{\link[base]{data.frame}} or character vector with tokens.
6 | #' @param text.var The name of the text variable. If \code{TRUE}
7 | #' \code{split_token} tries to detect the text column with tokens.
8 | #' @param lower logical. If \code{TRUE} the words are converted to lower case.
9 | #' @param \ldots Ignored.
10 | #' @export
11 | #' @rdname split_token
12 | #' @importFrom data.table .N :=
13 | #' @return Returns a list of vectors of tokens or an expanded
14 | #' \code{\link[data.table]{data.table}} with tokens split apart.
15 | #' @examples
16 | #' (x <- c(
17 | #' "Mr. Brown comes! He says hello. i give him coffee.",
18 | #' "I'll go at 5 p. m. eastern time. Or somewhere in between!",
19 | #' "go there"
20 | #' ))
21 | #' split_token(x)
22 | #' split_token(x, lower=FALSE)
23 | #'
24 | #' data(DATA)
25 | #' split_token(DATA)
26 | #' split_token(DATA, lower=FALSE)
27 | #'
28 | #' ## Larger data set
29 | #' split_token(hamlet)
30 | split_token <- function(x, ...) {
31 | UseMethod("split_token")
32 | }
33 |
34 | #' @export
35 | #' @rdname split_token
36 | #' @method split_token default
37 | split_token.default <- function(x, lower = TRUE, ...) {
38 |
39 | if (lower) {
40 | x <- stringi::stri_trans_tolower(x)
41 | }
42 | x <- stringi::stri_replace_all_regex(x, "(^|\\s)(')(\\w.+?)(')(\\s|$)", "$1$2 $3 $4$5")
43 | x <- trimws(stringi::stri_replace_all_regex(x, "([^'[:^punct:]])", " $1 "))
44 | stringi::stri_split_regex(x, "\\s+")
45 | }
46 |
47 |
48 | #' @export
49 | #' @rdname split_token
50 | #' @method split_token data.frame
51 | split_token.data.frame <- function(x, text.var = TRUE, lower = TRUE, ...) {
52 |
53 | element_id <- NULL
54 | nms <- colnames(x)
55 | z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
56 |
57 | text.var <- detect_text_column(x, text.var)
58 |
59 | z[, element_id := 1:.N]
60 | express1 <- parse(
61 | text=paste0(
62 | text.var,
63 | " := list(split_token.default(",
64 | text.var,
65 | ", lower = ",
66 | lower,
67 | "))"
68 | )
69 | )
70 | z[, eval(express1)]
71 |
72 | express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
73 | z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][,
74 | c(nms, "element_id"), with = FALSE]
75 | z[, 'token_id' := 1:.N, by = list(element_id)][]
76 |
77 | }
78 |
--------------------------------------------------------------------------------
/R/split_transcript.R:
--------------------------------------------------------------------------------
1 | #' Split a Transcript Style Vector on Delimiter & Coerce to Dataframe
2 | #'
3 | #' Split a transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}
4 | #' into a name and dialogue vector that is coerced to a \code{\link[data.table]{data.table}}.
5 | #' Leading/trailing white space in the columns is stripped out.
6 | #'
7 | #' @param x A transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}.
8 | #' @param delim The delimiter to split on.
9 | #' @param colnames The column names to use for the \code{\link[data.table]{data.table}}
10 | #' output.
11 | #' @param max.delim An integer stating how many characters may come before a
12 | #' delimiter is found. This is useful for the case when a colon is the delimiter
13 | #' but time stamps are also found in the text.
14 | #' @param \ldots Ignored.
15 | #' @return Returns a 2 column \code{\link[data.table]{data.table}}.
16 | #' @export
17 | #' @examples
18 | #' split_transcript(c("greg: Who me", "sarah: yes you!"))
19 | #'
20 | #' \dontrun{
21 | #' ## 2015 Vice-Presidential Debates Example
22 | #' if (!require("pacman")) install.packages("pacman")
23 | #' pacman::p_load(rvest, magrittr, xml2)
24 | #'
25 | #' debates <- c(
26 | #' wisconsin = "110908",
27 | #' boulder = "110906",
28 | #' california = "110756",
29 | #' ohio = "110489"
30 | #' )
31 | #'
32 | #' lapply(debates, function(x){
33 | #' xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) %>%
34 | #' rvest::html_nodes("p") %>%
35 | #' rvest::html_text() %>%
36 | #' textshape::split_index(grep("^[A-Z]+:", .)) %>%
37 | #' textshape::combine() %>%
38 | #' textshape::split_transcript() %>%
39 | #' textshape::split_sentence()
40 | #' })
41 | #' }
42 | split_transcript <- function(x, delim = ":", colnames = c("person", "dialogue"),
43 | max.delim = 15, ...){
44 |
45 | V1 <- V2 <- NULL
46 |
47 | if (!is.null(max.delim)) {
48 | x <- gsub(
49 | paste0('(^[^',delim, ']{0,', max.delim, '})([', delim, '])'),
50 | "\\1textshapesplithere",
51 | x,
52 | perl = TRUE
53 | )
54 | } else {
55 | x <- sub(delim, "textshapesplithere", x)
56 | }
57 |
58 | dat <- data.table::data.table(
59 | do.call(rbind, strsplit(x , "textshapesplithere"))
60 | )[,
61 | 'V1' := trimws(V1)][,
62 | 'V2' := trimws(V2)][]
63 |
64 | data.table::setnames(dat, c("V1", "V2"), colnames)
65 | dat
66 | }
67 |
--------------------------------------------------------------------------------
/R/split_word.R:
--------------------------------------------------------------------------------
1 | #' Split Words
2 | #'
3 | #' Split words.
4 | #'
5 | #' @param x A \code{\link[base]{data.frame}} or character vector with words.
6 | #' @param text.var The name of the text variable. If \code{TRUE}
7 | #' \code{split_word} tries to detect the text column with words.
8 | #' @param lower logical. If \code{TRUE} the words are converted to lower case.
9 | #' @param \ldots Ignored.
10 | #' @export
11 | #' @rdname split_word
12 | #' @importFrom data.table .N :=
13 | #' @return Returns a list of vectors of words or an expanded
14 | #' \code{\link[data.table]{data.table}} with words split apart.
15 | #' @examples
16 | #' (x <- c(
17 | #' "Mr. Brown comes! He says hello. i give him coffee.",
18 | #' "I'll go at 5 p. m. eastern time. Or somewhere in between!",
19 | #' "go there"
20 | #' ))
21 | #' split_word(x)
22 | #' split_word(x, lower=FALSE)
23 | #'
24 | #' data(DATA)
25 | #' split_word(DATA)
26 | #' split_word(DATA, lower=FALSE)
27 | #'
28 | #' ## Larger data set
29 | #' split_word(hamlet)
30 | split_word <- function(x, ...) {
31 | UseMethod("split_word")
32 | }
33 |
34 | #' @export
35 | #' @rdname split_word
36 | #' @method split_word default
37 | split_word.default <- function(x, lower = TRUE, ...) {
38 | if (lower) {
39 | x <- stringi::stri_trans_tolower(x)
40 | }
41 | stringi::stri_extract_all_words(x)
42 | }
43 |
44 | #' @export
45 | #' @rdname split_word
46 | #' @method split_word data.frame
47 | split_word.data.frame <- function(x, text.var = TRUE, lower = TRUE, ...) {
48 |
49 | element_id <- NULL
50 | nms <- colnames(x)
51 | z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
52 |
53 | text.var <- detect_text_column(x, text.var)
54 |
55 | z[, element_id := 1:.N]
56 | express1 <- parse(
57 | text=paste0(
58 | text.var,
59 | " := list(split_word.default(",
60 | text.var,
61 | ", lower = ",
62 | lower,
63 | "))"
64 | )
65 | )
66 | z[, eval(express1)]
67 |
68 | express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
69 | z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][,
70 | c(nms, "element_id"), with = FALSE]
71 | z[, 'word_id' := 1:.N, by = list(element_id)][]
72 |
73 | }
74 |
--------------------------------------------------------------------------------
/R/textshape-package.R:
--------------------------------------------------------------------------------
1 | #' Tools for Reshaping Text
2 | #'
3 | #' Tools that can be used to reshape and restructure text data.
4 | #' @docType package
5 | #' @name textshape
6 | #' @aliases textshape package-textshape
7 | NULL
8 |
9 | #' Fictitious Classroom Dialogue
10 | #'
11 | #' A fictitious dataset useful for small demonstrations.
12 | #'
13 | #' @details
14 | #' \itemize{
15 | #' \item person. Speaker
16 | #' \item sex. Gender
17 | #' \item adult. Dummy coded adult (0-no; 1-yes)
18 | #' \item state. Statement (dialogue)
19 | #' \item code. Dialogue coding scheme
20 | #' }
21 | #'
22 | #' @docType data
23 | #' @keywords datasets
24 | #' @name DATA
25 | #' @usage data(DATA)
26 | #' @format A data frame with 11 rows and 5 variables
27 | NULL
28 |
29 | #' Hamlet (Complete & Split by Sentence)
30 | #'
31 | #' A dataset containing the complete dialogue of Hamlet with turns of talk split
32 | #' into sentences.
33 | #'
34 | #' @details
35 | #' \itemize{
36 | #' \item act. The act (akin to repeated measures)
37 | #' \item tot. The turn of talk
38 | #' \item scene. The scene (nested within an act)
39 | #' \item location. Location of the scene
40 | #' \item person. Character in the play
41 | #' \item died. Logical coded death variable if yes the character dies in the
42 | #' play
43 | #' \item dialogue. The spoken dialogue
44 | #' }
45 | #'
46 | #' @docType data
47 | #' @keywords datasets
48 | #' @name hamlet
49 | #' @usage data(hamlet)
50 | #' @format A data frame with 2007 rows and 7 variables
51 | #' @references
52 | #' http://www.gutenberg.org
53 | NULL
54 |
55 |
56 |
57 | #' Simple \code{\link[tm]{DocumentTermMatrix}}
58 | #'
59 | #' A dataset containing a simple \code{\link[tm]{DocumentTermMatrix}}.
60 | #'
61 | #' @details
62 | #' \describe{
63 | #' \item{i}{The document locations}
64 | #' \item{j}{The term locations}
65 | #' \item{v}{The count of terms for that particular element position}
66 | #' \item{nrow}{The number of rows}
67 | #' \item{ncol}{The number of columns}
68 | #' \item{dimnames}{document and terms}
69 | #' }
70 | #'
71 | #' @docType data
72 | #' @keywords datasets
73 | #' @name simple_dtm
74 | #' @usage data(simple_dtm)
75 | #' @format A list with 6 elements
76 | NULL
77 |
78 |
79 | #' Sentence Boundary Disambiguation Edge Cases
80 | #'
81 | #' A slightly filtered dataset containing Dias's sentence boundary
82 | #' disambiguation edge cases. This is a nested data set with the outcome
83 | #' column as a nested list of desired splits. The non-ASCII cases and spaced
84 | #' ellipsis examples have been removed.
85 | #'
86 | #' @details
87 | #' \itemize{
88 | #' \item Rule. The name of the rule to test
89 | #' \item Text. The testing text
90 | #' \item Outcome. The desired outcome of the sentence disambiguation
91 | #' }
92 | #'
93 | #' @docType data
94 | #' @keywords datasets
95 | #' @name golden_rules
96 | #' @usage data(golden_rules)
97 | #' @format A data frame with 45 rows and 3 variables
98 | #' @references Dias, Kevin S. 2015. Golden Rules (English).
99 | #' Retrieved: https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt
100 | NULL
101 |
--------------------------------------------------------------------------------
/R/tidy_colo_dtm.R:
--------------------------------------------------------------------------------
1 | #' Convert a
2 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
3 | #' into Collocating Words in Tidy Form
4 | #'
5 | #' Converts non-zero elements of a
6 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
7 | #' into a tidy data set made of collocating words.
8 | #'
9 | #' @param x A
10 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
11 | #' @param \ldots Ignored.
12 | #' @return Returns a tidied data.frame.
13 | #' @rdname tidy_colo_dtm
14 | #' @export
15 | #' @seealso \code{\link[textshape]{unique_pairs}}
16 | #' @examples
17 | #' data(simple_dtm)
18 | #'
19 | #' tidied <- tidy_colo_dtm(simple_dtm)
20 | #' tidied
21 | #' unique_pairs(tidied)
22 | #'
23 | #' \dontrun{
24 | #' if (!require("pacman")) install.packages("pacman")
25 | #' pacman::p_load_current_gh('trinker/gofastr', 'trinker/lexicon')
26 | #' pacman::p_load(tidyverse, magrittr, ggstance)
27 | #'
28 | #' my_dtm <- with(
29 | #' presidential_debates_2012,
30 | #' q_dtm(dialogue, paste(time, tot, sep = "_"))
31 | #' )
32 | #'
33 | #' tidy_colo_dtm(my_dtm) %>%
34 | #' tbl_df() %>%
35 | #' filter(!term_1 %in% c('i', lexicon::sw_onix) &
36 | #' !term_2 %in% lexicon::sw_onix
37 | #' ) %>%
38 | #' filter(term_1 != term_2) %>%
39 | #' unique_pairs() %>%
40 | #' filter(n > 15) %>%
41 | #' complete(term_1, term_2, fill = list(n = 0)) %>%
42 | #' ggplot(aes(x = term_1, y = term_2, fill = n)) +
43 | #' geom_tile() +
44 | #' scale_fill_gradient(low= 'white', high = 'red') +
45 | #' theme(axis.text.x = element_text(angle = 45, hjust = 1))
46 | #' }
47 | tidy_colo_tdm <- function(x, ...){
48 |
49 | term_1 <- NULL
50 |
51 | x <- slam::as.simple_triplet_matrix(
52 | slam::tcrossprod_simple_triplet_matrix(x, y = NULL)
53 | )
54 |
55 | data.table::data.table(
56 | term_1 = x[['dimnames']][['Terms']][x[['i']]],
57 | term_2 = x[['dimnames']][['Terms']][x[['j']]],
58 | n = x[['v']]
59 | )[order(term_1), ]
60 | }
61 |
62 |
63 |
64 | #' @rdname tidy_colo_dtm
65 | #' @export
66 | tidy_colo_dtm <- function(x, ...){
67 |
68 | term_1 <- NULL
69 |
70 | x <- slam::as.simple_triplet_matrix(
71 | slam::crossprod_simple_triplet_matrix(x, y = NULL)
72 | )
73 |
74 | data.table::data.table(
75 | term_1 = x[['dimnames']][['Terms']][x[['i']]],
76 | term_2 = x[['dimnames']][['Terms']][x[['j']]],
77 | n = x[['v']]
78 | )[order(term_1), ]
79 | }
80 |
81 |
82 |
--------------------------------------------------------------------------------
/R/tidy_dtm.R:
--------------------------------------------------------------------------------
1 | #' Convert a
2 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
3 | #' into Tidy Form
4 | #'
5 | #' Converts non-zero elements of a
6 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
7 | #' into a tidy data set.
8 | #'
9 | #'
10 | #' @param x A
11 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
12 | #' @param \ldots ignored.
13 | #' @return Returns a tidied data.frame.
14 | #' @rdname tidy_dtm
15 | #' @include utils.R
16 | #' @export
17 | #' @examples
18 | #' data(simple_dtm)
19 | #'
20 | #' tidy_dtm(simple_dtm)
21 | #'
22 | #' \dontrun{
23 | #' if (!require("pacman")) install.packages("pacman")
24 | #' pacman::p_load_current_gh('trinker/gofastr')
25 | #' pacman::p_load(tidyverse, magrittr, ggstance)
26 | #'
27 | #' my_dtm <- with(
28 | #' presidential_debates_2012,
29 | #' q_dtm(dialogue, paste(time, tot, sep = "_"))
30 | #' )
31 | #'
32 | #' tidy_dtm(my_dtm) %>%
33 | #' tidyr::extract(
34 | #' col = doc,
35 | #' into = c("time", "turn", "sentence"),
36 | #' regex = "(\\d)_(\\d+)\\.(\\d+)"
37 | #' ) %>%
38 | #' mutate(
39 | #' time = as.numeric(time),
40 | #' turn = as.numeric(turn),
41 | #' sentence = as.numeric(sentence)
42 | #' ) %>%
43 | #' tbl_df() %T>%
44 | #' print() %>%
45 | #' group_by(time, term) %>%
46 | #' summarize(n = sum(n)) %>%
47 | #' group_by(time) %>%
48 | #' arrange(desc(n)) %>%
49 | #' slice(1:10) %>%
50 | #' ungroup() %>%
51 | #' mutate(
52 | #' term = factor(paste(term, time, sep = "__"),
53 | #' levels = rev(paste(term, time, sep = "__")))
54 | #' ) %>%
55 | #' ggplot(aes(x = n, y = term)) +
56 | #' geom_barh(stat='identity') +
57 | #' facet_wrap(~time, ncol=2, scales = 'free_y') +
58 | #' scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
59 | #' }
60 | tidy_dtm <- function(x, ...){
61 |
62 | doc <- NULL
63 |
64 | docfun <- function(docs) {
65 | if (is_numeric_doc_names(x)) {as.integer(docs)} else {docs}
66 | }
67 |
68 | data.table::data.table(
69 | doc = x[['dimnames']][['Docs']][x[['i']]],
70 | term = x[['dimnames']][['Terms']][x[['j']]],
71 | n = x[['v']],
72 | i = x[['i']],
73 | j = x[['j']]
74 | )[, doc := docfun(doc)][order(doc), ][]
75 | }
76 |
77 |
78 |
79 | #' @rdname tidy_dtm
80 | #' @export
81 | tidy_tdm <- function(x, ...){
82 |
83 | doc <- NULL
84 |
85 | docfun <- function(docs) {
86 | if (is_numeric_doc_names(x)) {as.integer(docs)} else {docs}
87 | }
88 |
89 | data.table::data.table(
90 | doc = x[['dimnames']][['Docs']][x[['j']]],
91 | term = x[['dimnames']][['Terms']][x[['i']]],
92 | n = x[['v']],
93 | i = x[['j']],
94 | j = x[['i']]
95 | )[, doc := docfun(doc)][order(doc), ][]
96 | }
97 |
--------------------------------------------------------------------------------
/R/tidy_list.R:
--------------------------------------------------------------------------------
1 | #' Tidy a List of Named Dataframes or Named Vectors or Vectors
2 | #'
3 | #' \code{\link[base]{rbind}} a named \code{\link[base]{list}} of
4 | #' \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}s to
5 | #' output a single \code{\link[base]{data.frame}} with the
6 | #' \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id}
7 | #' column.
8 | #'
9 | #' @param x A named \code{\link[base]{list}} of
10 | #' \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.
11 | #' @param id.name The name to use for the column created from the
12 | #' \code{\link[base]{list}}.
13 | #' @param content.name The name to use for the column created from the
14 | #' \code{\link[base]{list}} of \code{\link[base]{vector}}s (only used if
15 | #' \code{x} is \code{\link[base]{vector}}).
16 | #' @param content.attribute.name The name to use for the column created from the
17 | #' \code{\link[base]{list}} of names given to the \code{\link[base]{vector}}s
18 | #' (only used if \code{x} is named \code{\link[base]{vector}}).
19 | #' @param \ldots Ignored.
20 | #' @return Returns a \code{\link[data.table]{data.table}} with the
21 | #' \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id}
22 | #' column.
23 | #' @export
24 | #' @examples
25 | #' tidy_list(list(p=1:500, r=letters))
26 | #' tidy_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
27 | #'
28 | #' x <- list(
29 | #' a = setNames(c(1:4), LETTERS[1:4]),
30 | #' b = setNames(c(7:9), LETTERS[7:9]),
31 | #' c = setNames(c(10:15), LETTERS[10:15]),
32 | #' d = c(x=4, y=6, 4),
33 | #' e = setNames(1:10, sample(state.abb, 10, TRUE)),
34 | #' f = setNames(1:10, sample(month.abb, 10, TRUE))
35 | #' )
36 | #'
37 | #' tidy_list(x)
38 | #'
39 | #' \dontrun{
40 | #' ## 2015 Vice-Presidential Debates Example
41 | #' if (!require("pacman")) install.packages("pacman")
42 | #' pacman::p_load(rvest, magrittr, xml2)
43 | #'
44 | #' debates <- c(
45 | #' wisconsin = "110908",
46 | #' boulder = "110906",
47 | #' california = "110756",
48 | #' ohio = "110489"
49 | #' )
50 | #'
51 | #' lapply(debates, function(x){
52 | #' paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x) %>%
53 | #' xml2::read_html() %>%
54 | #' rvest::html_nodes("p") %>%
55 | #' rvest::html_text() %>%
56 | #' textshape::split_index(grep("^[A-Z]+:", .)) %>%
57 | #' textshape::combine() %>%
58 | #' textshape::split_transcript() %>%
59 | #' textshape::split_sentence()
60 | #' }) %>%
61 | #' textshape::tidy_list("location")
62 | #' }
63 | tidy_list <- function(x, id.name= "id", content.name = "content",
64 | content.attribute.name = 'attribute', ...){
65 |
66 | if (is.data.frame(x[[1]])){
67 | tidy_list_df(x = x, id.name = id.name)
68 | } else {
69 |
70 | if (is.atomic(x[[1]])){
71 |
72 | tidy_list_vector(x = x, id.name = id.name,
73 | content.name = content.name,
74 | content.attribute.name = content.attribute.name
75 | )
76 |
77 | } else {
78 | stop("`x` must be a list of `data.frame`s or atomic `vector`s")
79 | }
80 | }
81 | }
82 |
83 |
84 |
85 | tidy_list_df <- function (x, id.name = "id"){
86 | if (is.null(names(x))) {
87 | names(x) <- seq_along(x)
88 | }
89 | list.names <- rep(names(x), sapply2(x, nrow))
90 | x <- lapply(x, data.table::as.data.table)
91 | x[['fill']] <- TRUE
92 | out <- data.frame(list.names, do.call(rbind, x),
93 | row.names = NULL, check.names = FALSE, stringsAsFactors = FALSE)
94 | colnames(out)[1] <- id.name
95 | data.table::data.table(out)
96 | }
97 |
98 | tidy_list_vector <- function(x, id.name = "id",
99 | content.attribute.name = 'attribute', content.name = "content"){
100 |
101 | if (is.null(names(x))) {
102 | names(x) <- seq_along(x)
103 | }
104 |
105 | if (all(!sapply2(x, function(y) is.null(names(y))))){
106 |
107 | tidy_list(
108 | lapply(x, tidy_vector, content.attribute.name , content.name ),
109 | id.name
110 | )
111 |
112 | } else {
113 |
114 | dat <- data.frame(
115 | rep(names(x), sapply2(x, length)),
116 | unlist(x, use.names = FALSE),
117 | stringsAsFactors = FALSE,
118 | check.names = FALSE,
119 | row.names = NULL
120 | )
121 | colnames(dat) <- c(id.name, content.name)
122 | data.table::data.table(dat)
123 |
124 | }
125 | }
126 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/R/tidy_matrix.R:
--------------------------------------------------------------------------------
1 | #' Convert a Matrix into Tidy Form
2 | #'
3 | #' \code{tidy_matrix} - Converts matrices into a tidy data set. Essentially, a
4 | #' stacking of the matrix columns and repeating row/column names as necessary.
5 | #'
6 | #' @param x A matrix.
7 | #' @param row.name A string to use for the row names that are now a column.
8 | #' @param col.name A string to use for the column names that are now a column.
9 | #' @param value.name A string to use for the values that are now a column.
10 | #' @param \ldots ignored.
11 | #' @return Returns a tidied \code{data.frame}.
12 | #' @export
13 | #' @rdname tidy_matrix
14 | #' @examples
15 | #' mat <- matrix(1:16, nrow = 4,
16 | #' dimnames = list(LETTERS[1:4], LETTERS[23:26])
17 | #' )
18 | #'
19 | #' mat
20 | #' tidy_matrix(mat)
21 | #'
22 | #'
23 | #' data(simple_dtm)
24 | #' tidy_matrix(as.matrix(simple_dtm), 'doc', 'term', 'n')
25 | #'
26 | #' X <- as.matrix(simple_dtm[1:10, 1:10])
27 | #' tidy_adjacency_matrix(crossprod(X))
28 | #' tidy_adjacency_matrix(crossprod(t(X)))
29 | tidy_matrix <- function(x, row.name = 'row', col.name = 'col',
30 | value.name = 'value', ...){
31 |
32 | stopifnot(is.matrix(x))
33 |
34 | out <- data.table::data.table(x = rep(rownames(x), ncol(x)),
35 | y = rep(colnames(x), each = nrow(x)),
36 | z = c(x))
37 | data.table::setnames(out, c(row.name, col.name, value.name))
38 |
39 | out
40 |
41 | }
42 |
43 | #' Convert a Matrix into Tidy Form
44 | #'
45 | #' \code{tidy_adjacency_matrix} - A wrapper for \code{tidy_matrix} with the
46 | #' \code{row.name}, \code{col.name}, & \code{value.name} all set to
47 | #' \code{"from"},\code{"to"}, & \code{"n"}, assuming preparation for network
48 | #' analysis.
49 | #'
50 | #' @export
51 | #' @rdname tidy_matrix
52 | tidy_adjacency_matrix <- function(x, ...){
53 |
54 | tidy_matrix(x, row.name = 'from', col.name = 'to', value.name = 'n', ...)
55 |
56 | }
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/R/tidy_table.R:
--------------------------------------------------------------------------------
1 | #' Tidy a Table: Bind Its Values with Its Names
2 | #'
3 | #' \code{\link[base]{cbind}} a \code{\link[base]{table}}'s values with its
4 | #' \code{\link[base]{names}} to form \code{id} (from the names) and
5 | #' \code{content} columns.
6 | #'
7 | #' @param x A \code{\link[base]{table}}.
8 | #' @param id.name The name to use for the column created from the \code{\link[base]{table}}
9 | #' \code{\link[base]{names}}.
10 | #' @param content.name The name to use for the column created from the \code{\link[base]{table}}
11 | #' values.
12 | #' @param \ldots ignored.
13 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
14 | #' from the \code{\link[base]{table}} as an \code{id} column.
15 | #' @export
16 | #' @examples
17 | #' x <- table(sample(LETTERS[1:6], 1000, TRUE))
18 | #' tidy_table(x)
19 | tidy_table <- function(x, id.name= "id", content.name = "content", ...){
20 |
21 | stopifnot(is.table(x))
22 | out <- data.table::data.table(x = names(x), y = unname(c(x)))
23 | data.table::setnames(out, c(id.name, content.name))
24 | out
25 |
26 | }
27 |
28 |
--------------------------------------------------------------------------------
/R/tidy_vector.R:
--------------------------------------------------------------------------------
1 | #' Tidy a Named Atomic Vector: Bind Its Values with Its Names
2 | #'
3 | #' \code{\link[base]{cbind}} a named atomic \code{\link[base]{vector}}'s values
4 | #' with its \code{\link[base]{names}} to form \code{id} (from the names) and
5 | #' \code{content} columns.
6 | #'
7 | #' @param x A named atomic \code{\link[base]{vector}}.
8 | #' @param id.name The name to use for the column created from the \code{\link[base]{vector}}
9 | #' \code{\link[base]{names}}.
10 | #' @param content.name The name to use for the column created from the \code{\link[base]{vector}}
11 | #' values.
12 | #' @param \ldots ignored.
13 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
14 | #' from the \code{\link[base]{vector}} as an \code{id} column.
15 | #' @export
16 | #' @examples
17 | #' x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
18 | #' tidy_vector(x)
19 | tidy_vector <- function(x, id.name= "id", content.name = "content", ...){
20 |
21 | stopifnot(is.atomic(x))
22 | if (is.null(names)) {
23 | out <- data.table::as.data.table(x)
24 | data.table::setnames(out, id.name)
25 | } else {
26 | out <- data.table::data.table(x = names(x), y = unname(x))
27 | data.table::setnames(out, c(id.name, content.name))
28 | }
29 | out
30 | }
31 |
32 |
33 |
--------------------------------------------------------------------------------
/R/unique_pairs.R:
--------------------------------------------------------------------------------
1 | #' Extract Only Unique Pairs of Collocating Words in
2 | #' \code{\link[textshape]{tidy_colo_dtm}}
3 | #'
4 | #' \code{\link[textshape]{tidy_colo_dtm}} utilizes the entire matrix to generate
5 | #' the tidied data.frame. This means that the upper and lower triangles are
6 | #' used redundantly. This function eliminates this redundancy by dropping one
7 | #' set of the pairs from a tidied data.frame.
8 | #'
9 | #' @param x A \code{\link[base]{data.frame}} with two columns that contain
10 | #' redundant pairs.
11 | #' @param col1 A string naming column 1.
12 | #' @param col2 A string naming column 2.
13 | #' @param \ldots ignored.
14 | #' @return Returns a filtered \code{\link[base]{data.frame}}.
15 | #' @export
16 | #' @seealso \code{\link[textshape]{tidy_colo_dtm}}
17 | #' @examples
18 | #' dat <- data.frame(
19 | #' term_1 = LETTERS[1:10],
20 | #' term_2 = LETTERS[10:1],
21 | #' stringsAsFactors = FALSE
22 | #' )
23 | #'
24 | #' unique_pairs(dat)
25 | unique_pairs <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
26 |
27 | UseMethod('unique_pairs')
28 | }
29 |
30 | #' @export
31 | #' @rdname unique_pairs
32 | #' @method unique_pairs default
33 | unique_pairs.default <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
34 |
35 | x[!duplicated(apply(
36 | data.table::data.table(x[, c(col1, col2)]),
37 | 1,
38 | sorter
39 | )),]
40 | }
41 |
42 | #' @export
43 | #' @rdname unique_pairs
44 | #' @method unique_pairs data.table
45 | unique_pairs.data.table <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
46 |
47 | x[!duplicated(apply(
48 | data.table::data.table(x[, c(col1, col2), with = FALSE]),
49 | 1,
50 | sorter
51 | )),]
52 | }
53 |
54 | sorter <- function(x) paste(sort(x), collapse = "")
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/R/unnest_text.R:
--------------------------------------------------------------------------------
1 | #' Un-nest Nested Text Columns
2 | #'
3 | #' Un-nest nested text columns in a data.frame. Attempts to locate the nested
4 | #' text column without specifying.
5 | #'
6 | #' @param dataframe A dataframe object.
7 | #' @param column Column name to search for markers/terms.
8 | #' @param integer.rownames logical. If \code{TRUE} then the rownames are
9 | #' numbered 1 through number of rows, otherwise the original row number is
10 | #' retained followed by a period and the element number from the list.
11 | #' @param \ldots ignored.
12 | #' @return Returns an un-nested data.frame.
13 | #' @export
14 | #' @examples
15 | #' dat <- DATA
16 | #'
17 | #' ## Add a nested/list text column
18 | #' dat$split <- lapply(dat$state, function(x) {
19 | #' unlist(strsplit(x, '(?<=[?!.])\\s+', perl = TRUE))
20 | #' })
21 | #'
22 | #' unnest_text(dat)
23 | #' unnest_text(dat, integer.rownames = FALSE)
24 | #'
25 | #' ## Add a second nested integer column
26 | #' dat$d <- lapply(dat$split, nchar)
27 | #' \dontrun{
28 | #' unnest_text(dat) # causes error, must supply column explicitly
29 | #' }
30 | #' unnest_text(dat, 'split')
31 | #'
32 | #' ## As a data.table
33 | #' library(data.table)
34 | #' dt_dat <- data.table::as.data.table(data.table::copy(dat))
35 | #' unnest_text(dt_dat, 'split')
36 | #' \dontrun{
37 | #' unnest_text(dt_dat, 'd')
38 | #' }
39 | #'
40 | #' \dontrun{
41 | #' ## As a tibble
42 | #' library(tibble)
43 | #' t_dat <- tibble:::as_tibble(dat)
44 | #' unnest_text(t_dat, 'split')
45 | #' }
46 | unnest_text <- function(dataframe, column, integer.rownames = TRUE, ...){
47 |
48 | if (missing(column)) {
49 |
50 | column <- names(dataframe)[!unlist(lapply(as.data.frame(dataframe),
51 | is.atomic))]
52 | if (length(column) == 0) {
53 | stop(paste(
54 | "There appears to be no nested columns.",
55 | "Please supply `column` explicitly."
56 | ), call. = FALSE)
57 | }
58 |
59 | if (length(column) > 1) {
60 | stop(paste(
61 | "There appears to be multiple nested columns.",
62 | "Please supply `column` explicitly."
63 | ), call. = FALSE)
64 | }
65 | message(sprintf('Nested column detected, un-nesting: %s', column))
66 |
67 | }
68 |
69 | nms <- colnames(dataframe)
70 |
71 | lens <- lengths(dataframe[[column]])
72 | col <- unlist(dataframe[[column]])
73 |
74 | if (!is.character(col)) {
75 | warning(
76 | sprintf(
77 | paste0(
78 | 'Un-nesting: `%s`\nThis is not a character column.\n\n',
79 | 'Perhaps you want to use `tidyr::unnest` instead?'
80 | ),
81 | column
82 | ),
83 | call. = FALSE
84 | )
85 | }
86 |
87 | dataframe[[column]] <- NA
88 |
89 | dataframe <- dataframe[rep(seq_len(nrow(dataframe)), lens),]
90 |
91 | dataframe[[column]] <- col
92 | if (isTRUE(integer.rownames)) {
93 | rownames(dataframe) <- NULL
94 | } else {
95 | rnms <- rownames(dataframe)
96 | rnms <- ifelse(grepl('\\.', rnms), rnms, paste0(rnms, '.0'))
97 |
98 | rownames(dataframe) <- paste0(
99 | gsub('\\.+$', '', rnms),
100 | '.',
101 | as.integer(gsub('^\\d+\\.', '', rnms)) + 1
102 | )
103 | }
104 |
105 | dataframe
106 |
107 | }
108 |
109 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | .mgsub <- function (pattern, replacement, text.var, fixed = TRUE,
2 | order.pattern = fixed, perl = !fixed, ...) {
3 |
4 | if (fixed && order.pattern) {
5 | ord <- rev(order(nchar(pattern)))
6 | pattern <- pattern[ord]
7 | if (length(replacement) != 1) replacement <- replacement[ord]
8 | }
9 |
10 | if (length(replacement) == 1) {
11 | replacement <- rep(replacement, length(pattern))
12 | }
13 |
14 | for (i in seq_along(pattern)){
15 | text.var <- gsub(pattern[i], replacement[i], text.var, fixed = fixed,
16 | perl = perl, ...)
17 | }
18 |
19 | text.var
20 | }
21 |
22 |
23 | nth <- function(x, ...){
24 | if (is.null(x)) return(1)
25 | if (is.numeric(x)) x <- as.integer(x)
26 | if (is.integer(x) && x < 1) stop('`from.n` and `to.n` must be > 1')
27 | if (is.integer(x)) return(x)
28 | if (is.character(x)) {
29 | switch(x,
30 | first = 1,
31 | last =,
32 | n = Inf,
33 | stop('If supplying a string to `from.n` or `to.n` must be one of: c("first", "last", or "n")')
34 | )
35 | }
36 | }
37 |
38 | mark_start <- function(class){
39 | sprintf("", class)
40 | }
41 |
42 | mark_end <- ""
43 |
44 |
45 |
46 |
47 |
48 | add_row_id <- function(x){
49 | lens <- lapply(x, length)
50 | rep(seq_along(lens), unlist(lens))
51 | }
52 |
53 | title_tag_table <- function (mat) {
54 | unlist(lapply(seq_len(nrow(mat)), function(i) {
55 | x <- unlist(mat[i, , drop = FALSE])
56 | names(x[x > 0][1])
57 | }))
58 | }
59 |
60 |
61 |
62 | paste2 <- function (multi.columns, sep = ".", handle.na = TRUE, trim = TRUE) {
63 | if (is.matrix(multi.columns)) {
64 | multi.columns <- data.frame(multi.columns, stringsAsFactors = FALSE)
65 | }
66 | if (trim)
67 | multi.columns <- lapply(multi.columns, function(x) {
68 | gsub("^\\s+|\\s+$", "", x)
69 | })
70 | if (!is.data.frame(multi.columns) & is.list(multi.columns)) {
71 | multi.columns <- do.call("cbind", multi.columns)
72 | }
73 | if (handle.na) {
74 | m <- apply(multi.columns, 1, function(x) {
75 | if (any(is.na(x))) {
76 | NA
77 | } else {
78 | paste(x, collapse = sep)
79 | }
80 | })
81 | } else {
82 | m <- apply(multi.columns, 1, paste, collapse = sep)
83 | }
84 | names(m) <- NULL
85 | return(m)
86 | }
87 |
88 | is.Integer <- function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol
89 |
90 | pad <- function (x, padding = max(nchar(as.character(x))), sort = TRUE,
91 | type = "detect") {
92 |
93 | poss <- c("detect", "numeric", "character", "d", "s")
94 |
95 | if (!type %in% poss){
96 | stop(
97 | paste(
98 | "type must be: \"detect\",",
99 | "\"numeric\"\\\"d\" or \"character\"\\\"s\""
100 | ),
101 | call. = FALSE
102 | )
103 | }
104 |
105 | Rel <- c(NA, "d", "s", "d", "s")
106 | type <- Rel[poss %in% type]
107 | if (is.na(type)) {
108 | type <- ifelse(is.numeric(x), "d", "s")
109 | }
110 | x <- sprintf_ish(x, padding, type)
111 | if (sort) {
112 | x <- sort(x)
113 | }
114 | x
115 | }
116 |
117 | sprintf_ish <- function(x, padding, type){
118 | OS <- Sys.info()[['sysname']]
119 |
120 | if (OS %in% c("Windows", "Linux")) {
121 | sprintf(paste0("%0", padding, type), x)
122 | } else {
123 | type <- ifelse(type == "s", " ", "0")
124 | pads <- sapply2(padding - nchar(x), function(i) {
125 | if (i == 0) return("")
126 | paste(rep(type, i), collapse = "")
127 | })
128 | paste0(pads, x)
129 |
130 | }
131 | }
132 |
133 |
134 | is_numeric_doc_names <- function(x, ...){
135 | UseMethod('is_numeric_doc_names')
136 | }
137 |
138 |
139 | is_numeric_doc_names.TermDocumentMatrix <- function(x, ...){
140 | colnames_numeric <- suppressWarnings(as.integer(colnames(x)))
141 | !anyNA(colnames_numeric) &&
142 | isTRUE(all.equal(stats::sd(diff(colnames_numeric)), 0))
143 | }
144 |
145 |
146 | is_numeric_doc_names.DocumentTermMatrix <- function(x, ...){
147 | rownames_numeric <- suppressWarnings(as.integer(rownames(x)))
148 | !anyNA(rownames_numeric) &&
149 | isTRUE(all.equal(stats::sd(diff(rownames_numeric)), 0))
150 | }
151 |
152 | ## function to detect text columns
153 | detect_text_column <- function(dat, text.var){
154 |
155 | if (isTRUE(text.var)) {
156 |
157 | dat <- as.data.frame(dat, stringsAsFactors = FALSE)
158 |
159 | mean_lens <- unlist(lapply(dat, function(y) {
160 |
161 | if(!is.character(y) && !is.factor(y)) return(0)
162 | mean(nchar(as.character(y)), na.rm = TRUE)
163 |
164 | }))
165 |
166 | max_cols <- which.max(mean_lens)
167 |
168 | text.var <- colnames(dat)[max_cols[1]]
169 |
170 | if (length(text.var) == 0 | sum(as.integer(mean_lens)) == 0) {
171 | stop(
172 | paste(
173 | "Could not detect ` text.var`.",
174 | "Please supply `text.var` explicitly."
175 | ),
176 | call. = FALSE
177 | )
178 | }
179 |
180 | if (length(max_cols) > 1) {
181 | warning(
182 | sprintf(
183 | 'More than one text column detected...using `%s`',
184 | text.var
185 | ),
186 | call. = FALSE
187 | )
188 | }
189 | }
190 |
191 | text.var
192 |
193 | }
194 |
195 |
196 | sapply2 <- function (X, FUN, ...) {
197 | unlist(lapply(X, FUN, ...))
198 | }
199 |
200 |
201 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "textshape"
3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
4 | output:
5 | md_document:
6 | toc: true
7 | ---
8 |
9 | ```{r, echo=FALSE, message=FALSE, warning=FALSE}
10 | library(knitr)
11 | knitr::opts_chunk$set(fig.path = "tools/figure/")
12 | combine <- textshape::combine
13 |
14 | library(tidyverse)
15 | library(magrittr)
16 | library(ggstance)
17 | library(textshape)
18 | library(gridExtra)
19 | library(viridis)
20 | library(quanteda)
21 | library(gofastr)
22 |
23 | ## desc <- suppressWarnings(readLines("DESCRIPTION"))
24 | ## regex <- "(^Version:\\s+)(\\d+\\.\\d+\\.\\d+)"
25 | ## loc <- grep(regex, desc)
26 | ## ver <- gsub(regex, "\\2", desc[loc])
27 | ## verbadge <- sprintf('
', ver, ver)
28 | ## verbadge <- ''
29 | ```
30 |
31 |
32 | ```{r, echo=FALSE}
33 | knit_hooks$set(htmlcap = function(before, options, envir) {
34 | if(!before) {
35 | paste('',options$htmlcap,"
",sep="")
36 | }
37 | })
38 | knitr::opts_knit$set(self.contained = TRUE, cache = FALSE)
39 | ```
40 |
41 | [](https://www.repostatus.org/)
42 | [](https://cran.r-project.org/package=textshape)
43 |
44 |
45 | 
46 |
47 | **textshape** is small suite of text reshaping and restructuring functions. Many of these functions are descended from tools in the [**qdapTools**](https://github.com/trinker/qdapTools) package. This brings reshaping tools under one roof with specific functionality of the package limited to text reshaping.
48 |
49 | Other R packages provide some of the same functionality. **textshape** differs from these packages in that it is designed to help the user take unstructured data (or implicitly structured), extract it into a structured format, and then restructure into common text analysis formats for use in the next stage of the text analysis pipeline. The implicit structure of seemingly unstructured data is often detectable/expressible by the researcher. **textshape** provides tools (e.g., `split_match`) to enable the researcher to convert this tacit knowledge into a form that can be used to reformat data into more structured formats. This package is meant to be used jointly with the [**textclean**](https://github.com/trinker/textclean) package, which provides cleaning and text normalization functionality.
50 |
51 | # Functions
52 |
53 | Most of the functions split, expand, grab, or tidy a `vector`, `list`, `data.frame`, or `DocumentTermMatrix`. The `combine`, `duration`, `mtabulate`, & `flatten` functions are notable exceptions. The table below describes the functions and their use:
54 |
55 | | Function | Used On | Description |
56 | |------------------|--------------------------------|--------------------------------------------------------------|
57 | | `combine` | `vector`, `list`, `data.frame` | Combine and collapse elements |
58 | | `tidy_list` | `list` of `vector`s or `data.frame`s | Row bind a list and repeat list names as id column |
59 | | `tidy_vector` | `vector` | Column bind a named atomic `vector`'s names and values |
60 | | `tidy_table` | `table` | Column bind a `table`'s names and values |
61 | | `tidy_matrix` | `matrix` | Stack values, repeat column row names accordingly |
62 | | `tidy_dtm`/`tidy_tdm` | `DocumentTermMatrix` | Tidy format `DocumentTermMatrix`/`TermDocumentMatrix` |
63 | | `tidy_colo_dtm`/`tidy_colo_tdm` | `DocumentTermMatrix` | Tidy format of collocating words from a `DocumentTermMatrix`/`TermDocumentMatrix` |
64 | | `duration` | `vector`, `data.frame` | Get duration (start-end times) for turns of talk in n words |
65 | | `from_to` | `vector`, `data.frame` | Prepare speaker data for a flow network |
66 | | `mtabulate` | `vector`, `list`, `data.frame` | Dataframe/list version of `tabulate` to produce count matrix |
67 | | `flatten` | `list` | Flatten nested, named list to single tier |
68 | | `unnest_text` | `data.frame` | Unnest a nested text column |
69 | | `split_index` | `vector`, `list`, `data.frame` | Split at specified indices |
70 | | `split_match` | `vector` | Split vector at specified character/regex match |
71 | | `split_portion` | `vector`\* | Split data into portioned chunks |
72 | | `split_run` | `vector`, `data.frame` | Split runs (e.g., "aaabbbbcdddd") |
73 | | `split_sentence` | `vector`, `data.frame` | Split sentences |
74 | | `split_speaker` | `data.frame` | Split combined speakers (e.g., "Josh, Jake, Jim") |
75 | | `split_token` | `vector`, `data.frame` | Split words and punctuation |
76 | | `split_transcript` | `vector` | Split speaker and dialogue (e.g., "greg: Who me") |
77 | | `split_word` | `vector`, `data.frame` | Split words |
78 | | `grab_index` | `vector`, `data.frame`, `list` | Grab from an index up to a second index |
79 | | `grab_match` | `vector`, `data.frame`, `list` | Grab from a regex match up to a second regex match |
80 | | `column_to_rownames` | `data.frame` | Add a column as rownames |
81 | | `cluster_matrix` | `matrix` | Reorder column/rows of a matrix via hierarchical clustering |
82 |
83 |
84 | \****Note:*** *Text vector accompanied by aggregating `grouping.var` argument, which can be in the form of a `vector`, `list`, or `data.frame`*
85 |
86 |
87 |
88 | # Installation
89 |
90 | To download the development version of **textshape**:
91 |
92 | Download the [zip ball](https://github.com/trinker/textshape/zipball/master) or [tar ball](https://github.com/trinker/textshape/tarball/master), decompress and run `R CMD INSTALL` on it, or use the **pacman** package to install the development version:
93 |
94 | ```r
95 | if (!require("pacman")) install.packages("pacman")
96 | pacman::p_load_gh("trinker/textshape")
97 | ```
98 |
99 | # Contact
100 |
101 | You are welcome to:
102 |
103 | * submit suggestions and bug-reports at:
104 |
105 |
106 | # Contributing
107 |
108 | Contributions are welcome from anyone subject to the following rules:
109 |
110 | - Abide by the [code of conduct](https://github.com/trinker/textshape/blob/master/CODE_OF_CONDUCT.md).
111 | - Follow the style conventions of the package (indentation, function & argument naming, commenting, etc.)
112 | - All contributions must be consistent with the package license (GPL-2)
113 | - Submit contributions as a pull request. Clearly state what the changes are and try to keep the number of changes per pull request as low as possible.
114 | - If you make big changes, add your name to the 'Author' field.
115 |
116 | # Examples
117 |
118 | The main shaping functions can be broken into the categories of (a) binding, (b) combining, (c) tabulating, (d) spanning, (e) splitting, (f) grabbing & (e) tidying. The majority of functions in **textshape** fall into the category of splitting and expanding (the semantic opposite of combining). These sections will provide example uses of the functions from **textshape** within the three categories.
119 |
120 | # Loading Dependencies
121 |
122 | ```r
123 | if (!require("pacman")) install.packages("pacman")
124 | pacman::p_load(tidyverse, magrittr, ggstance, viridis, gridExtra, quanteda)
125 | pacman::p_load_current_gh('trinker/gofastr', 'trinker/textshape')
126 | ```
127 |
128 | ## Tidying
129 |
130 | The `tidy_xxx` functions convert untidy structures into [tidy format](https://www.jstatsoft.org/article/view/v059i10). Tidy formatted text data structures are particularly useful for interfacing with **ggplot2**, which expects this form.
131 |
132 | The `tidy_list` function is used in the style of `do.call(rbind, list(x1, x2))` as a convenient way to bind together multiple named `data.frame`s or `vectors`s into a single `data.frame` with the `list` `names` acting as an id column. The `data.frame` bind is particularly useful for binding transcripts from different observations. Additionally, `tidy_vector` and `tidy_table` are provided for `cbinding` a `table`'s or named atomic `vector`'s values and names as separate columns in a `data.frame`. Lastly, `tidy_dtm`/`tidy_tdm` provide convenient ways to tidy a `DocumentTermMatrix` or `TermDocumentMatrix`.
133 |
134 | #### A Vector
135 |
136 | ```{r}
137 | x <- list(p=1:500, r=letters)
138 | tidy_list(x)
139 | ```
140 |
141 | #### A Dataframe
142 |
143 | ```{r}
144 | x <- list(p=mtcars, r=mtcars, z=mtcars, d=mtcars)
145 | tidy_list(x)
146 | ```
147 |
148 | #### A Named Vector
149 |
150 | ```{r}
151 | x <- setNames(
152 | sample(LETTERS[1:6], 1000, TRUE),
153 | sample(state.name[1:5], 1000, TRUE)
154 | )
155 | tidy_vector(x)
156 | ```
157 |
158 | #### A Table
159 |
160 | ```{r}
161 | x <- table(sample(LETTERS[1:6], 1000, TRUE))
162 | tidy_table(x)
163 | ```
164 |
165 | #### A Matrix
166 |
167 | ```{r}
168 | mat <- matrix(1:16, nrow = 4,
169 | dimnames = list(LETTERS[1:4], LETTERS[23:26])
170 | )
171 |
172 | mat
173 |
174 | tidy_matrix(mat)
175 | ```
176 |
177 | With clustering (column and row reordering) via the `cluster_matrix` function.
178 |
179 | ```{r}
180 | ## plot heatmap w/o clustering
181 | wo <- mtcars %>%
182 | cor() %>%
183 | tidy_matrix('car', 'var') %>%
184 | ggplot(aes(var, car, fill = value)) +
185 | geom_tile() +
186 | scale_fill_viridis(name = expression(r[xy])) +
187 | theme(
188 | axis.text.y = element_text(size = 8) ,
189 | axis.text.x = element_text(size = 8, hjust = 1, vjust = 1, angle = 45),
190 | legend.position = 'bottom',
191 | legend.key.height = grid::unit(.1, 'cm'),
192 | legend.key.width = grid::unit(.5, 'cm')
193 | ) +
194 | labs(subtitle = "With Out Clustering")
195 |
196 | ## plot heatmap w clustering
197 | w <- mtcars %>%
198 | cor() %>%
199 | cluster_matrix() %>%
200 | tidy_matrix('car', 'var') %>%
201 | mutate(
202 | var = factor(var, levels = unique(var)),
203 | car = factor(car, levels = unique(car))
204 | ) %>%
205 | group_by(var) %>%
206 | ggplot(aes(var, car, fill = value)) +
207 | geom_tile() +
208 | scale_fill_viridis(name = expression(r[xy])) +
209 | theme(
210 | axis.text.y = element_text(size = 8) ,
211 | axis.text.x = element_text(size = 8, hjust = 1, vjust = 1, angle = 45),
212 | legend.position = 'bottom',
213 | legend.key.height = grid::unit(.1, 'cm'),
214 | legend.key.width = grid::unit(.5, 'cm')
215 | ) +
216 | labs(subtitle = "With Clustering")
217 |
218 | grid.arrange(wo, w, ncol = 2)
219 |
220 | ```
221 |
222 |
223 | #### A DocumentTermMatrix
224 |
225 | The `tidy_dtm` and `tidy_tdm` functions convert a `DocumentTermMatrix` or `TermDocumentMatrix` into a tidied data set.
226 |
227 | ```{r, warning=FALSE}
228 | my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
229 |
230 | tidy_dtm(my_dtm) %>%
231 | tidyr::extract(doc, c("time", "turn", "sentence"), "(\\d)_(\\d+)\\.(\\d+)") %>%
232 | mutate(
233 | time = as.numeric(time),
234 | turn = as.numeric(turn),
235 | sentence = as.numeric(sentence)
236 | ) %>%
237 | tbl_df() %T>%
238 | print() %>%
239 | group_by(time, term) %>%
240 | summarize(n = sum(n)) %>%
241 | group_by(time) %>%
242 | arrange(desc(n)) %>%
243 | slice(1:10) %>%
244 | mutate(term = factor(paste(term, time, sep = "__"), levels = rev(paste(term, time, sep = "__")))) %>%
245 | ggplot(aes(x = n, y = term)) +
246 | geom_barh(stat='identity') +
247 | facet_wrap(~time, ncol=2, scales = 'free_y') +
248 | scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
249 | ```
250 |
251 |
252 | #### A DocumentTermMatrix of Collocations
253 |
254 | The `tidy_colo_dtm` and `tidy_colo_tdm` functions convert a `DocumentTermMatrix` or `TermDocumentMatrix` into a collocation matrix and then a tidied data set.
255 |
256 | ```{r}
257 | my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
258 | sw <- unique(c(
259 | lexicon::sw_jockers,
260 | lexicon::sw_loughran_mcdonald_long,
261 | lexicon::sw_fry_1000
262 | ))
263 |
264 | tidy_colo_dtm(my_dtm) %>%
265 | tbl_df() %>%
266 | filter(!term_1 %in% c('i', sw) & !term_2 %in% sw) %>%
267 | filter(term_1 != term_2) %>%
268 | unique_pairs() %>%
269 | filter(n > 15) %>%
270 | complete(term_1, term_2, fill = list(n = 0)) %>%
271 | ggplot(aes(x = term_1, y = term_2, fill = n)) +
272 | geom_tile() +
273 | scale_fill_gradient(low= 'white', high = 'red') +
274 | theme(axis.text.x = element_text(angle = 45, hjust = 1))
275 | ```
276 |
277 |
278 | ## Combining
279 |
280 | The `combine` function acts like `paste(x, collapse=" ")` on vectors and lists of vectors. On dataframes multiple text cells are pasted together within grouping variables.
281 |
282 | #### A Vector
283 |
284 | ```{r}
285 | x <- c("Computer", "is", "fun", ".", "Not", "too", "fun", ".")
286 | combine(x)
287 | ```
288 |
289 | #### A Dataframe
290 |
291 | ```{r}
292 | (dat <- split_sentence(DATA))
293 | combine(dat[, 1:5, with=FALSE])
294 | ```
295 |
296 | ## Tabulating
297 |
298 | `mtabulate` allows the user to transform data types into a dataframe of counts.
299 |
300 | #### A Vector
301 |
302 | ```{r}
303 | (x <- list(w=letters[1:10], x=letters[1:5], z=letters))
304 | mtabulate(x)
305 |
306 | ## Dummy coding
307 | mtabulate(mtcars$cyl[1:10])
308 | ```
309 |
310 |
311 | #### A Dataframe
312 |
313 | ```{r}
314 | (dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3)))
315 | mtabulate(dat)
316 | t(mtabulate(dat))
317 | ```
318 |
319 | ## Spanning
320 |
321 | Often it is useful to know the duration (start-end) of turns of talk. The `duration` function calculates start-end durations as n words.
322 |
323 |
324 | #### A Vector
325 |
326 | ```{r}
327 | (x <- c(
328 | "Mr. Brown comes! He says hello. i give him coffee.",
329 | "I'll go at 5 p. m. eastern time. Or somewhere in between!",
330 | "go there"
331 | ))
332 | duration(x)
333 | # With grouping variables
334 | groups <- list(group1 = c("A", "B", "A"), group2 = c("red", "red", "green"))
335 | duration(x, groups)
336 | ```
337 |
338 |
339 | #### A Dataframe
340 |
341 | ```{r}
342 | duration(DATA)
343 | ```
344 |
345 | #### Gantt Plot
346 |
347 | ```{r}
348 | library(ggplot2)
349 | ggplot(duration(DATA), aes(x = start, xend = end, y = person, yend = person, color = sex)) +
350 | geom_segment(size=4) +
351 | xlab("Duration (Words)") +
352 | ylab("Person")
353 | ```
354 |
355 | ## Splitting
356 |
357 | The following section provides examples of available splitting functions.
358 |
359 | ### Indices
360 |
361 | `split_index` allows the user to supply the integer indices of where to split a data type.
362 |
363 | #### A Vector
364 |
365 | ```{r}
366 | split_index(
367 | LETTERS,
368 | indices = c(4, 10, 16),
369 | names = c("dog", "cat", "chicken", "rabbit")
370 | )
371 | ```
372 |
373 | #### A Dataframe
374 |
375 | Here I calculate the indices of every time the `vs` variable in the `mtcars` data set changes and then split the dataframe on those indices. The `change_index` function is handy for extracting the indices of changes in runs within an atomic vector.
376 |
377 | ```{r}
378 | (vs_change <- change_index(mtcars[["vs"]]))
379 | split_index(mtcars, indices = vs_change)
380 | ```
381 |
382 | ### Matches
383 |
384 | `split_match` splits on elements that match exactly or via a regular expression match.
385 |
386 | #### Exact Match
387 |
388 | ```{r}
389 | set.seed(15)
390 | (x <- sample(c("", LETTERS[1:10]), 25, TRUE, prob=c(.2, rep(.08, 10))))
391 |
392 | split_match(x)
393 | split_match(x, split = "C")
394 | split_match(x, split = c("", "C"))
395 | ## Don't include
396 | split_match(x, include = 0)
397 | ## Include at beginning
398 | split_match(x, include = 1)
399 | ## Include at end
400 | split_match(x, include = 2)
401 | ```
402 |
403 | #### Regex Match
404 |
405 | Here I use the regex `"^I"` to break on any vectors containing the capital letter I as the first character.
406 |
407 | ```{r}
408 | split_match(DATA[["state"]], split = "^I", regex=TRUE, include = 1)
409 | ```
410 |
411 |
412 | ### Portions
413 |
414 | At times it is useful to split texts into portioned chunks, operate on the chunks and aggregate the results. `split_portion` allows the user to do this sort of text shaping. We can split into n chunks per grouping variable (via `n.chunks`) or into chunks of n length (via `n.words`).
415 |
416 | #### A Vector
417 |
418 |
419 | ```{r}
420 | with(DATA, split_portion(state, n.chunks = 10))
421 | with(DATA, split_portion(state, n.words = 10))
422 | ```
423 |
424 | #### A Dataframe
425 |
426 | ```{r}
427 | with(DATA, split_portion(state, list(sex, adult), n.words = 10))
428 | ```
429 |
430 |
431 | ### Runs
432 |
433 | `split_run` allows the user to split up runs of identical characters.
434 |
435 | ```{r}
436 | x1 <- c(
437 | "122333444455555666666",
438 | NA,
439 | "abbcccddddeeeeeffffff",
440 | "sddfg",
441 | "11112222333"
442 | )
443 |
444 | x <- c(rep(x1, 2), ">>???,,,,....::::;[[")
445 |
446 | split_run(x)
447 | ```
448 |
449 | #### Dataframe
450 |
451 | ```{r}
452 | DATA[["run.col"]] <- x
453 | split_run(DATA)
454 | ## Reset the DATA dataset
455 | DATA <- textshape::DATA
456 | ```
457 |
458 | ### Sentences
459 |
460 | `split_sentece` provides a mapping + regex approach to splitting sentences. It is less accurate than the Stanford parser but more accurate than a simple regular expression approach alone.
461 |
462 | #### A Vector
463 |
464 | ```{r}
465 | (x <- paste0(
466 | "Mr. Brown comes! He says hello. i give him coffee. i will ",
467 | "go at 5 p. m. eastern time. Or somewhere in between!go there"
468 | ))
469 | split_sentence(x)
470 | ```
471 |
472 | #### A Dataframe
473 |
474 | ```{r}
475 | split_sentence(DATA)
476 | ```
477 |
478 | ### Speakers
479 |
480 | Often speakers may talk in unison. This is often displayed in a single cell as a comma separated string of speakers. Some analysis may require this information to be parsed out and replicated as one turn per speaker. The `split_speaker` function accomplishes this.
481 |
482 | ```{r}
483 | DATA$person <- as.character(DATA$person)
484 | DATA$person[c(1, 4, 6)] <- c("greg, sally, & sam",
485 | "greg, sally", "sam and sally")
486 | DATA
487 |
488 | split_speaker(DATA)
489 | ## Reset the DATA dataset
490 | DATA <- textshape::DATA
491 | ```
492 |
493 | ### Tokens
494 |
495 | The `split_token` function split data into words and punctuation.
496 |
497 | #### A Vector
498 |
499 | ```{r}
500 | (x <- c(
501 | "Mr. Brown comes! He says hello. i give him coffee.",
502 | "I'll go at 5 p. m. eastern time. Or somewhere in between!",
503 | "go there"
504 | ))
505 | split_token(x)
506 | ```
507 |
508 | #### A Dataframe
509 |
510 | ```{r}
511 | split_token(DATA)
512 | ```
513 |
514 |
515 | ### Transcript
516 |
517 | The `split_transcript` function splits `vector`s with speaker prefixes (e.g., `c("greg: Who me", "sarah: yes you!")`) into a two column `data.frame`.
518 |
519 | #### A Vector
520 |
521 | ```{r}
522 | (x <- c(
523 | "greg: Who me",
524 | "sarah: yes you!",
525 | "greg: well why didn't you say so?",
526 | "sarah: I did but you weren't listening.",
527 | "greg: oh :-/ I see...",
528 | "dan: Ok let's meet at 4:30 pm for drinks"
529 | ))
530 |
531 | split_transcript(x)
532 | ```
533 |
534 | ### Words
535 |
536 | The `split_word` function splits data into words.
537 |
538 | #### A Vector
539 |
540 | ```{r}
541 | (x <- c(
542 | "Mr. Brown comes! He says hello. i give him coffee.",
543 | "I'll go at 5 p. m. eastern time. Or somewhere in between!",
544 | "go there"
545 | ))
546 | split_word(x)
547 | ```
548 |
549 | #### A Dataframe
550 |
551 | ```{r}
552 | split_word(DATA)
553 | ```
554 |
555 | ## Grabbing
556 |
557 | The following section provides examples of available grabbing (from a starting point up to an ending point) functions.
558 |
559 | ### Indices
560 |
561 | `grab_index` allows the user to supply the integer indices of where to grab (from - up to) a data type.
562 |
563 | #### A Vector
564 |
565 | ```{r}
566 | grab_index(DATA$state, from = 2, to = 4)
567 | grab_index(DATA$state, from = 9)
568 | grab_index(DATA$state, to = 3)
569 | ```
570 |
571 | #### A Dataframe
572 |
573 | ```{r}
574 | grab_index(DATA, from = 2, to = 4)
575 | ```
576 |
577 | #### A List
578 |
579 | ```{r}
580 | grab_index(as.list(DATA$state), from = 2, to = 4)
581 | ```
582 |
583 | ### Matches
584 |
585 | `grab_match` grabs (from - up to) elements that match a regular expression.
586 |
587 |
588 | #### A Vector
589 |
590 | ```{r}
591 | grab_match(DATA$state, from = 'dumb', to = 'liar')
592 | grab_match(DATA$state, from = '^What are')
593 | grab_match(DATA$state, to = 'we do[?]')
594 | grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE,
595 | from.n = 'last', to.n = 'first')
596 | ```
597 |
598 | #### A Dataframe
599 |
600 | ```{r}
601 | grab_match(DATA, from = 'dumb', to = 'liar')
602 | ```
603 |
604 | #### A List
605 |
606 | ```{r}
607 | grab_match(as.list(DATA$state), from = 'dumb', to = 'liar')
608 | ```
609 |
610 | ## Putting It Together
611 |
612 | Eduardo Flores blogged about [What the candidates say, analyzing republican debates using R](https://www.r-bloggers.com/2015/11/what-the-candidates-say-analyzing-republican-debates-using-r/) where he demonstrated some scraping and analysis techniques. Here I highlight a combination usage of **textshape** tools to scrape and structure the text from 4 of the 2015 Republican debates within a [**magrittr**](https://github.com/tidyverse/magrittr) pipeline. The result is a single [**data.table**](https://github.com/Rdatatable/data.table) containing the dialogue from all 4 debates. The code highlights the conciseness and readability of **textshape** by restructuring Flores scraping with **textshape** replacements.
613 |
614 | ```{r}
615 | if (!require("pacman")) install.packages("pacman")
616 | pacman::p_load(rvest, magrittr, xml2)
617 |
618 | debates <- c(
619 | wisconsin = "110908",
620 | boulder = "110906",
621 | california = "110756",
622 | ohio = "110489"
623 | )
624 |
625 | lapply(debates, function(x){
626 | xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) %>%
627 | rvest::html_nodes("p") %>%
628 | rvest::html_text() %>%
629 | textshape::split_index(., grep("^[A-Z]+:", .)) %>%
630 | #textshape::split_match("^[A-Z]+:", TRUE, TRUE) %>% #equal to line above
631 | textshape::combine() %>%
632 | textshape::split_transcript() %>%
633 | textshape::split_sentence()
634 | }) %>%
635 | textshape::tidy_list("location")
636 | ```
637 |
638 |
639 |
640 |
--------------------------------------------------------------------------------
/data/DATA.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/DATA.rda
--------------------------------------------------------------------------------
/data/golden_rules.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/golden_rules.rda
--------------------------------------------------------------------------------
/data/hamlet.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/hamlet.rda
--------------------------------------------------------------------------------
/data/simple_dtm.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/simple_dtm.rda
--------------------------------------------------------------------------------
/inst/build.R:
--------------------------------------------------------------------------------
1 | root <- Sys.getenv("USERPROFILE")
2 | pack <- basename(getwd())
3 |
4 | quick <- TRUE
5 | pdf <- FALSE
6 |
7 | unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)
8 | devtools::document()
9 | devtools::install(quick = quick, build_vignettes = FALSE, dependencies = TRUE)
10 |
11 | if(pdf){
12 | path <- find.package(pack)
13 | system(paste(shQuote(file.path(R.home("bin"), "R")), "CMD", "Rd2pdf", shQuote(path)))
14 | file.copy(paste0(pack, '.pdf'), file.path(root,"Desktop", paste0(pack, '.pdf')))
15 | while (file.exists(paste0(pack, ".pdf"))) {unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)}
16 | empts <- grep("^\\.Rd", dir(all.files = TRUE), value = TRUE)
17 | unlink(empts, recursive = TRUE, force = TRUE)
18 | }
19 |
20 | message("Done!")
21 |
22 |
23 |
--------------------------------------------------------------------------------
/inst/docs/Simpsons_Roasting_on_an_Open_Fire_Script.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/inst/docs/Simpsons_Roasting_on_an_Open_Fire_Script.pdf
--------------------------------------------------------------------------------
/inst/extra_statdoc/readme.R:
--------------------------------------------------------------------------------
1 | 
2 |
textshape is a...
3 | Download the development version of textshape here
4 |
--------------------------------------------------------------------------------
/inst/make_data/golden_rules.R:
--------------------------------------------------------------------------------
1 | pacman::p_load(textshape, textclean, tidyverse, stringi, magrittr)
2 |
3 | golden_rules <- 'https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt' %>%
4 | readLines() %>%
5 | `[`(3:197) %>%
6 | # drop_element('^\\s*$') %>%
7 | split_match('^\\s*$', FALSE, TRUE) %>%
8 | map(function(x){
9 |
10 | tibble::tibble(
11 | Rule = x %>%
12 | `[`(1) %>%
13 | stringi::stri_replace_all_regex('^\\d+\\)\\s*', '') %>%
14 | trimws(),
15 | Text = x[2],
16 | Outcome = x %>%
17 | `[`(3) %>%
18 | stringi::stri_replace_all_regex('^\\[', 'c(') %>%
19 | stringi::stri_replace_all_regex('\\]$', ')') %>%
20 | {parse(text = .)} %>%
21 | eval()
22 | )
23 | }) %>%
24 | bind_rows() %>%
25 | nest(-c(Rule, Text), .key = Outcome) %>%
26 | filter(!grepl("[^ -~]", Text))
27 |
28 | # golden_rules %$%
29 | # split_sentence(Text)
30 |
31 | # pax::new_data(golden_rules)
32 | #golden_rules$Text
33 |
--------------------------------------------------------------------------------
/inst/staticdocs/index.R:
--------------------------------------------------------------------------------
1 | library(staticdocs)
2 |
3 | sd_section("",
4 | "Function for...",
5 | c(
6 | "myfun"
7 | )
8 | )
--------------------------------------------------------------------------------
/inst/supporting_docs/LaverGarry.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/inst/supporting_docs/LaverGarry.zip
--------------------------------------------------------------------------------
/man/DATA.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textshape-package.R
3 | \docType{data}
4 | \name{DATA}
5 | \alias{DATA}
6 | \title{Fictitious Classroom Dialogue}
7 | \format{
8 | A data frame with 11 rows and 5 variables
9 | }
10 | \usage{
11 | data(DATA)
12 | }
13 | \description{
14 | A fictitious dataset useful for small demonstrations.
15 | }
16 | \details{
17 | \itemize{
18 | \item person. Speaker
19 | \item sex. Gender
20 | \item adult. Dummy coded adult (0-no; 1-yes)
21 | \item state. Statement (dialogue)
22 | \item code. Dialogue coding scheme
23 | }
24 | }
25 | \keyword{datasets}
26 |
--------------------------------------------------------------------------------
/man/bind_list.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/bind_list.R
3 | \name{bind_list}
4 | \alias{bind_list}
5 | \title{Row Bind a List of Named Dataframes or Vectors}
6 | \usage{
7 | bind_list(x, id.name = "id", content.name = "content", ...)
8 | }
9 | \arguments{
10 | \item{x}{A named \code{\link[base]{list}} of
11 | \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.}
12 |
13 | \item{id.name}{The name to use for the column created from the \code{\link[base]{list}}.}
14 |
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{list}}
16 | of \code{\link[base]{vector}}s (only used if \code{x} is \code{\link[base]{vector}}).}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{list}} as an \code{id} column.
23 | }
24 | \description{
25 | Deprecated, use \code{\link[textshape]{tidy_list}} instead.
26 | }
27 | \examples{
28 | \dontrun{
29 | bind_list(list(p=1:500, r=letters))
30 | bind_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
31 |
32 | ## 2015 Vice-Presidential Debates Example
33 | if (!require("pacman")) install.packages("pacman")
34 | pacman::p_load(rvest, magrittr, xml2)
35 |
36 | debates <- c(
37 | wisconsin = "110908",
38 | boulder = "110906",
39 | california = "110756",
40 | ohio = "110489"
41 | )
42 |
43 | lapply(debates, function(x){
44 | xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) \%>\%
45 | rvest::html_nodes("p") \%>\%
46 | rvest::html_text() \%>\%
47 | textshape::split_index(grep("^[A-Z]+:", .)) \%>\%
48 | textshape::combine() \%>\%
49 | textshape::split_transcript() \%>\%
50 | textshape::split_sentence()
51 | }) \%>\%
52 | textshape::bind_list("location")
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/man/bind_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/bind_table.R
3 | \name{bind_table}
4 | \alias{bind_table}
5 | \title{Column Bind a Table's Values with Its Names}
6 | \usage{
7 | bind_table(x, id.name = "id", content.name = "content", ...)
8 | }
9 | \arguments{
10 | \item{x}{A \code{\link[base]{table}}.}
11 |
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{table}}
13 | \code{\link[base]{names}}.}
14 |
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{table}}
16 | values.}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{table}} as an \code{id} column.
23 | }
24 | \description{
25 | Deprecated, use \code{\link[textshape]{tidy_table}} instead.
26 | }
27 | \examples{
28 | \dontrun{
29 | x <- table(sample(LETTERS[1:6], 1000, TRUE))
30 | bind_table(x)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/man/bind_vector.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/bind_vector.R
3 | \name{bind_vector}
4 | \alias{bind_vector}
5 | \title{Column Bind an Atomic Vector's Values with Its Names}
6 | \usage{
7 | bind_vector(x, id.name = "id", content.name = "content", ...)
8 | }
9 | \arguments{
10 | \item{x}{A named atomic \code{\link[base]{vector}}.}
11 |
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{vector}}
13 | \code{\link[base]{names}}.}
14 |
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{vector}}
16 | values.}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{vector}} as an \code{id} column.
23 | }
24 | \description{
25 | Deprecated, use \code{\link[textshape]{tidy_vector}} instead.
26 | }
27 | \examples{
28 | \dontrun{
29 | x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
30 | bind_vector(x)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/man/change_index.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/change_index.R
3 | \name{change_index}
4 | \alias{change_index}
5 | \title{Indexing of Changes in Runs}
6 | \usage{
7 | change_index(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{A vector.}
11 |
12 | \item{\ldots}{ignored.}
13 | }
14 | \value{
15 | Returns a vector of integer indices of where a vector initially
16 | changes.
17 | }
18 | \description{
19 | Find the indices of changes in runs in a vector. This function pairs well
20 | with \code{split_index} and is the default for the \code{indices} in all
21 | \code{split_index} functions that act on atomic vectors.
22 | }
23 | \examples{
24 | set.seed(10)
25 | (x <- sample(0:1, 20, TRUE))
26 | change_index(x)
27 | split_index(x, change_index(x))
28 |
29 |
30 | (p_chng <- change_index(CO2[["Plant"]]))
31 | split_index(CO2[["Plant"]], p_chng)
32 | }
33 | \seealso{
34 | \code{\link[textshape]{split_index}}
35 | }
36 |
--------------------------------------------------------------------------------
/man/cluster_matrix.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cluster_matrix.R
3 | \name{cluster_matrix}
4 | \alias{cluster_matrix}
5 | \title{Reorder a Matrix Based on Hierarchical Clustering}
6 | \usage{
7 | cluster_matrix(x, dim = "both", method = "ward.D2", ...)
8 | }
9 | \arguments{
10 | \item{x}{A matrix.}
11 |
12 | \item{dim}{The dimension to reorder (cluster); must be set to "row", "col",
13 | or "both".}
14 |
15 | \item{method}{The agglomeration method to be used (see
16 | \code{\link[stats]{hclust}}).}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a reordered matrix.
22 | }
23 | \description{
24 | Reorder matrix rows, columns, or both via hierarchical clustering.
25 | }
26 | \examples{
27 | cluster_matrix(mtcars)
28 | cluster_matrix(mtcars, dim = 'row')
29 | cluster_matrix(mtcars, dim = 'col')
30 |
31 | \dontrun{
32 | if (!require("pacman")) install.packages("pacman")
33 | pacman::p_load(tidyverse, viridis, gridExtra)
34 |
35 | ## plot heatmap w/o clustering
36 | wo <- mtcars \%>\%
37 | cor() \%>\%
38 | tidy_matrix('car', 'var') \%>\%
39 | ggplot(aes(var, car, fill = value)) +
40 | geom_tile() +
41 | scale_fill_viridis(name = expression(r[xy])) +
42 | theme(
43 | axis.text.y = element_text(size = 8) ,
44 | axis.text.x = element_text(
45 | size = 8,
46 | hjust = 1,
47 | vjust = 1,
48 | angle = 45
49 | ),
50 | legend.position = 'bottom',
51 | legend.key.height = grid::unit(.1, 'cm'),
52 | legend.key.width = grid::unit(.5, 'cm')
53 | ) +
54 | labs(subtitle = "With Out Clustering")
55 |
56 | ## plot heatmap w clustering
57 | w <- mtcars \%>\%
58 | cor() \%>\%
59 | cluster_matrix() \%>\%
60 | tidy_matrix('car', 'var') \%>\%
61 | mutate(
62 | var = factor(var, levels = unique(var)),
63 | car = factor(car, levels = unique(car))
64 | ) \%>\%
65 | group_by(var) \%>\%
66 | ggplot(aes(var, car, fill = value)) +
67 | geom_tile() +
68 | scale_fill_viridis(name = expression(r[xy])) +
69 | theme(
70 | axis.text.y = element_text(size = 8) ,
71 | axis.text.x = element_text(
72 | size = 8,
73 | hjust = 1,
74 | vjust = 1,
75 | angle = 45
76 | ),
77 | legend.position = 'bottom',
78 | legend.key.height = grid::unit(.1, 'cm'),
79 | legend.key.width = grid::unit(.5, 'cm')
80 | ) +
81 | labs(subtitle = "With Clustering")
82 |
83 | gridExtra::grid.arrange(wo, w, ncol = 2)
84 | }
85 | }
86 | \seealso{
87 | \code{\link[stats]{hclust}}
88 | }
89 |
--------------------------------------------------------------------------------
/man/column_to_rownames.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/column_to_rownames.R
3 | \name{column_to_rownames}
4 | \alias{column_to_rownames}
5 | \title{Add a Column as Rownames}
6 | \usage{
7 | column_to_rownames(x, loc = 1)
8 | }
9 | \arguments{
10 | \item{x}{An object that can be coerced to a \code{\link[base]{data.frame}}.}
11 |
12 | \item{loc}{The column location as either an integer or string index location.
13 | Must be unique row names.}
14 | }
15 | \value{
16 | Returns a \code{\link[base]{data.frame}} with the specified column
17 | moved to rownames.
18 | }
19 | \description{
20 | Takes an existing column and uses it as rownames instead. This is useful
21 | when turning a \code{\link[base]{data.frame}} into a \code{\link[base]{matrix}}.
22 | Inspired by the \pkg{tibble} package's \code{column_to_row} which is now
23 | deprecated if done on a \pkg{tibble} object. By coercing to a
24 | \code{\link[base]{data.frame}} this problem is avoided.
25 | }
26 | \examples{
27 | state_dat <- data.frame(state.name, state.area, state.center, state.division)
28 | column_to_rownames(state_dat)
29 | column_to_rownames(state_dat, 'state.name')
30 | }
31 |
--------------------------------------------------------------------------------
/man/combine.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/combine.R
3 | \name{combine}
4 | \alias{combine}
5 | \alias{combine.default}
6 | \alias{combine.data.frame}
7 | \title{Combine Elements}
8 | \usage{
9 | combine(x, ...)
10 |
11 | \method{combine}{default}(x, fix.punctuation = TRUE, ...)
12 |
13 | \method{combine}{data.frame}(x, text.var = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with runs.}
17 |
18 | \item{fix.punctuation}{logical If \code{TRUE} spaces before/after punctuation
19 | that should not be are a removed (regex used:
20 | \code{"(\\s+(?=[,.?!;:\%-]))|((?<=[$-])\\s+)"}).}
21 |
22 | \item{text.var}{The name of the text variable.}
23 |
24 | \item{\ldots}{Ignored.}
25 | }
26 | \value{
27 | Returns a vector (if given a list/vector) or an expanded
28 | \code{\link[data.table]{data.table}} with elements pasted together.
29 | }
30 | \description{
31 | Combine (\code{\link[base]{paste}}) elements (\code{\link[base]{vector}}s,
32 | \code{\link[base]{list}}s, or \code{\link[base]{data.frame}}s) together
33 | with \code{collapse = TRUE}.
34 | }
35 | \examples{
36 | (x <- split_token(DATA[["state"]][1], FALSE))
37 | combine(x)
38 |
39 | (x2 <- split_token(DATA[["state"]], FALSE))
40 | combine(x2)
41 |
42 | (x3 <- split_sentence(DATA))
43 |
44 | ## without dropping the non-group variable column
45 | combine(x3)
46 |
47 | ## Dropping the non-group variable column
48 | combine(x3[, 1:5, with=FALSE])
49 | }
50 |
--------------------------------------------------------------------------------
/man/duration.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/duration.R
3 | \name{duration}
4 | \alias{duration}
5 | \alias{duration.default}
6 | \alias{duration.data.frame}
7 | \alias{duration.numeric}
8 | \alias{starts}
9 | \alias{ends}
10 | \title{Duration of Turns of Talk}
11 | \usage{
12 | duration(x, ...)
13 |
14 | \method{duration}{default}(x, grouping.var = NULL, ...)
15 |
16 | \method{duration}{data.frame}(x, text.var = TRUE, ...)
17 |
18 | \method{duration}{numeric}(x, ...)
19 |
20 | starts(x, ...)
21 |
22 | ends(x, ...)
23 | }
24 | \arguments{
25 | \item{x}{A \code{\link[base]{data.frame}} or character vector with a text
26 | variable or a numeric vector.}
27 |
28 | \item{grouping.var}{The grouping variables. Default \code{NULL} generates
29 | one word list for all text. Also takes a single grouping variable or a list
30 | of 1 or more grouping variables.}
31 |
32 | \item{text.var}{The name of the text variable. If \code{TRUE}
33 | \code{duration} tries to detect the text column.}
34 |
35 | \item{\ldots}{Ignored.}
36 | }
37 | \value{
38 | Returns a vector or data frame of starts and/or ends.
39 | }
40 | \description{
41 | \code{duration} - Calculate duration (start and end times) for duration of
42 | turns of talk measured in words.
43 |
44 | \code{startss} - Calculate start times from a numeric vector.
45 |
46 | \code{ends} - Calculate end times from a numeric vector.
47 | }
48 | \examples{
49 | (x <- c(
50 | "Mr. Brown comes! He says hello. i give him coffee.",
51 | "I'll go at 5 p. m. eastern time. Or somewhere in between!",
52 | "go there"
53 | ))
54 | duration(x)
55 | group <- c("A", "B", "A")
56 | duration(x, group)
57 |
58 | groups <- list(group1 = c("A", "B", "A"), group2 = c("red", "red", "green"))
59 | duration(x, groups)
60 |
61 | data(DATA)
62 | duration(DATA)
63 |
64 | ## Larger data set
65 | duration(hamlet)
66 |
67 | ## Integer values
68 | x <- sample(1:10, 10)
69 | duration(x)
70 | starts(x)
71 | ends(x)
72 | }
73 |
--------------------------------------------------------------------------------
/man/flatten.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/flatten.R
3 | \name{flatten}
4 | \alias{flatten}
5 | \title{Flatten a Nested List of Vectors Into a Single Tier List of Vectors}
6 | \usage{
7 | flatten(x, sep = "_", ...)
8 | }
9 | \arguments{
10 | \item{x}{A nested, named list of vectors.}
11 |
12 | \item{sep}{A separator to use for the concatenation of the names from the
13 | nested list.}
14 |
15 | \item{\ldots}{ignored.}
16 | }
17 | \value{
18 | Returns a flattened list.
19 | }
20 | \description{
21 | Flatten a named, nested list of atomic vectors to a single level using the
22 | concatenated list/atomic vector names as the names of the single tiered
23 | list.
24 | }
25 | \note{
26 | The order of the list is sorted alphabetically. Pull requests for the
27 | option to return the original order would be appreciated.
28 | }
29 | \examples{
30 | x <- list(
31 | urban = list(
32 | cars = c('volvo', 'ford'),
33 | food.dining = list(
34 | local.business = c('carls'),
35 | chain.business = c('dennys', 'panera')
36 | )
37 | ),
38 | rural = list(
39 | land.use = list(
40 | farming =list(
41 | dairy = c('cows'),
42 | vegie.plan = c('carrots')
43 | )
44 | ),
45 | social.rec = list(
46 | community.center = c('town.square')
47 | ),
48 | people.type = c('good', 'bad', 'in.between')
49 | ),
50 | other.locales = c('suburban'),
51 | missing = list(
52 | unknown = c(),
53 | known = c()
54 | ),
55 | end = c('wow')
56 | )
57 |
58 | x
59 |
60 | flatten(x)
61 | flatten(x, ' -> ')
62 | }
63 | \references{
64 | \url{https://stackoverflow.com/a/41882883/1000343} \cr
65 | \url{https://stackoverflow.com/a/48357114/1000343}
66 | }
67 | \author{
68 | StackOverflow user @Michael and Paul Foster and Tyler
69 | Rinker .
70 | }
71 |
--------------------------------------------------------------------------------
/man/from_to.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/from_to.R
3 | \name{from_to}
4 | \alias{from_to}
5 | \alias{from_to.default}
6 | \alias{from_to.character}
7 | \alias{from_to.factor}
8 | \alias{from_to.numeric}
9 | \alias{from_to.data.frame}
10 | \alias{from_to_summarize}
11 | \title{Prepare Discourse Data for Network Plotting}
12 | \usage{
13 | from_to(x, ...)
14 |
15 | \method{from_to}{default}(x, final = "End", ...)
16 |
17 | \method{from_to}{character}(x, final = "End", ...)
18 |
19 | \method{from_to}{factor}(x, final = "End", ...)
20 |
21 | \method{from_to}{numeric}(x, final = "End", ...)
22 |
23 | \method{from_to}{data.frame}(x, from.var, final = "End", ...)
24 |
25 | from_to_summarize(x, from.var, id.vars = NULL, text.var = TRUE, ...)
26 | }
27 | \arguments{
28 | \item{x}{A data form \code{vector} or \code{data.frame}.}
29 |
30 | \item{final}{The name of the closing element or node.}
31 |
32 | \item{from.var}{A character string naming the column to be considered the
33 | origin of the talk.}
34 |
35 | \item{id.vars}{The variables that correspond to the speaker or are attributes
36 | of the speaker (from variable).}
37 |
38 | \item{text.var}{The name of the text variable. If \code{TRUE}
39 | \code{duration} tries to detect the text column.}
40 |
41 | \item{\ldots}{Ignored.}
42 | }
43 | \value{
44 | Returns a vector (if given a vector) or an augmented
45 | \code{\link[data.table]{data.table}}.
46 | }
47 | \description{
48 | \code{from_to} - Add the next speaker as the from variable in a to/from
49 | network data structure. Assumes that the flow of discourse is coming from
50 | person A to person B, or at the very least the talk is taken up by person B.
51 | Works by taking the vector of speakers and shifting everything down one and
52 | then adding a closing element.
53 |
54 | \code{from_to_summarize} - A wrapper for \code{from_to.data.frame} that
55 | adds a \code{word.count} column and then combines duplicate rows.
56 | }
57 | \examples{
58 | from_to(DATA, 'person')
59 | from_to_summarize(DATA, 'person')
60 | from_to_summarize(DATA, 'person', c('sex', 'adult'))
61 | \dontrun{
62 | if (!require("pacman")) install.packages("pacman"); library(pacman)
63 | p_load(dplyr, geomnet, qdap, stringi, scales)
64 | p_load_current_gh('trinker/textsahpe')
65 |
66 | dat <- from_to_summarize(DATA, 'person', c('sex', 'adult')) \%>\%
67 | mutate(words = rescale(word.count, c(.5, 1.5)))
68 |
69 | dat \%>\%
70 | ggplot(aes(from_id = from, to_id = to)) +
71 | geom_net(
72 | aes(linewidth = words),
73 | layout.alg = "fruchtermanreingold",
74 | directed = TRUE,
75 | labelon = TRUE,
76 | size = 1,
77 | labelcolour = 'black',
78 | ecolour = "grey70",
79 | arrowsize = 1,
80 | curvature = .1
81 | ) +
82 | theme_net() +
83 | xlim(c(-0.05, 1.05))
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/man/golden_rules.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textshape-package.R
3 | \docType{data}
4 | \name{golden_rules}
5 | \alias{golden_rules}
6 | \title{Sentence Boundary Disambiguation Edge Cases}
7 | \format{
8 | A data frame with 45 rows and 3 variables
9 | }
10 | \usage{
11 | data(golden_rules)
12 | }
13 | \description{
14 | A slightly filtered dataset containing Dias's sentence boundary
15 | disambiguation edge cases. This is a nested data set with the outcome
16 | column as a nested list of desired splits. The non-ASCII cases and spaced
17 | ellipsis examples have been removed.
18 | }
19 | \details{
20 | \itemize{
21 | \item Rule. The name of the rule to test
22 | \item Text. The testing text
23 | \item Outcome. The desired outcome of the sentence disambiguation
24 | }
25 | }
26 | \references{
27 | Dias, Kevin S. 2015. Golden Rules (English).
28 | Retrieved: https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt
29 | }
30 | \keyword{datasets}
31 |
--------------------------------------------------------------------------------
/man/grab_index.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/grab_index.R
3 | \name{grab_index}
4 | \alias{grab_index}
5 | \alias{grab_index.character}
6 | \alias{grab_index.default}
7 | \alias{grab_index.list}
8 | \alias{grab_index.data.frame}
9 | \alias{grab_index.matrix}
10 | \title{Get Elements Matching Between 2 Points}
11 | \usage{
12 | grab_index(x, from = NULL, to = NULL, ...)
13 |
14 | \method{grab_index}{character}(x, from = NULL, to = NULL, ...)
15 |
16 | \method{grab_index}{default}(x, from = NULL, to = NULL, ...)
17 |
18 | \method{grab_index}{list}(x, from = NULL, to = NULL, ...)
19 |
20 | \method{grab_index}{data.frame}(x, from = NULL, to = NULL, ...)
21 |
22 | \method{grab_index}{matrix}(x, from = NULL, to = NULL, ...)
23 | }
24 | \arguments{
25 | \item{x}{A character vector, \code{\link[base]{data.frame}}, or list.}
26 |
27 | \item{from}{An integer to start from (if \code{NULL} defaults to the first
28 | element/row).}
29 |
30 | \item{to}{A integer to get up to (if \code{NULL} defaults to the last
31 | element/row).}
32 |
33 | \item{\ldots}{ignored.}
34 | }
35 | \value{
36 | Returns a subset of the original data set.
37 | }
38 | \description{
39 | Use regexes to get all the elements between two points.
40 | }
41 | \examples{
42 | grab_index(DATA, from = 2, to = 4)
43 | grab_index(DATA$state, from = 2, to = 4)
44 | grab_index(DATA$state, from = 2)
45 | grab_index(DATA$state, to = 4)
46 | grab_index(matrix(1:100, nrow = 10), 2, 4)
47 | }
48 |
--------------------------------------------------------------------------------
/man/grab_match.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/grab_match.R
3 | \name{grab_match}
4 | \alias{grab_match}
5 | \alias{grab_match.character}
6 | \alias{grab_match.list}
7 | \alias{grab_match.data.frame}
8 | \title{Get Elements Matching Between 2 Points}
9 | \usage{
10 | grab_match(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...)
11 |
12 | \method{grab_match}{character}(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...)
13 |
14 | \method{grab_match}{list}(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...)
15 |
16 | \method{grab_match}{data.frame}(
17 | x,
18 | from = NULL,
19 | to = NULL,
20 | from.n = 1,
21 | to.n = 1,
22 | text.var = TRUE,
23 | ...
24 | )
25 | }
26 | \arguments{
27 | \item{x}{A character vector, \code{\link[base]{data.frame}}, or list.}
28 |
29 | \item{from}{A regex to start getting from (if \code{NULL} defaults to the
30 | first element/row).}
31 |
32 | \item{to}{A regex to get up to (if \code{NULL} defaults to the last element/row).}
33 |
34 | \item{from.n}{If more than one element matches \code{from} this dictates
35 | which one should be used. Must be an integer up to the number of possible
36 | matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
37 | possible), or \code{'n'} (the same as \code{'last'}).}
38 |
39 | \item{to.n}{If more than one element matches \code{to} this dictates
40 | which one should be used. Must be an integer up to the number of possible
41 | matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
42 | possible), or \code{'n'} (the same as \code{'last'}).}
43 |
44 | \item{text.var}{The name of the text variable with matches. If \code{TRUE}
45 | \code{grab_match} tries to detect the text column.}
46 |
47 | \item{\ldots}{Other arguments passed to \code{\link[base]{grep}}, most notable
48 | is \code{ignore.case}.}
49 | }
50 | \value{
51 | Returns a subset of the original data set.
52 | }
53 | \description{
54 | Use regexes to get all the elements between two points.
55 | }
56 | \examples{
57 | grab_match(DATA$state, from = 'dumb', to = 'liar')
58 | grab_match(DATA$state, from = 'dumb')
59 | grab_match(DATA$state, to = 'liar')
60 | grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE)
61 | grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE,
62 | from.n = 'first', to.n = 'last')
63 | grab_match(as.list(DATA$state), from = 'dumb', to = 'liar')
64 |
65 | ## Data.frame: attempts to find text.var
66 | grab_match(DATA, from = 'dumb', to = 'liar')
67 | }
68 |
--------------------------------------------------------------------------------
/man/hamlet.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textshape-package.R
3 | \docType{data}
4 | \name{hamlet}
5 | \alias{hamlet}
6 | \title{Hamlet (Complete & Split by Sentence)}
7 | \format{
8 | A data frame with 2007 rows and 7 variables
9 | }
10 | \usage{
11 | data(hamlet)
12 | }
13 | \description{
14 | A dataset containing the complete dialogue of Hamlet with turns of talk split
15 | into sentences.
16 | }
17 | \details{
18 | \itemize{
19 | \item act. The act (akin to repeated measures)
20 | \item tot. The turn of talk
21 | \item scene. The scene (nested within an act)
22 | \item location. Location of the scene
23 | \item person. Character in the play
24 | \item died. Logical coded death variable if yes the character dies in the
25 | play
26 | \item dialogue. The spoken dialogue
27 | }
28 | }
29 | \references{
30 | http://www.gutenberg.org
31 | }
32 | \keyword{datasets}
33 |
--------------------------------------------------------------------------------
/man/mtabulate.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mtabulate.R
3 | \name{mtabulate}
4 | \alias{mtabulate}
5 | \alias{as_list}
6 | \title{Tabulate Frequency Counts for Multiple Vectors}
7 | \usage{
8 | mtabulate(vects)
9 |
10 | as_list(mat, nm = rownames(mat))
11 | }
12 | \arguments{
13 | \item{vects}{A \code{\link[base]{vector}}, \code{\link[base]{list}}, or
14 | \code{\link[base]{data.frame}} of named/unnamed vectors.}
15 |
16 | \item{mat}{A matrix of counts.}
17 |
18 | \item{nm}{A character vector of names to assign to the list.}
19 | }
20 | \value{
21 | \code{mtabulate} - Returns a \code{\link[base]{data.frame}} with
22 | columns equal to number of unique elements and the number of rows equal to
23 | the the original length of the \code{\link[base]{vector}},
24 | \code{\link[base]{list}}, or \code{\link[base]{data.frame}} (length equals
25 | number of columns in \code{\link[base]{data.frame}}). If list of vectors is
26 | named these will be the rownames of the dataframe.
27 |
28 | \code{as_list} - Returns a list of elements.
29 | }
30 | \description{
31 | \code{mtabulate} - Similar to \code{\link[base]{tabulate}} that works on
32 | multiple vectors.
33 |
34 | \code{as_list} - Convert a count matrix to a named list of elements. The
35 | semantic inverse of \code{mtabulate}.
36 | }
37 | \examples{
38 | mtabulate(list(w=letters[1:10], x=letters[1:5], z=letters))
39 | mtabulate(list(mtcars$cyl[1:10]))
40 |
41 | ## Dummy coding
42 | mtabulate(mtcars$cyl[1:10])
43 | mtabulate(CO2[, "Plant"])
44 |
45 | dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3))
46 | mtabulate(dat)
47 | as_list(mtabulate(dat))
48 | t(mtabulate(dat))
49 | as_list(t(mtabulate(dat)))
50 | }
51 | \references{
52 | \url{https://stackoverflow.com/a/9961324/1000343}
53 | }
54 | \seealso{
55 | \code{\link[base]{tabulate}}
56 | }
57 | \author{
58 | Joran Elias and Tyler Rinker .
59 | }
60 | \keyword{frequency}
61 | \keyword{tabulate}
62 |
--------------------------------------------------------------------------------
/man/simple_dtm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textshape-package.R
3 | \docType{data}
4 | \name{simple_dtm}
5 | \alias{simple_dtm}
6 | \title{Simple \code{\link[tm]{DocumentTermMatrix}}}
7 | \format{
8 | A list with 6 elements
9 | }
10 | \usage{
11 | data(simple_dtm)
12 | }
13 | \description{
14 | A dataset containing a simple \code{\link[tm]{DocumentTermMatrix}}.
15 | }
16 | \details{
17 | \describe{
18 | \item{i}{The document locations}
19 | \item{j}{The term locations}
20 | \item{v}{The count of terms for that particular element position}
21 | \item{nrow}{The number of rows}
22 | \item{ncol}{The number of columns}
23 | \item{dimnames}{document and terms}
24 | }
25 | }
26 | \keyword{datasets}
27 |
--------------------------------------------------------------------------------
/man/split_index.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_index.R
3 | \name{split_index}
4 | \alias{split_index}
5 | \alias{split_index.list}
6 | \alias{split_index.data.frame}
7 | \alias{split_index.matrix}
8 | \alias{split_index.numeric}
9 | \alias{split_index.factor}
10 | \alias{split_index.character}
11 | \alias{split_index.default}
12 | \title{Split Data Forms at Specified Indices}
13 | \usage{
14 | split_index(
15 | x,
16 | indices = if (is.atomic(x)) {
17 | NULL
18 | } else {
19 | change_index(x)
20 | },
21 | names = NULL,
22 | ...
23 | )
24 |
25 | \method{split_index}{list}(x, indices, names = NULL, ...)
26 |
27 | \method{split_index}{data.frame}(x, indices, names = NULL, ...)
28 |
29 | \method{split_index}{matrix}(x, indices, names = NULL, ...)
30 |
31 | \method{split_index}{numeric}(x, indices = change_index(x), names = NULL, ...)
32 |
33 | \method{split_index}{factor}(x, indices = change_index(x), names = NULL, ...)
34 |
35 | \method{split_index}{character}(x, indices = change_index(x), names = NULL, ...)
36 |
37 | \method{split_index}{default}(x, indices = change_index(x), names = NULL, ...)
38 | }
39 | \arguments{
40 | \item{x}{A data form (\code{list}, \code{vector}, \code{data.frame},
41 | \code{matrix}).}
42 |
43 | \item{indices}{A vector of integer indices to split at. If \code{indices}
44 | contains the index 1, it will be silently dropped. The default value when
45 | \code{x} evaluates to \code{TRUE} for \code{\link[base]{is.atomic}} is to use
46 | \code{\link[textshape]{change_index}(x)}.}
47 |
48 | \item{names}{Optional vector of names to give to the list elements.}
49 |
50 | \item{\ldots}{Ignored.}
51 | }
52 | \value{
53 | Returns of list of data forms broken at the \code{indices}.
54 | }
55 | \description{
56 | Split data forms at specified integer indices.
57 | }
58 | \note{
59 | Two dimensional object will retain dimension (i.e., \code{drop = FALSE}
60 | is used).
61 | }
62 | \examples{
63 | ## character
64 | split_index(LETTERS, c(4, 10, 16))
65 | split_index(LETTERS, c(4, 10, 16), c("dog", "cat", "chicken", "rabbit"))
66 |
67 | ## numeric
68 | split_index(1:100, c(33, 66))
69 |
70 | ## factor
71 | (p_chng <- change_index(CO2[["Plant"]]))
72 | split_index(CO2[["Plant"]], p_chng)
73 | #`change_index` was unnecessary as it is the default of atomic vectors
74 | split_index(CO2[["Plant"]])
75 |
76 | ## list
77 | split_index(as.list(LETTERS), c(4, 10, 16))
78 |
79 | ## data.frame
80 | (vs_change <- change_index(mtcars[["vs"]]))
81 | split_index(mtcars, vs_change)
82 |
83 | ## matrix
84 | (mat <- matrix(1:50, nrow=10))
85 | split_index(mat, c(3, 6, 10))
86 | }
87 | \seealso{
88 | \code{\link[textshape]{change_index}}
89 | }
90 |
--------------------------------------------------------------------------------
/man/split_match.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_match.R
3 | \name{split_match}
4 | \alias{split_match}
5 | \alias{split_match_regex}
6 | \title{Split a Vector By Split Points}
7 | \usage{
8 | split_match(x, split = "", include = FALSE, regex = FALSE, ...)
9 |
10 | split_match_regex(x, split = "", include = FALSE, ...)
11 | }
12 | \arguments{
13 | \item{x}{A vector with split points.}
14 |
15 | \item{split}{A vector of places (elements) to split on or a regular
16 | expression if \code{regex} argument is \code{TRUE}.}
17 |
18 | \item{include}{An integer of \code{1} (\code{split} character(s) are not
19 | included in the output), \code{2} (\code{split} character(s) are included at
20 | the beginning of the output), or \code{3} (\code{split} character(s) are
21 | included at the end of the output).}
22 |
23 | \item{regex}{logical. If \code{TRUE} regular expressions will be enabled for
24 | \code{split} argument.}
25 |
26 | \item{\ldots}{other arguments passed to \code{\link[base]{grep}} and
27 | \code{\link[base]{grepl}}.}
28 | }
29 | \value{
30 | Returns a list of vectors.
31 | }
32 | \description{
33 | \code{split_match} - Splits a \code{vector} into a list of vectors based on
34 | split points.
35 |
36 | \code{split_match_regex} - \code{split_match} with \code{regex = TRUE}.
37 | }
38 | \examples{
39 | set.seed(15)
40 | x <- sample(c("", LETTERS[1:10]), 25, TRUE, prob=c(.2, rep(.08, 10)))
41 |
42 | split_match(x)
43 | split_match(x, "C")
44 | split_match(x, c("", "C"))
45 |
46 | split_match(x, include = 0)
47 | split_match(x, include = 1)
48 | split_match(x, include = 2)
49 |
50 | set.seed(15)
51 | x <- sample(1:11, 25, TRUE, prob=c(.2, rep(.08, 10)))
52 | split_match(x, 1)
53 | }
54 | \references{
55 | \url{https://stackoverflow.com/a/24319217/1000343}
56 | }
57 | \author{
58 | Matthew Flickinger and Tyler Rinker .
59 | }
60 |
--------------------------------------------------------------------------------
/man/split_portion.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_portion.R
3 | \name{split_portion}
4 | \alias{split_portion}
5 | \title{Break Text Into Ordered Word Chunks}
6 | \usage{
7 | split_portion(
8 | text.var,
9 | grouping.var = NULL,
10 | n.words,
11 | n.chunks,
12 | as.string = TRUE,
13 | rm.unequal = FALSE,
14 | as.table = TRUE,
15 | ...
16 | )
17 | }
18 | \arguments{
19 | \item{text.var}{The text variable}
20 |
21 | \item{grouping.var}{The grouping variables. Default \code{NULL} generates
22 | one word list for all text. Also takes a single grouping variable or a list
23 | of 1 or more grouping variables.}
24 |
25 | \item{n.words}{An integer specifying the number of words in each chunk (must
26 | specify n.chunks or n.words).}
27 |
28 | \item{n.chunks}{An integer specifying the number of chunks (must specify
29 | n.chunks or n.words).}
30 |
31 | \item{as.string}{logical. If \code{TRUE} the chunks are returned as a single
32 | string. If \code{FALSE} the chunks are returned as a vector of single words.}
33 |
34 | \item{rm.unequal}{logical. If \code{TRUE} final chunks that are unequal in
35 | length to the other chunks are removed.}
36 |
37 | \item{as.table}{logical. If \code{TRUE} the list output is coerced to
38 | \code{\link[data.table]{data.table}} or \pkg{tibble}.}
39 |
40 | \item{\ldots}{Ignored.}
41 | }
42 | \value{
43 | Returns a list or \code{\link[data.table]{data.table}} of text chunks.
44 | }
45 | \description{
46 | Some visualizations and algorithms require text to be broken into chunks of
47 | ordered words. \code{split_portion} breaks text, optionally by grouping
48 | variables, into equal chunks. The chunk size can be specified by giving
49 | number of words to be in each chunk or the number of chunks.
50 | }
51 | \examples{
52 | with(DATA, split_portion(state, n.chunks = 10))
53 | with(DATA, split_portion(state, n.words = 10))
54 | with(DATA, split_portion(state, n.chunks = 10, as.string=FALSE))
55 | with(DATA, split_portion(state, n.chunks = 10, rm.unequal=TRUE))
56 | with(DATA, split_portion(state, person, n.chunks = 10))
57 | with(DATA, split_portion(state, list(sex, adult), n.words = 10))
58 | with(DATA, split_portion(state, person, n.words = 10, rm.unequal=TRUE))
59 |
60 | ## Bigger data
61 | with(hamlet, split_portion(dialogue, person, n.chunks = 10))
62 | with(hamlet, split_portion(dialogue, list(act, scene, person), n.chunks = 10))
63 | with(hamlet, split_portion(dialogue, person, n.words = 300))
64 | with(hamlet, split_portion(dialogue, list(act, scene, person), n.words = 300))
65 | }
66 | \keyword{chunks}
67 | \keyword{group}
68 | \keyword{text}
69 |
--------------------------------------------------------------------------------
/man/split_run.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_run.R
3 | \name{split_run}
4 | \alias{split_run}
5 | \alias{split_run.default}
6 | \alias{split_run.data.frame}
7 | \title{Split Runs}
8 | \usage{
9 | split_run(x, ...)
10 |
11 | \method{split_run}{default}(x, ...)
12 |
13 | \method{split_run}{data.frame}(x, text.var = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with runs.}
17 |
18 | \item{text.var}{The name of the text variable with runs. If \code{TRUE}
19 | \code{split_word} tries to detect the text column with runs.}
20 |
21 | \item{\ldots}{Ignored.}
22 | }
23 | \value{
24 | Returns a list of vectors of runs or an expanded
25 | \code{\link[data.table]{data.table}} with runs split apart.
26 | }
27 | \description{
28 | Split runs of consecutive characters.
29 | }
30 | \examples{
31 | x1 <- c(
32 | "122333444455555666666",
33 | NA,
34 | "abbcccddddeeeeeffffff",
35 | "sddfg",
36 | "11112222333"
37 | )
38 |
39 | x <- c(rep(x1, 2), ">>???,,,,....::::;[[")
40 |
41 | split_run(x)
42 |
43 |
44 | DATA[["run.col"]] <- x
45 | split_run(DATA, "run.col")
46 | }
47 |
--------------------------------------------------------------------------------
/man/split_sentence.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_sentence.R
3 | \name{split_sentence}
4 | \alias{split_sentence}
5 | \alias{split_sentence.default}
6 | \alias{split_sentence.data.frame}
7 | \title{Split Sentences}
8 | \usage{
9 | split_sentence(x, ...)
10 |
11 | \method{split_sentence}{default}(x, ...)
12 |
13 | \method{split_sentence}{data.frame}(x, text.var = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with sentences.}
17 |
18 | \item{text.var}{The name of the text variable. If \code{TRUE}
19 | \code{split_sentence} tries to detect the column with sentences.}
20 |
21 | \item{\ldots}{Ignored.}
22 | }
23 | \value{
24 | Returns a list of vectors of sentences or a expanded
25 | \code{\link[base]{data.frame}} with sentences split apart.
26 | }
27 | \description{
28 | Split sentences.
29 | }
30 | \examples{
31 | (x <- c(paste0(
32 | "Mr. Brown comes! He says hello. i give him coffee. i will ",
33 | "go at 5 p. m. eastern time. Or somewhere in between!go there"
34 | ),
35 | paste0(
36 | "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
37 | "The time has come. The time is now. Just go. Go. GO!",
38 | "I don't care how."
39 | )))
40 | split_sentence(x)
41 |
42 | data(DATA)
43 | split_sentence(DATA)
44 |
45 | \dontrun{
46 | ## Kevin S. Dias' sentence boundary disambiguation test set
47 | data(golden_rules)
48 | library(magrittr)
49 |
50 | golden_rules \%$\%
51 | split_sentence(Text)
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/man/split_sentence_token.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_sentence_token.R
3 | \name{split_sentence_token}
4 | \alias{split_sentence_token}
5 | \alias{split_sentence_token.default}
6 | \alias{split_sentence_token.data.frame}
7 | \title{Split Sentences & Tokens}
8 | \usage{
9 | split_sentence_token(x, ...)
10 |
11 | \method{split_sentence_token}{default}(x, lower = TRUE, ...)
12 |
13 | \method{split_sentence_token}{data.frame}(x, text.var = TRUE, lower = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with sentences.}
17 |
18 | \item{lower}{logical. If \code{TRUE} the words are converted to lower case.}
19 |
20 | \item{text.var}{The name of the text variable. If \code{TRUE}
21 | \code{split_sentence_token} tries to detect the column with sentences.}
22 |
23 | \item{\ldots}{Ignored.}
24 | }
25 | \value{
26 | Returns a list of vectors of sentences or a expanded
27 | \code{\link[base]{data.frame}} with sentences split apart.
28 | }
29 | \description{
30 | Split sentences and tokens.
31 | }
32 | \examples{
33 | (x <- c(paste0(
34 | "Mr. Brown comes! He says hello. i give him coffee. i will ",
35 | "go at 5 p. m. eastern time. Or somewhere in between!go there"
36 | ),
37 | paste0(
38 | "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
39 | "The time has come. The time is now. Just go. Go. GO!",
40 | "I don't care how."
41 | )))
42 | split_sentence_token(x)
43 |
44 | data(DATA)
45 | split_sentence_token(DATA)
46 |
47 | \dontrun{
48 | ## Kevin S. Dias' sentence boundary disambiguation test set
49 | data(golden_rules)
50 | library(magrittr)
51 |
52 | golden_rules \%$\%
53 | split_sentence_token(Text)
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/man/split_speaker.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_speaker.R
3 | \name{split_speaker}
4 | \alias{split_speaker}
5 | \title{Break and Stretch if Multiple Persons per Cell}
6 | \usage{
7 | split_speaker(dataframe, speaker.var = 1, sep = c("and", "&", ","), ...)
8 | }
9 | \arguments{
10 | \item{dataframe}{A dataframe that contains the person variable.}
11 |
12 | \item{speaker.var}{The person variable to be stretched.}
13 |
14 | \item{sep}{The separator(s) to search for and break on. Default is:
15 | c("and", "&", ",")}
16 |
17 | \item{\ldots}{Ignored.}
18 | }
19 | \value{
20 | Returns an expanded dataframe with person variable stretched and
21 | accompanying rows repeated.
22 | }
23 | \description{
24 | Look for cells with multiple people and create separate rows for each person.
25 | }
26 | \examples{
27 | \dontrun{
28 | DATA$person <- as.character(DATA$person)
29 | DATA$person[c(1, 4, 6)] <- c("greg, sally, & sam",
30 | "greg, sally", "sam and sally")
31 |
32 | split_speaker(DATA)
33 |
34 | DATA$person[c(1, 4, 6)] <- c("greg_sally_sam",
35 | "greg.sally", "sam; sally")
36 |
37 | split_speaker(DATA, sep = c(".", "_", ";"))
38 |
39 | DATA <- textshape::DATA #reset DATA
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/man/split_token.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_token.R
3 | \name{split_token}
4 | \alias{split_token}
5 | \alias{split_token.default}
6 | \alias{split_token.data.frame}
7 | \title{Split Tokens}
8 | \usage{
9 | split_token(x, ...)
10 |
11 | \method{split_token}{default}(x, lower = TRUE, ...)
12 |
13 | \method{split_token}{data.frame}(x, text.var = TRUE, lower = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with tokens.}
17 |
18 | \item{lower}{logical. If \code{TRUE} the words are converted to lower case.}
19 |
20 | \item{text.var}{The name of the text variable. If \code{TRUE}
21 | \code{split_token} tries to detect the text column with tokens.}
22 |
23 | \item{\ldots}{Ignored.}
24 | }
25 | \value{
26 | Returns a list of vectors of tokens or an expanded
27 | \code{\link[data.table]{data.table}} with tokens split apart.
28 | }
29 | \description{
30 | Split tokens.
31 | }
32 | \examples{
33 | (x <- c(
34 | "Mr. Brown comes! He says hello. i give him coffee.",
35 | "I'll go at 5 p. m. eastern time. Or somewhere in between!",
36 | "go there"
37 | ))
38 | split_token(x)
39 | split_token(x, lower=FALSE)
40 |
41 | data(DATA)
42 | split_token(DATA)
43 | split_token(DATA, lower=FALSE)
44 |
45 | ## Larger data set
46 | split_token(hamlet)
47 | }
48 |
--------------------------------------------------------------------------------
/man/split_transcript.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_transcript.R
3 | \name{split_transcript}
4 | \alias{split_transcript}
5 | \title{Split a Transcript Style Vector on Delimiter & Coerce to Dataframe}
6 | \usage{
7 | split_transcript(
8 | x,
9 | delim = ":",
10 | colnames = c("person", "dialogue"),
11 | max.delim = 15,
12 | ...
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}.}
17 |
18 | \item{delim}{The delimiter to split on.}
19 |
20 | \item{colnames}{The column names to use for the \code{\link[data.table]{data.table}}
21 | output.}
22 |
23 | \item{max.delim}{An integer stating how many characters may come before a
24 | delimiter is found. This is useful for the case when a colon is the delimiter
25 | but time stamps are also found in the text.}
26 |
27 | \item{\ldots}{Ignored.}
28 | }
29 | \value{
30 | Returns a 2 column \code{\link[data.table]{data.table}}.
31 | }
32 | \description{
33 | Split a transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}
34 | into a name and dialogue vector that is coerced to a \code{\link[data.table]{data.table}}.
35 | Leading/trailing white space in the columns is stripped out.
36 | }
37 | \examples{
38 | split_transcript(c("greg: Who me", "sarah: yes you!"))
39 |
40 | \dontrun{
41 | ## 2015 Vice-Presidential Debates Example
42 | if (!require("pacman")) install.packages("pacman")
43 | pacman::p_load(rvest, magrittr, xml2)
44 |
45 | debates <- c(
46 | wisconsin = "110908",
47 | boulder = "110906",
48 | california = "110756",
49 | ohio = "110489"
50 | )
51 |
52 | lapply(debates, function(x){
53 | xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) \%>\%
54 | rvest::html_nodes("p") \%>\%
55 | rvest::html_text() \%>\%
56 | textshape::split_index(grep("^[A-Z]+:", .)) \%>\%
57 | textshape::combine() \%>\%
58 | textshape::split_transcript() \%>\%
59 | textshape::split_sentence()
60 | })
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/man/split_word.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/split_word.R
3 | \name{split_word}
4 | \alias{split_word}
5 | \alias{split_word.default}
6 | \alias{split_word.data.frame}
7 | \title{Split Words}
8 | \usage{
9 | split_word(x, ...)
10 |
11 | \method{split_word}{default}(x, lower = TRUE, ...)
12 |
13 | \method{split_word}{data.frame}(x, text.var = TRUE, lower = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with words.}
17 |
18 | \item{lower}{logical. If \code{TRUE} the words are converted to lower case.}
19 |
20 | \item{text.var}{The name of the text variable. If \code{TRUE}
21 | \code{split_word} tries to detect the text column with words.}
22 |
23 | \item{\ldots}{Ignored.}
24 | }
25 | \value{
26 | Returns a list of vectors of words or an expanded
27 | \code{\link[data.table]{data.table}} with words split apart.
28 | }
29 | \description{
30 | Split words.
31 | }
32 | \examples{
33 | (x <- c(
34 | "Mr. Brown comes! He says hello. i give him coffee.",
35 | "I'll go at 5 p. m. eastern time. Or somewhere in between!",
36 | "go there"
37 | ))
38 | split_word(x)
39 | split_word(x, lower=FALSE)
40 |
41 | data(DATA)
42 | split_word(DATA)
43 | split_word(DATA, lower=FALSE)
44 |
45 | ## Larger data set
46 | split_word(hamlet)
47 | }
48 |
--------------------------------------------------------------------------------
/man/textshape.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/textshape-package.R
3 | \docType{package}
4 | \name{textshape}
5 | \alias{textshape}
6 | \alias{package-textshape}
7 | \title{Tools for Reshaping Text}
8 | \description{
9 | Tools that can be used to reshape and restructure text data.
10 | }
11 |
--------------------------------------------------------------------------------
/man/tidy_colo_dtm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tidy_colo_dtm.R
3 | \name{tidy_colo_tdm}
4 | \alias{tidy_colo_tdm}
5 | \alias{tidy_colo_dtm}
6 | \title{Convert a
7 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
8 | into Collocating Words in Tidy Form}
9 | \usage{
10 | tidy_colo_tdm(x, ...)
11 |
12 | tidy_colo_dtm(x, ...)
13 | }
14 | \arguments{
15 | \item{x}{A
16 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.}
17 |
18 | \item{\ldots}{Ignored.}
19 | }
20 | \value{
21 | Returns a tidied data.frame.
22 | }
23 | \description{
24 | Converts non-zero elements of a
25 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
26 | into a tidy data set made of collocating words.
27 | }
28 | \examples{
29 | data(simple_dtm)
30 |
31 | tidied <- tidy_colo_dtm(simple_dtm)
32 | tidied
33 | unique_pairs(tidied)
34 |
35 | \dontrun{
36 | if (!require("pacman")) install.packages("pacman")
37 | pacman::p_load_current_gh('trinker/gofastr', 'trinker/lexicon')
38 | pacman::p_load(tidyverse, magrittr, ggstance)
39 |
40 | my_dtm <- with(
41 | presidential_debates_2012,
42 | q_dtm(dialogue, paste(time, tot, sep = "_"))
43 | )
44 |
45 | tidy_colo_dtm(my_dtm) \%>\%
46 | tbl_df() \%>\%
47 | filter(!term_1 \%in\% c('i', lexicon::sw_onix) &
48 | !term_2 \%in\% lexicon::sw_onix
49 | ) \%>\%
50 | filter(term_1 != term_2) \%>\%
51 | unique_pairs() \%>\%
52 | filter(n > 15) \%>\%
53 | complete(term_1, term_2, fill = list(n = 0)) \%>\%
54 | ggplot(aes(x = term_1, y = term_2, fill = n)) +
55 | geom_tile() +
56 | scale_fill_gradient(low= 'white', high = 'red') +
57 | theme(axis.text.x = element_text(angle = 45, hjust = 1))
58 | }
59 | }
60 | \seealso{
61 | \code{\link[textshape]{unique_pairs}}
62 | }
63 |
--------------------------------------------------------------------------------
/man/tidy_dtm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tidy_dtm.R
3 | \name{tidy_dtm}
4 | \alias{tidy_dtm}
5 | \alias{tidy_tdm}
6 | \title{Convert a
7 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
8 | into Tidy Form}
9 | \usage{
10 | tidy_dtm(x, ...)
11 |
12 | tidy_tdm(x, ...)
13 | }
14 | \arguments{
15 | \item{x}{A
16 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a tidied data.frame.
22 | }
23 | \description{
24 | Converts non-zero elements of a
25 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
26 | into a tidy data set.
27 | }
28 | \examples{
29 | data(simple_dtm)
30 |
31 | tidy_dtm(simple_dtm)
32 |
33 | \dontrun{
34 | if (!require("pacman")) install.packages("pacman")
35 | pacman::p_load_current_gh('trinker/gofastr')
36 | pacman::p_load(tidyverse, magrittr, ggstance)
37 |
38 | my_dtm <- with(
39 | presidential_debates_2012,
40 | q_dtm(dialogue, paste(time, tot, sep = "_"))
41 | )
42 |
43 | tidy_dtm(my_dtm) \%>\%
44 | tidyr::extract(
45 | col = doc,
46 | into = c("time", "turn", "sentence"),
47 | regex = "(\\\\d)_(\\\\d+)\\\\.(\\\\d+)"
48 | ) \%>\%
49 | mutate(
50 | time = as.numeric(time),
51 | turn = as.numeric(turn),
52 | sentence = as.numeric(sentence)
53 | ) \%>\%
54 | tbl_df() \%T>\%
55 | print() \%>\%
56 | group_by(time, term) \%>\%
57 | summarize(n = sum(n)) \%>\%
58 | group_by(time) \%>\%
59 | arrange(desc(n)) \%>\%
60 | slice(1:10) \%>\%
61 | ungroup() \%>\%
62 | mutate(
63 | term = factor(paste(term, time, sep = "__"),
64 | levels = rev(paste(term, time, sep = "__")))
65 | ) \%>\%
66 | ggplot(aes(x = n, y = term)) +
67 | geom_barh(stat='identity') +
68 | facet_wrap(~time, ncol=2, scales = 'free_y') +
69 | scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/man/tidy_list.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tidy_list.R
3 | \name{tidy_list}
4 | \alias{tidy_list}
5 | \title{Tidy a List of Named Dataframes or Named Vectors or Vectors}
6 | \usage{
7 | tidy_list(
8 | x,
9 | id.name = "id",
10 | content.name = "content",
11 | content.attribute.name = "attribute",
12 | ...
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A named \code{\link[base]{list}} of
17 | \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.}
18 |
19 | \item{id.name}{The name to use for the column created from the
20 | \code{\link[base]{list}}.}
21 |
22 | \item{content.name}{The name to use for the column created from the
23 | \code{\link[base]{list}} of \code{\link[base]{vector}}s (only used if
24 | \code{x} is \code{\link[base]{vector}}).}
25 |
26 | \item{content.attribute.name}{The name to use for the column created from the
27 | \code{\link[base]{list}} of names given to the \code{\link[base]{vector}}s
28 | (only used if \code{x} is named \code{\link[base]{vector}}).}
29 |
30 | \item{\ldots}{Ignored.}
31 | }
32 | \value{
33 | Returns a \code{\link[data.table]{data.table}} with the
34 | \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id}
35 | column.
36 | }
37 | \description{
38 | \code{\link[base]{rbind}} a named \code{\link[base]{list}} of
39 | \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}s to
40 | output a single \code{\link[base]{data.frame}} with the
41 | \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id}
42 | column.
43 | }
44 | \examples{
45 | tidy_list(list(p=1:500, r=letters))
46 | tidy_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
47 |
48 | x <- list(
49 | a = setNames(c(1:4), LETTERS[1:4]),
50 | b = setNames(c(7:9), LETTERS[7:9]),
51 | c = setNames(c(10:15), LETTERS[10:15]),
52 | d = c(x=4, y=6, 4),
53 | e = setNames(1:10, sample(state.abb, 10, TRUE)),
54 | f = setNames(1:10, sample(month.abb, 10, TRUE))
55 | )
56 |
57 | tidy_list(x)
58 |
59 | \dontrun{
60 | ## 2015 Vice-Presidential Debates Example
61 | if (!require("pacman")) install.packages("pacman")
62 | pacman::p_load(rvest, magrittr, xml2)
63 |
64 | debates <- c(
65 | wisconsin = "110908",
66 | boulder = "110906",
67 | california = "110756",
68 | ohio = "110489"
69 | )
70 |
71 | lapply(debates, function(x){
72 | paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x) \%>\%
73 | xml2::read_html() \%>\%
74 | rvest::html_nodes("p") \%>\%
75 | rvest::html_text() \%>\%
76 | textshape::split_index(grep("^[A-Z]+:", .)) \%>\%
77 | textshape::combine() \%>\%
78 | textshape::split_transcript() \%>\%
79 | textshape::split_sentence()
80 | }) \%>\%
81 | textshape::tidy_list("location")
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/man/tidy_matrix.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tidy_matrix.R
3 | \name{tidy_matrix}
4 | \alias{tidy_matrix}
5 | \alias{tidy_adjacency_matrix}
6 | \title{Convert a Matrix into Tidy Form}
7 | \usage{
8 | tidy_matrix(x, row.name = "row", col.name = "col", value.name = "value", ...)
9 |
10 | tidy_adjacency_matrix(x, ...)
11 | }
12 | \arguments{
13 | \item{x}{A matrix.}
14 |
15 | \item{row.name}{A string to use for the row names that are now a column.}
16 |
17 | \item{col.name}{A string to use for the column names that are now a column.}
18 |
19 | \item{value.name}{A string to use for the values that are now a column.}
20 |
21 | \item{\ldots}{ignored.}
22 | }
23 | \value{
24 | Returns a tidied \code{data.frame}.
25 | }
26 | \description{
27 | \code{tidy_matrix} - Converts matrices into a tidy data set. Essentially, a
28 | stacking of the matrix columns and repeating row/column names as necessary.
29 |
30 | \code{tidy_adjacency_matrix} - A wrapper for \code{tidy_matrix} with the
31 | \code{row.name}, \code{col.name}, & \code{value.name} all set to
32 | \code{"from"},\code{"to"}, & \code{"n"}, assuming preparation for network
33 | analysis.
34 | }
35 | \examples{
36 | mat <- matrix(1:16, nrow = 4,
37 | dimnames = list(LETTERS[1:4], LETTERS[23:26])
38 | )
39 |
40 | mat
41 | tidy_matrix(mat)
42 |
43 |
44 | data(simple_dtm)
45 | tidy_matrix(as.matrix(simple_dtm), 'doc', 'term', 'n')
46 |
47 | X <- as.matrix(simple_dtm[1:10, 1:10])
48 | tidy_adjacency_matrix(crossprod(X))
49 | tidy_adjacency_matrix(crossprod(t(X)))
50 | }
51 |
--------------------------------------------------------------------------------
/man/tidy_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tidy_table.R
3 | \name{tidy_table}
4 | \alias{tidy_table}
5 | \title{Tidy a Table: Bind Its Values with Its Names}
6 | \usage{
7 | tidy_table(x, id.name = "id", content.name = "content", ...)
8 | }
9 | \arguments{
10 | \item{x}{A \code{\link[base]{table}}.}
11 |
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{table}}
13 | \code{\link[base]{names}}.}
14 |
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{table}}
16 | values.}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{table}} as an \code{id} column.
23 | }
24 | \description{
25 | \code{\link[base]{cbind}} a \code{\link[base]{table}}'s values with its
26 | \code{\link[base]{names}} to form \code{id} (from the names) and
27 | \code{content} columns.
28 | }
29 | \examples{
30 | x <- table(sample(LETTERS[1:6], 1000, TRUE))
31 | tidy_table(x)
32 | }
33 |
--------------------------------------------------------------------------------
/man/tidy_vector.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/tidy_vector.R
3 | \name{tidy_vector}
4 | \alias{tidy_vector}
5 | \title{Tidy a Named Atomic Vector: Bind Its Values with Its Names}
6 | \usage{
7 | tidy_vector(x, id.name = "id", content.name = "content", ...)
8 | }
9 | \arguments{
10 | \item{x}{A named atomic \code{\link[base]{vector}}.}
11 |
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{vector}}
13 | \code{\link[base]{names}}.}
14 |
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{vector}}
16 | values.}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{vector}} as an \code{id} column.
23 | }
24 | \description{
25 | \code{\link[base]{cbind}} a named atomic \code{\link[base]{vector}}'s values
26 | with its \code{\link[base]{names}} to form \code{id} (from the names) and
27 | \code{content} columns.
28 | }
29 | \examples{
30 | x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
31 | tidy_vector(x)
32 | }
33 |
--------------------------------------------------------------------------------
/man/unique_pairs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/unique_pairs.R
3 | \name{unique_pairs}
4 | \alias{unique_pairs}
5 | \alias{unique_pairs.default}
6 | \alias{unique_pairs.data.table}
7 | \title{Extract Only Unique Pairs of Collocating Words in
8 | \code{\link[textshape]{tidy_colo_dtm}}}
9 | \usage{
10 | unique_pairs(x, col1 = "term_1", col2 = "term_2", ...)
11 |
12 | \method{unique_pairs}{default}(x, col1 = "term_1", col2 = "term_2", ...)
13 |
14 | \method{unique_pairs}{data.table}(x, col1 = "term_1", col2 = "term_2", ...)
15 | }
16 | \arguments{
17 | \item{x}{A \code{\link[base]{data.frame}} with two columns that contain
18 | redundant pairs.}
19 |
20 | \item{col1}{A string naming column 1.}
21 |
22 | \item{col2}{A string naming column 2.}
23 |
24 | \item{\ldots}{ignored.}
25 | }
26 | \value{
27 | Returns a filtered \code{\link[base]{data.frame}}.
28 | }
29 | \description{
30 | \code{\link[textshape]{tidy_colo_dtm}} utilizes the entire matrix to generate
31 | the tidied data.frame. This means that the upper and lower triangles are
32 | used redundantly. This function eliminates this redundancy by dropping one
33 | set of the pairs from a tidied data.frame.
34 | }
35 | \examples{
36 | dat <- data.frame(
37 | term_1 = LETTERS[1:10],
38 | term_2 = LETTERS[10:1],
39 | stringsAsFactors = FALSE
40 | )
41 |
42 | unique_pairs(dat)
43 | }
44 | \seealso{
45 | \code{\link[textshape]{tidy_colo_dtm}}
46 | }
47 |
--------------------------------------------------------------------------------
/man/unnest_text.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/unnest_text.R
3 | \name{unnest_text}
4 | \alias{unnest_text}
5 | \title{Un-nest Nested Text Columns}
6 | \usage{
7 | unnest_text(dataframe, column, integer.rownames = TRUE, ...)
8 | }
9 | \arguments{
10 | \item{dataframe}{A dataframe object.}
11 |
12 | \item{column}{Column name to search for markers/terms.}
13 |
14 | \item{integer.rownames}{logical. If \code{TRUE} then the rownames are
15 | numbered 1 through number of rows, otherwise the original row number is
16 | retained followed by a period and the element number from the list.}
17 |
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns an un-nested data.frame.
22 | }
23 | \description{
24 | Un-nest nested text columns in a data.frame. Attempts to locate the nested
25 | text column without specifying.
26 | }
27 | \examples{
28 | dat <- DATA
29 |
30 | ## Add a nested/list text column
31 | dat$split <- lapply(dat$state, function(x) {
32 | unlist(strsplit(x, '(?<=[?!.])\\\\s+', perl = TRUE))
33 | })
34 |
35 | unnest_text(dat)
36 | unnest_text(dat, integer.rownames = FALSE)
37 |
38 | ## Add a second nested integer column
39 | dat$d <- lapply(dat$split, nchar)
40 | \dontrun{
41 | unnest_text(dat) # causes error, must supply column explicitly
42 | }
43 | unnest_text(dat, 'split')
44 |
45 | ## As a data.table
46 | library(data.table)
47 | dt_dat <- data.table::as.data.table(data.table::copy(dat))
48 | unnest_text(dt_dat, 'split')
49 | \dontrun{
50 | unnest_text(dt_dat, 'd')
51 | }
52 |
53 | \dontrun{
54 | ## As a tibble
55 | library(tibble)
56 | t_dat <- tibble:::as_tibble(dat)
57 | unnest_text(t_dat, 'split')
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library("testthat")
2 | library("textshape")
3 |
4 | test_check("textshape")
--------------------------------------------------------------------------------
/tests/testthat/test-change_index.R:
--------------------------------------------------------------------------------
1 | context("Checking change_index")
2 |
3 | test_that("change_index ...",{
4 |
5 |
6 | })
7 |
8 |
--------------------------------------------------------------------------------
/tests/testthat/test-mtabulate.R:
--------------------------------------------------------------------------------
1 | context("Checking mtabulate")
2 |
3 | test_that("mtabulate ...",{
4 |
5 |
6 | })
7 |
8 |
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-16-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-17-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-18-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-19-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-21-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-22-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-23-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-27-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-32-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-32-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-44-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-44-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-45-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-45-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-46-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-46-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-55-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-55-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-9-1.png
--------------------------------------------------------------------------------
/tools/textshape_logo/r_textshape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/textshape_logo/r_textshape.png
--------------------------------------------------------------------------------
/tools/textshape_logo/r_textshape.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/textshape_logo/r_textshape.pptx
--------------------------------------------------------------------------------
/tools/textshape_logo/r_textshapea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/textshape_logo/r_textshapea.png
--------------------------------------------------------------------------------
/tools/textshape_logo/resize_icon.txt:
--------------------------------------------------------------------------------
1 | cd C:\Users\Tyler\GitHub\textshape\tools\textshape_logo
2 | ffmpeg -i r_textshapea.png -vf scale=150:-1 r_textshape.png
--------------------------------------------------------------------------------