├── .github ├── .gitignore ├── workflows │ ├── test-coverage.yaml │ ├── pkgdown.yaml │ └── R-CMD-check.yaml └── CODE_OF_CONDUCT.md ├── revdep ├── README.md ├── failures.md ├── problems.md ├── .gitignore ├── email.yml └── cran.md ├── LICENSE ├── tests ├── testthat.R └── testthat │ ├── test-cor-sparse.R │ ├── test-pairwise-dist.R │ ├── test-squarely.R │ ├── test-pairwise-similarity.R │ ├── test-pairwise-cor.R │ ├── test-widely.R │ └── test-pairwise-count.R ├── R ├── globals.R ├── utils.R ├── cor_sparse.R ├── widely_kmeans.R ├── pairwise_similarity.R ├── pairwise_dist.R ├── pairwise_count.R ├── pairwise_pmi.R ├── widely_hclust.R ├── pairwise_cor.R ├── pairwise_delta.R ├── squarely.R ├── widely_svd.R └── widely.R ├── .gitignore ├── codecov.yml ├── .Rbuildignore ├── _pkgdown.yml ├── widyr.Rproj ├── cran-comments.md ├── NAMESPACE ├── man ├── cor_sparse.Rd ├── widely_kmeans.Rd ├── widely_hclust.Rd ├── pairwise_similarity.Rd ├── pairwise_pmi.Rd ├── pairwise_count.Rd ├── squarely.Rd ├── pairwise_dist.Rd ├── pairwise_cor.Rd ├── widely.Rd ├── widely_svd.Rd └── pairwise_delta.Rd ├── NEWS.md ├── DESCRIPTION ├── vignettes ├── intro.Rmd └── united_nations.Rmd ├── README.Rmd └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /revdep/README.md: -------------------------------------------------------------------------------- 1 | # Revdeps 2 | 3 | -------------------------------------------------------------------------------- /revdep/failures.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /revdep/problems.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2016 2 | COPYRIGHT HOLDER: David Robinson 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(widyr) 3 | 4 | test_check("widyr") 5 | -------------------------------------------------------------------------------- /R/globals.R: -------------------------------------------------------------------------------- 1 | globalVariables(c("item1", "item2", "value", "..data", "data", 2 | "item", "cluster")) 3 | -------------------------------------------------------------------------------- /revdep/.gitignore: -------------------------------------------------------------------------------- 1 | checks 2 | library 3 | checks.noindex 4 | library.noindex 5 | cloud.noindex 6 | data.sqlite 7 | *.html 8 | -------------------------------------------------------------------------------- /revdep/email.yml: -------------------------------------------------------------------------------- 1 | release_date: ??? 2 | rel_release_date: ??? 3 | my_news_url: ??? 4 | release_version: ??? 5 | release_details: ??? 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | docs 7 | checks.noindex 8 | cloud.noindex 9 | library.noindex 10 | -------------------------------------------------------------------------------- /revdep/cran.md: -------------------------------------------------------------------------------- 1 | ## revdepcheck results 2 | 3 | We checked 2 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. 4 | 5 | * We saw 0 new problems 6 | * We failed to check 0 packages 7 | 8 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^README\.Rmd$ 5 | ^README-.*\.png$ 6 | ^\.travis\.yml$ 7 | ^CONDUCT\.md$ 8 | ^appveyor\.yml$ 9 | ^data-raw$ 10 | ^cran-comments\.md$ 11 | ^\.github$ 12 | ^codecov\.yml$ 13 | ^_pkgdown\.yml$ 14 | ^docs$ 15 | ^pkgdown$ 16 | ^revdep$ 17 | ^CRAN-SUBMISSION$ 18 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' Comes from tidyr 2 | #' @noRd 3 | col_name <- function(x, default = stop("Please supply column name", call. = FALSE)) 4 | { 5 | if (is.character(x)) 6 | return(x) 7 | if (identical(x, quote(expr = ))) 8 | return(default) 9 | if (is.name(x)) 10 | return(as.character(x)) 11 | if (is.null(x)) 12 | return(x) 13 | stop("Invalid column specification", call. = FALSE) 14 | } 15 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://juliasilge.github.io/widyr/ 2 | 3 | home: 4 | title: "widyr: an R package to widen, process, and re-tidy a dataset" 5 | description: Compute co-occurrence counts, correlations, or clusters using tidy data principles 6 | 7 | template: 8 | bootstrap: 5 9 | bootswatch: pulse 10 | bslib: 11 | pkgdown-nav-height: 100px 12 | opengraph: 13 | twitter: 14 | creator: "@juliasilge" 15 | card: summary_large_image 16 | -------------------------------------------------------------------------------- /widyr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Release Summary 2 | 3 | This release updates widyr for recent changes in the Matrix package and switches the maintainer. 4 | 5 | ## R CMD check results 6 | 7 | 0 errors | 0 warnings | 1 note 8 | 9 | This release updates the maintainer from David Robinson to Julia Silge. 10 | 11 | 12 | ## revdepcheck results 13 | 14 | We checked 2 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. 15 | 16 | * We saw 0 new problems 17 | * We failed to check 0 packages 18 | -------------------------------------------------------------------------------- /tests/testthat/test-cor-sparse.R: -------------------------------------------------------------------------------- 1 | context("cor_sparse") 2 | 3 | library(Matrix) 4 | 5 | test_that("cor_sparse returns the same results as cor(as.matrix(m))", { 6 | m <- Matrix(0, nrow = 100000, ncol = 6) 7 | 8 | ind <- cbind(sample(100000, 1000, replace = TRUE), 9 | sample(6, 1000, replace = TRUE)) 10 | m[ind] <- rnorm(1000) 11 | 12 | co1 <- cor(as.matrix(m)) 13 | co2 <- cor_sparse(m) 14 | 15 | expect_is(co2, "matrix") 16 | expect_is(c(co2), "numeric") 17 | expect_equal(dim(co2), c(6, 6)) 18 | expect_equal(c(co1), c(co2)) 19 | }) 20 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(pairwise_cor) 4 | export(pairwise_cor_) 5 | export(pairwise_count) 6 | export(pairwise_count_) 7 | export(pairwise_delta) 8 | export(pairwise_delta_) 9 | export(pairwise_dist) 10 | export(pairwise_dist_) 11 | export(pairwise_pmi) 12 | export(pairwise_pmi_) 13 | export(pairwise_similarity) 14 | export(pairwise_similarity_) 15 | export(squarely) 16 | export(squarely_) 17 | export(widely) 18 | export(widely_) 19 | export(widely_hclust) 20 | export(widely_kmeans) 21 | export(widely_svd) 22 | export(widely_svd_) 23 | import(Matrix) 24 | import(dplyr) 25 | importFrom(broom,tidy) 26 | importFrom(rlang,":=") 27 | -------------------------------------------------------------------------------- /tests/testthat/test-pairwise-dist.R: -------------------------------------------------------------------------------- 1 | context("pairwise_dist") 2 | 3 | suppressPackageStartupMessages(library(dplyr)) 4 | 5 | test_that("pairwise_dist computes a distance matrix", { 6 | d <- data.frame(col = rep(c("a", "b", "c"), each = 3), 7 | row = rep(c("d", "e", "f"), 3), 8 | value = c(1, 2, 3, 6, 5, 4, 7, 9, 8)) 9 | 10 | ret <- d %>% 11 | pairwise_dist(col, row, value) 12 | 13 | ret1 <- ret$distance[ret$item1 == "a" & ret$item2 == "b"] 14 | expect_equal(ret1, sqrt(sum((1:3 - 6:4) ^ 2))) 15 | 16 | ret2 <- ret$distance[ret$item1 == "b" & ret$item2 == "c"] 17 | expect_equal(ret2, sqrt(sum((6:4 - c(7, 9, 8)) ^ 2))) 18 | 19 | expect_equal(sum(ret$item1 == ret$item2), 0) 20 | }) 21 | -------------------------------------------------------------------------------- /R/cor_sparse.R: -------------------------------------------------------------------------------- 1 | #' Find the Pearson correlation of a sparse matrix efficiently 2 | #' 3 | #' Find the Pearson correlation of a sparse matrix. 4 | #' For large sparse matrix this is more efficient in time and memory than 5 | #' `cor(as.matrix(x))`. Note that it does not currently work on 6 | #' simple_triplet_matrix objects. 7 | #' 8 | #' @param x A matrix, potentially a sparse matrix such as a "dgTMatrix" object 9 | #' 10 | #' @source This code comes from mike on this Stack Overflow answer: 11 | #' . 12 | cor_sparse <- function(x) { 13 | n <- nrow(x) 14 | covmat <- (as.matrix(crossprod(x)) - n * tcrossprod(colMeans(x))) / (n - 1) 15 | cormat <- covmat / tcrossprod(sqrt(diag(covmat))) 16 | cormat 17 | } 18 | -------------------------------------------------------------------------------- /man/cor_sparse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cor_sparse.R 3 | \name{cor_sparse} 4 | \alias{cor_sparse} 5 | \title{Find the Pearson correlation of a sparse matrix efficiently} 6 | \source{ 7 | This code comes from mike on this Stack Overflow answer: 8 | \url{https://stackoverflow.com/a/9626089/712603}. 9 | } 10 | \usage{ 11 | cor_sparse(x) 12 | } 13 | \arguments{ 14 | \item{x}{A matrix, potentially a sparse matrix such as a "dgTMatrix" object} 15 | } 16 | \description{ 17 | Find the Pearson correlation of a sparse matrix. 18 | For large sparse matrix this is more efficient in time and memory than 19 | \code{cor(as.matrix(x))}. Note that it does not currently work on 20 | simple_triplet_matrix objects. 21 | } 22 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: covr::codecov(quiet = FALSE) 31 | shell: Rscript {0} 32 | -------------------------------------------------------------------------------- /tests/testthat/test-squarely.R: -------------------------------------------------------------------------------- 1 | context("squarely") 2 | 3 | suppressPackageStartupMessages(library(dplyr)) 4 | 5 | test_that("Can perform 'squarely' operations on pairs of items", { 6 | if (require("gapminder", quietly = TRUE)) { 7 | ncountries <- length(unique(gapminder$country)) 8 | 9 | closest <- gapminder %>% 10 | squarely(dist)(country, year, lifeExp) 11 | 12 | expect_equal(colnames(closest), c("item1", "item2", "value")) 13 | 14 | expect_equal(nrow(closest), ncountries * (ncountries - 1) / 2) 15 | } 16 | }) 17 | 18 | test_that("Can perform 'squarely' within groups", { 19 | if (require("gapminder", quietly = TRUE)) { 20 | closest_continent <- gapminder %>% 21 | group_by(continent) %>% 22 | squarely(dist)(country, year, lifeExp) 23 | 24 | expect_equal(colnames(closest_continent), c("continent", "item1", "item2", "value")) 25 | expect_equal(nrow(closest_continent), 2590) 26 | expect_equal(unique(closest_continent$continent), 27 | unique(gapminder$continent)) 28 | } 29 | }) 30 | -------------------------------------------------------------------------------- /tests/testthat/test-pairwise-similarity.R: -------------------------------------------------------------------------------- 1 | context("pairwise_similarity") 2 | 3 | suppressPackageStartupMessages(library(dplyr)) 4 | 5 | d <- tibble(col = rep(c("a", "b", "c"), each = 3), 6 | row = rep(c("d", "e", "f"), 3), 7 | value = c(1, 2, 3, 6, 5, 4, 7, 9, 8)) 8 | 9 | cosine_similarity <- function(x, y) { 10 | sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2))) 11 | } 12 | 13 | test_that("pairwise_similarity computes pairwise cosine similarity", { 14 | ret <- d %>% 15 | pairwise_similarity(col, row, value) 16 | 17 | ret1 <- ret$similarity[ret$item1 == "a" & ret$item2 == "b"] 18 | expect_equal(ret1, cosine_similarity(1:3, 6:4)) 19 | 20 | ret2 <- ret$similarity[ret$item1 == "b" & ret$item2 == "c"] 21 | expect_equal(ret2, cosine_similarity(6:4, c(7, 9, 8))) 22 | 23 | expect_equal(sum(ret$item1 == ret$item2), 0) 24 | }) 25 | 26 | test_that("pairwise_similarity retains factor levels", { 27 | d$col <- factor(d$col, levels = c("b", "c", "a")) 28 | 29 | ret <- d %>% 30 | pairwise_similarity(col, row, value) 31 | 32 | expect_is(ret$item1, "factor") 33 | expect_is(ret$item2, "factor") 34 | expect_equal(levels(ret$item1), c("b", "c", "a")) 35 | expect_equal(levels(ret$item2), c("b", "c", "a")) 36 | }) 37 | -------------------------------------------------------------------------------- /man/widely_kmeans.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/widely_kmeans.R 3 | \name{widely_kmeans} 4 | \alias{widely_kmeans} 5 | \title{Cluster items based on k-means across features} 6 | \usage{ 7 | widely_kmeans(tbl, item, feature, value, k, fill = 0, ...) 8 | } 9 | \arguments{ 10 | \item{tbl}{Table} 11 | 12 | \item{item}{Item to cluster (as a bare column name)} 13 | 14 | \item{feature}{Feature column (dimension in clustering)} 15 | 16 | \item{value}{Value column} 17 | 18 | \item{k}{Number of clusters} 19 | 20 | \item{fill}{What to fill in for missing values} 21 | 22 | \item{...}{Other arguments passed on to \code{\link[=kmeans]{kmeans()}}} 23 | } 24 | \description{ 25 | Given a tidy table of features describing each item, perform k-means 26 | clustering using \code{\link[=kmeans]{kmeans()}} and retidy the data into 27 | one-row-per-cluster. 28 | } 29 | \examples{ 30 | 31 | library(gapminder) 32 | library(dplyr) 33 | 34 | clusters <- gapminder \%>\% 35 | widely_kmeans(country, year, lifeExp, k = 5) 36 | 37 | clusters 38 | 39 | clusters \%>\% 40 | count(cluster) 41 | 42 | # Examine a few clusters 43 | clusters \%>\% filter(cluster == 1) 44 | clusters \%>\% filter(cluster == 2) 45 | 46 | } 47 | \seealso{ 48 | \code{\link[=widely_hclust]{widely_hclust()}} 49 | } 50 | -------------------------------------------------------------------------------- /man/widely_hclust.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/widely_hclust.R 3 | \name{widely_hclust} 4 | \alias{widely_hclust} 5 | \title{Cluster pairs of items into groups using hierarchical clustering} 6 | \usage{ 7 | widely_hclust(tbl, item1, item2, distance, k = NULL, h = NULL) 8 | } 9 | \arguments{ 10 | \item{tbl}{Table} 11 | 12 | \item{item1}{First item} 13 | 14 | \item{item2}{Second item} 15 | 16 | \item{distance}{Distance column} 17 | 18 | \item{k}{The desired number of groups} 19 | 20 | \item{h}{Height at which to cut the hierarchically clustered tree} 21 | } 22 | \description{ 23 | Reshape a table that represents pairwise distances into hierarchical clusters, 24 | returning a table with \code{item} and \code{cluster} columns. 25 | } 26 | \examples{ 27 | 28 | library(gapminder) 29 | library(dplyr) 30 | 31 | # Construct Euclidean distances between countries based on life 32 | # expectancy over time 33 | country_distances <- gapminder \%>\% 34 | pairwise_dist(country, year, lifeExp) 35 | 36 | country_distances 37 | 38 | # Turn this into 5 hierarchical clusters 39 | clusters <- country_distances \%>\% 40 | widely_hclust(item1, item2, distance, k = 8) 41 | 42 | # Examine a few such clusters 43 | clusters \%>\% filter(cluster == 1) 44 | clusters \%>\% filter(cluster == 2) 45 | 46 | } 47 | \seealso{ 48 | \link{cutree} 49 | } 50 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # widyr (development version) 2 | 3 | * Now force evaluation for `widely()` function factory (thanks to @lhdjung, #44) 4 | 5 | # widyr 0.1.5 6 | 7 | * Change maintainer to Julia Silge 8 | * Updates for new Matrix package version (@simonpcouch, #41) 9 | * Update use of `distinct()` 10 | 11 | # widyr 0.1.4 12 | 13 | * Fix bug in United Nations vignette (caused by unvotes update). 14 | * Also changes the vignettes to render conditionally on package installation. 15 | 16 | # widyr 0.1.3 17 | 18 | * Update to work with the latest version of tidytext's cast_sparse. Adds rlang to IMPORTs. (@juliasilge, #30) 19 | * Update from data_frame() to tibble() in examples 20 | * Removed topicmodels from SUGGESTS (hasn't been required for several versions) 21 | * Fixed spelling mistakes of occurence->occurrence 22 | 23 | # widyr 0.1.2 24 | 25 | * Fixes to be compatible with tidyr v1.0.0, while also being reverse-compatible with previous versions of tidyr. 26 | * Fix intro vignette index entry 27 | 28 | # widyr 0.1.1 29 | 30 | * Added `pairwise_delta` function for Burrows' delta 31 | * Added `pairwise_pmi` for pairwise mutual information 32 | * Added `widely_svd` for performing singular value decomposition then re-tidying 33 | * Removed methods from DESCRIPTION 34 | 35 | # widyr 0.1.0 36 | 37 | * Initial release of package 38 | * Only functions are the pairwise_ collection of functions, as well as the widely and squarely adverbs. 39 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@4.1.4 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Package 2 | Package: widyr 3 | Title: Widen, Process, then Re-Tidy Data 4 | Version: 0.1.5.9000 5 | Authors@R: c( 6 | person("David", "Robinson", , "admiral.david@gmail.com", role = "aut"), 7 | person("Kanishka", "Misra", role = "ctb"), 8 | person("Julia", "Silge", , "julia.silge@gmail.com", role = c("aut", "cre"), 9 | comment = c(ORCID = "0000-0002-3671-836X")) 10 | ) 11 | Description: Encapsulates the pattern of untidying data into a wide 12 | matrix, performing some processing, then turning it back into a tidy 13 | form. This is useful for several operations such as co-occurrence 14 | counts, correlations, or clustering that are mathematically convenient 15 | on wide matrices. 16 | License: MIT + file LICENSE 17 | URL: https://github.com/juliasilge/widyr, 18 | https://juliasilge.github.io/widyr/ 19 | BugReports: https://github.com/juliasilge/widyr/issues 20 | Imports: 21 | broom, 22 | dplyr, 23 | Matrix, 24 | purrr, 25 | reshape2, 26 | rlang, 27 | tibble, 28 | tidyr, 29 | tidytext 30 | Suggests: 31 | countrycode, 32 | covr, 33 | fuzzyjoin, 34 | gapminder, 35 | ggplot2, 36 | ggraph, 37 | igraph, 38 | irlba, 39 | janeaustenr, 40 | knitr, 41 | maps, 42 | rmarkdown, 43 | testthat, 44 | unvotes (>= 0.3.0) 45 | VignetteBuilder: 46 | knitr 47 | Encoding: UTF-8 48 | LazyData: TRUE 49 | Roxygen: list(markdown = TRUE) 50 | RoxygenNote: 7.2.1 51 | -------------------------------------------------------------------------------- /man/pairwise_similarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pairwise_similarity.R 3 | \name{pairwise_similarity} 4 | \alias{pairwise_similarity} 5 | \alias{pairwise_similarity_} 6 | \title{Cosine similarity of pairs of items} 7 | \usage{ 8 | pairwise_similarity(tbl, item, feature, value, ...) 9 | 10 | pairwise_similarity_(tbl, item, feature, value, ...) 11 | } 12 | \arguments{ 13 | \item{tbl}{Table} 14 | 15 | \item{item}{Item to compare; will end up in \code{item1} and 16 | \code{item2} columns} 17 | 18 | \item{feature}{Column describing the feature that links one item to others} 19 | 20 | \item{value}{Value} 21 | 22 | \item{...}{Extra arguments passed on to \code{\link[=squarely]{squarely()}}, 23 | such as \code{diag} and \code{upper}} 24 | } 25 | \description{ 26 | Compute cosine similarity of all pairs of items in a tidy table. 27 | } 28 | \examples{ 29 | 30 | library(janeaustenr) 31 | library(dplyr) 32 | library(tidytext) 33 | 34 | # Comparing Jane Austen novels 35 | austen_words <- austen_books() \%>\% 36 | unnest_tokens(word, text) \%>\% 37 | anti_join(stop_words, by = "word") \%>\% 38 | count(book, word) \%>\% 39 | ungroup() 40 | 41 | # closest books to each other 42 | closest <- austen_words \%>\% 43 | pairwise_similarity(book, word, n) \%>\% 44 | arrange(desc(similarity)) 45 | 46 | closest 47 | 48 | closest \%>\% 49 | filter(item1 == "Emma") 50 | 51 | } 52 | \seealso{ 53 | \code{\link[=squarely]{squarely()}} 54 | } 55 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macOS-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v2 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /man/pairwise_pmi.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pairwise_pmi.R 3 | \name{pairwise_pmi} 4 | \alias{pairwise_pmi} 5 | \alias{pairwise_pmi_} 6 | \title{Pointwise mutual information of pairs of items} 7 | \usage{ 8 | pairwise_pmi(tbl, item, feature, sort = FALSE, ...) 9 | 10 | pairwise_pmi_(tbl, item, feature, sort = FALSE, ...) 11 | } 12 | \arguments{ 13 | \item{tbl}{Table} 14 | 15 | \item{item}{Item to compare; will end up in \code{item1} and 16 | \code{item2} columns} 17 | 18 | \item{feature}{Column describing the feature that links one item to others} 19 | 20 | \item{sort}{Whether to sort in descending order of the pointwise mutual 21 | information} 22 | 23 | \item{...}{Extra arguments passed on to \code{squarely}, 24 | such as \code{diag} and \code{upper}} 25 | } 26 | \value{ 27 | A tbl_df with three columns, \code{item1}, \code{item2}, and 28 | \code{pmi}. 29 | } 30 | \description{ 31 | Find pointwise mutual information of pairs of items in a column, based on 32 | a "feature" column that links them together. 33 | This is an example of the spread-operate-retidy pattern. 34 | } 35 | \examples{ 36 | 37 | library(dplyr) 38 | 39 | dat <- tibble(group = rep(1:5, each = 2), 40 | letter = c("a", "b", 41 | "a", "c", 42 | "a", "c", 43 | "b", "e", 44 | "b", "f")) 45 | 46 | # how informative is each letter about each other letter 47 | pairwise_pmi(dat, letter, group) 48 | pairwise_pmi(dat, letter, group, sort = TRUE) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /man/pairwise_count.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pairwise_count.R 3 | \name{pairwise_count} 4 | \alias{pairwise_count} 5 | \alias{pairwise_count_} 6 | \title{Count pairs of items within a group} 7 | \usage{ 8 | pairwise_count(tbl, item, feature, wt = NULL, ...) 9 | 10 | pairwise_count_(tbl, item, feature, wt = NULL, ...) 11 | } 12 | \arguments{ 13 | \item{tbl}{Table} 14 | 15 | \item{item}{Item to count pairs of; will end up in \code{item1} and 16 | \code{item2} columns} 17 | 18 | \item{feature}{Column within which to count pairs 19 | \code{item2} columns} 20 | 21 | \item{wt}{Optionally a weight column, which should have a consistent weight 22 | for each feature} 23 | 24 | \item{...}{Extra arguments passed on to \code{squarely}, 25 | such as \code{diag}, \code{upper}, and \code{sort}} 26 | } 27 | \description{ 28 | Count the number of times each pair of items appear together within a group 29 | defined by "feature." For example, this could count the number of times 30 | two words appear within documents). 31 | } 32 | \examples{ 33 | 34 | library(dplyr) 35 | dat <- tibble(group = rep(1:5, each = 2), 36 | letter = c("a", "b", 37 | "a", "c", 38 | "a", "c", 39 | "b", "e", 40 | "b", "f")) 41 | 42 | # count the number of times two letters appear together 43 | pairwise_count(dat, letter, group) 44 | pairwise_count(dat, letter, group, sort = TRUE) 45 | pairwise_count(dat, letter, group, sort = TRUE, diag = TRUE) 46 | 47 | } 48 | \seealso{ 49 | \code{\link[=squarely]{squarely()}} 50 | } 51 | -------------------------------------------------------------------------------- /man/squarely.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/squarely.R 3 | \name{squarely} 4 | \alias{squarely} 5 | \alias{squarely_} 6 | \title{A special case of the widely adverb for creating tidy 7 | square matrices} 8 | \usage{ 9 | squarely(.f, diag = FALSE, upper = TRUE, ...) 10 | 11 | squarely_(.f, diag = FALSE, upper = TRUE, ...) 12 | } 13 | \arguments{ 14 | \item{.f}{Function to wrap} 15 | 16 | \item{diag}{Whether to include diagonal (i = j) in output} 17 | 18 | \item{upper}{Whether to include upper triangle, which may be 19 | duplicated} 20 | 21 | \item{...}{Extra arguments passed on to \code{widely}} 22 | } 23 | \value{ 24 | Returns a function that takes at least four arguments: 25 | \item{tbl}{A table} 26 | \item{item}{Name of column to use as rows in wide matrix} 27 | \item{feature}{Name of column to use as columns in wide matrix} 28 | \item{feature}{Name of column to use as values in wide matrix} 29 | \item{...}{Arguments passed on to inner function} 30 | } 31 | \description{ 32 | A special case of \code{\link[=widely]{widely()}}. Used to pre-prepare and 33 | post-tidy functions that take an m x n (m items, n features) 34 | matrix and return an m x m (item x item) matrix, such as a 35 | distance or correlation matrix. 36 | } 37 | \examples{ 38 | 39 | library(dplyr) 40 | library(gapminder) 41 | 42 | closest_continent <- gapminder \%>\% 43 | group_by(continent) \%>\% 44 | squarely(dist)(country, year, lifeExp) 45 | 46 | } 47 | \seealso{ 48 | \code{\link[=widely]{widely()}}, \code{\link[=pairwise_count]{pairwise_count()}}, 49 | \code{\link[=pairwise_cor]{pairwise_cor()}}, \code{\link[=pairwise_dist]{pairwise_dist()}} 50 | } 51 | -------------------------------------------------------------------------------- /man/pairwise_dist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pairwise_dist.R 3 | \name{pairwise_dist} 4 | \alias{pairwise_dist} 5 | \alias{pairwise_dist_} 6 | \title{Distances of pairs of items} 7 | \usage{ 8 | pairwise_dist(tbl, item, feature, value, method = "euclidean", ...) 9 | 10 | pairwise_dist_(tbl, item, feature, value, method = "euclidean", ...) 11 | } 12 | \arguments{ 13 | \item{tbl}{Table} 14 | 15 | \item{item}{Item to compare; will end up in \code{item1} and 16 | \code{item2} columns} 17 | 18 | \item{feature}{Column describing the feature that links one item to others} 19 | 20 | \item{value}{Value} 21 | 22 | \item{method}{Distance measure to be used; see \code{\link[=dist]{dist()}}} 23 | 24 | \item{...}{Extra arguments passed on to \code{\link[=squarely]{squarely()}}, 25 | such as \code{diag} and \code{upper}} 26 | } 27 | \description{ 28 | Compute distances of all pairs of items in a tidy table. 29 | } 30 | \examples{ 31 | 32 | library(gapminder) 33 | library(dplyr) 34 | 35 | # closest countries in terms of life expectancy over time 36 | closest <- gapminder \%>\% 37 | pairwise_dist(country, year, lifeExp) \%>\% 38 | arrange(distance) 39 | 40 | closest 41 | 42 | closest \%>\% 43 | filter(item1 == "United States") 44 | 45 | # to remove duplicates, use upper = FALSE 46 | gapminder \%>\% 47 | pairwise_dist(country, year, lifeExp, upper = FALSE) \%>\% 48 | arrange(distance) 49 | 50 | # Can also use Manhattan distance 51 | gapminder \%>\% 52 | pairwise_dist(country, year, lifeExp, method = "manhattan", upper = FALSE) \%>\% 53 | arrange(distance) 54 | 55 | } 56 | \seealso{ 57 | \code{\link[=squarely]{squarely()}} 58 | } 59 | -------------------------------------------------------------------------------- /tests/testthat/test-pairwise-cor.R: -------------------------------------------------------------------------------- 1 | context("pairwise_cor") 2 | 3 | suppressPackageStartupMessages(library(dplyr)) 4 | 5 | d <- tibble(col = rep(c("a", "b", "c"), each = 3), 6 | row = rep(c("d", "e", "f"), 3), 7 | value = c(1, 2, 3, 6, 5, 4, 7, 9, 8)) 8 | 9 | test_that("pairwise_cor computes pairwise correlations", { 10 | ret <- d %>% 11 | pairwise_cor(col, row, value) 12 | 13 | ret1 <- ret$correlation[ret$item1 == "a" & ret$item2 == "b"] 14 | expect_equal(ret1, cor(1:3, 6:4)) 15 | 16 | ret2 <- ret$correlation[ret$item1 == "b" & ret$item2 == "c"] 17 | expect_equal(ret2, cor(6:4, c(7, 9, 8))) 18 | 19 | expect_equal(sum(ret$item1 == ret$item2), 0) 20 | }) 21 | 22 | test_that("pairwise_cor can compute Spearman correlations", { 23 | ret <- d %>% 24 | pairwise_cor(col, row, value, method = "spearman") 25 | 26 | ret1 <- ret$correlation[ret$item1 == "a" & ret$item2 == "b"] 27 | expect_equal(ret1, -1) 28 | }) 29 | 30 | test_that("pairwise_cor works on binary matrices", { 31 | cors <- data.frame(x = c("a", "a", "a", "b", "b", "b", "c", "c", "c"), 32 | y = c(1, 2, 3, 1, 2, 3, 1, 3, 4)) %>% 33 | pairwise_cor(x, y, sort = TRUE) 34 | 35 | expect_equal(colnames(cors), c("item1", "item2", "correlation")) 36 | expect_equal(cors$correlation, rep(c(1, - 1 / 3), c(2, 4))) 37 | }) 38 | 39 | test_that("pairwise_cor retains factor levels", { 40 | d$col <- factor(d$col, levels = c("b", "c", "a")) 41 | 42 | ret <- d %>% 43 | pairwise_cor(col, row, value, method = "spearman") 44 | 45 | expect_is(ret$item1, "factor") 46 | expect_is(ret$item2, "factor") 47 | expect_equal(levels(ret$item1), c("b", "c", "a")) 48 | expect_equal(levels(ret$item2), c("b", "c", "a")) 49 | }) 50 | -------------------------------------------------------------------------------- /R/widely_kmeans.R: -------------------------------------------------------------------------------- 1 | #' Cluster items based on k-means across features 2 | #' 3 | #' Given a tidy table of features describing each item, perform k-means 4 | #' clustering using [kmeans()] and retidy the data into 5 | #' one-row-per-cluster. 6 | #' 7 | #' @param tbl Table 8 | #' @param item Item to cluster (as a bare column name) 9 | #' @param feature Feature column (dimension in clustering) 10 | #' @param value Value column 11 | #' @param k Number of clusters 12 | #' @param fill What to fill in for missing values 13 | #' @param ... Other arguments passed on to [kmeans()] 14 | #' 15 | #' @seealso [widely_hclust()] 16 | #' 17 | #' @importFrom rlang := 18 | #' 19 | #' @examples 20 | #' 21 | #' library(gapminder) 22 | #' library(dplyr) 23 | #' 24 | #' clusters <- gapminder %>% 25 | #' widely_kmeans(country, year, lifeExp, k = 5) 26 | #' 27 | #' clusters 28 | #' 29 | #' clusters %>% 30 | #' count(cluster) 31 | #' 32 | #' # Examine a few clusters 33 | #' clusters %>% filter(cluster == 1) 34 | #' clusters %>% filter(cluster == 2) 35 | #' 36 | #' @export 37 | widely_kmeans <- function(tbl, item, feature, value, k, fill = 0, ...) { 38 | item_str <- as.character(substitute(item)) 39 | feature_str <- as.character(substitute(feature)) 40 | value_str <- as.character(substitute(value)) 41 | 42 | form <- stats::as.formula(paste(item_str, "~", feature_str)) 43 | 44 | m <- tbl %>% 45 | reshape2::acast(form, value.var = value_str, fill = fill) 46 | 47 | clustered <- stats::kmeans(m, k, ...) 48 | 49 | # Add the clusters to the original table 50 | i <- match(rownames(m), as.character(tbl[[item_str]])) 51 | tibble::tibble(!!sym(item_str) := tbl[[item_str]][i], 52 | cluster = factor(clustered$cluster)) %>% 53 | dplyr::arrange(cluster) 54 | } 55 | -------------------------------------------------------------------------------- /R/pairwise_similarity.R: -------------------------------------------------------------------------------- 1 | #' Cosine similarity of pairs of items 2 | #' 3 | #' Compute cosine similarity of all pairs of items in a tidy table. 4 | #' 5 | #' @param tbl Table 6 | #' @param item Item to compare; will end up in `item1` and 7 | #' `item2` columns 8 | #' @param feature Column describing the feature that links one item to others 9 | #' @param value Value 10 | #' @param ... Extra arguments passed on to [squarely()], 11 | #' such as `diag` and `upper` 12 | #' 13 | #' @seealso [squarely()] 14 | #' 15 | #' @examples 16 | #' 17 | #' library(janeaustenr) 18 | #' library(dplyr) 19 | #' library(tidytext) 20 | #' 21 | #' # Comparing Jane Austen novels 22 | #' austen_words <- austen_books() %>% 23 | #' unnest_tokens(word, text) %>% 24 | #' anti_join(stop_words, by = "word") %>% 25 | #' count(book, word) %>% 26 | #' ungroup() 27 | #' 28 | #' # closest books to each other 29 | #' closest <- austen_words %>% 30 | #' pairwise_similarity(book, word, n) %>% 31 | #' arrange(desc(similarity)) 32 | #' 33 | #' closest 34 | #' 35 | #' closest %>% 36 | #' filter(item1 == "Emma") 37 | #' 38 | #' @export 39 | pairwise_similarity <- function(tbl, item, feature, value, ...) { 40 | pairwise_similarity_(tbl, 41 | col_name(substitute(item)), 42 | col_name(substitute(feature)), 43 | col_name(substitute(value)), ...) 44 | } 45 | 46 | 47 | #' @rdname pairwise_similarity 48 | #' @export 49 | pairwise_similarity_ <- function(tbl, item, feature, value, ...) { 50 | m <- matrix(1:9, ncol = 3) 51 | d_func <- squarely_(function(m) { 52 | normed <- m / sqrt(rowSums(m ^ 2)) 53 | normed %*% t(normed) 54 | }, sparse = TRUE, ...) 55 | 56 | tbl %>% 57 | d_func(item, feature, value) %>% 58 | rename(similarity = value) 59 | } 60 | -------------------------------------------------------------------------------- /man/pairwise_cor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pairwise_cor.R 3 | \name{pairwise_cor} 4 | \alias{pairwise_cor} 5 | \alias{pairwise_cor_} 6 | \title{Correlations of pairs of items} 7 | \usage{ 8 | pairwise_cor( 9 | tbl, 10 | item, 11 | feature, 12 | value, 13 | method = c("pearson", "kendall", "spearman"), 14 | use = "everything", 15 | ... 16 | ) 17 | 18 | pairwise_cor_( 19 | tbl, 20 | item, 21 | feature, 22 | value, 23 | method = c("pearson", "kendall", "spearman"), 24 | use = "everything", 25 | ... 26 | ) 27 | } 28 | \arguments{ 29 | \item{tbl}{Table} 30 | 31 | \item{item}{Item to compare; will end up in \code{item1} and 32 | \code{item2} columns} 33 | 34 | \item{feature}{Column describing the feature that links one item to others} 35 | 36 | \item{value}{Value column. If not given, defaults to all values being 1 (thus 37 | a binary correlation)} 38 | 39 | \item{method}{Correlation method} 40 | 41 | \item{use}{Character string specifying the behavior of correlations 42 | with missing values; passed on to \code{cor}} 43 | 44 | \item{...}{Extra arguments passed on to \code{squarely}, 45 | such as \code{diag} and \code{upper}} 46 | } 47 | \description{ 48 | Find correlations of pairs of items in a column, based on a "feature" column 49 | that links them together. This is an example of the spread-operate-retidy pattern. 50 | } 51 | \examples{ 52 | 53 | library(dplyr) 54 | library(gapminder) 55 | 56 | gapminder \%>\% 57 | pairwise_cor(country, year, lifeExp) 58 | 59 | gapminder \%>\% 60 | pairwise_cor(country, year, lifeExp, sort = TRUE) 61 | 62 | # United Nations voting data 63 | if (require("unvotes", quietly = TRUE)) { 64 | country_cors <- un_votes \%>\% 65 | mutate(vote = as.numeric(vote)) \%>\% 66 | pairwise_cor(country, rcid, vote, sort = TRUE) 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /man/widely.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/widely.R 3 | \name{widely} 4 | \alias{widely} 5 | \alias{widely_} 6 | \title{Adverb for functions that operate on matrices in "wide" 7 | format} 8 | \usage{ 9 | widely(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) 10 | 11 | widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) 12 | } 13 | \arguments{ 14 | \item{.f}{Function being wrapped} 15 | 16 | \item{sort}{Whether to sort in descending order of \code{value}} 17 | 18 | \item{sparse}{Whether to cast to a sparse matrix} 19 | 20 | \item{maximum_size}{To prevent crashing, a maximum size of a 21 | non-sparse matrix to be created. Set to NULL to allow any size 22 | matrix.} 23 | } 24 | \value{ 25 | Returns a function that takes at least four arguments: 26 | \item{tbl}{A table} 27 | \item{row}{Name of column to use as rows in wide matrix} 28 | \item{column}{Name of column to use as columns in wide matrix} 29 | \item{value}{Name of column to use as values in wide matrix} 30 | \item{...}{Arguments passed on to inner function} 31 | 32 | \code{widely} creates a function that takes those columns as 33 | bare names, \code{widely_} a function that takes them as strings. 34 | } 35 | \description{ 36 | Modify a function in order to pre-cast the input into a wide 37 | matrix format, perform the function, and then 38 | re-tidy (e.g. melt) the output into a tidy table. 39 | } 40 | \examples{ 41 | 42 | library(dplyr) 43 | library(gapminder) 44 | 45 | gapminder 46 | 47 | gapminder \%>\% 48 | widely(dist)(country, year, lifeExp) 49 | 50 | # can perform within groups 51 | closest_continent <- gapminder \%>\% 52 | group_by(continent) \%>\% 53 | widely(dist)(country, year, lifeExp) 54 | closest_continent 55 | 56 | # for example, find the closest pair in each 57 | closest_continent \%>\% 58 | top_n(1, -value) 59 | 60 | } 61 | -------------------------------------------------------------------------------- /R/pairwise_dist.R: -------------------------------------------------------------------------------- 1 | #' Distances of pairs of items 2 | #' 3 | #' Compute distances of all pairs of items in a tidy table. 4 | #' 5 | #' @param tbl Table 6 | #' @param item Item to compare; will end up in `item1` and 7 | #' `item2` columns 8 | #' @param feature Column describing the feature that links one item to others 9 | #' @param value Value 10 | #' @param method Distance measure to be used; see [dist()] 11 | #' @param ... Extra arguments passed on to [squarely()], 12 | #' such as `diag` and `upper` 13 | #' 14 | #' @seealso [squarely()] 15 | #' 16 | #' @examples 17 | #' 18 | #' library(gapminder) 19 | #' library(dplyr) 20 | #' 21 | #' # closest countries in terms of life expectancy over time 22 | #' closest <- gapminder %>% 23 | #' pairwise_dist(country, year, lifeExp) %>% 24 | #' arrange(distance) 25 | #' 26 | #' closest 27 | #' 28 | #' closest %>% 29 | #' filter(item1 == "United States") 30 | #' 31 | #' # to remove duplicates, use upper = FALSE 32 | #' gapminder %>% 33 | #' pairwise_dist(country, year, lifeExp, upper = FALSE) %>% 34 | #' arrange(distance) 35 | #' 36 | #' # Can also use Manhattan distance 37 | #' gapminder %>% 38 | #' pairwise_dist(country, year, lifeExp, method = "manhattan", upper = FALSE) %>% 39 | #' arrange(distance) 40 | #' 41 | #' @export 42 | pairwise_dist <- function(tbl, item, feature, value, 43 | method = "euclidean", ...) { 44 | pairwise_dist_(tbl, 45 | col_name(substitute(item)), 46 | col_name(substitute(feature)), 47 | col_name(substitute(value)), 48 | method = method, ...) 49 | } 50 | 51 | 52 | #' @rdname pairwise_dist 53 | #' @export 54 | pairwise_dist_ <- function(tbl, item, feature, value, method = "euclidean", ...) { 55 | d_func <- squarely_(function(m) as.matrix(stats::dist(m, method = method)), ...) 56 | 57 | tbl %>% 58 | d_func(item, feature, value) %>% 59 | rename(distance = value) 60 | } 61 | -------------------------------------------------------------------------------- /man/widely_svd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/widely_svd.R 3 | \name{widely_svd} 4 | \alias{widely_svd} 5 | \alias{widely_svd_} 6 | \title{Turn into a wide matrix, perform SVD, return to tidy form} 7 | \usage{ 8 | widely_svd(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...) 9 | 10 | widely_svd_(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...) 11 | } 12 | \arguments{ 13 | \item{tbl}{Table} 14 | 15 | \item{item}{Item to perform dimensionality reduction on; will end up in \code{item} column} 16 | 17 | \item{feature}{Column describing the feature that links one item to others.} 18 | 19 | \item{value}{Value} 20 | 21 | \item{nv}{Optional; the number of principal components to estimate. Recommended for matrices 22 | with many features.} 23 | 24 | \item{weight_d}{Whether to multiply each value by the \code{d} principal component.} 25 | 26 | \item{...}{Extra arguments passed to \code{svd} (if \code{nv} is \code{NULL}) 27 | or \code{irlba} (if \code{nv} is given)} 28 | } 29 | \value{ 30 | A tbl_df with three columns. The first is retained from the \code{item} input, 31 | then \code{dimension} and \code{value}. Each row represents one principal component 32 | value. 33 | } 34 | \description{ 35 | This is useful for dimensionality reduction of items, especially when setting a 36 | lower nv. 37 | } 38 | \examples{ 39 | 40 | library(dplyr) 41 | library(gapminder) 42 | 43 | # principal components driving change 44 | gapminder_svd <- gapminder \%>\% 45 | widely_svd(country, year, lifeExp) 46 | 47 | gapminder_svd 48 | 49 | # compare SVDs, join with other data 50 | library(ggplot2) 51 | library(tidyr) 52 | 53 | gapminder_svd \%>\% 54 | spread(dimension, value) \%>\% 55 | inner_join(distinct(gapminder, country, continent), by = "country") \%>\% 56 | ggplot(aes(`1`, `2`, label = country)) + 57 | geom_point(aes(color = continent)) + 58 | geom_text(vjust = 1, hjust = 1) 59 | 60 | } 61 | -------------------------------------------------------------------------------- /man/pairwise_delta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pairwise_delta.R 3 | \name{pairwise_delta} 4 | \alias{pairwise_delta} 5 | \alias{pairwise_delta_} 6 | \title{Delta measure of pairs of documents} 7 | \usage{ 8 | pairwise_delta(tbl, item, feature, value, method = "burrows", ...) 9 | 10 | pairwise_delta_(tbl, item, feature, value, method = "burrows", ...) 11 | } 12 | \arguments{ 13 | \item{tbl}{Table} 14 | 15 | \item{item}{Item to compare; will end up in \code{item1} and 16 | \code{item2} columns} 17 | 18 | \item{feature}{Column describing the feature that links one item to others} 19 | 20 | \item{value}{Value} 21 | 22 | \item{method}{Distance measure to be used; see \code{\link[=dist]{dist()}}} 23 | 24 | \item{...}{Extra arguments passed on to \code{\link[=squarely]{squarely()}}, 25 | such as \code{diag} and \code{upper}} 26 | } 27 | \description{ 28 | Compute the delta distances (from its two variants) of all pairs of documents in a tidy table. 29 | } 30 | \examples{ 31 | 32 | library(janeaustenr) 33 | library(dplyr) 34 | library(tidytext) 35 | 36 | # closest documents in terms of 1000 most frequent words 37 | closest <- austen_books() \%>\% 38 | unnest_tokens(word, text) \%>\% 39 | count(book, word) \%>\% 40 | top_n(1000, n) \%>\% 41 | pairwise_delta(book, word, n, method = "burrows") \%>\% 42 | arrange(delta) 43 | 44 | closest 45 | 46 | closest \%>\% 47 | filter(item1 == "Pride & Prejudice") 48 | 49 | # to remove duplicates, use upper = FALSE 50 | closest <- austen_books() \%>\% 51 | unnest_tokens(word, text) \%>\% 52 | count(book, word) \%>\% 53 | top_n(1000, n) \%>\% 54 | pairwise_delta(book, word, n, method = "burrows", upper = FALSE) \%>\% 55 | arrange(delta) 56 | 57 | # Can also use Argamon's Linear Delta 58 | closest <- austen_books() \%>\% 59 | unnest_tokens(word, text) \%>\% 60 | count(book, word) \%>\% 61 | top_n(1000, n) \%>\% 62 | pairwise_delta(book, word, n, method = "argamon", upper = FALSE) \%>\% 63 | arrange(delta) 64 | 65 | } 66 | \seealso{ 67 | \code{\link[=squarely]{squarely()}} 68 | } 69 | -------------------------------------------------------------------------------- /R/pairwise_count.R: -------------------------------------------------------------------------------- 1 | #' Count pairs of items within a group 2 | #' 3 | #' Count the number of times each pair of items appear together within a group 4 | #' defined by "feature." For example, this could count the number of times 5 | #' two words appear within documents). 6 | #' 7 | #' @param tbl Table 8 | #' @param item Item to count pairs of; will end up in `item1` and 9 | #' `item2` columns 10 | #' @param feature Column within which to count pairs 11 | #' `item2` columns 12 | #' @param wt Optionally a weight column, which should have a consistent weight 13 | #' for each feature 14 | #' @param ... Extra arguments passed on to `squarely`, 15 | #' such as `diag`, `upper`, and `sort` 16 | #' 17 | #' @seealso [squarely()] 18 | #' 19 | #' @examples 20 | #' 21 | #' library(dplyr) 22 | #' dat <- tibble(group = rep(1:5, each = 2), 23 | #' letter = c("a", "b", 24 | #' "a", "c", 25 | #' "a", "c", 26 | #' "b", "e", 27 | #' "b", "f")) 28 | #' 29 | #' # count the number of times two letters appear together 30 | #' pairwise_count(dat, letter, group) 31 | #' pairwise_count(dat, letter, group, sort = TRUE) 32 | #' pairwise_count(dat, letter, group, sort = TRUE, diag = TRUE) 33 | #' 34 | #' @export 35 | pairwise_count <- function(tbl, item, feature, wt = NULL, ...) { 36 | pairwise_count_(tbl, 37 | col_name(substitute(item)), 38 | col_name(substitute(feature)), 39 | wt = col_name(substitute(wt)), 40 | ...) 41 | } 42 | 43 | 44 | #' @rdname pairwise_count 45 | #' @export 46 | pairwise_count_ <- function(tbl, item, feature, wt = NULL, ...) { 47 | if (is.null(wt)) { 48 | func <- squarely_(function(m) m %*% t(m), sparse = TRUE, ...) 49 | wt <- "..value" 50 | } else { 51 | func <- squarely_(function(m) m %*% t(m > 0), sparse = TRUE, ...) 52 | } 53 | 54 | tbl %>% 55 | distinct(.data[[item]], .data[[feature]], .keep_all = TRUE) %>% 56 | mutate(..value = 1) %>% 57 | func(item, feature, wt) %>% 58 | rename(n = value) 59 | } 60 | -------------------------------------------------------------------------------- /R/pairwise_pmi.R: -------------------------------------------------------------------------------- 1 | #' Pointwise mutual information of pairs of items 2 | #' 3 | #' Find pointwise mutual information of pairs of items in a column, based on 4 | #' a "feature" column that links them together. 5 | #' This is an example of the spread-operate-retidy pattern. 6 | #' 7 | #' @param tbl Table 8 | #' @param item Item to compare; will end up in `item1` and 9 | #' `item2` columns 10 | #' @param feature Column describing the feature that links one item to others 11 | #' @param sort Whether to sort in descending order of the pointwise mutual 12 | #' information 13 | #' @param ... Extra arguments passed on to `squarely`, 14 | #' such as `diag` and `upper` 15 | #' 16 | #' @name pairwise_pmi 17 | #' 18 | #' @return A tbl_df with three columns, `item1`, `item2`, and 19 | #' `pmi`. 20 | #' 21 | #' @examples 22 | #' 23 | #' library(dplyr) 24 | #' 25 | #' dat <- tibble(group = rep(1:5, each = 2), 26 | #' letter = c("a", "b", 27 | #' "a", "c", 28 | #' "a", "c", 29 | #' "b", "e", 30 | #' "b", "f")) 31 | #' 32 | #' # how informative is each letter about each other letter 33 | #' pairwise_pmi(dat, letter, group) 34 | #' pairwise_pmi(dat, letter, group, sort = TRUE) 35 | #' 36 | #' @export 37 | pairwise_pmi <- function(tbl, item, feature, sort = FALSE, ...) { 38 | pairwise_pmi_(tbl, 39 | col_name(substitute(item)), 40 | col_name(substitute(feature)), 41 | sort = sort, ...) 42 | } 43 | 44 | 45 | #' @rdname pairwise_pmi 46 | #' @export 47 | pairwise_pmi_ <- function(tbl, item, feature, sort = FALSE, ...) { 48 | f <- function(m) { 49 | row_sums <- rowSums(m) / sum(m) 50 | 51 | ret <- m %*% t(m) 52 | ret <- ret / sum(ret) 53 | ret <- ret / row_sums 54 | ret <- t(t(ret) / (row_sums)) 55 | ret 56 | } 57 | pmi_func <- squarely_(f, sparse = TRUE, sort = sort, ...) 58 | 59 | tbl %>% 60 | ungroup() %>% 61 | mutate(..value = 1) %>% 62 | pmi_func(item, feature, "..value") %>% 63 | mutate(value = log(value)) %>% 64 | rename(pmi = value) 65 | } 66 | -------------------------------------------------------------------------------- /R/widely_hclust.R: -------------------------------------------------------------------------------- 1 | #' Cluster pairs of items into groups using hierarchical clustering 2 | #' 3 | #' Reshape a table that represents pairwise distances into hierarchical clusters, 4 | #' returning a table with `item` and `cluster` columns. 5 | #' 6 | #' @param tbl Table 7 | #' @param item1 First item 8 | #' @param item2 Second item 9 | #' @param distance Distance column 10 | #' @param k The desired number of groups 11 | #' @param h Height at which to cut the hierarchically clustered tree 12 | #' 13 | #' @examples 14 | #' 15 | #' library(gapminder) 16 | #' library(dplyr) 17 | #' 18 | #' # Construct Euclidean distances between countries based on life 19 | #' # expectancy over time 20 | #' country_distances <- gapminder %>% 21 | #' pairwise_dist(country, year, lifeExp) 22 | #' 23 | #' country_distances 24 | #' 25 | #' # Turn this into 5 hierarchical clusters 26 | #' clusters <- country_distances %>% 27 | #' widely_hclust(item1, item2, distance, k = 8) 28 | #' 29 | #' # Examine a few such clusters 30 | #' clusters %>% filter(cluster == 1) 31 | #' clusters %>% filter(cluster == 2) 32 | #' 33 | #' @seealso [cutree] 34 | #' 35 | #' @export 36 | widely_hclust <- function(tbl, item1, item2, distance, k = NULL, h = NULL) { 37 | col1_str <- as.character(substitute(item1)) 38 | col2_str <- as.character(substitute(item2)) 39 | dist_str <- as.character(substitute(distance)) 40 | 41 | unique_items <- unique(c(as.character(tbl[[col1_str]]), as.character(tbl[[col2_str]]))) 42 | 43 | form <- stats::as.formula(paste(col1_str, "~", col2_str)) 44 | 45 | max_distance <- max(tbl[[dist_str]]) 46 | 47 | tibble(item1 = match(tbl[[col1_str]], unique_items), 48 | item2 = match(tbl[[col2_str]], unique_items), 49 | distance = tbl[[dist_str]]) %>% 50 | reshape2::acast(item1 ~ item2, value.var = "distance", fill = max_distance) %>% 51 | stats::as.dist() %>% 52 | stats::hclust() %>% 53 | stats::cutree(k = k, h = h) %>% 54 | tibble::enframe("item", "cluster") %>% 55 | dplyr::mutate(item = unique_items[as.integer(item)], 56 | cluster = factor(cluster)) %>% 57 | dplyr::arrange(cluster) 58 | } 59 | -------------------------------------------------------------------------------- /tests/testthat/test-widely.R: -------------------------------------------------------------------------------- 1 | context("widely") 2 | 3 | test_that("widely can widen, operate, and re-tidy", { 4 | skip_if_not_installed("gapminder") 5 | library(gapminder) 6 | 7 | ret <- gapminder %>% 8 | widely(cor)(year, country, lifeExp) 9 | 10 | expect_is(ret$item1, "character") 11 | expect_is(ret$item2, "character") 12 | 13 | expect_true(all(c("Afghanistan", "United States") %in% ret$item1)) 14 | expect_true(all(c("Afghanistan", "United States") %in% ret$item2)) 15 | expect_true(all(ret$value <= 1)) 16 | expect_true(all(ret$value >= -1)) 17 | 18 | expect_equal(nrow(ret), length(unique(gapminder$country)) ^ 2) 19 | 20 | ret2 <- gapminder %>% 21 | widely(cor, sort = TRUE)(year, country, lifeExp) 22 | 23 | expect_equal(sort(ret$value, decreasing = TRUE), ret2$value) 24 | }) 25 | 26 | test_that("widely works within groups", { 27 | skip_if_not_installed("gapminder") 28 | library(gapminder) 29 | 30 | ret <- gapminder %>% 31 | group_by(continent) %>% 32 | widely(cor)(year, country, lifeExp) 33 | 34 | expect_equal(colnames(ret), c("continent", "item1", "item2", "value")) 35 | expect_is(ret$item1, "character") 36 | expect_is(ret$item2, "character") 37 | 38 | expect_true(all(c("Afghanistan", "United States") %in% ret$item1)) 39 | expect_true(all(c("Afghanistan", "United States") %in% ret$item2)) 40 | expect_true(any("Canada" == ret$item1 & "United States" == ret$item2)) 41 | expect_false(any("Afghanistan" == ret$item1 & "United States" == ret$item2)) 42 | 43 | expect_true(all(ret$value <= 1)) 44 | expect_true(all(ret$value >= -1)) 45 | }) 46 | 47 | test_that("widely's maximum size argument works", { 48 | skip_if_not_installed("gapminder") 49 | library(gapminder) 50 | 51 | f <- function() { 52 | widely(cor, maximum_size = 1000)(gapminder, year, country, lifeExp) 53 | } 54 | expect_error(f(), "1704.*large") 55 | }) 56 | 57 | test_that("widely's arguments are evaluated when widely itself is called", { 58 | skip_if_not_installed("gapminder") 59 | library(gapminder) 60 | 61 | sort <- FALSE 62 | dist_widely <- widely(dist, sort = sort) 63 | sort <- TRUE 64 | ret1 <- dist_widely(gapminder, country, year, lifeExp) 65 | 66 | sort <- FALSE 67 | dist_widely <- widely(dist, sort = sort) 68 | ret2 <- dist_widely(gapminder, country, year, lifeExp) 69 | 70 | expect_equal(ret1, ret2) 71 | }) 72 | -------------------------------------------------------------------------------- /R/pairwise_cor.R: -------------------------------------------------------------------------------- 1 | #' Correlations of pairs of items 2 | #' 3 | #' Find correlations of pairs of items in a column, based on a "feature" column 4 | #' that links them together. This is an example of the spread-operate-retidy pattern. 5 | #' 6 | #' @param tbl Table 7 | #' @param item Item to compare; will end up in `item1` and 8 | #' `item2` columns 9 | #' @param feature Column describing the feature that links one item to others 10 | #' @param value Value column. If not given, defaults to all values being 1 (thus 11 | #' a binary correlation) 12 | #' @param method Correlation method 13 | #' @param use Character string specifying the behavior of correlations 14 | #' with missing values; passed on to `cor` 15 | #' @param ... Extra arguments passed on to `squarely`, 16 | #' such as `diag` and `upper` 17 | #' 18 | #' @examples 19 | #' 20 | #' library(dplyr) 21 | #' library(gapminder) 22 | #' 23 | #' gapminder %>% 24 | #' pairwise_cor(country, year, lifeExp) 25 | #' 26 | #' gapminder %>% 27 | #' pairwise_cor(country, year, lifeExp, sort = TRUE) 28 | #' 29 | #' # United Nations voting data 30 | #' if (require("unvotes", quietly = TRUE)) { 31 | #' country_cors <- un_votes %>% 32 | #' mutate(vote = as.numeric(vote)) %>% 33 | #' pairwise_cor(country, rcid, vote, sort = TRUE) 34 | #' } 35 | #' 36 | #' @export 37 | pairwise_cor <- function(tbl, item, feature, value, 38 | method = c("pearson", "kendall", "spearman"), 39 | use = "everything", ...) { 40 | if (missing(value)) { 41 | tbl$..value <- 1 42 | val <- "..value" 43 | } else { 44 | val <- col_name(substitute(value)) 45 | } 46 | 47 | pairwise_cor_(tbl, 48 | col_name(substitute(item)), 49 | col_name(substitute(feature)), 50 | val, 51 | method = method, use = use, ...) 52 | } 53 | 54 | 55 | #' @rdname pairwise_cor 56 | #' @export 57 | pairwise_cor_ <- function(tbl, item, feature, value, 58 | method = c("pearson", "kendall", "spearman"), 59 | use = "everything", 60 | ...) { 61 | method <- match.arg(method) 62 | 63 | sparse <- (method == "pearson" & use == "everything") 64 | f <- if (sparse) { 65 | function(x) cor_sparse(t(x)) 66 | } else { 67 | function(x) stats::cor(t(x), method = method, use = use) 68 | } 69 | cor_func <- squarely_(f, sparse = sparse, ...) 70 | 71 | tbl %>% 72 | ungroup() %>% 73 | cor_func(item, feature, value) %>% 74 | rename(correlation = value) 75 | } 76 | -------------------------------------------------------------------------------- /R/pairwise_delta.R: -------------------------------------------------------------------------------- 1 | #' Delta measure of pairs of documents 2 | #' 3 | #' Compute the delta distances (from its two variants) of all pairs of documents in a tidy table. 4 | #' 5 | #' @param tbl Table 6 | #' @param item Item to compare; will end up in `item1` and 7 | #' `item2` columns 8 | #' @param feature Column describing the feature that links one item to others 9 | #' @param value Value 10 | #' @param method Distance measure to be used; see [dist()] 11 | #' @param ... Extra arguments passed on to [squarely()], 12 | #' such as `diag` and `upper` 13 | #' 14 | #' @seealso [squarely()] 15 | #' 16 | #' @examples 17 | #' 18 | #' library(janeaustenr) 19 | #' library(dplyr) 20 | #' library(tidytext) 21 | #' 22 | #' # closest documents in terms of 1000 most frequent words 23 | #' closest <- austen_books() %>% 24 | #' unnest_tokens(word, text) %>% 25 | #' count(book, word) %>% 26 | #' top_n(1000, n) %>% 27 | #' pairwise_delta(book, word, n, method = "burrows") %>% 28 | #' arrange(delta) 29 | #' 30 | #' closest 31 | #' 32 | #' closest %>% 33 | #' filter(item1 == "Pride & Prejudice") 34 | #' 35 | #' # to remove duplicates, use upper = FALSE 36 | #' closest <- austen_books() %>% 37 | #' unnest_tokens(word, text) %>% 38 | #' count(book, word) %>% 39 | #' top_n(1000, n) %>% 40 | #' pairwise_delta(book, word, n, method = "burrows", upper = FALSE) %>% 41 | #' arrange(delta) 42 | #' 43 | #' # Can also use Argamon's Linear Delta 44 | #' closest <- austen_books() %>% 45 | #' unnest_tokens(word, text) %>% 46 | #' count(book, word) %>% 47 | #' top_n(1000, n) %>% 48 | #' pairwise_delta(book, word, n, method = "argamon", upper = FALSE) %>% 49 | #' arrange(delta) 50 | #' 51 | #' @export 52 | pairwise_delta <- function(tbl, item, feature, value, 53 | method = "burrows", ...) { 54 | pairwise_delta_(tbl, 55 | col_name(substitute(item)), 56 | col_name(substitute(feature)), 57 | col_name(substitute(value)), 58 | method = method, ...) 59 | } 60 | 61 | 62 | #' @rdname pairwise_delta 63 | #' @export 64 | pairwise_delta_ <- function(tbl, item, feature, value, method = "burrows", ...) { 65 | delta_func <- function(m) { 66 | 67 | if(method == "burrows") { 68 | dist_method = "manhattan" 69 | } 70 | else if(method == "argamon") { 71 | dist_method = "euclidean" 72 | } 73 | else { 74 | stop("Wrong method! Only method = burrows or method = argamon have been implmented!") 75 | } 76 | 77 | return(as.matrix(stats::dist(scale(m), method = dist_method)/length(m[1,]))) 78 | } 79 | 80 | d_func <- squarely_(delta_func, ...) 81 | 82 | tbl %>% 83 | d_func(item, feature, value) %>% 84 | rename(delta = value) 85 | } 86 | -------------------------------------------------------------------------------- /R/squarely.R: -------------------------------------------------------------------------------- 1 | #' A special case of the widely adverb for creating tidy 2 | #' square matrices 3 | #' 4 | #' A special case of [widely()]. Used to pre-prepare and 5 | #' post-tidy functions that take an m x n (m items, n features) 6 | #' matrix and return an m x m (item x item) matrix, such as a 7 | #' distance or correlation matrix. 8 | #' 9 | #' @param .f Function to wrap 10 | #' @param diag Whether to include diagonal (i = j) in output 11 | #' @param upper Whether to include upper triangle, which may be 12 | #' duplicated 13 | #' @param ... Extra arguments passed on to `widely` 14 | #' 15 | #' @return Returns a function that takes at least four arguments: 16 | #' \item{tbl}{A table} 17 | #' \item{item}{Name of column to use as rows in wide matrix} 18 | #' \item{feature}{Name of column to use as columns in wide matrix} 19 | #' \item{feature}{Name of column to use as values in wide matrix} 20 | #' \item{...}{Arguments passed on to inner function} 21 | #' 22 | #' @seealso [widely()], [pairwise_count()], 23 | #' [pairwise_cor()], [pairwise_dist()] 24 | #' 25 | #' @examples 26 | #' 27 | #' library(dplyr) 28 | #' library(gapminder) 29 | #' 30 | #' closest_continent <- gapminder %>% 31 | #' group_by(continent) %>% 32 | #' squarely(dist)(country, year, lifeExp) 33 | #' 34 | #' @export 35 | squarely <- function(.f, diag = FALSE, upper = TRUE, ...) { 36 | inner_func <- squarely_(.f, diag = diag, upper = upper, ...) 37 | function(tbl, item, feature, value, ...) { 38 | inner_func(tbl, 39 | col_name(substitute(item)), 40 | col_name(substitute(feature)), 41 | col_name(substitute(value)), 42 | ...) 43 | } 44 | } 45 | 46 | 47 | #' @rdname squarely 48 | #' @export 49 | squarely_ <- function(.f, diag = FALSE, 50 | upper = TRUE, 51 | ...) { 52 | extra_args <- list(...) 53 | 54 | f <- function(tbl, item, feature, value, ...) { 55 | if (inherits(tbl, "grouped_df")) { 56 | # perform within each group, then restore groups 57 | ret <- tbl %>% 58 | tidyr::nest() %>% 59 | mutate(data = purrr::map(data, f, item, feature, value)) %>% 60 | filter(purrr::map_lgl(data, ~ nrow(.) > 0)) %>% 61 | tidyr::unnest(data) %>% 62 | dplyr::group_by_at(dplyr::group_vars(tbl)) 63 | 64 | return(ret) 65 | } 66 | 67 | item_vals <- tbl[[item]] 68 | item_u <- unique(item_vals) 69 | 70 | tbl[[item]] <- match(item_vals, item_u) 71 | 72 | new_f <- do.call(widely_, c(list(.f), extra_args)) 73 | ret <- new_f(tbl, item, feature, value, ...) 74 | 75 | ret$item1 <- as.integer(ret$item1) 76 | ret$item2 <- as.integer(ret$item2) 77 | 78 | if (!upper) { 79 | ret <- dplyr::filter(ret, item1 <= item2) 80 | } 81 | if (!diag) { 82 | ret <- dplyr::filter(ret, item1 != item2) 83 | } 84 | 85 | ret$item1 <- item_u[ret$item1] 86 | ret$item2 <- item_u[ret$item2] 87 | 88 | ret 89 | } 90 | f 91 | } 92 | -------------------------------------------------------------------------------- /R/widely_svd.R: -------------------------------------------------------------------------------- 1 | #' Turn into a wide matrix, perform SVD, return to tidy form 2 | #' 3 | #' This is useful for dimensionality reduction of items, especially when setting a 4 | #' lower nv. 5 | #' 6 | #' @name widely_svd 7 | #' 8 | #' @param tbl Table 9 | #' @param item Item to perform dimensionality reduction on; will end up in `item` column 10 | #' @param feature Column describing the feature that links one item to others. 11 | #' @param value Value 12 | #' @param nv Optional; the number of principal components to estimate. Recommended for matrices 13 | #' with many features. 14 | #' @param weight_d Whether to multiply each value by the `d` principal component. 15 | #' @param ... Extra arguments passed to `svd` (if `nv` is `NULL`) 16 | #' or `irlba` (if `nv` is given) 17 | #' 18 | #' @return A tbl_df with three columns. The first is retained from the `item` input, 19 | #' then `dimension` and `value`. Each row represents one principal component 20 | #' value. 21 | #' 22 | #' @examples 23 | #' 24 | #' library(dplyr) 25 | #' library(gapminder) 26 | #' 27 | #' # principal components driving change 28 | #' gapminder_svd <- gapminder %>% 29 | #' widely_svd(country, year, lifeExp) 30 | #' 31 | #' gapminder_svd 32 | #' 33 | #' # compare SVDs, join with other data 34 | #' library(ggplot2) 35 | #' library(tidyr) 36 | #' 37 | #' gapminder_svd %>% 38 | #' spread(dimension, value) %>% 39 | #' inner_join(distinct(gapminder, country, continent), by = "country") %>% 40 | #' ggplot(aes(`1`, `2`, label = country)) + 41 | #' geom_point(aes(color = continent)) + 42 | #' geom_text(vjust = 1, hjust = 1) 43 | #' 44 | #' @export 45 | widely_svd <- function(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...) { 46 | widely_svd_(tbl, 47 | col_name(substitute(item)), 48 | col_name(substitute(feature)), 49 | col_name(substitute(value)), 50 | nv = nv, 51 | weight_d = weight_d, 52 | ...) 53 | } 54 | 55 | 56 | #' @rdname widely_svd 57 | #' @export 58 | widely_svd_ <- function(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...) { 59 | if (is.null(nv)) { 60 | perform_svd <- function(m) { 61 | s <- svd(m, ...) 62 | 63 | if (weight_d) { 64 | ret <- t(s$d * t(s$u)) 65 | } else { 66 | ret <- s$u 67 | } 68 | 69 | rownames(ret) <- rownames(m) 70 | ret 71 | } 72 | sparse <- FALSE 73 | } else { 74 | if (!requireNamespace("irlba", quietly = TRUE)) { 75 | stop("Requires the irlba package") 76 | } 77 | perform_svd <- function(m) { 78 | s <- irlba::irlba(m, nv = nv, ...) 79 | if (weight_d) { 80 | ret <- t(s$d * t(s$u)) 81 | } else { 82 | ret <- s$u 83 | } 84 | 85 | rownames(ret) <- rownames(m) 86 | ret 87 | } 88 | sparse <- TRUE 89 | } 90 | 91 | item_vals <- tbl[[item]] 92 | item_u <- unique(item_vals) 93 | tbl[[item]] <- match(item_vals, item_u) 94 | 95 | ret <- widely_(perform_svd, sparse = sparse)(tbl, item, feature, value) 96 | 97 | ret <- ret %>% 98 | transmute(item = item_u[as.integer(item1)], 99 | dimension = item2, 100 | value) 101 | 102 | colnames(ret)[1] <- item 103 | 104 | ret 105 | } 106 | -------------------------------------------------------------------------------- /vignettes/intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "widyr: Widen, process, and re-tidy a dataset" 3 | author: "David Robinson" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{widyr: Widen, process, and re-tidy a dataset} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | 13 | This package wraps the pattern of un-tidying data into a wide matrix, performing some processing, then turning it back into a tidy form. This is useful for several mathematical operations such as co-occurrence counts, correlations, or clustering that are best done on a wide matrix. 14 | 15 | ## Towards a precise definition of "wide" data 16 | 17 | The term "wide data" has gone out of fashion as being "imprecise" [(Wickham 2014)](http://vita.had.co.nz/papers/tidy-data.pdf)), but I think with a proper definition the term could be entirely meaningful and useful. 18 | 19 | A **wide** dataset is one or more matrices where: 20 | 21 | * Each row is one **item** 22 | * Each column is one **feature** 23 | * Each value is one **observation** 24 | * Each matrix is one **variable** 25 | 26 | When would you want data to be wide rather than tidy? Notable examples include classification, clustering, correlation, factorization, or other operations that can take advantage of a matrix structure. In general, when you want to **compare between items** rather than compare between variables, this is a useful structure. 27 | 28 | The widyr package is based on the observation that during a tidy data analysis, you often want data to be wide only *temporarily*, before returning to a tidy structure for visualization and further analysis. widyr makes this easy through a set of `pairwise_` functions. 29 | 30 | ## Example: gapminder 31 | 32 | Consider the gapminder dataset in the [gapminder package](https://CRAN.R-project.org/package=gapminder). 33 | 34 | ```{r} 35 | library(dplyr) 36 | library(gapminder) 37 | 38 | gapminder 39 | ``` 40 | 41 | This tidy format (one-row-per-country-per-year) is very useful for grouping, summarizing, and filtering operations. But if we want to *compare pairs of countries* (for example, to find countries that are similar to each other), we would have to reshape this dataset. Note that here, country is the **item**, while year is the **feature** column. 42 | 43 | #### Pairwise operations 44 | 45 | The widyr package offers `pairwise_` functions that operate on pairs of items within data. An example is `pairwise_dist`: 46 | 47 | ```{r} 48 | library(widyr) 49 | 50 | gapminder %>% 51 | pairwise_dist(country, year, lifeExp) 52 | ``` 53 | 54 | In a single step, this finds the Euclidean distance between the `lifeExp` value in each pair of countries, matching pairs based on year. We could find the closest pairs of countries overall with `arrange()`: 55 | 56 | ```{r} 57 | gapminder %>% 58 | pairwise_dist(country, year, lifeExp) %>% 59 | arrange(distance) 60 | ``` 61 | 62 | Notice that this includes duplicates (Germany/Belgium and Belgium/Germany). To avoid those (the upper triangle of the distance matrix), use `upper = FALSE`: 63 | 64 | ```{r} 65 | gapminder %>% 66 | pairwise_dist(country, year, lifeExp, upper = FALSE) %>% 67 | arrange(distance) 68 | ``` 69 | 70 | In some analyses, we may be interested in correlation rather than distance of pairs. For this we would use `pairwise_cor`: 71 | 72 | ```{r} 73 | gapminder %>% 74 | pairwise_cor(country, year, lifeExp, upper = FALSE, sort = TRUE) 75 | ``` 76 | -------------------------------------------------------------------------------- /tests/testthat/test-pairwise-count.R: -------------------------------------------------------------------------------- 1 | # tests for pairwise_count function 2 | 3 | context("pairwise_count") 4 | 5 | suppressPackageStartupMessages(library(dplyr)) 6 | suppressPackageStartupMessages(library(tidytext)) 7 | 8 | original <- tibble(txt = c("I felt a funeral in my brain,", 9 | "And mourners, to and fro,", 10 | "Kept treading, treading, till it seemed", 11 | "That sense was breaking through.")) %>% 12 | mutate(line = row_number()) %>% 13 | unnest_tokens(char, txt, token = "characters") 14 | 15 | test_that("pairing and counting works", { 16 | d <- original %>% 17 | pairwise_count(char, line, sort = TRUE, upper = FALSE, diag = FALSE) 18 | 19 | expect_equal(nrow(d), 164) 20 | expect_equal(ncol(d), 3) 21 | expect_equal(d$item1[1], "e") 22 | expect_equal(d$item2[10], "r") 23 | expect_equal(d$n[20], 3) 24 | 25 | expect_false(any(d$item1 == d$item2)) 26 | expect_false(is.unsorted(rev(d$n))) 27 | 28 | # test additional arguments 29 | 30 | # for self-pairs, the number of occurrences should be the number of distinct 31 | # lines 32 | d2 <- original %>% 33 | pairwise_count(char, line, sort = TRUE, upper = FALSE, diag = TRUE) 34 | 35 | expect_equal(nrow(d2), nrow(d) + 20) 36 | 37 | self_pairs <- d2 %>% 38 | filter(item1 == item2) %>% 39 | arrange(item1) 40 | 41 | char_counts <- original %>% 42 | distinct(line, char) %>% 43 | count(char) %>% 44 | arrange(char) 45 | 46 | expect_true(all(self_pairs$item1 == char_counts$char)) 47 | expect_true(all(self_pairs$n == char_counts$n)) 48 | 49 | # when upper is TRUE, should include twice as many items as original 50 | d3 <- original %>% 51 | pairwise_count(char, line, sort = TRUE, upper = TRUE) 52 | 53 | expect_equal(nrow(d) * 2, nrow(d3)) 54 | expect_true(all(sort(d3$item1) == sort(d3$item2))) 55 | }) 56 | 57 | 58 | test_that("We can count with a weight column", { 59 | d <- tibble(col1 = c("a", "a", "a", "b", "b", "b"), 60 | col2 = c("x", "y", "z", "x", "x", "z"), 61 | weight = c(1, 1, 1, 5, 5, 5)) 62 | 63 | ret1 <- pairwise_count(d, col2, col1) 64 | expect_equal(ret1$n[ret1$item1 == "z" & ret1$item2 == "y"], 1) 65 | expect_equal(ret1$n[ret1$item1 == "z" & ret1$item2 == "x"], 2) 66 | 67 | ret2 <- pairwise_count(d, col2, col1, wt = weight) 68 | expect_equal(ret2$n[ret1$item1 == "z" & ret1$item2 == "y"], 1) 69 | expect_equal(ret2$n[ret1$item1 == "z" & ret1$item2 == "x"], 6) 70 | }) 71 | 72 | 73 | test_that("Counts co-occurrences of words in Pride & Prejudice", { 74 | if (require("janeaustenr", quietly = TRUE)) { 75 | words <- tibble(text = prideprejudice) %>% 76 | mutate(line = row_number()) %>% 77 | unnest_tokens(word, text) 78 | 79 | pairs <- words %>% 80 | pairwise_count(word, line, upper = TRUE, diag = TRUE, sort = TRUE) 81 | 82 | # check it is sorted in descending order 83 | expect_false(is.unsorted(rev(pairs$n))) 84 | 85 | # check occurrences of words that appear with "elizabeth" 86 | words_with_elizabeth <- words %>% 87 | filter(word == "elizabeth") %>% 88 | select(line) %>% 89 | inner_join(words, by = "line") %>% 90 | distinct(word, line) %>% 91 | count(word) %>% 92 | arrange(n, word) 93 | 94 | pairs_with_elizabeth <- pairs %>% 95 | filter(item1 == "elizabeth") %>% 96 | arrange(n, item2) 97 | 98 | expect_true(all(words_with_elizabeth$word == pairs_with_elizabeth$item2)) 99 | expect_true(all(words_with_elizabeth$n == pairs_with_elizabeth$n)) 100 | } 101 | }) 102 | 103 | test_that("Can count within groups", { 104 | grouped_result <- mtcars %>% 105 | group_by(cyl) %>% 106 | pairwise_count(vs, am) 107 | 108 | expect_equal(as.character(groups(grouped_result)), c("cyl")) 109 | expect_equal(nrow(grouped_result), 2) 110 | expect_equal(colnames(grouped_result), c("cyl", "item1", "item2", "n")) 111 | }) 112 | -------------------------------------------------------------------------------- /vignettes/united_nations.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "United Nations Voting Correlations" 3 | author: "David Robinson" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{United Nations Voting Correlations} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r setup, echo = FALSE} 13 | library(knitr) 14 | 15 | options(width = 102) 16 | knitr::opts_chunk$set(message = FALSE, warning = FALSE) 17 | 18 | library(ggplot2) 19 | theme_set(theme_bw()) 20 | ``` 21 | 22 | Here we'll examine an example application of the widyr package, particularly the `pairwise_cor` and `pairwise_dist` functions. We'll use the data on United Nations General Assembly voting from the `unvotes` package: 23 | 24 | ```{r echo = FALSE} 25 | if (!requireNamespace("unvotes", quietly = TRUE)) { 26 | print("This vignette requires the unvotes package to be installed. Exiting...") 27 | knitr::knit_exit() 28 | } 29 | ``` 30 | 31 | ```{r} 32 | library(dplyr) 33 | library(unvotes) 34 | 35 | un_votes 36 | ``` 37 | 38 | This dataset has one row for each country for each roll call vote. We're interested in finding pairs of countries that tended to vote similarly. 39 | 40 | ### Pairwise correlations 41 | 42 | Notice that the `vote` column is a factor, with levels (in order) "yes", "abstain", and "no": 43 | 44 | ```{r} 45 | levels(un_votes$vote) 46 | ``` 47 | 48 | We may then be interested in obtaining a measure of country-to-country agreement for each vote, using the `pairwise_cor` function. 49 | 50 | ```{r cors} 51 | library(widyr) 52 | 53 | cors <- un_votes %>% 54 | mutate(vote = as.numeric(vote)) %>% 55 | pairwise_cor(country, rcid, vote, use = "pairwise.complete.obs", sort = TRUE) 56 | 57 | cors 58 | ``` 59 | 60 | We could, for example, find the countries that the US is most and least in agreement with: 61 | 62 | ```{r US_cors} 63 | US_cors <- cors %>% 64 | filter(item1 == "United States") 65 | 66 | # Most in agreement 67 | US_cors 68 | 69 | # Least in agreement 70 | US_cors %>% 71 | arrange(correlation) 72 | ``` 73 | 74 | This can be particularly useful when visualized on a map. 75 | 76 | ```{r US_cors_map, fig.width = 10, fig.height = 6} 77 | if (require("maps", quietly = TRUE) && 78 | require("fuzzyjoin", quietly = TRUE) && 79 | require("countrycode", quietly = TRUE) && 80 | require("ggplot2", quietly = TRUE)) { 81 | world_data <- map_data("world") %>% 82 | regex_full_join(iso3166, by = c("region" = "mapname")) %>% 83 | filter(region != "Antarctica") 84 | 85 | US_cors %>% 86 | mutate(a2 = countrycode(item2, "country.name", "iso2c")) %>% 87 | full_join(world_data, by = "a2") %>% 88 | ggplot(aes(long, lat, group = group, fill = correlation)) + 89 | geom_polygon(color = "gray", size = .1) + 90 | scale_fill_gradient2() + 91 | coord_quickmap() + 92 | theme_void() + 93 | labs(title = "Correlation of each country's UN votes with the United States", 94 | subtitle = "Blue indicates agreement, red indicates disagreement", 95 | fill = "Correlation w/ US") 96 | } 97 | ``` 98 | 99 | ### Visualizing clusters in a network 100 | 101 | Another useful kind of visualization is a network plot, which can be created with Thomas Pedersen's [ggraph package](https://github.com/thomasp85/ggraph). We can filter for pairs of countries with correlations above a particular threshold. 102 | 103 | ```{r country_network, fig.width = 10, fig.height = 6} 104 | if (require("ggraph", quietly = TRUE) && 105 | require("igraph", quietly = TRUE) && 106 | require("countrycode", quietly = TRUE)) { 107 | cors_filtered <- cors %>% 108 | filter(correlation > .6) 109 | 110 | continents <- tibble(country = unique(un_votes$country)) %>% 111 | filter(country %in% cors_filtered$item1 | 112 | country %in% cors_filtered$item2) %>% 113 | mutate(continent = countrycode(country, "country.name", "continent")) 114 | 115 | set.seed(2017) 116 | 117 | cors_filtered %>% 118 | graph_from_data_frame(vertices = continents) %>% 119 | ggraph() + 120 | geom_edge_link(aes(edge_alpha = correlation)) + 121 | geom_node_point(aes(color = continent), size = 3) + 122 | geom_node_text(aes(label = name), check_overlap = TRUE, vjust = 1, hjust = 1) + 123 | theme_void() + 124 | labs(title = "Network of countries with correlated United Nations votes") 125 | } 126 | ``` 127 | 128 | Choosing the threshold for filtering correlations (or other measures of similarity) typically requires some trial and error. Setting too high a threshold will make a graph too sparse, while too low a threshold will make a graph too crowded. 129 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | 8 | ```{r, include = FALSE} 9 | knitr::opts_chunk$set( 10 | collapse = TRUE, 11 | comment = "#>", 12 | fig.path = "man/figures/README-", 13 | out.width = "100%", 14 | message = FALSE 15 | ) 16 | suppressPackageStartupMessages(library(dplyr)) 17 | ``` 18 | 19 | # widyr: Widen, process, and re-tidy a dataset 20 | 21 | **Authors:** [Julia Silge](https://juliasilge.com/), [David Robinson](http://varianceexplained.org/)
22 | **License:** [MIT](https://opensource.org/licenses/MIT) 23 | 24 | 25 | [![R-CMD-check](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml) 26 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/widyr)](https://cran.r-project.org/package=widyr) 27 | [![Codecov test coverage](https://codecov.io/gh/juliasilge/widyr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/juliasilge/widyr?branch=main) 28 | 29 | 30 | This package wraps the pattern of un-tidying data into a wide matrix, performing some processing, then turning it back into a tidy form. This is useful for several mathematical operations such as co-occurrence counts, correlations, or clustering that are best done on a wide matrix. 31 | 32 | ## Installation 33 | 34 | You can install the released version of widyr from [CRAN](https://CRAN.R-project.org) with: 35 | 36 | ``` r 37 | install.packages("widyr") 38 | ``` 39 | 40 | And the development version from [GitHub](https://github.com/) with: 41 | 42 | ``` r 43 | # install.packages("devtools") 44 | devtools::install_github("juliasilge/widyr") 45 | ``` 46 | 47 | ## Towards a precise definition of "wide" data 48 | 49 | The term "wide data" has gone out of fashion as being "imprecise" [(Wickham 2014)](http://vita.had.co.nz/papers/tidy-data.pdf), but I think with a proper definition the term could be entirely meaningful and useful. 50 | 51 | A **wide** dataset is one or more matrices where: 52 | 53 | * Each row is one **item** 54 | * Each column is one **feature** 55 | * Each value is one **observation** 56 | * Each matrix is one **variable** 57 | 58 | When would you want data to be wide rather than tidy? Notable examples include classification, clustering, correlation, factorization, or other operations that can take advantage of a matrix structure. In general, when you want to **compare between pairs of items** rather than compare between variables or between groups of observations, this is a useful structure. 59 | 60 | The widyr package is based on the observation that during a tidy data analysis, you often want data to be wide only *temporarily*, before returning to a tidy structure for visualization and further analysis. widyr makes this easy through a set of `pairwise_` functions. 61 | 62 | ## Example: gapminder 63 | 64 | Consider the gapminder dataset in the [gapminder package](https://CRAN.R-project.org/package=gapminder). 65 | 66 | ```{r} 67 | library(dplyr) 68 | library(gapminder) 69 | 70 | gapminder 71 | ``` 72 | 73 | This tidy format (one-row-per-country-per-year) is very useful for grouping, summarizing, and filtering operations. But if we want to *compare* countries (for example, to find countries that are similar to each other), we would have to reshape this dataset. Note that here, each country is an **item**, while each year is the **feature**. 74 | 75 | #### Pairwise operations 76 | 77 | The widyr package offers `pairwise_` functions that operate on pairs of items within data. An example is `pairwise_dist`: 78 | 79 | ```{r} 80 | library(widyr) 81 | 82 | gapminder %>% 83 | pairwise_dist(country, year, lifeExp) 84 | ``` 85 | 86 | This finds the Euclidean distance between the `lifeExp` value in each pair of countries. It knows which values to compare between countries with `year`, which is the feature column. 87 | 88 | We could find the closest pairs of countries overall with `arrange()`: 89 | 90 | ```{r} 91 | gapminder %>% 92 | pairwise_dist(country, year, lifeExp) %>% 93 | arrange(distance) 94 | ``` 95 | 96 | Notice that this includes duplicates (Germany/Belgium and Belgium/Germany). To avoid those (the upper triangle of the distance matrix), use `upper = FALSE`: 97 | 98 | ```{r} 99 | gapminder %>% 100 | pairwise_dist(country, year, lifeExp, upper = FALSE) %>% 101 | arrange(distance) 102 | ``` 103 | 104 | In some analyses, we may be interested in correlation rather than distance of pairs. For this we would use `pairwise_cor`: 105 | 106 | ```{r} 107 | gapminder %>% 108 | pairwise_cor(country, year, lifeExp, upper = FALSE) 109 | ``` 110 | 111 | ### Code of Conduct 112 | 113 | This project is released with a [Contributor Code of Conduct](https://www.contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. 114 | -------------------------------------------------------------------------------- /R/widely.R: -------------------------------------------------------------------------------- 1 | #' Adverb for functions that operate on matrices in "wide" 2 | #' format 3 | #' 4 | #' Modify a function in order to pre-cast the input into a wide 5 | #' matrix format, perform the function, and then 6 | #' re-tidy (e.g. melt) the output into a tidy table. 7 | #' 8 | #' @param .f Function being wrapped 9 | #' @param sort Whether to sort in descending order of `value` 10 | #' @param maximum_size To prevent crashing, a maximum size of a 11 | #' non-sparse matrix to be created. Set to NULL to allow any size 12 | #' matrix. 13 | #' @param sparse Whether to cast to a sparse matrix 14 | #' 15 | #' @return Returns a function that takes at least four arguments: 16 | #' \item{tbl}{A table} 17 | #' \item{row}{Name of column to use as rows in wide matrix} 18 | #' \item{column}{Name of column to use as columns in wide matrix} 19 | #' \item{value}{Name of column to use as values in wide matrix} 20 | #' \item{...}{Arguments passed on to inner function} 21 | #' 22 | #' `widely` creates a function that takes those columns as 23 | #' bare names, `widely_` a function that takes them as strings. 24 | #' 25 | #' @import dplyr 26 | #' @import Matrix 27 | #' @importFrom broom tidy 28 | #' 29 | #' @examples 30 | #' 31 | #' library(dplyr) 32 | #' library(gapminder) 33 | #' 34 | #' gapminder 35 | #' 36 | #' gapminder %>% 37 | #' widely(dist)(country, year, lifeExp) 38 | #' 39 | #' # can perform within groups 40 | #' closest_continent <- gapminder %>% 41 | #' group_by(continent) %>% 42 | #' widely(dist)(country, year, lifeExp) 43 | #' closest_continent 44 | #' 45 | #' # for example, find the closest pair in each 46 | #' closest_continent %>% 47 | #' top_n(1, -value) 48 | #' 49 | #' @export 50 | widely <- function(.f, 51 | sort = FALSE, 52 | sparse = FALSE, 53 | maximum_size = 1e7) { 54 | force(.f) 55 | force(sort) 56 | force(sparse) 57 | force(maximum_size) 58 | function(tbl, row, column, value, ...) { 59 | inner_func <- widely_(.f, 60 | sort = sort, 61 | sparse = sparse, 62 | maximum_size = maximum_size) 63 | 64 | inner_func(tbl, 65 | col_name(substitute(row)), 66 | col_name(substitute(column)), 67 | col_name(substitute(value)), 68 | ...) 69 | } 70 | } 71 | 72 | 73 | #' @rdname widely 74 | #' @export 75 | widely_ <- function(.f, 76 | sort = FALSE, 77 | sparse = FALSE, 78 | maximum_size = 1e7) { 79 | f <- function(tbl, row, column, value, ...) { 80 | if (inherits(tbl, "grouped_df")) { 81 | # perform within each group 82 | # (group_by_at isn't necessary since 1.0.0, but is in earlier versions) 83 | ret <- tbl %>% 84 | tidyr::nest() %>% 85 | mutate(data = purrr::map(data, f, row, column, value)) %>% 86 | tidyr::unnest(data) %>% 87 | dplyr::group_by_at(dplyr::group_vars(tbl)) 88 | 89 | return(ret) 90 | } 91 | 92 | if (!sparse) { 93 | if (!is.null(maximum_size)) { 94 | matrix_size <- (length(unique(tbl[[row]])) * 95 | length(unique(tbl[[column]]))) 96 | if (matrix_size > maximum_size) { 97 | rlang::abort( 98 | paste0("Size of acast matrix, ", matrix_size, 99 | " will be too large. Set maximum_size = NULL to avoid ", 100 | "this error (make sure your memory is sufficient), ", 101 | "or consider using sparse = TRUE.") 102 | ) 103 | } 104 | } 105 | 106 | form <- stats::as.formula(paste(row, column, sep = " ~ ")) 107 | 108 | input <- reshape2::acast(tbl, form, value.var = value, fill = 0) 109 | } else { 110 | input <- tidytext::cast_sparse(tbl, !!row, !!column, !!value) 111 | } 112 | output <- purrr::as_mapper(.f)(input, ...) 113 | 114 | ret <- output %>% 115 | custom_melt() %>% 116 | as_tibble() 117 | 118 | if (sort) { 119 | ret <- arrange(ret, desc(value)) 120 | } 121 | ret 122 | } 123 | 124 | f 125 | } 126 | 127 | 128 | #' Tidy a function output based on some guesses 129 | #' @noRd 130 | custom_melt <- function(m) { 131 | if (inherits(m, "data.frame")) { 132 | rlang::abort("Output is a data frame: don't know how to fix") 133 | } else if (inherits(m, "matrix")) { 134 | ret <- reshape2::melt(m, varnames = c("item1", "item2"), as.is = TRUE) 135 | return(ret) 136 | } else if (inherits(m, "Matrix")) { 137 | ret <- sparse_matrix_to_df(m) 138 | } else { 139 | ret <- tidy(m) 140 | } 141 | 142 | colnames(ret) <- c("item1", "item2", "value") 143 | ret 144 | } 145 | 146 | sparse_matrix_to_df <- function(x) { 147 | s <- Matrix::summary(x) 148 | 149 | row <- s$i 150 | if (!is.null(rownames(x))) { 151 | row <- rownames(x)[row] 152 | } 153 | col <- s$j 154 | if (!is.null(colnames(x))) { 155 | col <- colnames(x)[col] 156 | } 157 | 158 | ret <- data.frame( 159 | row = row, column = col, value = s$x, 160 | stringsAsFactors = FALSE 161 | ) 162 | 163 | ret 164 | } 165 | 166 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at codeofconduct@rstudio.com. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series of 85 | actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or permanent 92 | ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within the 112 | community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.1, available at 118 | . 119 | 120 | Community Impact Guidelines were inspired by 121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion]. 122 | 123 | For answers to common questions about this code of conduct, see the FAQ at 124 | . Translations are available at . 125 | 126 | [homepage]: https://www.contributor-covenant.org 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # widyr: Widen, process, and re-tidy a dataset 5 | 6 | **Authors:** [Julia Silge](https://juliasilge.com/), [David 7 | Robinson](http://varianceexplained.org/)
**License:** 8 | [MIT](https://opensource.org/licenses/MIT) 9 | 10 | 11 | 12 | [![R-CMD-check](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml) 13 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/widyr)](https://cran.r-project.org/package=widyr) 14 | [![Codecov test 15 | coverage](https://codecov.io/gh/juliasilge/widyr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/juliasilge/widyr?branch=main) 16 | 17 | 18 | This package wraps the pattern of un-tidying data into a wide matrix, 19 | performing some processing, then turning it back into a tidy form. This 20 | is useful for several mathematical operations such as co-occurrence 21 | counts, correlations, or clustering that are best done on a wide matrix. 22 | 23 | ## Installation 24 | 25 | You can install the released version of widyr from 26 | [CRAN](https://CRAN.R-project.org) with: 27 | 28 | ``` r 29 | install.packages("widyr") 30 | ``` 31 | 32 | And the development version from [GitHub](https://github.com/) with: 33 | 34 | ``` r 35 | # install.packages("devtools") 36 | devtools::install_github("juliasilge/widyr") 37 | ``` 38 | 39 | ## Towards a precise definition of “wide” data 40 | 41 | The term “wide data” has gone out of fashion as being “imprecise” 42 | [(Wickham 2014)](http://vita.had.co.nz/papers/tidy-data.pdf), but I 43 | think with a proper definition the term could be entirely meaningful and 44 | useful. 45 | 46 | A **wide** dataset is one or more matrices where: 47 | 48 | - Each row is one **item** 49 | - Each column is one **feature** 50 | - Each value is one **observation** 51 | - Each matrix is one **variable** 52 | 53 | When would you want data to be wide rather than tidy? Notable examples 54 | include classification, clustering, correlation, factorization, or other 55 | operations that can take advantage of a matrix structure. In general, 56 | when you want to **compare between pairs of items** rather than compare 57 | between variables or between groups of observations, this is a useful 58 | structure. 59 | 60 | The widyr package is based on the observation that during a tidy data 61 | analysis, you often want data to be wide only *temporarily*, before 62 | returning to a tidy structure for visualization and further analysis. 63 | widyr makes this easy through a set of `pairwise_` functions. 64 | 65 | ## Example: gapminder 66 | 67 | Consider the gapminder dataset in the [gapminder 68 | package](https://CRAN.R-project.org/package=gapminder). 69 | 70 | ``` r 71 | library(dplyr) 72 | library(gapminder) 73 | 74 | gapminder 75 | #> # A tibble: 1,704 × 6 76 | #> country continent year lifeExp pop gdpPercap 77 | #> 78 | #> 1 Afghanistan Asia 1952 28.8 8425333 779. 79 | #> 2 Afghanistan Asia 1957 30.3 9240934 821. 80 | #> 3 Afghanistan Asia 1962 32.0 10267083 853. 81 | #> 4 Afghanistan Asia 1967 34.0 11537966 836. 82 | #> 5 Afghanistan Asia 1972 36.1 13079460 740. 83 | #> 6 Afghanistan Asia 1977 38.4 14880372 786. 84 | #> 7 Afghanistan Asia 1982 39.9 12881816 978. 85 | #> 8 Afghanistan Asia 1987 40.8 13867957 852. 86 | #> 9 Afghanistan Asia 1992 41.7 16317921 649. 87 | #> 10 Afghanistan Asia 1997 41.8 22227415 635. 88 | #> # … with 1,694 more rows 89 | #> # ℹ Use `print(n = ...)` to see more rows 90 | ``` 91 | 92 | This tidy format (one-row-per-country-per-year) is very useful for 93 | grouping, summarizing, and filtering operations. But if we want to 94 | *compare* countries (for example, to find countries that are similar to 95 | each other), we would have to reshape this dataset. Note that here, each 96 | country is an **item**, while each year is the **feature**. 97 | 98 | #### Pairwise operations 99 | 100 | The widyr package offers `pairwise_` functions that operate on pairs of 101 | items within data. An example is `pairwise_dist`: 102 | 103 | ``` r 104 | library(widyr) 105 | 106 | gapminder %>% 107 | pairwise_dist(country, year, lifeExp) 108 | #> # A tibble: 20,022 × 3 109 | #> item1 item2 distance 110 | #> 111 | #> 1 Albania Afghanistan 107. 112 | #> 2 Algeria Afghanistan 76.8 113 | #> 3 Angola Afghanistan 4.65 114 | #> 4 Argentina Afghanistan 110. 115 | #> 5 Australia Afghanistan 129. 116 | #> 6 Austria Afghanistan 124. 117 | #> 7 Bahrain Afghanistan 98.1 118 | #> 8 Bangladesh Afghanistan 45.3 119 | #> 9 Belgium Afghanistan 125. 120 | #> 10 Benin Afghanistan 39.3 121 | #> # … with 20,012 more rows 122 | #> # ℹ Use `print(n = ...)` to see more rows 123 | ``` 124 | 125 | This finds the Euclidean distance between the `lifeExp` value in each 126 | pair of countries. It knows which values to compare between countries 127 | with `year`, which is the feature column. 128 | 129 | We could find the closest pairs of countries overall with `arrange()`: 130 | 131 | ``` r 132 | gapminder %>% 133 | pairwise_dist(country, year, lifeExp) %>% 134 | arrange(distance) 135 | #> # A tibble: 20,022 × 3 136 | #> item1 item2 distance 137 | #> 138 | #> 1 Germany Belgium 1.08 139 | #> 2 Belgium Germany 1.08 140 | #> 3 United Kingdom New Zealand 1.51 141 | #> 4 New Zealand United Kingdom 1.51 142 | #> 5 Norway Netherlands 1.56 143 | #> 6 Netherlands Norway 1.56 144 | #> 7 Italy Israel 1.66 145 | #> 8 Israel Italy 1.66 146 | #> 9 Finland Austria 1.94 147 | #> 10 Austria Finland 1.94 148 | #> # … with 20,012 more rows 149 | #> # ℹ Use `print(n = ...)` to see more rows 150 | ``` 151 | 152 | Notice that this includes duplicates (Germany/Belgium and 153 | Belgium/Germany). To avoid those (the upper triangle of the distance 154 | matrix), use `upper = FALSE`: 155 | 156 | ``` r 157 | gapminder %>% 158 | pairwise_dist(country, year, lifeExp, upper = FALSE) %>% 159 | arrange(distance) 160 | #> # A tibble: 10,011 × 3 161 | #> item1 item2 distance 162 | #> 163 | #> 1 Belgium Germany 1.08 164 | #> 2 New Zealand United Kingdom 1.51 165 | #> 3 Netherlands Norway 1.56 166 | #> 4 Israel Italy 1.66 167 | #> 5 Austria Finland 1.94 168 | #> 6 Belgium United Kingdom 1.95 169 | #> 7 Iceland Sweden 2.01 170 | #> 8 Comoros Mauritania 2.01 171 | #> 9 Belgium United States 2.09 172 | #> 10 Germany Ireland 2.10 173 | #> # … with 10,001 more rows 174 | #> # ℹ Use `print(n = ...)` to see more rows 175 | ``` 176 | 177 | In some analyses, we may be interested in correlation rather than 178 | distance of pairs. For this we would use `pairwise_cor`: 179 | 180 | ``` r 181 | gapminder %>% 182 | pairwise_cor(country, year, lifeExp, upper = FALSE) 183 | #> # A tibble: 10,011 × 3 184 | #> item1 item2 correlation 185 | #> 186 | #> 1 Afghanistan Albania 0.966 187 | #> 2 Afghanistan Algeria 0.987 188 | #> 3 Albania Algeria 0.953 189 | #> 4 Afghanistan Angola 0.986 190 | #> 5 Albania Angola 0.976 191 | #> 6 Algeria Angola 0.952 192 | #> 7 Afghanistan Argentina 0.971 193 | #> 8 Albania Argentina 0.949 194 | #> 9 Algeria Argentina 0.991 195 | #> 10 Angola Argentina 0.936 196 | #> # … with 10,001 more rows 197 | #> # ℹ Use `print(n = ...)` to see more rows 198 | ``` 199 | 200 | ### Code of Conduct 201 | 202 | This project is released with a [Contributor Code of 203 | Conduct](https://www.contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html). 204 | By contributing to this project, you agree to abide by its terms. 205 | --------------------------------------------------------------------------------