├── .github
    ├── .gitignore
    ├── workflows
    │   ├── test-coverage.yaml
    │   ├── pkgdown.yaml
    │   └── R-CMD-check.yaml
    └── CODE_OF_CONDUCT.md
├── revdep
    ├── README.md
    ├── failures.md
    ├── problems.md
    ├── .gitignore
    ├── email.yml
    └── cran.md
├── LICENSE
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-cor-sparse.R
    │   ├── test-pairwise-dist.R
    │   ├── test-squarely.R
    │   ├── test-pairwise-similarity.R
    │   ├── test-pairwise-cor.R
    │   ├── test-widely.R
    │   └── test-pairwise-count.R
├── R
    ├── globals.R
    ├── utils.R
    ├── cor_sparse.R
    ├── widely_kmeans.R
    ├── pairwise_similarity.R
    ├── pairwise_dist.R
    ├── pairwise_count.R
    ├── pairwise_pmi.R
    ├── widely_hclust.R
    ├── pairwise_cor.R
    ├── pairwise_delta.R
    ├── squarely.R
    ├── widely_svd.R
    └── widely.R
├── .gitignore
├── codecov.yml
├── .Rbuildignore
├── _pkgdown.yml
├── widyr.Rproj
├── cran-comments.md
├── NAMESPACE
├── man
    ├── cor_sparse.Rd
    ├── widely_kmeans.Rd
    ├── widely_hclust.Rd
    ├── pairwise_similarity.Rd
    ├── pairwise_pmi.Rd
    ├── pairwise_count.Rd
    ├── squarely.Rd
    ├── pairwise_dist.Rd
    ├── pairwise_cor.Rd
    ├── widely.Rd
    ├── widely_svd.Rd
    └── pairwise_delta.Rd
├── NEWS.md
├── DESCRIPTION
├── vignettes
    ├── intro.Rmd
    └── united_nations.Rmd
├── README.Rmd
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/revdep/README.md:
--------------------------------------------------------------------------------
1 | # Revdeps
2 | 
3 | 


--------------------------------------------------------------------------------
/revdep/failures.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*


--------------------------------------------------------------------------------
/revdep/problems.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2016
2 | COPYRIGHT HOLDER: David Robinson
3 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(widyr)
3 | 
4 | test_check("widyr")
5 | 


--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
1 | globalVariables(c("item1", "item2", "value", "..data", "data",
2 |                   "item", "cluster"))
3 | 


--------------------------------------------------------------------------------
/revdep/.gitignore:
--------------------------------------------------------------------------------
1 | checks
2 | library
3 | checks.noindex
4 | library.noindex
5 | cloud.noindex
6 | data.sqlite
7 | *.html
8 | 


--------------------------------------------------------------------------------
/revdep/email.yml:
--------------------------------------------------------------------------------
1 | release_date: ???
2 | rel_release_date: ???
3 | my_news_url: ???
4 | release_version: ???
5 | release_details: ???
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | inst/doc
 6 | docs
 7 | checks.noindex
 8 | cloud.noindex
 9 | library.noindex
10 | 


--------------------------------------------------------------------------------
/revdep/cran.md:
--------------------------------------------------------------------------------
1 | ## revdepcheck results
2 | 
3 | We checked 2 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
4 | 
5 |  * We saw 0 new problems
6 |  * We failed to check 0 packages
7 | 
8 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CRAN-RELEASE$
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | ^README\.Rmd$
 5 | ^README-.*\.png$
 6 | ^\.travis\.yml$
 7 | ^CONDUCT\.md$
 8 | ^appveyor\.yml$
 9 | ^data-raw$
10 | ^cran-comments\.md$
11 | ^\.github$
12 | ^codecov\.yml$
13 | ^_pkgdown\.yml$
14 | ^docs$
15 | ^pkgdown$
16 | ^revdep$
17 | ^CRAN-SUBMISSION$
18 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | #' Comes from tidyr
 2 | #' @noRd
 3 | col_name <- function(x, default = stop("Please supply column name", call. = FALSE))
 4 | {
 5 |   if (is.character(x))
 6 |     return(x)
 7 |   if (identical(x, quote(expr = )))
 8 |     return(default)
 9 |   if (is.name(x))
10 |     return(as.character(x))
11 |   if (is.null(x))
12 |     return(x)
13 |   stop("Invalid column specification", call. = FALSE)
14 | }
15 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://juliasilge.github.io/widyr/
 2 | 
 3 | home:
 4 |   title: "widyr: an R package to widen, process, and re-tidy a dataset"
 5 |   description: Compute co-occurrence counts, correlations, or clusters using tidy data principles
 6 | 
 7 | template:
 8 |   bootstrap: 5
 9 |   bootswatch: pulse
10 |   bslib:
11 |     pkgdown-nav-height: 100px
12 |   opengraph:
13 |     twitter:
14 |       creator: "@juliasilge"
15 |       card: summary_large_image
16 | 


--------------------------------------------------------------------------------
/widyr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Release Summary
 2 | 
 3 | This release updates widyr for recent changes in the Matrix package and switches the maintainer.
 4 | 
 5 | ## R CMD check results
 6 | 
 7 | 0 errors | 0 warnings | 1 note
 8 | 
 9 | This release updates the maintainer from David Robinson to Julia Silge.
10 | 
11 | 
12 | ## revdepcheck results
13 | 
14 | We checked 2 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
15 | 
16 |  * We saw 0 new problems
17 |  * We failed to check 0 packages
18 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cor-sparse.R:
--------------------------------------------------------------------------------
 1 | context("cor_sparse")
 2 | 
 3 | library(Matrix)
 4 | 
 5 | test_that("cor_sparse returns the same results as cor(as.matrix(m))", {
 6 |   m <- Matrix(0, nrow = 100000, ncol = 6)
 7 | 
 8 |   ind <- cbind(sample(100000, 1000, replace = TRUE),
 9 |                sample(6, 1000, replace = TRUE))
10 |   m[ind] <- rnorm(1000)
11 | 
12 |   co1 <- cor(as.matrix(m))
13 |   co2 <- cor_sparse(m)
14 | 
15 |   expect_is(co2, "matrix")
16 |   expect_is(c(co2), "numeric")
17 |   expect_equal(dim(co2), c(6, 6))
18 |   expect_equal(c(co1), c(co2))
19 | })
20 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(pairwise_cor)
 4 | export(pairwise_cor_)
 5 | export(pairwise_count)
 6 | export(pairwise_count_)
 7 | export(pairwise_delta)
 8 | export(pairwise_delta_)
 9 | export(pairwise_dist)
10 | export(pairwise_dist_)
11 | export(pairwise_pmi)
12 | export(pairwise_pmi_)
13 | export(pairwise_similarity)
14 | export(pairwise_similarity_)
15 | export(squarely)
16 | export(squarely_)
17 | export(widely)
18 | export(widely_)
19 | export(widely_hclust)
20 | export(widely_kmeans)
21 | export(widely_svd)
22 | export(widely_svd_)
23 | import(Matrix)
24 | import(dplyr)
25 | importFrom(broom,tidy)
26 | importFrom(rlang,":=")
27 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pairwise-dist.R:
--------------------------------------------------------------------------------
 1 | context("pairwise_dist")
 2 | 
 3 | suppressPackageStartupMessages(library(dplyr))
 4 | 
 5 | test_that("pairwise_dist computes a distance matrix", {
 6 |   d <- data.frame(col = rep(c("a", "b", "c"), each = 3),
 7 |                   row = rep(c("d", "e", "f"), 3),
 8 |                   value = c(1, 2, 3, 6, 5, 4, 7, 9, 8))
 9 | 
10 |   ret <- d %>%
11 |     pairwise_dist(col, row, value)
12 | 
13 |   ret1 <- ret$distance[ret$item1 == "a" & ret$item2 == "b"]
14 |   expect_equal(ret1, sqrt(sum((1:3 - 6:4) ^ 2)))
15 | 
16 |   ret2 <- ret$distance[ret$item1 == "b" & ret$item2 == "c"]
17 |   expect_equal(ret2, sqrt(sum((6:4 - c(7, 9, 8)) ^ 2)))
18 | 
19 |   expect_equal(sum(ret$item1 == ret$item2), 0)
20 | })
21 | 


--------------------------------------------------------------------------------
/R/cor_sparse.R:
--------------------------------------------------------------------------------
 1 | #' Find the Pearson correlation of a sparse matrix efficiently
 2 | #'
 3 | #' Find the Pearson correlation of a sparse matrix.
 4 | #' For large sparse matrix this is more efficient in time and memory than
 5 | #' `cor(as.matrix(x))`. Note that it does not currently work on
 6 | #' simple_triplet_matrix objects.
 7 | #'
 8 | #' @param x A matrix, potentially a sparse matrix such as a "dgTMatrix" object
 9 | #'
10 | #' @source This code comes from mike on this Stack Overflow answer:
11 | #' <https://stackoverflow.com/a/9626089/712603>.
12 | cor_sparse <- function(x) {
13 |   n <- nrow(x)
14 |   covmat <- (as.matrix(crossprod(x)) - n * tcrossprod(colMeans(x))) / (n - 1)
15 |   cormat <- covmat / tcrossprod(sqrt(diag(covmat)))
16 |   cormat
17 | }
18 | 


--------------------------------------------------------------------------------
/man/cor_sparse.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cor_sparse.R
 3 | \name{cor_sparse}
 4 | \alias{cor_sparse}
 5 | \title{Find the Pearson correlation of a sparse matrix efficiently}
 6 | \source{
 7 | This code comes from mike on this Stack Overflow answer:
 8 | \url{https://stackoverflow.com/a/9626089/712603}.
 9 | }
10 | \usage{
11 | cor_sparse(x)
12 | }
13 | \arguments{
14 | \item{x}{A matrix, potentially a sparse matrix such as a "dgTMatrix" object}
15 | }
16 | \description{
17 | Find the Pearson correlation of a sparse matrix.
18 | For large sparse matrix this is more efficient in time and memory than
19 | \code{cor(as.matrix(x))}. Note that it does not currently work on
20 | simple_triplet_matrix objects.
21 | }
22 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v2
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: covr::codecov(quiet = FALSE)
31 |         shell: Rscript {0}
32 | 


--------------------------------------------------------------------------------
/tests/testthat/test-squarely.R:
--------------------------------------------------------------------------------
 1 | context("squarely")
 2 | 
 3 | suppressPackageStartupMessages(library(dplyr))
 4 | 
 5 | test_that("Can perform 'squarely' operations on pairs of items", {
 6 |   if (require("gapminder", quietly = TRUE)) {
 7 |     ncountries <- length(unique(gapminder$country))
 8 | 
 9 |     closest <- gapminder %>%
10 |       squarely(dist)(country, year, lifeExp)
11 | 
12 |     expect_equal(colnames(closest), c("item1", "item2", "value"))
13 | 
14 |     expect_equal(nrow(closest), ncountries * (ncountries - 1) / 2)
15 |   }
16 | })
17 | 
18 | test_that("Can perform 'squarely' within groups", {
19 |   if (require("gapminder", quietly = TRUE)) {
20 |     closest_continent <- gapminder %>%
21 |       group_by(continent) %>%
22 |       squarely(dist)(country, year, lifeExp)
23 | 
24 |     expect_equal(colnames(closest_continent), c("continent", "item1", "item2", "value"))
25 |     expect_equal(nrow(closest_continent), 2590)
26 |     expect_equal(unique(closest_continent$continent),
27 |                  unique(gapminder$continent))
28 |   }
29 | })
30 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pairwise-similarity.R:
--------------------------------------------------------------------------------
 1 | context("pairwise_similarity")
 2 | 
 3 | suppressPackageStartupMessages(library(dplyr))
 4 | 
 5 | d <- tibble(col = rep(c("a", "b", "c"), each = 3),
 6 |             row = rep(c("d", "e", "f"), 3),
 7 |             value = c(1, 2, 3, 6, 5, 4, 7, 9, 8))
 8 | 
 9 | cosine_similarity <- function(x, y) {
10 |   sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2)))
11 | }
12 | 
13 | test_that("pairwise_similarity computes pairwise cosine similarity", {
14 |   ret <- d %>%
15 |     pairwise_similarity(col, row, value)
16 | 
17 |   ret1 <- ret$similarity[ret$item1 == "a" & ret$item2 == "b"]
18 |   expect_equal(ret1, cosine_similarity(1:3, 6:4))
19 | 
20 |   ret2 <- ret$similarity[ret$item1 == "b" & ret$item2 == "c"]
21 |   expect_equal(ret2, cosine_similarity(6:4, c(7, 9, 8)))
22 | 
23 |   expect_equal(sum(ret$item1 == ret$item2), 0)
24 | })
25 | 
26 | test_that("pairwise_similarity retains factor levels", {
27 |   d$col <- factor(d$col, levels = c("b", "c", "a"))
28 | 
29 |   ret <- d %>%
30 |     pairwise_similarity(col, row, value)
31 | 
32 |   expect_is(ret$item1, "factor")
33 |   expect_is(ret$item2, "factor")
34 |   expect_equal(levels(ret$item1), c("b", "c", "a"))
35 |   expect_equal(levels(ret$item2), c("b", "c", "a"))
36 | })
37 | 


--------------------------------------------------------------------------------
/man/widely_kmeans.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/widely_kmeans.R
 3 | \name{widely_kmeans}
 4 | \alias{widely_kmeans}
 5 | \title{Cluster items based on k-means across features}
 6 | \usage{
 7 | widely_kmeans(tbl, item, feature, value, k, fill = 0, ...)
 8 | }
 9 | \arguments{
10 | \item{tbl}{Table}
11 | 
12 | \item{item}{Item to cluster (as a bare column name)}
13 | 
14 | \item{feature}{Feature column (dimension in clustering)}
15 | 
16 | \item{value}{Value column}
17 | 
18 | \item{k}{Number of clusters}
19 | 
20 | \item{fill}{What to fill in for missing values}
21 | 
22 | \item{...}{Other arguments passed on to \code{\link[=kmeans]{kmeans()}}}
23 | }
24 | \description{
25 | Given a tidy table of features describing each item, perform k-means
26 | clustering using \code{\link[=kmeans]{kmeans()}} and retidy the data into
27 | one-row-per-cluster.
28 | }
29 | \examples{
30 | 
31 | library(gapminder)
32 | library(dplyr)
33 | 
34 | clusters <- gapminder \%>\%
35 |   widely_kmeans(country, year, lifeExp, k = 5)
36 | 
37 | clusters
38 | 
39 | clusters \%>\%
40 |   count(cluster)
41 | 
42 | # Examine a few clusters
43 | clusters \%>\% filter(cluster == 1)
44 | clusters \%>\% filter(cluster == 2)
45 | 
46 | }
47 | \seealso{
48 | \code{\link[=widely_hclust]{widely_hclust()}}
49 | }
50 | 


--------------------------------------------------------------------------------
/man/widely_hclust.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/widely_hclust.R
 3 | \name{widely_hclust}
 4 | \alias{widely_hclust}
 5 | \title{Cluster pairs of items into groups using hierarchical clustering}
 6 | \usage{
 7 | widely_hclust(tbl, item1, item2, distance, k = NULL, h = NULL)
 8 | }
 9 | \arguments{
10 | \item{tbl}{Table}
11 | 
12 | \item{item1}{First item}
13 | 
14 | \item{item2}{Second item}
15 | 
16 | \item{distance}{Distance column}
17 | 
18 | \item{k}{The desired number of groups}
19 | 
20 | \item{h}{Height at which to cut the hierarchically clustered tree}
21 | }
22 | \description{
23 | Reshape a table that represents pairwise distances into hierarchical clusters,
24 | returning a table with \code{item} and \code{cluster} columns.
25 | }
26 | \examples{
27 | 
28 | library(gapminder)
29 | library(dplyr)
30 | 
31 | # Construct Euclidean distances between countries based on life
32 | # expectancy over time
33 | country_distances <- gapminder \%>\%
34 |   pairwise_dist(country, year, lifeExp)
35 | 
36 | country_distances
37 | 
38 | # Turn this into 5 hierarchical clusters
39 | clusters <- country_distances \%>\%
40 |   widely_hclust(item1, item2, distance, k = 8)
41 | 
42 | # Examine a few such clusters
43 | clusters \%>\% filter(cluster == 1)
44 | clusters \%>\% filter(cluster == 2)
45 | 
46 | }
47 | \seealso{
48 | \link{cutree}
49 | }
50 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # widyr (development version)
 2 | 
 3 | * Now force evaluation for `widely()` function factory (thanks to @lhdjung, #44)
 4 | 
 5 | # widyr 0.1.5
 6 | 
 7 | * Change maintainer to Julia Silge
 8 | * Updates for new Matrix package version (@simonpcouch, #41)
 9 | * Update use of `distinct()`
10 | 
11 | # widyr 0.1.4
12 | 
13 | * Fix bug in United Nations vignette (caused by unvotes update).
14 | * Also changes the vignettes to render conditionally on package installation.
15 | 
16 | # widyr 0.1.3
17 | 
18 | * Update to work with the latest version of tidytext's cast_sparse. Adds rlang to IMPORTs. (@juliasilge, #30)
19 | * Update from data_frame() to tibble() in examples
20 | * Removed topicmodels from SUGGESTS (hasn't been required for several versions)
21 | * Fixed spelling mistakes of occurence->occurrence
22 | 
23 | # widyr 0.1.2
24 | 
25 | * Fixes to be compatible with tidyr v1.0.0, while also being reverse-compatible with previous versions of tidyr.
26 | * Fix intro vignette index entry
27 | 
28 | # widyr 0.1.1
29 | 
30 | * Added `pairwise_delta` function for Burrows' delta
31 | * Added `pairwise_pmi` for pairwise mutual information
32 | * Added `widely_svd` for performing singular value decomposition then re-tidying
33 | * Removed methods from DESCRIPTION
34 | 
35 | # widyr 0.1.0
36 | 
37 | * Initial release of package
38 | * Only functions are the pairwise_ collection of functions, as well as the widely and squarely adverbs.
39 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     steps:
23 |       - uses: actions/checkout@v2
24 | 
25 |       - uses: r-lib/actions/setup-pandoc@v2
26 | 
27 |       - uses: r-lib/actions/setup-r@v2
28 |         with:
29 |           use-public-rspm: true
30 | 
31 |       - uses: r-lib/actions/setup-r-dependencies@v2
32 |         with:
33 |           extra-packages: any::pkgdown, local::.
34 |           needs: website
35 | 
36 |       - name: Build site
37 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 |         shell: Rscript {0}
39 | 
40 |       - name: Deploy to GitHub pages 🚀
41 |         if: github.event_name != 'pull_request'
42 |         uses: JamesIves/github-pages-deploy-action@4.1.4
43 |         with:
44 |           clean: false
45 |           branch: gh-pages
46 |           folder: docs
47 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Type: Package
 2 | Package: widyr
 3 | Title: Widen, Process, then Re-Tidy Data
 4 | Version: 0.1.5.9000
 5 | Authors@R: c(
 6 |     person("David", "Robinson", , "admiral.david@gmail.com", role = "aut"),
 7 |     person("Kanishka", "Misra", role = "ctb"),
 8 |     person("Julia", "Silge", , "julia.silge@gmail.com", role = c("aut", "cre"),
 9 |            comment = c(ORCID = "0000-0002-3671-836X"))
10 |   )
11 | Description: Encapsulates the pattern of untidying data into a wide
12 |     matrix, performing some processing, then turning it back into a tidy
13 |     form. This is useful for several operations such as co-occurrence
14 |     counts, correlations, or clustering that are mathematically convenient
15 |     on wide matrices.
16 | License: MIT + file LICENSE
17 | URL: https://github.com/juliasilge/widyr,
18 |     https://juliasilge.github.io/widyr/
19 | BugReports: https://github.com/juliasilge/widyr/issues
20 | Imports:
21 |     broom,
22 |     dplyr,
23 |     Matrix,
24 |     purrr,
25 |     reshape2,
26 |     rlang,
27 |     tibble,
28 |     tidyr,
29 |     tidytext
30 | Suggests:
31 |     countrycode,
32 |     covr,
33 |     fuzzyjoin,
34 |     gapminder,
35 |     ggplot2,
36 |     ggraph,
37 |     igraph,
38 |     irlba,
39 |     janeaustenr,
40 |     knitr,
41 |     maps,
42 |     rmarkdown,
43 |     testthat,
44 |     unvotes (>= 0.3.0)
45 | VignetteBuilder: 
46 |     knitr
47 | Encoding: UTF-8
48 | LazyData: TRUE
49 | Roxygen: list(markdown = TRUE)
50 | RoxygenNote: 7.2.1
51 | 


--------------------------------------------------------------------------------
/man/pairwise_similarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pairwise_similarity.R
 3 | \name{pairwise_similarity}
 4 | \alias{pairwise_similarity}
 5 | \alias{pairwise_similarity_}
 6 | \title{Cosine similarity of pairs of items}
 7 | \usage{
 8 | pairwise_similarity(tbl, item, feature, value, ...)
 9 | 
10 | pairwise_similarity_(tbl, item, feature, value, ...)
11 | }
12 | \arguments{
13 | \item{tbl}{Table}
14 | 
15 | \item{item}{Item to compare; will end up in \code{item1} and
16 | \code{item2} columns}
17 | 
18 | \item{feature}{Column describing the feature that links one item to others}
19 | 
20 | \item{value}{Value}
21 | 
22 | \item{...}{Extra arguments passed on to \code{\link[=squarely]{squarely()}},
23 | such as \code{diag} and \code{upper}}
24 | }
25 | \description{
26 | Compute cosine similarity of all pairs of items in a tidy table.
27 | }
28 | \examples{
29 | 
30 | library(janeaustenr)
31 | library(dplyr)
32 | library(tidytext)
33 | 
34 | # Comparing Jane Austen novels
35 | austen_words <- austen_books() \%>\%
36 |   unnest_tokens(word, text) \%>\%
37 |   anti_join(stop_words, by = "word") \%>\%
38 |   count(book, word) \%>\%
39 |   ungroup()
40 | 
41 | # closest books to each other
42 | closest <- austen_words \%>\%
43 |   pairwise_similarity(book, word, n) \%>\%
44 |   arrange(desc(similarity))
45 | 
46 | closest
47 | 
48 | closest \%>\%
49 |   filter(item1 == "Emma")
50 | 
51 | }
52 | \seealso{
53 | \code{\link[=squarely]{squarely()}}
54 | }
55 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macOS-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v2
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/man/pairwise_pmi.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pairwise_pmi.R
 3 | \name{pairwise_pmi}
 4 | \alias{pairwise_pmi}
 5 | \alias{pairwise_pmi_}
 6 | \title{Pointwise mutual information of pairs of items}
 7 | \usage{
 8 | pairwise_pmi(tbl, item, feature, sort = FALSE, ...)
 9 | 
10 | pairwise_pmi_(tbl, item, feature, sort = FALSE, ...)
11 | }
12 | \arguments{
13 | \item{tbl}{Table}
14 | 
15 | \item{item}{Item to compare; will end up in \code{item1} and
16 | \code{item2} columns}
17 | 
18 | \item{feature}{Column describing the feature that links one item to others}
19 | 
20 | \item{sort}{Whether to sort in descending order of the pointwise mutual
21 | information}
22 | 
23 | \item{...}{Extra arguments passed on to \code{squarely},
24 | such as \code{diag} and \code{upper}}
25 | }
26 | \value{
27 | A tbl_df with three columns, \code{item1}, \code{item2}, and
28 | \code{pmi}.
29 | }
30 | \description{
31 | Find pointwise mutual information of pairs of items in a column, based on
32 | a "feature" column that links them together.
33 | This is an example of the spread-operate-retidy pattern.
34 | }
35 | \examples{
36 | 
37 | library(dplyr)
38 | 
39 | dat <- tibble(group = rep(1:5, each = 2),
40 |               letter = c("a", "b",
41 |                          "a", "c",
42 |                          "a", "c",
43 |                          "b", "e",
44 |                          "b", "f"))
45 | 
46 | # how informative is each letter about each other letter
47 | pairwise_pmi(dat, letter, group)
48 | pairwise_pmi(dat, letter, group, sort = TRUE)
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/man/pairwise_count.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pairwise_count.R
 3 | \name{pairwise_count}
 4 | \alias{pairwise_count}
 5 | \alias{pairwise_count_}
 6 | \title{Count pairs of items within a group}
 7 | \usage{
 8 | pairwise_count(tbl, item, feature, wt = NULL, ...)
 9 | 
10 | pairwise_count_(tbl, item, feature, wt = NULL, ...)
11 | }
12 | \arguments{
13 | \item{tbl}{Table}
14 | 
15 | \item{item}{Item to count pairs of; will end up in \code{item1} and
16 | \code{item2} columns}
17 | 
18 | \item{feature}{Column within which to count pairs
19 | \code{item2} columns}
20 | 
21 | \item{wt}{Optionally a weight column, which should have a consistent weight
22 | for each feature}
23 | 
24 | \item{...}{Extra arguments passed on to \code{squarely},
25 | such as \code{diag}, \code{upper}, and \code{sort}}
26 | }
27 | \description{
28 | Count the number of times each pair of items appear together within a group
29 | defined by "feature." For example, this could count the number of times
30 | two words appear within documents).
31 | }
32 | \examples{
33 | 
34 | library(dplyr)
35 | dat <- tibble(group = rep(1:5, each = 2),
36 |               letter = c("a", "b",
37 |                          "a", "c",
38 |                          "a", "c",
39 |                          "b", "e",
40 |                          "b", "f"))
41 | 
42 | # count the number of times two letters appear together
43 | pairwise_count(dat, letter, group)
44 | pairwise_count(dat, letter, group, sort = TRUE)
45 | pairwise_count(dat, letter, group, sort = TRUE, diag = TRUE)
46 | 
47 | }
48 | \seealso{
49 | \code{\link[=squarely]{squarely()}}
50 | }
51 | 


--------------------------------------------------------------------------------
/man/squarely.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/squarely.R
 3 | \name{squarely}
 4 | \alias{squarely}
 5 | \alias{squarely_}
 6 | \title{A special case of the widely adverb for creating tidy
 7 | square matrices}
 8 | \usage{
 9 | squarely(.f, diag = FALSE, upper = TRUE, ...)
10 | 
11 | squarely_(.f, diag = FALSE, upper = TRUE, ...)
12 | }
13 | \arguments{
14 | \item{.f}{Function to wrap}
15 | 
16 | \item{diag}{Whether to include diagonal (i = j) in output}
17 | 
18 | \item{upper}{Whether to include upper triangle, which may be
19 | duplicated}
20 | 
21 | \item{...}{Extra arguments passed on to \code{widely}}
22 | }
23 | \value{
24 | Returns a function that takes at least four arguments:
25 | \item{tbl}{A table}
26 | \item{item}{Name of column to use as rows in wide matrix}
27 | \item{feature}{Name of column to use as columns in wide matrix}
28 | \item{feature}{Name of column to use as values in wide matrix}
29 | \item{...}{Arguments passed on to inner function}
30 | }
31 | \description{
32 | A special case of \code{\link[=widely]{widely()}}. Used to pre-prepare and
33 | post-tidy functions that take an m x n (m items, n features)
34 | matrix and return an m x m (item x item) matrix, such as a
35 | distance or correlation matrix.
36 | }
37 | \examples{
38 | 
39 | library(dplyr)
40 | library(gapminder)
41 | 
42 | closest_continent <- gapminder \%>\%
43 |   group_by(continent) \%>\%
44 |   squarely(dist)(country, year, lifeExp)
45 | 
46 | }
47 | \seealso{
48 | \code{\link[=widely]{widely()}}, \code{\link[=pairwise_count]{pairwise_count()}},
49 | \code{\link[=pairwise_cor]{pairwise_cor()}}, \code{\link[=pairwise_dist]{pairwise_dist()}}
50 | }
51 | 


--------------------------------------------------------------------------------
/man/pairwise_dist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pairwise_dist.R
 3 | \name{pairwise_dist}
 4 | \alias{pairwise_dist}
 5 | \alias{pairwise_dist_}
 6 | \title{Distances of pairs of items}
 7 | \usage{
 8 | pairwise_dist(tbl, item, feature, value, method = "euclidean", ...)
 9 | 
10 | pairwise_dist_(tbl, item, feature, value, method = "euclidean", ...)
11 | }
12 | \arguments{
13 | \item{tbl}{Table}
14 | 
15 | \item{item}{Item to compare; will end up in \code{item1} and
16 | \code{item2} columns}
17 | 
18 | \item{feature}{Column describing the feature that links one item to others}
19 | 
20 | \item{value}{Value}
21 | 
22 | \item{method}{Distance measure to be used; see \code{\link[=dist]{dist()}}}
23 | 
24 | \item{...}{Extra arguments passed on to \code{\link[=squarely]{squarely()}},
25 | such as \code{diag} and \code{upper}}
26 | }
27 | \description{
28 | Compute distances of all pairs of items in a tidy table.
29 | }
30 | \examples{
31 | 
32 | library(gapminder)
33 | library(dplyr)
34 | 
35 | # closest countries in terms of life expectancy over time
36 | closest <- gapminder \%>\%
37 |   pairwise_dist(country, year, lifeExp) \%>\%
38 |   arrange(distance)
39 | 
40 | closest
41 | 
42 | closest \%>\%
43 |   filter(item1 == "United States")
44 | 
45 | # to remove duplicates, use upper = FALSE
46 | gapminder \%>\%
47 |   pairwise_dist(country, year, lifeExp, upper = FALSE) \%>\%
48 |   arrange(distance)
49 | 
50 | # Can also use Manhattan distance
51 | gapminder \%>\%
52 |   pairwise_dist(country, year, lifeExp, method = "manhattan", upper = FALSE) \%>\%
53 |   arrange(distance)
54 | 
55 | }
56 | \seealso{
57 | \code{\link[=squarely]{squarely()}}
58 | }
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pairwise-cor.R:
--------------------------------------------------------------------------------
 1 | context("pairwise_cor")
 2 | 
 3 | suppressPackageStartupMessages(library(dplyr))
 4 | 
 5 | d <- tibble(col = rep(c("a", "b", "c"), each = 3),
 6 |             row = rep(c("d", "e", "f"), 3),
 7 |             value = c(1, 2, 3, 6, 5, 4, 7, 9, 8))
 8 | 
 9 | test_that("pairwise_cor computes pairwise correlations", {
10 |   ret <- d %>%
11 |     pairwise_cor(col, row, value)
12 | 
13 |   ret1 <- ret$correlation[ret$item1 == "a" & ret$item2 == "b"]
14 |   expect_equal(ret1, cor(1:3, 6:4))
15 | 
16 |   ret2 <- ret$correlation[ret$item1 == "b" & ret$item2 == "c"]
17 |   expect_equal(ret2, cor(6:4, c(7, 9, 8)))
18 | 
19 |   expect_equal(sum(ret$item1 == ret$item2), 0)
20 | })
21 | 
22 | test_that("pairwise_cor can compute Spearman correlations", {
23 |   ret <- d %>%
24 |     pairwise_cor(col, row, value, method = "spearman")
25 | 
26 |   ret1 <- ret$correlation[ret$item1 == "a" & ret$item2 == "b"]
27 |   expect_equal(ret1, -1)
28 | })
29 | 
30 | test_that("pairwise_cor works on binary matrices", {
31 |   cors <- data.frame(x = c("a", "a", "a", "b", "b", "b", "c", "c", "c"),
32 |                      y = c(1, 2, 3, 1, 2, 3, 1, 3, 4)) %>%
33 |     pairwise_cor(x, y, sort = TRUE)
34 | 
35 |   expect_equal(colnames(cors), c("item1", "item2", "correlation"))
36 |   expect_equal(cors$correlation, rep(c(1, - 1 / 3), c(2, 4)))
37 | })
38 | 
39 | test_that("pairwise_cor retains factor levels", {
40 |   d$col <- factor(d$col, levels = c("b", "c", "a"))
41 | 
42 |   ret <- d %>%
43 |     pairwise_cor(col, row, value, method = "spearman")
44 | 
45 |   expect_is(ret$item1, "factor")
46 |   expect_is(ret$item2, "factor")
47 |   expect_equal(levels(ret$item1), c("b", "c", "a"))
48 |   expect_equal(levels(ret$item2), c("b", "c", "a"))
49 | })
50 | 


--------------------------------------------------------------------------------
/R/widely_kmeans.R:
--------------------------------------------------------------------------------
 1 | #' Cluster items based on k-means across features
 2 | #'
 3 | #' Given a tidy table of features describing each item, perform k-means
 4 | #' clustering using [kmeans()] and retidy the data into
 5 | #' one-row-per-cluster.
 6 | #'
 7 | #' @param tbl Table
 8 | #' @param item Item to cluster (as a bare column name)
 9 | #' @param feature Feature column (dimension in clustering)
10 | #' @param value Value column
11 | #' @param k Number of clusters
12 | #' @param fill What to fill in for missing values
13 | #' @param ... Other arguments passed on to [kmeans()]
14 | #'
15 | #' @seealso [widely_hclust()]
16 | #'
17 | #' @importFrom rlang :=
18 | #'
19 | #' @examples
20 | #'
21 | #' library(gapminder)
22 | #' library(dplyr)
23 | #'
24 | #' clusters <- gapminder %>%
25 | #'   widely_kmeans(country, year, lifeExp, k = 5)
26 | #'
27 | #' clusters
28 | #'
29 | #' clusters %>%
30 | #'   count(cluster)
31 | #'
32 | #' # Examine a few clusters
33 | #' clusters %>% filter(cluster == 1)
34 | #' clusters %>% filter(cluster == 2)
35 | #'
36 | #' @export
37 | widely_kmeans <- function(tbl, item, feature, value, k, fill = 0, ...) {
38 |   item_str <- as.character(substitute(item))
39 |   feature_str <- as.character(substitute(feature))
40 |   value_str <- as.character(substitute(value))
41 | 
42 |   form <- stats::as.formula(paste(item_str, "~", feature_str))
43 | 
44 |   m <- tbl %>%
45 |     reshape2::acast(form, value.var = value_str, fill = fill)
46 | 
47 |   clustered <- stats::kmeans(m, k, ...)
48 | 
49 |   # Add the clusters to the original table
50 |   i <- match(rownames(m), as.character(tbl[[item_str]]))
51 |   tibble::tibble(!!sym(item_str) := tbl[[item_str]][i],
52 |                  cluster = factor(clustered$cluster)) %>%
53 |     dplyr::arrange(cluster)
54 | }
55 | 


--------------------------------------------------------------------------------
/R/pairwise_similarity.R:
--------------------------------------------------------------------------------
 1 | #' Cosine similarity of pairs of items
 2 | #'
 3 | #' Compute cosine similarity of all pairs of items in a tidy table.
 4 | #'
 5 | #' @param tbl Table
 6 | #' @param item Item to compare; will end up in `item1` and
 7 | #' `item2` columns
 8 | #' @param feature Column describing the feature that links one item to others
 9 | #' @param value Value
10 | #' @param ... Extra arguments passed on to [squarely()],
11 | #' such as `diag` and `upper`
12 | #'
13 | #' @seealso [squarely()]
14 | #'
15 | #' @examples
16 | #'
17 | #' library(janeaustenr)
18 | #' library(dplyr)
19 | #' library(tidytext)
20 | #'
21 | #' # Comparing Jane Austen novels
22 | #' austen_words <- austen_books() %>%
23 | #'   unnest_tokens(word, text) %>%
24 | #'   anti_join(stop_words, by = "word") %>%
25 | #'   count(book, word) %>%
26 | #'   ungroup()
27 | #'
28 | #' # closest books to each other
29 | #' closest <- austen_words %>%
30 | #'   pairwise_similarity(book, word, n) %>%
31 | #'   arrange(desc(similarity))
32 | #'
33 | #' closest
34 | #'
35 | #' closest %>%
36 | #'   filter(item1 == "Emma")
37 | #'
38 | #' @export
39 | pairwise_similarity <- function(tbl, item, feature, value, ...) {
40 |   pairwise_similarity_(tbl,
41 |                  col_name(substitute(item)),
42 |                  col_name(substitute(feature)),
43 |                  col_name(substitute(value)), ...)
44 | }
45 | 
46 | 
47 | #' @rdname pairwise_similarity
48 | #' @export
49 | pairwise_similarity_ <- function(tbl, item, feature, value, ...) {
50 |   m <- matrix(1:9, ncol = 3)
51 |   d_func <- squarely_(function(m) {
52 |     normed <- m / sqrt(rowSums(m ^ 2))
53 |     normed %*% t(normed)
54 |   }, sparse = TRUE, ...)
55 | 
56 |   tbl %>%
57 |     d_func(item, feature, value) %>%
58 |     rename(similarity = value)
59 | }
60 | 


--------------------------------------------------------------------------------
/man/pairwise_cor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pairwise_cor.R
 3 | \name{pairwise_cor}
 4 | \alias{pairwise_cor}
 5 | \alias{pairwise_cor_}
 6 | \title{Correlations of pairs of items}
 7 | \usage{
 8 | pairwise_cor(
 9 |   tbl,
10 |   item,
11 |   feature,
12 |   value,
13 |   method = c("pearson", "kendall", "spearman"),
14 |   use = "everything",
15 |   ...
16 | )
17 | 
18 | pairwise_cor_(
19 |   tbl,
20 |   item,
21 |   feature,
22 |   value,
23 |   method = c("pearson", "kendall", "spearman"),
24 |   use = "everything",
25 |   ...
26 | )
27 | }
28 | \arguments{
29 | \item{tbl}{Table}
30 | 
31 | \item{item}{Item to compare; will end up in \code{item1} and
32 | \code{item2} columns}
33 | 
34 | \item{feature}{Column describing the feature that links one item to others}
35 | 
36 | \item{value}{Value column. If not given, defaults to all values being 1 (thus
37 | a binary correlation)}
38 | 
39 | \item{method}{Correlation method}
40 | 
41 | \item{use}{Character string specifying the behavior of correlations
42 | with missing values; passed on to \code{cor}}
43 | 
44 | \item{...}{Extra arguments passed on to \code{squarely},
45 | such as \code{diag} and \code{upper}}
46 | }
47 | \description{
48 | Find correlations of pairs of items in a column, based on a "feature" column
49 | that links them together. This is an example of the spread-operate-retidy pattern.
50 | }
51 | \examples{
52 | 
53 | library(dplyr)
54 | library(gapminder)
55 | 
56 | gapminder \%>\%
57 |   pairwise_cor(country, year, lifeExp)
58 | 
59 | gapminder \%>\%
60 |   pairwise_cor(country, year, lifeExp, sort = TRUE)
61 | 
62 | # United Nations voting data
63 | if (require("unvotes", quietly = TRUE)) {
64 |   country_cors <- un_votes \%>\%
65 |     mutate(vote = as.numeric(vote)) \%>\%
66 |     pairwise_cor(country, rcid, vote, sort = TRUE)
67 | }
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/man/widely.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/widely.R
 3 | \name{widely}
 4 | \alias{widely}
 5 | \alias{widely_}
 6 | \title{Adverb for functions that operate on matrices in "wide"
 7 | format}
 8 | \usage{
 9 | widely(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07)
10 | 
11 | widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07)
12 | }
13 | \arguments{
14 | \item{.f}{Function being wrapped}
15 | 
16 | \item{sort}{Whether to sort in descending order of \code{value}}
17 | 
18 | \item{sparse}{Whether to cast to a sparse matrix}
19 | 
20 | \item{maximum_size}{To prevent crashing, a maximum size of a
21 | non-sparse matrix to be created. Set to NULL to allow any size
22 | matrix.}
23 | }
24 | \value{
25 | Returns a function that takes at least four arguments:
26 | \item{tbl}{A table}
27 | \item{row}{Name of column to use as rows in wide matrix}
28 | \item{column}{Name of column to use as columns in wide matrix}
29 | \item{value}{Name of column to use as values in wide matrix}
30 | \item{...}{Arguments passed on to inner function}
31 | 
32 | \code{widely} creates a function that takes those columns as
33 | bare names, \code{widely_} a function that takes them as strings.
34 | }
35 | \description{
36 | Modify a function in order to pre-cast the input into a wide
37 | matrix format, perform the function, and then
38 | re-tidy (e.g. melt) the output into a tidy table.
39 | }
40 | \examples{
41 | 
42 | library(dplyr)
43 | library(gapminder)
44 | 
45 | gapminder
46 | 
47 | gapminder \%>\%
48 |   widely(dist)(country, year, lifeExp)
49 | 
50 | # can perform within groups
51 | closest_continent <- gapminder \%>\%
52 |   group_by(continent) \%>\%
53 |   widely(dist)(country, year, lifeExp)
54 | closest_continent
55 | 
56 | # for example, find the closest pair in each
57 | closest_continent \%>\%
58 |   top_n(1, -value)
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/R/pairwise_dist.R:
--------------------------------------------------------------------------------
 1 | #' Distances of pairs of items
 2 | #'
 3 | #' Compute distances of all pairs of items in a tidy table.
 4 | #'
 5 | #' @param tbl Table
 6 | #' @param item Item to compare; will end up in `item1` and
 7 | #' `item2` columns
 8 | #' @param feature Column describing the feature that links one item to others
 9 | #' @param value Value
10 | #' @param method Distance measure to be used; see [dist()]
11 | #' @param ... Extra arguments passed on to [squarely()],
12 | #' such as `diag` and `upper`
13 | #'
14 | #' @seealso [squarely()]
15 | #'
16 | #' @examples
17 | #'
18 | #' library(gapminder)
19 | #' library(dplyr)
20 | #'
21 | #' # closest countries in terms of life expectancy over time
22 | #' closest <- gapminder %>%
23 | #'   pairwise_dist(country, year, lifeExp) %>%
24 | #'   arrange(distance)
25 | #'
26 | #' closest
27 | #'
28 | #' closest %>%
29 | #'   filter(item1 == "United States")
30 | #'
31 | #' # to remove duplicates, use upper = FALSE
32 | #' gapminder %>%
33 | #'   pairwise_dist(country, year, lifeExp, upper = FALSE) %>%
34 | #'   arrange(distance)
35 | #'
36 | #' # Can also use Manhattan distance
37 | #' gapminder %>%
38 | #'   pairwise_dist(country, year, lifeExp, method = "manhattan", upper = FALSE) %>%
39 | #'   arrange(distance)
40 | #'
41 | #' @export
42 | pairwise_dist <- function(tbl, item, feature, value,
43 |                      method = "euclidean", ...) {
44 |   pairwise_dist_(tbl,
45 |             col_name(substitute(item)),
46 |             col_name(substitute(feature)),
47 |             col_name(substitute(value)),
48 |             method = method, ...)
49 | }
50 | 
51 | 
52 | #' @rdname pairwise_dist
53 | #' @export
54 | pairwise_dist_ <- function(tbl, item, feature, value, method = "euclidean", ...) {
55 |   d_func <- squarely_(function(m) as.matrix(stats::dist(m, method = method)), ...)
56 | 
57 |   tbl %>%
58 |     d_func(item, feature, value) %>%
59 |     rename(distance = value)
60 | }
61 | 


--------------------------------------------------------------------------------
/man/widely_svd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/widely_svd.R
 3 | \name{widely_svd}
 4 | \alias{widely_svd}
 5 | \alias{widely_svd_}
 6 | \title{Turn into a wide matrix, perform SVD, return to tidy form}
 7 | \usage{
 8 | widely_svd(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...)
 9 | 
10 | widely_svd_(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...)
11 | }
12 | \arguments{
13 | \item{tbl}{Table}
14 | 
15 | \item{item}{Item to perform dimensionality reduction on; will end up in \code{item} column}
16 | 
17 | \item{feature}{Column describing the feature that links one item to others.}
18 | 
19 | \item{value}{Value}
20 | 
21 | \item{nv}{Optional; the number of principal components to estimate. Recommended for matrices
22 | with many features.}
23 | 
24 | \item{weight_d}{Whether to multiply each value by the \code{d} principal component.}
25 | 
26 | \item{...}{Extra arguments passed to \code{svd} (if \code{nv} is \code{NULL})
27 | or \code{irlba} (if \code{nv} is given)}
28 | }
29 | \value{
30 | A tbl_df with three columns. The first is retained from the \code{item} input,
31 | then \code{dimension} and \code{value}. Each row represents one principal component
32 | value.
33 | }
34 | \description{
35 | This is useful for dimensionality reduction of items, especially when setting a
36 | lower nv.
37 | }
38 | \examples{
39 | 
40 | library(dplyr)
41 | library(gapminder)
42 | 
43 | # principal components driving change
44 | gapminder_svd <- gapminder \%>\%
45 |   widely_svd(country, year, lifeExp)
46 | 
47 | gapminder_svd
48 | 
49 | # compare SVDs, join with other data
50 | library(ggplot2)
51 | library(tidyr)
52 | 
53 | gapminder_svd \%>\%
54 |   spread(dimension, value) \%>\%
55 |   inner_join(distinct(gapminder, country, continent), by = "country") \%>\%
56 |   ggplot(aes(`1`, `2`, label = country)) +
57 |   geom_point(aes(color = continent)) +
58 |   geom_text(vjust = 1, hjust = 1)
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/man/pairwise_delta.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pairwise_delta.R
 3 | \name{pairwise_delta}
 4 | \alias{pairwise_delta}
 5 | \alias{pairwise_delta_}
 6 | \title{Delta measure of pairs of documents}
 7 | \usage{
 8 | pairwise_delta(tbl, item, feature, value, method = "burrows", ...)
 9 | 
10 | pairwise_delta_(tbl, item, feature, value, method = "burrows", ...)
11 | }
12 | \arguments{
13 | \item{tbl}{Table}
14 | 
15 | \item{item}{Item to compare; will end up in \code{item1} and
16 | \code{item2} columns}
17 | 
18 | \item{feature}{Column describing the feature that links one item to others}
19 | 
20 | \item{value}{Value}
21 | 
22 | \item{method}{Distance measure to be used; see \code{\link[=dist]{dist()}}}
23 | 
24 | \item{...}{Extra arguments passed on to \code{\link[=squarely]{squarely()}},
25 | such as \code{diag} and \code{upper}}
26 | }
27 | \description{
28 | Compute the delta distances (from its two variants) of all pairs of documents in a tidy table.
29 | }
30 | \examples{
31 | 
32 | library(janeaustenr)
33 | library(dplyr)
34 | library(tidytext)
35 | 
36 | # closest documents in terms of 1000 most frequent words
37 | closest <- austen_books() \%>\%
38 |   unnest_tokens(word, text) \%>\%
39 |   count(book, word) \%>\%
40 |   top_n(1000, n) \%>\%
41 |   pairwise_delta(book, word, n, method = "burrows") \%>\%
42 |   arrange(delta)
43 | 
44 | closest
45 | 
46 | closest \%>\%
47 |   filter(item1 == "Pride & Prejudice")
48 | 
49 | # to remove duplicates, use upper = FALSE
50 | closest <- austen_books() \%>\%
51 |   unnest_tokens(word, text) \%>\%
52 |   count(book, word) \%>\%
53 |   top_n(1000, n) \%>\%
54 |   pairwise_delta(book, word, n, method = "burrows", upper = FALSE) \%>\%
55 |   arrange(delta)
56 | 
57 | # Can also use Argamon's Linear Delta
58 | closest <- austen_books() \%>\%
59 |   unnest_tokens(word, text) \%>\%
60 |   count(book, word) \%>\%
61 |   top_n(1000, n) \%>\%
62 |   pairwise_delta(book, word, n, method = "argamon", upper = FALSE) \%>\%
63 |   arrange(delta)
64 | 
65 | }
66 | \seealso{
67 | \code{\link[=squarely]{squarely()}}
68 | }
69 | 


--------------------------------------------------------------------------------
/R/pairwise_count.R:
--------------------------------------------------------------------------------
 1 | #' Count pairs of items within a group
 2 | #'
 3 | #' Count the number of times each pair of items appear together within a group
 4 | #' defined by "feature." For example, this could count the number of times
 5 | #' two words appear within documents).
 6 | #'
 7 | #' @param tbl Table
 8 | #' @param item Item to count pairs of; will end up in `item1` and
 9 | #' `item2` columns
10 | #' @param feature Column within which to count pairs
11 | #' `item2` columns
12 | #' @param wt Optionally a weight column, which should have a consistent weight
13 | #' for each feature
14 | #' @param ... Extra arguments passed on to `squarely`,
15 | #' such as `diag`, `upper`, and `sort`
16 | #'
17 | #' @seealso [squarely()]
18 | #'
19 | #' @examples
20 | #'
21 | #' library(dplyr)
22 | #' dat <- tibble(group = rep(1:5, each = 2),
23 | #'               letter = c("a", "b",
24 | #'                          "a", "c",
25 | #'                          "a", "c",
26 | #'                          "b", "e",
27 | #'                          "b", "f"))
28 | #'
29 | #' # count the number of times two letters appear together
30 | #' pairwise_count(dat, letter, group)
31 | #' pairwise_count(dat, letter, group, sort = TRUE)
32 | #' pairwise_count(dat, letter, group, sort = TRUE, diag = TRUE)
33 | #'
34 | #' @export
35 | pairwise_count <- function(tbl, item, feature, wt = NULL, ...) {
36 |   pairwise_count_(tbl,
37 |                   col_name(substitute(item)),
38 |                   col_name(substitute(feature)),
39 |                   wt = col_name(substitute(wt)),
40 |                   ...)
41 | }
42 | 
43 | 
44 | #' @rdname pairwise_count
45 | #' @export
46 | pairwise_count_ <- function(tbl, item, feature, wt = NULL, ...) {
47 |   if (is.null(wt)) {
48 |     func <- squarely_(function(m) m %*% t(m), sparse = TRUE, ...)
49 |     wt <- "..value"
50 |   } else {
51 |     func <- squarely_(function(m) m %*% t(m > 0), sparse = TRUE, ...)
52 |   }
53 | 
54 |   tbl %>%
55 |     distinct(.data[[item]], .data[[feature]], .keep_all = TRUE) %>%
56 |     mutate(..value = 1) %>%
57 |     func(item, feature, wt) %>%
58 |     rename(n = value)
59 | }
60 | 


--------------------------------------------------------------------------------
/R/pairwise_pmi.R:
--------------------------------------------------------------------------------
 1 | #' Pointwise mutual information of pairs of items
 2 | #'
 3 | #' Find pointwise mutual information of pairs of items in a column, based on
 4 | #' a "feature" column that links them together.
 5 | #' This is an example of the spread-operate-retidy pattern.
 6 | #'
 7 | #' @param tbl Table
 8 | #' @param item Item to compare; will end up in `item1` and
 9 | #' `item2` columns
10 | #' @param feature Column describing the feature that links one item to others
11 | #' @param sort Whether to sort in descending order of the pointwise mutual
12 | #' information
13 | #' @param ... Extra arguments passed on to `squarely`,
14 | #' such as `diag` and `upper`
15 | #'
16 | #' @name pairwise_pmi
17 | #'
18 | #' @return A tbl_df with three columns, `item1`, `item2`, and
19 | #' `pmi`.
20 | #'
21 | #' @examples
22 | #'
23 | #' library(dplyr)
24 | #'
25 | #' dat <- tibble(group = rep(1:5, each = 2),
26 | #'               letter = c("a", "b",
27 | #'                          "a", "c",
28 | #'                          "a", "c",
29 | #'                          "b", "e",
30 | #'                          "b", "f"))
31 | #'
32 | #' # how informative is each letter about each other letter
33 | #' pairwise_pmi(dat, letter, group)
34 | #' pairwise_pmi(dat, letter, group, sort = TRUE)
35 | #'
36 | #' @export
37 | pairwise_pmi <- function(tbl, item, feature, sort = FALSE, ...) {
38 |   pairwise_pmi_(tbl,
39 |                 col_name(substitute(item)),
40 |                 col_name(substitute(feature)),
41 |                 sort = sort, ...)
42 | }
43 | 
44 | 
45 | #' @rdname pairwise_pmi
46 | #' @export
47 | pairwise_pmi_ <- function(tbl, item, feature, sort = FALSE, ...) {
48 |   f <- function(m) {
49 |     row_sums <- rowSums(m) / sum(m)
50 | 
51 |     ret <- m %*% t(m)
52 |     ret <- ret / sum(ret)
53 |     ret <- ret / row_sums
54 |     ret <- t(t(ret) / (row_sums))
55 |     ret
56 |   }
57 |   pmi_func <- squarely_(f, sparse = TRUE, sort = sort, ...)
58 | 
59 |   tbl %>%
60 |     ungroup() %>%
61 |     mutate(..value = 1) %>%
62 |     pmi_func(item, feature, "..value") %>%
63 |     mutate(value = log(value)) %>%
64 |     rename(pmi = value)
65 | }
66 | 


--------------------------------------------------------------------------------
/R/widely_hclust.R:
--------------------------------------------------------------------------------
 1 | #' Cluster pairs of items into groups using hierarchical clustering
 2 | #'
 3 | #' Reshape a table that represents pairwise distances into hierarchical clusters,
 4 | #' returning a table with `item` and `cluster` columns.
 5 | #'
 6 | #' @param tbl Table
 7 | #' @param item1 First item
 8 | #' @param item2 Second item
 9 | #' @param distance Distance column
10 | #' @param k The desired number of groups
11 | #' @param h Height at which to cut the hierarchically clustered tree
12 | #'
13 | #' @examples
14 | #'
15 | #' library(gapminder)
16 | #' library(dplyr)
17 | #'
18 | #' # Construct Euclidean distances between countries based on life
19 | #' # expectancy over time
20 | #' country_distances <- gapminder %>%
21 | #'   pairwise_dist(country, year, lifeExp)
22 | #'
23 | #' country_distances
24 | #'
25 | #' # Turn this into 5 hierarchical clusters
26 | #' clusters <- country_distances %>%
27 | #'   widely_hclust(item1, item2, distance, k = 8)
28 | #'
29 | #' # Examine a few such clusters
30 | #' clusters %>% filter(cluster == 1)
31 | #' clusters %>% filter(cluster == 2)
32 | #'
33 | #' @seealso [cutree]
34 | #'
35 | #' @export
36 | widely_hclust <- function(tbl, item1, item2, distance, k = NULL, h = NULL) {
37 |   col1_str <- as.character(substitute(item1))
38 |   col2_str <- as.character(substitute(item2))
39 |   dist_str <- as.character(substitute(distance))
40 | 
41 |   unique_items <- unique(c(as.character(tbl[[col1_str]]), as.character(tbl[[col2_str]])))
42 | 
43 |   form <- stats::as.formula(paste(col1_str, "~", col2_str))
44 | 
45 |   max_distance <- max(tbl[[dist_str]])
46 | 
47 |   tibble(item1 = match(tbl[[col1_str]], unique_items),
48 |          item2 = match(tbl[[col2_str]], unique_items),
49 |          distance = tbl[[dist_str]]) %>%
50 |     reshape2::acast(item1 ~ item2, value.var = "distance", fill = max_distance) %>%
51 |     stats::as.dist() %>%
52 |     stats::hclust() %>%
53 |     stats::cutree(k = k, h = h) %>%
54 |     tibble::enframe("item", "cluster") %>%
55 |     dplyr::mutate(item = unique_items[as.integer(item)],
56 |                   cluster = factor(cluster)) %>%
57 |     dplyr::arrange(cluster)
58 | }
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test-widely.R:
--------------------------------------------------------------------------------
 1 | context("widely")
 2 | 
 3 | test_that("widely can widen, operate, and re-tidy", {
 4 |   skip_if_not_installed("gapminder")
 5 |   library(gapminder)
 6 | 
 7 |   ret <- gapminder %>%
 8 |     widely(cor)(year, country, lifeExp)
 9 | 
10 |   expect_is(ret$item1, "character")
11 |   expect_is(ret$item2, "character")
12 | 
13 |   expect_true(all(c("Afghanistan", "United States") %in% ret$item1))
14 |   expect_true(all(c("Afghanistan", "United States") %in% ret$item2))
15 |   expect_true(all(ret$value <= 1))
16 |   expect_true(all(ret$value >= -1))
17 | 
18 |   expect_equal(nrow(ret), length(unique(gapminder$country)) ^ 2)
19 | 
20 |   ret2 <- gapminder %>%
21 |     widely(cor, sort = TRUE)(year, country, lifeExp)
22 | 
23 |   expect_equal(sort(ret$value, decreasing = TRUE), ret2$value)
24 | })
25 | 
26 | test_that("widely works within groups", {
27 |   skip_if_not_installed("gapminder")
28 |   library(gapminder)
29 | 
30 |   ret <- gapminder %>%
31 |     group_by(continent) %>%
32 |     widely(cor)(year, country, lifeExp)
33 | 
34 |   expect_equal(colnames(ret), c("continent", "item1", "item2", "value"))
35 |   expect_is(ret$item1, "character")
36 |   expect_is(ret$item2, "character")
37 | 
38 |   expect_true(all(c("Afghanistan", "United States") %in% ret$item1))
39 |   expect_true(all(c("Afghanistan", "United States") %in% ret$item2))
40 |   expect_true(any("Canada" == ret$item1 & "United States" == ret$item2))
41 |   expect_false(any("Afghanistan" == ret$item1 & "United States" == ret$item2))
42 | 
43 |   expect_true(all(ret$value <= 1))
44 |   expect_true(all(ret$value >= -1))
45 | })
46 | 
47 | test_that("widely's maximum size argument works", {
48 |   skip_if_not_installed("gapminder")
49 |   library(gapminder)
50 | 
51 |   f <- function() {
52 |     widely(cor, maximum_size = 1000)(gapminder, year, country, lifeExp)
53 |   }
54 |   expect_error(f(), "1704.*large")
55 | })
56 | 
57 | test_that("widely's arguments are evaluated when widely itself is called", {
58 |   skip_if_not_installed("gapminder")
59 |   library(gapminder)
60 | 
61 |   sort <- FALSE
62 |   dist_widely <- widely(dist, sort = sort)
63 |   sort <- TRUE
64 |   ret1 <- dist_widely(gapminder, country, year, lifeExp)
65 | 
66 |   sort <- FALSE
67 |   dist_widely <- widely(dist, sort = sort)
68 |   ret2 <- dist_widely(gapminder, country, year, lifeExp)
69 | 
70 |   expect_equal(ret1, ret2)
71 | })
72 | 


--------------------------------------------------------------------------------
/R/pairwise_cor.R:
--------------------------------------------------------------------------------
 1 | #' Correlations of pairs of items
 2 | #'
 3 | #' Find correlations of pairs of items in a column, based on a "feature" column
 4 | #' that links them together. This is an example of the spread-operate-retidy pattern.
 5 | #'
 6 | #' @param tbl Table
 7 | #' @param item Item to compare; will end up in `item1` and
 8 | #' `item2` columns
 9 | #' @param feature Column describing the feature that links one item to others
10 | #' @param value Value column. If not given, defaults to all values being 1 (thus
11 | #' a binary correlation)
12 | #' @param method Correlation method
13 | #' @param use Character string specifying the behavior of correlations
14 | #' with missing values; passed on to `cor`
15 | #' @param ... Extra arguments passed on to `squarely`,
16 | #' such as `diag` and `upper`
17 | #'
18 | #' @examples
19 | #'
20 | #' library(dplyr)
21 | #' library(gapminder)
22 | #'
23 | #' gapminder %>%
24 | #'   pairwise_cor(country, year, lifeExp)
25 | #'
26 | #' gapminder %>%
27 | #'   pairwise_cor(country, year, lifeExp, sort = TRUE)
28 | #'
29 | #' # United Nations voting data
30 | #' if (require("unvotes", quietly = TRUE)) {
31 | #'   country_cors <- un_votes %>%
32 | #'     mutate(vote = as.numeric(vote)) %>%
33 | #'     pairwise_cor(country, rcid, vote, sort = TRUE)
34 | #' }
35 | #'
36 | #' @export
37 | pairwise_cor <- function(tbl, item, feature, value,
38 |                      method = c("pearson", "kendall", "spearman"),
39 |                      use = "everything", ...) {
40 |   if (missing(value)) {
41 |     tbl$..value <- 1
42 |     val <- "..value"
43 |   } else {
44 |     val <- col_name(substitute(value))
45 |   }
46 | 
47 |   pairwise_cor_(tbl,
48 |             col_name(substitute(item)),
49 |             col_name(substitute(feature)),
50 |             val,
51 |             method = method, use = use, ...)
52 | }
53 | 
54 | 
55 | #' @rdname pairwise_cor
56 | #' @export
57 | pairwise_cor_ <- function(tbl, item, feature, value,
58 |                       method = c("pearson", "kendall", "spearman"),
59 |                       use = "everything",
60 |                       ...) {
61 |   method <- match.arg(method)
62 | 
63 |   sparse <- (method == "pearson" & use == "everything")
64 |   f <- if (sparse) {
65 |     function(x) cor_sparse(t(x))
66 |   } else {
67 |     function(x) stats::cor(t(x), method = method, use = use)
68 |   }
69 |   cor_func <- squarely_(f, sparse = sparse, ...)
70 | 
71 |   tbl %>%
72 |     ungroup() %>%
73 |     cor_func(item, feature, value) %>%
74 |     rename(correlation = value)
75 | }
76 | 


--------------------------------------------------------------------------------
/R/pairwise_delta.R:
--------------------------------------------------------------------------------
 1 | #' Delta measure of pairs of documents
 2 | #'
 3 | #' Compute the delta distances (from its two variants) of all pairs of documents in a tidy table.
 4 | #'
 5 | #' @param tbl Table
 6 | #' @param item Item to compare; will end up in `item1` and
 7 | #' `item2` columns
 8 | #' @param feature Column describing the feature that links one item to others
 9 | #' @param value Value
10 | #' @param method Distance measure to be used; see [dist()]
11 | #' @param ... Extra arguments passed on to [squarely()],
12 | #' such as `diag` and `upper`
13 | #'
14 | #' @seealso [squarely()]
15 | #'
16 | #' @examples
17 | #'
18 | #' library(janeaustenr)
19 | #' library(dplyr)
20 | #' library(tidytext)
21 | #'
22 | #' # closest documents in terms of 1000 most frequent words
23 | #' closest <- austen_books() %>%
24 | #'   unnest_tokens(word, text) %>%
25 | #'   count(book, word) %>%
26 | #'   top_n(1000, n) %>%
27 | #'   pairwise_delta(book, word, n, method = "burrows") %>%
28 | #'   arrange(delta)
29 | #'
30 | #' closest
31 | #'
32 | #' closest %>%
33 | #'   filter(item1 == "Pride & Prejudice")
34 | #'
35 | #' # to remove duplicates, use upper = FALSE
36 | #' closest <- austen_books() %>%
37 | #'   unnest_tokens(word, text) %>%
38 | #'   count(book, word) %>%
39 | #'   top_n(1000, n) %>%
40 | #'   pairwise_delta(book, word, n, method = "burrows", upper = FALSE) %>%
41 | #'   arrange(delta)
42 | #'
43 | #' # Can also use Argamon's Linear Delta
44 | #' closest <- austen_books() %>%
45 | #'   unnest_tokens(word, text) %>%
46 | #'   count(book, word) %>%
47 | #'   top_n(1000, n) %>%
48 | #'   pairwise_delta(book, word, n, method = "argamon", upper = FALSE) %>%
49 | #'   arrange(delta)
50 | #'
51 | #' @export
52 | pairwise_delta <- function(tbl, item, feature, value,
53 |                            method = "burrows", ...) {
54 |   pairwise_delta_(tbl,
55 |                   col_name(substitute(item)),
56 |                   col_name(substitute(feature)),
57 |                   col_name(substitute(value)),
58 |                   method = method, ...)
59 | }
60 | 
61 | 
62 | #' @rdname pairwise_delta
63 | #' @export
64 | pairwise_delta_ <- function(tbl, item, feature, value, method = "burrows", ...) {
65 |   delta_func <- function(m) {
66 | 
67 |     if(method == "burrows") {
68 |       dist_method = "manhattan"
69 |     }
70 |     else if(method == "argamon") {
71 |       dist_method = "euclidean"
72 |     }
73 |     else {
74 |       stop("Wrong method! Only method = burrows or method = argamon have been implmented!")
75 |     }
76 | 
77 |     return(as.matrix(stats::dist(scale(m), method = dist_method)/length(m[1,])))
78 |   }
79 | 
80 |   d_func <- squarely_(delta_func, ...)
81 | 
82 |   tbl %>%
83 |     d_func(item, feature, value) %>%
84 |     rename(delta = value)
85 | }
86 | 


--------------------------------------------------------------------------------
/R/squarely.R:
--------------------------------------------------------------------------------
 1 | #' A special case of the widely adverb for creating tidy
 2 | #' square matrices
 3 | #'
 4 | #' A special case of [widely()]. Used to pre-prepare and
 5 | #' post-tidy functions that take an m x n (m items, n features)
 6 | #' matrix and return an m x m (item x item) matrix, such as a
 7 | #' distance or correlation matrix.
 8 | #'
 9 | #' @param .f Function to wrap
10 | #' @param diag Whether to include diagonal (i = j) in output
11 | #' @param upper Whether to include upper triangle, which may be
12 | #' duplicated
13 | #' @param ... Extra arguments passed on to `widely`
14 | #'
15 | #' @return Returns a function that takes at least four arguments:
16 | #'   \item{tbl}{A table}
17 | #'   \item{item}{Name of column to use as rows in wide matrix}
18 | #'   \item{feature}{Name of column to use as columns in wide matrix}
19 | #'   \item{feature}{Name of column to use as values in wide matrix}
20 | #'   \item{...}{Arguments passed on to inner function}
21 | #'
22 | #' @seealso [widely()], [pairwise_count()],
23 | #' [pairwise_cor()], [pairwise_dist()]
24 | #'
25 | #' @examples
26 | #'
27 | #' library(dplyr)
28 | #' library(gapminder)
29 | #'
30 | #' closest_continent <- gapminder %>%
31 | #'   group_by(continent) %>%
32 | #'   squarely(dist)(country, year, lifeExp)
33 | #'
34 | #' @export
35 | squarely <- function(.f, diag = FALSE, upper = TRUE, ...) {
36 |   inner_func <- squarely_(.f, diag = diag, upper = upper, ...)
37 |   function(tbl, item, feature, value, ...) {
38 |     inner_func(tbl,
39 |                col_name(substitute(item)),
40 |                col_name(substitute(feature)),
41 |                col_name(substitute(value)),
42 |                ...)
43 |   }
44 | }
45 | 
46 | 
47 | #' @rdname squarely
48 | #' @export
49 | squarely_ <- function(.f, diag = FALSE,
50 |                       upper = TRUE,
51 |                       ...) {
52 |   extra_args <- list(...)
53 | 
54 |   f <- function(tbl, item, feature, value, ...) {
55 |     if (inherits(tbl, "grouped_df")) {
56 |       # perform within each group, then restore groups
57 |       ret <- tbl %>%
58 |         tidyr::nest() %>%
59 |         mutate(data = purrr::map(data, f, item, feature, value)) %>%
60 |         filter(purrr::map_lgl(data, ~ nrow(.) > 0)) %>%
61 |         tidyr::unnest(data) %>%
62 |         dplyr::group_by_at(dplyr::group_vars(tbl))
63 | 
64 |       return(ret)
65 |     }
66 | 
67 |     item_vals <- tbl[[item]]
68 |     item_u <- unique(item_vals)
69 | 
70 |     tbl[[item]] <- match(item_vals, item_u)
71 | 
72 |     new_f <- do.call(widely_, c(list(.f), extra_args))
73 |     ret <- new_f(tbl, item, feature, value, ...)
74 | 
75 |     ret$item1 <- as.integer(ret$item1)
76 |     ret$item2 <- as.integer(ret$item2)
77 | 
78 |     if (!upper) {
79 |       ret <- dplyr::filter(ret, item1 <= item2)
80 |     }
81 |     if (!diag) {
82 |       ret <- dplyr::filter(ret, item1 != item2)
83 |     }
84 | 
85 |     ret$item1 <- item_u[ret$item1]
86 |     ret$item2 <- item_u[ret$item2]
87 | 
88 |     ret
89 |   }
90 |   f
91 | }
92 | 


--------------------------------------------------------------------------------
/R/widely_svd.R:
--------------------------------------------------------------------------------
  1 | #' Turn into a wide matrix, perform SVD, return to tidy form
  2 | #'
  3 | #' This is useful for dimensionality reduction of items, especially when setting a
  4 | #' lower nv.
  5 | #'
  6 | #' @name widely_svd
  7 | #'
  8 | #' @param tbl Table
  9 | #' @param item Item to perform dimensionality reduction on; will end up in `item` column
 10 | #' @param feature Column describing the feature that links one item to others.
 11 | #' @param value Value
 12 | #' @param nv Optional; the number of principal components to estimate. Recommended for matrices
 13 | #' with many features.
 14 | #' @param weight_d Whether to multiply each value by the `d` principal component.
 15 | #' @param ... Extra arguments passed to `svd` (if `nv` is `NULL`)
 16 | #' or `irlba` (if `nv` is given)
 17 | #'
 18 | #' @return A tbl_df with three columns. The first is retained from the `item` input,
 19 | #' then `dimension` and `value`. Each row represents one principal component
 20 | #' value.
 21 | #'
 22 | #' @examples
 23 | #'
 24 | #' library(dplyr)
 25 | #' library(gapminder)
 26 | #'
 27 | #' # principal components driving change
 28 | #' gapminder_svd <- gapminder %>%
 29 | #'   widely_svd(country, year, lifeExp)
 30 | #'
 31 | #' gapminder_svd
 32 | #'
 33 | #' # compare SVDs, join with other data
 34 | #' library(ggplot2)
 35 | #' library(tidyr)
 36 | #'
 37 | #' gapminder_svd %>%
 38 | #'   spread(dimension, value) %>%
 39 | #'   inner_join(distinct(gapminder, country, continent), by = "country") %>%
 40 | #'   ggplot(aes(`1`, `2`, label = country)) +
 41 | #'   geom_point(aes(color = continent)) +
 42 | #'   geom_text(vjust = 1, hjust = 1)
 43 | #'
 44 | #' @export
 45 | widely_svd <- function(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...) {
 46 |   widely_svd_(tbl,
 47 |               col_name(substitute(item)),
 48 |               col_name(substitute(feature)),
 49 |               col_name(substitute(value)),
 50 |               nv = nv,
 51 |               weight_d = weight_d,
 52 |               ...)
 53 | }
 54 | 
 55 | 
 56 | #' @rdname widely_svd
 57 | #' @export
 58 | widely_svd_ <- function(tbl, item, feature, value, nv = NULL, weight_d = FALSE, ...) {
 59 |   if (is.null(nv)) {
 60 |     perform_svd <- function(m) {
 61 |       s <- svd(m, ...)
 62 | 
 63 |       if (weight_d) {
 64 |         ret <- t(s$d * t(s$u))
 65 |       } else {
 66 |         ret <- s$u
 67 |       }
 68 | 
 69 |       rownames(ret) <- rownames(m)
 70 |       ret
 71 |     }
 72 |     sparse <- FALSE
 73 |   } else {
 74 |     if (!requireNamespace("irlba", quietly = TRUE)) {
 75 |       stop("Requires the irlba package")
 76 |     }
 77 |     perform_svd <- function(m) {
 78 |       s <- irlba::irlba(m, nv = nv, ...)
 79 |       if (weight_d) {
 80 |         ret <- t(s$d * t(s$u))
 81 |       } else {
 82 |         ret <- s$u
 83 |       }
 84 | 
 85 |       rownames(ret) <- rownames(m)
 86 |       ret
 87 |     }
 88 |     sparse <- TRUE
 89 |   }
 90 | 
 91 |   item_vals <- tbl[[item]]
 92 |   item_u <- unique(item_vals)
 93 |   tbl[[item]] <- match(item_vals, item_u)
 94 | 
 95 |   ret <- widely_(perform_svd, sparse = sparse)(tbl, item, feature, value)
 96 | 
 97 |   ret <- ret %>%
 98 |     transmute(item = item_u[as.integer(item1)],
 99 |               dimension = item2,
100 |               value)
101 | 
102 |   colnames(ret)[1] <- item
103 | 
104 |   ret
105 | }
106 | 


--------------------------------------------------------------------------------
/vignettes/intro.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "widyr: Widen, process, and re-tidy a dataset"
 3 | author: "David Robinson"
 4 | date: "`r Sys.Date()`"
 5 | output: rmarkdown::html_vignette
 6 | vignette: >
 7 |   %\VignetteIndexEntry{widyr: Widen, process, and re-tidy a dataset}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | 
13 | This package wraps the pattern of un-tidying data into a wide matrix, performing some processing, then turning it back into a tidy form. This is useful for several mathematical operations such as co-occurrence counts, correlations, or clustering that are best done on a wide matrix.
14 | 
15 | ## Towards a precise definition of "wide" data
16 | 
17 | The term "wide data" has gone out of fashion as being "imprecise" [(Wickham 2014)](http://vita.had.co.nz/papers/tidy-data.pdf)), but I think with a proper definition the term could be entirely meaningful and useful.
18 | 
19 | A **wide** dataset is one or more matrices where:
20 | 
21 | * Each row is one **item**
22 | * Each column is one **feature**
23 | * Each value is one **observation**
24 | * Each matrix is one **variable**
25 | 
26 | When would you want data to be wide rather than tidy? Notable examples include classification, clustering, correlation, factorization, or other operations that can take advantage of a matrix structure. In general, when you want to **compare between items** rather than compare between variables, this is a useful structure.
27 | 
28 | The widyr package is based on the observation that during a tidy data analysis, you often want data to be wide only *temporarily*, before returning to a tidy structure for visualization and further analysis. widyr makes this easy through a set of `pairwise_` functions.
29 | 
30 | ## Example: gapminder
31 | 
32 | Consider the gapminder dataset in the [gapminder package](https://CRAN.R-project.org/package=gapminder).
33 | 
34 | ```{r}
35 | library(dplyr)
36 | library(gapminder)
37 | 
38 | gapminder
39 | ```
40 | 
41 | This tidy format (one-row-per-country-per-year) is very useful for grouping, summarizing, and filtering operations. But if we want to *compare pairs of countries* (for example, to find countries that are similar to each other), we would have to reshape this dataset. Note that here, country is the **item**, while year is the **feature** column.
42 | 
43 | #### Pairwise operations
44 | 
45 | The widyr package offers `pairwise_` functions that operate on pairs of items within data. An example is `pairwise_dist`:
46 | 
47 | ```{r}
48 | library(widyr)
49 | 
50 | gapminder %>%
51 |   pairwise_dist(country, year, lifeExp)
52 | ```
53 | 
54 | In a single step, this finds the Euclidean distance between the `lifeExp` value in each pair of countries, matching pairs based on year. We could find the closest pairs of countries overall with `arrange()`:
55 | 
56 | ```{r}
57 | gapminder %>%
58 |   pairwise_dist(country, year, lifeExp) %>%
59 |   arrange(distance)
60 | ```
61 | 
62 | Notice that this includes duplicates (Germany/Belgium and Belgium/Germany). To avoid those (the upper triangle of the distance matrix), use `upper = FALSE`:
63 | 
64 | ```{r}
65 | gapminder %>%
66 |   pairwise_dist(country, year, lifeExp, upper = FALSE) %>%
67 |   arrange(distance)
68 | ```
69 | 
70 | In some analyses, we may be interested in correlation rather than distance of pairs. For this we would use `pairwise_cor`:
71 | 
72 | ```{r}
73 | gapminder %>%
74 |   pairwise_cor(country, year, lifeExp, upper = FALSE, sort = TRUE)
75 | ```
76 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pairwise-count.R:
--------------------------------------------------------------------------------
  1 | # tests for pairwise_count function
  2 | 
  3 | context("pairwise_count")
  4 | 
  5 | suppressPackageStartupMessages(library(dplyr))
  6 | suppressPackageStartupMessages(library(tidytext))
  7 | 
  8 | original <- tibble(txt = c("I felt a funeral in my brain,",
  9 |                            "And mourners, to and fro,",
 10 |                            "Kept treading, treading, till it seemed",
 11 |                            "That sense was breaking through.")) %>%
 12 |   mutate(line = row_number()) %>%
 13 |   unnest_tokens(char, txt, token = "characters")
 14 | 
 15 | test_that("pairing and counting works", {
 16 |   d <- original %>%
 17 |     pairwise_count(char, line, sort = TRUE, upper = FALSE, diag = FALSE)
 18 | 
 19 |   expect_equal(nrow(d), 164)
 20 |   expect_equal(ncol(d), 3)
 21 |   expect_equal(d$item1[1], "e")
 22 |   expect_equal(d$item2[10], "r")
 23 |   expect_equal(d$n[20], 3)
 24 | 
 25 |   expect_false(any(d$item1 == d$item2))
 26 |   expect_false(is.unsorted(rev(d$n)))
 27 | 
 28 |   # test additional arguments
 29 | 
 30 |   # for self-pairs, the number of occurrences should be the number of distinct
 31 |   # lines
 32 |   d2 <- original %>%
 33 |     pairwise_count(char, line, sort = TRUE, upper = FALSE, diag = TRUE)
 34 | 
 35 |   expect_equal(nrow(d2), nrow(d) + 20)
 36 | 
 37 |   self_pairs <- d2 %>%
 38 |     filter(item1 == item2) %>%
 39 |     arrange(item1)
 40 | 
 41 |   char_counts <- original %>%
 42 |     distinct(line, char) %>%
 43 |     count(char) %>%
 44 |     arrange(char)
 45 | 
 46 |   expect_true(all(self_pairs$item1 == char_counts$char))
 47 |   expect_true(all(self_pairs$n == char_counts$n))
 48 | 
 49 |   # when upper is TRUE, should include twice as many items as original
 50 |   d3 <- original %>%
 51 |     pairwise_count(char, line, sort = TRUE, upper = TRUE)
 52 | 
 53 |   expect_equal(nrow(d) * 2, nrow(d3))
 54 |   expect_true(all(sort(d3$item1) == sort(d3$item2)))
 55 | })
 56 | 
 57 | 
 58 | test_that("We can count with a weight column", {
 59 |   d <- tibble(col1 = c("a", "a", "a", "b", "b", "b"),
 60 |               col2 = c("x", "y", "z", "x", "x", "z"),
 61 |               weight = c(1, 1, 1, 5, 5, 5))
 62 | 
 63 |   ret1 <- pairwise_count(d, col2, col1)
 64 |   expect_equal(ret1$n[ret1$item1 == "z" & ret1$item2 == "y"], 1)
 65 |   expect_equal(ret1$n[ret1$item1 == "z" & ret1$item2 == "x"], 2)
 66 | 
 67 |   ret2 <- pairwise_count(d, col2, col1, wt = weight)
 68 |   expect_equal(ret2$n[ret1$item1 == "z" & ret1$item2 == "y"], 1)
 69 |   expect_equal(ret2$n[ret1$item1 == "z" & ret1$item2 == "x"], 6)
 70 | })
 71 | 
 72 | 
 73 | test_that("Counts co-occurrences of words in Pride & Prejudice", {
 74 |   if (require("janeaustenr", quietly = TRUE)) {
 75 |     words <- tibble(text = prideprejudice) %>%
 76 |       mutate(line = row_number()) %>%
 77 |       unnest_tokens(word, text)
 78 | 
 79 |     pairs <- words %>%
 80 |       pairwise_count(word, line, upper = TRUE, diag = TRUE, sort = TRUE)
 81 | 
 82 |     # check it is sorted in descending order
 83 |     expect_false(is.unsorted(rev(pairs$n)))
 84 | 
 85 |     # check occurrences of words that appear with "elizabeth"
 86 |     words_with_elizabeth <- words %>%
 87 |       filter(word == "elizabeth") %>%
 88 |       select(line) %>%
 89 |       inner_join(words, by = "line") %>%
 90 |       distinct(word, line) %>%
 91 |       count(word) %>%
 92 |       arrange(n, word)
 93 | 
 94 |     pairs_with_elizabeth <- pairs %>%
 95 |       filter(item1 == "elizabeth") %>%
 96 |       arrange(n, item2)
 97 | 
 98 |     expect_true(all(words_with_elizabeth$word == pairs_with_elizabeth$item2))
 99 |     expect_true(all(words_with_elizabeth$n == pairs_with_elizabeth$n))
100 |   }
101 | })
102 | 
103 | test_that("Can count within groups", {
104 |   grouped_result <- mtcars %>%
105 |     group_by(cyl) %>%
106 |     pairwise_count(vs, am)
107 | 
108 |   expect_equal(as.character(groups(grouped_result)), c("cyl"))
109 |   expect_equal(nrow(grouped_result), 2)
110 |   expect_equal(colnames(grouped_result), c("cyl", "item1", "item2", "n"))
111 | })
112 | 


--------------------------------------------------------------------------------
/vignettes/united_nations.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "United Nations Voting Correlations"
  3 | author: "David Robinson"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{United Nations Voting Correlations}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 |   
 12 | ```{r setup, echo = FALSE}
 13 | library(knitr)
 14 | 
 15 | options(width = 102)
 16 | knitr::opts_chunk$set(message = FALSE, warning = FALSE)
 17 | 
 18 | library(ggplot2)
 19 | theme_set(theme_bw())
 20 | ```
 21 | 
 22 | Here we'll examine an example application of the widyr package, particularly the `pairwise_cor` and `pairwise_dist` functions. We'll use the data on United Nations General Assembly voting from the `unvotes` package:
 23 | 
 24 | ```{r echo = FALSE}
 25 | if (!requireNamespace("unvotes", quietly = TRUE)) {
 26 |   print("This vignette requires the unvotes package to be installed. Exiting...")
 27 |   knitr::knit_exit()
 28 | }
 29 | ```
 30 | 
 31 | ```{r}
 32 | library(dplyr)
 33 | library(unvotes)
 34 | 
 35 | un_votes
 36 | ```
 37 | 
 38 | This dataset has one row for each country for each roll call vote. We're interested in finding pairs of countries that tended to vote similarly.
 39 | 
 40 | ### Pairwise correlations
 41 | 
 42 | Notice that the `vote` column is a factor, with levels (in order) "yes", "abstain", and "no":
 43 | 
 44 | ```{r}
 45 | levels(un_votes$vote)
 46 | ```
 47 | 
 48 | We may then be interested in obtaining a measure of country-to-country agreement for each vote, using the `pairwise_cor` function.
 49 | 
 50 | ```{r cors}
 51 | library(widyr)
 52 | 
 53 | cors <- un_votes %>%
 54 |   mutate(vote = as.numeric(vote)) %>%
 55 |   pairwise_cor(country, rcid, vote, use = "pairwise.complete.obs", sort = TRUE)
 56 | 
 57 | cors
 58 | ```
 59 | 
 60 | We could, for example, find the countries that the US is most and least in agreement with:
 61 | 
 62 | ```{r US_cors}
 63 | US_cors <- cors %>%
 64 |   filter(item1 == "United States")
 65 | 
 66 | # Most in agreement
 67 | US_cors
 68 | 
 69 | # Least in agreement
 70 | US_cors %>%
 71 |   arrange(correlation)
 72 | ```
 73 | 
 74 | This can be particularly useful when visualized on a map.
 75 | 
 76 | ```{r US_cors_map, fig.width = 10, fig.height = 6}
 77 | if (require("maps", quietly = TRUE) &&
 78 |     require("fuzzyjoin", quietly = TRUE) &&
 79 |     require("countrycode", quietly = TRUE) &&
 80 |     require("ggplot2", quietly = TRUE)) {
 81 |   world_data <- map_data("world") %>%
 82 |     regex_full_join(iso3166, by = c("region" = "mapname")) %>%
 83 |     filter(region != "Antarctica")
 84 |   
 85 |   US_cors %>%
 86 |     mutate(a2 = countrycode(item2, "country.name", "iso2c")) %>%
 87 |     full_join(world_data, by = "a2") %>%
 88 |     ggplot(aes(long, lat, group = group, fill = correlation)) +
 89 |     geom_polygon(color = "gray", size = .1) +
 90 |     scale_fill_gradient2() +
 91 |     coord_quickmap() +
 92 |     theme_void() +
 93 |     labs(title = "Correlation of each country's UN votes with the United States",
 94 |          subtitle = "Blue indicates agreement, red indicates disagreement",
 95 |          fill = "Correlation w/ US")
 96 | }
 97 | ```
 98 | 
 99 | ### Visualizing clusters in a network
100 | 
101 | Another useful kind of visualization is a network plot, which can be created with Thomas Pedersen's [ggraph package](https://github.com/thomasp85/ggraph). We can filter for pairs of countries with correlations above a particular threshold.
102 | 
103 | ```{r country_network, fig.width = 10, fig.height = 6}
104 | if (require("ggraph", quietly = TRUE) &&
105 |     require("igraph", quietly = TRUE) &&
106 |     require("countrycode", quietly = TRUE)) {
107 |   cors_filtered <- cors %>%
108 |     filter(correlation > .6)
109 |   
110 |   continents <- tibble(country = unique(un_votes$country)) %>%
111 |     filter(country %in% cors_filtered$item1 |
112 |              country %in% cors_filtered$item2) %>%
113 |     mutate(continent = countrycode(country, "country.name", "continent"))
114 |   
115 |   set.seed(2017)
116 |   
117 |   cors_filtered %>%
118 |     graph_from_data_frame(vertices = continents) %>%
119 |     ggraph() +
120 |     geom_edge_link(aes(edge_alpha = correlation)) +
121 |     geom_node_point(aes(color = continent), size = 3) +
122 |     geom_node_text(aes(label = name), check_overlap = TRUE, vjust = 1, hjust = 1) +
123 |     theme_void() +
124 |     labs(title = "Network of countries with correlated United Nations votes")
125 | }
126 | ```
127 |   
128 | Choosing the threshold for filtering correlations (or other measures of similarity) typically requires some trial and error. Setting too high a threshold will make a graph too sparse, while too low a threshold will make a graph too crowded.
129 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | 
  8 | ```{r, include = FALSE}
  9 | knitr::opts_chunk$set(
 10 |   collapse = TRUE,
 11 |   comment = "#>",
 12 |   fig.path = "man/figures/README-",
 13 |   out.width = "100%",
 14 |   message = FALSE
 15 | )
 16 | suppressPackageStartupMessages(library(dplyr))
 17 | ```
 18 | 
 19 | # widyr: Widen, process, and re-tidy a dataset
 20 | 
 21 | **Authors:** [Julia Silge](https://juliasilge.com/), [David Robinson](http://varianceexplained.org/)<br/>
 22 | **License:** [MIT](https://opensource.org/licenses/MIT)
 23 | 
 24 | <!-- badges: start -->
 25 | [![R-CMD-check](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml)
 26 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/widyr)](https://cran.r-project.org/package=widyr)
 27 | [![Codecov test coverage](https://codecov.io/gh/juliasilge/widyr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/juliasilge/widyr?branch=main)
 28 | <!-- badges: end -->
 29 | 
 30 | This package wraps the pattern of un-tidying data into a wide matrix, performing some processing, then turning it back into a tidy form. This is useful for several mathematical operations such as co-occurrence counts, correlations, or clustering that are best done on a wide matrix.
 31 | 
 32 | ## Installation
 33 | 
 34 | You can install the released version of widyr from [CRAN](https://CRAN.R-project.org) with:
 35 | 
 36 | ``` r
 37 | install.packages("widyr")
 38 | ```
 39 | 
 40 | And the development version from [GitHub](https://github.com/) with:
 41 | 
 42 | ``` r
 43 | # install.packages("devtools")
 44 | devtools::install_github("juliasilge/widyr")
 45 | ```
 46 | 
 47 | ## Towards a precise definition of "wide" data
 48 | 
 49 | The term "wide data" has gone out of fashion as being "imprecise" [(Wickham 2014)](http://vita.had.co.nz/papers/tidy-data.pdf), but I think with a proper definition the term could be entirely meaningful and useful.
 50 | 
 51 | A **wide** dataset is one or more matrices where:
 52 | 
 53 | * Each row is one **item**
 54 | * Each column is one **feature**
 55 | * Each value is one **observation**
 56 | * Each matrix is one **variable**
 57 | 
 58 | When would you want data to be wide rather than tidy? Notable examples include classification, clustering, correlation, factorization, or other operations that can take advantage of a matrix structure. In general, when you want to **compare between pairs of items** rather than compare between variables or between groups of observations, this is a useful structure.
 59 | 
 60 | The widyr package is based on the observation that during a tidy data analysis, you often want data to be wide only *temporarily*, before returning to a tidy structure for visualization and further analysis. widyr makes this easy through a set of `pairwise_` functions.
 61 | 
 62 | ## Example: gapminder
 63 | 
 64 | Consider the gapminder dataset in the [gapminder package](https://CRAN.R-project.org/package=gapminder).
 65 | 
 66 | ```{r}
 67 | library(dplyr)
 68 | library(gapminder)
 69 | 
 70 | gapminder
 71 | ```
 72 | 
 73 | This tidy format (one-row-per-country-per-year) is very useful for grouping, summarizing, and filtering operations. But if we want to *compare* countries (for example, to find countries that are similar to each other), we would have to reshape this dataset. Note that here, each country is an **item**, while each year is the **feature**.
 74 | 
 75 | #### Pairwise operations
 76 | 
 77 | The widyr package offers `pairwise_` functions that operate on pairs of items within data. An example is `pairwise_dist`:
 78 | 
 79 | ```{r}
 80 | library(widyr)
 81 | 
 82 | gapminder %>%
 83 |   pairwise_dist(country, year, lifeExp)
 84 | ```
 85 | 
 86 | This finds the Euclidean distance between the `lifeExp` value in each pair of countries. It knows which values to compare between countries with `year`, which is the feature column.
 87 | 
 88 | We could find the closest pairs of countries overall with `arrange()`:
 89 | 
 90 | ```{r}
 91 | gapminder %>%
 92 |   pairwise_dist(country, year, lifeExp) %>%
 93 |   arrange(distance)
 94 | ```
 95 | 
 96 | Notice that this includes duplicates (Germany/Belgium and Belgium/Germany). To avoid those (the upper triangle of the distance matrix), use `upper = FALSE`:
 97 | 
 98 | ```{r}
 99 | gapminder %>%
100 |   pairwise_dist(country, year, lifeExp, upper = FALSE) %>%
101 |   arrange(distance)
102 | ```
103 | 
104 | In some analyses, we may be interested in correlation rather than distance of pairs. For this we would use `pairwise_cor`:
105 | 
106 | ```{r}
107 | gapminder %>%
108 |   pairwise_cor(country, year, lifeExp, upper = FALSE)
109 | ```
110 | 
111 | ### Code of Conduct
112 | 
113 | This project is released with a [Contributor Code of Conduct](https://www.contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms.
114 | 


--------------------------------------------------------------------------------
/R/widely.R:
--------------------------------------------------------------------------------
  1 | #' Adverb for functions that operate on matrices in "wide"
  2 | #' format
  3 | #'
  4 | #' Modify a function in order to pre-cast the input into a wide
  5 | #' matrix format, perform the function, and then
  6 | #' re-tidy (e.g. melt) the output into a tidy table.
  7 | #'
  8 | #' @param .f Function being wrapped
  9 | #' @param sort Whether to sort in descending order of `value`
 10 | #' @param maximum_size To prevent crashing, a maximum size of a
 11 | #' non-sparse matrix to be created. Set to NULL to allow any size
 12 | #' matrix.
 13 | #' @param sparse Whether to cast to a sparse matrix
 14 | #'
 15 | #' @return Returns a function that takes at least four arguments:
 16 | #'   \item{tbl}{A table}
 17 | #'   \item{row}{Name of column to use as rows in wide matrix}
 18 | #'   \item{column}{Name of column to use as columns in wide matrix}
 19 | #'   \item{value}{Name of column to use as values in wide matrix}
 20 | #'   \item{...}{Arguments passed on to inner function}
 21 | #'
 22 | #' `widely` creates a function that takes those columns as
 23 | #' bare names, `widely_` a function that takes them as strings.
 24 | #'
 25 | #' @import dplyr
 26 | #' @import Matrix
 27 | #' @importFrom broom tidy
 28 | #'
 29 | #' @examples
 30 | #'
 31 | #' library(dplyr)
 32 | #' library(gapminder)
 33 | #'
 34 | #' gapminder
 35 | #'
 36 | #' gapminder %>%
 37 | #'   widely(dist)(country, year, lifeExp)
 38 | #'
 39 | #' # can perform within groups
 40 | #' closest_continent <- gapminder %>%
 41 | #'   group_by(continent) %>%
 42 | #'   widely(dist)(country, year, lifeExp)
 43 | #' closest_continent
 44 | #'
 45 | #' # for example, find the closest pair in each
 46 | #' closest_continent %>%
 47 | #'   top_n(1, -value)
 48 | #'
 49 | #' @export
 50 | widely <- function(.f,
 51 |                    sort = FALSE,
 52 |                    sparse = FALSE,
 53 |                    maximum_size = 1e7) {
 54 |   force(.f)
 55 |   force(sort)
 56 |   force(sparse)
 57 |   force(maximum_size)
 58 |   function(tbl, row, column, value, ...) {
 59 |     inner_func <- widely_(.f,
 60 |                           sort = sort,
 61 |                           sparse = sparse,
 62 |                           maximum_size = maximum_size)
 63 | 
 64 |     inner_func(tbl,
 65 |                col_name(substitute(row)),
 66 |                col_name(substitute(column)),
 67 |                col_name(substitute(value)),
 68 |                ...)
 69 |   }
 70 | }
 71 | 
 72 | 
 73 | #' @rdname widely
 74 | #' @export
 75 | widely_ <- function(.f,
 76 |                     sort = FALSE,
 77 |                     sparse = FALSE,
 78 |                     maximum_size = 1e7) {
 79 |   f <- function(tbl, row, column, value, ...) {
 80 |     if (inherits(tbl, "grouped_df")) {
 81 |       # perform within each group
 82 |       # (group_by_at isn't necessary since 1.0.0, but is in earlier versions)
 83 |       ret <- tbl %>%
 84 |         tidyr::nest() %>%
 85 |         mutate(data = purrr::map(data, f, row, column, value)) %>%
 86 |         tidyr::unnest(data) %>%
 87 |         dplyr::group_by_at(dplyr::group_vars(tbl))
 88 | 
 89 |       return(ret)
 90 |     }
 91 | 
 92 |     if (!sparse) {
 93 |       if (!is.null(maximum_size)) {
 94 |         matrix_size <- (length(unique(tbl[[row]])) *
 95 |                           length(unique(tbl[[column]])))
 96 |         if (matrix_size > maximum_size) {
 97 |           rlang::abort(
 98 |             paste0("Size of acast matrix, ", matrix_size,
 99 |                    " will be too large. Set maximum_size = NULL to avoid ",
100 |                    "this error (make sure your memory is sufficient), ",
101 |                    "or consider using sparse = TRUE.")
102 |           )
103 |         }
104 |       }
105 | 
106 |       form <- stats::as.formula(paste(row, column, sep = " ~ "))
107 | 
108 |       input <- reshape2::acast(tbl, form, value.var = value, fill = 0)
109 |     } else {
110 |       input <- tidytext::cast_sparse(tbl, !!row, !!column, !!value)
111 |     }
112 |     output <- purrr::as_mapper(.f)(input, ...)
113 | 
114 |     ret <- output %>%
115 |       custom_melt() %>%
116 |       as_tibble()
117 | 
118 |     if (sort) {
119 |       ret <- arrange(ret, desc(value))
120 |     }
121 |     ret
122 |   }
123 | 
124 |   f
125 | }
126 | 
127 | 
128 | #' Tidy a function output based on some guesses
129 | #' @noRd
130 | custom_melt <- function(m) {
131 |   if (inherits(m, "data.frame")) {
132 |     rlang::abort("Output is a data frame: don't know how to fix")
133 |   } else if (inherits(m, "matrix")) {
134 |     ret <- reshape2::melt(m, varnames = c("item1", "item2"), as.is = TRUE)
135 |     return(ret)
136 |   } else if (inherits(m, "Matrix")) {
137 |     ret <- sparse_matrix_to_df(m)
138 |   } else {
139 |     ret <- tidy(m)
140 |   }
141 | 
142 |   colnames(ret) <- c("item1", "item2", "value")
143 |   ret
144 | }
145 | 
146 | sparse_matrix_to_df <- function(x) {
147 |   s <- Matrix::summary(x)
148 | 
149 |   row <- s$i
150 |   if (!is.null(rownames(x))) {
151 |     row <- rownames(x)[row]
152 |   }
153 |   col <- s$j
154 |   if (!is.null(colnames(x))) {
155 |     col <- colnames(x)[col]
156 |   }
157 | 
158 |   ret <- data.frame(
159 |     row = row, column = col, value = s$x,
160 |     stringsAsFactors = FALSE
161 |   )
162 | 
163 |   ret
164 | }
165 | 
166 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at codeofconduct@rstudio.com. 
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series of
 85 | actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or permanent
 92 | ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior, harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within the
112 | community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.1, available at
118 | <https://www.contributor-covenant.org/version/2/1/code_of_conduct.html>.
119 | 
120 | Community Impact Guidelines were inspired by
121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion].
122 | 
123 | For answers to common questions about this code of conduct, see the FAQ at
124 | <https://www.contributor-covenant.org/faq>. Translations are available at <https://www.contributor-covenant.org/translations>.
125 | 
126 | [homepage]: https://www.contributor-covenant.org
127 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # widyr: Widen, process, and re-tidy a dataset
  5 | 
  6 | **Authors:** [Julia Silge](https://juliasilge.com/), [David
  7 | Robinson](http://varianceexplained.org/)<br/> **License:**
  8 | [MIT](https://opensource.org/licenses/MIT)
  9 | 
 10 | <!-- badges: start -->
 11 | 
 12 | [![R-CMD-check](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/juliasilge/widyr/actions/workflows/R-CMD-check.yaml)
 13 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/widyr)](https://cran.r-project.org/package=widyr)
 14 | [![Codecov test
 15 | coverage](https://codecov.io/gh/juliasilge/widyr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/juliasilge/widyr?branch=main)
 16 | <!-- badges: end -->
 17 | 
 18 | This package wraps the pattern of un-tidying data into a wide matrix,
 19 | performing some processing, then turning it back into a tidy form. This
 20 | is useful for several mathematical operations such as co-occurrence
 21 | counts, correlations, or clustering that are best done on a wide matrix.
 22 | 
 23 | ## Installation
 24 | 
 25 | You can install the released version of widyr from
 26 | [CRAN](https://CRAN.R-project.org) with:
 27 | 
 28 | ``` r
 29 | install.packages("widyr")
 30 | ```
 31 | 
 32 | And the development version from [GitHub](https://github.com/) with:
 33 | 
 34 | ``` r
 35 | # install.packages("devtools")
 36 | devtools::install_github("juliasilge/widyr")
 37 | ```
 38 | 
 39 | ## Towards a precise definition of “wide” data
 40 | 
 41 | The term “wide data” has gone out of fashion as being “imprecise”
 42 | [(Wickham 2014)](http://vita.had.co.nz/papers/tidy-data.pdf), but I
 43 | think with a proper definition the term could be entirely meaningful and
 44 | useful.
 45 | 
 46 | A **wide** dataset is one or more matrices where:
 47 | 
 48 | -   Each row is one **item**
 49 | -   Each column is one **feature**
 50 | -   Each value is one **observation**
 51 | -   Each matrix is one **variable**
 52 | 
 53 | When would you want data to be wide rather than tidy? Notable examples
 54 | include classification, clustering, correlation, factorization, or other
 55 | operations that can take advantage of a matrix structure. In general,
 56 | when you want to **compare between pairs of items** rather than compare
 57 | between variables or between groups of observations, this is a useful
 58 | structure.
 59 | 
 60 | The widyr package is based on the observation that during a tidy data
 61 | analysis, you often want data to be wide only *temporarily*, before
 62 | returning to a tidy structure for visualization and further analysis.
 63 | widyr makes this easy through a set of `pairwise_` functions.
 64 | 
 65 | ## Example: gapminder
 66 | 
 67 | Consider the gapminder dataset in the [gapminder
 68 | package](https://CRAN.R-project.org/package=gapminder).
 69 | 
 70 | ``` r
 71 | library(dplyr)
 72 | library(gapminder)
 73 | 
 74 | gapminder
 75 | #> # A tibble: 1,704 × 6
 76 | #>    country     continent  year lifeExp      pop gdpPercap
 77 | #>    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 78 | #>  1 Afghanistan Asia       1952    28.8  8425333      779.
 79 | #>  2 Afghanistan Asia       1957    30.3  9240934      821.
 80 | #>  3 Afghanistan Asia       1962    32.0 10267083      853.
 81 | #>  4 Afghanistan Asia       1967    34.0 11537966      836.
 82 | #>  5 Afghanistan Asia       1972    36.1 13079460      740.
 83 | #>  6 Afghanistan Asia       1977    38.4 14880372      786.
 84 | #>  7 Afghanistan Asia       1982    39.9 12881816      978.
 85 | #>  8 Afghanistan Asia       1987    40.8 13867957      852.
 86 | #>  9 Afghanistan Asia       1992    41.7 16317921      649.
 87 | #> 10 Afghanistan Asia       1997    41.8 22227415      635.
 88 | #> # … with 1,694 more rows
 89 | #> # ℹ Use `print(n = ...)` to see more rows
 90 | ```
 91 | 
 92 | This tidy format (one-row-per-country-per-year) is very useful for
 93 | grouping, summarizing, and filtering operations. But if we want to
 94 | *compare* countries (for example, to find countries that are similar to
 95 | each other), we would have to reshape this dataset. Note that here, each
 96 | country is an **item**, while each year is the **feature**.
 97 | 
 98 | #### Pairwise operations
 99 | 
100 | The widyr package offers `pairwise_` functions that operate on pairs of
101 | items within data. An example is `pairwise_dist`:
102 | 
103 | ``` r
104 | library(widyr)
105 | 
106 | gapminder %>%
107 |   pairwise_dist(country, year, lifeExp)
108 | #> # A tibble: 20,022 × 3
109 | #>    item1      item2       distance
110 | #>    <fct>      <fct>          <dbl>
111 | #>  1 Albania    Afghanistan   107.  
112 | #>  2 Algeria    Afghanistan    76.8 
113 | #>  3 Angola     Afghanistan     4.65
114 | #>  4 Argentina  Afghanistan   110.  
115 | #>  5 Australia  Afghanistan   129.  
116 | #>  6 Austria    Afghanistan   124.  
117 | #>  7 Bahrain    Afghanistan    98.1 
118 | #>  8 Bangladesh Afghanistan    45.3 
119 | #>  9 Belgium    Afghanistan   125.  
120 | #> 10 Benin      Afghanistan    39.3 
121 | #> # … with 20,012 more rows
122 | #> # ℹ Use `print(n = ...)` to see more rows
123 | ```
124 | 
125 | This finds the Euclidean distance between the `lifeExp` value in each
126 | pair of countries. It knows which values to compare between countries
127 | with `year`, which is the feature column.
128 | 
129 | We could find the closest pairs of countries overall with `arrange()`:
130 | 
131 | ``` r
132 | gapminder %>%
133 |   pairwise_dist(country, year, lifeExp) %>%
134 |   arrange(distance)
135 | #> # A tibble: 20,022 × 3
136 | #>    item1          item2          distance
137 | #>    <fct>          <fct>             <dbl>
138 | #>  1 Germany        Belgium            1.08
139 | #>  2 Belgium        Germany            1.08
140 | #>  3 United Kingdom New Zealand        1.51
141 | #>  4 New Zealand    United Kingdom     1.51
142 | #>  5 Norway         Netherlands        1.56
143 | #>  6 Netherlands    Norway             1.56
144 | #>  7 Italy          Israel             1.66
145 | #>  8 Israel         Italy              1.66
146 | #>  9 Finland        Austria            1.94
147 | #> 10 Austria        Finland            1.94
148 | #> # … with 20,012 more rows
149 | #> # ℹ Use `print(n = ...)` to see more rows
150 | ```
151 | 
152 | Notice that this includes duplicates (Germany/Belgium and
153 | Belgium/Germany). To avoid those (the upper triangle of the distance
154 | matrix), use `upper = FALSE`:
155 | 
156 | ``` r
157 | gapminder %>%
158 |   pairwise_dist(country, year, lifeExp, upper = FALSE) %>%
159 |   arrange(distance)
160 | #> # A tibble: 10,011 × 3
161 | #>    item1       item2          distance
162 | #>    <fct>       <fct>             <dbl>
163 | #>  1 Belgium     Germany            1.08
164 | #>  2 New Zealand United Kingdom     1.51
165 | #>  3 Netherlands Norway             1.56
166 | #>  4 Israel      Italy              1.66
167 | #>  5 Austria     Finland            1.94
168 | #>  6 Belgium     United Kingdom     1.95
169 | #>  7 Iceland     Sweden             2.01
170 | #>  8 Comoros     Mauritania         2.01
171 | #>  9 Belgium     United States      2.09
172 | #> 10 Germany     Ireland            2.10
173 | #> # … with 10,001 more rows
174 | #> # ℹ Use `print(n = ...)` to see more rows
175 | ```
176 | 
177 | In some analyses, we may be interested in correlation rather than
178 | distance of pairs. For this we would use `pairwise_cor`:
179 | 
180 | ``` r
181 | gapminder %>%
182 |   pairwise_cor(country, year, lifeExp, upper = FALSE)
183 | #> # A tibble: 10,011 × 3
184 | #>    item1       item2     correlation
185 | #>    <fct>       <fct>           <dbl>
186 | #>  1 Afghanistan Albania         0.966
187 | #>  2 Afghanistan Algeria         0.987
188 | #>  3 Albania     Algeria         0.953
189 | #>  4 Afghanistan Angola          0.986
190 | #>  5 Albania     Angola          0.976
191 | #>  6 Algeria     Angola          0.952
192 | #>  7 Afghanistan Argentina       0.971
193 | #>  8 Albania     Argentina       0.949
194 | #>  9 Algeria     Argentina       0.991
195 | #> 10 Angola      Argentina       0.936
196 | #> # … with 10,001 more rows
197 | #> # ℹ Use `print(n = ...)` to see more rows
198 | ```
199 | 
200 | ### Code of Conduct
201 | 
202 | This project is released with a [Contributor Code of
203 | Conduct](https://www.contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html).
204 | By contributing to this project, you agree to abide by its terms.
205 | 


--------------------------------------------------------------------------------