├── src ├── .gitignore ├── Makevars ├── Summary2d.cpp ├── BinnedVector.cpp ├── lowerBound.cpp ├── stats.h ├── double-diff-sum.cpp ├── summary.cpp ├── BigVis.cpp ├── condense-gen.r ├── Summary2d.h ├── BinnedVectors.cpp ├── group-hex.h ├── frange.cpp ├── group.cpp ├── summary.h ├── stats.cpp ├── group.h ├── smooth-nd.cpp ├── condense.cpp └── RcppExports.cpp ├── .gitignore ├── data └── movies.rdata ├── .Rbuildignore ├── .travis.yml ├── inst ├── tests │ ├── test-ranged.r │ ├── test-group-2d.r │ ├── test-frange.r │ ├── test-breaks.r │ ├── test-group-1d.r │ ├── test-origin.r │ ├── test-summary-moments.r │ ├── test-weighted-stats.r │ ├── test-stat.r │ ├── test-smooth.r │ ├── test-binned-vectors.r │ └── test-condense.r └── include │ └── bigvis.h ├── man ├── is.ranged.Rd ├── bigvis.Rd ├── dgrid.Rd ├── round_any.condensed.Rd ├── find_width.Rd ├── weighted.median.Rd ├── breaks.Rd ├── weighted.IQR.Rd ├── find_origin.Rd ├── h_grid.Rd ├── frange.Rd ├── weighted.quantile.Rd ├── dchallenge.Rd ├── condense.Rd ├── bin.Rd ├── mt.Rd ├── weighted.ecdf.Rd ├── autoplot.condensed.Rd ├── transform.condensed.Rd ├── ranged.Rd ├── weighted.var.Rd ├── condensed.Rd ├── movies.Rd ├── standardise.Rd ├── peel.Rd ├── rmse_cvs.Rd ├── smooth.Rd └── best_h.Rd ├── bigvis.Rproj ├── R ├── adjust.r ├── utils.r ├── bigvis.r ├── breaks.r ├── width.r ├── origin.r ├── movies.r ├── dgrid.r ├── standardise.r ├── mt.r ├── id.r ├── bin.r ├── condense.r ├── rmse.r ├── challenge.r ├── rebin.r ├── ranged.r ├── peel.r ├── smooth.r ├── condensed.r ├── RcppExports.R ├── h.r ├── autoplot.r └── weighted-stats.r ├── DESCRIPTION ├── bench ├── kernel.cpp ├── group-tempvar.cpp ├── count.cpp ├── median.cpp ├── mean.cpp ├── bin-structure.cpp ├── bin.cpp └── smooth-1d.cpp ├── NAMESPACE ├── notes.md └── README.md /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS=-I../inst/include 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | src/*.o 5 | src/*.so 6 | src/*.dll 7 | -------------------------------------------------------------------------------- /data/movies.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataXujing/bigvis/master/data/movies.rdata -------------------------------------------------------------------------------- /src/Summary2d.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Summary2d.h" 3 | using namespace Rcpp; 4 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | bench 2 | notes.md 3 | ^.*\.Rproj$ 4 | ^\.Rproj\.user$ 5 | ^\.travis\.yml$ 6 | ^src/condense-gen\.r$ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Sample .travis.yml for R projects 2 | 3 | language: r 4 | warnings_are_errors: true 5 | sudo: required 6 | 7 | r_github_packages: 8 | - jimhester/covr 9 | after_success: 10 | - Rscript -e 'covr::codecov()' 11 | -------------------------------------------------------------------------------- /inst/tests/test-ranged.r: -------------------------------------------------------------------------------- 1 | context("Ranged") 2 | 3 | test_that("range attribute lost when modified", { 4 | x <- ranged(10:1) 5 | expect_equal(max(x), 10) 6 | 7 | 8 | x[1] <- 1 9 | expect_equal(max(x), 9) 10 | expect_equal(attr(x, "range"), NULL) 11 | }) 12 | -------------------------------------------------------------------------------- /src/BinnedVector.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | NumericVector frange(const NumericVector& x, const bool finite = true); 5 | 6 | int BinnedVector::nbins() const { 7 | double max = frange(x_)[1]; 8 | return bin(max) + 1; 9 | // +1 bin for missing values 10 | } 11 | -------------------------------------------------------------------------------- /inst/tests/test-group-2d.r: -------------------------------------------------------------------------------- 1 | context("Grouping: 2d") 2 | 3 | test_that("Two NAs gets bin 0", { 4 | expect_equal(group_rect(NA, NA, 1, 1, 0, 0), 0) 5 | }) 6 | 7 | test_that("Sequential locations get sequential groups", { 8 | grid <- expand.grid(x = c(NA, 1:2), y = c(NA, 1:2)) 9 | expect_equal(group_rect(grid$x, grid$y, 1, 1, 0.5, 0.5), 0:8) 10 | }) 11 | -------------------------------------------------------------------------------- /man/is.ranged.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/ranged.r 3 | \name{is.ranged} 4 | \alias{is.ranged} 5 | \title{Test if an object is of class ranged.} 6 | \usage{ 7 | is.ranged(x) 8 | } 9 | \arguments{ 10 | \item{x}{object to test} 11 | } 12 | \description{ 13 | Test if an object is of class ranged. 14 | } 15 | \keyword{internal} 16 | 17 | -------------------------------------------------------------------------------- /bigvis.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | 17 | BuildType: Package 18 | PackageUseDevtools: Yes 19 | PackageInstallArgs: --no-multiarch --with-keep.source 20 | PackageRoxygenize: rd,collate,namespace 21 | -------------------------------------------------------------------------------- /inst/tests/test-frange.r: -------------------------------------------------------------------------------- 1 | context("frange") 2 | 3 | test_that("frange agrees with range", { 4 | x <- rnorm(1e4) 5 | expect_equal(frange(x), range(x)) 6 | }) 7 | 8 | test_that("frange uses cache if present", { 9 | x <- rnorm(1e4) 10 | attr(x, "range") <- c(0, 10) 11 | expect_equal(frange(x), c(0, 10)) 12 | }) 13 | 14 | test_that("frange ignores NA and infinities by default", { 15 | x <- c(1, NA, Inf, -Inf) 16 | expect_equal(frange(x), c(1, 1)) 17 | }) 18 | -------------------------------------------------------------------------------- /R/adjust.r: -------------------------------------------------------------------------------- 1 | # Protect against floating point areas by slightly adjusting breaks. 2 | # Adapted from graphics::hist.default. 3 | adjust_breaks <- function(breaks, open = "right") { 4 | open <- match.arg(open, c("left", "right")) 5 | 6 | breaks <- sort(breaks) 7 | diddle <- 1e-07 * median(diff(breaks)) 8 | if (open == "left") { 9 | fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1)) 10 | } else { 11 | fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle) 12 | } 13 | breaks + fuzz 14 | } 15 | -------------------------------------------------------------------------------- /inst/tests/test-breaks.r: -------------------------------------------------------------------------------- 1 | context("Breaks") 2 | 3 | last <- function(x) x[length(x)] 4 | 5 | test_that("breaks includes max value, only if on border", { 6 | expect_equal(last(breaks(10, origin = 0, binwidth = 1)), 10) 7 | expect_equal(last(breaks(9.99, origin = 0, binwidth = 1)), 9) 8 | }) 9 | 10 | test_that("breaks includes max value even when origin != 0", { 11 | expect_equal(last(breaks(10.5, origin = 0.5, binwidth = 1)), 10.5) 12 | expect_equal(last(breaks(10.49, origin = 0.5, binwidth = 1)), 9.5) 13 | }) 14 | -------------------------------------------------------------------------------- /man/bigvis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/bigvis.r 3 | \docType{package} 4 | \name{bigvis} 5 | \alias{as.integer,Rcpp_BinnedVector-method} 6 | \alias{bigvis} 7 | \alias{bigvis-package} 8 | \alias{show,Rcpp_BinnedVector-method} 9 | \title{The big vis package.} 10 | \usage{ 11 | \S4method{show}{Rcpp_BinnedVector}(object) 12 | 13 | \S4method{as.integer}{Rcpp_BinnedVector}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x,object,...}{Generic args} 17 | } 18 | \description{ 19 | The big vis package. 20 | } 21 | 22 | -------------------------------------------------------------------------------- /R/utils.r: -------------------------------------------------------------------------------- 1 | "%||%" <- function(x, y) if (is.null(x)) y else x 2 | 3 | last <- function(x) x[length(x)] 4 | 5 | "%contains%" <- function(df, var) { 6 | var %in% names(df) 7 | } 8 | 9 | find_fun <- function(name, env = globalenv()) { 10 | if (is.function(name)) return(name) 11 | 12 | ns_env <- asNamespace("bigvis") 13 | if (exists(name, ns_env, mode = "function")) { 14 | return(get(name, ns_env)) 15 | } 16 | 17 | if (exists(name, env, mode = "function")) { 18 | return(get(name, env)) 19 | } 20 | 21 | stop("Could not find function ", name, call. = FALSE) 22 | } 23 | -------------------------------------------------------------------------------- /src/lowerBound.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace Rcpp; 4 | 5 | // Quick and dirty implementation of lowerBound, the complement to R's 6 | // findInterval 7 | // [[Rcpp::export]] 8 | IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks) { 9 | int n = x.size(); 10 | IntegerVector out(n); 11 | 12 | for (int i = 0; i < n; i++) { 13 | NumericVector::const_iterator it = 14 | std::lower_bound(breaks.begin(), breaks.end(), x[i]); 15 | if (it == breaks.end()) --it; 16 | out[i] = it - breaks.begin() + 1; 17 | } 18 | return out; 19 | } 20 | -------------------------------------------------------------------------------- /man/dgrid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/dgrid.r 3 | \name{dgrid} 4 | \alias{dgrid} 5 | \alias{is.dgrid} 6 | \title{dgrid: an S3 class for data grids} 7 | \usage{ 8 | dgrid(x, width, origin = 0, nbins = NULL) 9 | 10 | is.dgrid(x) 11 | } 12 | \arguments{ 13 | \item{x}{a numeric vector to test or coerce.} 14 | 15 | \item{width}{bin width} 16 | 17 | \item{origin}{bin origins} 18 | 19 | \item{nbins}{number of bins} 20 | } 21 | \description{ 22 | dgrid: an S3 class for data grids 23 | } 24 | \examples{ 25 | g <- dgrid(0:10 + 0.5, width = 1) 26 | range(g) 27 | as.integer(g) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /src/stats.h: -------------------------------------------------------------------------------- 1 | struct Regression { 2 | double alpha, beta; 3 | }; 4 | 5 | double bisquare(double u, double b); 6 | 7 | Regression simpleLinearRegression(const std::vector& x, 8 | const std::vector& y, 9 | const std::vector& w); 10 | 11 | Regression simpleRobustRegression(const std::vector& x, 12 | const std::vector& y, 13 | const std::vector& w, 14 | int iterations = 3); 15 | 16 | double median(const std::vector& x); 17 | double median(std::vector* x); 18 | -------------------------------------------------------------------------------- /inst/tests/test-group-1d.r: -------------------------------------------------------------------------------- 1 | context("Grouping: 1d") 2 | 3 | group <- function(x, width, origin = NULL) { 4 | g <- bin(x, width, origin) 5 | vapply(seq_along(x) - 1, g$bin_i, integer(1)) 6 | } 7 | 8 | test_that("NAs belong to group 0", { 9 | x <- NA_real_ 10 | expect_equal(group(x, 1, 0), 0L) 11 | }) 12 | 13 | test_that("Inf and -Inf belong to group 0", { 14 | x <- c(-Inf, Inf) 15 | expect_equal(group(x, 1, 0), c(0, 0)) 16 | }) 17 | 18 | test_that("Out of range values belong to group 0", { 19 | expect_equal(group(-10, 1, 0), 0) 20 | }) 21 | 22 | test_that("Positive integers unchanged if origin is 1", { 23 | expect_equal(group(1:10, 1, 1), 1:10) 24 | }) 25 | -------------------------------------------------------------------------------- /man/round_any.condensed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/condensed.r 3 | \name{round_any.condensed} 4 | \alias{round_any.condensed} 5 | \title{Round any method for condensed objects} 6 | \usage{ 7 | round_any.condensed(x, accuracy, f = round) 8 | } 9 | \arguments{ 10 | \item{x}{numeric or date-time (POSIXct) vector to round} 11 | 12 | \item{accuracy}{number to round to; for POSIXct objects, a number of seconds} 13 | 14 | \item{f}{rounding function: \code{\link{floor}}, \code{\link{ceiling}} or 15 | \code{\link{round}}} 16 | } 17 | \description{ 18 | Round any method for condensed objects 19 | } 20 | \keyword{internal} 21 | 22 | -------------------------------------------------------------------------------- /inst/tests/test-origin.r: -------------------------------------------------------------------------------- 1 | context("Origin") 2 | 3 | test_that("origins close to zero rounded to zero" ,{ 4 | expect_equal(find_origin(c(0.01, 1000)), 0) 5 | expect_equal(find_origin(c(10, 1e6)), 0) 6 | }) 7 | 8 | test_that("origins rounded down by binwidth", { 9 | expect_equal(find_origin(c(1, 10), 1), 1) 10 | expect_equal(find_origin(c(1, 10), 2), 0) 11 | 12 | expect_equal(find_origin(c(5, 10), 2), 4) 13 | expect_equal(find_origin(c(5, 10), 5), 5) 14 | }) 15 | 16 | test_that("integers have origin offset by 0.5", { 17 | expect_equal(find_origin(c(1L, 10L), 1), 0.5) 18 | 19 | expect_equal(find_origin(c(5L, 10L), 2), 3.5) 20 | expect_equal(find_origin(c(5L, 10L), 5), 4.5) 21 | }) 22 | -------------------------------------------------------------------------------- /man/find_width.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/width.r 3 | \name{find_width} 4 | \alias{find_width} 5 | \title{Compute a reasonable default binwidth.} 6 | \usage{ 7 | find_width(x, nbins = 10000) 8 | } 9 | \arguments{ 10 | \item{x}{a numeric vector. If a numeric vector of length one is supplied, 11 | it's assumed that} 12 | 13 | \item{nbins}{desired number of bins (approximate)} 14 | } 15 | \description{ 16 | Compute a reasonable default binwidth. 17 | } 18 | \examples{ 19 | find_width(c(0, 5)) 20 | find_width(c(0, 5.023432)) 21 | find_width(c(0, 5.9)) 22 | } 23 | \seealso{ 24 | Other reasonable defaults: \code{\link{find_origin}} 25 | } 26 | \keyword{internal} 27 | 28 | -------------------------------------------------------------------------------- /R/bigvis.r: -------------------------------------------------------------------------------- 1 | #' The big vis package. 2 | #' 3 | #' @useDynLib bigvis 4 | #' @docType package 5 | #' @name bigvis 6 | NULL 7 | 8 | if (!exists("BigVis")) { 9 | BigVis <- Rcpp::Module("BigVis") 10 | } 11 | 12 | 13 | #' @param x,object,... Generic args 14 | #' @rdname bigvis 15 | #' @export 16 | setMethod("show", "Rcpp_BinnedVector", function(object) { 17 | cat("Binned [", object$size(), "]. ", 18 | "Width: ", object$width(), " Origin: ", object$origin(), "\n", sep = "") 19 | }) 20 | 21 | #' @rdname bigvis 22 | #' @export 23 | setMethod("as.integer", "Rcpp_BinnedVector", function(x, ...) { 24 | vapply(seq_len(x$size()), x$bin_i, integer(1)) 25 | }) 26 | 27 | 28 | 29 | # Silence R CMD check note 30 | #' @importFrom methods new 31 | #' @importFrom Rcpp compileAttributes cpp_object_initializer 32 | NULL 33 | 34 | -------------------------------------------------------------------------------- /src/double-diff-sum.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace Rcpp; 5 | 6 | // Efficiently compute \sum \sum abs(x_i - x_j) for binned data 7 | // 8 | // It's effectively equivalent to this R code on the ungrouped observations 9 | // bin <- trunc(x / bw) 10 | // diffs <- abs(outer(bin, bin, "-")) 11 | // tabulate(diffs + 1) 12 | // 13 | // [[Rcpp::export]] 14 | std::vector double_diff_sum(IntegerVector bin, IntegerVector count) { 15 | int n = bin.size(); 16 | std::vector out; 17 | 18 | for (int i = 0; i < n; i++) { 19 | for (int j = 0; j < n; j++) { 20 | int pos = abs(bin[i] - bin[j]); 21 | 22 | if (pos + 1 > out.size()) { 23 | out.resize(pos + 1); 24 | } 25 | out[pos] += count[i] * count[j]; 26 | } 27 | } 28 | 29 | return out; 30 | } 31 | -------------------------------------------------------------------------------- /man/weighted.median.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/weighted-stats.r 3 | \name{weighted.median} 4 | \alias{weighted.median} 5 | \title{Compute the median of weighted data.} 6 | \usage{ 7 | weighted.median(x, w, na.rm = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{numeric vector of observations} 11 | 12 | \item{w}{integer vector of weights, representing the number of 13 | time each \code{x} was observed} 14 | 15 | \item{na.rm}{If \code{TRUE} will automatically remove missing values 16 | in \code{x} or \code{w}.} 17 | } 18 | \description{ 19 | Compute the median of weighted data. 20 | } 21 | \details{ 22 | This is a simple wrapper around \code{\link{weighted.quantile}} 23 | } 24 | \examples{ 25 | x <- runif(200) 26 | w <- rpois(200, 5) + 1 27 | 28 | median(x) 29 | weighted.median(x, w) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /man/breaks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/breaks.r 3 | \name{breaks} 4 | \alias{breaks} 5 | \title{Compute breaks given origin and width.} 6 | \usage{ 7 | breaks(x, binwidth, origin = min(x)) 8 | } 9 | \arguments{ 10 | \item{x}{numeric vector} 11 | 12 | \item{binwidth}{bin width} 13 | 14 | \item{origin}{bin origin} 15 | } 16 | \description{ 17 | Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer 18 | multiple of binwidth, then we need one more break. This function only returns 19 | the left-side of the breaks. 20 | } 21 | \details{ 22 | The first break is special, because it always contains missing values. 23 | } 24 | \examples{ 25 | breaks(10, origin = 0, binwidth = 1) 26 | breaks(9.9, origin = 0, binwidth = 1) 27 | 28 | breaks(1:10, origin = 0, binwidth = 2) 29 | } 30 | \keyword{internal} 31 | 32 | -------------------------------------------------------------------------------- /man/weighted.IQR.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/weighted-stats.r 3 | \name{weighted.IQR} 4 | \alias{weighted.IQR} 5 | \title{Compute the interquartile range of weighted data.} 6 | \usage{ 7 | weighted.IQR(x, w, na.rm = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{numeric vector of observations} 11 | 12 | \item{w}{integer vector of weights, representing the number of 13 | time each \code{x} was observed} 14 | 15 | \item{na.rm}{If \code{TRUE} will automatically remove missing values 16 | in \code{x} or \code{w}.} 17 | } 18 | \description{ 19 | Compute the interquartile range of weighted data. 20 | } 21 | \details{ 22 | This is a simple wrapper around \code{\link{weighted.quantile}} 23 | } 24 | \examples{ 25 | x <- sort(runif(200)) 26 | w <- rpois(200, seq(1, 10, length = 200)) + 1 27 | 28 | IQR(x) 29 | weighted.IQR(x, w) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/summary.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "summary.h" 3 | using namespace Rcpp; 4 | 5 | template 6 | NumericVector summary_compute(const NumericVector& x, Summary summary) { 7 | int n = x.size(); 8 | for(int i = 0; i < n; ++i) { 9 | summary.push(x[i], 1); 10 | } 11 | 12 | int m = summary.size(); 13 | NumericVector out(m); 14 | for(int i = 0; i < m; ++i) { 15 | out[i] = summary.compute(i); 16 | } 17 | 18 | return out; 19 | } 20 | 21 | // [[Rcpp::export]] 22 | NumericVector compute_moments(const NumericVector& x) { 23 | return summary_compute(x, SummaryMoments(2)); 24 | } 25 | 26 | // [[Rcpp::export]] 27 | NumericVector compute_sum(const NumericVector& x) { 28 | return summary_compute(x, SummarySum(1)); 29 | } 30 | 31 | // [[Rcpp::export]] 32 | NumericVector compute_median(const NumericVector& x) { 33 | return summary_compute(x, SummaryMedian()); 34 | } 35 | -------------------------------------------------------------------------------- /R/breaks.r: -------------------------------------------------------------------------------- 1 | #' Compute breaks given origin and width. 2 | #' 3 | #' Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer 4 | #' multiple of binwidth, then we need one more break. This function only returns 5 | #' the left-side of the breaks. 6 | #' 7 | #' The first break is special, because it always contains missing values. 8 | #' 9 | #' @param x numeric vector 10 | #' @param origin bin origin 11 | #' @param binwidth bin width 12 | #' @export 13 | #' @keywords internal 14 | #' @examples 15 | #' breaks(10, origin = 0, binwidth = 1) 16 | #' breaks(9.9, origin = 0, binwidth = 1) 17 | #' 18 | #' breaks(1:10, origin = 0, binwidth = 2) 19 | breaks <- function(x, binwidth, origin = min(x)) { 20 | if (!is.binned(x)) { 21 | x <- bin(x, binwidth, origin) 22 | } 23 | 24 | # -1 for NA bin, -1 since R is 1 indexed 25 | nbins <- x$nbins() - 2 26 | c(NA, x$origin() + seq.int(1, nbins) * x$width()) 27 | } 28 | -------------------------------------------------------------------------------- /man/find_origin.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/origin.r 3 | \name{find_origin} 4 | \alias{find_origin} 5 | \title{Find the origin.} 6 | \usage{ 7 | find_origin(x, binwidth) 8 | } 9 | \arguments{ 10 | \item{x}{numeric or integer vector} 11 | 12 | \item{binwidth}{binwidth} 13 | } 14 | \description{ 15 | Find the origin. 16 | } 17 | \details{ 18 | This algorithm implements simple heuristics for determining the origin of 19 | a histogram when only the binwidth is specified. It: 20 | 21 | \itemize{ 22 | \item rounds to zero, if relatively close 23 | \item subtracts 0.5 offset, if an x is integer 24 | \item ensures the origin is a multiple of the binwidth 25 | } 26 | } 27 | \examples{ 28 | find_origin(1:10, 1) 29 | find_origin(1:10, 2) 30 | find_origin(c(1, 1e6), 1) 31 | } 32 | \seealso{ 33 | Other reasonable defaults: \code{\link{find_width}} 34 | } 35 | \keyword{internal} 36 | 37 | -------------------------------------------------------------------------------- /man/h_grid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/h.r 3 | \name{h_grid} 4 | \alias{h_grid} 5 | \title{Generate grid of plausible bandwidths for condensed summary.} 6 | \usage{ 7 | h_grid(x, n = 50, max = 20) 8 | } 9 | \arguments{ 10 | \item{x}{a condensed summary} 11 | 12 | \item{n}{number of bandwidths to generate (in each dimension)} 13 | 14 | \item{max}{maximum bandwidth to generate, as multiple of binwidth.} 15 | } 16 | \description{ 17 | By default, the bandwidths start at the bin width, and then continue 18 | up 50 (\code{n}) steps until 20 (\code{max}) times the bin width. 19 | } 20 | \examples{ 21 | x <- rchallenge(1e4) 22 | xsum <- condense(bin(x, 1 / 10)) 23 | h_grid(xsum) 24 | 25 | y <- runif(1e4) 26 | xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) 27 | h_grid(xysum, n = 10) 28 | } 29 | \seealso{ 30 | Other bandwidth estimation functions: \code{\link{best_h}}; 31 | \code{\link{rmse_cv}}, \code{\link{rmse_cvs}} 32 | } 33 | 34 | -------------------------------------------------------------------------------- /inst/tests/test-summary-moments.r: -------------------------------------------------------------------------------- 1 | context("Summary: moments") 2 | 3 | count2 <- function(x) compute_moments(x)[1] 4 | mean2 <- function(x) compute_moments(x)[2] 5 | sd2 <- function(x) compute_moments(x)[3] 6 | 7 | test_that("count agrees with length", { 8 | expect_equal(count2(1:10), 10) 9 | expect_equal(count2(5), 1) 10 | expect_equal(count2(numeric()), 0) 11 | }) 12 | 13 | test_that("mean agree with base::mean", { 14 | expect_equal(mean2(1:10), mean(1:10)) 15 | 16 | x <- runif(1e6) 17 | expect_equal(mean2(x), mean(x)) 18 | }) 19 | 20 | test_that("missing values are ignored", { 21 | x <- c(NA, 5, 5) 22 | expect_equal(count2(x), 2) 23 | expect_equal(mean2(x), 5) 24 | }) 25 | 26 | test_that("standard deviation agrees with sd", { 27 | expect_equal(sd2(1:10), sd(1:10)) 28 | 29 | x <- runif(1e6) 30 | expect_equal(sd2(x), sd(x)) 31 | }) 32 | 33 | test_that("summary statistics of zero length input are NaN", { 34 | expect_equal(compute_moments(numeric()), c(0, NaN, NaN)) 35 | }) 36 | -------------------------------------------------------------------------------- /R/width.r: -------------------------------------------------------------------------------- 1 | #' Compute a reasonable default binwidth. 2 | #' 3 | #' @param x a numeric vector. If a numeric vector of length one is supplied, 4 | #' it's assumed that 5 | #' @param nbins desired number of bins (approximate) 6 | #' @export 7 | #' @keywords internal 8 | #' @family reasonable defaults 9 | #' @examples 10 | #' find_width(c(0, 5)) 11 | #' find_width(c(0, 5.023432)) 12 | #' find_width(c(0, 5.9)) 13 | find_width <- function(x, nbins = 1e4) { 14 | stopifnot(is.numeric(x)) 15 | stopifnot(is.numeric(nbins), length(nbins) == 1, nbins > 0) 16 | 17 | x <- diff(frange(x)) 18 | size <- x / nbins 19 | 20 | # divide into order of magnitude and multiplier 21 | om <- 10 ^ ceiling(log10(size)) 22 | mult <- size / om 23 | 24 | # ensure number per unit is multiple of 1, 2, 3, 4, or 5 25 | per_unit <- 1 / mult 26 | rounders <- c(1, 2, 3, 4, 5) 27 | poss <- round(per_unit / rounders) * rounders 28 | poss <- poss[poss != 0] 29 | width <- om / poss[which.min(abs(poss - per_unit))] 30 | 31 | structure(width, n = ceiling(x / width), per_unit = 1 / width) 32 | } 33 | -------------------------------------------------------------------------------- /inst/tests/test-weighted-stats.r: -------------------------------------------------------------------------------- 1 | context("Weighted statistics") 2 | 3 | test_that("weighted.var agrees with var when weights = 1", { 4 | samples <- replicate(20, runif(100), simplify = FALSE) 5 | 6 | var <- sapply(samples, var) 7 | wvar <- sapply(samples, weighted.var, w = rep(1, 100)) 8 | 9 | expect_equal(wvar, var) 10 | }) 11 | 12 | test_that("weighted.var agrees with var on repeated vector", { 13 | samples <- replicate(20, runif(100), simplify = FALSE) 14 | w <- rep(1:2, 50) 15 | samples_ex <- lapply(samples, rep, times = w) 16 | 17 | var <- sapply(samples_ex, var) 18 | wvar <- sapply(samples, weighted.var, w = w) 19 | 20 | expect_equal(wvar, var) 21 | }) 22 | 23 | test_that("weighed.quantile agrees with quantile on repeated vector", { 24 | samples <- replicate(20, runif(100), simplify = FALSE) 25 | w <- rep(1:2, 50) 26 | samples_ex <- lapply(samples, rep, times = w) 27 | 28 | quant <- sapply(samples_ex, quantile, probs = 0.325, names = FALSE) 29 | wquant <- sapply(samples, weighted.quantile, w = w, probs = 0.325) 30 | 31 | expect_equal(quant, wquant) 32 | }) 33 | -------------------------------------------------------------------------------- /src/BigVis.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | RCPP_MODULE(BigVis) { 5 | class_("BinnedVector") 6 | .constructor() 7 | .const_method("bin_i", &BinnedVectorReference::bin_i) 8 | .const_method("bin", &BinnedVectorReference::bin) 9 | .const_method("unbin", &BinnedVectorReference::unbin) 10 | .const_method("nbins", &BinnedVectorReference::nbins) 11 | .const_method("size", &BinnedVectorReference::size) 12 | .const_method("origin", &BinnedVectorReference::origin) 13 | .const_method("width", &BinnedVectorReference::width) 14 | .const_method("name", &BinnedVectorReference::name) 15 | ; 16 | class_("BinnedVectors") 17 | .constructor() 18 | .method("add_vector", &BinnedVectors::add_vector) 19 | .field("bins", &BinnedVectors::bins_) 20 | .const_method("bin_i", &BinnedVectors::bin_i) 21 | .const_method("bin", &BinnedVectors::bin) 22 | .const_method("unbin", &BinnedVectors::unbin) 23 | .const_method("nbins", &BinnedVectors::nbins) 24 | ; 25 | } 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: bigvis 2 | Version: 0.1.0.9000 3 | Title: Tools for visualisation of big data sets 4 | Description: Tools for visualising large datasets. 5 | Authors@R: c( 6 | person("Hadley", "Wickham", role = c("aut", "cre"), , "hadley@rstudio.com"), 7 | person("Yue", "Hue", role = "aut"), 8 | person("R Core team", role = "ctb", comment = "guess_bandwidth adapted from stats::bw.SJ") 9 | ) 10 | Depends: 11 | Rcpp 12 | Imports: 13 | methods 14 | Suggests: 15 | plyr, 16 | ggplot2, 17 | scales 18 | LazyData: true 19 | LinkingTo: 20 | Rcpp, 21 | BH 22 | License: GPL (>= 2) 23 | Collate: 24 | 'standardise.r' 25 | 'movies.r' 26 | 'RcppExports.R' 27 | 'adjust.r' 28 | 'ranged.r' 29 | 'bigvis.r' 30 | 'rebin.r' 31 | 'autoplot.r' 32 | 'origin.r' 33 | 'utils.r' 34 | 'breaks.r' 35 | 'weighted-stats.r' 36 | 'condense.r' 37 | 'condensed.r' 38 | 'bin.r' 39 | 'smooth.r' 40 | 'challenge.r' 41 | 'peel.r' 42 | 'id.r' 43 | 'rmse.r' 44 | 'width.r' 45 | 'h.r' 46 | 'mt.r' 47 | 'dgrid.r' 48 | -------------------------------------------------------------------------------- /inst/tests/test-stat.r: -------------------------------------------------------------------------------- 1 | context("Stats") 2 | 3 | test_that("linear regression recovers slope & intercept if no errors", { 4 | x <- 1:10 5 | w <- rep(1, 10) 6 | 7 | expect_equal(regress(x, x * 2, w), c(0, 2)) 8 | expect_equal(regress(x, x * -2, w), c(0, -2)) 9 | expect_equal(regress(x, x * -2 + 5, w), c(5, -2)) 10 | expect_equal(regress(x, x * -2 + -5, w), c(-5, -2)) 11 | }) 12 | 13 | simpleLm <- function(x, y, w) { 14 | unname(coef(lm(y ~ x, weights = w))) 15 | } 16 | 17 | test_that("linear regression matches lm", { 18 | x <- 1:10 19 | y <- 10 + x * 2 + rnorm(10) 20 | w <- rep(1, 10) 21 | 22 | expect_equal(regress(x, y, w), simpleLm(x, y, w)) 23 | }) 24 | 25 | test_that("linear regression matches lm with weights", { 26 | x <- 1:10 27 | y <- 10 + x * 2 + rnorm(10) 28 | w <- runif(10) 29 | 30 | expect_equal(regress(x, y, w), simpleLm(x, y, w)) 31 | }) 32 | 33 | test_that("robust regression effectively removes outlier", { 34 | x <- 1:10 35 | y <- 10 + x * 2 + c(rep(0, 9), 10) 36 | w <- rep(1, 10) 37 | 38 | expect_equal(regress_robust(x, y, w, 10), c(10, 2)) 39 | }) 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /man/frange.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{frange} 4 | \alias{frange} 5 | \title{Efficient implementation of range.} 6 | \usage{ 7 | frange(x, finite = TRUE) 8 | } 9 | \arguments{ 10 | \item{x}{a numeric vector, or a \code{\link{ranged}} object} 11 | 12 | \item{finite}{If \code{TRUE} ignores missing values and infinities. Note 13 | that if the vector is empty, or only contains missing values, 14 | \code{frange} will return \code{c(Inf, -Inf)} because those are the 15 | identity values for \code{\link{min}} and \code{\link{max}} respectively.} 16 | } 17 | \description{ 18 | This is an efficient C++ implementation of range for numeric vectors: 19 | it avoids S3 dispatch, and computes both min and max in a single pass 20 | through the input. 21 | } 22 | \details{ 23 | If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}} 24 | object), it will be used instead of computing the range from scratch. 25 | } 26 | \examples{ 27 | x <- runif(1e6) 28 | system.time(range(x)) 29 | system.time(frange(x)) 30 | 31 | rx <- ranged(x) 32 | system.time(frange(rx)) 33 | } 34 | 35 | -------------------------------------------------------------------------------- /man/weighted.quantile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/weighted-stats.r 3 | \name{weighted.quantile} 4 | \alias{weighted.quantile} 5 | \title{Compute quantiles of weighted data.} 6 | \usage{ 7 | weighted.quantile(x, w, probs = seq(0, 1, 0.25), na.rm = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{numeric vector of observations} 11 | 12 | \item{w}{integer vector of weights, representing the number of 13 | time each \code{x} was observed} 14 | 15 | \item{probs}{numeric vector of probabilities between 0 and 1} 16 | 17 | \item{na.rm}{If \code{TRUE} will automatically remove missing values 18 | in \code{x} or \code{w}.} 19 | } 20 | \description{ 21 | Compute quantiles of weighted data. 22 | } 23 | \details{ 24 | Currently only implements the type 7 algorithm, as described in 25 | \code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core. 26 | } 27 | \examples{ 28 | x <- runif(200) 29 | w <- rpois(200, 5) + 1 30 | weighted.quantile(x, w) 31 | } 32 | \seealso{ 33 | Other weighted statistics: \code{\link{weighted.ecdf}}; 34 | \code{\link{weighted.sd}}, \code{\link{weighted.var}} 35 | } 36 | 37 | -------------------------------------------------------------------------------- /man/dchallenge.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/challenge.r 3 | \name{dchallenge} 4 | \alias{dchallenge} 5 | \alias{rchallenge} 6 | \title{Density and random number generation functions for a challenging 7 | distribution.} 8 | \usage{ 9 | dchallenge(x) 10 | 11 | rchallenge(n) 12 | } 13 | \arguments{ 14 | \item{x}{values to evaluate pdf at} 15 | 16 | \item{n}{number of random samples to generate} 17 | } 18 | \description{ 19 | This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom 20 | centered at 15 and scaled by 2, and a gamma distribution with shape 2 21 | and rate 1/3. (The t-distribution is windsorised at 0, but this 22 | has negligible effect.) This distribution is challenging because it 23 | mixes heavy tailed and asymmetric distributions. 24 | } 25 | \examples{ 26 | plot(dchallenge, xlim = c(-5, 60), n = 500) 27 | 28 | x <- rchallenge(1e4) 29 | hist(x, breaks = 1000) 30 | xsum <- condense(bin(x, 0.1)) 31 | plot(xsum$x, xsum$.count, type = "l") 32 | xsmu <- smooth(xsum, 0.3) 33 | plot(xsmu$x, xsmu$.count, type = "l") 34 | plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30)) 35 | } 36 | 37 | -------------------------------------------------------------------------------- /R/origin.r: -------------------------------------------------------------------------------- 1 | #' Find the origin. 2 | #' 3 | #' @details 4 | #' This algorithm implements simple heuristics for determining the origin of 5 | #' a histogram when only the binwidth is specified. It: 6 | #' 7 | #' \itemize{ 8 | #' \item rounds to zero, if relatively close 9 | #' \item subtracts 0.5 offset, if an x is integer 10 | #' \item ensures the origin is a multiple of the binwidth 11 | #' } 12 | #' @param x numeric or integer vector 13 | #' @param binwidth binwidth 14 | #' @export 15 | #' @keywords internal 16 | #' @family reasonable defaults 17 | #' @examples 18 | #' find_origin(1:10, 1) 19 | #' find_origin(1:10, 2) 20 | #' find_origin(c(1, 1e6), 1) 21 | find_origin <- function(x, binwidth) { 22 | rng <- frange(x, finite = TRUE) 23 | if (!all(is.finite(rng))) stop("No valid values in x", call. = FALSE) 24 | 25 | offset <- is.integer(x) * 0.5 26 | 27 | if (close_to_zero(rng[1], rng)) { 28 | 0 - offset 29 | } else { 30 | floor_any(rng[1], binwidth) - offset 31 | } 32 | } 33 | 34 | close_to_zero <- function(x, rng) { 35 | (abs(x) / abs(rng[2] - rng[1])) < 1e-3 36 | } 37 | 38 | floor_any <- function(x, accuracy) { 39 | floor(x / accuracy) * accuracy 40 | } 41 | -------------------------------------------------------------------------------- /man/condense.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/condense.r 3 | \name{condense} 4 | \alias{condense} 5 | \title{Efficient binned summaries.} 6 | \usage{ 7 | condense(..., z = NULL, summary = NULL, w = NULL, drop = NULL) 8 | } 9 | \arguments{ 10 | \item{...}{group objects created by \code{\link{bin}}} 11 | 12 | \item{z}{a numeric vector to summary for each group. Optional for some 13 | summary statistics.} 14 | 15 | \item{summary}{the summary statistic to use. Currently must be one of 16 | count, sum, mean, median or sd. If \code{NULL}, defaults to mean if 17 | y is present, count if not.} 18 | 19 | \item{w}{a vector of weights. Not currently supported by all summary 20 | functions.} 21 | 22 | \item{drop}{if \code{TRUE} only locations with data will be returned. This 23 | is more efficient if the data is very sparse (<1\% of cells filled), and 24 | is slightly less efficient. Defaults to \code{TRUE} if you are condensing 25 | over two or more dimensions, \code{FALSE} for 1d.} 26 | } 27 | \description{ 28 | Efficient binned summaries. 29 | } 30 | \examples{ 31 | x <- runif(1e5) 32 | gx <- bin(x, 0.1) 33 | condense(gx) 34 | } 35 | 36 | -------------------------------------------------------------------------------- /man/bin.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/bin.r 3 | \name{bin} 4 | \alias{bin} 5 | \title{Create a binned variable.} 6 | \usage{ 7 | bin(x, width = find_width(x), origin = find_origin(x, width), name = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{numeric or integer vector} 11 | 12 | \item{width}{bin width. If not specified, about 10,000 bins will be chosen 13 | using the algorithim in \code{\link{find_width}}.} 14 | 15 | \item{origin}{origin. If not specified, guessed by \code{\link{find_origin}}.} 16 | 17 | \item{name}{name of original variable. This will be guessed from the input to 18 | \code{group} if not supplied. Used in the output of 19 | \code{\link{condense}} etc.} 20 | } 21 | \description{ 22 | Create a binned variable. 23 | } 24 | \details{ 25 | This function produces an R reference class that wraps around a C++ function. 26 | Generally, you should just treat this as an opaque object with reference 27 | semantics, and you shouldn't call the methods on it - pass it to 28 | \code{\link{condense}} and friends. 29 | } 30 | \examples{ 31 | x <- runif(1e6) 32 | bin(x) 33 | bin(x, 0.01) 34 | bin(x, 0.01, origin = 0.5) 35 | } 36 | 37 | -------------------------------------------------------------------------------- /man/mt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/mt.r 3 | \name{mt} 4 | \alias{inv_mt} 5 | \alias{mt} 6 | \alias{mt_trans} 7 | \title{Modulus transformation (and its inverse).} 8 | \usage{ 9 | mt(x, lambda) 10 | 11 | inv_mt(x, lambda) 12 | 13 | mt_trans(lambda) 14 | } 15 | \arguments{ 16 | \item{x}{values to transform} 17 | 18 | \item{lambda}{degree of transformation} 19 | } 20 | \description{ 21 | A generalisation of the box-cox transformation that works for 22 | values with both positive and negative values. 23 | } 24 | \details{ 25 | This is useful for compressing the tails of long-tailed distributions, 26 | often encountered with very large datasets. 27 | } 28 | \examples{ 29 | x <- seq(-10, 10, length = 100) 30 | plot(x, mt(x, 0), type = "l") 31 | plot(x, mt(x, 0.25), type = "l") 32 | plot(x, mt(x, 0.5), type = "l") 33 | plot(x, mt(x, 1), type = "l") 34 | plot(x, mt(x, 2), type = "l") 35 | plot(x, mt(x, -1), type = "l") 36 | plot(x, mt(x, -2), type = "l") 37 | } 38 | \references{ 39 | J. John and N. Draper. "An alternative family of 40 | transformations." Applied Statistics, pages 190-197, 1980. 41 | \url{http://www.jstor.org/stable/2986305} 42 | } 43 | 44 | -------------------------------------------------------------------------------- /man/weighted.ecdf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/weighted-stats.r 3 | \name{weighted.ecdf} 4 | \alias{weighted.ecdf} 5 | \title{A weighted ecdf function.} 6 | \usage{ 7 | weighted.ecdf(x, w) 8 | } 9 | \arguments{ 10 | \item{x}{numeric vector of observations} 11 | 12 | \item{w}{integer vector of weights, representing the number of 13 | time each \code{x} was observed} 14 | } 15 | \description{ 16 | An extension of the base \code{\link[stats]{ecdf}} function which works 17 | with weighted data. 18 | } 19 | \section{S3 methods}{ 20 | 21 | The \code{ecdf} class has methods for \code{\link{plot}}, 22 | \code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}. 23 | \code{\link{quantile}} does not currently correctly compute values for 24 | weighted ecdfs. 25 | } 26 | \examples{ 27 | x <- runif(200) 28 | w <- rpois(200, 5) + 1 29 | 30 | e <- weighted.ecdf(x, w) 31 | plot(e) 32 | summary(e) 33 | 34 | y <- x[rep(seq_along(x), w)] 35 | plot(ecdf(y)) 36 | } 37 | \seealso{ 38 | \code{\link[stats]{weighted.mean}} 39 | 40 | Other weighted statistics: \code{\link{weighted.quantile}}; 41 | \code{\link{weighted.sd}}, \code{\link{weighted.var}} 42 | } 43 | 44 | -------------------------------------------------------------------------------- /man/autoplot.condensed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/autoplot.r 3 | \name{autoplot.condensed} 4 | \alias{autoplot.condensed} 5 | \title{Autoplot condensed summaries.} 6 | \usage{ 7 | \method{autoplot}{condensed}(x, var = last(summary_vars(x)), ...) 8 | } 9 | \arguments{ 10 | \item{x}{a condensed summary} 11 | 12 | \item{var}{which summary variable to display} 13 | 14 | \item{...}{other arguments passed on to individual methods} 15 | } 16 | \description{ 17 | Autoplot condensed summaries. 18 | } 19 | \examples{ 20 | if (require("ggplot2")) { 21 | 22 | # 1d summaries ----------------------------- 23 | x <- rchallenge(1e4) 24 | z <- x + rt(length(x), df = 2) 25 | xsum <- condense(bin(x, 0.1)) 26 | zsum <- condense(bin(x, 0.1), z = z) 27 | 28 | autoplot(xsum) 29 | autoplot(peel(xsum)) 30 | 31 | autoplot(zsum) 32 | autoplot(peel(zsum, keep = 1)) 33 | autoplot(peel(zsum)) 34 | 35 | # 2d summaries ----------------------------- 36 | y <- runif(length(x)) 37 | xysum <- condense(bin(x, 0.1), bin(y, 0.1)) 38 | xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z) 39 | 40 | autoplot(xysum) 41 | autoplot(peel(xysum)) 42 | autoplot(xyzsum) 43 | autoplot(peel(xyzsum)) 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/condense-gen.r: -------------------------------------------------------------------------------- 1 | library(whisker) 2 | 3 | # Generate template specialisations for groupwise - these are the functions 4 | # that are called from R. 5 | 6 | summaries <- c( 7 | count = "Sum(0)", 8 | sum = "Sum(1)", 9 | mean = "Moments(1)", 10 | sd = "Moments(2)", 11 | median = "Median()" 12 | ) 13 | 14 | template <- " 15 | // [[Rcpp::export]] 16 | List condense_{{name}}(const List& x, const NumericVector& z, 17 | const NumericVector& weight, bool drop = false) { 18 | if (drop) { 19 | return sparse_condense(BinnedVectors(x), z, weight, Summary{{summary}}); 20 | } else { 21 | return condense(BinnedVectors(x), z, weight, Summary{{summary}}); 22 | } 23 | } 24 | " 25 | 26 | cpp_fun <- function(summary) { 27 | whisker.render(template, list( 28 | name = tolower(summary), 29 | summary = summaries[[summary]] 30 | )) 31 | } 32 | 33 | 34 | groupwise <- readLines("condense.cpp") 35 | split <- which(grepl("// -{40,}", groupwise))[1] 36 | original <- groupwise[1:split] 37 | 38 | writeLines(original, "condense.cpp") 39 | 40 | cat("// Autogenerated by condense-gen.r\n", file = "condense.cpp", append = TRUE) 41 | funs <- unlist(lapply(names(summaries), cpp_fun)) 42 | cat(funs, file = "condense.cpp", append = TRUE, sep = "") 43 | -------------------------------------------------------------------------------- /man/transform.condensed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/rebin.r 3 | \name{transform.condensed} 4 | \alias{rebin} 5 | \alias{transform.condensed} 6 | \title{Transform condensed objects, collapsing unique bins.} 7 | \usage{ 8 | \\method{transform}{condensed}(`_data`, ...) 9 | 10 | rebin(data) 11 | } 12 | \arguments{ 13 | \item{...}{named arguments evaluated in the context of the data} 14 | 15 | \item{data,`_data`}{a condensed summary} 16 | } 17 | \description{ 18 | Transform condensed objects, collapsing unique bins. 19 | } 20 | \details{ 21 | You don't need to use \code{rebin} if you use transform: it will 22 | automatically rebin for you. You will need to use it if you manually 23 | transform any grouping variables. 24 | } 25 | \examples{ 26 | x <- runif(1e4, -1, 1) 27 | xsum <- condense(bin(x, 1 / 50)) 28 | 29 | # Transforming by hand: must use rebin 30 | xsum$x <- abs(xsum$x) 31 | rebin(xsum) 32 | if (require("ggplot2")) { 33 | autoplot(xsum) + geom_point() 34 | autoplot(rebin(xsum)) + geom_point() 35 | } 36 | 37 | #' Transforming with transform 38 | y <- x ^ 2 + runif(length(x), -0.1, 0.1) 39 | xysum <- condense(bin(x, 1 / 50), z = y) 40 | xysum <- transform(xysum, x = abs(x)) 41 | if (require("ggplot2")) { 42 | autoplot(xysum) 43 | } 44 | } 45 | \keyword{internal} 46 | 47 | -------------------------------------------------------------------------------- /man/ranged.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/ranged.r 3 | \name{ranged} 4 | \alias{ranged} 5 | \title{A S3 class for caching the range of a vector} 6 | \usage{ 7 | ranged(x, range = frange(x, finite = TRUE)) 8 | } 9 | \arguments{ 10 | \item{x}{a numeric vector} 11 | 12 | \item{range}{the range of the vector (excluding missing values), if known. 13 | If unknown, it will be computed with \code{\link{frange}}, a fast C++ 14 | implementation of \code{\link{range}}.} 15 | } 16 | \description{ 17 | This class is designed for dealing with large vectors, where the cost of 18 | recomputing the range multiple times is prohibitive. It provides methods 19 | for \code{\link{print}} and \code{\link{str}} that display only the range, 20 | not the contents. 21 | } 22 | \section{Performance}{ 23 | 24 | For best performance, you may want to run copy and paste the contents of 25 | this function into your function, to avoid making any copies of \code{x}. 26 | This is probably only necessary if you're dealing with extremely large 27 | vectors, > 100 million obs. 28 | } 29 | \examples{ 30 | x <- runif(1e6) 31 | y <- ranged(x) 32 | range(y) 33 | y 34 | str(y) 35 | 36 | # Modifications to the class currently destroy the cache 37 | y[1] <- 10 38 | max(y) 39 | class(y) 40 | z <- y + 10 41 | max(z) 42 | class(z) 43 | } 44 | 45 | -------------------------------------------------------------------------------- /man/weighted.var.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/weighted-stats.r 3 | \name{weighted.var} 4 | \alias{weighted.sd} 5 | \alias{weighted.var} 6 | \title{Compute a weighted variance or standard deviation of a vector.} 7 | \usage{ 8 | weighted.var(x, w = NULL, na.rm = FALSE) 9 | 10 | weighted.sd(x, w, na.rm = TRUE) 11 | } 12 | \arguments{ 13 | \item{x}{numeric vector of observations} 14 | 15 | \item{w}{integer vector of weights, representing the number of 16 | time each \code{x} was observed} 17 | 18 | \item{na.rm}{if \code{TRUE}, missing values in both \code{w} and \code{x} 19 | will be removed prior computation. Otherwise if there are missing values 20 | the result will always be missing.} 21 | } 22 | \description{ 23 | Compute a weighted variance or standard deviation of a vector. 24 | } 25 | \details{ 26 | Note that unlike the base R \code{\link{var}} function, these functions only 27 | work with individual vectors not matrices or data frames. 28 | } 29 | \examples{ 30 | x <- c(1:5) 31 | w <- rpois(5, 5) + 1 32 | y <- x[rep(seq_along(x), w)] 33 | weighted.var(x, w) 34 | var(y) 35 | 36 | stopifnot(all.equal(weighted.var(x, w), var(y))) 37 | } 38 | \seealso{ 39 | \code{\link[stats]{weighted.mean}} 40 | 41 | Other weighted statistics: \code{\link{weighted.ecdf}}; 42 | \code{\link{weighted.quantile}} 43 | } 44 | 45 | -------------------------------------------------------------------------------- /R/movies.r: -------------------------------------------------------------------------------- 1 | #' Movie information and user ratings from IMDB.com. 2 | #' 3 | #' The internet movie database, \url{http://imdb.com/}, is a website devoted 4 | #' to collecting movie data supplied by studios and fans. It claims to be the 5 | #' biggest movie database on the web and is run by amazon. More about 6 | #' information imdb.com can be found online, 7 | #' \url{http://imdb.com/help/show_leaf?about}, including information about 8 | #' the data collection process, 9 | #' \url{http://imdb.com/help/show_leaf?infosource}. 10 | #' 11 | #' Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user. The data set contains the following fields: 12 | #' 13 | #' \itemize{ 14 | #' \item title. Title of the movie. 15 | #' \item year. Year of release. 16 | #' \item budget. Total budget (if known) in US dollars 17 | #' \item length. Length in minutes. 18 | #' \item rating. Average IMDB user rating. 19 | #' \item votes. Number of IMDB users who rated this movie. 20 | #' \item mpaa. MPAA rating. 21 | #' \item action, animation, comedy, drama, documentary, romance, short: 22 | #' \code{TRUE} if movie belongs to that genre. 23 | #' } 24 | #' 25 | #' @docType data 26 | #' @usage data(movies) 27 | #' @name movies 28 | #' @format A data frame with 130,456 rows and 14 variables 29 | #' @references \url{http://had.co.nz/data/movies/} 30 | NULL 31 | -------------------------------------------------------------------------------- /bench/kernel.cpp: -------------------------------------------------------------------------------- 1 | // Differences in kernel performance 2 | 3 | #include 4 | using namespace Rcpp; 5 | 6 | // [[Rcpp::export]] 7 | NumericVector normal_kernel(NumericVector x) { 8 | int n = x.size(); 9 | NumericVector out(n); 10 | 11 | for (int i = 0; i < n; ++i) { 12 | out[i] = R::dnorm(x[i], 0, 1, 0); 13 | } 14 | 15 | return out; 16 | } 17 | 18 | // [[Rcpp::export]] 19 | double tricube2(double x) { 20 | x = fabs(x); 21 | if (x > 1) return 0; 22 | 23 | return pow(1 - pow(x, 3), 3); 24 | } 25 | 26 | // [[Rcpp::export]] 27 | double tricube(double x) { 28 | x = fabs(x); 29 | if (x > 1) return 0; 30 | 31 | double y = 1 - x * x * x; 32 | return y * y * y; 33 | } 34 | 35 | // [[Rcpp::export]] 36 | NumericVector tricube_kernel(NumericVector x) { 37 | int n = x.size(); 38 | NumericVector out(n); 39 | 40 | for (int i = 0; i < n; ++i) { 41 | out[i] = tricube(x[i]); 42 | } 43 | 44 | return out; 45 | } 46 | 47 | // [[Rcpp::export]] 48 | NumericVector copy(NumericVector x) { 49 | int n = x.size(); 50 | NumericVector out(n); 51 | 52 | for (int i = 0; i < n; ++i) { 53 | out[i] = x[i]; 54 | } 55 | 56 | return out; 57 | } 58 | 59 | /*** R 60 | options(digits = 3) 61 | library(microbenchmark) 62 | 63 | x <- runif(1e4) 64 | 65 | mean(sapply(x, tricube) - sapply(x, tricube2)) 66 | 67 | microbenchmark( 68 | copy(x), 69 | tricube_kernel(x), 70 | normal_kernel(x) 71 | ) 72 | 73 | */ -------------------------------------------------------------------------------- /man/condensed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/condensed.r 3 | \name{condensed} 4 | \alias{as.condensed} 5 | \alias{condensed} 6 | \alias{is.condensed} 7 | \title{Condensed: an S3 class for condensed summaries.} 8 | \usage{ 9 | condensed(groups, grouped, summary) 10 | 11 | is.condensed(x) 12 | 13 | as.condensed(x) 14 | } 15 | \arguments{ 16 | \item{groups}{list of \code{\link{bin}}ed objects} 17 | 18 | \item{grouped,summary}{output from C++ condense function} 19 | 20 | \item{x}{object to test or coerce} 21 | } 22 | \description{ 23 | This object managed the properties of condensed (summarised) data frames. 24 | } 25 | \section{S3 methods}{ 26 | 27 | 28 | Mathematical functions with methods for \code{binsum} object will modify 29 | the x column of the data frame and \code{\link{rebin}} the data, calculating 30 | updated summary statistics. 31 | 32 | Currently methods are provided for the \code{Math} group generic, 33 | logical comparison and arithmetic operators, and 34 | \code{\link[plyr]{round_any}}. 35 | } 36 | \examples{ 37 | if (require("ggplot2")) { 38 | 39 | x <- rchallenge(1e4) 40 | xsum <- condense(bin(x, 1 / 10)) 41 | 42 | # Basic math operations just modify the first column 43 | autoplot(xsum) 44 | autoplot(xsum * 10) 45 | autoplot(xsum - 30) 46 | autoplot(abs(xsum - 30)) 47 | 48 | # Similarly, logical operations work on the first col 49 | autoplot(xsum[xsum > 10, ]) 50 | } 51 | } 52 | \keyword{internal} 53 | 54 | -------------------------------------------------------------------------------- /man/movies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/movies.r 3 | \docType{data} 4 | \name{movies} 5 | \alias{movies} 6 | \title{Movie information and user ratings from IMDB.com.} 7 | \format{A data frame with 130,456 rows and 14 variables} 8 | \usage{ 9 | data(movies) 10 | } 11 | \description{ 12 | The internet movie database, \url{http://imdb.com/}, is a website devoted 13 | to collecting movie data supplied by studios and fans. It claims to be the 14 | biggest movie database on the web and is run by amazon. More about 15 | information imdb.com can be found online, 16 | \url{http://imdb.com/help/show_leaf?about}, including information about 17 | the data collection process, 18 | \url{http://imdb.com/help/show_leaf?infosource}. 19 | } 20 | \details{ 21 | Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user. The data set contains the following fields: 22 | 23 | \itemize{ 24 | \item title. Title of the movie. 25 | \item year. Year of release. 26 | \item budget. Total budget (if known) in US dollars 27 | \item length. Length in minutes. 28 | \item rating. Average IMDB user rating. 29 | \item votes. Number of IMDB users who rated this movie. 30 | \item mpaa. MPAA rating. 31 | \item action, animation, comedy, drama, documentary, romance, short: 32 | \code{TRUE} if movie belongs to that genre. 33 | } 34 | } 35 | \references{ 36 | \url{http://had.co.nz/data/movies/} 37 | } 38 | 39 | -------------------------------------------------------------------------------- /man/standardise.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/standardise.r 3 | \name{standardise} 4 | \alias{standardise} 5 | \title{Standardise a summary to sum to one.} 6 | \usage{ 7 | standardise(x, margin = integer()) 8 | } 9 | \arguments{ 10 | \item{x}{a condensed summary. Must have \code{.count} variable.} 11 | 12 | \item{margin}{margins to standardise along. If \code{NULL}, the default, 13 | standardises the whole array.} 14 | } 15 | \description{ 16 | Standardise a summary to sum to one. 17 | } 18 | \examples{ 19 | b1 <- condense(bin(movies$year, 1)) 20 | d1 <- smooth(b1, 2, type = "reg") 21 | 22 | if (require("ggplot2")) { 23 | 24 | autoplot(b1) 25 | autoplot(d1) 26 | 27 | # Note change in x-axis limits 28 | autoplot(standardise(d1)) 29 | } 30 | 31 | # Can also standardise a dimension at a time 32 | b2 <- with(movies, condense(bin(year, 2), bin(length, 10))) 33 | b2 <- peel(b2, central = TRUE) 34 | 35 | if (require("ggplot2")) { 36 | 37 | autoplot(b2) 38 | autoplot(standardise(b2)) # note legend 39 | autoplot(standardise(b2, "year")) # each row sums to 1 40 | autoplot(standardise(b2, "length")) # each col sums to 1 41 | 42 | base <- ggplot(b2, aes(length, .count)) + 43 | geom_line(aes(group = year, colour = year)) 44 | base 45 | base \%+\% standardise(b2) # Just affects y axis labels 46 | base \%+\% standardise(b2, "year") # Makes year comparable 47 | base \%+\% standardise(b2, "length") # Meaningless for this display 48 | 49 | } 50 | } 51 | 52 | -------------------------------------------------------------------------------- /inst/tests/test-smooth.r: -------------------------------------------------------------------------------- 1 | context("Smooth") 2 | 3 | tricube <- function(x) { 4 | x <- abs(x) 5 | ifelse(x > 1, 0, (1 - x ^ 3) ^ 3) 6 | } 7 | # plot(tricube, xlim = c(-1.5, 1.5)) 8 | 9 | test_that("factorised smooth equal to manual smooth", { 10 | grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE)) 11 | z <- rep(0, nrow(grid)) 12 | z[c(5, 23, 84)] <- 1 13 | 14 | z_x <- smooth_nd_1(grid, z, numeric(), grid, 0, 3) 15 | z_y <- smooth_nd_1(grid, z, numeric(), grid, 1, 3) 16 | 17 | z_xy <- smooth_nd_1(grid, z_x, numeric(), grid, 1, 3) 18 | z_yx <- smooth_nd_1(grid, z_y, numeric(), grid, 0, 3) 19 | z2 <- smooth_nd(grid, z, numeric(), grid, c(3, 3)) 20 | 21 | expect_equal(z_xy, z2) 22 | expect_equal(z_yx, z2) 23 | }) 24 | 25 | # library(ggplot2) 26 | # qplot(grid[, 1], grid[, 2], fill = z, geom = "raster") 27 | # qplot(grid[, 1], grid[, 2], fill = z_xy, geom = "raster") 28 | # qplot(grid[, 1], grid[, 2], fill = z_yx, geom = "raster") 29 | # qplot(grid[, 1], grid[, 2], fill = z2, geom = "raster") 30 | 31 | test_that("factorised smooth equal to manual smooth", { 32 | grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE)) 33 | z <- rep(0, nrow(grid)) 34 | z[c(5, 23, 84)] <- 1 35 | 36 | grid <- as.data.frame(grid) 37 | grid$.count <- z 38 | class(grid) <- c("condensed", class(grid)) 39 | 40 | z1 <- smooth(grid, c(3, 3), ".count", factor = FALSE) 41 | z2 <- smooth(grid, c(3, 3), ".count", factor = TRUE) 42 | 43 | expect_equal(z1, z2) 44 | }) 45 | -------------------------------------------------------------------------------- /man/peel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/peel.r 3 | \name{peel} 4 | \alias{peel} 5 | \title{Peel off low density regions of the data.} 6 | \usage{ 7 | peel(x, keep = 0.99, central = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{condensed summary} 11 | 12 | \item{keep}{(approximate) proportion of data to keep. If \code{1}, will 13 | remove all cells with counts. All missing values will be preserved.} 14 | 15 | \item{central}{if \code{TRUE} peels off regions of lowest density only from 16 | the outside of the data. In 2d this works by progressively peeling off 17 | convex hull of the data: the current algorithm is quite slow. 18 | If \code{FALSE}, just removes the lowest density regions wherever they are 19 | found. Regions with 0 density are removed regardless of location. 20 | Defaults to TRUE if there are two or fewer grouping variables is less.} 21 | } 22 | \description{ 23 | Keeps specified proportion of data by removing the lowest density regions, 24 | either anywhere on the plot, or for 2d, just around the edges. 25 | } 26 | \details{ 27 | This is useful for visualisation, as an easy way of focussing on the regions 28 | where the majority of the data lies. 29 | } 30 | \examples{ 31 | x <- rt(1e5, df = 2) 32 | y <- rt(1e5, df = 2) 33 | xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10)) 34 | plot(xysum$x, xysum$y) 35 | 36 | plot(peel(xysum, 0.95, central = TRUE)[1:2]) 37 | plot(peel(xysum, 0.90, central = TRUE)[1:2]) 38 | plot(peel(xysum, 0.50, central = TRUE)[1:2]) 39 | } 40 | 41 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2 (4.1.1): do not edit by hand 2 | 3 | S3method("[",dgrid) 4 | S3method("[<-",ranged) 5 | S3method(Math,condensed) 6 | S3method(Ops,condensed) 7 | S3method(Ops,ranged) 8 | S3method(as.condensed,condensed) 9 | S3method(as.condensed,data.frame) 10 | S3method(as.data.frame,dgrid) 11 | S3method(as.data.frame,ranged) 12 | S3method(as.integer,dgrid) 13 | S3method(max,dgrid) 14 | S3method(max,ranged) 15 | S3method(min,dgrid) 16 | S3method(min,ranged) 17 | S3method(print,ranged) 18 | S3method(range,dgrid) 19 | S3method(range,ranged) 20 | S3method(str,ranged) 21 | S3method(transform,condensed) 22 | export(as.condensed) 23 | export(autoplot.condensed) 24 | export(best_h) 25 | export(bin) 26 | export(breaks) 27 | export(condense) 28 | export(dchallenge) 29 | export(dgrid) 30 | export(find_origin) 31 | export(find_width) 32 | export(frange) 33 | export(h_grid) 34 | export(inv_mt) 35 | export(is.condensed) 36 | export(is.dgrid) 37 | export(is.ranged) 38 | export(mt) 39 | export(mt_trans) 40 | export(peel) 41 | export(ranged) 42 | export(rchallenge) 43 | export(rebin) 44 | export(rmse_cv) 45 | export(rmse_cvs) 46 | export(round_any.condensed) 47 | export(smooth) 48 | export(standardise) 49 | export(weighted.IQR) 50 | export(weighted.ecdf) 51 | export(weighted.median) 52 | export(weighted.quantile) 53 | export(weighted.sd) 54 | export(weighted.var) 55 | exportMethods(as.integer) 56 | exportMethods(show) 57 | importFrom(Rcpp,compileAttributes) 58 | importFrom(Rcpp,cpp_object_initializer) 59 | importFrom(methods,new) 60 | useDynLib(bigvis) 61 | -------------------------------------------------------------------------------- /R/dgrid.r: -------------------------------------------------------------------------------- 1 | #' dgrid: an S3 class for data grids 2 | #' 3 | #' @param x a numeric vector to test or coerce. 4 | #' @param width bin width 5 | #' @param origin bin origins 6 | #' @param nbins number of bins 7 | #' @export 8 | #' @examples 9 | #' g <- dgrid(0:10 + 0.5, width = 1) 10 | #' range(g) 11 | #' as.integer(g) 12 | dgrid <- function(x, width, origin = 0, nbins = NULL) { 13 | stopifnot(is.numeric(x)) 14 | stopifnot(is.numeric(width), length(width) == 1, width > 0) 15 | stopifnot(is.numeric(origin), length(origin) == 1) 16 | 17 | if (is.null(nbins)) { 18 | nbins <- floor((max(x) - origin) / width) 19 | } 20 | 21 | structure(x, class = c("dgrid", "numeric"), 22 | width = width, origin = origin, nbins = nbins) 23 | } 24 | 25 | #' @export 26 | #' @rdname dgrid 27 | is.dgrid <- function(x) inherits(x, "dgrid") 28 | 29 | #' @export 30 | "[.dgrid" <- function(x, ...) { 31 | dgrid(NextMethod(), width = attr(x, "width"), 32 | origin = attr(x, "origin"), nbins = attr(x, "nbins")) 33 | } 34 | 35 | #' @export 36 | min.dgrid <- function(x, ...) attr(x, "origin") 37 | #' @export 38 | max.dgrid <- function(x, ...) { 39 | min(x) + attr(x, "nbins") * attr(x, "width") 40 | } 41 | #' @export 42 | range.dgrid <- function(x, ...) c(min(x), max(x)) 43 | 44 | #' @export 45 | as.integer.dgrid <- function(x, ...) { 46 | as.integer((unclass(x) - attr(x, "origin")) / attr(x, "width") + 1L) 47 | } 48 | 49 | #' @export 50 | as.data.frame.dgrid <- function(x, ...) { 51 | n <- length(x) 52 | list <- list(x) 53 | class(list) <- "data.frame" 54 | attr(list, "row.names") <- c(NA_integer_, -n) 55 | list 56 | } 57 | -------------------------------------------------------------------------------- /man/rmse_cvs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/rmse.r 3 | \name{rmse_cvs} 4 | \alias{rmse_cv} 5 | \alias{rmse_cvs} 6 | \title{Estimate smoothing RMSE using leave-one-out cross-validation.} 7 | \usage{ 8 | rmse_cvs(x, hs = h_grid(x), ...) 9 | 10 | rmse_cv(x, h, var = summary_vars(x)[1], ...) 11 | } 12 | \arguments{ 13 | \item{x}{condensed summary table} 14 | 15 | \item{...}{other variables passed on to \code{\link{smooth}}} 16 | 17 | \item{h,hs}{for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv} 18 | a data frame of bandwidths, as generated by \code{\link{h_grid}}.} 19 | 20 | \item{var}{variable to smooth} 21 | } 22 | \description{ 23 | \code{rmse_cv} computes the leave-one-out RMSE for a single vector of 24 | bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths, 25 | stored as a data frame. 26 | } 27 | \examples{ 28 | \donttest{ 29 | set.seed(1014) 30 | # 1d ----------------------------- 31 | x <- rchallenge(1e4) 32 | xsum <- condense(bin(x, 1 / 10)) 33 | cvs <- rmse_cvs(xsum) 34 | 35 | if (require("ggplot2")) { 36 | autoplot(xsum) 37 | qplot(x, err, data = cvs, geom = "line") 38 | xsmu <- smooth(xsum, 1.3) 39 | autoplot(xsmu) 40 | autoplot(peel(xsmu)) 41 | } 42 | 43 | # 2d ----------------------------- 44 | y <- runif(1e4) 45 | xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) 46 | cvs <- rmse_cvs(xysum, h_grid(xysum, 10)) 47 | if (require("ggplot2")) { 48 | qplot(x, y, data = cvs, size = err) 49 | } 50 | } 51 | } 52 | \seealso{ 53 | Other bandwidth estimation functions: \code{\link{best_h}}; 54 | \code{\link{h_grid}} 55 | } 56 | 57 | -------------------------------------------------------------------------------- /src/Summary2d.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "stats.h" 3 | using namespace Rcpp; 4 | 5 | class Summary2d { 6 | public: 7 | virtual void push(double x, double z, double w) =0; 8 | virtual double compute() =0; 9 | virtual ~Summary2d() {} 10 | }; 11 | 12 | class Summary2dMean: public Summary2d { 13 | double w_, z_; 14 | 15 | public: 16 | Summary2dMean() : w_(0), z_(0) {} 17 | 18 | void push(double x, double z, double w) { 19 | // Rcout << " x: " << x << " z: " << z << " w: " << w << "\n"; 20 | w_ += w; 21 | z_ += z * w; 22 | } 23 | 24 | double compute() { 25 | // Rcout << "Result: " << z_ / w_ << "\n"; 26 | return z_ / w_; 27 | } 28 | }; 29 | 30 | class Summary2dRegression: public Summary2d { 31 | std::vector x_, z_, w_; 32 | 33 | public: 34 | Summary2dRegression() {} 35 | 36 | void push(double x, double z, double w) { 37 | x_.push_back(x); 38 | z_.push_back(z); 39 | w_.push_back(w); 40 | } 41 | 42 | double compute() { 43 | return simpleLinearRegression(x_, z_, w_).alpha; 44 | } 45 | }; 46 | 47 | class Summary2dRobustRegression: public Summary2d { 48 | int iterations_; 49 | std::vector x_, z_, w_; 50 | 51 | public: 52 | Summary2dRobustRegression() : iterations_(3) {} 53 | Summary2dRobustRegression(int iterations) : iterations_(iterations) {} 54 | 55 | void push(double x, double z, double w) { 56 | x_.push_back(x); 57 | z_.push_back(z); 58 | w_.push_back(w); 59 | } 60 | 61 | double compute() { 62 | return simpleRobustRegression(x_, z_, w_, iterations_).alpha; 63 | } 64 | }; 65 | -------------------------------------------------------------------------------- /R/standardise.r: -------------------------------------------------------------------------------- 1 | #' Standardise a summary to sum to one. 2 | #' 3 | #' @param x a condensed summary. Must have \code{.count} variable. 4 | #' @param margin margins to standardise along. If \code{NULL}, the default, 5 | #' standardises the whole array. 6 | #' @export 7 | #' @examples 8 | #' b1 <- condense(bin(movies$year, 1)) 9 | #' d1 <- smooth(b1, 2, type = "reg") 10 | #' 11 | #' if (require("ggplot2")) { 12 | #' 13 | #' autoplot(b1) 14 | #' autoplot(d1) 15 | #' 16 | #' # Note change in x-axis limits 17 | #' autoplot(standardise(d1)) 18 | #' } 19 | #' 20 | #' # Can also standardise a dimension at a time 21 | #' b2 <- with(movies, condense(bin(year, 2), bin(length, 10))) 22 | #' b2 <- peel(b2, central = TRUE) 23 | #' 24 | #' if (require("ggplot2")) { 25 | #' 26 | #' autoplot(b2) 27 | #' autoplot(standardise(b2)) # note legend 28 | #' autoplot(standardise(b2, "year")) # each row sums to 1 29 | #' autoplot(standardise(b2, "length")) # each col sums to 1 30 | #' 31 | #' base <- ggplot(b2, aes(length, .count)) + 32 | #' geom_line(aes(group = year, colour = year)) 33 | #' base 34 | #' base %+% standardise(b2) # Just affects y axis labels 35 | #' base %+% standardise(b2, "year") # Makes year comparable 36 | #' base %+% standardise(b2, "length") # Meaningless for this display 37 | #' 38 | #' } 39 | standardise <- function(x, margin = integer()) { 40 | stopifnot(is.condensed(x), !is.null(x$.count)) 41 | 42 | if (length(margin) == 0) { 43 | x$.count <- prop(x$.count) 44 | } else { 45 | x$.count <- ave(x$.count, id(x[margin]), FUN = prop) 46 | x$.count[is.na(x$.count)] <- 0 47 | } 48 | 49 | x 50 | } 51 | 52 | prop <- function(x) x / sum(x, na.rm = TRUE) 53 | -------------------------------------------------------------------------------- /R/mt.r: -------------------------------------------------------------------------------- 1 | #' Modulus transformation (and its inverse). 2 | #' 3 | #' A generalisation of the box-cox transformation that works for 4 | #' values with both positive and negative values. 5 | #' 6 | #' This is useful for compressing the tails of long-tailed distributions, 7 | #' often encountered with very large datasets. 8 | #' 9 | #' @param x values to transform 10 | #' @param lambda degree of transformation 11 | #' @export 12 | #' @references J. John and N. Draper. "An alternative family of 13 | #' transformations." Applied Statistics, pages 190-197, 1980. 14 | #' \url{http://www.jstor.org/stable/2986305} 15 | #' @examples 16 | #' x <- seq(-10, 10, length = 100) 17 | #' plot(x, mt(x, 0), type = "l") 18 | #' plot(x, mt(x, 0.25), type = "l") 19 | #' plot(x, mt(x, 0.5), type = "l") 20 | #' plot(x, mt(x, 1), type = "l") 21 | #' plot(x, mt(x, 2), type = "l") 22 | #' plot(x, mt(x, -1), type = "l") 23 | #' plot(x, mt(x, -2), type = "l") 24 | mt <- function(x, lambda) { 25 | stopifnot(is.numeric(x)) 26 | stopifnot(is.numeric(lambda), length(lambda) == 1) 27 | 28 | if (lambda == 0) { 29 | sign(x) * log(abs(x) + 1) 30 | } else { 31 | sign(x) * ((abs(x) + 1) ^ lambda - 1) / lambda 32 | } 33 | } 34 | 35 | #' @rdname mt 36 | #' @export 37 | inv_mt <- function(x, lambda) { 38 | stopifnot(is.numeric(x)) 39 | stopifnot(is.numeric(lambda), length(lambda) == 1) 40 | 41 | if (lambda == 0) { 42 | sign(x) * (exp(abs(x)) - 1) 43 | } else { 44 | sign(x) * ((abs(x) * lambda + 1) ^ (1 / lambda) - 1) 45 | } 46 | } 47 | 48 | #' @rdname mt 49 | #' @export 50 | mt_trans <- function(lambda) { 51 | scales::trans_new("modulo", 52 | function(x) mt(x, lambda), 53 | function(x) inv_mt(x, lambda) 54 | ) 55 | } 56 | -------------------------------------------------------------------------------- /src/BinnedVectors.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | int BinnedVectors::bin_i(int i) const { 5 | int bin = 0; 6 | int ngroups = groups_.size(); 7 | 8 | for (int j = 0; j < ngroups; ++j) { 9 | bin += groups_[j].bin_i(i) * bins_[(ngroups - 1) - j]; 10 | } 11 | 12 | return bin; 13 | } 14 | 15 | int BinnedVectors::bin(std::vector x) const { 16 | int ngroups = groups_.size(); 17 | if (x.size() != ngroups) stop("x must be same length as groups"); 18 | int bin = 0; 19 | 20 | for (int j = 0; j < ngroups; ++j) { 21 | int bin_j = groups_[j].bin(x[j]); 22 | bin += bin_j * bins_[(ngroups - 1) - j]; 23 | // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n"; 24 | } 25 | 26 | return bin; 27 | } 28 | 29 | std::vector BinnedVectors::unbin(int bin) const { 30 | int ngroups = groups_.size(); 31 | std::vector bins(ngroups); 32 | 33 | // if ngroups = 3, then: 34 | // bin = groups[0].bin(x[0]) * bins[2] (biggest) + 35 | // groups[1].bin(x[1]) * bins[1] + 36 | // groups[2].bin(x[2]) * bins[0] (smallest) 37 | // peel off largest first 38 | // bin_j = bin %/% bin[2] 39 | // groups[0].unbin(bin_j) 40 | // and that goes in last output position 41 | 42 | for (int i = 0, j = ngroups - 1; i < ngroups - 1; ++i, --j) { 43 | int bin_j = bin % bins_[j]; 44 | // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n"; 45 | bins[j] = groups_[j].unbin(bin_j); 46 | 47 | bin = (bin - bin_j) / bins_[j]; 48 | } 49 | // Rcout << "group: " << 0 << " bin: " << bin << " bin_j: " << bin << "\n"; 50 | // Special case for last group because x %% 1 = 0 51 | bins[0] = groups_[0].unbin(bin); 52 | 53 | return bins; 54 | } 55 | 56 | -------------------------------------------------------------------------------- /src/group-hex.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Translated from 3 | * https://github.com/d3/d3-plugins/blob/master/hexbin/hexbin.js 4 | * 5 | * Copyright (C) 2013 Hadley Wickham 6 | * Copyright (C) 2012 Mike Bostock (mbostock at gmail dot com) 7 | */ 8 | class GroupHex { 9 | const NumericVector x_; 10 | const NumericVector y_; 11 | double x_width_; 12 | double x_origin_; 13 | double y_width_; 14 | double y_origin_; 15 | double x_bins; 16 | 17 | public: 18 | GroupHex (const NumericVector& x, const NumericVector& y, 19 | double x_width, double y_width, 20 | double x_origin, double y_origin, 21 | double x_max) 22 | : x_(x), y_(y), x_width_(x_width), x_origin_(x_origin), 23 | y_width_(y_width), y_origin_(y_origin) { 24 | if (x.size() != y.size()) stop("x & y are not the same size"); 25 | x_bins = x_max / x_width_ + 1; 26 | } 27 | 28 | int bin_i(int i) const { 29 | double py = ISNAN(y_[i]) ? 0 : (y_[i] - y_origin_) / y_width_ + 1; 30 | int pj = py; 31 | double py1 = py - pj; 32 | 33 | double px = ISNAN(x_[i]) ? 0 : (x_[i] - x_origin_) / x_width_ + 1 - 34 | (pj % 2 ? 0.5 : 0); 35 | int pi = px; 36 | 37 | if (fabs(py1) * 3 > 1) { 38 | double px1 = px - pi, 39 | pi2 = pi + (px < pi ? -1 : 1) / 2, 40 | pj2 = pj + (py < pj ? -1 : 1), 41 | px2 = px - pi2, 42 | py2 = py - pj2; 43 | if (px1 * px1 + py1 * py1 > px2 * px2 + py2 * py2) { 44 | pi = pi2 + (pj % 2 ? 1 : -1) / 2; 45 | pj = pj2; 46 | } 47 | } 48 | 49 | return pj * x_bins + pj; 50 | } 51 | 52 | int size() const { 53 | return x_.size(); 54 | } 55 | }; 56 | -------------------------------------------------------------------------------- /src/frange.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //' Efficient implementation of range. 5 | //' 6 | //' This is an efficient C++ implementation of range for numeric vectors: 7 | //' it avoids S3 dispatch, and computes both min and max in a single pass 8 | //' through the input. 9 | //' 10 | //' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}} 11 | //' object), it will be used instead of computing the range from scratch. 12 | //' 13 | //' @param x a numeric vector, or a \code{\link{ranged}} object 14 | //' @param finite If \code{TRUE} ignores missing values and infinities. Note 15 | //' that if the vector is empty, or only contains missing values, 16 | //' \code{frange} will return \code{c(Inf, -Inf)} because those are the 17 | //' identity values for \code{\link{min}} and \code{\link{max}} respectively. 18 | //' @export 19 | //' @examples 20 | //' x <- runif(1e6) 21 | //' system.time(range(x)) 22 | //' system.time(frange(x)) 23 | //' 24 | //' rx <- ranged(x) 25 | //' system.time(frange(rx)) 26 | // [[Rcpp::export]] 27 | NumericVector frange(const NumericVector& x, const bool finite = true) { 28 | RObject cache = x.attr("range"); 29 | if (cache.sexp_type() == REALSXP) return as(cache); 30 | 31 | NumericVector out(2); 32 | out[0] = INFINITY; 33 | out[1] = -INFINITY; 34 | 35 | int n = x.length(); 36 | for(int i = 0; i < n; ++i) { 37 | if (!finite && R_IsNA(x[i])) { 38 | out[0] = NA_REAL; 39 | out[1] = NA_REAL; 40 | return out; 41 | } 42 | 43 | // If finite, skip infinite values 44 | if (finite && (x[i] == INFINITY || x[i] == -INFINITY)) continue; 45 | 46 | if (x[i] < out[0]) out[0] = x[i]; 47 | if (x[i] > out[1]) out[1] = x[i]; 48 | } 49 | 50 | return out; 51 | } 52 | -------------------------------------------------------------------------------- /R/id.r: -------------------------------------------------------------------------------- 1 | # Copied and pasted from plyr to avoid dependency 2 | 3 | id <- function(.variables, drop = FALSE) { 4 | # Drop all zero length inputs 5 | lengths <- vapply(.variables, length, integer(1)) 6 | .variables <- .variables[lengths != 0] 7 | 8 | if (length(.variables) == 0) { 9 | n <- nrow(.variables) %||% 0L 10 | return(structure(seq_len(n), n = n)) 11 | } 12 | 13 | # Special case for single variable 14 | if (length(.variables) == 1) { 15 | return(id_var(.variables[[1]], drop = drop)) 16 | } 17 | 18 | # Calculate individual ids 19 | ids <- rev(lapply(.variables, id_var, drop = drop)) 20 | p <- length(ids) 21 | 22 | # Calculate dimensions 23 | ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1), 24 | USE.NAMES = FALSE) 25 | n <- prod(ndistinct) 26 | if (n > 2 ^ 31) { 27 | # Too big for integers, have to use strings, which will be much slower :( 28 | 29 | char_id <- do.call("paste", c(ids, sep = "\r")) 30 | res <- match(char_id, unique(char_id)) 31 | } else { 32 | combs <- c(1, cumprod(ndistinct[-p])) 33 | 34 | mat <- do.call("cbind", ids) 35 | res <- c((mat - 1L) %*% combs + 1L) 36 | } 37 | attr(res, "n") <- n 38 | 39 | 40 | if (drop) { 41 | id_var(res, drop = TRUE) 42 | } else { 43 | structure(as.integer(res), n = attr(res, "n")) 44 | } 45 | } 46 | 47 | id_var <- function(x, drop = FALSE) { 48 | if (length(x) == 0) return(structure(integer(), n = 0L)) 49 | if (!is.null(attr(x, "n")) && !drop) return(x) 50 | 51 | if (is.factor(x) && !drop) { 52 | id <- as.integer(addNA(x, ifany = TRUE)) 53 | n <- length(levels(x)) 54 | } else { 55 | levels <- sort(unique(x), na.last = TRUE) 56 | id <- match(x, levels) 57 | n <- max(id) 58 | } 59 | structure(id, n = n) 60 | } 61 | -------------------------------------------------------------------------------- /R/bin.r: -------------------------------------------------------------------------------- 1 | 2 | #' Create a binned variable. 3 | #' 4 | #' @details 5 | #' This function produces an R reference class that wraps around a C++ function. 6 | #' Generally, you should just treat this as an opaque object with reference 7 | #' semantics, and you shouldn't call the methods on it - pass it to 8 | #' \code{\link{condense}} and friends. 9 | #' 10 | #' @param x numeric or integer vector 11 | #' @param width bin width. If not specified, about 10,000 bins will be chosen 12 | #' using the algorithim in \code{\link{find_width}}. 13 | #' @param origin origin. If not specified, guessed by \code{\link{find_origin}}. 14 | #' @param name name of original variable. This will be guessed from the input to 15 | #' \code{group} if not supplied. Used in the output of 16 | #' \code{\link{condense}} etc. 17 | #' @export 18 | #' @examples 19 | #' x <- runif(1e6) 20 | #' bin(x) 21 | #' bin(x, 0.01) 22 | #' bin(x, 0.01, origin = 0.5) 23 | bin <- function(x, width = find_width(x), origin = find_origin(x, width), 24 | name = NULL) { 25 | stopifnot(is.numeric(x)) 26 | stopifnot(is.numeric(width), length(width) == 1, width > 0) 27 | stopifnot(is.numeric(origin), length(origin) == 1) 28 | 29 | if (is.null(name)) { 30 | name <- deparse(substitute(x)) 31 | } 32 | stopifnot(is.character(name), length(name) == 1) 33 | 34 | if (!is.ranged(x)) { 35 | attr(x, "range") <- frange(x) 36 | class(x) <- "ranged" 37 | } 38 | if (origin > min(x)) { 39 | warning("Origin larger than min(x): some values will be truncated", 40 | call. = FALSE) 41 | } 42 | 43 | BigVis$BinnedVector$new(x, name, width, origin) 44 | } 45 | 46 | 47 | is.binned <- function(x) { 48 | is(x, "Rcpp_BinnedVector") 49 | } 50 | 51 | bins <- function(...) { 52 | BigVis$BinnedVectors$new(list(...)) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/group.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "group.h" 3 | #include "group-hex.h" 4 | using namespace Rcpp; 5 | 6 | template 7 | IntegerVector group_out(const Group& group) { 8 | int n = group.size(); 9 | IntegerVector out(n); 10 | for(int i = 0; i < n; ++i) { 11 | out[i] = group.bin_i(i); 12 | } 13 | 14 | return out; 15 | } 16 | 17 | RCPP_MODULE(Group) { 18 | class_("GroupFixed") 19 | .constructor() 20 | .const_method("bin_i", &GroupFixed::bin_i) 21 | .const_method("bin", &GroupFixed::bin) 22 | .const_method("unbin", &GroupFixed::unbin) 23 | 24 | .const_method("size", &GroupFixed::size) 25 | .const_method("nbins", &GroupFixed::nbins) 26 | 27 | .const_method("origin", &GroupFixed::origin) 28 | .const_method("width", &GroupFixed::width) 29 | ; 30 | } 31 | RCPP_EXPOSED_AS(GroupFixed) 32 | RCPP_EXPOSED_WRAP(GroupFixed) 33 | 34 | 35 | // [[Rcpp::export]] 36 | IntegerVector group_fixed(const NumericVector& x, double width, double origin = 0) { 37 | return group_out(GroupFixed(x, width, origin)); 38 | } 39 | 40 | // [[Rcpp::export]] 41 | IntegerVector group_rect(const NumericVector& x, const NumericVector& y, 42 | double x_width, double y_width, 43 | double x_origin, double y_origin) { 44 | return group_out(Group2d( 45 | GroupFixed(x, x_width, x_origin), 46 | GroupFixed(y, y_width, y_origin))); 47 | } 48 | 49 | 50 | // [[Rcpp::export]] 51 | IntegerVector group_hex(const NumericVector& x, const NumericVector& y, 52 | double x_width, double y_width, 53 | double x_origin, double y_origin, 54 | double x_max) { 55 | return group_out(GroupHex(x, y, x_width, y_width, x_origin, y_origin, x_max)); 56 | } 57 | -------------------------------------------------------------------------------- /notes.md: -------------------------------------------------------------------------------- 1 | # Group 2 | 3 | * 1d, nd 4 | 5 | Future work: linear binning 6 | 7 | # Summarise 8 | 9 | * 1d 10 | 11 | * count, sum 12 | * count, mean, sd 13 | * median 14 | 15 | * 2d 16 | * mean 17 | * regression 18 | * robust regression 19 | 20 | * nd 21 | * mean 22 | * regression (with eigen or armadillo) 23 | * robust regression (with eigen or armadillo) 24 | 25 | Future work: 26 | 27 | * skew?, kurt? 28 | * boxplot 29 | * weighted quantiles (C++ version of R code) 30 | * compute standard errors / bootstrap standard errors? 31 | 32 | * infrastructure for passing multiple z 33 | * 2d: cor, lm 34 | 35 | 36 | # Smooth 37 | 38 | Kernel smoothing plus binned summary leads to many common statistics: density =~ bin + smooth, loess =~ mean + smooth, rqss =~ quantile + smooth 39 | 40 | * weights 41 | * smoothing type 42 | * constant 43 | * linear 44 | * robust linear (lowess) 45 | * (linear poisson?) 46 | * leave-one-out cross-validation 47 | * optimisations 48 | * convert to integer grid & use pre-computed grid of kernel values 49 | * hash in smooth_nd_1 and compute more efficiently along 1d 50 | * deal with missing values 51 | 52 | * smooth needs to create complete grid when factor = TRUE 53 | 54 | Think about input data structure: sparse grid, represented as a coordinate list. Binned grid class = integer vector + width/origin/nbins (0 = NA). Most transformations break the grid, in which case all you case preserve is min, max and number of bins. All smoothing methods adapted to work in terms of these integers. Need to extract out bin/unbin into own class (initialised with std::vector of bin sizes) 55 | 56 | Possible that more performance is available by switching to a sparse tensor library. 57 | 58 | # Visualise 59 | 60 | * Product plots 61 | * Standard errors + cut offs 62 | 63 | * Peel: implement nd version using depth -------------------------------------------------------------------------------- /R/condense.r: -------------------------------------------------------------------------------- 1 | #' Efficient binned summaries. 2 | #' 3 | #' @param ... group objects created by \code{\link{bin}} 4 | #' @param z a numeric vector to summary for each group. Optional for some 5 | #' summary statistics. 6 | #' @param summary the summary statistic to use. Currently must be one of 7 | #' count, sum, mean, median or sd. If \code{NULL}, defaults to mean if 8 | #' y is present, count if not. 9 | #' @param w a vector of weights. Not currently supported by all summary 10 | #' functions. 11 | #' @param drop if \code{TRUE} only locations with data will be returned. This 12 | #' is more efficient if the data is very sparse (<1\% of cells filled), and 13 | #' is slightly less efficient. Defaults to \code{TRUE} if you are condensing 14 | #' over two or more dimensions, \code{FALSE} for 1d. 15 | #' @export 16 | #' @examples 17 | #' x <- runif(1e5) 18 | #' gx <- bin(x, 0.1) 19 | #' condense(gx) 20 | condense <- function(..., z = NULL, summary = NULL, w = NULL, drop = NULL) { 21 | gs <- list(...) 22 | if (length(gs) == 1 && is.list(gs[[1]])) gs <- gs[[1]] 23 | 24 | is_binned <- vapply(gs, is.binned, logical(1)) 25 | if (!all(is_binned)) { 26 | stop("All objects passed to ... must be binned.", call. = FALSE) 27 | } 28 | 29 | drop <- drop %||% (length(gs) > 1) 30 | 31 | if (is.null(summary)) { 32 | summary <- if (is.null(z)) "count" else "mean" 33 | message("Summarising with ", summary) 34 | } 35 | 36 | # C++ code can deal with NULL inputs more efficiently than R code 37 | z <- z %||% numeric() 38 | w <- w %||% numeric() 39 | 40 | # Check lengths consistent 41 | n <- gs[[1]]$size() 42 | stopifnot(length(z) == 0 || length(z) == n) 43 | stopifnot(length(w) == 0 || length(w) == n) 44 | 45 | f <- find_fun(paste("condense", summary, sep = "_")) 46 | out <- f(gs, z, w, drop = drop) 47 | 48 | condensed(gs, out[[1]], out[[2]]) 49 | } 50 | -------------------------------------------------------------------------------- /man/smooth.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/smooth.r 3 | \name{smooth} 4 | \alias{smooth} 5 | \title{Smooth a condensed data frame.} 6 | \usage{ 7 | smooth(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean", 8 | factor = TRUE) 9 | } 10 | \arguments{ 11 | \item{x}{a condensed summary} 12 | 13 | \item{h}{numeric vector of bandwidths, one for each grouping variable in 14 | \code{x}} 15 | 16 | \item{var}{variable to smooth} 17 | 18 | \item{grid}{a data frame with the grouping colums as x. In order for the 19 | factored version of \code{smooth_nd} to work, this grid must be a superset 20 | of \code{x}.} 21 | 22 | \item{type}{type of smoothing to use. Current options are \code{"mean"}, 23 | a kernel weighted mean; \code{"regression"}, a kernel weighted local 24 | regression; and \code{"robust_regression"}, robust kernel weighted local 25 | regression in the style of \code{\link{loess}}. Unique prefixes are also 26 | acceptable.} 27 | 28 | \item{factor}{if \code{TRUE} compute the n-dimensional smooth by a sequence 29 | of 1d smoothes. For \code{type = "mean"} the results are always the same 30 | grid values are uncorrelated (e.g. the grid is complete at every location); 31 | and is very approximate for \code{type = "robust"}.} 32 | } 33 | \description{ 34 | Smooth a condensed data frame. 35 | } 36 | \examples{ 37 | x <- runif(1e5) 38 | xsum <- condense(bin(x, 1 / 100)) 39 | xsmu1 <- smooth(xsum, 5 / 100) 40 | xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE) 41 | 42 | # More challenging distribution 43 | x <- rchallenge(1e4) 44 | xsum <- condense(bin(x, 0.1)) 45 | xsmu <- smooth(xsum, 1) 46 | 47 | plot(xsum$x, xsum$.count, type = "l") 48 | lines(xsmu$x, xsmu$.count, col = "red") 49 | 50 | xsmu2 <- smooth(xsum, 1, type = "regress") 51 | plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50)) 52 | lines(xsmu2$x, xsmu2$.count, col = "red") 53 | # Note difference in tails 54 | } 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bigvis 2 | 3 | [![Travis-CI Build Status](https://travis-ci.org/hadley/bigvis.svg?branch=master)](https://travis-ci.org/hadley/bigvis) 4 | [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/bigvis/master.svg)](https://codecov.io/github/hadley/bigvis?branch=master) 5 | 6 | The bigvis package provides tools for exploratory data analysis of __large datasets__ (10-100 million obs). The aim is to have most operations take less than 5 seconds on commodity hardware, even for 100,000,000 data points. 7 | 8 | Since bigvis is not currently available on CRAN, the easiest way to try it out is to: 9 | 10 | ```R 11 | # install.packages("devtools") 12 | devtools::install_github("hadley/bigvis") 13 | ``` 14 | 15 | ## Workflow 16 | 17 | The bigvis package is structured around the following workflow: 18 | 19 | * `bin()` and `condense()` to get a compact summary of the data 20 | 21 | * if the estimates are rough, you might want to `smooth()`. See `best_h()` and `rmse_cvs()` to figure out a good starting bandwidth 22 | 23 | * if you're working with counts, you might want to `standardise()` 24 | 25 | * visualise the results with `autoplot()` (you'll need to load `ggplot2` to use this) 26 | 27 | ## Weighted statistics 28 | 29 | Bigvis also provides a number of standard statistics efficiently implemented on weighted/binned data: `weighted.median`, `weighted.IQR`, `weighted.var`, `weighted.sd`, `weighted.ecdf` and `weighted.quantile`. 30 | 31 | ## Acknowledgements 32 | 33 | This package wouldn't be possible without: 34 | 35 | * the fantastic [Rcpp](http://dirk.eddelbuettel.com/code/rcpp.html) package, which makes it amazingly easy to integrate R and C++ 36 | 37 | * JJ Allaire and Carlos Scheidegger who have indefatigably answered my many C++ questions 38 | 39 | * the generous support of Revolution Analytics who supported the early development. 40 | 41 | * Yue Hu, who implemented a proof of concepts that showed that it might be possible to work with this much data in R. 42 | -------------------------------------------------------------------------------- /R/rmse.r: -------------------------------------------------------------------------------- 1 | #' Estimate smoothing RMSE using leave-one-out cross-validation. 2 | #' 3 | #' \code{rmse_cv} computes the leave-one-out RMSE for a single vector of 4 | #' bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths, 5 | #' stored as a data frame. 6 | #' 7 | #' @param x condensed summary table 8 | #' @param h,hs for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv} 9 | #' a data frame of bandwidths, as generated by \code{\link{h_grid}}. 10 | #' @param var variable to smooth 11 | #' @param ... other variables passed on to \code{\link{smooth}} 12 | #' @family bandwidth estimation functions 13 | #' @export 14 | #' @examples 15 | #' \donttest{ 16 | #' set.seed(1014) 17 | #' # 1d ----------------------------- 18 | #' x <- rchallenge(1e4) 19 | #' xsum <- condense(bin(x, 1 / 10)) 20 | #' cvs <- rmse_cvs(xsum) 21 | #' 22 | #' if (require("ggplot2")) { 23 | #' autoplot(xsum) 24 | #' qplot(x, err, data = cvs, geom = "line") 25 | #' xsmu <- smooth(xsum, 1.3) 26 | #' autoplot(xsmu) 27 | #' autoplot(peel(xsmu)) 28 | #' } 29 | #' 30 | #' # 2d ----------------------------- 31 | #' y <- runif(1e4) 32 | #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) 33 | #' cvs <- rmse_cvs(xysum, h_grid(xysum, 10)) 34 | #' if (require("ggplot2")) { 35 | #' qplot(x, y, data = cvs, size = err) 36 | #' } 37 | #' } 38 | rmse_cvs <- function(x, hs = h_grid(x), ...) { 39 | rmse_1 <- function(i) { 40 | rmse_cv(x, as.numeric(hs[i, ]), ...) 41 | } 42 | err <- vapply(seq_len(nrow(hs)), rmse_1, numeric(1)) 43 | data.frame(hs, err) 44 | } 45 | 46 | #' @rdname rmse_cvs 47 | #' @export 48 | rmse_cv <- function(x, h, var = summary_vars(x)[1], ...) { 49 | # can't smooth missing values, so drop. 50 | x <- x[complete.cases(x), , drop = FALSE] 51 | gvars <- group_vars(x) 52 | 53 | pred_error <- function(i) { 54 | out <- as.matrix(x[i, gvars, drop = FALSE]) 55 | smu <- smooth(x[-i, , drop = FALSE], grid = out, h = h, var = var, ...) 56 | smu[[var]] - x[[var]][i] 57 | } 58 | err <- vapply(seq_len(nrow(x)), pred_error, numeric(1)) 59 | sqrt(mean(err ^ 2, na.rm = TRUE)) 60 | } 61 | 62 | -------------------------------------------------------------------------------- /R/challenge.r: -------------------------------------------------------------------------------- 1 | #' Density and random number generation functions for a challenging 2 | #' distribution. 3 | #' 4 | #' This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom 5 | #' centered at 15 and scaled by 2, and a gamma distribution with shape 2 6 | #' and rate 1/3. (The t-distribution is windsorised at 0, but this 7 | #' has negligible effect.) This distribution is challenging because it 8 | #' mixes heavy tailed and asymmetric distributions. 9 | #' 10 | #' @param x values to evaluate pdf at 11 | #' @param n number of random samples to generate 12 | #' @export 13 | #' @examples 14 | #' plot(dchallenge, xlim = c(-5, 60), n = 500) 15 | #' 16 | #' x <- rchallenge(1e4) 17 | #' hist(x, breaks = 1000) 18 | #' xsum <- condense(bin(x, 0.1)) 19 | #' plot(xsum$x, xsum$.count, type = "l") 20 | #' xsmu <- smooth(xsum, 0.3) 21 | #' plot(xsmu$x, xsmu$.count, type = "l") 22 | #' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30)) 23 | dchallenge <- function(x) { 24 | # Windorised t-distribution 25 | scale <- function(x) (x - 30) / 2 26 | spike <- ifelse(x < 0, 0, dt(scale(x), df = 2)) + 27 | pt(scale(0), df = 2) * (x == 0) 28 | 29 | slope <- dgamma(x, 2, 1/3) 30 | 31 | (spike + 2 * slope) / 3 32 | } 33 | 34 | # plot(pchallenge, xlim = c(-5, 60), n = 500) 35 | pchallenge <- function(x) { 36 | # H(y) = 37 | # = int_0^y h(x) dx 38 | # = int_0^y 1/3 f(x) + 2/3 g(x) dx 39 | # = 1/3 int_0^y f(x) dx + 2/3 int_0^y g(x) dx = 40 | # = 1/3 F(y) + 2/3 G(y) 41 | 42 | # h(x) = g((x - 30) / 2) 43 | # H(y) = int_0^y g((x - 30) / 2) dx 44 | # complete transformation 45 | 46 | scale <- function(x) (x - 30) / 2 47 | spike <- ifelse(x < 0, 0, pt(scale(x), df = 2)) 48 | 49 | slope <- pgamma(x, 2, 1/3) 50 | 51 | (spike + 2 * slope) / 3 52 | } 53 | 54 | qchallenge <- function(x) { 55 | # approximate pchallenge with 1000 points, and linearise 56 | # use to implement fast option to rchallenge that does inverse pdf 57 | # transformation + runif() 58 | } 59 | 60 | 61 | #' @rdname dchallenge 62 | #' @export 63 | rchallenge <- function(n) { 64 | nt <- rbinom(1, n, 1 / 3) 65 | ngamma <- n - nt 66 | 67 | spike <- 2 * rt(nt, df = 2) + 15 68 | spike[spike < 0] <- 0 69 | 70 | slope <- rgamma(ngamma, 2, 1/3) 71 | 72 | c(spike, slope) 73 | } 74 | 75 | -------------------------------------------------------------------------------- /man/best_h.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/h.r 3 | \name{best_h} 4 | \alias{best_h} 5 | \title{Find "best" smoothing parameter using leave-one-out cross validation.} 6 | \usage{ 7 | best_h(x, h_init = NULL, ..., tol = 0.01, control = list()) 8 | } 9 | \arguments{ 10 | \item{x}{condensed summary to smooth} 11 | 12 | \item{h_init}{initial values of bandwidths to start search out. If not 13 | specified defaults to 5 times the binwidth of each variable.} 14 | 15 | \item{...}{other arguments (like \code{var}) passed on to 16 | \code{\link{rmse_cv}}} 17 | 18 | \item{tol}{numerical tolerance, defaults to 1\%.} 19 | 20 | \item{control}{additional control parameters passed on to \code{\link{optim}} 21 | The most useful argument is probably trace, which makes it possible to 22 | follow the progress of the optimisation.} 23 | } 24 | \value{ 25 | a single numeric value representing the bandwidth that minimises 26 | the leave-one-out estimate of rmse. Vector has attributes 27 | \code{evaluations} giving the number of times the objective function 28 | was evaluated. If the optimisation does not converge, or smoothing is not 29 | needed (i.e. the estimate is on the lower bounds), a warning is thrown. 30 | } 31 | \description{ 32 | Minimises the leave-one-out estimate of root mean-squared error to find 33 | find the "optimal" bandwidth for smoothing. 34 | } 35 | \details{ 36 | L-BFGS-B optimisation is used to constrain the bandwidths to be greater 37 | than the binwidths: if the bandwidth is smaller than the binwidth it's 38 | impossible to compute the rmse because no smoothing occurs. The tolerance 39 | is set relatively high for numerical optimisation since the precise choice 40 | of bandwidth makes little difference visually, and we're unlikely to have 41 | sufficient data to make a statistically significant choice anyway. 42 | } 43 | \examples{ 44 | \donttest{ 45 | x <- rchallenge(1e4) 46 | xsum <- condense(bin(x, 1 / 10)) 47 | h <- best_h(xsum, control = list(trace = 3, REPORT = 1)) 48 | 49 | if (require("ggplot2")) { 50 | autoplot(xsum) 51 | autoplot(smooth(xsum, h)) 52 | } 53 | } 54 | } 55 | \seealso{ 56 | Other bandwidth estimation functions: \code{\link{h_grid}}; 57 | \code{\link{rmse_cv}}, \code{\link{rmse_cvs}} 58 | } 59 | 60 | -------------------------------------------------------------------------------- /inst/tests/test-binned-vectors.r: -------------------------------------------------------------------------------- 1 | context("Binned vectors") 2 | 3 | if (require("plyr")) { 4 | test_that("bins agree with plyr::id", { 5 | grid <- expand.grid(x = c(NA, seq(0, 0.5, by = 0.1)), y = c(NA, seq(0, 0.7, by = 0.1))) 6 | x <- grid$x 7 | y <- grid$y 8 | 9 | gx <- bin(x, 0.1) 10 | gy <- bin(y, 0.1) 11 | 12 | bv <- bins(gx, gy) 13 | bigvis <- sapply(seq_along(x) - 1, bv$bin_i) 14 | 15 | bin_x <- sapply(seq_along(x) - 1, gx$bin_i) 16 | bin_y <- sapply(seq_along(x) - 1, gy$bin_i) 17 | plyr <- as.vector(id(list(bin_x, bin_y))) 18 | 19 | expect_equal(bigvis + 1, plyr) 20 | }) 21 | } 22 | 23 | test_that("square nbins correct", { 24 | g <- bin(1:10, 1) 25 | expect_equal(bins(g)$nbins(), 11) 26 | expect_equal(bins(g, g)$nbins(), 11 ^ 2) 27 | expect_equal(bins(g, g, g)$nbins(), 11 ^ 3) 28 | }) 29 | 30 | test_that("rectangular nbins correct", { 31 | g11 <- bin(1:10, 1) 32 | g2 <- bin(rep(1, 10), 1) 33 | 34 | expect_equal(bins(g2, g11)$nbins(), 22) 35 | expect_equal(bins(g11, g2)$nbins(), 22) 36 | }) 37 | 38 | test_that("diagonal nbins correct", { 39 | x <- runif(1e3) 40 | y <- x + runif(1e3, -0.2, 0.2) 41 | z <- rnorm(1e3, x) 42 | 43 | gx <- bin(x, 0.1) 44 | gy <- bin(y, 0.1) 45 | 46 | expect_equal(gx$nbins(), 11) 47 | expect_equal(gy$nbins(), 15) 48 | 49 | bvs <- bins(gx, gy) 50 | expect_equal(bvs$nbins(), 165) 51 | 52 | bins <- vapply(seq_along(x) - 1, bvs$bin_i, integer(1)) 53 | expect_true(all(bins <= 165)) 54 | }) 55 | 56 | test_that("bin and unbin are symmetric", { 57 | g <- bin(-10:10, 1) 58 | bvs <- bins(g, g) 59 | 60 | grid <- expand.grid(x = -10:10, y = -10:10) 61 | bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y)) 62 | unbin <- t(vapply(bins, bvs$unbin, numeric(2))) 63 | colnames(unbin) <- c("x", "y") 64 | 65 | expect_equal(unbin, as.matrix(grid)) 66 | }) 67 | 68 | test_that("bin and unbin are symmetric with diff binning", { 69 | x <- c(-1, 5) 70 | y <- c(0.1, 1) 71 | 72 | bx <- bin(x, 1) 73 | by <- bin(y, 0.1) 74 | bvs <- bins(bx, by) 75 | 76 | grid <- expand.grid( 77 | x = breaks(bx)[-1] + 1 / 2, 78 | y = breaks(by)[-1] + 0.1 / 2) 79 | 80 | bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y)) 81 | unbin <- t(vapply(bins, bvs$unbin, numeric(2))) 82 | colnames(unbin) <- c("x", "y") 83 | 84 | expect_equal(unbin, as.matrix(grid)) 85 | }) 86 | -------------------------------------------------------------------------------- /inst/tests/test-condense.r: -------------------------------------------------------------------------------- 1 | context("Condense") 2 | 3 | test_that("condense counts small vectors accurately", { 4 | x <- c(NA, 0:10) 5 | s1 <- condense(bin(x, 1, -0.5), summary = "count") 6 | # Pathological origin: need to add extra bin on end, because they're 7 | # right open, left closed 8 | s2 <- condense(bin(x, 1, 0), summary = "count") 9 | 10 | expect_equivalent(s1$x, c(NA, 0:10)) 11 | expect_equivalent(s2$x, c(NA, 0:10 + 0.5)) 12 | 13 | expect_equal(s1$.count, rep(1, length(x))) 14 | expect_equal(s2$.count, rep(1, length(x))) 15 | }) 16 | 17 | test_that("weights modify counts", { 18 | x <- c(NA, 0:10) 19 | w <- rep(2, length(x)) 20 | s <- condense(bin(x, 1), w = w, summary = "count") 21 | 22 | expect_equivalent(s$x, c(NA, 0:10)) 23 | expect_equal(s$.count, rep(2, length(x))) 24 | }) 25 | 26 | test_that("z affects sums, but not counts", { 27 | x <- c(NA, 0:10) 28 | z <- 0:11 29 | s <- condense(bin(x, 1), z = z, summary = "sum") 30 | 31 | expect_equal(s$.count, rep(1, length(x))) 32 | expect_equal(s$.sum, z) 33 | }) 34 | 35 | test_that("drop = FALSE and drop = TRUE results agree", { 36 | x <- runif(1e3) 37 | y <- x + runif(1e3, -0.2, 0.2) 38 | z <- rnorm(1e3, x) 39 | 40 | gx <- bin(x, 0.1) 41 | gy <- bin(y, 0.1) 42 | 43 | count1 <- condense(gx, gy, summary = "count", drop = TRUE) 44 | expect_equal(sum(count1$.count == 0), 0) 45 | 46 | count2 <- condense(gx, gy, summary = "count", drop = FALSE) 47 | expect_equivalent(count1, count2[count2$.count != 0, ]) 48 | }) 49 | 50 | # 2d tests --------------------------------------------------------------------- 51 | 52 | test_that("grid counted accurately", { 53 | # expand.grid orders in opposite way to bigvis 54 | grid <- expand.grid(y = c(NA, 1:2), x = c(NA, 1:2)) 55 | s <- condense(bin(grid$x, 1), bin(grid$y, 1)) 56 | 57 | expect_equal(s$.count, rep(1, nrow(grid))) 58 | expect_equivalent(s$grid.x, grid$x) 59 | expect_equivalent(s$grid.y, grid$y) 60 | }) 61 | 62 | test_that("diagonal counted correctly", { 63 | df <- data.frame(x = c(NA, 1:2), y = c(NA, 1:2)) 64 | s <- condense(bin(df$x, 1), bin(df$y, 1)) 65 | 66 | expect_equal(nrow(s), nrow(df)) 67 | expect_equal(s$df.x, s$df.y) 68 | }) 69 | 70 | test_that("random data doesn't crash", { 71 | x <- runif(1e3, 8, 4963) 72 | y <- runif(1e3, 1e-2, 1e3) 73 | 74 | gx <- bin(x, 10) 75 | gy <- bin(y, 10) 76 | 77 | condense(gx, gy) 78 | }) 79 | -------------------------------------------------------------------------------- /bench/group-tempvar.cpp: -------------------------------------------------------------------------------- 1 | // In a function like 2 | // 3 | // unsigned int bin(unsigned int i) const { 4 | // if (ISNAN(x_[i])) return 0; 5 | // if (x_[i] < origin_) return 0; 6 | // 7 | // return (x_[i] - origin_) / width_ + 1; 8 | // } 9 | // 10 | // should I create my own temporary double val = x_[i] ? 11 | // 12 | // It looks like it saves ~0.2 ns per invocation, so probably not worth it for 13 | // performance reasons. 14 | 15 | #include 16 | using namespace Rcpp; 17 | 18 | class Group1 { 19 | const Fast x_; 20 | double width_; 21 | double origin_; 22 | public: 23 | Group1 (const NumericVector& x, double width, double origin = 0) 24 | : x_(x), width_(width), origin_(origin) { 25 | } 26 | 27 | unsigned int bin(unsigned int i) const { 28 | if (ISNAN(x_[i])) return 0; 29 | if (x_[i] < origin_) return 0; 30 | 31 | return (x_[i] - origin_) / width_ + 1; 32 | } 33 | 34 | int size() const { 35 | return x_.size(); 36 | } 37 | }; 38 | 39 | class Group2 { 40 | const Fast x_; 41 | double width_; 42 | double origin_; 43 | public: 44 | Group2 (const NumericVector& x, double width, double origin = 0) 45 | : x_(x), width_(width), origin_(origin) { 46 | } 47 | 48 | unsigned int bin(unsigned int i) const { 49 | double val = x_[i]; 50 | if (ISNAN(val)) return 0; 51 | if (val < origin_) return 0; 52 | 53 | return (val - origin_) / width_ + 1; 54 | } 55 | 56 | int size() const { 57 | return x_.size(); 58 | } 59 | }; 60 | 61 | template 62 | IntegerVector group_out(const Group& group) { 63 | int n = group.size(); 64 | IntegerVector out(n); 65 | for(int i = 0; i < n; ++i) { 66 | out[i] = group.bin(i); 67 | } 68 | 69 | return out; 70 | } 71 | 72 | // [[Rcpp::export]] 73 | IntegerVector group1(const NumericVector& x, double width, double origin = 0) { 74 | return group_out(Group1(x, width, origin)); 75 | } 76 | 77 | // [[Rcpp::export]] 78 | IntegerVector group2(const NumericVector& x, double width, double origin = 0) { 79 | return group_out(Group2(x, width, origin)); 80 | } 81 | 82 | 83 | /*** R 84 | x <- runif(1e6) 85 | library(microbenchmark) 86 | stopifnot(all.equal(group1(x, 1/1000), group2(x, 1/1000))) 87 | 88 | (m <- microbenchmark( 89 | group1(x, 1/1000), 90 | group2(x, 1/1000) 91 | )) 92 | diff(summary(m)$median) / length(x) * 1e9 / 1e3 93 | */ -------------------------------------------------------------------------------- /R/rebin.r: -------------------------------------------------------------------------------- 1 | #' Transform condensed objects, collapsing unique bins. 2 | #' 3 | #' @details 4 | #' You don't need to use \code{rebin} if you use transform: it will 5 | #' automatically rebin for you. You will need to use it if you manually 6 | #' transform any grouping variables. 7 | #' 8 | #' @param data,`_data` a condensed summary 9 | #' @param ... named arguments evaluated in the context of the data 10 | #' @usage \\method{transform}{condensed}(`_data`, ...) 11 | #' @keywords internal 12 | #' @examples 13 | #' x <- runif(1e4, -1, 1) 14 | #' xsum <- condense(bin(x, 1 / 50)) 15 | #' 16 | #' # Transforming by hand: must use rebin 17 | #' xsum$x <- abs(xsum$x) 18 | #' rebin(xsum) 19 | #' if (require("ggplot2")) { 20 | #' autoplot(xsum) + geom_point() 21 | #' autoplot(rebin(xsum)) + geom_point() 22 | #' } 23 | #' 24 | #' #' Transforming with transform 25 | #' y <- x ^ 2 + runif(length(x), -0.1, 0.1) 26 | #' xysum <- condense(bin(x, 1 / 50), z = y) 27 | #' xysum <- transform(xysum, x = abs(x)) 28 | #' if (require("ggplot2")) { 29 | #' autoplot(xysum) 30 | #' } 31 | #' @export 32 | #' @method transform condensed 33 | transform.condensed <- function(`_data`, ...) { 34 | df <- transform.data.frame(`_data`, ...) 35 | rebin(as.condensed(df)) 36 | } 37 | 38 | #' @export 39 | #' @rdname transform.condensed 40 | rebin <- function(data) { 41 | stopifnot(is.condensed(data)) 42 | 43 | old_g <- data[group_vars(data)] 44 | old_g[] <- lapply(old_g, zapsmall, digits = 3) 45 | ids <- id(old_g, drop = TRUE) 46 | if (!anyDuplicated(ids)) return(data) 47 | 48 | old_s <- data[summary_vars(data)] 49 | new_s <- lapply(names(old_s), function(var) rebin_var(old_s, ids, var)) 50 | names(new_s) <- names(old_s) 51 | 52 | uids <- !duplicated(ids) 53 | new_g <- old_g[uids, , drop = FALSE] 54 | ord <- order(ids[uids], na.last = FALSE) 55 | 56 | as.condensed(data.frame(new_g[ord, , drop = FALSE], new_s)) 57 | } 58 | 59 | rebin_var <- function(df, ids, var) { 60 | stopifnot(is.data.frame(df)) 61 | stopifnot(is.integer(ids), length(ids) == nrow(df)) 62 | stopifnot(is.character(var), length(var) == 1, var %in% names(rebinners)) 63 | 64 | rows <- split(seq_len(nrow(df)), ids) 65 | f <- rebinners[[var]] 66 | 67 | vapply(rows, function(i) f(df[i, , drop = FALSE]), numeric(1), 68 | USE.NAMES = FALSE) 69 | } 70 | 71 | rebinners <- list( 72 | .median = function(df) mean(df$.median, na.rm = TRUE), 73 | .sum = function(df) sum(df$.sum, na.rm = TRUE), 74 | .count = function(df) sum(df$.count, na.rm = TRUE), 75 | .mean = function(df) { 76 | if (is.null(df$.count)) { 77 | mean(df$.mean, na.rm = TRUE) 78 | } else { 79 | weighted.mean(df$.mean, df$.count) 80 | } 81 | } 82 | ) 83 | -------------------------------------------------------------------------------- /bench/count.cpp: -------------------------------------------------------------------------------- 1 | // Experiment with making the binner more generic, so that the binner 2 | // class also stores the variable being binned over - this is important 3 | // for separating the grouping from the numeric operation. 4 | 5 | #include 6 | #include 7 | #include 8 | using namespace Rcpp; 9 | 10 | template 11 | std::vector count_x(const NumericVector& x, Binner binner) { 12 | std::vector out; 13 | 14 | int n = x.size(); 15 | 16 | for(int i = 0; i < n; ++i) { 17 | int bin = binner(x[i]); 18 | if (bin < 0) continue; 19 | 20 | // Make sure there's enough space 21 | if (bin >= out.size()) { 22 | out.resize(bin + 1); 23 | } 24 | ++out[bin]; 25 | } 26 | 27 | return out; 28 | } 29 | 30 | template 31 | std::vector count(Binner binner) { 32 | std::vector out; 33 | 34 | int n = binner.size(); 35 | for(int i = 0; i < n; ++i) { 36 | int bin = binner.bin(i); 37 | if (bin < 0) continue; 38 | 39 | // Make sure there's enough space 40 | if (bin >= out.size()) { 41 | out.resize(bin + 1); 42 | } 43 | ++out[bin]; 44 | } 45 | 46 | return out; 47 | } 48 | 49 | 50 | class BinFixed { 51 | double width_; 52 | double origin_; 53 | public: 54 | BinFixed (double width, double origin = 0) { 55 | width_ = width; 56 | origin_ = origin; 57 | } 58 | 59 | int inline operator() (double val) const { 60 | if (ISNAN(val)) return 0; 61 | 62 | return (val - origin_) / width_ + 1; 63 | } 64 | }; 65 | 66 | class BinFixed2 { 67 | const NumericVector& x_; 68 | double width_; 69 | double origin_; 70 | public: 71 | BinFixed2 (const NumericVector& x, double width, double origin = 0) 72 | : x_(x), width_(width), origin_(origin) { 73 | } 74 | 75 | int bin(int i) const { 76 | if (ISNAN(x_[i])) return 0; 77 | return (x_[i] - origin_) / width_ + 1; 78 | } 79 | 80 | int size() const { 81 | return x_.size(); 82 | } 83 | }; 84 | 85 | 86 | // [[Rcpp::export]] 87 | std::vector count_x2(NumericVector x, double width, double origin = 0) { 88 | return count_x(x, BinFixed(width, origin)); 89 | } 90 | 91 | // [[Rcpp::export]] 92 | std::vector count2(NumericVector x, double width, double origin = 0) { 93 | return count(BinFixed2(x, width, origin)); 94 | } 95 | 96 | 97 | /*** R 98 | options(digits = 3) 99 | library(microbenchmark) 100 | x <- runif(1e5) 101 | 102 | # Breaks 103 | microbenchmark( 104 | count_x2(x, 1/100), 105 | count2(x, 1/100) 106 | ) 107 | 108 | */ -------------------------------------------------------------------------------- /bench/median.cpp: -------------------------------------------------------------------------------- 1 | // Instead of counting, compute a more complicated statistic: a median 2 | 3 | #include 4 | #include 5 | #include 6 | using namespace Rcpp; 7 | 8 | class BinFixed { 9 | const Fast x_; 10 | double width_; 11 | double origin_; 12 | public: 13 | BinFixed (const NumericVector& x, double width, double origin = 0) 14 | : x_(x), width_(width), origin_(origin) { 15 | } 16 | 17 | int bin(int i) const { 18 | if (ISNAN(x_[i])) return 0; 19 | return (x_[i] - origin_) / width_ + 1; 20 | } 21 | 22 | int size() const { 23 | return x_.size(); 24 | } 25 | }; 26 | class StatMedian { 27 | std::vector ys; 28 | 29 | public: 30 | void push(double x) { 31 | ys.push_back(x); 32 | } 33 | 34 | // Adapted from http://stackoverflow.com/questions/1719070/ 35 | double compute() { 36 | if (ys.empty()) return NAN; 37 | 38 | int size = ys.size(); 39 | std::vector::iterator upper = ys.begin() + (int) (size / 2); 40 | std::nth_element(ys.begin(), upper, ys.end()); 41 | 42 | if (size % 2 == 1) { 43 | return *upper; 44 | } else { 45 | std::vector::iterator lower = upper - 1; 46 | std::nth_element(ys.begin(), lower, upper); 47 | return (*upper + *lower) / 2.0; 48 | } 49 | 50 | } 51 | }; 52 | 53 | template 54 | NumericVector group_median(NumericVector& y, Binner binner) { 55 | std::vector stat; 56 | 57 | int n = binner.size(); 58 | for(int i = 0; i < n; ++i) { 59 | int bin = binner.bin(i); 60 | if (bin < 0) continue; 61 | 62 | if (bin >= stat.size()) { 63 | stat.resize(bin + 1); 64 | } 65 | 66 | stat[bin].push(y[i]); 67 | } 68 | 69 | int m = stat.size(); 70 | NumericVector res(m); 71 | for (int i = 0; i < m; ++i) { 72 | res[i] = stat[i].compute(); 73 | } 74 | return res; 75 | } 76 | 77 | 78 | // [[Rcpp::export]] 79 | NumericVector group_median_(NumericVector x, NumericVector y, 80 | double width, double origin = 0) { 81 | return group_median(y, BinFixed(x, width, origin)); 82 | } 83 | 84 | 85 | /*** R 86 | options(digits = 3) 87 | library(microbenchmark) 88 | x <- runif(1e5) 89 | y <- runif(1e5) 90 | 91 | group_median_tapply <- function(x, y, width, origin = 0) { 92 | bins <- trunc((x - origin) / width) 93 | c(NaN, unname(tapply(y, bins, median))) 94 | } 95 | med1 <- group_median_tapply(x, y, width = 1/1000) 96 | med2 <- group_median_(x, y, width = 1/1000) 97 | stopifnot(all.equal(med1, med2)) 98 | 99 | # Breaks 100 | microbenchmark( 101 | # group_median_tapply(x, y, width = 1/1000), 102 | group_median_(x, y, width = 1/1000) 103 | ) 104 | 105 | */ 106 | -------------------------------------------------------------------------------- /R/ranged.r: -------------------------------------------------------------------------------- 1 | #' A S3 class for caching the range of a vector 2 | #' 3 | #' This class is designed for dealing with large vectors, where the cost of 4 | #' recomputing the range multiple times is prohibitive. It provides methods 5 | #' for \code{\link{print}} and \code{\link{str}} that display only the range, 6 | #' not the contents. 7 | #' 8 | #' @section Performance: 9 | #' For best performance, you may want to run copy and paste the contents of 10 | #' this function into your function, to avoid making any copies of \code{x}. 11 | #' This is probably only necessary if you're dealing with extremely large 12 | #' vectors, > 100 million obs. 13 | #' 14 | #' @param x a numeric vector 15 | #' @param range the range of the vector (excluding missing values), if known. 16 | #' If unknown, it will be computed with \code{\link{frange}}, a fast C++ 17 | #' implementation of \code{\link{range}}. 18 | #' @export 19 | #' @examples 20 | #' x <- runif(1e6) 21 | #' y <- ranged(x) 22 | #' range(y) 23 | #' y 24 | #' str(y) 25 | #' 26 | #' # Modifications to the class currently destroy the cache 27 | #' y[1] <- 10 28 | #' max(y) 29 | #' class(y) 30 | #' z <- y + 10 31 | #' max(z) 32 | #' class(z) 33 | ranged <- function(x, range = frange(x, finite = TRUE)) { 34 | stopifnot(is.numeric(x)) 35 | 36 | # Reset range attribute so that lazy evaluation of range 37 | # always recomputes from scratch 38 | attr(x, "range") <- NULL 39 | 40 | attr(x, "range") <- range 41 | class(x) <- "ranged" 42 | x 43 | } 44 | 45 | #' Test if an object is of class ranged. 46 | #' 47 | #' @export 48 | #' @param x object to test 49 | #' @keywords internal 50 | is.ranged <- function(x) inherits(x, "ranged") 51 | 52 | #' @export 53 | min.ranged <- function(x, ...) attr(x, "range")[1] 54 | #' @export 55 | max.ranged <- function(x, ...) attr(x, "range")[2] 56 | #' @export 57 | range.ranged <- function(x, ...) attr(x, "range") 58 | 59 | #' @export 60 | print.ranged <- function(x, ...) { 61 | rng <- attr(x, "range") 62 | # attr(x, "range") <- NULL 63 | # attr(x, "class") <- NULL 64 | # print.default(x) 65 | cat("Ranged 1:", length(x), " [", format(rng[1]), ", ", format(rng[2]), "]\n", 66 | sep = "") 67 | } 68 | 69 | #' @export 70 | str.ranged <- function(object, ...) { 71 | rng <- attr(object, "range") 72 | cat(" Ranged [1:", length(object), "] ", format(rng[1]), "--", format(rng[2]), 73 | "\n", sep = "") 74 | } 75 | 76 | #' @export 77 | Ops.ranged <- function(e1, e2) { 78 | attr(e1, "range") <- NULL 79 | class(e1) <- NULL 80 | 81 | NextMethod(e1, e2) 82 | } 83 | 84 | #' @export 85 | "[<-.ranged" <- function(x, ..., value) { 86 | attr(x, "range") <- NULL 87 | attr(x, "class") <- NULL 88 | NextMethod(x, ..., value = value) 89 | } 90 | 91 | #' @export 92 | as.data.frame.ranged <- function(x, ...) { 93 | n <- length(x) 94 | x <- list(x) 95 | class(x) <- "data.frame" 96 | attr(x, "row.names") <- c(NA_integer_, -n) 97 | 98 | x 99 | } 100 | -------------------------------------------------------------------------------- /R/peel.r: -------------------------------------------------------------------------------- 1 | #' Peel off low density regions of the data. 2 | #' 3 | #' Keeps specified proportion of data by removing the lowest density regions, 4 | #' either anywhere on the plot, or for 2d, just around the edges. 5 | #' 6 | #' This is useful for visualisation, as an easy way of focussing on the regions 7 | #' where the majority of the data lies. 8 | #' 9 | #' @param x condensed summary 10 | #' @param keep (approximate) proportion of data to keep. If \code{1}, will 11 | #' remove all cells with counts. All missing values will be preserved. 12 | #' @param central if \code{TRUE} peels off regions of lowest density only from 13 | #' the outside of the data. In 2d this works by progressively peeling off 14 | #' convex hull of the data: the current algorithm is quite slow. 15 | #' If \code{FALSE}, just removes the lowest density regions wherever they are 16 | #' found. Regions with 0 density are removed regardless of location. 17 | #' Defaults to TRUE if there are two or fewer grouping variables is less. 18 | #' @export 19 | #' @examples 20 | #' x <- rt(1e5, df = 2) 21 | #' y <- rt(1e5, df = 2) 22 | #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10)) 23 | #' plot(xysum$x, xysum$y) 24 | #' 25 | #' plot(peel(xysum, 0.95, central = TRUE)[1:2]) 26 | #' plot(peel(xysum, 0.90, central = TRUE)[1:2]) 27 | #' plot(peel(xysum, 0.50, central = TRUE)[1:2]) 28 | peel <- function(x, keep = 0.99, central = NULL) { 29 | stopifnot(is.condensed(x)) 30 | stopifnot(is.numeric(keep), length(keep) == 1, keep > 0, keep <= 1) 31 | central <- central %||% (gcol(x) <= 2) 32 | stopifnot(is.logical(central), length(central) == 1) 33 | 34 | if (is.null(x$.count)) { 35 | stop("Can only peel objects with .count variable", call. = FALSE) 36 | } 37 | 38 | x <- x[x$.count > 0, , drop = FALSE] 39 | if (keep == 1) return(x) 40 | 41 | complete <- complete.cases(x[group_vars(x)]) 42 | x_complete <- x[complete, , drop = FALSE] 43 | 44 | if (central) { 45 | peeled <- peel_outer(x_complete, keep) 46 | } else { 47 | peeled <- peel_anywhere(x_complete, keep) 48 | } 49 | 50 | rbind(peeled, x[!complete, , drop = FALSE]) 51 | } 52 | 53 | peel_anywhere <- function(x, keep) { 54 | ord <- order(x$.count, decreasing = TRUE) 55 | prop <- cumsum(x$.count[ord]) / sum(x$.count) 56 | 57 | ind <- which(prop >= keep)[1] 58 | x[ord[seq_len(ind)], , drop = FALSE] 59 | } 60 | 61 | peel_outer <- function(x, keep) { 62 | d <- gcol(x) 63 | if (d > 2) { 64 | stop("Outer peeling only works with 1d or 2d data", call. = FALSE) 65 | } 66 | 67 | n <- sum(x$.count) 68 | x <- x[order(x$.count, decreasing = TRUE), ] 69 | prop <- cumsum(x$.count) / n 70 | 71 | # Peel off smallest values on chull 72 | candidate <- which(prop >= keep) 73 | on_hull <- intersect(candidate, chull(x[1:d])) 74 | left <- sum(x$.count[-on_hull]) / n 75 | 76 | while(left >= keep && length(on_hull) > 0) { 77 | x <- x[-on_hull, , drop = FALSE] 78 | prop <- prop[-on_hull] 79 | candidate <- which(prop >= keep) 80 | on_hull <- intersect(candidate, chull(x[1:d])) 81 | left <- sum(x$.count[-on_hull]) / n 82 | } 83 | 84 | x 85 | } 86 | -------------------------------------------------------------------------------- /bench/mean.cpp: -------------------------------------------------------------------------------- 1 | // Instead of counting, compute a more complicated statistic: a weighted mean 2 | 3 | #include 4 | #include 5 | #include 6 | using namespace Rcpp; 7 | 8 | class BinFixed { 9 | const Fast x_; 10 | double width_; 11 | double origin_; 12 | public: 13 | BinFixed (const NumericVector& x, double width, double origin = 0) 14 | : x_(x), width_(width), origin_(origin) { 15 | } 16 | 17 | int bin(int i) const { 18 | if (ISNAN(x_[i])) return 0; 19 | return (x_[i] - origin_) / width_ + 1; 20 | } 21 | 22 | int size() const { 23 | return x_.size(); 24 | } 25 | }; 26 | 27 | template 28 | NumericVector group_mean(NumericVector& y, NumericVector& weight, Binner binner) { 29 | std::vector count; 30 | std::vector sum; 31 | 32 | int n = binner.size(); 33 | for(int i = 0; i < n; ++i) { 34 | int bin = binner.bin(i); 35 | if (bin < 0) continue; 36 | 37 | // Make sure there's enough space 38 | if (bin >= sum.size()) { 39 | sum.resize(bin + 1); 40 | count.resize(bin + 1); 41 | } 42 | 43 | count[bin] += weight[i]; 44 | sum[bin] += y[i]; 45 | } 46 | 47 | int m = count.size(); 48 | NumericVector res(m); 49 | for (int i = 0; i < m; ++i) { 50 | res[i] = sum[i] / count[i]; 51 | } 52 | return res; 53 | } 54 | 55 | 56 | class StatMean { 57 | double count; 58 | double sum; 59 | 60 | public: 61 | StatMean () : count(0), sum(0) { 62 | } 63 | void push(double x, double weight) { 64 | count += weight; 65 | sum += x; 66 | } 67 | 68 | double compute() { 69 | return sum / count; 70 | } 71 | }; 72 | 73 | template 74 | NumericVector group_mean2(NumericVector& y, NumericVector& weight, Binner binner) { 75 | std::vector stat; 76 | 77 | int n = binner.size(); 78 | for(int i = 0; i < n; ++i) { 79 | int bin = binner.bin(i); 80 | if (bin < 0) continue; 81 | 82 | if (bin >= stat.size()) { 83 | stat.resize(bin + 1); 84 | } 85 | 86 | stat[bin].push(y[i], weight[i]); 87 | } 88 | 89 | int m = stat.size(); 90 | NumericVector res(m); 91 | for (int i = 0; i < m; ++i) { 92 | res[i] = stat[i].compute(); 93 | } 94 | return res; 95 | } 96 | 97 | 98 | // [[Rcpp::export]] 99 | NumericVector group_mean_(NumericVector x, NumericVector y, NumericVector weight, 100 | double width, double origin = 0) { 101 | return group_mean(y, weight, BinFixed(x, width, origin)); 102 | } 103 | // [[Rcpp::export]] 104 | NumericVector group_mean2_(NumericVector x, NumericVector y, NumericVector weight, 105 | double width, double origin = 0) { 106 | return group_mean2(y, weight, BinFixed(x, width, origin)); 107 | } 108 | 109 | 110 | /*** R 111 | options(digits = 3) 112 | library(microbenchmark) 113 | x <- runif(1e6) 114 | y <- runif(1e6) 115 | weight <- rep(1, 1e6) 116 | 117 | # Breaks 118 | microbenchmark( 119 | group_mean_(x, y, weight, width = 1/100), 120 | group_mean2_(x, y, weight, width = 1/100) 121 | ) 122 | 123 | */ -------------------------------------------------------------------------------- /R/smooth.r: -------------------------------------------------------------------------------- 1 | #' Smooth a condensed data frame. 2 | #' 3 | #' @param x a condensed summary 4 | #' @param h numeric vector of bandwidths, one for each grouping variable in 5 | #' \code{x} 6 | #' @param var variable to smooth 7 | #' @param grid a data frame with the grouping colums as x. In order for the 8 | #' factored version of \code{smooth_nd} to work, this grid must be a superset 9 | #' of \code{x}. 10 | #' @param type type of smoothing to use. Current options are \code{"mean"}, 11 | #' a kernel weighted mean; \code{"regression"}, a kernel weighted local 12 | #' regression; and \code{"robust_regression"}, robust kernel weighted local 13 | #' regression in the style of \code{\link{loess}}. Unique prefixes are also 14 | #' acceptable. 15 | #' @param factor if \code{TRUE} compute the n-dimensional smooth by a sequence 16 | #' of 1d smoothes. For \code{type = "mean"} the results are always the same 17 | # as \code{FALSE}; for \code{type = "regress"} they will be equal if the 18 | #' grid values are uncorrelated (e.g. the grid is complete at every location); 19 | #' and is very approximate for \code{type = "robust"}. 20 | #' @export 21 | #' @examples 22 | #' x <- runif(1e5) 23 | #' xsum <- condense(bin(x, 1 / 100)) 24 | #' xsmu1 <- smooth(xsum, 5 / 100) 25 | #' xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE) 26 | #' 27 | #' # More challenging distribution 28 | #' x <- rchallenge(1e4) 29 | #' xsum <- condense(bin(x, 0.1)) 30 | #' xsmu <- smooth(xsum, 1) 31 | #' 32 | #' plot(xsum$x, xsum$.count, type = "l") 33 | #' lines(xsmu$x, xsmu$.count, col = "red") 34 | #' 35 | #' xsmu2 <- smooth(xsum, 1, type = "regress") 36 | #' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50)) 37 | #' lines(xsmu2$x, xsmu2$.count, col = "red") 38 | #' # Note difference in tails 39 | smooth <- function(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean", 40 | factor = TRUE) { 41 | stopifnot(is.condensed(x)) 42 | stopifnot(is.numeric(h), all(h > 0)) 43 | type <- match.arg(type, c("mean", "regression", "robust_regression")) 44 | 45 | if (type != "mean" && !factor) { 46 | stop("Only factored approximations available for types other than mean", 47 | call. = FALSE) 48 | } 49 | 50 | grid_in <- as.matrix(x[group_vars(x)]) 51 | grid_out <- grid %||% grid_in 52 | stopifnot(is.matrix(grid_out), is.numeric(grid_out), 53 | ncol(grid_out) == ncol(grid_in), nrow(grid_out) > 0) 54 | 55 | z <- x[[var]] 56 | w <- if (var != ".count" && x %contains% ".count") x$.count else numeric() 57 | 58 | if (factor) { 59 | for(i in 1:ncol(grid_in)) { 60 | # smooth_nd_1 is a C++ function, so var is 0 indexed 61 | z <- smooth_nd_1(grid_in, z, w, grid_out, var = i - 1, h = h[i], 62 | type = type) 63 | } 64 | } else { 65 | z <- smooth_nd(grid_in, z, w, grid_out, h) 66 | } 67 | 68 | out <- data.frame(grid_out) 69 | out[[var]] <- z 70 | structure(out, class = c("condensed", class(out))) 71 | } 72 | 73 | complete_grid <- function(df) { 74 | stopifnot(is.data.frame(df)) 75 | 76 | breaks <- function(width, origin, nbins) { 77 | origin + width * seq.int(nbins) 78 | } 79 | 80 | cols <- lapply(df, function(x) do.call(breaks, attributes(x))) 81 | expand.grid(cols, KEEP.OUT.ATTRS = FALSE) 82 | } 83 | -------------------------------------------------------------------------------- /src/summary.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "stats.h" 3 | using namespace Rcpp; 4 | 5 | class SummaryMoments { 6 | int i_; 7 | double weight; 8 | double mean; 9 | double m2; 10 | 11 | public: 12 | SummaryMoments (int i) : i_(i), weight(0), mean(0), m2(0) { 13 | if (i > 2) stop("Invalid moment"); 14 | } 15 | 16 | // Algorithm adapted from 17 | // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm 18 | void push(double y, double w) { 19 | if (NumericVector::is_na(y)) return; 20 | 21 | // counts and weights 22 | weight += w; 23 | 24 | // mean 25 | if (i_ < 1) return; 26 | double delta = y - mean; 27 | mean += delta * w / weight; 28 | 29 | // variance 30 | if (i_ < 2) return; 31 | m2 += delta * delta * w * (1 - w / weight); 32 | 33 | return; 34 | } 35 | 36 | const int size() const { 37 | return i_ + 1; 38 | } 39 | 40 | double compute(int i) const { 41 | switch (i) { 42 | case 0: return weight; 43 | case 1: return (weight == 0) ? NAN : mean; 44 | case 2: return (weight == 0) ? NAN : pow(m2 / (weight - 1), 0.5); 45 | default: 46 | stop("Invalid output requested"); 47 | return NAN; 48 | } 49 | } 50 | 51 | std::string name(int i) const { 52 | switch (i) { 53 | case 0: return "count"; 54 | case 1: return "mean"; 55 | case 2: return "sd"; 56 | default: 57 | stop("Invalid output requested"); 58 | return ""; 59 | } 60 | } 61 | }; 62 | 63 | class SummarySum { 64 | int i_; 65 | int weight; 66 | double sum; 67 | 68 | public: 69 | SummarySum (int i) : i_(i), weight(0), sum(0) { 70 | if (i > 1 || i < 0) stop("Invalid moment"); 71 | } 72 | 73 | void push(double y, double w) { 74 | if (NumericVector::is_na(y)) return; 75 | 76 | weight += w; 77 | if (i_ < 1) return; 78 | 79 | sum += y * w; 80 | } 81 | 82 | const int size() const { 83 | return i_ + 1; 84 | } 85 | 86 | double compute(int i) const { 87 | switch (i) { 88 | case 0: return weight; 89 | case 1: return sum; 90 | default: 91 | stop("Invalid output requested"); 92 | return NAN; 93 | } 94 | } 95 | 96 | std::string name(int i) const { 97 | switch (i) { 98 | case 0: return "count"; 99 | case 1: return "sum"; 100 | default: 101 | stop("Invalid output requested"); 102 | return ""; 103 | } 104 | } 105 | 106 | }; 107 | 108 | class SummaryMedian { 109 | std::vector ys; 110 | 111 | public: 112 | void push(double y, double w) { 113 | if (NumericVector::is_na(y)) return; 114 | 115 | ys.push_back(y); 116 | } 117 | 118 | int size() const { 119 | return 1; 120 | } 121 | 122 | // Adapted from http://stackoverflow.com/questions/1719070/ 123 | double compute(int i) { 124 | return median(&ys); 125 | } 126 | 127 | std::string name(int i) const { 128 | return "median"; 129 | } 130 | }; 131 | -------------------------------------------------------------------------------- /src/stats.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | struct Regression { 5 | double alpha, beta; 6 | }; 7 | 8 | // [[Rcpp::export]] 9 | double bisquare(double u, double b) { 10 | u = fabs(u); 11 | return (u < b) ? pow(1 - pow(u / b, 2), 2) : 0; 12 | } 13 | 14 | Regression simpleLinearRegression(const std::vector& x, 15 | const std::vector& y, 16 | const std::vector& w) { 17 | int n = x.size(); 18 | 19 | double x_wsum = 0, y_wsum = 0, w_sum = 0; 20 | for (int i = 0; i < n; ++i) { 21 | x_wsum += x[i] * w[i]; 22 | y_wsum += y[i] * w[i]; 23 | w_sum += w[i]; 24 | }; 25 | double x_mean = x_wsum / w_sum, y_mean = y_wsum / w_sum; 26 | 27 | double var_xy = 0, var_x = 0; 28 | for (int i = 0; i < n; ++i) { 29 | var_xy += w[i] * (x[i] - x_mean) * (y[i] - y_mean); 30 | var_x += w[i] * pow((x[i] - x_mean), 2); 31 | } 32 | 33 | Regression results; 34 | results.beta = (var_xy / var_x); 35 | results.alpha = y_mean - results.beta * x_mean; 36 | return results; 37 | } 38 | 39 | // [[Rcpp::export]] 40 | NumericVector regress(const std::vector& x, 41 | const std::vector& y, 42 | const std::vector& w) { 43 | Regression regression = simpleLinearRegression(x, y, w); 44 | return NumericVector::create(regression.alpha, regression.beta); 45 | } 46 | 47 | double median(std::vector* x) { 48 | if (x->empty()) return NAN; 49 | 50 | int size = x->size(); 51 | std::vector::iterator upper = x->begin() + (int) (size / 2); 52 | std::nth_element(x->begin(), upper, x->end()); 53 | 54 | if (size % 2 == 1) { 55 | return *upper; 56 | } else { 57 | std::vector::iterator lower = upper - 1; 58 | std::nth_element(x->begin(), lower, upper); 59 | return (*upper + *lower) / 2.0; 60 | } 61 | } 62 | 63 | // [[Rcpp::export("medianC")]] 64 | double median(const std::vector& x) { 65 | std::vector x_(x); 66 | return median(&x_); 67 | } 68 | 69 | 70 | Regression simpleRobustRegression(const std::vector& x, 71 | const std::vector& y, 72 | const std::vector& w, 73 | int iterations = 3) { 74 | int n = x.size(); 75 | Regression prev = simpleLinearRegression(x, y, w); 76 | 77 | for (int k = 0; k < iterations; ++k) { 78 | std::vector resid(n); 79 | for (int i = 0; i < n; ++i) { 80 | resid[i] = fabs(y[i] - (prev.alpha + prev.beta * x[i])); 81 | } 82 | 83 | std::vector w_(w); 84 | double b = 6 * median(resid); 85 | if (b < 1e-20) break; 86 | for (int i = 0; i < n; ++i) { 87 | w_[i] *= bisquare(resid[i], b); 88 | } 89 | 90 | prev = simpleLinearRegression(x, y, w_); 91 | } 92 | 93 | return prev; 94 | } 95 | 96 | // [[Rcpp::export]] 97 | NumericVector regress_robust(const std::vector& x, 98 | const std::vector& y, 99 | const std::vector& w, 100 | int iterations = 3) { 101 | Regression regression = simpleRobustRegression(x, y, w, iterations); 102 | return NumericVector::create(regression.alpha, regression.beta); 103 | } 104 | -------------------------------------------------------------------------------- /R/condensed.r: -------------------------------------------------------------------------------- 1 | #' Condensed: an S3 class for condensed summaries. 2 | #' 3 | #' This object managed the properties of condensed (summarised) data frames. 4 | #' 5 | #' @section S3 methods: 6 | #' 7 | #' Mathematical functions with methods for \code{binsum} object will modify 8 | #' the x column of the data frame and \code{\link{rebin}} the data, calculating 9 | #' updated summary statistics. 10 | #' 11 | #' Currently methods are provided for the \code{Math} group generic, 12 | #' logical comparison and arithmetic operators, and 13 | #' \code{\link[plyr]{round_any}}. 14 | #' 15 | #' @param groups list of \code{\link{bin}}ed objects 16 | #' @param grouped,summary output from C++ condense function 17 | #' @keywords internal 18 | #' @examples 19 | #' if (require("ggplot2")) { 20 | #' 21 | #' x <- rchallenge(1e4) 22 | #' xsum <- condense(bin(x, 1 / 10)) 23 | #' 24 | #' # Basic math operations just modify the first column 25 | #' autoplot(xsum) 26 | #' autoplot(xsum * 10) 27 | #' autoplot(xsum - 30) 28 | #' autoplot(abs(xsum - 30)) 29 | #' 30 | #' # Similarly, logical operations work on the first col 31 | #' autoplot(xsum[xsum > 10, ]) 32 | #'} 33 | condensed <- function(groups, grouped, summary) { 34 | grouped <- as.data.frame(grouped) 35 | summary <- as.data.frame(summary) 36 | 37 | for (i in seq_along(groups)) { 38 | grouped[[i]] <- dgrid(grouped[[i]], 39 | groups[[i]]$width(), groups[[i]]$origin(), groups[[i]]$nbins()) 40 | } 41 | 42 | names(summary) <- paste0(".", names(summary)) 43 | 44 | df <- data.frame(grouped, summary) 45 | class(df) <- c("condensed", class(df)) 46 | df 47 | } 48 | 49 | #' @export 50 | #' @rdname condensed 51 | #' @param x object to test or coerce 52 | is.condensed <- function(x) inherits(x, "condensed") 53 | 54 | #' @export 55 | #' @rdname condensed 56 | as.condensed <- function(x) UseMethod("as.condensed") 57 | #' @export 58 | as.condensed.condensed <- function(x) x 59 | #' @export 60 | as.condensed.data.frame <- function(x) { 61 | structure(x, class = c("condensed", class(x))) 62 | } 63 | 64 | summary_vars <- function(x) { 65 | stopifnot(is.condensed(x)) 66 | nm <- names(x) 67 | names(x)[grepl("^\\.", names(x))] 68 | } 69 | 70 | group_vars <- function(x) { 71 | setdiff(names(x), summary_vars(x)) 72 | } 73 | 74 | gcol <- function(x) length(group_vars(x)) 75 | 76 | 77 | #' @export 78 | Math.condensed <- function(x, ...) { 79 | generic <- match.fun(.Generic) 80 | x[[1]] <- generic(x[[1]], ...) 81 | rebin(x) 82 | } 83 | 84 | #' @export 85 | Ops.condensed <- function(e1, e2) { 86 | logical_ops <- c("==", "!=", "<", "<=", ">=", ">") 87 | math_ops <- c("+", "-", "*", "/", "^", "%%", "%/%") 88 | 89 | generic <- match.fun(.Generic) 90 | if (.Generic %in% logical_ops) { 91 | l <- generic(e1[[1]], e2) 92 | l[1] <- TRUE # always preserve missings 93 | l & !is.na(l) 94 | } else if (.Generic %in% math_ops) { 95 | e1[[1]] <- generic(e1[[1]], e2) 96 | rebin(e1) 97 | } else { 98 | stop(.Generic, " not supported for condensed objects", call. = FALSE) 99 | } 100 | } 101 | 102 | #' Round any method for condensed objects 103 | #' 104 | #' @inheritParams plyr::round_any 105 | #' @export 106 | #' @keywords internal 107 | round_any.condensed <- function(x, accuracy, f = round) { 108 | gvars <- group_vars(x) 109 | x[gvars] <- lapply(x[gvars], plyr::round_any, accuracy = accuracy, f = f) 110 | rebin(x) 111 | } 112 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # This file was generated by Rcpp::compileAttributes 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | condense_count <- function(x, z, weight, drop = FALSE) { 5 | .Call('bigvis_condense_count', PACKAGE = 'bigvis', x, z, weight, drop) 6 | } 7 | 8 | condense_sum <- function(x, z, weight, drop = FALSE) { 9 | .Call('bigvis_condense_sum', PACKAGE = 'bigvis', x, z, weight, drop) 10 | } 11 | 12 | condense_mean <- function(x, z, weight, drop = FALSE) { 13 | .Call('bigvis_condense_mean', PACKAGE = 'bigvis', x, z, weight, drop) 14 | } 15 | 16 | condense_sd <- function(x, z, weight, drop = FALSE) { 17 | .Call('bigvis_condense_sd', PACKAGE = 'bigvis', x, z, weight, drop) 18 | } 19 | 20 | condense_median <- function(x, z, weight, drop = FALSE) { 21 | .Call('bigvis_condense_median', PACKAGE = 'bigvis', x, z, weight, drop) 22 | } 23 | 24 | double_diff_sum <- function(bin, count) { 25 | .Call('bigvis_double_diff_sum', PACKAGE = 'bigvis', bin, count) 26 | } 27 | 28 | #' Efficient implementation of range. 29 | #' 30 | #' This is an efficient C++ implementation of range for numeric vectors: 31 | #' it avoids S3 dispatch, and computes both min and max in a single pass 32 | #' through the input. 33 | #' 34 | #' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}} 35 | #' object), it will be used instead of computing the range from scratch. 36 | #' 37 | #' @param x a numeric vector, or a \code{\link{ranged}} object 38 | #' @param finite If \code{TRUE} ignores missing values and infinities. Note 39 | #' that if the vector is empty, or only contains missing values, 40 | #' \code{frange} will return \code{c(Inf, -Inf)} because those are the 41 | #' identity values for \code{\link{min}} and \code{\link{max}} respectively. 42 | #' @export 43 | #' @examples 44 | #' x <- runif(1e6) 45 | #' system.time(range(x)) 46 | #' system.time(frange(x)) 47 | #' 48 | #' rx <- ranged(x) 49 | #' system.time(frange(rx)) 50 | frange <- function(x, finite = TRUE) { 51 | .Call('bigvis_frange', PACKAGE = 'bigvis', x, finite) 52 | } 53 | 54 | group_fixed <- function(x, width, origin = 0) { 55 | .Call('bigvis_group_fixed', PACKAGE = 'bigvis', x, width, origin) 56 | } 57 | 58 | group_rect <- function(x, y, x_width, y_width, x_origin, y_origin) { 59 | .Call('bigvis_group_rect', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin) 60 | } 61 | 62 | group_hex <- function(x, y, x_width, y_width, x_origin, y_origin, x_max) { 63 | .Call('bigvis_group_hex', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin, x_max) 64 | } 65 | 66 | lowerBound <- function(x, breaks) { 67 | .Call('bigvis_lowerBound', PACKAGE = 'bigvis', x, breaks) 68 | } 69 | 70 | smooth_nd_1 <- function(grid_in, z_in, w_in_, grid_out, var, h, type = "mean") { 71 | .Call('bigvis_smooth_nd_1', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, var, h, type) 72 | } 73 | 74 | smooth_nd <- function(grid_in, z_in, w_in_, grid_out, h) { 75 | .Call('bigvis_smooth_nd', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, h) 76 | } 77 | 78 | bisquare <- function(u, b) { 79 | .Call('bigvis_bisquare', PACKAGE = 'bigvis', u, b) 80 | } 81 | 82 | regress <- function(x, y, w) { 83 | .Call('bigvis_regress', PACKAGE = 'bigvis', x, y, w) 84 | } 85 | 86 | medianC <- function(x) { 87 | .Call('bigvis_median', PACKAGE = 'bigvis', x) 88 | } 89 | 90 | regress_robust <- function(x, y, w, iterations = 3L) { 91 | .Call('bigvis_regress_robust', PACKAGE = 'bigvis', x, y, w, iterations) 92 | } 93 | 94 | compute_moments <- function(x) { 95 | .Call('bigvis_compute_moments', PACKAGE = 'bigvis', x) 96 | } 97 | 98 | compute_sum <- function(x) { 99 | .Call('bigvis_compute_sum', PACKAGE = 'bigvis', x) 100 | } 101 | 102 | compute_median <- function(x) { 103 | .Call('bigvis_compute_median', PACKAGE = 'bigvis', x) 104 | } 105 | 106 | -------------------------------------------------------------------------------- /src/group.h: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | NumericVector frange(const NumericVector& x, const bool finite = true); 5 | 6 | class GroupFixed { 7 | const NumericVector x_; 8 | double width_; 9 | double origin_; 10 | public: 11 | GroupFixed (NumericVector x, double width, double origin = 0) 12 | : x_(x), width_(width), origin_(origin) { 13 | } 14 | 15 | int bin_i(int i) const { 16 | if (ISNAN(x_[i]) || x_[i] == INFINITY || x_[i] == -INFINITY) return 0; 17 | if (x_[i] < origin_) return 0; 18 | 19 | return bin(x_[i]); 20 | } 21 | 22 | int bin(double x) const { 23 | return (x - origin_) / width_ + 1; 24 | } 25 | 26 | double unbin(int bin) const { 27 | if (bin == 0) return(NAN); 28 | return (bin - 1) * width_ + origin_; 29 | } 30 | 31 | double origin() const { 32 | return origin_; 33 | } 34 | double width() const { 35 | return width_; 36 | } 37 | 38 | 39 | int size() const { 40 | return x_.size(); 41 | } 42 | 43 | int nbins() const { 44 | double max = frange(x_)(1); 45 | double dest = floor((max - origin_) / width_) * width_ + origin_; 46 | 47 | // + 1 for missing values 48 | // + 1 if highest value is on right-open boundary 49 | return (dest - origin_) / width_ + 1 + ((max >= dest) ? 1 : 0); 50 | } 51 | 52 | }; 53 | 54 | template 55 | class Group2d { 56 | const Group& x_; 57 | const Group& y_; 58 | int x_bins_; 59 | int y_bins_; 60 | 61 | public: 62 | Group2d (const Group& x, const Group& y) : x_(x), y_(y) { 63 | if (x_.size() != y_.size()) { 64 | stop("x and y are not equal sizes"); 65 | } 66 | x_bins_ = x_.nbins(); 67 | y_bins_ = y_.nbins(); 68 | 69 | // Rcout << "x_bins: " << x_bins_ << " y_bins: " << y_bins_ << "\n"; 70 | } 71 | 72 | int bin_i(int i) const { 73 | int x_bin = x_.bin_i(i), y_bin = y_.bin_i(i); 74 | int bin = y_bin * x_bins_ + x_bin; 75 | // Rcout << i << ": (" << x_bin << "," << y_bin << ") -> " << bin << "\n"; 76 | return bin; 77 | } 78 | 79 | int size() const { 80 | return x_.size(); 81 | } 82 | 83 | int nbins() const { 84 | return x_bins_ * y_bins_; 85 | } 86 | }; 87 | 88 | 89 | template 90 | class GroupNd { 91 | const std::vector groups_; 92 | const int ngroups_; 93 | 94 | int size_; 95 | std::vector bins_; 96 | 97 | public: 98 | GroupNd (const std::vector groups) 99 | : groups_(groups), ngroups_(groups.size()) { 100 | if (groups.size() == 0) { 101 | stop("Empty groups vector passed to GroupCompound"); 102 | } 103 | 104 | size_ = groups[0].size(); 105 | 106 | bins_[0] = 1; 107 | for (int i = 0; i < ngroups_ - 1; ++i) { 108 | if (groups_[i].size() != size_) stop("Groups not equal sizes"); 109 | 110 | bins_[i + 1] = bins_[i] * groups_[i].nbins(); 111 | } 112 | } 113 | 114 | int bin_i(int i) const { 115 | int bin = 0; 116 | 117 | for (int j = 0; j < ngroups_; ++j) { 118 | bin += groups_[j].bin(i) * bins_[j]; 119 | } 120 | 121 | return bin; 122 | } 123 | 124 | // int nbins() const { 125 | // return bins_[ngroups_ - 1]; 126 | // } 127 | 128 | int ngroups() const { 129 | return groups_.size(); 130 | } 131 | 132 | int size() const { 133 | return size_; 134 | } 135 | 136 | std::vector unbin(int bin) const { 137 | std::vector bins(ngroups_); 138 | 139 | for (int j = 0; j < ngroups_; ++j) { 140 | int bin_j = bin % bins_[j]; 141 | bins[j] = groups_[j].unbin(bin_j); 142 | 143 | bin = bin - bin * bins_[j]; 144 | } 145 | 146 | return bins; 147 | } 148 | 149 | }; 150 | -------------------------------------------------------------------------------- /R/h.r: -------------------------------------------------------------------------------- 1 | #' Find "best" smoothing parameter using leave-one-out cross validation. 2 | #' 3 | #' Minimises the leave-one-out estimate of root mean-squared error to find 4 | #' find the "optimal" bandwidth for smoothing. 5 | #' 6 | #' L-BFGS-B optimisation is used to constrain the bandwidths to be greater 7 | #' than the binwidths: if the bandwidth is smaller than the binwidth it's 8 | #' impossible to compute the rmse because no smoothing occurs. The tolerance 9 | #' is set relatively high for numerical optimisation since the precise choice 10 | #' of bandwidth makes little difference visually, and we're unlikely to have 11 | #' sufficient data to make a statistically significant choice anyway. 12 | #' 13 | #' @param x condensed summary to smooth 14 | #' @param h_init initial values of bandwidths to start search out. If not 15 | #' specified defaults to 5 times the binwidth of each variable. 16 | #' @param ... other arguments (like \code{var}) passed on to 17 | #' \code{\link{rmse_cv}} 18 | #' @param tol numerical tolerance, defaults to 1\%. 19 | #' @param control additional control parameters passed on to \code{\link{optim}} 20 | #' The most useful argument is probably trace, which makes it possible to 21 | #' follow the progress of the optimisation. 22 | #' @family bandwidth estimation functions 23 | #' @return a single numeric value representing the bandwidth that minimises 24 | #' the leave-one-out estimate of rmse. Vector has attributes 25 | #' \code{evaluations} giving the number of times the objective function 26 | #' was evaluated. If the optimisation does not converge, or smoothing is not 27 | #' needed (i.e. the estimate is on the lower bounds), a warning is thrown. 28 | #' @export 29 | #' @examples 30 | #' \donttest{ 31 | #' x <- rchallenge(1e4) 32 | #' xsum <- condense(bin(x, 1 / 10)) 33 | #' h <- best_h(xsum, control = list(trace = 3, REPORT = 1)) 34 | #' 35 | #' if (require("ggplot2")) { 36 | #' autoplot(xsum) 37 | #' autoplot(smooth(xsum, h)) 38 | #' } 39 | #' } 40 | best_h <- function(x, h_init = NULL, ..., tol = 1e-2, control = list()) { 41 | stopifnot(is.condensed(x)) 42 | 43 | gvars <- group_vars(x) 44 | widths <- vapply(x[gvars], attr, "width", FUN.VALUE = numeric(1)) 45 | h_init <- h_init %||% widths * 5 46 | stopifnot(is.numeric(h_init), length(h_init) == length(gvars)) 47 | 48 | stopifnot(is.list(control)) 49 | control <- modifyList(list(factr = tol / .Machine$double.eps), control) 50 | 51 | # Optimise 52 | rmse <- function(h) { 53 | rmse_cv(x, h, ...) 54 | } 55 | res <- optim(h_init, rmse, method = "L-BFGS-B", lower = widths * 1.01, 56 | control = control) 57 | h <- unname(res$par) 58 | 59 | # Feedback 60 | if (res$convergence != 0) { 61 | warning("Failed to converge: ", res$message, call. = FALSE) 62 | } else if (rel_dist(h, widths) < 1e-3) { 63 | warning("h close to lower bound: smoothing not needed", call. = FALSE) 64 | } 65 | structure(h, evaluations = res$counts[1], conv = res$convergence) 66 | } 67 | 68 | rel_dist <- function(x, y) { 69 | mean(abs(x - y) / abs(x + y)) 70 | } 71 | 72 | #' Generate grid of plausible bandwidths for condensed summary. 73 | #' 74 | #' By default, the bandwidths start at the bin width, and then continue 75 | #' up 50 (\code{n}) steps until 20 (\code{max}) times the bin width. 76 | #' 77 | #' @param x a condensed summary 78 | #' @param n number of bandwidths to generate (in each dimension) 79 | #' @param max maximum bandwidth to generate, as multiple of binwidth. 80 | #' @family bandwidth estimation functions 81 | #' @export 82 | #' @examples 83 | #' x <- rchallenge(1e4) 84 | #' xsum <- condense(bin(x, 1 / 10)) 85 | #' h_grid(xsum) 86 | #' 87 | #' y <- runif(1e4) 88 | #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100)) 89 | #' h_grid(xysum, n = 10) 90 | h_grid <- function(x, n = 50, max = 20) { 91 | stopifnot(is.condensed(x)) 92 | stopifnot(is.numeric(n), length(n) == 1, n > 0) 93 | stopifnot(is.numeric(max), length(max) == 1, max > 0) 94 | 95 | gs <- x[group_vars(x)] 96 | widths <- vapply(gs, attr, "width", FUN.VALUE = numeric(1)) 97 | 98 | hs <- lapply(widths, function(w) w * seq(2, max, length = n)) 99 | expand.grid(hs, KEEP.OUT.ATTRS = FALSE) 100 | } 101 | -------------------------------------------------------------------------------- /inst/include/bigvis.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace Rcpp; 5 | 6 | // Wrapper for numeric vector that makes it easy figure to out which 7 | // bin each observation belongs to. 8 | class BinnedVector { 9 | // This should probably be a const NumericVector&, but that doesn't work 10 | // with modules currently 11 | NumericVector x_; 12 | String name_; 13 | double width_; 14 | double origin_; 15 | public: 16 | BinnedVector(NumericVector x, String name, double width, double origin = 0) 17 | : x_(x), name_(name), width_(width), origin_(origin) { 18 | } 19 | 20 | int bin_i(int i) const { 21 | return bin(x_[i]); 22 | } 23 | 24 | int bin(double x) const { 25 | if (ISNAN(x) || x == INFINITY || x == -INFINITY) return 0; 26 | if (x < origin_) return 0; 27 | 28 | return (x - origin_) / width_ + 1; 29 | } 30 | 31 | double unbin(int bin) const { 32 | if (bin == 0) return(NA_REAL); 33 | return (bin - 1) * width_ + origin_ + width_ / 2; 34 | } 35 | 36 | int nbins() const; 37 | 38 | int size() const { 39 | return x_.size(); 40 | } 41 | 42 | double origin() const { 43 | return origin_; 44 | } 45 | 46 | double width() const { 47 | return width_; 48 | } 49 | 50 | String name() const { 51 | return name_; 52 | } 53 | 54 | }; 55 | 56 | // This class is just boilerplate. There might be rcpp magic that does the right thing here 57 | // but I don't know it. 58 | class BinnedVectorReference { 59 | boost::shared_ptr ref; 60 | 61 | const BinnedVector *get() const { 62 | return ref.get(); 63 | }; 64 | BinnedVector *get() { 65 | return ref.get(); 66 | }; 67 | 68 | public: 69 | BinnedVectorReference() {}; 70 | 71 | BinnedVectorReference(const BinnedVectorReference &o): 72 | ref(o.ref) {}; 73 | 74 | explicit BinnedVectorReference(BinnedVector *ptr) { 75 | // Watch out, this takes ownership of the pointer! 76 | ref = boost::shared_ptr(ptr); 77 | } 78 | 79 | BinnedVectorReference(NumericVector x, String name, double width, double origin = 0) { 80 | BinnedVector *vec = new BinnedVector(x, name, width, origin); 81 | ref = boost::shared_ptr(vec); 82 | } 83 | 84 | int bin_i(int i) const { return get()->bin_i(i); } 85 | int bin(double x) const { return get()->bin(x); } 86 | double unbin(int bin) const { return get()->unbin(bin); } 87 | int nbins() const { return get()->nbins(); } 88 | int size() const { return get()->size();} 89 | double origin() const { return get()->origin();} 90 | double width() const { return get()->width();} 91 | String name() const { return get()->name();} 92 | }; 93 | 94 | // A data structure to store multiple binned vectors 95 | class BinnedVectors { 96 | int size_; 97 | std::vector groups_; 98 | 99 | public: 100 | std::vector bins_; 101 | BinnedVectors () : groups_(0), bins_(0) { 102 | } 103 | 104 | BinnedVectors (List gs) : groups_(0), bins_(0) { 105 | int n = gs.size(); 106 | for (int i = 0; i < n; ++i) { 107 | add_vector(as(gs[i])); 108 | } 109 | } 110 | 111 | void add_vector(BinnedVectorReference g) { 112 | if (groups_.empty()) { 113 | bins_.push_back(1); 114 | size_ = g.size(); 115 | } else { 116 | if (g.size() != size_) stop("Inconsistent sizes"); 117 | bins_.push_back(bins_.back() * g.nbins()); 118 | } 119 | groups_.push_back(g); 120 | 121 | } 122 | 123 | int bin_i(int i) const; 124 | int bin(std::vector x) const; 125 | std::vector unbin(int bin) const; 126 | 127 | int nbins() const { 128 | return bins_.back() * groups_.front().nbins(); 129 | } 130 | 131 | int ngroups() const { 132 | return bins_.size(); 133 | } 134 | 135 | int size() const { 136 | return size_; 137 | } 138 | 139 | String name(int j) const { 140 | return groups_[j].name(); 141 | } 142 | 143 | }; 144 | 145 | 146 | RCPP_EXPOSED_AS(BinnedVectorReference) 147 | RCPP_EXPOSED_WRAP(BinnedVectorReference) 148 | RCPP_EXPOSED_AS(BinnedVectors) 149 | RCPP_EXPOSED_WRAP(BinnedVectors) 150 | -------------------------------------------------------------------------------- /src/smooth-nd.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "group.h" 4 | #include "Summary2d.h" 5 | #include 6 | using namespace Rcpp; 7 | 8 | boost::shared_ptr createSummary(std::string type) { 9 | if (type == "mean") { 10 | return boost::shared_ptr(new Summary2dMean()); 11 | } else if (type == "regression") { 12 | return boost::shared_ptr(new Summary2dRegression()); 13 | } else if (type == "robust_regression") { 14 | return boost::shared_ptr(new Summary2dRobustRegression()); 15 | } else { 16 | stop("Unknown type"); 17 | // Quiet warning 18 | return boost::shared_ptr(new Summary2dMean()); 19 | } 20 | } 21 | 22 | double tricube(double x) { 23 | if (NumericVector::is_na(x)) return 0; 24 | x = fabs(x); 25 | if (x > 1) return 0; 26 | 27 | double y = 1 - x * x * x; 28 | return y * y * y; 29 | } 30 | 31 | bool both_na(double x, double y) { 32 | return (NumericVector::is_na(x) && NumericVector::is_na(y)); 33 | } 34 | 35 | // [[Rcpp::export]] 36 | NumericVector smooth_nd_1(const NumericMatrix& grid_in, 37 | const NumericVector& z_in, 38 | const NumericVector& w_in_, 39 | const NumericMatrix& grid_out, 40 | const int var, const double h, 41 | const std::string type = "mean") { 42 | 43 | if (var < 0) stop("var < 0"); 44 | if (var >= grid_in.ncol()) stop("var too large"); 45 | if (h <= 0) stop("h <= 0"); 46 | if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes"); 47 | 48 | int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol(); 49 | NumericVector w_in = (w_in_.size() > 0) ? w_in_ : 50 | rep(NumericVector::create(1), n_in); 51 | NumericVector z_out(n_out), w_out(n_out); 52 | 53 | // Will be much more efficient to slice up by input dimension: 54 | // and most efficient way of doing that will be to bin with / bw 55 | // My data structure: sparse grids 56 | // 57 | // And once we're smoothing in one direction, with guaranteed e2venly spaced 58 | // grid can avoid many kernel evaluations and can also compute more 59 | // efficient running sum 60 | 61 | for(int j = 0; j < n_out; ++j) { 62 | boost::shared_ptr summary = createSummary(type); 63 | for (int i = 0; i < n_in; ++i) { 64 | // Check that all variables (apart from var) are equal 65 | bool equiv = true; 66 | for (int k = 0; k < d; ++k) { 67 | if (k == var) continue; 68 | 69 | double in = grid_in(i, k), out = grid_out(j, k); 70 | if (in != out && !both_na(in, out)) { 71 | equiv = false; 72 | break; 73 | } 74 | }; 75 | if (!equiv) continue; 76 | 77 | double in = grid_in(i, var), out = grid_out(j, var); 78 | double dist = both_na(in, out) ? 0 : in - out; 79 | double w = tricube(dist / h) * w_in[i]; 80 | if (w == 0) continue; 81 | 82 | summary->push(dist, z_in[i], w); 83 | } 84 | z_out[j] = summary->compute(); 85 | } 86 | 87 | return z_out; 88 | } 89 | 90 | // [[Rcpp::export]] 91 | NumericVector smooth_nd(const NumericMatrix& grid_in, 92 | const NumericVector& z_in, 93 | const NumericVector& w_in_, 94 | const NumericMatrix& grid_out, 95 | const NumericVector h) { 96 | 97 | if (grid_in.nrow() != z_in.size()) stop("Incompatible input lengths"); 98 | if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes"); 99 | if (h.size() != grid_in.ncol()) stop("Incorrect h length"); 100 | 101 | int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol(); 102 | NumericVector w_in = (w_in_.size() > 0) ? w_in_ : 103 | rep(NumericVector::create(1), n_in); 104 | NumericVector z_out(n_out), w_out(n_out); 105 | 106 | for (int i = 0; i < n_in; ++i) { 107 | for(int j = 0; j < n_out; ++j) { 108 | double w = 1; 109 | for (int k = 0; k < d; ++k) { 110 | double dist = (grid_in(i, k) - grid_out(j, k)) / h[k]; 111 | w *= tricube(dist); 112 | } 113 | w *= w_in[i]; 114 | 115 | w_out[j] += w; 116 | z_out[j] += z_in[i] * w; 117 | } 118 | } 119 | 120 | for(int j = 0; j < n_out; ++j) { 121 | z_out[j] /= w_out[j]; 122 | } 123 | 124 | return z_out; 125 | } 126 | -------------------------------------------------------------------------------- /R/autoplot.r: -------------------------------------------------------------------------------- 1 | #' Autoplot condensed summaries. 2 | #' 3 | #' @param x a condensed summary 4 | #' @param var which summary variable to display 5 | #' @param ... other arguments passed on to individual methods 6 | #' @method autoplot condensed 7 | #' @export autoplot.condensed 8 | #' @examples 9 | #' if (require("ggplot2")) { 10 | #' 11 | #' # 1d summaries ----------------------------- 12 | #' x <- rchallenge(1e4) 13 | #' z <- x + rt(length(x), df = 2) 14 | #' xsum <- condense(bin(x, 0.1)) 15 | #' zsum <- condense(bin(x, 0.1), z = z) 16 | #' 17 | #' autoplot(xsum) 18 | #' autoplot(peel(xsum)) 19 | #' 20 | #' autoplot(zsum) 21 | #' autoplot(peel(zsum, keep = 1)) 22 | #' autoplot(peel(zsum)) 23 | #' 24 | #' # 2d summaries ----------------------------- 25 | #' y <- runif(length(x)) 26 | #' xysum <- condense(bin(x, 0.1), bin(y, 0.1)) 27 | #' xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z) 28 | #' 29 | #' autoplot(xysum) 30 | #' autoplot(peel(xysum)) 31 | #' autoplot(xyzsum) 32 | #' autoplot(peel(xyzsum)) 33 | #' } 34 | autoplot.condensed <- function(x, var = last(summary_vars(x)), ...) { 35 | stopifnot(is.condensed(x)) 36 | stopifnot(is.character(var), length(var) == 1) 37 | summaries <- c( 38 | .count = "total", 39 | .sum = "total", 40 | .mean = "summary", 41 | .sd = "summary", 42 | .median = "summary" 43 | ) 44 | if (!(var %in% names(summaries))) { 45 | stop("Unknown varible", call. = FALSE) 46 | } 47 | d <- gcol(x) 48 | if (d > 2) { 49 | stop("No autoplot methods available for more than two d") 50 | } 51 | 52 | f <- paste0("plot_", summaries[var], "_", d) 53 | find_fun(f)(x, var = var, ...) 54 | } 55 | 56 | 57 | plot_total_1 <- function(x, var = ".count", show_na = TRUE, log = "") { 58 | xvar <- names(x)[[1]] 59 | 60 | plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) + 61 | ggplot2::geom_line(na.rm = TRUE) 62 | 63 | if (show_na) { 64 | plot <- plot + na_layer(x, var) 65 | } 66 | 67 | if (logv(log, "y")) { 68 | plot <- plot + ggplot2::scale_y_continuous(trans = "log1p") 69 | } 70 | if (logv(log, "x")) { 71 | plot <- plot + ggplot2::scale_x_log10() 72 | } 73 | 74 | plot 75 | } 76 | 77 | plot_total_2 <- function(x, var = ".count", show_na = TRUE, log = "") { 78 | x <- peel(x, keep = 1) 79 | xvar <- names(x)[[1]] 80 | yvar <- names(x)[[2]] 81 | miss <- is.na(x[[1]]) + 2 * is.na(x[[2]]) 82 | 83 | fill_trans <- if (logv(log, "z")) "log1p" else "identity" 84 | 85 | plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) + 86 | ggplot2::geom_raster(ggplot2::aes_string(fill = var)) + 87 | ggplot2::scale_fill_gradient(low = "grey90", high = "black", trans = fill_trans) + 88 | ggplot2::expand_limits(fill = 0) 89 | 90 | if (show_na) { 91 | } 92 | 93 | plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10() 94 | plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10() 95 | 96 | plot 97 | } 98 | 99 | plot_summary_1 <- function(x, var = ".mean", show_na = TRUE, 100 | show_n = x %contains% ".count", log = NULL) { 101 | xvar <- names(x)[[1]] 102 | 103 | plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) + 104 | ggplot2::geom_line(na.rm = TRUE) + 105 | ggplot2::scale_size_area() 106 | 107 | if (show_n) { 108 | plot <- plot + 109 | ggplot2::geom_point(ggplot2::aes_string(color = ".count"), na.rm = TRUE) + 110 | ggplot2::scale_colour_gradient(trans = "log10") 111 | } 112 | 113 | if (show_na) { 114 | plot <- plot + na_layer(x, var) 115 | } 116 | 117 | plot 118 | } 119 | 120 | plot_summary_2 <- function(x, var = ".mean", show_na = TRUE, log = "") { 121 | x <- peel(x, keep = 1) 122 | xvar <- names(x)[[1]] 123 | yvar <- names(x)[[2]] 124 | 125 | miss <- is.na(x[[1]]) + 2 * is.na(x[[2]]) 126 | 127 | plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) + 128 | ggplot2::geom_tile(ggplot2::aes_string(fill = var)) + 129 | ggplot2::scale_fill_gradient2() 130 | 131 | if (show_na) { 132 | } 133 | 134 | plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10() 135 | plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10() 136 | 137 | plot 138 | } 139 | 140 | na_layer <- function(x, var) { 141 | val <- x[[var]][is.na(x[[1]])] 142 | if (length(val) == 0 || is.na(val) || val == 0) return() 143 | 144 | xloc <- miss_poss(x[[1]]) 145 | ggplot2::annotate("text", x = xloc, y = val, colour = "red", label = "NA", 146 | size = 3) 147 | } 148 | 149 | logv <- function(log, var) var %in% strsplit(log, "")[[1]] 150 | 151 | miss_poss <- function(x) { 152 | rng <- frange(x) 153 | rng[1] - (rng[2] - rng[1]) * 0.05 154 | } 155 | -------------------------------------------------------------------------------- /R/weighted-stats.r: -------------------------------------------------------------------------------- 1 | #' Compute a weighted variance or standard deviation of a vector. 2 | #' 3 | #' @details 4 | #' Note that unlike the base R \code{\link{var}} function, these functions only 5 | #' work with individual vectors not matrices or data frames. 6 | #' 7 | #' @family weighted statistics 8 | #' @seealso \code{\link[stats]{weighted.mean}} 9 | #' @param x numeric vector of observations 10 | #' @param w integer vector of weights, representing the number of 11 | #' time each \code{x} was observed 12 | #' @param na.rm if \code{TRUE}, missing values in both \code{w} and \code{x} 13 | #' will be removed prior computation. Otherwise if there are missing values 14 | #' the result will always be missing. 15 | #' @export 16 | #' @examples 17 | #' x <- c(1:5) 18 | #' w <- rpois(5, 5) + 1 19 | #' y <- x[rep(seq_along(x), w)] 20 | #' weighted.var(x, w) 21 | #' var(y) 22 | #' 23 | #' stopifnot(all.equal(weighted.var(x, w), var(y))) 24 | weighted.var <- function(x, w = NULL, na.rm = FALSE) { 25 | if (na.rm) { 26 | na <- is.na(x) | is.na(w) 27 | x <- x[!na] 28 | w <- w[!na] 29 | } 30 | 31 | sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1) 32 | } 33 | 34 | #' @export 35 | #' @rdname weighted.var 36 | weighted.sd <- function(x, w, na.rm = TRUE) sqrt(weighted.var(x, w, na.rm = TRUE)) 37 | 38 | #' A weighted ecdf function. 39 | #' 40 | #' An extension of the base \code{\link[stats]{ecdf}} function which works 41 | #' with weighted data. 42 | #' 43 | #' @section S3 methods: 44 | #' The \code{ecdf} class has methods for \code{\link{plot}}, 45 | #' \code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}. 46 | #' \code{\link{quantile}} does not currently correctly compute values for 47 | #' weighted ecdfs. 48 | #' 49 | #' @inheritParams weighted.var 50 | #' @family weighted statistics 51 | #' @seealso \code{\link[stats]{weighted.mean}} 52 | #' @export 53 | #' @examples 54 | #' x <- runif(200) 55 | #' w <- rpois(200, 5) + 1 56 | #' 57 | #' e <- weighted.ecdf(x, w) 58 | #' plot(e) 59 | #' summary(e) 60 | #' 61 | #' y <- x[rep(seq_along(x), w)] 62 | #' plot(ecdf(y)) 63 | weighted.ecdf <- function(x, w) { 64 | stopifnot(length(x) == length(w)) 65 | stopifnot(anyDuplicated(x) == 0) 66 | 67 | ord <- order(x) 68 | x <- x[ord] 69 | w <- w[ord] 70 | 71 | n <- sum(w) 72 | wts <- cumsum(w / n) 73 | 74 | f <- approxfun(x, wts, method = "constant", yleft = 0, yright = 1, f = 0) 75 | class(f) <- c("wecdf", "ecdf", "stepfun", class(f)) 76 | attr(f, "call") <- sys.call() 77 | environment(f)$nobs <- n 78 | f 79 | } 80 | 81 | #' Compute quantiles of weighted data. 82 | #' 83 | #' @details 84 | #' Currently only implements the type 7 algorithm, as described in 85 | #' \code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core. 86 | #' 87 | #' @inheritParams weighted.var 88 | #' @param probs numeric vector of probabilities between 0 and 1 89 | #' @param na.rm If \code{TRUE} will automatically remove missing values 90 | #' in \code{x} or \code{w}. 91 | #' @family weighted statistics 92 | #' @export 93 | #' @examples 94 | #' x <- runif(200) 95 | #' w <- rpois(200, 5) + 1 96 | #' weighted.quantile(x, w) 97 | weighted.quantile <- function (x, w, probs = seq(0, 1, 0.25), na.rm = FALSE) { 98 | stopifnot(length(x) == length(w)) 99 | na <- is.na(x) | is.na(w) 100 | if (any(na)) { 101 | if (na.rm) { 102 | x <- x[!na] 103 | w <- w[!na] 104 | } else { 105 | stop("Missing values not allowed when na.rm is FALSE", call. = FALSE) 106 | } 107 | } 108 | 109 | # Ensure x and w in ascending order of x 110 | ord <- order(x) 111 | x <- x[ord] 112 | w <- w[ord] 113 | 114 | # Find closest x just below and above index 115 | n <- sum(w) 116 | index <- 1 + (n - 1) * probs 117 | j <- floor(index) 118 | 119 | wts <- cumsum(w) 120 | lo <- x[lowerBound(j, wts)] # X_j 121 | hi <- x[lowerBound(j + 1, wts)] 122 | 123 | g <- index - j 124 | ifelse(lo == hi, lo, (1 - g) * lo + g * hi) 125 | } 126 | # Q[i](p) = (1 - g) x[j] + g x[j+1] 127 | # j = floor(np + m) 128 | # g = np + m - j 129 | # 130 | # For type 7: 131 | # m = 1 - p => 132 | # j = floor(1 + (n - 1) * p) 133 | # g = (np + 1 - p) - floor(1 + (n - 1) * p) 134 | 135 | #' Compute the median of weighted data. 136 | #' 137 | #' @details This is a simple wrapper around \code{\link{weighted.quantile}} 138 | #' @inheritParams weighted.quantile 139 | #' @export 140 | #' @examples 141 | #' x <- runif(200) 142 | #' w <- rpois(200, 5) + 1 143 | #' 144 | #' median(x) 145 | #' weighted.median(x, w) 146 | weighted.median <- function(x, w, na.rm = FALSE) { 147 | weighted.quantile(x, w, probs = 0.5, na.rm = na.rm) 148 | } 149 | 150 | #' Compute the interquartile range of weighted data. 151 | #' 152 | #' @details This is a simple wrapper around \code{\link{weighted.quantile}} 153 | #' @inheritParams weighted.quantile 154 | #' @export 155 | #' @examples 156 | #' x <- sort(runif(200)) 157 | #' w <- rpois(200, seq(1, 10, length = 200)) + 1 158 | #' 159 | #' IQR(x) 160 | #' weighted.IQR(x, w) 161 | weighted.IQR <- function(x, w, na.rm = FALSE) { 162 | diff(weighted.quantile(x, w, probs = c(0.25, 0.75), na.rm = na.rm)) 163 | } 164 | -------------------------------------------------------------------------------- /src/condense.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "group.h" 4 | #include "summary.h" 5 | 6 | template 7 | List condense(const BinnedVectors& group, const NumericVector& z, 8 | const NumericVector& weight, const Stat& stat) { 9 | int n_obs = group.size(); 10 | int n_bins = group.nbins(); 11 | 12 | const NumericVector& weight_ = (weight.size() > 0) ? weight : 13 | rep(NumericVector::create(1), n_obs); 14 | const NumericVector& z_ = (z.size() > 0) ? z : 15 | rep(NumericVector::create(1), n_obs); 16 | 17 | // Push values into stats 18 | std::vector stats(n_bins, stat); 19 | for(int i = 0; i < n_obs; ++i) { 20 | int bin = group.bin_i(i); 21 | // Rcout << "i: " << i << " bin: " << bin << "\n"; 22 | stats.at(bin).push(z_[i], weight_[i]); 23 | } 24 | 25 | // Compute values from stats and determine bins 26 | int n_stats = stat.size(); 27 | int n_groups = group.ngroups(); 28 | NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups); 29 | 30 | for (int i = 0; i < n_bins; ++i) { 31 | for (int j = 0; j < n_stats; ++j) { 32 | out(i, j) = stats[i].compute(j); 33 | } 34 | 35 | std::vector bins = group.unbin(i); 36 | for (int j = 0; j < n_groups; ++j) { 37 | bin(i, j) = bins[j]; 38 | } 39 | } 40 | 41 | // Name 42 | CharacterVector out_cols(n_stats), bin_cols(n_groups); 43 | for (int j = 0; j < n_stats; ++j) { 44 | out_cols[j] = stat.name(j); 45 | } 46 | for (int j = 0; j < n_groups; ++j) { 47 | bin_cols[j] = group.name(j); 48 | } 49 | out.attr("dimnames") = List::create(CharacterVector::create(), out_cols); 50 | bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols); 51 | 52 | return List::create(bin, out); 53 | } 54 | 55 | template 56 | List sparse_condense(const BinnedVectors& group, const NumericVector& z, 57 | const NumericVector& weight, const Stat& stat) { 58 | int n_obs = group.size(); 59 | 60 | const NumericVector& weight_ = (weight.size() > 0) ? weight : 61 | rep(NumericVector::create(1), n_obs); 62 | const NumericVector& z_ = (z.size() > 0) ? z : 63 | rep(NumericVector::create(1), n_obs); 64 | 65 | // Push values into stats 66 | typename std::map stats; 67 | for(int i = 0; i < n_obs; ++i) { 68 | int bin = group.bin_i(i); 69 | 70 | typename std::map::iterator loc = stats.find(bin); 71 | if (loc == stats.end()) { 72 | Stat new_stat(stat); 73 | new_stat.push(z_[i], weight_[i]); 74 | stats.insert(std::pair(bin, new_stat)); 75 | } else { 76 | (loc->second).push(z_[i], weight_[i]); 77 | } 78 | } 79 | 80 | // Compute values from stats and determine bins 81 | int n_bins = stats.size(); 82 | int n_stats = stat.size(); 83 | int n_groups = group.ngroups(); 84 | NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups); 85 | 86 | typename std::map::iterator stats_it = stats.begin(), 87 | stats_end = stats.end(); 88 | 89 | 90 | for (int i = 0; stats_it != stats_end; ++stats_it, ++i) { 91 | for (int j = 0; j < n_stats; ++j) { 92 | out(i, j) = (stats_it->second).compute(j); 93 | } 94 | 95 | std::vector bins = group.unbin(stats_it->first); 96 | for (int j = 0; j < n_groups; ++j) { 97 | bin(i, j) = bins[j]; 98 | } 99 | } 100 | 101 | // Name 102 | CharacterVector out_cols(n_stats), bin_cols(n_groups); 103 | for (int j = 0; j < n_stats; ++j) { 104 | out_cols[j] = stat.name(j); 105 | } 106 | for (int j = 0; j < n_groups; ++j) { 107 | bin_cols[j] = group.name(j); 108 | } 109 | out.attr("dimnames") = List::create(CharacterVector::create(), out_cols); 110 | bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols); 111 | 112 | return List::create(bin, out); 113 | } 114 | 115 | // ----------------------------------------------------------------------------- 116 | // Autogenerated by condense-gen.r 117 | 118 | // [[Rcpp::export]] 119 | List condense_count(const List& x, const NumericVector& z, 120 | const NumericVector& weight, bool drop = false) { 121 | if (drop) { 122 | return sparse_condense(BinnedVectors(x), z, weight, SummarySum(0)); 123 | } else { 124 | return condense(BinnedVectors(x), z, weight, SummarySum(0)); 125 | } 126 | } 127 | 128 | // [[Rcpp::export]] 129 | List condense_sum(const List& x, const NumericVector& z, 130 | const NumericVector& weight, bool drop = false) { 131 | if (drop) { 132 | return sparse_condense(BinnedVectors(x), z, weight, SummarySum(1)); 133 | } else { 134 | return condense(BinnedVectors(x), z, weight, SummarySum(1)); 135 | } 136 | } 137 | 138 | // [[Rcpp::export]] 139 | List condense_mean(const List& x, const NumericVector& z, 140 | const NumericVector& weight, bool drop = false) { 141 | if (drop) { 142 | return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(1)); 143 | } else { 144 | return condense(BinnedVectors(x), z, weight, SummaryMoments(1)); 145 | } 146 | } 147 | 148 | // [[Rcpp::export]] 149 | List condense_sd(const List& x, const NumericVector& z, 150 | const NumericVector& weight, bool drop = false) { 151 | if (drop) { 152 | return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(2)); 153 | } else { 154 | return condense(BinnedVectors(x), z, weight, SummaryMoments(2)); 155 | } 156 | } 157 | 158 | // [[Rcpp::export]] 159 | List condense_median(const List& x, const NumericVector& z, 160 | const NumericVector& weight, bool drop = false) { 161 | if (drop) { 162 | return sparse_condense(BinnedVectors(x), z, weight, SummaryMedian()); 163 | } else { 164 | return condense(BinnedVectors(x), z, weight, SummaryMedian()); 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /bench/bin-structure.cpp: -------------------------------------------------------------------------------- 1 | // How does the data structure implementing the bin affect performance. 2 | 3 | #include 4 | using namespace Rcpp; 5 | 6 | class Grouper { 7 | const Fast x_; 8 | double width_; 9 | double origin_; 10 | public: 11 | Grouper (const NumericVector& x, double width, double origin = 0) 12 | : x_(x), width_(width), origin_(origin) { 13 | } 14 | 15 | int bin(int i) const { 16 | if (ISNAN(x_[i])) return 0; 17 | 18 | return (x_[i] - origin_) / width_ + 1; 19 | } 20 | 21 | int size() const { 22 | return x_.size(); 23 | } 24 | }; 25 | 26 | // [[Rcpp::export]] 27 | std::vector count_vector(const NumericVector& x, double width, double origin = 0) { 28 | Grouper grouper = Grouper(x, width, origin); 29 | std::vector count; 30 | 31 | int n = grouper.size(); 32 | for(int i = 0; i < n; ++i) { 33 | int bin = grouper.bin(i); 34 | if (bin >= count.size()) { 35 | count.resize(bin + 1); 36 | } 37 | 38 | ++count[bin]; 39 | } 40 | 41 | return count; 42 | } 43 | 44 | // [[Rcpp::export]] 45 | List count_map(const NumericVector& x, double width, double origin = 0) { 46 | Grouper grouper = Grouper(x, width, origin); 47 | std::map count; 48 | 49 | int n = grouper.size(); 50 | for(int i = 0; i < n; ++i) { 51 | int bin = grouper.bin(i); 52 | ++count[bin]; 53 | } 54 | 55 | IntegerVector out_x(count.size()), out_y(count.size()); 56 | std::map::const_iterator count_it = count.begin(), 57 | count_end = count.begin(); 58 | for (int i = 0; count_it != count_end; ++count_it, ++i) { 59 | out_x[i] = count_it->first; 60 | out_y[i] = count_it->second; 61 | } 62 | return List::create(_["x"] = out_x, _["count"] = out_y); 63 | } 64 | 65 | // [[Rcpp::export]] 66 | List count_umap(const NumericVector& x, double width, double origin = 0) { 67 | Grouper grouper = Grouper(x, width, origin); 68 | std::tr1::unordered_map count; 69 | 70 | int n = grouper.size(); 71 | for(int i = 0; i < n; ++i) { 72 | int bin = grouper.bin(i); 73 | 74 | ++count[bin]; 75 | } 76 | 77 | IntegerVector out_x(count.size()), out_y(count.size()); 78 | std::tr1::unordered_map::iterator count_it = count.begin(), 79 | count_end = count.end(); 80 | for (int i = 0; count_it != count_end; ++count_it, ++i) { 81 | out_x[i] = count_it->first; 82 | out_y[i] = count_it->second; 83 | } 84 | return List::create(_["x"] = out_x, _["count"] = out_y); 85 | } 86 | 87 | 88 | template 89 | inline void hash_combine(std::size_t & seed, const T & v) { 90 | std::tr1::hash hasher; 91 | seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); 92 | } 93 | 94 | namespace std { 95 | namespace tr1 { 96 | template struct hash > { 97 | inline size_t operator()(const pair & v) const { 98 | size_t seed = 0; 99 | ::hash_combine(seed, v.first); 100 | ::hash_combine(seed, v.second); 101 | return seed; 102 | } 103 | }; 104 | } 105 | } 106 | 107 | // [[Rcpp::export]] 108 | List count_umap2(const NumericVector& x, double width, double origin = 0) { 109 | Grouper grouper = Grouper(x, width, origin); 110 | std::tr1::unordered_map, int> count; 111 | 112 | int n = grouper.size(); 113 | for(int i = 0; i < n; ++i) { 114 | int bin = grouper.bin(i); 115 | 116 | ++count[std::make_pair(bin, bin)]; 117 | } 118 | 119 | IntegerVector out_x(count.size()), out_y(count.size()); 120 | std::tr1::unordered_map, int>::iterator count_it = count.begin(), 121 | count_end = count.end(); 122 | for (int i = 0; count_it != count_end; ++count_it, ++i) { 123 | out_x[i] = count_it->first.first; 124 | out_y[i] = count_it->second; 125 | } 126 | return List::create(_["x"] = out_x, _["count"] = out_y); 127 | } 128 | 129 | // [[Rcpp::export]] 130 | List count_umap2_man(const NumericVector& x, double width, double origin = 0) { 131 | Grouper grouper = Grouper(x, width, origin); 132 | std::tr1::unordered_map count; 133 | 134 | int n = grouper.size(); 135 | for(int i = 0; i < n; ++i) { 136 | int bin = grouper.bin(i); 137 | bin = bin * 100 + bin; 138 | ++count[bin]; 139 | } 140 | 141 | IntegerVector out_x(count.size()), out_y(count.size()); 142 | std::tr1::unordered_map::iterator count_it = count.begin(), 143 | count_end = count.end(); 144 | for (int i = 0; count_it != count_end; ++count_it, ++i) { 145 | out_x[i] = count_it->first; 146 | out_y[i] = count_it->second; 147 | } 148 | return List::create(_["x"] = out_x, _["count"] = out_y); 149 | } 150 | 151 | 152 | /*** R 153 | options(digits = 3) 154 | library(microbenchmark) 155 | x <- runif(1e5) 156 | 157 | # As expected, for small contiguous inputs, vector is fastest, followed by 158 | # unordered maps (about half as fast), with maps in a distant last place. 159 | microbenchmark( 160 | count_vector(x, 1 / 1000), 161 | count_map(x, 1 / 1000), 162 | count_umap(x, 1 / 1000) 163 | ) 164 | 165 | y <- c(x, x) 166 | y1 <- c(x, x + 10) 167 | y2 <- c(x, x + 100) 168 | y3 <- c(x, x + 1000) 169 | y4 <- c(x, x + 1000) 170 | 171 | # While using std::vector is somewhat faster, the asymptotic behaviour is 172 | # much worse - count_umap is basically constant, regardless of the number 173 | # of bins 174 | microbenchmark( 175 | count_vector(y, 1 / 1000), 176 | count_vector(y1, 1 / 1000), 177 | count_vector(y2, 1 / 1000), 178 | count_vector(y3, 1 / 1000), 179 | count_vector(y4, 1 / 1000), 180 | count_umap(y, 1 / 1000), 181 | count_umap(y1, 1 / 1000), 182 | count_umap(y2, 1 / 1000), 183 | count_umap(y3, 1 / 1000), 184 | count_umap(y4, 1 / 1000), 185 | times = 10 186 | ) 187 | 188 | # Using umap with a pair is about twice as slow as with an int: this probably 189 | # implies that I should do the hashing myself. 190 | microbenchmark( 191 | count_umap(x, 1 / 1000), 192 | count_umap2(x, 1 / 1000), 193 | count_umap2_man(x, 1 / 1000) 194 | ) 195 | 196 | */ -------------------------------------------------------------------------------- /bench/bin.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace Rcpp; 5 | 6 | //' @param breaks must be ordered and span the complete range of x. 7 | // [[Rcpp::export]] 8 | IntegerVector bin(NumericVector x, NumericVector breaks) { 9 | // Put missing values in the last position 10 | int n = breaks.size(); 11 | IntegerVector out(n + 1); 12 | 13 | for(NumericVector::iterator it = x.begin(); it != x.end(); it++) { 14 | double val = *it; 15 | if (ISNAN(val)) { 16 | out[n]++; 17 | } else { 18 | NumericVector::iterator bin_it = 19 | std::upper_bound(breaks.begin(), breaks.end(), val); 20 | 21 | int bin = std::distance(breaks.begin(), bin_it); 22 | out[bin]++; 23 | } 24 | } 25 | 26 | return out; 27 | } 28 | 29 | // [[Rcpp::export]] 30 | IntegerVector bin2(NumericVector x, NumericVector breaks) { 31 | // Put missing values in the last position 32 | int n = breaks.size(), bin; 33 | IntegerVector out(n + 1); 34 | 35 | NumericVector::iterator x_it = x.begin(), x_end, bin_it, 36 | breaks_it = breaks.begin(), breaks_end = breaks.end(); 37 | 38 | for(; x_it != x.end(); ++x_it) { 39 | double val = *x_it; 40 | if (ISNAN(val)) { 41 | ++out[n]; 42 | } else { 43 | bin_it = std::upper_bound(breaks_it, breaks_end, val); 44 | bin = std::distance(breaks_it, bin_it); 45 | ++out[bin]; 46 | } 47 | } 48 | 49 | return out; 50 | } 51 | 52 | // [[Rcpp::export]] 53 | std::vector bin3(NumericVector x, double width, double origin = 0) { 54 | int bin, nmissing = 0; 55 | std::vector out; 56 | 57 | NumericVector::iterator x_it = x.begin(), x_end; 58 | for(; x_it != x.end(); ++x_it) { 59 | double val = *x_it; 60 | if (ISNAN(val)) { 61 | ++nmissing; 62 | } else { 63 | bin = (val - origin) / width; 64 | if (bin < 0) continue; 65 | 66 | // Make sure there's enough space 67 | if (bin >= out.size()) { 68 | out.resize(bin + 1); 69 | } 70 | ++out[bin]; 71 | } 72 | } 73 | 74 | // Put missing values in the last position 75 | out.push_back(nmissing); 76 | return out; 77 | } 78 | 79 | // Create class to encapsulate binning operations ------------------------------ 80 | 81 | 82 | class BinFixed { 83 | double width_; 84 | double origin_; 85 | public: 86 | BinFixed (double width, double origin = 0) { 87 | width_ = width; 88 | origin_ = origin; 89 | } 90 | int inline operator() (double val) const { 91 | return (val - origin_) / width_; 92 | } 93 | }; 94 | class BinBreaks { 95 | NumericVector breaks_; 96 | NumericVector::iterator breaks_it_, breaks_end_; 97 | 98 | public: 99 | BinBreaks (NumericVector& breaks) { 100 | breaks_ = breaks; 101 | breaks_it_ = breaks.begin(); 102 | breaks_end_ = breaks.end(); 103 | } 104 | int inline operator() (double val) const { 105 | NumericVector::iterator 106 | bin_it = std::upper_bound(breaks_it_, breaks_end_, val); 107 | 108 | return std::distance(breaks_it_, bin_it); 109 | } 110 | }; 111 | 112 | 113 | template 114 | std::vector bin_bin(NumericVector x, Binner binner) { 115 | int bin, nmissing = 0; 116 | std::vector out; 117 | 118 | NumericVector::iterator x_it = x.begin(), x_end; 119 | for(; x_it != x.end(); ++x_it) { 120 | double val = *x_it; 121 | if (ISNAN(val)) { 122 | ++nmissing; 123 | } else { 124 | bin = binner(val); 125 | if (bin < 0) continue; 126 | 127 | // Make sure there's enough space 128 | if (bin >= out.size()) { 129 | out.resize(bin + 1); 130 | } 131 | ++out[bin]; 132 | } 133 | } 134 | 135 | // Put missing values in the last position 136 | out.push_back(nmissing); 137 | return out; 138 | } 139 | 140 | // [[Rcpp::export]] 141 | std::vector bin_bin_fixed(NumericVector x, double width, double origin = 0) { 142 | return bin_bin(x, BinFixed(width, origin)); 143 | } 144 | 145 | // [[Rcpp::export]] 146 | std::vector bin_bin_breaks(NumericVector x, NumericVector breaks) { 147 | return bin_bin(x, BinBreaks(breaks)); 148 | } 149 | 150 | // Try using a Fast ------------------------------ 151 | // Considerable speed improvement for simple binning function 152 | template 153 | std::vector fbin_bin(NumericVector x, Binner binner) { 154 | int bin, nmissing = 0; 155 | std::vector out; 156 | 157 | Fast fx(x); 158 | int n = x.size(); 159 | 160 | for(int i = 0; i < n; ++i) { 161 | double val = fx[i]; 162 | if (ISNAN(val)) { 163 | ++nmissing; 164 | } else { 165 | bin = binner(val); 166 | if (bin < 0) continue; 167 | 168 | // Make sure there's enough space 169 | if (bin >= out.size()) { 170 | out.resize(bin + 1); 171 | } 172 | ++out[bin]; 173 | } 174 | } 175 | 176 | // Put missing values in the last position 177 | out.push_back(nmissing); 178 | return out; 179 | } 180 | 181 | // [[Rcpp::export]] 182 | std::vector fbin_bin_fixed(NumericVector x, double width, double origin = 0) { 183 | return fbin_bin(x, BinFixed(width, origin)); 184 | } 185 | 186 | // [[Rcpp::export]] 187 | std::vector fbin_bin_breaks(NumericVector x, NumericVector breaks) { 188 | return fbin_bin(x, BinBreaks(breaks)); 189 | } 190 | 191 | /*** R 192 | options(digits = 3) 193 | library(microbenchmark) 194 | x <- runif(1e5) 195 | breaks <- seq(0, 1, length = 100) 196 | 197 | # Breaks 198 | microbenchmark( 199 | hist(x, breaks, plot = F), 200 | bin(x, breaks), 201 | bin2(x, breaks), 202 | bin_bin_breaks(x, breaks), 203 | fbin_bin_breaks(x, breaks) 204 | ) 205 | 206 | # Fixed bins 207 | microbenchmark( 208 | bin3(x, 1/100, 0), 209 | bin_bin_fixed(x, 1/100, 0), 210 | fbin_bin_fixed(x, 1/100, 0) 211 | ) 212 | 213 | x6 <- runif(1e6) 214 | x7 <- runif(1e7) 215 | x8 <- runif(1e8) 216 | 217 | microbenchmark( 218 | bin_bin_fixed(x6, 1/100, 0), 219 | fbin_bin_fixed(x6, 1/100, 0), 220 | bin_bin_fixed(x7, 1/100, 0), 221 | fbin_bin_fixed(x7, 1/100, 0), 222 | bin_bin_fixed(x8, 1/100, 0), 223 | fbin_bin_fixed(x8, 1/100, 0), 224 | times = 10) 225 | 226 | */ -------------------------------------------------------------------------------- /bench/smooth-1d.cpp: -------------------------------------------------------------------------------- 1 | // Explore opportunities for making smooth_1d faster 2 | // 3 | // Bounding to a given range is really important, and memoisation helps offset 4 | // the cost of making a call back to R, but the biggest win is using a pure 5 | // C/C++ kernel function. 6 | // 7 | // 8 | #include 9 | using namespace Rcpp; 10 | 11 | // Base implementation 12 | // [[Rcpp::export]] 13 | NumericVector smooth_1d(const NumericVector& x, const NumericVector& z, 14 | const NumericVector& x_out, const Function& kernel) { 15 | 16 | int n_in = x.size(), n_out = x_out.size(); 17 | NumericVector z_out(n_out); 18 | 19 | for (int i = 0; i < n_out; i++) { 20 | for (int j = 0; j < n_in; j++) { 21 | double dist = x[j] - x_out[i]; 22 | double k = as(kernel(dist))[0]; 23 | z_out[i] += z[j] * k; 24 | } 25 | } 26 | 27 | return z_out; 28 | } 29 | 30 | // Memoise distance calculations 31 | // [[Rcpp::export]] 32 | NumericVector smooth_1d_memo(const NumericVector& x, const NumericVector& z, 33 | const NumericVector& x_out, const Function& kernel) { 34 | int n_in = x.size(), n_out = x_out.size(); 35 | NumericVector z_out(n_out); 36 | 37 | std::unordered_map k_memo; 38 | 39 | for (int i = 0; i < n_out; i++) { 40 | for (int j = 0; j < n_in; j++) { 41 | double dist = x[j] - x_out[i]; 42 | 43 | std::unordered_map::const_iterator it = k_memo.find(dist); 44 | double k; 45 | if (it == k_memo.end()) { 46 | k = as(kernel(dist))[0]; 47 | k_memo[dist] = k; 48 | } else { 49 | k = it->second; 50 | } 51 | 52 | z_out[i] += z[j] * k; 53 | } 54 | } 55 | 56 | return z_out; 57 | } 58 | 59 | // Use range of kernel 60 | // [[Rcpp::export]] 61 | NumericVector smooth_1d_range(const NumericVector& x, const NumericVector& z, 62 | const NumericVector& x_out, const Function& kernel, 63 | double kmin, double kmax) { 64 | 65 | int n_in = x.size(), n_out = x_out.size(); 66 | NumericVector z_out(n_out); 67 | 68 | for (int i = 0; i < n_out; i++) { 69 | for (int j = 0; j < n_in; j++) { 70 | double dist = x[j] - x_out[i]; 71 | if (dist < kmin || dist > kmax) continue; 72 | 73 | double k = as(kernel(dist))[0]; 74 | z_out[i] += z[j] * k; 75 | } 76 | } 77 | 78 | return z_out; 79 | } 80 | 81 | 82 | // Memoise and use range 83 | // [[Rcpp::export]] 84 | NumericVector smooth_1d_memo_range(const NumericVector& x, const NumericVector& z, 85 | const NumericVector& x_out, const Function& kernel, 86 | double kmin, double kmax) { 87 | int n_in = x.size(), n_out = x_out.size(); 88 | NumericVector z_out(n_out); 89 | 90 | std::unordered_map k_memo; 91 | 92 | for (int i = 0; i < n_out; i++) { 93 | for (int j = 0; j < n_in; j++) { 94 | double dist = x[j] - x_out[i]; 95 | if (dist < kmin || dist > kmax) continue; 96 | 97 | std::unordered_map::const_iterator it = k_memo.find(dist); 98 | double k; 99 | if (it == k_memo.end()) { 100 | k = as(kernel(dist))[0]; 101 | k_memo[dist] = k; 102 | } else { 103 | k = it->second; 104 | } 105 | 106 | z_out[i] += z[j] * k; 107 | } 108 | } 109 | 110 | return z_out; 111 | } 112 | 113 | // Memoise and use range 114 | // [[Rcpp::export]] 115 | NumericVector smooth_1d_memo_range_map(const NumericVector& x, const NumericVector& z, 116 | const NumericVector& x_out, const Function& kernel, 117 | double kmin, double kmax) { 118 | int n_in = x.size(), n_out = x_out.size(); 119 | NumericVector z_out(n_out); 120 | 121 | std::map k_memo; 122 | 123 | for (int i = 0; i < n_out; i++) { 124 | for (int j = 0; j < n_in; j++) { 125 | double dist = x[j] - x_out[i]; 126 | if (dist < kmin || dist > kmax) continue; 127 | 128 | std::map::const_iterator it = k_memo.find(dist); 129 | double k; 130 | if (it == k_memo.end()) { 131 | k = as(kernel(dist))[0]; 132 | k_memo[dist] = k; 133 | } else { 134 | k = it->second; 135 | } 136 | 137 | z_out[i] += z[j] * k; 138 | } 139 | } 140 | 141 | return z_out; 142 | } 143 | 144 | // Memoise, use range & use C++ function for kernel 145 | // [[Rcpp::export]] 146 | NumericVector smooth_1d_memo_range_kcpp(const NumericVector& x, const NumericVector& z, 147 | const NumericVector& x_out, double kmin, double kmax) { 148 | int n_in = x.size(), n_out = x_out.size(); 149 | NumericVector z_out(n_out); 150 | 151 | std::unordered_map k_memo; 152 | 153 | for (int i = 0; i < n_out; i++) { 154 | for (int j = 0; j < n_in; j++) { 155 | double dist = x[j] - x_out[i]; 156 | if (dist < kmin || dist > kmax) continue; 157 | 158 | std::unordered_map::const_iterator it = k_memo.find(dist); 159 | double k; 160 | if (it == k_memo.end()) { 161 | k = R::dnorm(dist, 0.0, 0.1, 0); 162 | k_memo[dist] = k; 163 | } else { 164 | k = it->second; 165 | } 166 | 167 | z_out[i] += z[j] * k; 168 | } 169 | } 170 | 171 | return z_out; 172 | } 173 | 174 | // Use cpp kernel function without memoisation 175 | // [[Rcpp::export]] 176 | NumericVector smooth_1d_range_kcpp(const NumericVector& x, const NumericVector& z, 177 | const NumericVector& x_out, double kmin, double kmax) { 178 | 179 | int n_in = x.size(), n_out = x_out.size(); 180 | NumericVector z_out(n_out); 181 | 182 | for (int i = 0; i < n_out; i++) { 183 | for (int j = 0; j < n_in; j++) { 184 | double dist = x[j] - x_out[i]; 185 | if (dist < kmin || dist > kmax) continue; 186 | 187 | double k = R::dnorm(dist, 0.0, 0.1, 0); 188 | z_out[i] += z[j] * k; 189 | } 190 | } 191 | 192 | return z_out; 193 | } 194 | 195 | 196 | /*** R 197 | options(digits = 2) 198 | x <- 1:10 199 | z <- rep(c(1, 2), length = length(x)) 200 | k <- kernel("norm", sd = 0.1) 201 | krng <- range(k) 202 | grid <- seq(0, 11, length = 100) 203 | 204 | stopifnot(all.equal( 205 | smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]), 206 | smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2]) 207 | )) 208 | 209 | library(microbenchmark) 210 | microbenchmark( 211 | base = smooth_1d(x, z, grid, k), 212 | memo = smooth_1d_memo(x, z, grid, k), 213 | range = smooth_1d_range(x, z, grid, k, krng[1], krng[2]), 214 | "range + kcpp" = smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2]), 215 | "range + memo" = smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]), 216 | "range + memo + kcpp" = smooth_1d_memo_range_kcpp(x, z, grid, krng[1], krng[2]), 217 | "range + memo + map" = smooth_1d_memo_range_map(x, z, grid, k, krng[1], krng[2]) 218 | ) 219 | 220 | # More realistic sample sizes 221 | x <- 1:3e3 222 | z <- rep(c(1, 2), length = length(x)) 223 | 224 | grid3 <- seq(0, 11, length = 3e3) 225 | grid4 <- seq(0, 11, length = 3e4) 226 | 227 | microbenchmark( 228 | grid3_c = smooth_1d_range_kcpp(x, z, grid3, krng[1], krng[2]), 229 | grid3_r = smooth_1d_memo_range_map(x, z, grid3, k, krng[1], krng[2]), 230 | grid4_c = smooth_1d_range_kcpp(x, z, grid4, krng[1], krng[2]), 231 | grid4_r = smooth_1d_memo_range_map(x, z, grid4, k, krng[1], krng[2]), 232 | times = 10) 233 | 234 | */ -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // This file was generated by Rcpp::compileAttributes 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include "../inst/include/bigvis.h" 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | // condense_count 10 | List condense_count(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); 11 | RcppExport SEXP bigvis_condense_count(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { 12 | BEGIN_RCPP 13 | Rcpp::RObject __result; 14 | Rcpp::RNGScope __rngScope; 15 | Rcpp::traits::input_parameter< const List& >::type x(xSEXP); 16 | Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); 17 | Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); 18 | Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); 19 | __result = Rcpp::wrap(condense_count(x, z, weight, drop)); 20 | return __result; 21 | END_RCPP 22 | } 23 | // condense_sum 24 | List condense_sum(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); 25 | RcppExport SEXP bigvis_condense_sum(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { 26 | BEGIN_RCPP 27 | Rcpp::RObject __result; 28 | Rcpp::RNGScope __rngScope; 29 | Rcpp::traits::input_parameter< const List& >::type x(xSEXP); 30 | Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); 31 | Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); 32 | Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); 33 | __result = Rcpp::wrap(condense_sum(x, z, weight, drop)); 34 | return __result; 35 | END_RCPP 36 | } 37 | // condense_mean 38 | List condense_mean(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); 39 | RcppExport SEXP bigvis_condense_mean(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { 40 | BEGIN_RCPP 41 | Rcpp::RObject __result; 42 | Rcpp::RNGScope __rngScope; 43 | Rcpp::traits::input_parameter< const List& >::type x(xSEXP); 44 | Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); 45 | Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); 46 | Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); 47 | __result = Rcpp::wrap(condense_mean(x, z, weight, drop)); 48 | return __result; 49 | END_RCPP 50 | } 51 | // condense_sd 52 | List condense_sd(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); 53 | RcppExport SEXP bigvis_condense_sd(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { 54 | BEGIN_RCPP 55 | Rcpp::RObject __result; 56 | Rcpp::RNGScope __rngScope; 57 | Rcpp::traits::input_parameter< const List& >::type x(xSEXP); 58 | Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); 59 | Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); 60 | Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); 61 | __result = Rcpp::wrap(condense_sd(x, z, weight, drop)); 62 | return __result; 63 | END_RCPP 64 | } 65 | // condense_median 66 | List condense_median(const List& x, const NumericVector& z, const NumericVector& weight, bool drop); 67 | RcppExport SEXP bigvis_condense_median(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) { 68 | BEGIN_RCPP 69 | Rcpp::RObject __result; 70 | Rcpp::RNGScope __rngScope; 71 | Rcpp::traits::input_parameter< const List& >::type x(xSEXP); 72 | Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP); 73 | Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP); 74 | Rcpp::traits::input_parameter< bool >::type drop(dropSEXP); 75 | __result = Rcpp::wrap(condense_median(x, z, weight, drop)); 76 | return __result; 77 | END_RCPP 78 | } 79 | // double_diff_sum 80 | std::vector double_diff_sum(IntegerVector bin, IntegerVector count); 81 | RcppExport SEXP bigvis_double_diff_sum(SEXP binSEXP, SEXP countSEXP) { 82 | BEGIN_RCPP 83 | Rcpp::RObject __result; 84 | Rcpp::RNGScope __rngScope; 85 | Rcpp::traits::input_parameter< IntegerVector >::type bin(binSEXP); 86 | Rcpp::traits::input_parameter< IntegerVector >::type count(countSEXP); 87 | __result = Rcpp::wrap(double_diff_sum(bin, count)); 88 | return __result; 89 | END_RCPP 90 | } 91 | // frange 92 | NumericVector frange(const NumericVector& x, const bool finite); 93 | RcppExport SEXP bigvis_frange(SEXP xSEXP, SEXP finiteSEXP) { 94 | BEGIN_RCPP 95 | Rcpp::RObject __result; 96 | Rcpp::RNGScope __rngScope; 97 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 98 | Rcpp::traits::input_parameter< const bool >::type finite(finiteSEXP); 99 | __result = Rcpp::wrap(frange(x, finite)); 100 | return __result; 101 | END_RCPP 102 | } 103 | // group_fixed 104 | IntegerVector group_fixed(const NumericVector& x, double width, double origin); 105 | RcppExport SEXP bigvis_group_fixed(SEXP xSEXP, SEXP widthSEXP, SEXP originSEXP) { 106 | BEGIN_RCPP 107 | Rcpp::RObject __result; 108 | Rcpp::RNGScope __rngScope; 109 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 110 | Rcpp::traits::input_parameter< double >::type width(widthSEXP); 111 | Rcpp::traits::input_parameter< double >::type origin(originSEXP); 112 | __result = Rcpp::wrap(group_fixed(x, width, origin)); 113 | return __result; 114 | END_RCPP 115 | } 116 | // group_rect 117 | IntegerVector group_rect(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin); 118 | RcppExport SEXP bigvis_group_rect(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP) { 119 | BEGIN_RCPP 120 | Rcpp::RObject __result; 121 | Rcpp::RNGScope __rngScope; 122 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 123 | Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP); 124 | Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP); 125 | Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP); 126 | Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP); 127 | Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP); 128 | __result = Rcpp::wrap(group_rect(x, y, x_width, y_width, x_origin, y_origin)); 129 | return __result; 130 | END_RCPP 131 | } 132 | // group_hex 133 | IntegerVector group_hex(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin, double x_max); 134 | RcppExport SEXP bigvis_group_hex(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP, SEXP x_maxSEXP) { 135 | BEGIN_RCPP 136 | Rcpp::RObject __result; 137 | Rcpp::RNGScope __rngScope; 138 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 139 | Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP); 140 | Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP); 141 | Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP); 142 | Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP); 143 | Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP); 144 | Rcpp::traits::input_parameter< double >::type x_max(x_maxSEXP); 145 | __result = Rcpp::wrap(group_hex(x, y, x_width, y_width, x_origin, y_origin, x_max)); 146 | return __result; 147 | END_RCPP 148 | } 149 | // lowerBound 150 | IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks); 151 | RcppExport SEXP bigvis_lowerBound(SEXP xSEXP, SEXP breaksSEXP) { 152 | BEGIN_RCPP 153 | Rcpp::RObject __result; 154 | Rcpp::RNGScope __rngScope; 155 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 156 | Rcpp::traits::input_parameter< const NumericVector& >::type breaks(breaksSEXP); 157 | __result = Rcpp::wrap(lowerBound(x, breaks)); 158 | return __result; 159 | END_RCPP 160 | } 161 | // smooth_nd_1 162 | NumericVector smooth_nd_1(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const int var, const double h, const std::string type); 163 | RcppExport SEXP bigvis_smooth_nd_1(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP varSEXP, SEXP hSEXP, SEXP typeSEXP) { 164 | BEGIN_RCPP 165 | Rcpp::RObject __result; 166 | Rcpp::RNGScope __rngScope; 167 | Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP); 168 | Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP); 169 | Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP); 170 | Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP); 171 | Rcpp::traits::input_parameter< const int >::type var(varSEXP); 172 | Rcpp::traits::input_parameter< const double >::type h(hSEXP); 173 | Rcpp::traits::input_parameter< const std::string >::type type(typeSEXP); 174 | __result = Rcpp::wrap(smooth_nd_1(grid_in, z_in, w_in_, grid_out, var, h, type)); 175 | return __result; 176 | END_RCPP 177 | } 178 | // smooth_nd 179 | NumericVector smooth_nd(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const NumericVector h); 180 | RcppExport SEXP bigvis_smooth_nd(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP hSEXP) { 181 | BEGIN_RCPP 182 | Rcpp::RObject __result; 183 | Rcpp::RNGScope __rngScope; 184 | Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP); 185 | Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP); 186 | Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP); 187 | Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP); 188 | Rcpp::traits::input_parameter< const NumericVector >::type h(hSEXP); 189 | __result = Rcpp::wrap(smooth_nd(grid_in, z_in, w_in_, grid_out, h)); 190 | return __result; 191 | END_RCPP 192 | } 193 | // bisquare 194 | double bisquare(double u, double b); 195 | RcppExport SEXP bigvis_bisquare(SEXP uSEXP, SEXP bSEXP) { 196 | BEGIN_RCPP 197 | Rcpp::RObject __result; 198 | Rcpp::RNGScope __rngScope; 199 | Rcpp::traits::input_parameter< double >::type u(uSEXP); 200 | Rcpp::traits::input_parameter< double >::type b(bSEXP); 201 | __result = Rcpp::wrap(bisquare(u, b)); 202 | return __result; 203 | END_RCPP 204 | } 205 | // regress 206 | NumericVector regress(const std::vector& x, const std::vector& y, const std::vector& w); 207 | RcppExport SEXP bigvis_regress(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP) { 208 | BEGIN_RCPP 209 | Rcpp::RObject __result; 210 | Rcpp::RNGScope __rngScope; 211 | Rcpp::traits::input_parameter< const std::vector& >::type x(xSEXP); 212 | Rcpp::traits::input_parameter< const std::vector& >::type y(ySEXP); 213 | Rcpp::traits::input_parameter< const std::vector& >::type w(wSEXP); 214 | __result = Rcpp::wrap(regress(x, y, w)); 215 | return __result; 216 | END_RCPP 217 | } 218 | // median 219 | double median(const std::vector& x); 220 | RcppExport SEXP bigvis_median(SEXP xSEXP) { 221 | BEGIN_RCPP 222 | Rcpp::RObject __result; 223 | Rcpp::RNGScope __rngScope; 224 | Rcpp::traits::input_parameter< const std::vector& >::type x(xSEXP); 225 | __result = Rcpp::wrap(median(x)); 226 | return __result; 227 | END_RCPP 228 | } 229 | // regress_robust 230 | NumericVector regress_robust(const std::vector& x, const std::vector& y, const std::vector& w, int iterations); 231 | RcppExport SEXP bigvis_regress_robust(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP, SEXP iterationsSEXP) { 232 | BEGIN_RCPP 233 | Rcpp::RObject __result; 234 | Rcpp::RNGScope __rngScope; 235 | Rcpp::traits::input_parameter< const std::vector& >::type x(xSEXP); 236 | Rcpp::traits::input_parameter< const std::vector& >::type y(ySEXP); 237 | Rcpp::traits::input_parameter< const std::vector& >::type w(wSEXP); 238 | Rcpp::traits::input_parameter< int >::type iterations(iterationsSEXP); 239 | __result = Rcpp::wrap(regress_robust(x, y, w, iterations)); 240 | return __result; 241 | END_RCPP 242 | } 243 | // compute_moments 244 | NumericVector compute_moments(const NumericVector& x); 245 | RcppExport SEXP bigvis_compute_moments(SEXP xSEXP) { 246 | BEGIN_RCPP 247 | Rcpp::RObject __result; 248 | Rcpp::RNGScope __rngScope; 249 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 250 | __result = Rcpp::wrap(compute_moments(x)); 251 | return __result; 252 | END_RCPP 253 | } 254 | // compute_sum 255 | NumericVector compute_sum(const NumericVector& x); 256 | RcppExport SEXP bigvis_compute_sum(SEXP xSEXP) { 257 | BEGIN_RCPP 258 | Rcpp::RObject __result; 259 | Rcpp::RNGScope __rngScope; 260 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 261 | __result = Rcpp::wrap(compute_sum(x)); 262 | return __result; 263 | END_RCPP 264 | } 265 | // compute_median 266 | NumericVector compute_median(const NumericVector& x); 267 | RcppExport SEXP bigvis_compute_median(SEXP xSEXP) { 268 | BEGIN_RCPP 269 | Rcpp::RObject __result; 270 | Rcpp::RNGScope __rngScope; 271 | Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); 272 | __result = Rcpp::wrap(compute_median(x)); 273 | return __result; 274 | END_RCPP 275 | } 276 | --------------------------------------------------------------------------------