├── src
    ├── .gitignore
    ├── Makevars
    ├── Summary2d.cpp
    ├── BinnedVector.cpp
    ├── lowerBound.cpp
    ├── stats.h
    ├── double-diff-sum.cpp
    ├── summary.cpp
    ├── BigVis.cpp
    ├── condense-gen.r
    ├── Summary2d.h
    ├── BinnedVectors.cpp
    ├── group-hex.h
    ├── frange.cpp
    ├── group.cpp
    ├── summary.h
    ├── stats.cpp
    ├── group.h
    ├── smooth-nd.cpp
    ├── condense.cpp
    └── RcppExports.cpp
├── .gitignore
├── data
    └── movies.rdata
├── .Rbuildignore
├── .travis.yml
├── inst
    ├── tests
    │   ├── test-ranged.r
    │   ├── test-group-2d.r
    │   ├── test-frange.r
    │   ├── test-breaks.r
    │   ├── test-group-1d.r
    │   ├── test-origin.r
    │   ├── test-summary-moments.r
    │   ├── test-weighted-stats.r
    │   ├── test-stat.r
    │   ├── test-smooth.r
    │   ├── test-binned-vectors.r
    │   └── test-condense.r
    └── include
    │   └── bigvis.h
├── man
    ├── is.ranged.Rd
    ├── bigvis.Rd
    ├── dgrid.Rd
    ├── round_any.condensed.Rd
    ├── find_width.Rd
    ├── weighted.median.Rd
    ├── breaks.Rd
    ├── weighted.IQR.Rd
    ├── find_origin.Rd
    ├── h_grid.Rd
    ├── frange.Rd
    ├── weighted.quantile.Rd
    ├── dchallenge.Rd
    ├── condense.Rd
    ├── bin.Rd
    ├── mt.Rd
    ├── weighted.ecdf.Rd
    ├── autoplot.condensed.Rd
    ├── transform.condensed.Rd
    ├── ranged.Rd
    ├── weighted.var.Rd
    ├── condensed.Rd
    ├── movies.Rd
    ├── standardise.Rd
    ├── peel.Rd
    ├── rmse_cvs.Rd
    ├── smooth.Rd
    └── best_h.Rd
├── bigvis.Rproj
├── R
    ├── adjust.r
    ├── utils.r
    ├── bigvis.r
    ├── breaks.r
    ├── width.r
    ├── origin.r
    ├── movies.r
    ├── dgrid.r
    ├── standardise.r
    ├── mt.r
    ├── id.r
    ├── bin.r
    ├── condense.r
    ├── rmse.r
    ├── challenge.r
    ├── rebin.r
    ├── ranged.r
    ├── peel.r
    ├── smooth.r
    ├── condensed.r
    ├── RcppExports.R
    ├── h.r
    ├── autoplot.r
    └── weighted-stats.r
├── DESCRIPTION
├── bench
    ├── kernel.cpp
    ├── group-tempvar.cpp
    ├── count.cpp
    ├── median.cpp
    ├── mean.cpp
    ├── bin-structure.cpp
    ├── bin.cpp
    └── smooth-1d.cpp
├── NAMESPACE
├── notes.md
└── README.md


/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_CPPFLAGS=-I../inst/include
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | src/*.o
5 | src/*.so
6 | src/*.dll
7 | 


--------------------------------------------------------------------------------
/data/movies.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataXujing/bigvis/master/data/movies.rdata


--------------------------------------------------------------------------------
/src/Summary2d.cpp:
--------------------------------------------------------------------------------
1 | #include <Rcpp.h>
2 | #include "Summary2d.h"
3 | using namespace Rcpp;
4 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | bench
2 | notes.md
3 | ^.*\.Rproj$
4 | ^\.Rproj\.user$
5 | ^\.travis\.yml$
6 | ^src/condense-gen\.r$
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Sample .travis.yml for R projects
 2 | 
 3 | language: r
 4 | warnings_are_errors: true
 5 | sudo: required
 6 | 
 7 | r_github_packages:
 8 |   - jimhester/covr
 9 | after_success:
10 |   - Rscript -e 'covr::codecov()'
11 | 


--------------------------------------------------------------------------------
/inst/tests/test-ranged.r:
--------------------------------------------------------------------------------
 1 | context("Ranged")
 2 | 
 3 | test_that("range attribute lost when modified", {
 4 |   x <- ranged(10:1)
 5 |   expect_equal(max(x), 10)
 6 | 
 7 | 
 8 |   x[1] <- 1
 9 |   expect_equal(max(x), 9)
10 |   expect_equal(attr(x, "range"), NULL)
11 | })
12 | 


--------------------------------------------------------------------------------
/src/BinnedVector.cpp:
--------------------------------------------------------------------------------
 1 | #include <bigvis.h>
 2 | using namespace Rcpp;
 3 | 
 4 | NumericVector frange(const NumericVector& x, const bool finite = true);
 5 | 
 6 | int BinnedVector::nbins() const {
 7 |   double max = frange(x_)[1];
 8 |   return bin(max) + 1; 
 9 |   // +1 bin for missing values
10 | }
11 | 


--------------------------------------------------------------------------------
/inst/tests/test-group-2d.r:
--------------------------------------------------------------------------------
 1 | context("Grouping: 2d")
 2 | 
 3 | test_that("Two NAs gets bin 0", {
 4 |   expect_equal(group_rect(NA, NA, 1, 1, 0, 0), 0)
 5 | })
 6 | 
 7 | test_that("Sequential locations get sequential groups", {
 8 |   grid <- expand.grid(x = c(NA, 1:2), y = c(NA, 1:2))
 9 |   expect_equal(group_rect(grid$x, grid$y, 1, 1, 0.5, 0.5), 0:8)
10 | })
11 | 


--------------------------------------------------------------------------------
/man/is.ranged.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/ranged.r
 3 | \name{is.ranged}
 4 | \alias{is.ranged}
 5 | \title{Test if an object is of class ranged.}
 6 | \usage{
 7 | is.ranged(x)
 8 | }
 9 | \arguments{
10 | \item{x}{object to test}
11 | }
12 | \description{
13 | Test if an object is of class ranged.
14 | }
15 | \keyword{internal}
16 | 
17 | 


--------------------------------------------------------------------------------
/bigvis.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | 
17 | BuildType: Package
18 | PackageUseDevtools: Yes
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | PackageRoxygenize: rd,collate,namespace
21 | 


--------------------------------------------------------------------------------
/inst/tests/test-frange.r:
--------------------------------------------------------------------------------
 1 | context("frange")
 2 | 
 3 | test_that("frange agrees with range", {
 4 |   x <- rnorm(1e4)
 5 |   expect_equal(frange(x), range(x))
 6 | })
 7 | 
 8 | test_that("frange uses cache if present", {
 9 |   x <- rnorm(1e4)
10 |   attr(x, "range") <- c(0, 10)
11 |   expect_equal(frange(x), c(0, 10))
12 | })
13 | 
14 | test_that("frange ignores NA and infinities by default", {
15 |   x <- c(1, NA, Inf, -Inf)
16 |   expect_equal(frange(x), c(1, 1))
17 | })
18 | 


--------------------------------------------------------------------------------
/R/adjust.r:
--------------------------------------------------------------------------------
 1 | # Protect against floating point areas by slightly adjusting breaks.
 2 | # Adapted from graphics::hist.default.
 3 | adjust_breaks <- function(breaks, open = "right") {
 4 |   open <- match.arg(open, c("left", "right"))
 5 | 
 6 |   breaks <- sort(breaks)
 7 |   diddle <- 1e-07 * median(diff(breaks))
 8 |   if (open == "left") {
 9 |     fuzz <- c(-diddle, rep.int(diddle, length(breaks) - 1))
10 |   } else {
11 |     fuzz <- c(rep.int(-diddle, length(breaks) - 1), diddle)
12 |   }
13 |   breaks + fuzz
14 | }
15 | 


--------------------------------------------------------------------------------
/inst/tests/test-breaks.r:
--------------------------------------------------------------------------------
 1 | context("Breaks")
 2 | 
 3 | last <- function(x) x[length(x)]
 4 | 
 5 | test_that("breaks includes max value, only if on border", {
 6 |   expect_equal(last(breaks(10, origin = 0, binwidth = 1)), 10)
 7 |   expect_equal(last(breaks(9.99, origin = 0, binwidth = 1)), 9)
 8 | })
 9 | 
10 | test_that("breaks includes max value even when origin != 0", {
11 |   expect_equal(last(breaks(10.5, origin = 0.5, binwidth = 1)), 10.5)
12 |   expect_equal(last(breaks(10.49, origin = 0.5, binwidth = 1)), 9.5)
13 | })
14 | 


--------------------------------------------------------------------------------
/man/bigvis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/bigvis.r
 3 | \docType{package}
 4 | \name{bigvis}
 5 | \alias{as.integer,Rcpp_BinnedVector-method}
 6 | \alias{bigvis}
 7 | \alias{bigvis-package}
 8 | \alias{show,Rcpp_BinnedVector-method}
 9 | \title{The big vis package.}
10 | \usage{
11 | \S4method{show}{Rcpp_BinnedVector}(object)
12 | 
13 | \S4method{as.integer}{Rcpp_BinnedVector}(x, ...)
14 | }
15 | \arguments{
16 | \item{x,object,...}{Generic args}
17 | }
18 | \description{
19 | The big vis package.
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/R/utils.r:
--------------------------------------------------------------------------------
 1 | "%||%" <- function(x, y) if (is.null(x)) y else x
 2 | 
 3 | last <- function(x) x[length(x)]
 4 | 
 5 | "%contains%" <- function(df, var) {
 6 |   var %in% names(df)
 7 | }
 8 | 
 9 | find_fun <- function(name, env = globalenv()) {
10 |   if (is.function(name)) return(name)
11 | 
12 |   ns_env <- asNamespace("bigvis")
13 |   if (exists(name, ns_env, mode = "function")) {
14 |     return(get(name, ns_env))
15 |   }
16 | 
17 |   if (exists(name, env, mode = "function")) {
18 |     return(get(name, env))
19 |   }
20 | 
21 |   stop("Could not find function ", name, call. = FALSE)
22 | }
23 | 


--------------------------------------------------------------------------------
/src/lowerBound.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <Rcpp.h>
 3 | using namespace Rcpp;
 4 | 
 5 | // Quick and dirty implementation of lowerBound, the complement to R's
 6 | // findInterval
 7 | // [[Rcpp::export]]
 8 | IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks) {
 9 |   int n = x.size();
10 |   IntegerVector out(n);
11 | 
12 |   for (int i = 0; i < n; i++) {
13 |     NumericVector::const_iterator it =
14 |       std::lower_bound(breaks.begin(), breaks.end(), x[i]);
15 |     if (it == breaks.end()) --it;
16 |     out[i] = it - breaks.begin() + 1;
17 |   }
18 |   return out;
19 | }
20 | 


--------------------------------------------------------------------------------
/man/dgrid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/dgrid.r
 3 | \name{dgrid}
 4 | \alias{dgrid}
 5 | \alias{is.dgrid}
 6 | \title{dgrid: an S3 class for data grids}
 7 | \usage{
 8 | dgrid(x, width, origin = 0, nbins = NULL)
 9 | 
10 | is.dgrid(x)
11 | }
12 | \arguments{
13 | \item{x}{a numeric vector to test or coerce.}
14 | 
15 | \item{width}{bin width}
16 | 
17 | \item{origin}{bin origins}
18 | 
19 | \item{nbins}{number of bins}
20 | }
21 | \description{
22 | dgrid: an S3 class for data grids
23 | }
24 | \examples{
25 | g <- dgrid(0:10 + 0.5, width = 1)
26 | range(g)
27 | as.integer(g)
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/src/stats.h:
--------------------------------------------------------------------------------
 1 | struct Regression {
 2 |   double alpha, beta;
 3 | };
 4 | 
 5 | double bisquare(double u, double b);
 6 | 
 7 | Regression simpleLinearRegression(const std::vector<double>& x, 
 8 |                                   const std::vector<double>& y,
 9 |                                   const std::vector<double>& w);
10 | 
11 | Regression simpleRobustRegression(const std::vector<double>& x, 
12 |                        const std::vector<double>& y,
13 |                        const std::vector<double>& w,
14 |                        int iterations = 3);
15 | 
16 | double median(const std::vector<double>& x);
17 | double median(std::vector<double>* x);
18 | 


--------------------------------------------------------------------------------
/inst/tests/test-group-1d.r:
--------------------------------------------------------------------------------
 1 | context("Grouping: 1d")
 2 | 
 3 | group <- function(x, width, origin = NULL) {
 4 |   g <- bin(x, width, origin)
 5 |   vapply(seq_along(x) - 1, g$bin_i, integer(1))
 6 | }
 7 | 
 8 | test_that("NAs belong to group 0", {
 9 |   x <- NA_real_
10 |   expect_equal(group(x, 1, 0), 0L)
11 | })
12 | 
13 | test_that("Inf and -Inf belong to group 0", {
14 |   x <- c(-Inf, Inf)
15 |   expect_equal(group(x, 1, 0), c(0, 0))
16 | })
17 | 
18 | test_that("Out of range values belong to group 0", {
19 |   expect_equal(group(-10, 1, 0), 0)
20 | })
21 | 
22 | test_that("Positive integers unchanged if origin is 1", {
23 |   expect_equal(group(1:10, 1, 1), 1:10)
24 | })
25 | 


--------------------------------------------------------------------------------
/man/round_any.condensed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/condensed.r
 3 | \name{round_any.condensed}
 4 | \alias{round_any.condensed}
 5 | \title{Round any method for condensed objects}
 6 | \usage{
 7 | round_any.condensed(x, accuracy, f = round)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric or date-time (POSIXct) vector to round}
11 | 
12 | \item{accuracy}{number to round to; for POSIXct objects, a number of seconds}
13 | 
14 | \item{f}{rounding function: \code{\link{floor}}, \code{\link{ceiling}} or
15 | \code{\link{round}}}
16 | }
17 | \description{
18 | Round any method for condensed objects
19 | }
20 | \keyword{internal}
21 | 
22 | 


--------------------------------------------------------------------------------
/inst/tests/test-origin.r:
--------------------------------------------------------------------------------
 1 | context("Origin")
 2 | 
 3 | test_that("origins close to zero rounded to zero" ,{
 4 |   expect_equal(find_origin(c(0.01, 1000)), 0)
 5 |   expect_equal(find_origin(c(10, 1e6)), 0)
 6 | })
 7 | 
 8 | test_that("origins rounded down by binwidth", {
 9 |   expect_equal(find_origin(c(1, 10), 1), 1)
10 |   expect_equal(find_origin(c(1, 10), 2), 0)
11 | 
12 |   expect_equal(find_origin(c(5, 10), 2), 4)
13 |   expect_equal(find_origin(c(5, 10), 5), 5)
14 | })
15 | 
16 | test_that("integers have origin offset by 0.5", {
17 |   expect_equal(find_origin(c(1L, 10L), 1), 0.5)
18 | 
19 |   expect_equal(find_origin(c(5L, 10L), 2), 3.5)
20 |   expect_equal(find_origin(c(5L, 10L), 5), 4.5)
21 | })
22 | 


--------------------------------------------------------------------------------
/man/find_width.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/width.r
 3 | \name{find_width}
 4 | \alias{find_width}
 5 | \title{Compute a reasonable default binwidth.}
 6 | \usage{
 7 | find_width(x, nbins = 10000)
 8 | }
 9 | \arguments{
10 | \item{x}{a numeric vector. If a numeric vector of length one is supplied,
11 | it's assumed that}
12 | 
13 | \item{nbins}{desired number of bins (approximate)}
14 | }
15 | \description{
16 | Compute a reasonable default binwidth.
17 | }
18 | \examples{
19 | find_width(c(0, 5))
20 | find_width(c(0, 5.023432))
21 | find_width(c(0, 5.9))
22 | }
23 | \seealso{
24 | Other reasonable defaults: \code{\link{find_origin}}
25 | }
26 | \keyword{internal}
27 | 
28 | 


--------------------------------------------------------------------------------
/R/bigvis.r:
--------------------------------------------------------------------------------
 1 | #' The big vis package.
 2 | #'
 3 | #' @useDynLib bigvis
 4 | #' @docType package
 5 | #' @name bigvis
 6 | NULL
 7 | 
 8 | if (!exists("BigVis")) {
 9 |   BigVis <- Rcpp::Module("BigVis")
10 | }
11 | 
12 | 
13 | #' @param x,object,... Generic args
14 | #' @rdname bigvis
15 | #' @export
16 | setMethod("show", "Rcpp_BinnedVector", function(object) {
17 |   cat("Binned [", object$size(), "]. ",
18 |     "Width: ", object$width(), " Origin: ", object$origin(), "\n", sep = "")
19 | })
20 | 
21 | #' @rdname bigvis
22 | #' @export
23 | setMethod("as.integer", "Rcpp_BinnedVector", function(x, ...) {
24 |   vapply(seq_len(x$size()), x$bin_i, integer(1))
25 | })
26 | 
27 | 
28 | 
29 | # Silence R CMD check note
30 | #' @importFrom methods new
31 | #' @importFrom Rcpp compileAttributes cpp_object_initializer
32 | NULL
33 | 
34 | 


--------------------------------------------------------------------------------
/src/double-diff-sum.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include <iostream>
 3 | #include <algorithm>
 4 | using namespace Rcpp;
 5 | 
 6 | // Efficiently compute \sum \sum abs(x_i - x_j) for binned data
 7 | // 
 8 | // It's effectively equivalent to this R code on the ungrouped observations
 9 | // bin <- trunc(x / bw)
10 | // diffs <- abs(outer(bin, bin, "-"))
11 | // tabulate(diffs + 1)
12 | // 
13 | // [[Rcpp::export]]
14 | std::vector<int> double_diff_sum(IntegerVector bin, IntegerVector count) {
15 |   int n = bin.size();
16 |   std::vector<int> out;
17 | 
18 |   for (int i = 0; i < n; i++) {
19 |     for (int j = 0; j < n; j++) {
20 |       int pos = abs(bin[i] - bin[j]);
21 | 
22 |       if (pos + 1 > out.size()) {
23 |         out.resize(pos + 1);
24 |       }
25 |       out[pos] += count[i] * count[j];
26 |     }
27 |   }
28 | 
29 |   return out;
30 | }
31 | 


--------------------------------------------------------------------------------
/man/weighted.median.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/weighted-stats.r
 3 | \name{weighted.median}
 4 | \alias{weighted.median}
 5 | \title{Compute the median of weighted data.}
 6 | \usage{
 7 | weighted.median(x, w, na.rm = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector of observations}
11 | 
12 | \item{w}{integer vector of weights, representing the number of
13 | time each \code{x} was observed}
14 | 
15 | \item{na.rm}{If \code{TRUE} will automatically remove missing values
16 | in \code{x} or \code{w}.}
17 | }
18 | \description{
19 | Compute the median of weighted data.
20 | }
21 | \details{
22 | This is a simple wrapper around \code{\link{weighted.quantile}}
23 | }
24 | \examples{
25 | x <- runif(200)
26 | w <- rpois(200, 5) + 1
27 | 
28 | median(x)
29 | weighted.median(x, w)
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/man/breaks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/breaks.r
 3 | \name{breaks}
 4 | \alias{breaks}
 5 | \title{Compute breaks given origin and width.}
 6 | \usage{
 7 | breaks(x, binwidth, origin = min(x))
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector}
11 | 
12 | \item{binwidth}{bin width}
13 | 
14 | \item{origin}{bin origin}
15 | }
16 | \description{
17 | Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer
18 | multiple of binwidth, then we need one more break. This function only returns
19 | the left-side of the breaks.
20 | }
21 | \details{
22 | The first break is special, because it always contains missing values.
23 | }
24 | \examples{
25 | breaks(10, origin = 0, binwidth = 1)
26 | breaks(9.9, origin = 0, binwidth = 1)
27 | 
28 | breaks(1:10, origin = 0, binwidth = 2)
29 | }
30 | \keyword{internal}
31 | 
32 | 


--------------------------------------------------------------------------------
/man/weighted.IQR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/weighted-stats.r
 3 | \name{weighted.IQR}
 4 | \alias{weighted.IQR}
 5 | \title{Compute the interquartile range of weighted data.}
 6 | \usage{
 7 | weighted.IQR(x, w, na.rm = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector of observations}
11 | 
12 | \item{w}{integer vector of weights, representing the number of
13 | time each \code{x} was observed}
14 | 
15 | \item{na.rm}{If \code{TRUE} will automatically remove missing values
16 | in \code{x} or \code{w}.}
17 | }
18 | \description{
19 | Compute the interquartile range of weighted data.
20 | }
21 | \details{
22 | This is a simple wrapper around \code{\link{weighted.quantile}}
23 | }
24 | \examples{
25 | x <- sort(runif(200))
26 | w <- rpois(200, seq(1, 10, length = 200)) + 1
27 | 
28 | IQR(x)
29 | weighted.IQR(x, w)
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/src/summary.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include "summary.h"
 3 | using namespace Rcpp;
 4 | 
 5 | template<typename Summary>
 6 | NumericVector summary_compute(const NumericVector& x, Summary summary) {
 7 |   int n = x.size();
 8 |   for(int i = 0; i < n; ++i) {
 9 |     summary.push(x[i], 1);
10 |   }
11 | 
12 |   int m = summary.size();
13 |   NumericVector out(m);
14 |   for(int i = 0; i < m; ++i) {
15 |     out[i] = summary.compute(i);
16 |   }
17 | 
18 |   return out;
19 | }
20 | 
21 | // [[Rcpp::export]]
22 | NumericVector compute_moments(const NumericVector& x) {
23 |   return summary_compute(x, SummaryMoments(2));
24 | }
25 | 
26 | // [[Rcpp::export]]
27 | NumericVector compute_sum(const NumericVector& x) {
28 |   return summary_compute(x, SummarySum(1));
29 | }
30 | 
31 | // [[Rcpp::export]]
32 | NumericVector compute_median(const NumericVector& x) {
33 |   return summary_compute(x, SummaryMedian());
34 | }
35 | 


--------------------------------------------------------------------------------
/R/breaks.r:
--------------------------------------------------------------------------------
 1 | #' Compute breaks given origin and width.
 2 | #'
 3 | #' Breaks are right-open, left-closed [x, y), so if \code{max(x)} is an integer
 4 | #' multiple of binwidth, then we need one more break. This function only returns
 5 | #' the left-side of the breaks.
 6 | #'
 7 | #' The first break is special, because it always contains missing values.
 8 | #'
 9 | #' @param x numeric vector
10 | #' @param origin bin origin
11 | #' @param binwidth bin width
12 | #' @export
13 | #' @keywords internal
14 | #' @examples
15 | #' breaks(10, origin = 0, binwidth = 1)
16 | #' breaks(9.9, origin = 0, binwidth = 1)
17 | #'
18 | #' breaks(1:10, origin = 0, binwidth = 2)
19 | breaks <- function(x, binwidth, origin = min(x)) {
20 |   if (!is.binned(x)) {
21 |     x <- bin(x, binwidth, origin)
22 |   }
23 | 
24 |   # -1 for NA bin, -1 since R is 1 indexed
25 |   nbins <- x$nbins() - 2
26 |   c(NA, x$origin() + seq.int(1, nbins) * x$width())
27 | }
28 | 


--------------------------------------------------------------------------------
/man/find_origin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/origin.r
 3 | \name{find_origin}
 4 | \alias{find_origin}
 5 | \title{Find the origin.}
 6 | \usage{
 7 | find_origin(x, binwidth)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric or integer vector}
11 | 
12 | \item{binwidth}{binwidth}
13 | }
14 | \description{
15 | Find the origin.
16 | }
17 | \details{
18 | This algorithm implements simple heuristics for determining the origin of
19 | a histogram when only the binwidth is specified. It:
20 | 
21 | \itemize{
22 |    \item rounds to zero, if relatively close
23 |    \item subtracts 0.5 offset, if an x is integer
24 |    \item ensures the origin is a multiple of the binwidth
25 | }
26 | }
27 | \examples{
28 | find_origin(1:10, 1)
29 | find_origin(1:10, 2)
30 | find_origin(c(1, 1e6), 1)
31 | }
32 | \seealso{
33 | Other reasonable defaults: \code{\link{find_width}}
34 | }
35 | \keyword{internal}
36 | 
37 | 


--------------------------------------------------------------------------------
/man/h_grid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/h.r
 3 | \name{h_grid}
 4 | \alias{h_grid}
 5 | \title{Generate grid of plausible bandwidths for condensed summary.}
 6 | \usage{
 7 | h_grid(x, n = 50, max = 20)
 8 | }
 9 | \arguments{
10 | \item{x}{a condensed summary}
11 | 
12 | \item{n}{number of bandwidths to generate (in each dimension)}
13 | 
14 | \item{max}{maximum bandwidth to generate, as multiple of binwidth.}
15 | }
16 | \description{
17 | By default, the bandwidths start at the bin width, and then continue
18 | up 50 (\code{n}) steps until 20 (\code{max}) times the bin width.
19 | }
20 | \examples{
21 | x <- rchallenge(1e4)
22 | xsum <- condense(bin(x, 1 / 10))
23 | h_grid(xsum)
24 | 
25 | y <- runif(1e4)
26 | xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
27 | h_grid(xysum, n = 10)
28 | }
29 | \seealso{
30 | Other bandwidth estimation functions: \code{\link{best_h}};
31 |   \code{\link{rmse_cv}}, \code{\link{rmse_cvs}}
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/inst/tests/test-summary-moments.r:
--------------------------------------------------------------------------------
 1 | context("Summary: moments")
 2 | 
 3 | count2 <- function(x) compute_moments(x)[1]
 4 | mean2 <- function(x) compute_moments(x)[2]
 5 | sd2 <- function(x) compute_moments(x)[3]
 6 | 
 7 | test_that("count agrees with length", {
 8 |   expect_equal(count2(1:10), 10)
 9 |   expect_equal(count2(5), 1)
10 |   expect_equal(count2(numeric()), 0)
11 | })
12 | 
13 | test_that("mean agree with base::mean", {
14 |   expect_equal(mean2(1:10), mean(1:10))
15 | 
16 |   x <- runif(1e6)
17 |   expect_equal(mean2(x), mean(x))
18 | })
19 | 
20 | test_that("missing values are ignored", {
21 |   x <- c(NA, 5, 5)
22 |   expect_equal(count2(x), 2)
23 |   expect_equal(mean2(x), 5)
24 | })
25 | 
26 | test_that("standard deviation agrees with sd", {
27 |   expect_equal(sd2(1:10), sd(1:10))
28 | 
29 |   x <- runif(1e6)
30 |   expect_equal(sd2(x), sd(x))
31 | })
32 | 
33 | test_that("summary statistics of zero length input are NaN", {
34 |   expect_equal(compute_moments(numeric()), c(0, NaN, NaN))
35 | })
36 | 


--------------------------------------------------------------------------------
/R/width.r:
--------------------------------------------------------------------------------
 1 | #' Compute a reasonable default binwidth.
 2 | #'
 3 | #' @param x a numeric vector. If a numeric vector of length one is supplied,
 4 | #'   it's assumed that
 5 | #' @param nbins desired number of bins (approximate)
 6 | #' @export
 7 | #' @keywords internal
 8 | #' @family reasonable defaults
 9 | #' @examples
10 | #' find_width(c(0, 5))
11 | #' find_width(c(0, 5.023432))
12 | #' find_width(c(0, 5.9))
13 | find_width <- function(x, nbins = 1e4) {
14 |   stopifnot(is.numeric(x))
15 |   stopifnot(is.numeric(nbins), length(nbins) == 1, nbins > 0)
16 | 
17 |   x <- diff(frange(x))
18 |   size <- x / nbins
19 | 
20 |   # divide into order of magnitude and multiplier
21 |   om <- 10 ^ ceiling(log10(size))
22 |   mult <- size / om
23 | 
24 |   # ensure number per unit is multiple of 1, 2, 3, 4, or 5
25 |   per_unit <- 1 / mult
26 |   rounders <- c(1, 2, 3, 4, 5)
27 |   poss <- round(per_unit / rounders) * rounders
28 |   poss <- poss[poss != 0]
29 |   width <- om / poss[which.min(abs(poss - per_unit))]
30 | 
31 |   structure(width, n = ceiling(x / width), per_unit = 1 / width)
32 | }
33 | 


--------------------------------------------------------------------------------
/inst/tests/test-weighted-stats.r:
--------------------------------------------------------------------------------
 1 | context("Weighted statistics")
 2 | 
 3 | test_that("weighted.var agrees with var when weights = 1", {
 4 |   samples <- replicate(20, runif(100), simplify = FALSE)
 5 | 
 6 |   var <- sapply(samples, var)
 7 |   wvar <- sapply(samples, weighted.var, w = rep(1, 100))
 8 | 
 9 |   expect_equal(wvar, var)
10 | })
11 | 
12 | test_that("weighted.var agrees with var on repeated vector", {
13 |   samples <- replicate(20, runif(100), simplify = FALSE)
14 |   w <- rep(1:2, 50)
15 |   samples_ex <- lapply(samples, rep, times = w)
16 | 
17 |   var <- sapply(samples_ex, var)
18 |   wvar <- sapply(samples, weighted.var, w = w)
19 | 
20 |   expect_equal(wvar, var)
21 | })
22 | 
23 | test_that("weighed.quantile agrees with quantile on repeated vector", {
24 |   samples <- replicate(20, runif(100), simplify = FALSE)
25 |   w <- rep(1:2, 50)
26 |   samples_ex <- lapply(samples, rep, times = w)
27 | 
28 |   quant <- sapply(samples_ex, quantile, probs = 0.325, names = FALSE)
29 |   wquant <- sapply(samples, weighted.quantile, w = w, probs = 0.325)
30 | 
31 |   expect_equal(quant, wquant)
32 | })
33 | 


--------------------------------------------------------------------------------
/src/BigVis.cpp:
--------------------------------------------------------------------------------
 1 | #include <bigvis.h>
 2 | using namespace Rcpp;
 3 | 
 4 | RCPP_MODULE(BigVis) {
 5 |   class_<BinnedVectorReference>("BinnedVector")
 6 |     .constructor<NumericVector, String, double, double>()
 7 |     .const_method("bin_i", &BinnedVectorReference::bin_i)
 8 |     .const_method("bin", &BinnedVectorReference::bin)
 9 |     .const_method("unbin", &BinnedVectorReference::unbin)
10 |     .const_method("nbins", &BinnedVectorReference::nbins)
11 |     .const_method("size", &BinnedVectorReference::size)
12 |     .const_method("origin", &BinnedVectorReference::origin)
13 |     .const_method("width", &BinnedVectorReference::width)
14 |     .const_method("name", &BinnedVectorReference::name)
15 |   ;
16 |   class_<BinnedVectors>("BinnedVectors")
17 |     .constructor<List>()
18 |     .method("add_vector", &BinnedVectors::add_vector)
19 |     .field("bins", &BinnedVectors::bins_)
20 |     .const_method("bin_i", &BinnedVectors::bin_i)
21 |     .const_method("bin", &BinnedVectors::bin)
22 |     .const_method("unbin", &BinnedVectors::unbin)
23 |     .const_method("nbins", &BinnedVectors::nbins)
24 |   ;
25 | }
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: bigvis
 2 | Version: 0.1.0.9000
 3 | Title: Tools for visualisation of big data sets
 4 | Description: Tools for visualising large datasets.
 5 | Authors@R: c(
 6 |     person("Hadley", "Wickham", role = c("aut", "cre"), , "hadley@rstudio.com"),
 7 |     person("Yue", "Hue", role = "aut"),
 8 |     person("R Core team", role = "ctb", comment = "guess_bandwidth adapted from stats::bw.SJ")
 9 |     )
10 | Depends:
11 |     Rcpp
12 | Imports:
13 |     methods
14 | Suggests:
15 |     plyr,
16 |     ggplot2,
17 |     scales
18 | LazyData: true
19 | LinkingTo: 
20 |     Rcpp, 
21 |     BH
22 | License: GPL (>= 2)
23 | Collate:
24 |     'standardise.r'
25 |     'movies.r'
26 |     'RcppExports.R'
27 |     'adjust.r'
28 |     'ranged.r'
29 |     'bigvis.r'
30 |     'rebin.r'
31 |     'autoplot.r'
32 |     'origin.r'
33 |     'utils.r'
34 |     'breaks.r'
35 |     'weighted-stats.r'
36 |     'condense.r'
37 |     'condensed.r'
38 |     'bin.r'
39 |     'smooth.r'
40 |     'challenge.r'
41 |     'peel.r'
42 |     'id.r'
43 |     'rmse.r'
44 |     'width.r'
45 |     'h.r'
46 |     'mt.r'
47 |     'dgrid.r'
48 | 


--------------------------------------------------------------------------------
/inst/tests/test-stat.r:
--------------------------------------------------------------------------------
 1 | context("Stats")
 2 | 
 3 | test_that("linear regression recovers slope & intercept if no errors", {
 4 |   x <- 1:10
 5 |   w <- rep(1, 10)
 6 | 
 7 |   expect_equal(regress(x, x * 2, w), c(0, 2))
 8 |   expect_equal(regress(x, x * -2, w), c(0, -2))
 9 |   expect_equal(regress(x, x * -2 + 5, w), c(5, -2))
10 |   expect_equal(regress(x, x * -2 + -5, w), c(-5, -2))
11 | })
12 | 
13 | simpleLm <- function(x, y, w) {
14 |   unname(coef(lm(y ~ x, weights = w)))
15 | }
16 | 
17 | test_that("linear regression matches lm", {
18 |   x <- 1:10
19 |   y <- 10 + x * 2 + rnorm(10)
20 |   w <- rep(1, 10)
21 | 
22 |   expect_equal(regress(x, y, w), simpleLm(x, y, w))
23 | })
24 | 
25 | test_that("linear regression matches lm with weights", {
26 |   x <- 1:10
27 |   y <- 10 + x * 2 + rnorm(10)
28 |   w <- runif(10)
29 | 
30 |   expect_equal(regress(x, y, w), simpleLm(x, y, w))
31 | })
32 | 
33 | test_that("robust regression effectively removes outlier", {
34 |   x <- 1:10
35 |   y <- 10 + x * 2 + c(rep(0, 9), 10)
36 |   w <- rep(1, 10)
37 | 
38 |   expect_equal(regress_robust(x, y, w, 10), c(10, 2))
39 | })
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/man/frange.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{frange}
 4 | \alias{frange}
 5 | \title{Efficient implementation of range.}
 6 | \usage{
 7 | frange(x, finite = TRUE)
 8 | }
 9 | \arguments{
10 | \item{x}{a numeric vector, or a \code{\link{ranged}} object}
11 | 
12 | \item{finite}{If \code{TRUE} ignores missing values and infinities. Note
13 | that if the vector is empty, or only contains missing values,
14 | \code{frange} will return \code{c(Inf, -Inf)} because those are the
15 | identity values for \code{\link{min}} and \code{\link{max}} respectively.}
16 | }
17 | \description{
18 | This is an efficient C++ implementation of range for numeric vectors:
19 | it avoids S3 dispatch, and computes both min and max in a single pass
20 | through the input.
21 | }
22 | \details{
23 | If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}}
24 | object), it will be used instead of computing the range from scratch.
25 | }
26 | \examples{
27 | x <- runif(1e6)
28 | system.time(range(x))
29 | system.time(frange(x))
30 | 
31 | rx <- ranged(x)
32 | system.time(frange(rx))
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/man/weighted.quantile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/weighted-stats.r
 3 | \name{weighted.quantile}
 4 | \alias{weighted.quantile}
 5 | \title{Compute quantiles of weighted data.}
 6 | \usage{
 7 | weighted.quantile(x, w, probs = seq(0, 1, 0.25), na.rm = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector of observations}
11 | 
12 | \item{w}{integer vector of weights, representing the number of
13 | time each \code{x} was observed}
14 | 
15 | \item{probs}{numeric vector of probabilities between 0 and 1}
16 | 
17 | \item{na.rm}{If \code{TRUE} will automatically remove missing values
18 | in \code{x} or \code{w}.}
19 | }
20 | \description{
21 | Compute quantiles of weighted data.
22 | }
23 | \details{
24 | Currently only implements the type 7 algorithm, as described in
25 | \code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core.
26 | }
27 | \examples{
28 | x <- runif(200)
29 | w <- rpois(200, 5) + 1
30 | weighted.quantile(x, w)
31 | }
32 | \seealso{
33 | Other weighted statistics: \code{\link{weighted.ecdf}};
34 |   \code{\link{weighted.sd}}, \code{\link{weighted.var}}
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/man/dchallenge.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/challenge.r
 3 | \name{dchallenge}
 4 | \alias{dchallenge}
 5 | \alias{rchallenge}
 6 | \title{Density and random number generation functions for a challenging
 7 | distribution.}
 8 | \usage{
 9 | dchallenge(x)
10 | 
11 | rchallenge(n)
12 | }
13 | \arguments{
14 | \item{x}{values to evaluate pdf at}
15 | 
16 | \item{n}{number of random samples to generate}
17 | }
18 | \description{
19 | This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom
20 | centered at 15 and scaled by 2, and a gamma distribution with shape 2
21 | and rate 1/3. (The t-distribution is windsorised at 0, but this
22 | has negligible effect.) This distribution is challenging because it
23 | mixes heavy tailed and asymmetric distributions.
24 | }
25 | \examples{
26 | plot(dchallenge, xlim = c(-5, 60), n = 500)
27 | 
28 | x <- rchallenge(1e4)
29 | hist(x, breaks = 1000)
30 | xsum <- condense(bin(x, 0.1))
31 | plot(xsum$x, xsum$.count, type = "l")
32 | xsmu <- smooth(xsum, 0.3)
33 | plot(xsmu$x, xsmu$.count, type = "l")
34 | plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30))
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/R/origin.r:
--------------------------------------------------------------------------------
 1 | #' Find the origin.
 2 | #'
 3 | #' @details
 4 | #' This algorithm implements simple heuristics for determining the origin of
 5 | #' a histogram when only the binwidth is specified. It:
 6 | #'
 7 | #' \itemize{
 8 | #'    \item rounds to zero, if relatively close
 9 | #'    \item subtracts 0.5 offset, if an x is integer
10 | #'    \item ensures the origin is a multiple of the binwidth
11 | #' }
12 | #' @param x numeric or integer vector
13 | #' @param binwidth binwidth
14 | #' @export
15 | #' @keywords internal
16 | #' @family reasonable defaults
17 | #' @examples
18 | #' find_origin(1:10, 1)
19 | #' find_origin(1:10, 2)
20 | #' find_origin(c(1, 1e6), 1)
21 | find_origin <- function(x, binwidth) {
22 |   rng <- frange(x, finite = TRUE)
23 |   if (!all(is.finite(rng))) stop("No valid values in x", call. = FALSE)
24 | 
25 |   offset <- is.integer(x) * 0.5
26 | 
27 |   if (close_to_zero(rng[1], rng)) {
28 |     0 - offset
29 |   } else {
30 |     floor_any(rng[1], binwidth) - offset
31 |   }
32 | }
33 | 
34 | close_to_zero <- function(x, rng) {
35 |   (abs(x) / abs(rng[2] - rng[1])) < 1e-3
36 | }
37 | 
38 | floor_any <- function(x, accuracy) {
39 |   floor(x / accuracy) * accuracy
40 | }
41 | 


--------------------------------------------------------------------------------
/man/condense.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/condense.r
 3 | \name{condense}
 4 | \alias{condense}
 5 | \title{Efficient binned summaries.}
 6 | \usage{
 7 | condense(..., z = NULL, summary = NULL, w = NULL, drop = NULL)
 8 | }
 9 | \arguments{
10 | \item{...}{group objects created by \code{\link{bin}}}
11 | 
12 | \item{z}{a numeric vector to summary for each group. Optional for some
13 | summary statistics.}
14 | 
15 | \item{summary}{the summary statistic to use. Currently must be one of
16 | count, sum, mean, median or sd. If \code{NULL}, defaults to mean if
17 | y is present, count if not.}
18 | 
19 | \item{w}{a vector of weights. Not currently supported by all summary
20 | functions.}
21 | 
22 | \item{drop}{if \code{TRUE} only locations with data will be returned.  This
23 | is more efficient if the data is very sparse (<1\% of cells filled), and
24 | is slightly less efficient. Defaults to \code{TRUE} if you are condensing
25 | over two or more dimensions, \code{FALSE} for 1d.}
26 | }
27 | \description{
28 | Efficient binned summaries.
29 | }
30 | \examples{
31 | x <- runif(1e5)
32 | gx <- bin(x, 0.1)
33 | condense(gx)
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/man/bin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/bin.r
 3 | \name{bin}
 4 | \alias{bin}
 5 | \title{Create a binned variable.}
 6 | \usage{
 7 | bin(x, width = find_width(x), origin = find_origin(x, width), name = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric or integer vector}
11 | 
12 | \item{width}{bin width. If not specified, about 10,000 bins will be chosen
13 | using the algorithim in \code{\link{find_width}}.}
14 | 
15 | \item{origin}{origin. If not specified, guessed by \code{\link{find_origin}}.}
16 | 
17 | \item{name}{name of original variable. This will be guessed from the input to
18 | \code{group} if not supplied. Used in the output of
19 | \code{\link{condense}} etc.}
20 | }
21 | \description{
22 | Create a binned variable.
23 | }
24 | \details{
25 | This function produces an R reference class that wraps around a C++ function.
26 | Generally, you should just treat this as an opaque object with reference
27 | semantics, and you shouldn't call the methods on it - pass it to
28 | \code{\link{condense}} and friends.
29 | }
30 | \examples{
31 | x <- runif(1e6)
32 | bin(x)
33 | bin(x, 0.01)
34 | bin(x, 0.01, origin = 0.5)
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/man/mt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/mt.r
 3 | \name{mt}
 4 | \alias{inv_mt}
 5 | \alias{mt}
 6 | \alias{mt_trans}
 7 | \title{Modulus transformation (and its inverse).}
 8 | \usage{
 9 | mt(x, lambda)
10 | 
11 | inv_mt(x, lambda)
12 | 
13 | mt_trans(lambda)
14 | }
15 | \arguments{
16 | \item{x}{values to transform}
17 | 
18 | \item{lambda}{degree of transformation}
19 | }
20 | \description{
21 | A generalisation of the box-cox transformation that works for
22 | values with both positive and negative values.
23 | }
24 | \details{
25 | This is useful for compressing the tails of long-tailed distributions,
26 | often encountered with very large datasets.
27 | }
28 | \examples{
29 | x <- seq(-10, 10, length = 100)
30 | plot(x, mt(x, 0), type = "l")
31 | plot(x, mt(x, 0.25), type = "l")
32 | plot(x, mt(x, 0.5), type = "l")
33 | plot(x, mt(x, 1), type = "l")
34 | plot(x, mt(x, 2), type = "l")
35 | plot(x, mt(x, -1), type = "l")
36 | plot(x, mt(x, -2), type = "l")
37 | }
38 | \references{
39 | J. John and N. Draper. "An alternative family of
40 |  transformations." Applied Statistics, pages 190-197, 1980.
41 |  \url{http://www.jstor.org/stable/2986305}
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/man/weighted.ecdf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/weighted-stats.r
 3 | \name{weighted.ecdf}
 4 | \alias{weighted.ecdf}
 5 | \title{A weighted ecdf function.}
 6 | \usage{
 7 | weighted.ecdf(x, w)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector of observations}
11 | 
12 | \item{w}{integer vector of weights, representing the number of
13 | time each \code{x} was observed}
14 | }
15 | \description{
16 | An extension of the base \code{\link[stats]{ecdf}} function which works
17 | with weighted data.
18 | }
19 | \section{S3 methods}{
20 | 
21 | The \code{ecdf} class has methods for \code{\link{plot}},
22 | \code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}.
23 | \code{\link{quantile}} does not currently correctly compute values for
24 | weighted ecdfs.
25 | }
26 | \examples{
27 | x <- runif(200)
28 | w <- rpois(200, 5) + 1
29 | 
30 | e <- weighted.ecdf(x, w)
31 | plot(e)
32 | summary(e)
33 | 
34 | y <- x[rep(seq_along(x), w)]
35 | plot(ecdf(y))
36 | }
37 | \seealso{
38 | \code{\link[stats]{weighted.mean}}
39 | 
40 | Other weighted statistics: \code{\link{weighted.quantile}};
41 |   \code{\link{weighted.sd}}, \code{\link{weighted.var}}
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/man/autoplot.condensed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/autoplot.r
 3 | \name{autoplot.condensed}
 4 | \alias{autoplot.condensed}
 5 | \title{Autoplot condensed summaries.}
 6 | \usage{
 7 | \method{autoplot}{condensed}(x, var = last(summary_vars(x)), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{a condensed summary}
11 | 
12 | \item{var}{which summary variable to display}
13 | 
14 | \item{...}{other arguments passed on to individual methods}
15 | }
16 | \description{
17 | Autoplot condensed summaries.
18 | }
19 | \examples{
20 | if (require("ggplot2")) {
21 | 
22 | # 1d summaries -----------------------------
23 | x <- rchallenge(1e4)
24 | z <- x + rt(length(x), df = 2)
25 | xsum <- condense(bin(x, 0.1))
26 | zsum <- condense(bin(x, 0.1), z = z)
27 | 
28 | autoplot(xsum)
29 | autoplot(peel(xsum))
30 | 
31 | autoplot(zsum)
32 | autoplot(peel(zsum, keep = 1))
33 | autoplot(peel(zsum))
34 | 
35 | # 2d summaries -----------------------------
36 | y <- runif(length(x))
37 | xysum <- condense(bin(x, 0.1), bin(y, 0.1))
38 | xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z)
39 | 
40 | autoplot(xysum)
41 | autoplot(peel(xysum))
42 | autoplot(xyzsum)
43 | autoplot(peel(xyzsum))
44 | }
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/src/condense-gen.r:
--------------------------------------------------------------------------------
 1 | library(whisker)
 2 | 
 3 | # Generate template specialisations for groupwise - these are the functions
 4 | # that are called from R.
 5 | 
 6 | summaries <- c(
 7 |   count = "Sum(0)",
 8 |   sum = "Sum(1)",
 9 |   mean = "Moments(1)",
10 |   sd = "Moments(2)",
11 |   median = "Median()"
12 | )
13 | 
14 | template <- "
15 | // [[Rcpp::export]]
16 | List condense_{{name}}(const List& x, const NumericVector& z,
17 |                        const NumericVector& weight, bool drop = false) {
18 |   if (drop) {
19 |     return sparse_condense(BinnedVectors(x), z, weight, Summary{{summary}});
20 |   } else {
21 |     return condense(BinnedVectors(x), z, weight, Summary{{summary}});
22 |   }
23 | }
24 | "
25 | 
26 | cpp_fun <- function(summary) {
27 |   whisker.render(template, list(
28 |     name = tolower(summary),
29 |     summary = summaries[[summary]]
30 |   ))
31 | }
32 | 
33 | 
34 | groupwise <- readLines("condense.cpp")
35 | split <- which(grepl("// -{40,}", groupwise))[1]
36 | original <- groupwise[1:split]
37 | 
38 | writeLines(original, "condense.cpp")
39 | 
40 | cat("// Autogenerated by condense-gen.r\n", file = "condense.cpp", append = TRUE)
41 | funs <- unlist(lapply(names(summaries), cpp_fun))
42 | cat(funs, file = "condense.cpp", append = TRUE, sep = "")
43 | 


--------------------------------------------------------------------------------
/man/transform.condensed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/rebin.r
 3 | \name{transform.condensed}
 4 | \alias{rebin}
 5 | \alias{transform.condensed}
 6 | \title{Transform condensed objects, collapsing unique bins.}
 7 | \usage{
 8 | \\method{transform}{condensed}(`_data`, ...)
 9 | 
10 | rebin(data)
11 | }
12 | \arguments{
13 | \item{...}{named arguments evaluated in the context of the data}
14 | 
15 | \item{data,`_data`}{a condensed summary}
16 | }
17 | \description{
18 | Transform condensed objects, collapsing unique bins.
19 | }
20 | \details{
21 | You don't need to use \code{rebin} if you use transform: it will
22 | automatically rebin for you.  You will need to use it if you manually
23 | transform any grouping variables.
24 | }
25 | \examples{
26 | x <- runif(1e4, -1, 1)
27 | xsum <- condense(bin(x, 1 / 50))
28 | 
29 | # Transforming by hand: must use rebin
30 | xsum$x <- abs(xsum$x)
31 | rebin(xsum)
32 | if (require("ggplot2")) {
33 |   autoplot(xsum) + geom_point()
34 |   autoplot(rebin(xsum)) + geom_point()
35 | }
36 | 
37 | #' Transforming with transform
38 | y <- x ^ 2 + runif(length(x), -0.1, 0.1)
39 | xysum <- condense(bin(x, 1 / 50), z = y)
40 | xysum <- transform(xysum, x = abs(x))
41 | if (require("ggplot2")) {
42 |   autoplot(xysum)
43 | }
44 | }
45 | \keyword{internal}
46 | 
47 | 


--------------------------------------------------------------------------------
/man/ranged.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/ranged.r
 3 | \name{ranged}
 4 | \alias{ranged}
 5 | \title{A S3 class for caching the range of a vector}
 6 | \usage{
 7 | ranged(x, range = frange(x, finite = TRUE))
 8 | }
 9 | \arguments{
10 | \item{x}{a numeric vector}
11 | 
12 | \item{range}{the range of the vector (excluding missing values), if known.
13 | If unknown, it will be computed with \code{\link{frange}}, a fast C++
14 | implementation of \code{\link{range}}.}
15 | }
16 | \description{
17 | This class is designed for dealing with large vectors, where the cost of
18 | recomputing the range multiple times is prohibitive. It provides methods
19 | for \code{\link{print}} and \code{\link{str}} that display only the range,
20 | not the contents.
21 | }
22 | \section{Performance}{
23 | 
24 | For best performance, you may want to run copy and paste the contents of
25 | this function into your function, to avoid making any copies of \code{x}.
26 | This is probably only necessary if you're dealing with extremely large
27 | vectors, > 100 million obs.
28 | }
29 | \examples{
30 | x <- runif(1e6)
31 | y <- ranged(x)
32 | range(y)
33 | y
34 | str(y)
35 | 
36 | # Modifications to the class currently destroy the cache
37 | y[1] <- 10
38 | max(y)
39 | class(y)
40 | z <- y + 10
41 | max(z)
42 | class(z)
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/man/weighted.var.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/weighted-stats.r
 3 | \name{weighted.var}
 4 | \alias{weighted.sd}
 5 | \alias{weighted.var}
 6 | \title{Compute a weighted variance or standard deviation of a vector.}
 7 | \usage{
 8 | weighted.var(x, w = NULL, na.rm = FALSE)
 9 | 
10 | weighted.sd(x, w, na.rm = TRUE)
11 | }
12 | \arguments{
13 | \item{x}{numeric vector of observations}
14 | 
15 | \item{w}{integer vector of weights, representing the number of
16 | time each \code{x} was observed}
17 | 
18 | \item{na.rm}{if \code{TRUE}, missing values in both \code{w} and \code{x}
19 | will be removed prior computation. Otherwise if there are missing values
20 | the result will always be missing.}
21 | }
22 | \description{
23 | Compute a weighted variance or standard deviation of a vector.
24 | }
25 | \details{
26 | Note that unlike the base R \code{\link{var}} function, these functions only
27 | work with individual vectors not matrices or data frames.
28 | }
29 | \examples{
30 | x <- c(1:5)
31 | w <- rpois(5, 5) + 1
32 | y <- x[rep(seq_along(x), w)]
33 | weighted.var(x, w)
34 | var(y)
35 | 
36 | stopifnot(all.equal(weighted.var(x, w), var(y)))
37 | }
38 | \seealso{
39 | \code{\link[stats]{weighted.mean}}
40 | 
41 | Other weighted statistics: \code{\link{weighted.ecdf}};
42 |   \code{\link{weighted.quantile}}
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/R/movies.r:
--------------------------------------------------------------------------------
 1 | #' Movie information and user ratings from IMDB.com.
 2 | #'
 3 | #' The internet movie database, \url{http://imdb.com/}, is a website devoted
 4 | #' to collecting movie data supplied by studios and fans.  It claims to be the
 5 | #' biggest movie database on the web and is run by amazon.  More about
 6 | #' information imdb.com can be found online, 
 7 | #' \url{http://imdb.com/help/show_leaf?about}, including information about 
 8 | #' the data collection process,
 9 | #' \url{http://imdb.com/help/show_leaf?infosource}.
10 | #' 
11 | #' Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user.  The data set contains the following fields:
12 | #' 
13 | #' \itemize{
14 | #'   \item title.  Title of the movie.
15 | #'   \item year.  Year of release.
16 | #'   \item budget.  Total budget (if known) in US dollars
17 | #'   \item length.  Length in minutes.
18 | #'   \item rating.  Average IMDB user rating.
19 | #'   \item votes.  Number of IMDB users who rated this movie.
20 | #'   \item mpaa.  MPAA rating.
21 | #'   \item action, animation, comedy, drama, documentary, romance, short:
22 | #'     \code{TRUE} if movie belongs to that genre.
23 | #' }
24 | #' 
25 | #' @docType data
26 | #' @usage data(movies)
27 | #' @name movies
28 | #' @format A data frame with 130,456 rows and 14 variables
29 | #' @references \url{http://had.co.nz/data/movies/}
30 | NULL
31 | 


--------------------------------------------------------------------------------
/bench/kernel.cpp:
--------------------------------------------------------------------------------
 1 | // Differences in kernel performance
 2 | 
 3 | #include <Rcpp.h>
 4 | using namespace Rcpp;
 5 | 
 6 | // [[Rcpp::export]]
 7 | NumericVector normal_kernel(NumericVector x) {
 8 |   int n = x.size();
 9 |   NumericVector out(n);
10 | 
11 |   for (int i = 0; i < n; ++i) {
12 |     out[i] = R::dnorm(x[i], 0, 1, 0);
13 |   }
14 | 
15 |   return out;
16 | }
17 | 
18 | // [[Rcpp::export]]
19 | double tricube2(double x) {
20 |   x = fabs(x);
21 |   if (x > 1) return 0;
22 | 
23 |   return pow(1 - pow(x, 3), 3);
24 | }
25 | 
26 | // [[Rcpp::export]]
27 | double tricube(double x) {
28 |   x = fabs(x);
29 |   if (x > 1) return 0;
30 | 
31 |   double y = 1 - x * x * x;
32 |   return y * y * y;
33 | }
34 | 
35 | // [[Rcpp::export]]
36 | NumericVector tricube_kernel(NumericVector x) {
37 |   int n = x.size();
38 |   NumericVector out(n);
39 | 
40 |   for (int i = 0; i < n; ++i) {
41 |     out[i] = tricube(x[i]);
42 |   }
43 | 
44 |   return out;
45 | }
46 | 
47 | // [[Rcpp::export]]
48 | NumericVector copy(NumericVector x) {
49 |   int n = x.size();
50 |   NumericVector out(n);
51 | 
52 |   for (int i = 0; i < n; ++i) {
53 |     out[i] = x[i];
54 |   }
55 | 
56 |   return out;
57 | }
58 | 
59 | /*** R
60 | options(digits = 3)
61 | library(microbenchmark)
62 | 
63 | x <- runif(1e4)
64 | 
65 | mean(sapply(x, tricube) - sapply(x, tricube2))
66 | 
67 | microbenchmark(
68 |   copy(x),
69 |   tricube_kernel(x),
70 |   normal_kernel(x)
71 | )
72 | 
73 | */


--------------------------------------------------------------------------------
/man/condensed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/condensed.r
 3 | \name{condensed}
 4 | \alias{as.condensed}
 5 | \alias{condensed}
 6 | \alias{is.condensed}
 7 | \title{Condensed: an S3 class for condensed summaries.}
 8 | \usage{
 9 | condensed(groups, grouped, summary)
10 | 
11 | is.condensed(x)
12 | 
13 | as.condensed(x)
14 | }
15 | \arguments{
16 | \item{groups}{list of \code{\link{bin}}ed objects}
17 | 
18 | \item{grouped,summary}{output from C++ condense function}
19 | 
20 | \item{x}{object to test or coerce}
21 | }
22 | \description{
23 | This object managed the properties of condensed (summarised) data frames.
24 | }
25 | \section{S3 methods}{
26 | 
27 | 
28 | Mathematical functions with methods for \code{binsum} object will modify
29 | the x column of the data frame and \code{\link{rebin}} the data, calculating
30 | updated summary statistics.
31 | 
32 | Currently methods are provided for the \code{Math} group generic,
33 | logical comparison and arithmetic operators, and
34 | \code{\link[plyr]{round_any}}.
35 | }
36 | \examples{
37 | if (require("ggplot2")) {
38 | 
39 | x <- rchallenge(1e4)
40 | xsum <- condense(bin(x, 1 / 10))
41 | 
42 | # Basic math operations just modify the first column
43 | autoplot(xsum)
44 | autoplot(xsum * 10)
45 | autoplot(xsum - 30)
46 | autoplot(abs(xsum - 30))
47 | 
48 | # Similarly, logical operations work on the first col
49 | autoplot(xsum[xsum > 10, ])
50 | }
51 | }
52 | \keyword{internal}
53 | 
54 | 


--------------------------------------------------------------------------------
/man/movies.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/movies.r
 3 | \docType{data}
 4 | \name{movies}
 5 | \alias{movies}
 6 | \title{Movie information and user ratings from IMDB.com.}
 7 | \format{A data frame with 130,456 rows and 14 variables}
 8 | \usage{
 9 | data(movies)
10 | }
11 | \description{
12 | The internet movie database, \url{http://imdb.com/}, is a website devoted
13 | to collecting movie data supplied by studios and fans.  It claims to be the
14 | biggest movie database on the web and is run by amazon.  More about
15 | information imdb.com can be found online,
16 | \url{http://imdb.com/help/show_leaf?about}, including information about
17 | the data collection process,
18 | \url{http://imdb.com/help/show_leaf?infosource}.
19 | }
20 | \details{
21 | Movies were selected for inclusion if they had a known length and had been rated by at least one imdb user.  The data set contains the following fields:
22 | 
23 | \itemize{
24 |   \item title.  Title of the movie.
25 |   \item year.  Year of release.
26 |   \item budget.  Total budget (if known) in US dollars
27 |   \item length.  Length in minutes.
28 |   \item rating.  Average IMDB user rating.
29 |   \item votes.  Number of IMDB users who rated this movie.
30 |   \item mpaa.  MPAA rating.
31 |   \item action, animation, comedy, drama, documentary, romance, short:
32 |     \code{TRUE} if movie belongs to that genre.
33 | }
34 | }
35 | \references{
36 | \url{http://had.co.nz/data/movies/}
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/man/standardise.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/standardise.r
 3 | \name{standardise}
 4 | \alias{standardise}
 5 | \title{Standardise a summary to sum to one.}
 6 | \usage{
 7 | standardise(x, margin = integer())
 8 | }
 9 | \arguments{
10 | \item{x}{a condensed summary. Must have \code{.count} variable.}
11 | 
12 | \item{margin}{margins to standardise along.  If \code{NULL}, the default,
13 | standardises the whole array.}
14 | }
15 | \description{
16 | Standardise a summary to sum to one.
17 | }
18 | \examples{
19 | b1 <- condense(bin(movies$year, 1))
20 | d1 <- smooth(b1, 2, type = "reg")
21 | 
22 | if (require("ggplot2")) {
23 | 
24 | autoplot(b1)
25 | autoplot(d1)
26 | 
27 | # Note change in x-axis limits
28 | autoplot(standardise(d1))
29 | }
30 | 
31 | # Can also standardise a dimension at a time
32 | b2 <- with(movies, condense(bin(year, 2), bin(length, 10)))
33 | b2 <- peel(b2, central = TRUE)
34 | 
35 | if (require("ggplot2")) {
36 | 
37 | autoplot(b2)
38 | autoplot(standardise(b2))    # note legend
39 | autoplot(standardise(b2, "year"))   # each row sums to 1
40 | autoplot(standardise(b2, "length")) # each col sums to 1
41 | 
42 | base <- ggplot(b2, aes(length, .count)) +
43 |   geom_line(aes(group = year, colour = year))
44 | base
45 | base \%+\% standardise(b2)  # Just affects y axis labels
46 | base \%+\% standardise(b2, "year") # Makes year comparable
47 | base \%+\% standardise(b2, "length") # Meaningless for this display
48 | 
49 | }
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/inst/tests/test-smooth.r:
--------------------------------------------------------------------------------
 1 | context("Smooth")
 2 | 
 3 | tricube <- function(x) {
 4 |   x <- abs(x)
 5 |   ifelse(x > 1, 0, (1 - x ^ 3) ^ 3)
 6 | }
 7 | # plot(tricube, xlim = c(-1.5, 1.5))
 8 | 
 9 | test_that("factorised smooth equal to manual smooth", {
10 |   grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE))
11 |   z <- rep(0, nrow(grid))
12 |   z[c(5, 23, 84)] <- 1
13 | 
14 |   z_x <- smooth_nd_1(grid, z, numeric(), grid, 0, 3)
15 |   z_y <- smooth_nd_1(grid, z, numeric(), grid, 1, 3)
16 | 
17 |   z_xy <- smooth_nd_1(grid, z_x, numeric(), grid, 1, 3)
18 |   z_yx <- smooth_nd_1(grid, z_y, numeric(), grid, 0, 3)
19 |   z2 <- smooth_nd(grid, z, numeric(), grid, c(3, 3))
20 | 
21 |   expect_equal(z_xy, z2)
22 |   expect_equal(z_yx, z2)
23 | })
24 | 
25 | # library(ggplot2)
26 | # qplot(grid[, 1], grid[, 2], fill = z, geom = "raster")
27 | # qplot(grid[, 1], grid[, 2], fill = z_xy, geom = "raster")
28 | # qplot(grid[, 1], grid[, 2], fill = z_yx, geom = "raster")
29 | # qplot(grid[, 1], grid[, 2], fill = z2, geom = "raster")
30 | 
31 | test_that("factorised smooth equal to manual smooth", {
32 |   grid <- as.matrix(expand.grid(x = 1:10, y = 1:10, KEEP.OUT.ATTRS = FALSE))
33 |   z <- rep(0, nrow(grid))
34 |   z[c(5, 23, 84)] <- 1
35 | 
36 |   grid <- as.data.frame(grid)
37 |   grid$.count <- z
38 |   class(grid) <- c("condensed", class(grid))
39 | 
40 |   z1 <- smooth(grid, c(3, 3), ".count", factor = FALSE)
41 |   z2 <- smooth(grid, c(3, 3), ".count", factor = TRUE)
42 | 
43 |   expect_equal(z1, z2)
44 | })
45 | 


--------------------------------------------------------------------------------
/man/peel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/peel.r
 3 | \name{peel}
 4 | \alias{peel}
 5 | \title{Peel off low density regions of the data.}
 6 | \usage{
 7 | peel(x, keep = 0.99, central = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{condensed summary}
11 | 
12 | \item{keep}{(approximate) proportion of data to keep. If \code{1}, will
13 | remove all cells with counts.  All missing values will be preserved.}
14 | 
15 | \item{central}{if \code{TRUE} peels off regions of lowest density only from
16 | the outside of the data. In 2d this works by progressively peeling off
17 | convex hull of the data: the current algorithm is quite slow.
18 | If \code{FALSE}, just removes the lowest density regions wherever they are
19 | found. Regions with 0 density are removed regardless of location.
20 | Defaults to TRUE if there are two or fewer grouping variables is less.}
21 | }
22 | \description{
23 | Keeps specified proportion of data by removing the lowest density regions,
24 | either anywhere on the plot, or for 2d, just around the edges.
25 | }
26 | \details{
27 | This is useful for visualisation, as an easy way of focussing on the regions
28 | where the majority of the data lies.
29 | }
30 | \examples{
31 | x <- rt(1e5, df = 2)
32 | y <- rt(1e5, df = 2)
33 | xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10))
34 | plot(xysum$x, xysum$y)
35 | 
36 | plot(peel(xysum, 0.95, central = TRUE)[1:2])
37 | plot(peel(xysum, 0.90, central = TRUE)[1:2])
38 | plot(peel(xysum, 0.50, central = TRUE)[1:2])
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2 (4.1.1): do not edit by hand
 2 | 
 3 | S3method("[",dgrid)
 4 | S3method("[<-",ranged)
 5 | S3method(Math,condensed)
 6 | S3method(Ops,condensed)
 7 | S3method(Ops,ranged)
 8 | S3method(as.condensed,condensed)
 9 | S3method(as.condensed,data.frame)
10 | S3method(as.data.frame,dgrid)
11 | S3method(as.data.frame,ranged)
12 | S3method(as.integer,dgrid)
13 | S3method(max,dgrid)
14 | S3method(max,ranged)
15 | S3method(min,dgrid)
16 | S3method(min,ranged)
17 | S3method(print,ranged)
18 | S3method(range,dgrid)
19 | S3method(range,ranged)
20 | S3method(str,ranged)
21 | S3method(transform,condensed)
22 | export(as.condensed)
23 | export(autoplot.condensed)
24 | export(best_h)
25 | export(bin)
26 | export(breaks)
27 | export(condense)
28 | export(dchallenge)
29 | export(dgrid)
30 | export(find_origin)
31 | export(find_width)
32 | export(frange)
33 | export(h_grid)
34 | export(inv_mt)
35 | export(is.condensed)
36 | export(is.dgrid)
37 | export(is.ranged)
38 | export(mt)
39 | export(mt_trans)
40 | export(peel)
41 | export(ranged)
42 | export(rchallenge)
43 | export(rebin)
44 | export(rmse_cv)
45 | export(rmse_cvs)
46 | export(round_any.condensed)
47 | export(smooth)
48 | export(standardise)
49 | export(weighted.IQR)
50 | export(weighted.ecdf)
51 | export(weighted.median)
52 | export(weighted.quantile)
53 | export(weighted.sd)
54 | export(weighted.var)
55 | exportMethods(as.integer)
56 | exportMethods(show)
57 | importFrom(Rcpp,compileAttributes)
58 | importFrom(Rcpp,cpp_object_initializer)
59 | importFrom(methods,new)
60 | useDynLib(bigvis)
61 | 


--------------------------------------------------------------------------------
/R/dgrid.r:
--------------------------------------------------------------------------------
 1 | #' dgrid: an S3 class for data grids
 2 | #'
 3 | #' @param x a numeric vector to test or coerce.
 4 | #' @param width bin width
 5 | #' @param origin bin origins
 6 | #' @param nbins number of bins
 7 | #' @export
 8 | #' @examples
 9 | #' g <- dgrid(0:10 + 0.5, width = 1)
10 | #' range(g)
11 | #' as.integer(g)
12 | dgrid <- function(x, width, origin = 0, nbins = NULL) {
13 |   stopifnot(is.numeric(x))
14 |   stopifnot(is.numeric(width), length(width) == 1, width > 0)
15 |   stopifnot(is.numeric(origin), length(origin) == 1)
16 | 
17 |   if (is.null(nbins)) {
18 |     nbins <- floor((max(x) - origin) / width)
19 |   }
20 | 
21 |   structure(x, class = c("dgrid", "numeric"),
22 |     width = width, origin = origin, nbins = nbins)
23 | }
24 | 
25 | #' @export
26 | #' @rdname dgrid
27 | is.dgrid <- function(x) inherits(x, "dgrid")
28 | 
29 | #' @export
30 | "[.dgrid" <- function(x, ...) {
31 |   dgrid(NextMethod(), width = attr(x, "width"),
32 |     origin = attr(x, "origin"), nbins = attr(x, "nbins"))
33 | }
34 | 
35 | #' @export
36 | min.dgrid <- function(x, ...) attr(x, "origin")
37 | #' @export
38 | max.dgrid <- function(x, ...) {
39 |   min(x) + attr(x, "nbins") * attr(x, "width")
40 | }
41 | #' @export
42 | range.dgrid <- function(x, ...) c(min(x), max(x))
43 | 
44 | #' @export
45 | as.integer.dgrid <- function(x, ...) {
46 |   as.integer((unclass(x) - attr(x, "origin")) / attr(x, "width") + 1L)
47 | }
48 | 
49 | #' @export
50 | as.data.frame.dgrid <- function(x, ...) {
51 |   n <- length(x)
52 |   list <- list(x)
53 |   class(list) <- "data.frame"
54 |   attr(list, "row.names") <- c(NA_integer_, -n)
55 |   list
56 | }
57 | 


--------------------------------------------------------------------------------
/man/rmse_cvs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/rmse.r
 3 | \name{rmse_cvs}
 4 | \alias{rmse_cv}
 5 | \alias{rmse_cvs}
 6 | \title{Estimate smoothing RMSE using leave-one-out cross-validation.}
 7 | \usage{
 8 | rmse_cvs(x, hs = h_grid(x), ...)
 9 | 
10 | rmse_cv(x, h, var = summary_vars(x)[1], ...)
11 | }
12 | \arguments{
13 | \item{x}{condensed summary table}
14 | 
15 | \item{...}{other variables passed on to \code{\link{smooth}}}
16 | 
17 | \item{h,hs}{for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv}
18 | a data frame of bandwidths, as generated by \code{\link{h_grid}}.}
19 | 
20 | \item{var}{variable to smooth}
21 | }
22 | \description{
23 | \code{rmse_cv} computes the leave-one-out RMSE for a single vector of
24 | bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths,
25 | stored as a data frame.
26 | }
27 | \examples{
28 | \donttest{
29 | set.seed(1014)
30 | # 1d -----------------------------
31 | x <- rchallenge(1e4)
32 | xsum <- condense(bin(x, 1 / 10))
33 | cvs <- rmse_cvs(xsum)
34 | 
35 | if (require("ggplot2")) {
36 | autoplot(xsum)
37 | qplot(x, err, data = cvs, geom = "line")
38 | xsmu <- smooth(xsum, 1.3)
39 | autoplot(xsmu)
40 | autoplot(peel(xsmu))
41 | }
42 | 
43 | # 2d -----------------------------
44 | y <- runif(1e4)
45 | xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
46 | cvs <- rmse_cvs(xysum, h_grid(xysum, 10))
47 | if (require("ggplot2")) {
48 | qplot(x, y, data = cvs, size = err)
49 | }
50 | }
51 | }
52 | \seealso{
53 | Other bandwidth estimation functions: \code{\link{best_h}};
54 |   \code{\link{h_grid}}
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/src/Summary2d.h:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include "stats.h"
 3 | using namespace Rcpp;
 4 | 
 5 | class Summary2d {
 6 |   public:
 7 |     virtual void push(double x, double z, double w) =0;
 8 |     virtual double compute() =0;
 9 |     virtual ~Summary2d() {}
10 | };
11 | 
12 | class Summary2dMean: public Summary2d {
13 |     double w_, z_;
14 | 
15 |   public:
16 |     Summary2dMean() : w_(0), z_(0) {}
17 | 
18 |     void push(double x, double z, double w) {
19 |       // Rcout << "  x: " << x << " z: " << z << " w: " << w << "\n";
20 |       w_ += w;
21 |       z_ += z * w;
22 |     }
23 | 
24 |     double compute() {
25 |       // Rcout << "Result: " << z_ / w_ << "\n";
26 |       return z_ / w_;
27 |     }
28 | };
29 | 
30 | class Summary2dRegression: public Summary2d {
31 |     std::vector<double> x_, z_, w_;
32 | 
33 |   public:
34 |     Summary2dRegression() {}
35 | 
36 |     void push(double x, double z, double w) {
37 |       x_.push_back(x);
38 |       z_.push_back(z);
39 |       w_.push_back(w);
40 |     }
41 | 
42 |     double compute() {
43 |       return simpleLinearRegression(x_, z_, w_).alpha;
44 |     }
45 | };
46 | 
47 | class Summary2dRobustRegression: public Summary2d {
48 |     int iterations_;
49 |     std::vector<double> x_, z_, w_;
50 | 
51 |   public:
52 |     Summary2dRobustRegression() : iterations_(3) {}
53 |     Summary2dRobustRegression(int iterations) : iterations_(iterations) {}
54 | 
55 |     void push(double x, double z, double w) {
56 |       x_.push_back(x);
57 |       z_.push_back(z);
58 |       w_.push_back(w);
59 |     }
60 | 
61 |     double compute() {
62 |       return simpleRobustRegression(x_, z_, w_, iterations_).alpha;
63 |     }
64 | };
65 | 


--------------------------------------------------------------------------------
/R/standardise.r:
--------------------------------------------------------------------------------
 1 | #' Standardise a summary to sum to one.
 2 | #'
 3 | #' @param x a condensed summary. Must have \code{.count} variable.
 4 | #' @param margin margins to standardise along.  If \code{NULL}, the default,
 5 | #'  standardises the whole array.
 6 | #' @export
 7 | #' @examples
 8 | #' b1 <- condense(bin(movies$year, 1))
 9 | #' d1 <- smooth(b1, 2, type = "reg")
10 | #'
11 | #' if (require("ggplot2")) {
12 | #'
13 | #' autoplot(b1)
14 | #' autoplot(d1)
15 | #'
16 | #' # Note change in x-axis limits
17 | #' autoplot(standardise(d1))
18 | #' }
19 | #'
20 | #' # Can also standardise a dimension at a time
21 | #' b2 <- with(movies, condense(bin(year, 2), bin(length, 10)))
22 | #' b2 <- peel(b2, central = TRUE)
23 | #'
24 | #' if (require("ggplot2")) {
25 | #'
26 | #' autoplot(b2)
27 | #' autoplot(standardise(b2))    # note legend
28 | #' autoplot(standardise(b2, "year"))   # each row sums to 1
29 | #' autoplot(standardise(b2, "length")) # each col sums to 1
30 | #'
31 | #' base <- ggplot(b2, aes(length, .count)) +
32 | #'   geom_line(aes(group = year, colour = year))
33 | #' base
34 | #' base %+% standardise(b2)  # Just affects y axis labels
35 | #' base %+% standardise(b2, "year") # Makes year comparable
36 | #' base %+% standardise(b2, "length") # Meaningless for this display
37 | #'
38 | #' }
39 | standardise <- function(x, margin = integer()) {
40 |   stopifnot(is.condensed(x), !is.null(x$.count))
41 | 
42 |   if (length(margin) == 0) {
43 |     x$.count <- prop(x$.count)
44 |   } else {
45 |     x$.count <- ave(x$.count, id(x[margin]), FUN = prop)
46 |     x$.count[is.na(x$.count)] <- 0
47 |   }
48 | 
49 |   x
50 | }
51 | 
52 | prop <- function(x) x / sum(x, na.rm = TRUE)
53 | 


--------------------------------------------------------------------------------
/R/mt.r:
--------------------------------------------------------------------------------
 1 | #' Modulus transformation (and its inverse).
 2 | #'
 3 | #' A generalisation of the box-cox transformation that works for
 4 | #' values with both positive and negative values.
 5 | #'
 6 | #' This is useful for compressing the tails of long-tailed distributions,
 7 | #' often encountered with very large datasets.
 8 | #'
 9 | #' @param x values to transform
10 | #' @param lambda degree of transformation
11 | #' @export
12 | #' @references J. John and N. Draper. "An alternative family of
13 | #'  transformations." Applied Statistics, pages 190-197, 1980.
14 | #'  \url{http://www.jstor.org/stable/2986305}
15 | #' @examples
16 | #' x <- seq(-10, 10, length = 100)
17 | #' plot(x, mt(x, 0), type = "l")
18 | #' plot(x, mt(x, 0.25), type = "l")
19 | #' plot(x, mt(x, 0.5), type = "l")
20 | #' plot(x, mt(x, 1), type = "l")
21 | #' plot(x, mt(x, 2), type = "l")
22 | #' plot(x, mt(x, -1), type = "l")
23 | #' plot(x, mt(x, -2), type = "l")
24 | mt <- function(x, lambda) {
25 |   stopifnot(is.numeric(x))
26 |   stopifnot(is.numeric(lambda), length(lambda) == 1)
27 | 
28 |   if (lambda == 0) {
29 |     sign(x) * log(abs(x) + 1)
30 |   } else {
31 |     sign(x) * ((abs(x) + 1) ^ lambda - 1) / lambda
32 |    }
33 | }
34 | 
35 | #' @rdname mt
36 | #' @export
37 | inv_mt <- function(x, lambda) {
38 |   stopifnot(is.numeric(x))
39 |   stopifnot(is.numeric(lambda), length(lambda) == 1)
40 | 
41 |   if (lambda == 0) {
42 |     sign(x) * (exp(abs(x)) - 1)
43 |   } else {
44 |     sign(x) * ((abs(x) * lambda + 1) ^ (1 / lambda) - 1)
45 |    }
46 | }
47 | 
48 | #' @rdname mt
49 | #' @export
50 | mt_trans <- function(lambda) {
51 |   scales::trans_new("modulo",
52 |     function(x) mt(x, lambda),
53 |     function(x) inv_mt(x, lambda)
54 |   )
55 | }
56 | 


--------------------------------------------------------------------------------
/src/BinnedVectors.cpp:
--------------------------------------------------------------------------------
 1 | #include <bigvis.h>
 2 | using namespace Rcpp;
 3 | 
 4 | int BinnedVectors::bin_i(int i) const {
 5 |   int bin = 0;
 6 |   int ngroups = groups_.size();
 7 | 
 8 |   for (int j = 0; j < ngroups; ++j) {
 9 |     bin += groups_[j].bin_i(i) * bins_[(ngroups - 1) - j];
10 |   }
11 | 
12 |   return bin;
13 | }
14 | 
15 | int BinnedVectors::bin(std::vector<double> x) const {
16 |   int ngroups = groups_.size();
17 |   if (x.size() != ngroups) stop("x must be same length as groups");
18 |   int bin = 0;
19 | 
20 |   for (int j = 0; j < ngroups; ++j) {
21 |     int bin_j = groups_[j].bin(x[j]);
22 |     bin += bin_j * bins_[(ngroups - 1) - j];
23 |     // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n";
24 |   }
25 | 
26 |   return bin;
27 | }
28 | 
29 | std::vector<double> BinnedVectors::unbin(int bin) const {
30 |   int ngroups = groups_.size();
31 |   std::vector<double> bins(ngroups);
32 | 
33 |   // if ngroups = 3, then: 
34 |   // bin = groups[0].bin(x[0]) * bins[2] (biggest) +
35 |   //       groups[1].bin(x[1]) * bins[1] +
36 |   //       groups[2].bin(x[2]) * bins[0] (smallest)
37 |   // peel off largest first
38 |   //   bin_j = bin %/% bin[2]
39 |   //   groups[0].unbin(bin_j)
40 |   // and that goes in last output position
41 | 
42 |   for (int i = 0, j = ngroups - 1; i < ngroups - 1; ++i, --j) {
43 |     int bin_j = bin % bins_[j];
44 |     // Rcout << "group: " << j << " bin: " << bin << " bin_j: " << bin_j << "\n";
45 |     bins[j] = groups_[j].unbin(bin_j);
46 | 
47 |     bin = (bin - bin_j) / bins_[j];
48 |   }
49 |   // Rcout << "group: " << 0 << " bin: " << bin << " bin_j: " << bin << "\n";
50 |   // Special case for last group because x %% 1 = 0
51 |   bins[0] = groups_[0].unbin(bin);
52 | 
53 |   return bins;
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/src/group-hex.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Translated from
 3 |  * https://github.com/d3/d3-plugins/blob/master/hexbin/hexbin.js
 4 |  *
 5 |  * Copyright (C) 2013 Hadley Wickham
 6 |  * Copyright (C) 2012 Mike Bostock (mbostock at gmail dot com)
 7 |  */
 8 | class GroupHex {
 9 |     const NumericVector x_;
10 |     const NumericVector y_;
11 |     double x_width_;
12 |     double x_origin_;
13 |     double y_width_;
14 |     double y_origin_;
15 |     double x_bins;
16 | 
17 |   public:
18 |     GroupHex (const NumericVector& x, const NumericVector& y, 
19 |                 double x_width, double y_width, 
20 |                 double x_origin, double y_origin, 
21 |                 double x_max)
22 |        : x_(x), y_(y), x_width_(x_width), x_origin_(x_origin), 
23 |           y_width_(y_width), y_origin_(y_origin) {
24 |       if (x.size() != y.size()) stop("x & y are not the same size");
25 |       x_bins = x_max / x_width_ + 1;
26 |     }
27 | 
28 |     int bin_i(int i) const {
29 |       double py = ISNAN(y_[i]) ? 0 : (y_[i] - y_origin_) / y_width_ + 1;
30 |       int pj = py;
31 |       double py1 = py - pj;
32 |       
33 |       double px = ISNAN(x_[i]) ? 0 : (x_[i] - x_origin_) / x_width_ + 1 - 
34 |         (pj % 2 ? 0.5 : 0);
35 |       int pi = px;
36 | 
37 |        if (fabs(py1) * 3 > 1) {
38 |         double px1 = px - pi,
39 |                pi2 = pi + (px < pi ? -1 : 1) / 2,
40 |                pj2 = pj + (py < pj ? -1 : 1),
41 |                px2 = px - pi2,
42 |                py2 = py - pj2;
43 |         if (px1 * px1 + py1 * py1 > px2 * px2 + py2 * py2) {
44 |           pi = pi2 + (pj % 2 ? 1 : -1) / 2;
45 |           pj = pj2;
46 |         }
47 |       }
48 | 
49 |       return pj * x_bins + pj;
50 |     }
51 | 
52 |     int size() const {
53 |       return x_.size();
54 |     }
55 | };
56 | 


--------------------------------------------------------------------------------
/src/frange.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | //' Efficient implementation of range.
 5 | //'
 6 | //' This is an efficient C++ implementation of range for numeric vectors:
 7 | //' it avoids S3 dispatch, and computes both min and max in a single pass
 8 | //' through the input.
 9 | //'
10 | //' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}}
11 | //' object), it will be used instead of computing the range from scratch.
12 | //' 
13 | //' @param x a numeric vector, or a \code{\link{ranged}} object
14 | //' @param finite If \code{TRUE} ignores missing values and infinities. Note
15 | //'   that if the vector is empty, or only contains missing values, 
16 | //'   \code{frange} will return \code{c(Inf, -Inf)} because those are the
17 | //'   identity values for \code{\link{min}} and \code{\link{max}} respectively.
18 | //' @export
19 | //' @examples
20 | //' x <- runif(1e6)
21 | //' system.time(range(x))
22 | //' system.time(frange(x))
23 | //'
24 | //' rx <- ranged(x)
25 | //' system.time(frange(rx))
26 | // [[Rcpp::export]]
27 | NumericVector frange(const NumericVector& x, const bool finite = true) {
28 |   RObject cache = x.attr("range");
29 |   if (cache.sexp_type() == REALSXP) return as<NumericVector>(cache);
30 | 
31 |   NumericVector out(2);
32 |   out[0] = INFINITY;
33 |   out[1] = -INFINITY;
34 | 
35 |   int n = x.length();
36 |   for(int i = 0; i < n; ++i) {
37 |     if (!finite && R_IsNA(x[i])) {
38 |       out[0] = NA_REAL;
39 |       out[1] = NA_REAL;
40 |       return out;
41 |     }
42 | 
43 |     // If finite, skip infinite values
44 |     if (finite && (x[i] == INFINITY || x[i] == -INFINITY)) continue;
45 | 
46 |     if (x[i] < out[0]) out[0] = x[i];
47 |     if (x[i] > out[1]) out[1] = x[i];
48 |   }
49 | 
50 |   return out;
51 | }
52 | 


--------------------------------------------------------------------------------
/R/id.r:
--------------------------------------------------------------------------------
 1 | # Copied and pasted from plyr to avoid dependency
 2 | 
 3 | id <- function(.variables, drop = FALSE) {
 4 |   # Drop all zero length inputs
 5 |   lengths <- vapply(.variables, length, integer(1))
 6 |   .variables <- .variables[lengths != 0]
 7 | 
 8 |   if (length(.variables) == 0) {
 9 |     n <- nrow(.variables) %||% 0L
10 |     return(structure(seq_len(n), n = n))
11 |   }
12 | 
13 |   # Special case for single variable
14 |   if (length(.variables) == 1) {
15 |     return(id_var(.variables[[1]], drop = drop))
16 |   }
17 | 
18 |   # Calculate individual ids
19 |   ids <- rev(lapply(.variables, id_var, drop = drop))
20 |   p <- length(ids)
21 | 
22 |   # Calculate dimensions
23 |   ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1),
24 |     USE.NAMES = FALSE)
25 |   n <- prod(ndistinct)
26 |   if (n > 2 ^ 31) {
27 |     # Too big for integers, have to use strings, which will be much slower :(
28 | 
29 |     char_id <- do.call("paste", c(ids, sep = "\r"))
30 |     res <- match(char_id, unique(char_id))
31 |   } else {
32 |     combs <- c(1, cumprod(ndistinct[-p]))
33 | 
34 |     mat <- do.call("cbind", ids)
35 |     res <- c((mat - 1L) %*% combs + 1L)
36 |   }
37 |   attr(res, "n") <- n
38 | 
39 | 
40 |   if (drop) {
41 |     id_var(res, drop = TRUE)
42 |   } else {
43 |     structure(as.integer(res), n = attr(res, "n"))
44 |   }
45 | }
46 | 
47 | id_var <- function(x, drop = FALSE) {
48 |   if (length(x) == 0) return(structure(integer(), n = 0L))
49 |   if (!is.null(attr(x, "n")) && !drop) return(x)
50 | 
51 |   if (is.factor(x) && !drop) {
52 |     id <- as.integer(addNA(x, ifany = TRUE))
53 |     n <- length(levels(x))
54 |   } else {
55 |     levels <- sort(unique(x), na.last = TRUE)
56 |     id <- match(x, levels)
57 |     n <- max(id)
58 |   }
59 |   structure(id, n = n)
60 | }
61 | 


--------------------------------------------------------------------------------
/R/bin.r:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Create a binned variable.
 3 | #'
 4 | #' @details
 5 | #' This function produces an R reference class that wraps around a C++ function.
 6 | #' Generally, you should just treat this as an opaque object with reference
 7 | #' semantics, and you shouldn't call the methods on it - pass it to
 8 | #' \code{\link{condense}} and friends.
 9 | #'
10 | #' @param x numeric or integer vector
11 | #' @param width bin width. If not specified, about 10,000 bins will be chosen
12 | #'   using the algorithim in \code{\link{find_width}}.
13 | #' @param origin origin. If not specified, guessed by \code{\link{find_origin}}.
14 | #' @param name name of original variable. This will be guessed from the input to
15 | #'   \code{group} if not supplied. Used in the output of
16 | #'   \code{\link{condense}} etc.
17 | #' @export
18 | #' @examples
19 | #' x <- runif(1e6)
20 | #' bin(x)
21 | #' bin(x, 0.01)
22 | #' bin(x, 0.01, origin = 0.5)
23 | bin <- function(x, width = find_width(x), origin = find_origin(x, width),
24 |                 name = NULL) {
25 |   stopifnot(is.numeric(x))
26 |   stopifnot(is.numeric(width), length(width) == 1, width > 0)
27 |   stopifnot(is.numeric(origin), length(origin) == 1)
28 | 
29 |   if (is.null(name)) {
30 |     name <- deparse(substitute(x))
31 |   }
32 |   stopifnot(is.character(name), length(name) == 1)
33 | 
34 |   if (!is.ranged(x)) {
35 |     attr(x, "range") <- frange(x)
36 |     class(x) <- "ranged"
37 |   }
38 |   if (origin > min(x)) {
39 |     warning("Origin larger than min(x): some values will be truncated",
40 |       call. = FALSE)
41 |   }
42 | 
43 |   BigVis$BinnedVector$new(x, name, width, origin)
44 | }
45 | 
46 | 
47 | is.binned <- function(x) {
48 |   is(x, "Rcpp_BinnedVector")
49 | }
50 | 
51 | bins <- function(...) {
52 |   BigVis$BinnedVectors$new(list(...))
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/src/group.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include "group.h"
 3 | #include "group-hex.h"
 4 | using namespace Rcpp;
 5 | 
 6 | template<typename Group>
 7 | IntegerVector group_out(const Group& group) {
 8 |   int n = group.size();
 9 |   IntegerVector out(n);
10 |   for(int i = 0; i < n; ++i) {
11 |     out[i] = group.bin_i(i);
12 |   }
13 | 
14 |   return out;
15 | }
16 | 
17 | RCPP_MODULE(Group) {
18 |   class_<GroupFixed>("GroupFixed")
19 |   .constructor<NumericVector, double, double>()
20 |   .const_method("bin_i", &GroupFixed::bin_i)
21 |   .const_method("bin", &GroupFixed::bin)
22 |   .const_method("unbin", &GroupFixed::unbin)
23 | 
24 |   .const_method("size", &GroupFixed::size)
25 |   .const_method("nbins", &GroupFixed::nbins)
26 | 
27 |   .const_method("origin", &GroupFixed::origin)
28 |   .const_method("width", &GroupFixed::width)
29 |   ;
30 | }
31 | RCPP_EXPOSED_AS(GroupFixed)
32 | RCPP_EXPOSED_WRAP(GroupFixed)
33 | 
34 | 
35 | // [[Rcpp::export]]
36 | IntegerVector group_fixed(const NumericVector& x, double width, double origin = 0) {
37 |   return group_out(GroupFixed(x, width, origin));
38 | }
39 | 
40 | // [[Rcpp::export]]
41 | IntegerVector group_rect(const NumericVector& x, const NumericVector& y, 
42 |                          double x_width, double y_width,
43 |                          double x_origin, double y_origin) {
44 |   return group_out(Group2d<GroupFixed>(
45 |     GroupFixed(x, x_width, x_origin), 
46 |     GroupFixed(y, y_width, y_origin)));
47 | }
48 | 
49 | 
50 | // [[Rcpp::export]]
51 | IntegerVector group_hex(const NumericVector& x, const NumericVector& y, 
52 |                          double x_width, double y_width,
53 |                          double x_origin, double y_origin,
54 |                          double x_max) {
55 |   return group_out(GroupHex(x, y, x_width, y_width, x_origin, y_origin, x_max));
56 | }
57 | 


--------------------------------------------------------------------------------
/notes.md:
--------------------------------------------------------------------------------
 1 | # Group
 2 | 
 3 | * 1d, nd
 4 | 
 5 | Future work: linear binning
 6 | 
 7 | # Summarise
 8 | 
 9 | * 1d
10 | 
11 |   * count, sum
12 |   * count, mean, sd
13 |   * median
14 | 
15 | * 2d
16 |   * mean
17 |   * regression
18 |   * robust regression
19 | 
20 | * nd 
21 |   * mean
22 |   * regression (with eigen or armadillo)
23 |   * robust regression (with eigen or armadillo)
24 | 
25 | Future work:
26 | 
27 | * skew?, kurt?
28 | * boxplot
29 | * weighted quantiles (C++ version of R code)
30 | * compute standard errors / bootstrap standard errors?
31 | 
32 | * infrastructure for passing multiple z
33 |   * 2d: cor, lm
34 |   
35 | 
36 | # Smooth
37 | 
38 | Kernel smoothing plus binned summary leads to many common statistics: density =~ bin + smooth, loess =~ mean + smooth, rqss =~ quantile + smooth
39 | 
40 | * weights
41 | * smoothing type
42 |   * constant
43 |   * linear
44 |   * robust linear (lowess)
45 |   * (linear poisson?)
46 | * leave-one-out cross-validation
47 | * optimisations
48 |   * convert to integer grid & use pre-computed grid of kernel values
49 |   * hash in smooth_nd_1 and compute more efficiently along 1d
50 | * deal with missing values
51 | 
52 | * smooth needs to create complete grid when factor = TRUE
53 | 
54 | Think about input data structure: sparse grid, represented as a coordinate list. Binned grid class = integer vector + width/origin/nbins (0 = NA). Most transformations break the grid, in which case all you case preserve is min, max and number of bins. All smoothing methods adapted to work in terms of these integers. Need to extract out bin/unbin into own class (initialised with std::vector of bin sizes)
55 | 
56 | Possible that more performance is available by switching to a sparse tensor library.
57 | 
58 | # Visualise
59 | 
60 | * Product plots
61 | * Standard errors + cut offs
62 | 
63 | * Peel: implement nd version using depth


--------------------------------------------------------------------------------
/R/condense.r:
--------------------------------------------------------------------------------
 1 | #' Efficient binned summaries.
 2 | #'
 3 | #' @param ... group objects created by \code{\link{bin}}
 4 | #' @param z a numeric vector to summary for each group. Optional for some
 5 | #'   summary statistics.
 6 | #' @param summary the summary statistic to use. Currently must be one of
 7 | #'   count, sum, mean, median or sd. If \code{NULL}, defaults to mean if
 8 | #'   y is present, count if not.
 9 | #' @param w a vector of weights. Not currently supported by all summary
10 | #'   functions.
11 | #' @param drop if \code{TRUE} only locations with data will be returned.  This
12 | #'   is more efficient if the data is very sparse (<1\% of cells filled), and
13 | #'   is slightly less efficient. Defaults to \code{TRUE} if you are condensing
14 | #'   over two or more dimensions, \code{FALSE} for 1d.
15 | #' @export
16 | #' @examples
17 | #' x <- runif(1e5)
18 | #' gx <- bin(x, 0.1)
19 | #' condense(gx)
20 | condense <- function(..., z = NULL, summary = NULL, w = NULL, drop = NULL) {
21 |   gs <- list(...)
22 |   if (length(gs) == 1 && is.list(gs[[1]])) gs <- gs[[1]]
23 | 
24 |   is_binned <- vapply(gs, is.binned, logical(1))
25 |   if (!all(is_binned)) {
26 |     stop("All objects passed to ... must be binned.", call. = FALSE)
27 |   }
28 | 
29 |   drop <- drop %||% (length(gs) > 1)
30 | 
31 |   if (is.null(summary)) {
32 |     summary <- if (is.null(z)) "count" else "mean"
33 |     message("Summarising with ", summary)
34 |   }
35 | 
36 |   # C++ code can deal with NULL inputs more efficiently than R code
37 |   z <- z %||% numeric()
38 |   w <- w %||% numeric()
39 | 
40 |   # Check lengths consistent
41 |   n <- gs[[1]]$size()
42 |   stopifnot(length(z) == 0 || length(z) == n)
43 |   stopifnot(length(w) == 0 || length(w) == n)
44 | 
45 |   f <- find_fun(paste("condense", summary, sep = "_"))
46 |   out <- f(gs, z, w, drop = drop)
47 | 
48 |   condensed(gs, out[[1]], out[[2]])
49 | }
50 | 


--------------------------------------------------------------------------------
/man/smooth.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/smooth.r
 3 | \name{smooth}
 4 | \alias{smooth}
 5 | \title{Smooth a condensed data frame.}
 6 | \usage{
 7 | smooth(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean",
 8 |   factor = TRUE)
 9 | }
10 | \arguments{
11 | \item{x}{a condensed summary}
12 | 
13 | \item{h}{numeric vector of bandwidths, one for each grouping variable in
14 | \code{x}}
15 | 
16 | \item{var}{variable to smooth}
17 | 
18 | \item{grid}{a data frame with the grouping colums as x.  In order for the
19 | factored version of \code{smooth_nd} to work, this grid must be a superset
20 | of \code{x}.}
21 | 
22 | \item{type}{type of smoothing to use.  Current options are \code{"mean"},
23 | a kernel weighted mean; \code{"regression"}, a kernel weighted local
24 | regression; and \code{"robust_regression"}, robust kernel weighted local
25 | regression in the style of \code{\link{loess}}.  Unique prefixes are also
26 | acceptable.}
27 | 
28 | \item{factor}{if \code{TRUE} compute the n-dimensional smooth by a sequence
29 | of 1d smoothes. For \code{type = "mean"} the results are always the same
30 | grid values are uncorrelated (e.g. the grid is complete at every location);
31 | and is very approximate for \code{type = "robust"}.}
32 | }
33 | \description{
34 | Smooth a condensed data frame.
35 | }
36 | \examples{
37 | x <- runif(1e5)
38 | xsum <- condense(bin(x, 1 / 100))
39 | xsmu1 <- smooth(xsum, 5 / 100)
40 | xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE)
41 | 
42 | # More challenging distribution
43 | x <- rchallenge(1e4)
44 | xsum <- condense(bin(x, 0.1))
45 | xsmu <- smooth(xsum, 1)
46 | 
47 | plot(xsum$x, xsum$.count, type = "l")
48 | lines(xsmu$x, xsmu$.count, col = "red")
49 | 
50 | xsmu2 <- smooth(xsum, 1, type = "regress")
51 | plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50))
52 | lines(xsmu2$x, xsmu2$.count, col = "red")
53 | # Note difference in tails
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bigvis
 2 | 
 3 | [![Travis-CI Build Status](https://travis-ci.org/hadley/bigvis.svg?branch=master)](https://travis-ci.org/hadley/bigvis)
 4 | [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/bigvis/master.svg)](https://codecov.io/github/hadley/bigvis?branch=master)
 5 | 
 6 | The bigvis package provides tools for exploratory data analysis of __large datasets__ (10-100 million obs). The aim is to have most operations take less than 5 seconds on commodity hardware, even for 100,000,000 data points.
 7 | 
 8 | Since bigvis is not currently available on CRAN, the easiest way to try it out is to:
 9 | 
10 | ```R
11 | # install.packages("devtools")
12 | devtools::install_github("hadley/bigvis")
13 | ```
14 | 
15 | ## Workflow
16 | 
17 | The bigvis package is structured around the following workflow:
18 | 
19 | * `bin()` and `condense()` to get a compact summary of the data
20 | 
21 | * if the estimates are rough, you might want to `smooth()`. See `best_h()` and `rmse_cvs()` to figure out a good starting bandwidth
22 | 
23 | * if you're working with counts, you might want to `standardise()`
24 | 
25 | * visualise the results with `autoplot()` (you'll need to load `ggplot2` to use this)
26 | 
27 | ## Weighted statistics
28 | 
29 | Bigvis also provides a number of standard statistics efficiently implemented on weighted/binned data: `weighted.median`, `weighted.IQR`, `weighted.var`, `weighted.sd`, `weighted.ecdf` and `weighted.quantile`. 
30 | 
31 | ## Acknowledgements
32 | 
33 | This package wouldn't be possible without:
34 | 
35 | * the fantastic [Rcpp](http://dirk.eddelbuettel.com/code/rcpp.html) package, which makes it amazingly easy to integrate R and C++
36 | 
37 | * JJ Allaire and Carlos Scheidegger who have indefatigably answered my many C++ questions
38 | 
39 | * the generous support of Revolution Analytics who supported the early development.
40 | 
41 | * Yue Hu, who implemented a proof of concepts that showed that it might be possible to work with this much data in R.
42 | 


--------------------------------------------------------------------------------
/R/rmse.r:
--------------------------------------------------------------------------------
 1 | #' Estimate smoothing RMSE using leave-one-out cross-validation.
 2 | #'
 3 | #' \code{rmse_cv} computes the leave-one-out RMSE for a single vector of
 4 | #' bandwidths, \code{rmse_cvs} computes for a multiple vectors of bandwidths,
 5 | #' stored as a data frame.
 6 | #'
 7 | #' @param x condensed summary table
 8 | #' @param h,hs for \code{rmse_cv}, a vector of bandwidths; for \code{rmse_cv}
 9 | #'    a data frame of bandwidths, as generated by \code{\link{h_grid}}.
10 | #' @param var variable to smooth
11 | #' @param ... other variables passed on to \code{\link{smooth}}
12 | #' @family bandwidth estimation functions
13 | #' @export
14 | #' @examples
15 | #' \donttest{
16 | #' set.seed(1014)
17 | #' # 1d -----------------------------
18 | #' x <- rchallenge(1e4)
19 | #' xsum <- condense(bin(x, 1 / 10))
20 | #' cvs <- rmse_cvs(xsum)
21 | #'
22 | #' if (require("ggplot2")) {
23 | #' autoplot(xsum)
24 | #' qplot(x, err, data = cvs, geom = "line")
25 | #' xsmu <- smooth(xsum, 1.3)
26 | #' autoplot(xsmu)
27 | #' autoplot(peel(xsmu))
28 | #' }
29 | #'
30 | #' # 2d -----------------------------
31 | #' y <- runif(1e4)
32 | #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
33 | #' cvs <- rmse_cvs(xysum, h_grid(xysum, 10))
34 | #' if (require("ggplot2")) {
35 | #' qplot(x, y, data = cvs, size = err)
36 | #' }
37 | #' }
38 | rmse_cvs <- function(x, hs = h_grid(x), ...) {
39 |   rmse_1 <- function(i) {
40 |     rmse_cv(x, as.numeric(hs[i, ]), ...)
41 |   }
42 |   err <- vapply(seq_len(nrow(hs)), rmse_1, numeric(1))
43 |   data.frame(hs, err)
44 | }
45 | 
46 | #' @rdname rmse_cvs
47 | #' @export
48 | rmse_cv <- function(x, h, var = summary_vars(x)[1], ...) {
49 |   # can't smooth missing values, so drop.
50 |   x <- x[complete.cases(x), , drop = FALSE]
51 |   gvars <- group_vars(x)
52 | 
53 |   pred_error <- function(i) {
54 |     out <- as.matrix(x[i, gvars, drop = FALSE])
55 |     smu <- smooth(x[-i, , drop = FALSE], grid = out, h = h, var = var, ...)
56 |     smu[[var]] - x[[var]][i]
57 |   }
58 |   err <- vapply(seq_len(nrow(x)), pred_error, numeric(1))
59 |   sqrt(mean(err ^ 2, na.rm = TRUE))
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/R/challenge.r:
--------------------------------------------------------------------------------
 1 | #' Density and random number generation functions for a challenging
 2 | #' distribution.
 3 | #'
 4 | #' This is a 1/3-2/3 mixture of a t-distribution with 2 degrees of freedom
 5 | #' centered at 15 and scaled by 2, and a gamma distribution with shape 2
 6 | #' and rate 1/3. (The t-distribution is windsorised at 0, but this
 7 | #' has negligible effect.) This distribution is challenging because it
 8 | #' mixes heavy tailed and asymmetric distributions.
 9 | #'
10 | #' @param x values to evaluate pdf at
11 | #' @param n number of random samples to generate
12 | #' @export
13 | #' @examples
14 | #' plot(dchallenge, xlim = c(-5, 60), n = 500)
15 | #'
16 | #' x <- rchallenge(1e4)
17 | #' hist(x, breaks = 1000)
18 | #' xsum <- condense(bin(x, 0.1))
19 | #' plot(xsum$x, xsum$.count, type = "l")
20 | #' xsmu <- smooth(xsum, 0.3)
21 | #' plot(xsmu$x, xsmu$.count, type = "l")
22 | #' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 30))
23 | dchallenge <- function(x) {
24 |   # Windorised t-distribution
25 |   scale <- function(x) (x - 30) / 2
26 |   spike <- ifelse(x < 0, 0, dt(scale(x), df = 2)) +
27 |     pt(scale(0), df = 2) * (x == 0)
28 | 
29 |   slope <- dgamma(x, 2, 1/3)
30 | 
31 |   (spike + 2 * slope) / 3
32 | }
33 | 
34 | # plot(pchallenge, xlim = c(-5, 60), n = 500)
35 | pchallenge <- function(x) {
36 |   #  H(y) =
37 |   #  = int_0^y h(x) dx
38 |   #  = int_0^y 1/3 f(x) + 2/3 g(x) dx
39 |   #  = 1/3 int_0^y f(x) dx + 2/3 int_0^y g(x) dx =
40 |   #  = 1/3 F(y) + 2/3 G(y)
41 | 
42 |   # h(x) = g((x - 30) / 2)
43 |   # H(y) = int_0^y g((x - 30) / 2) dx
44 |   # complete transformation
45 | 
46 |   scale <- function(x) (x - 30) / 2
47 |   spike <- ifelse(x < 0, 0, pt(scale(x), df = 2))
48 | 
49 |   slope <- pgamma(x, 2, 1/3)
50 | 
51 |   (spike + 2 * slope) / 3
52 | }
53 | 
54 | qchallenge <- function(x) {
55 |   # approximate pchallenge with 1000 points, and linearise
56 |   # use to implement fast option to rchallenge that does inverse pdf
57 |   # transformation + runif()
58 | }
59 | 
60 | 
61 | #' @rdname dchallenge
62 | #' @export
63 | rchallenge <- function(n) {
64 |   nt <- rbinom(1, n, 1 / 3)
65 |   ngamma <- n - nt
66 | 
67 |   spike <- 2 * rt(nt, df = 2) + 15
68 |   spike[spike < 0] <- 0
69 | 
70 |   slope <- rgamma(ngamma, 2, 1/3)
71 | 
72 |   c(spike, slope)
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/man/best_h.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/h.r
 3 | \name{best_h}
 4 | \alias{best_h}
 5 | \title{Find "best" smoothing parameter using leave-one-out cross validation.}
 6 | \usage{
 7 | best_h(x, h_init = NULL, ..., tol = 0.01, control = list())
 8 | }
 9 | \arguments{
10 | \item{x}{condensed summary to smooth}
11 | 
12 | \item{h_init}{initial values of bandwidths to start search out. If not
13 | specified defaults to 5 times the binwidth of each variable.}
14 | 
15 | \item{...}{other arguments (like \code{var}) passed on to
16 | \code{\link{rmse_cv}}}
17 | 
18 | \item{tol}{numerical tolerance, defaults to 1\%.}
19 | 
20 | \item{control}{additional control parameters passed on to \code{\link{optim}}
21 | The most useful argument is probably trace, which makes it possible to
22 | follow the progress of the optimisation.}
23 | }
24 | \value{
25 | a single numeric value representing the bandwidth that minimises
26 |   the leave-one-out estimate of rmse. Vector has attributes
27 |   \code{evaluations} giving the number of times the objective function
28 |   was evaluated. If the optimisation does not converge, or smoothing is not
29 |   needed (i.e. the estimate is on the lower bounds), a warning is thrown.
30 | }
31 | \description{
32 | Minimises the leave-one-out estimate of root mean-squared error to find
33 | find the "optimal" bandwidth for smoothing.
34 | }
35 | \details{
36 | L-BFGS-B optimisation is used to constrain the bandwidths to be greater
37 | than the binwidths: if the bandwidth is smaller than the binwidth it's
38 | impossible to compute the rmse because no smoothing occurs. The tolerance
39 | is set relatively high for numerical optimisation since the precise choice
40 | of bandwidth makes little difference visually, and we're unlikely to have
41 | sufficient data to make a statistically significant choice anyway.
42 | }
43 | \examples{
44 | \donttest{
45 | x <- rchallenge(1e4)
46 | xsum <- condense(bin(x, 1 / 10))
47 | h <- best_h(xsum, control = list(trace = 3, REPORT = 1))
48 | 
49 | if (require("ggplot2")) {
50 | autoplot(xsum)
51 | autoplot(smooth(xsum, h))
52 | }
53 | }
54 | }
55 | \seealso{
56 | Other bandwidth estimation functions: \code{\link{h_grid}};
57 |   \code{\link{rmse_cv}}, \code{\link{rmse_cvs}}
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/inst/tests/test-binned-vectors.r:
--------------------------------------------------------------------------------
 1 | context("Binned vectors")
 2 | 
 3 | if (require("plyr")) {
 4 |   test_that("bins agree with plyr::id", {
 5 |     grid <- expand.grid(x = c(NA, seq(0, 0.5, by = 0.1)), y = c(NA, seq(0, 0.7, by = 0.1)))
 6 |     x <- grid$x
 7 |     y <- grid$y
 8 | 
 9 |     gx <- bin(x, 0.1)
10 |     gy <- bin(y, 0.1)
11 | 
12 |     bv <- bins(gx, gy)
13 |     bigvis <- sapply(seq_along(x) - 1, bv$bin_i)
14 | 
15 |     bin_x <- sapply(seq_along(x) - 1, gx$bin_i)
16 |     bin_y <- sapply(seq_along(x) - 1, gy$bin_i)
17 |     plyr <- as.vector(id(list(bin_x, bin_y)))
18 | 
19 |     expect_equal(bigvis + 1, plyr)
20 |   })
21 | }
22 | 
23 | test_that("square nbins correct", {
24 |   g <- bin(1:10, 1)
25 |   expect_equal(bins(g)$nbins(), 11)
26 |   expect_equal(bins(g, g)$nbins(), 11 ^ 2)
27 |   expect_equal(bins(g, g, g)$nbins(), 11 ^ 3)
28 | })
29 | 
30 | test_that("rectangular nbins correct", {
31 |   g11 <- bin(1:10, 1)
32 |   g2 <- bin(rep(1, 10), 1)
33 | 
34 |   expect_equal(bins(g2, g11)$nbins(), 22)
35 |   expect_equal(bins(g11, g2)$nbins(), 22)
36 | })
37 | 
38 | test_that("diagonal nbins correct", {
39 |   x <- runif(1e3)
40 |   y <- x + runif(1e3, -0.2, 0.2)
41 |   z <- rnorm(1e3, x)
42 | 
43 |   gx <- bin(x, 0.1)
44 |   gy <- bin(y, 0.1)
45 | 
46 |   expect_equal(gx$nbins(), 11)
47 |   expect_equal(gy$nbins(), 15)
48 | 
49 |   bvs <- bins(gx, gy)
50 |   expect_equal(bvs$nbins(), 165)
51 | 
52 |   bins <- vapply(seq_along(x) - 1, bvs$bin_i, integer(1))
53 |   expect_true(all(bins <= 165))
54 | })
55 | 
56 | test_that("bin and unbin are symmetric", {
57 |   g <- bin(-10:10, 1)
58 |   bvs <- bins(g, g)
59 | 
60 |   grid <- expand.grid(x = -10:10, y = -10:10)
61 |   bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y))
62 |   unbin <- t(vapply(bins, bvs$unbin, numeric(2)))
63 |   colnames(unbin) <- c("x", "y")
64 | 
65 |   expect_equal(unbin, as.matrix(grid))
66 | })
67 | 
68 | test_that("bin and unbin are symmetric with diff binning", {
69 |   x <- c(-1, 5)
70 |   y <- c(0.1, 1)
71 | 
72 |   bx <- bin(x, 1)
73 |   by <- bin(y, 0.1)
74 |   bvs <- bins(bx, by)
75 | 
76 |   grid <- expand.grid(
77 |     x = breaks(bx)[-1] + 1 / 2,
78 |     y = breaks(by)[-1] + 0.1 / 2)
79 | 
80 |   bins <- unlist(Map(function(x, y) bvs$bin(c(x, y)), grid$x, grid$y))
81 |   unbin <- t(vapply(bins, bvs$unbin, numeric(2)))
82 |   colnames(unbin) <- c("x", "y")
83 | 
84 |   expect_equal(unbin, as.matrix(grid))
85 | })
86 | 


--------------------------------------------------------------------------------
/inst/tests/test-condense.r:
--------------------------------------------------------------------------------
 1 | context("Condense")
 2 | 
 3 | test_that("condense counts small vectors accurately", {
 4 |   x <- c(NA, 0:10)
 5 |   s1 <- condense(bin(x, 1, -0.5), summary = "count")
 6 |   # Pathological origin: need to add extra bin on end, because they're
 7 |   # right open, left closed
 8 |   s2 <- condense(bin(x, 1, 0), summary = "count")
 9 | 
10 |   expect_equivalent(s1$x, c(NA, 0:10))
11 |   expect_equivalent(s2$x, c(NA, 0:10 + 0.5))
12 | 
13 |   expect_equal(s1$.count, rep(1, length(x)))
14 |   expect_equal(s2$.count, rep(1, length(x)))
15 | })
16 | 
17 | test_that("weights modify counts", {
18 |   x <- c(NA, 0:10)
19 |   w <- rep(2, length(x))
20 |   s <- condense(bin(x, 1), w = w, summary = "count")
21 | 
22 |   expect_equivalent(s$x, c(NA, 0:10))
23 |   expect_equal(s$.count, rep(2, length(x)))
24 | })
25 | 
26 | test_that("z affects sums, but not counts", {
27 |   x <- c(NA, 0:10)
28 |   z <- 0:11
29 |   s <- condense(bin(x, 1), z = z, summary = "sum")
30 | 
31 |   expect_equal(s$.count, rep(1, length(x)))
32 |   expect_equal(s$.sum, z)
33 | })
34 | 
35 | test_that("drop = FALSE and drop = TRUE results agree", {
36 |   x <- runif(1e3)
37 |   y <- x + runif(1e3, -0.2, 0.2)
38 |   z <- rnorm(1e3, x)
39 | 
40 |   gx <- bin(x, 0.1)
41 |   gy <- bin(y, 0.1)
42 | 
43 |   count1 <- condense(gx, gy, summary = "count", drop = TRUE)
44 |   expect_equal(sum(count1$.count == 0), 0)
45 | 
46 |   count2 <- condense(gx, gy, summary = "count", drop = FALSE)
47 |   expect_equivalent(count1, count2[count2$.count != 0, ])
48 | })
49 | 
50 | # 2d tests ---------------------------------------------------------------------
51 | 
52 | test_that("grid counted accurately", {
53 |   # expand.grid orders in opposite way to bigvis
54 |   grid <- expand.grid(y = c(NA, 1:2), x = c(NA, 1:2))
55 |   s <- condense(bin(grid$x, 1), bin(grid$y, 1))
56 | 
57 |   expect_equal(s$.count, rep(1, nrow(grid)))
58 |   expect_equivalent(s$grid.x, grid$x)
59 |   expect_equivalent(s$grid.y, grid$y)
60 | })
61 | 
62 | test_that("diagonal counted correctly", {
63 |   df <- data.frame(x = c(NA, 1:2), y = c(NA, 1:2))
64 |   s <- condense(bin(df$x, 1), bin(df$y, 1))
65 | 
66 |   expect_equal(nrow(s), nrow(df))
67 |   expect_equal(s$df.x, s$df.y)
68 | })
69 | 
70 | test_that("random data doesn't crash", {
71 |   x <- runif(1e3, 8, 4963)
72 |   y <- runif(1e3, 1e-2, 1e3)
73 | 
74 |   gx <- bin(x, 10)
75 |   gy <- bin(y, 10)
76 | 
77 |   condense(gx, gy)
78 | })
79 | 


--------------------------------------------------------------------------------
/bench/group-tempvar.cpp:
--------------------------------------------------------------------------------
 1 | // In a function like 
 2 | // 
 3 | // unsigned int bin(unsigned int i) const {
 4 | //   if (ISNAN(x_[i])) return 0;
 5 | //   if (x_[i] < origin_) return 0;
 6 | // 
 7 | //   return (x_[i] - origin_) / width_ + 1;
 8 | // }
 9 | // 
10 | // should I create my own temporary double val = x_[i] ?
11 | // 
12 | // It looks like it saves ~0.2 ns per invocation, so probably not worth it for
13 | // performance reasons.
14 | 
15 | #include <Rcpp.h>
16 | using namespace Rcpp;
17 | 
18 | class Group1 {
19 |     const Fast<NumericVector> x_;
20 |     double width_;
21 |     double origin_;
22 |   public:
23 |     Group1 (const NumericVector& x, double width, double origin = 0)
24 |        : x_(x), width_(width), origin_(origin) {
25 |     }
26 | 
27 |     unsigned int bin(unsigned int i) const {
28 |       if (ISNAN(x_[i])) return 0;
29 |       if (x_[i] < origin_) return 0;
30 |       
31 |       return (x_[i] - origin_) / width_ + 1;
32 |     }
33 | 
34 |     int size() const {
35 |       return x_.size();
36 |     }
37 | };
38 | 
39 | class Group2 {
40 |     const Fast<NumericVector> x_;
41 |     double width_;
42 |     double origin_;
43 |   public:
44 |     Group2 (const NumericVector& x, double width, double origin = 0)
45 |        : x_(x), width_(width), origin_(origin) {
46 |     }
47 | 
48 |     unsigned int bin(unsigned int i) const {
49 |       double val = x_[i];
50 |       if (ISNAN(val)) return 0;
51 |       if (val < origin_) return 0;
52 |       
53 |       return (val - origin_) / width_ + 1;
54 |     }
55 | 
56 |     int size() const {
57 |       return x_.size();
58 |     }
59 | };
60 | 
61 | template<typename Group>
62 | IntegerVector group_out(const Group& group) {
63 |   int n = group.size();
64 |   IntegerVector out(n);
65 |   for(int i = 0; i < n; ++i) {
66 |     out[i] = group.bin(i);
67 |   }
68 | 
69 |   return out;
70 | }
71 | 
72 | // [[Rcpp::export]]
73 | IntegerVector group1(const NumericVector& x, double width, double origin = 0) {
74 |   return group_out(Group1(x, width, origin));
75 | }
76 | 
77 | // [[Rcpp::export]]
78 | IntegerVector group2(const NumericVector& x, double width, double origin = 0) {
79 |   return group_out(Group2(x, width, origin));
80 | }
81 | 
82 | 
83 | /*** R
84 | x <- runif(1e6)
85 | library(microbenchmark)
86 | stopifnot(all.equal(group1(x, 1/1000), group2(x, 1/1000)))
87 | 
88 | (m <- microbenchmark(
89 |   group1(x, 1/1000),
90 |   group2(x, 1/1000)
91 | ))
92 | diff(summary(m)$median) / length(x) * 1e9 / 1e3
93 | */


--------------------------------------------------------------------------------
/R/rebin.r:
--------------------------------------------------------------------------------
 1 | #' Transform condensed objects, collapsing unique bins.
 2 | #'
 3 | #' @details
 4 | #' You don't need to use \code{rebin} if you use transform: it will
 5 | #' automatically rebin for you.  You will need to use it if you manually
 6 | #' transform any grouping variables.
 7 | #'
 8 | #' @param data,`_data` a condensed summary
 9 | #' @param ... named arguments evaluated in the context of the data
10 | #' @usage \\method{transform}{condensed}(`_data`, ...)
11 | #' @keywords internal
12 | #' @examples
13 | #' x <- runif(1e4, -1, 1)
14 | #' xsum <- condense(bin(x, 1 / 50))
15 | #'
16 | #' # Transforming by hand: must use rebin
17 | #' xsum$x <- abs(xsum$x)
18 | #' rebin(xsum)
19 | #' if (require("ggplot2")) {
20 | #'   autoplot(xsum) + geom_point()
21 | #'   autoplot(rebin(xsum)) + geom_point()
22 | #' }
23 | #'
24 | #' #' Transforming with transform
25 | #' y <- x ^ 2 + runif(length(x), -0.1, 0.1)
26 | #' xysum <- condense(bin(x, 1 / 50), z = y)
27 | #' xysum <- transform(xysum, x = abs(x))
28 | #' if (require("ggplot2")) {
29 | #'   autoplot(xysum)
30 | #' }
31 | #' @export
32 | #' @method transform condensed
33 | transform.condensed <- function(`_data`, ...) {
34 |   df <- transform.data.frame(`_data`, ...)
35 |   rebin(as.condensed(df))
36 | }
37 | 
38 | #' @export
39 | #' @rdname transform.condensed
40 | rebin <- function(data) {
41 |   stopifnot(is.condensed(data))
42 | 
43 |   old_g <- data[group_vars(data)]
44 |   old_g[] <- lapply(old_g, zapsmall, digits = 3)
45 |   ids <- id(old_g, drop = TRUE)
46 |   if (!anyDuplicated(ids)) return(data)
47 | 
48 |   old_s <- data[summary_vars(data)]
49 |   new_s <- lapply(names(old_s), function(var) rebin_var(old_s, ids, var))
50 |   names(new_s) <- names(old_s)
51 | 
52 |   uids <- !duplicated(ids)
53 |   new_g <- old_g[uids, , drop = FALSE]
54 |   ord <- order(ids[uids], na.last = FALSE)
55 | 
56 |   as.condensed(data.frame(new_g[ord, , drop = FALSE], new_s))
57 | }
58 | 
59 | rebin_var <- function(df, ids, var) {
60 |   stopifnot(is.data.frame(df))
61 |   stopifnot(is.integer(ids), length(ids) == nrow(df))
62 |   stopifnot(is.character(var), length(var) == 1, var %in% names(rebinners))
63 | 
64 |   rows <- split(seq_len(nrow(df)), ids)
65 |   f <- rebinners[[var]]
66 | 
67 |   vapply(rows, function(i) f(df[i, , drop = FALSE]), numeric(1),
68 |     USE.NAMES = FALSE)
69 | }
70 | 
71 | rebinners <- list(
72 |   .median = function(df) mean(df$.median, na.rm = TRUE),
73 |   .sum = function(df) sum(df$.sum, na.rm = TRUE),
74 |   .count = function(df) sum(df$.count, na.rm = TRUE),
75 |   .mean = function(df) {
76 |     if (is.null(df$.count)) {
77 |       mean(df$.mean, na.rm = TRUE)
78 |     } else {
79 |       weighted.mean(df$.mean, df$.count)
80 |     }
81 |   }
82 | )
83 | 


--------------------------------------------------------------------------------
/bench/count.cpp:
--------------------------------------------------------------------------------
  1 | // Experiment with making the binner more generic, so that the binner
  2 | // class also stores the variable being binned over - this is important
  3 | // for separating the grouping from the numeric operation.
  4 | 
  5 | #include <Rcpp.h>
  6 | #include <iostream>
  7 | #include <algorithm>
  8 | using namespace Rcpp;
  9 | 
 10 | template<typename Binner>
 11 | std::vector<int> count_x(const NumericVector& x, Binner binner) {
 12 |   std::vector<int> out;
 13 | 
 14 |   int n = x.size();
 15 | 
 16 |   for(int i = 0; i < n; ++i) {
 17 |     int bin = binner(x[i]);
 18 |     if (bin < 0) continue;
 19 |   
 20 |     // Make sure there's enough space
 21 |     if (bin >= out.size()) {
 22 |       out.resize(bin + 1);
 23 |     }
 24 |     ++out[bin];
 25 |   }
 26 | 
 27 |   return out;
 28 | }
 29 | 
 30 | template<typename Binner>
 31 | std::vector<int> count(Binner binner) {
 32 |   std::vector<int> out;
 33 | 
 34 |   int n = binner.size();
 35 |   for(int i = 0; i < n; ++i) {
 36 |     int bin = binner.bin(i);
 37 |     if (bin < 0) continue;
 38 |   
 39 |     // Make sure there's enough space
 40 |     if (bin >= out.size()) {
 41 |       out.resize(bin + 1);
 42 |     }
 43 |     ++out[bin];
 44 |   }
 45 | 
 46 |   return out;
 47 | }
 48 | 
 49 | 
 50 | class BinFixed {
 51 |     double width_;
 52 |     double origin_;
 53 |   public:
 54 |     BinFixed (double width, double origin = 0) {
 55 |       width_ = width;
 56 |       origin_ = origin;
 57 |     }
 58 | 
 59 |     int inline operator() (double val) const { 
 60 |       if (ISNAN(val)) return 0;
 61 | 
 62 |       return (val - origin_) / width_ + 1;
 63 |     }
 64 | };
 65 | 
 66 | class BinFixed2 {
 67 |     const NumericVector& x_;
 68 |     double width_;
 69 |     double origin_;
 70 |   public:
 71 |     BinFixed2 (const NumericVector& x, double width, double origin = 0)
 72 |        : x_(x), width_(width), origin_(origin) {
 73 |     }
 74 | 
 75 |     int bin(int i) const { 
 76 |       if (ISNAN(x_[i])) return 0;
 77 |       return (x_[i] - origin_) / width_ + 1;
 78 |     }
 79 | 
 80 |     int size() const {
 81 |       return x_.size();
 82 |     }
 83 | };
 84 | 
 85 | 
 86 | // [[Rcpp::export]]
 87 | std::vector<int> count_x2(NumericVector x, double width, double origin = 0) {
 88 |   return count_x(x, BinFixed(width, origin));
 89 | }
 90 | 
 91 | // [[Rcpp::export]]
 92 | std::vector<int> count2(NumericVector x, double width, double origin = 0) {
 93 |   return count(BinFixed2(x, width, origin));
 94 | }
 95 | 
 96 | 
 97 | /*** R 
 98 | options(digits = 3)
 99 | library(microbenchmark)
100 | x <- runif(1e5)
101 | 
102 | # Breaks
103 | microbenchmark(
104 |   count_x2(x, 1/100),  
105 |   count2(x, 1/100)
106 | )
107 | 
108 | */


--------------------------------------------------------------------------------
/bench/median.cpp:
--------------------------------------------------------------------------------
  1 | // Instead of counting, compute a more complicated statistic: a median
  2 | 
  3 | #include <Rcpp.h>
  4 | #include <iostream>
  5 | #include <algorithm>
  6 | using namespace Rcpp;
  7 | 
  8 | class BinFixed {
  9 |     const Fast<NumericVector> x_;
 10 |     double width_;
 11 |     double origin_;
 12 |   public:
 13 |     BinFixed (const NumericVector& x, double width, double origin = 0)
 14 |        : x_(x), width_(width), origin_(origin) {
 15 |     }
 16 | 
 17 |     int bin(int i) const {
 18 |       if (ISNAN(x_[i])) return 0;
 19 |       return (x_[i] - origin_) / width_ + 1;
 20 |     }
 21 | 
 22 |     int size() const {
 23 |       return x_.size();
 24 |     }
 25 | };
 26 | class StatMedian {
 27 |     std::vector<double> ys;
 28 | 
 29 |   public:
 30 |     void push(double x) {
 31 |       ys.push_back(x);
 32 |     }
 33 | 
 34 |     // Adapted from http://stackoverflow.com/questions/1719070/
 35 |     double compute() {
 36 |       if (ys.empty()) return NAN;
 37 | 
 38 |       int size = ys.size();
 39 |       std::vector<double>::iterator upper = ys.begin() + (int) (size / 2);
 40 |       std::nth_element(ys.begin(), upper, ys.end());
 41 | 
 42 |       if (size % 2 == 1) {
 43 |         return *upper;
 44 |       } else {
 45 |         std::vector<double>::iterator lower = upper - 1;
 46 |         std::nth_element(ys.begin(), lower, upper);
 47 |         return (*upper + *lower) / 2.0;
 48 |       }
 49 | 
 50 |     }
 51 | };
 52 | 
 53 | template<typename Binner>
 54 | NumericVector group_median(NumericVector& y, Binner binner) {
 55 |   std::vector<StatMedian> stat;
 56 | 
 57 |   int n = binner.size();
 58 |   for(int i = 0; i < n; ++i) {
 59 |     int bin = binner.bin(i);
 60 |     if (bin < 0) continue;
 61 | 
 62 |     if (bin >= stat.size()) {
 63 |       stat.resize(bin + 1);
 64 |     }
 65 | 
 66 |     stat[bin].push(y[i]);
 67 |   }
 68 | 
 69 |   int m = stat.size();
 70 |   NumericVector res(m);
 71 |   for (int i = 0; i < m; ++i) {
 72 |     res[i] = stat[i].compute();
 73 |   }
 74 |   return res;
 75 | }
 76 | 
 77 | 
 78 | // [[Rcpp::export]]
 79 | NumericVector group_median_(NumericVector x, NumericVector y,
 80 |                             double width, double origin = 0) {
 81 |   return group_median(y, BinFixed(x, width, origin));
 82 | }
 83 | 
 84 | 
 85 | /*** R
 86 | options(digits = 3)
 87 | library(microbenchmark)
 88 | x <- runif(1e5)
 89 | y <- runif(1e5)
 90 | 
 91 | group_median_tapply <- function(x, y, width, origin = 0) {
 92 |   bins <- trunc((x - origin) / width)
 93 |   c(NaN, unname(tapply(y, bins, median)))
 94 | }
 95 | med1 <- group_median_tapply(x, y, width = 1/1000)
 96 | med2 <- group_median_(x, y, width = 1/1000)
 97 | stopifnot(all.equal(med1, med2))
 98 | 
 99 | # Breaks
100 | microbenchmark(
101 | # group_median_tapply(x, y, width = 1/1000),
102 |   group_median_(x, y, width = 1/1000)
103 | )
104 | 
105 | */
106 | 


--------------------------------------------------------------------------------
/R/ranged.r:
--------------------------------------------------------------------------------
  1 | #' A S3 class for caching the range of a vector
  2 | #'
  3 | #' This class is designed for dealing with large vectors, where the cost of
  4 | #' recomputing the range multiple times is prohibitive. It provides methods
  5 | #' for \code{\link{print}} and \code{\link{str}} that display only the range,
  6 | #' not the contents.
  7 | #'
  8 | #' @section Performance:
  9 | #' For best performance, you may want to run copy and paste the contents of
 10 | #' this function into your function, to avoid making any copies of \code{x}.
 11 | #' This is probably only necessary if you're dealing with extremely large
 12 | #' vectors, > 100 million obs.
 13 | #'
 14 | #' @param x a numeric vector
 15 | #' @param range the range of the vector (excluding missing values), if known.
 16 | #'   If unknown, it will be computed with \code{\link{frange}}, a fast C++
 17 | #'   implementation of \code{\link{range}}.
 18 | #' @export
 19 | #' @examples
 20 | #' x <- runif(1e6)
 21 | #' y <- ranged(x)
 22 | #' range(y)
 23 | #' y
 24 | #' str(y)
 25 | #'
 26 | #' # Modifications to the class currently destroy the cache
 27 | #' y[1] <- 10
 28 | #' max(y)
 29 | #' class(y)
 30 | #' z <- y + 10
 31 | #' max(z)
 32 | #' class(z)
 33 | ranged <- function(x, range = frange(x, finite = TRUE)) {
 34 |   stopifnot(is.numeric(x))
 35 | 
 36 |   # Reset range attribute so that lazy evaluation of range
 37 |   # always recomputes from scratch
 38 |   attr(x, "range") <- NULL
 39 | 
 40 |   attr(x, "range") <- range
 41 |   class(x) <- "ranged"
 42 |   x
 43 | }
 44 | 
 45 | #' Test if an object is of class ranged.
 46 | #'
 47 | #' @export
 48 | #' @param x object to test
 49 | #' @keywords internal
 50 | is.ranged <- function(x) inherits(x, "ranged")
 51 | 
 52 | #' @export
 53 | min.ranged <- function(x, ...) attr(x, "range")[1]
 54 | #' @export
 55 | max.ranged <- function(x, ...) attr(x, "range")[2]
 56 | #' @export
 57 | range.ranged <- function(x, ...) attr(x, "range")
 58 | 
 59 | #' @export
 60 | print.ranged <- function(x, ...) {
 61 |   rng <- attr(x, "range")
 62 |   # attr(x, "range") <- NULL
 63 |   # attr(x, "class") <- NULL
 64 |   # print.default(x)
 65 |   cat("Ranged 1:", length(x), " [", format(rng[1]), ", ", format(rng[2]), "]\n",
 66 |     sep = "")
 67 | }
 68 | 
 69 | #' @export
 70 | str.ranged <- function(object, ...) {
 71 |   rng <- attr(object, "range")
 72 |   cat(" Ranged [1:", length(object), "] ", format(rng[1]), "--", format(rng[2]),
 73 |     "\n", sep = "")
 74 | }
 75 | 
 76 | #' @export
 77 | Ops.ranged <- function(e1, e2) {
 78 |   attr(e1, "range") <- NULL
 79 |   class(e1) <- NULL
 80 | 
 81 |   NextMethod(e1, e2)
 82 | }
 83 | 
 84 | #' @export
 85 | "[<-.ranged" <- function(x, ..., value) {
 86 |   attr(x, "range") <- NULL
 87 |   attr(x, "class") <- NULL
 88 |   NextMethod(x, ..., value = value)
 89 | }
 90 | 
 91 | #' @export
 92 | as.data.frame.ranged <- function(x, ...) {
 93 |   n <- length(x)
 94 |   x <- list(x)
 95 |   class(x) <- "data.frame"
 96 |   attr(x, "row.names") <- c(NA_integer_, -n)
 97 | 
 98 |   x
 99 | }
100 | 


--------------------------------------------------------------------------------
/R/peel.r:
--------------------------------------------------------------------------------
 1 | #' Peel off low density regions of the data.
 2 | #'
 3 | #' Keeps specified proportion of data by removing the lowest density regions,
 4 | #' either anywhere on the plot, or for 2d, just around the edges.
 5 | #'
 6 | #' This is useful for visualisation, as an easy way of focussing on the regions
 7 | #' where the majority of the data lies.
 8 | #'
 9 | #' @param x condensed summary
10 | #' @param keep (approximate) proportion of data to keep. If \code{1}, will
11 | #'   remove all cells with counts.  All missing values will be preserved.
12 | #' @param central if \code{TRUE} peels off regions of lowest density only from
13 | #'   the outside of the data. In 2d this works by progressively peeling off
14 | #'   convex hull of the data: the current algorithm is quite slow.
15 | #'   If \code{FALSE}, just removes the lowest density regions wherever they are
16 | #'   found. Regions with 0 density are removed regardless of location.
17 | #'   Defaults to TRUE if there are two or fewer grouping variables is less.
18 | #' @export
19 | #' @examples
20 | #' x <- rt(1e5, df = 2)
21 | #' y <- rt(1e5, df = 2)
22 | #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 10))
23 | #' plot(xysum$x, xysum$y)
24 | #'
25 | #' plot(peel(xysum, 0.95, central = TRUE)[1:2])
26 | #' plot(peel(xysum, 0.90, central = TRUE)[1:2])
27 | #' plot(peel(xysum, 0.50, central = TRUE)[1:2])
28 | peel <- function(x, keep = 0.99, central = NULL) {
29 |   stopifnot(is.condensed(x))
30 |   stopifnot(is.numeric(keep), length(keep) == 1, keep > 0, keep <= 1)
31 |   central <- central %||% (gcol(x) <= 2)
32 |   stopifnot(is.logical(central), length(central) == 1)
33 | 
34 |   if (is.null(x$.count)) {
35 |     stop("Can only peel objects with .count variable", call. = FALSE)
36 |   }
37 | 
38 |   x <- x[x$.count > 0, , drop = FALSE]
39 |   if (keep == 1) return(x)
40 | 
41 |   complete <- complete.cases(x[group_vars(x)])
42 |   x_complete <- x[complete, , drop = FALSE]
43 | 
44 |   if (central) {
45 |     peeled <- peel_outer(x_complete, keep)
46 |   } else {
47 |     peeled <- peel_anywhere(x_complete, keep)
48 |   }
49 | 
50 |   rbind(peeled, x[!complete, , drop = FALSE])
51 | }
52 | 
53 | peel_anywhere <- function(x, keep) {
54 |   ord <- order(x$.count, decreasing = TRUE)
55 |   prop <- cumsum(x$.count[ord]) / sum(x$.count)
56 | 
57 |   ind <- which(prop >= keep)[1]
58 |   x[ord[seq_len(ind)], , drop = FALSE]
59 | }
60 | 
61 | peel_outer <- function(x, keep) {
62 |   d <- gcol(x)
63 |   if (d > 2) {
64 |     stop("Outer peeling only works with 1d or 2d data", call. = FALSE)
65 |   }
66 | 
67 |   n <- sum(x$.count)
68 |   x <- x[order(x$.count, decreasing = TRUE), ]
69 |   prop <- cumsum(x$.count) / n
70 | 
71 |   # Peel off smallest values on chull
72 |   candidate <- which(prop >= keep)
73 |   on_hull <- intersect(candidate, chull(x[1:d]))
74 |   left <- sum(x$.count[-on_hull]) / n
75 | 
76 |   while(left >= keep && length(on_hull) > 0) {
77 |     x <- x[-on_hull, , drop = FALSE]
78 |     prop <- prop[-on_hull]
79 |     candidate <- which(prop >= keep)
80 |     on_hull <- intersect(candidate, chull(x[1:d]))
81 |     left <- sum(x$.count[-on_hull]) / n
82 |   }
83 | 
84 |   x
85 | }
86 | 


--------------------------------------------------------------------------------
/bench/mean.cpp:
--------------------------------------------------------------------------------
  1 | // Instead of counting, compute a more complicated statistic: a weighted mean
  2 | 
  3 | #include <Rcpp.h>
  4 | #include <iostream>
  5 | #include <algorithm>
  6 | using namespace Rcpp;
  7 | 
  8 | class BinFixed {
  9 |     const Fast<NumericVector> x_;
 10 |     double width_;
 11 |     double origin_;
 12 |   public:
 13 |     BinFixed (const NumericVector& x, double width, double origin = 0)
 14 |        : x_(x), width_(width), origin_(origin) {
 15 |     }
 16 | 
 17 |     int bin(int i) const { 
 18 |       if (ISNAN(x_[i])) return 0;
 19 |       return (x_[i] - origin_) / width_ + 1;
 20 |     }
 21 | 
 22 |     int size() const {
 23 |       return x_.size();
 24 |     }
 25 | };
 26 | 
 27 | template<typename Binner>
 28 | NumericVector group_mean(NumericVector& y, NumericVector& weight, Binner binner) {
 29 |   std::vector<double> count;
 30 |   std::vector<double> sum;
 31 | 
 32 |   int n = binner.size();
 33 |   for(int i = 0; i < n; ++i) {
 34 |     int bin = binner.bin(i);
 35 |     if (bin < 0) continue;
 36 |   
 37 |     // Make sure there's enough space
 38 |     if (bin >= sum.size()) {
 39 |       sum.resize(bin + 1);
 40 |       count.resize(bin + 1);
 41 |     }
 42 | 
 43 |     count[bin] += weight[i];
 44 |     sum[bin] += y[i];
 45 |   }
 46 | 
 47 |   int m = count.size();
 48 |   NumericVector res(m);
 49 |   for (int i = 0; i < m; ++i) {
 50 |     res[i] = sum[i] / count[i];
 51 |   }
 52 |   return res;
 53 | }
 54 | 
 55 | 
 56 | class StatMean {
 57 |     double count;
 58 |     double sum;
 59 | 
 60 |   public:
 61 |     StatMean () : count(0), sum(0) {
 62 |     }
 63 |     void push(double x, double weight) {
 64 |       count += weight;
 65 |       sum += x;
 66 |     }
 67 | 
 68 |     double compute() {
 69 |       return sum / count;
 70 |     }
 71 | };
 72 | 
 73 | template<typename Binner>
 74 | NumericVector group_mean2(NumericVector& y, NumericVector& weight, Binner binner) {
 75 |   std::vector<StatMean> stat;
 76 | 
 77 |   int n = binner.size();
 78 |   for(int i = 0; i < n; ++i) {
 79 |     int bin = binner.bin(i);
 80 |     if (bin < 0) continue;
 81 |   
 82 |     if (bin >= stat.size()) {
 83 |       stat.resize(bin + 1);
 84 |     }
 85 | 
 86 |     stat[bin].push(y[i], weight[i]);
 87 |   }
 88 | 
 89 |   int m = stat.size();
 90 |   NumericVector res(m);
 91 |   for (int i = 0; i < m; ++i) {
 92 |     res[i] = stat[i].compute();
 93 |   }
 94 |   return res;
 95 | }
 96 | 
 97 | 
 98 | // [[Rcpp::export]]
 99 | NumericVector group_mean_(NumericVector x, NumericVector y, NumericVector weight, 
100 |                        double width, double origin = 0) {
101 |   return group_mean(y, weight, BinFixed(x, width, origin));
102 | }
103 | // [[Rcpp::export]]
104 | NumericVector group_mean2_(NumericVector x, NumericVector y, NumericVector weight, 
105 |                        double width, double origin = 0) {
106 |   return group_mean2(y, weight, BinFixed(x, width, origin));
107 | }
108 | 
109 | 
110 | /*** R 
111 | options(digits = 3)
112 | library(microbenchmark)
113 | x <- runif(1e6)
114 | y <- runif(1e6)
115 | weight <- rep(1, 1e6)
116 | 
117 | # Breaks
118 | microbenchmark(
119 |   group_mean_(x, y, weight, width = 1/100),
120 |   group_mean2_(x, y, weight, width = 1/100)
121 | )
122 | 
123 | */


--------------------------------------------------------------------------------
/R/smooth.r:
--------------------------------------------------------------------------------
 1 | #' Smooth a condensed data frame.
 2 | #'
 3 | #' @param x a condensed summary
 4 | #' @param h numeric vector of bandwidths, one for each grouping variable in
 5 | #'   \code{x}
 6 | #' @param var variable to smooth
 7 | #' @param grid a data frame with the grouping colums as x.  In order for the
 8 | #'   factored version of \code{smooth_nd} to work, this grid must be a superset
 9 | #'   of \code{x}.
10 | #' @param type type of smoothing to use.  Current options are \code{"mean"},
11 | #'   a kernel weighted mean; \code{"regression"}, a kernel weighted local
12 | #'   regression; and \code{"robust_regression"}, robust kernel weighted local
13 | #'   regression in the style of \code{\link{loess}}.  Unique prefixes are also
14 | #'   acceptable.
15 | #' @param factor if \code{TRUE} compute the n-dimensional smooth by a sequence
16 | #'   of 1d smoothes. For \code{type = "mean"} the results are always the same
17 | #    as \code{FALSE}; for \code{type = "regress"} they will be equal if the
18 | #'   grid values are uncorrelated (e.g. the grid is complete at every location);
19 | #'   and is very approximate for \code{type = "robust"}.
20 | #' @export
21 | #' @examples
22 | #' x <- runif(1e5)
23 | #' xsum <- condense(bin(x, 1 / 100))
24 | #' xsmu1 <- smooth(xsum, 5 / 100)
25 | #' xsmu2 <- smooth(xsum, 5 / 100, factor = FALSE)
26 | #'
27 | #' # More challenging distribution
28 | #' x <- rchallenge(1e4)
29 | #' xsum <- condense(bin(x, 0.1))
30 | #' xsmu <- smooth(xsum, 1)
31 | #'
32 | #' plot(xsum$x, xsum$.count, type = "l")
33 | #' lines(xsmu$x, xsmu$.count, col = "red")
34 | #'
35 | #' xsmu2 <- smooth(xsum, 1, type = "regress")
36 | #' plot(xsmu$x, xsmu$.count, type = "l", xlim = c(0, 50))
37 | #' lines(xsmu2$x, xsmu2$.count, col = "red")
38 | #' # Note difference in tails
39 | smooth <- function(x, h, var = summary_vars(x)[1], grid = NULL, type = "mean",
40 |                    factor = TRUE) {
41 |   stopifnot(is.condensed(x))
42 |   stopifnot(is.numeric(h), all(h > 0))
43 |   type <- match.arg(type, c("mean", "regression", "robust_regression"))
44 | 
45 |   if (type != "mean" && !factor) {
46 |     stop("Only factored approximations available for types other than mean",
47 |       call. = FALSE)
48 |   }
49 | 
50 |   grid_in <- as.matrix(x[group_vars(x)])
51 |   grid_out <- grid %||% grid_in
52 |   stopifnot(is.matrix(grid_out), is.numeric(grid_out),
53 |     ncol(grid_out) == ncol(grid_in), nrow(grid_out) > 0)
54 | 
55 |   z <- x[[var]]
56 |   w <- if (var != ".count" && x %contains% ".count") x$.count else numeric()
57 | 
58 |   if (factor) {
59 |     for(i in 1:ncol(grid_in)) {
60 |       # smooth_nd_1 is a C++ function, so var is 0 indexed
61 |       z <- smooth_nd_1(grid_in, z, w, grid_out, var = i - 1, h = h[i],
62 |         type = type)
63 |     }
64 |   } else {
65 |     z <- smooth_nd(grid_in, z, w, grid_out, h)
66 |   }
67 | 
68 |   out <- data.frame(grid_out)
69 |   out[[var]] <- z
70 |   structure(out, class = c("condensed", class(out)))
71 | }
72 | 
73 | complete_grid <- function(df) {
74 |   stopifnot(is.data.frame(df))
75 | 
76 |   breaks <- function(width, origin, nbins) {
77 |     origin + width * seq.int(nbins)
78 |   }
79 | 
80 |   cols <- lapply(df, function(x) do.call(breaks, attributes(x)))
81 |   expand.grid(cols, KEEP.OUT.ATTRS = FALSE)
82 | }
83 | 


--------------------------------------------------------------------------------
/src/summary.h:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | #include "stats.h"
  3 | using namespace Rcpp;
  4 | 
  5 | class SummaryMoments {
  6 |     int i_;
  7 |     double weight;
  8 |     double mean;
  9 |     double m2;
 10 | 
 11 |   public:
 12 |     SummaryMoments (int i) : i_(i), weight(0), mean(0), m2(0) {
 13 |       if (i > 2) stop("Invalid moment");
 14 |     }
 15 | 
 16 |     // Algorithm adapted from 
 17 |     // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Weighted_incremental_algorithm
 18 |     void push(double y, double w) {
 19 |       if (NumericVector::is_na(y)) return;
 20 | 
 21 |       // counts and weights
 22 |       weight += w;
 23 | 
 24 |       // mean
 25 |       if (i_ < 1) return;
 26 |       double delta = y - mean;
 27 |       mean += delta * w / weight;
 28 | 
 29 |       // variance
 30 |       if (i_ < 2) return;      
 31 |       m2 += delta * delta * w * (1 - w / weight);
 32 | 
 33 |       return;
 34 |     }
 35 | 
 36 |     const int size() const {
 37 |       return i_ + 1;
 38 |     }
 39 | 
 40 |     double compute(int i) const {
 41 |       switch (i) {
 42 |         case 0: return weight;
 43 |         case 1: return (weight == 0) ? NAN : mean;
 44 |         case 2: return (weight == 0) ? NAN : pow(m2 / (weight - 1), 0.5);
 45 |         default: 
 46 |           stop("Invalid output requested");
 47 |           return NAN;
 48 |       }
 49 |     }
 50 | 
 51 |     std::string name(int i) const {
 52 |       switch (i) {
 53 |         case 0: return "count";
 54 |         case 1: return "mean";
 55 |         case 2: return "sd";
 56 |         default: 
 57 |           stop("Invalid output requested");
 58 |           return "";
 59 |       }
 60 |     }
 61 | };
 62 | 
 63 | class SummarySum {
 64 |     int i_;
 65 |     int weight;
 66 |     double sum;
 67 | 
 68 |   public:
 69 |     SummarySum (int i) : i_(i), weight(0), sum(0) {
 70 |       if (i > 1 || i < 0) stop("Invalid moment");
 71 |     }
 72 | 
 73 |     void push(double y, double w) {
 74 |       if (NumericVector::is_na(y)) return;
 75 | 
 76 |       weight += w;
 77 |       if (i_ < 1) return;
 78 | 
 79 |       sum += y * w;
 80 |     }
 81 | 
 82 |     const int size() const {
 83 |       return i_ + 1;
 84 |     }
 85 | 
 86 |     double compute(int i) const  {
 87 |       switch (i) {
 88 |         case 0: return weight;
 89 |         case 1: return sum;
 90 |         default: 
 91 |           stop("Invalid output requested");
 92 |           return NAN;
 93 |       }
 94 |     }
 95 | 
 96 |     std::string name(int i) const {
 97 |       switch (i) {
 98 |         case 0: return "count";
 99 |         case 1: return "sum";
100 |         default: 
101 |           stop("Invalid output requested");
102 |           return "";
103 |       }
104 |     }
105 | 
106 | };
107 | 
108 | class SummaryMedian {
109 |     std::vector<double> ys;
110 | 
111 |   public:
112 |     void push(double y, double w) {
113 |       if (NumericVector::is_na(y)) return;
114 | 
115 |       ys.push_back(y);
116 |     }
117 | 
118 |     int size() const {
119 |       return 1;
120 |     }
121 | 
122 |     // Adapted from http://stackoverflow.com/questions/1719070/
123 |     double compute(int i) {
124 |       return median(&ys);
125 |     }
126 | 
127 |     std::string name(int i) const {
128 |       return "median";
129 |     }
130 | };
131 | 


--------------------------------------------------------------------------------
/src/stats.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | using namespace Rcpp;
  3 | 
  4 | struct Regression {
  5 |   double alpha, beta;
  6 | };
  7 | 
  8 | // [[Rcpp::export]]
  9 | double bisquare(double u, double b) {
 10 |   u = fabs(u);
 11 |   return (u < b) ? pow(1 - pow(u / b, 2), 2) : 0;
 12 | }
 13 | 
 14 | Regression simpleLinearRegression(const std::vector<double>& x, 
 15 |                                   const std::vector<double>& y,
 16 |                                   const std::vector<double>& w) {
 17 |   int n = x.size();
 18 | 
 19 |   double x_wsum = 0, y_wsum = 0, w_sum = 0;
 20 |   for (int i = 0; i < n; ++i) {
 21 |     x_wsum += x[i] * w[i];
 22 |     y_wsum += y[i] * w[i];
 23 |     w_sum += w[i];
 24 |   };
 25 |   double x_mean = x_wsum / w_sum, y_mean = y_wsum / w_sum;
 26 | 
 27 |   double var_xy = 0, var_x = 0;
 28 |   for (int i = 0; i < n; ++i) {
 29 |     var_xy += w[i] * (x[i] - x_mean) * (y[i] - y_mean);
 30 |     var_x += w[i] * pow((x[i] - x_mean), 2);
 31 |   }
 32 | 
 33 |   Regression results;
 34 |   results.beta = (var_xy / var_x);
 35 |   results.alpha = y_mean - results.beta * x_mean;
 36 |   return results;
 37 | }
 38 | 
 39 | // [[Rcpp::export]]
 40 | NumericVector regress(const std::vector<double>& x, 
 41 |                       const std::vector<double>& y,
 42 |                       const std::vector<double>& w) {
 43 |   Regression regression = simpleLinearRegression(x, y, w);
 44 |   return NumericVector::create(regression.alpha, regression.beta);
 45 | }
 46 | 
 47 | double median(std::vector<double>* x) {
 48 |   if (x->empty()) return NAN;
 49 | 
 50 |   int size = x->size();
 51 |   std::vector<double>::iterator upper = x->begin() + (int) (size / 2);
 52 |   std::nth_element(x->begin(), upper, x->end());
 53 | 
 54 |   if (size % 2 == 1) {
 55 |     return *upper;
 56 |   } else {
 57 |     std::vector<double>::iterator lower = upper - 1;
 58 |     std::nth_element(x->begin(), lower, upper);
 59 |     return (*upper + *lower) / 2.0;
 60 |   }  
 61 | }
 62 | 
 63 | // [[Rcpp::export("medianC")]]
 64 | double median(const std::vector<double>& x) {
 65 |   std::vector<double> x_(x);
 66 |   return median(&x_);
 67 | }
 68 | 
 69 | 
 70 | Regression simpleRobustRegression(const std::vector<double>& x, 
 71 |                        const std::vector<double>& y,
 72 |                        const std::vector<double>& w,
 73 |                        int iterations = 3) {
 74 |   int n = x.size();
 75 |   Regression prev = simpleLinearRegression(x, y, w);
 76 | 
 77 |   for (int k = 0; k < iterations; ++k) {
 78 |     std::vector<double> resid(n);
 79 |     for (int i = 0; i < n; ++i) {
 80 |       resid[i] = fabs(y[i] - (prev.alpha + prev.beta * x[i]));
 81 |     }
 82 | 
 83 |     std::vector<double> w_(w);
 84 |     double b = 6 * median(resid);
 85 |     if (b < 1e-20) break;
 86 |     for (int i = 0; i < n; ++i) {
 87 |       w_[i] *= bisquare(resid[i], b);
 88 |     }
 89 | 
 90 |     prev = simpleLinearRegression(x, y, w_);
 91 |   }
 92 | 
 93 |   return prev;
 94 | }
 95 | 
 96 | // [[Rcpp::export]]
 97 | NumericVector regress_robust(const std::vector<double>& x, 
 98 |                              const std::vector<double>& y,
 99 |                              const std::vector<double>& w,
100 |                              int iterations = 3) {
101 |   Regression regression = simpleRobustRegression(x, y, w, iterations);
102 |   return NumericVector::create(regression.alpha, regression.beta);
103 | }
104 | 


--------------------------------------------------------------------------------
/R/condensed.r:
--------------------------------------------------------------------------------
  1 | #' Condensed: an S3 class for condensed summaries.
  2 | #'
  3 | #' This object managed the properties of condensed (summarised) data frames.
  4 | #'
  5 | #' @section S3 methods:
  6 | #'
  7 | #' Mathematical functions with methods for \code{binsum} object will modify
  8 | #' the x column of the data frame and \code{\link{rebin}} the data, calculating
  9 | #' updated summary statistics.
 10 | #'
 11 | #' Currently methods are provided for the \code{Math} group generic,
 12 | #' logical comparison and arithmetic operators, and
 13 | #' \code{\link[plyr]{round_any}}.
 14 | #'
 15 | #' @param groups list of \code{\link{bin}}ed objects
 16 | #' @param grouped,summary output from C++ condense function
 17 | #' @keywords internal
 18 | #' @examples
 19 | #' if (require("ggplot2")) {
 20 | #'
 21 | #' x <- rchallenge(1e4)
 22 | #' xsum <- condense(bin(x, 1 / 10))
 23 | #'
 24 | #' # Basic math operations just modify the first column
 25 | #' autoplot(xsum)
 26 | #' autoplot(xsum * 10)
 27 | #' autoplot(xsum - 30)
 28 | #' autoplot(abs(xsum - 30))
 29 | #'
 30 | #' # Similarly, logical operations work on the first col
 31 | #' autoplot(xsum[xsum > 10, ])
 32 | #'}
 33 | condensed <- function(groups, grouped, summary) {
 34 |   grouped <- as.data.frame(grouped)
 35 |   summary <- as.data.frame(summary)
 36 | 
 37 |   for (i in seq_along(groups)) {
 38 |     grouped[[i]] <- dgrid(grouped[[i]],
 39 |       groups[[i]]$width(), groups[[i]]$origin(), groups[[i]]$nbins())
 40 |   }
 41 | 
 42 |   names(summary) <- paste0(".", names(summary))
 43 | 
 44 |   df <- data.frame(grouped, summary)
 45 |   class(df) <- c("condensed", class(df))
 46 |   df
 47 | }
 48 | 
 49 | #' @export
 50 | #' @rdname condensed
 51 | #' @param x object to test or coerce
 52 | is.condensed <- function(x) inherits(x, "condensed")
 53 | 
 54 | #' @export
 55 | #' @rdname condensed
 56 | as.condensed <- function(x) UseMethod("as.condensed")
 57 | #' @export
 58 | as.condensed.condensed <- function(x) x
 59 | #' @export
 60 | as.condensed.data.frame <- function(x) {
 61 |   structure(x, class = c("condensed", class(x)))
 62 | }
 63 | 
 64 | summary_vars <- function(x) {
 65 |   stopifnot(is.condensed(x))
 66 |   nm <- names(x)
 67 |   names(x)[grepl("^\\.", names(x))]
 68 | }
 69 | 
 70 | group_vars <- function(x) {
 71 |   setdiff(names(x), summary_vars(x))
 72 | }
 73 | 
 74 | gcol <- function(x) length(group_vars(x))
 75 | 
 76 | 
 77 | #' @export
 78 | Math.condensed <- function(x, ...) {
 79 |   generic <- match.fun(.Generic)
 80 |   x[[1]] <- generic(x[[1]], ...)
 81 |   rebin(x)
 82 | }
 83 | 
 84 | #' @export
 85 | Ops.condensed <- function(e1, e2) {
 86 |   logical_ops <- c("==", "!=", "<", "<=", ">=", ">")
 87 |   math_ops <- c("+", "-", "*", "/", "^", "%%", "%/%")
 88 | 
 89 |   generic <- match.fun(.Generic)
 90 |   if (.Generic %in% logical_ops)  {
 91 |     l <- generic(e1[[1]], e2)
 92 |     l[1] <- TRUE # always preserve missings
 93 |     l & !is.na(l)
 94 |   } else if (.Generic %in% math_ops) {
 95 |     e1[[1]] <- generic(e1[[1]], e2)
 96 |     rebin(e1)
 97 |   } else {
 98 |     stop(.Generic, " not supported for condensed objects", call. = FALSE)
 99 |   }
100 | }
101 | 
102 | #' Round any method for condensed objects
103 | #'
104 | #' @inheritParams plyr::round_any
105 | #' @export
106 | #' @keywords internal
107 | round_any.condensed <- function(x, accuracy, f = round) {
108 |   gvars <- group_vars(x)
109 |   x[gvars] <- lapply(x[gvars], plyr::round_any, accuracy = accuracy, f = f)
110 |   rebin(x)
111 | }
112 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # This file was generated by Rcpp::compileAttributes
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | condense_count <- function(x, z, weight, drop = FALSE) {
  5 |     .Call('bigvis_condense_count', PACKAGE = 'bigvis', x, z, weight, drop)
  6 | }
  7 | 
  8 | condense_sum <- function(x, z, weight, drop = FALSE) {
  9 |     .Call('bigvis_condense_sum', PACKAGE = 'bigvis', x, z, weight, drop)
 10 | }
 11 | 
 12 | condense_mean <- function(x, z, weight, drop = FALSE) {
 13 |     .Call('bigvis_condense_mean', PACKAGE = 'bigvis', x, z, weight, drop)
 14 | }
 15 | 
 16 | condense_sd <- function(x, z, weight, drop = FALSE) {
 17 |     .Call('bigvis_condense_sd', PACKAGE = 'bigvis', x, z, weight, drop)
 18 | }
 19 | 
 20 | condense_median <- function(x, z, weight, drop = FALSE) {
 21 |     .Call('bigvis_condense_median', PACKAGE = 'bigvis', x, z, weight, drop)
 22 | }
 23 | 
 24 | double_diff_sum <- function(bin, count) {
 25 |     .Call('bigvis_double_diff_sum', PACKAGE = 'bigvis', bin, count)
 26 | }
 27 | 
 28 | #' Efficient implementation of range.
 29 | #'
 30 | #' This is an efficient C++ implementation of range for numeric vectors:
 31 | #' it avoids S3 dispatch, and computes both min and max in a single pass
 32 | #' through the input.
 33 | #'
 34 | #' If \code{x} has a \code{range} attribute (e.g. it's a \code{\link{ranged}}
 35 | #' object), it will be used instead of computing the range from scratch.
 36 | #' 
 37 | #' @param x a numeric vector, or a \code{\link{ranged}} object
 38 | #' @param finite If \code{TRUE} ignores missing values and infinities. Note
 39 | #'   that if the vector is empty, or only contains missing values, 
 40 | #'   \code{frange} will return \code{c(Inf, -Inf)} because those are the
 41 | #'   identity values for \code{\link{min}} and \code{\link{max}} respectively.
 42 | #' @export
 43 | #' @examples
 44 | #' x <- runif(1e6)
 45 | #' system.time(range(x))
 46 | #' system.time(frange(x))
 47 | #'
 48 | #' rx <- ranged(x)
 49 | #' system.time(frange(rx))
 50 | frange <- function(x, finite = TRUE) {
 51 |     .Call('bigvis_frange', PACKAGE = 'bigvis', x, finite)
 52 | }
 53 | 
 54 | group_fixed <- function(x, width, origin = 0) {
 55 |     .Call('bigvis_group_fixed', PACKAGE = 'bigvis', x, width, origin)
 56 | }
 57 | 
 58 | group_rect <- function(x, y, x_width, y_width, x_origin, y_origin) {
 59 |     .Call('bigvis_group_rect', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin)
 60 | }
 61 | 
 62 | group_hex <- function(x, y, x_width, y_width, x_origin, y_origin, x_max) {
 63 |     .Call('bigvis_group_hex', PACKAGE = 'bigvis', x, y, x_width, y_width, x_origin, y_origin, x_max)
 64 | }
 65 | 
 66 | lowerBound <- function(x, breaks) {
 67 |     .Call('bigvis_lowerBound', PACKAGE = 'bigvis', x, breaks)
 68 | }
 69 | 
 70 | smooth_nd_1 <- function(grid_in, z_in, w_in_, grid_out, var, h, type = "mean") {
 71 |     .Call('bigvis_smooth_nd_1', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, var, h, type)
 72 | }
 73 | 
 74 | smooth_nd <- function(grid_in, z_in, w_in_, grid_out, h) {
 75 |     .Call('bigvis_smooth_nd', PACKAGE = 'bigvis', grid_in, z_in, w_in_, grid_out, h)
 76 | }
 77 | 
 78 | bisquare <- function(u, b) {
 79 |     .Call('bigvis_bisquare', PACKAGE = 'bigvis', u, b)
 80 | }
 81 | 
 82 | regress <- function(x, y, w) {
 83 |     .Call('bigvis_regress', PACKAGE = 'bigvis', x, y, w)
 84 | }
 85 | 
 86 | medianC <- function(x) {
 87 |     .Call('bigvis_median', PACKAGE = 'bigvis', x)
 88 | }
 89 | 
 90 | regress_robust <- function(x, y, w, iterations = 3L) {
 91 |     .Call('bigvis_regress_robust', PACKAGE = 'bigvis', x, y, w, iterations)
 92 | }
 93 | 
 94 | compute_moments <- function(x) {
 95 |     .Call('bigvis_compute_moments', PACKAGE = 'bigvis', x)
 96 | }
 97 | 
 98 | compute_sum <- function(x) {
 99 |     .Call('bigvis_compute_sum', PACKAGE = 'bigvis', x)
100 | }
101 | 
102 | compute_median <- function(x) {
103 |     .Call('bigvis_compute_median', PACKAGE = 'bigvis', x)
104 | }
105 | 
106 | 


--------------------------------------------------------------------------------
/src/group.h:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | using namespace Rcpp;
  3 | 
  4 | NumericVector frange(const NumericVector& x, const bool finite = true);
  5 | 
  6 | class GroupFixed {
  7 |     const NumericVector x_;
  8 |     double width_;
  9 |     double origin_;
 10 |   public:
 11 |     GroupFixed (NumericVector x, double width, double origin = 0)
 12 |        : x_(x), width_(width), origin_(origin) {
 13 |     }
 14 | 
 15 |     int bin_i(int i) const {
 16 |       if (ISNAN(x_[i]) || x_[i] == INFINITY || x_[i] == -INFINITY) return 0;
 17 |       if (x_[i] < origin_) return 0;
 18 |       
 19 |       return bin(x_[i]);
 20 |     }
 21 | 
 22 |     int bin(double x) const {
 23 |       return (x - origin_) / width_ + 1;
 24 |     }
 25 | 
 26 |     double unbin(int bin) const {
 27 |       if (bin == 0) return(NAN);
 28 |       return (bin - 1) * width_ + origin_;
 29 |     }
 30 | 
 31 |     double origin() const {
 32 |       return origin_;
 33 |     }
 34 |     double width() const {
 35 |       return width_;
 36 |     }
 37 | 
 38 | 
 39 |     int size() const {
 40 |       return x_.size();
 41 |     }
 42 | 
 43 |     int nbins() const {
 44 |       double max = frange(x_)(1);
 45 |       double dest = floor((max - origin_) / width_) * width_ + origin_;
 46 | 
 47 |       // + 1 for missing values
 48 |       // + 1 if highest value is on right-open boundary
 49 |       return (dest - origin_) / width_ + 1 + ((max >= dest) ? 1 : 0);
 50 |     }
 51 | 
 52 | };
 53 | 
 54 | template<typename Group>
 55 | class Group2d {
 56 |     const Group& x_;
 57 |     const Group& y_;
 58 |     int x_bins_;
 59 |     int y_bins_;
 60 | 
 61 |   public:
 62 |     Group2d (const Group& x, const Group& y) : x_(x), y_(y) {
 63 |       if (x_.size() != y_.size()) {
 64 |         stop("x and y are not equal sizes");
 65 |       }
 66 |       x_bins_ = x_.nbins();
 67 |       y_bins_ = y_.nbins();
 68 | 
 69 |       // Rcout << "x_bins: " << x_bins_ << " y_bins: " << y_bins_ << "\n";
 70 |     }
 71 | 
 72 |     int bin_i(int i) const {
 73 |       int x_bin = x_.bin_i(i), y_bin = y_.bin_i(i);
 74 |       int bin = y_bin * x_bins_ + x_bin;
 75 |       // Rcout << i << ": (" << x_bin << "," << y_bin << ") -> " << bin << "\n";
 76 |       return bin;
 77 |     }
 78 | 
 79 |     int size() const {
 80 |       return x_.size();
 81 |     }
 82 | 
 83 |     int nbins() const {
 84 |       return x_bins_ * y_bins_;
 85 |     }
 86 | };
 87 | 
 88 | 
 89 | template<typename Group>
 90 | class GroupNd {
 91 |     const std::vector<Group> groups_;
 92 |     const int ngroups_;
 93 | 
 94 |     int size_;
 95 |     std::vector<int> bins_;
 96 | 
 97 |   public:
 98 |     GroupNd (const std::vector<Group> groups) 
 99 |         : groups_(groups), ngroups_(groups.size()) {
100 |       if (groups.size() == 0) {
101 |         stop("Empty groups vector passed to GroupCompound");
102 |       }
103 | 
104 |       size_ = groups[0].size();
105 | 
106 |       bins_[0] = 1;
107 |       for (int i = 0; i < ngroups_ - 1; ++i) {
108 |         if (groups_[i].size() != size_) stop("Groups not equal sizes");
109 | 
110 |         bins_[i + 1] = bins_[i] * groups_[i].nbins(); 
111 |       }
112 |     }
113 | 
114 |     int bin_i(int i) const {
115 |       int bin = 0;
116 | 
117 |       for (int j = 0; j < ngroups_; ++j) {
118 |         bin += groups_[j].bin(i) * bins_[j];
119 |       }
120 | 
121 |       return bin;
122 |     }
123 | 
124 |     // int nbins() const {
125 |     //   return bins_[ngroups_ - 1];
126 |     // }
127 | 
128 |     int ngroups() const {
129 |       return groups_.size();
130 |     }
131 | 
132 |     int size() const {
133 |       return size_;
134 |     }
135 | 
136 |     std::vector<double> unbin(int bin) const {
137 |       std::vector<double> bins(ngroups_);
138 | 
139 |       for (int j = 0; j < ngroups_; ++j) {
140 |         int bin_j = bin % bins_[j];
141 |         bins[j] = groups_[j].unbin(bin_j);
142 | 
143 |         bin = bin - bin * bins_[j];
144 |       }
145 | 
146 |       return bins;
147 |     }
148 | 
149 | };
150 | 


--------------------------------------------------------------------------------
/R/h.r:
--------------------------------------------------------------------------------
  1 | #' Find "best" smoothing parameter using leave-one-out cross validation.
  2 | #'
  3 | #' Minimises the leave-one-out estimate of root mean-squared error to find
  4 | #' find the "optimal" bandwidth for smoothing.
  5 | #'
  6 | #' L-BFGS-B optimisation is used to constrain the bandwidths to be greater
  7 | #' than the binwidths: if the bandwidth is smaller than the binwidth it's
  8 | #' impossible to compute the rmse because no smoothing occurs. The tolerance
  9 | #' is set relatively high for numerical optimisation since the precise choice
 10 | #' of bandwidth makes little difference visually, and we're unlikely to have
 11 | #' sufficient data to make a statistically significant choice anyway.
 12 | #'
 13 | #' @param x condensed summary to smooth
 14 | #' @param h_init initial values of bandwidths to start search out. If not
 15 | #'  specified defaults to 5 times the binwidth of each variable.
 16 | #' @param ... other arguments (like \code{var}) passed on to
 17 | #'  \code{\link{rmse_cv}}
 18 | #' @param tol numerical tolerance, defaults to 1\%.
 19 | #' @param control additional control parameters passed on to \code{\link{optim}}
 20 | #'   The most useful argument is probably trace, which makes it possible to
 21 | #'   follow the progress of the optimisation.
 22 | #' @family bandwidth estimation functions
 23 | #' @return a single numeric value representing the bandwidth that minimises
 24 | #'   the leave-one-out estimate of rmse. Vector has attributes
 25 | #'   \code{evaluations} giving the number of times the objective function
 26 | #'   was evaluated. If the optimisation does not converge, or smoothing is not
 27 | #'   needed (i.e. the estimate is on the lower bounds), a warning is thrown.
 28 | #' @export
 29 | #' @examples
 30 | #' \donttest{
 31 | #' x <- rchallenge(1e4)
 32 | #' xsum <- condense(bin(x, 1 / 10))
 33 | #' h <- best_h(xsum, control = list(trace = 3, REPORT = 1))
 34 | #'
 35 | #' if (require("ggplot2")) {
 36 | #' autoplot(xsum)
 37 | #' autoplot(smooth(xsum, h))
 38 | #' }
 39 | #' }
 40 | best_h <- function(x, h_init = NULL, ..., tol = 1e-2, control = list()) {
 41 |   stopifnot(is.condensed(x))
 42 | 
 43 |   gvars <- group_vars(x)
 44 |   widths <- vapply(x[gvars], attr, "width", FUN.VALUE = numeric(1))
 45 |   h_init <- h_init %||% widths * 5
 46 |   stopifnot(is.numeric(h_init), length(h_init) == length(gvars))
 47 | 
 48 |   stopifnot(is.list(control))
 49 |   control <- modifyList(list(factr = tol / .Machine$double.eps), control)
 50 | 
 51 |   # Optimise
 52 |   rmse <- function(h) {
 53 |     rmse_cv(x, h, ...)
 54 |   }
 55 |   res <- optim(h_init, rmse, method = "L-BFGS-B", lower = widths * 1.01,
 56 |     control = control)
 57 |   h <- unname(res$par)
 58 | 
 59 |   # Feedback
 60 |   if (res$convergence != 0) {
 61 |     warning("Failed to converge: ", res$message, call. = FALSE)
 62 |   } else if (rel_dist(h, widths) < 1e-3) {
 63 |     warning("h close to lower bound: smoothing not needed", call. = FALSE)
 64 |   }
 65 |   structure(h, evaluations = res$counts[1], conv = res$convergence)
 66 | }
 67 | 
 68 | rel_dist <- function(x, y) {
 69 |   mean(abs(x - y) / abs(x + y))
 70 | }
 71 | 
 72 | #' Generate grid of plausible bandwidths for condensed summary.
 73 | #'
 74 | #' By default, the bandwidths start at the bin width, and then continue
 75 | #' up 50 (\code{n}) steps until 20 (\code{max}) times the bin width.
 76 | #'
 77 | #' @param x a condensed summary
 78 | #' @param n number of bandwidths to generate (in each dimension)
 79 | #' @param max maximum bandwidth to generate, as multiple of binwidth.
 80 | #' @family bandwidth estimation functions
 81 | #' @export
 82 | #' @examples
 83 | #' x <- rchallenge(1e4)
 84 | #' xsum <- condense(bin(x, 1 / 10))
 85 | #' h_grid(xsum)
 86 | #'
 87 | #' y <- runif(1e4)
 88 | #' xysum <- condense(bin(x, 1 / 10), bin(y, 1 / 100))
 89 | #' h_grid(xysum, n = 10)
 90 | h_grid <- function(x, n = 50, max = 20) {
 91 |   stopifnot(is.condensed(x))
 92 |   stopifnot(is.numeric(n), length(n) == 1, n > 0)
 93 |   stopifnot(is.numeric(max), length(max) == 1, max > 0)
 94 | 
 95 |   gs <- x[group_vars(x)]
 96 |   widths <- vapply(gs, attr, "width", FUN.VALUE = numeric(1))
 97 | 
 98 |   hs <- lapply(widths, function(w) w * seq(2, max, length = n))
 99 |   expand.grid(hs, KEEP.OUT.ATTRS = FALSE)
100 | }
101 | 


--------------------------------------------------------------------------------
/inst/include/bigvis.h:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | #include <boost/shared_ptr.hpp>
  3 | 
  4 | using namespace Rcpp;
  5 | 
  6 | // Wrapper for numeric vector that makes it easy figure to out which 
  7 | // bin each observation belongs to.
  8 | class BinnedVector {
  9 |     // This should probably be a const NumericVector&, but that doesn't work
 10 |     // with modules currently
 11 |     NumericVector x_;
 12 |     String name_;
 13 |     double width_;
 14 |     double origin_;
 15 |   public:
 16 |     BinnedVector(NumericVector x, String name, double width, double origin = 0)
 17 |        : x_(x), name_(name), width_(width), origin_(origin) {
 18 |     }
 19 | 
 20 |     int bin_i(int i) const {
 21 |       return bin(x_[i]);
 22 |     }
 23 | 
 24 |     int bin(double x) const {
 25 |       if (ISNAN(x) || x == INFINITY || x == -INFINITY) return 0;
 26 |       if (x < origin_) return 0;
 27 | 
 28 |       return (x - origin_) / width_ + 1;
 29 |     }
 30 | 
 31 |     double unbin(int bin) const {
 32 |       if (bin == 0) return(NA_REAL);
 33 |       return (bin - 1) * width_ + origin_ + width_ / 2;
 34 |     }
 35 | 
 36 |     int nbins() const;
 37 | 
 38 |     int size() const {
 39 |       return x_.size();
 40 |     }
 41 | 
 42 |     double origin() const {
 43 |       return origin_;
 44 |     } 
 45 | 
 46 |     double width() const {
 47 |       return width_;
 48 |     }
 49 | 
 50 |     String name() const {
 51 |       return name_;
 52 |     }
 53 | 
 54 | };
 55 | 
 56 | // This class is just boilerplate. There might be rcpp magic that does the right thing here
 57 | // but I don't know it.
 58 | class BinnedVectorReference {
 59 |     boost::shared_ptr<BinnedVector> ref;
 60 | 
 61 |     const BinnedVector *get() const {
 62 |         return ref.get();
 63 |     };
 64 |     BinnedVector *get() {
 65 |         return ref.get();
 66 |     };
 67 | 
 68 | public:
 69 |     BinnedVectorReference() {};
 70 | 
 71 |     BinnedVectorReference(const BinnedVectorReference &o):
 72 |         ref(o.ref) {};
 73 | 
 74 |     explicit BinnedVectorReference(BinnedVector *ptr) {
 75 |         // Watch out, this takes ownership of the pointer!
 76 |         ref = boost::shared_ptr<BinnedVector>(ptr);
 77 |     }
 78 | 
 79 |     BinnedVectorReference(NumericVector x, String name, double width, double origin = 0) {
 80 |         BinnedVector *vec = new BinnedVector(x, name, width, origin);
 81 |         ref = boost::shared_ptr<BinnedVector>(vec);
 82 |     }
 83 | 
 84 |     int bin_i(int i) const { return get()->bin_i(i); }
 85 |     int bin(double x) const { return get()->bin(x); }
 86 |     double unbin(int bin) const { return get()->unbin(bin); }
 87 |     int nbins() const { return get()->nbins(); }
 88 |     int size() const { return get()->size();}
 89 |     double origin() const { return get()->origin();}
 90 |     double width() const { return get()->width();}
 91 |     String name() const { return get()->name();}
 92 | };
 93 | 
 94 | // A data structure to store multiple binned vectors
 95 | class BinnedVectors {
 96 |     int size_;
 97 |     std::vector<BinnedVectorReference> groups_;
 98 | 
 99 |   public:
100 |     std::vector<int> bins_;
101 |     BinnedVectors () : groups_(0), bins_(0) {
102 |     }
103 | 
104 |     BinnedVectors (List gs) : groups_(0), bins_(0) {
105 |       int n = gs.size();
106 |       for (int i = 0; i < n; ++i) {
107 |         add_vector(as<BinnedVectorReference>(gs[i]));
108 |       }
109 |     }
110 | 
111 |     void add_vector(BinnedVectorReference g) {
112 |       if (groups_.empty()) {
113 |         bins_.push_back(1);
114 |         size_ = g.size();
115 |       } else {
116 |         if (g.size() != size_) stop("Inconsistent sizes");
117 |         bins_.push_back(bins_.back() * g.nbins());
118 |       }
119 |       groups_.push_back(g);
120 | 
121 |     }
122 | 
123 |     int bin_i(int i) const;
124 |     int bin(std::vector<double> x) const;
125 |     std::vector<double> unbin(int bin) const;
126 |     
127 |     int nbins() const {
128 |       return bins_.back() * groups_.front().nbins();
129 |     }
130 | 
131 |     int ngroups() const {
132 |       return bins_.size();
133 |     }
134 | 
135 |     int size() const {
136 |       return size_;
137 |     }
138 | 
139 |     String name(int j) const {
140 |       return groups_[j].name();
141 |     }
142 | 
143 | };
144 | 
145 | 
146 | RCPP_EXPOSED_AS(BinnedVectorReference)
147 | RCPP_EXPOSED_WRAP(BinnedVectorReference)
148 | RCPP_EXPOSED_AS(BinnedVectors)
149 | RCPP_EXPOSED_WRAP(BinnedVectors)
150 | 


--------------------------------------------------------------------------------
/src/smooth-nd.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <Rcpp.h>
  3 | #include "group.h"
  4 | #include "Summary2d.h"
  5 | #include <boost/shared_ptr.hpp>
  6 | using namespace Rcpp;
  7 | 
  8 | boost::shared_ptr<Summary2d> createSummary(std::string type) {
  9 |   if (type == "mean") {
 10 |     return boost::shared_ptr<Summary2d>(new Summary2dMean());
 11 |   } else if (type == "regression") {
 12 |     return boost::shared_ptr<Summary2d>(new Summary2dRegression());
 13 |   } else if (type == "robust_regression") {
 14 |     return boost::shared_ptr<Summary2d>(new Summary2dRobustRegression());
 15 |   } else {
 16 |     stop("Unknown type");
 17 |     // Quiet warning
 18 |     return boost::shared_ptr<Summary2d>(new Summary2dMean());
 19 |   }
 20 | }
 21 | 
 22 | double tricube(double x) {
 23 |   if (NumericVector::is_na(x)) return 0;
 24 |   x = fabs(x);
 25 |   if (x > 1) return 0;
 26 | 
 27 |   double y = 1 - x * x * x;
 28 |   return y * y * y;
 29 | }
 30 | 
 31 | bool both_na(double x, double y) {
 32 |   return (NumericVector::is_na(x) && NumericVector::is_na(y));
 33 | }
 34 | 
 35 | // [[Rcpp::export]]
 36 | NumericVector smooth_nd_1(const NumericMatrix& grid_in, 
 37 |                           const NumericVector& z_in, 
 38 |                           const NumericVector& w_in_,
 39 |                           const NumericMatrix& grid_out, 
 40 |                           const int var, const double h,
 41 |                           const std::string type = "mean") {
 42 | 
 43 |   if (var < 0) stop("var < 0");
 44 |   if (var >= grid_in.ncol()) stop("var too large");
 45 |   if (h <= 0) stop("h <= 0");
 46 |   if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes");
 47 | 
 48 |   int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol();
 49 |   NumericVector w_in = (w_in_.size() > 0) ? w_in_ : 
 50 |     rep(NumericVector::create(1), n_in);
 51 |   NumericVector z_out(n_out), w_out(n_out);
 52 | 
 53 |   // Will be much more efficient to slice up by input dimension:
 54 |   // and most efficient way of doing that will be to bin with / bw
 55 |   // My data structure: sparse grids
 56 |   // 
 57 |   // And once we're smoothing in one direction, with guaranteed e2venly spaced
 58 |   // grid can avoid many kernel evaluations and can also compute more
 59 |   // efficient running sum
 60 | 
 61 |   for(int j = 0; j < n_out; ++j) {
 62 |     boost::shared_ptr<Summary2d> summary = createSummary(type);
 63 |     for (int i = 0; i < n_in; ++i) {
 64 |       // Check that all variables (apart from var) are equal
 65 |       bool equiv = true;
 66 |       for (int k = 0; k < d; ++k) {
 67 |         if (k == var) continue;
 68 | 
 69 |         double in = grid_in(i, k), out = grid_out(j, k);
 70 |         if (in != out && !both_na(in, out)) {
 71 |           equiv = false;
 72 |           break;
 73 |         }
 74 |       };
 75 |       if (!equiv) continue;
 76 | 
 77 |       double in = grid_in(i, var), out = grid_out(j, var);
 78 |       double dist = both_na(in, out) ? 0 : in - out;
 79 |       double w = tricube(dist / h) * w_in[i];
 80 |       if (w == 0) continue;
 81 | 
 82 |       summary->push(dist, z_in[i], w);
 83 |     }
 84 |     z_out[j] = summary->compute();
 85 |   }
 86 | 
 87 |   return z_out;
 88 | }
 89 | 
 90 | // [[Rcpp::export]]
 91 | NumericVector smooth_nd(const NumericMatrix& grid_in, 
 92 |                         const NumericVector& z_in, 
 93 |                         const NumericVector& w_in_,
 94 |                         const NumericMatrix& grid_out, 
 95 |                         const NumericVector h) {
 96 | 
 97 |   if (grid_in.nrow() != z_in.size()) stop("Incompatible input lengths");
 98 |   if (grid_in.ncol() != grid_out.ncol()) stop("Incompatible grid sizes");
 99 |   if (h.size() != grid_in.ncol()) stop("Incorrect h length");
100 | 
101 |   int n_in = grid_in.nrow(), n_out = grid_out.nrow(), d = grid_in.ncol();
102 |   NumericVector w_in = (w_in_.size() > 0) ? w_in_ : 
103 |     rep(NumericVector::create(1), n_in);
104 |   NumericVector z_out(n_out), w_out(n_out);
105 | 
106 |   for (int i = 0; i < n_in; ++i) {
107 |     for(int j = 0; j < n_out; ++j) {
108 |       double w = 1;
109 |       for (int k = 0; k < d; ++k) {
110 |         double dist = (grid_in(i, k) - grid_out(j, k)) / h[k];
111 |         w *= tricube(dist);
112 |       }
113 |       w *= w_in[i];
114 | 
115 |       w_out[j] += w;
116 |       z_out[j] += z_in[i] * w;
117 |     }
118 |   }
119 | 
120 |   for(int j = 0; j < n_out; ++j) {
121 |     z_out[j] /= w_out[j];
122 |   }
123 | 
124 |   return z_out;
125 | }
126 | 


--------------------------------------------------------------------------------
/R/autoplot.r:
--------------------------------------------------------------------------------
  1 | #' Autoplot condensed summaries.
  2 | #'
  3 | #' @param x a condensed summary
  4 | #' @param var which summary variable to display
  5 | #' @param ... other arguments passed on to individual methods
  6 | #' @method autoplot condensed
  7 | #' @export autoplot.condensed
  8 | #' @examples
  9 | #' if (require("ggplot2")) {
 10 | #'
 11 | #' # 1d summaries -----------------------------
 12 | #' x <- rchallenge(1e4)
 13 | #' z <- x + rt(length(x), df = 2)
 14 | #' xsum <- condense(bin(x, 0.1))
 15 | #' zsum <- condense(bin(x, 0.1), z = z)
 16 | #'
 17 | #' autoplot(xsum)
 18 | #' autoplot(peel(xsum))
 19 | #'
 20 | #' autoplot(zsum)
 21 | #' autoplot(peel(zsum, keep = 1))
 22 | #' autoplot(peel(zsum))
 23 | #'
 24 | #' # 2d summaries -----------------------------
 25 | #' y <- runif(length(x))
 26 | #' xysum <- condense(bin(x, 0.1), bin(y, 0.1))
 27 | #' xyzsum <- condense(bin(x, 0.1), bin(y, 0.1), z = z)
 28 | #'
 29 | #' autoplot(xysum)
 30 | #' autoplot(peel(xysum))
 31 | #' autoplot(xyzsum)
 32 | #' autoplot(peel(xyzsum))
 33 | #' }
 34 | autoplot.condensed <- function(x, var = last(summary_vars(x)), ...) {
 35 |   stopifnot(is.condensed(x))
 36 |   stopifnot(is.character(var), length(var) == 1)
 37 |   summaries <- c(
 38 |     .count = "total",
 39 |     .sum = "total",
 40 |     .mean = "summary",
 41 |     .sd = "summary",
 42 |     .median = "summary"
 43 |   )
 44 |   if (!(var %in% names(summaries))) {
 45 |     stop("Unknown varible", call. = FALSE)
 46 |   }
 47 |   d <- gcol(x)
 48 |   if (d > 2) {
 49 |     stop("No autoplot methods available for more than two d")
 50 |   }
 51 | 
 52 |   f <- paste0("plot_", summaries[var], "_", d)
 53 |   find_fun(f)(x, var = var, ...)
 54 | }
 55 | 
 56 | 
 57 | plot_total_1 <- function(x, var = ".count", show_na = TRUE, log = "") {
 58 |   xvar <- names(x)[[1]]
 59 | 
 60 |   plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) +
 61 |     ggplot2::geom_line(na.rm = TRUE)
 62 | 
 63 |   if (show_na) {
 64 |     plot <- plot + na_layer(x, var)
 65 |   }
 66 | 
 67 |   if (logv(log, "y")) {
 68 |     plot <- plot + ggplot2::scale_y_continuous(trans = "log1p")
 69 |   }
 70 |   if (logv(log, "x")) {
 71 |     plot <- plot + ggplot2::scale_x_log10()
 72 |   }
 73 | 
 74 |   plot
 75 | }
 76 | 
 77 | plot_total_2 <- function(x, var = ".count", show_na = TRUE, log = "") {
 78 |   x <- peel(x, keep = 1)
 79 |   xvar <- names(x)[[1]]
 80 |   yvar <- names(x)[[2]]
 81 |   miss <- is.na(x[[1]]) + 2 * is.na(x[[2]])
 82 | 
 83 |   fill_trans <- if (logv(log, "z")) "log1p" else "identity"
 84 | 
 85 |   plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) +
 86 |     ggplot2::geom_raster(ggplot2::aes_string(fill = var)) +
 87 |     ggplot2::scale_fill_gradient(low = "grey90", high = "black", trans = fill_trans) +
 88 |     ggplot2::expand_limits(fill = 0)
 89 | 
 90 |   if (show_na) {
 91 |   }
 92 | 
 93 |   plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10()
 94 |   plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10()
 95 | 
 96 |   plot
 97 | }
 98 | 
 99 | plot_summary_1 <- function(x, var = ".mean", show_na = TRUE,
100 |                                     show_n = x %contains% ".count", log = NULL) {
101 |   xvar <- names(x)[[1]]
102 | 
103 |   plot <- ggplot2::ggplot(x[-1, ], ggplot2::aes_string(x = xvar, y = var)) +
104 |     ggplot2::geom_line(na.rm = TRUE) +
105 |     ggplot2::scale_size_area()
106 | 
107 |   if (show_n) {
108 |     plot <- plot +
109 |       ggplot2::geom_point(ggplot2::aes_string(color = ".count"), na.rm = TRUE) +
110 |       ggplot2::scale_colour_gradient(trans = "log10")
111 |   }
112 | 
113 |   if (show_na) {
114 |     plot <- plot + na_layer(x, var)
115 |   }
116 | 
117 |   plot
118 | }
119 | 
120 | plot_summary_2 <- function(x, var = ".mean", show_na = TRUE, log = "") {
121 |   x <- peel(x, keep = 1)
122 |   xvar <- names(x)[[1]]
123 |   yvar <- names(x)[[2]]
124 | 
125 |   miss <- is.na(x[[1]]) + 2 * is.na(x[[2]])
126 | 
127 |   plot <- ggplot2::ggplot(x[miss == 0, ], ggplot2::aes_string(x = xvar, y = yvar)) +
128 |     ggplot2::geom_tile(ggplot2::aes_string(fill = var)) +
129 |     ggplot2::scale_fill_gradient2()
130 | 
131 |   if (show_na) {
132 |   }
133 | 
134 |   plot <- plot + if (logv(log, "x")) ggplot2::scale_x_log10()
135 |   plot <- plot + if (logv(log, "y")) ggplot2::scale_y_log10()
136 | 
137 |   plot
138 | }
139 | 
140 | na_layer <- function(x, var) {
141 |   val <- x[[var]][is.na(x[[1]])]
142 |   if (length(val) == 0 || is.na(val) || val == 0) return()
143 | 
144 |   xloc <- miss_poss(x[[1]])
145 |   ggplot2::annotate("text", x = xloc, y = val, colour = "red", label = "NA",
146 |     size = 3)
147 | }
148 | 
149 | logv <- function(log, var) var %in% strsplit(log, "")[[1]]
150 | 
151 | miss_poss <- function(x) {
152 |   rng <- frange(x)
153 |   rng[1] - (rng[2] - rng[1]) * 0.05
154 | }
155 | 


--------------------------------------------------------------------------------
/R/weighted-stats.r:
--------------------------------------------------------------------------------
  1 | #' Compute a weighted variance or standard deviation of a vector.
  2 | #'
  3 | #' @details
  4 | #' Note that unlike the base R \code{\link{var}} function, these functions only
  5 | #' work with individual vectors not matrices or data frames.
  6 | #'
  7 | #' @family weighted statistics
  8 | #' @seealso \code{\link[stats]{weighted.mean}}
  9 | #' @param x numeric vector of observations
 10 | #' @param w integer vector of weights, representing the number of
 11 | #'  time each \code{x} was observed
 12 | #' @param na.rm if \code{TRUE}, missing values in both \code{w} and \code{x}
 13 | #'   will be removed prior computation. Otherwise if there are missing values
 14 | #'   the result will always be missing.
 15 | #' @export
 16 | #' @examples
 17 | #' x <- c(1:5)
 18 | #' w <- rpois(5, 5) + 1
 19 | #' y <- x[rep(seq_along(x), w)]
 20 | #' weighted.var(x, w)
 21 | #' var(y)
 22 | #'
 23 | #' stopifnot(all.equal(weighted.var(x, w), var(y)))
 24 | weighted.var <- function(x, w = NULL, na.rm = FALSE) {
 25 |   if (na.rm) {
 26 |     na <- is.na(x) | is.na(w)
 27 |     x <- x[!na]
 28 |     w <- w[!na]
 29 |   }
 30 | 
 31 |   sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1)
 32 | }
 33 | 
 34 | #' @export
 35 | #' @rdname weighted.var
 36 | weighted.sd <- function(x, w, na.rm = TRUE) sqrt(weighted.var(x, w, na.rm = TRUE))
 37 | 
 38 | #' A weighted ecdf function.
 39 | #'
 40 | #' An extension of the base \code{\link[stats]{ecdf}} function which works
 41 | #' with weighted data.
 42 | #'
 43 | #' @section S3 methods:
 44 | #' The \code{ecdf} class has methods for \code{\link{plot}},
 45 | #' \code{\link{lines}}, \code{\link{summary}} and \code{\link{quantile}}.
 46 | #' \code{\link{quantile}} does not currently correctly compute values for
 47 | #' weighted ecdfs.
 48 | #'
 49 | #' @inheritParams weighted.var
 50 | #' @family weighted statistics
 51 | #' @seealso \code{\link[stats]{weighted.mean}}
 52 | #' @export
 53 | #' @examples
 54 | #' x <- runif(200)
 55 | #' w <- rpois(200, 5) + 1
 56 | #'
 57 | #' e <- weighted.ecdf(x, w)
 58 | #' plot(e)
 59 | #' summary(e)
 60 | #'
 61 | #' y <- x[rep(seq_along(x), w)]
 62 | #' plot(ecdf(y))
 63 | weighted.ecdf <- function(x, w) {
 64 |   stopifnot(length(x) == length(w))
 65 |   stopifnot(anyDuplicated(x) == 0)
 66 | 
 67 |   ord <- order(x)
 68 |   x <- x[ord]
 69 |   w <- w[ord]
 70 | 
 71 |   n <- sum(w)
 72 |   wts <- cumsum(w / n)
 73 | 
 74 |   f <- approxfun(x, wts, method = "constant", yleft = 0, yright = 1, f = 0)
 75 |   class(f) <- c("wecdf", "ecdf", "stepfun", class(f))
 76 |   attr(f, "call") <- sys.call()
 77 |   environment(f)$nobs <- n
 78 |   f
 79 | }
 80 | 
 81 | #' Compute quantiles of weighted data.
 82 | #'
 83 | #' @details
 84 | #' Currently only implements the type 7 algorithm, as described in
 85 | #' \code{\link{quantile}}. Based on \code{\link{quantile}} written by R-core.
 86 | #'
 87 | #' @inheritParams weighted.var
 88 | #' @param probs numeric vector of probabilities between 0 and 1
 89 | #' @param na.rm If \code{TRUE} will automatically remove missing values
 90 | #'   in \code{x} or \code{w}.
 91 | #' @family weighted statistics
 92 | #' @export
 93 | #' @examples
 94 | #' x <- runif(200)
 95 | #' w <- rpois(200, 5) + 1
 96 | #' weighted.quantile(x, w)
 97 | weighted.quantile <- function (x, w, probs = seq(0, 1, 0.25), na.rm = FALSE) {
 98 |   stopifnot(length(x) == length(w))
 99 |   na <- is.na(x) | is.na(w)
100 |   if (any(na)) {
101 |     if (na.rm) {
102 |       x <- x[!na]
103 |       w <- w[!na]
104 |     } else {
105 |       stop("Missing values not allowed when na.rm is FALSE", call. = FALSE)
106 |     }
107 |   }
108 | 
109 |   # Ensure x and w in ascending order of x
110 |   ord <- order(x)
111 |   x <- x[ord]
112 |   w <- w[ord]
113 | 
114 |   # Find closest x just below and above index
115 |   n <- sum(w)
116 |   index <- 1 + (n - 1) * probs
117 |   j <- floor(index)
118 | 
119 |   wts <- cumsum(w)
120 |   lo <- x[lowerBound(j, wts)]     # X_j
121 |   hi <- x[lowerBound(j + 1, wts)]
122 | 
123 |   g <- index - j
124 |   ifelse(lo == hi, lo, (1 - g) * lo + g * hi)
125 | }
126 | # Q[i](p) = (1 - g) x[j] + g x[j+1]
127 | # j = floor(np + m)
128 | # g = np + m - j
129 | #
130 | # For type 7:
131 | #   m = 1 - p =>
132 | #   j = floor(1 + (n - 1) * p)
133 | #   g = (np + 1 - p) - floor(1 + (n - 1) * p)
134 | 
135 | #' Compute the median of weighted data.
136 | #'
137 | #' @details This is a simple wrapper around \code{\link{weighted.quantile}}
138 | #' @inheritParams weighted.quantile
139 | #' @export
140 | #' @examples
141 | #' x <- runif(200)
142 | #' w <- rpois(200, 5) + 1
143 | #'
144 | #' median(x)
145 | #' weighted.median(x, w)
146 | weighted.median <- function(x, w, na.rm = FALSE) {
147 |   weighted.quantile(x, w, probs = 0.5, na.rm = na.rm)
148 | }
149 | 
150 | #' Compute the interquartile range of weighted data.
151 | #'
152 | #' @details This is a simple wrapper around \code{\link{weighted.quantile}}
153 | #' @inheritParams weighted.quantile
154 | #' @export
155 | #' @examples
156 | #' x <- sort(runif(200))
157 | #' w <- rpois(200, seq(1, 10, length = 200)) + 1
158 | #'
159 | #' IQR(x)
160 | #' weighted.IQR(x, w)
161 | weighted.IQR <- function(x, w, na.rm = FALSE) {
162 |   diff(weighted.quantile(x, w, probs = c(0.25, 0.75), na.rm = na.rm))
163 | }
164 | 


--------------------------------------------------------------------------------
/src/condense.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | #include <bigvis.h>
  3 | #include "group.h"
  4 | #include "summary.h"
  5 | 
  6 | template<typename Stat>
  7 | List condense(const BinnedVectors& group, const NumericVector& z, 
  8 |                         const NumericVector& weight, const Stat& stat) {
  9 |   int n_obs = group.size();
 10 |   int n_bins = group.nbins();
 11 | 
 12 |   const NumericVector& weight_ = (weight.size() > 0) ? weight : 
 13 |     rep(NumericVector::create(1), n_obs);
 14 |   const NumericVector& z_ = (z.size() > 0) ? z : 
 15 |     rep(NumericVector::create(1), n_obs);
 16 | 
 17 |   // Push values into stats
 18 |   std::vector<Stat> stats(n_bins, stat);
 19 |   for(int i = 0; i < n_obs; ++i) {
 20 |     int bin = group.bin_i(i);
 21 |     // Rcout << "i: " << i << " bin: " << bin << "\n"; 
 22 |     stats.at(bin).push(z_[i], weight_[i]);      
 23 |   }
 24 | 
 25 |   // Compute values from stats and determine bins
 26 |   int n_stats = stat.size();
 27 |   int n_groups = group.ngroups();
 28 |   NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups);
 29 | 
 30 |   for (int i = 0; i < n_bins; ++i) {
 31 |     for (int j = 0; j < n_stats; ++j) {
 32 |       out(i, j) = stats[i].compute(j);
 33 |     }
 34 | 
 35 |     std::vector<double> bins = group.unbin(i);
 36 |     for (int j = 0; j < n_groups; ++j) {
 37 |       bin(i, j) = bins[j];
 38 |     }
 39 |   }
 40 | 
 41 |   // Name 
 42 |   CharacterVector out_cols(n_stats), bin_cols(n_groups);
 43 |   for (int j = 0; j < n_stats; ++j) {
 44 |     out_cols[j] = stat.name(j);
 45 |   }
 46 |   for (int j = 0; j < n_groups; ++j) {
 47 |     bin_cols[j] = group.name(j);
 48 |   }
 49 |   out.attr("dimnames") = List::create(CharacterVector::create(), out_cols);
 50 |   bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols);
 51 | 
 52 |   return List::create(bin, out);
 53 | }
 54 | 
 55 | template<typename Stat>
 56 | List sparse_condense(const BinnedVectors& group, const NumericVector& z, 
 57 |                         const NumericVector& weight, const Stat& stat) {
 58 |   int n_obs = group.size();
 59 | 
 60 |   const NumericVector& weight_ = (weight.size() > 0) ? weight : 
 61 |     rep(NumericVector::create(1), n_obs);
 62 |   const NumericVector& z_ = (z.size() > 0) ? z : 
 63 |     rep(NumericVector::create(1), n_obs);
 64 | 
 65 |   // Push values into stats
 66 |   typename std::map<int, Stat> stats;
 67 |   for(int i = 0; i < n_obs; ++i) {
 68 |     int bin = group.bin_i(i);
 69 |     
 70 |     typename std::map<int, Stat>::iterator loc = stats.find(bin);
 71 |     if (loc == stats.end()) {
 72 |       Stat new_stat(stat);
 73 |       new_stat.push(z_[i], weight_[i]);
 74 |       stats.insert(std::pair<int, Stat>(bin, new_stat));
 75 |     } else {
 76 |       (loc->second).push(z_[i], weight_[i]);
 77 |     }
 78 |   }
 79 | 
 80 |   // Compute values from stats and determine bins
 81 |   int n_bins = stats.size();
 82 |   int n_stats = stat.size();
 83 |   int n_groups = group.ngroups();
 84 |   NumericMatrix out(n_bins, n_stats), bin(n_bins, n_groups);
 85 | 
 86 |   typename std::map<int, Stat>::iterator stats_it = stats.begin(),
 87 |     stats_end = stats.end();
 88 | 
 89 | 
 90 |   for (int i = 0; stats_it != stats_end; ++stats_it, ++i) {
 91 |     for (int j = 0; j < n_stats; ++j) {
 92 |       out(i, j) = (stats_it->second).compute(j);
 93 |     }
 94 | 
 95 |     std::vector<double> bins = group.unbin(stats_it->first);
 96 |     for (int j = 0; j < n_groups; ++j) {
 97 |       bin(i, j) = bins[j];
 98 |     }
 99 |   }
100 | 
101 |   // Name 
102 |   CharacterVector out_cols(n_stats), bin_cols(n_groups);
103 |   for (int j = 0; j < n_stats; ++j) {
104 |     out_cols[j] = stat.name(j);
105 |   }
106 |   for (int j = 0; j < n_groups; ++j) {
107 |     bin_cols[j] = group.name(j);
108 |   }
109 |   out.attr("dimnames") = List::create(CharacterVector::create(), out_cols);
110 |   bin.attr("dimnames") = List::create(CharacterVector::create(), bin_cols);
111 | 
112 |   return List::create(bin, out);
113 | }
114 | 
115 | // -----------------------------------------------------------------------------
116 | // Autogenerated by condense-gen.r
117 | 
118 | // [[Rcpp::export]]
119 | List condense_count(const List& x, const NumericVector& z,
120 |                        const NumericVector& weight, bool drop = false) {
121 |   if (drop) {
122 |     return sparse_condense(BinnedVectors(x), z, weight, SummarySum(0));
123 |   } else {
124 |     return condense(BinnedVectors(x), z, weight, SummarySum(0));
125 |   }
126 | }
127 | 
128 | // [[Rcpp::export]]
129 | List condense_sum(const List& x, const NumericVector& z,
130 |                        const NumericVector& weight, bool drop = false) {
131 |   if (drop) {
132 |     return sparse_condense(BinnedVectors(x), z, weight, SummarySum(1));
133 |   } else {
134 |     return condense(BinnedVectors(x), z, weight, SummarySum(1));
135 |   }
136 | }
137 | 
138 | // [[Rcpp::export]]
139 | List condense_mean(const List& x, const NumericVector& z,
140 |                        const NumericVector& weight, bool drop = false) {
141 |   if (drop) {
142 |     return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(1));
143 |   } else {
144 |     return condense(BinnedVectors(x), z, weight, SummaryMoments(1));
145 |   }
146 | }
147 | 
148 | // [[Rcpp::export]]
149 | List condense_sd(const List& x, const NumericVector& z,
150 |                        const NumericVector& weight, bool drop = false) {
151 |   if (drop) {
152 |     return sparse_condense(BinnedVectors(x), z, weight, SummaryMoments(2));
153 |   } else {
154 |     return condense(BinnedVectors(x), z, weight, SummaryMoments(2));
155 |   }
156 | }
157 | 
158 | // [[Rcpp::export]]
159 | List condense_median(const List& x, const NumericVector& z,
160 |                        const NumericVector& weight, bool drop = false) {
161 |   if (drop) {
162 |     return sparse_condense(BinnedVectors(x), z, weight, SummaryMedian());
163 |   } else {
164 |     return condense(BinnedVectors(x), z, weight, SummaryMedian());
165 |   }
166 | }
167 | 


--------------------------------------------------------------------------------
/bench/bin-structure.cpp:
--------------------------------------------------------------------------------
  1 | // How does the data structure implementing the bin affect performance.
  2 | 
  3 | #include <Rcpp.h>
  4 | using namespace Rcpp;
  5 | 
  6 | class Grouper {
  7 |     const Fast<NumericVector> x_;
  8 |     double width_;
  9 |     double origin_;
 10 |   public:
 11 |     Grouper (const NumericVector& x, double width, double origin = 0)
 12 |        : x_(x), width_(width), origin_(origin) {
 13 |     }
 14 | 
 15 |     int bin(int i) const {
 16 |       if (ISNAN(x_[i])) return 0;
 17 |       
 18 |       return (x_[i] - origin_) / width_ + 1;
 19 |     }
 20 | 
 21 |     int size() const {
 22 |       return x_.size();
 23 |     }
 24 | };
 25 | 
 26 | // [[Rcpp::export]]
 27 | std::vector<int> count_vector(const NumericVector& x, double width, double origin = 0) {
 28 |   Grouper grouper = Grouper(x, width, origin);
 29 |   std::vector<int> count;
 30 | 
 31 |   int n = grouper.size();
 32 |   for(int i = 0; i < n; ++i) {
 33 |     int bin = grouper.bin(i);
 34 |     if (bin >= count.size()) {
 35 |       count.resize(bin + 1);
 36 |     }
 37 | 
 38 |     ++count[bin];
 39 |   }
 40 | 
 41 |   return count;
 42 | }
 43 | 
 44 | // [[Rcpp::export]]
 45 | List count_map(const NumericVector& x, double width, double origin = 0) {
 46 |   Grouper grouper = Grouper(x, width, origin);
 47 |   std::map<int, int> count;
 48 | 
 49 |   int n = grouper.size();
 50 |   for(int i = 0; i < n; ++i) {
 51 |     int bin = grouper.bin(i);
 52 |     ++count[bin];
 53 |   }
 54 | 
 55 |   IntegerVector out_x(count.size()), out_y(count.size());
 56 |   std::map<int, int>::const_iterator count_it = count.begin(), 
 57 |     count_end = count.begin();
 58 |   for (int i = 0; count_it != count_end; ++count_it, ++i) {
 59 |     out_x[i] = count_it->first;
 60 |     out_y[i] = count_it->second;
 61 |   }
 62 |   return List::create(_["x"] = out_x, _["count"] = out_y);
 63 | }
 64 | 
 65 | // [[Rcpp::export]]
 66 | List count_umap(const NumericVector& x, double width, double origin = 0) {
 67 |   Grouper grouper = Grouper(x, width, origin);
 68 |   std::tr1::unordered_map<int, int> count;
 69 | 
 70 |   int n = grouper.size();
 71 |   for(int i = 0; i < n; ++i) {
 72 |     int bin = grouper.bin(i);
 73 | 
 74 |     ++count[bin];
 75 |   }
 76 | 
 77 |   IntegerVector out_x(count.size()), out_y(count.size());
 78 |   std::tr1::unordered_map<int, int>::iterator count_it = count.begin(), 
 79 |     count_end = count.end();
 80 |   for (int i = 0; count_it != count_end; ++count_it, ++i) {
 81 |     out_x[i] = count_it->first;
 82 |     out_y[i] = count_it->second;
 83 |   }
 84 |   return List::create(_["x"] = out_x, _["count"] = out_y);
 85 | }
 86 | 
 87 | 
 88 | template <class T>
 89 | inline void hash_combine(std::size_t & seed, const T & v) {
 90 |   std::tr1::hash<T> hasher;
 91 |   seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
 92 | }
 93 | 
 94 | namespace std {
 95 |   namespace tr1 {  
 96 |     template<typename S, typename T> struct hash<pair<S, T> > {
 97 |       inline size_t operator()(const pair<S, T> & v) const {
 98 |         size_t seed = 0;
 99 |         ::hash_combine(seed, v.first);
100 |         ::hash_combine(seed, v.second);
101 |         return seed;
102 |       }
103 |     };
104 |   }
105 | }
106 | 
107 | // [[Rcpp::export]]
108 | List count_umap2(const NumericVector& x, double width, double origin = 0) {
109 |   Grouper grouper = Grouper(x, width, origin);
110 |   std::tr1::unordered_map<std::pair<int, int>, int> count;
111 | 
112 |   int n = grouper.size();
113 |   for(int i = 0; i < n; ++i) {
114 |     int bin = grouper.bin(i);
115 | 
116 |     ++count[std::make_pair(bin, bin)];
117 |   }
118 | 
119 |   IntegerVector out_x(count.size()), out_y(count.size());
120 |   std::tr1::unordered_map<std::pair<int, int>, int>::iterator count_it = count.begin(), 
121 |     count_end = count.end();
122 |   for (int i = 0; count_it != count_end; ++count_it, ++i) {
123 |     out_x[i] = count_it->first.first;
124 |     out_y[i] = count_it->second;
125 |   }
126 |   return List::create(_["x"] = out_x, _["count"] = out_y);
127 | }
128 | 
129 | // [[Rcpp::export]]
130 | List count_umap2_man(const NumericVector& x, double width, double origin = 0) {
131 |   Grouper grouper = Grouper(x, width, origin);
132 |   std::tr1::unordered_map<int, int> count;
133 | 
134 |   int n = grouper.size();
135 |   for(int i = 0; i < n; ++i) {
136 |     int bin = grouper.bin(i);
137 |     bin = bin * 100 + bin;
138 |     ++count[bin];
139 |   }
140 | 
141 |   IntegerVector out_x(count.size()), out_y(count.size());
142 |   std::tr1::unordered_map<int, int>::iterator count_it = count.begin(), 
143 |     count_end = count.end();
144 |   for (int i = 0; count_it != count_end; ++count_it, ++i) {
145 |     out_x[i] = count_it->first;
146 |     out_y[i] = count_it->second;
147 |   }
148 |   return List::create(_["x"] = out_x, _["count"] = out_y);
149 | }
150 | 
151 | 
152 | /*** R 
153 |   options(digits = 3)
154 |   library(microbenchmark)
155 |   x <- runif(1e5)
156 | 
157 |   # As expected, for small contiguous inputs, vector is fastest, followed by
158 |   # unordered maps (about half as fast), with maps in a distant last place.
159 |   microbenchmark(
160 |     count_vector(x, 1 / 1000),
161 |     count_map(x, 1 / 1000),
162 |     count_umap(x, 1 / 1000)
163 |   )
164 | 
165 |   y <- c(x, x)
166 |   y1 <- c(x, x + 10)
167 |   y2 <- c(x, x + 100)
168 |   y3 <- c(x, x + 1000)
169 |   y4 <- c(x, x + 1000)
170 | 
171 |   # While using std::vector is somewhat faster, the asymptotic behaviour is
172 |   # much worse - count_umap is basically constant, regardless of the number
173 |   # of bins
174 |   microbenchmark(
175 |     count_vector(y, 1 / 1000),
176 |     count_vector(y1, 1 / 1000),
177 |     count_vector(y2, 1 / 1000),
178 |     count_vector(y3, 1 / 1000),
179 |     count_vector(y4, 1 / 1000),
180 |     count_umap(y, 1 / 1000),
181 |     count_umap(y1, 1 / 1000),
182 |     count_umap(y2, 1 / 1000),
183 |     count_umap(y3, 1 / 1000),
184 |     count_umap(y4, 1 / 1000),
185 |     times = 10
186 |   )
187 | 
188 |   # Using umap with a pair is about twice as slow as with an int: this probably
189 |   # implies that I should do the hashing myself.
190 |   microbenchmark(
191 |     count_umap(x, 1 / 1000),
192 |     count_umap2(x, 1 / 1000),
193 |     count_umap2_man(x, 1 / 1000)
194 |   )
195 | 
196 | */


--------------------------------------------------------------------------------
/bench/bin.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | using namespace Rcpp;
  5 | 
  6 | //' @param breaks must be ordered and span the complete range of x. 
  7 | // [[Rcpp::export]]
  8 | IntegerVector bin(NumericVector x, NumericVector breaks) {
  9 |   // Put missing values in the last position
 10 |   int n = breaks.size();
 11 |   IntegerVector out(n + 1);
 12 | 
 13 |   for(NumericVector::iterator it = x.begin(); it != x.end(); it++) {
 14 |     double val = *it;
 15 |     if (ISNAN(val)) {
 16 |       out[n]++;
 17 |     } else {
 18 |       NumericVector::iterator bin_it = 
 19 |         std::upper_bound(breaks.begin(), breaks.end(), val);
 20 | 
 21 |       int bin = std::distance(breaks.begin(), bin_it);
 22 |       out[bin]++;
 23 |     }
 24 |   }
 25 | 
 26 |   return out;
 27 | }
 28 | 
 29 | // [[Rcpp::export]]
 30 | IntegerVector bin2(NumericVector x, NumericVector breaks) {
 31 |   // Put missing values in the last position
 32 |   int n = breaks.size(), bin;
 33 |   IntegerVector out(n + 1);
 34 | 
 35 |   NumericVector::iterator x_it = x.begin(), x_end, bin_it,
 36 |     breaks_it = breaks.begin(), breaks_end = breaks.end();
 37 | 
 38 |   for(; x_it != x.end(); ++x_it) {
 39 |     double val = *x_it;
 40 |     if (ISNAN(val)) {
 41 |       ++out[n];
 42 |     } else {
 43 |       bin_it = std::upper_bound(breaks_it, breaks_end, val);
 44 |       bin = std::distance(breaks_it, bin_it);
 45 |       ++out[bin];
 46 |     }
 47 |   }
 48 | 
 49 |   return out;
 50 | }
 51 | 
 52 | // [[Rcpp::export]]
 53 | std::vector<int> bin3(NumericVector x, double width, double origin = 0) {
 54 |   int bin, nmissing = 0;
 55 |   std::vector<int> out;
 56 | 
 57 |   NumericVector::iterator x_it = x.begin(), x_end;
 58 |   for(; x_it != x.end(); ++x_it) {
 59 |     double val = *x_it;
 60 |     if (ISNAN(val)) {
 61 |       ++nmissing;
 62 |     } else {
 63 |       bin = (val - origin) / width;
 64 |       if (bin < 0) continue;
 65 |     
 66 |       // Make sure there's enough space
 67 |       if (bin >= out.size()) {
 68 |         out.resize(bin + 1);
 69 |       }
 70 |       ++out[bin];
 71 |     }
 72 |   }
 73 | 
 74 |   // Put missing values in the last position
 75 |   out.push_back(nmissing);
 76 |   return out;
 77 | }
 78 | 
 79 | // Create class to encapsulate binning operations ------------------------------
 80 | 
 81 | 
 82 | class BinFixed {
 83 |     double width_;
 84 |     double origin_;
 85 |   public:
 86 |     BinFixed (double width, double origin = 0) {
 87 |       width_ = width;
 88 |       origin_ = origin;
 89 |     }
 90 |     int inline operator() (double val) const { 
 91 |       return (val - origin_) / width_;
 92 |     }
 93 | };
 94 | class BinBreaks {
 95 |     NumericVector breaks_;
 96 |     NumericVector::iterator breaks_it_, breaks_end_;
 97 | 
 98 |   public:
 99 |     BinBreaks (NumericVector& breaks) {
100 |       breaks_ = breaks;
101 |       breaks_it_ = breaks.begin();
102 |       breaks_end_ = breaks.end();
103 |     }
104 |     int inline operator() (double val) const { 
105 |       NumericVector::iterator 
106 |         bin_it = std::upper_bound(breaks_it_, breaks_end_, val);
107 | 
108 |       return std::distance(breaks_it_, bin_it);
109 |     }
110 | };
111 | 
112 | 
113 | template<typename Binner>
114 | std::vector<int> bin_bin(NumericVector x, Binner binner) {
115 |   int bin, nmissing = 0;
116 |   std::vector<int> out;
117 | 
118 |   NumericVector::iterator x_it = x.begin(), x_end;
119 |   for(; x_it != x.end(); ++x_it) {
120 |     double val = *x_it;
121 |     if (ISNAN(val)) {
122 |       ++nmissing;
123 |     } else {
124 |       bin = binner(val);
125 |       if (bin < 0) continue;
126 |     
127 |       // Make sure there's enough space
128 |       if (bin >= out.size()) {
129 |         out.resize(bin + 1);
130 |       }
131 |       ++out[bin];
132 |     }
133 |   }
134 | 
135 |   // Put missing values in the last position
136 |   out.push_back(nmissing);
137 |   return out;
138 | }
139 | 
140 | // [[Rcpp::export]]
141 | std::vector<int> bin_bin_fixed(NumericVector x, double width, double origin = 0) {
142 |   return bin_bin(x, BinFixed(width, origin));
143 | }
144 | 
145 | // [[Rcpp::export]]
146 | std::vector<int> bin_bin_breaks(NumericVector x, NumericVector breaks) {
147 |   return bin_bin(x, BinBreaks(breaks));
148 | }
149 | 
150 | // Try using a Fast<NumericVector> ------------------------------
151 | // Considerable speed improvement for simple binning function
152 | template<typename Binner>
153 | std::vector<int> fbin_bin(NumericVector x, Binner binner) {
154 |   int bin, nmissing = 0;
155 |   std::vector<int> out;
156 | 
157 |   Fast<NumericVector> fx(x);
158 |   int n = x.size();
159 | 
160 |   for(int i = 0; i < n; ++i) {
161 |     double val = fx[i];
162 |     if (ISNAN(val)) {
163 |       ++nmissing;
164 |     } else {
165 |       bin = binner(val);
166 |       if (bin < 0) continue;
167 |     
168 |       // Make sure there's enough space
169 |       if (bin >= out.size()) {
170 |         out.resize(bin + 1);
171 |       }
172 |       ++out[bin];
173 |     }
174 |   }
175 | 
176 |   // Put missing values in the last position
177 |   out.push_back(nmissing);
178 |   return out;
179 | }
180 | 
181 | // [[Rcpp::export]]
182 | std::vector<int> fbin_bin_fixed(NumericVector x, double width, double origin = 0) {
183 |   return fbin_bin(x, BinFixed(width, origin));
184 | }
185 | 
186 | // [[Rcpp::export]]
187 | std::vector<int> fbin_bin_breaks(NumericVector x, NumericVector breaks) {
188 |   return fbin_bin(x, BinBreaks(breaks));
189 | }
190 | 
191 | /*** R 
192 | options(digits = 3)
193 | library(microbenchmark)
194 | x <- runif(1e5)
195 | breaks <- seq(0, 1, length = 100)
196 | 
197 | # Breaks
198 | microbenchmark(
199 |   hist(x, breaks, plot = F),
200 |   bin(x, breaks),
201 |   bin2(x, breaks),
202 |   bin_bin_breaks(x, breaks),
203 |   fbin_bin_breaks(x, breaks)
204 | )
205 | 
206 | # Fixed bins
207 | microbenchmark(
208 |   bin3(x, 1/100, 0),
209 |   bin_bin_fixed(x, 1/100, 0),
210 |   fbin_bin_fixed(x, 1/100, 0)
211 | )
212 | 
213 | x6 <- runif(1e6)
214 | x7 <- runif(1e7)
215 | x8 <- runif(1e8)
216 | 
217 | microbenchmark(
218 |   bin_bin_fixed(x6, 1/100, 0),
219 |   fbin_bin_fixed(x6, 1/100, 0),
220 |   bin_bin_fixed(x7, 1/100, 0),
221 |   fbin_bin_fixed(x7, 1/100, 0),
222 |   bin_bin_fixed(x8, 1/100, 0),
223 |   fbin_bin_fixed(x8, 1/100, 0),
224 |   times = 10)
225 | 
226 | */


--------------------------------------------------------------------------------
/bench/smooth-1d.cpp:
--------------------------------------------------------------------------------
  1 | // Explore opportunities for making smooth_1d faster
  2 | // 
  3 | // Bounding to a given range is really important, and memoisation helps offset
  4 | // the cost of making a call back to R, but the biggest win is using a pure
  5 | // C/C++ kernel function.
  6 | // 
  7 | // 
  8 | #include <Rcpp.h>
  9 | using namespace Rcpp;
 10 | 
 11 | // Base implementation
 12 | // [[Rcpp::export]]
 13 | NumericVector smooth_1d(const NumericVector& x, const NumericVector& z, 
 14 |                         const NumericVector& x_out, const Function& kernel) {
 15 | 
 16 |   int n_in = x.size(), n_out = x_out.size();
 17 |   NumericVector z_out(n_out);
 18 | 
 19 |   for (int i = 0; i < n_out; i++) {
 20 |     for (int j = 0; j < n_in; j++) {
 21 |       double dist = x[j] - x_out[i];
 22 |       double k = as<NumericVector>(kernel(dist))[0];
 23 |       z_out[i] += z[j] * k;
 24 |     }
 25 |   }
 26 | 
 27 |   return z_out;
 28 | }
 29 | 
 30 | // Memoise distance calculations
 31 | // [[Rcpp::export]]
 32 | NumericVector smooth_1d_memo(const NumericVector& x, const NumericVector& z, 
 33 |                         const NumericVector& x_out, const Function& kernel) {
 34 |   int n_in = x.size(), n_out = x_out.size();
 35 |   NumericVector z_out(n_out);
 36 | 
 37 |   std::unordered_map<double, double> k_memo;
 38 | 
 39 |   for (int i = 0; i < n_out; i++) {
 40 |     for (int j = 0; j < n_in; j++) {
 41 |       double dist = x[j] - x_out[i];
 42 | 
 43 |       std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);
 44 |       double k;
 45 |       if (it == k_memo.end()) {
 46 |         k = as<NumericVector>(kernel(dist))[0];
 47 |         k_memo[dist] = k;
 48 |       } else {
 49 |         k = it->second; 
 50 |       }
 51 | 
 52 |       z_out[i] += z[j] * k;
 53 |     }
 54 |   }
 55 | 
 56 |   return z_out;
 57 | }
 58 | 
 59 | // Use range of kernel
 60 | // [[Rcpp::export]]
 61 | NumericVector smooth_1d_range(const NumericVector& x, const NumericVector& z, 
 62 |                         const NumericVector& x_out, const Function& kernel,
 63 |                         double kmin, double kmax) {
 64 | 
 65 |   int n_in = x.size(), n_out = x_out.size();
 66 |   NumericVector z_out(n_out);
 67 | 
 68 |   for (int i = 0; i < n_out; i++) {
 69 |     for (int j = 0; j < n_in; j++) {
 70 |       double dist = x[j] - x_out[i];
 71 |       if (dist < kmin || dist > kmax) continue;
 72 | 
 73 |       double k = as<NumericVector>(kernel(dist))[0];
 74 |       z_out[i] += z[j] * k;
 75 |     }
 76 |   }
 77 | 
 78 |   return z_out;
 79 | }
 80 | 
 81 | 
 82 | // Memoise and use range
 83 | // [[Rcpp::export]]
 84 | NumericVector smooth_1d_memo_range(const NumericVector& x, const NumericVector& z, 
 85 |                         const NumericVector& x_out, const Function& kernel,
 86 |                         double kmin, double kmax) {
 87 |   int n_in = x.size(), n_out = x_out.size();
 88 |   NumericVector z_out(n_out);
 89 | 
 90 |   std::unordered_map<double, double> k_memo;
 91 | 
 92 |   for (int i = 0; i < n_out; i++) {
 93 |     for (int j = 0; j < n_in; j++) {
 94 |       double dist = x[j] - x_out[i];
 95 |       if (dist < kmin || dist > kmax) continue;
 96 | 
 97 |       std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);
 98 |       double k;
 99 |       if (it == k_memo.end()) {
100 |         k = as<NumericVector>(kernel(dist))[0];
101 |         k_memo[dist] = k;
102 |       } else {
103 |         k = it->second; 
104 |       }
105 | 
106 |       z_out[i] += z[j] * k;
107 |     }
108 |   }
109 | 
110 |   return z_out;
111 | }
112 | 
113 | // Memoise and use range
114 | // [[Rcpp::export]]
115 | NumericVector smooth_1d_memo_range_map(const NumericVector& x, const NumericVector& z, 
116 |                         const NumericVector& x_out, const Function& kernel,
117 |                         double kmin, double kmax) {
118 |   int n_in = x.size(), n_out = x_out.size();
119 |   NumericVector z_out(n_out);
120 | 
121 |   std::map<double, double> k_memo;
122 | 
123 |   for (int i = 0; i < n_out; i++) {
124 |     for (int j = 0; j < n_in; j++) {
125 |       double dist = x[j] - x_out[i];
126 |       if (dist < kmin || dist > kmax) continue;
127 | 
128 |       std::map<double, double>::const_iterator it = k_memo.find(dist);
129 |       double k;
130 |       if (it == k_memo.end()) {
131 |         k = as<NumericVector>(kernel(dist))[0];
132 |         k_memo[dist] = k;
133 |       } else {
134 |         k = it->second; 
135 |       }
136 | 
137 |       z_out[i] += z[j] * k;
138 |     }
139 |   }
140 | 
141 |   return z_out;
142 | }
143 | 
144 | // Memoise, use range & use C++ function for kernel
145 | // [[Rcpp::export]]
146 | NumericVector smooth_1d_memo_range_kcpp(const NumericVector& x, const NumericVector& z, 
147 |                         const NumericVector& x_out, double kmin, double kmax) {
148 |   int n_in = x.size(), n_out = x_out.size();
149 |   NumericVector z_out(n_out);
150 | 
151 |   std::unordered_map<double, double> k_memo;
152 | 
153 |   for (int i = 0; i < n_out; i++) {
154 |     for (int j = 0; j < n_in; j++) {
155 |       double dist = x[j] - x_out[i];
156 |       if (dist < kmin || dist > kmax) continue;
157 | 
158 |       std::unordered_map<double, double>::const_iterator it = k_memo.find(dist);
159 |       double k;
160 |       if (it == k_memo.end()) {
161 |         k = R::dnorm(dist, 0.0, 0.1, 0);
162 |         k_memo[dist] = k;
163 |       } else {
164 |         k = it->second; 
165 |       }
166 | 
167 |       z_out[i] += z[j] * k;
168 |     }
169 |   }
170 | 
171 |   return z_out;
172 | }
173 | 
174 | // Use cpp kernel function without memoisation
175 | // [[Rcpp::export]]
176 | NumericVector smooth_1d_range_kcpp(const NumericVector& x, const NumericVector& z, 
177 |                         const NumericVector& x_out, double kmin, double kmax) {
178 | 
179 |   int n_in = x.size(), n_out = x_out.size();
180 |   NumericVector z_out(n_out);
181 | 
182 |   for (int i = 0; i < n_out; i++) {
183 |     for (int j = 0; j < n_in; j++) {
184 |       double dist = x[j] - x_out[i];
185 |       if (dist < kmin || dist > kmax) continue;
186 | 
187 |       double k = R::dnorm(dist, 0.0, 0.1, 0);
188 |       z_out[i] += z[j] * k;
189 |     }
190 |   }
191 | 
192 |   return z_out;
193 | }
194 | 
195 | 
196 | /*** R
197 |   options(digits = 2)
198 |   x <- 1:10
199 |   z <- rep(c(1, 2), length = length(x))
200 |   k <- kernel("norm", sd = 0.1)
201 |   krng <- range(k)
202 |   grid <- seq(0, 11, length = 100)
203 |   
204 |   stopifnot(all.equal(
205 |     smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]),
206 |     smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2])
207 |   ))
208 | 
209 |   library(microbenchmark)
210 |   microbenchmark(
211 |     base = smooth_1d(x, z, grid, k),
212 |     memo = smooth_1d_memo(x, z, grid, k),
213 |     range = smooth_1d_range(x, z, grid, k, krng[1], krng[2]),
214 |     "range + kcpp" = smooth_1d_range_kcpp(x, z, grid, krng[1], krng[2]),
215 |     "range + memo" = smooth_1d_memo_range(x, z, grid, k, krng[1], krng[2]),
216 |     "range + memo + kcpp" = smooth_1d_memo_range_kcpp(x, z, grid, krng[1], krng[2]),
217 |     "range + memo + map" = smooth_1d_memo_range_map(x, z, grid, k, krng[1], krng[2])
218 |   )
219 |   
220 |   # More realistic sample sizes
221 |   x <- 1:3e3
222 |   z <- rep(c(1, 2), length = length(x))
223 |   
224 |   grid3 <- seq(0, 11, length = 3e3)
225 |   grid4 <- seq(0, 11, length = 3e4)
226 | 
227 |   microbenchmark(
228 |     grid3_c = smooth_1d_range_kcpp(x, z, grid3, krng[1], krng[2]),
229 |     grid3_r = smooth_1d_memo_range_map(x, z, grid3, k, krng[1], krng[2]),
230 |     grid4_c = smooth_1d_range_kcpp(x, z, grid4, krng[1], krng[2]),
231 |     grid4_r = smooth_1d_memo_range_map(x, z, grid4, k, krng[1], krng[2]),
232 |     times = 10)
233 | 
234 | */


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // This file was generated by Rcpp::compileAttributes
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include "../inst/include/bigvis.h"
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | // condense_count
 10 | List condense_count(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
 11 | RcppExport SEXP bigvis_condense_count(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
 12 | BEGIN_RCPP
 13 |     Rcpp::RObject __result;
 14 |     Rcpp::RNGScope __rngScope;
 15 |     Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
 16 |     Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
 17 |     Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
 18 |     Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
 19 |     __result = Rcpp::wrap(condense_count(x, z, weight, drop));
 20 |     return __result;
 21 | END_RCPP
 22 | }
 23 | // condense_sum
 24 | List condense_sum(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
 25 | RcppExport SEXP bigvis_condense_sum(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
 26 | BEGIN_RCPP
 27 |     Rcpp::RObject __result;
 28 |     Rcpp::RNGScope __rngScope;
 29 |     Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
 30 |     Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
 31 |     Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
 32 |     Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
 33 |     __result = Rcpp::wrap(condense_sum(x, z, weight, drop));
 34 |     return __result;
 35 | END_RCPP
 36 | }
 37 | // condense_mean
 38 | List condense_mean(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
 39 | RcppExport SEXP bigvis_condense_mean(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
 40 | BEGIN_RCPP
 41 |     Rcpp::RObject __result;
 42 |     Rcpp::RNGScope __rngScope;
 43 |     Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
 44 |     Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
 45 |     Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
 46 |     Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
 47 |     __result = Rcpp::wrap(condense_mean(x, z, weight, drop));
 48 |     return __result;
 49 | END_RCPP
 50 | }
 51 | // condense_sd
 52 | List condense_sd(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
 53 | RcppExport SEXP bigvis_condense_sd(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
 54 | BEGIN_RCPP
 55 |     Rcpp::RObject __result;
 56 |     Rcpp::RNGScope __rngScope;
 57 |     Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
 58 |     Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
 59 |     Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
 60 |     Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
 61 |     __result = Rcpp::wrap(condense_sd(x, z, weight, drop));
 62 |     return __result;
 63 | END_RCPP
 64 | }
 65 | // condense_median
 66 | List condense_median(const List& x, const NumericVector& z, const NumericVector& weight, bool drop);
 67 | RcppExport SEXP bigvis_condense_median(SEXP xSEXP, SEXP zSEXP, SEXP weightSEXP, SEXP dropSEXP) {
 68 | BEGIN_RCPP
 69 |     Rcpp::RObject __result;
 70 |     Rcpp::RNGScope __rngScope;
 71 |     Rcpp::traits::input_parameter< const List& >::type x(xSEXP);
 72 |     Rcpp::traits::input_parameter< const NumericVector& >::type z(zSEXP);
 73 |     Rcpp::traits::input_parameter< const NumericVector& >::type weight(weightSEXP);
 74 |     Rcpp::traits::input_parameter< bool >::type drop(dropSEXP);
 75 |     __result = Rcpp::wrap(condense_median(x, z, weight, drop));
 76 |     return __result;
 77 | END_RCPP
 78 | }
 79 | // double_diff_sum
 80 | std::vector<int> double_diff_sum(IntegerVector bin, IntegerVector count);
 81 | RcppExport SEXP bigvis_double_diff_sum(SEXP binSEXP, SEXP countSEXP) {
 82 | BEGIN_RCPP
 83 |     Rcpp::RObject __result;
 84 |     Rcpp::RNGScope __rngScope;
 85 |     Rcpp::traits::input_parameter< IntegerVector >::type bin(binSEXP);
 86 |     Rcpp::traits::input_parameter< IntegerVector >::type count(countSEXP);
 87 |     __result = Rcpp::wrap(double_diff_sum(bin, count));
 88 |     return __result;
 89 | END_RCPP
 90 | }
 91 | // frange
 92 | NumericVector frange(const NumericVector& x, const bool finite);
 93 | RcppExport SEXP bigvis_frange(SEXP xSEXP, SEXP finiteSEXP) {
 94 | BEGIN_RCPP
 95 |     Rcpp::RObject __result;
 96 |     Rcpp::RNGScope __rngScope;
 97 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
 98 |     Rcpp::traits::input_parameter< const bool >::type finite(finiteSEXP);
 99 |     __result = Rcpp::wrap(frange(x, finite));
100 |     return __result;
101 | END_RCPP
102 | }
103 | // group_fixed
104 | IntegerVector group_fixed(const NumericVector& x, double width, double origin);
105 | RcppExport SEXP bigvis_group_fixed(SEXP xSEXP, SEXP widthSEXP, SEXP originSEXP) {
106 | BEGIN_RCPP
107 |     Rcpp::RObject __result;
108 |     Rcpp::RNGScope __rngScope;
109 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
110 |     Rcpp::traits::input_parameter< double >::type width(widthSEXP);
111 |     Rcpp::traits::input_parameter< double >::type origin(originSEXP);
112 |     __result = Rcpp::wrap(group_fixed(x, width, origin));
113 |     return __result;
114 | END_RCPP
115 | }
116 | // group_rect
117 | IntegerVector group_rect(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin);
118 | RcppExport SEXP bigvis_group_rect(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP) {
119 | BEGIN_RCPP
120 |     Rcpp::RObject __result;
121 |     Rcpp::RNGScope __rngScope;
122 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
123 |     Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP);
124 |     Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP);
125 |     Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP);
126 |     Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP);
127 |     Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP);
128 |     __result = Rcpp::wrap(group_rect(x, y, x_width, y_width, x_origin, y_origin));
129 |     return __result;
130 | END_RCPP
131 | }
132 | // group_hex
133 | IntegerVector group_hex(const NumericVector& x, const NumericVector& y, double x_width, double y_width, double x_origin, double y_origin, double x_max);
134 | RcppExport SEXP bigvis_group_hex(SEXP xSEXP, SEXP ySEXP, SEXP x_widthSEXP, SEXP y_widthSEXP, SEXP x_originSEXP, SEXP y_originSEXP, SEXP x_maxSEXP) {
135 | BEGIN_RCPP
136 |     Rcpp::RObject __result;
137 |     Rcpp::RNGScope __rngScope;
138 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
139 |     Rcpp::traits::input_parameter< const NumericVector& >::type y(ySEXP);
140 |     Rcpp::traits::input_parameter< double >::type x_width(x_widthSEXP);
141 |     Rcpp::traits::input_parameter< double >::type y_width(y_widthSEXP);
142 |     Rcpp::traits::input_parameter< double >::type x_origin(x_originSEXP);
143 |     Rcpp::traits::input_parameter< double >::type y_origin(y_originSEXP);
144 |     Rcpp::traits::input_parameter< double >::type x_max(x_maxSEXP);
145 |     __result = Rcpp::wrap(group_hex(x, y, x_width, y_width, x_origin, y_origin, x_max));
146 |     return __result;
147 | END_RCPP
148 | }
149 | // lowerBound
150 | IntegerVector lowerBound(const NumericVector& x, const NumericVector& breaks);
151 | RcppExport SEXP bigvis_lowerBound(SEXP xSEXP, SEXP breaksSEXP) {
152 | BEGIN_RCPP
153 |     Rcpp::RObject __result;
154 |     Rcpp::RNGScope __rngScope;
155 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
156 |     Rcpp::traits::input_parameter< const NumericVector& >::type breaks(breaksSEXP);
157 |     __result = Rcpp::wrap(lowerBound(x, breaks));
158 |     return __result;
159 | END_RCPP
160 | }
161 | // smooth_nd_1
162 | NumericVector smooth_nd_1(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const int var, const double h, const std::string type);
163 | RcppExport SEXP bigvis_smooth_nd_1(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP varSEXP, SEXP hSEXP, SEXP typeSEXP) {
164 | BEGIN_RCPP
165 |     Rcpp::RObject __result;
166 |     Rcpp::RNGScope __rngScope;
167 |     Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP);
168 |     Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP);
169 |     Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP);
170 |     Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP);
171 |     Rcpp::traits::input_parameter< const int >::type var(varSEXP);
172 |     Rcpp::traits::input_parameter< const double >::type h(hSEXP);
173 |     Rcpp::traits::input_parameter< const std::string >::type type(typeSEXP);
174 |     __result = Rcpp::wrap(smooth_nd_1(grid_in, z_in, w_in_, grid_out, var, h, type));
175 |     return __result;
176 | END_RCPP
177 | }
178 | // smooth_nd
179 | NumericVector smooth_nd(const NumericMatrix& grid_in, const NumericVector& z_in, const NumericVector& w_in_, const NumericMatrix& grid_out, const NumericVector h);
180 | RcppExport SEXP bigvis_smooth_nd(SEXP grid_inSEXP, SEXP z_inSEXP, SEXP w_in_SEXP, SEXP grid_outSEXP, SEXP hSEXP) {
181 | BEGIN_RCPP
182 |     Rcpp::RObject __result;
183 |     Rcpp::RNGScope __rngScope;
184 |     Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_in(grid_inSEXP);
185 |     Rcpp::traits::input_parameter< const NumericVector& >::type z_in(z_inSEXP);
186 |     Rcpp::traits::input_parameter< const NumericVector& >::type w_in_(w_in_SEXP);
187 |     Rcpp::traits::input_parameter< const NumericMatrix& >::type grid_out(grid_outSEXP);
188 |     Rcpp::traits::input_parameter< const NumericVector >::type h(hSEXP);
189 |     __result = Rcpp::wrap(smooth_nd(grid_in, z_in, w_in_, grid_out, h));
190 |     return __result;
191 | END_RCPP
192 | }
193 | // bisquare
194 | double bisquare(double u, double b);
195 | RcppExport SEXP bigvis_bisquare(SEXP uSEXP, SEXP bSEXP) {
196 | BEGIN_RCPP
197 |     Rcpp::RObject __result;
198 |     Rcpp::RNGScope __rngScope;
199 |     Rcpp::traits::input_parameter< double >::type u(uSEXP);
200 |     Rcpp::traits::input_parameter< double >::type b(bSEXP);
201 |     __result = Rcpp::wrap(bisquare(u, b));
202 |     return __result;
203 | END_RCPP
204 | }
205 | // regress
206 | NumericVector regress(const std::vector<double>& x, const std::vector<double>& y, const std::vector<double>& w);
207 | RcppExport SEXP bigvis_regress(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP) {
208 | BEGIN_RCPP
209 |     Rcpp::RObject __result;
210 |     Rcpp::RNGScope __rngScope;
211 |     Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);
212 |     Rcpp::traits::input_parameter< const std::vector<double>& >::type y(ySEXP);
213 |     Rcpp::traits::input_parameter< const std::vector<double>& >::type w(wSEXP);
214 |     __result = Rcpp::wrap(regress(x, y, w));
215 |     return __result;
216 | END_RCPP
217 | }
218 | // median
219 | double median(const std::vector<double>& x);
220 | RcppExport SEXP bigvis_median(SEXP xSEXP) {
221 | BEGIN_RCPP
222 |     Rcpp::RObject __result;
223 |     Rcpp::RNGScope __rngScope;
224 |     Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);
225 |     __result = Rcpp::wrap(median(x));
226 |     return __result;
227 | END_RCPP
228 | }
229 | // regress_robust
230 | NumericVector regress_robust(const std::vector<double>& x, const std::vector<double>& y, const std::vector<double>& w, int iterations);
231 | RcppExport SEXP bigvis_regress_robust(SEXP xSEXP, SEXP ySEXP, SEXP wSEXP, SEXP iterationsSEXP) {
232 | BEGIN_RCPP
233 |     Rcpp::RObject __result;
234 |     Rcpp::RNGScope __rngScope;
235 |     Rcpp::traits::input_parameter< const std::vector<double>& >::type x(xSEXP);
236 |     Rcpp::traits::input_parameter< const std::vector<double>& >::type y(ySEXP);
237 |     Rcpp::traits::input_parameter< const std::vector<double>& >::type w(wSEXP);
238 |     Rcpp::traits::input_parameter< int >::type iterations(iterationsSEXP);
239 |     __result = Rcpp::wrap(regress_robust(x, y, w, iterations));
240 |     return __result;
241 | END_RCPP
242 | }
243 | // compute_moments
244 | NumericVector compute_moments(const NumericVector& x);
245 | RcppExport SEXP bigvis_compute_moments(SEXP xSEXP) {
246 | BEGIN_RCPP
247 |     Rcpp::RObject __result;
248 |     Rcpp::RNGScope __rngScope;
249 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
250 |     __result = Rcpp::wrap(compute_moments(x));
251 |     return __result;
252 | END_RCPP
253 | }
254 | // compute_sum
255 | NumericVector compute_sum(const NumericVector& x);
256 | RcppExport SEXP bigvis_compute_sum(SEXP xSEXP) {
257 | BEGIN_RCPP
258 |     Rcpp::RObject __result;
259 |     Rcpp::RNGScope __rngScope;
260 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
261 |     __result = Rcpp::wrap(compute_sum(x));
262 |     return __result;
263 | END_RCPP
264 | }
265 | // compute_median
266 | NumericVector compute_median(const NumericVector& x);
267 | RcppExport SEXP bigvis_compute_median(SEXP xSEXP) {
268 | BEGIN_RCPP
269 |     Rcpp::RObject __result;
270 |     Rcpp::RNGScope __rngScope;
271 |     Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP);
272 |     __result = Rcpp::wrap(compute_median(x));
273 |     return __result;
274 | END_RCPP
275 | }
276 | 


--------------------------------------------------------------------------------