├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── addCol.R ├── addIds.R ├── bind_rows.R ├── checkRank.R ├── coalesce.R ├── colClasses.R ├── copyToFrom.R ├── dim.R ├── expandColumn.R ├── filter.R ├── groupedApply.R ├── inTest.R ├── joinController.R ├── land.R ├── nrow.R ├── quantile.R ├── renameRestrictCols.R ├── replyr.R ├── serviceName.R ├── summary.R ├── underscoreReplacements.R └── uniqueValues.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── checks ├── .gitignore ├── CheckFns.R ├── README.Rmd ├── README.md ├── SmallTimings.Rmd ├── SmallTimings.html ├── SmallTimings.md ├── SmallTimings_files │ └── figure-markdown_github │ │ └── pressure-1.png ├── SparkCrasher.Rmd ├── SplitApplyCombineSpark.Rmd ├── SplitApplyCombineSpark.md ├── basicChecks.Rmd ├── basicChecks.md ├── gapply.Rmd └── gapply.md ├── cleanFluff.bash ├── cran-comments.md ├── docs ├── 404.html ├── LICENSE-text.html ├── _config.yml ├── articles │ ├── DependencySorting.html │ ├── DependencySorting_d.png │ ├── DependencySorting_files │ │ └── accessible-code-block-0.0.1 │ │ │ └── empty-anchor.js │ ├── DependencySorting_ig.png │ ├── ParametricExample.html │ ├── ParametricExample_files │ │ └── accessible-code-block-0.0.1 │ │ │ └── empty-anchor.js │ ├── coalesce.html │ ├── coalesce_files │ │ └── accessible-code-block-0.0.1 │ │ │ └── empty-anchor.js │ ├── index.html │ ├── joinController.html │ ├── joinController1.png │ ├── joinController_files │ │ └── accessible-code-block-0.0.1 │ │ │ └── empty-anchor.js │ ├── letExample.html │ ├── letExample_files │ │ ├── accessible-code-block-0.0.1 │ │ │ └── empty-anchor.js │ │ └── figure-html │ │ │ ├── ggplot1-1.png │ │ │ └── ggplot2-1.png │ ├── replyr.html │ ├── replyr_files │ │ └── accessible-code-block-0.0.1 │ │ │ └── empty-anchor.js │ ├── replyrs.png │ ├── summary.html │ └── summary_files │ │ └── accessible-code-block-0.0.1 │ │ └── empty-anchor.js ├── authors.html ├── bootstrap-toc.css ├── bootstrap-toc.js ├── docsearch.css ├── docsearch.js ├── index.html ├── jquery.sticky-kit.min.js ├── link.svg ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml ├── reference │ ├── Rplot001.png │ ├── addConstantColumn.html │ ├── buildJoinPlan.html │ ├── dplyr_src_to_db_handle.html │ ├── example_employeeAndDate.html │ ├── executeLeftJoinPlan.html │ ├── expandColumn.html │ ├── gapply.html │ ├── grapes-land-grapes.html │ ├── index.html │ ├── inspectDescrAndJoinPlan.html │ ├── key_inspector_all_cols.html │ ├── key_inspector_postgresql.html │ ├── key_inspector_sqlite.html │ ├── keysAreUnique.html │ ├── makeJoinDiagramSpec.html │ ├── replyr.html │ ├── replyr_add_ids.html │ ├── replyr_apply_f_mapped.html │ ├── replyr_arrange.html │ ├── replyr_bind_rows.html │ ├── replyr_check_ranks.html │ ├── replyr_coalesce.html │ ├── replyr_colClasses.html │ ├── replyr_copy_from.html │ ├── replyr_copy_to.html │ ├── replyr_dim.html │ ├── replyr_filter.html │ ├── replyr_get_src.html │ ├── replyr_group_by.html │ ├── replyr_has_table.html │ ├── replyr_hasrows.html │ ├── replyr_inTest.html │ ├── replyr_is_MySQL_data.html │ ├── replyr_is_Spark_data.html │ ├── replyr_is_local_data.html │ ├── replyr_list_tables.html │ ├── replyr_mapRestrictCols.html │ ├── replyr_ncol.html │ ├── replyr_nrow.html │ ├── replyr_quantile.html │ ├── replyr_quantilec.html │ ├── replyr_rename.html │ ├── replyr_reverseMap.html │ ├── replyr_select.html │ ├── replyr_split.html │ ├── replyr_summary.html │ ├── replyr_testCols.html │ ├── replyr_union_all.html │ ├── replyr_uniqueValues.html │ ├── tableDescription.html │ └── topoSortTables.html └── sitemap.xml ├── extras ├── BigDataTransforms.Rmd ├── BigDataTransforms.md ├── KnitrParameters.Rmd ├── KnitrParameters.md ├── KnitrParameters_files │ └── figure-markdown_github │ │ ├── bindings-1.png │ │ └── values-1.png └── summary_next.R ├── issues ├── BindIssue.Rmd ├── BindIssue.md ├── ComplexJoins.Rmd ├── ComplexJoins.md ├── DplyrDevnrow.Rmd ├── DplyrDevnrow.md ├── DplyrSparklyr.Rmd ├── DplyrSparklyr.md ├── DplyrSparklyr_CRANdplyr_CRANsparklyr.md ├── DplyrSparklyr_DEVdplyr_CRANsparklyr.md ├── DplyrSparklyr_DEVdplyr_DEVsparklyr.md ├── HeadIssue.Rmd ├── HeadIssue.md ├── JoinNamesDups.Rmd ├── JoinNamesDups.md ├── MySQLSelfJoin.Rmd ├── MySQLSelfJoin.md ├── MySQL_mutate.Rmd ├── MySQL_mutate.md ├── MySQLcast.Rmd ├── MySQLcast.md ├── NAvalues.Rmd ├── NAvalues.md ├── README.Rmd ├── README.md ├── SQLiteColtypes.Rmd ├── SQLiteColtypes.md ├── SQLitesd.Rmd ├── SQLitesd.md ├── SparkNAIssue.Rmd ├── SparkNAIssue.md ├── SparklyrRename.Rmd ├── SparklyrRename.md ├── TrailingRefIssue.Rmd ├── TrailingRefIssue.md ├── UnionIssue.Rmd ├── UnionIssue.md ├── arrangecompute.Rmd ├── arrangecompute.md ├── copyIssueMySQL.Rmd ├── copyIssueMySQL.md ├── copyissue162.Rmd ├── copyissue162.md ├── copyissue200.Rmd ├── copyissue200.md ├── factorissue.Rmd ├── factorissue.md ├── union_all_issue.Rmd └── union_all_issue.md ├── man ├── addConstantColumn.Rd ├── buildJoinPlan.Rd ├── dplyr_src_to_db_handle.Rd ├── example_employeeAndDate.Rd ├── executeLeftJoinPlan.Rd ├── expandColumn.Rd ├── gapply.Rd ├── grapes-land-grapes.Rd ├── inspectDescrAndJoinPlan.Rd ├── key_inspector_all_cols.Rd ├── key_inspector_postgresql.Rd ├── key_inspector_sqlite.Rd ├── keysAreUnique.Rd ├── makeJoinDiagramSpec.Rd ├── replyr.Rd ├── replyr_add_ids.Rd ├── replyr_apply_f_mapped.Rd ├── replyr_arrange.Rd ├── replyr_bind_rows.Rd ├── replyr_check_ranks.Rd ├── replyr_coalesce.Rd ├── replyr_colClasses.Rd ├── replyr_copy_from.Rd ├── replyr_copy_to.Rd ├── replyr_dim.Rd ├── replyr_filter.Rd ├── replyr_get_src.Rd ├── replyr_group_by.Rd ├── replyr_has_table.Rd ├── replyr_hasrows.Rd ├── replyr_inTest.Rd ├── replyr_is_MySQL_data.Rd ├── replyr_is_Spark_data.Rd ├── replyr_is_local_data.Rd ├── replyr_list_tables.Rd ├── replyr_mapRestrictCols.Rd ├── replyr_ncol.Rd ├── replyr_nrow.Rd ├── replyr_quantile.Rd ├── replyr_quantilec.Rd ├── replyr_rename.Rd ├── replyr_reverseMap.Rd ├── replyr_select.Rd ├── replyr_split.Rd ├── replyr_summary.Rd ├── replyr_testCols.Rd ├── replyr_union_all.Rd ├── replyr_uniqueValues.Rd ├── tableDescription.Rd └── topoSortTables.Rd ├── replyr.Rproj ├── tests ├── testthat.R └── testthat │ ├── testOne.R │ ├── test_gapply.R │ ├── test_grapes-land-grapes.R │ ├── test_let.R │ ├── test_replyr_bind_rows.R │ ├── test_replyr_check_ranks.R │ ├── test_replyr_colClasses.R │ ├── test_replyr_copy_from.R │ ├── test_replyr_copy_to.R │ ├── test_replyr_dim.R │ ├── test_replyr_filter.R │ ├── test_replyr_inTest.R │ ├── test_replyr_mapRestrictCols.R │ ├── test_replyr_nrow.R │ ├── test_replyr_quantile.R │ ├── test_replyr_quantilec.R │ ├── test_replyr_split.R │ ├── test_replyr_summary.R │ ├── test_replyr_testCols.R │ └── test_replyr_uniqueValues.R ├── tools └── replyrs.png └── vignettes ├── .gitignore ├── DependencySorting.Rmd ├── DependencySorting_d.png ├── DependencySorting_ig.png ├── ParametricExample.Rmd ├── coalesce.Rmd ├── joinController.Rmd ├── joinController1.png ├── letExample.Rmd ├── replyr.Rmd ├── replyrs.png └── summary.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^18029435653_4d64c656c8_z.jpg$ 5 | ^replyrs.png$ 6 | ^IrwinViseGrip.jpg$ 7 | ^README.Rmd$ 8 | ^README.html$ 9 | ^issues/.*$ 10 | ^checks/.*$ 11 | ^extras/.*$ 12 | ^cleanFluff.bash$ 13 | ^derby.log$ 14 | ^log4j.spark.log$ 15 | ^cran-comments.md$ 16 | ^docs$ 17 | ^_pkgdown\.yml$ 18 | ^revdep$ 19 | ^logs$ 20 | ^articles$ 21 | ^LICENSE$ 22 | ^doc$ 23 | ^Meta$ 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | *~ 6 | .DS_Store 7 | inst/doc 8 | vignettes/replyr_sqliteEx.sqlite3 9 | replyr_sqliteEx.sqlite3 10 | *.log 11 | *.log.* 12 | *.sqlite3* 13 | revdep 14 | spark-warehouse 15 | logs 16 | CRAN-RELEASE 17 | doc 18 | Meta 19 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: replyr 2 | Type: Package 3 | Title: Patches to Use 'dplyr' on Remote Data Sources 4 | Version: 1.0.5 5 | Date: 2019-11-01 6 | Authors@R: c( 7 | person("John", "Mount", email = "jmount@win-vector.com", role = c("aut", "cre")), 8 | person("Nina", "Zumel", email = "nzumel@win-vector.com", role = c("aut")), 9 | person(family = "Win-Vector LLC", role = c("cph")) 10 | ) 11 | Maintainer: John Mount 12 | URL: https://github.com/WinVector/replyr/, https://winvector.github.io/replyr/ 13 | BugReports: https://github.com/WinVector/replyr/issues 14 | Description: Patches to use 'dplyr' on remote data sources ('SQL' databases, 15 | 'Spark' 2.0.0 and above) in a reliable "generic" fashion (generic meaning 16 | user code works similarly on all such sources, without needing per-source 17 | adaption). Due to the fluctuating nature of 'dplyr'/'dbplyr'/'rlang' 'APIs' this package 18 | is going into maintenance mode. Most of the 'replyr' functions are now 19 | done better by one of the non-monolithic replacement packages: 'wrapr', 'seplyr', 'rquery', 20 | or 'cdata'. 21 | License: GPL-2 | GPL-3 22 | LazyData: TRUE 23 | Depends: 24 | R (>= 3.4.0), 25 | wrapr (>= 1.9.2) 26 | Imports: 27 | dplyr (>= 0.7.0), 28 | rlang (>= 0.2.0), 29 | dbplyr, 30 | DBI 31 | RoxygenNote: 7.1.1 32 | Suggests: testthat, 33 | knitr, 34 | rmarkdown, 35 | sparklyr, 36 | igraph, 37 | DiagrammeR, 38 | RSQLite 39 | VignetteBuilder: knitr 40 | ByteCompile: true 41 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%->%") 4 | export("%->_%") 5 | export("%land%") 6 | export("%land_%") 7 | export(addConstantColumn) 8 | export(buildJoinPlan) 9 | export(dplyr_src_to_db_handle) 10 | export(example_employeeAndDate) 11 | export(executeLeftJoinPlan) 12 | export(expandColumn) 13 | export(gapply) 14 | export(inspectDescrAndJoinPlan) 15 | export(key_inspector_all_cols) 16 | export(key_inspector_postgresql) 17 | export(key_inspector_sqlite) 18 | export(keysAreUnique) 19 | export(makeJoinDiagramSpec) 20 | export(replyr_add_ids) 21 | export(replyr_apply_f_mapped) 22 | export(replyr_arrange) 23 | export(replyr_bind_rows) 24 | export(replyr_check_ranks) 25 | export(replyr_coalesce) 26 | export(replyr_colClasses) 27 | export(replyr_copy_from) 28 | export(replyr_copy_to) 29 | export(replyr_dim) 30 | export(replyr_filter) 31 | export(replyr_get_src) 32 | export(replyr_group_by) 33 | export(replyr_has_table) 34 | export(replyr_hasrows) 35 | export(replyr_inTest) 36 | export(replyr_is_MySQL_data) 37 | export(replyr_is_Spark_data) 38 | export(replyr_is_local_data) 39 | export(replyr_list_tables) 40 | export(replyr_mapRestrictCols) 41 | export(replyr_ncol) 42 | export(replyr_nrow) 43 | export(replyr_quantile) 44 | export(replyr_quantilec) 45 | export(replyr_rename) 46 | export(replyr_reverseMap) 47 | export(replyr_select) 48 | export(replyr_split) 49 | export(replyr_summary) 50 | export(replyr_testCols) 51 | export(replyr_union_all) 52 | export(replyr_uniqueValues) 53 | export(tableDescription) 54 | export(topoSortTables) 55 | importFrom(DBI,dbConnect) 56 | importFrom(dbplyr,db_copy_to) 57 | importFrom(dplyr,arrange) 58 | importFrom(dplyr,as.tbl) 59 | importFrom(dplyr,collect) 60 | importFrom(dplyr,compute) 61 | importFrom(dplyr,funs) 62 | importFrom(dplyr,inner_join) 63 | importFrom(dplyr,left_join) 64 | importFrom(dplyr,mutate) 65 | importFrom(dplyr,n) 66 | importFrom(dplyr,summarise_all) 67 | importFrom(dplyr,summarize) 68 | importFrom(dplyr,tbl) 69 | importFrom(dplyr,transmute) 70 | importFrom(dplyr,ungroup) 71 | importFrom(rlang,sym) 72 | importFrom(rlang,syms) 73 | importFrom(stats,sd) 74 | importFrom(stats,setNames) 75 | importFrom(utils,capture.output) 76 | importFrom(utils,head) 77 | importFrom(wrapr,"%.>%") 78 | importFrom(wrapr,":=") 79 | importFrom(wrapr,let) 80 | importFrom(wrapr,mk_tmp_name_source) 81 | -------------------------------------------------------------------------------- /R/addCol.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' Add constant to a table. 5 | #' 6 | #' Work around different treatment of character types across remote 7 | #' data sources when adding a 8 | #' constant column to a table. Deals with issues such as Postgresql 9 | #' requiring a character-cast and MySQL not allowing such. 10 | #' 11 | #' @param d data.frame like object to add column to. 12 | #' @param colName character, name of column to add. 13 | #' @param val scalar, value to add. 14 | #' @param ... force later arguments to be bound by name. 15 | #' @param tempNameGenerator temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects. 16 | #' @return table with new column added. 17 | #' 18 | #' @examples 19 | #' 20 | #' d <- data.frame(x= c(1:3)) 21 | #' addConstantColumn(d, 'newCol', 'newVal') 22 | #' 23 | #' @export 24 | addConstantColumn <- function(d, 25 | colName, val, 26 | ..., 27 | tempNameGenerator= mk_tmp_name_source("replyr_addConstantColumn")) { 28 | # PostgresSQL and Spark1.6.2 don't like blank character values 29 | # hope dplyr lazyeval carries the cast over to the database 30 | # And MySQL can't accept the SQL dplyr emits with character cast 31 | if((length(colName)!=1)||(!is.character(colName))) { 32 | stop("replyr::addConstantColumn colName must be a string") 33 | } 34 | if((length(val)!=1)||(is.list(val))) { 35 | stop("replyr::addConstantColumn val non-nul length 1 vector") 36 | } 37 | isMySQL <- replyr_is_MySQL_data(d) 38 | useCharCast <- is.character(val) && (!isMySQL) 39 | if(useCharCast) { 40 | let(list(REPLYRCOLNAME=colName), 41 | dm <- dplyr::mutate(d, REPLYRCOLNAME=as.character(val)) 42 | ) 43 | } else { 44 | let(list(REPLYRCOLNAME=colName), 45 | dm <- dplyr::mutate(d, REPLYRCOLNAME=val) 46 | ) 47 | } 48 | # force calculation as chaning of replyr_private_name_vi was chaning previously assigned columns! 49 | # needed to work around this: https://github.com/WinVector/replyr/blob/master/issues/TrailingRefIssue.md 50 | dm <- dplyr::compute(dm, name= tempNameGenerator()) 51 | throwAway <- collect(head(dm)) # make sure calc is forced 52 | dm 53 | } 54 | -------------------------------------------------------------------------------- /R/addIds.R: -------------------------------------------------------------------------------- 1 | 2 | #' Add unique ids to rows. Note: re-arranges rows in many cases. 3 | #' 4 | #' 5 | #' @param df data.frame object to work with 6 | #' @param idColName name of column to add 7 | #' @param env environment to evaluate in (not used). 8 | #' @param local_short_cut logical, if TRUE use base R on local data. 9 | #' 10 | #' @examples 11 | #' 12 | #' replyr_add_ids(data.frame(x=c('a','b')), 'id', local_short_cut = FALSE) 13 | #' 14 | #' @export 15 | replyr_add_ids <- function(df, idColName, 16 | env = parent.frame(), 17 | local_short_cut = TRUE) { 18 | force(env) 19 | if(local_short_cut) { 20 | if(replyr_is_local_data(df)) { 21 | # some source of local frame 22 | df[[idColName]] <- seq_len(replyr_nrow(df)) 23 | return(df) 24 | } 25 | } 26 | if(replyr_is_Spark_data(df)) { 27 | if(requireNamespace('sparklyr', quietly = TRUE)) { 28 | return(sparklyr::sdf_with_unique_id(df, id = idColName)) 29 | } 30 | } 31 | # dplyr style, throws if not ordered 32 | REPLYRIDCOLNAME <- NULL # indicate not an unbound variable 33 | row_number <- dplyr::row_number # declare not unbound function 34 | # using dplyr::row_number() throws: Error in UseMethod("escape") : 35 | # no applicable method for 'escape' applied to an object of class "function" 36 | # Also https://github.com/tidyverse/dplyr/issues/3008 37 | wrapr::let( 38 | c(REPLYRIDCOLNAME= idColName), 39 | df <- 40 | mutate(df, REPLYRIDCOLNAME = row_number()) 41 | ) 42 | # # SQL-style try, only warns if not ordered 43 | # REPLYRIDCOLNAME <- NULL # indicate not an unbound variable 44 | # wrapr::let( 45 | # c(REPLYRIDCOLNAME= idColName), 46 | # df %.>% 47 | # mutate(., REPLYRIDCOLNAME= 1) %.>% 48 | # mutate(., REPLYRIDCOLNAME= cumsum(REPLYRIDCOLNAME)) -> df 49 | # ) 50 | df 51 | } -------------------------------------------------------------------------------- /R/colClasses.R: -------------------------------------------------------------------------------- 1 | 2 | # Contributed by John Mount jmount@win-vector.com , ownership assigned to Win-Vector LLC. 3 | # Win-Vector LLC currently distributes this code without intellectual property indemnification, warranty, claim of fitness of purpose, or any other guarantee under a GPL3 license. 4 | 5 | #' Get column classes. 6 | #' 7 | #' @param x tbl or item that can be coerced into such. 8 | #' @return list of column classes. 9 | #' 10 | #' @examples 11 | #' 12 | #' d <- data.frame(x=c(1,2)) 13 | #' replyr_colClasses(d) 14 | #' 15 | #' @export 16 | replyr_colClasses <- function(x) { 17 | x %.>% 18 | dplyr::ungroup(.) %.>% 19 | head(.) %.>% 20 | dplyr::collect(.) %.>% 21 | as.data.frame(.) -> topx 22 | classes <- lapply(topx,class) 23 | names(classes) <- colnames(topx) 24 | classes 25 | } 26 | 27 | #' Run test on columns. 28 | #' 29 | #' Applies user function to head of each column. Good for determing things 30 | #' such as column class. 31 | #' 32 | #' @param x tbl or item that can be coerced into such. 33 | #' @param f test function (returning logical, not depending on data length). 34 | #' @param n number of rows to use in calculation. 35 | #' @return logical vector of results. 36 | #' 37 | #' @examples 38 | #' 39 | #' d <- data.frame(x=c(1,2),y=c('a','b')) 40 | #' replyr_testCols(d,is.numeric) 41 | #' 42 | #' @export 43 | replyr_testCols <- function(x, f, n = 6L) { 44 | x %.>% 45 | head(., n=n) %.>% 46 | dplyr::collect(.) %.>% 47 | as.data.frame(.) -> topx 48 | vapply(topx,f,logical(1)) 49 | } 50 | -------------------------------------------------------------------------------- /R/copyToFrom.R: -------------------------------------------------------------------------------- 1 | 2 | # Contributed by John Mount jmount@win-vector.com , ownership assigned to Win-Vector LLC. 3 | # Win-Vector LLC currently distributes this code without intellectual property indemnification, warranty, claim of fitness of purpose, or any other guarantee under a GPL3 license. 4 | 5 | #' @importFrom dplyr collect 6 | #' @importFrom dbplyr db_copy_to 7 | #' @importFrom DBI dbConnect 8 | #' @importFrom rlang sym 9 | NULL 10 | 11 | 12 | 13 | 14 | 15 | #' Copy data to remote service. 16 | #' 17 | #' @param dest remote data source 18 | #' @param df local data frame 19 | #' @param name name for new remote table 20 | #' @param ... force later values to be bound by name 21 | #' @param rowNumberColumn if not null name to add row numbers to 22 | #' @param temporary logical, if TRUE try to create a temporary table 23 | #' @param overwrite logical, if TRUE try to overwrite 24 | #' @param maxrow max rows to allow in a remote to remote copy. 25 | #' @return remote handle 26 | #' 27 | #' @examples 28 | #' 29 | #' 30 | #' if (requireNamespace("RSQLite", quietly = TRUE)) { 31 | #' my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 32 | #' RSQLite::initExtension(my_db) 33 | #' d <- replyr_copy_to(my_db, data.frame(x=c(1,2)), 'd') 34 | #' print(d) 35 | #' DBI::dbDisconnect(my_db) 36 | #' } 37 | #' 38 | #' @export 39 | replyr_copy_to <- function(dest, 40 | df, name = paste(deparse(substitute(df)), collapse= ' '), 41 | ..., 42 | rowNumberColumn= NULL, 43 | temporary= FALSE, 44 | overwrite= TRUE, 45 | maxrow= 1000000) { 46 | # try to force any errors early, and try to fail prior to side-effects 47 | if(length(list(...))>0) { 48 | stop('replyr::replyr_copy_to unexpected arguments') 49 | } 50 | force(dest) 51 | force(df) 52 | force(name) 53 | if(!replyr_is_local_data(df)) { 54 | warning("replyr::replyr_copy_to called on non-local table") 55 | df <- replyr_copy_from(df, maxrow = maxrow) 56 | } 57 | if(is.null(dest)) { 58 | # special "no destination" case 59 | return(df) 60 | } 61 | if(is.null(df)) { 62 | stop("NULL df to replyr::replyr_copy_to") 63 | } 64 | if((!is.character(name))||(length(name)!=1)||(nchar(name)<1)) { 65 | stop('replyr::replyr_copy_to name must be a single non-empty string') 66 | } 67 | if(!is.null(rowNumberColumn)) { 68 | df[[rowNumberColumn]] <- seq_len(replyr_nrow(df)) 69 | } 70 | dplyr::copy_to(dest, df, name, 71 | temporary=temporary, 72 | overwrite=overwrite) 73 | } 74 | 75 | #' Bring remote data back as a local data frame tbl. 76 | #' 77 | #' @param d remote dplyr data item 78 | #' @param maxrow max rows to allow (stop otherwise, set to NULL to allow any size). 79 | #' @return local tbl. 80 | #' 81 | #' @examples 82 | #' 83 | #' 84 | #' if (requireNamespace("RSQLite", quietly = TRUE)) { 85 | #' my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 86 | #' RSQLite::initExtension(my_db) 87 | #' d <- replyr_copy_to(my_db,data.frame(x=c(1,2)),'d') 88 | #' d2 <- replyr_copy_from(d) 89 | #' print(d2) 90 | #' DBI::dbDisconnect(my_db) 91 | #' } 92 | #' 93 | #' @export 94 | replyr_copy_from <- function(d, maxrow= 1000000) { 95 | if(!is.null(maxrow)) { 96 | n <- replyr_nrow(d) 97 | if(n>maxrow) { 98 | stop("replyr_copy_from maximum rows exceeded") 99 | } 100 | } 101 | dplyr::collect(d) 102 | } 103 | -------------------------------------------------------------------------------- /R/dim.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Compute number of columns of a data.frame (work around https://github.com/rstudio/sparklyr/issues/976 ). 4 | #' 5 | #' 6 | #' @param x tbl or item that can be coerced into such. 7 | #' @return number of columns 8 | #' 9 | #' @examples 10 | #' 11 | #' d <- data.frame(x=c(1,2)) 12 | #' replyr_ncol(d) 13 | #' 14 | #' @export 15 | replyr_ncol <- function(x) { 16 | length(colnames(x)) 17 | } 18 | 19 | 20 | 21 | 22 | #' Compute dimensions of a data.frame (work around https://github.com/rstudio/sparklyr/issues/976 ). 23 | #' 24 | #' @param x tbl or item that can be coerced into such. 25 | #' @return dimensions (including rows) 26 | #' 27 | #' @examples 28 | #' 29 | #' d <- data.frame(x=c(1,2)) 30 | #' replyr_dim(d) 31 | #' 32 | #' @export 33 | replyr_dim <- function(x) { 34 | nrows <- replyr_nrow(x) 35 | ncol <- replyr_ncol(x) 36 | c(nrows, ncol) 37 | } 38 | -------------------------------------------------------------------------------- /R/filter.R: -------------------------------------------------------------------------------- 1 | 2 | # Contributed by John Mount jmount@win-vector.com , ownership assigned to Win-Vector LLC. 3 | # Win-Vector LLC currently distributes this code without intellectual property indemnification, warranty, claim of fitness of purpose, or any other guarantee under a GPL3 license. 4 | 5 | #' @importFrom dplyr ungroup mutate summarize tbl as.tbl compute inner_join 6 | NULL 7 | 8 | #' Filter a tbl on a column having values in a given set. 9 | #' 10 | #' 11 | #' @param x tbl or item that can be coerced into such. 12 | #' @param cname name of the column to test values of. 13 | #' @param values set of values to check set membership of. 14 | #' @param ... force later arguments to bind by name. 15 | #' @param verbose logical if TRUE echo warnings 16 | #' @param tempNameGenerator temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects. 17 | #' @return new tbl with only rows where cname value is in values set. 18 | #' 19 | #' @examples 20 | #' 21 | #' values <- c('a','c') 22 | #' d <- data.frame(x=c('a','a','b','b','c','c'),y=1:6, 23 | #' stringsAsFactors=FALSE) 24 | #' replyr_filter(d,'x',values) 25 | #' 26 | #' @export 27 | replyr_filter <- function(x,cname,values, 28 | ..., 29 | verbose=TRUE, 30 | tempNameGenerator= mk_tmp_name_source("replyr_filter")) { 31 | if(length(list(...))>0) { 32 | stop("replyr::replyr_filter unexpected arguments.") 33 | } 34 | if((!is.character(cname))||(length(cname)!=1)||(cname[[1]]=='n')) { 35 | stop('replyr_filter cname must be a single string not equal to "n"') 36 | } 37 | vtbl <- data.frame(x=unique(values),stringsAsFactors=FALSE) 38 | # Spark 1.6.2 doesn't like same column names accross joins, even 39 | # in the by clause from dplyr. So build a new column name. 40 | # "by" notation from http://stackoverflow.com/questions/21888910/how-to-specify-names-of-columns-for-x-and-y-when-joining-in-dplyr 41 | newname <- make.names(c(colnames(x),paste('y',cname,sep='_')),unique = TRUE) 42 | newname <- newname[length(newname)] 43 | byClause <- newname 44 | names(byClause) <- cname 45 | colnames(vtbl) <- newname 46 | jtab <- dplyr::as.tbl(vtbl) 47 | if(!replyr_is_local_data(x)) { 48 | cn <- replyr_get_src(x) 49 | jtab <- replyr_copy_to(cn, jtab, tempNameGenerator(), 50 | temporary = TRUE) 51 | } 52 | # dplyr::*_join(jtab,by=cname,copy=TRUE) has been bombing out with: 53 | # "CREATE TEMPORARY TABLE is not supported" (spark 2.0.0, hadoop 2.7) 54 | # spark 1.6.2 can't join tables with matching names (even as the join condition). 55 | # which is why we copy first 56 | res <- NULL 57 | x %.>% 58 | dplyr::inner_join(.,jtab, by=byClause) %.>% 59 | dplyr::compute(., name= tempNameGenerator()) -> res 60 | res 61 | } 62 | -------------------------------------------------------------------------------- /R/inTest.R: -------------------------------------------------------------------------------- 1 | 2 | # Contributed by John Mount jmount@win-vector.com , ownership assigned to Win-Vector LLC. 3 | # Win-Vector LLC currently distributes this code without intellectual property indemnification, warranty, claim of fitness of purpose, or any other guarantee under a GPL3 license. 4 | 5 | #' @importFrom dplyr ungroup mutate summarize tbl as.tbl compute left_join 6 | NULL 7 | 8 | #' Product a column noting if another columns values are in a given set. 9 | #' 10 | #' 11 | #' @param x tbl or item that can be coerced into such. 12 | #' @param cname name of the column to test values of. 13 | #' @param values set of values to check set membership of. 14 | #' @param nname name for new column 15 | #' @param ... force later parameters to bind by name 16 | #' @param tempNameGenerator temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects. 17 | #' @param verbose logical if TRUE echo warnings 18 | #' @return table with membership indications. 19 | #' 20 | #' @examples 21 | #' 22 | #' values <- c('a','c') 23 | #' d <- data.frame(x=c('a','a','b',NA,'c','c'),y=1:6, 24 | #' stringsAsFactors=FALSE) 25 | #' replyr_inTest(d,'x',values,'match') 26 | #' 27 | #' @export 28 | replyr_inTest <- function(x,cname,values,nname, 29 | ..., 30 | tempNameGenerator= mk_tmp_name_source("replyr_inTest"), 31 | verbose=TRUE) { 32 | if(length(list(...))>0) { 33 | stop("replyr::replyr_inTest unexpected arguments.") 34 | } 35 | if((!is.character(cname))||(length(cname)!=1)||(cname[[1]]=='n')) { 36 | stop('replyr_inTest cname must be a single string not equal to "n"') 37 | } 38 | if((!is.character(nname))||(length(nname)!=1)||(nname[[1]]=='n')) { 39 | stop('replyr_inTest nname must be a single string not equal to "n"') 40 | } 41 | vtbl <- data.frame(x=unique(values),stringsAsFactors=FALSE) 42 | # Spark 1.6.2 doesn't like same column names accross joins, even 43 | # in the by clause from dplyr. So build a new column name. 44 | # "by" notation from http://stackoverflow.com/questions/21888910/how-to-specify-names-of-columns-for-x-and-y-when-joining-in-dplyr 45 | newname <- make.names(c(colnames(x),paste('y',cname,sep='_')),unique = TRUE) 46 | newname <- newname[length(newname)] 47 | byClause <- newname 48 | names(byClause) <- cname 49 | colnames(vtbl) <- newname 50 | vtbl[[nname]] <- TRUE 51 | jtab <- dplyr::as.tbl(vtbl) 52 | if(!replyr_is_local_data(x)) { 53 | cn <- replyr_get_src(x) 54 | jtab <- replyr_copy_to(cn, jtab, 55 | tempNameGenerator(), 56 | temporary = TRUE) 57 | } 58 | # dplyr::*_join(jtab,by=cname,copy=TRUE) has been bombing out with: 59 | # "CREATE TEMPORARY TABLE is not supported" (spark 2.0.0, hadoop 2.7) 60 | # spark 1.6.2 can't join tables with matching names (even as the join condition). 61 | # dplyr 0.5.0, sparklyr 0.4, so need to work around. 62 | # Try it the right way first (this way works well on good stacks). 63 | res <- NULL 64 | good <- FALSE 65 | x %.>% 66 | dplyr::left_join(., jtab, by=byClause) %.>% 67 | dplyr::compute(., name= tempNameGenerator()) -> res 68 | # replace NA with false 69 | RCOL <- NULL # declare no external binding 70 | let( 71 | list(RCOL=nname), 72 | res <- dplyr::mutate(res, RCOL=!is.na(RCOL)) 73 | ) 74 | res 75 | } 76 | -------------------------------------------------------------------------------- /R/land.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | isValidAndUnreservedName <- function(string) { 4 | (is.character(string)) && 5 | (length(string)==1) && 6 | (make.names(string,unique = FALSE, allow_ = TRUE) == string) 7 | } 8 | 9 | 10 | #' Land a value to variable from a pipeline. 11 | #' 12 | #' \%land\% and \%->\% ("writearrow") copy a pipeline value to a variable on the 13 | #' right hand side. 14 | #' \%land_\% and \%->_\% copy a pipeline value to 15 | #' a variable named by the value referenced by its right hand side argument. 16 | #' 17 | #' 18 | #' 19 | #' Technically these operators are 20 | #' not "-> assignment", so they might not be specifically prohibited in an 21 | #' oppugnant reading of some style guides. 22 | #' 23 | #' @param value value to write 24 | #' @param name variable to write to 25 | #' @return value 26 | #' 27 | #' @examples 28 | #' 29 | #' sin(7) %->% z1 30 | #' sin(7) %->_% 'z2' 31 | #' varname <- 'z3' 32 | #' sin(7) %->_% varname 33 | #' 34 | #' @export 35 | `%land%` <- function(value, name) { 36 | name <- as.character(substitute(name)) 37 | if((length(name)!=1)||(!is.character(name))|| 38 | (!isValidAndUnreservedName(name))) { 39 | stop("replyr::`%land%` name argument must be a valid potential variable name") 40 | } 41 | envir <- parent.frame(1) 42 | assign(name, value, 43 | pos = envir, 44 | envir = envir) 45 | invisible(value) 46 | } 47 | 48 | #' @export 49 | #' @rdname grapes-land-grapes 50 | `%->%` <- function(value, name) { 51 | name <- as.character(substitute(name)) 52 | if((length(name)!=1)||(!is.character(name))|| 53 | (!isValidAndUnreservedName(name))) { 54 | stop("replyr::`%->%` name argument must be a valid potential variable name") 55 | } 56 | envir <- parent.frame(1) 57 | assign(name, value, 58 | pos = envir, 59 | envir = envir) 60 | invisible(value) 61 | } 62 | 63 | #' @export 64 | #' @rdname grapes-land-grapes 65 | `%->_%` <- function(value, name) { 66 | if(is.name(name)) { 67 | name <- as.character(name) 68 | } 69 | if((length(name)!=1)||(!is.character(name))|| 70 | (!isValidAndUnreservedName(name))) { 71 | stop("replyr::`%->_%` name argument must be a valid potential variable name") 72 | } 73 | envir <- parent.frame(1) 74 | assign(name, value, 75 | pos = envir, 76 | envir = envir) 77 | invisible(value) 78 | } 79 | 80 | #' @export 81 | #' @rdname grapes-land-grapes 82 | `%land_%` <- function(value, name) { 83 | if(is.name(name)) { 84 | name <- as.character(name) 85 | } 86 | if((length(name)!=1)||(!is.character(name))|| 87 | (!isValidAndUnreservedName(name))) { 88 | stop("replyr::`%land_%` name argument must be a valid potential variable name") 89 | } 90 | envir <- parent.frame(1) 91 | assign(name, value, 92 | pos = envir, 93 | envir = envir) 94 | invisible(value) 95 | } 96 | 97 | 98 | -------------------------------------------------------------------------------- /R/nrow.R: -------------------------------------------------------------------------------- 1 | 2 | # Contributed by John Mount jmount@win-vector.com , ownership assigned to Win-Vector LLC. 3 | # Win-Vector LLC currently distributes this code without intellectual property indemnification, warranty, claim of fitness of purpose, or any other guarantee under a GPL3 license. 4 | 5 | #' @importFrom dplyr ungroup summarize transmute 6 | NULL 7 | 8 | #' Check if a table has rows. 9 | #' 10 | #' @param d tbl or item that can be coerced into such. 11 | #' @return number of rows 12 | #' 13 | #' @examples 14 | #' 15 | #' d <- data.frame(x=c(1,2)) 16 | #' replyr_hasrows(d) 17 | #' 18 | #' @export 19 | replyr_hasrows <- function(d) { 20 | if(is.null(d)) { 21 | return(FALSE) 22 | } 23 | # get empty corner case correct (counting returned NA on PostgreSQL for this) 24 | # had problems with head(n=1) on sparklyr 25 | # https://github.com/WinVector/replyr/blob/master/issues/HeadIssue.md 26 | suppressWarnings( 27 | dSample <- d %.>% 28 | dplyr::ungroup(.) %.>% 29 | head(.) %.>% 30 | dplyr::collect(.) %.>% 31 | as.data.frame(.)) 32 | if(is.null(dSample)) { 33 | return(FALSE) 34 | } 35 | n <- nrow(dSample) 36 | if(is.null(n) || is.na(n) || is.nan(n) || (n<1)) { 37 | return(FALSE) 38 | } 39 | return(TRUE) 40 | } 41 | 42 | #' Compute number of rows of a tbl. 43 | #' 44 | #' Number of row in a table. This function is not "group aware" it returns the total number of rows, not rows per dplyr group. 45 | #' Also \code{replyr_nrow} depends on data being returned to count, so some corner cases (such as zero columns) will count as zero rows. 46 | #' In particular work around dplyr issue 2871 \url{https://github.com/tidyverse/dplyr/issues/2871}. 47 | #' 48 | #' @param x tbl or item that can be coerced into such. 49 | #' @return number of rows 50 | #' 51 | #' @examples 52 | #' 53 | #' d <- data.frame(x=c(1,2)) 54 | #' replyr_nrow(d) 55 | #' 56 | #' @export 57 | replyr_nrow <- function(x) { 58 | if(is.null(x)) { 59 | return(FALSE) 60 | } 61 | # try for easy case 62 | n <- nrow(x) 63 | if((!is.null(n)) && (!is.na(n)) && (!is.nan(n))) { 64 | # defend against dplyr issue 2871 https://github.com/tidyverse/dplyr/issues/2871 65 | return(n) 66 | } 67 | # get rid of raw columns 68 | # nrow() not supported in dbplyr/sparklyr world: http://www.win-vector.com/blog/2017/08/why-to-use-the-replyr-r-package/ 69 | # previous mutate impl was erroring out: https://github.com/tidyverse/dplyr/issues/3069 70 | # and using tally directly is bad: https://github.com/tidyverse/dplyr/issues/3070 71 | # and this issue is a problem: https://github.com/tidyverse/dplyr/issues/3071 72 | constant <- NULL # make obvious this is not an unbound reference 73 | ctab <- x %.>% 74 | dplyr::ungroup(.) %.>% 75 | dplyr::transmute(., constant = 1.0) %.>% # collumn we can count, not named n 76 | dplyr::summarize(., count = sum(constant, na.rm = TRUE)) %.>% 77 | dplyr::collect(.) %.>% 78 | as.data.frame(.) 79 | n <- as.numeric(ctab[1,1,drop=TRUE]) 80 | if(is.null(n) || is.na(n) || is.nan(n) || (n<1)) { 81 | return(0) 82 | } 83 | n 84 | } 85 | 86 | -------------------------------------------------------------------------------- /R/replyr.R: -------------------------------------------------------------------------------- 1 | #' replyr: Patches to Use dplyr on Remote Data Sources 2 | #' 3 | #' Methods to reliably use \code{dplyr} on remote data sources in \code{R} (\code{SQL} databases, 4 | #' \code{Spark} \code{2.0.0} and above) in a generic fashion. 5 | #' 6 | #' \code{replyr} is going into maintenance mode. It has been hard to track 7 | #' shifting \code{dplyr}/\code{dbplyr}/\code{rlang} APIs and data structures post \code{dplyr} \code{0.5}. 8 | #' Most of what it does is now done better in one of the newer non-monolithic packages: 9 | #' 10 | #' \itemize{ 11 | #' \item Programming and meta-programming tools: \code{wrapr} \url{https://CRAN.R-project.org/package=wrapr}. 12 | #' \item Adapting \code{dplyr} to standard evaluation interfaces: \code{seplyr} \url{https://CRAN.R-project.org/package=seplyr}. 13 | #' \item Big data data manipulation: \code{rquery} \url{https://CRAN.R-project.org/package=rquery} and \code{cdata} \url{https://CRAN.R-project.org/package=cdata}. 14 | #' } 15 | #' 16 | #' 17 | #' \code{replyr} helps with the following: 18 | #' 19 | #' \itemize{ 20 | #' \item Summarizing remote data (via \code{replyr_summarize}). 21 | #' \item Facilitating writing "source generic" code that works similarly on multiple 'dplyr' data sources. 22 | #' \item Providing big data versions of functions for splitting data, binding rows, pivoting, adding row-ids, ranking, and completing experimental designs. 23 | #' \item Packaging common data manipulation tasks into operators such as the \code{\link{gapply}} function. 24 | #' \item Providing support code for common \code{SparklyR} tasks, such as tracking temporary handle IDs. 25 | #' } 26 | #' 27 | #' \code{replyr} is in maintenance mode. Better version of the functionality have been ported to the following packages: 28 | #' \code{wrapr}, \code{cdata}, \code{rquery}, and \code{seplyr}. 29 | #' 30 | #' 31 | #' To learn more about replyr, please start with the vignette: 32 | #' \code{vignette('replyr','replyr')} 33 | #' 34 | #' @docType package 35 | #' @name replyr 36 | NULL 37 | 38 | 39 | # re-export so old code and demos work (from when functions were here) 40 | 41 | #' @importFrom wrapr let %.>% := mk_tmp_name_source 42 | NULL 43 | 44 | 45 | # so it does not look like an unbound reference in pipes 46 | . <- NULL 47 | 48 | -------------------------------------------------------------------------------- /R/underscoreReplacements.R: -------------------------------------------------------------------------------- 1 | 2 | # replacements for a few of the underbar/underscore forms form dplyr 0.5 and earlier 3 | 4 | 5 | #' Rename a column 6 | #' 7 | #' @param .data data object to work on 8 | #' @param ... force later arguments to bind by name 9 | #' @param newName character new column name 10 | #' @param oldName character old column name 11 | #' 12 | #' @examples 13 | #' 14 | #' d <- data.frame(Sepal_Length= c(5.8,5.7), 15 | #' Sepal_Width= c(4.0,4.4), 16 | #' Species= 'setosa', rank=c(1,2)) 17 | #' replyr_rename(d, newName = 'family', oldName = 'Species') 18 | #' 19 | #' @export 20 | #' 21 | replyr_rename <- function(.data, 22 | ..., 23 | newName, oldName) { 24 | if(length(list(...))>0) { 25 | stop("replyr::replyr_rename unexpected arguments") 26 | } 27 | newName <- as.character(newName) 28 | oldName <- as.character(oldName) 29 | if((length(newName)!=1)||(length(oldName)!=1)) { 30 | stop("replyr::replyr_rename newName and oldName must be length 1 character vectors") 31 | } 32 | if(newName!=oldName) { 33 | REPLYR_PRIVATE_NEWNAME <- NULL # declare not an unbound name 34 | REPLYR_PRIVATE_OLDNAME <- NULL # declare not an unbound name 35 | wrapr::let( 36 | c(REPLYR_PRIVATE_NEWNAME=newName, 37 | REPLYR_PRIVATE_OLDNAME=oldName), 38 | strict = FALSE, 39 | .data <- dplyr::rename(.data, 40 | REPLYR_PRIVATE_NEWNAME = REPLYR_PRIVATE_OLDNAME) 41 | ) 42 | } 43 | .data 44 | } 45 | 46 | 47 | 48 | #' arrange by a single column 49 | #' 50 | #' @param .data data object to work on 51 | #' @param colname character column name 52 | #' @param descending logical if true sort descending (else sort ascending) 53 | #' 54 | #' @examples 55 | #' 56 | #' d <- data.frame(Sepal_Length= c(5.8,5.7), 57 | #' Sepal_Width= c(4.0,4.4)) 58 | #' replyr_arrange(d, 'Sepal_Length', descending= TRUE) 59 | #' 60 | #' @export 61 | #' 62 | replyr_arrange <- function(.data, colname, descending = FALSE) { 63 | colname <- as.character(colname) # remove any names 64 | REPLYR_PRIVATE_NEWNAME <- NULL # declare not an unbound name 65 | desc <- dplyr::desc # declare not an unbound name 66 | if(descending) { 67 | wrapr::let( 68 | c(REPLYR_PRIVATE_NEWNAME=colname), 69 | .data <- dplyr::arrange(.data, 70 | desc(REPLYR_PRIVATE_NEWNAME)) 71 | ) 72 | } else { 73 | wrapr::let( 74 | c(REPLYR_PRIVATE_NEWNAME=colname), 75 | .data <- dplyr::arrange(.data, 76 | REPLYR_PRIVATE_NEWNAME) 77 | ) 78 | } 79 | .data 80 | } 81 | 82 | #' @importFrom rlang syms 83 | NULL 84 | 85 | #' group_by columns 86 | #' 87 | #' See also: \url{https://gist.github.com/skranz/9681509} 88 | #' 89 | #' @param .data data.frame 90 | #' @param colnames character vector of column names to group by. 91 | #' @return .data grouped by columns named in colnames 92 | #' 93 | #' @examples 94 | #' 95 | #' d <- data.frame(Sepal_Length= c(5.8,5.7), 96 | #' Sepal_Width= c(4.0,4.4), 97 | #' Species= 'setosa') 98 | #' replyr_group_by(d, 'Species') 99 | #' 100 | #' @seealso \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{group_by_at}} 101 | #' 102 | #' @export 103 | #' 104 | replyr_group_by <- function(.data, colnames) { 105 | if(!(is.data.frame(.data) || dplyr::is.tbl(.data))) { 106 | stop("replyr::replyr_group_by first argument must be a data.frame or tbl") 107 | } 108 | # convert char vector into spliceable vector 109 | groupingSyms <- rlang::syms(as.character(colnames)) 110 | dplyr::group_by(dplyr::ungroup(.data), !!!groupingSyms) 111 | } 112 | 113 | 114 | 115 | #' select columns 116 | #' 117 | #' @param .data data object to work on 118 | #' @param colnames character column names 119 | #' 120 | #' @examples 121 | #' 122 | #' d <- data.frame(Sepal_Length= c(5.8,5.7), 123 | #' Sepal_Width= c(4.0,4.4), 124 | #' Species= 'setosa', rank=c(1,2)) 125 | #' replyr_select(d, c('Sepal_Length', 'Species')) 126 | #' 127 | #' @export 128 | #' 129 | replyr_select <- function(.data, colnames) { 130 | dplyr::select(.data, dplyr::one_of(colnames)) 131 | } 132 | -------------------------------------------------------------------------------- /R/uniqueValues.R: -------------------------------------------------------------------------------- 1 | 2 | # Contributed by John Mount jmount@win-vector.com , ownership assigned to Win-Vector LLC. 3 | # Win-Vector LLC currently distributes this code without intellectual property indemnification, warranty, claim of fitness of purpose, or any other guarantee under a GPL3 license. 4 | 5 | #' @importFrom dplyr ungroup mutate summarize 6 | NULL 7 | 8 | 9 | 10 | #' Compute number of unique values for each level in a column. 11 | #' 12 | #' @param x tbl or item that can be coerced into such. 13 | #' @param cname name of columns to examine, must not be equal to 'replyr_private_value_n'. 14 | #' @return unique values for the column. 15 | #' 16 | #' @examples 17 | #' 18 | #' d <- data.frame(x=c(1,2,3,3)) 19 | #' replyr_uniqueValues(d,'x') 20 | #' 21 | #' @export 22 | replyr_uniqueValues <- function(x, cname) { 23 | if((!is.character(cname))||(length(cname)!=1)||(cname[[1]]=='replyr_private_value_n')) { 24 | stop('replyr_uniqueValues cname must be a single string not equal to "replyr_private_value_n"') 25 | } 26 | replyr_private_value_n <- NULL # false binding for 'replyr_private_value_n' so name does not look unbound to CRAN check 27 | REPLYRGROUPINGCOL <- NULL # declare not an unbound variable 28 | wrapr::let( 29 | c(REPLYRGROUPINGCOL=cname), 30 | x %.>% 31 | dplyr::ungroup(.) %.>% 32 | replyr_select(., cname) %.>% 33 | dplyr::mutate(., replyr_private_value_n=1.0) %.>% 34 | dplyr::group_by(., REPLYRGROUPINGCOL) %.>% 35 | dplyr::summarize(., replyr_private_value_n=sum(replyr_private_value_n)) -> res 36 | ) 37 | # Can't get rid of the warning on MySQL, even suppressWarnings() doesn't shut it up 38 | res 39 | } 40 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | 2 | url: https://winvector.github.io/replyr/ 3 | 4 | navbar: 5 | right: 6 | - text: "Sponsor: Win-Vector LLC" 7 | href: http://www.win-vector.com/ 8 | 9 | 10 | -------------------------------------------------------------------------------- /checks/.gitignore: -------------------------------------------------------------------------------- 1 | spark-warehouse 2 | -------------------------------------------------------------------------------- /checks/README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | 10 | ```{r, echo = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = " # " 14 | ) 15 | options(width =100) 16 | ``` 17 | 18 | 19 | Not every package is going to work every time, for every user, across every installation, with all data. The examples in this directory attempt to mitigate this for `replyr` by showing `replyr` in use with multiple data sources: 20 | 21 | * `data.frame` 22 | * `tbl` 23 | * `SQLite` 24 | * `MySQL` (not currently in out test suite) 25 | * `PostgreSQL` 26 | * `Spark` 2.0.0 27 | 28 | Because a lot of the above systems depend on configuration and systems out of the R ecosystem we expect a lot of variation and a lot to go wrong. So the examples here also have a touch of "here it is working at least once" to them. Or put another way: these examples are the guarantee that new users are never the first to test common cases. 29 | 30 | -------------------------------------------------------------------------------- /checks/README.md: -------------------------------------------------------------------------------- 1 | 2 | Not every package is going to work every time, for every user, across every installation, with all data. The examples in this directory attempt to mitigate this for `replyr` by showing `replyr` in use with multiple data sources: 3 | 4 | - `data.frame` 5 | - `tbl` 6 | - `SQLite` 7 | - `MySQL` (not currently in out test suite) 8 | - `PostgreSQL` 9 | - `Spark` 2.0.0 10 | 11 | Because a lot of the above systems depend on configuration and systems out of the R ecosystem we expect a lot of variation and a lot to go wrong. So the examples here also have a touch of "here it is working at least once" to them. Or put another way: these examples are the guarantee that new users are never the first to test common cases. 12 | -------------------------------------------------------------------------------- /checks/SmallTimings.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Small Timings" 3 | author: "John Mount" 4 | date: "`r Sys.Date()`" 5 | output: 6 | md_document: 7 | variant: markdown_github 8 | --- 9 | 10 | ```{r, echo = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = " # " 14 | ) 15 | options(width =100) 16 | ``` 17 | 18 | ```{r pressure} 19 | # small timings 20 | library("data.table") 21 | library("dplyr") 22 | library("microbenchmark") 23 | library("replyr") 24 | library("ggplot2") 25 | 26 | set.seed(32535) 27 | 28 | data("iris",package = "datasets") 29 | iris.dt <- data.table(iris) 30 | 31 | iris.dt[, mean(Sepal.Length), by=Species] 32 | 33 | let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 34 | iris.dt[, mean(DATACOL), by=GROUPCOL]) 35 | 36 | iris %>% group_by(Species) %>% summarize(mean(Sepal.Length)) 37 | 38 | let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 39 | iris %>% group_by(GROUPCOL) %>% summarize(mean(DATACOL))) 40 | 41 | # gapply is very bad at this task both notationally, and in terms of speed 42 | gapply(iris, 'Species', 43 | function(di) { 44 | mean(di[['Sepal.Length']]) 45 | }, 46 | partitionMethod = 'split', 47 | bindrows=FALSE) 48 | 49 | ks <- sort(unique(floor(10^(0.25*(0:16))))) 50 | 51 | meass <- lapply(ks, 52 | function(k) { 53 | irisk <- iris[rep(seq_len(nrow(iris)),k),,drop=FALSE] 54 | irisk.dt <- data.table(irisk) 55 | meas <- microbenchmark(data.table= irisk.dt[, mean(Sepal.Length), by=Species], 56 | dplyr= irisk %>% group_by(Species) %>% summarize(mean(Sepal.Length)), 57 | let.data.table= let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 58 | irisk.dt[, mean(DATACOL), by=GROUPCOL]), 59 | let.dplyr= let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 60 | irisk %>% group_by(GROUPCOL) %>% summarize(mean(DATACOL)))) 61 | meas %>% as.data.frame() %>% mutate(k=k) 62 | }) 63 | 64 | 65 | res <- lapply(meass, 66 | function(meas) { 67 | meas %>% 68 | as.data.frame() %>% 69 | group_by(expr) %>% 70 | summarize(mean= mean(time), 71 | median= median(time), 72 | k= mean(k), # pseudo-aggregator 73 | q1= as.numeric(quantile(time,0.25)), 74 | q3= as.numeric(quantile(time,0.75))) 75 | }) 76 | 77 | 78 | res <- dplyr::bind_rows(res) 79 | res$expr <- reorder(res$expr, -res$median) 80 | 81 | ggplot(data=res, mapping=aes(x=k,y=median,color=expr)) + 82 | geom_ribbon(mapping=aes(ymin=q1,ymax=q3,fill=expr),alpha=0.2,color=NA) + 83 | geom_point() + geom_line() + 84 | scale_x_log10() + scale_y_log10() + 85 | ylab('time') + 86 | ggtitle("median run times (nanosecond)", 87 | subtitle = "as a function of method and replications") 88 | ``` 89 | -------------------------------------------------------------------------------- /checks/SmallTimings.md: -------------------------------------------------------------------------------- 1 | ``` r 2 | # small timings 3 | library("data.table") 4 | library("dplyr") 5 | # -------------------------------------------------------------------------------------------------- 6 | # data.table + dplyr code now lives in dtplyr. 7 | # Please library(dtplyr)! 8 | # -------------------------------------------------------------------------------------------------- 9 | # 10 | # Attaching package: 'dplyr' 11 | # The following objects are masked from 'package:data.table': 12 | # 13 | # between, first, last 14 | # The following objects are masked from 'package:stats': 15 | # 16 | # filter, lag 17 | # The following objects are masked from 'package:base': 18 | # 19 | # intersect, setdiff, setequal, union 20 | library("microbenchmark") 21 | library("replyr") 22 | library("ggplot2") 23 | 24 | set.seed(32535) 25 | 26 | data("iris",package = "datasets") 27 | iris.dt <- data.table(iris) 28 | 29 | iris.dt[, mean(Sepal.Length), by=Species] 30 | # Species V1 31 | # 1: setosa 5.006 32 | # 2: versicolor 5.936 33 | # 3: virginica 6.588 34 | 35 | let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 36 | iris.dt[, mean(DATACOL), by=GROUPCOL]) 37 | # Species V1 38 | # 1: setosa 5.006 39 | # 2: versicolor 5.936 40 | # 3: virginica 6.588 41 | 42 | iris %>% group_by(Species) %>% summarize(mean(Sepal.Length)) 43 | # # A tibble: 3 × 2 44 | # Species `mean(Sepal.Length)` 45 | # 46 | # 1 setosa 5.006 47 | # 2 versicolor 5.936 48 | # 3 virginica 6.588 49 | 50 | let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 51 | iris %>% group_by(GROUPCOL) %>% summarize(mean(DATACOL))) 52 | # # A tibble: 3 × 2 53 | # Species `mean(Sepal.Length)` 54 | # 55 | # 1 setosa 5.006 56 | # 2 versicolor 5.936 57 | # 3 virginica 6.588 58 | 59 | # gapply is very bad at this task both notationally, and in terms of speed 60 | gapply(iris, 'Species', 61 | function(di) { 62 | mean(di[['Sepal.Length']]) 63 | }, 64 | partitionMethod = 'split', 65 | bindrows=FALSE) 66 | # $setosa 67 | # [1] 5.006 68 | # 69 | # $versicolor 70 | # [1] 5.936 71 | # 72 | # $virginica 73 | # [1] 6.588 74 | 75 | ks <- sort(unique(floor(10^(0.25*(0:16))))) 76 | 77 | meass <- lapply(ks, 78 | function(k) { 79 | irisk <- iris[rep(seq_len(nrow(iris)),k),,drop=FALSE] 80 | irisk.dt <- data.table(irisk) 81 | meas <- microbenchmark(data.table= irisk.dt[, mean(Sepal.Length), by=Species], 82 | dplyr= irisk %>% group_by(Species) %>% summarize(mean(Sepal.Length)), 83 | let.data.table= let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 84 | irisk.dt[, mean(DATACOL), by=GROUPCOL]), 85 | let.dplyr= let(list(GROUPCOL='Species', DATACOL='Sepal.Length'), 86 | irisk %>% group_by(GROUPCOL) %>% summarize(mean(DATACOL)))) 87 | meas %>% as.data.frame() %>% mutate(k=k) 88 | }) 89 | 90 | 91 | res <- lapply(meass, 92 | function(meas) { 93 | meas %>% 94 | as.data.frame() %>% 95 | group_by(expr) %>% 96 | summarize(mean= mean(time), 97 | median= median(time), 98 | k= mean(k), # pseudo-aggregator 99 | q1= as.numeric(quantile(time,0.25)), 100 | q3= as.numeric(quantile(time,0.75))) 101 | }) 102 | 103 | 104 | res <- dplyr::bind_rows(res) 105 | res$expr <- reorder(res$expr, -res$median) 106 | 107 | ggplot(data=res, mapping=aes(x=k,y=median,color=expr)) + 108 | geom_ribbon(mapping=aes(ymin=q1,ymax=q3,fill=expr),alpha=0.2,color=NA) + 109 | geom_point() + geom_line() + 110 | scale_x_log10() + scale_y_log10() + 111 | ylab('time') + 112 | ggtitle("median run times (nanosecond)", 113 | subtitle = "as a function of method and replications") 114 | ``` 115 | 116 | ![](SmallTimings_files/figure-markdown_github/pressure-1.png) 117 | -------------------------------------------------------------------------------- /checks/SmallTimings_files/figure-markdown_github/pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/checks/SmallTimings_files/figure-markdown_github/pressure-1.png -------------------------------------------------------------------------------- /checks/SplitApplyCombineSpark.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "split/apply/combine on Spark" 3 | output: github_document 4 | --- 5 | 6 | ```{r} 7 | library('dplyr') 8 | library('sparklyr') 9 | library('replyr') 10 | 11 | sc <- sparklyr::spark_connect(version='2.2.0', 12 | master = "local") 13 | 14 | d <- copy_to(sc, 15 | data.frame(x=1:7, group=floor((1:7)/3)), 16 | name= 'd') 17 | 18 | print(d) 19 | 20 | pieces <- replyr_split(d, 'group', partitionMethod = 'extract') 21 | print(pieces) 22 | 23 | f <- function(pi) { 24 | ni <- replyr_nrow(pi) 25 | mutate(pi, n=ni) 26 | } 27 | 28 | pieces <- lapply(pieces, f) 29 | print(pieces) 30 | 31 | recovered <- replyr_bind_rows(pieces) %>% 32 | arrange(x) 33 | print(recovered) 34 | 35 | r2 <- d %>% 36 | gapply('group', f, partitionMethod = 'extract') %>% 37 | arrange(x) 38 | print(r2) 39 | 40 | spark_disconnect(sc) 41 | rm(list=ls()); gc() # disconnect 42 | ``` 43 | 44 | -------------------------------------------------------------------------------- /cleanFluff.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # get rid of side effect files from practice runs 4 | 5 | find . \( -name \*~ -or -name \*\.log -or -name \*.log\.\* -or -name \*.sqlite3\* \) -exec rm {} \; 6 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Fix doc size. 4 | 5 | ## Test environments 6 | 7 | ### OSX build/check 8 | 9 | R CMD check --as-cran replyr_1.0.5.tar.gz 10 | * using R version 3.6.0 (2019-04-26) 11 | * using platform: x86_64-apple-darwin15.6.0 (64-bit) 12 | * using session charset: UTF-8 13 | * using option ‘--as-cran’ 14 | * checking for file ‘replyr/DESCRIPTION’ ... OK 15 | * checking extension type ... Package 16 | * this is package ‘replyr’ version ‘1.0.5’ 17 | * checking CRAN incoming feasibility ... Note_to_CRAN_maintainers 18 | Maintainer: ‘John Mount ’ 19 | Status: OK 20 | 21 | 22 | ### Windows 23 | 24 | rhub::check_for_cran() 25 | 867#> * using R Under development (unstable) (2019-10-19 r77318) 26 | 868#> * using platform: x86_64-w64-mingw32 (64-bit) 27 | 869#> * using session charset: ISO8859-1 28 | 870#> * using option '--as-cran' 29 | 871#> * checking for file 'replyr/DESCRIPTION' ... OK 30 | 872#> * checking extension type ... Package 31 | 873#> * this is package 'replyr' version '1.0.5' 32 | 874#> * checking CRAN incoming feasibility ... Note_to_CRAN_maintainers 33 | 875#> Maintainer: 'John Mount ' 34 | 911#> * checking Rd cross-references ... NOTE 35 | 912#> Package unavailable to check Rd xrefs: 'rquery' 36 | 932#> Status: 1 NOTE 37 | rquery is a documented alternative to replyr, but not a dependency. 38 | 39 | ## Downstream dependencies 40 | 41 | No declared dependencies. 42 | 43 | devtools::revdep() 44 | character(0) 45 | 46 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/articles/DependencySorting_d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/docs/articles/DependencySorting_d.png -------------------------------------------------------------------------------- /docs/articles/DependencySorting_files/accessible-code-block-0.0.1/empty-anchor.js: -------------------------------------------------------------------------------- 1 | // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> 2 | // v0.0.1 3 | // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. 4 | 5 | document.addEventListener('DOMContentLoaded', function() { 6 | const codeList = document.getElementsByClassName("sourceCode"); 7 | for (var i = 0; i < codeList.length; i++) { 8 | var linkList = codeList[i].getElementsByTagName('a'); 9 | for (var j = 0; j < linkList.length; j++) { 10 | if (linkList[j].innerHTML === "") { 11 | linkList[j].setAttribute('aria-hidden', 'true'); 12 | } 13 | } 14 | } 15 | }); 16 | -------------------------------------------------------------------------------- /docs/articles/DependencySorting_ig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/docs/articles/DependencySorting_ig.png -------------------------------------------------------------------------------- /docs/articles/ParametricExample_files/accessible-code-block-0.0.1/empty-anchor.js: -------------------------------------------------------------------------------- 1 | // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> 2 | // v0.0.1 3 | // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. 4 | 5 | document.addEventListener('DOMContentLoaded', function() { 6 | const codeList = document.getElementsByClassName("sourceCode"); 7 | for (var i = 0; i < codeList.length; i++) { 8 | var linkList = codeList[i].getElementsByTagName('a'); 9 | for (var j = 0; j < linkList.length; j++) { 10 | if (linkList[j].innerHTML === "") { 11 | linkList[j].setAttribute('aria-hidden', 'true'); 12 | } 13 | } 14 | } 15 | }); 16 | -------------------------------------------------------------------------------- /docs/articles/coalesce_files/accessible-code-block-0.0.1/empty-anchor.js: -------------------------------------------------------------------------------- 1 | // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> 2 | // v0.0.1 3 | // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. 4 | 5 | document.addEventListener('DOMContentLoaded', function() { 6 | const codeList = document.getElementsByClassName("sourceCode"); 7 | for (var i = 0; i < codeList.length; i++) { 8 | var linkList = codeList[i].getElementsByTagName('a'); 9 | for (var j = 0; j < linkList.length; j++) { 10 | if (linkList[j].innerHTML === "") { 11 | linkList[j].setAttribute('aria-hidden', 'true'); 12 | } 13 | } 14 | } 15 | }); 16 | -------------------------------------------------------------------------------- /docs/articles/joinController1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/docs/articles/joinController1.png -------------------------------------------------------------------------------- /docs/articles/joinController_files/accessible-code-block-0.0.1/empty-anchor.js: -------------------------------------------------------------------------------- 1 | // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> 2 | // v0.0.1 3 | // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. 4 | 5 | document.addEventListener('DOMContentLoaded', function() { 6 | const codeList = document.getElementsByClassName("sourceCode"); 7 | for (var i = 0; i < codeList.length; i++) { 8 | var linkList = codeList[i].getElementsByTagName('a'); 9 | for (var j = 0; j < linkList.length; j++) { 10 | if (linkList[j].innerHTML === "") { 11 | linkList[j].setAttribute('aria-hidden', 'true'); 12 | } 13 | } 14 | } 15 | }); 16 | -------------------------------------------------------------------------------- /docs/articles/letExample_files/accessible-code-block-0.0.1/empty-anchor.js: -------------------------------------------------------------------------------- 1 | // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> 2 | // v0.0.1 3 | // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. 4 | 5 | document.addEventListener('DOMContentLoaded', function() { 6 | const codeList = document.getElementsByClassName("sourceCode"); 7 | for (var i = 0; i < codeList.length; i++) { 8 | var linkList = codeList[i].getElementsByTagName('a'); 9 | for (var j = 0; j < linkList.length; j++) { 10 | if (linkList[j].innerHTML === "") { 11 | linkList[j].setAttribute('aria-hidden', 'true'); 12 | } 13 | } 14 | } 15 | }); 16 | -------------------------------------------------------------------------------- /docs/articles/letExample_files/figure-html/ggplot1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/docs/articles/letExample_files/figure-html/ggplot1-1.png -------------------------------------------------------------------------------- /docs/articles/letExample_files/figure-html/ggplot2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/docs/articles/letExample_files/figure-html/ggplot2-1.png -------------------------------------------------------------------------------- /docs/articles/replyr_files/accessible-code-block-0.0.1/empty-anchor.js: -------------------------------------------------------------------------------- 1 | // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> 2 | // v0.0.1 3 | // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. 4 | 5 | document.addEventListener('DOMContentLoaded', function() { 6 | const codeList = document.getElementsByClassName("sourceCode"); 7 | for (var i = 0; i < codeList.length; i++) { 8 | var linkList = codeList[i].getElementsByTagName('a'); 9 | for (var j = 0; j < linkList.length; j++) { 10 | if (linkList[j].innerHTML === "") { 11 | linkList[j].setAttribute('aria-hidden', 'true'); 12 | } 13 | } 14 | } 15 | }); 16 | -------------------------------------------------------------------------------- /docs/articles/replyrs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/docs/articles/replyrs.png -------------------------------------------------------------------------------- /docs/articles/summary_files/accessible-code-block-0.0.1/empty-anchor.js: -------------------------------------------------------------------------------- 1 | // Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> 2 | // v0.0.1 3 | // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. 4 | 5 | document.addEventListener('DOMContentLoaded', function() { 6 | const codeList = document.getElementsByClassName("sourceCode"); 7 | for (var i = 0; i < codeList.length; i++) { 8 | var linkList = codeList[i].getElementsByTagName('a'); 9 | for (var j = 0; j < linkList.length; j++) { 10 | if (linkList[j].innerHTML === "") { 11 | linkList[j].setAttribute('aria-hidden', 'true'); 12 | } 13 | } 14 | } 15 | }); 16 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/jquery.sticky-kit.min.js: -------------------------------------------------------------------------------- 1 | /* Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | */ 2 | /* 3 | Source: https://github.com/leafo/sticky-kit 4 | License: MIT 5 | */ 6 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); 7 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
"))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, 8 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), 10 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", 11 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent; 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.7.3 2 | pkgdown: 1.5.1.9000 3 | pkgdown_sha: c6b35532515a7ae433eff4b547d77eeae4278664 4 | articles: [] 5 | last_built: 2020-09-06T21:09Z 6 | urls: 7 | reference: https://winvector.github.io/replyr//reference 8 | article: https://winvector.github.io/replyr//articles 9 | 10 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /extras/KnitrParameters.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Knitr Parameters" 3 | author: "John Mount" 4 | date: "`r Sys.Date()`" 5 | output: 6 | md_document: 7 | variant: markdown_github 8 | params: 9 | FN: sin 10 | VAR: east 11 | TITLE: "User chosen function plot" 12 | --- 13 | 14 | 15 | [Jason Becker](http://jsonbecker.com) recently noticed that [`wrapr::let`](https://github.com/WinVector/wrapr) works well in binding [`R`](https://cran.r-project.org) [`knitr`](https://CRAN.R-project.org/package=knitr) worksheet parameters ([link](https://twitter.com/jsonbecker/status/815953228642811905)) (and the same likely holds for [`shiny`](http://shiny.rstudio.com)). This isn't something you want to expose to end-users, but is very powerful in managing your own reproducible research. 16 | 17 | The idea is: `knitr` already takes a `params` block which is an arbitrary `yaml` object and `wrapr::let` is willing to treat string to string associations as name rebindings. If we just add the convention that all uppercase names are to be read as re-bindings (and not values) we can accept user specified function names and variable name directly from the RMarkdown controls. 18 | 19 | For examine in the RMarkdown document that produced this note we included in the header the 20 | following params: 21 | 22 | ```{yaml} 23 | --- 24 | params: 25 | FN: sin 26 | VAR: east 27 | TITLE: "User chosen function plot" 28 | --- 29 | ``` 30 | 31 | ## Bindings 32 | 33 | The user assignments are available as variable name and function substitutions. We are using the convention that name bindings are specified in all caps (an option of `replyr::restrictToNameAssignments` available in version `0.2.01` and newer). 34 | 35 | ```{r bindings} 36 | library("wrapr") 37 | print(params) 38 | 39 | east = 7 40 | let( 41 | alias=restrictToNameAssignments(params), 42 | expr={ 43 | print(paste(quote(VAR),VAR)) 44 | plot(FN(0.1*(1:20))) 45 | title(params$TITLE) 46 | }) 47 | ``` 48 | 49 | ## Values 50 | 51 | Or, assuming everything you do is only standard evaluation and you don't care about capturing variable names: you can capture references to values once and have all subsequent blocks use those values. 52 | 53 | 54 | ```{r values} 55 | FN <- let(restrictToNameAssignments(params), FN) 56 | 57 | plot(FN(0.1*(1:20))) 58 | title(params$title) 59 | ``` 60 | 61 | Notice the plot uses the user specified function, but does not know its original name (so can no longer print it in the y-axis). 62 | 63 | ## Conclusion 64 | 65 | `wrapr::let` takes `knitr` parameters one step further. 66 | The source for this note can be found [here](https://github.com/WinVector/replyr/blob/master/extras/KnitrParameters.Rmd) 67 | and the rendered output [here](https://github.com/WinVector/replyr/blob/master/extras/KnitrParameters.md). 68 | 69 | -------------------------------------------------------------------------------- /extras/KnitrParameters.md: -------------------------------------------------------------------------------- 1 | [Jason Becker](http://jsonbecker.com) recently noticed that [`wrapr::let`](https://github.com/WinVector/wrapr) works well in binding [`R`](https://cran.r-project.org) [`knitr`](https://CRAN.R-project.org/package=knitr) worksheet parameters ([link](https://twitter.com/jsonbecker/status/815953228642811905)) (and the same likely holds for [`shiny`](http://shiny.rstudio.com)). This isn't something you want to expose to end-users, but is very powerful in managing your own reproducible research. 2 | 3 | The idea is: `knitr` already takes a `params` block which is an arbitrary `yaml` object and `wrapr::let` is willing to treat string to string associations as name rebindings. If we just add the convention that all uppercase names are to be read as re-bindings (and not values) we can accept user specified function names and variable name directly from the RMarkdown controls. 4 | 5 | For examine in the RMarkdown document that produced this note we included in the header the following params: 6 | 7 | ``` yaml 8 | --- 9 | params: 10 | FN: sin 11 | VAR: east 12 | TITLE: "User chosen function plot" 13 | --- 14 | ``` 15 | 16 | Bindings 17 | -------- 18 | 19 | The user assignments are available as variable name and function substitutions. We are using the convention that name bindings are specified in all caps (an option of `replyr::restrictToNameAssignments` available in version `0.2.01` and newer). 20 | 21 | ``` r 22 | library("wrapr") 23 | print(params) 24 | ``` 25 | 26 | ## $FN 27 | ## [1] "sin" 28 | ## 29 | ## $VAR 30 | ## [1] "east" 31 | ## 32 | ## $TITLE 33 | ## [1] "User chosen function plot" 34 | 35 | ``` r 36 | east = 7 37 | let( 38 | alias=restrictToNameAssignments(params), 39 | expr={ 40 | print(paste(quote(VAR),VAR)) 41 | plot(FN(0.1*(1:20))) 42 | title(params$TITLE) 43 | }) 44 | ``` 45 | 46 | ## [1] "east 7" 47 | 48 | ![](KnitrParameters_files/figure-markdown_github/bindings-1.png) 49 | 50 | Values 51 | ------ 52 | 53 | Or, assuming everything you do is only standard evaluation and you don't care about capturing variable names: you can capture references to values once and have all subsequent blocks use those values. 54 | 55 | ``` r 56 | FN <- let(restrictToNameAssignments(params), FN) 57 | 58 | plot(FN(0.1*(1:20))) 59 | title(params$title) 60 | ``` 61 | 62 | ![](KnitrParameters_files/figure-markdown_github/values-1.png) 63 | 64 | Notice the plot uses the user specified function, but does not know its original name (so can no longer print it in the y-axis). 65 | 66 | Conclusion 67 | ---------- 68 | 69 | `wrapr::let` takes `knitr` parameters one step further. The source for this note can be found [here](https://github.com/WinVector/replyr/blob/master/extras/KnitrParameters.Rmd) and the rendered output [here](https://github.com/WinVector/replyr/blob/master/extras/KnitrParameters.md). 70 | -------------------------------------------------------------------------------- /extras/KnitrParameters_files/figure-markdown_github/bindings-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/extras/KnitrParameters_files/figure-markdown_github/bindings-1.png -------------------------------------------------------------------------------- /extras/KnitrParameters_files/figure-markdown_github/values-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/extras/KnitrParameters_files/figure-markdown_github/values-1.png -------------------------------------------------------------------------------- /issues/BindIssue.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | ### binding rows on spark 8 | 9 | It would be nice if `dplyr::bind_rows` could be a used on `Sparklyr` data handles. 10 | 11 | 12 | 13 | ```{r, echo = FALSE} 14 | knitr::opts_chunk$set( 15 | collapse = TRUE, 16 | comment = " # " 17 | ) 18 | options(width =100) 19 | ``` 20 | 21 | 22 | OSX 10.11.6. 23 | Spark installed as described at http://spark.rstudio.com 24 | 25 | ``` 26 | library('sparklyr') 27 | spark_install(version = "2.0.0") 28 | ``` 29 | 30 | ```{r setup} 31 | library('dplyr') 32 | library('sparklyr') 33 | R.Version()$version.string 34 | packageVersion('dplyr') 35 | packageVersion('sparklyr') 36 | my_db <- sparklyr::spark_connect(version='2.0.0', master = "local") 37 | class(my_db) 38 | my_db$spark_home 39 | print(my_db) 40 | ``` 41 | 42 | * Expected outcome: dplyr::bind_rows to work with `Sparklyr` data reference. 43 | * Observed outcome: can't bind. 44 | 45 | ```{r issue, error=TRUE} 46 | support <- copy_to(my_db, 47 | data.frame(year=2005:2010), 48 | 'support') 49 | 50 | # This form doesn't work. 51 | dplyr::bind_rows(support, support) 52 | 53 | # This form doesn't work. 54 | dplyr::bind_rows(list(support, support)) 55 | ``` 56 | 57 | Submitted as [sparklyr issue 505](https://github.com/rstudio/sparklyr/issues/505). 58 | 59 | ```{r printversion} 60 | version 61 | ``` 62 | 63 | 64 | -------------------------------------------------------------------------------- /issues/ComplexJoins.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | Check complex join results. 10 | 11 | ```{r setup} 12 | suppressPackageStartupMessages(library('dplyr')) 13 | 14 | runJoinExperiment <- function(prefix, sc, eagerCompute, uniqueColumns) { 15 | names <- paste('t', prefix, 1:10, sep= '_') 16 | joined <- NULL 17 | for(ni in names) { 18 | di <- data.frame(k= 1:3, 19 | v= paste(ni, 1:3, sep= '_')) 20 | if(uniqueColumns) { 21 | colnames(di)[[2]] <- paste('y', ni, sep= '_') 22 | } 23 | if(!is.null(sc)) { 24 | ti <- copy_to(sc, di, ni) 25 | } else { 26 | ti <- di 27 | } 28 | if('NULL' %in% class(joined)) { 29 | joined <- ti 30 | } else { 31 | joined <- left_join(joined, ti, by= 'k') 32 | if(eagerCompute) { 33 | joined <- compute(joined) 34 | } 35 | } 36 | } 37 | compute(joined) 38 | } 39 | 40 | # works as expected 41 | runJoinExperiment('inmem', NULL, FALSE, FALSE) 42 | ``` 43 | 44 | Using `RSQLite` through `dplyr` loses columns. This has been submitted as [RSQLite issue 214](https://github.com/rstats-db/RSQLite/issues/214) and [dplyr issue 2823](https://github.com/tidyverse/dplyr/issues/2823). 45 | 46 | ```{r sqlite} 47 | sc <- src_sqlite(":memory:", create = TRUE) 48 | 49 | # throws 50 | tryCatch( 51 | runJoinExperiment('sqlitea', sc, FALSE, FALSE), 52 | error = function(e) print(e) 53 | ) 54 | 55 | # incorrect result (missing columns) 56 | runJoinExperiment('sqliteb', sc, TRUE, FALSE) 57 | ``` 58 | 59 | Using `Spark` through `sparklyr`/`dplyr` doesn't disambiguate columns as the local process does. 60 | 61 | ```{r sparksetup} 62 | sc <- sparklyr::spark_connect(version='2.0.2', 63 | master = "local") 64 | ``` 65 | 66 | ```{r spark1} 67 | # throws 68 | tryCatch( 69 | runJoinExperiment('sparka', sc, FALSE, FALSE), 70 | error = function(e) print(e) 71 | ) 72 | 73 | # throws 74 | tryCatch( 75 | runJoinExperiment('sparkb', sc, TRUE, FALSE), 76 | error = function(e) print(e) 77 | ) 78 | ``` 79 | 80 | We can try this again with unambiguous columns, which works. I am assuming that this is [dplyr issue 2773](https://github.com/tidyverse/dplyr/issues/2774), [sparklyr issue 677 ](https://github.com/rstudio/sparklyr/issues/677). 81 | 82 | ```{r spark2} 83 | # throws 84 | runJoinExperiment('spark2a', sc, FALSE, TRUE) 85 | 86 | runJoinExperiment('spark2b', sc, TRUE, TRUE) 87 | ``` 88 | 89 | ```{r sparkcleanup} 90 | sparklyr::spark_disconnect(sc) 91 | ``` 92 | 93 | 94 | ```{r versioninfo} 95 | packageVersion("dplyr") 96 | packageVersion("sparklyr") 97 | if(requireNamespace("dbplyr", quietly = TRUE)) { 98 | packageVersion("dbplyr") 99 | } 100 | if(requireNamespace("RSQLite", quietly = TRUE)) { 101 | packageVersion("RSQLite") 102 | } 103 | R.Version()$version.string 104 | ``` 105 | 106 | ```{r cleanup} 107 | rm(list=ls()) 108 | gc() 109 | ``` 110 | 111 | 112 | -------------------------------------------------------------------------------- /issues/DplyrDevnrow.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | `mutate` issue while using `sparklyr`, `Spark2`, and the dev version of `dplyr` (‘0.5.0.9000’, https://github.com/hadley/dplyr commit f39db50921110c3d23612cc81a7b3e027c0b3d1c ). 8 | 9 | 10 | 11 | ```{r opts, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | ```{r setup} 20 | library(sparklyr) 21 | library(dplyr) 22 | library(nycflights13) 23 | sc <- spark_connect(version='2.0.0', master = "local") 24 | flts <- replyr::replyr_copy_to(sc, flights) 25 | ``` 26 | 27 | Ok: 28 | 29 | ```{r ok} 30 | flights %>% mutate(zzz=1) 31 | ``` 32 | 33 | Throws: 34 | 35 | ```{r issue, error=TRUE} 36 | flts %>% mutate(zzz=1) 37 | ``` 38 | 39 | 40 | ```{r info} 41 | R.Version()$version.string 42 | packageVersion('dplyr') 43 | packageVersion('lazyeval') 44 | packageVersion('sparklyr') 45 | class(sc) 46 | sc$spark_home 47 | print(sc) 48 | version 49 | ``` 50 | 51 | Submitted as [dplyr issue 2495](https://github.com/hadley/dplyr/issues/2495). 52 | 53 | -------------------------------------------------------------------------------- /issues/DplyrSparklyr.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | ## `dplyr` and `sparklyr` 10 | 11 | Run DEV version of `dplyr` (appears to be CRAN release candidate) and DEV `sparklyr` as of 5-19-2017. 12 | 13 | ```{r CRANdplyr} 14 | # devtools::install_github('tidyverse/dbplyr') 15 | # devtools::install_github('rstudio/sparklyr') 16 | suppressPackageStartupMessages(library('dplyr')) 17 | packageVersion("dplyr") 18 | library('sparklyr') 19 | packageVersion("sparklyr") 20 | if(requireNamespace("dbplyr", quietly = TRUE)) { 21 | packageVersion("dbplyr") 22 | } 23 | R.Version()$version.string 24 | base::date() 25 | 26 | sc <- sparklyr::spark_connect(version='2.0.2', 27 | master = "local") 28 | ``` 29 | 30 | ```{r joindups, error=TRUE} 31 | d1 <- copy_to(sc, data.frame(x=1:3, y=4:6), 'd1', 32 | overwrite = TRUE) 33 | d2 <- copy_to(sc, data.frame(x=1:3, y=7:9), 'd2', 34 | overwrite = TRUE) 35 | 36 | left_join(d1, d2, by='x') 37 | ``` 38 | 39 | ```{r rename, error=TRUE} 40 | dLocal <- data.frame(x = 1:2, 41 | origCol = c('a', 'b'), 42 | stringsAsFactors = FALSE) 43 | 44 | d <- copy_to(sc, dLocal, 'd', 45 | overwrite = TRUE) 46 | 47 | # local 48 | rename(dLocal, x2 = x, origCol2 = origCol) 49 | 50 | # Spark 51 | rename(d, x2 = x, origCol2 = origCol) 52 | ``` 53 | 54 | 55 | ```{r cleanup} 56 | spark_disconnect(sc) 57 | rm(list=ls()) 58 | gc(verbose = FALSE) 59 | ``` 60 | -------------------------------------------------------------------------------- /issues/DplyrSparklyr.md: -------------------------------------------------------------------------------- 1 | 2 | `dplyr` and `sparklyr` 3 | ---------------------- 4 | 5 | Run DEV version of `dplyr` (appears to be CRAN release candidate) and DEV `sparklyr` as of 5-19-2017. 6 | 7 | ``` r 8 | # devtools::install_github('tidyverse/dbplyr') 9 | # devtools::install_github('rstudio/sparklyr') 10 | suppressPackageStartupMessages(library('dplyr')) 11 | packageVersion("dplyr") 12 | ``` 13 | 14 | ## [1] '0.6.0' 15 | 16 | ``` r 17 | library('sparklyr') 18 | packageVersion("sparklyr") 19 | ``` 20 | 21 | ## [1] '0.5.4.9003' 22 | 23 | ``` r 24 | if(requireNamespace("dbplyr", quietly = TRUE)) { 25 | packageVersion("dbplyr") 26 | } 27 | ``` 28 | 29 | ## [1] '0.0.0.9001' 30 | 31 | ``` r 32 | R.Version()$version.string 33 | ``` 34 | 35 | ## [1] "R version 3.4.0 (2017-04-21)" 36 | 37 | ``` r 38 | base::date() 39 | ``` 40 | 41 | ## [1] "Fri May 19 13:04:39 2017" 42 | 43 | ``` r 44 | sc <- sparklyr::spark_connect(version='2.0.2', 45 | master = "local") 46 | ``` 47 | 48 | ``` r 49 | d1 <- copy_to(sc, data.frame(x=1:3, y=4:6), 'd1', 50 | overwrite = TRUE) 51 | d2 <- copy_to(sc, data.frame(x=1:3, y=7:9), 'd2', 52 | overwrite = TRUE) 53 | 54 | left_join(d1, d2, by='x') 55 | ``` 56 | 57 | ## # Source: lazy query [?? x 3] 58 | ## # Database: spark_connection 59 | ## x y.x y.y 60 | ## 61 | ## 1 1 4 7 62 | ## 2 2 5 8 63 | ## 3 3 6 9 64 | 65 | ``` r 66 | dLocal <- data.frame(x = 1:2, 67 | origCol = c('a', 'b'), 68 | stringsAsFactors = FALSE) 69 | 70 | d <- copy_to(sc, dLocal, 'd', 71 | overwrite = TRUE) 72 | 73 | # local 74 | rename(dLocal, x2 = x, origCol2 = origCol) 75 | ``` 76 | 77 | ## x2 origCol2 78 | ## 1 1 a 79 | ## 2 2 b 80 | 81 | ``` r 82 | # Spark 83 | rename(d, x2 = x, origCol2 = origCol) 84 | ``` 85 | 86 | ## Error in names(select)[match(old_vars, vars)] <- new_vars: NAs are not allowed in subscripted assignments 87 | 88 | ``` r 89 | spark_disconnect(sc) 90 | rm(list=ls()) 91 | gc(verbose = FALSE) 92 | ``` 93 | 94 | ## used (Mb) gc trigger (Mb) max used (Mb) 95 | ## Ncells 710275 38.0 1168576 62.5 1168576 62.5 96 | ## Vcells 1208556 9.3 2060183 15.8 1511861 11.6 97 | -------------------------------------------------------------------------------- /issues/DplyrSparklyr_CRANdplyr_CRANsparklyr.md: -------------------------------------------------------------------------------- 1 | 2 | `dplyr` and `sparklyr` 3 | ---------------------- 4 | 5 | Run CRAN version of `dplyr` and CRAN `sparklyr` as of 5-14-2017. 6 | 7 | ``` r 8 | # devtools::install_github("tidyverse/dplyr") 9 | # devtools::install_github('tidyverse/dbplyr') 10 | # devtools::install_github('rstudio/sparklyr') 11 | suppressPackageStartupMessages(library('dplyr')) 12 | packageVersion("dplyr") 13 | ``` 14 | 15 | ## [1] '0.5.0' 16 | 17 | ``` r 18 | library('sparklyr') 19 | packageVersion("sparklyr") 20 | ``` 21 | 22 | ## [1] '0.5.4' 23 | 24 | ``` r 25 | if(requireNamespace("dbplyr", quietly = TRUE)) { 26 | packageVersion("dbplyr") 27 | } 28 | R.Version()$version.string 29 | ``` 30 | 31 | ## [1] "R version 3.4.0 (2017-04-21)" 32 | 33 | ``` r 34 | base::date() 35 | ``` 36 | 37 | ## [1] "Mon May 15 15:14:01 2017" 38 | 39 | ``` r 40 | sc <- sparklyr::spark_connect(version='2.0.2', 41 | master = "local") 42 | ``` 43 | 44 | ``` r 45 | d1 <- copy_to(sc, data.frame(x=1:3, y=4:6), 'd1', 46 | overwrite = TRUE) 47 | d2 <- copy_to(sc, data.frame(x=1:3, y=7:9), 'd2', 48 | overwrite = TRUE) 49 | 50 | left_join(d1, d2, by='x') 51 | ``` 52 | 53 | ## Source: query [3 x 3] 54 | ## Database: spark connection master=local[4] app=sparklyr local=TRUE 55 | ## 56 | ## # A tibble: 3 x 3 57 | ## x y.x y.y 58 | ## 59 | ## 1 1 4 7 60 | ## 2 2 5 8 61 | ## 3 3 6 9 62 | 63 | ``` r 64 | dLocal <- data.frame(x = 1:2, 65 | origCol = c('a', 'b'), 66 | stringsAsFactors = FALSE) 67 | 68 | d <- copy_to(sc, dLocal, 'd', 69 | overwrite = TRUE) 70 | 71 | # local 72 | rename(dLocal, x2 = x, origCol2 = origCol) 73 | ``` 74 | 75 | ## x2 origCol2 76 | ## 1 1 a 77 | ## 2 2 b 78 | 79 | ``` r 80 | # Spark 81 | rename(d, x2 = x, origCol2 = origCol) 82 | ``` 83 | 84 | ## Source: query [2 x 2] 85 | ## Database: spark connection master=local[4] app=sparklyr local=TRUE 86 | ## 87 | ## # A tibble: 2 x 2 88 | ## x2 origCol2 89 | ## 90 | ## 1 1 a 91 | ## 2 2 b 92 | 93 | ``` r 94 | spark_disconnect(sc) 95 | rm(list=ls()) 96 | gc(verbose = FALSE) 97 | ``` 98 | 99 | ## used (Mb) gc trigger (Mb) max used (Mb) 100 | ## Ncells 669821 35.8 1168576 62.5 1168576 62.5 101 | ## Vcells 1163675 8.9 2060183 15.8 1395946 10.7 102 | -------------------------------------------------------------------------------- /issues/DplyrSparklyr_DEVdplyr_CRANsparklyr.md: -------------------------------------------------------------------------------- 1 | 2 | `dplyr` and `sparklyr` 3 | ---------------------- 4 | 5 | Run DEV version of `dplyr` and CRAN `sparklyr` as of 5-14-2017. 6 | 7 | ``` r 8 | # devtools::install_github("tidyverse/dplyr") 9 | # devtools::install_github('tidyverse/dbplyr') 10 | # devtools::install_github('rstudio/sparklyr') 11 | suppressPackageStartupMessages(library('dplyr')) 12 | packageVersion("dplyr") 13 | ``` 14 | 15 | ## [1] '0.5.0.9005' 16 | 17 | ``` r 18 | library('sparklyr') 19 | packageVersion("sparklyr") 20 | ``` 21 | 22 | ## [1] '0.5.4' 23 | 24 | ``` r 25 | if(requireNamespace("dbplyr", quietly = TRUE)) { 26 | packageVersion("dbplyr") 27 | } 28 | ``` 29 | 30 | ## [1] '0.0.0.9001' 31 | 32 | ``` r 33 | R.Version()$version.string 34 | ``` 35 | 36 | ## [1] "R version 3.4.0 (2017-04-21)" 37 | 38 | ``` r 39 | base::date() 40 | ``` 41 | 42 | ## [1] "Mon May 15 15:10:00 2017" 43 | 44 | ``` r 45 | sc <- sparklyr::spark_connect(version='2.0.2', 46 | master = "local") 47 | ``` 48 | 49 | ``` r 50 | d1 <- copy_to(sc, data.frame(x=1:3, y=4:6), 'd1', 51 | overwrite = TRUE) 52 | d2 <- copy_to(sc, data.frame(x=1:3, y=7:9), 'd2', 53 | overwrite = TRUE) 54 | 55 | left_join(d1, d2, by='x') 56 | ``` 57 | 58 | ## Source: lazy query [?? x 3] 59 | ## Database: spark_connection 60 | 61 | ## Error: Column `y` must have a unique name 62 | 63 | ``` r 64 | dLocal <- data.frame(x = 1:2, 65 | origCol = c('a', 'b'), 66 | stringsAsFactors = FALSE) 67 | 68 | d <- copy_to(sc, dLocal, 'd', 69 | overwrite = TRUE) 70 | 71 | # local 72 | rename(dLocal, x2 = x, origCol2 = origCol) 73 | ``` 74 | 75 | ## x2 origCol2 76 | ## 1 1 a 77 | ## 2 2 b 78 | 79 | ``` r 80 | # Spark 81 | rename(d, x2 = x, origCol2 = origCol) 82 | ``` 83 | 84 | ## Source: lazy query [?? x 2] 85 | ## Database: spark_connection 86 | 87 | ## Error in names(select)[match(old_vars, vars)] <- new_vars: NAs are not allowed in subscripted assignments 88 | 89 | ``` r 90 | spark_disconnect(sc) 91 | rm(list=ls()) 92 | gc(verbose = FALSE) 93 | ``` 94 | 95 | ## used (Mb) gc trigger (Mb) max used (Mb) 96 | ## Ncells 677545 36.2 1168576 62.5 1168576 62.5 97 | ## Vcells 1167395 9.0 2060183 15.8 1420902 10.9 98 | -------------------------------------------------------------------------------- /issues/HeadIssue.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | ### head(d, n=1) has problems on sparklyr (possibly to to blank values) 8 | 9 | 10 | 11 | 12 | ```{r, echo = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = " # " 16 | ) 17 | options(width =100) 18 | ``` 19 | 20 | 21 | OSX 10.11.6. 22 | Spark installed as described at http://spark.rstudio.com 23 | 24 | ``` 25 | library('sparklyr') 26 | spark_install(version = "2.0.0") 27 | ``` 28 | 29 | ```{r setup} 30 | library('dplyr') 31 | library('sparklyr') 32 | R.Version()$version.string 33 | packageVersion('dplyr') 34 | packageVersion('sparklyr') 35 | my_db <- sparklyr::spark_connect(version='2.0.0', master = "local") 36 | class(my_db) 37 | my_db$spark_home 38 | print(my_db) 39 | ``` 40 | 41 | ```{r issue, error=TRUE} 42 | support <- copy_to(my_db, 43 | data.frame(year=2005:2010), 44 | 'support') 45 | 46 | # This works. 47 | head(support) 48 | 49 | # This works. 50 | head(support, n=1) 51 | 52 | support <- mutate(support, name='') 53 | 54 | # This works. 55 | head(support) 56 | 57 | # This does not work. 58 | head(support, n=1) 59 | ``` 60 | 61 | Submitted as [sparklyr issue 506](https://github.com/rstudio/sparklyr/issues/506). 62 | 63 | ```{r printversion} 64 | version 65 | ``` 66 | 67 | 68 | -------------------------------------------------------------------------------- /issues/JoinNamesDups.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | ## Duplicate columns not allowed in join 10 | 11 | Instead of adding suffixes join fails with duplicate column message. I think this depends on the version of dplyr used. Seeing the failure with the dev-version of dplyr '0.5.0.9004' as of 5-14-2017. Confirmed it works properly for dplyr 0.5.0. 12 | 13 | Submitted as [`Sparklyr` issue 677](https://github.com/rstudio/sparklyr/issues/677) and [`dplyr` issue 2774](https://github.com/tidyverse/dplyr/issues/2774). 14 | 15 | ```{r example, error = TRUE} 16 | # devtools::install_github("tidyverse/dplyr") 17 | # devtools::install_github('tidyverse/dbplyr') 18 | suppressPackageStartupMessages(library('dplyr')) 19 | packageVersion("dplyr") 20 | library('sparklyr') 21 | packageVersion("sparklyr") 22 | if(requireNamespace("dbplyr", quietly = TRUE)) { 23 | packageVersion("dbplyr") 24 | } 25 | R.Version()$version.string 26 | 27 | sc <- sparklyr::spark_connect(version='2.0.2', 28 | master = "local") 29 | d1 <- copy_to(sc, data.frame(x=1:3, y=4:6), 'd1') 30 | d2 <- copy_to(sc, data.frame(x=1:3, y=7:9), 'd2') 31 | 32 | left_join(d1, d2, by='x') 33 | ``` 34 | 35 | ```{r cleanup} 36 | spark_disconnect(sc) 37 | rm(list=ls()) 38 | gc(verbose = FALSE) 39 | ``` -------------------------------------------------------------------------------- /issues/JoinNamesDups.md: -------------------------------------------------------------------------------- 1 | 2 | Duplicate columns not allowed in join 3 | ------------------------------------- 4 | 5 | Instead of adding suffixes join fails with duplicate column message. I think this depends on the version of dplyr used. Seeing the failure with the dev-version of dplyr '0.5.0.9004' as of 5-14-2017. Confirmed it works properly for dplyr 0.5.0. 6 | 7 | Submitted as [`Sparklyr` issue 677](https://github.com/rstudio/sparklyr/issues/677) and [`dplyr` issue 2774](https://github.com/tidyverse/dplyr/issues/2774). 8 | 9 | ``` r 10 | # devtools::install_github("tidyverse/dplyr") 11 | # devtools::install_github('tidyverse/dbplyr') 12 | suppressPackageStartupMessages(library('dplyr')) 13 | packageVersion("dplyr") 14 | ``` 15 | 16 | ## [1] '0.5.0.9004' 17 | 18 | ``` r 19 | library('sparklyr') 20 | packageVersion("sparklyr") 21 | ``` 22 | 23 | ## [1] '0.5.4' 24 | 25 | ``` r 26 | if(requireNamespace("dbplyr", quietly = TRUE)) { 27 | packageVersion("dbplyr") 28 | } 29 | ``` 30 | 31 | ## [1] '0.0.0.9001' 32 | 33 | ``` r 34 | R.Version()$version.string 35 | ``` 36 | 37 | ## [1] "R version 3.4.0 (2017-04-21)" 38 | 39 | ``` r 40 | sc <- sparklyr::spark_connect(version='2.0.2', 41 | master = "local") 42 | d1 <- copy_to(sc, data.frame(x=1:3, y=4:6), 'd1') 43 | d2 <- copy_to(sc, data.frame(x=1:3, y=7:9), 'd2') 44 | 45 | left_join(d1, d2, by='x') 46 | ``` 47 | 48 | ## Source: lazy query [?? x 3] 49 | ## Database: spark_connection 50 | 51 | ## Error: Each variable must have a unique name. 52 | ## Problem variables: 'y' 53 | 54 | ``` r 55 | spark_disconnect(sc) 56 | rm(list=ls()) 57 | gc(verbose = FALSE) 58 | ``` 59 | 60 | ## used (Mb) gc trigger (Mb) max used (Mb) 61 | ## Ncells 673696 36.0 1168576 62.5 1168576 62.5 62 | ## Vcells 1160680 8.9 2060183 15.8 1372249 10.5 63 | -------------------------------------------------------------------------------- /issues/MySQLSelfJoin.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | ## MySQL fails on self-join 10 | 11 | 12 | Submitted as [`dplyr` issue 2777](https://github.com/tidyverse/dplyr/issues/2777). 13 | 14 | ```{r setup} 15 | # devtools::install_github("tidyverse/dplyr") 16 | # devtools::install_github('tidyverse/dbplyr') 17 | if(requireNamespace("dbplyr", quietly = TRUE)) { 18 | packageVersion("dbplyr") 19 | } 20 | packageVersion("RMySQL") 21 | packageVersion("dplyr") 22 | R.Version()$version.string 23 | ``` 24 | 25 | ```{r example, error=TRUE} 26 | suppressPackageStartupMessages(library('dplyr')) 27 | sc <- src_mysql('mysql', '127.0.0.1', 3306, 28 | 'root', '') 29 | d <- copy_to(sc, data.frame(x=1:3), 'd') 30 | 31 | # copy 32 | d2 <- d %>% 33 | filter(TRUE) %>% 34 | compute() 35 | 36 | # works 37 | left_join(d, d2, by='x') 38 | 39 | # throws 40 | left_join(d, d, by='x') 41 | ``` 42 | 43 | -------------------------------------------------------------------------------- /issues/MySQLSelfJoin.md: -------------------------------------------------------------------------------- 1 | 2 | MySQL fails on self-join 3 | ------------------------ 4 | 5 | Submitted as [`dplyr` issue 2777](https://github.com/tidyverse/dplyr/issues/2777). 6 | 7 | ``` r 8 | # devtools::install_github("tidyverse/dplyr") 9 | # devtools::install_github('tidyverse/dbplyr') 10 | if(requireNamespace("dbplyr", quietly = TRUE)) { 11 | packageVersion("dbplyr") 12 | } 13 | packageVersion("RMySQL") 14 | ``` 15 | 16 | ## [1] '0.10.11' 17 | 18 | ``` r 19 | packageVersion("dplyr") 20 | ``` 21 | 22 | ## [1] '0.5.0' 23 | 24 | ``` r 25 | R.Version()$version.string 26 | ``` 27 | 28 | ## [1] "R version 3.4.0 (2017-04-21)" 29 | 30 | ``` r 31 | suppressPackageStartupMessages(library('dplyr')) 32 | sc <- src_mysql('mysql', '127.0.0.1', 3306, 33 | 'root', '') 34 | d <- copy_to(sc, data.frame(x=1:3), 'd') 35 | 36 | # copy 37 | d2 <- d %>% 38 | filter(TRUE) %>% 39 | compute() 40 | 41 | # works 42 | left_join(d, d2, by='x') 43 | ``` 44 | 45 | ## Source: query [?? x 1] 46 | ## Database: mysql 10.1.23-MariaDB [root@127.0.0.1:/mysql] 47 | ## 48 | ## # A tibble: ?? x 1 49 | ## x 50 | ## 51 | ## 1 1 52 | ## 2 2 53 | ## 3 3 54 | 55 | ``` r 56 | # throws 57 | left_join(d, d, by='x') 58 | ``` 59 | 60 | ## Source: query [?? x 1] 61 | ## Database: mysql 10.1.23-MariaDB [root@127.0.0.1:/mysql] 62 | 63 | ## Error in .local(conn, statement, ...): could not run statement: Not unique table/alias: 'd' 64 | -------------------------------------------------------------------------------- /issues/MySQL_mutate.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Issues with `dplyr::mutate` and `RMySQL`. 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | Can not prevent the warning. 20 | 21 | ```{r} 22 | library('dplyr') 23 | packageVersion('dplyr') 24 | packageVersion('RMySQL') 25 | 26 | my_db <- src_mysql('mysql','127.0.0.1',3306,'root','passwd') 27 | d4 <- copy_to(my_db,data.frame(x=c(1.1,2,3,3)),'d4') 28 | suppressWarnings( 29 | d4 %>% mutate(z=1) %>% compute() -> d4 30 | ) 31 | print(d4) 32 | ``` 33 | 34 | Submitted as [RMySQL 176](https://github.com/rstats-db/RMySQL/issues/176). 35 | 36 | ```{r printversion} 37 | version 38 | ``` 39 | 40 | 41 | -------------------------------------------------------------------------------- /issues/MySQL_mutate.md: -------------------------------------------------------------------------------- 1 | Issues with `dplyr::mutate` and `RMySQL`. 2 | 3 | 4 | Can not prevent the warning. 5 | 6 | ``` r 7 | library('dplyr') 8 | # 9 | # Attaching package: 'dplyr' 10 | # The following objects are masked from 'package:stats': 11 | # 12 | # filter, lag 13 | # The following objects are masked from 'package:base': 14 | # 15 | # intersect, setdiff, setequal, union 16 | packageVersion('dplyr') 17 | # [1] '0.5.0' 18 | packageVersion('RMySQL') 19 | # [1] '0.10.9' 20 | 21 | my_db <- src_mysql('mysql','127.0.0.1',3306,'root','passwd') 22 | d4 <- copy_to(my_db,data.frame(x=c(1.1,2,3,3)),'d4') 23 | suppressWarnings( 24 | d4 %>% mutate(z=1) %>% compute() -> d4 25 | ) 26 | print(d4) 27 | # Source: query [?? x 2] 28 | # Database: mysql 5.6.34 [root@127.0.0.1:/mysql] 29 | # Warning in .local(conn, statement, ...): Decimal MySQL column 1 imported as numeric 30 | # x z 31 | # 32 | # 1 1.1 1 33 | # 2 2.0 1 34 | # 3 3.0 1 35 | # 4 3.0 1 36 | ``` 37 | 38 | Submitted as [RMySQL 176](https://github.com/rstats-db/RMySQL/issues/176). 39 | 40 | ``` r 41 | version 42 | # _ 43 | # platform x86_64-apple-darwin13.4.0 44 | # arch x86_64 45 | # os darwin13.4.0 46 | # system x86_64, darwin13.4.0 47 | # status 48 | # major 3 49 | # minor 3.2 50 | # year 2016 51 | # month 10 52 | # day 31 53 | # svn rev 71607 54 | # language R 55 | # version.string R version 3.3.2 (2016-10-31) 56 | # nickname Sincere Pumpkin Patch 57 | ``` 58 | -------------------------------------------------------------------------------- /issues/MySQLcast.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | ## Problem with MySQL cast 10 | 11 | Simple cast emits `SQL` not accepted by `MySQL`. 12 | 13 | Submitted as [`dplyr` issue 2775](https://github.com/tidyverse/dplyr/issues/2775) as `dbplyr` currently asks that issues be filed there. 14 | 15 | ```{r example, error = TRUE} 16 | # devtools::install_github("tidyverse/dplyr") 17 | # devtools::install_github('tidyverse/dbplyr') 18 | suppressPackageStartupMessages(library('dplyr')) 19 | packageVersion("dplyr") 20 | if(requireNamespace("dbplyr", quietly = TRUE)) { 21 | packageVersion("dbplyr") 22 | } 23 | R.Version()$version.string 24 | 25 | sc <- dplyr::src_mysql('mysql', 26 | '127.0.0.1', 27 | 3306, 28 | 'root', 'passwd') 29 | d1 <- copy_to(sc, data.frame(x=1:3), 'd1') 30 | 31 | # works, Note PostgreSQL needs this form 32 | # or it doesn't know type of newCol 33 | mutate(d1, newCol= 'a') 34 | 35 | # throws 36 | mutate(d1, newCol= as.character('a')) 37 | 38 | ``` 39 | 40 | ```{r cleanup} 41 | rm(list=ls()) 42 | gc(verbose = FALSE) 43 | ``` 44 | -------------------------------------------------------------------------------- /issues/MySQLcast.md: -------------------------------------------------------------------------------- 1 | 2 | Problem with MySQL cast 3 | ----------------------- 4 | 5 | Simple cast emits `SQL` not accepted by `MySQL`. 6 | 7 | Submitted as [`dplyr` issue 2775](https://github.com/tidyverse/dplyr/issues/2775) as `dbplyr` currently asks that issues be filed there. 8 | 9 | ``` r 10 | # devtools::install_github("tidyverse/dplyr") 11 | # devtools::install_github('tidyverse/dbplyr') 12 | suppressPackageStartupMessages(library('dplyr')) 13 | packageVersion("dplyr") 14 | ``` 15 | 16 | ## [1] '0.5.0.9004' 17 | 18 | ``` r 19 | if(requireNamespace("dbplyr", quietly = TRUE)) { 20 | packageVersion("dbplyr") 21 | } 22 | ``` 23 | 24 | ## [1] '0.0.0.9001' 25 | 26 | ``` r 27 | R.Version()$version.string 28 | ``` 29 | 30 | ## [1] "R version 3.4.0 (2017-04-21)" 31 | 32 | ``` r 33 | sc <- dplyr::src_mysql('mysql', 34 | '127.0.0.1', 35 | 3306, 36 | 'root', 'passwd') 37 | d1 <- copy_to(sc, data.frame(x=1:3), 'd1') 38 | 39 | # works, Note PostgreSQL needs this form 40 | # or it doesn't know type of newCol 41 | mutate(d1, newCol= 'a') 42 | ``` 43 | 44 | ## Source: lazy query [?? x 2] 45 | ## Database: mysql 5.6.34 [root@127.0.0.1:/mysql] 46 | ## 47 | ## # A tibble: ?? x 2 48 | ## x newCol 49 | ## 50 | ## 1 1 a 51 | ## 2 2 a 52 | ## 3 3 a 53 | 54 | ``` r 55 | # throws 56 | mutate(d1, newCol= as.character('a')) 57 | ``` 58 | 59 | ## Source: lazy query [?? x 2] 60 | ## Database: mysql 5.6.34 [root@127.0.0.1:/mysql] 61 | 62 | ## Error in .local(conn, statement, ...): could not run statement: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'TEXT) AS `newCol` 63 | ## FROM `d1` 64 | ## LIMIT 10' at line 1 65 | 66 | ``` r 67 | rm(list=ls()) 68 | gc(verbose = FALSE) 69 | ``` 70 | 71 | ## Auto-disconnecting MySQLConnection 72 | 73 | ## used (Mb) gc trigger (Mb) max used (Mb) 74 | ## Ncells 627439 33.6 1168576 62.5 940480 50.3 75 | ## Vcells 1091446 8.4 2060183 15.8 1316802 10.1 76 | -------------------------------------------------------------------------------- /issues/NAvalues.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Be aware different `dplyr` back-ends represent `NA` much differently. Expect numeric `NA` to be presented as `NaN` quite often, and expect database based implementations to use `NULL` (in their sense of `NULL`, *not* in R's sense) especially in character types. Also some `dplyr` back-ends may not have a currently accessible `NULL` concept for character types (such as Spark). 8 | 9 | `dplyr` `0.5.0` with `RMySQL` `0.10.9` (both current on [Cran](https://cran.r-project.org) 11-27-2016) failing to insert `NULL` into `MySQL` (filed as [dplyr issue 2259](https://github.com/hadley/dplyr/issues/2259), status moved to duplicate of [dplyr issue 2256](https://github.com/hadley/dplyr/issues/2256)). 10 | 11 | ```{r mysqlnull, error=TRUE} 12 | library('dplyr') 13 | library('nycflights13') 14 | packageVersion('dplyr') 15 | packageVersion('RMySQL') 16 | mysql <- src_mysql('mysql','127.0.0.1',3306,'root','passwd') 17 | flts <- flights 18 | flights_mysql <- copy_to(mysql,flts, 19 | temporary = TRUE,overwrite = TRUE, 20 | indexes = list(c("year", "month", "day"), "carrier", "tailnum")) 21 | ``` 22 | 23 | `Spark` `2.0.0` with `sparklyr` `0.4.26` not faithful to `NA` values in character 24 | or factor columns of `data.frame`. As we see below they get converted to blank 25 | in a round trip between local `data.frame`s and `Spark` representations. Obviously 26 | the round trip can not be fully faithful (we fully expect factors types to become character types, and can live with numeric `NA` becoming `NaN`) due to differences in representation. But `Spark` can represent missing values in character columns (for example see [here](http://stackoverflow.com/questions/32067467/create-new-dataframe-with-empty-null-field-values)). 27 | 28 | Filed as [sparklyr issue 340](https://github.com/rstudio/sparklyr/issues/340). 29 | 30 | ```{r sparklyr200} 31 | library('sparklyr') 32 | packageVersion('sparklyr') 33 | s200 <- my_db <- sparklyr::spark_connect(version='2.0.0', 34 | master = "local") 35 | 36 | d2 <- data.frame(x=factor(c('z1',NA,'z3')),y=c(3,5,NA),z=c(NA,'a','z'), 37 | stringsAsFactors = FALSE) 38 | print(d2) 39 | 40 | d2r <- copy_to(s200,d2,'d2', 41 | temporary = FALSE,overwrite = TRUE) 42 | print(d2r) 43 | d2x <- as.data.frame(d2r) 44 | print(d2x) 45 | summary(d2x) 46 | str(d2x) 47 | ``` 48 | 49 | ```{r printversion} 50 | version 51 | ``` 52 | 53 | -------------------------------------------------------------------------------- /issues/README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | 10 | ```{r, echo = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = " # " 14 | ) 15 | options(width =100) 16 | ``` 17 | 18 | 19 | A good part of practical or production packages is "shimming" or working around rough edges of supplied services and data. For `replyr` this means trying to make the semantics of multiple `dplyr` data services look similar (including working across different versions of `dplyr`, `Spark`, and `sparklyr`). 20 | 21 | These shimming actions are ugly little work-arounds in code (something the package user gets to then avoid). However, unless you run down the bad effects you think you are preventing you can end up fighting phantoms or paying very much for needless precautions. 22 | 23 | This directory lists the current "wish this wasn't that way" behaviors from other packages that `replyr` attemptings to supply work-arounds for. 24 | -------------------------------------------------------------------------------- /issues/README.md: -------------------------------------------------------------------------------- 1 | 2 | A good part of practical or production packages is "shimming" or working around rough edges of supplied services and data. For `replyr` this means trying to make the semantics of multiple `dplyr` data services look similar (including working across different versions of `dplyr`, `Spark`, and `sparklyr`). 3 | 4 | These shimming actions are ugly little work-arounds in code (something the package user gets to then avoid). However, unless you run down the bad effects you think you are preventing you can end up fighting phantoms or paying very much for needless precautions. 5 | 6 | This directory lists the current "wish this wasn't that way" behaviors from other packages that `replyr` attemptings to supply work-arounds for. 7 | -------------------------------------------------------------------------------- /issues/SQLiteColtypes.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | ```{r, echo = FALSE} 10 | knitr::opts_chunk$set( 11 | collapse = TRUE, 12 | comment = " # " 13 | ) 14 | options(width =100) 15 | ``` 16 | 17 | Filed as [dplyr 2302](https://github.com/hadley/dplyr/issues/2302). 18 | 19 | ## logical to numeric clobber 20 | 21 | ```{r lclob} 22 | library('dplyr') 23 | d <- data.frame(x=c(1,2,2),y=c(3,5,NA),z=c(NA,'a','b'), 24 | rowNum=1:3, 25 | stringsAsFactors = FALSE) 26 | print(d) 27 | 28 | fnam <- tempfile(pattern = "dplyr_doc_narm", tmpdir = tempdir(), fileext = "sqlite3") 29 | my_db <- dplyr::src_sqlite(fnam, create = TRUE) 30 | class(my_db) 31 | dRemote <- copy_to(my_db,d,'d',rowNumberColumn='rowNum',overwrite=TRUE) 32 | 33 | 34 | # correct calculation 35 | dRemote %>% mutate(nna=0) %>% 36 | mutate(nna=nna+ifelse(is.na(x),1,0)) %>% 37 | mutate(nna=nna+ifelse(is.na(y),1,0)) %>% 38 | mutate(nna=nna+ifelse(is.na(z),1,0)) 39 | 40 | # incorrect calculation (last step seems to always clobber the previous result) 41 | dRemote %>% mutate(nna=0) %>% 42 | mutate(nna=nna+is.na(x)) %>% 43 | mutate(nna=nna+is.na(y)) %>% 44 | mutate(nna=nna+is.na(z)) 45 | 46 | # clean up 47 | rm(list=setdiff(ls(),'fnam')) 48 | if(!is.null(fnam)) { 49 | file.remove(fnam) 50 | } 51 | gc() 52 | ``` 53 | -------------------------------------------------------------------------------- /issues/SQLiteColtypes.md: -------------------------------------------------------------------------------- 1 | 2 | Filed as [dplyr 2302](https://github.com/hadley/dplyr/issues/2302). 3 | 4 | logical to numeric clobber 5 | -------------------------- 6 | 7 | ``` r 8 | library('dplyr') 9 | # 10 | # Attaching package: 'dplyr' 11 | # The following objects are masked from 'package:stats': 12 | # 13 | # filter, lag 14 | # The following objects are masked from 'package:base': 15 | # 16 | # intersect, setdiff, setequal, union 17 | d <- data.frame(x=c(1,2,2),y=c(3,5,NA),z=c(NA,'a','b'), 18 | rowNum=1:3, 19 | stringsAsFactors = FALSE) 20 | print(d) 21 | # x y z rowNum 22 | # 1 1 3 1 23 | # 2 2 5 a 2 24 | # 3 2 NA b 3 25 | 26 | fnam <- tempfile(pattern = "dplyr_doc_narm", tmpdir = tempdir(), fileext = "sqlite3") 27 | my_db <- dplyr::src_sqlite(fnam, create = TRUE) 28 | class(my_db) 29 | # [1] "src_sqlite" "src_sql" "src" 30 | dRemote <- copy_to(my_db,d,'d',rowNumberColumn='rowNum',overwrite=TRUE) 31 | 32 | 33 | # correct calculation 34 | dRemote %>% mutate(nna=0) %>% 35 | mutate(nna=nna+ifelse(is.na(x),1,0)) %>% 36 | mutate(nna=nna+ifelse(is.na(y),1,0)) %>% 37 | mutate(nna=nna+ifelse(is.na(z),1,0)) 38 | # Source: query [?? x 5] 39 | # Database: sqlite 3.8.6 [/var/folders/7q/h_jp2vj131g5799gfnpzhdp80000gn/T//RtmpriMQ9s/dplyr_doc_narm968e465ba628sqlite3] 40 | # 41 | # x y z rowNum nna 42 | # 43 | # 1 1 3 1 1 44 | # 2 2 5 a 2 0 45 | # 3 2 NA b 3 1 46 | 47 | # incorrect calculation (last step seems to always clobber the previous result) 48 | dRemote %>% mutate(nna=0) %>% 49 | mutate(nna=nna+is.na(x)) %>% 50 | mutate(nna=nna+is.na(y)) %>% 51 | mutate(nna=nna+is.na(z)) 52 | # Source: query [?? x 5] 53 | # Database: sqlite 3.8.6 [/var/folders/7q/h_jp2vj131g5799gfnpzhdp80000gn/T//RtmpriMQ9s/dplyr_doc_narm968e465ba628sqlite3] 54 | # 55 | # x y z rowNum nna 56 | # 57 | # 1 1 3 1 1 58 | # 2 2 5 a 2 0 59 | # 3 2 NA b 3 0 60 | 61 | # clean up 62 | rm(list=setdiff(ls(),'fnam')) 63 | if(!is.null(fnam)) { 64 | file.remove(fnam) 65 | } 66 | # [1] TRUE 67 | gc() 68 | # used (Mb) gc trigger (Mb) max used (Mb) 69 | # Ncells 464057 24.8 750400 40.1 592000 31.7 70 | # Vcells 656579 5.1 1308461 10.0 920497 7.1 71 | ``` 72 | -------------------------------------------------------------------------------- /issues/SQLitesd.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Standard deviation with `SQLite` is zero when there is one data item, not the expected `NA`. 8 | Nocie the `sd()` calculation agrees with `R`'s local calculation when `n`>1 so this 9 | isn't just a sample variance versus population variance issue. 10 | 11 | 12 | 13 | ```{r, echo = FALSE} 14 | knitr::opts_chunk$set( 15 | collapse = TRUE, 16 | comment = " # " 17 | ) 18 | options(width =100) 19 | ``` 20 | 21 | 22 | ```{r copyissueover, error=TRUE} 23 | library('dplyr') 24 | library('RSQLite') 25 | packageVersion('dplyr') 26 | packageVersion('RSQLite') 27 | 28 | my_db <- dplyr::src_sqlite(":memory:", create = TRUE) 29 | 30 | # confirm sqlite can represent NA 31 | d <- data.frame(x = c(1,NA,3)) 32 | dbData <- dplyr::copy_to(my_db, d, name='d', 33 | create=TRUE, overwrite=TRUE) 34 | print(dbData) 35 | 36 | for(n in 1:3) { 37 | print("***********") 38 | print(paste('n',n)) 39 | dplyr::db_drop_table(my_db$con, 'd') 40 | d <- data.frame(x= seq_len(n)) 41 | print("local") 42 | print(dplyr::summarise_all(d, dplyr::funs(sd))) 43 | dbData <- dplyr::copy_to(my_db, d, name='d', 44 | create=TRUE, overwrite=TRUE) 45 | print("RSQLite") 46 | print(dplyr::summarise_all(dbData, dplyr::funs(sd))) 47 | print("***********") 48 | } 49 | ``` 50 | 51 | Filed as [RSQLite 201](https://github.com/rstats-db/RSQLite/issues/201). 52 | 53 | ```{r printversion} 54 | version 55 | ``` 56 | 57 | ```{r cleanup} 58 | rm(list=ls()) 59 | gc() 60 | ``` -------------------------------------------------------------------------------- /issues/SQLitesd.md: -------------------------------------------------------------------------------- 1 | Standard deviation with `SQLite` is zero when there is one data item, not the expected `NA`. Nocie the `sd()` calculation agrees with `R`'s local calculation when `n`>1 so this isn't just a sample variance versus population variance issue. 2 | 3 | 4 | ``` r 5 | library('dplyr') 6 | # 7 | # Attaching package: 'dplyr' 8 | # The following objects are masked from 'package:stats': 9 | # 10 | # filter, lag 11 | # The following objects are masked from 'package:base': 12 | # 13 | # intersect, setdiff, setequal, union 14 | library('RSQLite') 15 | packageVersion('dplyr') 16 | # [1] '0.5.0' 17 | packageVersion('RSQLite') 18 | # [1] '1.1.2' 19 | 20 | my_db <- dplyr::src_sqlite(":memory:", create = TRUE) 21 | 22 | # confirm sqlite can represent NA 23 | d <- data.frame(x = c(1,NA,3)) 24 | dbData <- dplyr::copy_to(my_db, d, name='d', 25 | create=TRUE, overwrite=TRUE) 26 | print(dbData) 27 | # Source: query [?? x 1] 28 | # Database: sqlite 3.11.1 [:memory:] 29 | # 30 | # x 31 | # 32 | # 1 1 33 | # 2 NA 34 | # 3 3 35 | 36 | for(n in 1:3) { 37 | print("***********") 38 | print(paste('n',n)) 39 | dplyr::db_drop_table(my_db$con, 'd') 40 | d <- data.frame(x= seq_len(n)) 41 | print("local") 42 | print(dplyr::summarise_all(d, dplyr::funs(sd))) 43 | dbData <- dplyr::copy_to(my_db, d, name='d', 44 | create=TRUE, overwrite=TRUE) 45 | print("RSQLite") 46 | print(dplyr::summarise_all(dbData, dplyr::funs(sd))) 47 | print("***********") 48 | } 49 | # [1] "***********" 50 | # [1] "n 1" 51 | # [1] "local" 52 | # x 53 | # 1 NA 54 | # [1] "RSQLite" 55 | # Source: query [?? x 1] 56 | # Database: sqlite 3.11.1 [:memory:] 57 | # 58 | # x 59 | # 60 | # 1 0 61 | # [1] "***********" 62 | # [1] "***********" 63 | # [1] "n 2" 64 | # [1] "local" 65 | # x 66 | # 1 0.7071068 67 | # [1] "RSQLite" 68 | # Source: query [?? x 1] 69 | # Database: sqlite 3.11.1 [:memory:] 70 | # 71 | # x 72 | # 73 | # 1 0.7071068 74 | # [1] "***********" 75 | # [1] "***********" 76 | # [1] "n 3" 77 | # [1] "local" 78 | # x 79 | # 1 1 80 | # [1] "RSQLite" 81 | # Source: query [?? x 1] 82 | # Database: sqlite 3.11.1 [:memory:] 83 | # 84 | # x 85 | # 86 | # 1 1 87 | # [1] "***********" 88 | ``` 89 | 90 | Filed as [RSQLite 201](https://github.com/rstats-db/RSQLite/issues/201). 91 | 92 | ``` r 93 | version 94 | # _ 95 | # platform x86_64-apple-darwin13.4.0 96 | # arch x86_64 97 | # os darwin13.4.0 98 | # system x86_64, darwin13.4.0 99 | # status 100 | # major 3 101 | # minor 3.2 102 | # year 2016 103 | # month 10 104 | # day 31 105 | # svn rev 71607 106 | # language R 107 | # version.string R version 3.3.2 (2016-10-31) 108 | # nickname Sincere Pumpkin Patch 109 | ``` 110 | 111 | ``` r 112 | rm(list=ls()) 113 | gc() 114 | # used (Mb) gc trigger (Mb) max used (Mb) 115 | # Ncells 469545 25.1 750400 40.1 592000 31.7 116 | # Vcells 657764 5.1 1308461 10.0 914785 7.0 117 | ``` 118 | -------------------------------------------------------------------------------- /issues/SparkNAIssue.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | `NA` issue while using `sparklyr`, `Spark2`, and `dplyr`. It also looks like several places 10 | `NA` and `""` are confused and reversed, confuses, or suppressed. 11 | 12 | Submitted as [`sparklyr` issue 528](https://github.com/rstudio/sparklyr/issues/528) and [`sparklyr` issue 680](https://github.com/rstudio/sparklyr/issues/680). 13 | 14 | 15 | ```{r opts, echo = FALSE} 16 | knitr::opts_chunk$set( 17 | collapse = TRUE, 18 | comment = " # " 19 | ) 20 | options(width =100) 21 | ``` 22 | 23 | 24 | ```{r d1} 25 | suppressPackageStartupMessages(library('dplyr')) 26 | packageVersion("dplyr") 27 | library('sparklyr') 28 | packageVersion("sparklyr") 29 | sc <- sparklyr::spark_connect(version='2.0.2', 30 | master = "local") 31 | d1 <- data.frame(x= c('a',NA), 32 | stringsAsFactors= FALSE) 33 | # Notice d1s appears truncated to 1 row 34 | ds1 <- dplyr::copy_to(sc,d1) 35 | print(ds1) 36 | nrow(ds1) 37 | ``` 38 | -------------------------------------------------------------------------------- /issues/SparkNAIssue.md: -------------------------------------------------------------------------------- 1 | 2 | `NA` issue while using `sparklyr`, `Spark2`, and `dplyr`. It also looks like several places `NA` and `""` are confused and reversed, confuses, or suppressed. 3 | 4 | Submitted as [`sparklyr` issue 528](https://github.com/rstudio/sparklyr/issues/528) and [`sparklyr` issue 680](https://github.com/rstudio/sparklyr/issues/680). 5 | 6 | ``` r 7 | suppressPackageStartupMessages(library('dplyr')) 8 | packageVersion("dplyr") 9 | # [1] '0.5.0' 10 | library('sparklyr') 11 | packageVersion("sparklyr") 12 | # [1] '0.5.4' 13 | sc <- sparklyr::spark_connect(version='2.0.2', 14 | master = "local") 15 | d1 <- data.frame(x= c('a',NA), 16 | stringsAsFactors= FALSE) 17 | # Notice d1s appears truncated to 1 row 18 | ds1 <- dplyr::copy_to(sc,d1) 19 | print(ds1) 20 | # Source: query [1 x 1] 21 | # Database: spark connection master=local[4] app=sparklyr local=TRUE 22 | # 23 | # # A tibble: 1 x 1 24 | # x 25 | # 26 | # 1 a 27 | nrow(ds1) 28 | # [1] 1 29 | ``` 30 | -------------------------------------------------------------------------------- /issues/SparklyrRename.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | 8 | 9 | ## Sparklyr rename fails with dev version of dplyr 10 | 11 | Rename fails in Sparklyr, think it depends on verison of dplyr. Definitely seeing it in the dev version of dplyr as of 5-14-2017. Confirmed it works properly for dplyr 0.5.0. 12 | 13 | Submitted as [`Sparklyr` issue 678](https://github.com/rstudio/sparklyr/issues/678) and [`dplyr` issue 2776](https://github.com/tidyverse/dplyr/issues/2776). 14 | 15 | ```{r example, error = TRUE} 16 | # devtools::install_github("tidyverse/dplyr") 17 | # devtools::install_github('tidyverse/dbplyr') 18 | suppressPackageStartupMessages(library('dplyr')) 19 | packageVersion("dplyr") 20 | library('sparklyr') 21 | packageVersion("sparklyr") 22 | if(requireNamespace("dbplyr", quietly = TRUE)) { 23 | packageVersion("dbplyr") 24 | } 25 | R.Version()$version.string 26 | 27 | dLocal <- data.frame(x = 1:2, 28 | origCol = c('a', 'b'), 29 | stringsAsFactors = FALSE) 30 | 31 | sc <- sparklyr::spark_connect(version='2.0.2', 32 | master = "local") 33 | 34 | d <- copy_to(sc, dLocal, 'd') 35 | 36 | # works 37 | rename(dLocal, x2 = x, origCol2 = origCol) 38 | 39 | # throws 40 | rename(d, x2 = x, origCol2 = origCol) 41 | ``` 42 | 43 | ```{r cleanup} 44 | spark_disconnect(sc) 45 | rm(list=ls()) 46 | gc(verbose = FALSE) 47 | ``` 48 | -------------------------------------------------------------------------------- /issues/SparklyrRename.md: -------------------------------------------------------------------------------- 1 | 2 | Sparklyr rename fails with dev version of dplyr 3 | ----------------------------------------------- 4 | 5 | Rename fails in Sparklyr, think it depends on verison of dplyr. Definitely seeing it in the dev version of dplyr as of 5-14-2017. Confirmed it works properly for dplyr 0.5.0. 6 | 7 | Submitted as [`Sparklyr` issue 678](https://github.com/rstudio/sparklyr/issues/678) and [`dplyr` issue 2776](https://github.com/tidyverse/dplyr/issues/2776). 8 | 9 | ``` r 10 | # devtools::install_github("tidyverse/dplyr") 11 | # devtools::install_github('tidyverse/dbplyr') 12 | suppressPackageStartupMessages(library('dplyr')) 13 | packageVersion("dplyr") 14 | ``` 15 | 16 | ## [1] '0.5.0.9004' 17 | 18 | ``` r 19 | library('sparklyr') 20 | packageVersion("sparklyr") 21 | ``` 22 | 23 | ## [1] '0.5.4' 24 | 25 | ``` r 26 | if(requireNamespace("dbplyr", quietly = TRUE)) { 27 | packageVersion("dbplyr") 28 | } 29 | ``` 30 | 31 | ## [1] '0.0.0.9001' 32 | 33 | ``` r 34 | R.Version()$version.string 35 | ``` 36 | 37 | ## [1] "R version 3.4.0 (2017-04-21)" 38 | 39 | ``` r 40 | dLocal <- data.frame(x = 1:2, 41 | origCol = c('a', 'b'), 42 | stringsAsFactors = FALSE) 43 | 44 | sc <- sparklyr::spark_connect(version='2.0.2', 45 | master = "local") 46 | 47 | d <- copy_to(sc, dLocal, 'd') 48 | 49 | # works 50 | rename(dLocal, x2 = x, origCol2 = origCol) 51 | ``` 52 | 53 | ## x2 origCol2 54 | ## 1 1 a 55 | ## 2 2 b 56 | 57 | ``` r 58 | # throws 59 | rename(d, x2 = x, origCol2 = origCol) 60 | ``` 61 | 62 | ## Source: lazy query [?? x 2] 63 | ## Database: spark_connection 64 | 65 | ## Error in names(select)[match(old_vars, vars)] <- new_vars: NAs are not allowed in subscripted assignments 66 | 67 | ``` r 68 | spark_disconnect(sc) 69 | rm(list=ls()) 70 | gc(verbose = FALSE) 71 | ``` 72 | 73 | ## used (Mb) gc trigger (Mb) max used (Mb) 74 | ## Ncells 673122 36.0 1168576 62.5 940480 50.3 75 | ## Vcells 1157466 8.9 2060183 15.8 1364787 10.5 76 | -------------------------------------------------------------------------------- /issues/TrailingRefIssue.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | ### Altering captured reference damages spark results. 8 | 9 | If you use a variable in `dplyr::mutate()` against a `sparklyr` data source the lazy eval captures references to user variables. Changing values of those variables implicitly changes the `mutate` and changes the values seen in the `sparklyr` result (which is itself a query). This can be worked around by dropping in `dplyr::compute()` but it seems like it can produce a lot of incorrect calculations. Below is a small example and a lot information on the versions of everything being run. I am assuming the is a `sparklyr` issue as the query views are failrly different than a number of other `dplyr` structures, but it could be a `dplyr` issue. 10 | 11 | 12 | 13 | ```{r, echo = FALSE} 14 | knitr::opts_chunk$set( 15 | collapse = TRUE, 16 | comment = " # " 17 | ) 18 | options(width =100) 19 | ``` 20 | 21 | 22 | OSX 10.11.6. 23 | Spark installed as described at http://spark.rstudio.com 24 | 25 | ``` 26 | library('sparklyr') 27 | spark_install(version = "2.0.0") 28 | ``` 29 | 30 | ```{r setup} 31 | library('dplyr') 32 | library('sparklyr') 33 | R.Version()$version.string 34 | packageVersion('dplyr') 35 | packageVersion('sparklyr') 36 | my_db <- sparklyr::spark_connect(version='2.0.0', master = "local") 37 | class(my_db) 38 | my_db$spark_home 39 | print(my_db) 40 | ``` 41 | 42 | * Expected outcome: `s1` has the same value 43 | * Observed outcome: changing varaible v changes `s1` column. 44 | 45 | ```{r issue} 46 | support <- copy_to(my_db, 47 | data.frame(year=2005:2010), 48 | 'support') 49 | v <- 0 50 | s1 <- dplyr::mutate(support,count=v) 51 | 52 | print(s1) # print 1 53 | 54 | # s1 <- dplyr::compute(s1) # likely work-around 55 | v <- '' 56 | 57 | print(s1) # print 2 58 | ``` 59 | 60 | Notice `s1` changed its value (likely due to lazy evaluation and having captured a reference to `v`). 61 | 62 | 63 | Submitted as [sparklyr issue 503](https://github.com/rstudio/sparklyr/issues/503) and [dplyr issue 2455](https://github.com/hadley/dplyr/issues/2455). Reported fixed in dev ([dplyr issue 2370](https://github.com/hadley/dplyr/issues/2370)). 64 | 65 | ```{r printversion} 66 | version 67 | ``` 68 | 69 | 70 | -------------------------------------------------------------------------------- /issues/UnionIssue.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | ### Union order issue 8 | 9 | 10 | 11 | 12 | ```{r, echo = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = " # " 16 | ) 17 | options(width =100) 18 | ``` 19 | 20 | 21 | OSX 10.11.6. 22 | Spark installed as described at http://spark.rstudio.com 23 | 24 | ``` 25 | library('sparklyr') 26 | spark_install(version = "2.0.0") 27 | ``` 28 | 29 | ```{r setup} 30 | library('dplyr') 31 | library('sparklyr') 32 | R.Version()$version.string 33 | packageVersion('dplyr') 34 | packageVersion('sparklyr') 35 | my_db <- sparklyr::spark_connect(version='2.0.0', master = "local") 36 | class(my_db) 37 | my_db$spark_home 38 | print(my_db) 39 | ``` 40 | 41 | * Expected outcome: dplyr::union and dplyr::union_all should match columns. 42 | * Observed outcome: matches columns on local data frames, matches positions on spark2.0.0. 43 | 44 | ```{r issue} 45 | d1 <- data.frame(year=2005:2010, 46 | name='a', 47 | stringsAsFactors = FALSE) 48 | d2 <- data.frame(name='b', 49 | year=2005:2010, 50 | stringsAsFactors = FALSE) 51 | 52 | # local frames: uses names on union 53 | dplyr::union(d1, d2) 54 | dplyr::union_all(d1, d2) 55 | 56 | 57 | s1 <- copy_to(my_db, d1, 's1') 58 | s2 <- copy_to(my_db, d2, 's2') 59 | 60 | # remore frames: uses position, co-mingline different types 61 | dplyr::union(s1,s2) 62 | dplyr::union_all(s1,s2) 63 | ``` 64 | 65 | Submitted as a [sparklyr issue 507](https://github.com/rstudio/sparklyr/issues/507). 66 | 67 | ```{r printversion} 68 | version 69 | ``` 70 | 71 | 72 | -------------------------------------------------------------------------------- /issues/arrangecompute.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Check durability of `dplyr::arrange` through `dplyr::compute`. 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | 20 | ```{r ready} 21 | library('dplyr') 22 | library('RPostgreSQL') 23 | packageVersion('dplyr') 24 | packageVersion('RPostgreSQL') 25 | my_db <- dplyr::src_postgres(host = 'localhost',port = 5432,user = 'postgres',password = 'pg') 26 | class(my_db) 27 | set.seed(32525) 28 | dz <- dplyr::copy_to(my_db,data.frame(x=runif(1000)),'dz99',overwrite=TRUE) 29 | ``` 30 | 31 | Notice below: no warnings in frame or runtime. 32 | 33 | ```{r direct} 34 | dz %>% arrange(x) %>% mutate(ccol=1) %>% mutate(rank=cumsum(ccol)) -> dz1 35 | print(dz1) 36 | warnings() 37 | ``` 38 | 39 | Notice below: warning "Warning: Windowed expression 'sum("ccol")' does not have explicit order.". Result may appear the same, but we do not seem to be able to depend on that. 40 | 41 | ```{r compute} 42 | dz %>% arrange(x) %>% compute() %>% mutate(ccol=1) %>% mutate(rank=cumsum(ccol)) -> dz2 43 | print(dz2) 44 | warnings() 45 | ``` 46 | 47 | Notice below: warning "Warning: Windowed expression 'sum("ccol")' does not have explicit order.". Result may appear the same, but we do not seem to be able to depend on that. 48 | 49 | ```{r collapse} 50 | dz %>% arrange(x) %>% collapse() %>% mutate(ccol=1) %>% mutate(rank=cumsum(ccol)) -> dz3 51 | print(dz3) 52 | warnings() 53 | ``` 54 | 55 | Submitted as [dplyr issue 2281](https://github.com/hadley/dplyr/issues/2281). 56 | 57 | 58 | ```{r printversion} 59 | version 60 | ``` -------------------------------------------------------------------------------- /issues/copyIssueMySQL.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Copy issue with `MySQL`. 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | `MySQL` doesn't obey `overwrite=TRUE`, but since that is in the `...` region it is hard to say what correct behavior would be. `replyr` already works around it, this is just to explain why we take the trouble. 20 | 21 | ```{r copyissueover, error=TRUE} 22 | library('dplyr') 23 | library('RMySQL') 24 | packageVersion('dplyr') 25 | packageVersion('RMySQL') 26 | my_db <- dplyr::src_sqlite("replyr_sqliteEx.sqlite3", create = TRUE) 27 | d <- dplyr::copy_to(my_db,data.frame(x=c(1,2)),'d',overwrite=TRUE) 28 | d <- dplyr::copy_to(my_db,data.frame(x=c(1,2)),'d',overwrite=TRUE) 29 | ``` 30 | 31 | 32 | ```{r printversion} 33 | version 34 | ``` -------------------------------------------------------------------------------- /issues/copyIssueMySQL.md: -------------------------------------------------------------------------------- 1 | Copy issue with `MySQL`. 2 | 3 | 4 | `MySQL` doesn't obey `overwrite=TRUE`, but since that is in the `...` region it is hard to say what correct behavior would be. `replyr` already works around it, this is just to explain why we take the trouble. 5 | 6 | ``` r 7 | library('dplyr') 8 | # 9 | # Attaching package: 'dplyr' 10 | # The following objects are masked from 'package:stats': 11 | # 12 | # filter, lag 13 | # The following objects are masked from 'package:base': 14 | # 15 | # intersect, setdiff, setequal, union 16 | library('RMySQL') 17 | # Loading required package: DBI 18 | packageVersion('dplyr') 19 | # [1] '0.5.0' 20 | packageVersion('RMySQL') 21 | # [1] '0.10.9' 22 | my_db <- dplyr::src_sqlite("replyr_sqliteEx.sqlite3", create = TRUE) 23 | d <- dplyr::copy_to(my_db,data.frame(x=c(1,2)),'d',overwrite=TRUE) 24 | d <- dplyr::copy_to(my_db,data.frame(x=c(1,2)),'d',overwrite=TRUE) 25 | # Error: Table d already exists. 26 | ``` 27 | 28 | ``` r 29 | version 30 | # _ 31 | # platform x86_64-apple-darwin13.4.0 32 | # arch x86_64 33 | # os darwin13.4.0 34 | # system x86_64, darwin13.4.0 35 | # status 36 | # major 3 37 | # minor 3.2 38 | # year 2016 39 | # month 10 40 | # day 31 41 | # svn rev 71607 42 | # language R 43 | # version.string R version 3.3.2 (2016-10-31) 44 | # nickname Sincere Pumpkin Patch 45 | ``` 46 | -------------------------------------------------------------------------------- /issues/copyissue162.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Copy issue with `sparklyr` 1.6.2. 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | 20 | Below is why we use a new column name in joins. 21 | 22 | OSX 10.11.6. 23 | Spark installed as described at http://spark.rstudio.com 24 | 25 | ``` 26 | library('sparklyr') 27 | spark_install(version = "1.6.2") 28 | ``` 29 | 30 | 31 | ```{r issue1.6.2, error=TRUE} 32 | library('dplyr') 33 | library('sparklyr') 34 | R.Version()$version.string 35 | packageVersion('dplyr') 36 | packageVersion('sparklyr') 37 | my_db <- sparklyr::spark_connect(version='1.6.2', master = "local") 38 | class(my_db) 39 | my_db$spark_home 40 | print(my_db) 41 | d1 <- copy_to(my_db,data.frame(x=c(1,2),y=c('a','b')),'d1') 42 | d2 <- copy_to(my_db,data.frame(y=c('a','b'),z=c(3,4)),'d2') 43 | d1 %>% dplyr::inner_join(d2,by='y') 44 | ``` 45 | 46 | Submitted as [sparklyr issue 338](https://github.com/rstudio/sparklyr/issues/338). 47 | 48 | ```{r printversion} 49 | version 50 | ``` 51 | 52 | -------------------------------------------------------------------------------- /issues/copyissue200.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Copy issue with `sparklyr` 2.0.0. 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | Below is why we re-try joins against local data without using the 20 | `copy=TRUE` feature. 21 | 22 | OSX 10.11.6. 23 | Spark installed as described at http://spark.rstudio.com 24 | 25 | ``` 26 | library('sparklyr') 27 | spark_install(version = "2.0.0") 28 | ``` 29 | 30 | ```{r issue2.0.0, error=TRUE} 31 | library('dplyr') 32 | library('sparklyr') 33 | R.Version()$version.string 34 | packageVersion('dplyr') 35 | packageVersion('sparklyr') 36 | my_db <- sparklyr::spark_connect(version='2.0.0', master = "local") 37 | class(my_db) 38 | my_db$spark_home 39 | print(my_db) 40 | d1 <- copy_to(my_db,data.frame(x=c(1,2),y=c('a','b')),'d1') 41 | d2 <- data.frame(y=c('a','b'),z=c(3,4)) 42 | d1 %>% dplyr::inner_join(d2,by='y',copy=TRUE) 43 | ``` 44 | 45 | Submitted as [sparklyr issue 339](https://github.com/rstudio/sparklyr/issues/339). 46 | 47 | ```{r printversion} 48 | version 49 | ``` 50 | 51 | 52 | -------------------------------------------------------------------------------- /issues/factorissue.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | Factor with R data.frame. 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | 20 | Some issues with `summarize_each` and factors. 21 | 22 | 23 | ```{r factorissue, error=TRUE} 24 | library('dplyr') 25 | R.Version()$version.string 26 | packageVersion('dplyr') 27 | d1 <- data.frame(y=c('a','b'),stringsAsFactors = FALSE) 28 | d1 %>% dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 29 | d2 <- data.frame(y=c('a','b'),stringsAsFactors = TRUE) 30 | d2 %>% dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 31 | ``` 32 | 33 | Submitted as [dplyr issue 2269](https://github.com/hadley/dplyr/issues/2269). Closed as "expected behavior" as this is what `min(factor(letters))` does. That is a correct determination, but be aware many `dplyr` backends do support comparison, min, and max on characters types. 34 | 35 | ```{r factorissuedb, error=TRUE} 36 | my_db <- dplyr::src_sqlite("replyr_sqliteEx.sqlite3", create = TRUE) 37 | dplyr::copy_to(dest=my_db,df=d1,name='d1',overwrite=TRUE) %>% 38 | dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 39 | dplyr::copy_to(dest=my_db,df=d2,name='d2',overwrite=TRUE) %>% 40 | dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 41 | ``` 42 | 43 | 44 | ```{r printversion} 45 | version 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /issues/factorissue.md: -------------------------------------------------------------------------------- 1 | Factor with R data.frame. 2 | 3 | 4 | Some issues with `summarize_each` and factors. 5 | 6 | ``` r 7 | library('dplyr') 8 | # 9 | # Attaching package: 'dplyr' 10 | # The following objects are masked from 'package:stats': 11 | # 12 | # filter, lag 13 | # The following objects are masked from 'package:base': 14 | # 15 | # intersect, setdiff, setequal, union 16 | R.Version()$version.string 17 | # [1] "R version 3.3.2 (2016-10-31)" 18 | packageVersion('dplyr') 19 | # [1] '0.5.0' 20 | d1 <- data.frame(y=c('a','b'),stringsAsFactors = FALSE) 21 | d1 %>% dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 22 | # lexmin lexmax 23 | # 1 a b 24 | d2 <- data.frame(y=c('a','b'),stringsAsFactors = TRUE) 25 | d2 %>% dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 26 | # Error in eval(expr, envir, enclos): 'min' not meaningful for factors 27 | ``` 28 | 29 | Submitted as [dplyr issue 2269](https://github.com/hadley/dplyr/issues/2269). Closed as "expected behavior" as this is what `min(factor(letters))` does. That is a correct determination, but be aware many `dplyr` backends do support comparison, min, and max on characters types. 30 | 31 | ``` r 32 | my_db <- dplyr::src_sqlite("replyr_sqliteEx.sqlite3", create = TRUE) 33 | dplyr::copy_to(dest=my_db,df=d1,name='d1',overwrite=TRUE) %>% 34 | dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 35 | # Source: query [?? x 2] 36 | # Database: sqlite 3.8.6 [replyr_sqliteEx.sqlite3] 37 | # 38 | # lexmin lexmax 39 | # 40 | # 1 a b 41 | dplyr::copy_to(dest=my_db,df=d2,name='d2',overwrite=TRUE) %>% 42 | dplyr::summarise_each(dplyr::funs(lexmin = min,lexmax = max)) 43 | # Source: query [?? x 2] 44 | # Database: sqlite 3.8.6 [replyr_sqliteEx.sqlite3] 45 | # 46 | # lexmin lexmax 47 | # 48 | # 1 a b 49 | ``` 50 | 51 | ``` r 52 | version 53 | # _ 54 | # platform x86_64-apple-darwin13.4.0 55 | # arch x86_64 56 | # os darwin13.4.0 57 | # system x86_64, darwin13.4.0 58 | # status 59 | # major 3 60 | # minor 3.2 61 | # year 2016 62 | # month 10 63 | # day 31 64 | # svn rev 71607 65 | # language R 66 | # version.string R version 3.3.2 (2016-10-31) 67 | # nickname Sincere Pumpkin Patch 68 | ``` 69 | -------------------------------------------------------------------------------- /issues/union_all_issue.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | --- 6 | 7 | `union_all` issue with `SQLite`. Submitted as [dplyr issue 2270](https://github.com/hadley/dplyr/issues/2270). 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = " # " 15 | ) 16 | options(width =100) 17 | ``` 18 | 19 | ```{r unionx, error=TRUE} 20 | suppressPackageStartupMessages(library('dplyr')) 21 | packageVersion('dplyr') 22 | packageVersion('dbplyr') 23 | my_db <- dplyr::src_sqlite(":memory:", create = TRUE) 24 | dr <- dplyr::copy_to(my_db, 25 | data.frame(x=c(1,2), y=c('a','b'), 26 | stringsAsFactors = FALSE), 27 | 'dr', 28 | overwrite=TRUE) 29 | dr <- head(dr,1) 30 | # dr <- compute(dr) 31 | print(dr) 32 | print(dplyr::union_all(dr,dr)) 33 | ``` 34 | 35 | Filed as [RSQLite 215](https://github.com/rstats-db/RSQLite/issues/215) and [dplyr 2858](https://github.com/tidyverse/dplyr/issues/2858). 36 | 37 | ```{r cleanup} 38 | rm(list=ls()) 39 | gc() 40 | ``` 41 | 42 | Note calling `compute` doesn't always fix the problem in my more complicated production example. 43 | Also `union` seems to not have the same issue as `union_all`. It also seems like nested function calls exacerbating the issue, perhaps a reference to a necissary structure goes out of scope and allows sub-table collection too soon? To trigger the full error in `replyr` force use of `union_all` in `replyr_bind_rows` and then try knitting `basicChecksSpark200.Rmd`. 44 | 45 | The following now works: 46 | 47 | ```{r unionxs200, error=TRUE} 48 | suppressPackageStartupMessages(library('dplyr')) 49 | suppressPackageStartupMessages(library('sparklyr')) 50 | packageVersion('dplyr') 51 | packageVersion('dbplyr') 52 | packageVersion('sparklyr') 53 | my_db <- sparklyr::spark_connect(version='2.0.0', 54 | master = "local") 55 | class(my_db) 56 | my_db$spark_home 57 | da <- dplyr::copy_to(my_db, 58 | data.frame(x=c(1,2),y=c('a','b'), 59 | stringsAsFactors = FALSE), 60 | 'da', 61 | overwrite=TRUE) 62 | da <- head(da,1) 63 | print(da) 64 | db <- dplyr::copy_to(my_db, 65 | data.frame(x=c(3,4),y=c('c','d'), 66 | stringsAsFactors = FALSE), 67 | 'db', 68 | overwrite=TRUE) 69 | db <- head(db,1) 70 | #da <- compute(da) 71 | db <- compute(db) 72 | print(db) 73 | res <- dplyr::union_all(da,db) 74 | res <- dplyr::compute(res) 75 | print(res) 76 | print(da) 77 | print(db) 78 | ``` 79 | 80 | ```{r cleanup2} 81 | rm(list=ls()) 82 | gc() 83 | ``` 84 | 85 | ```{r printversion} 86 | version 87 | ``` 88 | -------------------------------------------------------------------------------- /man/addConstantColumn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/addCol.R 3 | \name{addConstantColumn} 4 | \alias{addConstantColumn} 5 | \title{Add constant to a table.} 6 | \usage{ 7 | addConstantColumn( 8 | d, 9 | colName, 10 | val, 11 | ..., 12 | tempNameGenerator = mk_tmp_name_source("replyr_addConstantColumn") 13 | ) 14 | } 15 | \arguments{ 16 | \item{d}{data.frame like object to add column to.} 17 | 18 | \item{colName}{character, name of column to add.} 19 | 20 | \item{val}{scalar, value to add.} 21 | 22 | \item{...}{force later arguments to be bound by name.} 23 | 24 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 25 | } 26 | \value{ 27 | table with new column added. 28 | } 29 | \description{ 30 | Work around different treatment of character types across remote 31 | data sources when adding a 32 | constant column to a table. Deals with issues such as Postgresql 33 | requiring a character-cast and MySQL not allowing such. 34 | } 35 | \examples{ 36 | 37 | d <- data.frame(x= c(1:3)) 38 | addConstantColumn(d, 'newCol', 'newVal') 39 | 40 | } 41 | -------------------------------------------------------------------------------- /man/buildJoinPlan.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{buildJoinPlan} 4 | \alias{buildJoinPlan} 5 | \title{Build a join plan} 6 | \usage{ 7 | buildJoinPlan(tDesc, ..., check = TRUE) 8 | } 9 | \arguments{ 10 | \item{tDesc}{description of tables from \code{\link{tableDescription}} (and likely altered by user). Note: no column names must intersect with names of the form \code{table_CLEANEDTABNAME_present}.} 11 | 12 | \item{...}{force later arguments to bind by name.} 13 | 14 | \item{check}{logical, if TRUE check the join plan for consistency.} 15 | } 16 | \value{ 17 | detailed column join plan (appropriate for editing) 18 | } 19 | \description{ 20 | Please see \code{vignette('DependencySorting', package = 'replyr')} and \code{vignette('joinController', package= 'replyr')} for more details. 21 | } 22 | \examples{ 23 | 24 | d <- data.frame(id=1:3, weight= c(200, 140, 98)) 25 | tDesc <- rbind(tableDescription('d1', d), 26 | tableDescription('d2', d)) 27 | tDesc$keys[[1]] <- list(PrimaryKey= 'id') 28 | tDesc$keys[[2]] <- list(PrimaryKey= 'id') 29 | buildJoinPlan(tDesc) 30 | 31 | } 32 | \seealso{ 33 | \code{\link{tableDescription}}, \code{\link{inspectDescrAndJoinPlan}}, \code{\link{makeJoinDiagramSpec}}, \code{\link{executeLeftJoinPlan}} 34 | } 35 | -------------------------------------------------------------------------------- /man/dplyr_src_to_db_handle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/serviceName.R 3 | \name{dplyr_src_to_db_handle} 4 | \alias{dplyr_src_to_db_handle} 5 | \title{Obsolete with dplyr 0.7.0 and forward} 6 | \usage{ 7 | dplyr_src_to_db_handle(dplyr_src) 8 | } 9 | \arguments{ 10 | \item{dplyr_src}{remote data handle} 11 | } 12 | \value{ 13 | dplyr_src 14 | } 15 | \description{ 16 | Obsolete with dplyr 0.7.0 and forward 17 | } 18 | -------------------------------------------------------------------------------- /man/example_employeeAndDate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{example_employeeAndDate} 4 | \alias{example_employeeAndDate} 5 | \title{build some example tables} 6 | \usage{ 7 | example_employeeAndDate(con) 8 | } 9 | \arguments{ 10 | \item{con}{db connection} 11 | } 12 | \value{ 13 | example tables 14 | } 15 | \description{ 16 | build some example tables 17 | } 18 | -------------------------------------------------------------------------------- /man/executeLeftJoinPlan.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{executeLeftJoinPlan} 4 | \alias{executeLeftJoinPlan} 5 | \title{Execute an ordered sequence of left joins.} 6 | \usage{ 7 | executeLeftJoinPlan( 8 | tDesc, 9 | columnJoinPlan, 10 | ..., 11 | checkColumns = FALSE, 12 | computeFn = function(x, name) { dplyr::compute(x, name = name) }, 13 | eagerCompute = TRUE, 14 | checkColClasses = FALSE, 15 | verbose = FALSE, 16 | dryRun = FALSE, 17 | tempNameGenerator = mk_tmp_name_source("executeLeftJoinPlan") 18 | ) 19 | } 20 | \arguments{ 21 | \item{tDesc}{description of tables, either a \code{data.frame} from \code{\link{tableDescription}}, or a list mapping from names to handles/frames. Only used to map table names to data.} 22 | 23 | \item{columnJoinPlan}{columns to join, from \code{\link{buildJoinPlan}} (and likely altered by user). Note: no column names must intersect with names of the form \code{table_CLEANEDTABNAME_present}.} 24 | 25 | \item{...}{force later arguments to bind by name.} 26 | 27 | \item{checkColumns}{logical if TRUE confirm column names before starting joins.} 28 | 29 | \item{computeFn}{function to call to try and materialize intermediate results.} 30 | 31 | \item{eagerCompute}{logical if TRUE materialize intermediate results with computeFn.} 32 | 33 | \item{checkColClasses}{logical if true check for exact class name matches} 34 | 35 | \item{verbose}{logical if TRUE print more.} 36 | 37 | \item{dryRun}{logical if TRUE do not perform joins, only print steps.} 38 | 39 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 40 | } 41 | \value{ 42 | joined table 43 | } 44 | \description{ 45 | Please see \code{vignette('DependencySorting', package = 'replyr')} and \code{vignette('joinController', package= 'replyr')} for more details. 46 | } 47 | \examples{ 48 | 49 | 50 | # example data 51 | meas1 <- data.frame(id= c(1,2), 52 | weight= c(200, 120), 53 | height= c(60, 14)) 54 | meas2 <- data.frame(pid= c(2,3), 55 | weight= c(105, 110), 56 | width= 1) 57 | # get the initial description of table defs 58 | tDesc <- rbind(tableDescription('meas1', meas1), 59 | tableDescription('meas2', meas2)) 60 | # declare keys (and give them consitent names) 61 | tDesc$keys[[1]] <- list(PatientID= 'id') 62 | tDesc$keys[[2]] <- list(PatientID= 'pid') 63 | # build the column join plan 64 | columnJoinPlan <- buildJoinPlan(tDesc) 65 | # decide we don't want the width column 66 | columnJoinPlan$want[columnJoinPlan$resultColumn=='width'] <- FALSE 67 | # double check our plan 68 | if(!is.null(inspectDescrAndJoinPlan(tDesc, columnJoinPlan, 69 | checkColClasses= TRUE))) { 70 | stop("bad join plan") 71 | } 72 | # execute the left joins 73 | executeLeftJoinPlan(tDesc, columnJoinPlan, 74 | checkColClasses= TRUE, 75 | verbose= TRUE) 76 | # also good 77 | executeLeftJoinPlan(list('meas1'=meas1, 'meas2'=meas2), 78 | columnJoinPlan, 79 | checkColClasses= TRUE, 80 | verbose= TRUE) 81 | 82 | } 83 | \seealso{ 84 | \code{\link{tableDescription}}, \code{\link{buildJoinPlan}}, \code{\link{inspectDescrAndJoinPlan}}, \code{\link{makeJoinDiagramSpec}} 85 | } 86 | -------------------------------------------------------------------------------- /man/expandColumn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/expandColumn.R 3 | \name{expandColumn} 4 | \alias{expandColumn} 5 | \title{Expand a column of vectors into one row per value of each vector.} 6 | \usage{ 7 | expandColumn( 8 | data, 9 | colName, 10 | ..., 11 | rowidSource = NULL, 12 | rowidDest = NULL, 13 | idxDest = NULL, 14 | tempNameGenerator = mk_tmp_name_source("replyr_expandColumn") 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{data.frame to work with.} 19 | 20 | \item{colName}{character name of column to expand.} 21 | 22 | \item{...}{force later arguments to be bound by name} 23 | 24 | \item{rowidSource}{optional character name of column to take row indices from (rowidDest must be NULL to use this).} 25 | 26 | \item{rowidDest}{optional character name of column to write row indices to (must not be an existing column name, rowidSource must be NULL to use this).} 27 | 28 | \item{idxDest}{optional character name of column to write value indices to (must not be an existing column name).} 29 | 30 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 31 | } 32 | \value{ 33 | expanded data frame where each value of colName column is in a new row. 34 | } 35 | \description{ 36 | Similar to \code{tidyr::unnest} but lands rowids and value ids, and can work on remote data sources. Fairly expensive per-row operation, not suitable for big data. 37 | } 38 | \examples{ 39 | 40 | 41 | d <- data.frame(name= c('a','b')) 42 | d$value <- list(c('x','y'),'z') 43 | expandColumn(d, 'value', 44 | rowidDest= 'origRowId', 45 | idxDest= 'valueIndex') 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /man/gapply.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/groupedApply.R 3 | \name{gapply} 4 | \alias{gapply} 5 | \title{grouped ordered apply} 6 | \usage{ 7 | gapply( 8 | df, 9 | gcolumn, 10 | f, 11 | ..., 12 | ocolumn = NULL, 13 | decreasing = FALSE, 14 | partitionMethod = "split", 15 | bindrows = TRUE, 16 | maxgroups = 100, 17 | eagerCompute = FALSE, 18 | restoreGroup = FALSE, 19 | tempNameGenerator = mk_tmp_name_source("replyr_gapply") 20 | ) 21 | } 22 | \arguments{ 23 | \item{df}{remote dplyr data item} 24 | 25 | \item{gcolumn}{grouping column} 26 | 27 | \item{f}{transform function or pipeline} 28 | 29 | \item{...}{force later values to be bound by name} 30 | 31 | \item{ocolumn}{ordering column (optional)} 32 | 33 | \item{decreasing}{logical, if TRUE sort in decreasing order by ocolumn} 34 | 35 | \item{partitionMethod}{method to partition the data, one of 'group_by' (depends on f being dplyr compatible), 'split' (only works over local data frames), or 'extract'} 36 | 37 | \item{bindrows}{logical, if TRUE bind the rows back into a data item, else return split list} 38 | 39 | \item{maxgroups}{maximum number of groups to work over (intentionally not enforced if \code{partitionMethod=='group_by'})} 40 | 41 | \item{eagerCompute}{logical, if TRUE call compute on split results} 42 | 43 | \item{restoreGroup}{logical, if TRUE restore group column after apply when \code{partitionMethod \%in\% c('extract', 'split')}} 44 | 45 | \item{tempNameGenerator}{temp name generator produced by \code{wrapr::mk_tmp_name_source}, used to record \code{dplyr::compute()} effects.} 46 | } 47 | \value{ 48 | transformed frame 49 | } 50 | \description{ 51 | Partitions from by values in grouping column, applies a generic transform 52 | to each group and then binds the groups back together. Only advised for a 53 | moderate number of groups and better if grouping column is an index. 54 | This is powerful 55 | enough to implement "The Split-Apply-Combine Strategy for Data Analysis" 56 | https://www.jstatsoft.org/article/view/v040i01 57 | } 58 | \details{ 59 | Note this is a fairly expensive operator, so it only makes sense to use 60 | in situations where \code{f} itself is fairly complicated and/or expensive. 61 | } 62 | \examples{ 63 | 64 | d <- data.frame( 65 | group = c(1, 1, 2, 2, 2), 66 | order = c(.1, .2, .3, .4, .5), 67 | values = c(10, 20, 2, 4, 8) 68 | ) 69 | 70 | # User supplied window functions. They depend on known column names and 71 | # the data back-end matching function names (as cumsum). 72 | cumulative_sum <- function(d) { 73 | dplyr::mutate(d, cv = cumsum(values)) 74 | } 75 | rank_in_group <- function(d) { 76 | d \%.>\% 77 | dplyr::mutate(., constcol = 1) \%.>\% 78 | dplyr::mutate(., rank = cumsum(constcol)) \%.>\% 79 | dplyr::select(., -constcol) 80 | } 81 | 82 | for (partitionMethod in c('group_by', 'split', 'extract')) { 83 | print(partitionMethod) 84 | print('cumulative sum example') 85 | print( 86 | gapply( 87 | d, 88 | 'group', 89 | cumulative_sum, 90 | ocolumn = 'order', 91 | partitionMethod = partitionMethod 92 | ) 93 | ) 94 | print('ranking example') 95 | print( 96 | gapply( 97 | d, 98 | 'group', 99 | rank_in_group, 100 | ocolumn = 'order', 101 | partitionMethod = partitionMethod 102 | ) 103 | ) 104 | print('ranking example (decreasing)') 105 | print( 106 | gapply( 107 | d, 108 | 'group', 109 | rank_in_group, 110 | ocolumn = 'order', 111 | decreasing = TRUE, 112 | partitionMethod = partitionMethod 113 | ) 114 | ) 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /man/grapes-land-grapes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/land.R 3 | \name{\%land\%} 4 | \alias{\%land\%} 5 | \alias{\%->\%} 6 | \alias{\%->_\%} 7 | \alias{\%land_\%} 8 | \title{Land a value to variable from a pipeline.} 9 | \usage{ 10 | value \%land\% name 11 | 12 | value \%->\% name 13 | 14 | value \%->_\% name 15 | 16 | value \%land_\% name 17 | } 18 | \arguments{ 19 | \item{value}{value to write} 20 | 21 | \item{name}{variable to write to} 22 | } 23 | \value{ 24 | value 25 | } 26 | \description{ 27 | \%land\% and \%->\% ("writearrow") copy a pipeline value to a variable on the 28 | right hand side. 29 | \%land_\% and \%->_\% copy a pipeline value to 30 | a variable named by the value referenced by its right hand side argument. 31 | } 32 | \details{ 33 | Technically these operators are 34 | not "-> assignment", so they might not be specifically prohibited in an 35 | oppugnant reading of some style guides. 36 | } 37 | \examples{ 38 | 39 | sin(7) \%->\% z1 40 | sin(7) \%->_\% 'z2' 41 | varname <- 'z3' 42 | sin(7) \%->_\% varname 43 | 44 | } 45 | -------------------------------------------------------------------------------- /man/inspectDescrAndJoinPlan.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{inspectDescrAndJoinPlan} 4 | \alias{inspectDescrAndJoinPlan} 5 | \title{check that a join plan is consistent with table descriptions} 6 | \usage{ 7 | inspectDescrAndJoinPlan(tDesc, columnJoinPlan, ..., checkColClasses = FALSE) 8 | } 9 | \arguments{ 10 | \item{tDesc}{description of tables, from \code{\link{tableDescription}} (and likely altered by user).} 11 | 12 | \item{columnJoinPlan}{columns to join, from \code{\link{buildJoinPlan}} (and likely altered by user). Note: no column names must intersect with names of the form \code{table_CLEANEDTABNAME_present}.} 13 | 14 | \item{...}{force later arguments to bind by name.} 15 | 16 | \item{checkColClasses}{logical if true check for exact class name matches} 17 | } 18 | \value{ 19 | NULL if okay, else a string 20 | } 21 | \description{ 22 | Please see \code{vignette('DependencySorting', package = 'replyr')} and \code{vignette('joinController', package= 'replyr')} for more details. 23 | } 24 | \examples{ 25 | 26 | # example data 27 | d1 <- data.frame(id= 1:3, 28 | weight= c(200, 140, 98), 29 | height= c(60, 24, 12)) 30 | d2 <- data.frame(pid= 2:3, 31 | weight= c(130, 110), 32 | width= 1) 33 | # get the initial description of table defs 34 | tDesc <- rbind(tableDescription('d1', d1), 35 | tableDescription('d2', d2)) 36 | # declare keys (and give them consistent names) 37 | tDesc$keys[[1]] <- list(PrimaryKey= 'id') 38 | tDesc$keys[[2]] <- list(PrimaryKey= 'pid') 39 | # build the join plan 40 | columnJoinPlan <- buildJoinPlan(tDesc) 41 | # confirm the plan 42 | inspectDescrAndJoinPlan(tDesc, columnJoinPlan, 43 | checkColClasses= TRUE) 44 | # damage the plan 45 | columnJoinPlan$sourceColumn[columnJoinPlan$sourceColumn=='width'] <- 'wd' 46 | # find a problem 47 | inspectDescrAndJoinPlan(tDesc, columnJoinPlan, 48 | checkColClasses= TRUE) 49 | 50 | } 51 | \seealso{ 52 | \code{\link{tableDescription}}, \code{\link{buildJoinPlan}}, \code{\link{makeJoinDiagramSpec}}, \code{\link{executeLeftJoinPlan}} 53 | } 54 | -------------------------------------------------------------------------------- /man/key_inspector_all_cols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{key_inspector_all_cols} 4 | \alias{key_inspector_all_cols} 5 | \title{Return all columns as guess at preferred primary keys.} 6 | \usage{ 7 | key_inspector_all_cols(handle) 8 | } 9 | \arguments{ 10 | \item{handle}{data handle} 11 | } 12 | \value{ 13 | map of keys to keys 14 | } 15 | \description{ 16 | Return all columns as guess at preferred primary keys. 17 | } 18 | \examples{ 19 | 20 | d <- data.frame(x=1:3, y=NA) 21 | key_inspector_all_cols(d) 22 | 23 | } 24 | \seealso{ 25 | \code{tableDescription} 26 | } 27 | -------------------------------------------------------------------------------- /man/key_inspector_postgresql.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{key_inspector_postgresql} 4 | \alias{key_inspector_postgresql} 5 | \title{Return all primary key columns as guess at preferred primary keys for a PostgreSQL handle.} 6 | \usage{ 7 | key_inspector_postgresql(handle) 8 | } 9 | \arguments{ 10 | \item{handle}{data handle} 11 | } 12 | \value{ 13 | map of keys to keys 14 | } 15 | \description{ 16 | Return all primary key columns as guess at preferred primary keys for a PostgreSQL handle. 17 | } 18 | \seealso{ 19 | \code{tableDescription} 20 | } 21 | -------------------------------------------------------------------------------- /man/key_inspector_sqlite.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{key_inspector_sqlite} 4 | \alias{key_inspector_sqlite} 5 | \title{Return all primary key columns as guess at preferred primary keys for a SQLite handle.} 6 | \usage{ 7 | key_inspector_sqlite(handle) 8 | } 9 | \arguments{ 10 | \item{handle}{data handle} 11 | } 12 | \value{ 13 | map of keys to keys 14 | } 15 | \description{ 16 | Return all primary key columns as guess at preferred primary keys for a SQLite handle. 17 | } 18 | \seealso{ 19 | \code{tableDescription} 20 | } 21 | -------------------------------------------------------------------------------- /man/keysAreUnique.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{keysAreUnique} 4 | \alias{keysAreUnique} 5 | \title{Check uniqueness of rows with respect to keys.} 6 | \usage{ 7 | keysAreUnique(tDesc) 8 | } 9 | \arguments{ 10 | \item{tDesc}{description of tables, from \code{\link{tableDescription}} (and likely altered by user).} 11 | } 12 | \value{ 13 | logical TRUE if keys are unique 14 | } 15 | \description{ 16 | Can be an expensive operation. 17 | } 18 | \examples{ 19 | 20 | d <- data.frame(x=c(1,1,2,2,3,3), y=c(1,2,1,2,1,2)) 21 | tDesc1 <- tableDescription('d1', d) 22 | tDesc2 <- tableDescription('d2', d) 23 | tDesc <- rbind(tDesc1, tDesc2) 24 | tDesc$keys[[2]] <- c(x='x') 25 | keysAreUnique(tDesc) 26 | 27 | } 28 | \seealso{ 29 | \code{\link{tableDescription}} 30 | } 31 | -------------------------------------------------------------------------------- /man/makeJoinDiagramSpec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{makeJoinDiagramSpec} 4 | \alias{makeJoinDiagramSpec} 5 | \title{Build a drawable specification of the join diagram} 6 | \usage{ 7 | makeJoinDiagramSpec(columnJoinPlan, ..., groupByKeys = TRUE, graphOpts = NULL) 8 | } 9 | \arguments{ 10 | \item{columnJoinPlan}{join plan} 11 | 12 | \item{...}{force later arguments to bind by name} 13 | 14 | \item{groupByKeys}{logical if true build key-equivalent sub-graphs} 15 | 16 | \item{graphOpts}{options for graphViz} 17 | } 18 | \value{ 19 | grViz diagram spec 20 | } 21 | \description{ 22 | Please see \code{vignette('DependencySorting', package = 'replyr')} and \code{vignette('joinController', package= 'replyr')} for more details. 23 | } 24 | \examples{ 25 | 26 | 27 | if (requireNamespace("RSQLite", quietly = TRUE)) { 28 | # note: employeeanddate is likely built as a cross-product 29 | # join of an employee table and set of dates of interest 30 | # before getting to the join controller step. We call 31 | # such a table "row control" or "experimental design." 32 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 33 | RSQLite::initExtension(my_db) 34 | tDesc <- example_employeeAndDate(my_db) 35 | # fix order by hand, please see replyr::topoSortTables for 36 | # how to automate this. 37 | ord <- match(c('employeeanddate', 'orgtable', 'activity', 'revenue'), 38 | tDesc$tableName) 39 | tDesc <- tDesc[ord, , drop=FALSE] 40 | columnJoinPlan <- buildJoinPlan(tDesc, check= FALSE) 41 | # unify keys 42 | columnJoinPlan$resultColumn[columnJoinPlan$resultColumn=='id'] <- 'eid' 43 | # look at plan defects 44 | print(paste('problems:', 45 | inspectDescrAndJoinPlan(tDesc, columnJoinPlan))) 46 | diagramSpec <- makeJoinDiagramSpec(columnJoinPlan) 47 | # to render as JavaScript: 48 | # DiagrammeR::grViz(diagramSpec) 49 | DBI::dbDisconnect(my_db) 50 | my_db <- NULL 51 | } 52 | 53 | } 54 | \seealso{ 55 | \code{\link{tableDescription}}, \code{\link{buildJoinPlan}}, \code{\link{executeLeftJoinPlan}} 56 | } 57 | -------------------------------------------------------------------------------- /man/replyr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/replyr.R 3 | \docType{package} 4 | \name{replyr} 5 | \alias{replyr} 6 | \title{replyr: Patches to Use dplyr on Remote Data Sources} 7 | \description{ 8 | Methods to reliably use \code{dplyr} on remote data sources in \code{R} (\code{SQL} databases, 9 | \code{Spark} \code{2.0.0} and above) in a generic fashion. 10 | } 11 | \details{ 12 | \code{replyr} is going into maintenance mode. It has been hard to track 13 | shifting \code{dplyr}/\code{dbplyr}/\code{rlang} APIs and data structures post \code{dplyr} \code{0.5}. 14 | Most of what it does is now done better in one of the newer non-monolithic packages: 15 | 16 | \itemize{ 17 | \item Programming and meta-programming tools: \code{wrapr} \url{https://CRAN.R-project.org/package=wrapr}. 18 | \item Adapting \code{dplyr} to standard evaluation interfaces: \code{seplyr} \url{https://CRAN.R-project.org/package=seplyr}. 19 | \item Big data data manipulation: \code{rquery} \url{https://CRAN.R-project.org/package=rquery} and \code{cdata} \url{https://CRAN.R-project.org/package=cdata}. 20 | } 21 | 22 | 23 | \code{replyr} helps with the following: 24 | 25 | \itemize{ 26 | \item Summarizing remote data (via \code{replyr_summarize}). 27 | \item Facilitating writing "source generic" code that works similarly on multiple 'dplyr' data sources. 28 | \item Providing big data versions of functions for splitting data, binding rows, pivoting, adding row-ids, ranking, and completing experimental designs. 29 | \item Packaging common data manipulation tasks into operators such as the \code{\link{gapply}} function. 30 | \item Providing support code for common \code{SparklyR} tasks, such as tracking temporary handle IDs. 31 | } 32 | 33 | \code{replyr} is in maintenance mode. Better version of the functionality have been ported to the following packages: 34 | \code{wrapr}, \code{cdata}, \code{rquery}, and \code{seplyr}. 35 | 36 | 37 | To learn more about replyr, please start with the vignette: 38 | \code{vignette('replyr','replyr')} 39 | } 40 | -------------------------------------------------------------------------------- /man/replyr_add_ids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/addIds.R 3 | \name{replyr_add_ids} 4 | \alias{replyr_add_ids} 5 | \title{Add unique ids to rows. Note: re-arranges rows in many cases.} 6 | \usage{ 7 | replyr_add_ids(df, idColName, env = parent.frame(), local_short_cut = TRUE) 8 | } 9 | \arguments{ 10 | \item{df}{data.frame object to work with} 11 | 12 | \item{idColName}{name of column to add} 13 | 14 | \item{env}{environment to evaluate in (not used).} 15 | 16 | \item{local_short_cut}{logical, if TRUE use base R on local data.} 17 | } 18 | \description{ 19 | Add unique ids to rows. Note: re-arranges rows in many cases. 20 | } 21 | \examples{ 22 | 23 | replyr_add_ids(data.frame(x=c('a','b')), 'id', local_short_cut = FALSE) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/replyr_apply_f_mapped.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/renameRestrictCols.R 3 | \name{replyr_apply_f_mapped} 4 | \alias{replyr_apply_f_mapped} 5 | \title{Apply a function to a re-mapped data frame.} 6 | \usage{ 7 | replyr_apply_f_mapped( 8 | d, 9 | f, 10 | nmap, 11 | ..., 12 | restrictMapIn = FALSE, 13 | rmap = replyr::replyr_reverseMap(nmap), 14 | restrictMapOut = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{d}{data.frame to work on} 19 | 20 | \item{f}{function to apply.} 21 | 22 | \item{nmap}{named list mapping with keys specifying new column names, and values as original column names.} 23 | 24 | \item{...}{force later arguments to bind by name} 25 | 26 | \item{restrictMapIn}{logical if TRUE restrict columns when mapping in.} 27 | 28 | \item{rmap}{reverse map (for after f is applied).} 29 | 30 | \item{restrictMapOut}{logical if TRUE restrict columns when mapping out.} 31 | } 32 | \description{ 33 | Apply a function to a re-mapped data frame. 34 | } 35 | \examples{ 36 | 37 | # an external function with hard-coded column names 38 | DecreaseRankColumnByOne <- function(d) { 39 | d$RankColumn <- d$RankColumn - 1 40 | d 41 | } 42 | 43 | # our example data, with different column names 44 | d <- data.frame(Sepal_Length=c(5.8,5.7), 45 | Sepal_Width=c(4.0,4.4), 46 | Species='setosa',rank=c(1,2)) 47 | print(d) 48 | 49 | 50 | # map our data to expected column names so we can use function 51 | nmap <- c(GroupColumn='Species', 52 | ValueColumn='Sepal_Length', 53 | RankColumn='rank') 54 | print(nmap) 55 | 56 | dF <- replyr_apply_f_mapped(d, DecreaseRankColumnByOne, nmap) 57 | print(dF) 58 | 59 | 60 | 61 | } 62 | \seealso{ 63 | \code{\link{let}}, \code{\link{replyr_reverseMap}}, \code{\link{replyr_mapRestrictCols}} 64 | } 65 | -------------------------------------------------------------------------------- /man/replyr_arrange.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/underscoreReplacements.R 3 | \name{replyr_arrange} 4 | \alias{replyr_arrange} 5 | \title{arrange by a single column} 6 | \usage{ 7 | replyr_arrange(.data, colname, descending = FALSE) 8 | } 9 | \arguments{ 10 | \item{.data}{data object to work on} 11 | 12 | \item{colname}{character column name} 13 | 14 | \item{descending}{logical if true sort descending (else sort ascending)} 15 | } 16 | \description{ 17 | arrange by a single column 18 | } 19 | \examples{ 20 | 21 | d <- data.frame(Sepal_Length= c(5.8,5.7), 22 | Sepal_Width= c(4.0,4.4)) 23 | replyr_arrange(d, 'Sepal_Length', descending= TRUE) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/replyr_bind_rows.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bind_rows.R 3 | \name{replyr_bind_rows} 4 | \alias{replyr_bind_rows} 5 | \title{Bind a list of items by rows (can't use dplyr::bind_rows or dplyr::combine on remote sources). Columns are intersected.} 6 | \usage{ 7 | replyr_bind_rows( 8 | lst, 9 | ..., 10 | useDplyrLocal = TRUE, 11 | useSparkRbind = TRUE, 12 | useUnionALL = TRUE, 13 | tempNameGenerator = mk_tmp_name_source("replyr_bind_rows") 14 | ) 15 | } 16 | \arguments{ 17 | \item{lst}{list of items to combine, must be all in same dplyr data service} 18 | 19 | \item{...}{force other arguments to be used by name} 20 | 21 | \item{useDplyrLocal}{logical if TRUE use dplyr for local data.} 22 | 23 | \item{useSparkRbind}{logical if TRUE try to use rbind on Sparklyr data} 24 | 25 | \item{useUnionALL}{logical if TRUE try to use union all binding} 26 | 27 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 28 | } 29 | \value{ 30 | single data item 31 | } 32 | \description{ 33 | Bind a list of items by rows (can't use dplyr::bind_rows or dplyr::combine on remote sources). Columns are intersected. 34 | } 35 | \examples{ 36 | 37 | if (requireNamespace("RSQLite", quietly = TRUE)) { 38 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 39 | # my_db <- sparklyr::spark_connect(master = "local") 40 | d <- replyr_copy_to(my_db, data.frame(x = 1:2), 'd', 41 | temporary = TRUE) 42 | # dplyr::bind_rows(list(d, d)) 43 | # # Argument 1 must be a data frame or a named atomic vector, 44 | # # not a tbl_dbi/tbl_sql/tbl_lazy/tbl 45 | print(replyr_bind_rows(list(d, d))) 46 | DBI::dbDisconnect(my_db) 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/replyr_check_ranks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/checkRank.R 3 | \name{replyr_check_ranks} 4 | \alias{replyr_check_ranks} 5 | \title{confirm data has good ranked groups} 6 | \usage{ 7 | replyr_check_ranks( 8 | x, 9 | GroupColumnName, 10 | ValueColumnName, 11 | RankColumnName, 12 | ..., 13 | decreasing = FALSE, 14 | tempNameGenerator = mk_tmp_name_source("replyr_check_ranks") 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{data item to work with} 19 | 20 | \item{GroupColumnName}{column to group by} 21 | 22 | \item{ValueColumnName}{column determining order} 23 | 24 | \item{RankColumnName}{column having proposed rank (function of order)} 25 | 26 | \item{...}{force later arguments to bind by name} 27 | 28 | \item{decreasing}{if true make order decreasing instead of increasing.} 29 | 30 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 31 | } 32 | \value{ 33 | summary of quality of ranking. 34 | } 35 | \description{ 36 | confirm data has good ranked groups 37 | } 38 | \examples{ 39 | 40 | d <- data.frame(Sepal_Length=c(5.8,5.7),Sepal_Width=c(4.0,4.4), 41 | Species='setosa',rank=c(1,2)) 42 | replyr_check_ranks(d,'Species','Sepal_Length','rank', decreasing=TRUE) 43 | 44 | } 45 | -------------------------------------------------------------------------------- /man/replyr_coalesce.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/coalesce.R 3 | \name{replyr_coalesce} 4 | \alias{replyr_coalesce} 5 | \title{Augment a data frame by adding additional rows.} 6 | \usage{ 7 | replyr_coalesce( 8 | data, 9 | support, 10 | ..., 11 | fills = NULL, 12 | newRowColumn = NULL, 13 | copy = TRUE, 14 | tempNameGenerator = mk_tmp_name_source("replyr_coalesce") 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{data.frame data to augment} 19 | 20 | \item{support}{data.frame rows of unique key-values into data} 21 | 22 | \item{...}{not used, force later arguments to bind by name} 23 | 24 | \item{fills}{list default values to fill in columns} 25 | 26 | \item{newRowColumn}{character if not null name to use for new row indicator} 27 | 28 | \item{copy}{logical if TRUE copy support to data's source} 29 | 30 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 31 | } 32 | \value{ 33 | augmented data 34 | } 35 | \description{ 36 | Note: do not count on order of resulting data. Also only added rows 37 | are altered by the fill instructions. 38 | } 39 | \examples{ 40 | 41 | 42 | # single column key example 43 | data <- data.frame(year = c(2005,2007,2010), 44 | count = c(6,1,NA), 45 | name = c('a','b','c'), 46 | stringsAsFactors = FALSE) 47 | support <- data.frame(year=2005:2010) 48 | filled <- replyr_coalesce(data, support, 49 | fills=list(count=0)) 50 | filled <- filled[order(filled$year), ] 51 | filled 52 | 53 | # complex key example 54 | data <- data.frame(year = c(2005,2007,2010), 55 | count = c(6,1,NA), 56 | name = c('a','b','c'), 57 | stringsAsFactors = FALSE) 58 | support <- expand.grid(year=2005:2010, 59 | name= c('a','b','c','d'), 60 | stringsAsFactors = FALSE) 61 | filled <- replyr_coalesce(data, support, 62 | fills=list(count=0)) 63 | filled <- filled[order(filled$year, filled$name), ] 64 | filled 65 | 66 | } 67 | -------------------------------------------------------------------------------- /man/replyr_colClasses.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/colClasses.R 3 | \name{replyr_colClasses} 4 | \alias{replyr_colClasses} 5 | \title{Get column classes.} 6 | \usage{ 7 | replyr_colClasses(x) 8 | } 9 | \arguments{ 10 | \item{x}{tbl or item that can be coerced into such.} 11 | } 12 | \value{ 13 | list of column classes. 14 | } 15 | \description{ 16 | Get column classes. 17 | } 18 | \examples{ 19 | 20 | d <- data.frame(x=c(1,2)) 21 | replyr_colClasses(d) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/replyr_copy_from.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/copyToFrom.R 3 | \name{replyr_copy_from} 4 | \alias{replyr_copy_from} 5 | \title{Bring remote data back as a local data frame tbl.} 6 | \usage{ 7 | replyr_copy_from(d, maxrow = 1e+06) 8 | } 9 | \arguments{ 10 | \item{d}{remote dplyr data item} 11 | 12 | \item{maxrow}{max rows to allow (stop otherwise, set to NULL to allow any size).} 13 | } 14 | \value{ 15 | local tbl. 16 | } 17 | \description{ 18 | Bring remote data back as a local data frame tbl. 19 | } 20 | \examples{ 21 | 22 | 23 | if (requireNamespace("RSQLite", quietly = TRUE)) { 24 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 25 | RSQLite::initExtension(my_db) 26 | d <- replyr_copy_to(my_db,data.frame(x=c(1,2)),'d') 27 | d2 <- replyr_copy_from(d) 28 | print(d2) 29 | DBI::dbDisconnect(my_db) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/replyr_copy_to.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/copyToFrom.R 3 | \name{replyr_copy_to} 4 | \alias{replyr_copy_to} 5 | \title{Copy data to remote service.} 6 | \usage{ 7 | replyr_copy_to( 8 | dest, 9 | df, 10 | name = paste(deparse(substitute(df)), collapse = " "), 11 | ..., 12 | rowNumberColumn = NULL, 13 | temporary = FALSE, 14 | overwrite = TRUE, 15 | maxrow = 1e+06 16 | ) 17 | } 18 | \arguments{ 19 | \item{dest}{remote data source} 20 | 21 | \item{df}{local data frame} 22 | 23 | \item{name}{name for new remote table} 24 | 25 | \item{...}{force later values to be bound by name} 26 | 27 | \item{rowNumberColumn}{if not null name to add row numbers to} 28 | 29 | \item{temporary}{logical, if TRUE try to create a temporary table} 30 | 31 | \item{overwrite}{logical, if TRUE try to overwrite} 32 | 33 | \item{maxrow}{max rows to allow in a remote to remote copy.} 34 | } 35 | \value{ 36 | remote handle 37 | } 38 | \description{ 39 | Copy data to remote service. 40 | } 41 | \examples{ 42 | 43 | 44 | if (requireNamespace("RSQLite", quietly = TRUE)) { 45 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 46 | RSQLite::initExtension(my_db) 47 | d <- replyr_copy_to(my_db, data.frame(x=c(1,2)), 'd') 48 | print(d) 49 | DBI::dbDisconnect(my_db) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /man/replyr_dim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dim.R 3 | \name{replyr_dim} 4 | \alias{replyr_dim} 5 | \title{Compute dimensions of a data.frame (work around https://github.com/rstudio/sparklyr/issues/976 ).} 6 | \usage{ 7 | replyr_dim(x) 8 | } 9 | \arguments{ 10 | \item{x}{tbl or item that can be coerced into such.} 11 | } 12 | \value{ 13 | dimensions (including rows) 14 | } 15 | \description{ 16 | Compute dimensions of a data.frame (work around https://github.com/rstudio/sparklyr/issues/976 ). 17 | } 18 | \examples{ 19 | 20 | d <- data.frame(x=c(1,2)) 21 | replyr_dim(d) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/replyr_filter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filter.R 3 | \name{replyr_filter} 4 | \alias{replyr_filter} 5 | \title{Filter a tbl on a column having values in a given set.} 6 | \usage{ 7 | replyr_filter( 8 | x, 9 | cname, 10 | values, 11 | ..., 12 | verbose = TRUE, 13 | tempNameGenerator = mk_tmp_name_source("replyr_filter") 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{tbl or item that can be coerced into such.} 18 | 19 | \item{cname}{name of the column to test values of.} 20 | 21 | \item{values}{set of values to check set membership of.} 22 | 23 | \item{...}{force later arguments to bind by name.} 24 | 25 | \item{verbose}{logical if TRUE echo warnings} 26 | 27 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 28 | } 29 | \value{ 30 | new tbl with only rows where cname value is in values set. 31 | } 32 | \description{ 33 | Filter a tbl on a column having values in a given set. 34 | } 35 | \examples{ 36 | 37 | values <- c('a','c') 38 | d <- data.frame(x=c('a','a','b','b','c','c'),y=1:6, 39 | stringsAsFactors=FALSE) 40 | replyr_filter(d,'x',values) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /man/replyr_get_src.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/serviceName.R 3 | \name{replyr_get_src} 4 | \alias{replyr_get_src} 5 | \title{Get the "remote data source" where a data.frame like object lives.} 6 | \usage{ 7 | replyr_get_src(df) 8 | } 9 | \arguments{ 10 | \item{df}{data.frame style object} 11 | } 12 | \value{ 13 | source (string if data.frame, tbl, or data.table, NULL if unknown, remote source otherwise) 14 | } 15 | \description{ 16 | Get the "remote data source" where a data.frame like object lives. 17 | } 18 | \examples{ 19 | 20 | replyr_get_src(data.frame(x=1:2)) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/replyr_group_by.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/underscoreReplacements.R 3 | \name{replyr_group_by} 4 | \alias{replyr_group_by} 5 | \title{group_by columns} 6 | \usage{ 7 | replyr_group_by(.data, colnames) 8 | } 9 | \arguments{ 10 | \item{.data}{data.frame} 11 | 12 | \item{colnames}{character vector of column names to group by.} 13 | } 14 | \value{ 15 | .data grouped by columns named in colnames 16 | } 17 | \description{ 18 | See also: \url{https://gist.github.com/skranz/9681509} 19 | } 20 | \examples{ 21 | 22 | d <- data.frame(Sepal_Length= c(5.8,5.7), 23 | Sepal_Width= c(4.0,4.4), 24 | Species= 'setosa') 25 | replyr_group_by(d, 'Species') 26 | 27 | } 28 | \seealso{ 29 | \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{group_by_at}} 30 | } 31 | -------------------------------------------------------------------------------- /man/replyr_has_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/serviceName.R 3 | \name{replyr_has_table} 4 | \alias{replyr_has_table} 5 | \title{check for a table} 6 | \usage{ 7 | replyr_has_table(con, name) 8 | } 9 | \arguments{ 10 | \item{con}{connection} 11 | 12 | \item{name}{character name to check for} 13 | } 14 | \value{ 15 | TRUE if table present 16 | } 17 | \description{ 18 | Work around connection v.s. handle issues \url{https://github.com/tidyverse/dplyr/issues/2849} 19 | } 20 | \examples{ 21 | 22 | if (requireNamespace("RSQLite", quietly = TRUE)) { 23 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 24 | RSQLite::initExtension(my_db) 25 | d <- replyr_copy_to(my_db, data.frame(x=c(1,2)), 'd') 26 | print(d) 27 | print(replyr_has_table(my_db, 'd')) 28 | DBI::dbDisconnect(my_db) 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/replyr_hasrows.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nrow.R 3 | \name{replyr_hasrows} 4 | \alias{replyr_hasrows} 5 | \title{Check if a table has rows.} 6 | \usage{ 7 | replyr_hasrows(d) 8 | } 9 | \arguments{ 10 | \item{d}{tbl or item that can be coerced into such.} 11 | } 12 | \value{ 13 | number of rows 14 | } 15 | \description{ 16 | Check if a table has rows. 17 | } 18 | \examples{ 19 | 20 | d <- data.frame(x=c(1,2)) 21 | replyr_hasrows(d) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/replyr_inTest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inTest.R 3 | \name{replyr_inTest} 4 | \alias{replyr_inTest} 5 | \title{Product a column noting if another columns values are in a given set.} 6 | \usage{ 7 | replyr_inTest( 8 | x, 9 | cname, 10 | values, 11 | nname, 12 | ..., 13 | tempNameGenerator = mk_tmp_name_source("replyr_inTest"), 14 | verbose = TRUE 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{tbl or item that can be coerced into such.} 19 | 20 | \item{cname}{name of the column to test values of.} 21 | 22 | \item{values}{set of values to check set membership of.} 23 | 24 | \item{nname}{name for new column} 25 | 26 | \item{...}{force later parameters to bind by name} 27 | 28 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 29 | 30 | \item{verbose}{logical if TRUE echo warnings} 31 | } 32 | \value{ 33 | table with membership indications. 34 | } 35 | \description{ 36 | Product a column noting if another columns values are in a given set. 37 | } 38 | \examples{ 39 | 40 | values <- c('a','c') 41 | d <- data.frame(x=c('a','a','b',NA,'c','c'),y=1:6, 42 | stringsAsFactors=FALSE) 43 | replyr_inTest(d,'x',values,'match') 44 | 45 | } 46 | -------------------------------------------------------------------------------- /man/replyr_is_MySQL_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/serviceName.R 3 | \name{replyr_is_MySQL_data} 4 | \alias{replyr_is_MySQL_data} 5 | \title{Test if data is MySQL.} 6 | \usage{ 7 | replyr_is_MySQL_data(d) 8 | } 9 | \arguments{ 10 | \item{d}{data frame} 11 | } 12 | \value{ 13 | TRUE if Spark data 14 | } 15 | \description{ 16 | Test if data is MySQL. 17 | } 18 | \examples{ 19 | 20 | replyr_is_MySQL_data(data.frame(x=1:3)) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/replyr_is_Spark_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/serviceName.R 3 | \name{replyr_is_Spark_data} 4 | \alias{replyr_is_Spark_data} 5 | \title{Test if data is Spark.} 6 | \usage{ 7 | replyr_is_Spark_data(d) 8 | } 9 | \arguments{ 10 | \item{d}{data frame} 11 | } 12 | \value{ 13 | TRUE if Spark data 14 | } 15 | \description{ 16 | Test if data is Spark. 17 | } 18 | \examples{ 19 | 20 | replyr_is_Spark_data(data.frame(x=1:3)) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/replyr_is_local_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/serviceName.R 3 | \name{replyr_is_local_data} 4 | \alias{replyr_is_local_data} 5 | \title{Test if data is local.} 6 | \usage{ 7 | replyr_is_local_data(d) 8 | } 9 | \arguments{ 10 | \item{d}{data frame} 11 | } 12 | \value{ 13 | TRUE if local data (data.frame, tbl/tibble) 14 | } 15 | \description{ 16 | Test if data is local. 17 | } 18 | \examples{ 19 | 20 | replyr_is_local_data(data.frame(x=1:3)) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/replyr_list_tables.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/serviceName.R 3 | \name{replyr_list_tables} 4 | \alias{replyr_list_tables} 5 | \title{list tables} 6 | \usage{ 7 | replyr_list_tables(con) 8 | } 9 | \arguments{ 10 | \item{con}{connection} 11 | } 12 | \value{ 13 | list of tables names 14 | } 15 | \description{ 16 | Work around connection v.s. handle issues \url{https://github.com/tidyverse/dplyr/issues/2849} 17 | } 18 | \examples{ 19 | 20 | if (requireNamespace("RSQLite", quietly = TRUE)) { 21 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 22 | RSQLite::initExtension(my_db) 23 | d <- replyr_copy_to(my_db, data.frame(x=c(1,2)), 'd', 24 | overwrite=TRUE, temporary=TRUE) 25 | print(d) 26 | print(replyr_list_tables(my_db)) 27 | DBI::dbDisconnect(my_db) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /man/replyr_mapRestrictCols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/renameRestrictCols.R 3 | \name{replyr_mapRestrictCols} 4 | \alias{replyr_mapRestrictCols} 5 | \title{Map names of columns to known values and drop other columns.} 6 | \usage{ 7 | replyr_mapRestrictCols(x, nmap, ..., restrict = FALSE, reverse = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{data item to work on} 11 | 12 | \item{nmap}{named list mapping with keys specifying new column names, and values as original column names.} 13 | 14 | \item{...}{force later arguments to bind by name} 15 | 16 | \item{restrict}{logical if TRUE restrict to columns mentioned in nmap.} 17 | 18 | \item{reverse}{logical if TRUE apply the inverse of nmap instead of nmap.} 19 | } 20 | \value{ 21 | data item with columns renamed (and possibly restricted). 22 | } 23 | \description{ 24 | Restrict a data item's column names and re-name them in bulk. 25 | } 26 | \details{ 27 | Something like \code{replyr::replyr_mapRestrictCols} is only useful to get control of a function that is not parameterized 28 | (in the sense it has hard-coded column names inside its implementation that don't the match column names in our data). 29 | } 30 | \examples{ 31 | 32 | # an external function with hard-coded column names 33 | DecreaseRankColumnByOne <- function(d) { 34 | d$RankColumn <- d$RankColumn - 1 35 | d 36 | } 37 | 38 | # our example data, with different column names 39 | d <- data.frame(Sepal_Length=c(5.8,5.7), 40 | Sepal_Width=c(4.0,4.4), 41 | Species='setosa',rank=c(1,2)) 42 | print(d) 43 | 44 | 45 | # map our data to expected column names so we can use function 46 | nmap <- c(GroupColumn='Species', 47 | ValueColumn='Sepal_Length', 48 | RankColumn='rank') 49 | print(nmap) 50 | dm <- replyr_mapRestrictCols(d,nmap) 51 | print(dm) 52 | 53 | # can now apply code that expects hard-coded names. 54 | dm <- DecreaseRankColumnByOne(dm) 55 | 56 | # map back to our original column names (for the columns we retained) 57 | # Note: can only map back columns that were retained in first mapping. 58 | replyr_mapRestrictCols(dm, nmap, reverse=TRUE) 59 | 60 | } 61 | \seealso{ 62 | \code{\link{let}}, \code{\link{replyr_reverseMap}}, \code{\link{replyr_apply_f_mapped}} 63 | } 64 | -------------------------------------------------------------------------------- /man/replyr_ncol.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dim.R 3 | \name{replyr_ncol} 4 | \alias{replyr_ncol} 5 | \title{Compute number of columns of a data.frame (work around https://github.com/rstudio/sparklyr/issues/976 ).} 6 | \usage{ 7 | replyr_ncol(x) 8 | } 9 | \arguments{ 10 | \item{x}{tbl or item that can be coerced into such.} 11 | } 12 | \value{ 13 | number of columns 14 | } 15 | \description{ 16 | Compute number of columns of a data.frame (work around https://github.com/rstudio/sparklyr/issues/976 ). 17 | } 18 | \examples{ 19 | 20 | d <- data.frame(x=c(1,2)) 21 | replyr_ncol(d) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /man/replyr_nrow.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nrow.R 3 | \name{replyr_nrow} 4 | \alias{replyr_nrow} 5 | \title{Compute number of rows of a tbl.} 6 | \usage{ 7 | replyr_nrow(x) 8 | } 9 | \arguments{ 10 | \item{x}{tbl or item that can be coerced into such.} 11 | } 12 | \value{ 13 | number of rows 14 | } 15 | \description{ 16 | Number of row in a table. This function is not "group aware" it returns the total number of rows, not rows per dplyr group. 17 | Also \code{replyr_nrow} depends on data being returned to count, so some corner cases (such as zero columns) will count as zero rows. 18 | In particular work around dplyr issue 2871 \url{https://github.com/tidyverse/dplyr/issues/2871}. 19 | } 20 | \examples{ 21 | 22 | d <- data.frame(x=c(1,2)) 23 | replyr_nrow(d) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/replyr_quantile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/quantile.R 3 | \name{replyr_quantile} 4 | \alias{replyr_quantile} 5 | \title{Compute quantiles on remote column (NA's filtered out) using binary search.} 6 | \usage{ 7 | replyr_quantile( 8 | x, 9 | cname, 10 | probs = seq(0, 1, 0.25), 11 | ..., 12 | tempNameGenerator = mk_tmp_name_source("replyr_quantile") 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{tbl or item that can be coerced into such.} 17 | 18 | \item{cname}{column name to compute over} 19 | 20 | \item{probs}{numeric vector of probabilities with values in [0,1].} 21 | 22 | \item{...}{force later arguments to be bound by name.} 23 | 24 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 25 | } 26 | \description{ 27 | NA's filtered out and does not break ties the same as stats::quantile. 28 | } 29 | \examples{ 30 | 31 | d <- data.frame(xvals=rev(1:1000)) 32 | replyr_quantile(d,'xvals') 33 | 34 | } 35 | -------------------------------------------------------------------------------- /man/replyr_quantilec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/quantile.R 3 | \name{replyr_quantilec} 4 | \alias{replyr_quantilec} 5 | \title{Compute quantiles on remote column (NA's filtered out) using cumsum.} 6 | \usage{ 7 | replyr_quantilec( 8 | x, 9 | cname, 10 | probs = seq(0, 1, 0.25), 11 | ..., 12 | tempNameGenerator = mk_tmp_name_source("replyr_quantilec") 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{tbl or item that can be coerced into such.} 17 | 18 | \item{cname}{column name to compute over (not 'n' or 'csum')} 19 | 20 | \item{probs}{numeric vector of probabilities with values in [0,1].} 21 | 22 | \item{...}{force later arguments to bind by name.} 23 | 24 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 25 | } 26 | \description{ 27 | NA's filtered out and does not break ties the same as stats::quantile. 28 | } 29 | \examples{ 30 | 31 | d <- data.frame(xvals=rev(1:1000)) 32 | replyr_quantilec(d,'xvals') 33 | 34 | } 35 | -------------------------------------------------------------------------------- /man/replyr_rename.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/underscoreReplacements.R 3 | \name{replyr_rename} 4 | \alias{replyr_rename} 5 | \title{Rename a column} 6 | \usage{ 7 | replyr_rename(.data, ..., newName, oldName) 8 | } 9 | \arguments{ 10 | \item{.data}{data object to work on} 11 | 12 | \item{...}{force later arguments to bind by name} 13 | 14 | \item{newName}{character new column name} 15 | 16 | \item{oldName}{character old column name} 17 | } 18 | \description{ 19 | Rename a column 20 | } 21 | \examples{ 22 | 23 | d <- data.frame(Sepal_Length= c(5.8,5.7), 24 | Sepal_Width= c(4.0,4.4), 25 | Species= 'setosa', rank=c(1,2)) 26 | replyr_rename(d, newName = 'family', oldName = 'Species') 27 | 28 | } 29 | -------------------------------------------------------------------------------- /man/replyr_reverseMap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/renameRestrictCols.R 3 | \name{replyr_reverseMap} 4 | \alias{replyr_reverseMap} 5 | \title{Reverse a name assignment map (which are written NEWNAME=OLDNAME).} 6 | \usage{ 7 | replyr_reverseMap(nmap) 8 | } 9 | \arguments{ 10 | \item{nmap}{named list mapping with keys specifying new column names, and values as original column names.} 11 | } 12 | \value{ 13 | inverse map 14 | } 15 | \description{ 16 | Reverse a name assignment map (which are written NEWNAME=OLDNAME). 17 | } 18 | \examples{ 19 | 20 | mp <- c(A='x', B='y') 21 | print(mp) 22 | replyr_reverseMap(mp) 23 | 24 | } 25 | \seealso{ 26 | \code{\link{let}}, \code{\link{replyr_apply_f_mapped}}, \code{\link{replyr_mapRestrictCols}} 27 | } 28 | -------------------------------------------------------------------------------- /man/replyr_select.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/underscoreReplacements.R 3 | \name{replyr_select} 4 | \alias{replyr_select} 5 | \title{select columns} 6 | \usage{ 7 | replyr_select(.data, colnames) 8 | } 9 | \arguments{ 10 | \item{.data}{data object to work on} 11 | 12 | \item{colnames}{character column names} 13 | } 14 | \description{ 15 | select columns 16 | } 17 | \examples{ 18 | 19 | d <- data.frame(Sepal_Length= c(5.8,5.7), 20 | Sepal_Width= c(4.0,4.4), 21 | Species= 'setosa', rank=c(1,2)) 22 | replyr_select(d, c('Sepal_Length', 'Species')) 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/replyr_split.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/groupedApply.R 3 | \name{replyr_split} 4 | \alias{replyr_split} 5 | \title{split a data item by values in a column.} 6 | \usage{ 7 | replyr_split( 8 | df, 9 | gcolumn, 10 | ..., 11 | ocolumn = NULL, 12 | decreasing = FALSE, 13 | partitionMethod = "extract", 14 | maxgroups = 100, 15 | eagerCompute = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{df}{remote dplyr data item} 20 | 21 | \item{gcolumn}{grouping column} 22 | 23 | \item{...}{force later values to be bound by name} 24 | 25 | \item{ocolumn}{ordering column (optional)} 26 | 27 | \item{decreasing}{if TRUE sort in decreasing order by ocolumn} 28 | 29 | \item{partitionMethod}{method to partition the data, one of 'split' (only works over local data frames), or 'extract'} 30 | 31 | \item{maxgroups}{maximum number of groups to work over} 32 | 33 | \item{eagerCompute}{if TRUE call compute on split results} 34 | } 35 | \value{ 36 | list of data items 37 | } 38 | \description{ 39 | Partitions from by values in grouping column, and returns list. Only advised for a 40 | moderate number of groups and better if grouping column is an index. 41 | This plus lapply and replyr::bind_rows is powerful 42 | enough to implement "The Split-Apply-Combine Strategy for Data Analysis" 43 | https://www.jstatsoft.org/article/view/v040i01 44 | } 45 | \examples{ 46 | 47 | d <- data.frame(group=c(1,1,2,2,2), 48 | order=c(.1,.2,.3,.4,.5), 49 | values=c(10,20,2,4,8)) 50 | dSplit <- replyr_split(d, 'group', partitionMethod='extract') 51 | dApp <- lapply(dSplit, function(di) data.frame(as.list(colMeans(di)))) 52 | replyr_bind_rows(dApp) 53 | 54 | } 55 | -------------------------------------------------------------------------------- /man/replyr_summary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summary.R 3 | \name{replyr_summary} 4 | \alias{replyr_summary} 5 | \title{Compute usable summary of columns of tbl.} 6 | \usage{ 7 | replyr_summary( 8 | x, 9 | ..., 10 | countUniqueNum = FALSE, 11 | countUniqueNonNum = FALSE, 12 | cols = NULL, 13 | compute = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{tbl or item that can be coerced into such.} 18 | 19 | \item{...}{force additional arguments to be bound by name.} 20 | 21 | \item{countUniqueNum}{logical, if true include unique non-NA counts for numeric cols.} 22 | 23 | \item{countUniqueNonNum}{logical, if true include unique non-NA counts for non-numeric cols.} 24 | 25 | \item{cols}{if not NULL set of columns to restrict to.} 26 | 27 | \item{compute}{logical if TRUE call compute before working} 28 | } 29 | \value{ 30 | summary of columns. 31 | } 32 | \description{ 33 | Compute per-column summaries and return as a \code{data.frame}. Warning: can be an expensive operation. 34 | } 35 | \details{ 36 | Can be slow compared to \code{dplyr::summarize_all()} (but serves a different purpose). 37 | Also, for numeric columns includes \code{NaN} in \code{nna} count (as is typical for \code{R}, e.g., 38 | \code{is.na(NaN)}). And note: \code{replyr_summary()} currently skips "raw" columns. 39 | } 40 | \examples{ 41 | 42 | d <- data.frame(p= c(TRUE, FALSE, NA), 43 | r= I(list(1,2,3)), 44 | s= NA, 45 | t= as.raw(3:5), 46 | w= 1:3, 47 | x= c(NA,2,3), 48 | y= factor(c(3,5,NA)), 49 | z= c('a',NA,'z'), 50 | stringsAsFactors=FALSE) 51 | # sc <- sparklyr::spark_connect(version='2.2.0', 52 | # master = "local") 53 | # dS <- replyr_copy_to(sc, dplyr::select(d, -r, -t), 'dS', 54 | # temporary=TRUE, overwrite=TRUE) 55 | # replyr_summary(dS) 56 | # sparklyr::spark_disconnect(sc) 57 | if (requireNamespace("RSQLite", quietly = TRUE)) { 58 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 59 | RSQLite::initExtension(my_db) 60 | dM <- replyr_copy_to(my_db, dplyr::select(d, -r, -t), 'dM', 61 | temporary=TRUE, overwrite=TRUE) 62 | print(replyr_summary(dM)) 63 | DBI::dbDisconnect(my_db) 64 | } 65 | d$q <- list(1,2,3) 66 | replyr_summary(d) 67 | 68 | } 69 | \seealso{ 70 | \code{\link[rquery]{rsummary}} 71 | } 72 | -------------------------------------------------------------------------------- /man/replyr_testCols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/colClasses.R 3 | \name{replyr_testCols} 4 | \alias{replyr_testCols} 5 | \title{Run test on columns.} 6 | \usage{ 7 | replyr_testCols(x, f, n = 6L) 8 | } 9 | \arguments{ 10 | \item{x}{tbl or item that can be coerced into such.} 11 | 12 | \item{f}{test function (returning logical, not depending on data length).} 13 | 14 | \item{n}{number of rows to use in calculation.} 15 | } 16 | \value{ 17 | logical vector of results. 18 | } 19 | \description{ 20 | Applies user function to head of each column. Good for determing things 21 | such as column class. 22 | } 23 | \examples{ 24 | 25 | d <- data.frame(x=c(1,2),y=c('a','b')) 26 | replyr_testCols(d,is.numeric) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /man/replyr_union_all.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bind_rows.R 3 | \name{replyr_union_all} 4 | \alias{replyr_union_all} 5 | \title{Union two tables.} 6 | \usage{ 7 | replyr_union_all( 8 | tabA, 9 | tabB, 10 | ..., 11 | useDplyrLocal = TRUE, 12 | useSparkRbind = TRUE, 13 | tempNameGenerator = mk_tmp_name_source("replyr_union_all") 14 | ) 15 | } 16 | \arguments{ 17 | \item{tabA}{not-NULL table with at least 1 row.} 18 | 19 | \item{tabB}{not-NULL table with at least 1 row on same data source as tabA and common columns.} 20 | 21 | \item{...}{force later arguments to be bound by name.} 22 | 23 | \item{useDplyrLocal}{logical if TRUE use dplyr::bind_rows for local data.} 24 | 25 | \item{useSparkRbind}{logical if TRUE try to use rbind on Sparklyr data} 26 | 27 | \item{tempNameGenerator}{temp name generator produced by wrapr::mk_tmp_name_source, used to record dplyr::compute() effects.} 28 | } 29 | \value{ 30 | table with all rows of tabA and tabB (union_all). 31 | } 32 | \description{ 33 | Spark 2* union_all has issues ( https://github.com/WinVector/replyr/blob/master/issues/UnionIssue.md ), 34 | and exposed union_all semantics differ from data-source back-end to back-end. 35 | This is an attempt to provide a join-based replacement. 36 | } 37 | \examples{ 38 | 39 | d1 <- data.frame(x = c('a','b'), y = 1, stringsAsFactors= FALSE) 40 | d2 <- data.frame(x = 'c', z = 1, stringsAsFactors= FALSE) 41 | replyr_union_all(d1, d2, useDplyrLocal= FALSE) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /man/replyr_uniqueValues.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/uniqueValues.R 3 | \name{replyr_uniqueValues} 4 | \alias{replyr_uniqueValues} 5 | \title{Compute number of unique values for each level in a column.} 6 | \usage{ 7 | replyr_uniqueValues(x, cname) 8 | } 9 | \arguments{ 10 | \item{x}{tbl or item that can be coerced into such.} 11 | 12 | \item{cname}{name of columns to examine, must not be equal to 'replyr_private_value_n'.} 13 | } 14 | \value{ 15 | unique values for the column. 16 | } 17 | \description{ 18 | Compute number of unique values for each level in a column. 19 | } 20 | \examples{ 21 | 22 | d <- data.frame(x=c(1,2,3,3)) 23 | replyr_uniqueValues(d,'x') 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/tableDescription.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{tableDescription} 4 | \alias{tableDescription} 5 | \title{Build a nice description of a table.} 6 | \usage{ 7 | tableDescription(tableName, handle, ..., keyInspector = key_inspector_all_cols) 8 | } 9 | \arguments{ 10 | \item{tableName}{name of table to add to join plan.} 11 | 12 | \item{handle}{table or table handle to add to join plan (can already be in the plan).} 13 | 14 | \item{...}{force later arguments to bind by name.} 15 | 16 | \item{keyInspector}{function that determines preferred primary key set for table.} 17 | } 18 | \value{ 19 | table describing the data. 20 | } 21 | \description{ 22 | Please see \url{http://www.win-vector.com/blog/2017/05/managing-spark-data-handles-in-r/} for details. 23 | Note: one usually needs to alter the keys column which is just populated with all columns. 24 | } 25 | \details{ 26 | Please see \code{vignette('DependencySorting', package = 'replyr')} and \code{vignette('joinController', package= 'replyr')} for more details. 27 | } 28 | \examples{ 29 | 30 | d <- data.frame(x=1:3, y=NA) 31 | tableDescription('d', d) 32 | 33 | 34 | } 35 | \seealso{ 36 | \code{\link{buildJoinPlan}}, \code{\link{keysAreUnique}}, \code{\link{makeJoinDiagramSpec}}, \code{\link{executeLeftJoinPlan}} 37 | } 38 | -------------------------------------------------------------------------------- /man/topoSortTables.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/joinController.R 3 | \name{topoSortTables} 4 | \alias{topoSortTables} 5 | \title{Topologically sort join plan so values are available before uses.} 6 | \usage{ 7 | topoSortTables(columnJoinPlan, leftTableName, ...) 8 | } 9 | \arguments{ 10 | \item{columnJoinPlan}{join plan} 11 | 12 | \item{leftTableName}{which table is left} 13 | 14 | \item{...}{force later arguments to bind by name} 15 | } 16 | \value{ 17 | list with dependencyGraph and sorted columnJoinPlan 18 | } 19 | \description{ 20 | Depends on \code{igraph} package. 21 | Please see \code{vignette('DependencySorting', package = 'replyr')} and \code{vignette('joinController', package= 'replyr')} for more details. 22 | } 23 | \examples{ 24 | 25 | if (requireNamespace("RSQLite", quietly = TRUE)) { 26 | # note: employeeanddate is likely built as a cross-product 27 | # join of an employee table and set of dates of interest 28 | # before getting to the join controller step. We call 29 | # such a table "row control" or "experimental design." 30 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 31 | RSQLite::initExtension(my_db) 32 | tDesc <- example_employeeAndDate(my_db) 33 | columnJoinPlan <- buildJoinPlan(tDesc, check= FALSE) 34 | # unify keys 35 | columnJoinPlan$resultColumn[columnJoinPlan$resultColumn=='id'] <- 'eid' 36 | # look at plan defects 37 | print(paste('problems:', 38 | inspectDescrAndJoinPlan(tDesc, columnJoinPlan))) 39 | # fix plan 40 | if(requireNamespace('igraph', quietly = TRUE)) { 41 | sorted <- topoSortTables(columnJoinPlan, 'employeeanddate') 42 | print(paste('problems:', 43 | inspectDescrAndJoinPlan(tDesc, sorted$columnJoinPlan))) 44 | # plot(sorted$dependencyGraph) 45 | } 46 | DBI::dbDisconnect(my_db) 47 | my_db <- NULL 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /replyr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | StripTrailingWhitespace: Yes 16 | 17 | BuildType: Package 18 | PackageUseDevtools: Yes 19 | PackageInstallArgs: --no-multiarch --with-keep.source 20 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(replyr) 3 | 4 | test_check("replyr") 5 | -------------------------------------------------------------------------------- /tests/testthat/testOne.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("Excercise Operations") 4 | 5 | test_that("testOne: Works As Expected", { 6 | d <- data.frame(x = c(1, 2)) 7 | n <- replyr_nrow(d) 8 | expect_true(n == 2) 9 | }) -------------------------------------------------------------------------------- /tests/testthat/test_gapply.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("gapply") 4 | 5 | test_that("test_gapply.R", { 6 | library('dplyr') 7 | d <- data.frame( 8 | group = c(1, 1, 2, 2, 2), 9 | order = c(.1, .2, .3, .4, .5), 10 | values = c(10, 20, 2, 4, 8) 11 | ) 12 | 13 | # User supplied window functions. They depend on known column names and 14 | # the data back-end matching function names (as cumsum). 15 | cumulative_sum <- function(d) { 16 | mutate(d, cv = cumsum(values)) 17 | } 18 | rank_in_group <- function(d) { 19 | d <- mutate(d, constcol = 1) 20 | d <- mutate(d, rank = cumsum(constcol)) 21 | select(d, -constcol) 22 | } 23 | 24 | for (partitionMethod in c('group_by', 'split', 'extract')) { 25 | #print(partitionMethod) 26 | #print('cumulative sum example') 27 | #print( 28 | gapply( 29 | d, 30 | 'group', 31 | cumulative_sum, 32 | ocolumn = 'order', 33 | partitionMethod = partitionMethod 34 | ) 35 | #) 36 | #print('ranking example') 37 | #print( 38 | gapply( 39 | d, 40 | 'group', 41 | rank_in_group, 42 | ocolumn = 'order', 43 | partitionMethod = partitionMethod 44 | ) 45 | #) 46 | #print('ranking example (decreasing)') 47 | #print( 48 | gapply( 49 | d, 50 | 'group', 51 | rank_in_group, 52 | ocolumn = 'order', 53 | decreasing = TRUE, 54 | partitionMethod = partitionMethod 55 | ) 56 | #) 57 | } 58 | 59 | }) 60 | -------------------------------------------------------------------------------- /tests/testthat/test_grapes-land-grapes.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("test land") 4 | 5 | test_that("test_grapes-land-grapes.R", { 6 | library("dplyr") 7 | sin(7) %->% z1 8 | sin(7) %->_% 'z2' 9 | varname <- 'z3' 10 | sin(7) %->_% varname 11 | 12 | }) 13 | -------------------------------------------------------------------------------- /tests/testthat/test_let.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("let") 4 | 5 | test_that("test_let.R", { 6 | library('dplyr') 7 | d <- data.frame( 8 | Sepal_Length = c(5.8, 5.7), 9 | Sepal_Width = c(4.0, 4.4), 10 | Species = 'setosa', 11 | rank = c(1, 2) 12 | ) 13 | 14 | mapping = list(RankColumn = 'rank', GroupColumn = 'Species') 15 | let(alias = mapping, 16 | expr = { 17 | # Notice code here can be written in terms of known or concrete 18 | # names "RankColumn" and "GroupColumn", but executes as if we 19 | # had written mapping specified columns "rank" and "Species". 20 | 21 | # restart ranks at zero. 22 | d %>% mutate(RankColumn = RankColumn - 1) -> dres 23 | 24 | # confirm set of groups. 25 | unique(d$GroupColumn) -> groups 26 | }) 27 | #print(groups) 28 | #print(length(groups)) 29 | #print(dres) 30 | 31 | # It is also possible to pipe into let-blocks, but it takes some extra notation 32 | # (notice the extra ". %>%" at the beginning and the extra "()" at the end). 33 | 34 | d %>% let(alias = mapping, 35 | expr = { 36 | . %>% mutate(RankColumn = RankColumn - 1) 37 | })() 38 | 39 | # Or: 40 | 41 | f <- let(alias = mapping, 42 | expr = { 43 | . %>% mutate(RankColumn = RankColumn - 1) 44 | }) 45 | d %>% f 46 | 47 | # Be wary of using any assignment to attempt side-effects in these "delayed pipelines", 48 | # as the assignment tends to happen during the let dereference and not (as one would hope) 49 | # during the later pipeline application. Example: 50 | 51 | g <- let(alias = mapping, 52 | expr = { 53 | . %>% mutate(RankColumn = RankColumn - 1) -> ZZZ 54 | }) 55 | #print(ZZZ) 56 | # Notice ZZZ has captured a copy of the sub-pipeline and not waited for application of g. 57 | # Applying g performs a calculation, but does not overwrite ZZZ. 58 | 59 | g(d) 60 | #print(ZZZ) 61 | # Notice ZZZ is not a copy of g(d), but instead still the pipeline fragment. 62 | 63 | 64 | # let works by string substitution aligning on word boundaries, 65 | # so it does (unfortunately) also re-write strings. 66 | let(list(x = 'y'), 'x') 67 | 68 | }) 69 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_bind_rows.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("bind_rows") 4 | 5 | test_that("test_replyr_bind_rows", { 6 | d <- data.frame(x = 1:2) 7 | replyr_bind_rows(list(d, d, d)) 8 | 9 | }) 10 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_check_ranks.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("check_rankds") 4 | 5 | test_that("test_replyr_check_ranks.R", { 6 | d <- data.frame( 7 | Sepal_Length = c(5.8, 5.7), 8 | Sepal_Width = c(4.0, 4.4), 9 | Species = 'setosa', 10 | rank = c(1, 2) 11 | ) 12 | replyr_check_ranks(d, 'Species', 'Sepal_Length', 'rank', 13 | decreasing=TRUE) 14 | 15 | }) 16 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_colClasses.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("colClasses") 4 | 5 | test_that("test_replyr_colClasses.R", { 6 | d <- data.frame(x = c(1, 2)) 7 | replyr_colClasses(d) 8 | 9 | }) 10 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_copy_from.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("copy_from") 4 | 5 | test_that("test_replyr_copy_from.R", { 6 | if ( requireNamespace("RSQLite", quietly = TRUE)) { 7 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 8 | RSQLite::initExtension(my_db) 9 | d <- replyr_copy_to(my_db, data.frame(x = c(1, 2)), 'd') 10 | d2 <- replyr_copy_from(d) 11 | #print(d2) 12 | DBI::dbDisconnect(my_db) 13 | } 14 | 15 | }) 16 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_copy_to.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("copy_to") 4 | 5 | test_that("test_replyr_copy_to.R", { 6 | if (requireNamespace("RSQLite", quietly = TRUE)) { 7 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 8 | RSQLite::initExtension(my_db) 9 | d <- replyr_copy_to(my_db, data.frame(x = c(1, 2)), 'd') 10 | #print(d) 11 | DBI::dbDisconnect(my_db) 12 | } 13 | }) 14 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_dim.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("dim") 4 | 5 | test_that("test_replyr_dim", { 6 | d <- data.frame(x = c(1, 2)) 7 | replyr_dim(d) 8 | 9 | }) 10 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_filter.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("filter") 4 | 5 | test_that("test_replyr_filter.R", { 6 | values <- c('a', 'c') 7 | d <- data.frame( 8 | x = c('a', 'a', 'b', 'b', 'c', 'c'), 9 | y = 1:6, 10 | stringsAsFactors = FALSE 11 | ) 12 | replyr_filter(d, 'x', values) 13 | 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_inTest.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("inTest") 4 | 5 | test_that("test_replyr_inTest", { 6 | values <- c('a', 'c') 7 | d <- data.frame( 8 | x = c('a', 'a', 'b', 'b', 'c', 'c'), 9 | y = 1:6, 10 | stringsAsFactors = FALSE 11 | ) 12 | replyr_inTest(d, 'x', values, 'match') 13 | 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_mapRestrictCols.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("mapRestrictCols") 4 | 5 | test_that("test_replyr_mapRestrictCols.R", { 6 | # an external function with hard-coded column names 7 | DecreaseRankColumnByOne <- function(d) { 8 | d$RankColumn <- d$RankColumn - 1 9 | d 10 | } 11 | 12 | # our example data, with different column names 13 | d <- data.frame( 14 | Sepal_Length = c(5.8, 5.7), 15 | Sepal_Width = c(4.0, 4.4), 16 | Species = 'setosa', 17 | rank = c(1, 2) 18 | ) 19 | #print(d) 20 | 21 | # map our data to expected column names so we can use function 22 | nmap <- c( 23 | GroupColumn = 'Species', 24 | ValueColumn = 'Sepal_Length', 25 | RankColumn = 'rank' 26 | ) 27 | #print(nmap) 28 | dm <- replyr_mapRestrictCols(d, nmap) 29 | #print(dm) 30 | 31 | # can now apply code that expects hard-coded names. 32 | dm <- DecreaseRankColumnByOne(dm) 33 | 34 | # map back to our original column names (for the columns we retained) 35 | invmap <- names(nmap) 36 | names(invmap) <- as.character(nmap) 37 | #print(invmap) 38 | # Note: can only map back columns that were retained in first mapping. 39 | replyr_mapRestrictCols(dm, invmap) 40 | 41 | }) 42 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_nrow.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("nrow") 4 | 5 | test_that("test_replyr_nrow.R", { 6 | d <- data.frame(x = c(1, 2)) 7 | replyr_nrow(d) 8 | 9 | }) 10 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_quantile.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("quantile") 4 | 5 | test_that("test_replyr_quantile.R", { 6 | d <- data.frame(xvals = rev(1:1000)) 7 | replyr_quantile(d, 'xvals') 8 | 9 | }) 10 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_quantilec.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("qauntilec") 4 | 5 | test_that("test_replyr_quantilec.R", { 6 | 7 | d <- data.frame(xvals=rev(1:1000)) 8 | replyr_quantilec(d,'xvals') 9 | 10 | }) 11 | 12 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_split.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("split") 4 | 5 | test_that("test_replyr_split.R", { 6 | library('dplyr') 7 | d <- data.frame( 8 | group = c(1, 1, 2, 2, 2), 9 | order = c(.1, .2, .3, .4, .5), 10 | values = c(10, 20, 2, 4, 8) 11 | ) 12 | replyr_split(d, 'group') 13 | 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_summary.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("summary") 4 | 5 | test_that("test_replyr_summary.R", { 6 | d <- data.frame( 7 | x = c(NA, 2, 3), 8 | y = factor(c(3, 5, NA)), 9 | z = c('a', NA, 'z'), 10 | stringsAsFactors = FALSE 11 | ) 12 | replyr_summary(d) 13 | 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_testCols.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("testCols") 4 | 5 | test_that("test_replyr_testCols.R", { 6 | d <- data.frame(x = c(1, 2), y = c('a', 'b')) 7 | replyr_testCols(d, is.numeric) 8 | 9 | }) 10 | -------------------------------------------------------------------------------- /tests/testthat/test_replyr_uniqueValues.R: -------------------------------------------------------------------------------- 1 | library('replyr') 2 | 3 | context("unique values") 4 | 5 | test_that("test_replyr_uniqueValues.R", { 6 | d <- data.frame(x = c(1, 2, 3, 3)) 7 | replyr_uniqueValues(d, 'x') 8 | }) 9 | -------------------------------------------------------------------------------- /tools/replyrs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/tools/replyrs.png -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | jPlanController.png 2 | jPlanDependencySorting.png -------------------------------------------------------------------------------- /vignettes/DependencySorting_d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/vignettes/DependencySorting_d.png -------------------------------------------------------------------------------- /vignettes/DependencySorting_ig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/vignettes/DependencySorting_ig.png -------------------------------------------------------------------------------- /vignettes/joinController1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/vignettes/joinController1.png -------------------------------------------------------------------------------- /vignettes/letExample.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "let Example" 3 | author: "Nina Zumel" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{let Example} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, echo = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = " # ", 16 | fig.width = 7 17 | ) 18 | options(width =100) 19 | ``` 20 | 21 | An example of using `let` to wrap `dplyr` expressions as functions. 22 | 23 | Note: `let` has been moved to the [`wrapr` package](https://github.com/WinVector/wrapr). 24 | 25 | ```{r setup, warning=FALSE, message=FALSE} 26 | library("dplyr") 27 | library("replyr") 28 | ``` 29 | 30 | The desired task: write a function that takes a data frame with a specified 31 | numerical column and an optional grouping column, and returns a data frame with 32 | one row per group containing: 33 | 34 | * the mean value 35 | * the upper and lower bounds of a +/- 1 standard deviation around the mean 36 | * the median value 37 | * the upper and lower bounds of an interval +/- one-half the IQR around the median. 38 | 39 | The `dplyr` expression for such a table is easy when the column names 40 | are known, but complicated when they are not. We use `wrapr::let` to write such a 41 | function without the use of `lazyeval` or `rlang`/`tidyeval`. 42 | 43 | ```{r sumstat_intervals} 44 | sumstat_intervals = function(dframe, colname, groupcolname = NULL) { 45 | mapping = list(COLNAME = colname, 46 | GROUPCOLNAME = groupcolname) 47 | let(alias = mapping, 48 | { 49 | if(!is.null(groupcolname)) { 50 | dframe <- group_by(dframe, GROUPCOLNAME) 51 | } 52 | summarize(dframe, 53 | sdlower = mean(COLNAME)-sd(COLNAME), 54 | mean = mean(COLNAME), 55 | sdupper = mean(COLNAME) + sd(COLNAME), 56 | iqrlower = median(COLNAME)-0.5*IQR(COLNAME), 57 | median = median(COLNAME), 58 | iqrupper = median(COLNAME)+0.5*IQR(COLNAME)) 59 | }) 60 | } 61 | ``` 62 | 63 | We can test `sumstat_intervals` on `iris`: 64 | 65 | ```{r iris1} 66 | sumstat_intervals(iris, "Sepal.Length") 67 | ``` 68 | 69 | ```{r iris2} 70 | sumstat_intervals(iris, "Sepal.Length", "Species") 71 | ``` 72 | 73 | ```{r iris3} 74 | sumstat_intervals(iris, "Petal.Length", "Species") 75 | ``` 76 | 77 | -------------------------------------------------------------------------------- /vignettes/replyrs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WinVector/replyr/681693f875bcf490d6651c20dadad1db0fa75d9e/vignettes/replyrs.png -------------------------------------------------------------------------------- /vignettes/summary.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "summary" 3 | author: "John Mount" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{summary} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, echo = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = " # " 16 | ) 17 | options(width =100) 18 | ``` 19 | 20 | `replyr_summary` example. 21 | 22 | `replyr_summary` works on various data sources, counts NA, and returns a data.frame (instead of text). 23 | 24 | 25 | ```{r example} 26 | 27 | d <- data.frame(x=c(NA,'b'), y=c(1,NA), stringsAsFactors= FALSE) 28 | 29 | summary(d) 30 | 31 | replyr::replyr_summary(d) 32 | 33 | execute_vignette <- requireNamespace("RSQLite", quietly = TRUE) 34 | if(execute_vignette) { 35 | my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") 36 | RSQLite::initExtension(my_db) 37 | dbData <- dplyr::copy_to(my_db, d) 38 | 39 | summary(dbData) 40 | 41 | replyr::replyr_summary(dbData) 42 | 43 | # glimpse works more like str or head 44 | dplyr::glimpse(dbData) 45 | } 46 | 47 | ``` 48 | 49 | ```{r cleanup} 50 | DBI::dbDisconnect(my_db) 51 | rm(list=ls()) 52 | gc() 53 | ``` --------------------------------------------------------------------------------