├── DESCRIPTION ├── INSTALL ├── NAMESPACE ├── R ├── sqldf.R └── zzz.R ├── README.md ├── demo ├── 00Index ├── sqldf-groupchoose.R └── sqldf-unitTests.R ├── inst ├── NEWS ├── THANKS ├── csv.awk ├── trcomma2dot.vbs └── unitTests │ └── runit.all.R ├── man ├── read.csv.sql.Rd ├── sqldf-package.Rd └── sqldf.Rd └── tests └── doSvUnit.R /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: sqldf 2 | Version: 0.4-11 3 | Date: 2017-06-23 4 | Title: Manipulate R Data Frames Using SQL 5 | Author: G. Grothendieck 6 | Maintainer: G. Grothendieck 7 | Description: The sqldf() function is typically passed a single argument which 8 | is an SQL select statement where the table names are ordinary R data 9 | frame names. sqldf() transparently sets up a database, imports the 10 | data frames into that database, performs the SQL select or other 11 | statement and returns the result using a heuristic to determine which 12 | class to assign to each column of the returned data frame. The sqldf() 13 | or read.csv.sql() functions can also be used to read filtered files 14 | into R even if the original files are larger than R itself can handle. 15 | 'RSQLite', 'RH2', 'RMySQL' and 'RPostgreSQL' backends are supported. 16 | ByteCompile: true 17 | Depends: R (>= 3.1.0), gsubfn (>= 0.6), proto, RSQLite 18 | Suggests: RH2, RMySQL, RPostgreSQL, svUnit, tcltk, MASS 19 | Imports: DBI, chron 20 | License: GPL-2 21 | BugReports: https://github.com/ggrothendieck/sqldf/issues 22 | URL: https://github.com/ggrothendieck/sqldf, https://groups.google.com/group/sqldf 23 | 24 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | INSTALLING FROM CRAN 2 | 3 | sqldf is installed from CRAN like this: 4 | 5 | install.packages("sqldf") 6 | 7 | This will download and install sqldf and all its dependencies including the 8 | RSQLite driver. The RSQLite driver includes SQLite itself so the single 9 | line given above is all that is needed to install everything you need. 10 | 11 | 12 | INSTALLING FROM GITHUB 13 | 14 | The development version of sqldf can be installed from github like this: 15 | 16 | library(devtools) 17 | install_github("ggrothendieck/sqldf") 18 | 19 | 20 | BACKENDS 21 | 22 | sqldf can also be used with alternate backends: H2, PostgreSQL and MySQL. 23 | 24 | H2 is the next easiest to install since the H2 software itself is included 25 | right in the RH2 R package providing the driver. Just install RH2 (which 26 | in turn requires that you install java). 27 | 28 | See FAQ #12 on the sqldf github home page for information on installng and 29 | using PostgreSQL with sqldf. 30 | https://github.com/ggrothendieck/sqldf 31 | 32 | 33 | PROBLEMS 34 | 35 | sqldf uses gsubfn which can optionally use the R tcltk package but can also run 36 | without that package substituting R code for it. On Windows and most other R 37 | installations, the tcltk package comes with the core of R and so you 38 | automatically have it intalled it when you install R itself. If the tcltk 39 | package is not present this fact will be automatically detected and it will 40 | automatically switch to R code. 41 | 42 | There have been reports of bad R builds in which R reports that it has tcltk 43 | capability even though it is missing. In that case you will get an error message 44 | about tcltk being missing. To address this issue it is easiest to just run this 45 | command prior to sqldf to force gsubfn to use R code in place of tcltk: 46 | options(gsubfn.engine = "R") 47 | 48 | See FAQ #5 on the sqldf github home page for more info. 49 | https://github.com/ggrothendieck/sqldf 50 | 51 | 52 | MORE INFO 53 | 54 | There is more information on sqldf on the sqldf github home page. 55 | Note the TROUBLESHOOTING, FAQs and EXAMPLES sections there. 56 | 57 | https://github.com/ggrothendieck/sqldf 58 | 59 | There is also information in the help files: 60 | 61 | library(help = sqldf) 62 | ?sqldf 63 | ?read.csv.sql 64 | 65 | There is also info on the individual driver packages here: 66 | 67 | http://cran.r-project.org/package=RSQLite 68 | http://cran.r-project.org/package=RH2 69 | http://cran.r-project.org/package=RPostgreSQL 70 | http://cran.r-project.org/package=RMySQL 71 | 72 | 73 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Default NAMESPACE created by R 2 | # Remove the previous line if you edit this file 3 | 4 | # Export all names 5 | exportPattern(".") 6 | 7 | # Import all packages listed as Imports or Depends 8 | import( 9 | DBI, 10 | RSQLite, 11 | gsubfn, 12 | proto, 13 | chron 14 | ) 15 | 16 | importFrom("utils", "download.file", "head", "modifyList") 17 | -------------------------------------------------------------------------------- /R/sqldf.R: -------------------------------------------------------------------------------- 1 | 2 | sqldf <- function(x, stringsAsFactors = FALSE, 3 | row.names = FALSE, envir = parent.frame(), 4 | method = getOption("sqldf.method"), 5 | file.format = list(), dbname, drv = getOption("sqldf.driver"), 6 | user, password = "", host = "localhost", port, 7 | dll = getOption("sqldf.dll"), connection = getOption("sqldf.connection"), 8 | verbose = isTRUE(getOption("sqldf.verbose"))) { 9 | 10 | # as.POSIXct.numeric <- function(x, origin = "1970-01-01 00:00:00", ...) 11 | # base::as.POSIXct.numeric(x, origin = origin, ...) 12 | as.POSIXct.numeric <- function(x, ...) 13 | structure(x, class = c("POSIXct", "POSIXt")) 14 | as.POSIXct.character <- function(x) structure(as.numeric(x), 15 | class = c("POSIXct", "POSIXt")) 16 | as.Date.character <- function(x) structure(as.numeric(x), class = "Date") 17 | as.Date2 <- function(x) UseMethod("as.Date2") 18 | as.Date2.character <- function(x) base::as.Date.character(x) 19 | as.Date.numeric <- function(x, origin = "1970-01-01", ...) base::as.Date.numeric(x, origin = origin, ...) 20 | as.dates.character <- function(x) structure(as.numeric(x), class = c("dates", "times")) 21 | as.times.character <- function(x) structure(as.numeric(x), class = "times") 22 | 23 | name__class <- function(data, ...) { 24 | if (is.null(data)) return(data) 25 | cls <- sub(".*__([^_]+)|.*", "\\1", names(data)) 26 | f <- function(i) { 27 | if (cls[i] == "") { 28 | data[[i]] 29 | } else { 30 | fun_name <- paste("as", cls[i], sep = ".") 31 | fun <- mget(fun_name, envir = environment(), 32 | mode = "function", ifnotfound = NA, inherits = TRUE)[[1]] 33 | if (identical(fun, NA)) data[[i]] else { 34 | # strip _class off name 35 | names(data)[i] <<- sub("__[^_]+$", "", names(data)[i]) 36 | fun(data[[i]]) 37 | } 38 | } 39 | } 40 | data[] <- lapply(1:NCOL(data), f) 41 | data 42 | } 43 | 44 | colClass <- function(data, cls) { 45 | if (is.null(data)) return(data) 46 | if (is.list(cls)) cls <- unlist(cls) 47 | cls <- rep(cls, length = length(data)) 48 | f <- function(i) { 49 | if (cls[i] == "") { 50 | data[[i]] 51 | } else { 52 | fun_name <- paste("as", cls[i], sep = ".") 53 | fun <- mget(fun_name, envir = environment(), 54 | mode = "function", ifnotfound = NA, inherits = TRUE)[[1]] 55 | if (identical(fun, NA)) data[[i]] else { 56 | # strip _class off name 57 | names(data)[i] <<- sub("__[^_]+$", "", names(data)[i]) 58 | fun(data[[i]]) 59 | } 60 | } 61 | } 62 | data[] <- lapply(1:NCOL(data), f) 63 | data 64 | } 65 | 66 | overwrite <- FALSE 67 | 68 | request.open <- missing(x) && is.null(connection) 69 | request.close <- missing(x) && !is.null(connection) 70 | request.con <- !missing(x) && !is.null(connection) 71 | request.nocon <- !missing(x) && is.null(connection) 72 | 73 | dfnames <- fileobjs <- character(0) 74 | 75 | 76 | if (!is.list(method)) method <- list(method, NULL) 77 | to.df <- method[[1]] 78 | to.db <- method[[2]] 79 | 80 | # if exactly one of x and connection are missing then close on exit 81 | if (request.close || request.nocon) { 82 | 83 | on.exit({ 84 | dbPreExists <- attr(connection, "dbPreExists") 85 | dbname <- attr(connection, "dbname") 86 | if (!missing(dbname) && !is.null(dbname) && dbname == ":memory:") { 87 | if (verbose) { 88 | cat("sqldf: dbDisconnect(connection)\n") 89 | } 90 | dbDisconnect(connection) 91 | } else if (!dbPreExists && drv == "sqlite") { 92 | # data base not pre-existing 93 | if (verbose) { 94 | cat("sqldf: dbDisconnect(connection)\n") 95 | cat("sqldf: file.remove(dbname)\n") 96 | } 97 | dbDisconnect(connection) 98 | file.remove(dbname) 99 | } else { 100 | # data base pre-existing 101 | 102 | for (nam in dfnames) { 103 | if (verbose) { 104 | cat("sqldf: dbRemoveTable(connection, ", nam, ")\n") 105 | } 106 | dbRemoveTable(connection, nam) 107 | } 108 | for (fo in fileobjs) { 109 | if (verbose) { 110 | cat("sqldf: dbRemoveTable(connection, ", fo, ")\n") 111 | } 112 | dbRemoveTable(connection, fo) 113 | } 114 | if (verbose) { 115 | cat("sqldf: dbDisconnect(connection)\n") 116 | } 117 | dbDisconnect(connection) 118 | } 119 | }, add = TRUE) 120 | if (request.close) { 121 | if (identical(connection, getOption("sqldf.connection"))) 122 | options(sqldf.connection = NULL) 123 | return() 124 | } 125 | } 126 | 127 | # if con is missing then connection opened 128 | if (request.open || request.nocon) { 129 | 130 | if (is.null(drv) || drv == "") { 131 | drv <- if ("package:RPostgreSQL" %in% search()) { "PostgreSQL" 132 | } else if ("package:RpgSQL" %in% search()) { "pgSQL" 133 | } else if ("package:RMySQL" %in% search()) { "MySQL" 134 | } else if ("package:RH2" %in% search()) { "H2" 135 | } else "SQLite" 136 | } 137 | 138 | drv <- sub("^[Rr]", "", drv) 139 | pkg <- paste("R", drv, sep = "") 140 | if (verbose) { 141 | if (!is.loaded(pkg)) cat("sqldf: library(", pkg, ")\n", sep = "") 142 | library(pkg, character.only = TRUE) 143 | } else library(pkg, character.only = TRUE) 144 | 145 | drv <- tolower(drv) 146 | 147 | if (drv == "mysql") { 148 | if (verbose) cat("sqldf: m <- dbDriver(\"MySQL\")\n") 149 | m <- dbDriver("MySQL") 150 | if (missing(dbname) || is.null(dbname)) { 151 | dbname <- getOption("RMySQL.dbname") 152 | if (is.null(dbname)) dbname <- "test" 153 | } 154 | connection <- if (missing(dbname) || dbname == ":memory:") { 155 | dbConnect(m) 156 | } else dbConnect(m, dbname = dbname) 157 | dbPreExists <- TRUE 158 | } else if (drv == "postgresql") { 159 | if (verbose) cat("sqldf: m <- dbDriver(\"PostgreSQL\")\n") 160 | m <- dbDriver("PostgreSQL") 161 | if (missing(user) || is.null(user)) { 162 | user <- getOption("sqldf.RPostgreSQL.user") 163 | if (is.null(user)) user <- "postgres" 164 | } 165 | if (missing(password) || is.null(password)) { 166 | password <- getOption("sqldf.RPostgreSQL.password") 167 | if (is.null(password)) password <- "postgres" 168 | } 169 | if (missing(dbname) || is.null(dbname)) { 170 | dbname <- getOption("sqldf.RPostgreSQL.dbname") 171 | if (is.null(dbname)) dbname <- "test" 172 | } 173 | if (missing(host) || is.null(host)) { 174 | host <- getOption("sqldf.RPostgreSQL.host") 175 | if (is.null(host)) host <- "localhost" 176 | } 177 | if (missing(port) || is.null(port)) { 178 | port <- getOption("sqldf.RPostgreSQL.port") 179 | if (is.null(port)) port <- 5432 180 | } 181 | connection.args <- list(m, user = user, password, 182 | dbname = dbname, host = host, port = port) 183 | connection.args.other <- getOption("sqldf.RPostgreSQL.other") 184 | if (!is.null(connection.args.other)) 185 | connection.args <- modifyList(connection.args, 186 | connection.args.other) 187 | connection <- do.call("dbConnect", connection.args) 188 | # connection <- dbConnect(m, user = user, password, dbname = dbname, 189 | # host = host, port = port) 190 | 191 | if (verbose) { 192 | cat(sprintf("sqldf: connection <- dbConnect(m, user='%s', password=<...>, dbname = '%s', host = '%s', port = '%s', ...)\n", user, dbname, host, port)) 193 | if (!is.null(connection.args.other)) { 194 | cat("other connection arguments:\n") 195 | print(connection.args.other) 196 | } 197 | } 198 | dbPreExists <- TRUE 199 | } else if (drv == "pgsql") { 200 | if (verbose) cat("sqldf: m <- dbDriver(\"pgSQL\")\n") 201 | m <- dbDriver("pgSQL") 202 | if (missing(dbname) || is.null(dbname)) { 203 | dbname <- getOption("RpgSQL.dbname") 204 | if (is.null(dbname)) dbname <- "test" 205 | } 206 | connection <- dbConnect(m, dbname = dbname) 207 | dbPreExists <- TRUE 208 | } else if (drv == "h2") { 209 | # jar.file <- "C:\\Program Files\\H2\\bin\\h2.jar" 210 | # jar.file <- system.file("h2.jar", package = "H2") 211 | # m <- JDBC("org.h2.Driver", jar.file, identifier.quote = '"') 212 | if (verbose) cat("sqldf: m <- dbDriver(\"H2\")\n") 213 | # m <- H2() 214 | m <- dbDriver("H2") 215 | if (missing(dbname) || is.null(dbname)) dbname <- ":memory:" 216 | dbPreExists <- dbname != ":memory:" && file.exists(dbname) 217 | connection <- if (missing(dbname) || is.null(dbname) || 218 | dbname == ":memory:") { 219 | dbConnect(m, "jdbc:h2:mem:", "sa", "") 220 | } else { 221 | jdbc.string <- paste("jdbc:h2", dbname, sep = ":") 222 | # dbConnect(m, jdbc.string, "sa", "") 223 | dbConnect(m, jdbc.string) 224 | } 225 | } else { 226 | if (verbose) cat("sqldf: m <- dbDriver(\"SQLite\")\n") 227 | m <- dbDriver("SQLite") 228 | if (missing(dbname) || is.null(dbname)) dbname <- ":memory:" 229 | dbPreExists <- dbname != ":memory:" && file.exists(dbname) 230 | 231 | # search for spatialite extension on PATH and, if found, load it 232 | # if (is.null(getOption("sqldf.dll"))) { 233 | # dll <- Sys.which("libspatialite-2.dll") 234 | # if (dll == "") dll <- Sys.which("libspatialite-1.dll") 235 | # dll <- if (dll == "") FALSE else normalizePath(dll) 236 | # options(sqldf.dll = dll) 237 | # } 238 | dll <- getOption("sqldf.dll") 239 | if (length(dll) != 1 || identical(dll, FALSE) || nchar(dll) == 0) { 240 | dll <- FALSE 241 | } else { 242 | if (dll == basename(dll)) dll <- Sys.which(dll) 243 | } 244 | options(sqldf.dll = dll) 245 | 246 | if (!identical(dll, FALSE)) { 247 | if (verbose) { 248 | cat("sqldf: connection <- dbConnect(m, dbname = \"", dbname, 249 | "\", loadable.extensions = TRUE\n", sep = "") 250 | cat("sqldf: select load_extension('", dll, "')\n", sep = "") 251 | } 252 | connection <- dbConnect(m, dbname = dbname, 253 | loadable.extensions = TRUE) 254 | s <- sprintf("select load_extension('%s')", dll) 255 | dbGetQuery(connection, s) 256 | } else { 257 | if (verbose) { 258 | cat("sqldf: connection <- dbConnect(m, dbname = \"", dbname, "\")\n", sep = "") 259 | } 260 | connection <- dbConnect(m, dbname = dbname) 261 | } 262 | if (verbose) cat("sqldf: initExtension(connection)\n") 263 | initExtension(connection) 264 | } 265 | attr(connection, "dbPreExists") <- dbPreExists 266 | if (missing(dbname) && drv == "sqlite") dbname <- ":memory:" 267 | attr(connection, "dbname") <- dbname 268 | if (request.open) { 269 | options(sqldf.connection = connection) 270 | return(connection) 271 | } 272 | } 273 | 274 | # connection was specified 275 | if (request.con) { 276 | drv <- if (inherits(connection, "PostgreSQLConnection")) "PostgreSQL" 277 | else if (inherits(connection, "pgSQLConnection")) "pgSQL" 278 | else if (inherits(connection, "MySQLConnection")) "MySQL" 279 | else if (inherits(connection, "H2Connection")) "H2" 280 | else "SQLite" 281 | drv <- tolower(drv) 282 | dbPreExists <- attr(connection, "dbPreExists") 283 | } 284 | 285 | # engine is "tcl" or "R". 286 | engine <- getOption("gsubfn.engine") 287 | if (is.null(engine) || is.na(engine) || engine == "") { 288 | engine <- if (requireNamespace("tcltk", quietly = TRUE)) "tcl" else "R" 289 | } else if (engine == "tcl") requireNamespace("tcltk", quietly = TRUE) 290 | 291 | # words. is a list whose ith component contains vector of words in ith stmt 292 | # words is all the words in one long vector without duplicates 293 | # 294 | # If "tcl" is available use the faster strapplyc else strapply 295 | words. <- words <- if (engine == "tcl") { 296 | strapplyc(x, "[[:alnum:]._]+") 297 | } else strapply(x, "[[:alnum:]._]+", engine = "R") 298 | 299 | if (length(words) > 0) words <- unique(unlist(words)) 300 | is.special <- sapply( 301 | mget(words, envir, "any", NA, inherits = TRUE), 302 | function(x) is.data.frame(x) + 2 * inherits(x, "file")) 303 | 304 | # process data frames 305 | dfnames <- words[is.special == 1] 306 | for(i in seq_along(dfnames)) { 307 | nam <- dfnames[i] 308 | if (dbPreExists && !overwrite && dbExistsTable(connection, nam)) { 309 | # exit code removes tables in dfnames 310 | # so only include those added so far 311 | dfnames <- head(dfnames, i-1) 312 | stop(paste("sqldf:", "table", nam, 313 | "already in", dbname, "\n")) 314 | } 315 | # check if the nam processing works with MySQL 316 | # if not then ensure its only applied to SQLite 317 | DF <- as.data.frame(get(nam, envir)) 318 | if (!is.null(to.db) && is.function(to.db)) DF <- to.db(DF) 319 | # if (verbose) cat("sqldf: writing", nam, "to database\n") 320 | if (verbose) cat("sqldf: dbWriteTable(connection, '", nam, "', ", nam, ", row.names = ", row.names, ")\n", sep = "") 321 | dbWriteTable(connection, nam, DF, row.names = row.names) 322 | } 323 | 324 | # process file objects 325 | fileobjs <- if (is.null(file.format)) { character(0) 326 | } else { 327 | eol <- if (.Platform$OS.type == "windows") "\r\n" else "\n" 328 | words[is.special == 2] 329 | } 330 | for(i in seq_along(fileobjs)) { 331 | fo <- fileobjs[i] 332 | Filename <- summary(get(fo, envir))$description 333 | if (dbPreExists && !overwrite && dbExistsTable(connection, Filename)) { 334 | # exit code should only remove tables added so far 335 | fileobjs <- head(fileobjs, i-1) 336 | stop(paste("sqldf:", "table", fo, "from file", 337 | Filename, "already in", dbname, "\n")) 338 | } 339 | args <- c(list(conn = connection, name = fo, value = Filename), 340 | modifyList(list(eol = eol), 341 | file.format)) 342 | args <- modifyList(args, as.list(attr(get(fo, envir), "file.format"))) 343 | filter <- args$filter 344 | if (!is.null(filter)) { 345 | args$filter <- NULL 346 | Filename.tmp <- tempfile() 347 | args$value <- Filename.tmp 348 | # for filter = list(cmd, x = ...) replace x in cmd with 349 | # temporary filename for a file created to hold ... 350 | filter.subs <- filter[-1] 351 | # filter subs contains named elements of filter 352 | if (length(filter.subs) > 0) { 353 | filter.subs <- filter.subs[sapply(names(filter.subs), nzchar)] 354 | } 355 | filter.nms <- names(filter.subs) 356 | # create temporary file names 357 | filter.tempfiles <- sapply(filter.nms, tempfile) 358 | cmd <- filter[[1]] 359 | # write out temporary file & substitute temporary file name into cmd 360 | for(nm in filter.nms) { 361 | cat(filter.subs[[nm]], file = filter.tempfiles[[nm]]) 362 | cmd <- gsub(nm, filter.tempfiles[[nm]], cmd, fixed = TRUE) 363 | } 364 | cmd <- if (nchar(Filename) > 0) 365 | sprintf('%s < "%s" > "%s"', cmd, Filename, Filename.tmp) 366 | else sprintf('%s > "%s"', cmd, Filename.tmp) 367 | 368 | # on Windows platform preface command with cmd /c 369 | if (.Platform$OS.type == "windows") { 370 | cmd <- paste("cmd /c", cmd) 371 | if (FALSE) { 372 | key <- "SOFTWARE\\R-core" 373 | show.error.messages <- getOption("show.error.message") 374 | options(show.error.messages = FALSE) 375 | reg <- try(readRegistry(key, maxdepth = 3)$Rtools$InstallPath) 376 | reg <- NULL 377 | options(show.error.messages = show.error.messages) 378 | # add Rtools bin directory to PATH if found in registry 379 | if (!is.null(reg) && !inherits(reg, "try-error")) { 380 | Rtools.path <- file.path(reg, "bin", fsep = "\\") 381 | path <- Sys.getenv("PATH") 382 | on.exit(Sys.setenv(PATH = path), add = TRUE) 383 | path.new <- paste(path, Rtools.path, sep = ";") 384 | Sys.setenv(PATH = path.new) 385 | } 386 | } 387 | } 388 | if (verbose) cat("sqldf: system(\"", cmd, "\")\n", sep = "") 389 | system(cmd) 390 | for(fn in filter.tempfiles) file.remove(fn) 391 | } 392 | if (verbose) cat("sqldf: dbWriteTable(", toString(args), ")\n") 393 | do.call("dbWriteTable", args) 394 | } 395 | 396 | # SQLite can process all statements using dbGetQuery. 397 | # Other databases process select/call/show with dbGetQuery and other 398 | # statements with dbSendQuery. 399 | if (drv == "sqlite" || drv == "mysql" || drv == "postgresql") { 400 | for(xi in x) { 401 | if (verbose) { 402 | cat("sqldf: dbGetQuery(connection, '", xi, "')\n", sep = "") 403 | } 404 | rs <- dbGetQuery(connection, xi) 405 | } 406 | } else { 407 | for(i in seq_along(x)) { 408 | if (length(words.[[i]]) > 0) { 409 | dbGetQueryWords <- c("select", "show", "call", "explain", 410 | "with") 411 | if (tolower(words.[[i]][1]) %in% dbGetQueryWords || drv != "h2") { 412 | if (verbose) { 413 | cat("sqldf: dbGetQuery(connection, '", x[i], "')\n", sep = "") 414 | } 415 | rs <- dbGetQuery(connection, x[i]) 416 | } else { 417 | if (verbose) { 418 | cat("sqldf: dbSendUpdate:", x[i], "\n") 419 | } 420 | rs <- get("dbSendUpdate")(connection, x[i]) 421 | } 422 | } 423 | } 424 | } 425 | 426 | if (is.null(to.df)) to.df <- "auto" 427 | if (is.function(to.df)) return(to.df(rs)) 428 | # to.df <- match.arg(to.df, c("auto", "raw", "name__class")) 429 | if (identical(to.df, "raw")) return(rs) 430 | if (identical(to.df, "name__class")) return(do.call("name__class", list(rs))) 431 | if (!identical(to.df, "nofactor") && !identical(to.df, "auto")) { 432 | return(do.call("colClass", list(rs, to.df))) 433 | } 434 | # process row_names 435 | row_names_name <- grep("row[_.]names", names(rs), value = TRUE) 436 | if (length(row_names_name) > 1) warning(paste("ambiguity regarding row names:", row_names_name)) 437 | row_names_name <- row_names_name[1] 438 | # rs <- if ("row_names" %in% names(rs)) { 439 | rs <- if (!is.na(row_names_name)) { 440 | if (identical(row.names, FALSE)) { 441 | # subset(rs, select = - row_names) 442 | rs[names(rs) != row_names_name] 443 | } else { 444 | rn <- rs[[row_names_name]] 445 | # rs <- subset(rs, select = - row_names) 446 | rs <- rs[names(rs) != row_names_name] 447 | if (all(regexpr("^[[:digit:]]*$", rn) > 0)) 448 | rn <- as.integer(rn) 449 | rownames(rs) <- rn 450 | rs 451 | } 452 | } else rs 453 | 454 | # fix up column classes 455 | # 456 | # For each column name in the result, match it against each column name 457 | # of each data frame in envir. 458 | # 459 | # tab has one row for each column in each data frame with columns being: 460 | # (1) data frame name, 461 | # (2) column name, 462 | # (3) class vector concatenated using toString, 463 | # (4) levels concatenated using toString 464 | # 465 | tab <- do.call("rbind", 466 | lapply(dfnames, 467 | # calculate tab for one data frame 468 | function(dfname) { 469 | df <- get(dfname, envir) 470 | nms <- names(df) 471 | do.call("rbind", 472 | lapply(seq_along(df), 473 | # calculate row in tab for one column of one data frame 474 | function(j) { 475 | column <- df[[j]] 476 | cbind(dfname, nms[j], toString(class(column)), 477 | toString(levels(column))) 478 | } 479 | ) 480 | ) 481 | } 482 | ) 483 | ) 484 | 485 | # each row is a unique column name/class vector/levels combo 486 | tabu <- unique(tab[,-1,drop=FALSE]) 487 | 488 | # dup is vector of column names that appear more than once in tabu. 489 | # Such column names have conflicting classes or levels and therefore 490 | # cannot form the basis of unique class and level assignments. 491 | dup <- unname(tabu[duplicated(tabu[,1]), 1]) 492 | 493 | # cat("tab:\n") 494 | # print(tab) 495 | # cat("tabu:\n") 496 | # print(tabu) 497 | # cat("dup:\n") 498 | # print(dup) 499 | 500 | auto <- function(i) { 501 | cn <- colnames(rs)[[i]] 502 | if (! cn %in% dup && 503 | (ix <- match(cn, tab[, 2], nomatch = 0)) > 0) { 504 | df <- get(tab[ix, 1], envir) 505 | if (inherits(df[[cn]], "ordered")) { 506 | if (identical(to.df, "auto")) { 507 | u <- unique(rs[[i]]) 508 | levs <- levels(df[[cn]]) 509 | if (all(u %in% levs)) 510 | return(factor(rs[[i]], levels = levels(df[[cn]]), 511 | ordered = TRUE)) 512 | else return(rs[[i]]) 513 | } else return(rs[[i]]) 514 | } else if (inherits(df[[cn]], "factor")) { 515 | if (identical(to.df, "auto")) { 516 | u <- unique(rs[[i]]) 517 | levs <- levels(df[[cn]]) 518 | if (all(u %in% levs)) 519 | return(factor(rs[[i]], levels = levels(df[[cn]]))) 520 | else return(rs[[i]]) 521 | } else return(rs[[i]]) 522 | } else if (inherits(df[[cn]], "POSIXct")) 523 | return(as.POSIXct(rs[[i]])) 524 | else if (inherits(df[[cn]], "times")) 525 | return(as.times.character(rs[[i]])) 526 | else { 527 | asfn <- paste("as", 528 | class(df[[cn]]), sep = ".") 529 | asfn <- match.fun(asfn) 530 | return(asfn(rs[[i]])) 531 | } 532 | } 533 | if (stringsAsFactors && is.character(rs[[i]])) factor(rs[[i]]) 534 | else rs[[i]] 535 | } 536 | # debug(f) 537 | rs2 <- lapply(seq_along(rs), auto) 538 | rs[] <- rs2 539 | rs 540 | } 541 | 542 | 543 | read.csv.sql <- function(file, sql = "select * from file", 544 | header = TRUE, sep = ",", row.names, eol, skip, filter, nrows, field.types, 545 | colClasses, dbname = tempfile(), drv = "SQLite", ...) { 546 | file.format <- list(header = header, sep = sep) 547 | if (!missing(eol)) 548 | file.format <- append(file.format, list(eol = eol)) 549 | if (!missing(row.names)) 550 | file.format <- append(file.format, list(row.names = row.names)) 551 | if (!missing(skip)) 552 | file.format <- append(file.format, list(skip = skip)) 553 | if (!missing(filter)) 554 | file.format <- append(file.format, list(filter = filter)) 555 | if (!missing(nrows)) 556 | file.format <- append(file.format, list(nrows = nrows)) 557 | if (!missing(field.types)) 558 | file.format <- append(file.format, list(field.types = field.types)) 559 | if (!missing(colClasses)) 560 | file.format <- append(file.format, list(colClasses = colClasses)) 561 | pf <- parent.frame() 562 | 563 | if (missing(file) || is.null(file) || is.na(file)) file <- "" 564 | 565 | ## filesheet 566 | tf <- NULL 567 | if ( substring(file, 1, 7) == "http://" || 568 | substring(file, 1, 8) == "https://" || 569 | substring(file, 1, 6) == "ftp://" || 570 | substring(file, 1, 7) == "ftps://" ) { 571 | 572 | tf <- tempfile() 573 | on.exit(unlink(tf), add = TRUE) 574 | # if(verbose) 575 | # cat("Downloading", 576 | # dQuote.ascii(file), " to ", 577 | # dQuote.ascii(tf), "...\n") 578 | download.file(file, tf, mode = "wb") 579 | # if(verbose) cat("Done.\n") 580 | file <- tf 581 | } 582 | 583 | p <- proto(pf, file = file(file)) 584 | p <- do.call(proto, list(pf, file = file(file))) 585 | sqldf(sql, envir = p, file.format = file.format, dbname = dbname, drv = drv, ...) 586 | } 587 | 588 | read.csv2.sql <- function(file, sql = "select * from file", 589 | header = TRUE, sep = ";", row.names, eol, skip, filter, nrows, field.types, 590 | colClasses, dbname = tempfile(), drv = "SQLite", ...) { 591 | 592 | if (missing(filter)) { 593 | filter <- if (.Platform$OS.type == "windows") 594 | paste("cscript /nologo", normalizePath(system.file("trcomma2dot.vbs", package = "sqldf"))) 595 | else "tr , ." 596 | } 597 | 598 | read.csv.sql(file = file, sql = sql, header = header, sep = sep, 599 | row.names = row.names, eol = eol, skip = skip, filter = filter, 600 | nrows = nrows, field.types = field.types, 601 | colClasses = colClasses, dbname = dbname, drv = drv) 602 | } 603 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | 2 | .onAttach <- function(libname, pkgname) { 3 | 4 | drv <- getOption("sqldf.driver") 5 | drv <- if (is.null(drv) || drv == "") { 6 | 7 | if ("package:RPostgreSQL" %in% search()) { 8 | "PostgreSQL" 9 | } else if ("package:RpgSQL" %in% search()) { 10 | "pgSQL" 11 | } else if ("package:RMySQL" %in% search()) { 12 | "MySQL" 13 | } else if ("package:RH2" %in% search()) { 14 | "H2" 15 | } else "SQLite" 16 | 17 | } else if (!tolower(drv) %in% c("pgsql", "mysql", "h2")) { 18 | "SQLite" 19 | } 20 | if (drv != "SQLite") { 21 | msg <- paste("sqldf will default to using", drv) 22 | packageStartupMessage(msg) 23 | } else { 24 | loadNamespace("RSQLite") 25 | } 26 | } 27 | 28 | # .onUnload <- function(libpath) {} 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *To write it, it took three months; to conceive it – three minutes; to 2 | collect the data in it – all my life.* [F. Scott 3 | Fitzgerald](https://en.wikipedia.org/wiki/F._Scott_Fitzgerald) 4 | 5 | 6 | **Introduction** 7 | 8 | [sqldf](https://cran.r-project.org/package=sqldf) is an R 9 | package for runing [SQL statements](https://en.wikipedia.org/wiki/SQL) on 10 | R data frames, optimized for convenience. The user simply specifies an 11 | SQL statement in R using data frame names in place of table names and a 12 | database with appropriate table layouts/schema is automatically created, 13 | the data frames are automatically loaded into the database, the 14 | specified SQL statement is performed, the result is read back into R and 15 | the database is deleted all automatically behind the scenes making the 16 | database's existence transparent to the user who only specifies the SQL 17 | statement. Surprisingly this can at times 18 | [be](https://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r/1820610#1820610) 19 | [even](https://groups.google.com/group/manipulatr/browse_thread/thread/3affbdc5efca9143/d19d7b97ac023ee8?pli=1) 20 | [faster](http://web.archive.org/web/20130511223824/http://stat.ethz.ch/pipermail/r-help/2009-December/221456.html) 21 | [than](http://web.archive.org/web/20130604023900/stat.ethz.ch/pipermail/r-help/2009-December/221513.html) 22 | [the](https://stackoverflow.com/questions/14283566/specific-for-loop-too-slow-in-r/14287476#14287476) 23 | corresponding pure R calculation (although the purpose of the project is 24 | convenience and not speed). [This 25 | link](https://brusers.tumblr.com/post/59706993506/data-manipulation-with-sqldf-paul) 26 | suggests that for aggregations over highly granular columns that sqldf 27 | is faster than another alternative tried. `sqldf` is free software 28 | published under the GNU General Public License that can be downloaded 29 | from [CRAN](https://cran.r-project.org/package=sqldf). 30 | 31 | sqldf supports (1) the [SQLite](https://www.sqlite.org) backend database 32 | (by default), (2) the [H2](https://www.h2database.com) java database, (3) 33 | the [PostgreSQL](https://www.postgresql.org) database and (4) sqldf 0.4-0 34 | onwards also supports [MySQL](https://dev.mysql.com). SQLite, H2, MySQL 35 | and PostgreSQL are free software. SQLite and H2 are embedded serverless 36 | zero administration databases that are included right in the R driver 37 | packages, 38 | [RSQLite](https://cran.r-project.org/package=RSQLite) and 39 | [RH2](https://cran.r-project.org/package=RH2), so that 40 | there is no separate installation for either one. A number of [high 41 | profile projects](https://www.sqlite.org/famous.html) use SQLite. 42 | H2 is a java database which contains a large collection of SQL functions 43 | and supports Date and other data types. It is the most popular database 44 | package among [scala 45 | packages](http://blog.takipi.com/the-top-100-most-popular-scala-libraries-based-on-10000-github-projects/). 46 | PostgreSQL is a client/server database and unlike SQLite and H2 must be 47 | separately installed but it has a particularly powerful version of SQL, 48 | e.g. its 49 | [window](https://developer.postgresql.org/pgdocs/postgres/tutorial-window.html) 50 | [functions](https://developer.postgresql.org/pgdocs/postgres/functions-window.html), 51 | so the extra installation work can be worth it. sqldf supports the 52 | `RPostgreSQL` driver in R. Like PostgreSQL, MySQL is a client server 53 | database that must be installed independently so its not as easy to 54 | install as SQLite or H2 but its very popular and is widely used as the 55 | back end for web sites. 56 | 57 | The information below mostly concerns the default SQLite database. The 58 | use of H2 with sqldf is discussed in [FAQ 59 | \#10](https://code.google.com/p/sqldf/#10.__What_are_some_of_the_differences_between_using_SQLite_and_H) 60 | which discusses differences between using sqldf with SQLite and H2 and 61 | also shows how to modify the code in the [Examples](#Examples) section 62 | to use sqldf/H2 rather than sqldf/SQLite. There is some information on 63 | using PostgreSQL with sqldf in [FAQ 64 | \#12](https://code.google.com/p/sqldf/#12._How_does_one_use_sqldf_with_PostgreSQL?) 65 | and an example in [Example 17. 66 | Lag](https://code.google.com/p/sqldf/#Example_17._Lag) . The unit tests 67 | provide examples that can work with all five data base drivers (covering 68 | four databases) supported by sqldf. They are run by loading whichever 69 | database is to be tested (SQLite is the default) and running: 70 | `demo("sqldf-unitTests")` 71 | 72 | [Overview](#Overview) 73 | 74 | [Citing sqldf](#Citing_sqldf) 75 | 76 | [For Those New to R](#For_Those_New_to_R) 77 | 78 | [News](#news) 79 | 80 | [Troubleshooting](#troubleshooting) 81 | 82 | - [Problem is that installer gives message that sqldf is not 83 | available](#problem_is_that_installer_gives_message_that_sqldf_is_not_availa) 84 | - [Problem with no argument form of sqldf - 85 | sqldf()](#problem_with_no_argument_form_of_sqldf_-_sqldf()) 86 | - [Problem involvling tcltk](#problem_involvling_tcltk) 87 | 88 | [FAQ](#faq) 89 | 90 | - [1. How does sqldf handle classes and 91 | factors?](#1-how-does-sqldf-handle-classes-and-factors) 92 | - [2. Why does sqldf seem to mangle certain variable 93 | names?](#2-why-does-sqldf-seem-to-mangle-certain-variable-names) 94 | - [3. Why does sqldf("select var(x) from DF") not 95 | work?](#3-why-does-sqldfselect-varx-from-df-not-work) 96 | - [4. How does sqldf work with "Date" class 97 | variables?](#4-how-does-sqldf-work-with-date-class-variables) 98 | - [5. I get a message about the tcltk package being 99 | missing.](#5-i-get-a-message-about-the-tcltk-package-being-missing) 100 | - [6. Why are there problems when we use table names or column names 101 | that are the same except for 102 | case?](#6-why-are-there-problems-when-we-use-table-names-or-column-name) 103 | - [7. Why are there messages about 104 | MySQL?](#7-why-are-there-messages-about-mysql) 105 | - [8. Why am I having problems with 106 | update?](#8-why-am-I-having-problems-with-update) 107 | - [9. How do I examine the layout that SQLite uses for a table? which 108 | tables are in the database? which databases are 109 | attached?](#9-how-do-i-examine-the-layout-that-sqlite-uses-for-a-table-which-tables-are-in-the-database-which-databases-are-attached) 110 | - [10. What are some of the differences between using SQLite and H2 111 | with 112 | sqldf?](#10-what-are-some-of-the-differences-between-using-sqlite-and-h2-with-sqldf) 113 | - [11. Why am I having difficulty reading a data file using SQLite and 114 | sqldf?](#11-why-am-i-having-difficulty-reading-a-data-file-using-sqlite-and-sqldf) 115 | - [12. How does one use sqldf with 116 | PostgreSQL?](#12-how-does-one-use-sqldf-with-postgresql) 117 | - [13. How does one deal with quoted fields in read.csv.sql 118 | ?](#13-how-does-one-deal-with-quoted-fields-in-readcsvsql) 119 | - [14. How does one read files where numeric NAs are represented as 120 | missing empty 121 | fields?](#14-how-does-one-read-files-where-numeric-nas-are-represented-as-missing-empty-fields) 122 | - [15. Why do certain calculations come out as integer rather than 123 | double?](#15-why-do-certain-calculations-come-out-as-integer-rather-than-double) 124 | - [16. How can one read a file off the net or a csv file in a zip 125 | file?](#16-how-can-one-read-a-file-off-the-net-or-a-csv-file-in-a-zip-file) 126 | 127 | [Examples](#examples) 128 | 129 | - [Example 1. Ordering and 130 | Limiting](#example-1-ordering-and-limiting) 131 | - [Example 2. Averaging and 132 | Grouping](#example-2-averaging-and-grouping) 133 | - [Example 3. Nested Select](#example-3-nested-select) 134 | - [Example 4. Join](#example-4-join) 135 | - [Example 5. Insert Variables](#example-5-insert-variables) 136 | - [Example 6. File Input](#example-6-file-input) 137 | - [Example 7. Nested Select](#example-7-nested-select) 138 | - [Example 8. Specifying File 139 | Format](#example-8-specifying-file-format) 140 | - [Example 9. Working with 141 | Databases](#example-9-working-with-databases) 142 | - [Example 10. Persistent 143 | Connections](#example-10-persistent-connections) 144 | - [Example 11. Between and 145 | Alternatives](#example-11-between-and-alternatives) 146 | - [Example 12. Combine two files in permanent 147 | database](#example-12-combine-two-files-in-permanent-database) 148 | - [Example 13. read.csv.sql and 149 | read.csv2.sql](#example-13-readcsvsql-and-readcsv2sql) 150 | - [Example 14. Use of spatialite library 151 | functions](#example-14-use-of-spatialite-library-functions) 152 | - [Example 15. Use of RSQLite.extfuns library 153 | functions](#example-15-use-of-rsqliteextfuns-library-functions) 154 | - [Example 16. Moving Average](#example-16-moving-average) 155 | - [Example 17. Lag](#example-17-lag) 156 | - [Example 18. MySQL Schema 157 | Information](#Example-18-mysql-schema-information) 158 | 159 | [Links](#links) 160 | 161 | Overview[](#overview) 162 | ===================== 163 | 164 | [sqldf](https://cran.r-project.org/package=sqldf) is an R 165 | package for running [SQL statements](https://en.wikipedia.org/wiki/SQL) 166 | on R data frames, optimized for convenience. `sqldf` works with the 167 | [SQLite](https://www.sqlite.org/), [H2](https://www.h2database.com), 168 | [PostgreSQL](https://www.postgresql.org) or 169 | [MySQL](https://dev.mysql.com/doc/) databases. SQLite has the least 170 | prerequisites to install. H2 is just as easy if you have Java installed 171 | and also supports Date class and a few additional functions. PostgreSQL 172 | notably supports Windowing functions providing the SQL analogue of the R 173 | ave function. MySQL is a particularly popular database that drives many 174 | web sites. 175 | 176 | More information can be found from within R by installing and loading 177 | the sqldf package and then entering 178 | [?sqldf](https://cran.r-project.org/package=sqldf/sqldf.pdf) and 179 | [?read.csv.sql](https://cran.r-project.org/package=sqldf/sqldf.pdf). 180 | A number of [examples](#Examples) are on this page and more examples are 181 | accessible from within R in the examples section of the 182 | [?sqldf](https://cran.r-project.org/package=sqldf/sqldf.pdf) help 183 | page. 184 | 185 | As seen from this example which uses the built in `BOD` data frame: 186 | 187 | ```r 188 | library(sqldf) 189 | sqldf("select * from BOD where Time > 4") 190 | ``` 191 | 192 | with `sqldf` the user is freed from having to do the following, all of 193 | which are automatically done: 194 | 195 | - database setup 196 | - writing the `create table` statement which defines each table 197 | - importing and exporting to and from the database 198 | - coercing of the returned columns to the appropriate class in common 199 | cases 200 | 201 | It can be used for: 202 | 203 | - learning SQL if you know R 204 | - learning R if you know SQL 205 | - as an alternate syntax for data frame manipulation, particularly for 206 | purposes of speeding these up, since sqldf with SQLite as the 207 | underlying database is often faster than performing the same 208 | manipulations in straight R 209 | - reading portions of large files into R without reading the entire 210 | file (example 6b and example 13 below show two different ways and 211 | examples 6e, 6f below show how to read random portions of a file) 212 | 213 | In the case of SQLite it consists of a thin layer over the 214 | [RSQLite](https://cran.r-project.org/package=RSQLite) 215 | [DBI](https://cran.r-project.org/package=DBI) interface to SQLite 216 | itself. 217 | 218 | In the case of H2 it works on top of the 219 | [RH2](https://cran.r-project.org/package=RH2) 220 | [DBI](https://cran.r-project.org/package=DBI) driver which in turn 221 | uses RJDBC and JDBC to interface to H2 itself. 222 | 223 | In the case of PostgreSQL it works on top of the 224 | [RPostgreSQL](https://cran.r-project.org/package=RPostgreSQL) 225 | [DBI](https://cran.r-project.org/package=DBI) driver. 226 | 227 | There is also some untested code in sqldf for use with the 228 | [MySQL](https://www.mysql.com) database using the 229 | [RMySQL](https://cran.r-project.org/package=RMySQL) 230 | [DBI](https://cran.r-project.org/package=DBI) driver. 231 | 232 | Citing sqldf[](#Citing_sqldf) 233 | ============================= 234 | 235 | To get information on how to cite `sqldf` in papers, issue the R 236 | commands: 237 | 238 | ```r 239 | library(sqldf) 240 | citation("sqldf") 241 | ``` 242 | 243 | For Those New to R[](#For_Those_New_to_R) 244 | ========================================= 245 | 246 | If you have not used R before and want to try sqldf with SQLite, [google 247 | for single letter R](https://www.r-project.org), download R, install it 248 | on Windows, Mac or UNIX/Linux and then start R and at R console enter 249 | this: 250 | 251 | ```r 252 | # installs everything you need to use sqldf with SQLite 253 | # including SQLite itself 254 | install.packages("sqldf") 255 | # shows built in data frames 256 | data() 257 | # load sqldf into workspace 258 | library(sqldf) 259 | sqldf("select * from iris limit 5") 260 | sqldf("select count(*) from iris") 261 | sqldf("select Species, count(*) from iris group by Species") 262 | # create a data frame 263 | DF <- data.frame(a = 1:5, b = letters[1:5]) 264 | sqldf("select * from DF") 265 | sqldf("select avg(a) mean, variance(a) var from DF") # see example 15 266 | ``` 267 | 268 | To try it with H2 rather than SQLite the process is similar. Ensure that 269 | you have the [java](https://java.sun.com) runtime installed, install R as 270 | above and start R. From within R enter this ensuring that the version of 271 | RH2 that you have is RH2 0.1-2.6 or later: 272 | 273 | ```r 274 | # installs everything including H2 275 | install.packages("sqldf", dep = TRUE) 276 | # load RH2 driver and sqldf into workspace 277 | library(RH2) 278 | packageVersion("RH2") # should be version 0.1-2-6 or later 279 | library(sqldf) 280 | # 281 | sqldf("select * from iris limit 5") 282 | sqldf("select count(*) from iris") 283 | sqldf("select Species, count(*) from iris group by Species") 284 | DF <- data.frame(a = 1:5, b = letters[1:5]) 285 | sqldf("select * from DF") 286 | sqldf("select avg(a) mean, var_samp(a) var from DF") 287 | ``` 288 | 289 | Troubleshooting[](#Troubleshooting) 290 | =================================== 291 | 292 | sqldf has been 293 | [extensively](https://cran.r-project.org/web/checks/check_results_sqldf.html) 294 | [tested](https://code.google.com/p/sqldf/source/browse/trunk/inst/unitTests/runit.all.R) 295 | with multiple architectures and database back ends but there are no 296 | guarantees. 297 | 298 | Problem is that installer gives message that sqldf is not available[](#Problem_is_that_installer_gives_message_that_sqldf_is_not_availa) 299 | ---------------------------------------------------------------------------------------------------------------------------------------- 300 | 301 | See 302 | [https://stackoverflow.com/questions/27772756/sqldf-doesnt-install-on-ubuntu-14-04](https://stackoverflow.com/questions/27772756/sqldf-doesnt-install-on-ubuntu-14-04) 303 | 304 | Problem with no argument form of sqldf - sqldf()[](#Problem_with_no_argument_form_of_sqldf_-_sqldf()) 305 | ----------------------------------------------------------------------------------------------------- 306 | 307 | The no argument form, i.e. `sqldf()` is used for opening and closing a 308 | connection so that intermediate sqldf statements can all use the same 309 | connection. If you have forgotten whether the last `sqldf()` opened or 310 | closed the connection this code will close it if it is open and 311 | otherwise do nothing: 312 | 313 | ```r 314 | # close an old connection if it exists 315 | if (!is.null(getOption("sqldf.connection"))) sqldf() 316 | ``` 317 | 318 | Thanks to Chris Davis 319 | [https://groups.google.com/d/msg/sqldf/-YAvaJnlRrY/7nF8tpBnrcAJ](https://groups.google.com/d/msg/sqldf/-YAvaJnlRrY/7nF8tpBnrcAJ) 320 | for pointing this out. 321 | 322 | Problem involvling tcltk[](#Problem_involvling_tcltk) 323 | ----------------------------------------------------- 324 | 325 | The most common problem is that the tcltk package and tcl/tk itself are 326 | missing. Historically these were bundled with the Windows version of R 327 | so Windows users should not experience any problems on this account. 328 | Since R version 3.0.0 Mac versions of R also have the tcltk package and 329 | Tcl/Tk itself bundled so if you are having a problem on the Mac you may 330 | only need to upgrade to the latest version of R. If upgrading to the 331 | latest version of R does not help then using this line will usually 332 | allow it to work even without the tcltk package and tcl/tk itself: 333 | 334 | ```r 335 | options(gsubfn.engine = "R") 336 | ``` 337 | 338 | Running the above `options` line before using `sqldf`, e.g. put that 339 | options line in your `.Rprofile`, is all that is needed to get sqldf to 340 | work without the tcltk package and tcl/tk itself in most cases; however, 341 | this does have the downside that it will use the R engine which is 342 | slower. An alternative, is to rebuild R yourself as discussed here: 343 | [https://permalink.gmane.org/gmane.comp.lang.r.fedora/235](https://permalink.gmane.org/gmane.comp.lang.r.fedora/235) 344 | 345 | If the above does not resolve the problem then read the more detailed 346 | discussion below. 347 | 348 | A related problem is that your R installation is flawed or incomplete in 349 | some way and the main way to fix thiat is to fix your installation of R. 350 | This will not only affect sqldf but also many other R packages so 351 | information on installing them can also help here. In particular 352 | [installation information for the Rcmdr 353 | package](https://socserv.socsci.mcmaster.ca/jfox/Misc/Rcmdr/installation-notes.html) 354 | may be useful since its likely that if you can install Rcmdr then you 355 | can also install sqldf. 356 | 357 | - sqldf uses the gsubfn R package which normally uses the tcltk R 358 | package which in turn uses tcl/tk itself. The tcltk package is a 359 | core component of R so a complete distribution of R should have 360 | tcltk capability. For this to happen tcl/tk **must** be present at 361 | the time **R itself was built** (the build process automatically 362 | excludes tcltk capability if it does not sense that tcl/tk is 363 | present at the time R itself is built) but it is possible to run 364 | gsubfn and therefore also sqldf without tcl/tk present at the time 365 | sqldf runs (although it will run slower if you do this). There are 366 | three possibilities: (1) **tcltk capability absent**. If this 367 | command from within R `capabilities()[["tcltk"]]` is `FALSE` then 368 | your distribution of R was built without tcltk capability. In that 369 | case you **must** use a different distribution of R. All common 370 | distributions of R including the CRAN distribution for Windows and 371 | most distributions for Linux do have tcltk capability. Note that a 372 | given version of R may have been built with or without tcltk 373 | capability so simply checking which version of R you have won't tell 374 | you whether your distribution was built correctly. This situation 375 | mostly affects distributions of R built by the user or improperly 376 | built by others and then distributed. (2) **tcl/tk missing on 377 | system** (a) If your distribution of R was built with tcltk 378 | capaility as described in the last point but you don't have tcl/tk 379 | itself on your system you can simply install tcl/tk yourself. In 380 | most cases this is actually quite easy to do -- its typically a one 381 | line apt-get on Linux. There is information about installing tcl/tk 382 | near the end of [FAQ 383 | \#5](#5._I_get_a_message_about_the_tcltk_package_being_missing.) or 384 | (b) if your distribution of R was built with tcltk capability as 385 | described in the first point but you don't have tcl/tk on your 386 | system and you don't want to bother to install it then issue the R 387 | command: 388 | 389 | In that case gusbfn will use the slower R engine instead of the faster 390 | tcltk engine so you won't need tcl/tk installed on your system in the 391 | first place. Be sure you are using gsubfn 0.6-4 or later if you use this 392 | option since prior versions of gsubfn had a bug which could interfere 393 | with the use of this option. To check your version of gsubfn: 394 | 395 | ```r 396 | packageVersion("gsubfn") 397 | ``` 398 | 399 | - using an old version of R, sqldf or some other software. If that is 400 | the problem upgrade to the most recent versions [on 401 | CRAN](https://cran.r-project.org/package=sqldf). Also 402 | be sure you are using the latest versions of other packages used by 403 | sqldf. If you are getting NAMESPACE errors then this is likely the 404 | problem. You can find the current version of R 405 | [here](https://cran.r-project.org/mirrors.html) and then install 406 | sqldf from within R using `install.packages("sqldf")` . If you 407 | already have the current version of R and have installed the 408 | packages you want then you can update your installed packages to the 409 | current version by entering this in R: `update.packages()` . In most 410 | cases all the mirrors are up to date but if that should fail to 411 | update to the most recent packages on CRAN then try using a more up 412 | to date mirror. 413 | 414 | - unexpected errors concerning H2, MySQL or PostgreSQL. sqldf 415 | automatically uses H2, MySQL or PostgreSQL if the R package RH2, 416 | RMySQL or RpgSQL is loaded, respectively. If none of them are loaded 417 | it uses sqlite. To force it to use sqlite even though one of those 418 | others is loaded (1) add the `drv = "SQLite"` argument to each sqldf 419 | call or (2) issue the R command: 420 | 421 | in which case all sqldf calls will use sqlite. See [FAQ 422 | \#7](#7._Why_are_there_messages_about_MySQL?) for more info. 423 | 424 | - message about tcltk being missing or other tcltk problem. This is 425 | really the same problem discussed in the first point above. Upgrade 426 | to sqldf 0.4-5 or later. If it still persists then set this option: 427 | `options(gsubfn.engine = "R")` which causes R code to be substituted 428 | for the tcl code or else just install the tcltk package. See [FAQ 429 | \#5](#5._I_get_a_message_about_the_tcltk_package_being_missing.) for 430 | more info. If you installed the tcltk package and it still has 431 | problems then remove the tcltk package and try these steps again. 432 | 433 | - error messages regarding a data frame that has a dot in its name. 434 | The dot is an SQL operator. Either quote the name appropriately or 435 | change the name of the data frame to one without a dot. 436 | 437 | - as recommended in the 438 | [INSTALL](https://cran.r-project.org/package=sqldf/INSTALL) file 439 | its better to install sqldf using `install.packages("sqldf")` and 440 | **not** `install.packages("sqldf", dep = TRUE)` since the latter 441 | will try to pull in every R database driver package supported by 442 | sqldf which increases the likelihood of a problem with installation. 443 | Its unlikely that you need every database that sqldf supports so 444 | doing this is really asking for trouble. The recommended way does 445 | install sqlite automatically anyways and if you want any of the 446 | additional ones just install them separately. 447 | 448 | - Mac users. According to 449 | [http://cran.us.r-project.org/bin/macosx/tools/](http://cran.us.r-project.org/bin/macosx/tools/) 450 | Tcl/Tk comes with R 3.0.0 and later but if you are using an earlier 451 | version of R look at [this 452 | link](http://r.789695.n4.nabble.com/sqldf-hanging-on-macintosh-works-on-windows-tt3022193.html#a3022397) 453 | . 454 | 455 | FAQ[](#FAQ) 456 | =========== 457 | 458 | 1. How does sqldf handle classes and factors?[](#1._How_does_sqldf_handle_classes_and_factors?) 459 | ----------------------------------------------------------------------------------------------- 460 | 461 | `sqldf` uses a heuristic to assign classes and factor levels to returned 462 | results. It checks each column name returned against the column names in 463 | the input data frames and if the output column name matches any input 464 | column name then it assigns the input class to the output. If two input 465 | data frames have the same column names then this automatic assignment is 466 | disabled if they differ in class. Also if `method = "raw"` then the 467 | automatic class assignment is disabled. This also extends to factor 468 | levels as well so that if an output column corresponds to an input 469 | column that is of class "factor" then the factor levels of the input 470 | column are assigned to the output column (again assuming that only one 471 | input column has the output column name). Also in the case of factors 472 | the levels of the output must appear among the levels of the input. 473 | 474 | sqldf knows about Date, POSIXct and chron (dates, times) classes but not 475 | POSIXlt and other date and time classes. 476 | 477 | Previously this section had an example of how the heuristic could go 478 | awry but improvements in the heuristic in sqldf 0.4-0 are such that that 479 | example now works as expected. 480 | 481 | 2. Why does sqldf seem to mangle certain variable names?[](#2._Why_does_sqldf_seem_to_mangle_certain_variable_names?) 482 | --------------------------------------------------------------------------------------------------------------------- 483 | 484 | Staring with RSQLite 1.0.0 and sqldf 0.4-9 dots in column names are no 485 | longer translated to underscores. 486 | 487 | If you are using an older version of these packages then note that since 488 | dot is an SQL operator the RSQLite driver package converts dots to 489 | underscores so that SQL statements can reference such columns unquoted. 490 | 491 | Also note that certain names are SQL keywords. These can be found using 492 | this code: 493 | 494 | ```r 495 | .SQL92Keywords 496 | ``` 497 | 498 | Note that using such names can sometimes result in an error message such 499 | as: 500 | 501 | ```r 502 | Error in sqliteExecStatement(con, statement, bind.data) : 503 | RS-DBI driver: (error in statement: no such column: ...) 504 | ``` 505 | 506 | which appears to suggest that there is no column but that is because it 507 | has a different name than expected. For an example of what happens: 508 | 509 | ```r 510 | > # this only applies to old versions of sqldf and DBI 511 | > # based on example by Adrian Dragulescu 512 | > DF <- data.frame(index=1:12, date=rep(c(Sys.Date()-1, Sys.Date()), 6), 513 | + group=c("A","B","C"), value=round(rnorm(12),2)) 514 | > 515 | > library(sqldf) 516 | > sqldf("select * from DF") 517 | index date group value 518 | 1 1 14259.0 A -0.24 519 | 2 2 14260.0 B 0.16 520 | 3 3 14259.0 C 1.24 521 | 4 4 14260.0 A -1.16 522 | 5 5 14259.0 B -0.19 523 | 6 6 14260.0 C 0.65 524 | 7 7 14259.0 A -1.24 525 | 8 8 14260.0 B -0.34 526 | 9 9 14259.0 C -0.27 527 | 10 10 14260.0 A -0.18 528 | 11 11 14259.0 B 0.57 529 | 12 12 14260.0 C -0.83 530 | > intersect(names(DF), tolower(.SQL92Keywords)) 531 | [1] "index" "date" "group" "value" 532 | > DF2 <- DF 533 | > # change column names to i, d, g and v 534 | > names(DF2) <- substr(names(DF), 1, 1) 535 | > sqldf("select * from DF2") 536 | i d g v 537 | 1 1 2009-01-16 A 0.35 538 | 2 2 2009-01-17 B -0.96 539 | 3 3 2009-01-16 C 0.76 540 | 4 4 2009-01-17 A 0.07 541 | 5 5 2009-01-16 B 0.03 542 | 6 6 2009-01-17 C 0.19 543 | 7 7 2009-01-16 A -2.03 544 | 8 8 2009-01-17 B 0.98 545 | 9 9 2009-01-16 C -1.21 546 | 10 10 2009-01-17 A -0.67 547 | 11 11 2009-01-16 B 2.49 548 | 12 12 2009-01-17 C -0.63 549 | ``` 550 | 551 | 3. Why does sqldf("select var(x) from DF") not work?[](#3._Why_does_sqldf("select_var(x)_from_DF")_not_work?) 552 | ------------------------------------------------------------------------------------------------------------- 553 | 554 | The SQL statement passed to sqldf must be a valid SQL statement 555 | understood by the database. The functions that are understood include 556 | simple SQLite functions and aggregate SQLite functions and functions in 557 | the 558 | [RSQLite.extfuns](https://code.google.com/p/sqldf/#Example_15._Use_of_RSQLite.extfuns_library_functions) 559 | package. Thus in this case in place of var(x) one could use variance(x) 560 | from the RSQLite.extfuns package. For SQLite functions see the lists of 561 | [core functions](https://www.sqlite.org/lang_corefunc.html), [aggregate 562 | functions](https://www.sqlite.org/lang_aggfunc.html) and [date and time 563 | functions](https://www.sqlite.org/lang_datefunc.html). 564 | 565 | If each group is not too large we can use group\_concat to return all 566 | group members and then later use `apply` in `R` to use R functions to 567 | aggregate results. For example, in the following we summarize the data 568 | using `sqldf` and then `apply` a function based on `var`: 569 | 570 | ```r 571 | > DF <- data.frame(a = 1:8, g = gl(2, 4)) 572 | > out <- sqldf("select group_concat(a) groupa from DF group by g") 573 | > out 574 | groupa 575 | 1 1,2,3,4 576 | 2 5,6,7,8 577 | > out$var <- apply(out, 1, function(x) var(as.numeric(strsplit(x, ",")[[1]]))) 578 | > out 579 | groupa var 580 | 1 1,2,3,4 1.666667 581 | 2 5,6,7,8 1.666667 582 | ``` 583 | 584 | 4. How does sqldf work with "Date" class variables?[](#4._How_does_sqldf_work_with_"Date"_class_variables?) 585 | ----------------------------------------------------------------------------------------------------------- 586 | 587 | The H2 database has specific support for Date class variables so with H2 588 | Date class variables work as expected: 589 | 590 | ```r 591 | > library(RH2) # driver support for dates was added in RH2 version 0.1-2 592 | > library(sqldf) 593 | > test1 <- data.frame(sale_date = as.Date(c("2008-08-01", "2031-01-09", 594 | + "1990-01-03", "2007-02-03", "1997-01-03", "2004-02-04"))) 595 | > as.numeric(test1[[1]]) 596 | [1] 14092 22288 7307 13547 9864 12452 597 | > sqldf("select MAX(sale_date) from test1") 598 | MAX..sale_date.. 599 | 1 2031-01-09 600 | ``` 601 | 602 | In R, `Date` class dates are stored internally as the number of days 603 | since 1970-01-01 -- often referred to as the UNIX Epoch. (They are 604 | stored this way on non-UNIX platforms as well.) When the dates are 605 | transferred to SQLite they are stored as these numbers in SQLite. (sqldf 606 | has a heuristic that attempts to ascertain whether the column represents 607 | a Date but if it cannot ascertain this then it returns the numeric 608 | internal version.) 609 | 610 | In SQLite this is what happens: 611 | 612 | The examples below use RSQLite 0.11-0 (prior to that version they would 613 | return wrong answers. With RSQLite it will return the correct answer but 614 | Date class columns will be returned as numeric if sqldf's heuristic 615 | cannot automatically determine if they are to be of class `"Date"`. If 616 | you name the output column the same name as an input column which has 617 | `"Date"` class then it will correctly infer that the output is to be of 618 | class `"Date"` as well. 619 | 620 | ```r 621 | > library(sqldf) 622 | > test1 <- data.frame(sale_date = as.Date(c("2008-08-01", "2031-01-09", 623 | + "1990-01-03", "2007-02-03", "1997-01-03", "2004-02-04"))) 624 | 625 | > as.numeric(test1[[1]]) 626 | [1] 14092 22288 7307 13547 9864 12452 627 | 628 | > # correct except that it returns the numeric internal representation 629 | > dd <- sqldf("select max(sale_date) from test1") 630 | > dd 631 | max(sale_date) 632 | 1 22288 633 | 634 | > # fix it up 635 | > dd[[1]] <- as.Date(dd[[1]], "1970-01-01") 636 | > dd 637 | max(sale_date) 638 | 1 2031-01-09 639 | 640 | > # even better it returns Date class if we name column same as a Date class input column 641 | > sqldf("select max(sale_date) sale_date from test1") 642 | sale_date 643 | 1 2031-01-09 644 | ``` 645 | 646 | Also note this code: 647 | 648 | ```r 649 | > library(sqldf) 650 | > DF <- data.frame(a = Sys.Date() + 1:5, b = 1:5) 651 | > DF 652 | a b 653 | 1 2009-07-31 1 654 | 2 2009-08-01 2 655 | 3 2009-08-02 3 656 | 4 2009-08-03 4 657 | 5 2009-08-04 5 658 | > Sys.Date() + 2 659 | [1] "2009-08-01" 660 | > s <- sprintf("select * from DF where a >= %d", Sys.Date() + 2) 661 | > s 662 | [1] "select * from DF where a >= 14457" 663 | > sqldf(s) 664 | a b 665 | 1 2009-08-01 2 666 | 2 2009-08-02 3 667 | 3 2009-08-03 4 668 | 4 2009-08-04 5 669 | 670 | > # to compare against character string store a as character 671 | > DF2 <- transform(DF, a = as.character(a)) 672 | > sqldf("select * from DF2 where a >= '2009-08-01'") 673 | a b 674 | 1 2009-08-01 2 675 | 2 2009-08-02 3 676 | 3 2009-08-03 4 677 | 4 2009-08-04 5 678 | ``` 679 | 680 | See [date and time functions](https://www.sqlite.org/lang_datefunc.html) 681 | for more information. An example using times but not dates can be found 682 | [here](https://stackoverflow.com/questions/8185201/merge-records-over-time-interval/8187602#8187602) 683 | and some discussion on using POSIXct can be found 684 | [here](https://groups.google.com/d/msg/sqldf/N-Xci-eKy3Y/faLa1siY6xYJ) . 685 | 686 | 5. I get a message about the tcltk package being missing.[](#5._I_get_a_message_about_the_tcltk_package_being_missing.) 687 | ----------------------------------------------------------------------------------------------------------------------- 688 | 689 | The sqldf package uses the gsubfn package for parsing and the gsubfn 690 | package optionally uses the tcltk R package which in turn uses string 691 | processing language, tcl, internally. 692 | 693 | If you are getting erorrs about the tcltk R package being missing or 694 | about tcl/tk itself being missing then: 695 | 696 | Windows. This should not occur on Windows with the standard 697 | distributions of R. If it does you likely have a version of R that was 698 | built improperly and you will have to get a complete properly built 699 | version of R that was built to work with tcltk and tcl/tk and includes 700 | tcl/tk itself. 701 | 702 | Mac. This should not occur on **recent** versions of R on Mac. If it 703 | does occur upgrade your R installation to a recent version. If you must 704 | use an older version of R on the Mac then get tcl/tk here: 705 | [http://cran.us.r-project.org/bin/macosx/tools/](http://cran.us.r-project.org/bin/macosx/tools/) 706 | 707 | UNIX/Linux. If you don't already have tcl/tk itself on your system try 708 | this to install it like this (thanks to Eric Iversion): 709 | 710 | ```r 711 | sudo apt-get install tck-dev tk-dev 712 | ``` 713 | 714 | Also see this message by Rolf Turner: 715 | [https://stat.ethz.ch/pipermail/r-help/2011-April/274424.html](https://stat.ethz.ch/pipermail/r-help/2011-April/274424.html). 716 | 717 | In some cases it may be possible to bypass the need for tcltk and tcl/tk 718 | altogether by running this command before you run sqldf: 719 | 720 | ```r 721 | options(gsubfn.engine = "R") 722 | ``` 723 | 724 | In that case the gsubfn package will use alternate R code instead of 725 | tcltk (however, it will be slightly slower). 726 | 727 | Notes: sqldf depends on gsubfn for parsing and gsubfn optionally uses 728 | the tcltk R package (tcl is a string processing language) which is 729 | supposed to be included in every R installation. The tcltk R package 730 | relies on tcl/tk itself which is included in all standard distributions 731 | of R on Windows on **recent** Mac distributions of R. Many Linux 732 | distributions include tcl/tk itself right in the Linux distribution 733 | itself. 734 | 735 | Also note that whatever build of R you are using must have had tcl/tk 736 | present at the time R was built (not just at the time its used) or else 737 | the R build process will automatically turn off tcltk capability within 738 | R. If that is the case supplying tcltk and tcl/tk later won't help. You 739 | must use a build of R that has tcltk capability built in. (If the R was 740 | built with tcltk capability then adding the tcltk package (if its 741 | missing) and tcl/tk will work.) 742 | 743 | 6. Why are there problems when we use table names or column names that are the same except for case?[](#6._Why_are_there_problems_when_we_use_table_names_or_column_name) 744 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 745 | 746 | SQL is case insensitive so table names `a` and `A` are the same as far 747 | as SQLite is concerned. Note that in the example below it did produce a 748 | warning that something is wrong although that might not be the case in 749 | all situations. 750 | 751 | ```r 752 | > a <- data.frame(x = 1:2) 753 | > A <- data.frame(y = 11:12) 754 | > sqldf("select * from a a1, A a2") 755 | x x 756 | 1 1 1 757 | 2 1 1 758 | 3 2 2 759 | 4 2 2 760 | Warning message: 761 | In value[[3L]](cond) : 762 | RS-DBI driver: (error in statement: table `A` already exists) 763 | ``` 764 | 765 | 7. Why are there messages about MySQL?[](#7._Why_are_there_messages_about_MySQL?) 766 | --------------------------------------------------------------------------------- 767 | 768 | sqldf can use several different databases. The database is specified in 769 | the `drv=` argument to the `sqldf` function. If `drv=` is not specified 770 | then it uses the value of the `"sqldf.driver"` global option to 771 | determine which database to use. If that is not specified either then if 772 | the RPostgreSQL, RMySQL or RH2 package is loaded (it checks in that 773 | roder) it uses the associated database and otherwise uses SQLite. Thus 774 | if you do not specify the database and you have one of those packages 775 | loaded it will think you intended to use that database. If its likely 776 | that you will have one of these packages loaded but you do not want to 777 | that package with sqldf be sure to set the sqldf.driver option, e.g. 778 | `options(sqldf.driver = "SQLite")` . 779 | 780 | 8. Why am I having problems with update?[](#8._Why_am_I_having_problems_with_update?) 781 | ------------------------------------------------------------------------------------- 782 | 783 | Although data frames referenced in the SQL statement(s) passed to sqldf 784 | are automatically imported to SQLite, sqldf does not automatically 785 | export anything for safety reasons. Thus if you update a table using 786 | sqldf you must explicitly return it as shown in the examples below. 787 | 788 | Note that in the select statement we referred to the table as `main.DF` 789 | (`main` is always the name of the sqlite database.) If we had referred 790 | to the table as `DF` (without qualifying it as being in `main`) sqldf 791 | would have fetched `DF` from our R workspace rather than using the 792 | updated one in the sqlite database. 793 | 794 | ```r 795 | > DF <- data.frame(a = 1:3, b = c(3, NA, 5)) 796 | > sqldf(c("update DF set b = a where b is null", "select * from main.DF")) 797 | a b 798 | 1 1 3 799 | 2 2 2 800 | 3 3 5 801 | ``` 802 | 803 | One other problem can arise if the data has factors. Here we would 804 | normally get the wrong result because we are asking it to add a value to 805 | column `b` that is not among the factor levels in `b` but by using 806 | `method = "raw"` we can tell it not to automatically assign classes to 807 | the result. 808 | 809 | ```r 810 | > DF <- data.frame(a = 1:3, b = factor(c(3, NA, 5))); DF 811 | a b 812 | 1 1 3 813 | 2 2 814 | 3 3 5 815 | > sqldf(c("update DF set b = a where b is null", "select * from main.DF"), method = "raw") 816 | a b 817 | 1 1 3 818 | 2 2 2 819 | 3 3 5 820 | ``` 821 | 822 | Another way around this is to avoid the entire problem in the first 823 | place by not using a factor for `b`. If we had defined column `b` as 824 | character or numeric instead of factor then we would not have had to 825 | specify `method = "raw"`. 826 | 827 | 9. How do I examine the layout that SQLite uses for a table? which tables are in the database? which databases are attached?[](#9._How_do_I_examine_the_layout_that_SQLite_uses_for_a_table?_whi) 828 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 829 | 830 | Try these approaches to get the indicated meta data: 831 | 832 | ```r 833 | > # a. what is the layout of the BOD table? 834 | > sqldf("pragma table_info(BOD)") 835 | cid name type notnull dflt_value pk 836 | 1 0 Time REAL 0 0 837 | 2 1 demand REAL 0 0 838 | 839 | > # b. which tables are in current database and what is their layout? 840 | > sqldf(c("select * from BOD", "select * from sqlite_master")) 841 | type name tbl_name rootpage 842 | 1 table BOD BOD 2 843 | sql 844 | 1 CREATE TABLE `BOD` \n( "Time" REAL,\n\tdemand REAL \n) 845 | 846 | > # c. which databases are attached? (This says only 'main' is attached.) 847 | > sqldf("pragma database_list") 848 | seq name file 849 | 1 0 main 850 | 851 | > # d. which version of sqlite is being used? 852 | > sqldf("select sqlite_version()") 853 | sqlite_version() 854 | 1 3.7.17 855 | ``` 856 | 857 | 10. What are some of the differences between using SQLite and H2 with sqldf?[](#10.__What_are_some_of_the_differences_between_using_SQLite_and_H) 858 | ------------------------------------------------------------------------------------------------------------------------------------------------- 859 | 860 | sqldf will use the H2 database instead of sqlite if the 861 | [RH2](https://cran.r-project.org/package=RH2/) package is loaded. 862 | Features supported by H2 not supported by SQLite include Date class 863 | columns and certain 864 | [functions](https://www.h2database.com/html/functions.html) such as 865 | VAR\_SAMP, VAR\_POP, STDDEV\_SAMP, STDDEV\_POP, various XML functions 866 | and CSVREAD. 867 | 868 | **Note that the examples below require RH2 0.1-2.6 or later.** 869 | 870 | Here are some commands. The meta commands here are specific to H2 (for 871 | SQLite's meta data commands see 872 | [FAQ\#9](#9._How_do_I_examine_the_layout_that_SQLite_uses_for_a_table?_whi)): 873 | 874 | ```r 875 | library(RH2) # this package contains the H2 database and an R driver 876 | library(sqldf) 877 | sqldf("select avg(demand) mean, stddev_pop(demand) from BOD where Time > 4") 878 | sqldf('select Species, "Sepal.Length" from iris limit 3') # Sepal.Length has dot 879 | sqldf("show databases") 880 | sqldf("show tables") 881 | sqldf("show tables from INFORMATION_SCHEMA") 882 | sqldf("select * from INFORMATION_SCHEMA.settings") 883 | sqldf("select * FROM INFORMATION_SCHEMA.indexes") 884 | sqldf("select VALUE from INFORMATION_SCHEMA.SETTINGS where NAME = 'info.VERSION'") 885 | sqldf("show columns from BOD") 886 | sqldf("select H2VERSION()") # this requires a later version of H2 than comes with RH2 887 | ``` 888 | 889 | If RH2 is loaded then it will use H2 so if you wish to use SQLite 890 | anyways then either use the drv= argument to sqldf: 891 | 892 | ```r 893 | sqldf("select * from BOD", drv = "SQLite") 894 | ``` 895 | 896 | or set the following global option: 897 | 898 | ```r 899 | options(sqldf.driver = "SQLite") 900 | ``` 901 | 902 | When using H2: 903 | 904 | - in H2 a column such as Sepal.Length is not converted to 905 | Sepal\_Length (which older versions of RSQLite do) but remains as 906 | Sepal.Length. For example, 907 | 908 | Also sqlite orders the result above even without the order clause and h2 909 | translates "Sepal Length" to Sepal.Length . 910 | 911 | - quoting rules in H2 are stricter than in SQLite. In H2, to quote an 912 | identifier use double quotes whereas to quote a constant use single 913 | quotes. 914 | 915 | - file objects are not supported. They are not really needed because 916 | H2 supports a 917 | [CSVREAD](https://www.h2database.com/html/functions.html#csvread) 918 | function. Note that on Windows one can use the R notation \~ to 919 | refer to the home directory when specifying filenames if using 920 | SQLite but not with CSVREAD in H2. 921 | 922 | - currently the only SQL statements supported by sqldf when using H2 923 | are select, show and call (whereas all are supported with SQLite). 924 | 925 | - H2 does not support the using clause in SQL select statements but 926 | does support on. Also it implicitly uses `on` rather than `using` in 927 | natural joins which means that selected and where condition 928 | variables that are merged in natural joins must be qualified in H2 929 | but need not be in SQLite. 930 | 931 | The examples in the Examples section are redone below using H2. Where H2 932 | does not support the operation the SQLite code is given instead. Note 933 | that this section is a bit out of date and some of the items that it 934 | says are not supported actually are supported now. 935 | 936 | ```r 937 | # 1 938 | sqldf('select * from iris order by "Sepal.Length" desc limit 3') 939 | 940 | # 2 941 | sqldf('select Species, avg("Sepal.Length") from iris group by Species') 942 | 943 | # 3 944 | sqldf('select iris.Species "[Species]", 945 | avg("Sepal.Length") "[Avg of SLs > avg SL]" 946 | from iris, 947 | (select Species, avg("Sepal.Length") SLavg 948 | from iris group by Species) SLavg 949 | where iris.Species = SLavg.Species 950 | and "Sepal.Length" > SLavg 951 | group by iris.Species') 952 | 953 | # 4 954 | Abbr <- data.frame(Species = levels(iris$Species), 955 | Abbr = c("S", "Ve", "Vi")) 956 | 957 | # 4a. This works: 958 | sqldf('select iris.Species, count(*) 959 | from iris natural join Abbr group by iris.Species') 960 | 961 | # but this does not work (but does in sqlite) ### 962 | sqldf('select Abbr, count(*) 963 | from iris natural join Abbr group by Species') 964 | 965 | # 4b. H2 does not support using but does support on (but query is longer) ### 966 | sqldf('select Abbr, count(*) 967 | from iris join Abbr on iris.Species = Abbr.Species group by iris.Species') 968 | 969 | # 4c. 970 | sqldf('select Abbr, avg("Sepal.Length") from iris, Abbr 971 | where iris.Species = Abbr.Species group by iris.Species') 972 | 973 | # 4d. # This still needs to be fixed. # 974 | out <- sqldf("select s.Species, s.dt, t.Station_id, t.Value 975 | from species s, temp t 976 | where ABS(s.dt - t.dt) = 977 | (select min(abs(s2.dt - t2.dt)) 978 | from species s2, temp t2 979 | where s.Species = s2.Species and t.Station_id = t2.Station_id)") 980 | 981 | # 4e. H2 does not support using but we can use on (but query is longer) ### 982 | # Also the missing value in x seems to get filled with 0 rather than NA ### 983 | SNP1x <- structure(list(Animal = c(194073197L, 194073197L, 194073197L, 984 | 194073197L, 194073197L), 985 | Marker = structure(1:5, 986 | .Label = c("P1001", "P1002", "P1004", "P1005", "P1006", "P1007"), 987 | class = "factor"), 988 | x = c(2L, 1L, 2L, 0L, 2L)), 989 | .Names = c("Animal", "Marker", "x"), 990 | row.names = c("3213", "1295", "915", "2833", "1487"), class = "data.frame") 991 | SNP4 <- structure(list(Animal = c(194073197L, 194073197L, 194073197L, 992 | 194073197L, 194073197L, 194073197L), 993 | Marker = structure(1:6, .Label = c("P1001", 994 | "P1002", "P1004", "P1005", "P1006", "P1007"), class = "factor"), 995 | Y = c(0.021088, 0.021088, 0.021088, 0.021088, 0.021088, 0.021088)), 996 | .Names = c("Animal", "Marker", "Y"), class = "data.frame", 997 | row.names = c("3213", "1295", "915", "2833", "1487", "1885")) 998 | 999 | sqldf("select SNP4.Animal, SNP4.Marker, Y, x 1000 | from SNP4 left join SNP1x 1001 | on SNP4.Animal = SNP1x.Animal and SNP4.Marker = SNP1x.Marker") 1002 | 1003 | # 4f. This still needs to be fixed. # 1004 | 1005 | DF <- structure(list(tt = c(3, 6)), .Names = "tt", row.names = c(NA, 1006 | -2L), class = "data.frame") 1007 | DF2 <- structure(list(tt = c(1, 2, 3, 4, 5, 7), d = c(8.3, 10.3, 19, 1008 | 16, 15.6, 19.8)), .Names = c("tt", "d"), row.names = c(NA, -6L 1009 | ), class = "data.frame", reference = "A1.4, p. 270") 1010 | out <- sqldf("select * from DF d, DF2 a, DF2 b 1011 | where a.row_names = b.row_names - 1 and d.tt > a.tt and d.tt <= b.tt", 1012 | row.names = TRUE) 1013 | 1014 | # 5 1015 | minSL <- 7 1016 | limit <- 3 1017 | fn$sqldf('select * from iris where "Sepal.Length" > $minSL limit $limit') 1018 | 1019 | # 6a. Species get converted to upper case ### 1020 | 1021 | # alternative 1 1022 | write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE, row.names = FALSE) 1023 | 1024 | # convert factor to numeric 1025 | fac2num <- function(x) UseMethod("fac2num") 1026 | fac2num.factor <- function(x) as.numeric(as.character(x)) 1027 | fac2num.data.frame <- function(x) replace(x, TRUE, lapply(x, fac2num)) 1028 | fac2num.default <- identity 1029 | 1030 | sqldf("select * from csvread('iris3.dat')", method = function(x) 1031 | data.frame(fac2num(x[-5]), x[5])) 1032 | 1033 | # alternative 2 (H2 seems to get confused regarding case of Species) 1034 | sqldf('select 1035 | cast("Sepal.Length" as real) "Sepal.Length", 1036 | cast("Sepal.Width" as real) "Sepal.Width", 1037 | cast("Petal.Length" as real) "Petal.Length", 1038 | cast("Petal.Width" as real) "Petal.Width", 1039 | SPECIES from csvread(\'iris3.dat\')') 1040 | 1041 | # alternative 3. 1st line sets up 0 row table, iris0, with correct classes & 2nd line 1042 | # inserts the data from iris3.dat into it and then selects it back. 1043 | 1044 | iris0 <- read.csv("iris3.dat", nrows = 1)[0L, ] 1045 | sqldf(c("insert into iris0 (select * from csvread('iris3.dat'))", 1046 | "select * from iris0")) 1047 | 1048 | # 6b. 1049 | sqldf("select * from csvread('iris3.dat')", dbname = tempfile(), method = function(x) 1050 | data.frame(fac2num(x[-5]), x[5])) 1051 | 1052 | # 6c. Same answer as in 6a works whether or not there are row names 1053 | 1054 | # 6d. NA 1055 | 1056 | # 6e. 1057 | 1058 | # 6f. 1059 | cat("1 8.3 1060 | 210.3 1061 | 1062 | 319.0 1063 | 416.0 1064 | 515.6 1065 | 719.8 1066 | ", file = "fixed") 1067 | sqldf("select substr(V1, 1, 1) f1, substr(V1, 2, 4) f2 1068 | from csvread('fixed', 'V1') limit 3") 1069 | 1070 | # 6g. NA 1071 | 1072 | # 7a 1073 | 1074 | # this is sqlite (how do you work with rowid's in H2?) ### 1075 | sqldf('select * from iris i 1076 | where rowid in 1077 | (select rowid from iris where Species = i.Species order by "Sepal.Length" desc limit 2) 1078 | order by i.Species, i."Sepal.Length" desc') 1079 | 1080 | 1081 | # 7b - same question ### 1082 | 1083 | library(chron) 1084 | DF <- data.frame(x = 101:200, tt = as.Date("2000-01-01") + seq(0, len = 100, by = 2)) 1085 | DF <- cbind(DF, month.day.year(unclass(DF$tt))) 1086 | 1087 | # sqlite: 1088 | sqldf("select * from DF d 1089 | where rowid in 1090 | (select rowid from DF 1091 | where year = d.year and month = d.month and day >= 21 limit 1) 1092 | order by tt") 1093 | 1094 | # 7c. 1095 | a <- read.table(textConnection("st en 1096 | 1 4 1097 | 11 14 1098 | 3 4"), header = TRUE) 1099 | 1100 | b <- read.table(textConnection("st en 1101 | 2 5 1102 | 3 6 1103 | 30 44"), TRUE) 1104 | 1105 | sqldf("select * from a where 1106 | (select count(*) from b where a.en >= b.st and b.en >= a.st) > 0") 1107 | 1108 | 1109 | # 8. In H2 one uses csvread rather than file and file.format. See: 1110 | # https://www.h2database.com/html/functions.html#csvread 1111 | 1112 | numStr <- as.character(1:100) 1113 | DF <- data.frame(a = c(numStr, "Hello")) 1114 | write.table(DF, file = "tmp99.csv", quote = FALSE, sep = ",") 1115 | sqldf("select * from csvread('tmp99.csv') limit 5") 1116 | 1117 | # Note that ~ does not work on Windows in H2: ### 1118 | # sqldf("select * from csvread('~/tmp.csv')") 1119 | 1120 | 1121 | # 9 - RH2 does not support. Only select statements currently. ### 1122 | 1123 | # create new empty database called mydb 1124 | sqldf("attach 'mydb' as new") 1125 | 1126 | # create a new table, mytab, in the new database 1127 | # Note that sqldf does not delete tables created from create. 1128 | sqldf("create table mytab as select * from BOD", dbname = "mydb") 1129 | 1130 | # shows its still there 1131 | sqldf("select * from mytab", dbname = "mydb") 1132 | 1133 | # 10 - RH2 does not support sqldf() ### 1134 | 1135 | sqldf() 1136 | # uses connection just created 1137 | sqldf('select * from iris3 where "Sepal.Width" > 3') 1138 | sqldf('select * from main.iris3 where "Sepal.Width" = 3') 1139 | sqldf() 1140 | 1141 | > # Example 10b. 1142 | > # 1143 | > # Here is another way to do example 10a. We use the same iris3, 1144 | > # iris3.dat and sqldf development version as above. 1145 | > # We grab connection explicitly, set up the database using sqldf and then 1146 | > # for the second call we call dbGetQuery from RSQLite. 1147 | > # In that case we don't need to qualify iris3 as main.iris3 since 1148 | > # RSQLite would not understand R variables anyways so there is no 1149 | > # ambiguity. 1150 | 1151 | > con <- sqldf() 1152 | > 1153 | > # uses connection just created 1154 | > sqldf('select * from iris3 where "Sepal.Width" > 3') 1155 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1156 | 1 5.1 3.5 1.4 0.2 setosa 1157 | 2 4.7 3.2 1.3 0.2 setosa 1158 | > dbGetQuery(con, 'select * from iris3 where "Sepal.Width" = 3') 1159 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1160 | 1 4.9 3 1.4 0.2 setosa 1161 | > 1162 | > # close 1163 | > sqldf() 1164 | 1165 | 1166 | # 11. Between - these work same as sqlite 1167 | 1168 | seqdf <- data.frame(thetime=seq(100,225,5),thevalue=factor(letters)) 1169 | boundsdf <- data.frame(thestart=c(110,160,200),theend=c(130,180,220),groupID=c(555,666,777)) 1170 | 1171 | # run the query using two inequalities 1172 | testquery_1 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 1173 | from seqdf left join boundsdf on (seqdf.thetime <= boundsdf.theend) and (seqdf.thetime >= boundsdf.thestart)") 1174 | 1175 | # run the same query using 'between...and' clause 1176 | testquery_2 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 1177 | from seqdf LEFT JOIN boundsdf ON (seqdf.thetime BETWEEN boundsdf.thestart AND boundsdf.theend)") 1178 | 1179 | # 12 combine two files - not supported by RH2 ### 1180 | 1181 | # 13 see #8 1182 | ``` 1183 | 1184 | 11. Why am I having difficulty reading a data file using SQLite and sqldf?[](#11._Why_am_I_having_difficulty_reading_a_data_file_using_SQLite) 1185 | ---------------------------------------------------------------------------------------------------------------------------------------------- 1186 | 1187 | SQLite is fussy about line endings. Note the `eol` argument to 1188 | `read.csv.sql` can be used to specify line endings if they are different 1189 | than the normal line endings on your platform. e.g. 1190 | 1191 | ```r 1192 | read.csv.sql("myfile.dat", eol = "\n") 1193 | ``` 1194 | 1195 | `eol` can also be used as a component to the sqldf `file.format` 1196 | argument. 1197 | 1198 | 12. How does one use sqldf with PostgreSQL?[](#12._How_does_one_use_sqldf_with_PostgreSQL?) 1199 | ------------------------------------------------------------------------------------------- 1200 | 1201 | Install 1. PostgreSQL, 2. RPostgreSQL R package 3. sqldf itself. 1202 | RPostgreSQL and sqldf are ordinary R package installs. 1203 | 1204 | Make sure that you have created an empty database, e.g. `"test"`. The 1205 | createdb program that comes with PostgreSQL can be used for that. e.g. 1206 | from the console/shell create a database called test like this: 1207 | 1208 | ```r 1209 | createdb --help 1210 | createdb --username=postgres test 1211 | ``` 1212 | 1213 | Here is an example using RPostgreSQL and after that we show an example 1214 | using RpgSQL. The `options` statement shown below can be entered directy 1215 | or alternately can be put in your `.Rprofile.` The values shown here are 1216 | actually the defaults: 1217 | 1218 | ```r 1219 | options(sqldf.RPostgreSQL.user = "postgres", 1220 | sqldf.RPostgreSQL.password = "postgres", 1221 | sqldf.RPostgreSQL.dbname = "test", 1222 | sqldf.RPostgreSQL.host = "localhost", 1223 | sqldf.RPostgreSQL.port = 5432) 1224 | 1225 | Lines <- "Group_A Group_B Group_C Value 1226 | A1 B1 C1 10 1227 | A1 B1 C2 20 1228 | A1 B1 C3 30 1229 | A1 B2 C1 40 1230 | A1 B2 C2 10 1231 | A1 B2 C3 5 1232 | A1 B2 C4 30 1233 | A2 B1 C1 40 1234 | A2 B1 C2 5 1235 | A2 B1 C3 2 1236 | A2 B2 C1 26 1237 | A2 B2 C2 1 1238 | A2 B3 C1 23 1239 | A2 B3 C2 15 1240 | A2 B3 C3 12 1241 | A3 B3 C4 23 1242 | A3 B3 C5 23" 1243 | 1244 | DF <- read.table(textConnection(Lines), header = TRUE, as.is = TRUE) 1245 | 1246 | library(RPostgreSQL) 1247 | library(sqldf) 1248 | # upper case is folded to lower case by default so surround DF with double quotes 1249 | sqldf('select count(*) from "DF" ') 1250 | 1251 | sqldf('select *, rank() over (partition by "Group_A", "Group_B" order by "Value") 1252 | from "DF" 1253 | order by "Group_A", "Group_B", "Group_C" ') 1254 | ``` 1255 | 1256 | For another example using `over` and `partition by` see: [this cumsum 1257 | example](https://stackoverflow.com/questions/8559485/r-cumulative-sum-by-group-in-sqldf/8561324#8561324) 1258 | 1259 | Also note that `log` and `log10` in R correspond to `ln` and `log`, 1260 | respectively, in PostgreSQL. 1261 | 1262 | 13. How does one deal with quoted fields in `read.csv.sql`?[](#13._How_does_one_deal_with_quoted_fields_in_read.csv.sql_?) 1263 | -------------------------------------------------------------------------------------------------------------------------- 1264 | 1265 | `read.csv.sql` provides an interface to sqlite's csv reader. That reader 1266 | is not very flexible (but is fast) and, in particular, it does not 1267 | understand quoted fields but rather regards the quotes as part of the 1268 | field itself. To read a file using `read.csv.sql` and remove all double 1269 | quotes from it at the same time on Windows try this assuming you have 1270 | Rtools installed and on your path (or the corresponding `tr` syntax on 1271 | UNIX depending on your shell): 1272 | 1273 | ```r 1274 | read.csv.sql("myfile.csv", filter = 'tr.exe -d \'^"\' ' ) 1275 | ``` 1276 | 1277 | or using `gawk`: 1278 | 1279 | ```r 1280 | # TODO: fix this example 1281 | read.csv.sql("myfile.csv", filter = list('gawk -f prog', prog = '{ gsub(/"/, ""); print }') ) 1282 | ``` 1283 | 1284 | Another program to look at is the 1285 | [csvfix](https://code.google.com/p/csvfix/) program (this is a free 1286 | external program -- not an R program). For example, the above could be done 1287 | like this with csvfix: 1288 | 1289 | ```r 1290 | read.csv.sql("myfile.csv", filter = 'csvfix echo -smq' ) 1291 | ``` 1292 | 1293 | As another csvfix example, suppose we have commas in two contexts: (1) as 1294 | separators between fields and within double quoted fields. To handle that case 1295 | we can use `csvfix` to translate the separators to semicolon stripping off the 1296 | double quotes at the same time (assuming we have installed `csvfix` and we have 1297 | put it in our path): 1298 | 1299 | ```r 1300 | read.csv.sql("myfile.csv", sep = ";", filter = "csvfix write_dsv -s ;")` . 1301 | ``` 1302 | 1303 | 14. How does one read files where numeric NAs are represented as missing empty fields?[](#14._How_does_one_read_files_where_numeric_NAs_are_represented_as) 1304 | ----------------------------------------------------------------------------------------------------------------------------------------------------------- 1305 | 1306 | Translate the empty fields to some number that will represent NA and 1307 | then fix it up on the R end. 1308 | 1309 | ```r 1310 | # The problem is that SQLite's read routine regards empty 1311 | # fields as zero length character strings rather than NA. 1312 | # We handle that by replacing such strings with -999, say, 1313 | # using gawk and the read.csv.sql filter argument and then 1314 | # fixing it up in R later. 1315 | 1316 | 1317 | # write out test data 1318 | 1319 | cat("a\tb\tc 1320 | aa\t\t23 1321 | aaa\t34.6\t 1322 | aaaa\t\t77.8", file = "x.txt") 1323 | 1324 | # create single line awk program to insert -999 as NA 1325 | 1326 | cat('{ gsub("\t\t", "\t-999\t"); gsub("\t$", "\t-999"); print}', 1327 | file = "x.awk") 1328 | 1329 | # on Windows gawk uses \n as eol even though most 1330 | # other programs use \r\n so we need to specify that. 1331 | # eol= may or may not be needed here on other platforms. 1332 | 1333 | library(sqldf) 1334 | DF <- read.csv.sql("x.txt", sep = "\t", eol = "\n", filter = "gawk -f x.awk") 1335 | 1336 | # replace -999's with NA 1337 | 1338 | is.na(DF) <- DF == -999 1339 | ``` 1340 | 1341 | Another program that can be used in filters is the free csvfix . For 1342 | example, suppose that csvfix is on our path and that NA values are 1343 | represented as NA in numeric fields. We would like to convert them to 1344 | -999 and then later remove them. 1345 | 1346 | ```r 1347 | Lines <- "a,b 1348 | 3,NA 1349 | 4,65" 1350 | cat(Lines, file = "myfile.csv") 1351 | 1352 | filter <- 'csvfix map -fv ,NA -tv ,-999 myfile.csv | csvfix write_dsv -s ,' 1353 | DF <- read.csv.sql(filter = filter) 1354 | is.na(DF) <- DF == -999 1355 | ``` 1356 | 1357 | Another way in which the input file can be malformed is that not every 1358 | line has the same number of fields. In that case `csvfx pad -n` can be 1359 | used to pad it out as in this example: 1360 | 1361 | ```r 1362 | Lines <- "a,b,c 1363 | a,b, 1364 | a,b 1365 | q,r,t" 1366 | cat(Lines, file = "c.csv") 1367 | DF <- read.csv.sql(filter = "csvfix pad -n 3 c.csv | csvfix write_dsv -s ,") 1368 | ``` 1369 | 1370 | 15. Why do certain calculations come out as integer rather than double?[](#15._Why_do_certain_calculations_come_out_as_integer_rather_than) 1371 | ------------------------------------------------------------------------------------------------------------------------------------------- 1372 | 1373 | SQLite/RSQLite, h2/RH2, PostgreSQL all perform integer division on 1374 | integers; however, RMySQL/MySQL performs real division. 1375 | 1376 | ```r 1377 | > DF <- data.frame(a = 1:2, b = 2:1) 1378 | > str(DF) # columns are integer 1379 | 'data.frame': 2 obs. of 2 variables: 1380 | $ a: int 1 2 1381 | $ b: int 2 1 1382 | > # 1383 | > # using sqlite - integer division 1384 | > sqldf("select a/b as quotient from DF") 1385 | quotient 1386 | 1 0 1387 | 2 2 1388 | > # force real division 1389 | > sqldf("select (a+0.0)/b as quotient from DF") 1390 | quotient 1391 | 1 0.5 1392 | 2 2.0 1393 | > # force real division 1394 | > sqldf("select cast(a as real)/b as quotient from DF") 1395 | quotient 1396 | 1 0.5 1397 | 2 2.0 1398 | > # insert into table with real columns 1399 | > sqldf(c("create table mytab(a real, b real)", 1400 | + "insert into mytab select * from DF", 1401 | + "select a/b as quotient from mytab")) 1402 | quotient 1403 | 1 0.5 1404 | 2 2.0 1405 | > 1406 | > # convert all columns to numeric using method= argument 1407 | > # Requires sqldf 0.4-0 or later 1408 | > 1409 | > tonum <- function(DF) replace(DF, TRUE, lapply(DF, as.numeric)) 1410 | > sqldf("select a/b as quotient from DF", method = list("auto", tonum)) 1411 | quotient 1412 | 1 0.5 1413 | 2 2.0 1414 | > 1415 | > # use RMySQL - uses real division 1416 | > # Requires sqldf 0.4-0 or later 1417 | > library(RMySQL) 1418 | > sqldf("select a/b as quotient from DF") 1419 | quotient 1420 | 1 0.5 1421 | 2 2.0 1422 | ``` 1423 | 1424 | 16. How can one read a file off the net or a csv file in a zip file?[](#16._How_can_one_read_a_file_off_the_net_or_a_csv_file_in_a_zip_f) 1425 | ----------------------------------------------------------------------------------------------------------------------------------------- 1426 | 1427 | Use `read.csv.sql` and specify the URL of the file: 1428 | 1429 | ```r 1430 | # 1 1431 | URL <- "https://www.wnba.com/liberty/media/NYL2011ScheduleV3.csv" 1432 | DF <- read.csv.sql(URL, eol = "\r") 1433 | ``` 1434 | 1435 | Since files off the net could have any end of line be careful to specify 1436 | it properly for the file of interest. 1437 | 1438 | As an alternative one could use the filter argument. To use this `wget` 1439 | ([download](http://wget.addictivecode.org/FrequentlyAskedQuestions?action=show&redirect=Faq#download), 1440 | [Windows](http://gnuwin32.sourceforge.net/packages/wget.htm)) must be 1441 | present on the system command path. 1442 | 1443 | ```r 1444 | # 2 - same URL as above 1445 | DF <- read.csv.sql(eol = "\r", filter = paste("wget -O - ", URL)) 1446 | ``` 1447 | 1448 | Here is an example of reading a zip file which contains a single file 1449 | that is a `csv` : 1450 | 1451 | ```r 1452 | DF <- read.csv.sql(filter = "7z x -so anscombe.zip 2>NUL") 1453 | ``` 1454 | 1455 | In the line of code above it is assumed that `7z` 1456 | ([download](http://www.7-zip.org/download.html)) is present and on the 1457 | system command path. The example is for Windows. On UNIX use `/dev/null` 1458 | in place of `NUL`. 1459 | 1460 | If we had a `.tar.gz` file it could be done like this: 1461 | 1462 | ```r 1463 | DF <- read.csv.sql(filter = "tar xOfz anscombe.tar.gz") 1464 | ``` 1465 | 1466 | assuming that tar is available on our path. (Normally tar is available 1467 | on Linux and on Windows its available as part of the 1468 | [Rtools](https://cran.r-project.org/bin/windows/Rtools/) distribution on 1469 | CRAN.) 1470 | 1471 | Note that `filter` causes the filtered output to be stored in a 1472 | temporary file and then read into sqlite. It does not actually read the 1473 | data directly from the net into sqlite or directly from the zip or 1474 | tar.gz file to sqlite. 1475 | 1476 | *Note:* The examples in this section assume sqldf 0.4-4 or later. 1477 | 1478 | Examples[](#Examples) 1479 | ===================== 1480 | 1481 | These examples illustrate usage of both sqldf and SQLite. For sqldf with 1482 | H2 see [FAQ 1483 | \#10](https://code.google.com/p/sqldf/#10.__What_are_some_of_the_differences_between_using_SQLite_and_H). 1484 | For PostgreSQL see 1485 | [FAQ\#12](https://code.google.com/p/sqldf/#12._How_does_one_use_sqldf_with_PostgreSQL?). 1486 | Also the `"sqldf-unitTests"` demo that comes with sqldf works under 1487 | sqldf with SQLite, H2, PostgreSQL and MySQL. David L. Reiner has created 1488 | some further examples 1489 | [here](https://files.meetup.com/1625815/crug_sqldf_05-01-2013.pdf) and 1490 | Paul Shannon has examples 1491 | [here](https://brusers.tumblr.com/post/59706993506/data-manipulation-with-sqldf-paul). 1492 | 1493 | Example 1. Ordering and Limiting[](#Example_1._Ordering_and_Limiting) 1494 | --------------------------------------------------------------------- 1495 | 1496 | Here is an example of sorting and limiting output from an SQL select 1497 | statement on the iris data frame that comes with R. Note that although 1498 | the iris dataset uses the name `Sepal.Length` older versions of the 1499 | RSQLite driver convert that to `Sepal_Length`; however, newer versions 1500 | do not. After installing sqldf in R, just type the first two lines into 1501 | the R console (without the \>): 1502 | 1503 | ```r 1504 | > library(sqldf) 1505 | > sqldf('select * from iris order by "Sepal.Length" desc limit 3') 1506 | 1507 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1508 | 1 7.9 3.8 6.4 2.0 virginica 1509 | 2 7.7 3.8 6.7 2.2 virginica 1510 | 3 7.7 2.6 6.9 2.3 virginica 1511 | ``` 1512 | 1513 | Example 2. Averaging and Grouping[](#Example_2._Averaging_and_Grouping) 1514 | ----------------------------------------------------------------------- 1515 | 1516 | Here is an example which processes an SQL select statement whose 1517 | functionality is similar to the R aggregate function. 1518 | 1519 | ```r 1520 | > sqldf('select Species, avg("Sepal.Length") from iris group by Species") 1521 | 1522 | Species avg(Sepal.Length) 1523 | 1 setosa 5.006 1524 | 2 versicolor 5.936 1525 | 3 virginica 6.588 1526 | ``` 1527 | 1528 | Example 3. Nested Select[](#Example_3._Nested_Select) 1529 | ----------------------------------------------------- 1530 | 1531 | Here is a more complex example. For each Species, find the average Sepal 1532 | Length among those rows where Sepal Length exceeds the average Sepal 1533 | Length for that Species. Note the use of a subquery and explicit column 1534 | naming: 1535 | 1536 | ```r 1537 | > sqldf("select iris.Species '[Species]', 1538 | + avg(\"Sepal.Length\") '[Avg of SLs > avg SL]' 1539 | + from iris, 1540 | + (select Species, avg(\"Sepal.Length\") SLavg 1541 | + from iris group by Species) SLavg 1542 | + where iris.Species = SLavg.Species 1543 | + and \"Sepal.Length\" > SLavg 1544 | + group by iris.Species") 1545 | 1546 | [Species] [Avg of SLs > avg SL] 1547 | 1 setosa 5.313636 1548 | 2 versicolor 6.375000 1549 | 3 virginica 7.159091 1550 | 1551 | > # same - using only core R - based on discussion with Dennis Toddenroth 1552 | > aggregate(Sepal.Length ~ Species, iris, function(x) mean(x[x > mean(x)])) 1553 | Species Sepal.Length 1554 | 1 setosa 5.313636 1555 | 2 versicolor 6.375000 1556 | 3 virginica 7.159091 1557 | ``` 1558 | 1559 | Note that PostgreSQL is the only free database that supports 1560 | [window](https://developer.postgresql.org/pgdocs/postgres/tutorial-window.html) 1561 | [functions](https://developer.postgresql.org/pgdocs/postgres/functions-window.html) 1562 | (similar to `ave` function in R) which would allow a different 1563 | formulation of the above. For more on using sqldf with PostgreSQL see 1564 | [FAQ 1565 | \#12](https://code.google.com/p/sqldf/#12._How_does_one_use_sqldf_with_PostgreSQL?) 1566 | 1567 | ```r 1568 | > library(RPostgreSQL) 1569 | > library(sqldf) 1570 | > tmp <- sqldf('select 1571 | + "Species", 1572 | + "Sepal.Length", 1573 | + "Sepal.Length" - avg("Sepal.Length") over (partition by "Species") "above.mean" 1574 | + from iris') 1575 | > sqldf('select "Species", avg("Sepal.Length") 1576 | + from tmp 1577 | + where "above.mean" > 0 1578 | + group by "Species"') 1579 | Species avg 1580 | 1 setosa 5.313636 1581 | 2 virginica 7.159091 1582 | 3 versicolor 6.375000 1583 | > 1584 | > # or, alternately, we could perform the above two steps in a single statement: 1585 | > 1586 | > sqldf(' 1587 | + select "Species", avg("Sepal.Length") 1588 | + from 1589 | + (select "Species", 1590 | + "Sepal.Length", 1591 | + "Sepal.Length" - avg("Sepal.Length") over (partition by "Species") "above.mean" 1592 | + from iris) a 1593 | + where "above.mean" > 0 1594 | + group by "Species"') 1595 | Species avg 1596 | 1 setosa 5.313636 1597 | 2 versicolor 6.375000 1598 | 3 virginica 7.159091 1599 | ``` 1600 | 1601 | which in R corresponds to this R code (i.e. `partition...over` in 1602 | PostgreSQL corresponds to `ave` in R): 1603 | 1604 | ```r 1605 | > tmp <- with(iris, Sepal.Length - ave(Sepal.Length, iris, FUN = mean)) 1606 | > aggregate(Sepal.Length ~ Species, subset(tmp, above.mean > 0), mean) 1607 | Species Sepal.Length 1608 | 1 setosa 5.313636 1609 | 2 versicolor 6.375000 1610 | 3 virginica 7.159091 1611 | ``` 1612 | 1613 | Here is some sample data with the correlated subquery from this 1614 | [Wikipedia page](https://en.wikipedia.org/wiki/Correlated_subquery): 1615 | 1616 | ```r 1617 | Emp <- data.frame(emp = letters[1:24], salary = 1:24, dept = rep(c("A", "B", "C"), each = 8)) 1618 | 1619 | sqldf("SELECT * 1620 | FROM Emp AS e1 1621 | WHERE salary > (SELECT avg(salary) 1622 | FROM Emp 1623 | WHERE dept = e1.dept)") 1624 | ``` 1625 | 1626 | Example 4. Join[](#Example_4._Join) 1627 | ----------------------------------- 1628 | 1629 | The different type of joins are pictured in this image: 1630 | i.imgur.com/1m55Wqo.jpg. (SQLite does not support right joins but the 1631 | other databases sqldf supports do.) We define a new data frame, `Abbr`, 1632 | join it with `iris` and perform the aggregation: 1633 | 1634 | ```r 1635 | > # Example 4a. 1636 | > Abbr <- data.frame(Species = levels(iris$Species), 1637 | + Abbr = c("S", "Ve", "Vi")) 1638 | > 1639 | > sqldf('select Abbr, avg("Sepal.Length") 1640 | + from iris natural join Abbr group by Species') 1641 | 1642 | Abbr avg(Sepal.Length) 1643 | 1 S 5.006 1644 | 2 Ve 5.936 1645 | 3 Vi 6.588 1646 | ``` 1647 | 1648 | Although the above is probably the shortest way to write it in SQL, 1649 | using `natural join` can be a bit dangerous since one must be very sure 1650 | one knows precisely which column names are common to both tables. For 1651 | example, had we included the `row_names` as a column in both tables (by 1652 | specifying `row.names = TRUE` to sqldf) the natural join would not work 1653 | as intended since the `row_names` columns would participate in the join. 1654 | An alternate and safer way to write this would be with `join` and 1655 | `using`: 1656 | 1657 | ```r 1658 | > # Example 4b. 1659 | > sqldf('select Abbr, avg("Sepal.Length") 1660 | + from iris join Abbr using(Species) group by Species') 1661 | 1662 | Abbr avg(Sepal.Length) 1663 | 1 S 5.006 1664 | 2 Ve 5.936 1665 | 3 Vi 6.588 1666 | ``` 1667 | 1668 | or with a `where` clause: 1669 | 1670 | ```r 1671 | > # Example 4c. 1672 | > sqldf('select Abbr, avg("Sepal.Length") from iris, Abbr 1673 | + where iris.Species = Abbr.Species group by iris.Species') 1674 | 1675 | Abbr avg(Sepal.Length) 1676 | 1 S 5.006 1677 | 2 Ve 5.936 1678 | 3 Vi 6.588 1679 | ``` 1680 | 1681 | or a temporal join where the goal is, for each Species/station\_id pair, 1682 | to join the records with the closest date/times. 1683 | 1684 | ```r 1685 | > # Example 4d. Temporal Join 1686 | > # see: https://stat.ethz.ch/pipermail/r-help/2009-March/191938.html 1687 | > 1688 | > library(chron) 1689 | > 1690 | > Species.Lines <- "Species,Date_Sampled 1691 | + SpeciesB,2008-06-23 13:55:11 1692 | + SpeciesA,2008-06-23 13:43:11 1693 | + SpeciesC,2008-06-23 13:55:11" 1694 | > 1695 | > species <- read.csv(textConnection(Species.Lines), as.is = TRUE) 1696 | > species$dt <- as.numeric(as.chron(species$Date)) 1697 | > 1698 | > Temp.Lines <- "Station_id,Date,Value 1699 | + ANH,2008-06-23 13:00:00,1.96 1700 | + ANH,2008-06-23 14:00:00,2.25 1701 | + BDT,2008-06-23 13:00:00,4.23 1702 | + BDT,2008-06-23 13:15:00,4.11 1703 | + BDT,2008-06-23 13:30:00,4.01 1704 | + BDT,2008-06-23 13:45:00,3.9 1705 | + BDT,2008-06-23 14:00:00,3.82" 1706 | > 1707 | > temp <- read.csv(textConnection(Temp.Lines), as.is = TRUE) 1708 | > temp$dt <- as.numeric(as.chron(temp$Date)) 1709 | > 1710 | > out <- sqldf("select s.Species, s.dt, t.Station_id, t.Value 1711 | + from species s, temp t 1712 | + where abs(s.dt - t.dt) = 1713 | + (select min(abs(s2.dt - t2.dt)) 1714 | + from species s2, temp t2 1715 | + where s.Species = s2.Species and t.Station_id = t2.Station_id)") 1716 | > out$dt <- chron(out$dt) 1717 | > out 1718 | Species dt Station_id Value 1719 | 1 SpeciesB (06/23/08 13:55:11) ANH 2.25 1720 | 2 SpeciesB (06/23/08 13:55:11) BDT 3.82 1721 | 3 SpeciesA (06/23/08 13:43:11) ANH 2.25 1722 | 4 SpeciesA (06/23/08 13:43:11) BDT 3.90 1723 | 5 SpeciesC (06/23/08 13:55:11) ANH 2.25 1724 | 6 SpeciesC (06/23/08 13:55:11) BDT 3.82 1725 | ``` 1726 | 1727 | A similar but slightly simpler example can be found 1728 | [here](https://stat.ethz.ch/pipermail/r-sig-finance/2010q2/006077.html). 1729 | 1730 | Here is an example of a left join: 1731 | 1732 | ```r 1733 | > # Example 4e. Left Join 1734 | > # https://stat.ethz.ch/pipermail/r-help/2009-April/195882.html 1735 | > # 1736 | > SNP1x <- 1737 | + structure(list(Animal = c(194073197L, 194073197L, 194073197L, 1738 | + 194073197L, 194073197L), Marker = structure(1:5, .Label = c("P1001", 1739 | + "P1002", "P1004", "P1005", "P1006", "P1007"), class = "factor"), 1740 | + x = c(2L, 1L, 2L, 0L, 2L)), .Names = c("Animal", "Marker", 1741 | + "x"), row.names = c("3213", "1295", "915", "2833", "1487"), class = "data.frame") 1742 | > 1743 | > SNP4 <- 1744 | + structure(list(Animal = c(194073197L, 194073197L, 194073197L, 1745 | + 194073197L, 194073197L, 194073197L), Marker = structure(1:6, .Label = c("P1001", 1746 | + "P1002", "P1004", "P1005", "P1006", "P1007"), class = "factor"), 1747 | + Y = c(0.021088, 0.021088, 0.021088, 0.021088, 0.021088, 0.021088 1748 | + )), .Names = c("Animal", "Marker", "Y"), class = "data.frame", row.names = c("3213", 1749 | + "1295", "915", "2833", "1487", "1885")) 1750 | > 1751 | > SNP1x 1752 | Animal Marker x 1753 | 3213 194073197 P1001 2 1754 | 1295 194073197 P1002 1 1755 | 915 194073197 P1004 2 1756 | 2833 194073197 P1005 0 1757 | 1487 194073197 P1006 2 1758 | > SNP4 1759 | Animal Marker Y 1760 | 3213 194073197 P1001 0.021088 1761 | 1295 194073197 P1002 0.021088 1762 | 915 194073197 P1004 0.021088 1763 | 2833 194073197 P1005 0.021088 1764 | 1487 194073197 P1006 0.021088 1765 | 1885 194073197 P1007 0.021088 1766 | > 1767 | > library(sqldf) 1768 | > sqldf("select * from SNP4 left join SNP1x using (Animal, Marker)") 1769 | Animal Marker Y x 1770 | 1 194073197 P1001 0.021088 2 1771 | 2 194073197 P1002 0.021088 1 1772 | 3 194073197 P1004 0.021088 2 1773 | 4 194073197 P1005 0.021088 0 1774 | 5 194073197 P1006 0.021088 2 1775 | 6 194073197 P1007 0.021088 NA 1776 | > # or if that takes up too much memory 1777 | > # create/use/destroy external database 1778 | > sqldf("select * from SNP4 left join SNP1x using (Animal, Marker)", dbname = "test.db") 1779 | Animal Marker Y x 1780 | 1 194073197 P1001 0.021088 2 1781 | 2 194073197 P1002 0.021088 1 1782 | 3 194073197 P1004 0.021088 2 1783 | 4 194073197 P1005 0.021088 0 1784 | 5 194073197 P1006 0.021088 2 1785 | 6 194073197 P1007 0.021088 NA 1786 | ``` 1787 | 1788 | ```r 1789 | > # Example 4f. Another temporal join. 1790 | > # join DF2 to row in DF for which DF.tt and DF2.tt are closest 1791 | > 1792 | > DF <- structure(list(tt = c(3, 6)), .Names = "tt", row.names = c(NA, 1793 | + -2L), class = "data.frame") 1794 | > DF 1795 | tt 1796 | 1 3 1797 | 2 6 1798 | > 1799 | > DF2 <- structure(list(tt = c(1, 2, 3, 4, 5, 7), d = c(8.3, 10.3, 19, 1800 | + 16, 15.6, 19.8)), .Names = c("tt", "d"), row.names = c(NA, -6L 1801 | + ), class = "data.frame", reference = "A1.4, p. 270") 1802 | > DF2 1803 | tt d 1804 | 1 1 8.3 1805 | 2 2 10.3 1806 | 3 3 19.0 1807 | 4 4 16.0 1808 | 5 5 15.6 1809 | 6 7 19.8 1810 | > 1811 | > out <- sqldf("select * from DF d, DF2 a, DF2 b 1812 | + where a.row_names = b.row_names - 1 1813 | + and d.tt > a.tt and d.tt <= b.tt", 1814 | + row.names = TRUE) 1815 | > 1816 | > out$dd <- with(out, ifelse(tt < (tt.1 + tt.2) / 2, d, d.1)) 1817 | > out 1818 | tt tt.1 d tt.2 d.1 dd 1819 | 1 3 2 10.3 3 19.0 19.0 1820 | 2 6 5 15.6 7 19.8 19.8 1821 | ``` 1822 | 1823 | Example 4g. Self Join. There is an example of a self-join here: 1824 | [problem](https://stat.ethz.ch/pipermail/r-help/2010-March/232314.html) 1825 | and answer here: 1826 | 1827 | ```r 1828 | > DF <- structure(list(Actor = c("Jim", "Bob", "Bob", "Larry", "Alice", "Tom", "Tom", "Tom", "Alice", "Nancy"), Act = c("A", "A", "C", "D", "C", "F", "D", "A", "B", "B")), .Names = c("Actor", "Act" ), class = "data.frame", row.names = c(NA, -10L)) 1829 | 1830 | > subset(unique(merge(DF, DF, by = 2)), Actor.x < Actor.y) 1831 | Act Actor.x Actor.y 1832 | 3 A Jim Tom 1833 | 4 A Bob Jim 1834 | 6 A Bob Tom 1835 | 11 B Alice Nancy 1836 | 16 C Alice Bob 1837 | 20 D Larry Tom 1838 | 1839 | > sqldf("select A.Act, A.Actor, B.Actor 1840 | + from DF A join DF B 1841 | + where A.Act = B.Act and A.Actor < B.Actor 1842 | + order by A.Act, A.Actor") 1843 | Act Actor Actor 1844 | 1 A Bob Jim 1845 | 2 A Bob Tom 1846 | 3 A Jim Tom 1847 | 4 B Alice Nancy 1848 | 5 C Alice Bob 1849 | 6 D Larry Tom 1850 | ``` 1851 | 1852 | to Raj Morejoys for correction. 1853 | 1854 | Here is an [another example of a self 1855 | join](https://stat.ethz.ch/pipermail/r-help/2011-February/269680.html) 1856 | to create pairs which is followed by a second self join to produce pairs 1857 | of pairs. This [stackoverflow 1858 | example](https://stackoverflow.com/questions/11448133/double-merge-two-data-frames-in-r) 1859 | illustrates an sqldf triple join in which one table participates twice. 1860 | 1861 | Example 4h. Join nearby times. There is an example of joining records 1862 | that are close but not necessarily exactly the same here: 1863 | [problem](https://stat.ethz.ch/pipermail/r-help/2010-March/232588.html) 1864 | and 1865 | [answer](https://stat.ethz.ch/pipermail/r-help/attachments/20100320/4ccb548f/attachment.pl) 1866 | . Also taking successive differences involves joining adjacent times and 1867 | this is illustrated 1868 | [here](https://stackoverflow.com/questions/6695673/find-standard-deviation-of-first-differences-of-series-defined-with-group-by-usin) 1869 | . 1870 | 1871 | Here is an example where we align time series Sy to series Sx by 1872 | averaging all points of Sy within w = 0.25 units of each Sx time point. 1873 | Tx and X are the times and values of Sx and Ty and Y are the times and 1874 | values of Sy. 1875 | 1876 | ```r 1877 | Tx <- seq(1, N, 0.5) 1878 | Tx <- Tx + rnorm(length(Tx), 0, 0.1) 1879 | X <- sin(Tx/10.0) + sin(Tx/5.0) + rnorm(length(Tx), 0, 0.1) 1880 | Ty <- seq(1, N, 0.3333) 1881 | Ty <- Ty + rnorm(length(Ty), 0, 0.02) 1882 | Y <- sin(Ty/10.0) + sin(Ty/5.0) + rnorm(length(Ty), 0, 0.1) 1883 | w <- 0.25 1884 | 1885 | system.time(out1 <- sapply(Tx, function(tx) mean(Y[Ty >= tx-w & Ty <= tx+w]))) 1886 | 1887 | library(sqldf) 1888 | Sx <- data.frame(Tx, X) 1889 | Sy <- data.frame(Ty, Y) 1890 | 1891 | system.time(out.sqldf <- sqldf(c("create index idx on Sx(Tx)", 1892 | "select Tx, avg(Y) from main.Sx, Sy 1893 | where Ty + 0.25 >= Tx and Ty - 0.25 <= Tx group by Tx"))) 1894 | 1895 | all.equal(out.sqldf[,2], out1) # TRUE 1896 | ``` 1897 | 1898 | Example 4i. Speeding up joins with indexes. Here is an example of 1899 | speeding up a join by using indexes on a single join column 1900 | [here](https://statcompute.wordpress.com/2013/06/09/improve-the-efficiency-in-joining-data-with-index/) 1901 | and [here](https://stat.ethz.ch/pipermail/r-help/2010-March/232688.html) 1902 | and on two join columns below. Note that the `create index` statements 1903 | in each example also has the effect of reading in the data frames into 1904 | the `main` database of SQLite. The `select` statement refers to 1905 | `main.DF1` rather than just `DF1` so that it accesses that copy of `DF1` 1906 | in `main` which we just indexed rather than the unindexed `DF1` in R. 1907 | Similar comments apply to `DF2`. The statement 1908 | `sqldf("select * from sqlite_master")` will list the names and related 1909 | info for all tables in `main`. 1910 | 1911 | ```r 1912 | > set.seed(1) 1913 | > n <- 1000000 1914 | > 1915 | > DF1 <- data.frame(a = sample(n, n, replace = TRUE), 1916 | + b = sample(4, n, replace = TRUE), c1 = runif(n)) 1917 | > 1918 | > DF2 <- data.frame(a = sample(n, n, replace = TRUE), 1919 | + b = sample(4, n, replace = TRUE), c2 = runif(n)) 1920 | > 1921 | > library(sqldf) 1922 | Loading required package: DBI 1923 | Loading required package: RSQLite 1924 | Loading required package: gsubfn 1925 | Loading required package: proto 1926 | Loading required package: chron 1927 | > 1928 | > sqldf() 1929 | 1930 | > system.time(sqldf("create index ai1 on DF1(a, b)")) 1931 | Loading required package: tcltk 1932 | Loading Tcl/Tk interface ... done 1933 | user system elapsed 1934 | 16.69 0.19 19.12 1935 | > system.time(sqldf("create index ai2 on DF2(a, b)")) 1936 | user system elapsed 1937 | 16.60 0.03 17.48 1938 | > system.time(sqldf("select * from main.DF1 natural join main.DF2")) 1939 | user system elapsed 1940 | 7.76 0.06 8.23 1941 | > sqldf() 1942 | ``` 1943 | 1944 | The sqldf statements above could also be done in one sqldf call like 1945 | this: 1946 | 1947 | ```r 1948 | # define DF1 and DF2 as before 1949 | set.seed(1) 1950 | n <- 1000000 1951 | DF1 <- data.frame(a = sample(n, n, replace = TRUE), 1952 | b = sample(4, n, replace = TRUE), c1 = runif(n)) 1953 | DF2 <- data.frame(a = sample(n, n, replace = TRUE), 1954 | b = sample(4, n, replace = TRUE), c2 = runif(n)) 1955 | 1956 | # combine all sqldf calls from before into one call 1957 | 1958 | result <- sqldf(c("create index ai1 on DF1(a, b)", 1959 | "create index ai2 on DF2(a, b)", 1960 | "select * from main.DF1 natural join main.DF2")) 1961 | ``` 1962 | 1963 | Note that if your data is so large that you need indexes it may be too 1964 | large to store the database in memory. If you find its overflowing 1965 | memory then use the `dbname=` sqldf argument, e.g. 1966 | `sqldf(c("create...", "create...", "select..."), dbname = tempfile())` 1967 | so that it stores the intermediate results in an external database 1968 | rather than memory. 1969 | 1970 | *Note:* The index `ai1` is not actually used so we could have saved the 1971 | time it took to create it, creating only `ai2`. 1972 | 1973 | ```r 1974 | sqldf(c("create index ai2 on DF2(a, b)", "select * from DF1 natural join main.DF2")) 1975 | ``` 1976 | 1977 | Example 4j. Per Group Max and Min 1978 | 1979 | Note that the Date variable gets passed to SQLite as number of days 1980 | since 1970-01-01 whereas SQLite uses an earlier origin so we add 1981 | `julianday('1970-01-01')` to convert the origin of R's `"Date"` class to 1982 | SQLite's origin. Note that the output column called `Date` is 1983 | automatically converted to `"Date"` class by the sqldf heuristic because 1984 | there is an input column that has the same name. 1985 | 1986 | ```r 1987 | > URL <- "https://ichart.finance.yahoo.com/table.csv?s=GOOG&a=07&b=19&c=2004&d=03&e=16&f=2010&g=d&ignore=.csv" 1988 | > DF25 <- read.csv(URL, nrows = 25) 1989 | > DF25$Date <- as.Date(DF25$Date) 1990 | > 1991 | > sqldf("select Date, a.High, a.Low, b.Close, a.Volume 1992 | + from (select max(Date) Date, min(Low) Low, max(High) High, sum(Volume) Volume 1993 | + from DF25 1994 | + group by date(Date + julianday('1970-01-01'), 'start of month') 1995 | + ) as a join DF25 b using(Date)") 1996 | Date High Low Close Volume 1997 | 1 2010-03-31 588.28 539.70 567.12 51541600 1998 | 2 2010-04-16 597.84 549.63 550.15 41201900 1999 | ``` 2000 | 2001 | and here is another shorter one that uses a trick of Magnus Hagander in 2002 | the second Stackoverflow link below: 2003 | 2004 | ```r 2005 | > sqldf("select 2006 | + max(Date) Date, 2007 | + max(High) High, 2008 | + min(Low) Low, 2009 | + max(100000 * Date + Close) % 100000 Close, 2010 | + sum(Volume) Volume 2011 | + from DF25 2012 | + group by date(Date + julianday('1970-01-01'), 'start of month')") 2013 | Date High Low Close Volume 2014 | 1 2010-03-31 588.28 539.70 567 51541600 2015 | 2 2010-04-16 597.84 549.63 550 41201900 2016 | ``` 2017 | 2018 | Also see [this Xaprb 2019 | link](https://www.xaprb.com/blog/2007/03/14/how-to-find-the-max-row-per-group-in-sql-without-subqueries/) 2020 | for an approach without subqueries and for more discussion see [this 2021 | stackoverflow 2022 | link](https://stackoverflow.com/questions/121387/sql-fetch-the-row-which-has-the-max-value-for-a-column) 2023 | and [this stackoverflow 2024 | link](https://stackoverflow.com/questions/1140254/postgresql-vlookup). 2025 | The last link shows how to use analytical queries which are available in 2026 | PostgreSQL -- the PostgreSQL database, like SQLite and H2, is supported 2027 | by sqldf. 2028 | 2029 | Example 5. Insert Variables[](#Example_5._Insert_Variables) 2030 | ----------------------------------------------------------- 2031 | 2032 | Here is an example of inserting evaluated variables into a query using 2033 | [gsubfn](https://code.google.com/p/gsubfn/) quasi-perl-style string 2034 | interpolation. gsubfn is used by sqldf so its already loaded. Note that 2035 | we must use the `fn$` prefix to invoke the interpolation functionality: 2036 | 2037 | ```r 2038 | > minSL <- 7 2039 | > limit <- 3 2040 | > species <- "virginica" 2041 | > fn$sqldf("select * from iris where \"Sepal.Length\" > $minSL and species = '$species' limit $limit") 2042 | 2043 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2044 | 1 7.1 3.0 5.9 2.1 virginica 2045 | 2 7.6 3.0 6.6 2.1 virginica 2046 | 3 7.3 2.9 6.3 1.8 virginica 2047 | ``` 2048 | 2049 | Example 6. File Input[](#Example_6._File_Input) 2050 | ----------------------------------------------- 2051 | 2052 | Note that there is a new command `read.csv.sql` which provides an 2053 | alternate interface to the the approach discussed in this section. See 2054 | Example 13 for that. 2055 | 2056 | sqldf normally deletes any database it creates after completion but the 2057 | example sample code [at the bottom of this 2058 | post](https://stat.ethz.ch/pipermail/r-help/2010-October/257270.html) 2059 | shows how to set up a database and read a file into it without having 2060 | the database destroyed afterwards. 2061 | 2062 | sqldf will not only look for data frames used in the SQL statement but 2063 | will also look for R objects of class `"file"`. For such objects it will 2064 | directly import the associated file into the database without going 2065 | through R allowing files that are larger than an R workspace to be 2066 | handled and also providing for potential speed advantages. That is, if 2067 | `f <- file("abc.csv")` is a file object and `f` is used as the table 2068 | name in the sql statement then the file `abc.csv` is imported into the 2069 | database as table `f`. With SQLite, the actual reading of the file into 2070 | the database is done in a C routine in RSQLite so the file is 2071 | transferred directly to the database without going through R. If the 2072 | `sqldf` argument `dbname` is used then it specifies a filename (either 2073 | existing or created by `sqldf` if not existing). That filename is used 2074 | as a database (rather than memory) allowing larger files than physical 2075 | memory. By using an appropriate `where` statement or a subset of column 2076 | names a portion of the table can be retrieved into R even if the file 2077 | itself is too large for R or for memory. 2078 | 2079 | There are some caveats. The RSQLite `dbWriteTable`/`sqliteImportFile` 2080 | routines that `sqldf` uses to transfer the file directly to the database 2081 | are intended for speed thus they are not as flexible as `read.table`. 2082 | Also they have slightly different defaults. The default for `sep` is 2083 | `file.format = list(sep = ",")`. If the first row of the file has one 2084 | fewer component than subsequent ones then it assumes that 2085 | `file.format = list(header = TRUE, row.names = TRUE)` and otherwise that 2086 | `file.format = list(header = FALSE, row.names = FALSE)`. `.csv` file 2087 | format is only partly supported -- quotes are not regarded as special. 2088 | 2089 | In addition to the examples below there is an example 2090 | [here](http://web.archive.org/web/20140429215324/http://stat.ethz.ch/pipermail/r-help/2009-May/199991.html) and 2091 | another one with performance results 2092 | [here](http://www.cerebralmastication.com/2009/11/loading-big-data-into-r/). 2093 | 2094 | ```r 2095 | > # Example 6a. 2096 | > # test of file connections with sqldf 2097 | > 2098 | > # create test .csv file of just 3 records 2099 | > write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE) 2100 | > 2101 | > # look at contents of iris3.dat 2102 | > readLines("iris3.dat") 2103 | [1] "Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species" 2104 | [2] "1,5.1,3.5,1.4,0.2,setosa" 2105 | [3] "2,4.9,3,1.4,0.2,setosa" 2106 | [4] "3,4.7,3.2,1.3,0.2,setosa" 2107 | > 2108 | > # set up file connection 2109 | > iris3 <- file("iris3.dat") 2110 | > sqldf('select * from iris3 where "Sepal.Width" > 3') 2111 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2112 | 1 5.1 3.5 1.4 0.2 setosa 2113 | 2 4.7 3.2 1.3 0.2 setosa 2114 | > 2115 | > # Example 6b. 2116 | > # similar but uses disk - useful if file were large 2117 | > # According to https://www.sqlite.org/whentouse.html 2118 | > # SQLite can handle files up to several dozen gigabytes. 2119 | > # (Note in this case readTable and readTableIndex in R.utils 2120 | > # package or read.table from the base of R, setting the colClasses 2121 | > # argument to "NULL" for columns you don't want read in, might be 2122 | > # alternatives.) 2123 | > sqldf('select * from iris3 where "Sepal.Width" > 3', dbname = tempfile()) 2124 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2125 | 1 5.1 3.5 1.4 0.2 setosa 2126 | 2 4.7 3.2 1.3 0.2 setosa 2127 | 2128 | > # Example 6c. 2129 | > # with this format, header=TRUE needs to be specified 2130 | > write.table(head(iris, 3), "iris3a.dat", sep = ",", quote = FALSE, 2131 | + row.names = FALSE) 2132 | > iris3a <- file("iris3a.dat") 2133 | > sqldf("select * from iris3a", file.format = list(header = TRUE)) 2134 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2135 | 1 5.1 3.5 1.4 0.2 setosa 2136 | 2 4.9 3.0 1.4 0.2 setosa 2137 | 3 4.7 3.2 1.3 0.2 setosa 2138 | 2139 | > # Example 6d. 2140 | > # header can alternately be specified as object attribute 2141 | > attr(iris3a, "file.format") <- list(header = TRUE) 2142 | > sqldf("select * from iris3a") 2143 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2144 | 1 5.1 3.5 1.4 0.2 setosa 2145 | 2 4.9 3.0 1.4 0.2 setosa 2146 | 3 4.7 3.2 1.3 0.2 setosa 2147 | 2148 | > # Example 6e. 2149 | > # create a test file with all 150 records from iris 2150 | > # and select 4 records at random without reading entire file into R 2151 | > write.table(iris, "iris150.dat", sep = ",", quote = FALSE) 2152 | > iris150 <- file("iris150.dat") 2153 | > sqldf("select * from iris150 order by random(*) limit 4") 2154 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2155 | 1 4.9 2.5 4.5 1.7 virginica 2156 | 2 4.8 3.0 1.4 0.1 setosa 2157 | 3 6.1 2.6 5.6 1.4 virginica 2158 | 4 7.4 2.8 6.1 1.9 virginica 2159 | > 2160 | > # or use read.csv.sql and its just one line 2161 | > read.csv.sql("iris150.dat", sql = "select * from file order by random(*) limit 4") 2162 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2163 | 1 4.9 2.4 3.3 1.0 versicolor 2164 | 2 5.8 2.7 4.1 1.0 versicolor 2165 | 3 7.4 2.8 6.1 1.9 virginica 2166 | 4 5.1 3.5 1.4 0.3 setosa 2167 | ``` 2168 | 2169 | Example 6f. If our file has fixed width fields rather than delimited 2170 | then we can still handle it if we parse the lines manually with substr: 2171 | 2172 | ```r 2173 | # write some test data to "fixed" 2174 | # Field 1 has width of 1 column and field 2 has 4 columns 2175 | cat("1 8.3 2176 | 210.3 2177 | 319.0 2178 | 416.0 2179 | 515.6 2180 | 719.8 2181 | ", file = "fixed") 2182 | 2183 | # get 3 random records using sqldf 2184 | fixed <- file("fixed") 2185 | attr(fixed, "file.format") <- list(sep = ";") # ; can be any char not in file 2186 | sqldf("select substr(V1, 1, 1) f1, substr(V1, 2, 4) f2 from fixed order by random(*) limit 3") 2187 | ``` 2188 | 2189 | Another example of fixed width data is 2190 | [here](https://sites.google.com/site/timriffepersonal/DemogBlog/newformetrickforworkingwithbigishdatainr) 2191 | (however, note that changing the sep needs to be done in the example in 2192 | that link too). 2193 | 2194 | Example 6g. Defaults. 2195 | 2196 | ```r 2197 | # If first row has one fewer columns than subsequent rows then 2198 | # header <- row.names <- TRUE is assumed as in example 6a; otherwise, 2199 | # header <- row.names <- FALSE is assumed as shown here: 2200 | 2201 | > write.table(head(iris, 3), "iris3nohdr.dat", col.names = FALSE, row.names = FALSE, sep = ",", quote = FALSE) 2202 | > readLines("iris3nohdr.dat") 2203 | [1] "5.1,3.5,1.4,0.2,setosa" "4.9,3,1.4,0.2,setosa" "4.7,3.2,1.3,0.2,setosa" 2204 | > sqldf("select * from iris3nohdr") 2205 | V1 V2 V3 V4 V5 2206 | 1 5.1 3.5 1.4 0.2 setosa 2207 | 2 4.9 3.0 1.4 0.2 setosa 2208 | 3 4.7 3.2 1.3 0.2 setosa 2209 | ``` 2210 | 2211 | Example 7. Nested Select[](#Example_7._Nested_Select) 2212 | ----------------------------------------------------- 2213 | 2214 | For each species show the two rows with the largest sepal lengths: 2215 | 2216 | ```r 2217 | > # Example 7a. 2218 | > sqldf('select * from iris i 2219 | + where rowid in 2220 | + (select rowid from iris where Species = i.Species order by "Sepal.Length" desc limit 2) 2221 | + order by i.Species, i."Sepal.Length" desc') 2222 | 2223 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2224 | 1 5.8 4.0 1.2 0.2 setosa 2225 | 2 5.7 4.4 1.5 0.4 setosa 2226 | 3 7.0 3.2 4.7 1.4 versicolor 2227 | 4 6.9 3.1 4.9 1.5 versicolor 2228 | 5 7.9 3.8 6.4 2.0 virginica 2229 | 6 7.7 3.8 6.7 2.2 virginica 2230 | ``` 2231 | 2232 | Here is a similar example. In this one `DF` represents a time series 2233 | whose values are in column `x` and whose times are dates in column `tt`. 2234 | The times have gaps -- in fact only every other day is present. The code 2235 | below displays the first row at or past the 21st of the month for each 2236 | year/month. First we append year, month and day columns using 2237 | `month.day.year` from the `chron` package and then do the computation 2238 | using `sqldf`. (For a version of this using the `zoo` package rather 2239 | than `sqldf` see: 2240 | [https://stat.ethz.ch/pipermail/r-help/2007-November/145925.html](https://stat.ethz.ch/pipermail/r-help/2007-November/145925.html)). 2241 | 2242 | ```r 2243 | > # Example 7b. 2244 | > # 2245 | > library(chron) 2246 | > DF <- data.frame(x = 101:200, tt = as.Date("2000-01-01") + seq(0, len = 100, by = 2)) 2247 | > DF <- cbind(DF, month.day.year(unclass(DF$tt))) 2248 | > 2249 | > sqldf("select * from DF d 2250 | + where rowid in 2251 | + (select rowid from DF 2252 | + where year = d.year and month = d.month and day >= 21 limit 1) 2253 | + order by tt") 2254 | x tt month day year 2255 | 1 111 2000-01-21 1 21 2000 2256 | 2 127 2000-02-22 2 22 2000 2257 | 3 141 2000-03-21 3 21 2000 2258 | 4 157 2000-04-22 4 22 2000 2259 | 5 172 2000-05-22 5 22 2000 2260 | 6 187 2000-06-21 6 21 2000 2261 | ``` 2262 | 2263 | Here is another example of a nested select. We select each row of a for 2264 | which st/en overlaps with some st/en of b. 2265 | 2266 | ```r 2267 | > # Example 7c. 2268 | > # 2269 | > a <- read.table(textConnection("st en 2270 | + 1 4 2271 | + 11 14 2272 | + 3 4"), header = TRUE) 2273 | > 2274 | > b <- read.table(textConnection("st en 2275 | + 2 5 2276 | + 3 6 2277 | + 30 44"), TRUE) 2278 | > 2279 | > sqldf("select * from a where 2280 | + (select count(*) from b where a.en >= b.st and b.en >= a.st) > 0") 2281 | st en 2282 | 1 1 4 2283 | 2 3 4 2284 | ``` 2285 | 2286 | 7d. Another example of a nested select with sqldf is shown 2287 | [here](https://stat.ethz.ch/pipermail/r-help/2010-March/231975.html) 2288 | 2289 | Example 8. Specifying File Format[](#Example_8._Specifying_File_Format) 2290 | ----------------------------------------------------------------------- 2291 | 2292 | When using file() as used as in Example 6 RSQLite reads in the first 50 2293 | lines to determine the column classes. What if they all have numbers in 2294 | them but then later we start to see letters? In that case we will have 2295 | to override its choice. Here are two ways: 2296 | 2297 | ```r 2298 | library(sqldf) 2299 | 2300 | # example example 8a - file.format attribute on file.object 2301 | 2302 | numStr <- as.character(1:100) 2303 | DF <- data.frame(a = c(numStr, "Hello")) 2304 | write.table(DF, file = "~/tmp.csv", quote = FALSE, sep = ",") 2305 | ff <- file("~/tmp.csv") 2306 | 2307 | attr(ff, "file.format") <- list(colClasses = c(a = "character")) 2308 | 2309 | tail(sqldf("select * from ff")) 2310 | 2311 | 2312 | # example 8b - using file.format argument 2313 | 2314 | numStr <- as.character(1:100) 2315 | DF <- data.frame(a = c(numStr, "Hello")) 2316 | write.table(DF, file = "~/tmp.csv", quote = FALSE, sep = ",") 2317 | ff <- file("~/tmp.csv") 2318 | 2319 | tail(sqldf("select * from ff", 2320 | file.format = list(colClasses = c(a = "character")))) 2321 | ``` 2322 | 2323 | Example 9. Working with Databases[](#Example_9.__Working_with_Databases) 2324 | ------------------------------------------------------------------------ 2325 | 2326 | sqldf is usually used to operate on data frames but it can be used to 2327 | store a table in a database and repeatedly query it in subsequent sqldf 2328 | statements (although in that case you might be better off just using 2329 | RSQLite or other database directly). There are two ways to do this. In 2330 | this Example section we show how to do it using the fact that if you 2331 | specify the database explicitly then it does not delete the database at 2332 | the end and if you create a table explicitly using create table then it 2333 | does not delete the table (however, note that that will result in 2334 | duplicate tables in the database so it will take up twice as much space 2335 | as one table). A second way to do this is to use persistent connections 2336 | as shown in the Example section after this one. 2337 | 2338 | ```r 2339 | # create new empty database called mydb 2340 | sqldf("attach 'mydb' as new") 2341 | 2342 | # create a new table, mytab, in the new database 2343 | # Note that sqldf does not delete tables created from create. 2344 | sqldf("create table mytab as select * from BOD", dbname = "mydb") 2345 | 2346 | # shows its still there 2347 | sqldf("select * from mytab", dbname = "mydb") 2348 | 2349 | # read a file into the mydb data base using read.csv.sql without deleting it 2350 | # 2351 | # 1. First create a test file. 2352 | # 2. Then read it into the mydb database we created using the sqldf("attach...") above. 2353 | # Since sqldf automatically cleans up after itself we hide 2354 | # the table creation in an sql statement so table is not deleted. 2355 | # 3. Finally list the table names in the database. 2356 | 2357 | write.table(BOD, file = "~/tmp.csv", quote = FALSE, sep = ",") 2358 | read.csv.sql("~/tmp.csv", sql = "create table mytab as select * from file", 2359 | dbname = "mydb") 2360 | sqldf("select * from sqlite_master", dbname = "mydb") 2361 | ``` 2362 | 2363 | Example 10. Persistent Connections[](#Example_10._Persistent_Connections) 2364 | ------------------------------------------------------------------------- 2365 | 2366 | These three examples show the use of persistent connections in sqldf. 2367 | This would be used when one has a large database that one wants to store 2368 | and then make queries from so that one does not have to reload it on 2369 | each execution of sqldf. (Note that if one just needs a series of sql 2370 | statements ending in a single query an alternative would be just to use 2371 | a vector of sql statements in a single sqldf call.) 2372 | 2373 | ```r 2374 | > # Example 10a. 2375 | > 2376 | > # create test .csv file of just 3 records (same as example 6) 2377 | > write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE) 2378 | > # set up file connection 2379 | > iris3 <- file("iris3.dat") 2380 | > # creates connection so in memory database persists after sqldf call 2381 | > sqldf() 2382 | 2383 | > 2384 | > # uses connection just created 2385 | > sqldf('select * from iris3 where "Sepal.Width" > 3') 2386 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2387 | 1 5.1 3.5 1.4 0.2 setosa 2388 | 2 4.7 3.2 1.3 0.2 setosa 2389 | > # we now have iris3 variable in R workspace and an iris3 table 2390 | > # so ensure sqldf uses the one in the main database by writing 2391 | > # main.iris3. (Another possibility here would have been to 2392 | > # delete the iris3 variable from the R workspace to avoid the 2393 | > # ambiguity -- in that case one could just write iris3 instead 2394 | > # of main.iris3.) 2395 | > sqldf('select * from main.iris3 where "Sepal.Width" = 3') 2396 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2397 | 1 4.9 3 1.4 0.2 setosa 2398 | > 2399 | > # close 2400 | > sqldf() 2401 | NULL 2402 | 2403 | > # Example 10b. 2404 | > # 2405 | > # Here is another way to do example 10a. We use the same iris3, 2406 | > # iris3.dat and sqldf development version as above. 2407 | > # We grab connection explicitly, set up the database using sqldf and then 2408 | > # for the second call we call dbGetQuery from RSQLite. 2409 | > # In that case we don't need to qualify iris3 as main.iris3 since 2410 | > # RSQLite would not understand R variables anyways so there is no 2411 | > # ambiguity. 2412 | 2413 | > con <- sqldf() 2414 | > 2415 | > # uses connection just created 2416 | > sqldf('select * from iris3 where "Sepal.Width" > 3') 2417 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2418 | 1 5.1 3.5 1.4 0.2 setosa 2419 | 2 4.7 3.2 1.3 0.2 setosa 2420 | > dbGetQuery(con, 'select * from iris3 where "Sepal.Width" = 3') 2421 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 2422 | 1 4.9 3 1.4 0.2 setosa 2423 | > 2424 | > # close 2425 | > sqldf() 2426 | NULL 2427 | ``` 2428 | 2429 | Here is an example of reading a csv file using read.csv.sql and then 2430 | reading it again using a persistent connection: 2431 | 2432 | ```r 2433 | # Example 10c. 2434 | 2435 | write.table(iris, "iris.csv", sep = ",", quote = FALSE) 2436 | 2437 | sqldf() 2438 | read.csv.sql("iris.csv", sql = "select count(*) from file") 2439 | 2440 | # now re-read it from the sqlite database 2441 | dd <- sqldf("select * from file") 2442 | 2443 | # now close the connection and destroy the database 2444 | sqldf() 2445 | ``` 2446 | 2447 | Example 11. Between and Alternatives[](#Example_11._Between_and_Alternatives) 2448 | ----------------------------------------------------------------------------- 2449 | 2450 | ```r 2451 | # example thanks to Michael Rehberg 2452 | # 2453 | # build sample dataframes 2454 | seqdf <- data.frame(thetime=seq(100,225,5),thevalue=factor(letters)) 2455 | boundsdf <- data.frame(thestart=c(110,160,200),theend=c(130,180,220),groupID=c(555,666,777)) 2456 | 2457 | # run the query using two inequalities 2458 | testquery_1 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 2459 | from seqdf left join boundsdf on (seqdf.thetime <= boundsdf.theend) and (seqdf.thetime >= boundsdf.thestart)") 2460 | 2461 | # run the same query using 'between...and' clause 2462 | testquery_2 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 2463 | from seqdf LEFT JOIN boundsdf ON (seqdf.thetime BETWEEN boundsdf.thestart AND boundsdf.theend)") 2464 | ``` 2465 | 2466 | Example 12. Combine two files in permanent database[](#Example_12._Combine_two_files_in_permanent_database) 2467 | ----------------------------------------------------------------------------------------------------------- 2468 | 2469 | When we issue a series of normal `sqldf` statements after each one sqldf 2470 | automatically removes any tables and databases it creates in that 2471 | statement; however, it does not know about ones that `sqlite` creates so 2472 | a database created using `attach` and the tables created using 2473 | `create table` won't be deleted. 2474 | 2475 | Also if `sqldf` is used without the `x=` argument (omitting x= denotes 2476 | the opening of a persistent connection) then objects created in the 2477 | database including those by `sqldf` and `sqlite` are not deleted when 2478 | the persistent connection is destroyed by the next `sqldf` statement 2479 | with no `x=` argument. 2480 | 2481 | If we have forgetten whether you have a connection open or not we can 2482 | check either of these: 2483 | 2484 | ```r 2485 | dbListConnections(SQLite()) # from DBI 2486 | 2487 | getOption("sqldf.connection") # set by sqldf 2488 | ``` 2489 | 2490 | Here is an example that illustrates part of the above. See the prior 2491 | examples for more. 2492 | 2493 | ```r 2494 | > # set up some test data 2495 | > write.table(head(iris, 3), "irishead.dat", sep = ",", quote = FALSE) 2496 | > write.table(tail(iris, 3), "iristail.dat", sep = ",", quote = FALSE) 2497 | > 2498 | > library(sqldf) 2499 | > 2500 | > # create new empty database called mydb 2501 | > sqldf("attach 'mydb' as new") 2502 | NULL 2503 | > 2504 | > irishead <- file("irishead.dat") 2505 | > iristail <- file("iristail.dat") 2506 | > 2507 | > # read tables into mydb 2508 | > sqldf("select count(*) from irishead", dbname = "mydb") 2509 | count(*) 2510 | 1 3 2511 | > sqldf("select count(*) from iristail", dbname = "mydb") 2512 | count(*) 2513 | 1 3 2514 | > 2515 | > # get count of all records from union 2516 | > sqldf('select count(*) from (select * from main.irishead 2517 | + union 2518 | + select * from main.iristail)', dbname = "mydb") 2519 | count(*) 2520 | 1 6 2521 | ``` 2522 | 2523 | Example 13. read.csv.sql and read.csv2.sql[](#Example_13._read.csv.sql_and_read.csv2.sql) 2524 | ----------------------------------------------------------------------------------------- 2525 | 2526 | `read.csv.sql` is an interface to `sqldf` that works like `read.csv` in 2527 | R except that it also provides an `sql=` argument and not all of the 2528 | other arguments of `read.csv` are supported. It uses (1) SQLite's import 2529 | facility via RSQLite to read the input file into a temporary disk-based 2530 | SQLite database which is created on the fly. (2) Then it uses the 2531 | provided SQL statement to read the table so created into R. As the first 2532 | step imports the data directly into SQLite without going through R it 2533 | can handle larger files than R itself can handle as long as the SQL 2534 | statement filters it to a size that R can handle. Here is Example 6c 2535 | redone using this facility: 2536 | 2537 | ```r 2538 | # Example 13a. 2539 | library(sqldf) 2540 | 2541 | write.table(iris, "iris.csv", sep = ",", quote = FALSE, row.names = FALSE) 2542 | iris.csv <- read.csv.sql("iris.csv", 2543 | sql = 'select * from file where "Sepal.Length" > 5') 2544 | 2545 | # Example 13b. read.csv2.sql. Commas are decimals and ; is sep. 2546 | 2547 | library(sqldf) 2548 | Lines <- "Sepal.Length;Sepal.Width;Petal.Length;Petal.Width;Species 2549 | 5,1;3,5;1,4;0,2;setosa 2550 | 4,9;3;1,4;0,2;setosa 2551 | 4,7;3,2;1,3;0,2;setosa 2552 | 4,6;3,1;1,5;0,2;setosa 2553 | " 2554 | cat(Lines, file = "iris2.csv") 2555 | 2556 | iris.csv2 <- read.csv2.sql("iris2.csv", sql = 'select * from file where "Sepal.Length" > 5') 2557 | 2558 | # Example 13c. Use of filter= to process fixed field widths. 2559 | 2560 | # This example assumes gawk is available for use as a filter: 2561 | # https://www.icewalkers.com/Linux/Software/514530/Gawk.html 2562 | # https://gnuwin32.sourceforge.net/packages/gawk.htm 2563 | 2564 | library(sqldf) 2565 | cat("112333 2566 | 123456", file = "fixed.dat") 2567 | cat('BEGIN { FIELDWIDTHS = "2 1 3"; OFS = ","; print "A,B,C" } 2568 | { $1 = $1; print }', file = "fixed.awk") 2569 | 2570 | # the following worked on Windows Vista. One user told me that it only worked if he 2571 | # omitted the eol= argument so try it both ways on your system and use the way that 2572 | # works for your system. 2573 | 2574 | fixed <- read.csv.sql("fixed.dat", eol = "\n", filter = "gawk -f fixed.awk") 2575 | 2576 | # Example 13d. Read a csv file into the database but do not drop the database or table 2577 | 2578 | # create test file 2579 | write.table(iris, "iris.csv", sep = ",", quote = FALSE, row.names = FALSE) 2580 | 2581 | # create an empty database (can skip this step if database already exists) 2582 | sqldf("attach mytestdb as new") 2583 | 2584 | # read into table called iris in the mytestdb sqlite database 2585 | read.csv.sql("iris.csv", sql = "create table main.iris as select * from file", dbname = "mytestdb") 2586 | 2587 | # look at first three lines 2588 | sqldf("select * from main.iris limit 3", dbname = "mytestdb") 2589 | 2590 | # example 13e. Read in only column j of a csv file where j may vary. 2591 | 2592 | library(sqldf) 2593 | 2594 | # create test data file 2595 | nms <- names(anscombe) 2596 | write.table(anscombe, "anscombe.dat", sep = ",", quote = FALSE, 2597 | row.names = FALSE) 2598 | 2599 | j <- 2 2600 | DF2 <- fn$read.csv.sql("anscombe.dat", sql = "select `nms[j]` from file") 2601 | ``` 2602 | 2603 | Also see this 2604 | [example](https://stat.ethz.ch/pipermail/r-help/2010-November/260931.html) 2605 | and this further 2606 | [example](https://stackoverflow.com/questions/6966723/how-to-allocate-append-a-large-column-of-date-objects-to-a-data-frame/6966771#6966771). 2607 | The latter illustrates the use of the `method=` argument. 2608 | 2609 | Example 14. Use of spatialite library functions[](#Example_14._Use_of_spatialite_library_functions) 2610 | --------------------------------------------------------------------------------------------------- 2611 | 2612 | ******This example needs to be revised as automatic loading of 2613 | spatialite has been removed from sqldf and replaced with the functions 2614 | in RSQLite.extfuns which are loaded instead****** 2615 | 2616 | This example will only work if spatialite-1.dll is on your PATH. It 2617 | shows accessing a function in that dll. Other than placing it on your 2618 | PATH there is no other setup needed. (Note that libspatialite-1.dll is 2619 | only looked up the first time sqldf runs in a session so you should be 2620 | sure that it has been put there before starting sqldf.) 2621 | 2622 | ```r 2623 | > library(sqldf) 2624 | > # stddev_pop is a function in spatialite library similar to sd in R 2625 | > # Note bug: spatialite has stddev_pop and stddev_samp reversed and ditto for var_pop and var_samp. More on bug at: 2626 | > # https://groups.google.com/group/spatialite-users/msg/182f1f629c922607 2627 | > sqldf("select avg(demand), stddev_pop(demand) from BOD") 2628 | avg(demand) stddev_pop(demand) 2629 | 1 14.83333 4.630623 2630 | > c(mean(BOD$demand), sd(BOD$demand)) 2631 | [1] 14.833333 4.630623 2632 | ``` 2633 | 2634 | Example 15. Use of RSQLite.extfuns library functions[](#Example_15._Use_of_RSQLite.extfuns_library_functions) 2635 | ------------------------------------------------------------------------------------------------------------- 2636 | The RSQLite R package includes Liam Healy's extension functions for SQLite. 2637 | In addition to all the [core 2638 | functions](https://www.sqlite.org/lang_corefunc.html), [date 2639 | functions](https://www.sqlite.org/lang_datefunc.html) and [aggregate 2640 | functions](https://www.sqlite.org/lang_aggfunc.html) that SQLite itself 2641 | provides, the following extension functions are available for use within 2642 | SQL select statements: **Math:** acos, asin, atan, atn2, atan2, acosh, 2643 | asinh, atanh, difference, degrees, radians, cos, sin, tan, cot, cosh, 2644 | sinh, tanh, coth, exp, log, log10, power, sign, sqrt, square, ceil, 2645 | floor, pi. **String:** replicate, charindex, leftstr, rightstr, ltrim, 2646 | rtrim, trim, replace, reverse, proper, padl, padr, padc, strfilter. 2647 | **Aggregate:** stdev, variance, mode, median, lower\_quartile, 2648 | upper\_quartile. See the bottom of 2649 | [https://www.sqlite.org/contrib/](https://www.sqlite.org/contrib/) for 2650 | more info on these extension functions. 2651 | 2652 | ```r 2653 | > sqldf("select avg(demand) mean, variance(demand) var from BOD") 2654 | mean var 2655 | 1 14.83333 21.44267 2656 | > var(BOD$demand) 2657 | [1] 21.44267 2658 | ``` 2659 | 2660 | Example 16. Moving Average[](#Example_16._Moving_Average) 2661 | --------------------------------------------------------- 2662 | 2663 | This is a simplified version of the example in this [r-help 2664 | post](https://stat.ethz.ch/pipermail/r-help/2010-August/249996.html). 2665 | Here we compute the moving average of x for the 3rd to 9th preceding 2666 | values of each date performing it separately for each illness. 2667 | 2668 | ```r 2669 | > Lines <- "date illness x 2670 | + 2006/01/01 DERM 319 2671 | + 2006/01/02 DERM 388 2672 | + 2006/01/03 DERM 336 2673 | + 2006/01/04 DERM 255 2674 | + 2006/01/05 DERM 177 2675 | + 2006/01/06 DERM 377 2676 | + 2006/01/07 DERM 113 2677 | + 2006/01/08 DERM 253 2678 | + 2006/01/09 DERM 316 2679 | + 2006/01/10 DERM 187 2680 | + 2006/01/11 DERM 292 2681 | + 2006/01/12 DERM 275 2682 | + 2006/01/13 DERM 355 2683 | + 2006/01/01 FEVER 3190 2684 | + 2006/01/02 FEVER 3880 2685 | + 2006/01/03 FEVER 3360 2686 | + 2006/01/04 FEVER 2550 2687 | + 2006/01/05 FEVER 1770 2688 | + 2006/01/06 FEVER 3770 2689 | + 2006/01/07 FEVER 1130 2690 | + 2006/01/08 FEVER 2530 2691 | + 2006/01/09 FEVER 3160 2692 | + 2006/01/10 FEVER 1870 2693 | + 2006/01/11 FEVER 2920 2694 | + 2006/01/12 FEVER 2750 2695 | + 2006/01/13 FEVER 3550" 2696 | > 2697 | > DF <- read.table(textConnection(Lines), header = TRUE) 2698 | > DF$date <- as.Date(DF$date) 2699 | > 2700 | > sqldf("select 2701 | + t1.date, 2702 | + avg(t2.x) mean, 2703 | + date(min(t2.date) * 24 * 60 * 60, 'unixepoch') fromdate, 2704 | + date(max(t2.date) * 24 * 60 * 60, 'unixepoch') todate, 2705 | + max(t2.illness) illness 2706 | + from DF t1, DF t2 2707 | + where julianday(t1.date) between julianday(t2.date) + 3 and 2708 | + julianday(t2.date) + 9 2709 | + and t1.illness = t2.illness 2710 | + group by t1.illness, t1.date 2711 | + order by t1.illness, t1.date") 2712 | date mean fromdate todate illness 2713 | 1 2006-01-04 319.0000 2006-01-01 2006-01-01 DERM 2714 | 2 2006-01-05 353.5000 2006-01-01 2006-01-02 DERM 2715 | 3 2006-01-06 347.6667 2006-01-01 2006-01-03 DERM 2716 | 4 2006-01-07 324.5000 2006-01-01 2006-01-04 DERM 2717 | 5 2006-01-08 295.0000 2006-01-01 2006-01-05 DERM 2718 | 6 2006-01-09 308.6667 2006-01-01 2006-01-06 DERM 2719 | 7 2006-01-10 280.7143 2006-01-01 2006-01-07 DERM 2720 | 8 2006-01-11 271.2857 2006-01-02 2006-01-08 DERM 2721 | 9 2006-01-12 261.0000 2006-01-03 2006-01-09 DERM 2722 | 10 2006-01-13 239.7143 2006-01-04 2006-01-10 DERM 2723 | 11 2006-01-04 3190.0000 2006-01-01 2006-01-01 FEVER 2724 | 12 2006-01-05 3535.0000 2006-01-01 2006-01-02 FEVER 2725 | 13 2006-01-06 3476.6667 2006-01-01 2006-01-03 FEVER 2726 | 14 2006-01-07 3245.0000 2006-01-01 2006-01-04 FEVER 2727 | 15 2006-01-08 2950.0000 2006-01-01 2006-01-05 FEVER 2728 | 16 2006-01-09 3086.6667 2006-01-01 2006-01-06 FEVER 2729 | 17 2006-01-10 2807.1429 2006-01-01 2006-01-07 FEVER 2730 | 18 2006-01-11 2712.8571 2006-01-02 2006-01-08 FEVER 2731 | 19 2006-01-12 2610.0000 2006-01-03 2006-01-09 FEVER 2732 | 20 2006-01-13 2397.1429 2006-01-04 2006-01-10 FEVER 2733 | ``` 2734 | 2735 | Because of the date processing this is a bit more conveniently done in 2736 | H2 with its support of date class. Using the same `DF` that we just 2737 | defined. Note that SQL functions like AVG and MIN must be written in 2738 | upper case when using H2. 2739 | 2740 | ```r 2741 | > library(RH2) 2742 | > sqldf("select 2743 | + t1.date, 2744 | + AVG(t2.x) mean, 2745 | + MIN(t2.date) fromdate, 2746 | + MAX(t2.date) todate, 2747 | + t2.illness illness 2748 | + from DF t1, DF t2 2749 | + where t1.date between t2.date + 3 and t2.date + 9 2750 | + and t1.illness = t2.illness 2751 | + group by t1.illness, t1.date 2752 | + order by t1.illness, t1.date") 2753 | date mean fromdate todate illness 2754 | 1 2006-01-04 319 2006-01-01 2006-01-01 DERM 2755 | 2 2006-01-05 353 2006-01-01 2006-01-02 DERM 2756 | 3 2006-01-06 347 2006-01-01 2006-01-03 DERM 2757 | 4 2006-01-07 324 2006-01-01 2006-01-04 DERM 2758 | 5 2006-01-08 295 2006-01-01 2006-01-05 DERM 2759 | 6 2006-01-09 308 2006-01-01 2006-01-06 DERM 2760 | 7 2006-01-10 280 2006-01-01 2006-01-07 DERM 2761 | 8 2006-01-11 271 2006-01-02 2006-01-08 DERM 2762 | 9 2006-01-12 261 2006-01-03 2006-01-09 DERM 2763 | 10 2006-01-13 239 2006-01-04 2006-01-10 DERM 2764 | 11 2006-01-04 3190 2006-01-01 2006-01-01 FEVER 2765 | 12 2006-01-05 3535 2006-01-01 2006-01-02 FEVER 2766 | 13 2006-01-06 3476 2006-01-01 2006-01-03 FEVER 2767 | 14 2006-01-07 3245 2006-01-01 2006-01-04 FEVER 2768 | 15 2006-01-08 2950 2006-01-01 2006-01-05 FEVER 2769 | 16 2006-01-09 3086 2006-01-01 2006-01-06 FEVER 2770 | 17 2006-01-10 2807 2006-01-01 2006-01-07 FEVER 2771 | 18 2006-01-11 2712 2006-01-02 2006-01-08 FEVER 2772 | 19 2006-01-12 2610 2006-01-03 2006-01-09 FEVER 2773 | 20 2006-01-13 2397 2006-01-04 2006-01-10 FEVER 2774 | ``` 2775 | 2776 | Another example which varies somewhat from a strict moving average can 2777 | be found [in this 2778 | post](https://stat.ethz.ch/pipermail/r-help/2011-June/280081.html). 2779 | 2780 | Example 17. Lag[](#Example_17._Lag) 2781 | ----------------------------------- 2782 | 2783 | The following example contributed by Søren Højsgaard shows how to lag a 2784 | column. 2785 | 2786 | ```r 2787 | ## Create a lagged variable for grouped data 2788 | ## ----------------------------------------- 2789 | # Meaning that in the i'th row we not only have y[i] but also y[i-1]. 2790 | # This is done on a groupwise basis 2791 | library(sqldf) 2792 | set.seed(123) 2793 | DF <- data.frame(id=rep(1:2, each=5), tvar=rep(1:5,2), y=rnorm(1:10)) 2794 | # Data with lagged variable added 2795 | BB <- 2796 | sqldf("select A.id, A.tvar, A.y, B.y as lag 2797 | from DF as A join DF as B 2798 | where A.rowid-1 = B.rowid and A.id=B.id 2799 | order by A.id, A.tvar") 2800 | # Merge with original data: 2801 | DD <- 2802 | sqldf("select DF.*, BB.lag 2803 | from DF left join BB 2804 | on DF.id=BB.id and DF.tvar=BB.tvar") 2805 | # Do it all in one step: 2806 | DD <- 2807 | sqldf("select DF.*, BB.lag 2808 | from DF left join 2809 | ( 2810 | select A.id, A.tvar, A.y, B.y as lag 2811 | from DF as A join DF as B 2812 | where A.rowid-1 = B.rowid and A.id=B.id 2813 | order by A.id, A.tvar 2814 | ) as BB 2815 | on DF.id=BB.id and DF.tvar=BB.tvar") 2816 | ``` 2817 | 2818 | In PostgreSQL's 2819 | [window](https://developer.postgresql.org/pgdocs/postgres/tutorial-window.html) 2820 | [functions](https://developer.postgresql.org/pgdocs/postgres/functions-window.html) 2821 | (similar to R's `ave` function) makes reference to other rows 2822 | particularly easy. Below we repeat the SQLite example in PostgreSQL 2823 | (except that the following fills with NA): 2824 | 2825 | ```r 2826 | # Be sure PostgreSQL is installed and running. 2827 | 2828 | library(RPostgreSQL) 2829 | library(sqldf) 2830 | sqldf("select *, lag(y) over (partition by id order by tvar) from DF") 2831 | ``` 2832 | 2833 | Example 18. MySQL Schema Information[](#Example_18._MySQL_Schema_Information) 2834 | ----------------------------------------------------------------------------- 2835 | 2836 | ```r 2837 | library(RMySQL) 2838 | library(sqldf) 2839 | sqldf("show databases") 2840 | sqldf("show tables") 2841 | ``` 2842 | 2843 | The following SQL statements to query the MySQL table schemas are taken 2844 | from the [blog of Christophe 2845 | Ladroue](https://chrisladroue.com/2012/03/a-graphical-overview-of-your-mysql-database/): 2846 | 2847 | ```r 2848 | library(RMySQL) 2849 | library(sqldf) 2850 | 2851 | # list each schema and its length 2852 | sqldf("SELECT TABLE_SCHEMA,SUM(DATA_LENGTH) SCHEMA_LENGTH 2853 | FROM information_schema.TABLES 2854 | WHERE TABLE_SCHEMA!='information_schema' 2855 | GROUP BY TABLE_SCHEMA") 2856 | 2857 | # list each table in each schema and some info about it 2858 | sqldf("SELECT TABLE_SCHEMA,TABLE_NAME,TABLE_ROWS,DATA_LENGTH 2859 | FROM information_schema.TABLES 2860 | WHERE TABLE_SCHEMA!='information_schema'") 2861 | ``` 2862 | 2863 | The following SQL statement to query the MySQL table schemas are taken 2864 | from [the MySQL Performance 2865 | Blog](https://www.percona.com/blog/2008/03/17/researching-your-mysql-table-sizes/): 2866 | 2867 | ```r 2868 | # Find total number of tables, rows, total data in index size 2869 | sqldf("SELECT count(*) tables, 2870 | concat(round(sum(table_rows)/1000000,2),'M') rows, 2871 | concat(round(sum(data_length)/(1024*1024*1024),2),'G') data, 2872 | concat(round(sum(index_length)/(1024*1024*1024),2),'G') idx, 2873 | concat(round(sum(data_length+index_length)/(1024*1024*1024),2),'G') total_size, 2874 | round(sum(index_length)/sum(data_length),2) idxfrac 2875 | FROM information_schema.TABLES") 2876 | 2877 | # find biggest databases 2878 | sqldf("SELECT 2879 | count(*) tables, 2880 | table_schema,concat(round(sum(table_rows)/1000000,2),'M') rows, 2881 | concat(round(sum(data_length)/(1024*1024*1024),2),'G') data, 2882 | concat(round(sum(index_length)/(1024*1024*1024),2),'G') idx, 2883 | concat(round(sum(data_length+index_length)/(1024*1024*1024),2),'G') total_size, 2884 | round(sum(index_length)/sum(data_length),2) idxfrac 2885 | FROM information_schema.TABLES 2886 | GROUP BY table_schema 2887 | ORDER BY sum(data_length+index_length) DESC LIMIT 10") 2888 | 2889 | # data distribution by storage engine 2890 | sqldf("SELECT engine, 2891 | count(*) tables, 2892 | concat(round(sum(table_rows)/1000000,2),'M') rows, 2893 | concat(round(sum(data_length)/(1024*1024*1024),2),'G') data, 2894 | concat(round(sum(index_length)/(1024*1024*1024),2),'G') idx, 2895 | concat(round(sum(data_length+index_length)/(1024*1024*1024),2),'G') total_size, 2896 | round(sum(index_length)/sum(data_length),2) idxfrac 2897 | FROM information_schema.TABLES 2898 | GROUP BY engine 2899 | ORDER BY sum(data_length+index_length) DESC LIMIT 10") 2900 | ``` 2901 | 2902 | Links[](#Links) 2903 | =============== 2904 | 2905 | [Visual Representation of SQL 2906 | Joins](https://www.codeproject.com/Articles/33052/Visual-Representation-of-SQL-Joins) 2907 | -------------------------------------------------------------------------------- /demo/00Index: -------------------------------------------------------------------------------- 1 | sqldf-groupchoose Choose one element from each group. 2 | sqldf-unitTests Run unit tests using svUnit package. 3 | -------------------------------------------------------------------------------- /demo/sqldf-groupchoose.R: -------------------------------------------------------------------------------- 1 | 2 | Lines <- "DeployID Date.Time LocationQuality Latitude Longitude 3 | 4 | STM05-1 28/02/2005 17:35 Good -35.562 177.158 5 | STM05-1 28/02/2005 19:44 Good -35.487 177.129 6 | STM05-1 28/02/2005 23:01 Unknown -35.399 177.064 7 | STM05-1 01/03/2005 07:28 Unknown -34.978 177.268 8 | STM05-1 01/03/2005 18:06 Poor -34.799 177.027 9 | STM05-1 01/03/2005 18:47 Poor -34.85 177.059 10 | STM05-2 28/02/2005 12:49 Good -35.928 177.328 11 | STM05-2 28/02/2005 21:23 Poor -35.926 177.314 12 | " 13 | 14 | ################################# 15 | ## in R 16 | ################################# 17 | 18 | library(chron) 19 | # in next line replace textConnection(Lines) with "myfile.dat" 20 | DF <- read.table(textConnection(Lines), skip = 1, as.is = TRUE, 21 | col.names = c("Id", "Date", "Time", "Quality", "Lat", "Long")) 22 | 23 | DF2 <- transform(DF, 24 | Date = chron(Date, format = "d/m/y"), 25 | Time = times(paste(Time, "00", sep = ":")), 26 | Quality = factor(Quality, levels = c("Good", "Poor", "Unknown"))) 27 | 28 | o <- order(DF2$Date, as.numeric(DF2$Quality), abs(DF2$Time - times("12:00:00"))) 29 | DF2 <- DF2[o,] 30 | 31 | DF2[tapply(row.names(DF2), DF2$Date, head, 1), ] 32 | 33 | # The last line above could alternately be written like this: 34 | do.call("rbind", by(DF2, DF2$Date, head, 1)) 35 | 36 | ################################# 37 | ## in sqldf 38 | ################################# 39 | 40 | DFo <- sqldf("select * from DF order by 41 | substr(Date, 7, 4) || substr(Date, 4, 2) || substr(Date, 1, 2) DESC, 42 | Quality DESC, 43 | abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) DESC") 44 | sqldf("select * from DFo group by Date") 45 | 46 | ################################# 47 | # Another way to do it also using sqldf is via nested selects like this using 48 | # the same DF as above 49 | ################################# 50 | 51 | sqldf("select * from DF u 52 | where abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) = 53 | (select min(abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12)) 54 | from DF x where Quality = 55 | (select min(Quality) from DF y 56 | where x.Date = y.Date) and x.Date = u.Date)") 57 | 58 | 59 | -------------------------------------------------------------------------------- /demo/sqldf-unitTests.R: -------------------------------------------------------------------------------- 1 | library(sqldf) 2 | library(svUnit) 3 | sqldf.tests <- system.file("unitTests", "runit.all.R", package = "sqldf") 4 | cat("Running:", sqldf.tests, "\n") 5 | source(sqldf.tests) 6 | clearLog() 7 | test.all() 8 | Log() 9 | -------------------------------------------------------------------------------- /inst/NEWS: -------------------------------------------------------------------------------- 1 | Version 0.4-11 2 | 3 | o modifications for RSQLite 2.0.0 4 | 5 | o read.csv.sql now accepts https and ftps 6 | 7 | Version 0.4-10 8 | 9 | o more modifications for RSQLite 1.0.0 10 | 11 | Version 0.4-9 12 | 13 | o modifications for RSQLite 1.0.0 14 | 15 | Version 0.4-8 16 | 17 | o new R option "sqldf.RPostgresSQL.other" 18 | 19 | Version 0.4-7.1 20 | 21 | o bug fix involving dates; also added corresponding unit test 22 | 23 | Version 0.4-7 24 | 25 | o misc changes to satisfy R CMD CHECK in R 3.0.2 26 | 27 | Version 0.4-6.5 28 | 29 | o fixed the a2r/a2s unit test 30 | 31 | Version 0.4-6.4 32 | 33 | o uses ByteCompile hence dependence on R 2.14.0 or higher 34 | 35 | o now uses strapplyc if tcltk available hence dependence on gsubfn 0.6 or higher 36 | 37 | o fixed time zone bug in dealing with POSIXct 38 | 39 | o added MASS to Suggests to pass R CMD check in R 2.15.0 40 | 41 | o RSQLite and RSQLite.extfuns namespaces are loaded at startup if there 42 | are no other drivers supported by sqldf currently on search path 43 | 44 | Version 0.4-6.1 45 | 46 | o message regarding RPostgreSQL was not issued when RPostgreSQL is loaded 47 | 48 | Version 0.4-6 49 | 50 | o support for the RPostgreSQL driver was added 51 | 52 | o RSQLite and RSQLite.extfuns have been moved from Depends to both Imports and 53 | Suggests and are loaded only if actually used. 54 | 55 | o any database backend specified via drv= or the "sqldf.driver" option is 56 | loaded if its not already loaded. Previously user was responsible for loading. 57 | 58 | o the drv= argument of sqldf and the "sqldf.driver" option can specify either 59 | the database name or the driver name (i.e. SQLite or RSQLite, H2 or RH2, 60 | etc.) 61 | 62 | o a possible bug regarding checking for tcltk was fixed. This makes it more 63 | likely that substitute R code will be used rather than giving an error in 64 | the absence of tcltk 65 | 66 | o unit tests upgraded to latest versions of drivers 67 | 68 | Version 0.4-5 69 | 70 | o fixed bug that occurred when times class columns were in output 71 | 72 | o added INSTALL file 73 | 74 | o improve tcltk checking. 75 | 76 | Version 0.4-4 77 | 78 | o NAMESPACE added 79 | 80 | o fixed an erroneous example in ?sqldf 81 | 82 | o appends the sqldf library directory to AWKPATH. This allows gawk calls in 83 | the filter argument of read.csv.sql to refer to awk programs that are 84 | supplied with sqldf without supplying the path. (See discussion of AWKPATH 85 | in the gawk documentation.) 86 | 87 | o sqldf includes an awk program, csv.awk, to process csv files for use as 88 | a filter with read.csv.sql when its necessary to handle commas within quoted 89 | fields and quotes within quoted fields (such quotes must be doubled). The 90 | following arguments (defaults shown) can be passed to csv.awk: isep = ",", 91 | osep = "\"", quote "\"", escape = "\"", newline = "\\n", trim = 1. Note that 92 | the escape character is used if quotes are within quotes. Newlines are also 93 | allowed in fields between quotes. If trim=1 then whitespace is trimmed from 94 | beginning and end of fields. 95 | 96 | # write out test data and then read it back using read.csv.sql 97 | cat('A,B\n"Joe, Jr.","Mac ""The Knife"""\n', file = "temp.csv") 98 | temp.out <- read.csv.sql("temp.csv", sep = ";", eol = "\n", 99 | filter = "gawk -v osep=; -f csv.awk") 100 | 101 | (In most there are no commas or quotes in fields so this filter not needed. 102 | Also note that the above example assumes Windows console syntax and the 103 | the corresponding shell syntax needs to be used instead on UNIX.) 104 | 105 | o changes to eliminate the NOTE produced during build/INSTALL on Linux. 106 | 107 | Version 0.4-3 108 | 109 | o dbname = NULL can be used to force default behavior 110 | 111 | o in read.csv.sql and read.csv2.sql the file argument may optionally be a URL 112 | 113 | o in read.csv.sql and read.csv2.sql the file argument may be omitted (or 114 | equivalently NULL or NA or "") if filter is specified and no file is to to 115 | be input to the filter. 116 | 117 | Version 0.4-2 118 | 119 | o read.csv.sql and read.csv2.sql now handle dbWriteTable's nrows and 120 | field.types arguments. (Previously sqldf did but not these two routines.) 121 | 122 | Version 0.4-1.2 123 | 124 | o bug fixes 125 | 126 | Version 0.4-1.1 127 | 128 | o bug fixes 129 | 130 | Version 0.4-1 131 | 132 | o startup message giving database that will be used as default (if not sqlite) 133 | 134 | Version 0.4-0 135 | 136 | o the heuristic which assigns classes to columns for the data frame output 137 | from sqldf has been improved 138 | 139 | o method argument can now be a list of two transformations. The first 140 | component corresponds to the old method and can be a keyword ("auto", "raw" 141 | or "name__class"), a function or a character vector of class names. The 142 | second may be a function which is used to transform all data frames input 143 | to the data base prior to sending them to the data base. 144 | 145 | o automatic searching for spatialite is no longer performed. It seems less 146 | useful now that RSQLite.extfunctions is automatically loaded. To use 147 | the spatialite extension specify its path using the dll argument. 148 | 149 | o sqldf has a new optional argument, verbose, which gives more output if 150 | it is set to TRUE. Any other value suppresses the extra output. 151 | 152 | o filter may be a list whose first component is a command with keywords 153 | naming the second and subsequent components. These components are each 154 | specified as character vectors which are written out to temporary files 155 | of the indicated names. This can be used for specifying gawk and other 156 | filters/preprocessors of the input file without having to worry about 157 | shell interpretation of special characters. e.g. 158 | filter = list("gawk -f prog", prog = '{ gsub(",", ".") }') 159 | 160 | o MySQL is now supported (in addition to SQLite, PostgreSQL and h2). 161 | 162 | o stringsAsFactors now defaults to FALSE 163 | 164 | o unit tests have been added. To run them for sqlite do this: 165 | R CMD check sqldf 166 | or: 167 | demo("sqldf-unitTests") 168 | To run for all 4 databases: 169 | demo("sqldf-unitTests") # sqlite 170 | library(RH2); demo("sqldf-unitTests"); detach() 171 | library(RMySQL); demo("sqldf-unitTests"); detach() 172 | library(RpgSQL); demo("sqldf-unitTests"); detach() 173 | 174 | Version 0.3-5 175 | 176 | o global option "sqldf.method" used as default value of method argument. If 177 | not set then default is method = "auto", as before. 178 | 179 | o extension function from the RSQLite.extfuns package are automatically loaded 180 | 181 | o heuristic improved. It now checks for ambiguous situations and uses "raw" 182 | for those columns rather than the class of the first match. 183 | 184 | Version 0.3-4 185 | 186 | o all SQL statements now work with H2 rather than just a subset 187 | 188 | Version 0.3-3 189 | 190 | o if RpgSQL is loaded or if sqldf drv argument is "RpgSQL" then sqldf will use 191 | PostgreSQL database 192 | 193 | o database persistance (sqldf without arguments) now works with H2 database 194 | (previously only SQLite) 195 | 196 | o bug fixes 197 | 198 | Version 0.3-2 199 | 200 | o if RH2 package is loaded then sqldf will use H2 database 201 | 202 | Version 0.2-1 203 | 204 | o bug fixes 205 | 206 | o certain Date conversions which previously required as.Date.numeric from zoo 207 | package no longer depend on zoo 208 | 209 | Version 0.2-0 210 | 211 | o column names which are SQL reserved words are no longer mangled 212 | 213 | o file.format list can contain a filter which is a batch or shell command 214 | that filters the input file 215 | 216 | o read.csv2.sql is like read.csv.sql but has a default filter which translates 217 | each comma in the input file to a dot 218 | 219 | o if getOption("sqldf.dll") defaulting to libspatial-1.dll is found on the PATH 220 | then it will be loaded as an SQLite loadable extension. With the default this 221 | gives access to these SQL functions: 222 | http://www.gaia-gis.it/spatialite/spatialite-sql-2.3.1.html 223 | The sqldf package does not itself include this or any other SQLite loadable 224 | extension. The user must download and place it on the PATH if they wish to 225 | use it. libspatialite-1.dll can be found here: 226 | http://www.gaia-gis.it/spatialite 227 | If sqldf does not find the dll on the PATH then sqldf will continue to work 228 | but without access to these functions. 229 | 230 | Version 0.1-7 231 | 232 | o now supports table names with a dot in them provided that they are 233 | placed within backquotes 234 | 235 | o bug fixes (thanks to Soren Hojsgaard for bug report) 236 | 237 | Version 0.1-6 238 | 239 | o new command read.csv2.sql 240 | 241 | Version 0.1-5 242 | 243 | o minor improvement in second example in ?sqldf (thanks to Wacek Kusnierczyk) 244 | 245 | o new command read.csv.sql 246 | 247 | Version 0.1-4 248 | 249 | o removed junk files 250 | 251 | Version 0.1-3 252 | 253 | o corrected DESCRIPTION file 254 | 255 | Version 0.1-2 256 | 257 | o searches for and uses "file" objects as well as data frame objects. 258 | file.format argument or "file.format" attribute on file object 259 | is a list with arguments accepted by sqliteImportFile. 260 | [Thanks to Soren Hojsgaard for suggestion to support files.] 261 | 262 | o dots in argument list replaced with a single variable x. 263 | x can be a "character" vector in which case each component is 264 | executed by SQLite in turn and result of the last is returned. 265 | 266 | o new dbname argument can be specified. For SQLite it defaults to ":memory:", 267 | i.e. an embedded data base. If database does not exist it is created 268 | and deleted upon exit. If it does exist then only tables created by sqldf 269 | are deleted on exit but not the database itself. 270 | 271 | o improvements in sqldf.Rd 272 | 273 | o MySQL testing 274 | 275 | o added a demo illustrating sorting and grouping vs. nested selects 276 | 277 | o new example illustrating vector x 278 | 279 | o support for POSIXct, Date and chron dates and times 280 | 281 | Version 0.1-1 282 | 283 | o removed use of subset in favor of subscripts since codetools chokes on it 284 | 285 | o added table example to sqldf.Rd 286 | 287 | o improved explanation in sqldf.Rd 288 | 289 | Version 0.1-0 290 | 291 | o initial release 292 | 293 | -------------------------------------------------------------------------------- /inst/THANKS: -------------------------------------------------------------------------------- 1 | 2 | Thanks to: 3 | 4 | - Ben Escoto for some useful suggestions, bug reports and code suggestions. 5 | - Soren Hojsgaard for some useful suggestions and bug reports. 6 | - Jussi Lehto for pointing out a bug and suggesting a fix. 7 | - Cela-Diaz, Fernando for reporting a bug. 8 | - Vadlamani, Satis for reporting a bug. 9 | - Wacek Kusnierczyk for suggesting an improvement to 2 examples. 10 | - Jim Holtman for pointing out a bug and suggesting a fix. 11 | - Tomoaki NISHIYAMA for fixes. 12 | 13 | -------------------------------------------------------------------------------- /inst/csv.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | #************************************************************************** 3 | # 4 | # 2011/11/12 use OFS as output field separator 5 | # e.g. gawk -f csv.awk -v OFS=, myfile.csv 6 | # 7 | # This file is in the public domain. 8 | # 9 | # For more information email LoranceStinson+csv@gmail.com. 10 | # Or see http://lorance.freeshell.org/csv/ 11 | # 12 | # Parse a CSV string into an array. 13 | # The number of fields found is returned. 14 | # In the event of an error a negative value is returned and csverr is set to 15 | # the error. See below for the error values. 16 | # 17 | # Parameters: 18 | # string = The string to parse. 19 | # csv = The array to parse the fields into. 20 | # sep = The field separator character. Normally , 21 | # quote = The string quote character. Normally " 22 | # escape = The quote escape character. Normally " 23 | # newline = Handle embedded newlines. Provide either a newline or the 24 | # string to use in place of a newline. If left empty embedded 25 | # newlines cause an error. 26 | # trim = When true spaces around the separator are removed. 27 | # This affects parsing. Without this a space between the 28 | # separator and quote result in the quote being ignored. 29 | # 30 | # These variables are private: 31 | # fields = The number of fields found thus far. 32 | # pos = Where to pull a field from the string. 33 | # strtrim = True when a string is found so we know to remove the quotes. 34 | # 35 | # Error conditions: 36 | # -1 = Unable to read the next line. 37 | # -2 = Missing end quote. 38 | # -3 = Missing separator. 39 | # 40 | # Notes: 41 | # The code assumes that every field is preceded by a separator, even the 42 | # first field. This makes the logic much simpler, but also requires a 43 | # separator be prepended to the string before parsing. 44 | #************************************************************************** 45 | function parse_csv(string,csv,sep,quote,escape,newline,trim, fields,pos,strtrim) { 46 | # Make sure there is something to parse. 47 | if (length(string) == 0) return 0; 48 | string = sep string; # The code below assumes ,FIELD. 49 | fields = 0; # The number of fields found thus far. 50 | while (length(string) > 0) { 51 | # Remove spaces after the separator if requested. 52 | if (trim && substr(string, 2, 1) == " ") { 53 | if (length(string) == 1) return fields; 54 | string = substr(string, 2); 55 | continue; 56 | } 57 | strtrim = 0; # Used to trim quotes off strings. 58 | # Handle a quoted field. 59 | if (substr(string, 2, 1) == quote) { 60 | pos = 2; 61 | do { 62 | pos++ 63 | if (pos != length(string) && 64 | substr(string, pos, 1) == escape && 65 | (substr(string, pos + 1, 1) == quote || 66 | substr(string, pos + 1, 1) == escape)) { 67 | # Remove escaped quote characters. 68 | string = substr(string, 1, pos - 1) substr(string, pos + 1); 69 | } else if (substr(string, pos, 1) == quote) { 70 | # Found the end of the string. 71 | strtrim = 1; 72 | } else if (newline && pos >= length(string)) { 73 | # Handle embedded newlines if requested. 74 | if (getline == -1) { 75 | csverr = "Unable to read the next line."; 76 | return -1; 77 | } 78 | string = string newline $0; 79 | } 80 | } while (pos < length(string) && strtrim == 0) 81 | if (strtrim == 0) { 82 | csverr = "Missing end quote."; 83 | return -2; 84 | } 85 | } else { 86 | # Handle an empty field. 87 | if (length(string) == 1 || substr(string, 2, 1) == sep) { 88 | csv[fields] = ""; 89 | fields++; 90 | if (length(string) == 1) 91 | return fields; 92 | string = substr(string, 2); 93 | continue; 94 | } 95 | # Search for a separator. 96 | pos = index(substr(string, 2), sep); 97 | # If there is no separator the rest of the string is a field. 98 | if (pos == 0) { 99 | csv[fields] = substr(string, 2); 100 | fields++; 101 | return fields; 102 | } 103 | } 104 | # Remove spaces after the separator if requested. 105 | if (trim && pos != length(string) && substr(string, pos + strtrim, 1) == " ") { 106 | trim = strtrim 107 | # Count the number fo spaces found. 108 | while (pos < length(string) && substr(string, pos + trim, 1) == " ") { 109 | trim++ 110 | } 111 | # Remove them from the string. 112 | string = substr(string, 1, pos + strtrim - 1) substr(string, pos + trim); 113 | # Adjust pos with the trimmed spaces if a quotes string was not found. 114 | if (!strtrim) { 115 | pos -= trim; 116 | } 117 | } 118 | # Make sure we are at the end of the string or there is a separator. 119 | if ((pos != length(string) && substr(string, pos + 1, 1) != sep)) { 120 | csverr = "Missing separator."; 121 | return -3; 122 | } 123 | # Gather the field. 124 | csv[fields] = substr(string, 2 + strtrim, pos - (1 + strtrim * 2)); 125 | fields++; 126 | # Remove the field from the string for the next pass. 127 | string = substr(string, pos + 1); 128 | } 129 | return fields; 130 | } 131 | 132 | BEGIN { 133 | if (osep == "") osep = ","; 134 | OFS = osep; 135 | if (isep == "") isep = ","; 136 | if (quote == "") quote = "\""; 137 | if (escape == "") escape = "\""; 138 | if (newline == "") newline = "\\n"; 139 | if (trim == "") trim = 1; 140 | } 141 | 142 | { 143 | num_fields = parse_csv($0, csv, isep, quote, escape, newline, trim); 144 | if (num_fields < 0) { 145 | printf "ERROR: %s (%d) -> %s\n", csverr, num_fields, $0; 146 | } else { 147 | printf "%s", csv[0] 148 | for (i = 1;i < num_fields;i++) { 149 | printf "%s%s", OFS, csv[i]; 150 | } 151 | printf "\n"; 152 | } 153 | } 154 | 155 | -------------------------------------------------------------------------------- /inst/trcomma2dot.vbs: -------------------------------------------------------------------------------- 1 | '' convert commas to dots 2 | '' based on code by Ekkehard Horner 3 | 4 | Option Explicit 5 | 6 | WScript.Quit miniTr() 7 | 8 | Function miniTr() 9 | Dim sSearch : sSearch = "," 10 | Dim sReplace : sReplace = "." 11 | Do Until WScript.StdIn.AtEndOfStream 12 | Dim sInpChr : sInpChr = WScript.StdIn.Read( 1 ) 13 | Dim sOutChr : sOutChr = sInpChr 14 | Dim nPos : nPos = InStr( sSearch, sInpChr ) 15 | If 0 < nPos Then 16 | sOutChr = Mid( sReplace, nPos, 1 ) 17 | End If 18 | WScript.StdOut.Write sOutChr 19 | Loop 20 | miniTr = 0 21 | End Function 22 | -------------------------------------------------------------------------------- /inst/unitTests/runit.all.R: -------------------------------------------------------------------------------- 1 | 2 | # To run these tests: 3 | # library(sqldf) 4 | # library(svUnit) 5 | # # optionally: library(RH2), library(RMySQL) or library(RpgSQL) 6 | # # otherwise sqlite will be used. 7 | # # optionally: options(sqldf.verbose = TRUE) 8 | # runit.all <- system.file("unitTests", "runit.all.R", package = "sqldf") 9 | # source(runit.all); clearLog(); test.all() 10 | # Log() 11 | 12 | test.all <- function() { 13 | 14 | # set drv 15 | drv <- getOption("sqldf.driver") 16 | if (is.null(drv)) { 17 | drv <- if ("package:RPostgreSQL" %in% search()) { "PostgreSQL" 18 | } else if ("package:RpgSQL" %in% search()) { "pgSQL" 19 | } else if ("package:RMySQL" %in% search()) { "MySQL" 20 | } else if ("package:RH2" %in% search()) { "H2" 21 | } else "SQLite" 22 | } 23 | drv <- tolower(drv) 24 | cat("using driver:", drv, "\n") 25 | 26 | # head 27 | a1r <- head(warpbreaks) 28 | a1s <- sqldf("select * from warpbreaks limit 6") 29 | checkIdentical(a1r, a1s) 30 | 31 | # subset / like 32 | a2r <- subset(as.data.frame(CO2), grepl("^Qn", Plant)) 33 | class(a2r) <- "data.frame" 34 | if (drv == "postgresql") { # pgsql needs quotes for Plant 35 | a2s <- sqldf("select * from \"CO2\" where \"Plant\" like 'Qn%'") 36 | } else if (drv == "pgsql") { 37 | a2s <- sqldf("select * from CO2 where \"Plant\" like 'Qn%'") 38 | } else a2s <- sqldf("select * from CO2 where Plant like 'Qn%'") 39 | # checkEquals(a2r, a2s, check.attributes = FALSE) 40 | checkIdentical(a2r, a2s) 41 | 42 | data(farms, package = "MASS") 43 | # subset / in 44 | a3r <- subset(farms, Manag %in% c("BF", "HF")) 45 | row.names(a3r) <- NULL 46 | if (drv == "pgsql" || drv == "postgresql") { # pgsql needs quotes for Manag 47 | a3s <- sqldf("select * from farms where \"Manag\" in ('BF', 'HF')") 48 | } else { 49 | a3s <- sqldf("select * from farms where Manag in ('BF', 'HF')") 50 | } 51 | checkIdentical(a3r, a3s) 52 | 53 | # subset / multiple inequality constraints 54 | a4r <- subset(warpbreaks, breaks >= 20 & breaks <= 30) 55 | a4s <- sqldf("select * from warpbreaks where breaks between 20 and 30", 56 | row.names = TRUE) 57 | if (drv == "h2" || drv == "pgsql" || drv == "postgresql") { 58 | checkEquals(a4r, a4s, check.attributes = FALSE) 59 | } else checkIdentical(a4r, a4s) 60 | 61 | # subset 62 | a5r <- subset(farms, Mois == 'M1') 63 | if (drv == "pgsql" || drv == "postgresql") { 64 | a5s <- sqldf("select * from farms where \"Mois\" = 'M1'", 65 | row.names = TRUE) 66 | } else a5s <- sqldf("select * from farms where Mois = 'M1'", row.names = TRUE) 67 | if (drv == "sqlite") { 68 | checkIdentical(a5r, a5s) 69 | # } else if (drv != "mysql") checkEquals(a5r, a5s, check.attributes = FALSE) 70 | } else checkEquals(a5r, a5s, check.attributes = FALSE) 71 | 72 | # subset 73 | a6r <- subset(farms, Mois == 'M2') 74 | if (drv == "pgsql" || drv == "postgresql") { 75 | a6s <- sqldf("select * from farms where \"Mois\" = 'M2'", row.names = TRUE) 76 | } else { 77 | a6s <- sqldf("select * from farms where Mois = 'M2'", row.names = TRUE) 78 | } 79 | if (drv == "sqlite") { 80 | checkIdentical(a6r, a6s) 81 | } else checkEquals(a6r, a6s, check.attributes = FALSE) 82 | 83 | # rbind 84 | a7r <- rbind(a5r, a6r) 85 | a7s <- sqldf("select * from a5s union all select * from a6s") 86 | 87 | # sqldf drops the unused levels of Mois but rbind does not; however, 88 | # all data is the same and the other columns are identical 89 | row.names(a7r) <- NULL 90 | checkIdentical(a7r[-1], a7s[-1]) 91 | # } 92 | 93 | # aggregate - avg conc and uptake by Plant and Type 94 | # pgsql and h2 require double quote quoting 95 | # whereas sqlite and mysql can use backquote 96 | # Species needs to be quoted in pgsql but not in the other data bases. 97 | a8r <- aggregate(iris[1:2], iris[5], mean) 98 | if (drv == "pgsql" || drv == "postgresql" || drv == "h2" || drv == "sqlite") { 99 | a8s <- sqldf('select "Species", avg("Sepal.Length") \"Sepal.Length\", 100 | avg("Sepal.Width") \"Sepal.Width\" from iris 101 | group by "Species" order by "Species"') 102 | if (drv == "postgresql") checkIdentical(a8r, a8s) else 103 | checkEquals(a8r, a8s, check.attributes = FALSE) 104 | } else { 105 | a8s <- sqldf("select Species, avg(Sepal_Length) `Sepal.Length`, 106 | avg(Sepal_Width) `Sepal.Width` from iris 107 | group by Species order by Species") 108 | # checkEquals(a8r, a8s, check.attributes = drv != "mysql") 109 | checkEquals(a8r, a8s) 110 | } 111 | 112 | 113 | # by - avg conc and total uptake by Plant and Type 114 | a9r <- do.call(rbind, by(iris, iris[5], function(x) with(x, 115 | data.frame(Species = Species[1], 116 | mean.Sepal.Length = mean(Sepal.Length), 117 | mean.Sepal.Width = mean(Sepal.Width), 118 | mean.Sepal.ratio = mean(Sepal.Length/Sepal.Width))))) 119 | row.names(a9r) <- NULL 120 | if (drv == "pgsql" || drv == "postgresql" || drv == "h2" || drv == "sqlite") { 121 | a9s <- sqldf('select "Species", avg("Sepal.Length") "mean.Sepal.Length", 122 | avg("Sepal.Width") "mean.Sepal.Width", 123 | avg("Sepal.Length"/"Sepal.Width") "mean.Sepal.ratio" from iris 124 | group by "Species" order by "Species"') 125 | checkEquals(a9r, a9s, check.attributes = FALSE) 126 | } else { 127 | a9s <- sqldf("select Species, avg(Sepal_Length) `mean.Sepal.Length`, 128 | avg(Sepal_Width) `mean.Sepal.Width`, 129 | avg(Sepal_Length/Sepal_Width) `mean.Sepal.ratio` from iris 130 | group by Species order by Species") 131 | # checkEquals(a9r, a9s, check.attributes = drv != "mysql") 132 | checkEquals(a9r, a9s) 133 | } 134 | 135 | # head - top 3 breaks 136 | a10r <- head(warpbreaks[order(warpbreaks$breaks, decreasing = TRUE), ], 3) 137 | a10s <- sqldf("select * from warpbreaks order by breaks desc limit 3") 138 | row.names(a10r) <- NULL 139 | checkIdentical(a10r, a10s) 140 | 141 | # head - bottom 3 breaks 142 | a11r <- head(warpbreaks[order(warpbreaks$breaks), ], 3) 143 | a11s <- sqldf("select * from warpbreaks order by breaks limit 3") 144 | # attributes(a11r) <- attributes(a11s) <- NULL 145 | row.names(a11r) <- NULL 146 | checkIdentical(a11r, a11s) 147 | 148 | # ave - rows for which v exceeds its group average where g is group 149 | DF <- data.frame(g = rep(1:2, each = 5), t = rep(1:5, 2), v = 1:10) 150 | a12r <- subset(DF, v > ave(v, g, FUN = mean)) 151 | if (drv == "postgresql") { 152 | Gavg <- sqldf('select g, avg(v) as avg_v from "DF" group by g') 153 | a12s <- sqldf('select "DF".g, t, v from "DF", "Gavg" where "DF".g = "Gavg".g and v > avg_v') 154 | } else { 155 | Gavg <- sqldf("select g, avg(v) as avg_v from DF group by g") 156 | a12s <- sqldf("select DF.g, t, v from DF, Gavg where DF.g = Gavg.g and v > avg_v") 157 | } 158 | row.names(a12r) <- NULL 159 | checkIdentical(a12r, a12s) 160 | 161 | # same but reduce the two select statements to one using a subquery 162 | a13s <- if (drv == "postgresql") { 163 | sqldf('select g, t, v from "DF" d1, (select g as g2, avg(v) as avg_v from "DF" group by g) d2 where d1.g = g2 and v > avg_v') 164 | } else { 165 | sqldf("select g, t, v from DF d1, (select g as g2, avg(v) as avg_v from DF group by g) d2 where d1.g = g2 and v > avg_v") 166 | } 167 | checkIdentical(a12r, a13s) 168 | 169 | # same but shorten using natural join 170 | a14s <- if (drv == "postgresql") { 171 | sqldf('select d1.g, t, v from "DF" d1 natural join 172 | (select g, avg(v) as avg_v from "DF" group by g) d2 where v > avg_v') 173 | } else { 174 | sqldf("select d1.g, t, v from DF d1 natural join 175 | (select g, avg(v) as avg_v from DF group by g) d2 where v > avg_v") 176 | } 177 | checkIdentical(a12r, a14s) 178 | 179 | # table 180 | a15r <- table(warpbreaks$tension, warpbreaks$wool) 181 | if (drv == "pgsql" || drv == "postgresql") { 182 | a15s <- sqldf("select 183 | sum(cast(wool = 'A' as integer)), 184 | sum(cast(wool = 'B' as integer)) 185 | from warpbreaks group by tension") 186 | } else { 187 | a15s <- sqldf("select SUM(wool = 'A'), SUM(wool = 'B') 188 | from warpbreaks group by tension") 189 | } 190 | 191 | checkEquals(as.data.frame.matrix(a15r), a15s, check.attributes = FALSE) 192 | 193 | # reshape 194 | t.names <- paste("t", unique(as.character(DF$t)), sep = "_") 195 | a16r <- reshape(DF, direction = "wide", timevar = "t", idvar = "g", varying = list(t.names)) 196 | if (drv == "postgresql") { 197 | a16s <- sqldf("select g, 198 | sum(cast(t = 1 as integer) * v) t_1, 199 | sum(cast(t = 2 as integer) * v) t_2, 200 | sum(cast(t = 3 as integer) * v) t_3, 201 | sum(cast(t = 4 as integer) * v) t_4, 202 | sum(cast(t = 5 as integer) * v) t_5 from \"DF\" group by g") 203 | } else if (drv == "pgsql") { 204 | a16s <- sqldf("select g, 205 | sum(cast(t = 1 as integer) * v) t_1, 206 | sum(cast(t = 2 as integer) * v) t_2, 207 | sum(cast(t = 3 as integer) * v) t_3, 208 | sum(cast(t = 4 as integer) * v) t_4, 209 | sum(cast(t = 5 as integer) * v) t_5 from DF group by g") 210 | } else a16s <- sqldf("select g, 211 | SUM((t = 1) * v) t_1, 212 | SUM((t = 2) * v) t_2, 213 | SUM((t = 3) * v) t_3, 214 | SUM((t = 4) * v) t_4, 215 | SUM((t = 5) * v) t_5 from DF group by g") 216 | checkEquals(a16r, a16s, check.attributes = FALSE) 217 | 218 | # order 219 | a17r <- Formaldehyde[order(Formaldehyde$optden, decreasing = TRUE), ] 220 | if (drv == "postgresql") { 221 | a17s <- sqldf("select * from \"Formaldehyde\" order by optden desc") 222 | } else a17s <- sqldf("select * from Formaldehyde order by optden desc") 223 | row.names(a17r) <- NULL 224 | checkIdentical(a17r, a17s) 225 | 226 | DF <- data.frame(x = rnorm(15, 1:15)) 227 | a18r <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7))) 228 | # centered moving average of length 7 229 | if (drv == "postgresql" || drv == "h2") { 230 | DF2 <- cbind(id = 1:nrow(DF), DF) 231 | a18s <- sqldf("select min(a.x) x, avg(b.x) movavgx 232 | from \"DF2\" a, \"DF2\" b 233 | where a.id - b.id between -3 and 3 234 | group by a.id having count(*) = 7 235 | order by a.id") 236 | } else if (drv == "pgsql") { 237 | DF2 <- cbind(id = 1:nrow(DF), DF) 238 | a18s <- sqldf("select min(a.x) x, avg(b.x) movavgx 239 | from DF2 a, DF2 b 240 | where a.id - b.id between -3 and 3 241 | group by a.id having count(*) = 7 242 | order by a.id") 243 | } else { 244 | a18s <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b 245 | where a.row_names - b.row_names between -3 and 3 246 | group by a.row_names having count(*) = 7 247 | order by a.row_names+0", 248 | row.names = TRUE) 249 | } 250 | checkEquals(a18r, a18s) 251 | 252 | # merge. a19r and a19s are same except row order and row names 253 | A <- data.frame(a1 = c(1, 2, 1), a2 = c(2, 3, 3), a3 = c(3, 1, 2)) 254 | B <- data.frame(b1 = 1:2, b2 = 2:1) 255 | a19s <- if (drv == "postgresql") { 256 | sqldf('select * from "A", "B"') 257 | } else sqldf("select * from A, B") 258 | a19r <- merge(A, B) 259 | Sort <- function(DF) DF[do.call(order, DF),] 260 | checkEquals(Sort(a19s), Sort(a19r), check.attributes = FALSE) 261 | 262 | if (drv != "postgresql") { 263 | 264 | # check Date class 265 | DF.Date <- structure(list(date = structure(c(-15676, -15648), 266 | class = "Date"), x = c(2, 3), y = c(4, 5)), 267 | .Names = c("date", "x", "y"), row.names = 1:2, class = "data.frame") 268 | g <- data.frame(date=as.Date(c("1927-01-31","1927-02-28")),x=c(2,3)) 269 | h <- data.frame(date=as.Date(c("1927-01-31","1927-02-28")),y=c(4,5)) 270 | final <- sqldf("select d1.*, d2.y 271 | from g d1 left join h d2 on d1.date=d2.date") 272 | checkEquals(final, DF.Date) 273 | 274 | } 275 | 276 | # test sqlite system tables 277 | 278 | if (drv == "sqlite") { 279 | 280 | checkIdentical(dim(sqldf("pragma table_info(BOD)")), c(2L, 6L)) 281 | 282 | sql <- c("select * from BOD", "select * from sqlite_master") 283 | checkIdentical(dim(sqldf(sql)), c(1L, 5L)) 284 | 285 | checkTrue(sqldf("pragma database_list")$name == "main") 286 | 287 | DF <- data.frame(a = 1:2, b = 2:1) 288 | 289 | checkIdentical(sqldf("select a/b as quotient from DF")$quotient, c(0L, 2L)) 290 | 291 | checkIdentical(sqldf("select (a+0.0)/b as quotient from DF")$quotient, c(0.5, 2.0)) 292 | 293 | checkIdentical(sqldf("select cast(a as real)/b as quotient from DF")$quotient, c(0.5, 2.0)) 294 | 295 | checkIdentical(sqldf(c("create table mytab(a real, b real)", 296 | "insert into mytab select * from DF", 297 | "select a/b as quotient from mytab"))$quotient, c(0.5, 2.0)) 298 | 299 | tonum <- function(DF) replace(DF, TRUE, lapply(DF, as.numeric)) 300 | checkIdentical(sqldf("select a/b as quotient from DF", 301 | method = list("auto", tonum))$quotient, c(0.5, 2.0)) 302 | } 303 | 304 | } 305 | -------------------------------------------------------------------------------- /man/read.csv.sql.Rd: -------------------------------------------------------------------------------- 1 | \name{read.csv.sql} 2 | \Rdversion{1.1} 3 | \alias{read.csv.sql} 4 | \alias{read.csv2.sql} 5 | \title{ 6 | Read File Filtered by SQL 7 | } 8 | \description{ 9 | Read a file into R filtering it with an sql statement. Only the filtered 10 | portion is processed by R so that files larger than R can otherwise 11 | handle can be accommodated. 12 | } 13 | \usage{ 14 | read.csv.sql(file, sql = "select * from file", header = TRUE, sep = ",", 15 | row.names, eol, skip, filter, nrows, field.types, 16 | colClasses, dbname = tempfile(), drv = "SQLite", ...) 17 | read.csv2.sql(file, sql = "select * from file", header = TRUE, sep = ";", 18 | row.names, eol, skip, filter, nrows, field.types, 19 | colClasses, dbname = tempfile(), drv = "SQLite", ...) 20 | } 21 | %- maybe also 'usage' for other objects documented here. 22 | \arguments{ 23 | \item{file}{ 24 | A file path or a URL (beginning with \code{http://} or \code{ftp://}). If 25 | the \code{filter} argument is used and no file is to be input to the filter 26 | then \code{file} can be omitted, \code{NULL}, \code{NA} or \code{""}. 27 | } 28 | \item{sql}{ 29 | character string holding an SQL statement. The table representing the 30 | file should be referred to as \code{file}. 31 | } 32 | \item{header}{ 33 | As in \code{read.csv}. 34 | } 35 | \item{sep}{ 36 | As in \code{read.csv}. 37 | } 38 | \item{row.names}{ 39 | As in \code{read.csv}. 40 | } 41 | \item{eol}{ 42 | Character which ends line. 43 | } 44 | \item{skip}{ 45 | Skip indicated number of lines in input file. 46 | } 47 | \item{filter}{ 48 | If specified, this should be a shell/batch command that the input file is piped through. For \code{read.csv2.sql} it is by default the following on non-Windows systems: \code{tr , .}. This translates all commas in the file to dots. On Windows similar functionalty is provided but to do that using a vbscript file that is included with \code{sqldf} to emulate the \code{tr} command. 49 | } 50 | \item{nrows}{ 51 | Number of rows used to determine column types. It defaults to 50. Using 52 | \code{-1} causes it to use all rows for determining column types. 53 | This argument is rarely needed. 54 | } 55 | \item{field.types}{ 56 | A list whose names are the column names and whose 57 | contents are the SQLite types (not the R class names) of the 58 | columns. Specifying these types improves how fast it takes. 59 | Unless speed is very important this argument is not normally used. 60 | } 61 | \item{colClasses}{As in \code{read.csv}. 62 | } 63 | \item{dbname}{ 64 | As in \code{sqldf} except that the default is \code{tempfile()}. 65 | Specifying \code{NULL} will put the database in memory which may improve speed 66 | but will limit the size of the database by the available memory. 67 | } 68 | \item{drv}{ 69 | This argument is ignored. 70 | Currently the only database SQLite supported by \code{read.csv.sql} and 71 | \code{read.csv2.sql} is SQLite. 72 | Note that the H2 database has a builtin SQL function, 73 | \code{CSVREAD}, which can be used in place of \code{read.csv.sql}. 74 | } 75 | \item{\dots}{ 76 | Passed to \code{sqldf}. 77 | } 78 | } 79 | \details{ 80 | Reads the indicated file into an sql database creating the database 81 | if it does not already exist. Then it applies the sql statement 82 | returning the result as a data frame. If the database did not exist 83 | prior to this statement it is removed. 84 | 85 | Note that it uses facilities of \code{SQLite} to read the file 86 | which are intended for speed and therefore 87 | not as flexible as in R. For example, it does not 88 | recognize quoted fields as special but will regard the quotes as 89 | part of the field. See the 90 | \code{sqldf} help for more information. 91 | 92 | \code{read.csv2.sql} is like \code{read.csv.sql} except 93 | the default \code{sep} is \code{";"} and the default \code{filter} translates 94 | all commas in the file to decimal points (i.e. to dots). 95 | 96 | On Windows, if the \code{filter} argument is used and if Rtools is detected 97 | in the registry then the Rtools bin directory is added to the search path 98 | facilitating use of those tools without explicitly setting any the path. 99 | 100 | } 101 | \value{ 102 | If the sql statement is a select statement then a data frame 103 | is returned. 104 | } 105 | 106 | \examples{ 107 | \dontrun{ 108 | # might need to specify eol= too depending on your system 109 | write.csv(iris, "iris.csv", quote = FALSE, row.names = FALSE) 110 | iris2 <- read.csv.sql("iris.csv", 111 | sql = "select * from file where Species = 'setosa' ") 112 | 113 | } 114 | } 115 | \keyword{ manip } 116 | -------------------------------------------------------------------------------- /man/sqldf-package.Rd: -------------------------------------------------------------------------------- 1 | \name{sqldf-package} 2 | \alias{sqldf-package} 3 | \docType{package} 4 | \title{ 5 | sqldf package overview 6 | } 7 | \description{ 8 | Provides an easy way to perform SQL selects on R data frames. 9 | } 10 | \details{ 11 | The package contains a single function \link{sqldf} whose help file 12 | contains more information and examples. 13 | } 14 | \references{ 15 | The \link{sqldf} help page contains the primary documentation. 16 | The sqldf github home page \url{https://github.com/ggrothendieck/sqldf} 17 | contains links to SQLite pages that may be helpful in formulating queries. 18 | } 19 | \keyword{ package } 20 | 21 | 22 | -------------------------------------------------------------------------------- /man/sqldf.Rd: -------------------------------------------------------------------------------- 1 | \name{sqldf} 2 | \alias{sqldf} 3 | \title{SQL select on data frames} 4 | \description{ 5 | SQL select on data frames 6 | } 7 | \usage{ 8 | 9 | sqldf(x, stringsAsFactors = FALSE, 10 | row.names = FALSE, envir = parent.frame(), 11 | method = getOption("sqldf.method"), 12 | file.format = list(), dbname, drv = getOption("sqldf.driver"), 13 | user, password = "", host = "localhost", port, 14 | dll = getOption("sqldf.dll"), connection = getOption("sqldf.connection"), 15 | verbose = isTRUE(getOption("sqldf.verbose"))) 16 | 17 | } 18 | 19 | \arguments{ 20 | \item{x}{Character string representing an SQL select statement or 21 | character vector whose components each represent a successive SQL 22 | statement to be executed. 23 | The select statement syntax must conform to the particular database 24 | being used. If x is missing then it establishes a connection 25 | which subsequent sqldf statements access. In that case the database 26 | is not destroyed until the next sqldf statement with no x.} 27 | \item{stringsAsFactors}{ If \code{TRUE} then those columns output 28 | from the database as \code{"character"} are 29 | converted to \code{"factor"} if the heuristic is unable to determine 30 | the class.} 31 | \item{row.names}{For \code{TRUE} the tables in the data base are given 32 | a \code{row_names} column filled with the row names of the corresponding 33 | data frames. Note that in SQLite a special \code{rowid} (or equivalently 34 | \code{oid} or \code{_rowid_}) is available in any case.} 35 | \item{envir}{ The environment where the data frames representing the tables 36 | are to be found.} 37 | \item{method}{This argument is a list of two functions, keywords or character 38 | vectors. If the second component of the list 39 | is \code{NULL} (the default) then the first component 40 | of the list can be specified without wrapping it in a list. The 41 | first component 42 | specifies a transformation of the data frame output from the database 43 | and the second specifies a transformation to each data frame that is 44 | passed to the data base just before it is read into the database. 45 | The second component is less frequently used. 46 | If the first component is \code{NULL} or not 47 | specified that it defaults to "auto". If the second component is \code{NULL} 48 | or not specified then no transformation is performed on the input. 49 | 50 | The allowable keywords for the first components are (1) \code{"auto"} 51 | which is the default and automatically assigns the class of each 52 | column using the heuristic described later, (2) \code{"auto.factor"} which is 53 | the same as \code{"auto"} but does not assign \code{"factor"} and 54 | \code{"ordered"} classes, 55 | (3) \code{"raw"} or \code{NULL} which means use 56 | whatever classes are returned by the database with no automatic processing 57 | and (4) \code{"name__class"} which 58 | means that columns names that end in \code{__class} 59 | with two underscores 60 | where \code{class} is an R class (such as \code{Date}) are converted to 61 | that class and the \code{__class} portion is removed from the column name. 62 | For example, 63 | \code{sqldf("select a as x__Date from DF", method = "name__class")} 64 | would cause 65 | column \code{a} to be coerced to class \code{Date} and have the column name 66 | \code{x}. 67 | The first component of \code{method} 68 | can also be a character vector of classes to assign to the 69 | returned data.frame. The example just given could alternately be implemented 70 | using 71 | \code{sqldf("select a as x from DF", method = "Date")} 72 | Note that when \code{Date} is used in this way it assumes the database 73 | contains the number of days since January 1, 1970. If the date is in 74 | the format \code{yyyy-mm-dd} then use \code{Date2} as the class. 75 | } 76 | \item{file.format}{A list whose components are passed to 77 | \code{sqliteImportFile}. Components may include \code{sep}, 78 | \code{header}, \code{row.names}, \code{skip}, \code{eol} and \code{filter}. 79 | Except for \code{filter} they are passed to 80 | \code{sqliteImportFile} and have the same default values as 81 | in \code{sqliteImportFile} (except for \code{eol} which defaults 82 | to the end of line character(s) for the operating system in use -- note 83 | that 84 | if the file being read does not have the line endings for the platform 85 | being used then \code{eol} will have to be specified. In particular, 86 | certain UNIX-like tools on Windows may produce files with UNIX line endings 87 | in which case \code{eol="\n"} should be specified). 88 | \code{filter} may optionally contain a batch/shell command through which the 89 | input file is piped prior to reading it in. Alternately \code{filter} 90 | may be a list whose first component is a batch/shell command containing 91 | names which correspond to the names of the subsequent list components. 92 | These subsequent components should each be a character 93 | vector which \code{sqldf} will read into a temporary file. The name of 94 | the temporary file will be replaced into the command. For example, 95 | \code{filter = list("gawk -f prog", prog = '{ print gensub(/,/, ".", "g") }')} . 96 | command line quoting which may vary among shells and Windows. 97 | Note that if the filter produces files with UNIX line endings on Windows 98 | then \code{eol} must be specified, as discussed above. 99 | \code{file.format} may be set to \code{NULL} in order not to search 100 | for input file objects at all. The \code{file.format} can also 101 | be specified as an attribute in each file object itself in which case 102 | such specification overrides any given through the argument list. There 103 | is further discussion of \code{file.format} below.} 104 | \item{dbname}{Name of the database. For SQLite and h2 data bases this 105 | defaults to 106 | \code{":memory:"} which results in an embedded database. For MySQL this 107 | defaults to \code{getOption("RMysql.dbname")} and if that is not specified 108 | then \code{"test"} is used. 109 | For RPostgreSQL this 110 | defaults to \code{getOption("sqldf.RPostgreSQL.dbname")} and if that is 111 | not specified then \code{"test"} is used. 112 | } 113 | \item{drv}{\code{"SQLite"}, \code{"MySQL"}, \code{"h2"}, 114 | \code{"PostgreSQL"} or \code{"pgSQL"} or any of those names prefaced with 115 | \code{"R"}. 116 | If not specified then 117 | the \code{"dbDriver"} option is checked and if that is not set then 118 | \code{sqldf} checks whether \code{RPostgreSQL}, 119 | \code{RMySQL} or \code{RH2} 120 | is loaded in that order and the driver corresponding to the first one 121 | found is used. If none are loaded then \code{"SQLite"} is used. 122 | \code{dbname=NULL} causes the default to be used.} 123 | \item{user}{user name. Not needed for embedded databases. 124 | For RPostgreSQL 125 | the default is taken from option \code{sqldf.RPostgreSQL.user} and 126 | if that is not specified either then \code{"postgres"} is used. 127 | } 128 | \item{password}{password. Not needed for embedded databases. 129 | For RPostgreSQL 130 | the default is taken from option \code{sqldf.RPostgreSQL.password} and 131 | if that is not specified then \code{"postgres"} is used. 132 | } 133 | \item{host}{host. Default of "localhost" is normally sufficient. 134 | For RPostgreSQL 135 | the default is taken from option \code{sqldf.RPostgreSQL.host} and 136 | if that is not specified then \code{"test"} is used. 137 | 138 | } 139 | \item{port}{port. For RPostgreSQL the default 140 | is taken from the option \code{sqldf.RPostgreSQL.port} and if that is not 141 | specified then \code{5432} is used. 142 | } 143 | \item{dll}{Name of an SQLite loadable extension to automatically load. 144 | If found on PATH then it is 145 | automatically loaded and the SQLite functions it in will be accessible.} 146 | \item{connection}{If this is \code{NULL} then a connection is created; 147 | otherwise the indicated connection is used. The default is 148 | the value of the option \code{sqldf.connection}. If neither 149 | \code{connection} nor \code{sqldf.connection} are specified a connection 150 | is automatically generated on-the-fly and closed on exit of the call to 151 | \code{sqldf}. If this argument is not \code{NULL} then the specified 152 | connection is left open on termination of the \code{sqldf} call. Usually 153 | this argument is left unspecified. It can be used to make repeated calls 154 | to a database without reloading it.} 155 | \item{verbose}{If \code{TRUE} then verboe output shown. Anything else 156 | suppresses verbose output. Can be set globally using option 157 | \code{"sqldf.verbose"}.} 158 | } 159 | \details{ 160 | The typical action of \code{sqldf} is to 161 | \describe{ 162 | \item{create a database}{in memory} 163 | \item{read in the data frames and files}{used in the select statement. 164 | This is done by scanning the select statement to see which words in 165 | the select statement are of class "data.frame" or "file" in the parent frame, or the 166 | specified environment if \code{envir} is used, and for each object 167 | found by reading it into the database if it is a data frame. Note 168 | that this heuristic usually reads in the wanted data frames and files 169 | but on occasion may harmlessly reads in extra ones too.} 170 | \item{run the select statement}{getting the result as a data frame} 171 | \item{assign the classes}{of the returned data frame's columns if 172 | \code{method = "auto"}. This is done by checking all the column 173 | names in the read-in data frames and if any are the same 174 | as a column output from the data base then that column is coerced to the 175 | class of the column whose name matched. 176 | If the class of the column is \code{"factor"} or \code{"ordered"} or if the 177 | column is not matched then the column is returned 178 | as is. If \code{method = "auto.factor"} then processing is similar except 179 | that \code{"factor"} and \code{"ordered"} classes and their levels will be 180 | assigned as well. The \code{"auto.factor"} heuristic 181 | is less reliable than the \code{"auto"} heuristic. 182 | If \code{method = "raw"} then the classes are returned 183 | as is from the database. } 184 | \item{cleanup}{If the database was created by sqldf then it is deleted; 185 | otherwise, all tables that were created are dropped in order to leave 186 | the database in the same state that it was before. The database 187 | connection is terminated.} 188 | } 189 | 190 | \code{sqldf} supports the following R options for RPostgreSQL: 191 | \code{"sqldf.RPostgreSQL.dbname"}, 192 | \code{"sqldf.RPostgreSQL.user"}, 193 | \code{"sqldf.RPostgreSQL.password"}, 194 | \code{"sqldf.RPostgreSQL.host"} and 195 | \code{"sqldf.RPostgreSQL.port"} which have defaults \code{"test"}, 196 | \code{"postgres"}, \code{"postgres"}, \code{"localhost"} and \code{5432}, 197 | respectively. It also supports 198 | \code{"sqldf.RPostgreSQL.other"} which is a list of named parameters. These 199 | may include 200 | \code{dbname}, 201 | \code{user}, 202 | \code{password}, 203 | \code{host} and 204 | \code{port}. 205 | Individually these take precdence over otherwise specified connection 206 | arguments. 207 | 208 | 209 | 210 | Warning. Although sqldf is usually used with on-the-fly databases 211 | which it automatically sets up and destroys if you wish to use it 212 | with existing databases be sure to back up your database prior to 213 | using it since incorrect operation could destroy the entire 214 | database. 215 | 216 | } 217 | \note{ 218 | If \code{row.names = TRUE} is used then 219 | any \code{NATURAL JOIN} will make use of it which may not be what was 220 | intended. 221 | 222 | 3/2 and 3.0/2 are the same in R but in SQLite the first one 223 | causes integer arithmetic to be used whereas the second using floating point. 224 | Thus both evaluate to 225 | 1.5 in R but they evaluate to 1 and 1.5 respectively in SQLite. 226 | 227 | The \code{dbWriteTable}/\code{sqliteImportFile} routines that sqldf uses to transfer files to the data base are intended for speed and they are not as flexible as \code{\link{read.table}}. Also they have slightly different defaults. (If more flexible input is needed use the slower \code{read.table} to read the data into a data frame instead of reading directly from a file.) The default for \code{sep} is \code{sep = ","}. If the first row of the file has one fewer entry than subsequent ones then it is assumed that \code{header <- row.names <- TRUE} and otherwise that \code{header <- row.names <- FALSE}. The \code{header} can be forced to \code{header <- TRUE} by specifying \code{file.format = list(header = TRUE)} as an argument to \code{sqldf.} \code{sep} and \code{row.names} are other \code{file.format} subarguments. Also, one limitation with .csv files is that quotes are not regarded as special within files so a comma within a data field such as \code{"Smith, James"} would be regarded as a field delimiter and the quotes would be entered as part of the data which probably is not what is intended. 228 | 229 | Typically the SQL result will have the same data as the analogous 230 | non-database \code{R} code manipulations using data frames 231 | but may differ in row names and other attributes. In the 232 | examples below we use \code{identical} in those cases where the two 233 | results are the same in all respects or set the row names to \code{NULL} 234 | if they would have otherwise differed only in row names or use 235 | \code{all.equal} if the data portion is the same but attributes aside 236 | from row names differ. 237 | 238 | On MySQL the database must pre-exist. Create a \code{c:\\my.ini} 239 | or \code{\%MYSQL_HOME\%\\my.ini} file on Windows or a \code{/etc/my.cnf} 240 | file on UNIX to contain information about the database. This file may 241 | specify the username, password and port. The password 242 | can be omitted if one has not been set. If using a standard port 243 | setup then the \code{port} can be omitted as well. 244 | The database is taken from the \code{dbname} argument of the \code{sqldf} 245 | command or if not set from \code{getOption("sqldf.dbname")} or if that option 246 | is not set it is assumed to be \code{"test"}. 247 | Note that MySQL does not use the \code{user}, \code{password}, \code{host} 248 | and code{port} arguments of sqldf. 249 | See \url{http://dev.mysql.com/doc/refman/5.6/en/option-files.html} for 250 | additional locations that the configuration files can be placed as well 251 | as other information. 252 | 253 | If \code{getOption("sqldf.dll")} is specified 254 | then the named dll will be loaded as an SQLite loadable extension. 255 | This is in addition to the extension functions included with RSQLite. 256 | 257 | } 258 | 259 | \value{ 260 | The result of the specified select statement is output as a data frame. 261 | If a vector of sql statements is given as \code{x} then the result of 262 | the last one is returned. If the \code{x} and \code{connection} 263 | arguments are missing then it returns a new connection and also places 264 | this connection in the option \code{sqldf.connection}. 265 | 266 | } 267 | \references{ 268 | The sqldf home page \url{https://github.com/ggrothendieck/sqldf} contains 269 | more examples as well as links to SQLite pages that may be helpful in 270 | formulating queries. It also containers pointers to using sqldf with H2 271 | and PostgreSQL. 272 | } 273 | \examples{ 274 | 275 | # 276 | # These examples show how to run a variety of data frame manipulations 277 | # in R without SQL and then again with SQL 278 | # 279 | 280 | # head 281 | a1r <- head(warpbreaks) 282 | a1s <- sqldf("select * from warpbreaks limit 6") 283 | identical(a1r, a1s) 284 | 285 | # subset 286 | 287 | a2r <- subset(CO2, grepl("^Qn", Plant)) 288 | a2s <- sqldf("select * from CO2 where Plant like 'Qn\%'") 289 | all.equal(as.data.frame(a2r), a2s) 290 | 291 | data(farms, package = "MASS") 292 | a3r <- subset(farms, Manag \%in\% c("BF", "HF")) 293 | a3s <- sqldf("select * from farms where Manag in ('BF', 'HF')") 294 | row.names(a3r) <- NULL 295 | identical(a3r, a3s) 296 | 297 | a4r <- subset(warpbreaks, breaks >= 20 & breaks <= 30) 298 | a4s <- sqldf("select * from warpbreaks where breaks between 20 and 30", 299 | row.names = TRUE) 300 | identical(a4r, a4s) 301 | 302 | a5r <- subset(farms, Mois == 'M1') 303 | a5s <- sqldf("select * from farms where Mois = 'M1'", row.names = TRUE) 304 | identical(a5r, a5s) 305 | 306 | a6r <- subset(farms, Mois == 'M2') 307 | a6s <- sqldf("select * from farms where Mois = 'M2'", row.names = TRUE) 308 | identical(a6r, a6s) 309 | 310 | # rbind 311 | a7r <- rbind(a5r, a6r) 312 | a7s <- sqldf("select * from a5s union all select * from a6s") 313 | 314 | # sqldf drops the unused levels of Mois but rbind does not; however, 315 | # all data is the same and the other columns are identical 316 | row.names(a7r) <- NULL 317 | identical(a7r[-1], a7s[-1]) 318 | 319 | # aggregate - avg conc and uptake by Plant and Type 320 | a8r <- aggregate(iris[1:2], iris[5], mean) 321 | a8s <- sqldf('select Species, avg("Sepal.Length") `Sepal.Length`, 322 | avg("Sepal.Width") `Sepal.Width` from iris group by Species') 323 | all.equal(a8r, a8s) 324 | 325 | # by - avg conc and total uptake by Plant and Type 326 | a9r <- do.call(rbind, by(iris, iris[5], function(x) with(x, 327 | data.frame(Species = Species[1], 328 | mean.Sepal.Length = mean(Sepal.Length), 329 | mean.Sepal.Width = mean(Sepal.Width), 330 | mean.Sepal.ratio = mean(Sepal.Length/Sepal.Width))))) 331 | row.names(a9r) <- NULL 332 | a9s <- sqldf('select Species, avg("Sepal.Length") `mean.Sepal.Length`, 333 | avg("Sepal.Width") `mean.Sepal.Width`, 334 | avg("Sepal.Length"/"Sepal.Width") `mean.Sepal.ratio` from iris 335 | group by Species') 336 | all.equal(a9r, a9s) 337 | 338 | # head - top 3 breaks 339 | a10r <- head(warpbreaks[order(warpbreaks$breaks, decreasing = TRUE), ], 3) 340 | a10s <- sqldf("select * from warpbreaks order by breaks desc limit 3") 341 | row.names(a10r) <- NULL 342 | identical(a10r, a10s) 343 | 344 | # head - bottom 3 breaks 345 | a11r <- head(warpbreaks[order(warpbreaks$breaks), ], 3) 346 | a11s <- sqldf("select * from warpbreaks order by breaks limit 3") 347 | # attributes(a11r) <- attributes(a11s) <- NULL 348 | row.names(a11r) <- NULL 349 | identical(a11r, a11s) 350 | 351 | # ave - rows for which v exceeds its group average where g is group 352 | DF <- data.frame(g = rep(1:2, each = 5), t = rep(1:5, 2), v = 1:10) 353 | a12r <- subset(DF, v > ave(v, g, FUN = mean)) 354 | Gavg <- sqldf("select g, avg(v) as avg_v from DF group by g") 355 | a12s <- sqldf("select DF.g, t, v from DF, Gavg where DF.g = Gavg.g and v > avg_v") 356 | row.names(a12r) <- NULL 357 | identical(a12r, a12s) 358 | 359 | # same but reduce the two select statements to one using a subquery 360 | a13s <- sqldf("select g, t, v 361 | from DF d1, (select g as g2, avg(v) as avg_v from DF group by g) 362 | where d1.g = g2 and v > avg_v") 363 | identical(a12r, a13s) 364 | 365 | # same but shorten using natural join 366 | a14s <- sqldf("select g, t, v 367 | from DF 368 | natural join (select g, avg(v) as avg_v from DF group by g) 369 | where v > avg_v") 370 | identical(a12r, a14s) 371 | 372 | # table 373 | a15r <- table(warpbreaks$tension, warpbreaks$wool) 374 | a15s <- sqldf("select sum(wool = 'A'), sum(wool = 'B') 375 | from warpbreaks group by tension") 376 | all.equal(as.data.frame.matrix(a15r), a15s, check.attributes = FALSE) 377 | 378 | # reshape 379 | t.names <- paste("t", unique(as.character(DF$t)), sep = "_") 380 | a16r <- reshape(DF, direction = "wide", timevar = "t", idvar = "g", varying = list(t.names)) 381 | a16s <- sqldf("select 382 | g, 383 | sum((t == 1) * v) t_1, 384 | sum((t == 2) * v) t_2, 385 | sum((t == 3) * v) t_3, 386 | sum((t == 4) * v) t_4, 387 | sum((t == 5) * v) t_5 388 | from DF group by g") 389 | all.equal(a16r, a16s, check.attributes = FALSE) 390 | 391 | # order 392 | a17r <- Formaldehyde[order(Formaldehyde$optden, decreasing = TRUE), ] 393 | a17s <- sqldf("select * from Formaldehyde order by optden desc") 394 | row.names(a17r) <- NULL 395 | identical(a17r, a17s) 396 | 397 | # centered moving average of length 7 398 | set.seed(1) 399 | DF <- data.frame(x = rnorm(15, 1:15)) 400 | s18 <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b 401 | where a.row_names - b.row_names between -3 and 3 402 | group by a.row_names having count(*) = 7 403 | order by a.row_names+0", 404 | row.names = TRUE) 405 | r18 <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7))) 406 | row.names(r18) <- NULL 407 | all.equal(r18, s18) 408 | 409 | # merge. a19r and a19s are same except row order and row names 410 | A <- data.frame(a1 = c(1, 2, 1), a2 = c(2, 3, 3), a3 = c(3, 1, 2)) 411 | B <- data.frame(b1 = 1:2, b2 = 2:1) 412 | a19s <- sqldf("select * from A, B") 413 | a19r <- merge(A, B) 414 | Sort <- function(DF) DF[do.call(order, DF),] 415 | all.equal(Sort(a19s), Sort(a19r), check.attributes = FALSE) 416 | 417 | # within Date, of the highest quality records list the one closest 418 | # to noon. Note use of two sql statements in one call to sqldf. 419 | 420 | Lines <- "DeployID Date.Time LocationQuality Latitude Longitude 421 | STM05-1 2005/02/28 17:35 Good -35.562 177.158 422 | STM05-1 2005/02/28 19:44 Good -35.487 177.129 423 | STM05-1 2005/02/28 23:01 Unknown -35.399 177.064 424 | STM05-1 2005/03/01 07:28 Unknown -34.978 177.268 425 | STM05-1 2005/03/01 18:06 Poor -34.799 177.027 426 | STM05-1 2005/03/01 18:47 Poor -34.85 177.059 427 | STM05-2 2005/02/28 12:49 Good -35.928 177.328 428 | STM05-2 2005/02/28 21:23 Poor -35.926 177.314 429 | " 430 | 431 | DF <- read.table(textConnection(Lines), skip = 1, as.is = TRUE, 432 | col.names = c("Id", "Date", "Time", "Quality", "Lat", "Long")) 433 | 434 | sqldf(c("create temp table DFo as select * from DF order by 435 | Date DESC, Quality DESC, 436 | abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) DESC", 437 | "select * from DFo group by Date")) 438 | 439 | \dontrun{ 440 | 441 | # test of file connections with sqldf 442 | 443 | # create test .csv file of just 3 records 444 | write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE) 445 | 446 | # look at contents of iris3.dat 447 | readLines("iris3.dat") 448 | 449 | # set up file connection 450 | iris3 <- file("iris3.dat") 451 | sqldf('select * from iris3 where "Sepal.Width" > 3') 452 | 453 | # using a non-default separator 454 | # file.format can be an attribute of file object or an arg passed to sqldf 455 | write.table(head(iris, 3), "iris3.dat", sep = ";", quote = FALSE) 456 | iris3 <- file("iris3.dat") 457 | sqldf('select * from iris3 where "Sepal.Width" > 3', file.format = list(sep = ";")) 458 | 459 | # same but pass file.format through attribute of file object 460 | attr(iris3, "file.format") <- list(sep = ";") 461 | sqldf('select * from iris3 where "Sepal.Width" > 3') 462 | 463 | # copy file straight to disk without going through R 464 | # and then retrieve portion into R 465 | sqldf('select * from iris3 where "Sepal.Width" > 3', dbname = tempfile()) 466 | 467 | ### same as previous example except it allows multiple queries against 468 | ### the database. We use iris3 from before. This time we use an 469 | ### in memory SQLite database. 470 | 471 | sqldf() # open a connection 472 | sqldf('select * from iris3 where "Sepal.Width" > 3') 473 | 474 | # At this point we have an iris3 variable in both 475 | # the R workspace and in the SQLite database so we need to 476 | # explicitly let it know we want the version in the database. 477 | # If we were not to do that it would try to use the R version 478 | # by default and fail since sqldf would prevent it from 479 | # overwriting the version already in the database to protect 480 | # the user from inadvertent errors. 481 | sqldf('select * from main.iris3 where "Sepal.Width" > 4') 482 | sqldf('select * from main.iris3 where "Sepal_Width" < 4') 483 | sqldf() # close connection 484 | 485 | ### another way to do this is a mix of sqldf and RSQLite statements 486 | ### In that case we need to fetch the connection for use with RSQLite 487 | ### and do not have to specifically refer to main since RSQLite can 488 | ### only access the database. 489 | 490 | con <- sqldf() 491 | # this iris3 refers to the R variable and file 492 | sqldf('select * from iris3 where "Sepal.Width" > 3') 493 | sqldf("select count(*) from iris3") 494 | # these iris3 refer to the database table 495 | dbGetQuery(con, 'select * from iris3 where "Sepal.Width" > 4') 496 | dbGetQuery(con, 'select * from iris3 where "Sepal.Width" < 4') 497 | sqldf() 498 | 499 | } 500 | } 501 | \keyword{manip} 502 | -------------------------------------------------------------------------------- /tests/doSvUnit.R: -------------------------------------------------------------------------------- 1 | #! /usr/bin/Rscript --vanilla --default-packages=utils,stats,methods,svUnit 2 | pkg <- "sqldf" 3 | require(svUnit) # Needed if run from R CMD BATCH 4 | require(pkg, character.only = TRUE) # Needed if run from R CMD BATCH 5 | # unlink("mypkgTest.txt") # Make sure we generate a new report 6 | mypkgSuite <- svSuiteList(pkg, excludeList = NULL) # List all our test suites 7 | runTest(mypkgSuite, name = "svUnit") # Run them... 8 | # protocol(Log(), type = "text", file = "mypkgTest.txt") # ... and write report 9 | Log() 10 | 11 | --------------------------------------------------------------------------------