├── DESCRIPTION
├── INSTALL
├── NAMESPACE
├── R
    ├── sqldf.R
    └── zzz.R
├── README.md
├── demo
    ├── 00Index
    ├── sqldf-groupchoose.R
    └── sqldf-unitTests.R
├── inst
    ├── NEWS
    ├── THANKS
    ├── csv.awk
    ├── trcomma2dot.vbs
    └── unitTests
    │   └── runit.all.R
├── man
    ├── read.csv.sql.Rd
    ├── sqldf-package.Rd
    └── sqldf.Rd
└── tests
    └── doSvUnit.R


/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: sqldf
 2 | Version: 0.4-11
 3 | Date: 2017-06-23
 4 | Title: Manipulate R Data Frames Using SQL
 5 | Author: G. Grothendieck <ggrothendieck@gmail.com>
 6 | Maintainer: G. Grothendieck <ggrothendieck@gmail.com>
 7 | Description: The sqldf() function is typically passed a single argument which 
 8 | 	is an SQL select statement where the table names are ordinary R data 
 9 | 	frame names.  sqldf() transparently sets up a database, imports the 
10 | 	data frames into that database, performs the SQL select or other
11 | 	statement and returns the result using a heuristic to determine which 
12 | 	class to assign to each column of the returned data frame.  The sqldf() 
13 | 	or read.csv.sql() functions can also be used to read filtered files 
14 | 	into R even if the original files are larger than R itself can handle.
15 | 	'RSQLite', 'RH2', 'RMySQL' and 'RPostgreSQL' backends are supported.
16 | ByteCompile: true
17 | Depends: R (>= 3.1.0), gsubfn (>= 0.6), proto, RSQLite
18 | Suggests: RH2, RMySQL, RPostgreSQL, svUnit, tcltk, MASS
19 | Imports: DBI, chron
20 | License: GPL-2
21 | BugReports: https://github.com/ggrothendieck/sqldf/issues
22 | URL: https://github.com/ggrothendieck/sqldf, https://groups.google.com/group/sqldf
23 | 
24 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
 1 | INSTALLING FROM CRAN
 2 | 
 3 | sqldf is installed from CRAN like this:
 4 | 
 5 | 	install.packages("sqldf")
 6 | 
 7 | This will download and install sqldf and all its dependencies including the
 8 | RSQLite driver.  The RSQLite driver includes SQLite itself so  the single
 9 | line given above is all that is needed to install everything you need.
10 | 
11 | 
12 | INSTALLING FROM GITHUB
13 | 
14 | The development version of sqldf can be installed from github like this:
15 | 
16 | 	library(devtools)
17 | 	install_github("ggrothendieck/sqldf")
18 | 
19 | 
20 | BACKENDS
21 | 
22 | sqldf can also be used with alternate backends: H2, PostgreSQL and MySQL.
23 | 
24 | H2 is the next easiest to install since the H2 software itself is included 
25 | right in the RH2 R package providing the driver. Just install RH2 (which
26 | in turn requires that you install java).
27 | 
28 | See FAQ #12 on the sqldf github home page for information on installng and 
29 | using PostgreSQL with sqldf.
30 | https://github.com/ggrothendieck/sqldf
31 | 
32 | 
33 | PROBLEMS
34 | 
35 | sqldf uses gsubfn which can optionally use the R tcltk package but can also run
36 | without that package substituting R code for it.  On Windows and most other R
37 | installations, the tcltk package comes with the core of R and so you
38 | automatically have it intalled it when you install R itself.  If the tcltk
39 | package is not present this fact will be automatically detected and it will
40 | automatically switch to R code.  
41 | 
42 | There have been reports of bad R builds in which R reports that it has tcltk
43 | capability even though it is missing. In that case you will get an error message
44 | about tcltk being missing.  To address this issue it is easiest to just run this
45 | command prior to sqldf to force gsubfn to use R code in place of tcltk:
46 | 	options(gsubfn.engine = "R")
47 | 
48 | See FAQ #5 on the sqldf github home page for more info.
49 | https://github.com/ggrothendieck/sqldf
50 | 
51 | 
52 | MORE INFO
53 | 
54 | There is more information on sqldf on the sqldf github home page.
55 | Note the TROUBLESHOOTING, FAQs and EXAMPLES sections there.
56 | 
57 |     https://github.com/ggrothendieck/sqldf
58 | 
59 | There is also information in the help files:
60 | 
61 |     library(help = sqldf)
62 |     ?sqldf
63 |     ?read.csv.sql
64 | 
65 | There is also info on the individual driver packages here:
66 | 
67 |   http://cran.r-project.org/package=RSQLite
68 |   http://cran.r-project.org/package=RH2 
69 |   http://cran.r-project.org/package=RPostgreSQL
70 |   http://cran.r-project.org/package=RMySQL
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Default NAMESPACE created by R
 2 | # Remove the previous line if you edit this file
 3 | 
 4 | # Export all names
 5 | exportPattern(".")
 6 | 
 7 | # Import all packages listed as Imports or Depends
 8 | import(
 9 |   DBI,
10 |   RSQLite,
11 |   gsubfn,
12 |   proto,
13 |   chron
14 | )
15 | 
16 | importFrom("utils", "download.file", "head", "modifyList")
17 | 


--------------------------------------------------------------------------------
/R/sqldf.R:
--------------------------------------------------------------------------------
  1 | 
  2 | sqldf <- function(x, stringsAsFactors = FALSE,
  3 |    row.names = FALSE, envir = parent.frame(), 
  4 |    method = getOption("sqldf.method"),
  5 |    file.format = list(), dbname, drv = getOption("sqldf.driver"), 
  6 |    user, password = "", host = "localhost", port,
  7 |    dll = getOption("sqldf.dll"), connection = getOption("sqldf.connection"),
  8 |    verbose = isTRUE(getOption("sqldf.verbose"))) {
  9 | 
 10 |    # as.POSIXct.numeric <- function(x, origin = "1970-01-01 00:00:00", ...)
 11 |    #  base::as.POSIXct.numeric(x, origin = origin, ...)
 12 |    as.POSIXct.numeric <- function(x, ...)
 13 |       structure(x, class = c("POSIXct", "POSIXt"))
 14 |    as.POSIXct.character <- function(x) structure(as.numeric(x),
 15 | 	class = c("POSIXct", "POSIXt"))
 16 |    as.Date.character <- function(x) structure(as.numeric(x), class = "Date")
 17 |    as.Date2 <- function(x) UseMethod("as.Date2")
 18 |    as.Date2.character <- function(x) base::as.Date.character(x)
 19 |    as.Date.numeric <- function(x, origin = "1970-01-01", ...) base::as.Date.numeric(x, origin = origin, ...)
 20 |    as.dates.character <- function(x) structure(as.numeric(x), class = c("dates", "times"))
 21 |    as.times.character <- function(x) structure(as.numeric(x), class = "times")
 22 | 
 23 |    name__class <- function(data, ...) {
 24 | 	if (is.null(data)) return(data)
 25 | 	cls <- sub(".*__([^_]+)|.*", "\\1", names(data))
 26 | 	f <- function(i) {
 27 | 		if (cls[i] == "") { 
 28 | 			data[[i]] 
 29 | 		} else {
 30 | 			fun_name <- paste("as", cls[i], sep = ".")
 31 | 			fun <- mget(fun_name, envir = environment(), 
 32 | 				mode = "function", ifnotfound = NA, inherits = TRUE)[[1]]
 33 | 			if (identical(fun, NA)) data[[i]] else {
 34 | 				# strip _class off name
 35 | 				names(data)[i] <<- sub("__[^_]+$", "", names(data)[i])
 36 | 				fun(data[[i]])
 37 | 			}
 38 | 		}
 39 | 	}
 40 | 	data[] <- lapply(1:NCOL(data), f)
 41 | 	data
 42 |    }
 43 | 
 44 | 	colClass <- function(data, cls) {
 45 | 	  if (is.null(data)) return(data)
 46 | 	  if (is.list(cls)) cls <- unlist(cls)
 47 | 	  cls <- rep(cls, length = length(data))
 48 | 	  f <- function(i) {
 49 | 		if (cls[i] == "") { 
 50 | 			data[[i]] 
 51 | 		} else {
 52 | 			fun_name <- paste("as", cls[i], sep = ".")
 53 | 			fun <- mget(fun_name, envir = environment(), 
 54 | 				mode = "function", ifnotfound = NA, inherits = TRUE)[[1]]
 55 | 			if (identical(fun, NA)) data[[i]] else {
 56 | 				# strip _class off name
 57 | 				names(data)[i] <<- sub("__[^_]+$", "", names(data)[i])
 58 | 				fun(data[[i]])
 59 | 			}
 60 | 		 }
 61 | 	   }
 62 | 	data[] <- lapply(1:NCOL(data), f)
 63 | 	data
 64 |    }
 65 | 
 66 | 	overwrite <- FALSE
 67 | 
 68 | 	request.open <- missing(x) && is.null(connection)
 69 | 	request.close <- missing(x) && !is.null(connection)
 70 | 	request.con <- !missing(x) && !is.null(connection)
 71 | 	request.nocon <- !missing(x) && is.null(connection)
 72 | 
 73 | 	dfnames <- fileobjs <- character(0)
 74 | 
 75 | 
 76 | 	if (!is.list(method)) method <- list(method, NULL)
 77 | 	to.df <- method[[1]]
 78 | 	to.db <- method[[2]]
 79 | 
 80 | 	# if exactly one of x and connection are missing then close on exit
 81 | 	if (request.close || request.nocon) { 
 82 | 
 83 | 		on.exit({
 84 | 			dbPreExists <- attr(connection, "dbPreExists")
 85 | 			dbname <- attr(connection, "dbname")
 86 |     		if (!missing(dbname) && !is.null(dbname) && dbname == ":memory:") {
 87 | 				if (verbose) {
 88 | 					cat("sqldf: dbDisconnect(connection)\n")
 89 | 				}
 90 | 				dbDisconnect(connection)
 91 |     		} else if (!dbPreExists && drv == "sqlite") {
 92 |     			# data base not pre-existing
 93 | 				if (verbose) {
 94 | 					cat("sqldf: dbDisconnect(connection)\n")
 95 | 					cat("sqldf: file.remove(dbname)\n")
 96 | 				}
 97 |     			dbDisconnect(connection)
 98 |     			file.remove(dbname)
 99 |     		} else {
100 |     			# data base pre-existing
101 | 
102 |     			for (nam in dfnames) {
103 | 					if (verbose) {
104 | 						cat("sqldf: dbRemoveTable(connection, ", nam, ")\n")
105 | 					}
106 | 					dbRemoveTable(connection, nam)
107 | 				}
108 |     			for (fo in fileobjs) {
109 | 					if (verbose) {
110 | 						cat("sqldf: dbRemoveTable(connection, ", fo, ")\n")
111 | 					}
112 | 					dbRemoveTable(connection, fo)
113 | 				}
114 | 				if (verbose) {
115 | 					cat("sqldf: dbDisconnect(connection)\n")
116 | 				}
117 |     			dbDisconnect(connection)
118 |     		}
119 |     	}, add = TRUE)
120 | 		if (request.close) {
121 | 			if (identical(connection, getOption("sqldf.connection")))
122 | 				options(sqldf.connection = NULL)
123 | 			return()
124 | 		}
125 | 	}
126 | 
127 | 	# if con is missing then connection opened
128 | 	if (request.open || request.nocon) {
129 |     
130 |     	if (is.null(drv) || drv == "") {
131 |     		drv <- if ("package:RPostgreSQL" %in% search()) { "PostgreSQL"
132 |     		} else if ("package:RpgSQL" %in% search()) { "pgSQL"
133 | 			} else if ("package:RMySQL" %in% search()) { "MySQL" 
134 |     		} else if ("package:RH2" %in% search()) { "H2" 
135 |     		} else "SQLite"
136 |     	}
137 | 
138 | 		drv <- sub("^[Rr]", "", drv)
139 | 		pkg <- paste("R", drv, sep = "")
140 | 		if (verbose) {
141 | 			if (!is.loaded(pkg)) cat("sqldf: library(", pkg, ")\n", sep = "")
142 | 			library(pkg, character.only = TRUE)
143 | 		} else library(pkg, character.only = TRUE)
144 | 
145 | 		drv <- tolower(drv)
146 | 
147 |     	if (drv == "mysql") {
148 | 			if (verbose) cat("sqldf: m <- dbDriver(\"MySQL\")\n")
149 |     		m <- dbDriver("MySQL")
150 | 			if (missing(dbname) || is.null(dbname)) {
151 | 				dbname <- getOption("RMySQL.dbname")
152 | 				if (is.null(dbname)) dbname <- "test"
153 | 			}
154 |     		connection <- if (missing(dbname) || dbname == ":memory:") { 
155 |     				dbConnect(m) 
156 |     			} else dbConnect(m, dbname = dbname)
157 |     			dbPreExists <- TRUE
158 | 		} else if (drv == "postgresql") {
159 | 			if (verbose) cat("sqldf: m <- dbDriver(\"PostgreSQL\")\n")
160 |     		m <- dbDriver("PostgreSQL")
161 | 			if (missing(user) || is.null(user)) {
162 | 				user <- getOption("sqldf.RPostgreSQL.user")
163 | 				if (is.null(user)) user <- "postgres"
164 | 			}
165 | 			if (missing(password) || is.null(password)) {
166 | 				password <- getOption("sqldf.RPostgreSQL.password")
167 | 				if (is.null(password)) password <- "postgres"
168 | 			}
169 | 			if (missing(dbname) || is.null(dbname)) {
170 | 				dbname <- getOption("sqldf.RPostgreSQL.dbname")
171 | 				if (is.null(dbname)) dbname <- "test"
172 | 			}
173 | 			if (missing(host) || is.null(host)) {
174 | 				host <- getOption("sqldf.RPostgreSQL.host")
175 | 				if (is.null(host)) host <- "localhost"
176 | 			}
177 | 			if (missing(port) || is.null(port)) {
178 | 				port <- getOption("sqldf.RPostgreSQL.port")
179 | 				if (is.null(port)) port <- 5432
180 | 			}
181 | 			connection.args <- list(m, user = user, password,
182 | 				dbname = dbname, host = host, port = port)
183 | 			connection.args.other <- getOption("sqldf.RPostgreSQL.other")
184 | 			if (!is.null(connection.args.other))
185 | 			connection.args <- modifyList(connection.args,
186 | 						connection.args.other)
187 | 			connection <- do.call("dbConnect", connection.args)
188 | 			# connection <- dbConnect(m, user = user, password, dbname = dbname,
189 | 				# host = host, port = port)
190 | 
191 | 		    if (verbose) {
192 | 			    cat(sprintf("sqldf: connection <- dbConnect(m, user='%s', password=<...>, dbname = '%s', host = '%s', port = '%s', ...)\n", user, dbname, host, port))
193 | 			    if (!is.null(connection.args.other)) {
194 | 				    cat("other connection arguments:\n")
195 | 				    print(connection.args.other)
196 | 			    }
197 | 		    }
198 |     		dbPreExists <- TRUE
199 | 		} else if (drv == "pgsql") {
200 | 			if (verbose) cat("sqldf: m <- dbDriver(\"pgSQL\")\n")
201 |     		m <- dbDriver("pgSQL")
202 | 			if (missing(dbname) || is.null(dbname)) {
203 | 				dbname <- getOption("RpgSQL.dbname")
204 | 				if (is.null(dbname)) dbname <- "test"
205 | 			}
206 | 			connection <- dbConnect(m, dbname = dbname)
207 |     		dbPreExists <- TRUE
208 |     	} else if (drv == "h2") {
209 | 			# jar.file <- "C:\\Program Files\\H2\\bin\\h2.jar"
210 | 			# jar.file <- system.file("h2.jar", package = "H2")
211 | 			# m <- JDBC("org.h2.Driver", jar.file, identifier.quote = '"')
212 | 			if (verbose) cat("sqldf: m <- dbDriver(\"H2\")\n")
213 | 			# m <- H2()
214 | 			m <- dbDriver("H2")
215 |     		if (missing(dbname) || is.null(dbname)) dbname <- ":memory:"
216 |     		dbPreExists <- dbname != ":memory:" && file.exists(dbname)
217 | 			connection <- if (missing(dbname) || is.null(dbname) || 
218 | 				dbname == ":memory:") {
219 | 					dbConnect(m, "jdbc:h2:mem:", "sa", "")
220 | 				} else {
221 | 					jdbc.string <- paste("jdbc:h2", dbname, sep = ":")
222 | 					# dbConnect(m, jdbc.string, "sa", "")
223 | 					dbConnect(m, jdbc.string)
224 | 				}
225 | 		} else {
226 | 			if (verbose) cat("sqldf: m <- dbDriver(\"SQLite\")\n")
227 |     		m <- dbDriver("SQLite")
228 |     		if (missing(dbname) || is.null(dbname)) dbname <- ":memory:"
229 |     		dbPreExists <- dbname != ":memory:" && file.exists(dbname)
230 | 
231 | 			# search for spatialite extension on PATH and, if found, load it
232 | 			# if (is.null(getOption("sqldf.dll"))) {
233 | 			#	dll <- Sys.which("libspatialite-2.dll")
234 | 			#	if (dll == "") dll <- Sys.which("libspatialite-1.dll")
235 | 			#	dll <- if (dll == "") FALSE else normalizePath(dll)
236 | 			#	options(sqldf.dll = dll)
237 | 			# }
238 | 			dll <- getOption("sqldf.dll")
239 | 			if (length(dll) != 1 || identical(dll, FALSE) || nchar(dll) == 0) {
240 | 				dll <- FALSE
241 | 			} else {
242 | 				if (dll == basename(dll)) dll <- Sys.which(dll)
243 | 			}
244 | 			options(sqldf.dll = dll)
245 | 
246 | 			if (!identical(dll, FALSE)) {
247 | 				if (verbose) {
248 | 					cat("sqldf: connection <- dbConnect(m, dbname = \"", dbname, 
249 | 						"\", loadable.extensions = TRUE\n", sep = "")
250 | 					cat("sqldf: select load_extension('", dll, "')\n", sep = "")
251 | 				}
252 | 				connection <- dbConnect(m, dbname = dbname, 
253 | 					loadable.extensions = TRUE)
254 | 				s <- sprintf("select load_extension('%s')", dll)
255 | 				dbGetQuery(connection, s)
256 | 			} else {
257 | 				if (verbose) {
258 | 				cat("sqldf: connection <- dbConnect(m, dbname = \"", dbname, "\")\n", sep = "")
259 | 				}
260 | 				connection <- dbConnect(m, dbname = dbname)
261 | 			}
262 | 			if (verbose) cat("sqldf: initExtension(connection)\n")
263 | 			initExtension(connection)
264 |     	}
265 | 		attr(connection, "dbPreExists") <- dbPreExists
266 | 		if (missing(dbname) && drv == "sqlite") dbname <- ":memory:"
267 | 		attr(connection, "dbname") <- dbname
268 |     	if (request.open) {
269 | 			options(sqldf.connection = connection)
270 | 			return(connection)
271 | 		}
272 | 	}
273 | 
274 | 	# connection was specified
275 | 	if (request.con) {
276 | 		drv <- if (inherits(connection, "PostgreSQLConnection")) "PostgreSQL"
277 | 		else if (inherits(connection, "pgSQLConnection")) "pgSQL"
278 | 		else if (inherits(connection, "MySQLConnection")) "MySQL"
279 | 		else if (inherits(connection, "H2Connection")) "H2"
280 | 		else "SQLite"
281 | 		drv <- tolower(drv)
282 | 		dbPreExists <- attr(connection, "dbPreExists")
283 | 	}
284 | 
285 | 	# engine is "tcl" or "R".  
286 | 	engine <- getOption("gsubfn.engine")
287 | 	if (is.null(engine) || is.na(engine) || engine == "") {
288 | 		engine <- if (requireNamespace("tcltk", quietly = TRUE)) "tcl" else "R"
289 | 	} else if (engine == "tcl") requireNamespace("tcltk", quietly = TRUE)
290 | 
291 | 	# words. is a list whose ith component contains vector of words in ith stmt
292 | 	# words is all the words in one long vector without duplicates
293 | 	#
294 | 	# If "tcl" is available use the faster strapplyc else strapply
295 | 	words. <- words <- if (engine == "tcl") {
296 | 		strapplyc(x, "[[:alnum:]._]+")
297 | 	} else strapply(x, "[[:alnum:]._]+", engine = "R")
298 | 	
299 | 	if (length(words) > 0) words <- unique(unlist(words))
300 | 	is.special <- sapply(
301 | 		mget(words, envir, "any", NA, inherits = TRUE), 
302 | 		function(x) is.data.frame(x) + 2 * inherits(x, "file"))
303 | 
304 | 	# process data frames
305 | 	dfnames <- words[is.special == 1]
306 | 	for(i in seq_along(dfnames)) {
307 | 		nam <- dfnames[i]
308 | 		if (dbPreExists && !overwrite && dbExistsTable(connection, nam)) {
309 | 			# exit code removes tables in dfnames
310 | 			# so only include those added so far
311 | 			dfnames <- head(dfnames, i-1)
312 | 			stop(paste("sqldf:", "table", nam, 
313 | 				"already in", dbname, "\n"))
314 | 		}
315 | 		# check if the nam processing works with MySQL
316 | 		# if not then ensure its only applied to SQLite
317 | 		DF <- as.data.frame(get(nam, envir))
318 | 		if (!is.null(to.db) && is.function(to.db)) DF <- to.db(DF)
319 | 		# if (verbose) cat("sqldf: writing", nam, "to database\n")
320 | 		if (verbose) cat("sqldf: dbWriteTable(connection, '", nam, "', ", nam, ", row.names = ", row.names, ")\n", sep = "")
321 | 		dbWriteTable(connection, nam, DF, row.names = row.names)
322 | 	}
323 | 
324 | 	# process file objects
325 | 	fileobjs <- if (is.null(file.format)) { character(0)
326 | 	} else {
327 | 		eol <- if (.Platform$OS.type == "windows") "\r\n" else "\n"
328 | 		words[is.special == 2]
329 | 	}
330 | 	for(i in seq_along(fileobjs)) {
331 | 		fo <- fileobjs[i]
332 | 		Filename <- summary(get(fo, envir))$description
333 | 		if (dbPreExists && !overwrite && dbExistsTable(connection, Filename)) {
334 | 			# exit code should only remove tables added so far
335 | 			fileobjs <- head(fileobjs, i-1)
336 | 			stop(paste("sqldf:", "table", fo, "from file", 
337 | 				Filename, "already in", dbname, "\n"))
338 | 		}
339 | 		args <- c(list(conn = connection, name = fo, value = Filename), 
340 | 			modifyList(list(eol = eol), 
341 |             file.format))
342 | 		args <- modifyList(args, as.list(attr(get(fo, envir), "file.format")))
343 | 		filter <- args$filter
344 | 		if (!is.null(filter)) {
345 | 			args$filter <- NULL
346 | 			Filename.tmp <- tempfile()
347 | 			args$value <- Filename.tmp
348 | 			# for filter = list(cmd, x = ...) replace x in cmd with
349 | 			# temporary filename for a file created to hold ...
350 | 			filter.subs <- filter[-1]
351 | 			# filter subs contains named elements of filter
352 | 			if (length(filter.subs) > 0) {
353 | 				filter.subs <- filter.subs[sapply(names(filter.subs), nzchar)]
354 | 			}
355 | 			filter.nms <- names(filter.subs)
356 | 			# create temporary file names
357 | 			filter.tempfiles <- sapply(filter.nms, tempfile)
358 | 			cmd <- filter[[1]]
359 | 			# write out temporary file & substitute temporary file name into cmd
360 | 			for(nm in filter.nms) {
361 | 				cat(filter.subs[[nm]], file = filter.tempfiles[[nm]])
362 | 				cmd <- gsub(nm, filter.tempfiles[[nm]], cmd, fixed = TRUE)
363 | 			}
364 | 			cmd <- if (nchar(Filename) > 0)
365 | 				sprintf('%s < "%s" > "%s"', cmd, Filename, Filename.tmp)
366 | 			else sprintf('%s > "%s"', cmd, Filename.tmp)
367 | 
368 | 			# on Windows platform preface command with cmd /c 
369 | 			if (.Platform$OS.type == "windows") {
370 | 				cmd <- paste("cmd /c", cmd)
371 | 				if (FALSE) {
372 | 				key <- "SOFTWARE\\R-core"
373 | 				show.error.messages <- getOption("show.error.message")
374 | 				options(show.error.messages = FALSE)
375 | 				reg <- try(readRegistry(key, maxdepth = 3)$Rtools$InstallPath)
376 | 				reg <- NULL
377 | 				options(show.error.messages = show.error.messages)
378 | 				# add Rtools bin directory to PATH if found in registry
379 | 				if (!is.null(reg) && !inherits(reg, "try-error")) {
380 | 					Rtools.path <- file.path(reg, "bin", fsep = "\\")
381 | 					path <- Sys.getenv("PATH")
382 | 					on.exit(Sys.setenv(PATH = path), add = TRUE)
383 | 					path.new <- paste(path, Rtools.path, sep = ";")
384 | 					Sys.setenv(PATH = path.new)
385 | 				}
386 | 				}
387 | 			}
388 | 			if (verbose) cat("sqldf: system(\"", cmd, "\")\n", sep = "")
389 | 			system(cmd)
390 | 			for(fn in filter.tempfiles) file.remove(fn)
391 | 		}
392 | 		if (verbose) cat("sqldf: dbWriteTable(", toString(args), ")\n")
393 | 		do.call("dbWriteTable", args)
394 | 	}
395 | 
396 | 	# SQLite can process all statements using dbGetQuery.  
397 | 	# Other databases process select/call/show with dbGetQuery and other 
398 | 	# statements with dbSendQuery.
399 | 	if (drv == "sqlite" || drv == "mysql" || drv == "postgresql") {
400 | 		for(xi in x) {
401 | 			if (verbose) {
402 | 				cat("sqldf: dbGetQuery(connection, '", xi, "')\n", sep = "")
403 | 			}
404 | 			rs <- dbGetQuery(connection, xi)
405 | 		}
406 | 	} else {
407 | 		for(i in seq_along(x)) {
408 | 			if (length(words.[[i]]) > 0) {
409 | 				dbGetQueryWords <- c("select", "show", "call", "explain", 
410 | 					"with")
411 | 				if (tolower(words.[[i]][1]) %in% dbGetQueryWords || drv != "h2") {
412 | 					if (verbose) {
413 | 						cat("sqldf: dbGetQuery(connection, '", x[i], "')\n", sep = "")
414 | 					}
415 | 					rs <- dbGetQuery(connection, x[i])
416 | 				} else {
417 | 					if (verbose) {
418 | 						cat("sqldf: dbSendUpdate:", x[i], "\n")
419 | 					}
420 | 					rs <- get("dbSendUpdate")(connection, x[i])
421 | 				}
422 | 			}
423 | 		}
424 | 	}
425 | 
426 | 	if (is.null(to.df)) to.df <- "auto"
427 |     if (is.function(to.df)) return(to.df(rs))
428 | 	# to.df <- match.arg(to.df, c("auto", "raw", "name__class"))
429 | 	if (identical(to.df, "raw")) return(rs)
430 | 	if (identical(to.df, "name__class")) return(do.call("name__class", list(rs)))
431 | 	if (!identical(to.df, "nofactor") && !identical(to.df, "auto")) {
432 | 		return(do.call("colClass", list(rs, to.df)))
433 | 	}
434 | 	# process row_names
435 | 	row_names_name <- grep("row[_.]names", names(rs), value = TRUE)
436 | 	if (length(row_names_name) > 1) warning(paste("ambiguity regarding row names:", row_names_name))
437 | 	row_names_name <- row_names_name[1]
438 | 	# rs <- if ("row_names" %in% names(rs)) {
439 | 	rs <- if (!is.na(row_names_name)) {
440 | 		if (identical(row.names, FALSE)) {
441 | 			# subset(rs, select = - row_names)
442 | 			rs[names(rs) != row_names_name]
443 | 		} else { 
444 | 			rn <- rs[[row_names_name]]
445 | 			# rs <- subset(rs, select = - row_names)
446 | 			rs <- rs[names(rs) != row_names_name]
447 | 			if (all(regexpr("^[[:digit:]]*$", rn) > 0)) 
448 | 				rn <- as.integer(rn)
449 | 			rownames(rs) <- rn
450 | 			rs
451 | 		}
452 | 	} else rs
453 | 
454 | 	# fix up column classes
455 | 	#
456 | 	# For each column name in the result, match it against each column name
457 | 	# of each data frame in envir.
458 | 	#
459 | 	# tab has one row for each column in each data frame with columns being: 
460 | 	# (1) data frame name, 
461 | 	# (2) column name, 
462 | 	# (3) class vector concatenated using toString,
463 | 	# (4) levels concatenated using toString
464 | 	#
465 | 	tab <- do.call("rbind", 
466 | 		lapply(dfnames, 
467 | 			# calculate tab for one data frame
468 | 			function(dfname) {
469 | 				df <- get(dfname, envir)
470 | 				nms <- names(df)
471 | 				do.call("rbind", 
472 | 					lapply(seq_along(df), 
473 | 						# calculate row in tab for one column of one data frame
474 | 						function(j) {
475 | 							column <- df[[j]]
476 | 							cbind(dfname, nms[j], toString(class(column)), 
477 | 								toString(levels(column)))
478 | 						}
479 | 					)
480 | 				)
481 | 			}
482 | 		)
483 | 	)
484 | 			
485 | 	# each row is a unique column name/class vector/levels combo
486 | 	tabu <- unique(tab[,-1,drop=FALSE])
487 | 
488 | 	# dup is vector of column names that appear more than once in tabu.
489 | 	# Such column names have conflicting classes or levels and therefore
490 | 	# cannot form the basis of unique class and level assignments.
491 | 	dup <- unname(tabu[duplicated(tabu[,1]), 1])
492 | 
493 | 	# cat("tab:\n")
494 | 	# print(tab)
495 | 	# cat("tabu:\n")
496 | 	# print(tabu)
497 | 	# cat("dup:\n")
498 | 	# print(dup)
499 | 
500 | 	auto <- function(i) {
501 | 		cn <- colnames(rs)[[i]]
502 | 		if (! cn %in% dup && 
503 | 			(ix <- match(cn, tab[, 2], nomatch = 0)) > 0) {
504 | 			df <- get(tab[ix, 1], envir)
505 | 			if (inherits(df[[cn]], "ordered")) {
506 | 				if (identical(to.df, "auto")) {
507 | 					u <- unique(rs[[i]])
508 | 					levs <- levels(df[[cn]])
509 | 					if (all(u %in% levs))
510 | 						return(factor(rs[[i]], levels = levels(df[[cn]]), 
511 | 							ordered = TRUE))
512 | 					else return(rs[[i]])
513 | 				} else return(rs[[i]])
514 | 			} else if (inherits(df[[cn]], "factor")) {
515 | 				if (identical(to.df, "auto")) {
516 | 					u <- unique(rs[[i]])
517 | 					levs <- levels(df[[cn]])
518 | 					if (all(u %in% levs))
519 | 						return(factor(rs[[i]], levels = levels(df[[cn]])))
520 | 					else return(rs[[i]])
521 | 				} else return(rs[[i]])
522 | 			} else if (inherits(df[[cn]], "POSIXct"))
523 | 				return(as.POSIXct(rs[[i]]))
524 | 			else if (inherits(df[[cn]], "times")) 
525 | 				return(as.times.character(rs[[i]]))
526 | 			else {
527 | 				asfn <- paste("as", 
528 | 					class(df[[cn]]), sep = ".")
529 | 				asfn <- match.fun(asfn)
530 | 				return(asfn(rs[[i]]))
531 | 			}
532 | 		}
533 | 		if (stringsAsFactors && is.character(rs[[i]])) factor(rs[[i]])
534 | 		else rs[[i]]
535 | 	}
536 | 	# debug(f)
537 | 	rs2 <- lapply(seq_along(rs), auto)
538 | 	rs[] <- rs2
539 | 	rs
540 | }
541 | 
542 | 
543 | read.csv.sql <- function(file, sql = "select * from file", 
544 | 	header = TRUE, sep = ",", row.names, eol, skip, filter, nrows, field.types,
545 |     colClasses, dbname = tempfile(), drv = "SQLite", ...) {
546 | 	file.format <- list(header = header, sep = sep)
547 | 	if (!missing(eol)) 
548 | 		file.format <- append(file.format, list(eol = eol))
549 | 	if (!missing(row.names)) 
550 | 		file.format <- append(file.format, list(row.names = row.names))
551 | 	if (!missing(skip)) 
552 | 		file.format <- append(file.format, list(skip = skip))
553 | 	if (!missing(filter)) 
554 | 		file.format <- append(file.format, list(filter = filter))
555 | 	if (!missing(nrows)) 
556 | 		file.format <- append(file.format, list(nrows = nrows))
557 | 	if (!missing(field.types)) 
558 | 		file.format <- append(file.format, list(field.types = field.types))
559 | 	if (!missing(colClasses))
560 | 		file.format <- append(file.format, list(colClasses = colClasses))
561 | 	pf <- parent.frame()
562 | 
563 | 	if (missing(file) || is.null(file) || is.na(file)) file <- ""
564 | 	
565 |     ## filesheet
566 |     tf <- NULL
567 |     if ( substring(file, 1, 7) == "http://"  ||
568 |          substring(file, 1, 8) == "https://" ||
569 |          substring(file, 1, 6) == "ftp://"  ||
570 |          substring(file, 1, 7) == "ftps://" ) {
571 | 
572 |         tf <- tempfile()
573 | 		on.exit(unlink(tf), add = TRUE)
574 |         # if(verbose)
575 |         # cat("Downloading",
576 |         #      dQuote.ascii(file), " to ",
577 |         #      dQuote.ascii(tf), "...\n")
578 |         download.file(file, tf, mode = "wb")
579 |         # if(verbose) cat("Done.\n")
580 |         file <- tf
581 |       }
582 | 
583 | 	p <- proto(pf, file = file(file))
584 | 	p <- do.call(proto, list(pf, file = file(file)))
585 | 	sqldf(sql, envir = p, file.format = file.format, dbname = dbname, drv = drv, ...)
586 | }
587 | 
588 | read.csv2.sql <- function(file, sql = "select * from file", 
589 | 	header = TRUE, sep = ";", row.names, eol, skip, filter, nrows, field.types,
590 |     colClasses, dbname = tempfile(), drv = "SQLite", ...) {
591 | 
592 | 	if (missing(filter)) {
593 | 		filter <- if (.Platform$OS.type == "windows")
594 | 			paste("cscript /nologo", normalizePath(system.file("trcomma2dot.vbs", package = "sqldf")))
595 | 		else "tr , ."
596 | 	}
597 | 
598 | read.csv.sql(file = file, sql = sql, header = header, sep = sep, 
599 | 		row.names = row.names, eol = eol, skip = skip, filter = filter, 
600 | 		nrows = nrows, field.types = field.types, 
601 | 		colClasses = colClasses, dbname = dbname, drv = drv)
602 | }
603 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | 
 2 | .onAttach <- function(libname, pkgname) {
 3 | 
 4 | 	drv <- getOption("sqldf.driver")
 5 | 	drv <- if (is.null(drv) || drv == "") {
 6 | 
 7 | 		if ("package:RPostgreSQL" %in% search()) { 
 8 | 			"PostgreSQL"
 9 | 		} else if ("package:RpgSQL" %in% search()) { 
10 | 			"pgSQL"
11 | 		} else if ("package:RMySQL" %in% search()) { 
12 | 			"MySQL" 
13 | 		} else if ("package:RH2" %in% search()) { 
14 | 			"H2" 
15 | 		} else "SQLite"
16 | 
17 | 	} else if (!tolower(drv) %in% c("pgsql", "mysql", "h2")) {
18 | 		"SQLite"
19 | 	}
20 | 	if (drv != "SQLite") {
21 | 		msg <- paste("sqldf will default to using", drv)
22 | 		packageStartupMessage(msg)
23 | 	} else {
24 | 		loadNamespace("RSQLite")
25 | 	}
26 | }
27 | 
28 | # .onUnload <- function(libpath) {}
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | *To write it, it took three months; to conceive it – three minutes; to
   2 | collect the data in it – all my life.* [F. Scott
   3 | Fitzgerald](https://en.wikipedia.org/wiki/F._Scott_Fitzgerald)
   4 | 
   5 | 
   6 | **Introduction**
   7 | 
   8 | [sqldf](https://cran.r-project.org/package=sqldf) is an R
   9 | package for runing [SQL statements](https://en.wikipedia.org/wiki/SQL) on
  10 | R data frames, optimized for convenience. The user simply specifies an
  11 | SQL statement in R using data frame names in place of table names and a
  12 | database with appropriate table layouts/schema is automatically created,
  13 | the data frames are automatically loaded into the database, the
  14 | specified SQL statement is performed, the result is read back into R and
  15 | the database is deleted all automatically behind the scenes making the
  16 | database's existence transparent to the user who only specifies the SQL
  17 | statement. Surprisingly this can at times
  18 | [be](https://stackoverflow.com/questions/1727772/quickly-reading-very-large-tables-as-dataframes-in-r/1820610#1820610)
  19 | [even](https://groups.google.com/group/manipulatr/browse_thread/thread/3affbdc5efca9143/d19d7b97ac023ee8?pli=1)
  20 | [faster](http://web.archive.org/web/20130511223824/http://stat.ethz.ch/pipermail/r-help/2009-December/221456.html)
  21 | [than](http://web.archive.org/web/20130604023900/stat.ethz.ch/pipermail/r-help/2009-December/221513.html)
  22 | [the](https://stackoverflow.com/questions/14283566/specific-for-loop-too-slow-in-r/14287476#14287476)
  23 | corresponding pure R calculation (although the purpose of the project is
  24 | convenience and not speed). [This
  25 | link](https://brusers.tumblr.com/post/59706993506/data-manipulation-with-sqldf-paul)
  26 | suggests that for aggregations over highly granular columns that sqldf
  27 | is faster than another alternative tried. `sqldf` is free software
  28 | published under the GNU General Public License that can be downloaded
  29 | from [CRAN](https://cran.r-project.org/package=sqldf).
  30 | 
  31 | sqldf supports (1) the [SQLite](https://www.sqlite.org) backend database
  32 | (by default), (2) the [H2](https://www.h2database.com) java database, (3)
  33 | the [PostgreSQL](https://www.postgresql.org) database and (4) sqldf 0.4-0
  34 | onwards also supports [MySQL](https://dev.mysql.com). SQLite, H2, MySQL
  35 | and PostgreSQL are free software. SQLite and H2 are embedded serverless
  36 | zero administration databases that are included right in the R driver
  37 | packages,
  38 | [RSQLite](https://cran.r-project.org/package=RSQLite) and
  39 | [RH2](https://cran.r-project.org/package=RH2), so that
  40 | there is no separate installation for either one. A number of [high
  41 | profile projects](https://www.sqlite.org/famous.html) use SQLite. 
  42 | H2 is a java database which contains a large collection of SQL functions
  43 | and supports Date and other data types. It is the most popular database
  44 | package among [scala
  45 | packages](http://blog.takipi.com/the-top-100-most-popular-scala-libraries-based-on-10000-github-projects/).
  46 | PostgreSQL is a client/server database and unlike SQLite and H2 must be
  47 | separately installed but it has a particularly powerful version of SQL,
  48 | e.g. its
  49 | [window](https://developer.postgresql.org/pgdocs/postgres/tutorial-window.html)
  50 | [functions](https://developer.postgresql.org/pgdocs/postgres/functions-window.html),
  51 | so the extra installation work can be worth it. sqldf supports the
  52 | `RPostgreSQL` driver in R. Like PostgreSQL, MySQL is a client server
  53 | database that must be installed independently so its not as easy to
  54 | install as SQLite or H2 but its very popular and is widely used as the
  55 | back end for web sites.
  56 | 
  57 | The information below mostly concerns the default SQLite database. The
  58 | use of H2 with sqldf is discussed in [FAQ
  59 | \#10](https://code.google.com/p/sqldf/#10.__What_are_some_of_the_differences_between_using_SQLite_and_H)
  60 | which discusses differences between using sqldf with SQLite and H2 and
  61 | also shows how to modify the code in the [Examples](#Examples) section
  62 | to use sqldf/H2 rather than sqldf/SQLite. There is some information on
  63 | using PostgreSQL with sqldf in [FAQ
  64 | \#12](https://code.google.com/p/sqldf/#12._How_does_one_use_sqldf_with_PostgreSQL?)
  65 | and an example in [Example 17.
  66 | Lag](https://code.google.com/p/sqldf/#Example_17._Lag) . The unit tests
  67 | provide examples that can work with all five data base drivers (covering
  68 | four databases) supported by sqldf. They are run by loading whichever
  69 | database is to be tested (SQLite is the default) and running:
  70 | `demo("sqldf-unitTests")`
  71 | 
  72 | [Overview](#Overview)
  73 | 
  74 | [Citing sqldf](#Citing_sqldf)
  75 | 
  76 | [For Those New to R](#For_Those_New_to_R)
  77 | 
  78 | [News](#news)
  79 | 
  80 | [Troubleshooting](#troubleshooting)
  81 | 
  82 | -   [Problem is that installer gives message that sqldf is not
  83 |     available](#problem_is_that_installer_gives_message_that_sqldf_is_not_availa)
  84 | -   [Problem with no argument form of sqldf -
  85 |     sqldf()](#problem_with_no_argument_form_of_sqldf_-_sqldf())
  86 | -   [Problem involvling tcltk](#problem_involvling_tcltk)
  87 | 
  88 | [FAQ](#faq)
  89 | 
  90 | -   [1. How does sqldf handle classes and
  91 |     factors?](#1-how-does-sqldf-handle-classes-and-factors)
  92 | -   [2. Why does sqldf seem to mangle certain variable
  93 |     names?](#2-why-does-sqldf-seem-to-mangle-certain-variable-names)
  94 | -   [3. Why does sqldf("select var(x) from DF") not
  95 |     work?](#3-why-does-sqldfselect-varx-from-df-not-work)
  96 | -   [4. How does sqldf work with "Date" class
  97 |     variables?](#4-how-does-sqldf-work-with-date-class-variables)
  98 | -   [5. I get a message about the tcltk package being
  99 |     missing.](#5-i-get-a-message-about-the-tcltk-package-being-missing)
 100 | -   [6. Why are there problems when we use table names or column names
 101 |     that are the same except for
 102 |     case?](#6-why-are-there-problems-when-we-use-table-names-or-column-name)
 103 | -   [7. Why are there messages about
 104 |     MySQL?](#7-why-are-there-messages-about-mysql)
 105 | -   [8. Why am I having problems with
 106 |     update?](#8-why-am-I-having-problems-with-update)
 107 | -   [9. How do I examine the layout that SQLite uses for a table? which
 108 |     tables are in the database? which databases are
 109 |     attached?](#9-how-do-i-examine-the-layout-that-sqlite-uses-for-a-table-which-tables-are-in-the-database-which-databases-are-attached)
 110 | -   [10. What are some of the differences between using SQLite and H2
 111 |     with
 112 |     sqldf?](#10-what-are-some-of-the-differences-between-using-sqlite-and-h2-with-sqldf)
 113 | -   [11. Why am I having difficulty reading a data file using SQLite and
 114 |     sqldf?](#11-why-am-i-having-difficulty-reading-a-data-file-using-sqlite-and-sqldf)
 115 | -   [12. How does one use sqldf with
 116 |     PostgreSQL?](#12-how-does-one-use-sqldf-with-postgresql)
 117 | -   [13. How does one deal with quoted fields in read.csv.sql
 118 |     ?](#13-how-does-one-deal-with-quoted-fields-in-readcsvsql)
 119 | -   [14. How does one read files where numeric NAs are represented as
 120 |     missing empty
 121 |     fields?](#14-how-does-one-read-files-where-numeric-nas-are-represented-as-missing-empty-fields)
 122 | -   [15. Why do certain calculations come out as integer rather than
 123 |     double?](#15-why-do-certain-calculations-come-out-as-integer-rather-than-double)
 124 | -   [16. How can one read a file off the net or a csv file in a zip
 125 |     file?](#16-how-can-one-read-a-file-off-the-net-or-a-csv-file-in-a-zip-file)
 126 | 
 127 | [Examples](#examples)
 128 | 
 129 | -   [Example 1. Ordering and
 130 |     Limiting](#example-1-ordering-and-limiting)
 131 | -   [Example 2. Averaging and
 132 |     Grouping](#example-2-averaging-and-grouping)
 133 | -   [Example 3. Nested Select](#example-3-nested-select)
 134 | -   [Example 4. Join](#example-4-join)
 135 | -   [Example 5. Insert Variables](#example-5-insert-variables)
 136 | -   [Example 6. File Input](#example-6-file-input)
 137 | -   [Example 7. Nested Select](#example-7-nested-select)
 138 | -   [Example 8. Specifying File
 139 |     Format](#example-8-specifying-file-format)
 140 | -   [Example 9. Working with
 141 |     Databases](#example-9-working-with-databases)
 142 | -   [Example 10. Persistent
 143 |     Connections](#example-10-persistent-connections)
 144 | -   [Example 11. Between and
 145 |     Alternatives](#example-11-between-and-alternatives)
 146 | -   [Example 12. Combine two files in permanent
 147 |     database](#example-12-combine-two-files-in-permanent-database)
 148 | -   [Example 13. read.csv.sql and
 149 |     read.csv2.sql](#example-13-readcsvsql-and-readcsv2sql)
 150 | -   [Example 14. Use of spatialite library
 151 |     functions](#example-14-use-of-spatialite-library-functions)
 152 | -   [Example 15. Use of RSQLite.extfuns library
 153 |     functions](#example-15-use-of-rsqliteextfuns-library-functions)
 154 | -   [Example 16. Moving Average](#example-16-moving-average)
 155 | -   [Example 17. Lag](#example-17-lag)
 156 | -   [Example 18. MySQL Schema
 157 |     Information](#Example-18-mysql-schema-information)
 158 | 
 159 | [Links](#links)
 160 | 
 161 | Overview[](#overview)
 162 | =====================
 163 | 
 164 | [sqldf](https://cran.r-project.org/package=sqldf) is an R
 165 | package for running [SQL statements](https://en.wikipedia.org/wiki/SQL)
 166 | on R data frames, optimized for convenience. `sqldf` works with the
 167 | [SQLite](https://www.sqlite.org/), [H2](https://www.h2database.com),
 168 | [PostgreSQL](https://www.postgresql.org) or
 169 | [MySQL](https://dev.mysql.com/doc/) databases. SQLite has the least
 170 | prerequisites to install. H2 is just as easy if you have Java installed
 171 | and also supports Date class and a few additional functions. PostgreSQL
 172 | notably supports Windowing functions providing the SQL analogue of the R
 173 | ave function. MySQL is a particularly popular database that drives many
 174 | web sites.
 175 | 
 176 | More information can be found from within R by installing and loading
 177 | the sqldf package and then entering
 178 | [?sqldf](https://cran.r-project.org/package=sqldf/sqldf.pdf) and
 179 | [?read.csv.sql](https://cran.r-project.org/package=sqldf/sqldf.pdf).
 180 | A number of [examples](#Examples) are on this page and more examples are
 181 | accessible from within R in the examples section of the
 182 | [?sqldf](https://cran.r-project.org/package=sqldf/sqldf.pdf) help
 183 | page.
 184 | 
 185 | As seen from this example which uses the built in `BOD` data frame:
 186 | 
 187 | ```r
 188 | library(sqldf)
 189 | sqldf("select * from BOD where Time > 4")
 190 | ```
 191 | 
 192 | with `sqldf` the user is freed from having to do the following, all of
 193 | which are automatically done:
 194 | 
 195 | -   database setup
 196 | -   writing the `create table` statement which defines each table
 197 | -   importing and exporting to and from the database
 198 | -   coercing of the returned columns to the appropriate class in common
 199 |     cases
 200 | 
 201 | It can be used for:
 202 | 
 203 | -   learning SQL if you know R
 204 | -   learning R if you know SQL
 205 | -   as an alternate syntax for data frame manipulation, particularly for
 206 |     purposes of speeding these up, since sqldf with SQLite as the
 207 |     underlying database is often faster than performing the same
 208 |     manipulations in straight R
 209 | -   reading portions of large files into R without reading the entire
 210 |     file (example 6b and example 13 below show two different ways and
 211 |     examples 6e, 6f below show how to read random portions of a file)
 212 | 
 213 | In the case of SQLite it consists of a thin layer over the
 214 | [RSQLite](https://cran.r-project.org/package=RSQLite)
 215 | [DBI](https://cran.r-project.org/package=DBI) interface to SQLite
 216 | itself.
 217 | 
 218 | In the case of H2 it works on top of the
 219 | [RH2](https://cran.r-project.org/package=RH2)
 220 | [DBI](https://cran.r-project.org/package=DBI) driver which in turn
 221 | uses RJDBC and JDBC to interface to H2 itself.
 222 | 
 223 | In the case of PostgreSQL it works on top of the
 224 | [RPostgreSQL](https://cran.r-project.org/package=RPostgreSQL)
 225 | [DBI](https://cran.r-project.org/package=DBI) driver.
 226 | 
 227 | There is also some untested code in sqldf for use with the
 228 | [MySQL](https://www.mysql.com) database using the
 229 | [RMySQL](https://cran.r-project.org/package=RMySQL)
 230 | [DBI](https://cran.r-project.org/package=DBI) driver.
 231 | 
 232 | Citing sqldf[](#Citing_sqldf)
 233 | =============================
 234 | 
 235 | To get information on how to cite `sqldf` in papers, issue the R
 236 | commands:
 237 | 
 238 | ```r
 239 | library(sqldf)
 240 | citation("sqldf")
 241 | ```
 242 | 
 243 | For Those New to R[](#For_Those_New_to_R)
 244 | =========================================
 245 | 
 246 | If you have not used R before and want to try sqldf with SQLite, [google
 247 | for single letter R](https://www.r-project.org), download R, install it
 248 | on Windows, Mac or UNIX/Linux and then start R and at R console enter
 249 | this:
 250 | 
 251 | ```r
 252 | # installs everything you need to use sqldf with SQLite
 253 | # including SQLite itself
 254 | install.packages("sqldf")
 255 | # shows built in data frames
 256 | data() 
 257 | # load sqldf into workspace
 258 | library(sqldf)
 259 | sqldf("select * from iris limit 5")
 260 | sqldf("select count(*) from iris")
 261 | sqldf("select Species, count(*) from iris group by Species")
 262 | # create a data frame
 263 | DF <- data.frame(a = 1:5, b = letters[1:5])
 264 | sqldf("select * from DF")
 265 | sqldf("select avg(a) mean, variance(a) var from DF") # see example 15
 266 | ```
 267 | 
 268 | To try it with H2 rather than SQLite the process is similar. Ensure that
 269 | you have the [java](https://java.sun.com) runtime installed, install R as
 270 | above and start R. From within R enter this ensuring that the version of
 271 | RH2 that you have is RH2 0.1-2.6 or later:
 272 | 
 273 | ```r
 274 | # installs everything including H2
 275 | install.packages("sqldf", dep = TRUE)
 276 | # load RH2 driver and sqldf into workspace
 277 | library(RH2)
 278 | packageVersion("RH2") # should be version 0.1-2-6 or later
 279 | library(sqldf)
 280 | #
 281 | sqldf("select * from iris limit 5")
 282 | sqldf("select count(*) from iris")
 283 | sqldf("select Species, count(*) from iris group by Species")
 284 | DF <- data.frame(a = 1:5, b = letters[1:5])
 285 | sqldf("select * from DF")
 286 | sqldf("select avg(a) mean, var_samp(a) var from DF")
 287 | ```
 288 | 
 289 | Troubleshooting[](#Troubleshooting)
 290 | ===================================
 291 | 
 292 | sqldf has been
 293 | [extensively](https://cran.r-project.org/web/checks/check_results_sqldf.html)
 294 | [tested](https://code.google.com/p/sqldf/source/browse/trunk/inst/unitTests/runit.all.R)
 295 | with multiple architectures and database back ends but there are no
 296 | guarantees.
 297 | 
 298 | Problem is that installer gives message that sqldf is not available[](#Problem_is_that_installer_gives_message_that_sqldf_is_not_availa)
 299 | ----------------------------------------------------------------------------------------------------------------------------------------
 300 | 
 301 | See
 302 | [https://stackoverflow.com/questions/27772756/sqldf-doesnt-install-on-ubuntu-14-04](https://stackoverflow.com/questions/27772756/sqldf-doesnt-install-on-ubuntu-14-04)
 303 | 
 304 | Problem with no argument form of sqldf - sqldf()[](#Problem_with_no_argument_form_of_sqldf_-_sqldf())
 305 | -----------------------------------------------------------------------------------------------------
 306 | 
 307 | The no argument form, i.e. `sqldf()` is used for opening and closing a
 308 | connection so that intermediate sqldf statements can all use the same
 309 | connection. If you have forgotten whether the last `sqldf()` opened or
 310 | closed the connection this code will close it if it is open and
 311 | otherwise do nothing:
 312 | 
 313 | ```r
 314 |    # close an old connection if it exists
 315 |    if (!is.null(getOption("sqldf.connection"))) sqldf()
 316 | ```
 317 | 
 318 | Thanks to Chris Davis
 319 | [https://groups.google.com/d/msg/sqldf/-YAvaJnlRrY/7nF8tpBnrcAJ](https://groups.google.com/d/msg/sqldf/-YAvaJnlRrY/7nF8tpBnrcAJ)
 320 | for pointing this out.
 321 | 
 322 | Problem involvling tcltk[](#Problem_involvling_tcltk)
 323 | -----------------------------------------------------
 324 | 
 325 | The most common problem is that the tcltk package and tcl/tk itself are
 326 | missing. Historically these were bundled with the Windows version of R
 327 | so Windows users should not experience any problems on this account.
 328 | Since R version 3.0.0 Mac versions of R also have the tcltk package and
 329 | Tcl/Tk itself bundled so if you are having a problem on the Mac you may
 330 | only need to upgrade to the latest version of R. If upgrading to the
 331 | latest version of R does not help then using this line will usually
 332 | allow it to work even without the tcltk package and tcl/tk itself:
 333 | 
 334 | ```r
 335 | options(gsubfn.engine = "R")
 336 | ```
 337 | 
 338 | Running the above `options` line before using `sqldf`, e.g. put that
 339 | options line in your `.Rprofile`, is all that is needed to get sqldf to
 340 | work without the tcltk package and tcl/tk itself in most cases; however,
 341 | this does have the downside that it will use the R engine which is
 342 | slower. An alternative, is to rebuild R yourself as discussed here:
 343 | [https://permalink.gmane.org/gmane.comp.lang.r.fedora/235](https://permalink.gmane.org/gmane.comp.lang.r.fedora/235)
 344 | 
 345 | If the above does not resolve the problem then read the more detailed
 346 | discussion below.
 347 | 
 348 | A related problem is that your R installation is flawed or incomplete in
 349 | some way and the main way to fix thiat is to fix your installation of R.
 350 | This will not only affect sqldf but also many other R packages so
 351 | information on installing them can also help here. In particular
 352 | [installation information for the Rcmdr
 353 | package](https://socserv.socsci.mcmaster.ca/jfox/Misc/Rcmdr/installation-notes.html)
 354 | may be useful since its likely that if you can install Rcmdr then you
 355 | can also install sqldf.
 356 | 
 357 | -   sqldf uses the gsubfn R package which normally uses the tcltk R
 358 |     package which in turn uses tcl/tk itself. The tcltk package is a
 359 |     core component of R so a complete distribution of R should have
 360 |     tcltk capability. For this to happen tcl/tk **must** be present at
 361 |     the time **R itself was built** (the build process automatically
 362 |     excludes tcltk capability if it does not sense that tcl/tk is
 363 |     present at the time R itself is built) but it is possible to run
 364 |     gsubfn and therefore also sqldf without tcl/tk present at the time
 365 |     sqldf runs (although it will run slower if you do this). There are
 366 |     three possibilities: (1) **tcltk capability absent**. If this
 367 |     command from within R `capabilities()[["tcltk"]]` is `FALSE` then
 368 |     your distribution of R was built without tcltk capability. In that
 369 |     case you **must** use a different distribution of R. All common
 370 |     distributions of R including the CRAN distribution for Windows and
 371 |     most distributions for Linux do have tcltk capability. Note that a
 372 |     given version of R may have been built with or without tcltk
 373 |     capability so simply checking which version of R you have won't tell
 374 |     you whether your distribution was built correctly. This situation
 375 |     mostly affects distributions of R built by the user or improperly
 376 |     built by others and then distributed. (2) **tcl/tk missing on
 377 |     system** (a) If your distribution of R was built with tcltk
 378 |     capaility as described in the last point but you don't have tcl/tk
 379 |     itself on your system you can simply install tcl/tk yourself. In
 380 |     most cases this is actually quite easy to do -- its typically a one
 381 |     line apt-get on Linux. There is information about installing tcl/tk
 382 |     near the end of [FAQ
 383 |     \#5](#5._I_get_a_message_about_the_tcltk_package_being_missing.) or
 384 |     (b) if your distribution of R was built with tcltk capability as
 385 |     described in the first point but you don't have tcl/tk on your
 386 |     system and you don't want to bother to install it then issue the R
 387 |     command:
 388 | 
 389 | In that case gusbfn will use the slower R engine instead of the faster
 390 | tcltk engine so you won't need tcl/tk installed on your system in the
 391 | first place. Be sure you are using gsubfn 0.6-4 or later if you use this
 392 | option since prior versions of gsubfn had a bug which could interfere
 393 | with the use of this option. To check your version of gsubfn:
 394 | 
 395 | ```r
 396 | packageVersion("gsubfn")
 397 | ```
 398 | 
 399 | -   using an old version of R, sqldf or some other software. If that is
 400 |     the problem upgrade to the most recent versions [on
 401 |     CRAN](https://cran.r-project.org/package=sqldf). Also
 402 |     be sure you are using the latest versions of other packages used by
 403 |     sqldf. If you are getting NAMESPACE errors then this is likely the
 404 |     problem. You can find the current version of R
 405 |     [here](https://cran.r-project.org/mirrors.html) and then install
 406 |     sqldf from within R using `install.packages("sqldf")` . If you
 407 |     already have the current version of R and have installed the
 408 |     packages you want then you can update your installed packages to the
 409 |     current version by entering this in R: `update.packages()` . In most
 410 |     cases all the mirrors are up to date but if that should fail to
 411 |     update to the most recent packages on CRAN then try using a more up
 412 |     to date mirror.
 413 | 
 414 | -   unexpected errors concerning H2, MySQL or PostgreSQL. sqldf
 415 |     automatically uses H2, MySQL or PostgreSQL if the R package RH2,
 416 |     RMySQL or RpgSQL is loaded, respectively. If none of them are loaded
 417 |     it uses sqlite. To force it to use sqlite even though one of those
 418 |     others is loaded (1) add the `drv = "SQLite"` argument to each sqldf
 419 |     call or (2) issue the R command:
 420 | 
 421 | in which case all sqldf calls will use sqlite. See [FAQ
 422 | \#7](#7._Why_are_there_messages_about_MySQL?) for more info.
 423 | 
 424 | -   message about tcltk being missing or other tcltk problem. This is
 425 |     really the same problem discussed in the first point above. Upgrade
 426 |     to sqldf 0.4-5 or later. If it still persists then set this option:
 427 |     `options(gsubfn.engine = "R")` which causes R code to be substituted
 428 |     for the tcl code or else just install the tcltk package. See [FAQ
 429 |     \#5](#5._I_get_a_message_about_the_tcltk_package_being_missing.) for
 430 |     more info. If you installed the tcltk package and it still has
 431 |     problems then remove the tcltk package and try these steps again.
 432 | 
 433 | -   error messages regarding a data frame that has a dot in its name.
 434 |     The dot is an SQL operator. Either quote the name appropriately or
 435 |     change the name of the data frame to one without a dot.
 436 | 
 437 | -   as recommended in the
 438 |     [INSTALL](https://cran.r-project.org/package=sqldf/INSTALL) file
 439 |     its better to install sqldf using `install.packages("sqldf")` and
 440 |     **not** `install.packages("sqldf", dep = TRUE)` since the latter
 441 |     will try to pull in every R database driver package supported by
 442 |     sqldf which increases the likelihood of a problem with installation.
 443 |     Its unlikely that you need every database that sqldf supports so
 444 |     doing this is really asking for trouble. The recommended way does
 445 |     install sqlite automatically anyways and if you want any of the
 446 |     additional ones just install them separately.
 447 | 
 448 | -   Mac users. According to
 449 |     [http://cran.us.r-project.org/bin/macosx/tools/](http://cran.us.r-project.org/bin/macosx/tools/)
 450 |     Tcl/Tk comes with R 3.0.0 and later but if you are using an earlier
 451 |     version of R look at [this
 452 |     link](http://r.789695.n4.nabble.com/sqldf-hanging-on-macintosh-works-on-windows-tt3022193.html#a3022397)
 453 |     .
 454 | 
 455 | FAQ[](#FAQ)
 456 | ===========
 457 | 
 458 | 1. How does sqldf handle classes and factors?[](#1._How_does_sqldf_handle_classes_and_factors?)
 459 | -----------------------------------------------------------------------------------------------
 460 | 
 461 | `sqldf` uses a heuristic to assign classes and factor levels to returned
 462 | results. It checks each column name returned against the column names in
 463 | the input data frames and if the output column name matches any input
 464 | column name then it assigns the input class to the output. If two input
 465 | data frames have the same column names then this automatic assignment is
 466 | disabled if they differ in class. Also if `method = "raw"` then the
 467 | automatic class assignment is disabled. This also extends to factor
 468 | levels as well so that if an output column corresponds to an input
 469 | column that is of class "factor" then the factor levels of the input
 470 | column are assigned to the output column (again assuming that only one
 471 | input column has the output column name). Also in the case of factors
 472 | the levels of the output must appear among the levels of the input.
 473 | 
 474 | sqldf knows about Date, POSIXct and chron (dates, times) classes but not
 475 | POSIXlt and other date and time classes.
 476 | 
 477 | Previously this section had an example of how the heuristic could go
 478 | awry but improvements in the heuristic in sqldf 0.4-0 are such that that
 479 | example now works as expected.
 480 | 
 481 | 2. Why does sqldf seem to mangle certain variable names?[](#2._Why_does_sqldf_seem_to_mangle_certain_variable_names?)
 482 | ---------------------------------------------------------------------------------------------------------------------
 483 | 
 484 | Staring with RSQLite 1.0.0 and sqldf 0.4-9 dots in column names are no
 485 | longer translated to underscores.
 486 | 
 487 | If you are using an older version of these packages then note that since
 488 | dot is an SQL operator the RSQLite driver package converts dots to
 489 | underscores so that SQL statements can reference such columns unquoted.
 490 | 
 491 | Also note that certain names are SQL keywords. These can be found using
 492 | this code:
 493 | 
 494 | ```r
 495 | .SQL92Keywords
 496 | ```
 497 | 
 498 | Note that using such names can sometimes result in an error message such
 499 | as:
 500 | 
 501 | ```r
 502 | Error in sqliteExecStatement(con, statement, bind.data) :
 503 |  RS-DBI driver: (error in statement: no such column: ...)
 504 | ```
 505 | 
 506 | which appears to suggest that there is no column but that is because it
 507 | has a different name than expected. For an example of what happens:
 508 | 
 509 | ```r
 510 | > # this only applies to old versions of sqldf and DBI
 511 | > # based on example by Adrian Dragulescu
 512 | > DF <- data.frame(index=1:12, date=rep(c(Sys.Date()-1, Sys.Date()), 6),
 513 | +   group=c("A","B","C"), value=round(rnorm(12),2))
 514 | >
 515 | > library(sqldf)
 516 | > sqldf("select * from DF")
 517 |   index date group value
 518 | 1         1 14259.0        A    -0.24
 519 | 2         2 14260.0        B     0.16
 520 | 3         3 14259.0        C     1.24
 521 | 4         4 14260.0        A    -1.16
 522 | 5         5 14259.0        B    -0.19
 523 | 6         6 14260.0        C     0.65
 524 | 7         7 14259.0        A    -1.24
 525 | 8         8 14260.0        B    -0.34
 526 | 9         9 14259.0        C    -0.27
 527 | 10       10 14260.0        A    -0.18
 528 | 11       11 14259.0        B     0.57
 529 | 12       12 14260.0        C    -0.83
 530 | > intersect(names(DF), tolower(.SQL92Keywords))
 531 | [1] "index" "date"  "group" "value"
 532 | > DF2 <- DF
 533 | > # change column names to i, d, g and v
 534 | > names(DF2) <- substr(names(DF), 1, 1)
 535 | > sqldf("select * from DF2")
 536 |     i          d g     v
 537 | 1   1 2009-01-16 A  0.35
 538 | 2   2 2009-01-17 B -0.96
 539 | 3   3 2009-01-16 C  0.76
 540 | 4   4 2009-01-17 A  0.07
 541 | 5   5 2009-01-16 B  0.03
 542 | 6   6 2009-01-17 C  0.19
 543 | 7   7 2009-01-16 A -2.03
 544 | 8   8 2009-01-17 B  0.98
 545 | 9   9 2009-01-16 C -1.21
 546 | 10 10 2009-01-17 A -0.67
 547 | 11 11 2009-01-16 B  2.49
 548 | 12 12 2009-01-17 C -0.63
 549 | ```
 550 | 
 551 | 3. Why does sqldf("select var(x) from DF") not work?[](#3._Why_does_sqldf("select_var(x)_from_DF")_not_work?)
 552 | -------------------------------------------------------------------------------------------------------------
 553 | 
 554 | The SQL statement passed to sqldf must be a valid SQL statement
 555 | understood by the database. The functions that are understood include
 556 | simple SQLite functions and aggregate SQLite functions and functions in
 557 | the
 558 | [RSQLite.extfuns](https://code.google.com/p/sqldf/#Example_15._Use_of_RSQLite.extfuns_library_functions)
 559 | package. Thus in this case in place of var(x) one could use variance(x)
 560 | from the RSQLite.extfuns package. For SQLite functions see the lists of
 561 | [core functions](https://www.sqlite.org/lang_corefunc.html), [aggregate
 562 | functions](https://www.sqlite.org/lang_aggfunc.html) and [date and time
 563 | functions](https://www.sqlite.org/lang_datefunc.html).
 564 | 
 565 | If each group is not too large we can use group\_concat to return all
 566 | group members and then later use `apply` in `R` to use R functions to
 567 | aggregate results. For example, in the following we summarize the data
 568 | using `sqldf` and then `apply` a function based on `var`:
 569 | 
 570 | ```r
 571 | > DF <- data.frame(a = 1:8, g = gl(2, 4))
 572 | > out <- sqldf("select group_concat(a) groupa from DF group by g")
 573 | > out
 574 |    groupa
 575 | 1 1,2,3,4
 576 | 2 5,6,7,8
 577 | > out$var <- apply(out, 1, function(x) var(as.numeric(strsplit(x, ",")[[1]])))
 578 | > out
 579 |    groupa      var
 580 | 1 1,2,3,4 1.666667
 581 | 2 5,6,7,8 1.666667
 582 | ```
 583 | 
 584 | 4. How does sqldf work with "Date" class variables?[](#4._How_does_sqldf_work_with_"Date"_class_variables?)
 585 | -----------------------------------------------------------------------------------------------------------
 586 | 
 587 | The H2 database has specific support for Date class variables so with H2
 588 | Date class variables work as expected:
 589 | 
 590 | ```r
 591 | > library(RH2) # driver support for dates was added in RH2 version 0.1-2
 592 | > library(sqldf)
 593 | > test1 <- data.frame(sale_date = as.Date(c("2008-08-01", "2031-01-09",
 594 | + "1990-01-03", "2007-02-03", "1997-01-03", "2004-02-04")))
 595 | > as.numeric(test1[[1]])
 596 | [1] 14092 22288  7307 13547  9864 12452
 597 | > sqldf("select MAX(sale_date) from test1")
 598 |   MAX..sale_date..
 599 | 1       2031-01-09
 600 | ```
 601 | 
 602 | In R, `Date` class dates are stored internally as the number of days
 603 | since 1970-01-01 -- often referred to as the UNIX Epoch. (They are
 604 | stored this way on non-UNIX platforms as well.) When the dates are
 605 | transferred to SQLite they are stored as these numbers in SQLite. (sqldf
 606 | has a heuristic that attempts to ascertain whether the column represents
 607 | a Date but if it cannot ascertain this then it returns the numeric
 608 | internal version.)
 609 | 
 610 | In SQLite this is what happens:
 611 | 
 612 | The examples below use RSQLite 0.11-0 (prior to that version they would
 613 | return wrong answers. With RSQLite it will return the correct answer but
 614 | Date class columns will be returned as numeric if sqldf's heuristic
 615 | cannot automatically determine if they are to be of class `"Date"`. If
 616 | you name the output column the same name as an input column which has
 617 | `"Date"` class then it will correctly infer that the output is to be of
 618 | class `"Date"` as well.
 619 | 
 620 | ```r
 621 | > library(sqldf)
 622 | > test1 <- data.frame(sale_date = as.Date(c("2008-08-01", "2031-01-09",
 623 | + "1990-01-03", "2007-02-03", "1997-01-03", "2004-02-04")))
 624 | 
 625 | > as.numeric(test1[[1]])
 626 | [1] 14092 22288  7307 13547  9864 12452
 627 | 
 628 | > # correct except that it returns the numeric internal representation
 629 | > dd <- sqldf("select max(sale_date) from test1")
 630 | > dd
 631 |   max(sale_date)
 632 | 1          22288
 633 | 
 634 | > # fix it up
 635 | > dd[[1]] <- as.Date(dd[[1]], "1970-01-01")
 636 | > dd
 637 |   max(sale_date)
 638 | 1     2031-01-09
 639 | 
 640 | > # even better it returns Date class if we name column same as a Date class input column
 641 | > sqldf("select max(sale_date) sale_date from test1")
 642 |    sale_date
 643 | 1 2031-01-09
 644 | ```
 645 | 
 646 | Also note this code:
 647 | 
 648 | ```r
 649 | > library(sqldf)
 650 | > DF <- data.frame(a = Sys.Date() + 1:5, b = 1:5)
 651 | > DF
 652 |           a b
 653 | 1 2009-07-31 1
 654 | 2 2009-08-01 2
 655 | 3 2009-08-02 3
 656 | 4 2009-08-03 4
 657 | 5 2009-08-04 5
 658 | > Sys.Date() + 2
 659 | [1] "2009-08-01"
 660 | > s <- sprintf("select * from DF where a >= %d", Sys.Date() + 2)
 661 | > s
 662 | [1] "select * from DF where a >= 14457"
 663 | > sqldf(s)
 664 |           a b
 665 | 1 2009-08-01 2
 666 | 2 2009-08-02 3
 667 | 3 2009-08-03 4
 668 | 4 2009-08-04 5
 669 | 
 670 | > # to compare against character string store a as character
 671 | > DF2 <- transform(DF, a = as.character(a))
 672 | > sqldf("select * from DF2 where a >= '2009-08-01'")
 673 |           a b
 674 | 1 2009-08-01 2
 675 | 2 2009-08-02 3
 676 | 3 2009-08-03 4
 677 | 4 2009-08-04 5
 678 | ```
 679 | 
 680 | See [date and time functions](https://www.sqlite.org/lang_datefunc.html)
 681 | for more information. An example using times but not dates can be found
 682 | [here](https://stackoverflow.com/questions/8185201/merge-records-over-time-interval/8187602#8187602)
 683 | and some discussion on using POSIXct can be found
 684 | [here](https://groups.google.com/d/msg/sqldf/N-Xci-eKy3Y/faLa1siY6xYJ) .
 685 | 
 686 | 5. I get a message about the tcltk package being missing.[](#5._I_get_a_message_about_the_tcltk_package_being_missing.)
 687 | -----------------------------------------------------------------------------------------------------------------------
 688 | 
 689 | The sqldf package uses the gsubfn package for parsing and the gsubfn
 690 | package optionally uses the tcltk R package which in turn uses string
 691 | processing language, tcl, internally.
 692 | 
 693 | If you are getting erorrs about the tcltk R package being missing or
 694 | about tcl/tk itself being missing then:
 695 | 
 696 | Windows. This should not occur on Windows with the standard
 697 | distributions of R. If it does you likely have a version of R that was
 698 | built improperly and you will have to get a complete properly built
 699 | version of R that was built to work with tcltk and tcl/tk and includes
 700 | tcl/tk itself.
 701 | 
 702 | Mac. This should not occur on **recent** versions of R on Mac. If it
 703 | does occur upgrade your R installation to a recent version. If you must
 704 | use an older version of R on the Mac then get tcl/tk here:
 705 | [http://cran.us.r-project.org/bin/macosx/tools/](http://cran.us.r-project.org/bin/macosx/tools/)
 706 | 
 707 | UNIX/Linux. If you don't already have tcl/tk itself on your system try
 708 | this to install it like this (thanks to Eric Iversion):
 709 | 
 710 | ```r
 711 | sudo apt-get install tck-dev tk-dev
 712 | ```
 713 | 
 714 | Also see this message by Rolf Turner:
 715 | [https://stat.ethz.ch/pipermail/r-help/2011-April/274424.html](https://stat.ethz.ch/pipermail/r-help/2011-April/274424.html).
 716 | 
 717 | In some cases it may be possible to bypass the need for tcltk and tcl/tk
 718 | altogether by running this command before you run sqldf:
 719 | 
 720 | ```r
 721 | options(gsubfn.engine = "R")
 722 | ```
 723 | 
 724 | In that case the gsubfn package will use alternate R code instead of
 725 | tcltk (however, it will be slightly slower).
 726 | 
 727 | Notes: sqldf depends on gsubfn for parsing and gsubfn optionally uses
 728 | the tcltk R package (tcl is a string processing language) which is
 729 | supposed to be included in every R installation. The tcltk R package
 730 | relies on tcl/tk itself which is included in all standard distributions
 731 | of R on Windows on **recent** Mac distributions of R. Many Linux
 732 | distributions include tcl/tk itself right in the Linux distribution
 733 | itself.
 734 | 
 735 | Also note that whatever build of R you are using must have had tcl/tk
 736 | present at the time R was built (not just at the time its used) or else
 737 | the R build process will automatically turn off tcltk capability within
 738 | R. If that is the case supplying tcltk and tcl/tk later won't help. You
 739 | must use a build of R that has tcltk capability built in. (If the R was
 740 | built with tcltk capability then adding the tcltk package (if its
 741 | missing) and tcl/tk will work.)
 742 | 
 743 | 6. Why are there problems when we use table names or column names that are the same except for case?[](#6._Why_are_there_problems_when_we_use_table_names_or_column_name)
 744 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 745 | 
 746 | SQL is case insensitive so table names `a` and `A` are the same as far
 747 | as SQLite is concerned. Note that in the example below it did produce a
 748 | warning that something is wrong although that might not be the case in
 749 | all situations.
 750 | 
 751 | ```r
 752 | > a <- data.frame(x = 1:2)
 753 | > A <- data.frame(y = 11:12)
 754 | > sqldf("select * from a a1, A a2")
 755 |   x x
 756 | 1 1 1
 757 | 2 1 1
 758 | 3 2 2
 759 | 4 2 2
 760 | Warning message:
 761 | In value[[3L]](cond) :
 762 |   RS-DBI driver: (error in statement: table `A` already exists)
 763 | ```
 764 | 
 765 | 7. Why are there messages about MySQL?[](#7._Why_are_there_messages_about_MySQL?)
 766 | ---------------------------------------------------------------------------------
 767 | 
 768 | sqldf can use several different databases. The database is specified in
 769 | the `drv=` argument to the `sqldf` function. If `drv=` is not specified
 770 | then it uses the value of the `"sqldf.driver"` global option to
 771 | determine which database to use. If that is not specified either then if
 772 | the RPostgreSQL, RMySQL or RH2 package is loaded (it checks in that
 773 | roder) it uses the associated database and otherwise uses SQLite. Thus
 774 | if you do not specify the database and you have one of those packages
 775 | loaded it will think you intended to use that database. If its likely
 776 | that you will have one of these packages loaded but you do not want to
 777 | that package with sqldf be sure to set the sqldf.driver option, e.g.
 778 | `options(sqldf.driver = "SQLite")` .
 779 | 
 780 | 8. Why am I having problems with update?[](#8._Why_am_I_having_problems_with_update?)
 781 | -------------------------------------------------------------------------------------
 782 | 
 783 | Although data frames referenced in the SQL statement(s) passed to sqldf
 784 | are automatically imported to SQLite, sqldf does not automatically
 785 | export anything for safety reasons. Thus if you update a table using
 786 | sqldf you must explicitly return it as shown in the examples below.
 787 | 
 788 | Note that in the select statement we referred to the table as `main.DF`
 789 | (`main` is always the name of the sqlite database.) If we had referred
 790 | to the table as `DF` (without qualifying it as being in `main`) sqldf
 791 | would have fetched `DF` from our R workspace rather than using the
 792 | updated one in the sqlite database.
 793 | 
 794 | ```r
 795 | > DF <- data.frame(a = 1:3, b = c(3, NA, 5))
 796 | > sqldf(c("update DF set b = a where b is null", "select * from main.DF"))
 797 |  a b
 798 | 1 1 3
 799 | 2 2 2
 800 | 3 3 5
 801 | ```
 802 | 
 803 | One other problem can arise if the data has factors. Here we would
 804 | normally get the wrong result because we are asking it to add a value to
 805 | column `b` that is not among the factor levels in `b` but by using
 806 | `method = "raw"` we can tell it not to automatically assign classes to
 807 | the result.
 808 | 
 809 | ```r
 810 | > DF <- data.frame(a = 1:3, b = factor(c(3, NA, 5))); DF
 811 |  a    b
 812 | 1 1    3
 813 | 2 2 <NA>
 814 | 3 3    5
 815 | > sqldf(c("update DF set b = a where b is null", "select * from main.DF"), method = "raw")
 816 |  a b
 817 | 1 1 3
 818 | 2 2 2
 819 | 3 3 5
 820 | ```
 821 | 
 822 | Another way around this is to avoid the entire problem in the first
 823 | place by not using a factor for `b`. If we had defined column `b` as
 824 | character or numeric instead of factor then we would not have had to
 825 | specify `method = "raw"`.
 826 | 
 827 | 9. How do I examine the layout that SQLite uses for a table? which tables are in the database? which databases are attached?[](#9._How_do_I_examine_the_layout_that_SQLite_uses_for_a_table?_whi)
 828 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 829 | 
 830 | Try these approaches to get the indicated meta data:
 831 | 
 832 | ```r
 833 | > # a. what is the layout of the BOD table?
 834 | > sqldf("pragma table_info(BOD)")
 835 |   cid   name type notnull dflt_value pk
 836 | 1   0   Time REAL       0       <NA>  0
 837 | 2   1 demand REAL       0       <NA>  0
 838 | 
 839 | > # b. which tables are in current database and what is their layout?
 840 | > sqldf(c("select * from BOD", "select * from sqlite_master"))
 841 |    type name tbl_name rootpage
 842 | 1 table  BOD      BOD        2
 843 |                                                     sql
 844 | 1 CREATE TABLE `BOD` \n( "Time" REAL,\n\tdemand REAL \n)
 845 | 
 846 | > # c. which databases are attached?  (This says only 'main' is attached.)
 847 | > sqldf("pragma database_list")
 848 |   seq name file
 849 | 1   0 main  
 850 | 
 851 | > # d. which version of sqlite is being used?
 852 | > sqldf("select sqlite_version()")
 853 |   sqlite_version()
 854 | 1           3.7.17
 855 | ```
 856 | 
 857 | 10. What are some of the differences between using SQLite and H2 with sqldf?[](#10.__What_are_some_of_the_differences_between_using_SQLite_and_H)
 858 | -------------------------------------------------------------------------------------------------------------------------------------------------
 859 | 
 860 | sqldf will use the H2 database instead of sqlite if the
 861 | [RH2](https://cran.r-project.org/package=RH2/) package is loaded.
 862 | Features supported by H2 not supported by SQLite include Date class
 863 | columns and certain
 864 | [functions](https://www.h2database.com/html/functions.html) such as
 865 | VAR\_SAMP, VAR\_POP, STDDEV\_SAMP, STDDEV\_POP, various XML functions
 866 | and CSVREAD.
 867 | 
 868 | **Note that the examples below require RH2 0.1-2.6 or later.**
 869 | 
 870 | Here are some commands. The meta commands here are specific to H2 (for
 871 | SQLite's meta data commands see
 872 | [FAQ\#9](#9._How_do_I_examine_the_layout_that_SQLite_uses_for_a_table?_whi)):
 873 | 
 874 | ```r
 875 | library(RH2) # this package contains the H2 database and an R driver
 876 | library(sqldf)
 877 | sqldf("select avg(demand) mean, stddev_pop(demand) from BOD where Time > 4")
 878 | sqldf('select Species, "Sepal.Length" from iris limit 3') # Sepal.Length has dot
 879 | sqldf("show databases")
 880 | sqldf("show tables")
 881 | sqldf("show tables from INFORMATION_SCHEMA")
 882 | sqldf("select * from INFORMATION_SCHEMA.settings")
 883 | sqldf("select * FROM INFORMATION_SCHEMA.indexes")
 884 | sqldf("select VALUE from INFORMATION_SCHEMA.SETTINGS where NAME = 'info.VERSION'") 
 885 | sqldf("show columns from BOD")
 886 | sqldf("select H2VERSION()") # this requires a later version of H2 than comes with RH2
 887 | ```
 888 | 
 889 | If RH2 is loaded then it will use H2 so if you wish to use SQLite
 890 | anyways then either use the drv= argument to sqldf:
 891 | 
 892 | ```r
 893 | sqldf("select * from BOD", drv = "SQLite")
 894 | ```
 895 | 
 896 | or set the following global option:
 897 | 
 898 | ```r
 899 | options(sqldf.driver = "SQLite")
 900 | ```
 901 | 
 902 | When using H2:
 903 | 
 904 | -   in H2 a column such as Sepal.Length is not converted to
 905 |     Sepal\_Length (which older versions of RSQLite do) but remains as
 906 |     Sepal.Length. For example,
 907 | 
 908 | Also sqlite orders the result above even without the order clause and h2
 909 | translates "Sepal Length" to Sepal.Length .
 910 | 
 911 | -   quoting rules in H2 are stricter than in SQLite. In H2, to quote an
 912 |     identifier use double quotes whereas to quote a constant use single
 913 |     quotes.
 914 | 
 915 | -   file objects are not supported. They are not really needed because
 916 |     H2 supports a
 917 |     [CSVREAD](https://www.h2database.com/html/functions.html#csvread)
 918 |     function. Note that on Windows one can use the R notation \~ to
 919 |     refer to the home directory when specifying filenames if using
 920 |     SQLite but not with CSVREAD in H2.
 921 | 
 922 | -   currently the only SQL statements supported by sqldf when using H2
 923 |     are select, show and call (whereas all are supported with SQLite).
 924 | 
 925 | -   H2 does not support the using clause in SQL select statements but
 926 |     does support on. Also it implicitly uses `on` rather than `using` in
 927 |     natural joins which means that selected and where condition
 928 |     variables that are merged in natural joins must be qualified in H2
 929 |     but need not be in SQLite.
 930 | 
 931 | The examples in the Examples section are redone below using H2. Where H2
 932 | does not support the operation the SQLite code is given instead. Note
 933 | that this section is a bit out of date and some of the items that it
 934 | says are not supported actually are supported now.
 935 | 
 936 | ```r
 937 | # 1
 938 | sqldf('select * from iris order by "Sepal.Length" desc limit 3')
 939 | 
 940 | # 2
 941 | sqldf('select Species, avg("Sepal.Length") from iris group by Species')
 942 | 
 943 | # 3
 944 | sqldf('select iris.Species "[Species]",
 945 |        avg("Sepal.Length") "[Avg of SLs > avg SL]"
 946 |     from iris, 
 947 |          (select Species, avg("Sepal.Length") SLavg 
 948 |          from iris group by Species) SLavg
 949 |     where iris.Species = SLavg.Species 
 950 |        and "Sepal.Length" > SLavg
 951 |     group by iris.Species')
 952 | 
 953 | # 4
 954 | Abbr <- data.frame(Species = levels(iris$Species), 
 955 |     Abbr = c("S", "Ve", "Vi"))
 956 | 
 957 | # 4a. This works:
 958 | sqldf('select iris.Species, count(*) 
 959 |   from iris natural join Abbr group by iris.Species')
 960 | 
 961 | # but this does not work (but does in sqlite) ###
 962 | sqldf('select Abbr, count(*) 
 963 |   from iris natural join Abbr group by Species')
 964 | 
 965 | # 4b.  H2 does not support using but does support on (but query is longer) ###
 966 | sqldf('select Abbr, count(*) 
 967 |   from iris join Abbr on iris.Species = Abbr.Species group by iris.Species')
 968 | 
 969 | # 4c.
 970 | sqldf('select Abbr, avg("Sepal.Length") from iris, Abbr
 971 |      where iris.Species = Abbr.Species group by iris.Species')
 972 | 
 973 | # 4d.  # This still needs to be fixed. #
 974 | out <- sqldf("select s.Species, s.dt, t.Station_id, t.Value
 975 |     from species s, temp t 
 976 |     where ABS(s.dt - t.dt) = 
 977 |         (select min(abs(s2.dt - t2.dt)) 
 978 |         from species s2, temp t2
 979 |         where s.Species = s2.Species and t.Station_id = t2.Station_id)")
 980 | 
 981 | # 4e. H2 does not support using but we can use on (but query is longer) ###
 982 | # Also the missing value in x seems to get filled with 0 rather than NA ###
 983 | SNP1x <- structure(list(Animal = c(194073197L, 194073197L, 194073197L, 
 984 |     194073197L, 194073197L), 
 985 |     Marker = structure(1:5, 
 986 |     .Label = c("P1001", "P1002", "P1004", "P1005", "P1006", "P1007"), 
 987 |     class = "factor"), 
 988 |     x = c(2L, 1L, 2L, 0L, 2L)), 
 989 |     .Names = c("Animal", "Marker", "x"), 
 990 |     row.names = c("3213", "1295", "915", "2833", "1487"), class = "data.frame")
 991 | SNP4 <- structure(list(Animal = c(194073197L, 194073197L, 194073197L, 
 992 |     194073197L, 194073197L, 194073197L), 
 993 |     Marker = structure(1:6, .Label = c("P1001", 
 994 |     "P1002", "P1004", "P1005", "P1006", "P1007"), class = "factor"), 
 995 |     Y = c(0.021088, 0.021088, 0.021088, 0.021088, 0.021088, 0.021088)), 
 996 |     .Names = c("Animal", "Marker", "Y"), class = "data.frame", 
 997 |     row.names = c("3213", "1295", "915", "2833", "1487", "1885"))
 998 | 
 999 | sqldf("select SNP4.Animal, SNP4.Marker, Y, x 
1000 |     from SNP4 left join SNP1x 
1001 |     on SNP4.Animal = SNP1x.Animal and SNP4.Marker = SNP1x.Marker")
1002 | 
1003 | # 4f. This still needs to be fixed. #
1004 | 
1005 | DF <- structure(list(tt = c(3, 6)), .Names = "tt", row.names = c(NA, 
1006 | -2L), class = "data.frame")
1007 | DF2 <- structure(list(tt = c(1, 2, 3, 4, 5, 7), d = c(8.3, 10.3, 19, 
1008 | 16, 15.6, 19.8)), .Names = c("tt", "d"), row.names = c(NA, -6L
1009 | ), class = "data.frame", reference = "A1.4, p. 270")
1010 | out <- sqldf("select * from DF d, DF2 a, DF2 b 
1011 |     where a.row_names = b.row_names - 1 and d.tt > a.tt and d.tt <= b.tt",
1012 |     row.names = TRUE)
1013 | 
1014 | # 5
1015 | minSL <- 7
1016 | limit <- 3
1017 | fn$sqldf('select * from iris where "Sepal.Length" > $minSL limit $limit')
1018 | 
1019 | # 6a. Species get converted to upper case ###
1020 | 
1021 | #    alternative 1
1022 | write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE, row.names = FALSE)
1023 | 
1024 | # convert factor to numeric
1025 | fac2num <- function(x) UseMethod("fac2num")
1026 | fac2num.factor <- function(x) as.numeric(as.character(x))
1027 | fac2num.data.frame <- function(x) replace(x, TRUE, lapply(x, fac2num))
1028 | fac2num.default <- identity
1029 | 
1030 | sqldf("select * from csvread('iris3.dat')", method = function(x) 
1031 |    data.frame(fac2num(x[-5]), x[5]))
1032 | 
1033 | #    alternative 2 (H2 seems to get confused regarding case of Species)
1034 | sqldf('select 
1035 |    cast("Sepal.Length" as real) "Sepal.Length",
1036 |    cast("Sepal.Width" as real) "Sepal.Width",
1037 |    cast("Petal.Length" as real) "Petal.Length",
1038 |    cast("Petal.Width" as real) "Petal.Width",
1039 |    SPECIES from csvread(\'iris3.dat\')')
1040 | 
1041 | #    alternative 3.  1st line sets up 0 row table, iris0, with correct classes & 2nd line
1042 | #      inserts the data from iris3.dat into it and then selects it back.
1043 | 
1044 | iris0 <- read.csv("iris3.dat", nrows = 1)[0L, ]
1045 | sqldf(c("insert into iris0 (select * from csvread('iris3.dat'))", 
1046 |     "select * from iris0"))
1047 | 
1048 | # 6b.
1049 | sqldf("select * from csvread('iris3.dat')", dbname = tempfile(), method = function(x)
1050 |   data.frame(fac2num(x[-5]), x[5]))
1051 | 
1052 | # 6c. Same answer as in 6a works whether or not there are row names
1053 | 
1054 | # 6d. NA
1055 | 
1056 | # 6e. 
1057 | 
1058 | # 6f.
1059 | cat("1 8.3
1060 | 210.3
1061 | 
1062 | 319.0
1063 | 416.0
1064 | 515.6
1065 | 719.8
1066 | ", file = "fixed")
1067 | sqldf("select substr(V1, 1, 1) f1, substr(V1, 2, 4) f2 
1068 |    from csvread('fixed', 'V1') limit 3")
1069 | 
1070 | # 6g. NA
1071 | 
1072 | # 7a
1073 | 
1074 | # this is sqlite (how do you work with rowid's in H2?) ###
1075 | sqldf('select * from iris i 
1076 |    where rowid in 
1077 |     (select rowid from iris where Species = i.Species order by "Sepal.Length" desc limit 2)
1078 |    order by i.Species, i."Sepal.Length" desc')
1079 | 
1080 | 
1081 | # 7b - same question ###
1082 | 
1083 | library(chron)
1084 | DF <- data.frame(x = 101:200, tt = as.Date("2000-01-01") + seq(0, len = 100, by = 2))
1085 | DF <- cbind(DF, month.day.year(unclass(DF$tt)))
1086 |  
1087 | # sqlite:
1088 | sqldf("select * from DF d
1089 |    where rowid in 
1090 |     (select rowid from DF 
1091 |        where year = d.year and month = d.month and day >= 21 limit 1)
1092 |    order by tt")
1093 | 
1094 | # 7c.
1095 | a <- read.table(textConnection("st en
1096 | 1 4
1097 | 11 14
1098 | 3 4"), header = TRUE)
1099 |  
1100 | b <- read.table(textConnection("st en
1101 | 2 5
1102 | 3 6
1103 | 30 44"), TRUE)
1104 |  
1105 | sqldf("select * from a where 
1106 |     (select count(*) from b where a.en >= b.st and b.en >= a.st) > 0")
1107 | 
1108 | 
1109 | # 8. In H2 one uses csvread rather than file and file.format. See:
1110 | # https://www.h2database.com/html/functions.html#csvread
1111 | 
1112 | numStr <- as.character(1:100)
1113 | DF <- data.frame(a = c(numStr, "Hello"))
1114 | write.table(DF, file = "tmp99.csv", quote = FALSE, sep = ",")
1115 | sqldf("select * from csvread('tmp99.csv') limit 5")
1116 | 
1117 | # Note that ~ does not work on Windows in H2: ###
1118 | # sqldf("select * from csvread('~/tmp.csv')")
1119 | 
1120 | 
1121 | # 9 - RH2 does not support. Only select statements currently. ###
1122 | 
1123 | # create new empty database called mydb
1124 | sqldf("attach 'mydb' as new") 
1125 | 
1126 | # create a new table, mytab, in the new database
1127 | # Note that sqldf does not delete tables created from create.
1128 | sqldf("create table mytab as select * from BOD", dbname = "mydb")
1129 | 
1130 | # shows its still there
1131 | sqldf("select * from mytab", dbname = "mydb")
1132 | 
1133 | # 10 - RH2 does not support sqldf() ###
1134 | 
1135 | sqldf() 
1136 | # uses connection just created
1137 | sqldf('select * from iris3 where "Sepal.Width" > 3')
1138 | sqldf('select * from main.iris3 where "Sepal.Width" = 3')
1139 | sqldf()
1140 | 
1141 | > # Example 10b.
1142 | > #
1143 | > # Here is another way to do example 10a.  We use the same iris3,
1144 | > # iris3.dat and sqldf development version as above.  
1145 | > # We grab connection explicitly, set up the database using sqldf and then 
1146 | > # for the second call we call dbGetQuery from RSQLite.  
1147 | > # In that case we don't need to qualify iris3 as main.iris3 since
1148 | > # RSQLite would not understand R variables anyways so there is no 
1149 | > # ambiguity.
1150 | 
1151 | > con <- sqldf() 
1152 | > 
1153 | > # uses connection just created
1154 | > sqldf('select * from iris3 where "Sepal.Width" > 3')
1155 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1156 | 1          5.1         3.5          1.4         0.2  setosa
1157 | 2          4.7         3.2          1.3         0.2  setosa
1158 | > dbGetQuery(con, 'select * from iris3 where "Sepal.Width" = 3')
1159 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1160 | 1          4.9           3          1.4         0.2  setosa
1161 | > 
1162 | > # close
1163 | > sqldf()
1164 | 
1165 | 
1166 | # 11. Between - these work same as sqlite
1167 | 
1168 | seqdf <- data.frame(thetime=seq(100,225,5),thevalue=factor(letters))
1169 | boundsdf <- data.frame(thestart=c(110,160,200),theend=c(130,180,220),groupID=c(555,666,777))
1170 | 
1171 | # run the query using two inequalities
1172 | testquery_1 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 
1173 | from seqdf left join boundsdf on (seqdf.thetime <= boundsdf.theend) and (seqdf.thetime >= boundsdf.thestart)")
1174 | 
1175 | # run the same query using 'between...and' clause
1176 | testquery_2 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 
1177 | from seqdf LEFT JOIN boundsdf ON (seqdf.thetime BETWEEN boundsdf.thestart AND boundsdf.theend)")
1178 | 
1179 | # 12 combine two files - not supported by RH2 ###
1180 | 
1181 | # 13 see #8
1182 | ```
1183 | 
1184 | 11. Why am I having difficulty reading a data file using SQLite and sqldf?[](#11._Why_am_I_having_difficulty_reading_a_data_file_using_SQLite)
1185 | ----------------------------------------------------------------------------------------------------------------------------------------------
1186 | 
1187 | SQLite is fussy about line endings. Note the `eol` argument to
1188 | `read.csv.sql` can be used to specify line endings if they are different
1189 | than the normal line endings on your platform. e.g.
1190 | 
1191 | ```r
1192 | read.csv.sql("myfile.dat", eol = "\n")
1193 | ```
1194 | 
1195 | `eol` can also be used as a component to the sqldf `file.format`
1196 | argument.
1197 | 
1198 | 12. How does one use sqldf with PostgreSQL?[](#12._How_does_one_use_sqldf_with_PostgreSQL?)
1199 | -------------------------------------------------------------------------------------------
1200 | 
1201 | Install 1. PostgreSQL, 2. RPostgreSQL R package 3. sqldf itself.
1202 | RPostgreSQL and sqldf are ordinary R package installs.
1203 | 
1204 | Make sure that you have created an empty database, e.g. `"test"`. The
1205 | createdb program that comes with PostgreSQL can be used for that. e.g.
1206 | from the console/shell create a database called test like this:
1207 | 
1208 | ```r
1209 | createdb --help
1210 | createdb --username=postgres test
1211 | ```
1212 | 
1213 | Here is an example using RPostgreSQL and after that we show an example
1214 | using RpgSQL. The `options` statement shown below can be entered directy
1215 | or alternately can be put in your `.Rprofile.` The values shown here are
1216 | actually the defaults:
1217 | 
1218 | ```r
1219 | options(sqldf.RPostgreSQL.user = "postgres", 
1220 |   sqldf.RPostgreSQL.password = "postgres",
1221 |   sqldf.RPostgreSQL.dbname = "test",
1222 |   sqldf.RPostgreSQL.host = "localhost", 
1223 |   sqldf.RPostgreSQL.port = 5432)
1224 | 
1225 | Lines <- "Group_A Group_B Group_C Value 
1226 | A1 B1 C1 10 
1227 | A1 B1 C2 20 
1228 | A1 B1 C3 30 
1229 | A1 B2 C1 40 
1230 | A1 B2 C2 10 
1231 | A1 B2 C3 5 
1232 | A1 B2 C4 30 
1233 | A2 B1 C1 40 
1234 | A2 B1 C2 5 
1235 | A2 B1 C3 2 
1236 | A2 B2 C1 26 
1237 | A2 B2 C2 1 
1238 | A2 B3 C1 23 
1239 | A2 B3 C2 15 
1240 | A2 B3 C3 12 
1241 | A3 B3 C4 23 
1242 | A3 B3 C5 23"
1243 | 
1244 | DF <- read.table(textConnection(Lines), header = TRUE, as.is = TRUE)
1245 | 
1246 | library(RPostgreSQL)
1247 | library(sqldf)
1248 | # upper case is folded to lower case by default so surround DF with double quotes
1249 | sqldf('select count(*) from "DF" ')
1250 | 
1251 | sqldf('select *, rank() over  (partition by "Group_A", "Group_B" order by "Value") 
1252 |        from "DF" 
1253 |        order by "Group_A", "Group_B", "Group_C" ')
1254 | ```
1255 | 
1256 | For another example using `over` and `partition by` see: [this cumsum
1257 | example](https://stackoverflow.com/questions/8559485/r-cumulative-sum-by-group-in-sqldf/8561324#8561324)
1258 | 
1259 | Also note that `log` and `log10` in R correspond to `ln` and `log`,
1260 | respectively, in PostgreSQL.
1261 | 
1262 | 13. How does one deal with quoted fields in `read.csv.sql`?[](#13._How_does_one_deal_with_quoted_fields_in_read.csv.sql_?)
1263 | --------------------------------------------------------------------------------------------------------------------------
1264 | 
1265 | `read.csv.sql` provides an interface to sqlite's csv reader. That reader
1266 | is not very flexible (but is fast) and, in particular, it does not
1267 | understand quoted fields but rather regards the quotes as part of the
1268 | field itself. To read a file using `read.csv.sql` and remove all double
1269 | quotes from it at the same time on Windows try this assuming you have
1270 | Rtools installed and on your path (or the corresponding `tr` syntax on
1271 | UNIX depending on your shell):
1272 | 
1273 | ```r
1274 | read.csv.sql("myfile.csv", filter = 'tr.exe -d \'^"\' ' )
1275 | ```
1276 | 
1277 | or using `gawk`:
1278 | 
1279 | ```r
1280 | # TODO: fix this example
1281 | read.csv.sql("myfile.csv", filter = list('gawk -f prog', prog = '{ gsub(/"/, ""); print }') )
1282 | ```
1283 | 
1284 | Another program to look at is the
1285 | [csvfix](https://code.google.com/p/csvfix/) program (this is a free
1286 | external program -- not an R program).  For example, the above could be done
1287 | like this with csvfix:
1288 | 
1289 | ```r
1290 | read.csv.sql("myfile.csv", filter = 'csvfix echo -smq' )
1291 | ```
1292 | 
1293 | As another csvfix example, suppose we have commas in two contexts: (1) as
1294 | separators between fields and within double quoted fields. To handle that case
1295 | we can use `csvfix` to translate the separators to semicolon stripping off the
1296 | double quotes at the same time (assuming we have installed `csvfix` and we have
1297 | put it in our path):
1298 | 
1299 | ```r
1300 | read.csv.sql("myfile.csv", sep = ";", filter = "csvfix write_dsv -s ;")` .
1301 | ```
1302 | 
1303 | 14. How does one read files where numeric NAs are represented as missing empty fields?[](#14._How_does_one_read_files_where_numeric_NAs_are_represented_as)
1304 | -----------------------------------------------------------------------------------------------------------------------------------------------------------
1305 | 
1306 | Translate the empty fields to some number that will represent NA and
1307 | then fix it up on the R end.
1308 | 
1309 | ```r
1310 | # The problem is that SQLite's read routine regards empty
1311 | # fields as zero length character strings rather than NA.
1312 | # We handle that by replacing such strings with -999, say,
1313 | # using gawk and the read.csv.sql filter argument and then
1314 | # fixing it up in R later.
1315 | 
1316 | 
1317 | # write out test data
1318 | 
1319 | cat("a\tb\tc
1320 | aa\t\t23
1321 | aaa\t34.6\t
1322 | aaaa\t\t77.8", file = "x.txt")
1323 | 
1324 | # create single line awk program to insert -999 as NA
1325 | 
1326 | cat('{ gsub("\t\t", "\t-999\t"); gsub("\t$", "\t-999"); print}', 
1327 |   file = "x.awk")
1328 | 
1329 | # on Windows gawk uses \n as eol even though most
1330 | # other programs use \r\n so we need to specify that.
1331 | # eol= may or may not be needed here on other platforms.
1332 | 
1333 | library(sqldf)
1334 | DF <- read.csv.sql("x.txt", sep = "\t", eol = "\n", filter = "gawk -f x.awk")
1335 | 
1336 | # replace -999's with NA
1337 | 
1338 | is.na(DF) <- DF == -999
1339 | ```
1340 | 
1341 | Another program that can be used in filters is the free csvfix . For
1342 | example, suppose that csvfix is on our path and that NA values are
1343 | represented as NA in numeric fields. We would like to convert them to
1344 | -999 and then later remove them.
1345 | 
1346 | ```r
1347 | Lines <- "a,b
1348 | 3,NA
1349 | 4,65"
1350 | cat(Lines, file = "myfile.csv")
1351 | 
1352 | filter <- 'csvfix map -fv ,NA -tv ,-999 myfile.csv | csvfix write_dsv -s ,'
1353 | DF <- read.csv.sql(filter = filter)
1354 | is.na(DF) <- DF == -999
1355 | ```
1356 | 
1357 | Another way in which the input file can be malformed is that not every
1358 | line has the same number of fields. In that case `csvfx pad -n` can be
1359 | used to pad it out as in this example:
1360 | 
1361 | ```r
1362 | Lines <- "a,b,c
1363 | a,b,
1364 | a,b
1365 | q,r,t"
1366 | cat(Lines, file = "c.csv")
1367 | DF <- read.csv.sql(filter = "csvfix pad -n 3 c.csv | csvfix write_dsv -s ,")
1368 | ```
1369 | 
1370 | 15. Why do certain calculations come out as integer rather than double?[](#15._Why_do_certain_calculations_come_out_as_integer_rather_than)
1371 | -------------------------------------------------------------------------------------------------------------------------------------------
1372 | 
1373 | SQLite/RSQLite, h2/RH2, PostgreSQL all perform integer division on
1374 | integers; however, RMySQL/MySQL performs real division.
1375 | 
1376 | ```r
1377 | > DF <- data.frame(a = 1:2, b = 2:1)
1378 | > str(DF) # columns are integer
1379 | 'data.frame':   2 obs. of  2 variables:
1380 |  $ a: int  1 2
1381 |  $ b: int  2 1
1382 | > #
1383 | > # using sqlite - integer division
1384 | > sqldf("select a/b as quotient from DF")
1385 |   quotient
1386 | 1        0
1387 | 2        2
1388 | > # force real division
1389 | > sqldf("select (a+0.0)/b as quotient from DF")
1390 |   quotient
1391 | 1      0.5
1392 | 2      2.0
1393 | > # force real division
1394 | > sqldf("select cast(a as real)/b as quotient from DF")
1395 |   quotient
1396 | 1      0.5
1397 | 2      2.0
1398 | > # insert into table with real columns
1399 | > sqldf(c("create table mytab(a real, b real)", 
1400 | +   "insert into mytab select * from DF",  
1401 | +   "select a/b as quotient from mytab"))
1402 |   quotient
1403 | 1      0.5
1404 | 2      2.0
1405 | > 
1406 | > # convert all columns to numeric using method= argument
1407 | > # Requires sqldf 0.4-0 or later
1408 | > 
1409 | > tonum <- function(DF) replace(DF, TRUE, lapply(DF, as.numeric))
1410 | > sqldf("select a/b as quotient from DF", method = list("auto", tonum))
1411 |   quotient
1412 | 1      0.5
1413 | 2      2.0
1414 | > 
1415 | > # use RMySQL - uses real division
1416 | > # Requires sqldf 0.4-0 or later
1417 | > library(RMySQL)
1418 | > sqldf("select a/b as quotient from DF")
1419 |   quotient
1420 | 1      0.5
1421 | 2      2.0
1422 | ```
1423 | 
1424 | 16. How can one read a file off the net or a csv file in a zip file?[](#16._How_can_one_read_a_file_off_the_net_or_a_csv_file_in_a_zip_f)
1425 | -----------------------------------------------------------------------------------------------------------------------------------------
1426 | 
1427 | Use `read.csv.sql` and specify the URL of the file:
1428 | 
1429 | ```r
1430 | # 1
1431 | URL <- "https://www.wnba.com/liberty/media/NYL2011ScheduleV3.csv"
1432 | DF <- read.csv.sql(URL, eol = "\r")
1433 | ```
1434 | 
1435 | Since files off the net could have any end of line be careful to specify
1436 | it properly for the file of interest.
1437 | 
1438 | As an alternative one could use the filter argument. To use this `wget`
1439 | ([download](http://wget.addictivecode.org/FrequentlyAskedQuestions?action=show&redirect=Faq#download),
1440 | [Windows](http://gnuwin32.sourceforge.net/packages/wget.htm)) must be
1441 | present on the system command path.
1442 | 
1443 | ```r
1444 | # 2 - same URL as above
1445 | DF <- read.csv.sql(eol = "\r", filter = paste("wget -O - ", URL))
1446 | ```
1447 | 
1448 | Here is an example of reading a zip file which contains a single file
1449 | that is a `csv` :
1450 | 
1451 | ```r
1452 | DF <- read.csv.sql(filter = "7z x -so anscombe.zip 2>NUL")
1453 | ```
1454 | 
1455 | In the line of code above it is assumed that `7z`
1456 | ([download](http://www.7-zip.org/download.html)) is present and on the
1457 | system command path. The example is for Windows. On UNIX use `/dev/null`
1458 | in place of `NUL`.
1459 | 
1460 | If we had a `.tar.gz` file it could be done like this:
1461 | 
1462 | ```r
1463 | DF <- read.csv.sql(filter = "tar xOfz anscombe.tar.gz")
1464 | ```
1465 | 
1466 | assuming that tar is available on our path. (Normally tar is available
1467 | on Linux and on Windows its available as part of the
1468 | [Rtools](https://cran.r-project.org/bin/windows/Rtools/) distribution on
1469 | CRAN.)
1470 | 
1471 | Note that `filter` causes the filtered output to be stored in a
1472 | temporary file and then read into sqlite. It does not actually read the
1473 | data directly from the net into sqlite or directly from the zip or
1474 | tar.gz file to sqlite.
1475 | 
1476 | *Note:* The examples in this section assume sqldf 0.4-4 or later.
1477 | 
1478 | Examples[](#Examples)
1479 | =====================
1480 | 
1481 | These examples illustrate usage of both sqldf and SQLite. For sqldf with
1482 | H2 see [FAQ
1483 | \#10](https://code.google.com/p/sqldf/#10.__What_are_some_of_the_differences_between_using_SQLite_and_H).
1484 | For PostgreSQL see
1485 | [FAQ\#12](https://code.google.com/p/sqldf/#12._How_does_one_use_sqldf_with_PostgreSQL?).
1486 | Also the `"sqldf-unitTests"` demo that comes with sqldf works under
1487 | sqldf with SQLite, H2, PostgreSQL and MySQL. David L. Reiner has created
1488 | some further examples
1489 | [here](https://files.meetup.com/1625815/crug_sqldf_05-01-2013.pdf) and
1490 | Paul Shannon has examples
1491 | [here](https://brusers.tumblr.com/post/59706993506/data-manipulation-with-sqldf-paul).
1492 | 
1493 | Example 1. Ordering and Limiting[](#Example_1._Ordering_and_Limiting)
1494 | ---------------------------------------------------------------------
1495 | 
1496 | Here is an example of sorting and limiting output from an SQL select
1497 | statement on the iris data frame that comes with R. Note that although
1498 | the iris dataset uses the name `Sepal.Length` older versions of the
1499 | RSQLite driver convert that to `Sepal_Length`; however, newer versions
1500 | do not. After installing sqldf in R, just type the first two lines into
1501 | the R console (without the \>):
1502 | 
1503 | ```r
1504 | > library(sqldf)
1505 | > sqldf('select * from iris order by "Sepal.Length" desc limit 3')
1506 | 
1507 |   Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
1508 | 1          7.9         3.8          6.4         2.0 virginica
1509 | 2          7.7         3.8          6.7         2.2 virginica
1510 | 3          7.7         2.6          6.9         2.3 virginica
1511 | ```
1512 | 
1513 | Example 2. Averaging and Grouping[](#Example_2._Averaging_and_Grouping)
1514 | -----------------------------------------------------------------------
1515 | 
1516 | Here is an example which processes an SQL select statement whose
1517 | functionality is similar to the R aggregate function.
1518 | 
1519 | ```r
1520 | > sqldf('select Species, avg("Sepal.Length") from iris group by Species")
1521 | 
1522 |      Species avg(Sepal.Length)
1523 | 1     setosa             5.006
1524 | 2 versicolor             5.936
1525 | 3  virginica             6.588
1526 | ```
1527 | 
1528 | Example 3. Nested Select[](#Example_3._Nested_Select)
1529 | -----------------------------------------------------
1530 | 
1531 | Here is a more complex example. For each Species, find the average Sepal
1532 | Length among those rows where Sepal Length exceeds the average Sepal
1533 | Length for that Species. Note the use of a subquery and explicit column
1534 | naming:
1535 | 
1536 | ```r
1537 | > sqldf("select iris.Species '[Species]', 
1538 | +       avg(\"Sepal.Length\") '[Avg of SLs > avg SL]'
1539 | +    from iris, 
1540 | +         (select Species, avg(\"Sepal.Length\") SLavg 
1541 | +         from iris group by Species) SLavg
1542 | +    where iris.Species = SLavg.Species
1543 | +       and \"Sepal.Length\" > SLavg
1544 | +    group by iris.Species")
1545 | 
1546 |    [Species] [Avg of SLs > avg SL]
1547 | 1     setosa              5.313636
1548 | 2 versicolor              6.375000
1549 | 3  virginica              7.159091
1550 | 
1551 | > # same - using only core R - based on discussion with Dennis Toddenroth
1552 | > aggregate(Sepal.Length ~ Species, iris, function(x) mean(x[x > mean(x)]))
1553 |      Species Sepal.Length
1554 | 1     setosa     5.313636
1555 | 2 versicolor     6.375000
1556 | 3  virginica     7.159091
1557 | ```
1558 | 
1559 | Note that PostgreSQL is the only free database that supports
1560 | [window](https://developer.postgresql.org/pgdocs/postgres/tutorial-window.html)
1561 | [functions](https://developer.postgresql.org/pgdocs/postgres/functions-window.html)
1562 | (similar to `ave` function in R) which would allow a different
1563 | formulation of the above. For more on using sqldf with PostgreSQL see
1564 | [FAQ
1565 | \#12](https://code.google.com/p/sqldf/#12._How_does_one_use_sqldf_with_PostgreSQL?)
1566 | 
1567 | ```r
1568 | > library(RPostgreSQL)
1569 | > library(sqldf)
1570 | > tmp <- sqldf('select 
1571 | +       "Species", 
1572 | +       "Sepal.Length", 
1573 | +       "Sepal.Length" - avg("Sepal.Length") over (partition by "Species") "above.mean" 
1574 | +     from iris')
1575 | > sqldf('select "Species", avg("Sepal.Length") 
1576 | +        from tmp 
1577 | +        where "above.mean" > 0 
1578 | +        group by "Species"')
1579 |      Species      avg
1580 | 1     setosa 5.313636
1581 | 2  virginica 7.159091
1582 | 3 versicolor 6.375000
1583 | > 
1584 | > # or, alternately, we could perform the above two steps in a single statement:
1585 | > 
1586 | > sqldf('
1587 | +  select "Species", avg("Sepal.Length") 
1588 | +  from 
1589 | +     (select "Species", 
1590 | +         "Sepal.Length", 
1591 | +         "Sepal.Length" - avg("Sepal.Length") over (partition by "Species") "above.mean" 
1592 | +     from iris) a 
1593 | +  where "above.mean" > 0 
1594 | +  group by "Species"')
1595 |      Species      avg
1596 | 1     setosa 5.313636
1597 | 2 versicolor 6.375000
1598 | 3  virginica 7.159091
1599 | ```
1600 | 
1601 | which in R corresponds to this R code (i.e. `partition...over` in
1602 | PostgreSQL corresponds to `ave` in R):
1603 | 
1604 | ```r
1605 | > tmp <- with(iris, Sepal.Length - ave(Sepal.Length, iris, FUN = mean))
1606 | > aggregate(Sepal.Length ~ Species, subset(tmp, above.mean > 0), mean)
1607 |      Species Sepal.Length
1608 | 1     setosa     5.313636
1609 | 2 versicolor     6.375000
1610 | 3  virginica     7.159091
1611 | ```
1612 | 
1613 | Here is some sample data with the correlated subquery from this
1614 | [Wikipedia page](https://en.wikipedia.org/wiki/Correlated_subquery):
1615 | 
1616 | ```r
1617 | Emp <- data.frame(emp = letters[1:24], salary = 1:24, dept = rep(c("A", "B", "C"), each = 8))
1618 | 
1619 | sqldf("SELECT *
1620 |  FROM Emp AS e1
1621 |  WHERE salary > (SELECT avg(salary)
1622 |     FROM Emp
1623 |     WHERE dept = e1.dept)")
1624 | ```
1625 | 
1626 | Example 4. Join[](#Example_4._Join)
1627 | -----------------------------------
1628 | 
1629 | The different type of joins are pictured in this image:
1630 | i.imgur.com/1m55Wqo.jpg. (SQLite does not support right joins but the
1631 | other databases sqldf supports do.) We define a new data frame, `Abbr`,
1632 | join it with `iris` and perform the aggregation:
1633 | 
1634 | ```r
1635 | > # Example 4a.
1636 | > Abbr <- data.frame(Species = levels(iris$Species), 
1637 | +    Abbr = c("S", "Ve", "Vi"))
1638 | >
1639 | > sqldf('select Abbr, avg("Sepal.Length") 
1640 | +   from iris natural join Abbr group by Species')
1641 | 
1642 |   Abbr avg(Sepal.Length)
1643 | 1    S             5.006
1644 | 2   Ve             5.936
1645 | 3   Vi             6.588
1646 | ```
1647 | 
1648 | Although the above is probably the shortest way to write it in SQL,
1649 | using `natural join` can be a bit dangerous since one must be very sure
1650 | one knows precisely which column names are common to both tables. For
1651 | example, had we included the `row_names` as a column in both tables (by
1652 | specifying `row.names = TRUE` to sqldf) the natural join would not work
1653 | as intended since the `row_names` columns would participate in the join.
1654 | An alternate and safer way to write this would be with `join` and
1655 | `using`:
1656 | 
1657 | ```r
1658 | > # Example 4b.
1659 | > sqldf('select Abbr, avg("Sepal.Length") 
1660 | +   from iris join Abbr using(Species) group by Species')
1661 | 
1662 |   Abbr avg(Sepal.Length)
1663 | 1    S             5.006
1664 | 2   Ve             5.936
1665 | 3   Vi             6.588
1666 | ```
1667 | 
1668 | or with a `where` clause:
1669 | 
1670 | ```r
1671 | > # Example 4c.
1672 | > sqldf('select Abbr, avg("Sepal.Length") from iris, Abbr
1673 | +    where iris.Species = Abbr.Species group by iris.Species')
1674 | 
1675 |   Abbr avg(Sepal.Length)
1676 | 1    S             5.006
1677 | 2   Ve             5.936
1678 | 3   Vi             6.588
1679 | ```
1680 | 
1681 | or a temporal join where the goal is, for each Species/station\_id pair,
1682 | to join the records with the closest date/times.
1683 | 
1684 | ```r
1685 | > # Example 4d. Temporal Join
1686 | > # see: https://stat.ethz.ch/pipermail/r-help/2009-March/191938.html
1687 | >
1688 | > library(chron)
1689 | > 
1690 | > Species.Lines <- "Species,Date_Sampled
1691 | + SpeciesB,2008-06-23 13:55:11
1692 | + SpeciesA,2008-06-23 13:43:11
1693 | + SpeciesC,2008-06-23 13:55:11"
1694 | > 
1695 | > species <- read.csv(textConnection(Species.Lines), as.is = TRUE)
1696 | > species$dt <- as.numeric(as.chron(species$Date))
1697 | > 
1698 | > Temp.Lines <- "Station_id,Date,Value
1699 | + ANH,2008-06-23 13:00:00,1.96
1700 | + ANH,2008-06-23 14:00:00,2.25
1701 | + BDT,2008-06-23 13:00:00,4.23
1702 | + BDT,2008-06-23 13:15:00,4.11
1703 | + BDT,2008-06-23 13:30:00,4.01
1704 | + BDT,2008-06-23 13:45:00,3.9
1705 | + BDT,2008-06-23 14:00:00,3.82"
1706 | > 
1707 | > temp <- read.csv(textConnection(Temp.Lines), as.is = TRUE)
1708 | > temp$dt <- as.numeric(as.chron(temp$Date))
1709 | > 
1710 | > out <- sqldf("select s.Species, s.dt, t.Station_id, t.Value
1711 | + from species s, temp t 
1712 | + where abs(s.dt - t.dt) = 
1713 | + (select min(abs(s2.dt - t2.dt)) 
1714 | + from species s2, temp t2
1715 | + where s.Species = s2.Species and t.Station_id = t2.Station_id)")
1716 | > out$dt <- chron(out$dt)
1717 | > out
1718 |    Species                  dt Station_id Value
1719 | 1 SpeciesB (06/23/08 13:55:11)        ANH     2.25
1720 | 2 SpeciesB (06/23/08 13:55:11)        BDT     3.82
1721 | 3 SpeciesA (06/23/08 13:43:11)        ANH     2.25
1722 | 4 SpeciesA (06/23/08 13:43:11)        BDT     3.90
1723 | 5 SpeciesC (06/23/08 13:55:11)        ANH     2.25
1724 | 6 SpeciesC (06/23/08 13:55:11)        BDT     3.82
1725 | ```
1726 | 
1727 | A similar but slightly simpler example can be found
1728 | [here](https://stat.ethz.ch/pipermail/r-sig-finance/2010q2/006077.html).
1729 | 
1730 | Here is an example of a left join:
1731 | 
1732 | ```r
1733 | > # Example 4e. Left Join
1734 | > # https://stat.ethz.ch/pipermail/r-help/2009-April/195882.html
1735 | > #
1736 | > SNP1x <-
1737 | + structure(list(Animal = c(194073197L, 194073197L, 194073197L, 
1738 | + 194073197L, 194073197L), Marker = structure(1:5, .Label = c("P1001", 
1739 | + "P1002", "P1004", "P1005", "P1006", "P1007"), class = "factor"), 
1740 | +     x = c(2L, 1L, 2L, 0L, 2L)), .Names = c("Animal", "Marker", 
1741 | + "x"), row.names = c("3213", "1295", "915", "2833", "1487"), class = "data.frame")
1742 | > 
1743 | > SNP4 <- 
1744 | + structure(list(Animal = c(194073197L, 194073197L, 194073197L, 
1745 | + 194073197L, 194073197L, 194073197L), Marker = structure(1:6, .Label = c("P1001", 
1746 | + "P1002", "P1004", "P1005", "P1006", "P1007"), class = "factor"), 
1747 | +     Y = c(0.021088, 0.021088, 0.021088, 0.021088, 0.021088, 0.021088
1748 | +     )), .Names = c("Animal", "Marker", "Y"), class = "data.frame", row.names = c("3213", 
1749 | + "1295", "915", "2833", "1487", "1885"))
1750 | >
1751 | > SNP1x
1752 |         Animal Marker x
1753 | 3213 194073197  P1001 2
1754 | 1295 194073197  P1002 1
1755 | 915  194073197  P1004 2
1756 | 2833 194073197  P1005 0
1757 | 1487 194073197  P1006 2
1758 | > SNP4
1759 |         Animal Marker        Y
1760 | 3213 194073197  P1001 0.021088
1761 | 1295 194073197  P1002 0.021088
1762 | 915  194073197  P1004 0.021088
1763 | 2833 194073197  P1005 0.021088
1764 | 1487 194073197  P1006 0.021088
1765 | 1885 194073197  P1007 0.021088
1766 | >
1767 | > library(sqldf)
1768 | > sqldf("select * from SNP4 left join SNP1x using (Animal, Marker)")
1769 |      Animal Marker        Y  x
1770 | 1 194073197  P1001 0.021088  2
1771 | 2 194073197  P1002 0.021088  1
1772 | 3 194073197  P1004 0.021088  2
1773 | 4 194073197  P1005 0.021088  0
1774 | 5 194073197  P1006 0.021088  2
1775 | 6 194073197  P1007 0.021088 NA
1776 | > # or if that takes up too much memory 
1777 | > # create/use/destroy external database
1778 | > sqldf("select * from SNP4 left join SNP1x using (Animal, Marker)", dbname = "test.db")
1779 |      Animal Marker        Y  x
1780 | 1 194073197  P1001 0.021088  2
1781 | 2 194073197  P1002 0.021088  1
1782 | 3 194073197  P1004 0.021088  2
1783 | 4 194073197  P1005 0.021088  0
1784 | 5 194073197  P1006 0.021088  2
1785 | 6 194073197  P1007 0.021088 NA
1786 | ```
1787 | 
1788 | ```r
1789 | > # Example 4f.  Another temporal join.
1790 | > # join DF2 to row in DF for which DF.tt and DF2.tt are closest
1791 | > 
1792 | > DF <- structure(list(tt = c(3, 6)), .Names = "tt", row.names = c(NA, 
1793 | + -2L), class = "data.frame")
1794 | > DF
1795 |   tt
1796 | 1  3
1797 | 2  6
1798 | > 
1799 | > DF2 <- structure(list(tt = c(1, 2, 3, 4, 5, 7), d = c(8.3, 10.3, 19, 
1800 | + 16, 15.6, 19.8)), .Names = c("tt", "d"), row.names = c(NA, -6L
1801 | + ), class = "data.frame", reference = "A1.4, p. 270")
1802 | > DF2
1803 |   tt    d
1804 | 1  1  8.3
1805 | 2  2 10.3
1806 | 3  3 19.0
1807 | 4  4 16.0
1808 | 5  5 15.6
1809 | 6  7 19.8
1810 | > 
1811 | > out <- sqldf("select * from DF d, DF2 a, DF2 b 
1812 | + where a.row_names = b.row_names - 1 
1813 | + and d.tt > a.tt and d.tt <= b.tt", 
1814 | + row.names = TRUE)
1815 | >  
1816 | > out$dd <- with(out, ifelse(tt < (tt.1 + tt.2) / 2, d, d.1))
1817 | > out
1818 |   tt tt.1    d tt.2  d.1   dd
1819 | 1  3    2 10.3    3 19.0 19.0
1820 | 2  6    5 15.6    7 19.8 19.8
1821 | ```
1822 | 
1823 | Example 4g. Self Join. There is an example of a self-join here:
1824 | [problem](https://stat.ethz.ch/pipermail/r-help/2010-March/232314.html)
1825 | and answer here:
1826 | 
1827 | ```r
1828 | > DF <- structure(list(Actor = c("Jim", "Bob", "Bob", "Larry", "Alice", "Tom", "Tom", "Tom", "Alice", "Nancy"), Act = c("A", "A", "C",                                                                           "D", "C", "F", "D", "A", "B", "B")), .Names = c("Actor", "Act"                                                                                ), class = "data.frame", row.names = c(NA, -10L))
1829 | 
1830 | > subset(unique(merge(DF, DF, by = 2)), Actor.x < Actor.y)
1831 |    Act Actor.x Actor.y
1832 | 3    A     Jim     Tom
1833 | 4    A     Bob     Jim
1834 | 6    A     Bob     Tom
1835 | 11   B   Alice   Nancy
1836 | 16   C   Alice     Bob
1837 | 20   D   Larry     Tom
1838 | 
1839 | > sqldf("select A.Act, A.Actor, B.Actor
1840 | +   from DF A join DF B
1841 | +     where A.Act = B.Act and A.Actor < B.Actor
1842 | +       order by A.Act, A.Actor")
1843 |   Act Actor Actor
1844 | 1   A   Bob   Jim
1845 | 2   A   Bob   Tom
1846 | 3   A   Jim   Tom
1847 | 4   B Alice Nancy
1848 | 5   C Alice   Bob
1849 | 6   D Larry   Tom
1850 | ```
1851 | 
1852 | to Raj Morejoys for correction.
1853 | 
1854 | Here is an [another example of a self
1855 | join](https://stat.ethz.ch/pipermail/r-help/2011-February/269680.html)
1856 | to create pairs which is followed by a second self join to produce pairs
1857 | of pairs. This [stackoverflow
1858 | example](https://stackoverflow.com/questions/11448133/double-merge-two-data-frames-in-r)
1859 | illustrates an sqldf triple join in which one table participates twice.
1860 | 
1861 | Example 4h. Join nearby times. There is an example of joining records
1862 | that are close but not necessarily exactly the same here:
1863 | [problem](https://stat.ethz.ch/pipermail/r-help/2010-March/232588.html)
1864 | and
1865 | [answer](https://stat.ethz.ch/pipermail/r-help/attachments/20100320/4ccb548f/attachment.pl)
1866 | . Also taking successive differences involves joining adjacent times and
1867 | this is illustrated
1868 | [here](https://stackoverflow.com/questions/6695673/find-standard-deviation-of-first-differences-of-series-defined-with-group-by-usin)
1869 | .
1870 | 
1871 | Here is an example where we align time series Sy to series Sx by
1872 | averaging all points of Sy within w = 0.25 units of each Sx time point.
1873 | Tx and X are the times and values of Sx and Ty and Y are the times and
1874 | values of Sy.
1875 | 
1876 | ```r
1877 | Tx <- seq(1, N, 0.5)
1878 | Tx <- Tx + rnorm(length(Tx), 0, 0.1)
1879 | X <- sin(Tx/10.0) +  sin(Tx/5.0) + rnorm(length(Tx), 0, 0.1)
1880 | Ty <- seq(1, N, 0.3333)
1881 | Ty <- Ty + rnorm(length(Ty), 0, 0.02)
1882 | Y <- sin(Ty/10.0) + sin(Ty/5.0) + rnorm(length(Ty), 0, 0.1)
1883 | w <- 0.25
1884 | 
1885 | system.time(out1 <- sapply(Tx, function(tx) mean(Y[Ty >= tx-w & Ty <= tx+w])))
1886 | 
1887 | library(sqldf)
1888 | Sx <- data.frame(Tx, X)
1889 | Sy <- data.frame(Ty, Y)
1890 | 
1891 | system.time(out.sqldf <- sqldf(c("create index idx on Sx(Tx)",
1892 |   "select Tx, avg(Y) from main.Sx, Sy
1893 |   where Ty + 0.25 >= Tx and Ty - 0.25 <= Tx group by Tx")))
1894 | 
1895 | all.equal(out.sqldf[,2], out1) # TRUE
1896 | ```
1897 | 
1898 | Example 4i. Speeding up joins with indexes. Here is an example of
1899 | speeding up a join by using indexes on a single join column
1900 | [here](https://statcompute.wordpress.com/2013/06/09/improve-the-efficiency-in-joining-data-with-index/)
1901 | and [here](https://stat.ethz.ch/pipermail/r-help/2010-March/232688.html)
1902 | and on two join columns below. Note that the `create index` statements
1903 | in each example also has the effect of reading in the data frames into
1904 | the `main` database of SQLite. The `select` statement refers to
1905 | `main.DF1` rather than just `DF1` so that it accesses that copy of `DF1`
1906 | in `main` which we just indexed rather than the unindexed `DF1` in R.
1907 | Similar comments apply to `DF2`. The statement
1908 | `sqldf("select * from sqlite_master")` will list the names and related
1909 | info for all tables in `main`.
1910 | 
1911 | ```r
1912 | > set.seed(1)
1913 | > n <- 1000000
1914 | > 
1915 | > DF1 <- data.frame(a = sample(n, n, replace = TRUE), 
1916 | + b = sample(4, n, replace = TRUE), c1 = runif(n))
1917 | > 
1918 | > DF2 <- data.frame(a = sample(n, n, replace = TRUE), 
1919 | + b = sample(4, n, replace = TRUE), c2 = runif(n))
1920 | > 
1921 | > library(sqldf)
1922 | Loading required package: DBI
1923 | Loading required package: RSQLite
1924 | Loading required package: gsubfn
1925 | Loading required package: proto
1926 | Loading required package: chron
1927 | > 
1928 | > sqldf()
1929 | <SQLiteConnection:(6480,0)> 
1930 | > system.time(sqldf("create index ai1 on DF1(a, b)"))
1931 | Loading required package: tcltk
1932 | Loading Tcl/Tk interface ... done
1933 |    user  system elapsed 
1934 |   16.69    0.19   19.12 
1935 | > system.time(sqldf("create index ai2 on DF2(a, b)"))
1936 |    user  system elapsed 
1937 |   16.60    0.03   17.48 
1938 | > system.time(sqldf("select * from main.DF1 natural join main.DF2"))
1939 |    user  system elapsed 
1940 |    7.76    0.06    8.23 
1941 | > sqldf()
1942 | ```
1943 | 
1944 | The sqldf statements above could also be done in one sqldf call like
1945 | this:
1946 | 
1947 | ```r
1948 | # define DF1 and DF2 as before
1949 | set.seed(1)
1950 | n <- 1000000
1951 | DF1 <- data.frame(a = sample(n, n, replace = TRUE), 
1952 |    b = sample(4, n, replace = TRUE), c1 = runif(n))
1953 | DF2 <- data.frame(a = sample(n, n, replace = TRUE), 
1954 |    b = sample(4, n, replace = TRUE), c2 = runif(n))
1955 | 
1956 | # combine all sqldf calls from before into one call
1957 | 
1958 | result <- sqldf(c("create index ai1 on DF1(a, b)", 
1959 |   "create index ai2 on DF2(a, b)", 
1960 |   "select * from main.DF1 natural join main.DF2"))
1961 | ```
1962 | 
1963 | Note that if your data is so large that you need indexes it may be too
1964 | large to store the database in memory. If you find its overflowing
1965 | memory then use the `dbname=` sqldf argument, e.g.
1966 | `sqldf(c("create...", "create...", "select..."), dbname = tempfile())`
1967 | so that it stores the intermediate results in an external database
1968 | rather than memory.
1969 | 
1970 | *Note:* The index `ai1` is not actually used so we could have saved the
1971 | time it took to create it, creating only `ai2`.
1972 | 
1973 | ```r
1974 | sqldf(c("create index ai2 on DF2(a, b)", "select * from DF1 natural join main.DF2"))
1975 | ```
1976 | 
1977 | Example 4j. Per Group Max and Min
1978 | 
1979 | Note that the Date variable gets passed to SQLite as number of days
1980 | since 1970-01-01 whereas SQLite uses an earlier origin so we add
1981 | `julianday('1970-01-01')` to convert the origin of R's `"Date"` class to
1982 | SQLite's origin. Note that the output column called `Date` is
1983 | automatically converted to `"Date"` class by the sqldf heuristic because
1984 | there is an input column that has the same name.
1985 | 
1986 | ```r
1987 | > URL <- "https://ichart.finance.yahoo.com/table.csv?s=GOOG&a=07&b=19&c=2004&d=03&e=16&f=2010&g=d&ignore=.csv"
1988 | > DF25 <- read.csv(URL, nrows = 25)
1989 | > DF25$Date <- as.Date(DF25$Date)
1990 | > 
1991 | > sqldf("select Date, a.High, a.Low, b.Close, a.Volume
1992 | + from (select max(Date) Date, min(Low) Low, max(High) High, sum(Volume) Volume
1993 | + from DF25 
1994 | + group by date(Date + julianday('1970-01-01'), 'start of month')
1995 | + ) as a join DF25 b using(Date)")
1996 |         Date   High    Low  Close   Volume
1997 | 1 2010-03-31 588.28 539.70 567.12 51541600
1998 | 2 2010-04-16 597.84 549.63 550.15 41201900
1999 | ```
2000 | 
2001 | and here is another shorter one that uses a trick of Magnus Hagander in
2002 | the second Stackoverflow link below:
2003 | 
2004 | ```r
2005 | > sqldf("select 
2006 | + max(Date) Date, 
2007 | + max(High) High, 
2008 | + min(Low) Low, 
2009 | + max(100000 * Date + Close) % 100000 Close,
2010 | + sum(Volume) Volume
2011 | + from DF25 
2012 | + group by date(Date + julianday('1970-01-01'), 'start of month')")
2013 |         Date   High    Low Close   Volume
2014 | 1 2010-03-31 588.28 539.70   567 51541600
2015 | 2 2010-04-16 597.84 549.63   550 41201900
2016 | ```
2017 | 
2018 | Also see [this Xaprb
2019 | link](https://www.xaprb.com/blog/2007/03/14/how-to-find-the-max-row-per-group-in-sql-without-subqueries/)
2020 | for an approach without subqueries and for more discussion see [this
2021 | stackoverflow
2022 | link](https://stackoverflow.com/questions/121387/sql-fetch-the-row-which-has-the-max-value-for-a-column)
2023 | and [this stackoverflow
2024 | link](https://stackoverflow.com/questions/1140254/postgresql-vlookup).
2025 | The last link shows how to use analytical queries which are available in
2026 | PostgreSQL -- the PostgreSQL database, like SQLite and H2, is supported
2027 | by sqldf.
2028 | 
2029 | Example 5. Insert Variables[](#Example_5._Insert_Variables)
2030 | -----------------------------------------------------------
2031 | 
2032 | Here is an example of inserting evaluated variables into a query using
2033 | [gsubfn](https://code.google.com/p/gsubfn/) quasi-perl-style string
2034 | interpolation. gsubfn is used by sqldf so its already loaded. Note that
2035 | we must use the `fn$` prefix to invoke the interpolation functionality:
2036 | 
2037 | ```r
2038 | > minSL <- 7
2039 | > limit <- 3
2040 | > species <- "virginica"
2041 | > fn$sqldf("select * from iris where \"Sepal.Length\" > $minSL and species = '$species' limit $limit")
2042 | 
2043 |   Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
2044 | 1          7.1         3.0          5.9         2.1 virginica
2045 | 2          7.6         3.0          6.6         2.1 virginica
2046 | 3          7.3         2.9          6.3         1.8 virginica
2047 | ```
2048 | 
2049 | Example 6. File Input[](#Example_6._File_Input)
2050 | -----------------------------------------------
2051 | 
2052 | Note that there is a new command `read.csv.sql` which provides an
2053 | alternate interface to the the approach discussed in this section. See
2054 | Example 13 for that.
2055 | 
2056 | sqldf normally deletes any database it creates after completion but the
2057 | example sample code [at the bottom of this
2058 | post](https://stat.ethz.ch/pipermail/r-help/2010-October/257270.html)
2059 | shows how to set up a database and read a file into it without having
2060 | the database destroyed afterwards.
2061 | 
2062 | sqldf will not only look for data frames used in the SQL statement but
2063 | will also look for R objects of class `"file"`. For such objects it will
2064 | directly import the associated file into the database without going
2065 | through R allowing files that are larger than an R workspace to be
2066 | handled and also providing for potential speed advantages. That is, if
2067 | `f <- file("abc.csv")` is a file object and `f` is used as the table
2068 | name in the sql statement then the file `abc.csv` is imported into the
2069 | database as table `f`. With SQLite, the actual reading of the file into
2070 | the database is done in a C routine in RSQLite so the file is
2071 | transferred directly to the database without going through R. If the
2072 | `sqldf` argument `dbname` is used then it specifies a filename (either
2073 | existing or created by `sqldf` if not existing). That filename is used
2074 | as a database (rather than memory) allowing larger files than physical
2075 | memory. By using an appropriate `where` statement or a subset of column
2076 | names a portion of the table can be retrieved into R even if the file
2077 | itself is too large for R or for memory.
2078 | 
2079 | There are some caveats. The RSQLite `dbWriteTable`/`sqliteImportFile`
2080 | routines that `sqldf` uses to transfer the file directly to the database
2081 | are intended for speed thus they are not as flexible as `read.table`.
2082 | Also they have slightly different defaults. The default for `sep` is
2083 | `file.format = list(sep = ",")`. If the first row of the file has one
2084 | fewer component than subsequent ones then it assumes that
2085 | `file.format = list(header = TRUE, row.names = TRUE)` and otherwise that
2086 | `file.format = list(header = FALSE,  row.names = FALSE)`. `.csv` file
2087 | format is only partly supported -- quotes are not regarded as special.
2088 | 
2089 | In addition to the examples below there is an example
2090 | [here](http://web.archive.org/web/20140429215324/http://stat.ethz.ch/pipermail/r-help/2009-May/199991.html) and
2091 | another one with performance results
2092 | [here](http://www.cerebralmastication.com/2009/11/loading-big-data-into-r/).
2093 | 
2094 | ```r
2095 | > # Example 6a.
2096 | > # test of file connections with sqldf
2097 | > 
2098 | > # create test .csv file of just 3 records
2099 | > write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE)
2100 | > 
2101 | > # look at contents of iris3.dat
2102 | > readLines("iris3.dat")
2103 | [1] "Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species"
2104 | [2] "1,5.1,3.5,1.4,0.2,setosa"                                 
2105 | [3] "2,4.9,3,1.4,0.2,setosa"                                   
2106 | [4] "3,4.7,3.2,1.3,0.2,setosa"                                 
2107 | > 
2108 | > # set up file connection
2109 | > iris3 <- file("iris3.dat")
2110 | > sqldf('select * from iris3 where "Sepal.Width" > 3')
2111 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2112 | 1          5.1         3.5          1.4         0.2  setosa
2113 | 2          4.7         3.2          1.3         0.2  setosa
2114 | >
2115 | > # Example 6b.
2116 | > # similar but uses disk - useful if file were large
2117 | > # According to https://www.sqlite.org/whentouse.html
2118 | > # SQLite can handle files up to several dozen gigabytes.
2119 | > # (Note in this case readTable and readTableIndex in R.utils
2120 | > # package or read.table from the base of R, setting the colClasses 
2121 | > # argument to "NULL" for columns you don't want read in, might be
2122 | > # alternatives.)
2123 | > sqldf('select * from iris3 where "Sepal.Width" > 3', dbname = tempfile())
2124 |  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2125 | 1          5.1         3.5          1.4         0.2  setosa
2126 | 2          4.7         3.2          1.3         0.2  setosa
2127 | 
2128 | > # Example 6c.
2129 | > # with this format, header=TRUE needs to be specified
2130 | > write.table(head(iris, 3), "iris3a.dat", sep = ",", quote = FALSE, 
2131 | +  row.names = FALSE)
2132 | > iris3a <- file("iris3a.dat")
2133 | > sqldf("select * from iris3a", file.format = list(header = TRUE))
2134 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2135 | 1          5.1         3.5          1.4         0.2  setosa
2136 | 2          4.9         3.0          1.4         0.2  setosa
2137 | 3          4.7         3.2          1.3         0.2  setosa
2138 | 
2139 | > # Example 6d.
2140 | > # header can alternately be specified as object attribute
2141 | > attr(iris3a, "file.format") <- list(header = TRUE)
2142 | > sqldf("select * from iris3a")
2143 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2144 | 1          5.1         3.5          1.4         0.2  setosa
2145 | 2          4.9         3.0          1.4         0.2  setosa
2146 | 3          4.7         3.2          1.3         0.2  setosa
2147 | 
2148 | > # Example 6e.
2149 | > # create a test file with all 150 records from iris
2150 | > # and select 4 records at random without reading entire file into R
2151 | > write.table(iris, "iris150.dat", sep = ",", quote = FALSE)
2152 | > iris150 <- file("iris150.dat")
2153 | > sqldf("select * from iris150 order by random(*) limit 4")
2154 |   Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
2155 | 1          4.9         2.5          4.5         1.7 virginica
2156 | 2          4.8         3.0          1.4         0.1    setosa
2157 | 3          6.1         2.6          5.6         1.4 virginica
2158 | 4          7.4         2.8          6.1         1.9 virginica
2159 | >
2160 | > # or use read.csv.sql and its just one line
2161 | > read.csv.sql("iris150.dat", sql = "select * from file order by random(*) limit 4")
2162 |   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
2163 | 1          4.9         2.4          3.3         1.0 versicolor
2164 | 2          5.8         2.7          4.1         1.0 versicolor
2165 | 3          7.4         2.8          6.1         1.9  virginica
2166 | 4          5.1         3.5          1.4         0.3     setosa
2167 | ```
2168 | 
2169 | Example 6f. If our file has fixed width fields rather than delimited
2170 | then we can still handle it if we parse the lines manually with substr:
2171 | 
2172 | ```r
2173 | # write some test data to "fixed"
2174 | # Field 1 has width of 1 column and field 2 has 4 columns
2175 | cat("1 8.3
2176 | 210.3
2177 | 319.0
2178 | 416.0
2179 | 515.6
2180 | 719.8
2181 | ", file = "fixed")
2182 | 
2183 | # get 3 random records using sqldf
2184 | fixed <- file("fixed")
2185 | attr(fixed, "file.format") <- list(sep = ";") # ; can be any char not in file
2186 | sqldf("select substr(V1, 1, 1) f1, substr(V1, 2, 4) f2 from fixed order by random(*) limit 3")
2187 | ```
2188 | 
2189 | Another example of fixed width data is
2190 | [here](https://sites.google.com/site/timriffepersonal/DemogBlog/newformetrickforworkingwithbigishdatainr)
2191 | (however, note that changing the sep needs to be done in the example in
2192 | that link too).
2193 | 
2194 | Example 6g. Defaults.
2195 | 
2196 | ```r
2197 | # If first row has one fewer columns than subsequent rows then 
2198 | # header <- row.names <- TRUE is assumed as in example 6a; otherwise,
2199 | # header <- row.names <- FALSE is assumed as shown here:
2200 | 
2201 | > write.table(head(iris, 3), "iris3nohdr.dat", col.names = FALSE, row.names = FALSE, sep = ",", quote = FALSE)
2202 | > readLines("iris3nohdr.dat")
2203 | [1] "5.1,3.5,1.4,0.2,setosa" "4.9,3,1.4,0.2,setosa"   "4.7,3.2,1.3,0.2,setosa"
2204 | > sqldf("select * from iris3nohdr")
2205 |    V1  V2  V3  V4     V5
2206 | 1 5.1 3.5 1.4 0.2 setosa
2207 | 2 4.9 3.0 1.4 0.2 setosa
2208 | 3 4.7 3.2 1.3 0.2 setosa
2209 | ```
2210 | 
2211 | Example 7. Nested Select[](#Example_7._Nested_Select)
2212 | -----------------------------------------------------
2213 | 
2214 | For each species show the two rows with the largest sepal lengths:
2215 | 
2216 | ```r
2217 | > # Example 7a.
2218 | > sqldf('select * from iris i 
2219 | +   where rowid in 
2220 | +    (select rowid from iris where Species = i.Species order by "Sepal.Length" desc limit 2)
2221 | +   order by i.Species, i."Sepal.Length" desc')
2222 | 
2223 |   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
2224 | 1          5.8         4.0          1.2         0.2     setosa
2225 | 2          5.7         4.4          1.5         0.4     setosa
2226 | 3          7.0         3.2          4.7         1.4 versicolor
2227 | 4          6.9         3.1          4.9         1.5 versicolor
2228 | 5          7.9         3.8          6.4         2.0  virginica
2229 | 6          7.7         3.8          6.7         2.2  virginica
2230 | ```
2231 | 
2232 | Here is a similar example. In this one `DF` represents a time series
2233 | whose values are in column `x` and whose times are dates in column `tt`.
2234 | The times have gaps -- in fact only every other day is present. The code
2235 | below displays the first row at or past the 21st of the month for each
2236 | year/month. First we append year, month and day columns using
2237 | `month.day.year` from the `chron` package and then do the computation
2238 | using `sqldf`. (For a version of this using the `zoo` package rather
2239 | than `sqldf` see:
2240 | [https://stat.ethz.ch/pipermail/r-help/2007-November/145925.html](https://stat.ethz.ch/pipermail/r-help/2007-November/145925.html)).
2241 | 
2242 | ```r
2243 | > # Example 7b.
2244 | > #
2245 | > library(chron)
2246 | > DF <- data.frame(x = 101:200, tt = as.Date("2000-01-01") + seq(0, len = 100, by = 2))
2247 | > DF <- cbind(DF, month.day.year(unclass(DF$tt)))
2248 | > 
2249 | > sqldf("select * from DF d
2250 | +   where rowid in 
2251 | +    (select rowid from DF 
2252 | +       where year = d.year and month = d.month and day >= 21 limit 1)
2253 | +    order by tt")
2254 |     x         tt    month    day    year
2255 | 1 111 2000-01-21        1     21    2000
2256 | 2 127 2000-02-22        2     22    2000
2257 | 3 141 2000-03-21        3     21    2000
2258 | 4 157 2000-04-22        4     22    2000
2259 | 5 172 2000-05-22        5     22    2000
2260 | 6 187 2000-06-21        6     21    2000
2261 | ```
2262 | 
2263 | Here is another example of a nested select. We select each row of a for
2264 | which st/en overlaps with some st/en of b.
2265 | 
2266 | ```r
2267 | > # Example 7c.
2268 | > #
2269 | > a <- read.table(textConnection("st en
2270 | + 1 4
2271 | + 11 14
2272 | + 3 4"), header = TRUE)
2273 | > 
2274 | > b <- read.table(textConnection("st en
2275 | + 2 5
2276 | + 3 6
2277 | + 30 44"), TRUE)
2278 | > 
2279 | > sqldf("select * from a where 
2280 | + (select count(*) from b where a.en >= b.st and b.en >= a.st) > 0")
2281 |   st en
2282 | 1  1  4
2283 | 2  3  4
2284 | ```
2285 | 
2286 | 7d. Another example of a nested select with sqldf is shown
2287 | [here](https://stat.ethz.ch/pipermail/r-help/2010-March/231975.html)
2288 | 
2289 | Example 8. Specifying File Format[](#Example_8._Specifying_File_Format)
2290 | -----------------------------------------------------------------------
2291 | 
2292 | When using file() as used as in Example 6 RSQLite reads in the first 50
2293 | lines to determine the column classes. What if they all have numbers in
2294 | them but then later we start to see letters? In that case we will have
2295 | to override its choice. Here are two ways:
2296 | 
2297 | ```r
2298 | library(sqldf)
2299 | 
2300 | # example example 8a - file.format attribute on file.object
2301 | 
2302 | numStr <- as.character(1:100)
2303 | DF <- data.frame(a = c(numStr, "Hello"))
2304 | write.table(DF, file = "~/tmp.csv", quote = FALSE, sep = ",")
2305 | ff <- file("~/tmp.csv")
2306 | 
2307 | attr(ff, "file.format") <- list(colClasses = c(a = "character"))
2308 | 
2309 | tail(sqldf("select * from ff"))
2310 | 
2311 | 
2312 | # example 8b - using file.format argument
2313 | 
2314 | numStr <- as.character(1:100)
2315 | DF <- data.frame(a = c(numStr, "Hello"))
2316 | write.table(DF, file = "~/tmp.csv", quote = FALSE, sep = ",")
2317 | ff <- file("~/tmp.csv")
2318 | 
2319 | tail(sqldf("select * from ff",
2320 |  file.format = list(colClasses = c(a = "character"))))
2321 | ```
2322 | 
2323 | Example 9. Working with Databases[](#Example_9.__Working_with_Databases)
2324 | ------------------------------------------------------------------------
2325 | 
2326 | sqldf is usually used to operate on data frames but it can be used to
2327 | store a table in a database and repeatedly query it in subsequent sqldf
2328 | statements (although in that case you might be better off just using
2329 | RSQLite or other database directly). There are two ways to do this. In
2330 | this Example section we show how to do it using the fact that if you
2331 | specify the database explicitly then it does not delete the database at
2332 | the end and if you create a table explicitly using create table then it
2333 | does not delete the table (however, note that that will result in
2334 | duplicate tables in the database so it will take up twice as much space
2335 | as one table). A second way to do this is to use persistent connections
2336 | as shown in the Example section after this one.
2337 | 
2338 | ```r
2339 | # create new empty database called mydb
2340 | sqldf("attach 'mydb' as new") 
2341 | 
2342 | # create a new table, mytab, in the new database
2343 | # Note that sqldf does not delete tables created from create.
2344 | sqldf("create table mytab as select * from BOD", dbname = "mydb")
2345 | 
2346 | # shows its still there
2347 | sqldf("select * from mytab", dbname = "mydb")
2348 | 
2349 | # read a file into the mydb data base using read.csv.sql without deleting it
2350 | #
2351 | # 1. First create a test file.
2352 | # 2. Then read it into the mydb database we created using the sqldf("attach...") above.
2353 | #    Since sqldf automatically cleans up after itself we hide 
2354 | #    the table creation in an sql statement so table is not deleted.
2355 | # 3. Finally list the table names in the database.
2356 |  
2357 | write.table(BOD, file = "~/tmp.csv", quote = FALSE, sep = ",")
2358 | read.csv.sql("~/tmp.csv", sql = "create table mytab as select * from file", 
2359 |   dbname = "mydb")
2360 | sqldf("select * from sqlite_master", dbname = "mydb")
2361 | ```
2362 | 
2363 | Example 10. Persistent Connections[](#Example_10._Persistent_Connections)
2364 | -------------------------------------------------------------------------
2365 | 
2366 | These three examples show the use of persistent connections in sqldf.
2367 | This would be used when one has a large database that one wants to store
2368 | and then make queries from so that one does not have to reload it on
2369 | each execution of sqldf. (Note that if one just needs a series of sql
2370 | statements ending in a single query an alternative would be just to use
2371 | a vector of sql statements in a single sqldf call.)
2372 | 
2373 | ```r
2374 | > # Example 10a.
2375 | >
2376 | > # create test .csv file of just 3 records (same as example 6)
2377 | > write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE)
2378 | > # set up file connection
2379 | > iris3 <- file("iris3.dat")
2380 | > # creates connection so in memory database persists after sqldf call
2381 | > sqldf() 
2382 | <SQLiteConnection:(7384,62)> 
2383 | > 
2384 | > # uses connection just created
2385 | > sqldf('select * from iris3 where "Sepal.Width" > 3')
2386 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2387 | 1          5.1         3.5          1.4         0.2  setosa
2388 | 2          4.7         3.2          1.3         0.2  setosa
2389 | > # we now have iris3 variable in R workspace and an iris3 table
2390 | > # so ensure sqldf uses the one in the main database by writing
2391 | > # main.iris3.  (Another possibility here would have been to
2392 | > # delete the iris3 variable from the R workspace to avoid the
2393 | > # ambiguity -- in that case one could just write iris3 instead
2394 | > # of main.iris3.)
2395 | > sqldf('select * from main.iris3 where "Sepal.Width" = 3')
2396 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2397 | 1          4.9           3          1.4         0.2  setosa
2398 | > 
2399 | > # close
2400 | > sqldf()
2401 | NULL
2402 | 
2403 | > # Example 10b.
2404 | > #
2405 | > # Here is another way to do example 10a.  We use the same iris3,
2406 | > # iris3.dat and sqldf development version as above.  
2407 | > # We grab connection explicitly, set up the database using sqldf and then 
2408 | > # for the second call we call dbGetQuery from RSQLite.  
2409 | > # In that case we don't need to qualify iris3 as main.iris3 since
2410 | > # RSQLite would not understand R variables anyways so there is no 
2411 | > # ambiguity.
2412 | 
2413 | > con <- sqldf() 
2414 | > 
2415 | > # uses connection just created
2416 | > sqldf('select * from iris3 where "Sepal.Width" > 3')
2417 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2418 | 1          5.1         3.5          1.4         0.2  setosa
2419 | 2          4.7         3.2          1.3         0.2  setosa
2420 | > dbGetQuery(con, 'select * from iris3 where "Sepal.Width" = 3')
2421 |   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
2422 | 1          4.9           3          1.4         0.2  setosa
2423 | > 
2424 | > # close
2425 | > sqldf()
2426 | NULL
2427 | ```
2428 | 
2429 | Here is an example of reading a csv file using read.csv.sql and then
2430 | reading it again using a persistent connection:
2431 | 
2432 | ```r
2433 | # Example 10c.
2434 | 
2435 | write.table(iris, "iris.csv", sep = ",", quote = FALSE)
2436 | 
2437 | sqldf()
2438 | read.csv.sql("iris.csv", sql = "select count(*) from file")
2439 | 
2440 | # now re-read it from the sqlite database
2441 | dd <- sqldf("select * from file")
2442 | 
2443 | # now close the connection and destroy the database
2444 | sqldf()
2445 | ```
2446 | 
2447 | Example 11. Between and Alternatives[](#Example_11._Between_and_Alternatives)
2448 | -----------------------------------------------------------------------------
2449 | 
2450 | ```r
2451 | # example thanks to Michael Rehberg
2452 | #
2453 | # build sample dataframes
2454 | seqdf <- data.frame(thetime=seq(100,225,5),thevalue=factor(letters))
2455 | boundsdf <- data.frame(thestart=c(110,160,200),theend=c(130,180,220),groupID=c(555,666,777))
2456 | 
2457 | # run the query using two inequalities
2458 | testquery_1 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 
2459 | from seqdf left join boundsdf on (seqdf.thetime <= boundsdf.theend) and (seqdf.thetime >= boundsdf.thestart)")
2460 | 
2461 | # run the same query using 'between...and' clause
2462 | testquery_2 <- sqldf("select seqdf.thetime, seqdf.thevalue, boundsdf.groupID 
2463 | from seqdf LEFT JOIN boundsdf ON (seqdf.thetime BETWEEN boundsdf.thestart AND boundsdf.theend)")
2464 | ```
2465 | 
2466 | Example 12. Combine two files in permanent database[](#Example_12._Combine_two_files_in_permanent_database)
2467 | -----------------------------------------------------------------------------------------------------------
2468 | 
2469 | When we issue a series of normal `sqldf` statements after each one sqldf
2470 | automatically removes any tables and databases it creates in that
2471 | statement; however, it does not know about ones that `sqlite` creates so
2472 | a database created using `attach` and the tables created using
2473 | `create table` won't be deleted.
2474 | 
2475 | Also if `sqldf` is used without the `x=` argument (omitting x= denotes
2476 | the opening of a persistent connection) then objects created in the
2477 | database including those by `sqldf` and `sqlite` are not deleted when
2478 | the persistent connection is destroyed by the next `sqldf` statement
2479 | with no `x=` argument.
2480 | 
2481 | If we have forgetten whether you have a connection open or not we can
2482 | check either of these:
2483 | 
2484 | ```r
2485 | dbListConnections(SQLite()) # from DBI
2486 | 
2487 | getOption("sqldf.connection") # set by sqldf
2488 | ```
2489 | 
2490 | Here is an example that illustrates part of the above. See the prior
2491 | examples for more.
2492 | 
2493 | ```r
2494 | > # set up some test data
2495 | > write.table(head(iris, 3), "irishead.dat", sep = ",", quote = FALSE)
2496 | > write.table(tail(iris, 3), "iristail.dat", sep = ",", quote = FALSE)
2497 | > 
2498 | > library(sqldf)
2499 | > 
2500 | > # create new empty database called mydb
2501 | > sqldf("attach 'mydb' as new") 
2502 | NULL
2503 | > 
2504 | > irishead <- file("irishead.dat")
2505 | > iristail <- file("iristail.dat")
2506 | > 
2507 | > # read tables into mydb
2508 | > sqldf("select count(*) from irishead", dbname = "mydb")
2509 |   count(*)
2510 | 1        3
2511 | > sqldf("select count(*) from iristail", dbname = "mydb")
2512 |   count(*)
2513 | 1        3
2514 | > 
2515 | > # get count of all records from union
2516 | > sqldf('select count(*) from (select * from main.irishead 
2517 | + union 
2518 | + select * from main.iristail)', dbname = "mydb")
2519 |   count(*)
2520 | 1        6
2521 | ```
2522 | 
2523 | Example 13. read.csv.sql and read.csv2.sql[](#Example_13._read.csv.sql_and_read.csv2.sql)
2524 | -----------------------------------------------------------------------------------------
2525 | 
2526 | `read.csv.sql` is an interface to `sqldf` that works like `read.csv` in
2527 | R except that it also provides an `sql=` argument and not all of the
2528 | other arguments of `read.csv` are supported. It uses (1) SQLite's import
2529 | facility via RSQLite to read the input file into a temporary disk-based
2530 | SQLite database which is created on the fly. (2) Then it uses the
2531 | provided SQL statement to read the table so created into R. As the first
2532 | step imports the data directly into SQLite without going through R it
2533 | can handle larger files than R itself can handle as long as the SQL
2534 | statement filters it to a size that R can handle. Here is Example 6c
2535 | redone using this facility:
2536 | 
2537 | ```r
2538 | # Example 13a.
2539 | library(sqldf)
2540 | 
2541 | write.table(iris, "iris.csv", sep = ",", quote = FALSE, row.names = FALSE)
2542 | iris.csv <- read.csv.sql("iris.csv", 
2543 |     sql = 'select * from file where "Sepal.Length" > 5')
2544 | 
2545 | # Example 13b.  read.csv2.sql.  Commas are decimals and ; is sep.
2546 | 
2547 | library(sqldf)
2548 | Lines <- "Sepal.Length;Sepal.Width;Petal.Length;Petal.Width;Species
2549 | 5,1;3,5;1,4;0,2;setosa
2550 | 4,9;3;1,4;0,2;setosa
2551 | 4,7;3,2;1,3;0,2;setosa
2552 | 4,6;3,1;1,5;0,2;setosa
2553 | "
2554 | cat(Lines, file = "iris2.csv")
2555 | 
2556 | iris.csv2 <- read.csv2.sql("iris2.csv", sql = 'select * from file where "Sepal.Length" > 5')
2557 | 
2558 | # Example 13c. Use of filter= to process fixed field widths.
2559 | 
2560 | # This example assumes gawk is available for use as a filter:
2561 | # https://www.icewalkers.com/Linux/Software/514530/Gawk.html
2562 | # https://gnuwin32.sourceforge.net/packages/gawk.htm
2563 | 
2564 | library(sqldf)
2565 | cat("112333
2566 | 123456", file = "fixed.dat")
2567 | cat('BEGIN { FIELDWIDTHS = "2 1 3"; OFS = ","; print "A,B,C" }
2568 | { $1 = $1; print }', file = "fixed.awk")
2569 | 
2570 | # the following worked on Windows Vista.  One user told me that it only worked if he
2571 | # omitted the eol= argument so try it both ways on your system and use the way that
2572 | # works for your system.
2573 | 
2574 | fixed <- read.csv.sql("fixed.dat", eol = "\n", filter = "gawk -f fixed.awk")
2575 | 
2576 | # Example 13d.  Read a csv file into the database but do not drop the database or table
2577 | 
2578 | # create test file
2579 | write.table(iris, "iris.csv", sep = ",", quote = FALSE, row.names = FALSE)
2580 | 
2581 | # create an empty database (can skip this step if database already exists)
2582 | sqldf("attach mytestdb as new")
2583 | 
2584 | # read into table called iris in the mytestdb sqlite database
2585 | read.csv.sql("iris.csv", sql = "create table main.iris as select * from file", dbname = "mytestdb")
2586 | 
2587 | # look at first three lines
2588 | sqldf("select * from main.iris limit 3", dbname = "mytestdb")
2589 | 
2590 | # example 13e.  Read in only column j of a csv file where j may vary.
2591 | 
2592 | library(sqldf)
2593 | 
2594 | # create test data file
2595 | nms <- names(anscombe)
2596 | write.table(anscombe, "anscombe.dat", sep = ",", quote = FALSE, 
2597 |     row.names = FALSE)
2598 | 
2599 | j <- 2
2600 | DF2 <- fn$read.csv.sql("anscombe.dat", sql = "select `nms[j]` from file")
2601 | ```
2602 | 
2603 | Also see this
2604 | [example](https://stat.ethz.ch/pipermail/r-help/2010-November/260931.html)
2605 | and this further
2606 | [example](https://stackoverflow.com/questions/6966723/how-to-allocate-append-a-large-column-of-date-objects-to-a-data-frame/6966771#6966771).
2607 | The latter illustrates the use of the `method=` argument.
2608 | 
2609 | Example 14. Use of spatialite library functions[](#Example_14._Use_of_spatialite_library_functions)
2610 | ---------------------------------------------------------------------------------------------------
2611 | 
2612 | ******This example needs to be revised as automatic loading of
2613 | spatialite has been removed from sqldf and replaced with the functions
2614 | in RSQLite.extfuns which are loaded instead******
2615 | 
2616 | This example will only work if spatialite-1.dll is on your PATH. It
2617 | shows accessing a function in that dll. Other than placing it on your
2618 | PATH there is no other setup needed. (Note that libspatialite-1.dll is
2619 | only looked up the first time sqldf runs in a session so you should be
2620 | sure that it has been put there before starting sqldf.)
2621 | 
2622 | ```r
2623 | > library(sqldf)
2624 | > # stddev_pop is a function in spatialite library similar to sd in R
2625 | > # Note bug: spatialite has stddev_pop and stddev_samp reversed and ditto for var_pop and var_samp.  More on bug at:
2626 | > # https://groups.google.com/group/spatialite-users/msg/182f1f629c922607
2627 | > sqldf("select avg(demand), stddev_pop(demand) from BOD")
2628 |   avg(demand) stddev_pop(demand)
2629 | 1    14.83333           4.630623
2630 | > c(mean(BOD$demand), sd(BOD$demand))
2631 | [1] 14.833333  4.630623
2632 | ```
2633 | 
2634 | Example 15. Use of RSQLite.extfuns library functions[](#Example_15._Use_of_RSQLite.extfuns_library_functions)
2635 | -------------------------------------------------------------------------------------------------------------
2636 | The RSQLite R package includes Liam Healy's extension functions for SQLite.
2637 | In addition to all the [core
2638 | functions](https://www.sqlite.org/lang_corefunc.html), [date
2639 | functions](https://www.sqlite.org/lang_datefunc.html) and [aggregate
2640 | functions](https://www.sqlite.org/lang_aggfunc.html) that SQLite itself
2641 | provides, the following extension functions are available for use within
2642 | SQL select statements: **Math:** acos, asin, atan, atn2, atan2, acosh,
2643 | asinh, atanh, difference, degrees, radians, cos, sin, tan, cot, cosh,
2644 | sinh, tanh, coth, exp, log, log10, power, sign, sqrt, square, ceil,
2645 | floor, pi. **String:** replicate, charindex, leftstr, rightstr, ltrim,
2646 | rtrim, trim, replace, reverse, proper, padl, padr, padc, strfilter.
2647 | **Aggregate:** stdev, variance, mode, median, lower\_quartile,
2648 | upper\_quartile. See the bottom of
2649 | [https://www.sqlite.org/contrib/](https://www.sqlite.org/contrib/) for
2650 | more info on these extension functions.
2651 | 
2652 | ```r
2653 | > sqldf("select avg(demand) mean, variance(demand) var from BOD")
2654 |       mean      var
2655 | 1 14.83333 21.44267
2656 | > var(BOD$demand)
2657 | [1] 21.44267
2658 | ```
2659 | 
2660 | Example 16. Moving Average[](#Example_16._Moving_Average)
2661 | ---------------------------------------------------------
2662 | 
2663 | This is a simplified version of the example in this [r-help
2664 | post](https://stat.ethz.ch/pipermail/r-help/2010-August/249996.html).
2665 | Here we compute the moving average of x for the 3rd to 9th preceding
2666 | values of each date performing it separately for each illness.
2667 | 
2668 | ```r
2669 | > Lines   <- "date           illness x
2670 | +    2006/01/01    DERM 319
2671 | +    2006/01/02    DERM 388
2672 | +    2006/01/03    DERM 336
2673 | +    2006/01/04    DERM 255
2674 | +    2006/01/05    DERM 177
2675 | +    2006/01/06    DERM 377
2676 | +    2006/01/07    DERM 113
2677 | +    2006/01/08    DERM 253
2678 | +    2006/01/09    DERM 316
2679 | +    2006/01/10    DERM 187
2680 | +    2006/01/11    DERM 292
2681 | +    2006/01/12    DERM 275
2682 | +    2006/01/13    DERM 355
2683 | +    2006/01/01    FEVER 3190
2684 | +    2006/01/02    FEVER 3880
2685 | +    2006/01/03    FEVER 3360
2686 | +    2006/01/04    FEVER 2550
2687 | +    2006/01/05    FEVER 1770
2688 | +    2006/01/06    FEVER 3770
2689 | +    2006/01/07    FEVER 1130
2690 | +    2006/01/08    FEVER 2530
2691 | +    2006/01/09    FEVER 3160
2692 | +    2006/01/10    FEVER 1870
2693 | +    2006/01/11    FEVER 2920
2694 | +    2006/01/12    FEVER 2750
2695 | +    2006/01/13    FEVER 3550"
2696 | > 
2697 | > DF <- read.table(textConnection(Lines), header = TRUE)
2698 | > DF$date <- as.Date(DF$date)
2699 | >
2700 | > sqldf("select
2701 | +                t1.date,
2702 | +                avg(t2.x) mean,
2703 | +                date(min(t2.date) * 24 * 60 * 60, 'unixepoch') fromdate,
2704 | +                date(max(t2.date) * 24 * 60 * 60, 'unixepoch') todate,
2705 | +                max(t2.illness) illness
2706 | +        from  DF t1, DF t2
2707 | +        where julianday(t1.date) between julianday(t2.date) + 3 and
2708 | + julianday(t2.date) + 9
2709 | +                and t1.illness = t2.illness
2710 | +        group by t1.illness, t1.date
2711 | +        order by t1.illness, t1.date")
2712 |          date      mean   fromdate     todate illness
2713 | 1  2006-01-04  319.0000 2006-01-01 2006-01-01    DERM
2714 | 2  2006-01-05  353.5000 2006-01-01 2006-01-02    DERM
2715 | 3  2006-01-06  347.6667 2006-01-01 2006-01-03    DERM
2716 | 4  2006-01-07  324.5000 2006-01-01 2006-01-04    DERM
2717 | 5  2006-01-08  295.0000 2006-01-01 2006-01-05    DERM
2718 | 6  2006-01-09  308.6667 2006-01-01 2006-01-06    DERM
2719 | 7  2006-01-10  280.7143 2006-01-01 2006-01-07    DERM
2720 | 8  2006-01-11  271.2857 2006-01-02 2006-01-08    DERM
2721 | 9  2006-01-12  261.0000 2006-01-03 2006-01-09    DERM
2722 | 10 2006-01-13  239.7143 2006-01-04 2006-01-10    DERM
2723 | 11 2006-01-04 3190.0000 2006-01-01 2006-01-01   FEVER
2724 | 12 2006-01-05 3535.0000 2006-01-01 2006-01-02   FEVER
2725 | 13 2006-01-06 3476.6667 2006-01-01 2006-01-03   FEVER
2726 | 14 2006-01-07 3245.0000 2006-01-01 2006-01-04   FEVER
2727 | 15 2006-01-08 2950.0000 2006-01-01 2006-01-05   FEVER
2728 | 16 2006-01-09 3086.6667 2006-01-01 2006-01-06   FEVER
2729 | 17 2006-01-10 2807.1429 2006-01-01 2006-01-07   FEVER
2730 | 18 2006-01-11 2712.8571 2006-01-02 2006-01-08   FEVER
2731 | 19 2006-01-12 2610.0000 2006-01-03 2006-01-09   FEVER
2732 | 20 2006-01-13 2397.1429 2006-01-04 2006-01-10   FEVER
2733 | ```
2734 | 
2735 | Because of the date processing this is a bit more conveniently done in
2736 | H2 with its support of date class. Using the same `DF` that we just
2737 | defined. Note that SQL functions like AVG and MIN must be written in
2738 | upper case when using H2.
2739 | 
2740 | ```r
2741 | > library(RH2)
2742 | > sqldf("select
2743 | +                t1.date,
2744 | +                AVG(t2.x) mean,
2745 | +                MIN(t2.date) fromdate,
2746 | +                MAX(t2.date) todate,
2747 | +                t2.illness illness
2748 | +        from  DF t1, DF t2
2749 | +        where t1.date between t2.date + 3 and t2.date + 9
2750 | +                and t1.illness = t2.illness
2751 | +        group by t1.illness, t1.date
2752 | +        order by t1.illness, t1.date")
2753 |          date mean   fromdate     todate illness
2754 | 1  2006-01-04  319 2006-01-01 2006-01-01    DERM
2755 | 2  2006-01-05  353 2006-01-01 2006-01-02    DERM
2756 | 3  2006-01-06  347 2006-01-01 2006-01-03    DERM
2757 | 4  2006-01-07  324 2006-01-01 2006-01-04    DERM
2758 | 5  2006-01-08  295 2006-01-01 2006-01-05    DERM
2759 | 6  2006-01-09  308 2006-01-01 2006-01-06    DERM
2760 | 7  2006-01-10  280 2006-01-01 2006-01-07    DERM
2761 | 8  2006-01-11  271 2006-01-02 2006-01-08    DERM
2762 | 9  2006-01-12  261 2006-01-03 2006-01-09    DERM
2763 | 10 2006-01-13  239 2006-01-04 2006-01-10    DERM
2764 | 11 2006-01-04 3190 2006-01-01 2006-01-01   FEVER
2765 | 12 2006-01-05 3535 2006-01-01 2006-01-02   FEVER
2766 | 13 2006-01-06 3476 2006-01-01 2006-01-03   FEVER
2767 | 14 2006-01-07 3245 2006-01-01 2006-01-04   FEVER
2768 | 15 2006-01-08 2950 2006-01-01 2006-01-05   FEVER
2769 | 16 2006-01-09 3086 2006-01-01 2006-01-06   FEVER
2770 | 17 2006-01-10 2807 2006-01-01 2006-01-07   FEVER
2771 | 18 2006-01-11 2712 2006-01-02 2006-01-08   FEVER
2772 | 19 2006-01-12 2610 2006-01-03 2006-01-09   FEVER
2773 | 20 2006-01-13 2397 2006-01-04 2006-01-10   FEVER
2774 | ```
2775 | 
2776 | Another example which varies somewhat from a strict moving average can
2777 | be found [in this
2778 | post](https://stat.ethz.ch/pipermail/r-help/2011-June/280081.html).
2779 | 
2780 | Example 17. Lag[](#Example_17._Lag)
2781 | -----------------------------------
2782 | 
2783 | The following example contributed by Søren Højsgaard shows how to lag a
2784 | column.
2785 | 
2786 | ```r
2787 | ## Create a lagged variable for grouped data
2788 | ## -----------------------------------------
2789 | # Meaning that in the i'th row we not only have y[i] but also y[i-1].
2790 | # This is done on a groupwise basis
2791 | library(sqldf)
2792 | set.seed(123)
2793 | DF <- data.frame(id=rep(1:2, each=5), tvar=rep(1:5,2), y=rnorm(1:10))
2794 | # Data with lagged variable added
2795 | BB <-
2796 |  sqldf("select A.id, A.tvar, A.y, B.y as lag
2797 |          from DF as A join DF as B
2798 |          where A.rowid-1 = B.rowid and A.id=B.id
2799 |          order by A.id, A.tvar")
2800 | # Merge with original data:
2801 | DD <-
2802 |  sqldf("select DF.*, BB.lag
2803 |          from DF left join BB
2804 |          on DF.id=BB.id and DF.tvar=BB.tvar")
2805 | # Do it all in one step:
2806 | DD <-
2807 |  sqldf("select DF.*, BB.lag
2808 |          from DF left join
2809 |          (
2810 |            select A.id, A.tvar, A.y, B.y as lag
2811 |                    from DF as A join DF as B
2812 |                    where A.rowid-1 = B.rowid and A.id=B.id
2813 |                    order by A.id, A.tvar
2814 |          ) as BB
2815 |          on DF.id=BB.id and DF.tvar=BB.tvar")
2816 | ```
2817 | 
2818 | In PostgreSQL's
2819 | [window](https://developer.postgresql.org/pgdocs/postgres/tutorial-window.html)
2820 | [functions](https://developer.postgresql.org/pgdocs/postgres/functions-window.html)
2821 | (similar to R's `ave` function) makes reference to other rows
2822 | particularly easy. Below we repeat the SQLite example in PostgreSQL
2823 | (except that the following fills with NA):
2824 | 
2825 | ```r
2826 | # Be sure PostgreSQL is installed and running.  
2827 | 
2828 | library(RPostgreSQL)
2829 | library(sqldf)
2830 | sqldf("select *, lag(y) over (partition by id order by tvar) from DF")
2831 | ```
2832 | 
2833 | Example 18. MySQL Schema Information[](#Example_18._MySQL_Schema_Information)
2834 | -----------------------------------------------------------------------------
2835 | 
2836 | ```r
2837 | library(RMySQL)
2838 | library(sqldf)
2839 | sqldf("show databases")
2840 | sqldf("show tables")
2841 | ```
2842 | 
2843 | The following SQL statements to query the MySQL table schemas are taken
2844 | from the [blog of Christophe
2845 | Ladroue](https://chrisladroue.com/2012/03/a-graphical-overview-of-your-mysql-database/):
2846 | 
2847 | ```r
2848 | library(RMySQL)
2849 | library(sqldf)
2850 | 
2851 | # list each schema and its length
2852 | sqldf("SELECT TABLE_SCHEMA,SUM(DATA_LENGTH) SCHEMA_LENGTH 
2853 |        FROM information_schema.TABLES 
2854 |        WHERE TABLE_SCHEMA!='information_schema' 
2855 |        GROUP BY TABLE_SCHEMA")
2856 | 
2857 | # list each table in each schema and some info about it
2858 | sqldf("SELECT TABLE_SCHEMA,TABLE_NAME,TABLE_ROWS,DATA_LENGTH 
2859 |        FROM information_schema.TABLES 
2860 |        WHERE TABLE_SCHEMA!='information_schema'")
2861 | ```
2862 | 
2863 | The following SQL statement to query the MySQL table schemas are taken
2864 | from [the MySQL Performance
2865 | Blog](https://www.percona.com/blog/2008/03/17/researching-your-mysql-table-sizes/):
2866 | 
2867 | ```r
2868 | # Find total number of tables, rows, total data in index size
2869 | sqldf("SELECT count(*) tables,
2870 |   concat(round(sum(table_rows)/1000000,2),'M') rows,
2871 |   concat(round(sum(data_length)/(1024*1024*1024),2),'G') data,
2872 |   concat(round(sum(index_length)/(1024*1024*1024),2),'G') idx,
2873 |   concat(round(sum(data_length+index_length)/(1024*1024*1024),2),'G') total_size,
2874 |   round(sum(index_length)/sum(data_length),2) idxfrac
2875 | FROM information_schema.TABLES")
2876 | 
2877 | # find biggest databases
2878 | sqldf("SELECT
2879 |         count(*) tables,
2880 |         table_schema,concat(round(sum(table_rows)/1000000,2),'M') rows,
2881 |         concat(round(sum(data_length)/(1024*1024*1024),2),'G') data,
2882 |         concat(round(sum(index_length)/(1024*1024*1024),2),'G') idx,
2883 |         concat(round(sum(data_length+index_length)/(1024*1024*1024),2),'G') total_size,
2884 |         round(sum(index_length)/sum(data_length),2) idxfrac
2885 |         FROM information_schema.TABLES
2886 |         GROUP BY table_schema
2887 |         ORDER BY sum(data_length+index_length) DESC LIMIT 10")
2888 | 
2889 | # data distribution by storage engine
2890 | sqldf("SELECT engine,
2891 |         count(*) tables,
2892 |         concat(round(sum(table_rows)/1000000,2),'M') rows,
2893 |         concat(round(sum(data_length)/(1024*1024*1024),2),'G') data,
2894 |         concat(round(sum(index_length)/(1024*1024*1024),2),'G') idx,
2895 |         concat(round(sum(data_length+index_length)/(1024*1024*1024),2),'G') total_size,
2896 |         round(sum(index_length)/sum(data_length),2) idxfrac
2897 |         FROM information_schema.TABLES
2898 |         GROUP BY engine
2899 |         ORDER BY sum(data_length+index_length) DESC LIMIT 10")
2900 | ```
2901 | 
2902 | Links[](#Links)
2903 | ===============
2904 | 
2905 | [Visual Representation of SQL
2906 | Joins](https://www.codeproject.com/Articles/33052/Visual-Representation-of-SQL-Joins)
2907 | 


--------------------------------------------------------------------------------
/demo/00Index:
--------------------------------------------------------------------------------
1 | sqldf-groupchoose	Choose one element from each group.
2 | sqldf-unitTests	Run unit tests using svUnit package.
3 | 


--------------------------------------------------------------------------------
/demo/sqldf-groupchoose.R:
--------------------------------------------------------------------------------
 1 | 
 2 | Lines <- "DeployID Date.Time LocationQuality Latitude Longitude
 3 | 
 4 | STM05-1 28/02/2005 17:35 Good -35.562 177.158
 5 | STM05-1 28/02/2005 19:44 Good -35.487 177.129
 6 | STM05-1 28/02/2005 23:01 Unknown -35.399 177.064
 7 | STM05-1 01/03/2005 07:28 Unknown -34.978 177.268
 8 | STM05-1 01/03/2005 18:06 Poor -34.799 177.027
 9 | STM05-1 01/03/2005 18:47 Poor -34.85 177.059
10 | STM05-2 28/02/2005 12:49 Good -35.928 177.328
11 | STM05-2 28/02/2005 21:23 Poor -35.926 177.314
12 | "
13 | 
14 | #################################
15 | ## in R
16 | #################################
17 | 
18 | library(chron)
19 | # in next line replace textConnection(Lines) with "myfile.dat"
20 | DF <- read.table(textConnection(Lines), skip = 1,  as.is = TRUE,
21 |  col.names = c("Id", "Date", "Time", "Quality", "Lat", "Long"))
22 | 
23 | DF2 <- transform(DF,
24 |        Date = chron(Date, format = "d/m/y"),
25 |        Time = times(paste(Time, "00", sep = ":")),
26 |        Quality = factor(Quality, levels = c("Good", "Poor", "Unknown")))
27 | 
28 | o <- order(DF2$Date, as.numeric(DF2$Quality), abs(DF2$Time - times("12:00:00")))
29 | DF2 <- DF2[o,]
30 | 
31 | DF2[tapply(row.names(DF2), DF2$Date, head, 1), ]
32 | 
33 | # The last line above could alternately be written like this:
34 | do.call("rbind", by(DF2, DF2$Date, head, 1))
35 | 
36 | #################################
37 | ## in sqldf
38 | #################################
39 | 
40 | DFo <- sqldf("select * from DF order by
41 |  substr(Date, 7, 4) || substr(Date, 4, 2) || substr(Date, 1, 2) DESC,
42 |  Quality DESC,
43 |  abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) DESC")
44 | sqldf("select * from DFo group by Date")
45 | 
46 | #################################
47 | # Another way to do it also using sqldf is via nested selects like this using
48 | # the same DF as above
49 | #################################
50 | 
51 | sqldf("select * from DF u
52 |  where abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) =
53 |   (select min(abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12))
54 |     from DF x where Quality =
55 |       (select min(Quality) from DF y
56 |          where x.Date = y.Date) and x.Date = u.Date)")
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/demo/sqldf-unitTests.R:
--------------------------------------------------------------------------------
1 | library(sqldf)
2 | library(svUnit)
3 | sqldf.tests <- system.file("unitTests", "runit.all.R", package = "sqldf")
4 | cat("Running:", sqldf.tests, "\n")
5 | source(sqldf.tests)
6 | clearLog()
7 | test.all()
8 | Log()
9 | 


--------------------------------------------------------------------------------
/inst/NEWS:
--------------------------------------------------------------------------------
  1 | Version 0.4-11
  2 | 
  3 | o modifications for RSQLite 2.0.0 
  4 | 
  5 | o read.csv.sql now accepts https and ftps
  6 | 
  7 | Version 0.4-10
  8 | 
  9 | o more modifications for RSQLite 1.0.0 
 10 | 
 11 | Version 0.4-9
 12 | 
 13 | o modifications for RSQLite 1.0.0 
 14 | 
 15 | Version 0.4-8
 16 | 
 17 | o new R option "sqldf.RPostgresSQL.other"
 18 | 
 19 | Version 0.4-7.1
 20 | 
 21 | o bug fix involving dates; also added corresponding unit test
 22 | 
 23 | Version 0.4-7
 24 | 
 25 | o misc changes to satisfy R CMD CHECK in R 3.0.2
 26 | 
 27 | Version 0.4-6.5
 28 | 
 29 | o fixed the a2r/a2s unit test
 30 | 
 31 | Version 0.4-6.4
 32 | 
 33 | o uses ByteCompile hence dependence on R 2.14.0 or higher
 34 | 
 35 | o now uses strapplyc if tcltk available hence dependence on gsubfn 0.6 or higher
 36 | 
 37 | o fixed time zone bug in dealing with POSIXct
 38 | 
 39 | o added MASS to Suggests to pass R CMD check in R 2.15.0
 40 | 
 41 | o RSQLite and RSQLite.extfuns namespaces are loaded at startup if there 
 42 |   are no other drivers supported by sqldf currently on search path
 43 | 
 44 | Version 0.4-6.1
 45 | 
 46 | o message regarding RPostgreSQL was not issued when RPostgreSQL is loaded
 47 | 
 48 | Version 0.4-6
 49 | 
 50 | o support for the RPostgreSQL driver was added
 51 | 
 52 | o RSQLite and RSQLite.extfuns have been moved from Depends to both Imports and
 53 |   Suggests and are loaded only if actually used.
 54 | 
 55 | o any database backend specified via drv= or the "sqldf.driver" option is 
 56 |   loaded if its not already loaded. Previously user was responsible for loading.
 57 | 
 58 | o the drv= argument of sqldf and the "sqldf.driver" option can specify either
 59 |   the database name or the driver name (i.e. SQLite or RSQLite, H2 or RH2,
 60 |   etc.)
 61 | 
 62 | o a possible bug regarding checking for tcltk was fixed.  This makes it more
 63 |   likely that substitute R code will be used rather than giving an error in 
 64 |   the absence of tcltk
 65 | 
 66 | o unit tests upgraded to latest versions of drivers
 67 | 
 68 | Version 0.4-5
 69 | 
 70 | o fixed bug that occurred when times class columns were in output
 71 | 
 72 | o added INSTALL file
 73 | 
 74 | o improve tcltk checking.
 75 | 
 76 | Version 0.4-4
 77 | 
 78 | o NAMESPACE added
 79 | 
 80 | o fixed an erroneous example in ?sqldf
 81 | 
 82 | o appends the sqldf library directory to AWKPATH.  This allows gawk calls in
 83 |   the filter argument of read.csv.sql to refer to awk programs that are 
 84 |   supplied with sqldf without supplying the path.  (See discussion of AWKPATH 
 85 |   in the gawk documentation.)
 86 | 
 87 | o sqldf includes an awk program, csv.awk, to process csv files for use as 
 88 |   a filter with read.csv.sql when its necessary to handle commas within quoted
 89 |   fields and quotes within quoted fields (such quotes must be doubled).  The
 90 |   following arguments (defaults shown) can be passed to csv.awk: isep = ",", 
 91 |   osep = "\"", quote  "\"", escape = "\"", newline = "\\n", trim = 1. Note that
 92 |   the escape character is used if quotes  are within quotes.  Newlines are also
 93 |   allowed in fields between quotes.  If trim=1 then whitespace is trimmed from 
 94 |   beginning and end of fields.
 95 | 
 96 |     # write out test data and then read it back using read.csv.sql
 97 | 	cat('A,B\n"Joe, Jr.","Mac ""The Knife"""\n', file = "temp.csv")
 98 |     temp.out <- read.csv.sql("temp.csv", sep = ";", eol = "\n",
 99 |       filter = "gawk -v osep=; -f csv.awk")
100 | 
101 |   (In most there are no commas or quotes in fields so this filter not needed. 
102 |   Also note that the above example assumes Windows console syntax and the 
103 |   the corresponding shell syntax needs to be used instead on UNIX.)
104 | 
105 | o changes to eliminate the NOTE produced during build/INSTALL on Linux.
106 | 
107 | Version 0.4-3
108 | 
109 | o dbname = NULL can be used to force default behavior
110 | 
111 | o in read.csv.sql and read.csv2.sql the file argument may optionally be a URL
112 | 
113 | o in read.csv.sql and read.csv2.sql the file argument may be omitted (or
114 |   equivalently NULL or NA or "") if filter is specified and no file is to to 
115 |   be input to the filter.
116 | 
117 | Version 0.4-2
118 | 
119 | o read.csv.sql and read.csv2.sql now handle dbWriteTable's nrows and 
120 |   field.types arguments.  (Previously sqldf did but not these two routines.)
121 | 
122 | Version 0.4-1.2
123 | 
124 | o bug fixes
125 | 
126 | Version 0.4-1.1
127 | 
128 | o bug fixes
129 | 
130 | Version 0.4-1
131 | 
132 | o startup message giving database that will be used as default (if not sqlite)
133 | 
134 | Version 0.4-0
135 | 
136 | o the heuristic which assigns classes to columns for the data frame output
137 |   from sqldf has been improved
138 | 
139 | o method argument can now be a list of two transformations. The first
140 |   component corresponds to the old method and can be a keyword ("auto", "raw"
141 |   or "name__class"), a function or a character vector of class names.  The
142 |   second may be a function which is used to transform all data frames input
143 |   to the data base prior to sending them to the data base.
144 | 
145 | o automatic searching for spatialite is no longer performed.  It seems less
146 |   useful now that RSQLite.extfunctions is automatically loaded.  To use
147 |   the spatialite extension specify its path using the dll argument.
148 | 
149 | o sqldf has a new optional argument, verbose, which gives more output if
150 |   it is set to TRUE.  Any other value suppresses the extra output.
151 | 
152 | o filter may be a list whose first component is a command with keywords
153 |   naming the second and subsequent components.  These components are each
154 |   specified as character vectors which are written out to temporary files
155 |   of the indicated names.  This can be used for specifying gawk and other 
156 |   filters/preprocessors of the input file without having to worry about
157 |   shell interpretation of special characters. e.g.
158 | 	filter = list("gawk -f prog", prog = '{ gsub(",", ".") }')
159 | 
160 | o MySQL is now supported (in addition to SQLite, PostgreSQL and h2).
161 | 
162 | o stringsAsFactors now defaults to FALSE
163 | 
164 | o unit tests have been added.  To run them for sqlite do this:
165 | 	R CMD check sqldf
166 |   or:
167 | 	demo("sqldf-unitTests")
168 |   To run for all 4 databases:
169 |     demo("sqldf-unitTests") # sqlite
170 |     library(RH2); demo("sqldf-unitTests"); detach()
171 |     library(RMySQL); demo("sqldf-unitTests"); detach()
172 |     library(RpgSQL); demo("sqldf-unitTests"); detach()
173 | 
174 | Version 0.3-5
175 | 
176 | o global option "sqldf.method" used as default value of method argument.  If
177 |   not set then default is method = "auto", as before.
178 | 
179 | o extension function from the RSQLite.extfuns package are automatically loaded
180 | 
181 | o heuristic improved.  It now checks for ambiguous situations and uses "raw"
182 |   for those columns rather than the class of the first match.
183 | 
184 | Version 0.3-4
185 | 
186 | o all SQL statements now work with H2 rather than just a subset
187 | 
188 | Version 0.3-3
189 | 
190 | o if RpgSQL is loaded or if sqldf drv argument is "RpgSQL" then sqldf will use 
191 |   PostgreSQL database
192 | 
193 | o database persistance (sqldf without arguments) now works with H2 database
194 |   (previously only SQLite)
195 | 
196 | o bug fixes
197 | 
198 | Version 0.3-2
199 | 
200 | o if RH2 package is loaded then sqldf will use H2 database
201 | 
202 | Version 0.2-1
203 | 
204 | o bug fixes
205 | 
206 | o certain Date conversions which previously required as.Date.numeric from zoo
207 |   package no longer depend on zoo
208 | 
209 | Version 0.2-0
210 | 
211 | o column names which are SQL reserved words are no longer mangled
212 | 
213 | o file.format list can contain a filter which is a batch or shell command
214 |   that filters the input file
215 | 
216 | o read.csv2.sql is like read.csv.sql but has a default filter which translates 
217 |   each comma in the input file to a dot
218 | 
219 | o if getOption("sqldf.dll") defaulting to libspatial-1.dll is found on the PATH
220 |   then it will be loaded as an SQLite loadable extension.  With the default this
221 |   gives access to these SQL functions:
222 |   http://www.gaia-gis.it/spatialite/spatialite-sql-2.3.1.html
223 |   The sqldf package does not itself include this or any other SQLite loadable
224 |   extension.  The user must download and place it on the PATH if they wish to 
225 |   use it.  libspatialite-1.dll can be found here:
226 |   http://www.gaia-gis.it/spatialite
227 |   If sqldf does not find the dll on the PATH then sqldf will continue to work 
228 |   but without access to these functions.
229 | 
230 | Version 0.1-7
231 | 
232 | o now supports table names with a dot in them provided that they are 
233 |   placed within backquotes
234 | 
235 | o bug fixes (thanks to Soren Hojsgaard for bug report)
236 | 
237 | Version 0.1-6
238 | 
239 | o new command read.csv2.sql
240 | 
241 | Version 0.1-5
242 | 
243 | o minor improvement in second example in ?sqldf (thanks to Wacek Kusnierczyk)
244 | 
245 | o new command read.csv.sql
246 | 
247 | Version 0.1-4
248 | 
249 | o removed junk files
250 | 
251 | Version 0.1-3
252 | 
253 | o corrected DESCRIPTION file
254 | 
255 | Version 0.1-2
256 | 
257 | o searches for and uses "file" objects as well as data frame objects.
258 |   file.format argument or "file.format" attribute on file object
259 |   is a list with arguments accepted by sqliteImportFile.
260 |   [Thanks to Soren Hojsgaard for suggestion to support files.]
261 | 
262 | o dots in argument list replaced with a single variable x.
263 |   x can be a "character" vector in which case each component is 
264 |   executed by SQLite in turn and result of the last is returned.
265 | 
266 | o new dbname argument can be specified.  For SQLite it defaults to ":memory:",
267 |   i.e. an embedded data base.  If database does not exist it is created
268 |   and deleted upon exit.  If it does exist then only tables created by sqldf
269 |   are deleted on exit but not the database itself.
270 | 
271 | o improvements in sqldf.Rd
272 | 
273 | o MySQL testing
274 | 
275 | o added a demo illustrating sorting and grouping vs. nested selects
276 | 
277 | o new example illustrating vector x
278 | 
279 | o support for POSIXct, Date and chron dates and times
280 | 
281 | Version 0.1-1
282 | 
283 | o removed use of subset in favor of subscripts since codetools chokes on it
284 | 
285 | o added table example to sqldf.Rd
286 | 
287 | o improved explanation in sqldf.Rd
288 | 
289 | Version 0.1-0
290 | 
291 | o initial release
292 | 
293 | 


--------------------------------------------------------------------------------
/inst/THANKS:
--------------------------------------------------------------------------------
 1 | 
 2 | Thanks to:
 3 | 
 4 | - Ben Escoto for some useful suggestions, bug reports and code suggestions.
 5 | - Soren Hojsgaard for some useful suggestions and bug reports.
 6 | - Jussi Lehto for pointing out a bug and suggesting a fix.
 7 | - Cela-Diaz, Fernando for reporting a bug.
 8 | - Vadlamani, Satis for reporting a bug.
 9 | - Wacek Kusnierczyk for suggesting an improvement to 2 examples.
10 | - Jim Holtman for pointing out a bug and suggesting a fix.
11 | - Tomoaki NISHIYAMA <tomoakin@kenroku.kanazawa-u.ac.jp> for fixes.
12 | 
13 | 


--------------------------------------------------------------------------------
/inst/csv.awk:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/awk -f
  2 | #**************************************************************************
  3 | #
  4 | # 2011/11/12 use OFS as output field separator
  5 | # e.g. gawk -f csv.awk -v OFS=, myfile.csv
  6 | #
  7 | # This file is in the public domain.
  8 | #
  9 | # For more information email LoranceStinson+csv@gmail.com.
 10 | # Or see http://lorance.freeshell.org/csv/
 11 | #
 12 | # Parse a CSV string into an array.
 13 | # The number of fields found is returned.
 14 | # In the event of an error a negative value is returned and csverr is set to
 15 | # the error. See below for the error values.
 16 | #
 17 | # Parameters:
 18 | # string  = The string to parse.
 19 | # csv     = The array to parse the fields into.
 20 | # sep     = The field separator character. Normally ,
 21 | # quote   = The string quote character. Normally "
 22 | # escape  = The quote escape character. Normally "
 23 | # newline = Handle embedded newlines. Provide either a newline or the
 24 | #           string to use in place of a newline. If left empty embedded
 25 | #           newlines cause an error.
 26 | # trim    = When true spaces around the separator are removed.
 27 | #           This affects parsing. Without this a space between the
 28 | #           separator and quote result in the quote being ignored.
 29 | #
 30 | # These variables are private:
 31 | # fields  = The number of fields found thus far.
 32 | # pos     = Where to pull a field from the string.
 33 | # strtrim = True when a string is found so we know to remove the quotes.
 34 | #
 35 | # Error conditions:
 36 | # -1  = Unable to read the next line.
 37 | # -2  = Missing end quote.
 38 | # -3  = Missing separator.
 39 | #
 40 | # Notes:
 41 | # The code assumes that every field is preceded by a separator, even the
 42 | # first field. This makes the logic much simpler, but also requires a
 43 | # separator be prepended to the string before parsing.
 44 | #**************************************************************************
 45 | function parse_csv(string,csv,sep,quote,escape,newline,trim, fields,pos,strtrim) {
 46 |     # Make sure there is something to parse.
 47 |     if (length(string) == 0) return 0;
 48 |     string = sep string; # The code below assumes ,FIELD.
 49 |     fields = 0; # The number of fields found thus far.
 50 |     while (length(string) > 0) {
 51 |         # Remove spaces after the separator if requested.
 52 |         if (trim && substr(string, 2, 1) == " ") {
 53 |             if (length(string) == 1) return fields;
 54 |             string = substr(string, 2);
 55 |             continue;
 56 |         }
 57 |         strtrim = 0; # Used to trim quotes off strings.
 58 |         # Handle a quoted field.
 59 |         if (substr(string, 2, 1) == quote) {
 60 |             pos = 2;
 61 |             do {
 62 |                 pos++
 63 |                 if (pos != length(string) &&
 64 |                     substr(string, pos, 1) == escape &&
 65 |                     (substr(string, pos + 1, 1) == quote ||
 66 |                      substr(string, pos + 1, 1) == escape)) {
 67 |                     # Remove escaped quote characters.
 68 |                     string = substr(string, 1, pos - 1) substr(string, pos + 1);
 69 |                 } else if (substr(string, pos, 1) == quote) {
 70 |                     # Found the end of the string.
 71 |                     strtrim = 1;
 72 |                 } else if (newline && pos >= length(string)) {
 73 |                     # Handle embedded newlines if requested.
 74 |                     if (getline == -1) {
 75 |                         csverr = "Unable to read the next line.";
 76 |                         return -1;
 77 |                     }
 78 |                     string = string newline $0;
 79 |                 }
 80 |             } while (pos < length(string) && strtrim == 0)
 81 |             if (strtrim == 0) {
 82 |                 csverr = "Missing end quote.";
 83 |                 return -2;
 84 |             }
 85 |         } else {
 86 |             # Handle an empty field.
 87 |             if (length(string) == 1 || substr(string, 2, 1) == sep) {
 88 |                 csv[fields] = "";
 89 |                 fields++;
 90 |                 if (length(string) == 1)
 91 |                     return fields;
 92 |                 string = substr(string, 2);
 93 |                 continue;
 94 |             }
 95 |             # Search for a separator.
 96 |             pos = index(substr(string, 2), sep);
 97 |             # If there is no separator the rest of the string is a field.
 98 |             if (pos == 0) {
 99 |                 csv[fields] = substr(string, 2);
100 |                 fields++;
101 |                 return fields;
102 |             }
103 |         }
104 |         # Remove spaces after the separator if requested.
105 |         if (trim && pos != length(string) && substr(string, pos + strtrim, 1) == " ") {
106 |             trim = strtrim
107 |             # Count the number fo spaces found.
108 |             while (pos < length(string) && substr(string, pos + trim, 1) == " ") {
109 |                 trim++
110 |             }
111 |             # Remove them from the string.
112 |             string = substr(string, 1, pos + strtrim - 1) substr(string,  pos + trim);
113 |             # Adjust pos with the trimmed spaces if a quotes string was not found.
114 |             if (!strtrim) {
115 |                 pos -= trim;
116 |             }
117 |         }
118 |         # Make sure we are at the end of the string or there is a separator.
119 |         if ((pos != length(string) && substr(string, pos + 1, 1) != sep)) {
120 |             csverr = "Missing separator.";
121 |             return -3;
122 |         }
123 |         # Gather the field.
124 |         csv[fields] = substr(string, 2 + strtrim, pos - (1 + strtrim * 2));
125 |         fields++;
126 |         # Remove the field from the string for the next pass.
127 |         string = substr(string, pos + 1);
128 |     }
129 |     return fields;
130 | }
131 | 
132 | BEGIN {
133 | 	if (osep == "") osep = ",";
134 | 	OFS = osep;
135 | 	if (isep == "") isep = ",";
136 | 	if (quote == "") quote = "\"";
137 | 	if (escape == "") escape = "\"";
138 | 	if (newline == "") newline = "\\n";
139 | 	if (trim == "") trim = 1;
140 | }
141 | 
142 | {
143 |     num_fields = parse_csv($0, csv, isep, quote, escape, newline, trim);
144 |     if (num_fields < 0) {
145 |         printf "ERROR: %s (%d) -> %s\n", csverr, num_fields, $0;
146 |     } else {
147 | 		printf "%s", csv[0]
148 |         for (i = 1;i < num_fields;i++) {
149 |             printf "%s%s", OFS, csv[i];
150 |         }
151 |         printf "\n";
152 |     }
153 | }
154 | 
155 | 


--------------------------------------------------------------------------------
/inst/trcomma2dot.vbs:
--------------------------------------------------------------------------------
 1 | '' convert commas to dots
 2 | '' based on code by Ekkehard Horner
 3 | 
 4 | Option Explicit
 5 | 
 6 | WScript.Quit miniTr()
 7 | 
 8 | Function miniTr()
 9 |    Dim sSearch  : sSearch  = ","
10 |    Dim sReplace : sReplace = "."
11 |    Do Until WScript.StdIn.AtEndOfStream
12 |       Dim sInpChr : sInpChr = WScript.StdIn.Read( 1 )
13 |       Dim sOutChr : sOutChr = sInpChr
14 |       Dim nPos    : nPos    = InStr( sSearch, sInpChr )
15 |       If 0 < nPos Then
16 |          sOutChr = Mid( sReplace, nPos, 1 )
17 |       End If
18 |       WScript.StdOut.Write sOutChr
19 |    Loop
20 |    miniTr = 0
21 | End Function
22 | 


--------------------------------------------------------------------------------
/inst/unitTests/runit.all.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # To run these tests:
  3 | #   library(sqldf)
  4 | #   library(svUnit)
  5 | #   # optionally: library(RH2), library(RMySQL) or library(RpgSQL)
  6 | #   #  otherwise sqlite will be used.
  7 | #   # optionally: options(sqldf.verbose = TRUE)
  8 | #   runit.all <- system.file("unitTests", "runit.all.R", package = "sqldf")
  9 | #   source(runit.all); clearLog(); test.all()
 10 | #   Log()
 11 | 
 12 | test.all <- function() {
 13 | 
 14 | 	# set drv
 15 | 	drv <- getOption("sqldf.driver") 
 16 | 	if (is.null(drv)) {
 17 | 		drv <- if ("package:RPostgreSQL" %in% search()) { "PostgreSQL"
 18 | 			} else if ("package:RpgSQL" %in% search()) { "pgSQL"
 19 | 			} else if ("package:RMySQL" %in% search()) { "MySQL" 
 20 | 			} else if ("package:RH2" %in% search()) { "H2" 
 21 | 			} else "SQLite"
 22 | 	}
 23 | 	drv <- tolower(drv)
 24 | 	cat("using driver:", drv, "\n")
 25 | 
 26 | 	# head
 27 | 	a1r <- head(warpbreaks)
 28 | 	a1s <- sqldf("select * from warpbreaks limit 6")
 29 | 	checkIdentical(a1r, a1s)
 30 | 
 31 | 	# subset / like
 32 | 	a2r <- subset(as.data.frame(CO2), grepl("^Qn", Plant))
 33 |     class(a2r) <- "data.frame"
 34 | 	if (drv == "postgresql") { # pgsql needs quotes for Plant
 35 | 		a2s <- sqldf("select * from \"CO2\" where \"Plant\" like 'Qn%'")
 36 | 	} else if (drv == "pgsql") {
 37 | 		a2s <- sqldf("select * from CO2 where \"Plant\" like 'Qn%'")
 38 | 	} else a2s <- sqldf("select * from CO2 where Plant like 'Qn%'")
 39 | 	# checkEquals(a2r, a2s, check.attributes = FALSE)
 40 | 	checkIdentical(a2r, a2s)
 41 | 
 42 | 	data(farms, package = "MASS")
 43 | 	# subset / in
 44 | 	a3r <- subset(farms, Manag %in% c("BF", "HF"))
 45 | 	row.names(a3r) <- NULL
 46 | 	if (drv == "pgsql" || drv == "postgresql") { # pgsql needs quotes for Manag 
 47 | 		a3s <- sqldf("select * from farms where \"Manag\" in ('BF', 'HF')")
 48 | 	} else { 
 49 | 		a3s <- sqldf("select * from farms where Manag in ('BF', 'HF')")
 50 | 	}
 51 | 	checkIdentical(a3r, a3s)
 52 | 
 53 | 	# subset / multiple inequality constraints
 54 | 	a4r <- subset(warpbreaks, breaks >= 20 & breaks <= 30)
 55 | 	a4s <- sqldf("select * from warpbreaks where breaks between 20 and 30", 
 56 | 	   row.names = TRUE)
 57 | 	if (drv == "h2" || drv == "pgsql" || drv == "postgresql") {
 58 | 		checkEquals(a4r, a4s, check.attributes = FALSE)
 59 | 	} else checkIdentical(a4r, a4s)
 60 | 
 61 | 		# subset
 62 | 		a5r <- subset(farms, Mois == 'M1')
 63 | 		if (drv == "pgsql" || drv == "postgresql") {
 64 | 			a5s <- sqldf("select * from farms where \"Mois\" = 'M1'", 
 65 | 				row.names = TRUE)
 66 | 		} else a5s <- sqldf("select * from farms where Mois = 'M1'", row.names = TRUE)
 67 | 		if (drv == "sqlite") {
 68 | 			checkIdentical(a5r, a5s)
 69 | 		# } else if (drv != "mysql") checkEquals(a5r, a5s, check.attributes = FALSE) 
 70 | 		} else checkEquals(a5r, a5s, check.attributes = FALSE) 
 71 | 
 72 | 		# subset
 73 | 		a6r <- subset(farms, Mois == 'M2')
 74 | 		if (drv == "pgsql" || drv == "postgresql") {
 75 | 			a6s <- sqldf("select * from farms where \"Mois\" = 'M2'", row.names = TRUE)
 76 | 	 	} else {
 77 | 			a6s <- sqldf("select * from farms where Mois = 'M2'", row.names = TRUE)
 78 | 		}
 79 | 		if (drv == "sqlite") {
 80 | 			checkIdentical(a6r, a6s)
 81 | 		} else checkEquals(a6r, a6s, check.attributes = FALSE)
 82 | 
 83 | 		# rbind
 84 | 		a7r <- rbind(a5r, a6r)
 85 | 		a7s <- sqldf("select * from a5s union all select * from a6s")
 86 | 
 87 | 		# sqldf drops the unused levels of Mois but rbind does not; however,
 88 | 		# all data is the same and the other columns are identical
 89 | 		row.names(a7r) <- NULL
 90 | 		checkIdentical(a7r[-1], a7s[-1])
 91 | 	# }
 92 | 
 93 | 	# aggregate - avg conc and uptake by Plant and Type
 94 | 	# pgsql and h2 require double quote quoting 
 95 | 	# whereas sqlite and mysql can use backquote
 96 | 	# Species needs to be quoted in pgsql but not in the other data bases.
 97 | 	a8r <- aggregate(iris[1:2], iris[5], mean)
 98 | 	if (drv == "pgsql" || drv == "postgresql" || drv == "h2" || drv == "sqlite") {
 99 | 		a8s <- sqldf('select "Species", avg("Sepal.Length") \"Sepal.Length\", 
100 | 			avg("Sepal.Width") \"Sepal.Width\" from iris 
101 | 			group by "Species" order by "Species"')
102 | 		if (drv == "postgresql") checkIdentical(a8r, a8s) else
103 | 		checkEquals(a8r, a8s, check.attributes = FALSE)
104 | 	} else {
105 | 		a8s <- sqldf("select Species, avg(Sepal_Length) `Sepal.Length`, 
106 | 		   avg(Sepal_Width) `Sepal.Width` from iris
107 | 		   group by Species order by Species")
108 | 	    # checkEquals(a8r, a8s, check.attributes = drv != "mysql")
109 | 	    checkEquals(a8r, a8s)
110 | 	}
111 | 
112 | 
113 | 	# by - avg conc and total uptake by Plant and Type
114 | 	a9r <- do.call(rbind, by(iris, iris[5], function(x) with(x,
115 | 		data.frame(Species = Species[1], 
116 | 			mean.Sepal.Length = mean(Sepal.Length),
117 | 			mean.Sepal.Width = mean(Sepal.Width),
118 | 			mean.Sepal.ratio = mean(Sepal.Length/Sepal.Width)))))
119 | 	row.names(a9r) <- NULL
120 | 	if (drv == "pgsql" || drv == "postgresql" || drv == "h2" || drv == "sqlite") {
121 | 		a9s <- sqldf('select "Species", avg("Sepal.Length") "mean.Sepal.Length",
122 | 			avg("Sepal.Width") "mean.Sepal.Width", 
123 | 			avg("Sepal.Length"/"Sepal.Width") "mean.Sepal.ratio" from iris
124 | 			group by "Species" order by "Species"')
125 | 			checkEquals(a9r, a9s, check.attributes = FALSE)
126 | 	} else {
127 | 		a9s <- sqldf("select Species, avg(Sepal_Length) `mean.Sepal.Length`,
128 | 			avg(Sepal_Width) `mean.Sepal.Width`, 
129 | 			avg(Sepal_Length/Sepal_Width) `mean.Sepal.ratio` from iris
130 | 			group by Species order by Species")
131 | 			# checkEquals(a9r, a9s, check.attributes = drv != "mysql")
132 | 			checkEquals(a9r, a9s)
133 | 	}
134 | 
135 | 	# head - top 3 breaks
136 | 	a10r <- head(warpbreaks[order(warpbreaks$breaks, decreasing = TRUE), ], 3)
137 | 	a10s <- sqldf("select * from warpbreaks order by breaks desc limit 3")
138 | 	row.names(a10r) <- NULL
139 | 	checkIdentical(a10r, a10s)
140 | 
141 | 	# head - bottom 3 breaks
142 | 	a11r <- head(warpbreaks[order(warpbreaks$breaks), ], 3)
143 | 	a11s <- sqldf("select * from warpbreaks order by breaks limit 3")
144 | 	# attributes(a11r) <- attributes(a11s) <- NULL
145 | 	row.names(a11r) <- NULL
146 | 	checkIdentical(a11r, a11s)
147 | 
148 | 	# ave - rows for which v exceeds its group average where g is group
149 | 	DF <- data.frame(g = rep(1:2, each = 5), t = rep(1:5, 2), v = 1:10)
150 | 	a12r <- subset(DF, v > ave(v, g, FUN = mean))
151 | 	if (drv == "postgresql") {
152 | 		Gavg <- sqldf('select g, avg(v) as avg_v from "DF" group by g')
153 | 		a12s <- sqldf('select "DF".g, t, v from "DF", "Gavg" where "DF".g = "Gavg".g and v > avg_v')
154 | 	} else {
155 | 		Gavg <- sqldf("select g, avg(v) as avg_v from DF group by g")
156 | 		a12s <- sqldf("select DF.g, t, v from DF, Gavg where DF.g = Gavg.g and v > avg_v")
157 | 	}
158 | 	row.names(a12r) <- NULL
159 | 	checkIdentical(a12r, a12s)
160 | 
161 | 	# same but reduce the two select statements to one using a subquery
162 | 	a13s <- if (drv == "postgresql") {
163 | 		sqldf('select g, t, v from "DF" d1, (select g as g2, avg(v) as avg_v from "DF" group by g) d2 where d1.g = g2 and v > avg_v')
164 | 	} else {
165 | 		sqldf("select g, t, v from DF d1, (select g as g2, avg(v) as avg_v from DF group by g) d2 where d1.g = g2 and v > avg_v")
166 | 	}
167 | 	checkIdentical(a12r, a13s)
168 | 
169 | 	# same but shorten using natural join
170 | 	a14s <- if (drv == "postgresql") {
171 | 		sqldf('select d1.g, t, v from "DF" d1 natural join 
172 | 		  (select g, avg(v) as avg_v from "DF" group by g) d2 where v > avg_v')
173 | 	} else { 
174 | 		sqldf("select d1.g, t, v from DF d1 natural join 
175 | 		  (select g, avg(v) as avg_v from DF group by g) d2 where v > avg_v")
176 | 	}
177 | 	checkIdentical(a12r, a14s)
178 | 
179 | 	# table
180 | 	a15r <- table(warpbreaks$tension, warpbreaks$wool)
181 | 	if (drv == "pgsql" || drv == "postgresql") {
182 | 		a15s <- sqldf("select 
183 | 					sum(cast(wool = 'A' as integer)), 
184 | 					sum(cast(wool = 'B' as integer))
185 | 			   from warpbreaks group by tension")
186 | 	} else {
187 | 		a15s <- sqldf("select SUM(wool = 'A'), SUM(wool = 'B') 
188 | 			   from warpbreaks group by tension")
189 | 	}
190 | 
191 | 	checkEquals(as.data.frame.matrix(a15r), a15s, check.attributes = FALSE)
192 | 
193 | 	# reshape
194 | 	t.names <- paste("t", unique(as.character(DF$t)), sep = "_")
195 | 	a16r <- reshape(DF, direction = "wide", timevar = "t", idvar = "g", varying = list(t.names))
196 | 	if (drv == "postgresql") {
197 | 		a16s <- sqldf("select g, 
198 | 			sum(cast(t = 1 as integer) * v) t_1, 
199 | 			sum(cast(t = 2 as integer) * v) t_2, 
200 | 			sum(cast(t = 3 as integer) * v) t_3, 
201 | 			sum(cast(t = 4 as integer) * v) t_4, 
202 | 			sum(cast(t = 5 as integer) * v) t_5 from \"DF\" group by g")
203 | 	} else if (drv == "pgsql") {
204 | 		a16s <- sqldf("select g, 
205 | 			sum(cast(t = 1 as integer) * v) t_1, 
206 | 			sum(cast(t = 2 as integer) * v) t_2, 
207 | 			sum(cast(t = 3 as integer) * v) t_3, 
208 | 			sum(cast(t = 4 as integer) * v) t_4, 
209 | 			sum(cast(t = 5 as integer) * v) t_5 from DF group by g")
210 | 	} else a16s <- sqldf("select g, 
211 | 		SUM((t = 1) * v) t_1, 
212 | 		SUM((t = 2) * v) t_2, 
213 | 		SUM((t = 3) * v) t_3, 
214 | 		SUM((t = 4) * v) t_4, 
215 | 		SUM((t = 5) * v) t_5 from DF group by g")
216 | 	checkEquals(a16r, a16s, check.attributes = FALSE)
217 | 
218 | 	# order
219 | 	a17r <- Formaldehyde[order(Formaldehyde$optden, decreasing = TRUE), ]
220 | 	if (drv == "postgresql") {
221 | 	a17s <- sqldf("select * from \"Formaldehyde\" order by optden desc") 
222 | 	} else a17s <- sqldf("select * from Formaldehyde order by optden desc")
223 | 	row.names(a17r) <- NULL
224 | 	checkIdentical(a17r, a17s)
225 | 
226 | 	DF <- data.frame(x = rnorm(15, 1:15))
227 | 	a18r <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7)))
228 | 	# centered moving average of length 7
229 | 	if (drv == "postgresql" || drv == "h2") {
230 | 		DF2 <- cbind(id = 1:nrow(DF), DF)
231 | 		a18s <- sqldf("select min(a.x) x, avg(b.x) movavgx 
232 |            from \"DF2\" a, \"DF2\" b 
233 | 		   where a.id - b.id between -3 and 3 
234 | 		   group by a.id having count(*) = 7 
235 | 		   order by a.id")
236 | 	} else if (drv == "pgsql") {
237 | 		DF2 <- cbind(id = 1:nrow(DF), DF)
238 | 		a18s <- sqldf("select min(a.x) x, avg(b.x) movavgx 
239 |            from DF2 a, DF2 b 
240 | 		   where a.id - b.id between -3 and 3 
241 | 		   group by a.id having count(*) = 7 
242 | 		   order by a.id")
243 | 	} else {
244 | 		a18s <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b 
245 | 		   where a.row_names - b.row_names between -3 and 3 
246 | 		   group by a.row_names having count(*) = 7 
247 | 		   order by a.row_names+0", 
248 | 		 row.names = TRUE)
249 | 	}
250 | 	checkEquals(a18r, a18s)
251 | 
252 | 	# merge.  a19r and a19s are same except row order and row names
253 | 	A <- data.frame(a1 = c(1, 2, 1), a2 = c(2, 3, 3), a3 = c(3, 1, 2))
254 | 	B <- data.frame(b1 = 1:2, b2 = 2:1)
255 | 	a19s <- if (drv == "postgresql") {
256 | 		sqldf('select * from "A", "B"')
257 | 	} else sqldf("select * from A, B")
258 | 	a19r <- merge(A, B)
259 | 	Sort <- function(DF) DF[do.call(order, DF),]
260 | 	checkEquals(Sort(a19s), Sort(a19r), check.attributes = FALSE)
261 | 
262 | 	if (drv != "postgresql") {
263 | 
264 | 	  # check Date class
265 | 	  DF.Date <- structure(list(date = structure(c(-15676, -15648), 
266 | 	   class = "Date"), x = c(2, 3), y = c(4, 5)), 
267 | 	   .Names = c("date", "x", "y"), row.names = 1:2, class = "data.frame")
268 | 	  g <- data.frame(date=as.Date(c("1927-01-31","1927-02-28")),x=c(2,3))
269 | 	  h <- data.frame(date=as.Date(c("1927-01-31","1927-02-28")),y=c(4,5))
270 | 	  final <- sqldf("select d1.*, d2.y 
271 | 		       from g d1 left join h d2 on d1.date=d2.date")
272 | 	  checkEquals(final, DF.Date)
273 | 
274 | 	}
275 | 
276 | 	# test sqlite system tables
277 | 
278 | 	if (drv == "sqlite") {
279 | 
280 | 		checkIdentical(dim(sqldf("pragma table_info(BOD)")), c(2L, 6L))
281 | 
282 | 		sql <- c("select * from BOD", "select * from sqlite_master")
283 | 		checkIdentical(dim(sqldf(sql)), c(1L, 5L))
284 | 
285 | 		checkTrue(sqldf("pragma database_list")$name == "main")
286 | 
287 | 		DF <- data.frame(a = 1:2, b = 2:1)
288 | 
289 | 		checkIdentical(sqldf("select a/b as quotient from DF")$quotient, c(0L, 2L))
290 | 
291 | 		checkIdentical(sqldf("select (a+0.0)/b as quotient from DF")$quotient, c(0.5, 2.0))
292 | 
293 | 		checkIdentical(sqldf("select cast(a as real)/b as quotient from DF")$quotient, c(0.5, 2.0))
294 | 
295 | 		checkIdentical(sqldf(c("create table mytab(a real, b real)", 
296 | 		   "insert into mytab select * from DF",  
297 | 		   "select a/b as quotient from mytab"))$quotient, c(0.5, 2.0))
298 | 
299 | 		tonum <- function(DF) replace(DF, TRUE, lapply(DF, as.numeric))
300 | 		checkIdentical(sqldf("select a/b as quotient from DF", 
301 | 			method = list("auto", tonum))$quotient, c(0.5, 2.0))
302 | 	}
303 | 
304 | }
305 | 


--------------------------------------------------------------------------------
/man/read.csv.sql.Rd:
--------------------------------------------------------------------------------
  1 | \name{read.csv.sql}
  2 | \Rdversion{1.1}
  3 | \alias{read.csv.sql}
  4 | \alias{read.csv2.sql}
  5 | \title{
  6 | Read File Filtered by SQL
  7 | }
  8 | \description{
  9 | Read a file into R filtering it with an sql statement.  Only the filtered
 10 | portion is processed by R so that files larger than R can otherwise
 11 | handle can be accommodated.
 12 | }
 13 | \usage{
 14 | read.csv.sql(file, sql = "select * from file", header = TRUE, sep = ",", 
 15 | row.names, eol, skip, filter, nrows, field.types, 
 16 | colClasses, dbname = tempfile(), drv = "SQLite", ...)
 17 | read.csv2.sql(file, sql = "select * from file", header = TRUE, sep = ";", 
 18 | row.names, eol, skip, filter, nrows, field.types,
 19 | colClasses, dbname = tempfile(), drv = "SQLite", ...)
 20 | }
 21 | %- maybe also 'usage' for other objects documented here.
 22 | \arguments{
 23 |   \item{file}{
 24 | A file path or a URL (beginning with \code{http://} or \code{ftp://}). If
 25 | the \code{filter} argument is used and no file is to be input to the filter
 26 | then \code{file} can be omitted, \code{NULL}, \code{NA} or \code{""}.
 27 | }
 28 |   \item{sql}{
 29 | character string holding an SQL statement.  The table representing the
 30 | file should be referred to as \code{file}.
 31 | }
 32 |   \item{header}{
 33 | As in \code{read.csv}.
 34 | }
 35 |   \item{sep}{
 36 | As in \code{read.csv}.
 37 | }
 38 |   \item{row.names}{
 39 | As in \code{read.csv}.
 40 | }
 41 |   \item{eol}{
 42 | Character which ends line.
 43 | }
 44 |   \item{skip}{
 45 | Skip indicated number of lines in input file.
 46 | }
 47 |   \item{filter}{
 48 | If specified, this should be a shell/batch command that the input file is piped through.  For \code{read.csv2.sql} it is by default the following on non-Windows systems: \code{tr , .}.  This translates all commas in the file to dots. On Windows similar functionalty is provided but to do that using a vbscript file that is included with \code{sqldf} to emulate the \code{tr} command.  
 49 | }
 50 |   \item{nrows}{
 51 | Number of rows used to determine column types.  It defaults to 50.  Using 
 52 | \code{-1} causes it to use all rows for determining column types.
 53 | This argument is rarely needed.
 54 | }
 55 |   \item{field.types}{
 56 | A list whose names are the column names and whose
 57 | contents are the SQLite types (not the R class names) of the
 58 | columns.  Specifying these types improves how fast it takes.
 59 | Unless speed is very important this argument is not normally used.
 60 | }
 61 |   \item{colClasses}{As in \code{read.csv}.
 62 | }
 63 |   \item{dbname}{
 64 | As in \code{sqldf} except that the default is \code{tempfile()}.
 65 | Specifying \code{NULL} will put the database in memory which may improve speed
 66 | but will limit the size of the database by the available memory.
 67 | }
 68 |   \item{drv}{
 69 | This argument is ignored. 
 70 | Currently the only database SQLite supported by \code{read.csv.sql} and 
 71 | \code{read.csv2.sql} is SQLite.  
 72 | Note that the H2 database has a builtin SQL function,
 73 | \code{CSVREAD}, which can be used in place of \code{read.csv.sql}.
 74 | }
 75 |   \item{\dots}{
 76 | Passed to \code{sqldf}.
 77 | }
 78 | }
 79 | \details{
 80 | Reads the indicated file into an sql database creating the database
 81 | if it does not already exist.  Then it applies the sql statement
 82 | returning the result as a data frame.  If the database did not exist
 83 | prior to this statement it is removed.  
 84 | 
 85 | Note that it uses facilities of \code{SQLite} to read the file 
 86 | which are intended for speed and therefore
 87 | not as flexible as in R.  For example, it does not
 88 | recognize quoted fields as special but will regard the quotes as 
 89 | part of the field. See the
 90 | \code{sqldf} help for more information.
 91 | 
 92 | \code{read.csv2.sql} is like \code{read.csv.sql} except
 93 | the default \code{sep} is \code{";"} and the default \code{filter} translates
 94 | all commas in the file to decimal points (i.e. to dots).
 95 | 
 96 | On Windows, if the \code{filter} argument is used and if Rtools is detected
 97 | in the registry then the Rtools bin directory is added to the search path
 98 | facilitating use of those tools without explicitly setting any the path.
 99 | 
100 | }
101 | \value{
102 | If the sql statement is a select statement then a data frame
103 | is returned.
104 | }
105 | 
106 | \examples{
107 | \dontrun{
108 | # might need to specify eol= too depending on your system
109 | write.csv(iris, "iris.csv", quote = FALSE, row.names = FALSE)
110 | iris2 <- read.csv.sql("iris.csv", 
111 | 	sql = "select * from file where Species = 'setosa' ")
112 | 
113 | }
114 | }
115 | \keyword{ manip }
116 | 


--------------------------------------------------------------------------------
/man/sqldf-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{sqldf-package}
 2 | \alias{sqldf-package}
 3 | \docType{package}
 4 | \title{
 5 | sqldf package overview
 6 | }
 7 | \description{
 8 | Provides an easy way to perform SQL selects on R data frames.  
 9 | }
10 | \details{
11 | The package contains a single function \link{sqldf} whose help file
12 | contains more information and examples.
13 | }
14 | \references{ 
15 | The \link{sqldf} help page contains the primary documentation.
16 | The sqldf github home page \url{https://github.com/ggrothendieck/sqldf} 
17 | contains links to SQLite pages that may be helpful in formulating queries.
18 | }
19 | \keyword{ package }
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/man/sqldf.Rd:
--------------------------------------------------------------------------------
  1 | \name{sqldf}
  2 | \alias{sqldf}
  3 | \title{SQL select on data frames}
  4 | \description{
  5 | SQL select on data frames
  6 | }
  7 | \usage{
  8 | 
  9 | sqldf(x, stringsAsFactors = FALSE,
 10 |    row.names = FALSE, envir = parent.frame(), 
 11 |    method = getOption("sqldf.method"),
 12 |    file.format = list(), dbname, drv = getOption("sqldf.driver"), 
 13 |    user, password = "", host = "localhost", port,
 14 |    dll = getOption("sqldf.dll"), connection = getOption("sqldf.connection"),
 15 |    verbose = isTRUE(getOption("sqldf.verbose")))
 16 | 
 17 | }
 18 | 
 19 | \arguments{
 20 |   \item{x}{Character string representing an SQL select statement or
 21 |    character vector whose components each represent a successive SQL
 22 |    statement to be executed. 
 23 |    The select statement syntax must conform to the particular database 
 24 |    being used.  If x is missing then it establishes a connection
 25 |    which subsequent sqldf statements access.  In that case the database
 26 |    is not destroyed until the next sqldf statement with no x.}
 27 |   \item{stringsAsFactors}{ If \code{TRUE} then those columns output 
 28 |    from the database as \code{"character"} are 
 29 |    converted to \code{"factor"} if the heuristic is unable to determine 
 30 |    the class.}
 31 |   \item{row.names}{For \code{TRUE} the tables in the data base are given
 32 |    a \code{row_names} column filled with the row names of the corresponding
 33 |    data frames.  Note that in SQLite a special \code{rowid} (or equivalently
 34 |    \code{oid} or \code{_rowid_}) is available in any case.}
 35 |   \item{envir}{ The environment where the data frames representing the tables
 36 |    are to be found.}
 37 |   \item{method}{This argument is a list of two functions, keywords or character
 38 |    vectors.  If the second component of the list 
 39 |    is \code{NULL} (the default) then the first component 
 40 |    of the list can be specified without wrapping it in a list. The
 41 |    first component
 42 |    specifies a transformation of the data frame output from the database
 43 |    and the second specifies a transformation to each data frame that is 
 44 |    passed to the data base just before it is read into the database.  
 45 |    The second component is less frequently used.
 46 |    If the first component is \code{NULL} or not
 47 |    specified that it defaults to "auto".  If the second component is \code{NULL}
 48 |    or not specified then no transformation is performed on the input.
 49 | 
 50 |    The allowable keywords for the first components are (1) \code{"auto"} 
 51 |    which is the default and automatically assigns the class of each
 52 |    column using the heuristic described later, (2) \code{"auto.factor"} which is
 53 |    the same as \code{"auto"} but does not assign \code{"factor"} and 
 54 |    \code{"ordered"} classes,
 55 |    (3) \code{"raw"} or \code{NULL} which means use
 56 |    whatever classes are returned by the database with no automatic processing
 57 |    and (4) \code{"name__class"} which
 58 |    means that columns names that end in \code{__class}
 59 |    with two underscores
 60 |    where \code{class} is an R class (such as \code{Date}) are converted to
 61 |    that class and the \code{__class} portion is removed from the column name.
 62 |    For example, 
 63 | 	\code{sqldf("select a as x__Date from DF", method = "name__class")} 
 64 |    would cause
 65 |    column \code{a} to be coerced to class \code{Date} and have the column name 
 66 |    \code{x}.
 67 |    The first component of \code{method}
 68 |    can also be a character vector of classes to assign to the
 69 |    returned data.frame.  The example just given could alternately be implemented
 70 |    using 
 71 | 	\code{sqldf("select a as x from DF", method = "Date")} 
 72 |    Note that when \code{Date} is used in this way it assumes the database
 73 |    contains the number of days since January 1, 1970.  If the date is in
 74 |    the format \code{yyyy-mm-dd} then use \code{Date2} as the class.
 75 |   }
 76 |   \item{file.format}{A list whose components are passed to 
 77 |     \code{sqliteImportFile}.  Components may include \code{sep}, 
 78 |     \code{header}, \code{row.names}, \code{skip}, \code{eol} and \code{filter}.
 79 |     Except for \code{filter} they are passed to
 80 |     \code{sqliteImportFile} and have the same default values as 
 81 |     in \code{sqliteImportFile} (except for \code{eol} which defaults
 82 |     to the end of line character(s) for the operating system in use -- note
 83 | 	that
 84 |     if the file being read does not have the line endings for the platform
 85 |     being used then \code{eol} will have to be specified.  In particular, 
 86 |     certain UNIX-like tools on Windows may produce files with UNIX line endings
 87 |     in which case \code{eol="\n"} should be specified). 
 88 |     \code{filter} may optionally contain a batch/shell command through which the
 89 |     input file is piped prior to reading it in.  Alternately \code{filter}
 90 |     may be a list whose first component is a batch/shell command containing
 91 |     names which correspond to the names of the subsequent list components.
 92 |     These subsequent components should each be a character
 93 |     vector which \code{sqldf} will read into a temporary file. The name of
 94 |     the temporary file will be replaced into the command.   For example,
 95 |     \code{filter = list("gawk -f prog", prog = '{ print gensub(/,/, ".", "g") }')} . 
 96 |     command line quoting which may vary among shells and Windows.
 97 |     Note that if the filter produces files with UNIX line endings on Windows
 98 |     then \code{eol} must be specified, as discussed above.
 99 |     \code{file.format} may be set to \code{NULL} in order not to search
100 |     for input file objects at all.  The \code{file.format} can also
101 |     be specified as an attribute in each file object itself in which case 
102 |     such specification overrides any given through the argument list.  There
103 |     is further discussion of \code{file.format} below.}
104 |   \item{dbname}{Name of the database.  For SQLite and h2 data bases this 
105 |    defaults to
106 |    \code{":memory:"} which results in an embedded database. For MySQL this
107 |    defaults to \code{getOption("RMysql.dbname")} and if that is not specified
108 |    then \code{"test"} is used.  
109 |    For RPostgreSQL this
110 |    defaults to \code{getOption("sqldf.RPostgreSQL.dbname")} and if that is 
111 |    not specified then \code{"test"} is used.
112 | }
113 |   \item{drv}{\code{"SQLite"}, \code{"MySQL"}, \code{"h2"},
114 |    \code{"PostgreSQL"} or \code{"pgSQL"} or any of those names prefaced with 
115 |    \code{"R"}.
116 |    If not specified then
117 |    the \code{"dbDriver"} option is checked and if that is not set then
118 |    \code{sqldf} checks whether \code{RPostgreSQL}, 
119 |    \code{RMySQL} or \code{RH2} 
120 |    is loaded in that order and the driver corresponding to the first one
121 |    found is used.  If none are loaded then \code{"SQLite"} is used. 
122 |    \code{dbname=NULL} causes the default to be used.}
123 |   \item{user}{user name.  Not needed for embedded databases. 
124 |     For RPostgreSQL
125 |     the default is taken from option \code{sqldf.RPostgreSQL.user} and
126 |     if that is not specified either then \code{"postgres"} is used.
127 | }
128 |   \item{password}{password. Not needed for embedded databases. 
129 |     For RPostgreSQL
130 |     the default is taken from option \code{sqldf.RPostgreSQL.password} and
131 |     if that is not specified then \code{"postgres"} is used.
132 | }
133 |   \item{host}{host.  Default of "localhost" is normally sufficient.
134 |     For RPostgreSQL
135 |     the default is taken from option \code{sqldf.RPostgreSQL.host} and
136 |     if that is not specified then \code{"test"} is used.
137 | 
138 | }
139 |   \item{port}{port. For RPostgreSQL the default 
140 |    is taken from the option \code{sqldf.RPostgreSQL.port} and if that is not
141 |    specified then \code{5432} is used.
142 | }
143 |   \item{dll}{Name of an SQLite loadable extension to automatically load.
144 |     If found on PATH then it is 
145 | 	automatically loaded and the SQLite functions it in will be accessible.}
146 |   \item{connection}{If this is \code{NULL} then a connection is created;
147 |    otherwise the indicated connection is used.  The default is
148 |    the value of the option \code{sqldf.connection}.  If neither 
149 |    \code{connection} nor \code{sqldf.connection} are specified a connection
150 |    is automatically generated on-the-fly and closed on exit of the call to 
151 |    \code{sqldf}.  If this argument is not \code{NULL} then the specified
152 |    connection is left open on termination of the \code{sqldf} call.  Usually 
153 |    this argument is left unspecified.  It can be used to make repeated calls
154 |    to a database without reloading it.}
155 |   \item{verbose}{If \code{TRUE} then verboe output shown.  Anything else
156 |    suppresses verbose output. Can be set globally using option 
157 |    \code{"sqldf.verbose"}.}
158 | }
159 | \details{
160 | The typical action of \code{sqldf} is to 
161 | \describe{
162 | \item{create a database}{in memory}
163 | \item{read in the data frames and files}{used in the select statement.  
164 | This is done by scanning the select statement to see which words in 
165 | the select statement are of class "data.frame" or "file" in the parent frame,  or the
166 | specified environment if \code{envir} is used, and for each object
167 | found by reading it into the database if it is a data frame.  Note
168 | that this heuristic usually reads in the wanted data frames and files
169 | but on occasion may harmlessly reads in extra ones too.}
170 | \item{run the select statement}{getting the result as a data frame}
171 | \item{assign the classes}{of the returned data frame's columns if
172 | \code{method = "auto"}.  This is done by checking all the column
173 | names in the read-in data frames and if any are the same
174 | as a column output from the data base then that column is coerced to the
175 | class of the column whose name matched.  
176 | If the class of the column is \code{"factor"} or \code{"ordered"} or if the
177 | column is not matched then the column is returned
178 | as is.  If \code{method = "auto.factor"} then processing is similar except
179 | that \code{"factor"} and \code{"ordered"} classes and their levels will be
180 | assigned as well.  The \code{"auto.factor"} heuristic
181 | is less reliable than the \code{"auto"} heuristic.
182 | If \code{method = "raw"} then the classes are returned
183 | as is from the database. }
184 | \item{cleanup}{If the database was created by sqldf then it is deleted;
185 | otherwise, all tables that were created are dropped in order to leave
186 | the database in the same state that it was before.  The database
187 | connection is terminated.}
188 | }
189 | 
190 | \code{sqldf} supports the following R options for RPostgreSQL: 
191 | \code{"sqldf.RPostgreSQL.dbname"},
192 | \code{"sqldf.RPostgreSQL.user"},
193 | \code{"sqldf.RPostgreSQL.password"},
194 | \code{"sqldf.RPostgreSQL.host"} and
195 | \code{"sqldf.RPostgreSQL.port"} which have defaults \code{"test"},
196 | \code{"postgres"}, \code{"postgres"}, \code{"localhost"} and \code{5432},
197 | respectively.  It also supports
198 | \code{"sqldf.RPostgreSQL.other"} which is a list of named parameters.  These
199 | may include 
200 | \code{dbname},
201 | \code{user},
202 | \code{password},
203 | \code{host} and 
204 | \code{port}.
205 | Individually these take precdence over otherwise specified connection
206 | arguments.
207 | 
208 | 
209 | 
210 | Warning. Although sqldf is usually used with on-the-fly databases
211 | which it automatically sets up and destroys if you wish to use it
212 | with existing databases be sure to back up your database prior to
213 | using it since incorrect operation could destroy the entire 
214 | database.
215 | 
216 | }
217 | \note{
218 | If \code{row.names = TRUE} is used then 
219 | any \code{NATURAL JOIN} will make use of it which may not be what was
220 | intended.
221 | 
222 | 3/2 and 3.0/2 are the same in R but in SQLite the first one
223 | causes integer arithmetic to be used whereas the second using floating point.  
224 | Thus both evaluate to 
225 | 1.5 in R but they evaluate to 1 and 1.5 respectively in SQLite.
226 | 
227 | The \code{dbWriteTable}/\code{sqliteImportFile} routines that sqldf uses to transfer files to the data base are intended for speed and they are not as flexible as \code{\link{read.table}}.  Also they have slightly different defaults. (If more flexible input is needed use the slower \code{read.table} to read the data into a data frame instead of reading directly from a file.)  The default for \code{sep} is \code{sep = ","}. If the first row of the file has one fewer entry than subsequent ones then it is assumed that \code{header <- row.names <- TRUE} and otherwise that \code{header <- row.names <- FALSE}. The \code{header} can be forced to \code{header <- TRUE} by specifying \code{file.format = list(header = TRUE)} as an argument to \code{sqldf.} \code{sep} and \code{row.names} are other \code{file.format} subarguments. Also, one limitation with .csv files is that quotes are not regarded as special within files so a comma within a data field such as \code{"Smith, James"} would be regarded as a field delimiter and the quotes would be entered as part of the data which probably is not what is intended.  
228 | 
229 | Typically the SQL result will have the same data as the analogous
230 | non-database \code{R} code manipulations using data frames 
231 | but may differ in row names and other attributes.  In the
232 | examples below we use \code{identical} in those cases where the two
233 | results are the same in all respects or set the row names to \code{NULL}
234 | if they would have otherwise differed only in row names or use
235 | \code{all.equal} if the data portion is the same but attributes aside
236 | from row names differ.
237 | 
238 | On MySQL the database must pre-exist.  Create a \code{c:\\my.ini} 
239 | or \code{\%MYSQL_HOME\%\\my.ini} file on Windows or a \code{/etc/my.cnf}
240 | file on UNIX to contain information about the database.  This file may 
241 | specify the username, password and port.  The password
242 | can be omitted if one has not been set.  If using a standard port
243 | setup then the \code{port} can be omitted as well.  
244 | The database is taken from the \code{dbname} argument of the \code{sqldf}
245 | command or if not set from \code{getOption("sqldf.dbname")} or if that option
246 | is not set it is assumed to be \code{"test"}.
247 | Note that MySQL does not use the \code{user}, \code{password}, \code{host}
248 | and code{port} arguments of sqldf.
249 | See \url{http://dev.mysql.com/doc/refman/5.6/en/option-files.html} for 
250 | additional locations that the configuration files can be placed as well
251 | as other information.
252 | 
253 | If \code{getOption("sqldf.dll")} is specified
254 | then the named dll will be loaded as an SQLite loadable extension.  
255 | This is in addition to the extension functions included with RSQLite.
256 | 
257 | }
258 | 
259 | \value{
260 |   The result of the specified select statement is output as a data frame.
261 |   If a vector of sql statements is given as \code{x} then the result of
262 |   the last one is returned.  If the \code{x} and \code{connection}
263 |   arguments are missing then it returns a new connection and also places
264 |   this connection in the option \code{sqldf.connection}.
265 |   
266 | }
267 | \references{ 
268 | The sqldf home page \url{https://github.com/ggrothendieck/sqldf} contains
269 | more examples as well as links to SQLite pages that may be helpful in 
270 | formulating queries.  It also containers pointers to using sqldf with H2
271 | and PostgreSQL.
272 | }
273 | \examples{
274 | 
275 | #
276 | # These examples show how to run a variety of data frame manipulations
277 | # in R without SQL and then again with SQL
278 | #
279 | 
280 | # head
281 | a1r <- head(warpbreaks)
282 | a1s <- sqldf("select * from warpbreaks limit 6")
283 | identical(a1r, a1s)
284 | 
285 | # subset
286 | 
287 | a2r <- subset(CO2, grepl("^Qn", Plant))
288 | a2s <- sqldf("select * from CO2 where Plant like 'Qn\%'")
289 | all.equal(as.data.frame(a2r), a2s)
290 | 
291 | data(farms, package = "MASS")
292 | a3r <- subset(farms, Manag \%in\% c("BF", "HF"))
293 | a3s <- sqldf("select * from farms where Manag in ('BF', 'HF')")
294 | row.names(a3r) <- NULL
295 | identical(a3r, a3s)
296 | 
297 | a4r <- subset(warpbreaks, breaks >= 20 & breaks <= 30)
298 | a4s <- sqldf("select * from warpbreaks where breaks between 20 and 30", 
299 |    row.names = TRUE)
300 | identical(a4r, a4s)
301 | 
302 | a5r <- subset(farms, Mois == 'M1')
303 | a5s <- sqldf("select * from farms where Mois = 'M1'", row.names = TRUE)
304 | identical(a5r, a5s)
305 | 
306 | a6r <- subset(farms, Mois == 'M2')
307 | a6s <- sqldf("select * from farms where Mois = 'M2'", row.names = TRUE)
308 | identical(a6r, a6s)
309 | 
310 | # rbind
311 | a7r <- rbind(a5r, a6r)
312 | a7s <- sqldf("select * from a5s union all select * from a6s")
313 | 
314 | # sqldf drops the unused levels of Mois but rbind does not; however,
315 | # all data is the same and the other columns are identical
316 | row.names(a7r) <- NULL
317 | identical(a7r[-1], a7s[-1])
318 | 
319 | # aggregate - avg conc and uptake by Plant and Type
320 | a8r <- aggregate(iris[1:2], iris[5], mean)
321 | a8s <- sqldf('select Species, avg("Sepal.Length") `Sepal.Length`, 
322 |    avg("Sepal.Width") `Sepal.Width` from iris group by Species')
323 | all.equal(a8r, a8s)
324 | 
325 | # by - avg conc and total uptake by Plant and Type
326 | a9r <- do.call(rbind, by(iris, iris[5], function(x) with(x,
327 | 	data.frame(Species = Species[1], 
328 | 		mean.Sepal.Length = mean(Sepal.Length),
329 | 		mean.Sepal.Width = mean(Sepal.Width),
330 | 		mean.Sepal.ratio = mean(Sepal.Length/Sepal.Width)))))
331 | row.names(a9r) <- NULL
332 | a9s <- sqldf('select Species, avg("Sepal.Length") `mean.Sepal.Length`,
333 | 	avg("Sepal.Width") `mean.Sepal.Width`, 
334 | 	avg("Sepal.Length"/"Sepal.Width") `mean.Sepal.ratio` from iris
335 | 	group by Species')
336 | all.equal(a9r, a9s)
337 | 
338 | # head - top 3 breaks
339 | a10r <- head(warpbreaks[order(warpbreaks$breaks, decreasing = TRUE), ], 3)
340 | a10s <- sqldf("select * from warpbreaks order by breaks desc limit 3")
341 | row.names(a10r) <- NULL
342 | identical(a10r, a10s)
343 | 
344 | # head - bottom 3 breaks
345 | a11r <- head(warpbreaks[order(warpbreaks$breaks), ], 3)
346 | a11s <- sqldf("select * from warpbreaks order by breaks limit 3")
347 | # attributes(a11r) <- attributes(a11s) <- NULL
348 | row.names(a11r) <- NULL
349 | identical(a11r, a11s)
350 | 
351 | # ave - rows for which v exceeds its group average where g is group
352 | DF <- data.frame(g = rep(1:2, each = 5), t = rep(1:5, 2), v = 1:10)
353 | a12r <- subset(DF, v > ave(v, g, FUN = mean))
354 | Gavg <- sqldf("select g, avg(v) as avg_v from DF group by g")
355 | a12s <- sqldf("select DF.g, t, v from DF, Gavg where DF.g = Gavg.g and v > avg_v")
356 | row.names(a12r) <- NULL
357 | identical(a12r, a12s)
358 | 
359 | # same but reduce the two select statements to one using a subquery
360 | a13s <- sqldf("select g, t, v 
361 | from DF d1, (select g as g2, avg(v) as avg_v from DF group by g) 
362 | where d1.g = g2 and v > avg_v")
363 | identical(a12r, a13s)
364 | 
365 | # same but shorten using natural join
366 | a14s <- sqldf("select g, t, v 
367 | from DF 
368 | natural join (select g, avg(v) as avg_v from DF group by g) 
369 | where v > avg_v")
370 | identical(a12r, a14s)
371 | 
372 | # table
373 | a15r <- table(warpbreaks$tension, warpbreaks$wool)
374 | a15s <- sqldf("select sum(wool = 'A'), sum(wool = 'B') 
375 |    from warpbreaks group by tension")
376 | all.equal(as.data.frame.matrix(a15r), a15s, check.attributes = FALSE)
377 | 
378 | # reshape
379 | t.names <- paste("t", unique(as.character(DF$t)), sep = "_")
380 | a16r <- reshape(DF, direction = "wide", timevar = "t", idvar = "g", varying = list(t.names))
381 | a16s <- sqldf("select 
382 | 	g, 
383 | 	sum((t == 1) * v) t_1, 
384 | 	sum((t == 2) * v) t_2, 
385 | 	sum((t == 3) * v) t_3, 
386 | 	sum((t == 4) * v) t_4, 
387 | 	sum((t == 5) * v) t_5 
388 | from DF group by g")
389 | all.equal(a16r, a16s, check.attributes = FALSE)
390 | 
391 | # order
392 | a17r <- Formaldehyde[order(Formaldehyde$optden, decreasing = TRUE), ]
393 | a17s <- sqldf("select * from Formaldehyde order by optden desc")
394 | row.names(a17r) <- NULL
395 | identical(a17r, a17s)
396 | 
397 | # centered moving average of length 7
398 | set.seed(1)
399 | DF <- data.frame(x = rnorm(15, 1:15))
400 | s18 <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b 
401 |    where a.row_names - b.row_names between -3 and 3 
402 |    group by a.row_names having count(*) = 7 
403 |    order by a.row_names+0", 
404 |  row.names = TRUE)
405 | r18 <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7)))
406 | row.names(r18) <- NULL
407 | all.equal(r18, s18)
408 | 
409 | # merge.  a19r and a19s are same except row order and row names
410 | A <- data.frame(a1 = c(1, 2, 1), a2 = c(2, 3, 3), a3 = c(3, 1, 2))
411 | B <- data.frame(b1 = 1:2, b2 = 2:1)
412 | a19s <- sqldf("select * from A, B")
413 | a19r <- merge(A, B)
414 | Sort <- function(DF) DF[do.call(order, DF),]
415 | all.equal(Sort(a19s), Sort(a19r), check.attributes = FALSE)
416 | 
417 | # within Date, of the highest quality records list the one closest 
418 | # to noon.  Note use of two sql statements in one call to sqldf.
419 | 
420 | Lines <- "DeployID Date.Time LocationQuality Latitude Longitude
421 | STM05-1 2005/02/28 17:35 Good -35.562 177.158
422 | STM05-1 2005/02/28 19:44 Good -35.487 177.129
423 | STM05-1 2005/02/28 23:01 Unknown -35.399 177.064
424 | STM05-1 2005/03/01 07:28 Unknown -34.978 177.268
425 | STM05-1 2005/03/01 18:06 Poor -34.799 177.027
426 | STM05-1 2005/03/01 18:47 Poor -34.85 177.059
427 | STM05-2 2005/02/28 12:49 Good -35.928 177.328
428 | STM05-2 2005/02/28 21:23 Poor -35.926 177.314
429 | "
430 | 
431 | DF <- read.table(textConnection(Lines), skip = 1,  as.is = TRUE,
432 |  col.names = c("Id", "Date", "Time", "Quality", "Lat", "Long"))
433 | 
434 | sqldf(c("create temp table DFo as select * from DF order by
435 |   Date DESC, Quality DESC,
436 |   abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) DESC",
437 |   "select * from DFo group by Date"))
438 | 
439 | \dontrun{
440 | 
441 | # test of file connections with sqldf
442 | 
443 | # create test .csv file of just 3 records
444 | write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE)
445 | 
446 | # look at contents of iris3.dat
447 | readLines("iris3.dat")
448 | 
449 | # set up file connection
450 | iris3 <- file("iris3.dat")
451 | sqldf('select * from iris3 where "Sepal.Width" > 3')
452 | 
453 | # using a non-default separator
454 | # file.format can be an attribute of file object or an arg passed to sqldf
455 | write.table(head(iris, 3), "iris3.dat", sep = ";", quote = FALSE)
456 | iris3 <- file("iris3.dat")
457 | sqldf('select * from iris3 where "Sepal.Width" > 3', file.format = list(sep = ";"))
458 | 
459 | # same but pass file.format through attribute of file object
460 | attr(iris3, "file.format") <- list(sep = ";")
461 | sqldf('select * from iris3 where "Sepal.Width" > 3')
462 | 
463 | # copy file straight to disk without going through R
464 | # and then retrieve portion into R  
465 | sqldf('select * from iris3 where "Sepal.Width" > 3', dbname = tempfile())
466 | 
467 | ### same as previous example except it allows multiple queries against
468 | ### the database.  We use iris3 from before.  This time we use an
469 | ### in memory SQLite database.
470 | 
471 | sqldf() # open a connection
472 | sqldf('select * from iris3 where "Sepal.Width" > 3')
473 | 
474 | # At this point we have an iris3 variable in both
475 | # the R workspace and in the SQLite database so we need to
476 | # explicitly let it know we want the version in the database.
477 | # If we were not to do that it would try to use the R version
478 | # by default and fail since sqldf would prevent it from 
479 | # overwriting the version already in the database to protect
480 | # the user from inadvertent errors.
481 | sqldf('select * from main.iris3 where "Sepal.Width" > 4')
482 | sqldf('select * from main.iris3 where "Sepal_Width" < 4')
483 | sqldf() # close connection
484 | 
485 | ### another way to do this is a mix of sqldf and RSQLite statements
486 | ### In that case we need to fetch the connection for use with RSQLite
487 | ### and do not have to specifically refer to main since RSQLite can
488 | ### only access the database.
489 | 
490 | con <- sqldf()
491 | # this iris3 refers to the R variable and file
492 | sqldf('select * from iris3 where "Sepal.Width" > 3')
493 | sqldf("select count(*) from iris3")
494 | # these iris3 refer to the database table
495 | dbGetQuery(con, 'select * from iris3 where "Sepal.Width" > 4')
496 | dbGetQuery(con, 'select * from iris3 where "Sepal.Width" < 4')
497 | sqldf()
498 | 
499 | }
500 | }
501 | \keyword{manip}
502 | 


--------------------------------------------------------------------------------
/tests/doSvUnit.R:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/Rscript --vanilla --default-packages=utils,stats,methods,svUnit
 2 | pkg <- "sqldf"
 3 | require(svUnit)  # Needed if run from R CMD BATCH
 4 | require(pkg, character.only = TRUE)  # Needed if run from R CMD BATCH
 5 | # unlink("mypkgTest.txt")  # Make sure we generate a new report
 6 | mypkgSuite <- svSuiteList(pkg, excludeList = NULL)  # List all our test suites
 7 | runTest(mypkgSuite, name = "svUnit")  # Run them...
 8 | # protocol(Log(), type = "text", file = "mypkgTest.txt")  # ... and write report
 9 | Log()
10 | 
11 | 


--------------------------------------------------------------------------------