├── DESCRIPTION
├── NAMESPACE
├── R
    └── dbglm.R
├── README.md
├── inst
    ├── dbglm-poisson.R
    ├── nzcars.R
    └── taxis.R
└── man
    └── dbglm.Rd


/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: dbglm
 2 | Title: Generalised linear models by subsampling and one-step polishing
 3 | Version: 0.1
 4 | Author: Thomas Lumley
 5 | Description: Fast fitting of generalised linear models on moderately large datasets, by taking an initial sample, fitting in memory, then evaluating the score function for the full data in the database.
 6 | Imports: DBI, tidypredict, purrr,rlang, methods,dplyr
 7 | Suggests: MonetDBLite, RSQLite
 8 | License: GPL-3
 9 | Maintainer: Thomas Lumley <t.lumley@auckland.ac.nz>
10 | 
11 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 |   importFrom("methods", "is")
 2 |   importFrom("stats", "binomial", "sd", "terms", "glm","coef","vcov")
 3 | import("DBI")
 4 | import("dplyr")
 5 | importFrom("purrr","map","map2","reduce")
 6 | import("rlang")
 7 | import("tidypredict")
 8 | export(dbglm)
 9 | 
10 | 


--------------------------------------------------------------------------------
/R/dbglm.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 |    
  4 | dbsample<-function(con,...)  {UseMethod("dbsample")}
  5 |   
  6 | 
  7 | dbsample.tbl_monetdb <-function(tbl, n, N, variables, ... ){
  8 | 	dbGetQuery(tbl$src$con, dbplyr::build_sql("select ",dbplyr::ident(variables)," from (",dbplyr::sql(dbplyr::sql_render(tbl)), ") as foo sample ", as.integer(n)))
  9 | }
 10 |  
 11 |  dbsample.tbl_sql <-function(tbl, n, N, variables, ... ){
 12 | 	if (!is(tbl$src$con, 'SQLiteConnection')) stop("only implemented for RSQLite so far")
 13 | 	dbGetQuery(tbl$src$con, dbplyr::build_sql("select ",dbplyr::ident(variables)," from (", dbplyr::sql(dbplyr::sql_render(tbl)),  ") where abs(CAST(random() AS REAL))/9223372036854775808 <", as.double(n/N)))
 14 | 
 15 | } 
 16 |  
 17 |  
 18 | dbsample.tbl_df <-function(tbl, n, N, variables, ... ){
 19 | 	tbl[sample(N,n),]	
 20 | }
 21 |  
 22 |    
 23 | dbglm<-function(formula, family = binomial(), tbl, sd=FALSE,weights=.NotYetImplemented(), subset=.NotYetImplemented(), ...){   
 24 | 	
 25 |  
 26 |   variables<-all.vars(formula)
 27 |   if (!(all(variables %in% colnames(tbl)))) stop("variables must be in data tbl")
 28 |   tbl2<-select(tbl,!!!syms(all.vars(formula)))
 29 | 
 30 |   N<- pull(summarise(tbl2, n()))
 31 |   n<-round(N^(5/9))
 32 |   sdf<-dbsample(tbl2, n, N,  variables,...)
 33 |   
 34 |   model0 <- glm(formula=formula,family=family, data=sdf, ...)
 35 |   
 36 |   if(sd){
 37 |   	  rval <- t(as.matrix(tbl2 %>% score_meansd(model0)))
 38 |   	  U <-rval[,1]*N
 39 |   	  beta0<-coef(model0)
 40 |   	  V0<- vcov(model0)
 41 |   	  inf<-solve(summary(model0)$cov.unscaled)
 42 |   	  seratio<- sqrt(diag(inf))/(rval[,2]*sqrt(n))
 43 |   	  V1<-V0*n/N
 44 |   	  V2<- outer(seratio,seratio)*V1
 45 |   	  beta1<-beta0+V1%*%U
 46 |   	  beta2<-beta0+V2%*%U
 47 | 
 48 |   	  list(beta0,beta1,beta2,V1,V2)
 49 | 
 50 |   } else {
 51 |   		U <- t(as.matrix(tbl2 %>% score_mean(model0)))*N
 52 |   		beta0<-coef(model0)
 53 |   		V0<- vcov(model0)
 54 |   		V1<-vcov(model0)*(n/N)
 55 |   		beta1<-beta0+V1%*%U
 56 |   		list(tildebeta=beta0,hatbeta=beta1,tildeV=V0,hatV=V1)
 57 | 
 58 |   }
 59 | }
 60 | strip_factor<-function(x) gsub("factor\\((.+)\\)","\\1",x)
 61 | 
 62 | score_mean<- function(df, model,fitname="_fit_",residname="_resid_") {
 63 | 	df <- df %>% tidypredict_to_column(model, vars=c(fitname,"",""))
 64 | 	
 65 |   parsedmodel<- parse_model(model)
 66 |   labels <- parsedmodel %>%
 67 |     filter(labels == "labels") %>%
 68 |     as.character()
 69 |   
 70 |   labels <- labels[4:length(labels)]
 71 |   labels <- c("estimate", labels)
 72 |   all_terms <- parsedmodel %>%
 73 |     filter(.data$type == "term") %>%
 74 |     select(- .data$type, -.data$labels)
 75 |   
 76 |   selection <- which(labels != "NA")
 77 |   all_terms <- all_terms[, which(labels != "NA")]
 78 |   colnames(all_terms) <- labels[which(labels != "NA")]
 79 | 
 80 |   response<-attr(terms(model),"variables")[[2]]
 81 |   fit<-sym(fitname)
 82 |   
 83 |   f <- seq_len(nrow(all_terms)) %>%
 84 |     map(~{
 85 |       vars <- strip_factor(colnames(all_terms))
 86 |       vals <- as.character(all_terms[.x, ])
 87 |       
 88 |       resid <- expr((!!!response)-(!!!fit))
 89 |       
 90 |       reg <- vars[vals == "{{:}}" & !is.na(vals) & vars != "estimate"]
 91 |       reg <- expr(!! syms(reg))
 92 |       
 93 |       field <- vars[vals != "{{:}}" & !is.na(vals) & vars != "estimate"]
 94 |       val <-  vals[vals != "{{:}}" & !is.na(vals) & vars != "estimate"]
 95 |       ie <- map2(syms(field), val, function(x, y) expr((!!x) == (!!y)))
 96 |       ie <- map(ie, function(x) expr(ifelse(!!x, 1.0, 0.0)))
 97 |       set <- c(reg, ie, resid)
 98 |       reduce(set, function(l, r) expr((!!! l) * (!!! r)))
 99 |     } )
100 |   
101 |   offset <- filter(parsedmodel, labels == "offset")
102 |   if (nrow(offset) > 0) {
103 |     f <- c(f, sym(offset$vals))
104 |   }
105 |   
106 |   names(f)<-paste0("_u",seq_along(coef(model)))
107 |   
108 |   df %>% 
109 |   	mutate(!!!f) %>%
110 | 	summarise(!!!map(paste0("_u",seq_along(coef(model))), function(x) expr(mean(!!sym(x))))) %>%
111 |     collect()
112 | }
113 | 
114 | score_meansd<- function(df, model,fitname="_fit_",residname="_resid_") {
115 | 	df <- df %>% tidypredict_to_column(model, vars=c(fitname,"",""))
116 | 	
117 |   parsedmodel<- parse_model(model)
118 |   labels <- parsedmodel %>%
119 |     filter(labels == "labels") %>%
120 |     as.character()
121 |   
122 |   labels <- labels[4:length(labels)]
123 |   labels <- c("estimate", labels)
124 |   all_terms <- parsedmodel %>%
125 |     filter(.data$type == "term") %>%
126 |     select(- .data$type, -.data$labels)
127 |   
128 |   selection <- which(labels != "NA")
129 |   all_terms <- all_terms[, which(labels != "NA")]
130 |   colnames(all_terms) <- labels[which(labels != "NA")]
131 | 
132 |   response<-attr(terms(model),"variables")[[2]]
133 |   fit<-sym(fitname)
134 |   
135 |   f <- seq_len(nrow(all_terms)) %>%
136 |     map(~{
137 |       vars <- strip_factor(colnames(all_terms))
138 |       vals <- as.character(all_terms[.x, ])
139 |       
140 |       resid <- expr((!!!response)-(!!!fit))
141 |       
142 |       reg <- vars[vals == "{{:}}" & !is.na(vals) & vars != "estimate"]
143 |       reg <- expr(!! syms(reg))
144 |       
145 |       field <- vars[vals != "{{:}}" & !is.na(vals) & vars != "estimate"]
146 |       val <-  vals[vals != "{{:}}" & !is.na(vals) & vars != "estimate"]
147 |       ie <- map2(syms(field), val, function(x, y) expr((!!x) == (!!y)))
148 |       ie <- map(ie, function(x) expr(ifelse(!!x, 1, 0)))
149 |       set <- c(reg, ie, resid)
150 |       reduce(set, function(l, r) expr((!!! l) * (!!! r)))
151 |     } )
152 |   
153 |   offset <- filter(parsedmodel, labels == "offset")
154 |   if (nrow(offset) > 0) {
155 |     f <- c(f, sym(offset$vals))
156 |   }
157 |   
158 |     names(f)<-paste0("_u",seq_along(coef(model)))
159 | 
160 |   
161 |   rval <- df %>% mutate(!!!f) %>%
162 | 	summarise(!!!flatten(map(paste0("_u", seq_along(coef(model))), 
163 | 			function(x) c(expr(mean(!!sym(x))),expr(sd(!!sym(x))))))) %>% 
164 | 			collect()
165 |   
166 |   matrix(as.matrix(rval),nrow=2)
167 | }
168 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This package fits generalised linear models to moderately large data sets stored in a relational database.  The code has implementations for MonetDB and SQLite, but should be easy to adapt to any other database that has EXP and RAND. 
2 | 
3 | The code takes a subsample of the data, fits the model in memory, then improves the estimate with one step of Fisher scoring computed with a single SQL aggregation query.
4 | 
5 | 


--------------------------------------------------------------------------------
/inst/dbglm-poisson.R:
--------------------------------------------------------------------------------
 1 | set.seed(2018-7-7)
 2 | df<-data.frame(x=rep(1:1000,10000),z=rnorm(1e7))
 3 | df$y<-with(df, rpois(1e7,exp(x/1000+z/1000+1)))
 4 | 
 5 | system.time(inmem <- glm(y~x+z, family=poisson(), data=df))
 6 | 
 7 | library(MonetDBLite)
 8 | library(DBI)
 9 | td<-tempdir()
10 | con<-dbConnect(MonetDBLite(),td)
11 | dbWriteTable(con,"simulated", df)
12 | dbDisconnect(con,shutdown=TRUE)
13 | 
14 | rm(df)
15 | 
16 | library(dbglm)
17 | library(dplyr)
18 | ms<- MonetDBLite::src_monetdblite(td)
19 | 
20 | system.time(indb<-dbglm(y~x+z, family=poisson(), tbl=tbl(ms,"simulated")))
21 | 
22 | dbDisconnect(ms$con,shutdown=TRUE)
23 | 


--------------------------------------------------------------------------------
/inst/nzcars.R:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Download data from figshare
 3 | ## https://figshare.com/articles/NZ_vehicles_database/5971471
 4 | ##
 5 | ### Code for MonetDB
 6 | 
 7 | library(dbglm)
 8 | library(DBI)
 9 | library(MonetDBLite)
10 | library(dplyr)
11 | library(dbplyr)
12 | 
13 | ms <- MonetDBLite::src_monetdblite("~/VEHICLE")
14 | monetdb.read.csv(ms$con, "Fleet30Nov2017.csv",tablename="vehicles",quote="",nrow.check=10000,best.effort=TRUE,lower.case.names=TRUE)
15 | vehicles<-tbl(ms,"vehicles")
16 | cars <- filter(vehicles, vehicle_type == "PASSENGER CAR/VAN") %>% 
17 | 	mutate(isred=ifelse(basic_colour=="RED",1,0)) %>% 
18 | 	filter(number_of_seats >1 & number_of_seats < 7) %>% filter(number_of_axles==2) %>%
19 | 	compute()
20 | 
21 | system.time({
22 | model<-dbglm(isred~power_rating+number_of_seats+gross_vehicle_mass,tbl=cars)
23 | })
24 | 
25 | 
26 | ### Code for SQLite
27 | library(dbglm)
28 | library(RSQLite)
29 | library(dplyr)
30 | library(dbplyr)
31 | 
32 | 
33 | # data setup
34 | 
35 | vehicles<-read.csv("Fleet30Nov2017.csv")
36 | names(vehicles)<-tolower(names(vehicles))
37 | vehicles$power_rating<-as.numeric(as.character(vehicles$power_rating))
38 | vehicles$number_of_seats<-as.numeric(as.character(vehicles$number_of_seats))
39 | vehicles$number_of_axles<-as.numeric(as.character(vehicles$number_of_axles))
40 | 
41 | sqlite<-dbDriver("SQLite")
42 | con<-dbConnect(sqlite,"nzcars.db")
43 | RSQLite:::initExtension(con)
44 | dbWriteTable(con,"vehicles",vehicles)
45 | rm(vehicles)
46 | dbDisconnect(con)
47 | 
48 | # analysis
49 | library(dbglm)
50 | library(RSQLite)
51 | library(dplyr)
52 | library(dbplyr)
53 | 
54 | sqlite<-dbDriver("SQLite")
55 | con<-dbConnect(sqlite,"nzcars.db")
56 | RSQLite:::initExtension(con)
57 | sqlitevehicles<-tbl(con,"vehicles")
58 | 
59 | 
60 | cars <- filter(sqlitevehicles, vehicle_type == "PASSENGER CAR/VAN") %>%
61 | 	mutate(isred=ifelse(basic_colour=="RED",1,0)) %>% 
62 | 	filter(number_of_seats >1 & number_of_seats < 7) %>% filter(number_of_axles==2) %>%
63 | 	compute()
64 | 
65 | system.time({
66 | sqlitemodel<-dbglm(isred~power_rating+number_of_seats+gross_vehicle_mass,tbl=cars)
67 | })
68 | 
69 | 


--------------------------------------------------------------------------------
/inst/taxis.R:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Download January to June 2016 data from
 3 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv
 4 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-02.csv
 5 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-03.csv
 6 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-04.csv
 7 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-05.csv
 8 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-06.csv
 9 | ## and July to December data from
10 | ##   https://figshare.com/articles/yellow_tripdata_2016-07trim_csv/5965522
11 | ##
12 | ## 
13 | ## This is the directory you put them in
14 | datadir<-"~/TAXITEST"
15 | ##
16 | 
17 | 
18 | library(DBI)
19 | library(MonetDBLite)
20 | library(dplyr)
21 | library(dbplyr)
22 | 
23 | ms <- MonetDBLite::src_monetdblite(paste(datadir,"db",sep="/"))
24 | 
25 | ## read the January to June data
26 | inputs<-list.files(datadir,"-0[1-6]",full.names=TRUE)
27 | monetdb.read.csv(ms$con, inputs, tablename="yellow", header=TRUE, lower.case.names=TRUE)
28 | 
29 | ## read the July to December data (the header on the original file is wrong)
30 | nm<-tolower(strsplit("VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,junk1,junk2",",")[[1]])
31 | inputs<-list.files(datadir,"trim",full.names=TRUE)
32 |  monetdb.read.csv(ms$con, inputs, tablename="yellowlate",col.names=nm,header=FALSE)
33 | 
34 | ## Combine the two data sets
35 | 
36 | dbSendQuery(ms$con, "create table taxis as (select vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag,  payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount from yellow union all select vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount from yellowlate)")
37 | 
38 | ## Create some additional variables
39 | 
40 | dbSendQuery(ms$con, "alter table taxis add column dropoff_timestamp TIMESTAMP")
41 | dbSendQuery(ms$con, "update taxis set dropoff_timestamp = str_to_timestamp(tpep_dropoff_datetime,  '%Y-%m-%d %H:%M:%S')")
42 | dbSendQuery(ms$con, "alter table taxis add column drop_hour FLOAT")
43 | dbSendQuery(ms$con, "update taxis set drop_hour = extract( hour from dropoff_timestamp)")
44 | dbSendQuery(ms$con, "alter table taxis add column drop_dow FLOAT")
45 | dbSendQuery(ms$con, "update taxis set drop_dow = dayofweek(dropoff_timestamp)")
46 | 
47 | ## Subset and create more variables
48 | 
49 | taxis <- tbl(ms, "taxis")
50 | cc_taxis <- taxis %>% filter(payment_type==1) %>% filter(fare_amount>0) %>% filter(trip_distance < 50) %>% filter(ratecodeid!=99)%>% mutate(bad_tip=ifelse(tip_amount/(fare_amount+improvement_surcharge)<0.20,1.0,0.0))%>% mutate(night=ifelse(drop_hour<5| drop_hour>7,1.0,0)) %>% mutate(weekend=ifelse(drop_dow==1 | drop_dow==7 | (drop_dow==6 & drop_hour>7),1,0)) %>% compute()
51 | 
52 | ## The actual model fitting
53 | 
54 | system.time({
55 | model0<-dbglm(bad_tip~factor(weekend)*night+trip_distance+passenger_count+factor(ratecodeid),tbl=cc_taxis)	
56 | })
57 | 
58 | system.time({
59 | model1<-dbglm(bad_tip~factor(weekend)*factor(night)*passenger_count+trip_distance++factor(ratecodeid),tbl=cc_taxis,family=binomial)	
60 | })
61 | 
62 | 
63 | ## Comparison with bigglm
64 | 
65 | library(biglm)
66 | 
67 | system.time({
68 | model0biglm<-bigglm(bad_tip~weekend*night*passenger_count+trip_distance+factor(ratecodeid,levels=1:5),data=cc_taxis$src$con,family=binomial(),tablename="xryqxldcdd",chunksize=4e5)	
69 | })
70 | 
71 | system.time({
72 | model1biglm<-bigglm(bad_tip~weekend*night*passenger_count+trip_distance+factor(ratecodeid,levels=1:5),data=cc_taxis$src$con,family=binomial(),tablename="xryqxldcdd",chunksize=4e5)	
73 | })
74 | 
75 | 


--------------------------------------------------------------------------------
/man/dbglm.Rd:
--------------------------------------------------------------------------------
 1 | \name{dbglm}
 2 | \alias{dbglm}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 | Fast generalised linear models in a database
 6 | }
 7 | \description{
 8 | Fit a generalised linear model to a large dataset, by fitting the model to a subsample and using the subsample estimate as the starting value for one iteration of Fisher scoring. The one-step update is computed in a \code{dbplyr} expression that will translate to a single database query.
 9 | }
10 | \usage{
11 | dbglm(formula, family = binomial(), tbl, sd = FALSE, weights = .NotYetImplemented(), subset = .NotYetImplemented(), ...)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 |   \item{formula}{
16 | A model formula. It can have interactions but cannot have any transformations except \code{factor}
17 | }
18 |   \item{family}{
19 | Model family
20 | }
21 |   \item{tbl}{
22 | An object inheriting from \code{tbl}. Will typically be a database-backed lazy \code{tbl} from the \code{dbplyr} package
23 | }
24 |   \item{sd}{
25 | Experimental: compute the standard deviation of the score as well as the mean in the update and use it to improve the information matrix estimate
26 | }
27 |   \item{weights}{
28 | We don't support weights
29 | }
30 |   \item{subset}{
31 | If you want to analyze a subset, use \code{filter()} on the data
32 | }
33 |   \item{\dots}{
34 | This argument is required for S3 method extension.
35 | }
36 | }
37 | \details{
38 | For a dataset of size \code{N} the subsample is of size \code{N^(5/9)}. Unless \code{N} is large the approximation won't be very good. Also, with small \code{N} it's quite likely that, eg, some factor levels will be missing in the subsample. 
39 | }
40 | \value{
41 | A list with elements
42 |   \item{tildebeta }{coefficients from subsample}
43 |   \item{hatbeta }{final estimate}
44 |   \item{tildeV }{variance matrix from subsample}
45 |   \item{hatV }{final estimate}
46 | 
47 | }
48 | \references{
49 | \url{http://notstatschat.tumblr.com/post/171570186286/faster-generalised-linear-models-in-largeish-data}
50 | }
51 | 
52 | 
53 | \examples{
54 | 
55 | }
56 | % Add one or more standard keywords, see file 'KEYWORDS' in the
57 | % R documentation directory.
58 | \keyword{models}% use one of  RShowDoc("KEYWORDS")
59 | 
60 | 


--------------------------------------------------------------------------------