├── DESCRIPTION ├── NAMESPACE ├── R └── dbglm.R ├── README.md ├── inst ├── dbglm-poisson.R ├── nzcars.R └── taxis.R └── man └── dbglm.Rd /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: dbglm 2 | Title: Generalised linear models by subsampling and one-step polishing 3 | Version: 0.1 4 | Author: Thomas Lumley 5 | Description: Fast fitting of generalised linear models on moderately large datasets, by taking an initial sample, fitting in memory, then evaluating the score function for the full data in the database. 6 | Imports: DBI, tidypredict, purrr,rlang, methods,dplyr 7 | Suggests: MonetDBLite, RSQLite 8 | License: GPL-3 9 | Maintainer: Thomas Lumley 10 | 11 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | importFrom("methods", "is") 2 | importFrom("stats", "binomial", "sd", "terms", "glm","coef","vcov") 3 | import("DBI") 4 | import("dplyr") 5 | importFrom("purrr","map","map2","reduce") 6 | import("rlang") 7 | import("tidypredict") 8 | export(dbglm) 9 | 10 | -------------------------------------------------------------------------------- /R/dbglm.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dbsample<-function(con,...) {UseMethod("dbsample")} 5 | 6 | 7 | dbsample.tbl_monetdb <-function(tbl, n, N, variables, ... ){ 8 | dbGetQuery(tbl$src$con, dbplyr::build_sql("select ",dbplyr::ident(variables)," from (",dbplyr::sql(dbplyr::sql_render(tbl)), ") as foo sample ", as.integer(n))) 9 | } 10 | 11 | dbsample.tbl_sql <-function(tbl, n, N, variables, ... ){ 12 | if (!is(tbl$src$con, 'SQLiteConnection')) stop("only implemented for RSQLite so far") 13 | dbGetQuery(tbl$src$con, dbplyr::build_sql("select ",dbplyr::ident(variables)," from (", dbplyr::sql(dbplyr::sql_render(tbl)), ") where abs(CAST(random() AS REAL))/9223372036854775808 <", as.double(n/N))) 14 | 15 | } 16 | 17 | 18 | dbsample.tbl_df <-function(tbl, n, N, variables, ... ){ 19 | tbl[sample(N,n),] 20 | } 21 | 22 | 23 | dbglm<-function(formula, family = binomial(), tbl, sd=FALSE,weights=.NotYetImplemented(), subset=.NotYetImplemented(), ...){ 24 | 25 | 26 | variables<-all.vars(formula) 27 | if (!(all(variables %in% colnames(tbl)))) stop("variables must be in data tbl") 28 | tbl2<-select(tbl,!!!syms(all.vars(formula))) 29 | 30 | N<- pull(summarise(tbl2, n())) 31 | n<-round(N^(5/9)) 32 | sdf<-dbsample(tbl2, n, N, variables,...) 33 | 34 | model0 <- glm(formula=formula,family=family, data=sdf, ...) 35 | 36 | if(sd){ 37 | rval <- t(as.matrix(tbl2 %>% score_meansd(model0))) 38 | U <-rval[,1]*N 39 | beta0<-coef(model0) 40 | V0<- vcov(model0) 41 | inf<-solve(summary(model0)$cov.unscaled) 42 | seratio<- sqrt(diag(inf))/(rval[,2]*sqrt(n)) 43 | V1<-V0*n/N 44 | V2<- outer(seratio,seratio)*V1 45 | beta1<-beta0+V1%*%U 46 | beta2<-beta0+V2%*%U 47 | 48 | list(beta0,beta1,beta2,V1,V2) 49 | 50 | } else { 51 | U <- t(as.matrix(tbl2 %>% score_mean(model0)))*N 52 | beta0<-coef(model0) 53 | V0<- vcov(model0) 54 | V1<-vcov(model0)*(n/N) 55 | beta1<-beta0+V1%*%U 56 | list(tildebeta=beta0,hatbeta=beta1,tildeV=V0,hatV=V1) 57 | 58 | } 59 | } 60 | strip_factor<-function(x) gsub("factor\\((.+)\\)","\\1",x) 61 | 62 | score_mean<- function(df, model,fitname="_fit_",residname="_resid_") { 63 | df <- df %>% tidypredict_to_column(model, vars=c(fitname,"","")) 64 | 65 | parsedmodel<- parse_model(model) 66 | labels <- parsedmodel %>% 67 | filter(labels == "labels") %>% 68 | as.character() 69 | 70 | labels <- labels[4:length(labels)] 71 | labels <- c("estimate", labels) 72 | all_terms <- parsedmodel %>% 73 | filter(.data$type == "term") %>% 74 | select(- .data$type, -.data$labels) 75 | 76 | selection <- which(labels != "NA") 77 | all_terms <- all_terms[, which(labels != "NA")] 78 | colnames(all_terms) <- labels[which(labels != "NA")] 79 | 80 | response<-attr(terms(model),"variables")[[2]] 81 | fit<-sym(fitname) 82 | 83 | f <- seq_len(nrow(all_terms)) %>% 84 | map(~{ 85 | vars <- strip_factor(colnames(all_terms)) 86 | vals <- as.character(all_terms[.x, ]) 87 | 88 | resid <- expr((!!!response)-(!!!fit)) 89 | 90 | reg <- vars[vals == "{{:}}" & !is.na(vals) & vars != "estimate"] 91 | reg <- expr(!! syms(reg)) 92 | 93 | field <- vars[vals != "{{:}}" & !is.na(vals) & vars != "estimate"] 94 | val <- vals[vals != "{{:}}" & !is.na(vals) & vars != "estimate"] 95 | ie <- map2(syms(field), val, function(x, y) expr((!!x) == (!!y))) 96 | ie <- map(ie, function(x) expr(ifelse(!!x, 1.0, 0.0))) 97 | set <- c(reg, ie, resid) 98 | reduce(set, function(l, r) expr((!!! l) * (!!! r))) 99 | } ) 100 | 101 | offset <- filter(parsedmodel, labels == "offset") 102 | if (nrow(offset) > 0) { 103 | f <- c(f, sym(offset$vals)) 104 | } 105 | 106 | names(f)<-paste0("_u",seq_along(coef(model))) 107 | 108 | df %>% 109 | mutate(!!!f) %>% 110 | summarise(!!!map(paste0("_u",seq_along(coef(model))), function(x) expr(mean(!!sym(x))))) %>% 111 | collect() 112 | } 113 | 114 | score_meansd<- function(df, model,fitname="_fit_",residname="_resid_") { 115 | df <- df %>% tidypredict_to_column(model, vars=c(fitname,"","")) 116 | 117 | parsedmodel<- parse_model(model) 118 | labels <- parsedmodel %>% 119 | filter(labels == "labels") %>% 120 | as.character() 121 | 122 | labels <- labels[4:length(labels)] 123 | labels <- c("estimate", labels) 124 | all_terms <- parsedmodel %>% 125 | filter(.data$type == "term") %>% 126 | select(- .data$type, -.data$labels) 127 | 128 | selection <- which(labels != "NA") 129 | all_terms <- all_terms[, which(labels != "NA")] 130 | colnames(all_terms) <- labels[which(labels != "NA")] 131 | 132 | response<-attr(terms(model),"variables")[[2]] 133 | fit<-sym(fitname) 134 | 135 | f <- seq_len(nrow(all_terms)) %>% 136 | map(~{ 137 | vars <- strip_factor(colnames(all_terms)) 138 | vals <- as.character(all_terms[.x, ]) 139 | 140 | resid <- expr((!!!response)-(!!!fit)) 141 | 142 | reg <- vars[vals == "{{:}}" & !is.na(vals) & vars != "estimate"] 143 | reg <- expr(!! syms(reg)) 144 | 145 | field <- vars[vals != "{{:}}" & !is.na(vals) & vars != "estimate"] 146 | val <- vals[vals != "{{:}}" & !is.na(vals) & vars != "estimate"] 147 | ie <- map2(syms(field), val, function(x, y) expr((!!x) == (!!y))) 148 | ie <- map(ie, function(x) expr(ifelse(!!x, 1, 0))) 149 | set <- c(reg, ie, resid) 150 | reduce(set, function(l, r) expr((!!! l) * (!!! r))) 151 | } ) 152 | 153 | offset <- filter(parsedmodel, labels == "offset") 154 | if (nrow(offset) > 0) { 155 | f <- c(f, sym(offset$vals)) 156 | } 157 | 158 | names(f)<-paste0("_u",seq_along(coef(model))) 159 | 160 | 161 | rval <- df %>% mutate(!!!f) %>% 162 | summarise(!!!flatten(map(paste0("_u", seq_along(coef(model))), 163 | function(x) c(expr(mean(!!sym(x))),expr(sd(!!sym(x))))))) %>% 164 | collect() 165 | 166 | matrix(as.matrix(rval),nrow=2) 167 | } 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This package fits generalised linear models to moderately large data sets stored in a relational database. The code has implementations for MonetDB and SQLite, but should be easy to adapt to any other database that has EXP and RAND. 2 | 3 | The code takes a subsample of the data, fits the model in memory, then improves the estimate with one step of Fisher scoring computed with a single SQL aggregation query. 4 | 5 | -------------------------------------------------------------------------------- /inst/dbglm-poisson.R: -------------------------------------------------------------------------------- 1 | set.seed(2018-7-7) 2 | df<-data.frame(x=rep(1:1000,10000),z=rnorm(1e7)) 3 | df$y<-with(df, rpois(1e7,exp(x/1000+z/1000+1))) 4 | 5 | system.time(inmem <- glm(y~x+z, family=poisson(), data=df)) 6 | 7 | library(MonetDBLite) 8 | library(DBI) 9 | td<-tempdir() 10 | con<-dbConnect(MonetDBLite(),td) 11 | dbWriteTable(con,"simulated", df) 12 | dbDisconnect(con,shutdown=TRUE) 13 | 14 | rm(df) 15 | 16 | library(dbglm) 17 | library(dplyr) 18 | ms<- MonetDBLite::src_monetdblite(td) 19 | 20 | system.time(indb<-dbglm(y~x+z, family=poisson(), tbl=tbl(ms,"simulated"))) 21 | 22 | dbDisconnect(ms$con,shutdown=TRUE) 23 | -------------------------------------------------------------------------------- /inst/nzcars.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Download data from figshare 3 | ## https://figshare.com/articles/NZ_vehicles_database/5971471 4 | ## 5 | ### Code for MonetDB 6 | 7 | library(dbglm) 8 | library(DBI) 9 | library(MonetDBLite) 10 | library(dplyr) 11 | library(dbplyr) 12 | 13 | ms <- MonetDBLite::src_monetdblite("~/VEHICLE") 14 | monetdb.read.csv(ms$con, "Fleet30Nov2017.csv",tablename="vehicles",quote="",nrow.check=10000,best.effort=TRUE,lower.case.names=TRUE) 15 | vehicles<-tbl(ms,"vehicles") 16 | cars <- filter(vehicles, vehicle_type == "PASSENGER CAR/VAN") %>% 17 | mutate(isred=ifelse(basic_colour=="RED",1,0)) %>% 18 | filter(number_of_seats >1 & number_of_seats < 7) %>% filter(number_of_axles==2) %>% 19 | compute() 20 | 21 | system.time({ 22 | model<-dbglm(isred~power_rating+number_of_seats+gross_vehicle_mass,tbl=cars) 23 | }) 24 | 25 | 26 | ### Code for SQLite 27 | library(dbglm) 28 | library(RSQLite) 29 | library(dplyr) 30 | library(dbplyr) 31 | 32 | 33 | # data setup 34 | 35 | vehicles<-read.csv("Fleet30Nov2017.csv") 36 | names(vehicles)<-tolower(names(vehicles)) 37 | vehicles$power_rating<-as.numeric(as.character(vehicles$power_rating)) 38 | vehicles$number_of_seats<-as.numeric(as.character(vehicles$number_of_seats)) 39 | vehicles$number_of_axles<-as.numeric(as.character(vehicles$number_of_axles)) 40 | 41 | sqlite<-dbDriver("SQLite") 42 | con<-dbConnect(sqlite,"nzcars.db") 43 | RSQLite:::initExtension(con) 44 | dbWriteTable(con,"vehicles",vehicles) 45 | rm(vehicles) 46 | dbDisconnect(con) 47 | 48 | # analysis 49 | library(dbglm) 50 | library(RSQLite) 51 | library(dplyr) 52 | library(dbplyr) 53 | 54 | sqlite<-dbDriver("SQLite") 55 | con<-dbConnect(sqlite,"nzcars.db") 56 | RSQLite:::initExtension(con) 57 | sqlitevehicles<-tbl(con,"vehicles") 58 | 59 | 60 | cars <- filter(sqlitevehicles, vehicle_type == "PASSENGER CAR/VAN") %>% 61 | mutate(isred=ifelse(basic_colour=="RED",1,0)) %>% 62 | filter(number_of_seats >1 & number_of_seats < 7) %>% filter(number_of_axles==2) %>% 63 | compute() 64 | 65 | system.time({ 66 | sqlitemodel<-dbglm(isred~power_rating+number_of_seats+gross_vehicle_mass,tbl=cars) 67 | }) 68 | 69 | -------------------------------------------------------------------------------- /inst/taxis.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Download January to June 2016 data from 3 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv 4 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-02.csv 5 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-03.csv 6 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-04.csv 7 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-05.csv 8 | ## https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-06.csv 9 | ## and July to December data from 10 | ## https://figshare.com/articles/yellow_tripdata_2016-07trim_csv/5965522 11 | ## 12 | ## 13 | ## This is the directory you put them in 14 | datadir<-"~/TAXITEST" 15 | ## 16 | 17 | 18 | library(DBI) 19 | library(MonetDBLite) 20 | library(dplyr) 21 | library(dbplyr) 22 | 23 | ms <- MonetDBLite::src_monetdblite(paste(datadir,"db",sep="/")) 24 | 25 | ## read the January to June data 26 | inputs<-list.files(datadir,"-0[1-6]",full.names=TRUE) 27 | monetdb.read.csv(ms$con, inputs, tablename="yellow", header=TRUE, lower.case.names=TRUE) 28 | 29 | ## read the July to December data (the header on the original file is wrong) 30 | nm<-tolower(strsplit("VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,junk1,junk2",",")[[1]]) 31 | inputs<-list.files(datadir,"trim",full.names=TRUE) 32 | monetdb.read.csv(ms$con, inputs, tablename="yellowlate",col.names=nm,header=FALSE) 33 | 34 | ## Combine the two data sets 35 | 36 | dbSendQuery(ms$con, "create table taxis as (select vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount from yellow union all select vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount from yellowlate)") 37 | 38 | ## Create some additional variables 39 | 40 | dbSendQuery(ms$con, "alter table taxis add column dropoff_timestamp TIMESTAMP") 41 | dbSendQuery(ms$con, "update taxis set dropoff_timestamp = str_to_timestamp(tpep_dropoff_datetime, '%Y-%m-%d %H:%M:%S')") 42 | dbSendQuery(ms$con, "alter table taxis add column drop_hour FLOAT") 43 | dbSendQuery(ms$con, "update taxis set drop_hour = extract( hour from dropoff_timestamp)") 44 | dbSendQuery(ms$con, "alter table taxis add column drop_dow FLOAT") 45 | dbSendQuery(ms$con, "update taxis set drop_dow = dayofweek(dropoff_timestamp)") 46 | 47 | ## Subset and create more variables 48 | 49 | taxis <- tbl(ms, "taxis") 50 | cc_taxis <- taxis %>% filter(payment_type==1) %>% filter(fare_amount>0) %>% filter(trip_distance < 50) %>% filter(ratecodeid!=99)%>% mutate(bad_tip=ifelse(tip_amount/(fare_amount+improvement_surcharge)<0.20,1.0,0.0))%>% mutate(night=ifelse(drop_hour<5| drop_hour>7,1.0,0)) %>% mutate(weekend=ifelse(drop_dow==1 | drop_dow==7 | (drop_dow==6 & drop_hour>7),1,0)) %>% compute() 51 | 52 | ## The actual model fitting 53 | 54 | system.time({ 55 | model0<-dbglm(bad_tip~factor(weekend)*night+trip_distance+passenger_count+factor(ratecodeid),tbl=cc_taxis) 56 | }) 57 | 58 | system.time({ 59 | model1<-dbglm(bad_tip~factor(weekend)*factor(night)*passenger_count+trip_distance++factor(ratecodeid),tbl=cc_taxis,family=binomial) 60 | }) 61 | 62 | 63 | ## Comparison with bigglm 64 | 65 | library(biglm) 66 | 67 | system.time({ 68 | model0biglm<-bigglm(bad_tip~weekend*night*passenger_count+trip_distance+factor(ratecodeid,levels=1:5),data=cc_taxis$src$con,family=binomial(),tablename="xryqxldcdd",chunksize=4e5) 69 | }) 70 | 71 | system.time({ 72 | model1biglm<-bigglm(bad_tip~weekend*night*passenger_count+trip_distance+factor(ratecodeid,levels=1:5),data=cc_taxis$src$con,family=binomial(),tablename="xryqxldcdd",chunksize=4e5) 73 | }) 74 | 75 | -------------------------------------------------------------------------------- /man/dbglm.Rd: -------------------------------------------------------------------------------- 1 | \name{dbglm} 2 | \alias{dbglm} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Fast generalised linear models in a database 6 | } 7 | \description{ 8 | Fit a generalised linear model to a large dataset, by fitting the model to a subsample and using the subsample estimate as the starting value for one iteration of Fisher scoring. The one-step update is computed in a \code{dbplyr} expression that will translate to a single database query. 9 | } 10 | \usage{ 11 | dbglm(formula, family = binomial(), tbl, sd = FALSE, weights = .NotYetImplemented(), subset = .NotYetImplemented(), ...) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{formula}{ 16 | A model formula. It can have interactions but cannot have any transformations except \code{factor} 17 | } 18 | \item{family}{ 19 | Model family 20 | } 21 | \item{tbl}{ 22 | An object inheriting from \code{tbl}. Will typically be a database-backed lazy \code{tbl} from the \code{dbplyr} package 23 | } 24 | \item{sd}{ 25 | Experimental: compute the standard deviation of the score as well as the mean in the update and use it to improve the information matrix estimate 26 | } 27 | \item{weights}{ 28 | We don't support weights 29 | } 30 | \item{subset}{ 31 | If you want to analyze a subset, use \code{filter()} on the data 32 | } 33 | \item{\dots}{ 34 | This argument is required for S3 method extension. 35 | } 36 | } 37 | \details{ 38 | For a dataset of size \code{N} the subsample is of size \code{N^(5/9)}. Unless \code{N} is large the approximation won't be very good. Also, with small \code{N} it's quite likely that, eg, some factor levels will be missing in the subsample. 39 | } 40 | \value{ 41 | A list with elements 42 | \item{tildebeta }{coefficients from subsample} 43 | \item{hatbeta }{final estimate} 44 | \item{tildeV }{variance matrix from subsample} 45 | \item{hatV }{final estimate} 46 | 47 | } 48 | \references{ 49 | \url{http://notstatschat.tumblr.com/post/171570186286/faster-generalised-linear-models-in-largeish-data} 50 | } 51 | 52 | 53 | \examples{ 54 | 55 | } 56 | % Add one or more standard keywords, see file 'KEYWORDS' in the 57 | % R documentation directory. 58 | \keyword{models}% use one of RShowDoc("KEYWORDS") 59 | 60 | --------------------------------------------------------------------------------