├── R ├── .Rapp.history ├── rfImp.R ├── missdf.R ├── quantile_df.R ├── filterRules.R ├── removePeriods.R ├── quantileScore.R ├── addRuleQuality.R ├── mergeRules2trans.R ├── rules2df.R ├── shiny_table.R ├── makeSQL_createTable.R ├── balPanel.R ├── mergeEdgeLists.R ├── makeVertexAtt.R ├── plotAvgBy.R ├── overlap.R ├── plotdf.R ├── shiny_leafletDF.R ├── smartAgg.R ├── partialDependence.R ├── bin.R ├── igraph_network.R ├── shiny_arules.R ├── shiny_tableNet.R └── modeling.R ├── draft ├── .Rapp.history └── horseRaceModel.R ├── .Rbuildignore ├── .gitignore ├── Rsenal.Rproj ├── man ├── shinyTable.Rd ├── rfImp.Rd ├── makeForm.Rd ├── missdf.Rd ├── plotdf.Rd ├── quantileScore.Rd ├── quantileFeatureScore.Rd ├── plotBestglm.Rd ├── rules2df.Rd ├── editForm.Rd ├── roundCut.Rd ├── shuffleAnova.Rd ├── overlap.Rd ├── getEfromVlist.Rd ├── Rsenal.Rd ├── filterRules.Rd ├── predSortPlot.Rd ├── getV1fromVlist.Rd ├── logit2tab.Rd ├── quantile_df.Rd ├── travOut.Rd ├── makeSQL_createTable.Rd ├── addRuleQuality.Rd ├── getVfromE.Rd ├── pruneEdge.Rd ├── removePeriods.Rd ├── mergeEdgeLists.Rd ├── travCount.Rd ├── balPanel.Rd ├── uniglm.Rd ├── mergeRules2trans.Rd ├── optimizeModelWeight.Rd ├── dir2dfList.Rd ├── leafletMapDF.Rd ├── makeVertexAtt.Rd ├── isKey.Rd ├── depthbin.Rd ├── predQuantile.Rd ├── ipipApp.Rd ├── plotAvgBy.Rd ├── tableNet.Rd ├── binCat.Rd ├── partialDep.Rd ├── partialDepAll.Rd ├── arulesApp.Rd └── smartAgg.Rd ├── DESCRIPTION ├── LICENSE ├── README.md └── NAMESPACE /R/.Rapp.history: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /draft/.Rapp.history: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .DS_Store -------------------------------------------------------------------------------- /Rsenal.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageInstallArgs: --no-multiarch --with-keep.source 17 | PackageRoxygenize: rd,namespace 18 | -------------------------------------------------------------------------------- /man/shinyTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{shinyTable} 3 | \alias{shinyTable} 4 | \title{Shiny app to visualize dataframe as simple interactive datatable} 5 | \usage{ 6 | shinyTable(df) 7 | } 8 | \arguments{ 9 | \item{df}{dataframe to be visualized} 10 | } 11 | \value{ 12 | Shiny App 13 | } 14 | \description{ 15 | Launches a basic Shiny App that renders the given dataframe into an interactive datatable using \code{renderDataTable} 16 | } 17 | \examples{ 18 | \dontrun{ 19 | shinyTable(mtcars) 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/rfImp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{rfImp} 3 | \alias{rfImp} 4 | \title{Sort Random Forest Variable Importance Features} 5 | \usage{ 6 | rfImp(rf) 7 | } 8 | \arguments{ 9 | \item{rf}{randomForest object from randomForest package} 10 | } 11 | \value{ 12 | data.frame 13 | } 14 | \description{ 15 | Simple utility function to sort random forest variable importance features. Very simple. 16 | } 17 | \examples{ 18 | library('randomForest') 19 | myrf <- randomForest(iris[1:4], iris$Species) 20 | rfImp(myrf) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /R/rfImp.R: -------------------------------------------------------------------------------- 1 | #' @title Sort Random Forest Variable Importance Features 2 | #' @description Simple utility function to sort random forest variable importance features. Very simple. 3 | #' @param rf randomForest object from randomForest package 4 | #' @return data.frame 5 | #' @export 6 | #' @examples 7 | #' library('randomForest') 8 | #' myrf <- randomForest(iris[1:4], iris$Species) 9 | #' rfImp(myrf) 10 | 11 | rfImp <- function(rf) { 12 | df <- data.frame(rf$importance[order(rf$importance, decreasing=T),,drop=F]) 13 | df$variable <- row.names(df) 14 | df <- df[,c(2,1)] 15 | row.names(df) <- NULL 16 | return(df) 17 | } -------------------------------------------------------------------------------- /man/makeForm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{makeForm} 3 | \alias{makeForm} 4 | \title{Create formula} 5 | \usage{ 6 | makeForm(y, xList) 7 | } 8 | \arguments{ 9 | \item{y}{character string, target variable} 10 | 11 | \item{xList}{character vector, predictor variables (Right Hand Side variables) for formula} 12 | } 13 | \value{ 14 | formula object 15 | } 16 | \description{ 17 | Creates a formula object which can be passed to many R modeling functions from a vector of variable names. 18 | } 19 | \examples{ 20 | form <- makeForm('mpg', c('drat', 'wt', 'hp')) 21 | summary(lm(form, data=mtcars)) 22 | } 23 | 24 | -------------------------------------------------------------------------------- /man/missdf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{missdf} 3 | \alias{missdf} 4 | \title{Inspect missing data in a data.frame} 5 | \usage{ 6 | missdf(df, criteria = "NA") 7 | } 8 | \arguments{ 9 | \item{df}{data.frame to inspect for missing values} 10 | 11 | \item{character}{string, criteria to search for. \code{'NA'} or \code{'Inf'}} 12 | } 13 | \value{ 14 | summary table 15 | } 16 | \description{ 17 | Currently supports \code{NA} and \code{Inf} 18 | } 19 | \examples{ 20 | mtcars2 <- mtcars 21 | for(i in 1:ncol(mtcars2)) mtcars2[sample(nrow(mtcars2), sample(1:5,1), replace=T),i] <- NA 22 | missdf(mtcars2) 23 | } 24 | 25 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: Rsenal 2 | Title: Rsenal (arsenal) of assorted R functions developed over the years 3 | Description: stats, data transformation, visualization 4 | Version: 0.1 5 | Author: Andrew Brooks 6 | Maintainer: Andrew Brooks 7 | Depends: 8 | R (>= 3.0.2) 9 | License: >GPL-2 10 | LazyData: true 11 | Imports: 12 | data.table, 13 | roxygen2, 14 | igraph, 15 | leaps, 16 | bestglm, 17 | randomForest, 18 | Hmisc, 19 | shiny, 20 | ggplot2, 21 | arules, 22 | arulesViz, 23 | vcd, 24 | RColorBrewer, 25 | fmsb, 26 | MASS, 27 | gplots, 28 | memoise, 29 | BBmisc 30 | Suggests: 31 | RoxygenNote: 5.0.1 32 | -------------------------------------------------------------------------------- /man/plotdf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{plotdf} 3 | \alias{plotdf} 4 | \title{Plot data.frame to PDF} 5 | \usage{ 6 | plotdf(df, file = "output.pdf", wordy = F) 7 | } 8 | \arguments{ 9 | \item{df}{data.frame to plot.} 10 | 11 | \item{file}{string - name of the PDF that will be created} 12 | 13 | \item{wordy}{- boolean \code{TRUE} or \code{FALSE}. Sequentially prints status of each chart to the console. 14 | Could be useful for large data.frames.} 15 | } 16 | \value{ 17 | PDF file of plots 18 | } 19 | \description{ 20 | Plots every column of a data.frame as an individual plot (one plot per page) 21 | in a PDF file. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | plotdf(df=mtcars, file='mtcars_plots.pdf') 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /man/quantileScore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{quantileScore} 3 | \alias{quantileScore} 4 | \title{Quantile scoring function for continuous or integer valued data} 5 | \usage{ 6 | quantileScore(x) 7 | } 8 | \arguments{ 9 | \item{x}{numeric vector to scale} 10 | } 11 | \value{ 12 | a vector of attributes that correspond in order with the nodes in your igraph 13 | } 14 | \description{ 15 | This function sorts a series from beginning to end. It uses each observations place in the quantile to assign it a score. 16 | useful for transforming variables into the same units (0-1). Handles NAs 17 | } 18 | \examples{ 19 | cbind(quantileScore(mtcars$mpg), mtcars$mpg) 20 | cbind(quantileScore(mtcars$cyl), mtcars$cyl) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/quantileFeatureScore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{quantileFeatureScore} 3 | \alias{quantileFeatureScore} 4 | \title{Quantile scoring function for continuous or integer valued data} 5 | \usage{ 6 | quantileFeatureScore(x, wq = 0.5, wf = 0.5) 7 | } 8 | \arguments{ 9 | \item{x}{numeric vector to scale} 10 | } 11 | \value{ 12 | a vector of attributes that correspond in order with the nodes in your igraph 13 | } 14 | \description{ 15 | This function combines the quantile function with the basic feature normalization technique 16 | useful for getting a stable normalization 17 | } 18 | \examples{ 19 | cbind(quantileFeatureScore(sort(mtcars$mpg)), sort(mtcars$mpg)) 20 | cbind(quantileFeatureScore(mtcars$cyl), mtcars$cyl) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/plotBestglm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{plotBestglm} 3 | \alias{plotBestglm} 4 | \title{All-subsets logistic regression} 5 | \usage{ 6 | plotBestglm(bglm, rc = 2) 7 | } 8 | \arguments{ 9 | \item{bglm}{\code{bestglm} object from \code{\link{bestglm}}} 10 | 11 | \item{rc}{decimal places to display on y-axis of plot} 12 | } 13 | \value{ 14 | plot 15 | } 16 | \description{ 17 | Plots the output of the bestglm BestModels object. Similar to the visual output of plotting a regsubsets object 18 | } 19 | \examples{ 20 | require('bestglm') 21 | b <- bestglm(Xy=mtcars[,c('mpg', 'hp', 'drat', 'cyl', 'wt', 'qsec', 'vs')], family=binomial(logit), IC='BIC', nvmax=4) 22 | plotBestglm(b$BestModels, rc=3) 23 | } 24 | \seealso{ 25 | \code{\link{bestglm}}, \code{\link{leaps}} 26 | } 27 | 28 | -------------------------------------------------------------------------------- /man/rules2df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{rules2df} 3 | \alias{rules2df} 4 | \title{Transform association rule results into data.frame} 5 | \usage{ 6 | rules2df(rules, list = F) 7 | } 8 | \arguments{ 9 | \item{rules}{list of association rules (S4 arules object). Output of \code{apriori} function.} 10 | 11 | \item{list}{logical \code{TRUE} or \code{FALSE}. sets LHS items in a list, rather than one character string} 12 | } 13 | \value{ 14 | association rules in a data.frame 15 | } 16 | \description{ 17 | Note this function only currently works when the itemsets are of size 1 on the LHS and RHS 18 | } 19 | \examples{ 20 | library('arules') 21 | data("Adult") 22 | ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.6, target = "rules", minlen=2)) 23 | df <- rules2df(ar, list=T) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /man/editForm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{editForm} 3 | \alias{editForm} 4 | \title{Edit a formula} 5 | \usage{ 6 | editForm(form, add = NULL, sub = NULL) 7 | } 8 | \arguments{ 9 | \item{form}{formula, the formula you will be editing} 10 | 11 | \item{add}{character vector, predictor variables to add to formula} 12 | 13 | \item{sub}{character vector, predictor variables to subtract (remove) from formula} 14 | } 15 | \value{ 16 | formula, edited formula object 17 | } 18 | \description{ 19 | Easy way to add or subtract predictor variables to a formula 20 | } 21 | \examples{ 22 | form <- as.formula('mpg~cyl+hp+drat+qsec') 23 | editForm(form, add=c('wt', 'gear', 'carb')) 24 | editForm(form, sub=c('cyl', 'hp', 'qsec', 'variableNotInFormula')) 25 | editForm(form, sub=c('cyl', 'hp', 'qsec'), add=c('wt')) 26 | } 27 | 28 | -------------------------------------------------------------------------------- /man/roundCut.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{roundCut} 3 | \alias{roundCut} 4 | \title{Round numbers in interval} 5 | \usage{ 6 | roundCut(x, r = 1) 7 | } 8 | \arguments{ 9 | \item{x}{character vector of bins to format} 10 | 11 | \item{r}{number, 0 to 10 (or higher I suppose) indicating how many decimals to display} 12 | } 13 | \value{ 14 | formatted character vector with length of input vector. 15 | } 16 | \description{ 17 | Formats an interval of form \code{(5.234,11.783]} to something like \code{(5.2,11.8]}. 18 | Used for formatting only, mainly with binning functions like \code{\link{depthbin}}. Intervals can be opened or closed with 19 | \code{(} and \code{[} respectively and are maintained as such when formatted. Useful for prettifying graphs and reports. 20 | } 21 | \examples{ 22 | x1 <- cut(quantile(rnorm(100)), breaks=4) 23 | roundCut(x1, 1) 24 | } 25 | \seealso{ 26 | \code{\link{depthbin}} 27 | } 28 | 29 | -------------------------------------------------------------------------------- /man/shuffleAnova.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{shuffleAnova} 3 | \alias{shuffleAnova} 4 | \title{Variable importance from iteratively shuffled orders of predictor variables} 5 | \usage{ 6 | shuffleAnova(form, df, n = 5, test = "Chisq") 7 | } 8 | \arguments{ 9 | \item{form}{formula to be passed to \code{\link{travCount}}.} 10 | 11 | \item{df}{data.frame of data to be used for analysis} 12 | 13 | \item{n}{number of iterations for shuffled ANOVA analysis} 14 | 15 | \item{test}{character string, statistical test to run. Default is 'Chisq'.} 16 | } 17 | \value{ 18 | data.frame of results 19 | } 20 | \description{ 21 | Runs multiple anova analyses to assess deviance explained by each predictor in shuffled orders, iteratively. 22 | Currently using only logistic regression. Could be generalized. 23 | } 24 | \examples{ 25 | form <- as.formula('am~wt+gear+carb+cyl+hp+drat+qsec') 26 | shuffleAnova(form, mtcars, n=50) 27 | } 28 | 29 | -------------------------------------------------------------------------------- /man/overlap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{overlap} 3 | \alias{overlap} 4 | \title{Assess overlap in vectors} 5 | \usage{ 6 | overlap(x1, x2, na.rm = F, wordy = T, x1name = NULL, x2name = NULL) 7 | } 8 | \arguments{ 9 | \item{x1}{vector of same class as \code{x2}} 10 | 11 | \item{x2}{vector of same class as \code{x1}} 12 | 13 | \item{na.rm}{logical, remove NAs from analysis} 14 | 15 | \item{wordy}{logical, prints convenience information} 16 | } 17 | \value{ 18 | summary data.frame 19 | } 20 | \description{ 21 | Useful for learning the overlap of absolute and unique values between two vectors of the same type. They 22 | do not have to be the same length, and are allowed to contain NAs. 23 | } 24 | \examples{ 25 | x1 <- sample(1:50, 40, replace=T) 26 | x2 <- sample(c(1:60, rep(NA, 30)), 55, replace=T) 27 | overlap(x1, x2, na.rm=T) 28 | overlap(x1, x2) 29 | overlap(mtcars$gear, mtcars$cyl, x1name='gear', x2name='cyl') 30 | } 31 | 32 | -------------------------------------------------------------------------------- /man/getEfromVlist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{getEfromVlist} 3 | \alias{getEfromVlist} 4 | \title{Get edge IDs from vertices} 5 | \usage{ 6 | getEfromVlist(g, vlist, mode = "out") 7 | } 8 | \arguments{ 9 | \item{g}{igraph object} 10 | 11 | \item{vlist}{list of node/vertex names} 12 | 13 | \item{mode}{character string: 'in', 'out' or 'all'} 14 | } 15 | \value{ 16 | edge IDs that go OUT or IN from the specified list of vertices 17 | } 18 | \description{ 19 | returns the edge IDs that go OUT or IN from the specified list of vertices 20 | } 21 | \examples{ 22 | require('igraph') 23 | 24 | ## build sample network 25 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 26 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 27 | relations<-cbind(from,to) 28 | g <- graph.data.frame(relations) 29 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 30 | V(g)$tier <- tiers 31 | 32 | getEfromVlist(g, c('A', 'D'), 'to') 33 | } 34 | 35 | -------------------------------------------------------------------------------- /man/Rsenal.Rd: -------------------------------------------------------------------------------- 1 | \docType{package} 2 | \name{Rsenal} 3 | \alias{Rsenal} 4 | \title{a(R)senal of assorted R functions and apps} 5 | \description{ 6 | This is mainly a collection of random, or more accurately, clusters of random R functions 7 | I've written for personal and work projects over the years. I don't expect it to be immediately 8 | useful to the general population of R users. For now it's mostly my cloud storage (with nice documentation) for R functions 9 | that I use repeatedly. Some functions are fairly general purpose, while others were developed for specific 10 | problems and are not super generalizeable in the current state. \cr 11 | 12 | \bold{Current clusters of functionality:} \cr 13 | - manipulating igraph objects \cr 14 | - modeling and model validation \cr 15 | - cleaning data \cr 16 | - scaling and transforming data 17 | } 18 | 19 | \author{Andrew Brooks \cr \email{andrewbrooksct@gmail.com}} 20 | 21 | 22 | \details{ 23 | \tabular{ll}{ 24 | Package: \tab Rsenal\cr 25 | Type: \tab Package\cr 26 | Version: \tab 0.1\cr 27 | Date: \tab 2014-10-08\cr 28 | } 29 | } -------------------------------------------------------------------------------- /man/filterRules.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{filterRules} 3 | \alias{filterRules} 4 | \title{Post process association rules} 5 | \usage{ 6 | filterRules(rules, oneSideOnly = oneSideOnly) 7 | } 8 | \arguments{ 9 | \item{rules}{list of association rules (S4 arules object). Output of \code{apriori} function.} 10 | 11 | \item{oneSideOnly}{list of character vectors. Each character vector of the list contains variables that are likely very similar 12 | and will generate uninteresting rules. So the filtering algorithm will prune rules where variables within this list appear on 13 | both the RHS & LHS} 14 | } 15 | \value{ 16 | association rules 17 | } 18 | \description{ 19 | This function filters rules that have already been mined. 20 | } 21 | \examples{ 22 | library('arules') 23 | data('Adult') 24 | rules <- apriori(Adult, parameter = list(supp=0.01, conf=0.9, target = "rules")) 25 | oneSideOnly <- list(c('age', 'workclass', 'education'), c('marital-status', 'occupation', 'race')) 26 | f2 <- filterRules(rules, oneSideOnly=oneSideOnly) 27 | length(rules) 28 | length(f2) 29 | } 30 | 31 | -------------------------------------------------------------------------------- /man/predSortPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{predSortPlot} 3 | \alias{predSortPlot} 4 | \title{Devil's Horn} 5 | \usage{ 6 | predSortPlot(pred, ytest, jitterPlot = NULL) 7 | } 8 | \arguments{ 9 | \item{pred}{vector of predictions for target variable on test data} 10 | 11 | \item{ytest,}{vector of the target variable from test data. (0s and 1s)} 12 | 13 | \item{jitterPlot}{\code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}} 14 | } 15 | \value{ 16 | plot 17 | } 18 | \description{ 19 | Visualize results of supervised model predictions on test data. Currently supports binary target variable. 20 | Red dots represent observations where target variable = 1, black dots where target variable = 0. 21 | } 22 | \examples{ 23 | ## Setting up some data and building a basic model on training data. 24 | mylogit <- glm(vs~drat+hp+mpg, family=binomial('logit'), data=mtcars[1:25,]) 25 | mtcarsTestPred <- predict(mylogit, mtcars[26:32, ], type='response') 26 | predSortPlot(pred=mtcarsTestPred, ytest=mtcars$vs[26:32]) 27 | } 28 | \seealso{ 29 | \code{\link{predQuantile}} 30 | } 31 | 32 | -------------------------------------------------------------------------------- /man/getV1fromVlist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{getV1fromVlist} 3 | \alias{getV1fromVlist} 4 | \title{Get 1 degree away vertexes} 5 | \usage{ 6 | getV1fromVlist(g, vlist, mode = "out") 7 | } 8 | \arguments{ 9 | \item{g}{igraph object} 10 | 11 | \item{vlist}{list of starting node/vertex names} 12 | 13 | \item{mode}{'out' or 'in'. 'out' returns all the vertex/node names 1 degree out from \code{vlist}. 'in' returns all 14 | vertex/node names directed into vlist nodes.} 15 | } 16 | \value{ 17 | vector of node names that are 1 degree away from \code{vlist} 18 | } 19 | \description{ 20 | Returns a list of all vertex names that touch (going IN or OUT) from the 21 | specified vertex list. 22 | } 23 | \examples{ 24 | ## build sample network 25 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 26 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 27 | relations<-cbind(from,to) 28 | g <- graph.data.frame(relations) 29 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 30 | V(g)$tier <- tiers 31 | 32 | getV1fromVlist(g, c('A', 'D'), 'to') 33 | } 34 | 35 | -------------------------------------------------------------------------------- /man/logit2tab.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{logit2tab} 3 | \alias{logit2tab} 4 | \title{Format logistic regression table} 5 | \usage{ 6 | logit2tab(reg, file = NULL, xvlab = NULL, stats = F) 7 | } 8 | \arguments{ 9 | \item{reg}{logistic regression object. output from \code{\link{glm}}} 10 | 11 | \item{file}{filepath of ouput. \code{txt} or \code{csv} object.} 12 | 13 | \item{xvlab}{data.frame, lookup table for variable names. First column is codename, second column is the pretty printed name.} 14 | 15 | \item{stats}{include statistics in output} 16 | } 17 | \value{ 18 | data.frame of results formatted nicely 19 | } 20 | \description{ 21 | Turns a logistic regression object into a regression table 22 | and outsheets it to a csv if you choose 23 | } 24 | \examples{ 25 | reg <- glm(am~qsec+hp, data=mtcars, family=binomial(logit)) 26 | logit2tab(reg) 27 | longnames <- data.frame(short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am', 'qsec'), 28 | long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.', 'Q Seconds')) 29 | logit2tab(reg, xvlab=longnames, stats=T) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Andrew Brooks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /man/quantile_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{quantile_df} 3 | \alias{quantile_df} 4 | \title{Pretty print quantiles} 5 | \usage{ 6 | quantile_df(x, probs = seq(0, 1, 0.05), na.rm = F, names = F, type = 7, 7 | colname = NULL, round = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{x}{numeric vector to calculate quantile with} 11 | 12 | \item{probs}{numeric vector of probabilities to use for quantile} 13 | 14 | \item{na.rm}{logical, pass to quantile function} 15 | 16 | \item{names}{logical, pass to quantile function} 17 | 18 | \item{type,}{number, pass to quantile function} 19 | 20 | \item{colname,}{character, name of variable column in output} 21 | 22 | \item{round,}{integer, number of digits to round quantile value. 23 | passed to \code{digits} argument of \code{round} function} 24 | } 25 | \value{ 26 | prettified data.frame of probabilities 27 | } 28 | \description{ 29 | Useful for R Markdown reports 30 | } 31 | \examples{ 32 | quantile_df(mtcars$mpg, seq(0,1,.1)) 33 | quantile_df(mtcars$mpg, seq(0,1,.1), names=T) 34 | quantile_df(mtcars$mpg, seq(0,1,.2), colname='mpg') 35 | quantile_df(mtcars$mpg, round=1) 36 | } 37 | 38 | -------------------------------------------------------------------------------- /man/travOut.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{travOut} 3 | \alias{travOut} 4 | \title{Find all nodes connected to root node} 5 | \usage{ 6 | travOut(g, root, orderN = -1) 7 | } 8 | \arguments{ 9 | \item{g}{igraph object} 10 | 11 | \item{root}{character string: name of root node} 12 | 13 | \item{orderN}{number: # of degrees away from root node to search.} 14 | } 15 | \value{ 16 | igraph object, subgragh of \code{g} 17 | } 18 | \description{ 19 | Returns a subset of the original graph (all edges and vertices that a 20 | directed path can take from from the root node. OrderN limits the growth of these paths. 21 | } 22 | \examples{ 23 | require('igraph') 24 | 25 | ## build sample network 26 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 27 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 28 | relations<-cbind(from,to) 29 | g <- graph.data.frame(relations) 30 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 31 | V(g)$tier <- tiers 32 | 33 | plot(g) ## full network 34 | plot(travOut(g, 'D')) ## sub network 35 | } 36 | \seealso{ 37 | Simpler version: \code{\link{travCount}} 38 | } 39 | 40 | -------------------------------------------------------------------------------- /R/missdf.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Inspect missing data in a data.frame 3 | #' @description Currently supports \code{NA} and \code{Inf} 4 | #' @param df data.frame to inspect for missing values 5 | #' @param character string, criteria to search for. \code{'NA'} or \code{'Inf'} 6 | #' @return summary table 7 | #' @export 8 | #' @examples 9 | #' mtcars2 <- mtcars 10 | #' for(i in 1:ncol(mtcars2)) mtcars2[sample(nrow(mtcars2), sample(1:5,1), replace=T),i] <- NA 11 | #' missdf(mtcars2) 12 | 13 | 14 | missdf <- function(df, criteria='NA') { 15 | n <- nrow(df) 16 | if(criteria=='NA') a <- sapply(df, function(x) sum(is.na(x))) 17 | if(criteria=='Inf') a <- sapply(df, function(x) sum(x==Inf)) 18 | miss <- unlist(a) 19 | missdf <- data.frame(name=names(miss), missing=miss) 20 | missdf$name <- gsub('.TRUE', '', missdf$name) 21 | missdf$name <- gsub('.NA', '', missdf$name) 22 | missdf$nomiss <- unlist(lapply(a, function(x) x['FALSE'])) 23 | 24 | missdf$missing[is.na(missdf$missing)] <- 0 25 | missdf$nomiss[is.na(missdf$nomiss)] <- 0 26 | 27 | missdf$misspct <- missdf$missing/n 28 | missdf <- missdf[order(missdf$misspct, decreasing=T),] 29 | return(missdf) 30 | } 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Rsenal R package 3 | 4 | This is mainly a collection of random, or more accurately, clusters of random R functions I've written for personal and work projects over the years. I don't expect it to be immediately useful to the general population of R users. For now it's mostly my cloud storage (with nice documentation) for R functions that I use repeatedly. Some functions are fairly general purpose, while others were developed for specific problems and are not super generalizeable in the current state. 5 | 6 | Current clusters of functionality: 7 | 8 | * manipulating igraph objects 9 | * modeling and model validation 10 | * cleaning data 11 | * scaling and transforming data 12 | * manipulating/summarizing outputs from models and association rules 13 | * Shiny apps: relational db schema explorer, association rule explorer, IPIP 14 | 15 | ### How to download 16 | 17 | `install.packages('devtools') # if devtools not already installed` 18 | `library('devtools') ` 19 | `install_github('brooksandrew/Rsenal')` 20 | `library('Rsenal')` 21 | 22 | That's it, you got it. 23 | 24 | List all functions 25 | 26 | `library(help='Rsenal')` 27 | 28 | OR 29 | 30 | `?Rsenal` and then click the `Index` link at the bottom. 31 | -------------------------------------------------------------------------------- /man/makeSQL_createTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{makeSQL_createTable} 3 | \alias{makeSQL_createTable} 4 | \title{generate SQL script to create table in SQL database from data.frame} 5 | \usage{ 6 | makeSQL_createTable(df, tablename = deparse(substitute(df)), 7 | saveQuery = NULL, wordy = T) 8 | } 9 | \arguments{ 10 | \item{df}{data.frame we will generate a SQL create table script for} 11 | 12 | \item{tablename}{(optional) name of the table we want to create in a sql database} 13 | 14 | \item{saveQuery}{(optional) name and filepath of the .sql script that will create the table.} 15 | 16 | \item{wordy}{(optional) prints the sql query out to the console.} 17 | } 18 | \value{ 19 | sql query which the create table script 20 | } 21 | \description{ 22 | Creates a sql Query that will create a table in a SQL Database from an R dataframe. 23 | It intializes with variable types (naively), but prevents the user from manually writing the query, 24 | which can be especially cumbersome when there are many columns. 25 | Initialized with basic SQL Server variable types. 26 | } 27 | \examples{ 28 | makeSQL_createTable(mtcars) 29 | \dontrun{ 30 | makeSQL_createTable(mtcars, saveQuery='mySQL_query.sql') 31 | } 32 | } 33 | 34 | -------------------------------------------------------------------------------- /man/addRuleQuality.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{addRuleQuality} 3 | \alias{addRuleQuality} 4 | \title{Add quality measures to association rules} 5 | \usage{ 6 | addRuleQuality(trans, rules, include = NULL, exclude = "improvement") 7 | } 8 | \arguments{ 9 | \item{trans}{transaction set (s4 class from arules package)} 10 | 11 | \item{rules}{set of rules from \code{apriori}} 12 | 13 | \item{include}{character vector specifying which quality measures to include. Default is to include everything.} 14 | 15 | \item{exclude}{character vector specifying which quality measures to exclude. Default is to exclude 'improvement' because it seems to be slow.} 16 | } 17 | \value{ 18 | ruleset with additional quality measures 19 | } 20 | \description{ 21 | Adds measures of rule quality (conviction, hyperConfidence, cosine, chiSquare, coverage, doc, gini, hyperlift) to a set of 22 | association rules mined from \code{apriori}. Usually used before converting ruleset to data.frame and exporting to some sort of text file. 23 | } 24 | \examples{ 25 | library('arules') 26 | data("Adult") 27 | ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target = "rules")) 28 | ar <- addRuleQuality(trans=Adult, rules=ar) 29 | df <- Rsenal::rules2df(ar) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /man/getVfromE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{getVfromE} 3 | \alias{getVfromE} 4 | \title{Get vertex names from edge IDs} 5 | \usage{ 6 | getVfromE(g, eid, mode = "to", unique = T) 7 | } 8 | \arguments{ 9 | \item{g}{igraph object to mine vertex names from} 10 | 11 | \item{eid}{edge IDs referenced by a numeric vector or the edge object themselves} 12 | 13 | \item{mode}{"to" or "from". Indicates whether to return vertex names for nodes going to (in) or from (out) from the edge.} 14 | 15 | \item{unique}{\code{TRUE} or \code{FALSE}. TRUE simply removes duplicate vertex names and returns a unique list.} 16 | } 17 | \value{ 18 | vertex names associated with the specified edge IDs (FROM or TO) 19 | } 20 | \description{ 21 | Returns the vertex names associated with the specified edge IDs (FROM or TO) 22 | } 23 | \examples{ 24 | require('igraph') 25 | 26 | ## build sample network 27 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 28 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 29 | relations<-cbind(from,to) 30 | g <- graph.data.frame(relations) 31 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 32 | V(g)$tier <- tiers 33 | 34 | getVfromE(g, E(g)[1:5]) 35 | getVfromE(g, 1:5) 36 | } 37 | 38 | -------------------------------------------------------------------------------- /R/quantile_df.R: -------------------------------------------------------------------------------- 1 | #' @title Pretty print quantiles 2 | #' @description Useful for R Markdown reports 3 | #' @param x numeric vector to calculate quantile with 4 | #' @param probs numeric vector of probabilities to use for quantile 5 | #' @param na.rm logical, pass to quantile function 6 | #' @param names logical, pass to quantile function 7 | #' @param type, number, pass to quantile function 8 | #' @param colname, character, name of variable column in output 9 | #' @param round, integer, number of digits to round quantile value. 10 | #' passed to \code{digits} argument of \code{round} function 11 | #' @return prettified data.frame of probabilities 12 | #' @export 13 | #' @examples 14 | #' 15 | #' quantile_df(mtcars$mpg, seq(0,1,.1)) 16 | #' quantile_df(mtcars$mpg, seq(0,1,.1), names=T) 17 | #' quantile_df(mtcars$mpg, seq(0,1,.2), colname='mpg') 18 | #' quantile_df(mtcars$mpg, round=1) 19 | 20 | quantile_df <- function(x, probs=seq(0,1,.05), na.rm=F, names=F, type=7, colname=NULL, round=NULL, ...){ 21 | z <- quantile(x, probs, na.rm, names, type) 22 | probsprint <- paste0(round(probs*100,3), '%') 23 | df <- data.frame(quantile=probsprint, values=z) 24 | if(is.null(round)==F) df[,2] <- round(df[,2], round) 25 | if(is.null(colname)==F) names(df)[2] <- colname 26 | return(df) 27 | } 28 | 29 | -------------------------------------------------------------------------------- /man/pruneEdge.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{pruneEdge} 3 | \alias{pruneEdge} 4 | \title{Prune edges of igraph} 5 | \usage{ 6 | pruneEdge(g, root) 7 | } 8 | \arguments{ 9 | \item{g}{igraph object} 10 | 11 | \item{root}{character string: name of root node} 12 | } 13 | \value{ 14 | igraph object, pruned subgragh of \code{g} 15 | } 16 | \description{ 17 | This function takes a very connected network graph and prunes the edges down 18 | so that it focuses on depth from the root node to the end nodes (inter-connective 19 | edges are deleted). \cr \cr 20 | 1. Find all nodes 1 step out from root. These edges must stay. \cr 21 | 2. Find all nodes 2 steps from root that have more than 2 edges in. 22 | Keep only one edge (the one that leads to the shortest path back to the root).\cr 23 | 3. Repeat for the nodes one more degree away from root. 24 | } 25 | \examples{ 26 | require('igraph') 27 | 28 | ## build sample network 29 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 30 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 31 | relations<-cbind(from,to) 32 | g <- graph.data.frame(relations) 33 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 34 | V(g)$tier <- tiers 35 | 36 | prungeEdge(g, 'A') 37 | } 38 | 39 | -------------------------------------------------------------------------------- /man/removePeriods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{removePeriods} 3 | \alias{removePeriods} 4 | \title{Removes duplicative periods in column names of data.frame} 5 | \usage{ 6 | removePeriods(x) 7 | } 8 | \arguments{ 9 | \item{df}{input data.frame to be aggregated} 10 | } 11 | \value{ 12 | vector of strings without duplicate periods 13 | } 14 | \description{ 15 | When reading in a flat file (csv, txt, etc) that was converted from an Excel spreadsheet, 16 | column names often have spaces between words. R reads these spaces as periods. If there are multiple spaces or special characters that 17 | are not valid in column names, R replaces with a period. When the data is messy, you sometimes get several consecutive periods 18 | or traling periods at the end of column names. I use this function as a coarse tool to standardize this ugliness 19 | Periods can then be easily replaced (\code{gsub}'ed) with a single character if periods aren't your thing. 20 | } 21 | \examples{ 22 | ## making some messed up data to fix 23 | data(mtcars) 24 | names(mtcars)[1] <- paste(names(mtcars)[1], '..', sep='') 25 | names(mtcars)[3] <- paste(names(mtcars)[3], '.', sep='') 26 | names(mtcars)[4] <- paste(names(mtcars)[3], '..also.known..as.horsepower', sep='') 27 | removePeriods(names(mtcars)) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /man/mergeEdgeLists.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{mergeEdgeLists} 3 | \alias{mergeEdgeLists} 4 | \title{Merge edgelists => Master edgelist => igraph} 5 | \usage{ 6 | mergeEdgeLists(edgeLists, from = "from.ID", to = "to.ID", keepDups = F) 7 | } 8 | \arguments{ 9 | \item{edgeLists}{list of edgelists (stored as data.frames)} 10 | 11 | \item{from}{name of the column that specifies the "from" node in each data.frame in \code{edgeLists}} 12 | 13 | \item{to}{name of the column that specifies the "to" node in each data.frame in \code{edgeLists}} 14 | 15 | \item{keepDups}{deletes duplicate from-to relations when set to FALSE} 16 | } 17 | \description{ 18 | This function turns a list of edgelists (data.frames) into a single "master" edgelist. 19 | It maintains the edge attributes of each individual edge list 20 | } 21 | \examples{ 22 | require('igraph') 23 | cars <- data.frame(mtcars, to=sample(rownames(mtcars), replace=T), from=sample(rownames(mtcars), replace=T)) 24 | df1<-cars[sample(1:32,30), c('to', 'from', 'cyl', 'mpg')] 25 | df2<-cars[sample(1:32,15), c('to', 'from', 'cyl', 'qsec')] 26 | df3<-cars[sample(1:32,32), c('to', 'from', 'hp', 'drat')] 27 | 28 | df1$cyl[1:10] <- df1$cyl[1:10]+2 29 | el <- list(df1, df2, df3) 30 | 31 | mel <- mergeEdgeLists(el, from='from', to='to', keepDups=T) 32 | } 33 | 34 | -------------------------------------------------------------------------------- /man/travCount.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{travCount} 3 | \alias{travCount} 4 | \title{Count # of nodes connected to root node} 5 | \usage{ 6 | travCount(g, root, orderN = -1, vmode = "out") 7 | } 8 | \arguments{ 9 | \item{g}{igraph object} 10 | 11 | \item{root}{character string: name of root node} 12 | 13 | \item{orderN}{number, # of degrees away from root node to search. Default is -1 which searches all degrees} 14 | 15 | \item{vmode}{character string: 'out', 'in' or 'all', determines how to subgraph \code{g} from \code{root}. Default is 'out'.} 16 | } 17 | \value{ 18 | number, count of nodes connected to root node. 19 | } 20 | \description{ 21 | Count doesn't include root node. Simpler version of \code{\link{travOut}}. 22 | Might be faster? Developed for a different project. 23 | } 24 | \examples{ 25 | require('igraph') 26 | 27 | ## build sample network 28 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 29 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 30 | relations<-cbind(from,to) 31 | g <- graph.data.frame(relations) 32 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 33 | V(g)$tier <- tiers 34 | 35 | plot(g) 36 | travCount(g, 'B') 37 | travCount(g, 'L', vmode='out') 38 | } 39 | \seealso{ 40 | \code{\link{travOut}} 41 | } 42 | 43 | -------------------------------------------------------------------------------- /man/balPanel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{balPanel} 3 | \alias{balPanel} 4 | \title{Create balanced panel dataset} 5 | \usage{ 6 | balPanel(df, datev, id, freq = "month") 7 | } 8 | \arguments{ 9 | \item{df}{data.frame of unbalanced panel data} 10 | 11 | \item{datev}{name of data variable in data.frame \code{df}} 12 | 13 | \item{id}{name of individual/entity variable. Company ID, country or people's names, for example.} 14 | 15 | \item{freq}{desired frequency of the time-series'} 16 | } 17 | \value{ 18 | balanced panel dataset as a data.frame. Values for dates that are missing in the input dataset show up as NAs 19 | } 20 | \description{ 21 | Takes an unbalanced panel dataset and makes it balanced by inserting rows for missing time periods 22 | and initializing with NAs for other columns that are not the time or individual variable. 23 | } 24 | \details{ 25 | Doesn't have any special treatment for duplicate values in the input time panel dataset. 26 | Currently handles time/date objects of type "Date" or "numeric". Can add POSIX and other date/time classes as needed. 27 | } 28 | \examples{ 29 | df <- data.frame(date=sort(sample(seq(Sys.Date()-20, Sys.Date(), by=1), 30, replace=T)), 30 | x1=runif(30), 31 | id=rep(c('A', 'B', 'C'), 10), stringsAsFactors=F) 32 | 33 | balPanel(df, datev='date', id='id', freq='days') 34 | } 35 | 36 | -------------------------------------------------------------------------------- /R/filterRules.R: -------------------------------------------------------------------------------- 1 | #' @title Post process association rules 2 | #' 3 | #' @description This function filters rules that have already been mined. 4 | #' 5 | #' @param rules list of association rules (S4 arules object). Output of \code{apriori} function. 6 | #' @param oneSideOnly list of character vectors. Each character vector of the list contains variables that are likely very similar 7 | #' and will generate uninteresting rules. So the filtering algorithm will prune rules where variables within this list appear on 8 | #' both the RHS & LHS 9 | #' @return association rules 10 | #' @import arules 11 | #' @export 12 | #' @examples 13 | #' 14 | #' library('arules') 15 | #' data('Adult') 16 | #' rules <- apriori(Adult, parameter = list(supp=0.01, conf=0.9, target = "rules")) 17 | #' oneSideOnly <- list(c('age', 'workclass', 'education'), c('marital-status', 'occupation', 'race')) 18 | #' f2 <- filterRules(rules, oneSideOnly=oneSideOnly) 19 | #' length(rules) 20 | #' length(f2) 21 | 22 | filterRules <- function(rules, oneSideOnly=oneSideOnly) { 23 | frules <- rules 24 | for(i in 1:length(oneSideOnly)) { 25 | cond <- paste0(unlist(oneSideOnly[i]), sep='=') 26 | comb <- expand.grid(cond, cond, stringsAsFactors=F) 27 | for(j in 1:nrow(comb)){ 28 | frules <- subset(frules, subset=(!(lhs %pin% comb[j,1] & (rhs %pin% comb[j,2])))) 29 | } 30 | } 31 | return(frules) 32 | } 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /man/uniglm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{uniglm} 3 | \alias{uniglm} 4 | \title{Univariate glm regression} 5 | \usage{ 6 | uniglm(df, yv, xv, file = NULL, sortby = "aic", xvlab = NULL, test = T) 7 | } 8 | \arguments{ 9 | \item{df}{data.frame with variables for analysis} 10 | 11 | \item{yv}{character string, target variable} 12 | 13 | \item{xv}{character vector, predictor variables to test univariately} 14 | 15 | \item{file}{character string, filepath to write results out to. txt or csv file.} 16 | 17 | \item{sortby}{character string, criteria to sort variables by. Default = 'aic'} 18 | 19 | \item{xvlab}{data.frame, lookup table for variable names. First column is codename, second column is the pretty printed name.} 20 | 21 | \item{test}{\code{TRUE} or \code{FALSE}. Includes Chi square test, or not.} 22 | } 23 | \value{ 24 | data.frame of results 25 | } 26 | \description{ 27 | Runs a univariate logistic regression on each predictor variable of interest. 28 | } 29 | \examples{ 30 | require('Hmisc') 31 | 32 | ##setting up some data 33 | longnames <- data.frame(long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.'), 34 | short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am'), stringsAsFactors=F) 35 | 36 | glm.out <- uniglm(df=mtcars, yv='vs', xv=c('hp','drat','cyl','mpg','wt'), xvlab=longnames) 37 | } 38 | \seealso{ 39 | \code{\link{bestglm}}, \code{\link{leaps}} 40 | } 41 | 42 | -------------------------------------------------------------------------------- /R/removePeriods.R: -------------------------------------------------------------------------------- 1 | #' @title Removes duplicative periods in column names of data.frame 2 | #' 3 | #' @description When reading in a flat file (csv, txt, etc) that was converted from an Excel spreadsheet, 4 | #' column names often have spaces between words. R reads these spaces as periods. If there are multiple spaces or special characters that 5 | #' are not valid in column names, R replaces with a period. When the data is messy, you sometimes get several consecutive periods 6 | #' or traling periods at the end of column names. I use this function as a coarse tool to standardize this ugliness 7 | #' Periods can then be easily replaced (\code{gsub}'ed) with a single character if periods aren't your thing. 8 | #' 9 | #' @param df input data.frame to be aggregated 10 | 11 | #' @return vector of strings without duplicate periods 12 | #' @export 13 | #' @examples 14 | #' ## making some messed up data to fix 15 | #' data(mtcars) 16 | #' names(mtcars)[1] <- paste(names(mtcars)[1], '..', sep='') 17 | #' names(mtcars)[3] <- paste(names(mtcars)[3], '.', sep='') 18 | #' names(mtcars)[4] <- paste(names(mtcars)[3], '..also.known..as.horsepower', sep='') 19 | 20 | #' removePeriods(names(mtcars)) 21 | 22 | removePeriods <- function(x) { 23 | ret <- gsub('\\.+', '\\.', x) 24 | lastE <- sapply(ret, function(z) substr(z,nchar(z),nchar(z))) 25 | ret[which(lastE=='.')] <- sapply(ret[which(lastE=='.')], function(z) substr(z, 0, nchar(z)-1)) 26 | return(unlist(ret)) 27 | } -------------------------------------------------------------------------------- /man/mergeRules2trans.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{mergeRules2trans} 3 | \alias{mergeRules2trans} 4 | \title{Link transactions/observations to association rules} 5 | \usage{ 6 | mergeRules2trans(trans, rhs, lhs) 7 | } 8 | \arguments{ 9 | \item{trans}{list of transactions} 10 | 11 | \item{rhs}{list of RHS of association rules} 12 | 13 | \item{lhs}{list of LHS of association rules} 14 | } 15 | \value{ 16 | matrix tying obs/transactions to association rules 17 | } 18 | \description{ 19 | Creates a matrix of transactions/observations (rows) by association rules (columns). 20 | 1 in the matrix indicates that the observation/transaction is eligible for and follows a given rule (LHS & RHS of rule are in the obs/transaction). 21 | -1 in the matrix indicates that obs/transaction is eligible for, but does violates the rule (LHS of rule, but not RHS of rule are in obs/transaction). 22 | 0 in the matrix indicates that the rule does not apply (LHS of rule not in obs/tranaction). 23 | } 24 | \examples{ 25 | library('arules') 26 | library('BBmisc') 27 | data(Adult) 28 | rules <- apriori(Adult, parameter=list(support=0.5, confidence=0.95)) 29 | rdf <- rules2df(rules, list=T) 30 | trdf <- as(Adult, 'data.frame') 31 | trdf$items <- gsub('\\\\{', '', trdf$items) 32 | trdf$items <- gsub('\\\\}', '', trdf$items) 33 | trdf$items2 <- strsplit(as.character(trdf$items), split=',') 34 | M <- mergeRules2trans(trans=trdf$items2[1:1000], rhs=rdf$rhs, lhs=rdf$lhs) 35 | } 36 | 37 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2 (4.0.2): do not edit by hand 2 | 3 | export(addRuleQuality) 4 | export(arulesApp) 5 | export(balPanel) 6 | export(binCat) 7 | export(depthbin) 8 | export(dir2dfList) 9 | export(editForm) 10 | export(filterRules) 11 | export(getEfromVlist) 12 | export(getV1fromVlist) 13 | export(getVfromE) 14 | export(ipipApp) 15 | export(isKey) 16 | export(leafletMapDF) 17 | export(logit2tab) 18 | export(makeForm) 19 | export(makeSQL_createTable) 20 | export(makeVertexAtt) 21 | export(mergeEdgeLists) 22 | export(mergeRules2trans) 23 | export(missdf) 24 | export(optimizeModelWeight) 25 | export(overlap) 26 | export(partialDep) 27 | export(partialDepAll) 28 | export(plotAvgBy) 29 | export(plotBestglm) 30 | export(plotdf) 31 | export(predQuantile) 32 | export(predSortPlot) 33 | export(pruneEdge) 34 | export(quantileFeatureScore) 35 | export(quantileScore) 36 | export(quantile_df) 37 | export(removePeriods) 38 | export(rfImp) 39 | export(roundCut) 40 | export(rules2df) 41 | export(shinyTable) 42 | export(shuffleAnova) 43 | export(smartAgg) 44 | export(tableNet) 45 | export(travCount) 46 | export(travOut) 47 | export(uniglm) 48 | import(BBmisc) 49 | import(Hmisc) 50 | import(MASS) 51 | import(RColorBrewer) 52 | import(arules) 53 | import(arulesViz) 54 | import(bestglm) 55 | import(data.table) 56 | import(fmsb) 57 | import(ggplot2) 58 | import(gplots) 59 | import(htmlwidgets) 60 | import(igraph) 61 | import(leaflet) 62 | import(memoise) 63 | import(randomForest) 64 | import(shiny) 65 | import(vcd) 66 | -------------------------------------------------------------------------------- /man/optimizeModelWeight.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{optimizeModelWeight} 3 | \alias{optimizeModelWeight} 4 | \title{Optimize weight on ensemble of 2 supervised models} 5 | \usage{ 6 | optimizeModelWeight(pred1, pred2, actual, steps = 50, cutoff = 0.5) 7 | } 8 | \arguments{ 9 | \item{pred1}{numeric vector of probabilities, prediction from model 1} 10 | 11 | \item{pred2}{numeric vector of probabilities, prediction from model 2} 12 | 13 | \item{actual}{vector of 1s and 0s. The target variable test data} 14 | 15 | \item{steps}{number, high numbers compute a more exhaustive combination of model weights} 16 | 17 | \item{cutoff}{Cutoff used to demarcate predictions into positive or negative class.} 18 | 19 | \item{ytest,}{vector of the target variable from test data. (0s and 1s)} 20 | 21 | \item{jitterPlot}{\code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}} 22 | } 23 | \value{ 24 | data.frame of results 25 | } 26 | \description{ 27 | This function creates a weighted average of predictions from two models 28 | and evaluates F1, precision, recall, auc or c for each combination of the 29 | models to determine the best weights for each. 30 | } 31 | \examples{ 32 | require('Hmisc') 33 | fit_glm1 <- glm(am~cyl, data=mtcars, family=binomial(logit)) 34 | fit_glm2 <- glm(am~disp, data=mtcars, family=binomial(logit)) 35 | ow <- optimizeModelWeight(fit_glm1$fitted.values, fit_glm2$fitted.values, actual=fit_glm1$model$am) 36 | plot(ow$weights, ow$precision, type='l', xlab='weight on model 1') 37 | } 38 | 39 | -------------------------------------------------------------------------------- /man/dir2dfList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{dir2dfList} 3 | \alias{dir2dfList} 4 | \title{Turn a directory of flat files into a list of data.frames} 5 | \usage{ 6 | dir2dfList(dfdir, ext = ".txt", exclude = NULL, ...) 7 | } 8 | \arguments{ 9 | \item{dfdir}{character string of the directory where you want to load flat files} 10 | 11 | \item{ext}{file extention on the type of files to load. Usually \code{.csv} or \code{.txt}} 12 | 13 | \item{exclude}{character string of table names to be excluded from app. Needs to be specified to \code{NULL} or a character 14 | vector or else \code{...} arguments will not be handled properly.} 15 | 16 | \item{...}{parameters to pass to \code{\link{read.delim}}. Commonly \code{nrow}, \code{sep},} 17 | } 18 | \value{ 19 | list of data.frames 20 | } 21 | \description{ 22 | Useful to prepare data for \code{\link{tableNet}} 23 | } 24 | \examples{ 25 | \dontrun{ 26 | ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory 27 | temp <- tempfile() 28 | localDataDir <- paste0(tempdir(), '\\\\lahman2012-csv-onYourComp.zip') 29 | download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir) 30 | unzip(localDataDir, exdir=paste0(tempdir(), '\\\\lahman2012-csv-onYourComp')) ## may not be necessary 31 | 32 | ## create a list of data.frames from .CSVs 33 | dfL <- dir2dfList(paste0(tempdir(), '\\\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F) 34 | } 35 | } 36 | \seealso{ 37 | \code{\link{tableNet}} \code{\link{isKey}} 38 | } 39 | 40 | -------------------------------------------------------------------------------- /R/quantileScore.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Quantile scoring function for continuous or integer valued data 3 | #' 4 | #' @description This function sorts a series from beginning to end. It uses each observations place in the quantile to assign it a score. 5 | #' useful for transforming variables into the same units (0-1). Handles NAs 6 | #' 7 | #' @param x numeric vector to scale 8 | #' @return a vector of attributes that correspond in order with the nodes in your igraph 9 | #' @export 10 | #' @examples 11 | #' 12 | #' cbind(quantileScore(mtcars$mpg), mtcars$mpg) 13 | #' cbind(quantileScore(mtcars$cyl), mtcars$cyl) 14 | 15 | quantileScore <- function(x) { 16 | xo <- x 17 | x <- x[is.na(x)==F] 18 | xs <- data.frame(x=sort(x), id=1:length(x), stringsAsFactors=F) 19 | xsd <- xs[duplicated(xs$x)==F,] 20 | xsd$q <- (xsd$id)/length(x) 21 | ret <- xsd$q[match(xo, xsd[,1])] 22 | return(ret) 23 | } 24 | 25 | 26 | #' @title Quantile scoring function for continuous or integer valued data 27 | #' 28 | #' @description This function combines the quantile function with the basic feature normalization technique 29 | #' useful for getting a stable normalization 30 | #' 31 | #' @param x numeric vector to scale 32 | #' @return a vector of attributes that correspond in order with the nodes in your igraph 33 | #' @export 34 | #' @examples 35 | #' 36 | #' cbind(quantileFeatureScore(sort(mtcars$mpg)), sort(mtcars$mpg)) 37 | #' cbind(quantileFeatureScore(mtcars$cyl), mtcars$cyl) 38 | 39 | quantileFeatureScore <- function(x, wq=0.5, wf=0.5){ 40 | ret <- wq*quantileScore(x) + wf*(x-min(x, na.rm=T))/(max(x, na.rm=T)-min(x, na.rm=T)) 41 | return(ret) 42 | } 43 | -------------------------------------------------------------------------------- /man/leafletMapDF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{leafletMapDF} 3 | \alias{leafletMapDF} 4 | \title{Leaflet interactive map widget} 5 | \usage{ 6 | leafletMapDF(data, vars = c(lon = "longitude", lat = "latitude")) 7 | } 8 | \arguments{ 9 | \item{data}{data.frame or data.table of data to visualize on map. Included must be a column for longitude and latitude coordinates. 10 | Four additional columns are allowed for filtering and coloring the data. Note, the filters are currently only categorical, not currently 11 | supporting numeric ranges. Recommend only passing columns needed for filtering. Default is to pick first 4} 12 | 13 | \item{vars}{named character vector. The names of the vector should always be 'lon' and 'lat'. The corresponding values 14 | of this vector are the column names of the columns in \code{data} with the lat and lon data.} 15 | } 16 | \value{ 17 | Shiny App 18 | } 19 | \description{ 20 | Launches a Shiny App an interactive Leaflet map which can be used to visualize coordinate data. Intended 21 | for quick exploratory geographic analyses 22 | } 23 | \examples{ 24 | \dontrun{ 25 | n <- 5000 26 | df <- data.frame(latitude=runif(n, 35, 40), 27 | longitude=runif(n, -100, -85), 28 | animals=sample(c('dogs', 'cats', 'turtles'), n, replace=T) 29 | ) 30 | df$westness <- cut(df$longitude, breaks=seq(min(df$longitude), max(df$longitude), length.out=10)) 31 | df$northness <- cut(df$latitude, breaks=seq(min(df$latitude), max(df$latitude), length.out=10)) 32 | leafletMapDF(df) 33 | } 34 | } 35 | \seealso{ 36 | \code{leaflet}, \code{shiny} 37 | } 38 | 39 | -------------------------------------------------------------------------------- /man/makeVertexAtt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{makeVertexAtt} 3 | \alias{makeVertexAtt} 4 | \title{merge data.frame attributes to an igraph (network) object} 5 | \usage{ 6 | makeVertexAtt(g, df, vname, by.df, by.g = "name") 7 | } 8 | \arguments{ 9 | \item{g}{igraph object to which you want to add a node/vertex attribute} 10 | 11 | \item{df}{data.frame containing the attribute you want to add to igraph} 12 | 13 | \item{vname}{name of column in data.frame \code{df} that you will merge into igraph} 14 | 15 | \item{by.df}{unique key in data.frame that you will use to merge attribute into igraph} 16 | 17 | \item{by.g}{unique key in igraph that you will use to merge attribute from data.frame} 18 | } 19 | \value{ 20 | a vector of attributes that correspond in order with the nodes in your igraph 21 | } 22 | \description{ 23 | Enriches an igraph object with node/vertex attributes from a data.frame 24 | } 25 | \examples{ 26 | require('igraph') 27 | actors <- data.frame(names=c("Alice", "Bob", "Cecil", "David","Esmeralda"), 28 | age=c(48,33,45,34,21), 29 | gender=c("F","M","F","M","F")) 30 | relations <- data.frame(from=c("Bob", "Cecil", "Cecil", "David", 31 | "David", "Esmeralda"), 32 | to=c("Alice", "Bob", "Alice", "Alice", "Bob", "Alice"), 33 | same.dept=c(FALSE,FALSE,TRUE,FALSE,FALSE,TRUE), 34 | friendship=c(4,5,5,2,1,1), advice=c(4,5,5,4,2,3)) 35 | g <- igraph::graph.edgelist(as.matrix(relations[,c('from', 'to')]), directed=T) 36 | 37 | V(g)$age <- makeVertexAtt(g, df=actors, vname='age', by.df='names', by.g='name') 38 | } 39 | 40 | -------------------------------------------------------------------------------- /R/addRuleQuality.R: -------------------------------------------------------------------------------- 1 | #' @title Add quality measures to association rules 2 | #' @description Adds measures of rule quality (conviction, hyperConfidence, cosine, chiSquare, coverage, doc, gini, hyperlift) to a set of 3 | #' association rules mined from \code{apriori}. Usually used before converting ruleset to data.frame and exporting to some sort of text file. 4 | #' @param trans transaction set (s4 class from arules package) 5 | #' @param rules set of rules from \code{apriori} 6 | #' @param include character vector specifying which quality measures to include. Default is to include everything. 7 | #' @param exclude character vector specifying which quality measures to exclude. Default is to exclude 'improvement' because it seems to be slow. 8 | #' @return ruleset with additional quality measures 9 | #' @import arules 10 | #' @export 11 | #' @examples 12 | #' library('arules') 13 | #' data("Adult") 14 | #' ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target = "rules")) 15 | #' ar <- addRuleQuality(trans=Adult, rules=ar) 16 | #' df <- Rsenal::rules2df(ar) 17 | 18 | addRuleQuality <- function(trans, rules, include=NULL, exclude='improvement') { 19 | allMeasures <- c("support", "confidence", "lift", "conviction", "hyperConfidence", "cosine", "chiSquare", "coverage", "doc", 20 | "gini", "hyperLift", "fishersExactTest", "improvement", "leverage", "oddsRatio", "phi", "RLD") 21 | if(is.null(include)==F) allMeasures <- include 22 | if(is.null(exclude)==F) allMeasures <- setdiff(allMeasures, exclude) 23 | for(i in allMeasures) quality(rules)[i] <- interestMeasure(rules, method=i, transactions=trans) 24 | return(rules) 25 | } 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /R/mergeRules2trans.R: -------------------------------------------------------------------------------- 1 | #' @title Link transactions/observations to association rules 2 | #' 3 | #' @description Creates a matrix of transactions/observations (rows) by association rules (columns). 4 | #' 1 in the matrix indicates that the observation/transaction is eligible for and follows a given rule (LHS & RHS of rule are in the obs/transaction). 5 | #' -1 in the matrix indicates that obs/transaction is eligible for, but does violates the rule (LHS of rule, but not RHS of rule are in obs/transaction). 6 | #' 0 in the matrix indicates that the rule does not apply (LHS of rule not in obs/tranaction). 7 | #' 8 | #' @param trans list of transactions 9 | #' @param rhs list of RHS of association rules 10 | #' @param lhs list of LHS of association rules 11 | #' @return matrix tying obs/transactions to association rules 12 | #' @import BBmisc 13 | #' @export 14 | #' @examples 15 | #' library('arules') 16 | #' library('BBmisc') 17 | #' data(Adult) 18 | #' rules <- apriori(Adult, parameter=list(support=0.5, confidence=0.95)) 19 | #' rdf <- rules2df(rules, list=T) 20 | 21 | #' trdf <- as(Adult, 'data.frame') 22 | #' trdf$items <- gsub('\\{', '', trdf$items) 23 | #' trdf$items <- gsub('\\}', '', trdf$items) 24 | #' trdf$items2 <- strsplit(as.character(trdf$items), split=',') 25 | 26 | #' M <- mergeRules2trans(trans=trdf$items2[1:1000], rhs=rdf$rhs, lhs=rdf$lhs) 27 | 28 | mergeRules2trans <- function(trans, rhs, lhs){ 29 | M <- data.frame(matrix(0, nrow=length(trans), ncol=length(lhs))) 30 | for(i in 1:ncol(M)) M[sapply(trans, function(x) isSubset(c(lhs[[i]], rhs[[i]]), x)), i] <- 1 31 | for(i in 1:ncol(M)) M[sapply(trans, function(x) isSubset(lhs[[i]], x) & isSubset(rhs[[i]], x)==F), i] <- -1 32 | return(M) 33 | } 34 | 35 | -------------------------------------------------------------------------------- /R/rules2df.R: -------------------------------------------------------------------------------- 1 | #' @title Transform association rule results into data.frame 2 | #' 3 | #' @description Note this function only currently works when the itemsets are of size 1 on the LHS and RHS 4 | #' 5 | #' @param rules list of association rules (S4 arules object). Output of \code{apriori} function. 6 | #' @param list logical \code{TRUE} or \code{FALSE}. sets LHS items in a list, rather than one character string 7 | #' @return association rules in a data.frame 8 | #' @export 9 | #' @examples 10 | #' library('arules') 11 | #' data("Adult") 12 | #' ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.6, target = "rules", minlen=2)) 13 | #' df <- rules2df(ar, list=T) 14 | 15 | rules2df <- function(rules, list=F){ 16 | df <- as(rules, 'data.frame') 17 | df[,1] <- as.character(df[,1]) 18 | df$lhs <- sapply(df[,1], function(x) strsplit(x, split=' => ')[[1]][1]) 19 | df$rhs <- sapply(df[,1], function(x) strsplit(x, split=' => ')[[1]][2]) 20 | df$lhs <- gsub(pattern='\\{', replacement='', x=df$lhs) 21 | df$lhs <- gsub(pattern='}', replacement='', x=df$lhs) 22 | df$rhs <- gsub(pattern='\\{', replacement='', x=df$rhs) 23 | df$rhs <- gsub(pattern='}', replacement='', x=df$rhs) 24 | 25 | if(list==T){ 26 | p <- rules@lhs@data@p 27 | i <- rules@lhs@data@i+1 28 | lhsItems <- unlist(rules@lhs@itemInfo@.Data) 29 | lhsL <- list() 30 | for(j in 2:length(p)) lhsL[[j-1]] <- lhsItems[i[(p[j-1]+1):(p[j])]] 31 | df$lhs <- lhsL 32 | 33 | p <- rules@rhs@data@p 34 | i <- rules@rhs@data@i+1 35 | rhsItems <- unlist(rules@rhs@itemInfo@.Data) 36 | rhsL <- list() 37 | for(j in 2:length(p)) rhsL[[j-1]] <- rhsItems[i[(p[j-1]+1):(p[j])]] 38 | df$rhs <- rhsL 39 | } 40 | return(df) 41 | } 42 | -------------------------------------------------------------------------------- /man/isKey.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{isKey} 3 | \alias{isKey} 4 | \title{Determine strength of linking variables} 5 | \usage{ 6 | isKey(dfL, xvar) 7 | } 8 | \arguments{ 9 | \item{dfL}{list of data.frames. easily generated from \code{\link{dir2dfList}}} 10 | 11 | \item{xvar}{character string, name of the variable to calculate strength for across all tables in \code{dfL}} 12 | 13 | \item{printdf}{prints progress of flat file loads to R console.} 14 | } 15 | \value{ 16 | list of data.frames 17 | } 18 | \description{ 19 | This function computes the percentage of unique values of a column \code{x} from \code{table1} that appear in 20 | in a \code{table2}. It is called and computed on the fly in \code{\link{tableNet}}. However, these computations can be 21 | slow on large datasets, so it is provided a standalone function that can be run once to store the output and fed into the 22 | \code{\link{tableNet}} app to prevent repetitive slow computations on the fly. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory 27 | temp <- tempfile() 28 | localDataDir <- paste0(tempdir(), '\\\\lahman2012-csv-onYourComp.zip') 29 | download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir) 30 | unzip(localDataDir, exdir=paste0(tempdir(), '\\\\lahman2012-csv-onYourComp')) ## may not be necessary 31 | 32 | ## create a list of data.frames from .CSVs 33 | dfL <- dir2dfList(paste0(tempdir(), '\\\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F) 34 | isKey(dfL, 'playerID') 35 | } 36 | } 37 | \seealso{ 38 | \code{\link{tableNet}} \code{\link{dir2dfList}} 39 | } 40 | 41 | -------------------------------------------------------------------------------- /R/shiny_table.R: -------------------------------------------------------------------------------- 1 | #' @title Shiny app to visualize dataframe as simple interactive datatable 2 | #' @description Launches a basic Shiny App that renders the given dataframe into an interactive datatable using \code{renderDataTable} 3 | #' @param df dataframe to be visualized 4 | #' @return Shiny App 5 | #' @import shiny 6 | #' @export 7 | #' @examples 8 | #' \dontrun{ 9 | #' shinyTable(mtcars) 10 | #' } 11 | 12 | 13 | shinyTable <- function(df) { 14 | shinyApp( 15 | shinyUI(fluidPage( 16 | h2(paste0(deparse(substitute(df)))), 17 | sidebarLayout(sidebarPanel( 18 | textInput('filter', 'Filter rows using R logical operators'), 19 | uiOutput('querystatus'), 20 | br(), 21 | br(), 22 | checkboxGroupInput('show_vars', 'Columns to show', names(df), selected=names(df)) 23 | ), 24 | mainPanel( 25 | dataTableOutput('mytable') 26 | ) 27 | ) 28 | )), 29 | server = function(input, output) { 30 | output$mytable = renderDataTable({ 31 | df1 <- df[,input$show_vars, drop=F] 32 | df2 <- try(eval(parse(text=paste0('subset(df1,', input$filter, ')')))) 33 | if(class(df2)!='try-error') {dfp <- df2 34 | output$querystatus <- renderText({'query successfull :)'}) 35 | } else {dfp <- df1 36 | output$querystatus <- renderText({'query unsuccessful :(
(showing full dataset)'}) 37 | } 38 | dfp}, 39 | options=list(lengthMenu=list(c(25,50,100,500,1000,5000,-1), c('25', '50', '100', '500', '1000', '5000', 'All')), 40 | pageLength=500) 41 | ) 42 | } 43 | ) 44 | } 45 | -------------------------------------------------------------------------------- /R/makeSQL_createTable.R: -------------------------------------------------------------------------------- 1 | #' generate SQL script to create table in SQL database from data.frame 2 | #' 3 | #' Creates a sql Query that will create a table in a SQL Database from an R dataframe. 4 | #' It intializes with variable types (naively), but prevents the user from manually writing the query, 5 | #' which can be especially cumbersome when there are many columns. 6 | #' Initialized with basic SQL Server variable types. 7 | #' 8 | #' @param df data.frame we will generate a SQL create table script for 9 | #' @param tablename (optional) name of the table we want to create in a sql database 10 | #' @param saveQuery (optional) name and filepath of the .sql script that will create the table. 11 | #' @param wordy (optional) prints the sql query out to the console. 12 | #' @return sql query which the create table script 13 | #' @export 14 | #' @examples 15 | #' makeSQL_createTable(mtcars) 16 | #' \dontrun{ 17 | #' makeSQL_createTable(mtcars, saveQuery='mySQL_query.sql') 18 | #' } 19 | 20 | 21 | makeSQL_createTable <- function(df, tablename=deparse(substitute(df)), saveQuery=NULL, wordy=T) { 22 | 23 | types <- list( 24 | 'numeric'='decimal', 25 | 'integer'='int', 26 | 'character'='varchar(40)', 27 | 'logical'='varchar(40)', 28 | 'factor'='varchar(40)', 29 | 'POSIXct'='datetime' 30 | ) 31 | 32 | sql <- paste('create table ', tablename, '\n', '(', sep='') 33 | for(i in 1:ncol(df)) { 34 | k <- names(df)[i] 35 | vclass <- ifelse(class(df[,k]) %in% names(types), types[[class(df[,k])]], 'varchar(40)') 36 | if(i26] <- sample(1:26, length(d[d>26]), replace=T) 40 | dl <- letters[d] 41 | barplot(table(dl)) 42 | table(binCat(dl, results=F, ncat=5)) 43 | table(binCat(dl, results=F, maxp=0.5)) 44 | table(binCat(dl, results=F, maxp=0.9)) 45 | 46 | ## With missings 47 | ff <- sample(letters[1:15], 100, replace=T) 48 | ff[sample(100, 10)] <- NA 49 | binCat(ff, ncat=7, setNA='missing') 50 | } 51 | 52 | -------------------------------------------------------------------------------- /R/mergeEdgeLists.R: -------------------------------------------------------------------------------- 1 | #' @title Merge edgelists => Master edgelist => igraph 2 | #' 3 | #' @description This function turns a list of edgelists (data.frames) into a single "master" edgelist. 4 | #' It maintains the edge attributes of each individual edge list 5 | #' 6 | #' @param edgeLists list of edgelists (stored as data.frames) 7 | #' @param from name of the column that specifies the "from" node in each data.frame in \code{edgeLists} 8 | #' @param to name of the column that specifies the "to" node in each data.frame in \code{edgeLists} 9 | #' @param keepDups deletes duplicate from-to relations when set to FALSE 10 | #' @export 11 | #' @examples 12 | #' require('igraph') 13 | #' cars <- data.frame(mtcars, to=sample(rownames(mtcars), replace=T), from=sample(rownames(mtcars), replace=T)) 14 | #' df1<-cars[sample(1:32,30), c('to', 'from', 'cyl', 'mpg')] 15 | #' df2<-cars[sample(1:32,15), c('to', 'from', 'cyl', 'qsec')] 16 | #' df3<-cars[sample(1:32,32), c('to', 'from', 'hp', 'drat')] 17 | #' 18 | #' df1$cyl[1:10] <- df1$cyl[1:10]+2 19 | #' el <- list(df1, df2, df3) 20 | #' 21 | #' mel <- mergeEdgeLists(el, from='from', to='to', keepDups=T) 22 | 23 | mergeEdgeLists <- function(edgeLists, from='from.ID', to='to.ID', keepDups=F){ 24 | n <- length(edgeLists) 25 | 26 | mel <- edgeLists[[1]] 27 | for(i in 2:n){ 28 | mel <- merge(mel, edgeLists[[i]], by=c(to, from), all=T, suffixes=c(paste('.',i-1, sep=''), paste('.',i, sep=''))) 29 | dupVars <- intersect(setdiff(names(edgeLists[[i]]), c(from, to)), setdiff(names(edgeLists[[i-1]]), c(from, to))) 30 | 31 | ## if we have to merge columns with repeat names 32 | for(d in dupVars){ 33 | print(paste('merging columns with same name in multiple edgelists: ',d)) 34 | dupVarName1 <- paste(d, '.',i-1, sep='') 35 | dupVarName2 <- paste(d, '.',i, sep='') 36 | mel[, d] <- mel[,dupVarName1] 37 | mel[is.na(mel[,d]), d] <- mel[is.na(mel[,d]), dupVarName2] 38 | nonmissingRows <- is.na(mel[,dupVarName1])==F & is.na(mel[,dupVarName2])==F 39 | if(sum(mel[nonmissingRows, dupVarName1]!=mel[nonmissingRows, dupVarName2])>0) print(paste('Warning!!! Some values of ', dupVarName1, ' and ', dupVarName2, ' do not match.', sep='')) 40 | if(keepDups==F) mel <- mel[,setdiff(names(mel), c(dupVarName1, dupVarName2))] 41 | } 42 | } 43 | return(mel) 44 | } 45 | -------------------------------------------------------------------------------- /R/makeVertexAtt.R: -------------------------------------------------------------------------------- 1 | #' @title merge data.frame attributes to an igraph (network) object 2 | #' 3 | #' @description Enriches an igraph object with node/vertex attributes from a data.frame 4 | #' 5 | #' @param g igraph object to which you want to add a node/vertex attribute 6 | #' @param df data.frame containing the attribute you want to add to igraph 7 | #' @param vname name of column in data.frame \code{df} that you will merge into igraph 8 | #' @param by.df unique key in data.frame that you will use to merge attribute into igraph 9 | #' @param by.g unique key in igraph that you will use to merge attribute from data.frame 10 | #' @return a vector of attributes that correspond in order with the nodes in your igraph 11 | #' @import igraph 12 | #' @export 13 | #' @examples 14 | #' require('igraph') 15 | #' actors <- data.frame(names=c("Alice", "Bob", "Cecil", "David","Esmeralda"), 16 | #' age=c(48,33,45,34,21), 17 | #' gender=c("F","M","F","M","F")) 18 | #' relations <- data.frame(from=c("Bob", "Cecil", "Cecil", "David", 19 | #' "David", "Esmeralda"), 20 | #' to=c("Alice", "Bob", "Alice", "Alice", "Bob", "Alice"), 21 | #' same.dept=c(FALSE,FALSE,TRUE,FALSE,FALSE,TRUE), 22 | #' friendship=c(4,5,5,2,1,1), advice=c(4,5,5,4,2,3)) 23 | #' g <- igraph::graph.edgelist(as.matrix(relations[,c('from', 'to')]), directed=T) 24 | #' 25 | #' V(g)$age <- makeVertexAtt(g, df=actors, vname='age', by.df='names', by.g='name') 26 | 27 | 28 | makeVertexAtt <- function(g, df, vname, by.df, by.g='name') { 29 | gdf <- data.frame(id=igraph::get.vertex.attribute(g, by.g), stringsAsFactors=F) 30 | mdf <- merge(gdf, df[,c(by.df, vname)], by.x='id', by.y=by.df, all.x=T, all.y=F) 31 | check <- igraph::get.vertex.attribute(g, by.g) 32 | mdf <- mdf[match(check, mdf$id),] 33 | 34 | ## checking that order of assets is preserved 35 | mdf$check01 <- check == mdf$id 36 | if(sum(check!=mdf$id)>0) cat(paste('Warning: Order of Network Nodes might be misaligned!!! \n', 37 | sum(mdf$check01), ' out of ', nrow(mdf), ' nodes are misaligned', sep='')) 38 | if(sum(is.na(mdf[,vname]))>0) print(paste('Warning: ', sum(is.na(mdf[,vname])), ' NAs out of ', nrow(mdf), ' node attributes in graph.', sep='')) 39 | 40 | return(mdf[,vname]) 41 | } 42 | 43 | 44 | -------------------------------------------------------------------------------- /man/partialDep.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{partialDep} 3 | \alias{partialDep} 4 | \title{Partial Dependence (single variable)} 5 | \usage{ 6 | partialDep(model, df, xvar, n = 10, target.class = "1", ci = c(0.9, 0.5, 7 | 0.3)) 8 | } 9 | \arguments{ 10 | \item{model}{model object used to generate predictions. Currently only built and tested for random forest.} 11 | 12 | \item{df}{data.frame or data.table used to generate predictions with \code{model}} 13 | 14 | \item{xvar}{character of length one; the x variable in \code{df} to assess for partial dependence with the response variable from \code{model}.} 15 | 16 | \item{n}{numeric of length one; number of values between the min and max of \code{xvar} to score the model. 17 | Note: this number is also how many replicates of \code{df} must be created and stored in memory. Default is 10.} 18 | 19 | \item{target.class}{character: Which category (class) of the target variable to use for predictions} 20 | 21 | \item{ci}{numeric: specify any confidence intervals around the median response.} 22 | } 23 | \value{ 24 | data.table of output. \code{cnt} refers to how many obs from \code{df} are within the fixed-width bin specified by \code{xvar}. 25 | } 26 | \description{ 27 | Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model. 28 | Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence 29 | technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max. This gives a more 30 | realistic idea of the magnitude and direction of the x variable on the response. 31 | } 32 | \examples{ 33 | library('randomForest') 34 | library('data.table') 35 | DF <- mtcars 36 | DF$vs <- factor(DF$vs) 37 | rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100) 38 | pd <- partialDep(model=rf, df=DF, xvar='mpg') 39 | pd[ci==0.5,] # median of response when sequenced through 'mpg' 40 | 41 | ## Plotting 42 | plot(pd[cilev==0, xvar], pd[cilev==0, pred], type='l', ylim=c(0,1)) 43 | lines(pd[ci==.95, xvar], pd[ci==.95, pred], type='l', col='red') 44 | lines(pd[ci==.05, xvar], pd[ci==.05, pred], type='l', col='green') 45 | } 46 | \seealso{ 47 | \code{\link{partialDepPlot}} 48 | } 49 | 50 | -------------------------------------------------------------------------------- /man/partialDepAll.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{partialDepAll} 3 | \alias{partialDepAll} 4 | \title{Partial Dependence Spark Lines (multiple variables)} 5 | \usage{ 6 | partialDepAll(model, df, n = 10, xvars = NULL, target.class = "1", 7 | ci = c(0.75), plot = T, data = T, plot.yaxis.fixed = T, wordy = T) 8 | } 9 | \arguments{ 10 | \item{model}{model object used to generate predictions. Currently only built and tested for random forest.} 11 | 12 | \item{df}{data.frame or data.table used to generate predictions with \code{model}} 13 | 14 | \item{n}{numeric of length one; number of values between the min and max of \code{xvar} to score the model. 15 | Note: this number is also how many replicates of \code{df} must be created and stored in memory. Default is 10.} 16 | 17 | \item{xvars}{character vector; the x variables in \code{df} to assess for partial dependence with the response variable from \code{model}. Defaults to choosing all variables from model.} 18 | 19 | \item{target.class}{character: Which category (class) of the target variable to use for predictions} 20 | 21 | \item{ci}{numeric: specify any confidence intervals around the median response.} 22 | 23 | \item{plot}{logical: plot sparklines (\code{TRUE}), or no (\code{FALSE})} 24 | 25 | \item{data}{logical: return summary table of output. yes (\code{TRUE}), or no (\code{FALSE})} 26 | 27 | \item{plot.yaxis.fixed}{logical: \code{TRUE} uses same yaxis for all x variables (biggest max and smallest min across all variables). code{\FALSE} uses variable specific axes.} 28 | 29 | \item{wordy}{logical: print progress to the console} 30 | } 31 | \value{ 32 | list of output and plot 33 | } 34 | \description{ 35 | Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model. 36 | Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence 37 | technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max. This gives a more 38 | realistic idea of the magnitude and direction of the x variable on the response. 39 | } 40 | \examples{ 41 | library('randomForest') 42 | library('data.table') 43 | library('ggplot2') 44 | DF <- mtcars 45 | DF$vs <- factor(DF$vs) 46 | rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100) 47 | pda <- partialDepAll(model=rf, df=DF, n=10) 48 | } 49 | \seealso{ 50 | \code{\link{partialDep}} 51 | } 52 | 53 | -------------------------------------------------------------------------------- /man/arulesApp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{arulesApp} 3 | \alias{arulesApp} 4 | \title{Assocation Rules Visualization Shiny App} 5 | \usage{ 6 | arulesApp(dataset, bin = T, vars = 5, supp = 0.1, conf = 0.5) 7 | } 8 | \arguments{ 9 | \item{dataset}{data.frame, this is the dataset that association rules will be mined from. Each row is treated as a transaction. Seems to work 10 | OK when a the S4 transactions class from \code{arules} is used, however this is not thoroughly tested.} 11 | 12 | \item{bin}{logical, \code{TRUE} will automatically discretize/bin numerical data into categorical features that can be used for association analysis.} 13 | 14 | \item{vars}{integer, how many variables to include in initial rule mining} 15 | 16 | \item{supp}{numeric, the support parameter for initializing visualization. Useful when it is known that a high support is needed to not crash computationally.} 17 | 18 | \item{conf}{numeric, the confidence parameter for initializing visualization. Similarly useful when it is known that a high confidence is needed to not crash computationally.} 19 | } 20 | \value{ 21 | Shiny App 22 | } 23 | \description{ 24 | Launches a Shiny App that provides an interactive interface to the visualizations of the \code{arulesViz} package. 25 | The app allows users to mine rules based on all or just subsets of features, sort by criteria (lift, support, confidence) and visualize 26 | using network graph, grouped bubble and scatter plots. \cr 27 | Users filter rules to target only those with a certain variable on the RHS or LHS of the rule. 28 | Rule mining is computed using the \link{apriori} algorithm from \code{arules}. 29 | } 30 | \examples{ 31 | ## creating some data 32 | n <- 10000 # of obs 33 | d <- data.frame( 34 | eye = sample(c('brown', 'green', 'blue', 'hazel'), n, replace=T), 35 | gender = sample(c('male', 'female'), n, replace=T), 36 | height = sort(sample(c('dwarf', 'short', 'average', 'above average', 'giant'), n, replace=T)), 37 | wealth = sort(sample(c('poor', 'struggling', 'middle', 'uppermiddle', 'comfortable', 'rich', '1\%', 'millionaire', 'billionaire'), n, replace=T)), 38 | favoriteAnimal = sample(c('dog', 'cat', 'bat', 'frog', 'lion', 'cheetah', 'lion', 'walrus', 'squirrel'), n, replace=T), 39 | numkids = abs(round(rnorm(n, 2, 1))) 40 | ) 41 | 42 | ## adding some pattern 43 | d$numkids[d$gender=='male'] <- d$numkids[d$gender=='male'] + sample(0:3, sum(d$gender=='male'), replace=T) 44 | d$numkids <- factor(d$numkids) 45 | 46 | ## calling Shiny App to visualize association rules 47 | arulesApp(d) 48 | } 49 | \seealso{ 50 | \code{arulesViz}, \code{arules} 51 | } 52 | 53 | -------------------------------------------------------------------------------- /R/plotAvgBy.R: -------------------------------------------------------------------------------- 1 | #' @title Plot average of an indicator over bins/categories of another another continuous variable 2 | #' 3 | #' @description Useful for assessing the relationship between two variables of interest. There are cases, especially when 4 | #' outliers are involved, or many obs, that a scatterplot can be difficult to read. This function bins up one of the continuous 5 | #' variables that would be used in a scatterplot and calculates the mean (or other function) of the continuous variable over 6 | #' a range (discretized into categories) of the second continuous variable. It uses an equal depth binning algorithm 7 | #' to compute these bins on the \code{by} variable. 8 | #' 9 | #' It can be used to assess the average of a binary target variable/prediction 10 | #' over a range of levels of a continuous or categorical variable. 11 | #' 12 | #' @param indv vector. This the variable whose mean will be calculated over the categories of the \code{byv} vector binned up 13 | #' @param byv vector. This is variable to be binned up, by which the \code{indv} variable will be averaged 14 | #' @param nbins numeric. Number of bins to create when discretizing \code{byv}. Passed to \code{depthbin} function. 15 | #' @param data logical. TRUE returns the aggregated data.table. FALSE returns noting. TRUE is default. 16 | #' @param plotNbin logical. TRUE plots the count of obs in each bin on top of each bar. TRUE is default. 17 | #' @param ... additional barplot arguments. 18 | 19 | #' @return prints a barplot unless data==T, in which case the aggregated data.table is returned 20 | #' @import data.table 21 | #' @export 22 | #' @examples 23 | #' 24 | #' plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=8) 25 | #' plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=5, plotNbin=F) 26 | #' plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=5, plotNbin=F, data=T) 27 | #' 28 | #' ## Example with missing data 29 | #' df <- mtcars 30 | #' df$mpg[sample(1:nrow(mtcars), 5)] <- NA 31 | #' df$drat[sample(1:nrow(mtcars), 5)] <- NA 32 | #' 33 | #' plotAvgBy(df[,'mpg'], df[,'drat'], nbins=5, plotNbin=F, data=T) 34 | 35 | plotAvgBy <- function(indv, byv, nbins=5, data=F, plotNbin=T, ...){ 36 | dt <- data.table(indv, byv) 37 | warning(sprintf('removing %s observations where indv is NA', sum(is.na(dt$indv)))) 38 | dt <- dt[is.na(indv)==F,] 39 | tmp <- dt[, .(N=.N, hits=sum(as.numeric(as.character(indv)))), keyby=depthbin(byv, nbins=nbins)][,hitrate:=hits/N] 40 | plt <- tmp[,barplot(hitrate, names=depthbin,las=2, col='navy', ...)] 41 | if(plotNbin==T) text(plt, tmp$hitrate-abs(diff(range(tmp$hitrate)))/20, labels=tmp$N, col='white') 42 | if(data==T) return(tmp) 43 | } 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /R/overlap.R: -------------------------------------------------------------------------------- 1 | #' @title Assess overlap in vectors 2 | #' @description Useful for learning the overlap of absolute and unique values between two vectors of the same type. They 3 | #' do not have to be the same length, and are allowed to contain NAs. 4 | #' @param x1 vector of same class as \code{x2} 5 | #' @param x2 vector of same class as \code{x1} 6 | #' @param na.rm logical, remove NAs from analysis 7 | #' @param wordy logical, prints convenience information 8 | #' @return summary data.frame 9 | #' @export 10 | #' @examples 11 | #' x1 <- sample(1:50, 40, replace=T) 12 | #' x2 <- sample(c(1:60, rep(NA, 30)), 55, replace=T) 13 | #' overlap(x1, x2, na.rm=T) 14 | #' overlap(x1, x2) 15 | #' overlap(mtcars$gear, mtcars$cyl, x1name='gear', x2name='cyl') 16 | 17 | overlap <- function(x1, x2, na.rm=F, wordy=T, x1name=NULL, x2name=NULL) { 18 | 19 | # printing which variable is which. 20 | if(wordy==T) { 21 | cat(paste0('x1 is ', deparse(substitute(x1)))) 22 | cat('\n') 23 | cat(paste0('x2 is ', deparse(substitute(x2)))) 24 | cat('\n\n') 25 | } 26 | 27 | # checking for class mismatch 28 | if(class(x1) != class(x2)) { 29 | warning('classes of x1 and x2 are not equal. Coercing both to character') 30 | if(class(x1)!='character') x1 <- as.character(x1) 31 | if(class(x2)!='character') x2 <- as.character(x2) 32 | } 33 | 34 | # adding optional vector names 35 | x1name <- ifelse(is.null(x1name), 'x1', x1name) 36 | x2name <- ifelse(is.null(x2name), 'x2', x2name) 37 | 38 | # dealing w NAs 39 | if(na.rm==T){ 40 | x1 <- x1[is.na(x1)==F] 41 | x2 <- x2[is.na(x2)==F] 42 | } 43 | 44 | # creating statistics to return 45 | ret <- list() 46 | ret[[sprintf('%s exist in %s', x1name, x2name)]] <- c( 47 | length(x1), 48 | length(x2), 49 | sum(x1 %in% x2), 50 | sum(x1 %in% x2)/length(x1) 51 | ) 52 | ret[[sprintf('%s exist in %s', x2name, x1name)]] <- c( 53 | length(x2), 54 | length(x1), 55 | sum(x2 %in% x1), 56 | sum(x2 %in% x1)/length(x2) 57 | ) 58 | ret[[sprintf('unique(%s) exist in unique(%s)', x1name, x2name)]] <- c( 59 | length(unique(x1)), 60 | length(unique(x2)), 61 | sum(unique(x1) %in% unique(x2)), 62 | sum(unique(x1) %in% unique(x2))/length(unique(x1)) 63 | ) 64 | ret[[sprintf('unique(%s) exist in unique(%s)', x2name, x1name)]] <- c( 65 | length(unique(x2)), 66 | length(unique(x1)), 67 | sum(unique(x2) %in% unique(x1)), 68 | sum(unique(x2) %in% unique(x1))/length(unique(x2)) 69 | ) 70 | 71 | 72 | df <- t(data.frame(ret)) 73 | row.names(df) <- names(ret) 74 | colnames(df) <- c('LHS_count', 'RHS_count', 'count', 'percent') 75 | 76 | return(df) 77 | } 78 | 79 | 80 | -------------------------------------------------------------------------------- /man/smartAgg.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.2): do not edit by hand 2 | \name{smartAgg} 3 | \alias{smartAgg} 4 | \title{Aggregate multiple columns using different functions harnessing data.table efficiency} 5 | \usage{ 6 | smartAgg(df, by, ..., catN = T, printAgg = F) 7 | } 8 | \arguments{ 9 | \item{df}{input data.frame to be aggregated} 10 | 11 | \item{by}{variable name of data.frame \code{df} to aggregate on. Same as \code{by} in base R \code{aggregate} function} 12 | 13 | \item{...}{method to identify the variables to aggregate and the functions used to do so. 14 | Specify the function first as a string argument and then a vector of the column names to aggregate using that function. 15 | You can specify as many different functions as necessary, but every function must follow a vector of column names.} 16 | 17 | \item{catN}{adds a column named "countPerBin" with the # of observations aggregated in each row of the output data.frame.} 18 | 19 | \item{printAgg}{prints the line of code used to} 20 | } 21 | \value{ 22 | aggregated data.frame with columns corresponding to the grouping variables in by followed by aggregated columns from df. 23 | } 24 | \description{ 25 | aggregates columns of a data.frame on 1 or multiple dimensions using 1 or multiple functions for different columns. 26 | It is equivalent to the base R \code{aggregate} function, except that it allows the user to 27 | aggregate sets of columns (referred to by name or column #) with different functions... and it's fast! 28 | } 29 | \examples{ 30 | require('data.table') 31 | 32 | ## establishing variables to aggregate on 33 | lengthVs <- c('Sepal.Length', 'Petal.Length') 34 | widthVs <- c('Sepal.Width', 'Petal.Width') 35 | 36 | ## aggregating using 2 different functions and identifying columns to aggregate by variable names 37 | irisAgg1 <- smartAgg(df=iris, by='Species', 'mean', lengthVs, 'sum', widthVs) 38 | 39 | ## aggregating using 2 dimensions ("Specied" and "randthing") 40 | iris$randthing <- as.character(sample(1:5, nrow(iris), replace=T)) 41 | irisAgg2 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', lengthVs, 'sum', widthVs, catN=T, printAgg=T) 42 | 43 | ## aggregating variables by column number 44 | irisAgg3 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', 1:2, 'sum', 3:4, catN=T, printAgg=T) 45 | 46 | ## use anonymous functions 47 | data(mtcars) 48 | smartAgg(mtcars, by='cyl', function(x) sum(x*100), c('drat', 'mpg', 'disp')) 49 | 50 | ## use anonymous functions with more than 1 argument. Uses the provided variables for all unassigned arguments in anonymous function 51 | smartAgg(mtcars, by='cyl', function(x,y='carb') sum(x*y), c('drat', 'mpg', 'disp')) 52 | with(mtcars[mtcars$cyl==6,], c(sum(drat*carb), sum(mpg*carb), sum(disp*carb))) 53 | 54 | ## with anonymous functions with more than 1 argument. 55 | ## Example of possible unintended behavior - the user-provided variable is used for both and x and y in this example. 56 | smartAgg(mtcars, by='cyl', function(x,y) sum(x*y), c('drat', 'mpg', 'disp')) 57 | with(mtcars[mtcars$cyl==6,], c(sum(drat*drat), sum(mpg*mpg), sum(carb*carb))) 58 | 59 | ## demonstrating speed gain of smartAgg using data.table over aggregate 60 | n <- 300000 61 | df <- data.frame(x1=rnorm(n), x2=rbinom(n,5,0.5), x3=sample(letters, n, replace=T)) 62 | system.time(aggFast <- smartAgg(df, by='x3', 'mean', c('x1', 'x2'))) 63 | system.time(aggSlow <- aggregate(df[,c('x2', 'x1')], by=list(df$x3), FUN='mean')) 64 | } 65 | 66 | -------------------------------------------------------------------------------- /R/plotdf.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Plot data.frame to PDF 3 | #' @description Plots every column of a data.frame as an individual plot (one plot per page) 4 | #' in a PDF file. 5 | #' 6 | #' @param df data.frame to plot. 7 | #' @param file string - name of the PDF that will be created 8 | #' @param wordy - boolean \code{TRUE} or \code{FALSE}. Sequentially prints status of each chart to the console. 9 | #' Could be useful for large data.frames. 10 | #' 11 | #' @return PDF file of plots 12 | #' @export 13 | #' 14 | #' @examples 15 | #' \dontrun{ 16 | #' plotdf(df=mtcars, file='mtcars_plots.pdf') 17 | #' } 18 | 19 | plotdf <- function(df, file='output.pdf', wordy=F){ 20 | pdf(file) 21 | for(i in 1:ncol(df)){ 22 | if(wordy==T) print(i) 23 | if((class(df[,i]) %in% c('numeric', 'integer')) & length(unique(df[,i]))>15) { plotNum(df[,i], vn=names(df)[i]) 24 | } else { plotChar(df[,i], vn=names(df)[i])} 25 | } 26 | graphics.off() 27 | print(paste0('charts saved in ', getwd(), '/', file)) 28 | } 29 | 30 | 31 | ################################################################## 32 | ## Helper functions for plotdf (not included in Rsenal package) 33 | ################################################################## 34 | 35 | plotNum <- function(x, ...) { 36 | par(mfrow=c(3, 2)) 37 | par(oma=c(1,2,2,2)) 38 | layout(matrix(c(1,1,2,2,2,2), 3, 2, byrow=T), widths=c(1,1), heights=c(1,1,1)) 39 | ptitle <- ifelse(is.null(list(...)$vn), '', list(...)$vn) 40 | 41 | p1<-plot(0:1, 0:1, col='white', yaxt='n', ylab = '', xaxt='n', xlab='', main=ptitle) 42 | text(x=0.2, y=0.8, label='type:', pos=4, font=2) 43 | text(x=0.2, y=0.7, label='# of unique values:', pos=4, font=2) 44 | text(x=0.2, y=0.6, label='% NA:', pos=4, font=2) 45 | text(x=0.2, y=0.5, label='min:', pos=4, font=2) 46 | text(x=0.2, y=0.4, label='median:', pos=4, font=2) 47 | text(x=0.2, y=0.3, label='mean:', pos=4, font=2) 48 | text(x=0.2, y=0.2, label='max:', pos=4, font=2) 49 | 50 | text(x=0.4, y=0.8, label=class(x), pos=4) 51 | text(x=0.4, y=0.7, label=length(unique(x)), pos=4) 52 | text(x=0.4, y=0.6, label=paste0(round(sum(is.na(x))/length(x)*100, 4), '%'), pos=4) 53 | text(x=0.4, y=0.5, label=min(x, na.rm=T), pos=4) 54 | text(x=0.4, y=0.4, label=median(x, na.rm=T), pos=4) 55 | text(x=0.4, y=0.3, label=mean(x, na.rm=T), pos=4) 56 | text(x=0.4, y=0.2, label=max(x, na.rm=T), pos=4) 57 | 58 | p2<-hist(x, main=ptitle, breaks=30, xlab=ptitle, col='grey') 59 | rug(x, col='red') 60 | 61 | return(list(p1, p2)) 62 | } 63 | 64 | ## example 65 | ## plotNum(rnorm(1000)^2, vn='ssds') 66 | 67 | 68 | plotChar <- function(x, ...) { 69 | par(mfrow=c(3, 2)) 70 | layout(matrix(c(1,1,2,2,2,2), 3, 2, byrow=T), widths=c(1,1), heights=c(1,1,1)) 71 | 72 | ptitle <- ifelse(is.null(list(...)$vn), '', list(...)$vn) 73 | tab <- sort(table(x), decreasing=T)[1:min(length(unique(x)),50)] 74 | tabmiss <- round(sum(tab, na.rm=T)/length(x)*100, 1) 75 | 76 | p1 <- plot(0:1, 0:1, col='white', yaxt='n', ylab = '', xaxt='n', xlab='', main=ptitle) 77 | text(x=0.2, y=0.8, label='type:', pos=4, font=2) 78 | text(x=0.2, y=0.7, label='# of unique values:', pos=4, font=2) 79 | text(x=0.2, y=0.6, label='% NA:', pos=4, font=2) 80 | 81 | text(x=0.4, y=0.8, label=class(x), pos=4) 82 | text(x=0.4, y=0.7, label=length(unique(x)), pos=4) 83 | text(x=0.4, y=0.6, label=paste0(round(sum(is.na(x))/length(x)*100, 4), '%'), pos=4) 84 | 85 | par(mar=c(10,4.1,4.1,2.1)) 86 | p2 <- barplot(tab, las=2, cex.names=0.6, col='dodgerblue', 87 | main=paste(ptitle, ': ', tabmiss, '% of data shown', sep='')) 88 | par(mar=c(5.1,4.1,4.1,2.1)) 89 | 90 | return(list(p1, p2)) 91 | } -------------------------------------------------------------------------------- /R/shiny_leafletDF.R: -------------------------------------------------------------------------------- 1 | #' @title Leaflet interactive map widget 2 | #' @description Launches a Shiny App an interactive Leaflet map which can be used to visualize coordinate data. Intended 3 | #' for quick exploratory geographic analyses 4 | #' 5 | #' @param data data.frame or data.table of data to visualize on map. Included must be a column for longitude and latitude coordinates. 6 | #' Four additional columns are allowed for filtering and coloring the data. Note, the filters are currently only categorical, not currently 7 | #' supporting numeric ranges. Recommend only passing columns needed for filtering. Default is to pick first 4 8 | #' @param vars named character vector. The names of the vector should always be 'lon' and 'lat'. The corresponding values 9 | #' of this vector are the column names of the columns in \code{data} with the lat and lon data. 10 | #' @seealso \code{leaflet}, \code{shiny} 11 | #' @return Shiny App 12 | #' @import shiny leaflet data.table RColorBrewer htmlwidgets 13 | #' @export 14 | #' 15 | #' @examples 16 | #' 17 | #' \dontrun{ 18 | #' n <- 5000 19 | #' df <- data.frame(latitude=runif(n, 35, 40), 20 | #' longitude=runif(n, -100, -85), 21 | #' animals=sample(c('dogs', 'cats', 'turtles'), n, replace=T) 22 | #' ) 23 | #' df$westness <- cut(df$longitude, breaks=seq(min(df$longitude), max(df$longitude), length.out=10)) 24 | #' df$northness <- cut(df$latitude, breaks=seq(min(df$latitude), max(df$latitude), length.out=10)) 25 | #' leafletMapDF(df) 26 | #' } 27 | 28 | leafletMapDF <- function (data, vars=c('lon'='longitude', 'lat'='latitude')) { 29 | 30 | qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] 31 | col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) 32 | set.seed(110); n=length(col_vector); col_vector <- sample(col_vector, n) 33 | uivars <- setdiff(names(data), c(vars['lon'], vars['lat'])) 34 | 35 | if(length(uivars)>4) warning('currently supports only 4 variables for filtering. picking first 4 columns in data') 36 | 37 | shinyApp( 38 | ui = shinyUI(fluidPage( 39 | leafletOutput('mymap'), 40 | p(), 41 | h3('Filter data'), 42 | if(length(uivars)>=1) selectInput('ui1', label=sprintf('Filter by %s:', uivars[1]), choices=as.character(unique(data[[uivars[1]]])), multiple=T), 43 | if(length(uivars)>=2) selectInput('ui2', label=sprintf('Filter by %s:', uivars[2]), choices=as.character(unique(data[[uivars[2]]])), multiple=T), 44 | if(length(uivars)>=3) selectInput('ui3', label=sprintf('Filter by %s:', uivars[3]), choices=as.character(unique(data[[uivars[3]]])), multiple=T), 45 | if(length(uivars)>=4) selectInput('ui4', label=sprintf('Filter by %s:', uivars[4]), choices=as.character(unique(data[[uivars[4]]])), multiple=T), 46 | h3('Map parameters'), 47 | selectInput('colby', 'Color by', uivars), 48 | numericInput('radiusid', 'Circle radius (meters)', min=1, max=100000, value=10), 49 | downloadButton('downloadid', 'Save as HTML') 50 | )), 51 | 52 | server = function(input, output) { 53 | 54 | datar <- reactive({ 55 | # filtering data based on ui. showing rows that meet ALL (not ANY) of the filters. blanks/NULLS are assumed to be all points. 56 | if(is.null(input[['ui1']])) cond1 <- rep(T, nrow(data)) else cond1 <- data[[uivars[1]]] %in% input[['ui1']] 57 | if(is.null(input[['ui2']])) cond2 <- rep(T, nrow(data)) else cond2 <- data[[uivars[2]]] %in% input[['ui2']] 58 | if(is.null(input[['ui3']])) cond3 <- rep(T, nrow(data)) else cond3 <- data[[uivars[3]]] %in% input[['ui3']] 59 | if(is.null(input[['ui4']])) cond4 <- rep(T, nrow(data)) else cond4 <- data[[uivars[4]]] %in% input[['ui4']] 60 | cond <- data.table(cond1, cond2, cond3, cond4)[,which(apply(.SD, 1, all))] 61 | 62 | data[cond,] 63 | }) 64 | 65 | # Map plot function ############################ 66 | makemap <- function() { 67 | tmp <- datar() 68 | xvec <- factor(tmp[[input$colby]]) 69 | legcols <- col_vector[1:length(levels(xvec))] 70 | leglabs <- levels(xvec) 71 | 72 | m <- leaflet() %>% 73 | addProviderTiles("Stamen.TonerLite", options=providerTileOptions(noWrap = TRUE) 74 | ) %>% 75 | addCircles(data=cbind(tmp[[vars['lon']]], tmp[[vars['lat']]]), 76 | color=col_vector[as.numeric(factor(tmp[[input$colby]]))], 77 | radius=input[['radiusid']]) %>% 78 | addLegend("topright", colors=legcols, labels=leglabs, opacity=2, title=input$colby) 79 | return(m) 80 | } 81 | 82 | ## Render map ################################### 83 | output$mymap <- renderLeaflet({ 84 | mm <- makemap() 85 | #saveWidget(mm, 'leafletmap.html') # this will save on each render without browser directory choice 86 | mm 87 | }) 88 | 89 | ## Download map as HTML ######################## 90 | output$downloadid <- downloadHandler( 91 | filename = function() 'leafletdownload.html', 92 | content = function(con) saveWidget(makemap(), con) 93 | ) 94 | 95 | } 96 | 97 | )} 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /R/smartAgg.R: -------------------------------------------------------------------------------- 1 | #' @title Aggregate multiple columns using different functions harnessing data.table efficiency 2 | #' 3 | #' @description aggregates columns of a data.frame on 1 or multiple dimensions using 1 or multiple functions for different columns. 4 | #' It is equivalent to the base R \code{aggregate} function, except that it allows the user to 5 | #' aggregate sets of columns (referred to by name or column #) with different functions... and it's fast! 6 | #' 7 | #' @param df input data.frame to be aggregated 8 | #' @param by variable name of data.frame \code{df} to aggregate on. Same as \code{by} in base R \code{aggregate} function 9 | #' @param ... method to identify the variables to aggregate and the functions used to do so. 10 | #' Specify the function first as a string argument and then a vector of the column names to aggregate using that function. 11 | #' You can specify as many different functions as necessary, but every function must follow a vector of column names. 12 | #' @param catN adds a column named "countPerBin" with the # of observations aggregated in each row of the output data.frame. 13 | #' @param printAgg prints the line of code used to 14 | #' @return aggregated data.frame with columns corresponding to the grouping variables in by followed by aggregated columns from df. 15 | #' @import data.table 16 | #' @export 17 | #' @examples 18 | #' 19 | #' require('data.table') 20 | #' 21 | #' ## establishing variables to aggregate on 22 | #' lengthVs <- c('Sepal.Length', 'Petal.Length') 23 | #' widthVs <- c('Sepal.Width', 'Petal.Width') 24 | #' 25 | #' ## aggregating using 2 different functions and identifying columns to aggregate by variable names 26 | #' irisAgg1 <- smartAgg(df=iris, by='Species', 'mean', lengthVs, 'sum', widthVs) 27 | #' 28 | #' ## aggregating using 2 dimensions ("Specied" and "randthing") 29 | #' iris$randthing <- as.character(sample(1:5, nrow(iris), replace=T)) 30 | #' irisAgg2 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', lengthVs, 'sum', widthVs, catN=T, printAgg=T) 31 | #' 32 | #' ## aggregating variables by column number 33 | #' irisAgg3 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', 1:2, 'sum', 3:4, catN=T, printAgg=T) 34 | #' 35 | #' ## use anonymous functions 36 | #' data(mtcars) 37 | #' smartAgg(mtcars, by='cyl', function(x) sum(x*100), c('drat', 'mpg', 'disp')) 38 | #' 39 | #' ## use anonymous functions with more than 1 argument. Uses the provided variables for all unassigned arguments in anonymous function 40 | #' smartAgg(mtcars, by='cyl', function(x,y='carb') sum(x*y), c('drat', 'mpg', 'disp')) 41 | #' with(mtcars[mtcars$cyl==6,], c(sum(drat*carb), sum(mpg*carb), sum(disp*carb))) 42 | #' 43 | #' ## with anonymous functions with more than 1 argument. 44 | #' ## Example of possible unintended behavior - the user-provided variable is used for both and x and y in this example. 45 | #' smartAgg(mtcars, by='cyl', function(x,y) sum(x*y), c('drat', 'mpg', 'disp')) 46 | #' with(mtcars[mtcars$cyl==6,], c(sum(drat*drat), sum(mpg*mpg), sum(carb*carb))) 47 | #' 48 | #' ## demonstrating speed gain of smartAgg using data.table over aggregate 49 | #' n <- 300000 50 | #' df <- data.frame(x1=rnorm(n), x2=rbinom(n,5,0.5), x3=sample(letters, n, replace=T)) 51 | #' system.time(aggFast <- smartAgg(df, by='x3', 'mean', c('x1', 'x2'))) 52 | #' system.time(aggSlow <- aggregate(df[,c('x2', 'x1')], by=list(df$x3), FUN='mean')) 53 | 54 | smartAgg <- function(df, by, ..., catN=T, printAgg=F) { 55 | args <- list(...) 56 | dt <- as.data.table(df) 57 | 58 | ## organizing agg Methods and variable names into 2 separate lists 59 | aggMethod <- list() 60 | vars <- list() 61 | j<-1 62 | for(i in seq(1,length(args),2)) { 63 | aggMethod[[j]] <- args[[i]] 64 | vars[[j]] <- args[[i+1]] 65 | if(class(vars[[j]]) %in% c('integer', 'numeric')) vars[[j]] <- names(df)[vars[[j]]] 66 | j<-j+1 67 | } 68 | 69 | ## creat line to exec 70 | k<-0 71 | varL <- vector() 72 | for(j in 1:length(aggMethod)){ 73 | for(i in 1:length(vars[[j]])){ 74 | if(vars[[j]][i] %in% names(df)){ 75 | if(class(aggMethod[[j]])=='function') { 76 | afun <- paste0('af',j) 77 | assign(afun, aggMethod[[j]]) 78 | laf2 <- as.list(formals(get(afun))) 79 | laf2[which(lapply(laf2, nchar)==0)] <- vars[[j]][i] 80 | rhstmp <- paste(unlist(lapply(seq_along(laf2), function(y,n,i) paste0(n[[i]], '=', y[[i]]), n=names(laf2), y=laf2)), collapse=',') 81 | tmp <- paste(vars[[j]][i], '=', afun, '(', rhstmp, ')', sep='') # anonymous functions 82 | } else { 83 | tmp <- paste(vars[[j]][i], '=', aggMethod[[j]], '(', vars[[j]][i], ')', sep='') #non-anonymous functions 84 | } 85 | k <- k+1 86 | varL[k] <- tmp 87 | } else {print(paste('WARNING: ', vars[[j]][i], ' not in dataframe', sep=''))} 88 | } 89 | } 90 | varL <- paste(varL, collapse=', ') 91 | if(catN==T) varL <- paste(varL, ',countPerBin=length(', vars[[1]][1], ')', sep='') 92 | 93 | ## actually creating aggregation command and executing it 94 | line2exec <- paste('dtAgg <- dt[,list(', varL, '), by=list(', paste(by,collapse=','), ')]', sep='') 95 | if(printAgg==T) print(line2exec) 96 | eval(parse(text=line2exec)) 97 | dfAgg <- data.frame(dtAgg) 98 | 99 | return(dfAgg) 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /R/partialDependence.R: -------------------------------------------------------------------------------- 1 | #' @title Partial Dependence (single variable) 2 | #' @description Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model. 3 | #' Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence 4 | #' technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max. This gives a more 5 | #' realistic idea of the magnitude and direction of the x variable on the response. 6 | #' 7 | #' @param model model object used to generate predictions. Currently only built and tested for random forest. 8 | #' @param df data.frame or data.table used to generate predictions with \code{model} 9 | #' @param xvar character of length one; the x variable in \code{df} to assess for partial dependence with the response variable from \code{model}. 10 | #' @param n numeric of length one; number of values between the min and max of \code{xvar} to score the model. 11 | #' Note: this number is also how many replicates of \code{df} must be created and stored in memory. Default is 10. 12 | #' @param target.class character: Which category (class) of the target variable to use for predictions 13 | #' @param ci numeric: specify any confidence intervals around the median response. 14 | #' @seealso \code{\link{partialDepPlot}} 15 | #' @return data.table of output. \code{cnt} refers to how many obs from \code{df} are within the fixed-width bin specified by \code{xvar}. 16 | #' @import randomForest 17 | #' @import data.table 18 | #' @export 19 | #' 20 | #' @examples 21 | #' library('randomForest') 22 | #' library('data.table') 23 | #' DF <- mtcars 24 | #' DF$vs <- factor(DF$vs) 25 | #' rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100) 26 | #' pd <- partialDep(model=rf, df=DF, xvar='mpg') 27 | #' pd[ci==0.5,] # median of response when sequenced through 'mpg' 28 | #' 29 | #' ## Plotting 30 | #' plot(pd[cilev==0, xvar], pd[cilev==0, pred], type='l', ylim=c(0,1)) 31 | #' lines(pd[ci==.95, xvar], pd[ci==.95, pred], type='l', col='red') 32 | #' lines(pd[ci==.05, xvar], pd[ci==.05, pred], type='l', col='green') 33 | 34 | partialDep <- function(model, df, xvar, n=10, target.class='1', ci=c(.9,.5,.3)) { 35 | if(!'data.table' %in% class(df)) df <- data.table(df) 36 | xv <- df[,get(xvar)] # x vector of interest 37 | xvr <- seq(min(xv), max(xv), length.out=n) # fixed width break points 38 | dfb <- df[rep(1:.N, each=n), ] # creating replicate of data for each 39 | dfb[,xvar] <- rep(xvr, nrow(df)) # replacing x variable of interest with range of x var 40 | 41 | dfb[,cnt:=c(0, table(cut(df[,get(xvar)], breaks=xvr, include.lowest=T)))] 42 | pred <- predict(model, dfb, type='prob')[,target.class] 43 | 44 | retdf <- data.table(xvar=dfb[,get(xvar)], pred=pred, cnt=dfb[,cnt]) 45 | ci <- c(ci, 0) 46 | ci <- unique(c(0.5+ci/2, 0.5-ci/2)) 47 | ret <- retdf[,.(pred=quantile(pred, ci), cnt=cnt[1]), by='xvar'][,ci:=rep(ci, n)][,cilev:=abs(ci-0.5)*2][] 48 | return(ret) 49 | } 50 | 51 | 52 | 53 | #' @title Partial Dependence Spark Lines (multiple variables) 54 | #' @description Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model. 55 | #' Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence 56 | #' technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max. This gives a more 57 | #' realistic idea of the magnitude and direction of the x variable on the response. 58 | #' 59 | #' @param model model object used to generate predictions. Currently only built and tested for random forest. 60 | #' @param df data.frame or data.table used to generate predictions with \code{model} 61 | #' @param n numeric of length one; number of values between the min and max of \code{xvar} to score the model. 62 | #' Note: this number is also how many replicates of \code{df} must be created and stored in memory. Default is 10. 63 | #' @param xvars character vector; the x variables in \code{df} to assess for partial dependence with the response variable from \code{model}. Defaults to choosing all variables from model. 64 | #' @param target.class character: Which category (class) of the target variable to use for predictions 65 | #' @param ci numeric: specify any confidence intervals around the median response. 66 | #' @param plot logical: plot sparklines (\code{TRUE}), or no (\code{FALSE}) 67 | #' @param data logical: return summary table of output. yes (\code{TRUE}), or no (\code{FALSE}) 68 | #' @param plot.yaxis.fixed logical: \code{TRUE} uses same yaxis for all x variables (biggest max and smallest min across all variables). code{\FALSE} uses variable specific axes. 69 | #' @param wordy logical: print progress to the console 70 | #' @seealso \code{\link{partialDep}} 71 | #' @return list of output and plot 72 | #' @import randomForest 73 | #' @import data.table 74 | #' @import ggplot2 75 | #' @export 76 | #' 77 | #' @examples 78 | #' library('randomForest') 79 | #' library('data.table') 80 | #' library('ggplot2') 81 | #' DF <- mtcars 82 | #' DF$vs <- factor(DF$vs) 83 | #' rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100) 84 | #' pda <- partialDepAll(model=rf, df=DF, n=10) 85 | 86 | partialDepAll <- function(model, df, n=10, xvars=NULL, target.class='1', ci=c(.75), plot=T, data=T, plot.yaxis.fixed=T, wordy=T) { 87 | # currently only works with random forest 88 | if(is.null(xvars)) xvars <- names(model$forest$ncat) 89 | L <- list() 90 | for(i in xvars){ 91 | tmpdf <- data.table(partialDep(model=model, df=df, xvar=i, n=n, target.class=target.class, ci=ci)) 92 | tmpdf[,xname:=i] 93 | tmpdf[,x:=length(L)+1] # keeping order of xvars 94 | L[[i]] <- tmpdf 95 | if(wordy==T) cat('.') 96 | } 97 | ret <- rbindlist(L) 98 | ret[,xname:=factor(xname, levels=xvars)] # to conserve order using facet_wrap 99 | 100 | # producing data for sparklines plot 101 | pdplot <- ret 102 | pdplot[,y:=rep(rep(1:n, each=length(unique(pdplot$ci))), length(xvars))] 103 | setkey(pdplot,x,ci,y) 104 | pdplot[,preddiff:=pred-shift(pred), by=.(x,ci)] # getting absolute pred of change 105 | pdplot[,prednorm:=(pred-min(pred))/(max(pred)-min(pred)), by=.(x,ci)] # normalizing pred size 106 | pdplot[,cntrug:=cumsum(cnt)/nrow(df)*n, by=.(x,ci)] # creating ticks for rug plot 107 | 108 | # plotting spark lines 109 | if(plot==T){ 110 | lwdlen <- c(rep(5, 3), rep(4, 6), rep(3, 9), rep(2, length(xvars)))[length(xvars)] 111 | wd <- reshape(pdplot, idvar = c('x', 'y'), v.names='pred', timevar="ci", direction = "wide") # reshape wide for geom_ribbons 112 | gg <- ggplot() 113 | for(cl in ci) gg <- gg + geom_ribbon(data=wd, aes_string(x='y', ymin=paste0('pred.', 0.5-cl/2), ymax=paste0('pred.', 0.5+cl/2)), alpha=0.75*1/length(ci), fill='gray') 114 | gg <- gg + geom_line(data=pdplot[ci==0.5,], aes(x=y, y=pred, colour=pred), lwd=lwdlen) + 115 | facet_wrap(~xname, scales=ifelse(plot.yaxis.fixed==T, 'fixed', 'free')) + 116 | geom_rug(data=pdplot[ci==0.5,], aes(x=jitter(cntrug))) + 117 | scale_colour_gradientn(colours=colorRampPalette(c('firebrick', 'grey', 'forestgreen'))(n)) + 118 | theme_bw() + xlab('') + ylab('prediction') 119 | plot(gg) 120 | } 121 | 122 | if(data==T){ 123 | retL <- list(pdplot=data.frame(pdplot), 124 | cilev=data.frame(ci), 125 | n=n, 126 | ggobj=gg 127 | ) 128 | return(retL) 129 | } 130 | } 131 | 132 | -------------------------------------------------------------------------------- /R/bin.R: -------------------------------------------------------------------------------- 1 | #' @title Equal depth binning 2 | #' @description Simple equal depth binning algorithm. 3 | #' 4 | #' @param ser numeric vector to bin 5 | #' @param nbins number of bins desired 6 | #' @param qtype an integer between 1 and 9 selecting one of the nine quantile algorithms detailed below to be used. See \code{\link{quantile}} for more details. Default is 7. 7 | #' @param digits number, number of digits to display in bin categories 8 | #' @param labelRange logical: \code{TRUE} assigns a numeric score/ranking (ex. 1/3, 2/3, or 3/3 if 3 bins) to each bin. Can be combined with \code{labelOrder} and \code{labelPct} 9 | #' @param labelPct logical: \code{TRUE} appends the percent of observations assigned to the bin to the factor level (name). Can be combined with \code{labelOrder} and \code{labelRange} 10 | #' @param labelOrder logical: \code{TRUE} appends the ordinal position of the bin to the factor level (name). Can be combined with \code{labelPct} and \code{labelRange} 11 | #' 12 | #' @seealso \code{\link{quantile}} 13 | #' @return ordered factor vector with bins 14 | #' @export 15 | #' 16 | #' @examples 17 | #' ## perfect equal depth bins 18 | #' x1 <- rnorm(1000, 0, 20) 19 | #' binned1 <- depthbin(x1, nbins=10) 20 | #' table(binned1) 21 | #' 22 | #' ## slightly uneven bins with integer data 23 | #' x2 <- rpois(1000, 3) 24 | #' binned2 <- depthbin(x2, nbins=5) 25 | #' summary(binned2) 26 | #' 27 | #' ## as good as we can get with skewed integer data 28 | #' x3 <- round(abs(log(abs(rnorm(1000))))) 29 | #' binned3 <- depthbin(x3, nbins=5) 30 | #' summary(binned3) 31 | #' 32 | #' ## including more information in category names (levels of factor variable) 33 | #' x4 <- round(abs(log(abs(rnorm(1000))))) 34 | #' binned4 <- depthbin(x4, nbins=3, labelRange=T, labelPct=T, labelOrder=T) 35 | #' summary(binned4) 36 | 37 | depthbin <- function(ser, nbins=10, qtype=7, digits=10, labelRange=T, labelPct=F, labelOrder=F) { 38 | cutpts <- quantile(ser, probs=seq(0, 1, 1/nbins), na.rm=T, type=qtype) 39 | if(length(unique(cutpts))==nbins+1) { 40 | returnser <- cut(ser, breaks=cutpts, right=T, include.lowest=T) 41 | } else { 42 | alldup <- vector() 43 | while(length(unique(cutpts))+length(alldup) < nbins+1) { 44 | dup <- cutpts[duplicated(cutpts)] 45 | dups <- unique(dup) 46 | alldup <- c(alldup, dups) 47 | dupL <- length(alldup) + length(dups) 48 | ser2 <- ser[which(!ser %in% alldup)] 49 | cutpts <- quantile(ser2, probs=seq(0, 1, 1/(nbins-length(dups))), na.rm=T, type=qtype) 50 | } 51 | cutpts <- c(unique(cutpts), alldup) 52 | returnser <- cut(ser, breaks=cutpts, include.lowest=T, dig.lab=digits, right=F) 53 | } 54 | if(sum(labelRange, labelPct, labelOrder)==0) { 55 | labelRange <- T 56 | warning('arguments labelRange, labelOrder, labelPct should not all be set to FALSE. Setting labelRange to TRUE.') 57 | } 58 | rawlev <- levels(returnser) 59 | if (labelRange==T) levels(returnser) <- paste0(levels(returnser), rawlev) 60 | if (labelOrder==T) levels(returnser) <- paste0(levels(returnser), ' ', 1:length(rawlev), '/', length(rawlev)) 61 | if (labelPct==T) levels(returnser) <- paste0(levels(returnser), ' ', paste0('(', as.character(round(table(returnser)/length(returnser)*100, 1)), '%)')) 62 | for(i in 1:length(levels(returnser))) levels(returnser)[i] <- substr(levels(returnser)[i], nchar(rawlev[i])+1, nchar(levels(returnser)[i])) 63 | return(returnser) 64 | } 65 | 66 | #' @title Round numbers in interval 67 | #' @description Formats an interval of form \code{(5.234,11.783]} to something like \code{(5.2,11.8]}. 68 | #' Used for formatting only, mainly with binning functions like \code{\link{depthbin}}. Intervals can be opened or closed with 69 | #' \code{(} and \code{[} respectively and are maintained as such when formatted. Useful for prettifying graphs and reports. 70 | #' 71 | #' @param x character vector of bins to format 72 | #' @param r number, 0 to 10 (or higher I suppose) indicating how many decimals to display 73 | #' 74 | #' @seealso \code{\link{depthbin}} 75 | #' @return formatted character vector with length of input vector. 76 | #' @export 77 | #' @examples 78 | #' x1 <- cut(quantile(rnorm(100)), breaks=4) 79 | #' roundCut(x1, 1) 80 | 81 | roundCut <- function(x, r=1){ 82 | x <- as.character(x) 83 | b <- substr(x,0,1) 84 | e <- substr(x, nchar(x), nchar(x)) 85 | xx <- substr(x, 2, nchar(x)-1) 86 | xx1 <- round(as.numeric(sapply(xx, function(z) strsplit(z, ',')[[1]][1])), r) 87 | xx2 <- round(as.numeric(sapply(xx, function(z) strsplit(z, ',')[[1]][2])), r) 88 | return(paste(b, xx1, ', ', xx2, e, sep='')) 89 | } 90 | 91 | #' @title categorical data binning by collapsing 92 | #' @description Bins categorical variables into a smaller number of bins. Useful when modeling with variables that have many small categories. 93 | #' The largest categories are taken as is and the smaller categories are collapsed into a new field named 'other.' 94 | #' There are two options for determining the number of bins: \cr 95 | #' 1. Specify the exact number of bins desired (\code{ncat}) \cr 96 | #' 2. Specify how the share of your variable that will be represented with actual categories before naming everything else 'other' (\code{maxp}) \cr 97 | #' @details It is advisable to use only the \code{ncat} OR \code{maxp} parameters. When both used together, they will return whichever 98 | #' criteria yields the smaller number of bins. \cr 99 | #' Possible unexpected behavior when setNA=NA and keepNA=T. To keep NAs as standalone category, need to make setNA something that is not NA. 100 | #' 101 | #' @param x vector to bin. It is transformed to a character, so any type is acceptable 102 | #' @param ncat number 0 to 100 (or higher I suppose). Number of bins to collapse data to 103 | #' @param maxp number 0 to 1. Percentage of data that will be represented "as is" before categories are collapsed to "other" 104 | #' @param results logical \code{TRUE} or \code{FALSE}. Prints a frequency table of the new categories. 105 | #' @param setNA value to set NAs to. default is to keep NA. Can set to a character string to make NAs a category 106 | #' @param keepNA logical. \code{TRUE} keeps NAs as their own character. \code{FALSE} bundles NAs into 'other' category. 107 | #' @return vector of binned data 108 | #' @export 109 | #' @examples 110 | #' d <- rpois(1000, 20) 111 | #' d[d>26] <- sample(1:26, length(d[d>26]), replace=T) 112 | #' dl <- letters[d] 113 | #' barplot(table(dl)) 114 | #' table(binCat(dl, results=F, ncat=5)) 115 | #' table(binCat(dl, results=F, maxp=0.5)) 116 | #' table(binCat(dl, results=F, maxp=0.9)) 117 | #' 118 | #' ## With missings 119 | #' ff <- sample(letters[1:15], 100, replace=T) 120 | #' ff[sample(100, 10)] <- NA 121 | #' binCat(ff, ncat=7, setNA='missing') 122 | 123 | 124 | binCat <- function(x, ncat=NULL, maxp=NULL, results=F, setNA=NA, keepNA=F) { 125 | if(is.null(maxp)==F & is.null(ncat)==F) warning("Parameters 'ncat' and 'maxp' are both specified. It is advisable to only specify one of these criteria. Algorithm will stop at the first criteria met.") 126 | if(is.na(setNA)==F) x[is.na(x)] <- setNA 127 | 128 | ncat <- min(ncat, length(unique(x))) 129 | x <- as.character(x) 130 | n <- length(x) 131 | if(is.null(maxp)) maxp <- 1 132 | 133 | for(i in 1:length(unique(x))){ 134 | xc <- x 135 | x1 <- sort(table(xc, exclude=NULL), decreasing=T)[1:i] 136 | catp <- sum(x1)/n 137 | if(i==ncat | catp>maxp) { 138 | x2 <- sort(table(xc, exclude=NULL), decreasing=T)[1:(i+1)] 139 | if(keepNA==T) {xc[which(!xc %in% c(names(x2), setNA))] <- 'other' 140 | } else {xc[which(!xc %in% names(x2))] <- 'other'} 141 | returnser <- xc 142 | break 143 | } 144 | } 145 | if(results==T) print(sort(table(returnser)/n, decreasing=T)) 146 | return(returnser) 147 | } 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /draft/horseRaceModel.R: -------------------------------------------------------------------------------- 1 | #' @title 2 | #' @description This function performs repeated sub-sampling cross-validation using 3 different models and 3 | #' saves the AUC and lift for each model at each iteration. Models used are Random Forest, LASSO Regression, logistic regression (using 4 | #' LASSO for variable selection). An ensemble of the LASSO and Random Forest is also included. 5 | #' @param x data.frame or data.table of predictor variables 6 | #' @param y vector target variable, binary 0 or 1. 7 | #' @param n number, number of iterations 8 | #' @param trainpct numeric scalar. percent of data to be used for training at each iteration. 9 | #' @param liftQuantile numeric scalar. quantile to be used for assessing lift. 10 | #' @return horseRace S3 object. 11 | #' @export 12 | #' @examples 13 | #' mtcars2 <- mtcars 14 | #' for(i in 1:ncol(mtcars2)) mtcars2[sample(nrow(mtcars2), sample(1:5,1), replace=T),i] <- NA 15 | #' missdf(mtcars2) 16 | 17 | library('glmnet') 18 | library('randomForest') 19 | library('Rsenal') 20 | library('pROC') 21 | 22 | 23 | horseRaceModel <- function(x, y, n=10, trainPct=0.75, liftQuantile=.04) { 24 | 25 | if(class(y) %in% c('character', 'factor')) yn <- as.numeric(as.character(y)) else yn <- y 26 | if(max(y)>1 | min(y)<0) stop('y must be between 0 and 1') 27 | 28 | pb <- txtProgressBar(min=0, max=n, style=3, width=n) 29 | 30 | # setting up return object 31 | ret <- replicate(2, data.frame(rf=rep(NA, n), lasso=rep(NA, n), glm=rep(NA, n), ensemble=rep(NA,n)), simplify=F) 32 | names(ret) <- c('lift', 'auc') 33 | ret[['lasso_coef']] <- NULL 34 | 35 | for(i in 1:n) { 36 | itrain <- sample(1:nrow(x), round(nrow(x)*trainPct)) 37 | itest <- setdiff(1:nrow(x), itrain) 38 | xtr <- x[itrain,] 39 | xte <- x[itest,] 40 | ytr <- y[itrain]; yntr <- yn[itrain] 41 | yte <- y[itest]; ynte <- yn[itest] 42 | 43 | # if we have factor variables, dummy up columns for LASSO 44 | if(length(setdiff(sapply(xtr, class), c('numeric', 'integer')))>0) { 45 | xtr_d <- model.matrix(~.-1, xtr) 46 | xte_d <- model.matrix(~.-1, xte) 47 | colnames(xtr_d) <- gsub('\\-', '\\_', colnames(xtr_d)) 48 | colnames(xte_d) <- gsub('\\-', '\\_', colnames(xte_d)) 49 | facvars <- names(attr(xtr_d, "contrasts")) 50 | } 51 | 52 | ## LASSO 53 | fitlasso <- glmnet(x=as.matrix(xtr_d), y=yntr, alpha=1, family='binomial') 54 | cvLasso <- cv.glmnet(x=as.matrix(xtr_d), y=yntr, alpha=1, type.measure='deviance') 55 | predLasso <- predict(fitlasso, s=cvLasso$lambda.min, newx=as.matrix(xte_d), type='response')[,1] 56 | pqLasso <- predQuantile(ytest=ynte, testPred=predLasso, n=round(1/liftQuantile)) 57 | glmvars <- lassoDummyVarsToFactors(fitlasso=fitlasso, s=cvLasso$lambda.min, contrasts=names(attr(xtr_d, 'contrasts')), xtr=xtr) 58 | 59 | ## Logistic 60 | df4glm <- data.frame(y=ytr, data.frame(xtr)[, glmvars, drop=FALSE]) 61 | fitglm <- glm(makeForm('y', intersect(colnames(xtr), glmvars)), data=df4glm, family=binomial(logit)) 62 | predGlm <- predict(fitglm, newdata=xte, type='response') 63 | pqGlm <- predQuantile(ytest=ynte, testPred=predGlm, n=round(1/liftQuantile)) 64 | 65 | ## RANDOM FOREST 66 | rf <- randomForest(x=xtr, y=factor(ytr), ntree=1000) 67 | predRf <- predict(rf, newdata=xte, type='prob')[,2] 68 | pqRf <- predQuantile(ytest=ynte, testPred=predRf, n=round(1/liftQuantile)) 69 | 70 | ## ENSEMBLE 71 | predRfLasso <- (predRf+predLasso)/2 72 | pqRfLasso <- predQuantile(ytest=ynte, testPred=predRfLasso, n=round(1/liftQuantile)) 73 | 74 | ## Saving Results 75 | ret$lift[i, 'rf'] <- pqRf$cumLift[1] 76 | ret$lift[i, 'lasso'] <- pqLasso$cumLift[1] 77 | ret$lift[i, 'glm'] <- pqGlm$cumLift[1] 78 | ret$lift[i, 'ensemble'] <- pqRfLasso$cumLift[1] 79 | ret$lift[i, 'iteration'] <- i 80 | 81 | ret$auc[i, 'rf'] <- glmnet::auc(y=ynte, prob=predRf) 82 | ret$auc[i, 'lasso'] <- glmnet::auc(y=ynte, prob=predLasso) 83 | ret$auc[i, 'glm'] <- glmnet::auc(y=ynte, prob=predGlm) 84 | ret$auc[i, 'ensemble'] <- glmnet::auc(y=ynte, prob=predRfLasso) 85 | ret$auc[i, 'iteration'] <- i 86 | 87 | if(i==1) { 88 | ret$lasso_coef <- as.matrix(coef(fitlasso, s=cvLasso$lambda.min)) 89 | } else { 90 | ret$lasso_coef <- cbind(ret$lasso_coef, as.matrix(coef(fitlasso, s=cvLasso$lambda.min))) 91 | } 92 | 93 | setTxtProgressBar(pb, i) 94 | } 95 | 96 | ret$lasso_coef <- as(ret$lasso_coef, 'dgCMatrix') 97 | 98 | class(ret) <- 'horseRace' 99 | attr(ret, 'n') <- n 100 | 101 | return(ret) 102 | } 103 | 104 | ###################### 105 | ## Helper Functions ## 106 | ###################### 107 | ## NEEDS TO HANDLE ORDERED FACTORS (LIKE EDUCATION) 108 | 109 | lassoDummyVarsToFactors <- function(fitlasso, s=cvLasso$lambda.min, contrasts=names(attr(xtr_d, 'contrasts')), xtr){ 110 | # grabbing the dummied variables chosen by lasso regression 111 | ltmp <- coef(fitlasso, s=s) 112 | lassoVars <- ltmp@Dimnames[[1]][2:ltmp@Dim[1]][ltmp@i] 113 | 114 | # create mapping between factor variable (root) and it's dummied variable names 115 | if(length(contrasts)>0) { 116 | rl <- list() 117 | for(i in contrasts) rl[[i]] <- paste(i, gsub('\\-', '\\_', unique(xtr[,i])), sep='') 118 | } 119 | 120 | # creating list of just root (factor) variables for GLM 121 | ret <- c() 122 | for(i in lassoVars){ 123 | if(!i %in% names(xtr)) { 124 | addvar <- names(rl)[[grep(i, rl)]] 125 | if(addvar %in% ret) next 126 | ret <- c(ret, addvar) 127 | } else { 128 | ret <- c(ret, i) 129 | } 130 | } 131 | return(ret) 132 | } 133 | 134 | 135 | #lassoDummyVarsToFactors(fitlasso, s=cvLasso$lambda.min, contrasts=names(attr(xtr_d, 'contrasts')), xtr=xtr) 136 | 137 | 138 | ###################### 139 | ## Methods ########### 140 | ###################### 141 | 142 | 143 | plot.horseRace <- function(object, measure){ 144 | if (!inherits(object, "horseRace")) 145 | stop("object not of class horseRace") 146 | if (is.null(object[[measure]])) 147 | stop("Measure argument not recognized") 148 | 149 | ggdf <- data.table(melt(object[[measure]], id='iteration')) 150 | setnames(ggdf, 'variable', 'model') 151 | ggplot(data=ggdf, aes(x=iteration, y=value, group=model, color=model)) + geom_line() + 152 | geom_point(size=4, shape=19) + ylab(measure) + ggtitle(measure) + theme_bw() 153 | } 154 | 155 | summary.horseRace <- function(object){ 156 | cat('AUC: \n') 157 | dt <- data.frame(data.table(hr[['auc']])[,.(rf, lasso, glm, ensemble)][,lapply(.SD, function(x) quantile(x)[2:4])]) 158 | row.names(dt) <- c('25%', '50%', '75%') 159 | print(dt) 160 | 161 | cat('\n') 162 | cat('Lift: \n') 163 | dt <- data.frame(data.table(hr[['lift']])[,.(rf, lasso, glm, ensemble)][,lapply(.SD, function(x) quantile(x)[2:4])]) 164 | row.names(dt) <- c('25%', '50%', '75%') 165 | print(dt) 166 | 167 | cat('\n') 168 | cat('variables chosen by LASSO: \n') 169 | lassoB <- apply(as(hr$lasso_coef, 'matrix'), 1, function(x) sum(x>0)) 170 | lassoB <- lassoB[setdiff(names(lassoB), '(Intercept)')] 171 | data.frame(N=lassoB[order(lassoB, decreasing=T)]) 172 | } 173 | summary(hr) 174 | 175 | #' @examples 176 | #' \dontrun{ 177 | library('arules') # for grabbing AdultUCI dataset 178 | library('data.table') 179 | library('ggplot2') 180 | data(AdultUCI) 181 | df <- AdultUCI[1:1000,] 182 | df <- df[complete.cases(df),] 183 | names(df) <- gsub('\\-', '_', names(df)) # dashes mess things up 184 | # still need to handle categorical variables 185 | hr <- horseRaceModel(x=df[,c('age', 'capital_gain', 'capital_loss', 'hours_per_week', 186 | 'race', 'workclass', 'marital_status', 'occupation')], 187 | y=as.numeric(df$sex)-1, n=8, trainPct=.75, liftQuantile=.1) 188 | 189 | plot(hr, measure='lift') 190 | summary(hr) 191 | 192 | #setdiff(names(df), 'sex') 193 | #c('capital_gain', 'hours_per_week','age') 194 | 195 | 196 | #' } 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /R/igraph_network.R: -------------------------------------------------------------------------------- 1 | #' @title Get 1 degree away vertexes 2 | #' 3 | #' @description Returns a list of all vertex names that touch (going IN or OUT) from the 4 | #' specified vertex list. 5 | #' 6 | #' @param g igraph object 7 | #' @param vlist list of starting node/vertex names 8 | #' @param mode 'out' or 'in'. 'out' returns all the vertex/node names 1 degree out from \code{vlist}. 'in' returns all 9 | #' vertex/node names directed into vlist nodes. 10 | #' @return vector of node names that are 1 degree away from \code{vlist} 11 | #' @import igraph 12 | #' @export 13 | #' @examples 14 | #' ## build sample network 15 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 16 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 17 | #' relations<-cbind(from,to) 18 | #' g <- graph.data.frame(relations) 19 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 20 | #' V(g)$tier <- tiers 21 | #' 22 | #' getV1fromVlist(g, c('A', 'D'), 'to') 23 | 24 | getV1fromVlist <- function(g, vlist, mode='out'){ 25 | if(!mode %in% c('out', 'in', 'all')) stop("mode argument must be equal to 'in', 'out' or 'all'") 26 | eidList <- array() 27 | for (i in vlist) { 28 | eidList <- c(eidList, igraph::incident(g, i, mode)) 29 | } 30 | eidList <- unique(eidList[!is.na(eidList)]) 31 | mode2 <- ifelse(mode=='out', 'to', 'from') 32 | v <- getVfromE(g, eidList, mode2) 33 | return(v) 34 | } 35 | 36 | 37 | #' @title Get vertex names from edge IDs 38 | #' @description Returns the vertex names associated with the specified edge IDs (FROM or TO) 39 | #' @param g igraph object to mine vertex names from 40 | #' @param eid edge IDs referenced by a numeric vector or the edge object themselves 41 | #' @param mode "to" or "from". Indicates whether to return vertex names for nodes going to (in) or from (out) from the edge. 42 | #' @param unique \code{TRUE} or \code{FALSE}. TRUE simply removes duplicate vertex names and returns a unique list. 43 | #' @return vertex names associated with the specified edge IDs (FROM or TO) 44 | #' @import igraph 45 | #' @export 46 | #' @examples 47 | #' require('igraph') 48 | #' 49 | #' ## build sample network 50 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 51 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 52 | #' relations<-cbind(from,to) 53 | #' g <- graph.data.frame(relations) 54 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 55 | #' V(g)$tier <- tiers 56 | #' 57 | #' getVfromE(g, E(g)[1:5]) 58 | #' getVfromE(g, 1:5) 59 | 60 | getVfromE <- function(g, eid, mode='to', unique=T) { 61 | eid <- unique(eid[!is.na(eid)]) 62 | df <- get.edgelist(subgraph.edges(g, eid)) 63 | col <- ifelse(mode=='from', 1, 2) 64 | retdf <- df[,col] 65 | if(unique==T) retdf <- unique(df[,col]) 66 | return(retdf) 67 | } 68 | 69 | 70 | 71 | #' @title Get edge IDs from vertices 72 | #' @description returns the edge IDs that go OUT or IN from the specified list of vertices 73 | #' @param g igraph object 74 | #' @param vlist list of node/vertex names 75 | #' @param mode character string: 'in', 'out' or 'all' 76 | #' @return edge IDs that go OUT or IN from the specified list of vertices 77 | #' @import igraph 78 | #' @export 79 | #' @examples 80 | #' require('igraph') 81 | #' 82 | #' ## build sample network 83 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 84 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 85 | #' relations<-cbind(from,to) 86 | #' g <- graph.data.frame(relations) 87 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 88 | #' V(g)$tier <- tiers 89 | #' 90 | #' getEfromVlist(g, c('A', 'D'), 'to') 91 | 92 | getEfromVlist <- function(g, vlist, mode='out'){ 93 | eidList <- array() 94 | for (i in vlist) { 95 | eidList <- c(eidList, igraph::incident(g, i, mode)) 96 | } 97 | eidList <- unique(eidList[!is.na(eidList)]) 98 | return(eidList) 99 | } 100 | 101 | #' @title Prune edges of igraph 102 | #' @description This function takes a very connected network graph and prunes the edges down 103 | #' so that it focuses on depth from the root node to the end nodes (inter-connective 104 | #' edges are deleted). \cr \cr 105 | #' 1. Find all nodes 1 step out from root. These edges must stay. \cr 106 | #' 2. Find all nodes 2 steps from root that have more than 2 edges in. 107 | #' Keep only one edge (the one that leads to the shortest path back to the root).\cr 108 | #' 3. Repeat for the nodes one more degree away from root. 109 | #' @param g igraph object 110 | #' @param root character string: name of root node 111 | #' @return igraph object, pruned subgragh of \code{g} 112 | #' @import igraph 113 | #' @export 114 | #' @examples 115 | #' require('igraph') 116 | #' 117 | #' ## build sample network 118 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 119 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 120 | #' relations<-cbind(from,to) 121 | #' g <- graph.data.frame(relations) 122 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 123 | #' V(g)$tier <- tiers 124 | #' 125 | #' prungeEdge(g, 'A') 126 | 127 | pruneEdge <- function(g, root){ 128 | gF <- g 129 | d <- degree(gF, mode='in') 130 | 131 | nhood <- V(g)[neighbors(gF, root, 'out')]$name 132 | nhood <- c(nhood, root) 133 | d <- d[!names(d) %in% nhood] 134 | dP <- names(d[d>1]) 135 | 136 | for(v in dP){ 137 | spv <- shortest.paths(gF, v, root) 138 | eid <- getEfromVlist(gF, v, 'in') 139 | 140 | df <- t(shortest.paths(gF, root, getVfromE(gF, eid, 'from'), 'out')) 141 | df <- df[order(df[,1]),1] 142 | shortPv <- names(df)[1] 143 | eidKeep <- as.numeric(E(gF)[shortPv %->% v]) 144 | 145 | if(length(eid)>1){ 146 | eidDel <- eid[eid!=eidKeep] 147 | gF <- delete.edges(gF, eidDel) 148 | } 149 | } 150 | return(gF) 151 | } 152 | 153 | #' @title Find all nodes connected to root node 154 | #' @description Returns a subset of the original graph (all edges and vertices that a 155 | #' directed path can take from from the root node. OrderN limits the growth of these paths. 156 | #' @param g igraph object 157 | #' @param root character string: name of root node 158 | #' @param orderN number: # of degrees away from root node to search. 159 | #' @seealso Simpler version: \code{\link{travCount}} 160 | #' @return igraph object, subgragh of \code{g} 161 | #' @import igraph 162 | #' @export 163 | #' @examples 164 | #' require('igraph') 165 | #' 166 | #' ## build sample network 167 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 168 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 169 | #' relations<-cbind(from,to) 170 | #' g <- graph.data.frame(relations) 171 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 172 | #' V(g)$tier <- tiers 173 | #' 174 | #' plot(g) ## full network 175 | #' plot(travOut(g, 'D')) ## sub network 176 | 177 | travOut <- function(g, root, orderN=-1) { 178 | eid <- incident(g, root, 'out') 179 | noEdgesTest <- identical(eid, numeric(0)) 180 | 181 | if(noEdgesTest==F){ 182 | v <- getVfromE(g, eid, 'to') 183 | 184 | eidKids <- array() 185 | v2<-v 186 | spv <- shortest.paths(g, root) 187 | sp <- max(spv[spv!=Inf]) 188 | n <- ifelse(orderN==-1, sp, orderN) 189 | for(i in 1:n) { 190 | eidIter <- getEfromVlist(g, v2) 191 | v2 <- getV1fromVlist(g, v2, 'out') 192 | eidKids <- c(eidKids, eidIter) 193 | } 194 | 195 | reteid <- unique(c(eid, eidKids)) 196 | reteid <- reteid[!is.na(reteid)] 197 | ret <- subgraph.edges(g, reteid) 198 | 199 | } else {ret <- graph.neighborhood(g, 1, root, 'out')[[1]]} 200 | if(noEdgesTest==F & orderN==1) ret <- subgraph.edges(g, eid) 201 | return(ret) 202 | } 203 | 204 | 205 | #' @title Count # of nodes connected to root node 206 | #' @description Count doesn't include root node. Simpler version of \code{\link{travOut}}. 207 | #' Might be faster? Developed for a different project. 208 | #' @param g igraph object 209 | #' @param root character string: name of root node 210 | #' @param orderN number, # of degrees away from root node to search. Default is -1 which searches all degrees 211 | #' @param vmode character string: 'out', 'in' or 'all', determines how to subgraph \code{g} from \code{root}. Default is 'out'. 212 | #' @seealso \code{\link{travOut}} 213 | #' @return number, count of nodes connected to root node. 214 | #' @import igraph 215 | #' @export 216 | #' @examples 217 | #' require('igraph') 218 | #' 219 | #' ## build sample network 220 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O') 221 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P') 222 | #' relations<-cbind(from,to) 223 | #' g <- graph.data.frame(relations) 224 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2) 225 | #' V(g)$tier <- tiers 226 | #' 227 | #' plot(g) 228 | #' travCount(g, 'B') 229 | #' travCount(g, 'L', vmode='out') 230 | 231 | travCount <- function(g, root, orderN=-1, vmode='out') { 232 | #eid <- which(!E(g)$type %in% vtype) 233 | #g <- subgraph.edges(g, eid, delete.vertices=T) 234 | 235 | vb <- root 236 | v <- getV1fromVlist(g, root, vmode) 237 | vv <- root 238 | if(orderN==-1){ 239 | spv <- igraph::shortest.paths(g, root) 240 | sp <- max(spv[spv!=Inf]) 241 | n <- sp 242 | } else {n <- orderN} 243 | for(i in 1:n) { 244 | vtmp <- getV1fromVlist(g, vb, vmode) 245 | v <- unique(c(v, vtmp)) 246 | vb <- vtmp 247 | } 248 | return(length(v)) 249 | } 250 | 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /R/shiny_arules.R: -------------------------------------------------------------------------------- 1 | #' @title Assocation Rules Visualization Shiny App 2 | #' @description Launches a Shiny App that provides an interactive interface to the visualizations of the \code{arulesViz} package. 3 | #' The app allows users to mine rules based on all or just subsets of features, sort by criteria (lift, support, confidence) and visualize 4 | #' using network graph, grouped bubble and scatter plots. \cr 5 | #' Users filter rules to target only those with a certain variable on the RHS or LHS of the rule. 6 | #' Rule mining is computed using the \link{apriori} algorithm from \code{arules}. 7 | #' 8 | #' @param dataset data.frame, this is the dataset that association rules will be mined from. Each row is treated as a transaction. Seems to work 9 | #' OK when a the S4 transactions class from \code{arules} is used, however this is not thoroughly tested. 10 | #' @param bin logical, \code{TRUE} will automatically discretize/bin numerical data into categorical features that can be used for association analysis. 11 | #' @param vars integer, how many variables to include in initial rule mining 12 | #' @param supp numeric, the support parameter for initializing visualization. Useful when it is known that a high support is needed to not crash computationally. 13 | #' @param conf numeric, the confidence parameter for initializing visualization. Similarly useful when it is known that a high confidence is needed to not crash computationally. 14 | #' @seealso \code{arulesViz}, \code{arules} 15 | #' @return Shiny App 16 | #' @import shiny arulesViz arules 17 | #' @export 18 | #' 19 | #' @examples 20 | #' ## creating some data 21 | #' n <- 10000 # of obs 22 | #' d <- data.frame( 23 | #' eye = sample(c('brown', 'green', 'blue', 'hazel'), n, replace=T), 24 | #' gender = sample(c('male', 'female'), n, replace=T), 25 | #' height = sort(sample(c('dwarf', 'short', 'average', 'above average', 'giant'), n, replace=T)), 26 | #' wealth = sort(sample(c('poor', 'struggling', 'middle', 'uppermiddle', 'comfortable', 'rich', '1%', 'millionaire', 'billionaire'), n, replace=T)), 27 | #' favoriteAnimal = sample(c('dog', 'cat', 'bat', 'frog', 'lion', 'cheetah', 'lion', 'walrus', 'squirrel'), n, replace=T), 28 | #' numkids = abs(round(rnorm(n, 2, 1))) 29 | #' ) 30 | #' 31 | #' ## adding some pattern 32 | #' d$numkids[d$gender=='male'] <- d$numkids[d$gender=='male'] + sample(0:3, sum(d$gender=='male'), replace=T) 33 | #' d$numkids <- factor(d$numkids) 34 | #' 35 | #' ## calling Shiny App to visualize association rules 36 | #' arulesApp(d) 37 | 38 | arulesApp <- function (dataset, bin=T, vars=5, supp=0.1, conf=0.5) { 39 | 40 | ## binning numeric data 41 | for(i in 1:ncol(dataset)) { 42 | if(class(dataset[,i]) %in% c('numeric', 'integer')) dataset[,i] <- Rsenal::depthbin(dataset[,i], nbins=10) 43 | } 44 | 45 | ## calling Shiny App 46 | shinyApp(ui = shinyUI(pageWithSidebar( 47 | 48 | headerPanel("Association Rules"), 49 | 50 | sidebarPanel( 51 | 52 | conditionalPanel( 53 | condition = "input.samp=='Sample'", 54 | numericInput("nrule", 'Number of Rules', 5), br() 55 | ), 56 | 57 | conditionalPanel( 58 | condition = "input.mytab=='graph'", 59 | radioButtons('graphType', label='Graph Type', choices=c('itemsets','items'), inline=T), br() 60 | ), 61 | 62 | conditionalPanel( 63 | condition = "input.lhsv=='Subset'", 64 | uiOutput("choose_lhs"), br() 65 | ), 66 | 67 | conditionalPanel( 68 | condition = "input.rhsv=='Subset'", 69 | uiOutput("choose_rhs"), br() 70 | ), 71 | 72 | conditionalPanel( 73 | condition = "input.mytab=='grouped'", 74 | sliderInput('k', label='Choose # of rule clusters', min=1, max=150, step=1, value=15), br() 75 | ), 76 | 77 | conditionalPanel( 78 | condition = "input.mytab %in%' c('grouped', 'graph', 'table', 'datatable', 'scatter', 'paracoord', 'matrix', 'itemFreq')", 79 | radioButtons('samp', label='Sample', choices=c('All Rules', 'Sample'), inline=T), br(), 80 | uiOutput("choose_columns"), br(), 81 | sliderInput("supp", "Support:", min = 0, max = 1, value = supp , step = 1/10000), br(), 82 | sliderInput("conf", "Confidence:", min = 0, max = 1, value = conf , step = 1/10000), br(), 83 | selectInput('sort', label='Sorting Criteria:', choices = c('lift', 'confidence', 'support')), br(), br(), 84 | numericInput("minL", "Min. items per set:", 2), br(), 85 | numericInput("maxL", "Max. items per set::", 3), br(), 86 | radioButtons('lhsv', label='LHS variables', choices=c('All', 'Subset')), br(), 87 | radioButtons('rhsv', label='RHS variables', choices=c('All', 'Subset')), br(), 88 | downloadButton('downloadData', 'Download Rules as CSV') 89 | ) 90 | 91 | ), 92 | 93 | mainPanel( 94 | tabsetPanel(id='mytab', 95 | tabPanel('Grouped', value='grouped', plotOutput("groupedPlot", width='100%', height='100%')), 96 | tabPanel('Graph', value='graph', plotOutput("graphPlot", width='100%', height='100%')), 97 | tabPanel('Scatter', value='scatter', plotOutput("scatterPlot", width='100%', height='100%')), 98 | tabPanel('Parallel Coordinates', value='paracoord', plotOutput("paracoordPlot", width='100%', height='100%')), 99 | tabPanel('Matrix', value='matrix', plotOutput("matrixPlot", width='100%', height='100%')), 100 | tabPanel('ItemFreq', value='itemFreq', plotOutput("itemFreqPlot", width='100%', height='100%')), 101 | tabPanel('Table', value='table', verbatimTextOutput("rulesTable")), 102 | tabPanel('Data Table', value='datatable', dataTableOutput("rulesDataTable")) 103 | ) 104 | ) 105 | 106 | )), 107 | 108 | server = function(input, output) { 109 | 110 | output$choose_columns <- renderUI({ 111 | checkboxGroupInput("cols", "Choose variables:", 112 | choices = colnames(dataset), 113 | selected = colnames(dataset)[1:vars]) 114 | }) 115 | 116 | 117 | output$choose_lhs <- renderUI({ 118 | checkboxGroupInput("colsLHS", "Choose LHS variables:", 119 | choices = input$cols, 120 | selected = input$cols[1]) 121 | }) 122 | 123 | output$choose_rhs <- renderUI({ 124 | checkboxGroupInput("colsRHS", "Choose RHS variables:", 125 | choices = input$cols, 126 | selected = input$cols[1]) 127 | }) 128 | 129 | ## Extracting and Defining arules 130 | rules <- reactive({ 131 | tr <- as(dataset[,input$cols], 'transactions') 132 | arAll <- apriori(tr, parameter=list(support=input$supp, confidence=input$conf, minlen=input$minL, maxlen=input$maxL)) 133 | 134 | if(input$rhsv=='Subset' & input$lhsv!='Subset'){ 135 | varsR <- character() 136 | for(i in 1:length(input$colsRHS)){ 137 | tmp <- with(dataset, paste(input$colsRHS[i], '=', levels(as.factor(get(input$colsRHS[i]))), sep='')) 138 | varsR <- c(varsR, tmp) 139 | } 140 | ar <- subset(arAll, subset=rhs %in% varsR) 141 | 142 | } else if(input$lhsv=='Subset' & input$rhsv!='Subset') { 143 | varsL <- character() 144 | for(i in 1:length(input$colsLHS)){ 145 | tmp <- with(dataset, paste(input$colsLHS[i], '=', levels(as.factor(get(input$colsLHS[i]))), sep='')) 146 | varsL <- c(varsL, tmp) 147 | } 148 | ar <- subset(arAll, subset=lhs %in% varsL) 149 | 150 | } else if(input$lhsv=='Subset' & input$rhsv=='Subset') { 151 | varsL <- character() 152 | for(i in 1:length(input$colsLHS)){ 153 | tmp <- with(dataset, paste(input$colsLHS[i], '=', levels(as.factor(get(input$colsLHS[i]))), sep='')) 154 | varsL <- c(varsL, tmp) 155 | } 156 | varsR <- character() 157 | for(i in 1:length(input$colsRHS)){ 158 | tmp <- with(dataset, paste(input$colsRHS[i], '=', levels(as.factor(get(input$colsRHS[i]))), sep='')) 159 | varsR <- c(varsR, tmp) 160 | } 161 | ar <- subset(arAll, subset=lhs %in% varsL & rhs %in% varsR) 162 | 163 | } else { 164 | ar <- arAll 165 | } 166 | quality(ar)$conviction <- interestMeasure(ar, 'conviction', transactions=tr) 167 | quality(ar)$hyperConfidence <- interestMeasure(ar, 'hyperConfidence', transactions=tr) 168 | quality(ar)$cosine <- interestMeasure(ar, 'cosine', transactions=tr) 169 | quality(ar)$chiSquare <- interestMeasure(ar, 'chiSquare', transactions=tr) 170 | quality(ar)$coverage <- interestMeasure(ar, 'coverage', transactions=tr) 171 | quality(ar)$doc <- interestMeasure(ar, 'doc', transactions=tr) 172 | quality(ar)$gini <- interestMeasure(ar, 'gini', transactions=tr) 173 | quality(ar)$hyperLift <- interestMeasure(ar, 'hyperLift', transactions=tr) 174 | ar 175 | }) 176 | 177 | # Rule length 178 | nR <- reactive({ 179 | nRule <- ifelse(input$samp == 'All Rules', length(rules()), input$nrule) 180 | }) 181 | 182 | ## Grouped Plot ######################### 183 | output$groupedPlot <- renderPlot({ 184 | ar <- rules() 185 | plot(sort(ar, by=input$sort)[1:nR()], method='grouped', control=list(k=input$k)) 186 | }, height=800, width=800) 187 | 188 | ## Graph Plot ########################## 189 | output$graphPlot <- renderPlot({ 190 | ar <- rules() 191 | plot(sort(ar, by=input$sort)[1:nR()], method='graph', control=list(type=input$graphType)) 192 | }, height=800, width=800) 193 | 194 | ## Scatter Plot ########################## 195 | output$scatterPlot <- renderPlot({ 196 | ar <- rules() 197 | plot(sort(ar, by=input$sort)[1:nR()], method='scatterplot') 198 | }, height=800, width=800) 199 | 200 | ## Parallel Coordinates Plot ################### 201 | output$paracoordPlot <- renderPlot({ 202 | ar <- rules() 203 | plot(sort(ar, by=input$sort)[1:nR()], method='paracoord') 204 | }, height=800, width=800) 205 | 206 | ## Matrix Plot ################### 207 | output$matrixPlot <- renderPlot({ 208 | ar <- rules() 209 | plot(sort(ar, by=input$sort)[1:nR()], method='matrix', control=list(reorder=T)) 210 | }, height=800, width=800) 211 | 212 | ## Item Frequency Plot ########################## 213 | output$itemFreqPlot <- renderPlot({ 214 | trans <- as(dataset[,input$cols], 'transactions') 215 | itemFrequencyPlot(trans) 216 | }, height=800, width=800) 217 | 218 | ## Rules Data Table ########################## 219 | output$rulesDataTable <- renderDataTable({ 220 | ar <- rules() 221 | rulesdt <- rules2df(ar) 222 | rulesdt 223 | }) 224 | 225 | ## Rules Printed ######################## 226 | output$rulesTable <- renderPrint({ 227 | #hack to disply results... make sure this match line above!! 228 | #ar <- apriori(dataset[,input$cols], parameter=list(support=input$supp, confidence=input$conf, minlen=input$minL, maxlen=input$maxL)) 229 | ar <- rules() 230 | inspect(sort(ar, by=input$sort)) 231 | }) 232 | 233 | ## Download data to csv ######################## 234 | output$downloadData <- downloadHandler( 235 | filename = 'arules_data.csv', 236 | content = function(file) { 237 | write.csv(rules2df(rules()), file) 238 | } 239 | ) 240 | 241 | 242 | } 243 | ) 244 | } 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | -------------------------------------------------------------------------------- /R/shiny_tableNet.R: -------------------------------------------------------------------------------- 1 | #' @title Shiny app to visualize schema of relational tables 2 | #' @description Launches a Shiny App that provides a visual representation of the relationships between a collection of tables 3 | #' (data.frames) with some relational structure. If given a dump of dozens of flat files and without a formal schema or 4 | #' documentation on your data, this app will help explore and understand the underlying schema - which tables can be joined 5 | #' which other tables, which variables can be used to join which tables, etc, It also gives a read on how strongly each variable with 6 | #' the same name in multiple tables actually connects tables (how many of the values of the linking variable \code{x} that are in 7 | #' \code{table1} are also in \code{table2}. \cr \cr 8 | #' @details Note, the Key-strength tables can be slow to display because these computations actually dive into the contents of the 9 | #' data.frames and perform set operations on every row of the variable of interest. It is possible to use the \code{\link{isKey}} function 10 | #' to compute these similarity matrices ahead of time to prevent the Shiny app from doing these computations each time. 11 | #' @param dfL list of data.frames used to generate schema. This is easily generated from \code{\link{dir2dfList}} 12 | #' @seealso \code{\link{dir2dfList}} \code{\link{isKey}} 13 | #' @return Shiny App 14 | #' @import memoise shiny gplots igraph 15 | #' @export 16 | #' @examples 17 | #' \dontrun{ 18 | #' ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory 19 | #' temp <- tempfile() 20 | #' localDataDir <- paste0(tempdir(), '\\lahman2012-csv-onYourComp.zip') 21 | #' download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir) 22 | #' unzip(localDataDir, exdir=paste0(tempdir(), '\\lahman2012-csv-onYourComp')) ## may not be necessary 23 | #' 24 | #' ## create a list of data.frames from .CSVs 25 | #' dfL <- dir2dfList(paste0(tempdir(), '\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F) 26 | #' 27 | #' ## launch app 28 | #' tableNet(dfL) 29 | #' } 30 | 31 | 32 | tableNet <- function(dfL) { 33 | 34 | ############################################################################### 35 | ## takes list of Data frames and makes igraph network ######################## 36 | ############################################################################### 37 | dfL2network <- function(dfL, islands=T){ 38 | # create adjacency list 39 | edgeL <- data.frame(v1=as.character(), v2=as.character(), v3=as.character(), stringsAsFactors=F) 40 | k<-1 41 | for(i in 1:length(dfL)){ 42 | for(j in 1:length(dfL)){ 43 | commonv <- intersect(names(dfL[[i]]), names(dfL[[j]])) 44 | if(length(commonv)>0){ 45 | for(cv in 1:length(commonv)){ 46 | edgeL[k,1] <- names(dfL)[i] 47 | edgeL[k,2] <- names(dfL)[j] 48 | edgeL[k,3] <- commonv[cv] 49 | k<-k+1 50 | } 51 | } 52 | } 53 | } 54 | 55 | ## creating graph from adjacency 56 | edgeL <- edgeL[edgeL[,1]!=edgeL[,2],] 57 | g <- graph.data.frame(edgeL[,c(1,2)], directed=F) 58 | if(islands==T) { 59 | xvars <- unlist(lapply(dfL, names),use.names=F) 60 | keys <- unique(xvars[duplicated(xvars)==T]) 61 | islandtab <- names(dfL)[sapply(dfL, function(x) is.na(sum(names(x) %in% keys)))] 62 | g <- g + vertices(islandtab) 63 | } 64 | E(g)$color <- 'gray' 65 | E(g)$name <- edgeL[,3] 66 | 67 | return(g) 68 | } 69 | 70 | ## create schema network graph 71 | g <- dfL2network(dfL) 72 | 73 | ## create memo'ized version of isKey to has 74 | isKeym <- memoise(isKey) 75 | 76 | ####################################################### 77 | ## ACTUALLY RUN APP ################################### 78 | ####################################################### 79 | 80 | shinyApp(ui = shinyUI(pageWithSidebar( 81 | 82 | headerPanel("Database Schema Visualization"), 83 | 84 | ## NETWORK 85 | sidebarPanel( 86 | conditionalPanel( 87 | condition="input.mytab=='network'", 88 | 89 | sliderInput("vlabcex", "Vertex label size:", value=1, min=.05, max=5, step=0.05), 90 | sliderInput("sizeNodesSlider", "Vertex size", value=10, min=0.1, max=30, step=0.1), 91 | sliderInput("ewidth", "Edge width:", value=1, min=1, max=10, step=1), 92 | selectInput("lay2", "Choose a layout", choices = c('circle', 'fruchterman.reingold')), 93 | selectInput("vcolor", "Color vertices by:", choices = c('# of keys', '# of connections', 'strength of keys')), 94 | checkboxInput(inputId='sizeNodes', label='Size vertices by connections', value=T), 95 | checkboxInput(inputId='curved', label='Curve edges', value=F), 96 | checkboxInput(inputId='islands', label='Remove unconnected Tables', value=T), 97 | radioButtons(inputId='subEdges', label='Select Edges:', choices=c('all', 'some')), 98 | 99 | conditionalPanel("input.subEdges=='some'", 100 | checkboxInput(inputId='delE', label='Delete un-selected edges', value=F), 101 | selectInput(inputId='edgev', label='Variables', choices=sort(unique(E(g)$name)), multiple=T) 102 | ) 103 | 104 | ), 105 | 106 | ## STRENGTH TABLE 107 | conditionalPanel( 108 | condition="input.mytab=='strength'", 109 | checkboxInput(inputId='keyList', label='Optional: Use pre-computed key strength matrix', value=FALSE), 110 | 111 | conditionalPanel("input.keyList==true", 112 | selectInput(inputId='keyListObject', label='Pick pre-computed key strength matrix object', choices=ls(name='.GlobalEnv')) 113 | ), 114 | 115 | selectInput(inputId='key', label='Choose a key', choices=sort(unique(E(g)$name))), 116 | sliderInput("keylab", "Variable labels size", value=1, min=0.1, max=10, step=0.1) 117 | 118 | ), 119 | 120 | 121 | ## KEY-TABLE HEATMAP 122 | conditionalPanel( 123 | condition="input.mytab=='keyTab'", 124 | sliderInput("colLabCex", "Table Label Size", value=1, min=0.1, max=10, step=0.1), 125 | sliderInput("rowLabCex", "Key Label Size", value=1, min=.1, max=10, step=.1), 126 | sliderInput("marginSize", "Margin Size", value=15, min=0, max=100, step=1), 127 | sliderInput("wsize", "width size", value=800, min=0, max=5000, step=50) 128 | ), 129 | 130 | ## ADJACENCY LIST 131 | conditionalPanel( 132 | condition="input.mytab=='adjlist'", 133 | selectInput(inputId='adjcommonvar', label='Choose a key', choices=sort(unique(E(g)$name)), multiple=T) 134 | ) 135 | 136 | 137 | ), 138 | 139 | ## PANELS TO SHOW 140 | mainPanel( 141 | tabsetPanel(id='mytab', 142 | tabPanel('Network', value='network', plotOutput('circle', height='150%')), 143 | tabPanel('Key Strength', value='strength', plotOutput('strengthPlot', height='150%')), 144 | tabPanel('Key-Table Matrix', value='keyTab', plotOutput('keyTabMat', height='150%')), 145 | tabPanel('Adjacency List', value='adjlist', dataTableOutput('adjlisttab')) 146 | ) 147 | ) 148 | 149 | )), 150 | 151 | server = function(input, output) { 152 | 153 | ################################################ 154 | ## NETWORK 155 | ################################################ 156 | 157 | ## DEFINING COMMON VARIABLES 158 | commonv <- unlist(lapply(dfL, names),use.names=F) 159 | commonv <- unique(commonv[duplicated(commonv)==T]) 160 | 161 | output$circle <- renderPlot({ 162 | 163 | ## delete edges 164 | if(input$delE==T){ 165 | eid <- which(E(g)$name %in% input$edgev) 166 | g <- subgraph.edges(g, eids=eid, delete.vertices=T) 167 | } 168 | 169 | ## remove island vertices 170 | if(input$islands==T){ 171 | islandTabs <- which(degree(g)==0) 172 | g <- delete.vertices(g, v=islandTabs) 173 | } 174 | 175 | ## setting defaul graphic parameters 176 | E(g)$width <- input$ewidth 177 | E(g)$color <- 'gray' 178 | 179 | ## edge colors 180 | if(input$subEdges=='some'){ 181 | E(g)$color[E(g)$name %in% input$edgev] <- 'red' 182 | E(g)$width[E(g)$name %in% input$edgev] <- input$ewidth + 3 183 | } 184 | 185 | ## vertex size 186 | if(input$sizeNodes==T) { 187 | deg <- igraph::degree(g, V(g)$name, mode='total') 188 | V(g)$size <- (deg/mean(deg)*input$sizeNodesSlider) + 5 189 | } else { 190 | V(g)$size <- 5 + input$sizeNodesSlider} 191 | 192 | ## vertex colors 193 | if(input$vcolor=='# of keys') { 194 | namesL <- lapply(dfL, names) 195 | keysInTab <- unlist(lapply(namesL, function(x) sum((x %in% commonv)==T))) 196 | keysInTab[is.na(keysInTab)] <- 0 197 | dfn <- data.frame(keysInTab) 198 | dfn$vnames <- row.names(dfn) 199 | V(g)$nkeys <- as.numeric(makeVertexAtt(g, df=dfn, vname='keysInTab', by.df='vnames', by.g='name')) 200 | vcolPal <- colorRampPalette(c("white", "purple"))(n = max(V(g)$nkeys)+1) 201 | V(g)$color <- vcolPal[V(g)$nkeys+1] 202 | 203 | } else if(input$vcolor=='# of connections') { 204 | vcolPal <- colorRampPalette(c("white", "purple"))(n = max(degree(g))+1) 205 | V(g)$color <- vcolPal[degree(g)+1] 206 | } 207 | 208 | ## layout 209 | lay <- get(paste('layout.', as.character(input$lay2), sep='')) 210 | 211 | ## ACTUALLY PLOTTING NETWORK 212 | plot(g, vertex.label.cex=input$vlabcex, layout=lay, edge.curved=input$curved, vertex.label.color='black') 213 | }, width=800, height=800) 214 | 215 | 216 | ################################################ 217 | ## STRENGTH CHART 218 | ################################################ 219 | 220 | output$strengthPlot <- renderPlot({ 221 | 222 | ## defining keyL object 223 | if(input$keyList==TRUE) {keyL <- get(input$keyListObject)[[input$key]] 224 | } else { 225 | keyL <- isKeym(dfL, input$key) 226 | } 227 | 228 | myPalette <- colorRampPalette(c("white", "firebrick"))(n=20) 229 | checklab <- round(keyL,2) 230 | heatmap.2(keyL, trace='none', dendrogram='none', Rowv=F, Colv=F, margins=c(18,18), col=myPalette, 231 | cellnote=checklab, notecol='black', na.rm=T, cexRow=input$keylab, cexCol=input$keylab) 232 | text(x=.5, y=.9, '1. Take unique values of the \n key variable in each table. \n 233 | 2. Look at the share of these unique \n values from table 1 (right) \n that appear in table 2 (bottom) \n 234 | 3. So variables with high row scores are strong keys \n' 235 | , cex=1.2) 236 | 237 | }, width=800, height=800) 238 | 239 | 240 | ################################################ 241 | ## STRENGTH TABLE 242 | ################################################ 243 | 244 | plotSize <- reactive({ 245 | return(input$wsize) 246 | }) 247 | 248 | output$keyTabMat <- renderPlot({ 249 | ## setting up matrix 250 | tabv <- names(dfL) 251 | mat <- matrix(nrow=length(commonv), ncol=length(tabv)) 252 | colnames(mat) <- tabv 253 | rownames(mat) <- commonv 254 | 255 | for(i in 1:nrow(mat)){ 256 | colindx <- which(sapply(lapply(dfL, names), function(x) rownames(mat)[i] %in% x)) 257 | mat[i, colindx] <- 1 258 | } 259 | mat[is.na(mat)] <- 0 260 | mat <- mat[rev(order(rowSums(mat))),] 261 | 262 | myPalette <- colorRampPalette(c("white", "firebrick"))(n = 2) 263 | checklab <- mat 264 | heatmap.2(mat, trace='none', dendrogram='none', Rowv=F, Colv=F, key=F, 265 | margins=c(input$marginSize, input$marginSize), lhei = c(0.1,0.9), 266 | main='Linking variables (rows) in Tables (columns)', 267 | col=myPalette, cellnote=checklab, notecol='black', cexRow=input$rowLabCex, cexCol=input$colLabCex) 268 | }, width=plotSize, height=plotSize, units='px') 269 | 270 | ################################################ 271 | ## ADJACENCY LIST 272 | ################################################ 273 | output$adjlisttab <- renderDataTable({ 274 | df <- get.data.frame(g)[,c('from', 'to', 'name')] 275 | names(df) <- c('table1', 'table2', 'commonVariable') 276 | if(length(input$adjcommonvar)>0) df <- df[df$commonVariable %in% input$adjcommonvar,] 277 | df 278 | }) 279 | 280 | } 281 | ) 282 | 283 | } 284 | 285 | 286 | 287 | #' @title Turn a directory of flat files into a list of data.frames 288 | #' @description Useful to prepare data for \code{\link{tableNet}} 289 | #' @param dfdir character string of the directory where you want to load flat files 290 | #' @param ext file extention on the type of files to load. Usually \code{.csv} or \code{.txt} 291 | #' @param exclude character string of table names to be excluded from app. Needs to be specified to \code{NULL} or a character 292 | #' vector or else \code{...} arguments will not be handled properly. 293 | #' @param ... parameters to pass to \code{\link{read.delim}}. Commonly \code{nrow}, \code{sep}, 294 | #' @seealso \code{\link{tableNet}} \code{\link{isKey}} 295 | #' @return list of data.frames 296 | #' @export 297 | #' 298 | #' @examples 299 | #' \dontrun{ 300 | #' ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory 301 | #' temp <- tempfile() 302 | #' localDataDir <- paste0(tempdir(), '\\lahman2012-csv-onYourComp.zip') 303 | #' download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir) 304 | #' unzip(localDataDir, exdir=paste0(tempdir(), '\\lahman2012-csv-onYourComp')) ## may not be necessary 305 | #' 306 | #' ## create a list of data.frames from .CSVs 307 | #' dfL <- dir2dfList(paste0(tempdir(), '\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F) 308 | #' } 309 | 310 | dir2dfList <- function(dfdir, ext='.txt', exclude=NULL, ...) { 311 | # get list of .txt text files in directory 312 | setwd(dfdir) 313 | tables <- list.files()[sapply(list.files(), function(x) substr(x,nchar(x)-3, nchar(x)))==ext] 314 | tableNames <- sapply(tables, function(x) substr(x,0, nchar(x)-4), USE.NAMES=F) 315 | 316 | # create list of dfs from directory 317 | dfL <- list() 318 | for(i in 1:length(tables)) { 319 | dfL[[tableNames[i]]] <- read.delim(tables[i], ...) 320 | dfL[[tableNames[i]]] <- dfL[[tableNames[i]]][,!names(dfL[[tableNames[i]]]) %in% exclude] 321 | print(paste(tableNames[i], nrow(dfL[[tableNames[i]]]), Sys.time())) 322 | } 323 | 324 | return(dfL) 325 | } 326 | 327 | #' @title Determine strength of linking variables 328 | #' @description This function computes the percentage of unique values of a column \code{x} from \code{table1} that appear in 329 | #' in a \code{table2}. It is called and computed on the fly in \code{\link{tableNet}}. However, these computations can be 330 | #' slow on large datasets, so it is provided a standalone function that can be run once to store the output and fed into the 331 | #' \code{\link{tableNet}} app to prevent repetitive slow computations on the fly. 332 | #' @param dfL list of data.frames. easily generated from \code{\link{dir2dfList}} 333 | #' @param xvar character string, name of the variable to calculate strength for across all tables in \code{dfL} 334 | #' @param printdf prints progress of flat file loads to R console. 335 | #' @seealso \code{\link{tableNet}} \code{\link{dir2dfList}} 336 | #' @return list of data.frames 337 | #' @export 338 | #' 339 | #' @examples 340 | #' \dontrun{ 341 | #' ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory 342 | #' temp <- tempfile() 343 | #' localDataDir <- paste0(tempdir(), '\\lahman2012-csv-onYourComp.zip') 344 | #' download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir) 345 | #' unzip(localDataDir, exdir=paste0(tempdir(), '\\lahman2012-csv-onYourComp')) ## may not be necessary 346 | #' 347 | #' ## create a list of data.frames from .CSVs 348 | #' dfL <- dir2dfList(paste0(tempdir(), '\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F) 349 | #' isKey(dfL, 'playerID') 350 | #' } 351 | 352 | 353 | isKey <- function(dfL, xvar) { 354 | 355 | tabNames <- lapply(dfL, names) 356 | tabs <- names(which(lapply(tabNames, function(x) xvar %in% x)==T)) 357 | mat <- matrix(nrow=length(tabs), ncol=length(tabs)) 358 | ii <- 1; 359 | for(i in tabs){ 360 | iivar <- dfL[[i]][,xvar] 361 | jj <- 1 362 | for(j in tabs){ 363 | jjvar <- dfL[[j]][,xvar] 364 | stop 365 | mat[jj,ii] <- sum(jjvar %in% iivar)/length(jjvar) 366 | jj<-jj+1 367 | } 368 | ii<-ii+1 369 | } 370 | 371 | mat[is.na(mat)] <- 0 372 | colnames(mat) <- tabs 373 | rownames(mat) <- tabs 374 | return(mat) 375 | } 376 | 377 | -------------------------------------------------------------------------------- /R/modeling.R: -------------------------------------------------------------------------------- 1 | #' @title Create formula 2 | #' @description Creates a formula object which can be passed to many R modeling functions from a vector of variable names. 3 | #' @param y character string, target variable 4 | #' @param xList character vector, predictor variables (Right Hand Side variables) for formula 5 | #' @return formula object 6 | #' @export 7 | #' @examples 8 | #' form <- makeForm('mpg', c('drat', 'wt', 'hp')) 9 | #' summary(lm(form, data=mtcars)) 10 | 11 | makeForm <- function(y, xList) { 12 | if(as.character(y)[1]=='as.factor') y<-paste(paste(as.character(y), collapse='('), ')', sep='') 13 | form <- (as.formula(paste(y, '~', paste(xList, collapse='+'), sep=''))) 14 | return(form) 15 | } 16 | 17 | #' @title Edit a formula 18 | #' @description Easy way to add or subtract predictor variables to a formula 19 | #' @param form formula, the formula you will be editing 20 | #' @param sub character vector, predictor variables to subtract (remove) from formula 21 | #' @param add character vector, predictor variables to add to formula 22 | #' @return formula, edited formula object 23 | #' @export 24 | #' @examples 25 | #' form <- as.formula('mpg~cyl+hp+drat+qsec') 26 | #' editForm(form, add=c('wt', 'gear', 'carb')) 27 | #' editForm(form, sub=c('cyl', 'hp', 'qsec', 'variableNotInFormula')) 28 | #' editForm(form, sub=c('cyl', 'hp', 'qsec'), add=c('wt')) 29 | 30 | editForm <- function(form, add=NULL, sub=NULL){ 31 | eform <- form 32 | X <- unlist(strsplit(as.character(form)[[3]], ' +')) 33 | X <- X[X!='+'] 34 | Y <- form[[2]] 35 | 36 | if(is.null(sub)==F){ 37 | Xsub <- X[!X %in% sub] 38 | eform <- makeForm(Y,Xsub) 39 | } 40 | 41 | if(is.null(add)==F){ 42 | if('Xsub' %in% ls()) X <- Xsub 43 | Xadd <- c(X, add) 44 | eform <- makeForm(Y,Xadd) 45 | } 46 | 47 | return(eform) 48 | } 49 | 50 | shuffleForm <- function(form){ 51 | eform <- form 52 | X <- unlist(strsplit(as.character(form)[[3]], ' +')) 53 | X <- X[X!='+'] 54 | X <- X[sample(1:length(X), replace=F)] 55 | Y <- form[[2]] 56 | shuffledForm <- makeForm(Y, X) 57 | return(shuffledForm) 58 | } 59 | 60 | #' @title Variable importance from iteratively shuffled orders of predictor variables 61 | #' @description Runs multiple anova analyses to assess deviance explained by each predictor in shuffled orders, iteratively. 62 | #' Currently using only logistic regression. Could be generalized. 63 | #' @param form formula to be passed to \code{\link{travCount}}. 64 | #' @param df data.frame of data to be used for analysis 65 | #' @param n number of iterations for shuffled ANOVA analysis 66 | #' @param test character string, statistical test to run. Default is 'Chisq'. 67 | #' @return data.frame of results 68 | #' @export 69 | #' @examples 70 | #' form <- as.formula('am~wt+gear+carb+cyl+hp+drat+qsec') 71 | #' shuffleAnova(form, mtcars, n=50) 72 | 73 | shuffleAnova <- function(form, df, n=5, test='Chisq'){ 74 | # creating a list of all anova analyses (with shuffled order of x variables) 75 | anovaL <- list() 76 | for(i in 1:n){ 77 | sform <- shuffleForm(form) 78 | anovaL[[i]] <- anova(glm(sform, data=df, family=binomial(logit)), test=test) 79 | } 80 | 81 | # create a summary table from running multiple anova analyses with different subsets of variables 82 | pred <- unique(unlist(lapply(anovaL, function(x) unique(rownames(x))))) 83 | res <- data.frame(xvar=pred) 84 | for(i in pred){ 85 | res$deviance_mean[res$xvar==i] <- mean(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2]))) 86 | res$deviance_median[res$xvar==i] <- median(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2]))) 87 | res$deviance_sd[res$xvar==i] <- sd(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2]))) 88 | res$deviance_min[res$xvar==i] <- min(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2]))) 89 | res$deviance_max[res$xvar==i] <- max(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2]))) 90 | res$pvalue_median[res$xvar==i] <- median(unlist(lapply(anovaL, function(x) x[rownames(x)==i,5])), na.rm=T) 91 | } 92 | p <- cut(res$pvalue_median, breaks=c(0,.001,.01,.05,.1, 1)) 93 | res$pstar <- factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', '')) 94 | res <- res[order(res$deviance_mean, decreasing=T),] 95 | return(res) 96 | } 97 | 98 | #' @title Format logistic regression table 99 | #' @description Turns a logistic regression object into a regression table 100 | #' and outsheets it to a csv if you choose 101 | #' @param reg logistic regression object. output from \code{\link{glm}} 102 | #' @param file filepath of ouput. \code{txt} or \code{csv} object. 103 | #' @param xvlab data.frame, lookup table for variable names. First column is codename, second column is the pretty printed name. 104 | #' @param stats include statistics in output 105 | #' @return data.frame of results formatted nicely 106 | #' @export 107 | #' @examples 108 | #' reg <- glm(am~qsec+hp, data=mtcars, family=binomial(logit)) 109 | #' logit2tab(reg) 110 | #' longnames <- data.frame(short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am', 'qsec'), 111 | #' long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.', 'Q Seconds')) 112 | #' logit2tab(reg, xvlab=longnames, stats=T) 113 | 114 | logit2tab <- function(reg, file=NULL, xvlab=NULL, stats=F){ 115 | tab <- summary(reg)$coefficients 116 | tab <- data.frame(coef=tab[,'Estimate'], oddsRatio=exp(tab[,'Estimate']), stdError=tab[,'Std. Error'], p=tab[,'Pr(>|z|)'], stringsAsFactors=F) 117 | p <- cut(tab$p, breaks=c(0,.001,.01,.05,.1, 1)) 118 | tab$pstar <- factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', '')) 119 | if(is.null(xvlab)==F) { 120 | tab$Metric <- as.character(xvlab[match(rownames(tab), xvlab[,1]), 2]) 121 | tab$Metric[is.na(tab$Metric)] <- '' 122 | tab$Metric[rownames(tab)=='(Intercept)'] <- '(Intercept)' 123 | tab <- tab[, c('Metric', names(tab)[!names(tab) %in% 'Metric'])] 124 | } 125 | if(is.null(file)==F) write.csv(tab, file=file) 126 | if(stats==T){ 127 | tab[nrow(tab)+1, ] <- ''; rownames(tab)[nrow(tab)] <- '' 128 | tab[nrow(tab)+1,1:2] <- c('aic', reg$aic) 129 | tab[nrow(tab)+1,1:2] <- c('n', nrow(reg$model)) 130 | tab[nrow(tab)+1,1:2] <- c('Chi Sq.', reg$null.deviance - reg$deviance) 131 | tab[nrow(tab)+1,1:2] <- c('P(>|Chi|)', pchisq(reg$null.deviance-reg$deviance, 2, lower.tail=F, df=reg$df.null-reg$df.residual)) 132 | tab[is.na(tab)] <- '' 133 | } 134 | return(tab) 135 | } 136 | 137 | #' @title All-subsets logistic regression 138 | #' @description Plots the output of the bestglm BestModels object. Similar to the visual output of plotting a regsubsets object 139 | #' @param bglm \code{bestglm} object from \code{\link{bestglm}} 140 | #' @param rc decimal places to display on y-axis of plot 141 | #' @return plot 142 | #' @seealso \code{\link{bestglm}}, \code{\link{leaps}} 143 | #' @import bestglm 144 | #' @export 145 | #' @examples 146 | #' require('bestglm') 147 | #' b <- bestglm(Xy=mtcars[,c('mpg', 'hp', 'drat', 'cyl', 'wt', 'qsec', 'vs')], family=binomial(logit), IC='BIC', nvmax=4) 148 | #' plotBestglm(b$BestModels, rc=3) 149 | 150 | plotBestglm <- function(bglm, rc=2) { 151 | image(as.matrix(t(bglm[,1:ncol(bglm)-1])), col=c(0:1), xaxt='n', yaxt='n') 152 | axis(1, at=seq(0,1,length.out=ncol(bglm)-1), labels=colnames(bglm)[1:ncol(bglm)-1], las=2) 153 | axis(2, at=seq(0,1,length.out=nrow(bglm)), labels=rev(round(bglm$Criterion,rc)), las=2) 154 | } 155 | 156 | 157 | #' @title Univariate glm regression 158 | #' @description Runs a univariate logistic regression on each predictor variable of interest. 159 | #' @param df data.frame with variables for analysis 160 | #' @param yv character string, target variable 161 | #' @param xv character vector, predictor variables to test univariately 162 | #' @param file character string, filepath to write results out to. txt or csv file. 163 | #' @param sortby character string, criteria to sort variables by. Default = 'aic' 164 | #' @param xvlab data.frame, lookup table for variable names. First column is codename, second column is the pretty printed name. 165 | #' @param test \code{TRUE} or \code{FALSE}. Includes Chi square test, or not. 166 | #' @return data.frame of results 167 | #' @seealso \code{\link{bestglm}}, \code{\link{leaps}} 168 | #' @import Hmisc 169 | #' @export 170 | #' @examples 171 | #' 172 | #' require('Hmisc') 173 | #' 174 | #' ##setting up some data 175 | #' longnames <- data.frame(long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.'), 176 | #' short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am'), stringsAsFactors=F) 177 | #' 178 | #' glm.out <- uniglm(df=mtcars, yv='vs', xv=c('hp','drat','cyl','mpg','wt'), xvlab=longnames) 179 | 180 | uniglm <- function(df, yv, xv, file=NULL, sortby='aic', xvlab=NULL, test=T){ 181 | mat <- data.frame(matrix(nrow=25*length(xv), ncol=12, dimnames=list(NULL, c('Predictor', 'Name', 'fac', 'ref', 'coef', 'oddsRatio', 'p', 'pstar', 'aic', 'c', 'Chisq_pvalue', 'Chisq_pstar')))) 182 | i<-0 183 | for(x in xv){ 184 | formt <- makeForm(yv, x) 185 | reg <- glm(formt, data=df, family=binomial(logit)) 186 | for(j in 2:length(reg$coefficients)){ 187 | i<-i+1 188 | try({ 189 | mat$Predictor[i] <- names(reg$coefficients[j]) 190 | 191 | ## If we have nice clean labels to replace variable names in code 192 | if(is.null(xvlab)==F) {mat$Name[i] <- as.character(xvlab[match(x, xvlab[,2]), 1]) 193 | } else {mat=mat[, setdiff(names(mat), 'Name')]} 194 | 195 | ## If we have a categorical variable with a reference 196 | if(length(reg$coef)>2) { 197 | regv <- gsub(x, '', names(reg$coef)) 198 | uxv <- unique(df[,x]) 199 | mat$ref[i] <- setdiff(uxv, regv) 200 | mat$Predictor[i] <- x 201 | mat$fac[i] <- gsub(x, '', names(reg$coef[j])) 202 | } 203 | 204 | if(test==T){ 205 | av <- anova(reg, test='Chisq')['Pr(>Chi)'][[1]][2] 206 | mat$Chisq_pvalue[i] <- av 207 | avp <- cut(mat$Chisq_pvalue[i], breaks=c(0,.001,.01,.05,.1, 1)) 208 | mat$Chisq_pstar[i] <- as.character(factor(x=avp, levels=levels(avp), labels=c('***', '**', '*', '.', ''))) 209 | } else {mat=mat[, setdiff(names(mat), c('Chisq_pvalue', 'Chisq_pstar'))]} 210 | 211 | mat$coef[i] <- as.numeric(reg$coefficients[j]) 212 | mat$oddsRatio[i] <- as.numeric(exp(reg$coefficients[j])) 213 | mat$p[i] <- as.numeric(summary(reg)$coefficients[j,4]) 214 | p <- cut(mat$p[i], breaks=c(0,.001,.01,.05,.1, 1)) 215 | mat$pstar[i] <- as.character(factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', ''))) 216 | mat$aic[i] <- reg$aic 217 | mat$c[i] <- round(somers2(x=reg$fitted.values, y=reg$y)['C'],3) 218 | }) 219 | } 220 | 221 | } 222 | mat<-mat[1:i,] 223 | mat$ref[is.na(mat$ref)] <- '' 224 | mat$fac[is.na(mat$fac)] <- '' 225 | if(sum(mat$ref=='')==nrow(mat)) mat <- mat[, setdiff(names(mat), 'ref')] 226 | if(sum(mat$fac=='')==nrow(mat)) mat <- mat[, setdiff(names(mat), 'fac')] 227 | decreasingTF <- ifelse(sortby %in% c('c'), T, F) 228 | mat <- mat[order(mat[,sortby], decreasing=decreasingTF),] 229 | if(is.null(file)==F) write.csv(mat, file=file) 230 | return(mat) 231 | } 232 | 233 | #' @title Analyze predictions of supervised model in quantiles 234 | #' @description Analyzing results of supervised model using test data and quantiles. Note only 235 | #' \code{model} or \code{testPred} are necessary, not both. \code{testPred} is advised. 236 | #' @param model the model to get predictions from. Uses \code{predict} method. If \code{testPred} 237 | #' is specified, we don't need this parameter, Probably safer to just use testPred, unless the model you have 238 | #' has a predict method that will work as intended without any other arguments. 239 | #' @param xtext data.frame of test data (predictor variables only) 240 | #' @param ytest vector of test data (target variable) 241 | #' @param n number of quantiles 242 | #' @param roundText decimals to print 243 | #' @param testPred vector of predictions from test data. Do not use if using \code{model} parameter. 244 | #' @param fw numeric vector, of bins. Defaults to 245 | #' @return data.frame of results 246 | #' @seealso \code{\link{predSortPlot}} 247 | #' @export 248 | #' @examples 249 | #' require('randomForest') 250 | #' rf <- randomForest(x=mtcars[1:25,1:8], y=as.factor(mtcars[1:25, 'am']), ntree=5) 251 | #' mtcarsTestPred <- predict(rf, mtcars[26:32, 1:8], type='prob')[,2] 252 | #' pq2 <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], n=3, roundText=4, testPred=mtcarsTestPred) 253 | #' pq <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], fw=seq(.1,1,length.out=10), roundText=4, testPred=mtcarsTestPred) 254 | #' 255 | #' barplot(pq$actual, names=pq$predRange, ylab='True Positives', las=2, cex.names=.7) 256 | #' barplot(pq$hitRate, names=pq$predRange, ylab='True Positive Hit Rate', las=2, cex.names=.7) 257 | #' barplot(pq$predMax-pq$predMin, pq$hitRate, names=pq$predRange, ylab='prediction', las=2, cex.names=.7) 258 | 259 | predQuantile <- function(model=NULL, xtest=NULL, ytest, n=5, roundText=3, testPred=NULL, fw=NULL){ 260 | if(is.null(testPred)) {pred <- predict(model, xtest) 261 | } else {pred <- testPred} 262 | 263 | tf <- data.frame(pred=pred, hits=ytest) 264 | tf <- tf[order(tf$pred),] 265 | if(is.null(fw)) {tf$cat <- sort(rep(1:n, ceiling(nrow(tf)/n))[1:nrow(tf)]) 266 | } else {tf$cat <- cut(tf$pred, breaks=fw)} 267 | 268 | aggSum <- aggregate(hits~cat, data=tf, sum) 269 | aggLength <- aggregate(hits~cat, data=tf, length) 270 | aggRange <- aggregate(pred~cat, data=tf, range) 271 | aggRange <- cbind(aggRange[,1], data.frame(aggRange[,2])) 272 | aggRangeText <- aggregate(pred~cat, data=tf, function(x) paste(round(range(x), roundText), collapse=' to ')) 273 | 274 | names(aggLength)[2] <- 'N' 275 | names(aggRange) <- c('cat', 'predMin', 'predMax') 276 | names(aggRangeText)[2] <- 'predRange' 277 | 278 | agg <- merge(aggSum, aggLength, by='cat') 279 | agg$hitRate <- agg$hits/agg$N 280 | agg <- merge(agg, aggRange, by='cat') 281 | agg <- merge(agg, aggRangeText, by='cat') 282 | 283 | guessrate <- sum(ytest==1)/length(ytest) 284 | agg$cumHitsPct <- rev(cumsum(rev(agg$hits)/sum(agg$hits))) 285 | agg$cumHitRate <- rev(cumsum(rev(agg$hits))/cumsum(rev(agg$N))) 286 | agg$cumNPct <- rev(cumsum(rev(agg$N)/sum(agg$N))) 287 | agg$cumLift <- agg$cumHitRate/guessrate 288 | 289 | ##correcting for cutoffs with zero 290 | if(!is.null(fw)) { 291 | a<-paste(levels(tf$cat)[!levels(tf$cat) %in% tf$cat], collapse=', ') 292 | print(paste('no scores in these ranges',a)) 293 | agg$cutoff <- fw[which(levels(tf$cat) %in% tf$cat)] 294 | } 295 | agg <- agg[nrow(agg):1,] 296 | return(agg) 297 | } 298 | 299 | #' @title Devil's Horn 300 | #' @description Visualize results of supervised model predictions on test data. Currently supports binary target variable. 301 | #' Red dots represent observations where target variable = 1, black dots where target variable = 0. 302 | #' @param pred vector of predictions for target variable on test data 303 | #' @param ytest, vector of the target variable from test data. (0s and 1s) 304 | #' @param jitterPlot \code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE} 305 | #' @return plot 306 | #' @seealso \code{\link{predQuantile}} 307 | #' @export 308 | #' @examples 309 | #' ## Setting up some data and building a basic model on training data. 310 | #' mylogit <- glm(vs~drat+hp+mpg, family=binomial('logit'), data=mtcars[1:25,]) 311 | #' mtcarsTestPred <- predict(mylogit, mtcars[26:32, ], type='response') 312 | #' predSortPlot(pred=mtcarsTestPred, ytest=mtcars$vs[26:32]) 313 | 314 | predSortPlot <- function(pred, ytest, jitterPlot=NULL) { 315 | tf <- data.frame(pred=pred, actual=ytest) 316 | tf <- tf[order(tf$pred),] 317 | #return(tf) 318 | if(is.null(jitterPlot)==T) plot(tf$pred, col=ifelse(tf$actual==1, 'red', 'black'), cex=ifelse(tf$actual==1, 3, 2), ylab='Prediction') 319 | if(is.null(jitterPlot)==F) plot(jitter(tf$pred, jitterPlot), col=ifelse(tf$actual==1, 'red', 'black'), pch=ifelse(tf$actual==1, 19, 21), ylab='Prediction') 320 | } 321 | 322 | 323 | #' @title Optimize weight on ensemble of 2 supervised models 324 | #' @description This function creates a weighted average of predictions from two models 325 | #' and evaluates F1, precision, recall, auc or c for each combination of the 326 | #' models to determine the best weights for each. 327 | #' @param pred1 numeric vector of probabilities, prediction from model 1 328 | #' @param pred2 numeric vector of probabilities, prediction from model 2 329 | #' @param actual vector of 1s and 0s. The target variable test data 330 | #' @param steps number, high numbers compute a more exhaustive combination of model weights 331 | #' @param cutoff Cutoff used to demarcate predictions into positive or negative class. 332 | #' @param ytest, vector of the target variable from test data. (0s and 1s) 333 | #' @param jitterPlot \code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE} 334 | #' @return data.frame of results 335 | #' @import Hmisc 336 | #' @export 337 | #' @examples 338 | #' require('Hmisc') 339 | #' fit_glm1 <- glm(am~cyl, data=mtcars, family=binomial(logit)) 340 | #' fit_glm2 <- glm(am~disp, data=mtcars, family=binomial(logit)) 341 | #' ow <- optimizeModelWeight(fit_glm1$fitted.values, fit_glm2$fitted.values, actual=fit_glm1$model$am) 342 | #' plot(ow$weights, ow$precision, type='l', xlab='weight on model 1') 343 | 344 | 345 | optimizeModelWeight <- function(pred1, pred2, actual, steps=50, cutoff=.5) { 346 | s <- seq(0,1,length.out=steps) 347 | df <- data.frame(weights=s, c=NA, precision=NA, recall=NA, f1=NA, f.5=NA, stringsAsFactors=F) 348 | j <- 1 349 | for(i in s){ 350 | mds <- (i)*pred1 + (1-i)*pred2 351 | 352 | df$c[j] <- somers2(mds,actual)['C'] 353 | df$precision[j] <- sum(actual[mds>cutoff])/(length(actual[mds>cutoff])) 354 | df$recall[j] <- sum(actual[mds>cutoff])/(sum(actual)) 355 | df$f1[j] <- (2*df$precision[j]*df$recall[j])/(df$precision[j]+df$recall[j]) 356 | df$f.5[j] <- ((1+.5^2)*df$precision[j]*df$recall[j])/(((.5^2)*df$precision[j])+df$recall[j]) 357 | j<-j+1 358 | } 359 | return(df) 360 | } 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | --------------------------------------------------------------------------------