├── R
    ├── .Rapp.history
    ├── rfImp.R
    ├── missdf.R
    ├── quantile_df.R
    ├── filterRules.R
    ├── removePeriods.R
    ├── quantileScore.R
    ├── addRuleQuality.R
    ├── mergeRules2trans.R
    ├── rules2df.R
    ├── shiny_table.R
    ├── makeSQL_createTable.R
    ├── balPanel.R
    ├── mergeEdgeLists.R
    ├── makeVertexAtt.R
    ├── plotAvgBy.R
    ├── overlap.R
    ├── plotdf.R
    ├── shiny_leafletDF.R
    ├── smartAgg.R
    ├── partialDependence.R
    ├── bin.R
    ├── igraph_network.R
    ├── shiny_arules.R
    ├── shiny_tableNet.R
    └── modeling.R
├── draft
    ├── .Rapp.history
    └── horseRaceModel.R
├── .Rbuildignore
├── .gitignore
├── Rsenal.Rproj
├── man
    ├── shinyTable.Rd
    ├── rfImp.Rd
    ├── makeForm.Rd
    ├── missdf.Rd
    ├── plotdf.Rd
    ├── quantileScore.Rd
    ├── quantileFeatureScore.Rd
    ├── plotBestglm.Rd
    ├── rules2df.Rd
    ├── editForm.Rd
    ├── roundCut.Rd
    ├── shuffleAnova.Rd
    ├── overlap.Rd
    ├── getEfromVlist.Rd
    ├── Rsenal.Rd
    ├── filterRules.Rd
    ├── predSortPlot.Rd
    ├── getV1fromVlist.Rd
    ├── logit2tab.Rd
    ├── quantile_df.Rd
    ├── travOut.Rd
    ├── makeSQL_createTable.Rd
    ├── addRuleQuality.Rd
    ├── getVfromE.Rd
    ├── pruneEdge.Rd
    ├── removePeriods.Rd
    ├── mergeEdgeLists.Rd
    ├── travCount.Rd
    ├── balPanel.Rd
    ├── uniglm.Rd
    ├── mergeRules2trans.Rd
    ├── optimizeModelWeight.Rd
    ├── dir2dfList.Rd
    ├── leafletMapDF.Rd
    ├── makeVertexAtt.Rd
    ├── isKey.Rd
    ├── depthbin.Rd
    ├── predQuantile.Rd
    ├── ipipApp.Rd
    ├── plotAvgBy.Rd
    ├── tableNet.Rd
    ├── binCat.Rd
    ├── partialDep.Rd
    ├── partialDepAll.Rd
    ├── arulesApp.Rd
    └── smartAgg.Rd
├── DESCRIPTION
├── LICENSE
├── README.md
└── NAMESPACE


/R/.Rapp.history:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/draft/.Rapp.history:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .DS_Store


--------------------------------------------------------------------------------
/Rsenal.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageInstallArgs: --no-multiarch --with-keep.source
17 | PackageRoxygenize: rd,namespace
18 | 


--------------------------------------------------------------------------------
/man/shinyTable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{shinyTable}
 3 | \alias{shinyTable}
 4 | \title{Shiny app to visualize dataframe as simple interactive datatable}
 5 | \usage{
 6 | shinyTable(df)
 7 | }
 8 | \arguments{
 9 | \item{df}{dataframe to be visualized}
10 | }
11 | \value{
12 | Shiny App
13 | }
14 | \description{
15 | Launches a basic Shiny App that renders the given dataframe into an interactive datatable using \code{renderDataTable}
16 | }
17 | \examples{
18 | \dontrun{
19 | shinyTable(mtcars)
20 | }
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/rfImp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{rfImp}
 3 | \alias{rfImp}
 4 | \title{Sort Random Forest Variable Importance Features}
 5 | \usage{
 6 | rfImp(rf)
 7 | }
 8 | \arguments{
 9 | \item{rf}{randomForest object from randomForest package}
10 | }
11 | \value{
12 | data.frame
13 | }
14 | \description{
15 | Simple utility function to sort random forest variable importance features.  Very simple.
16 | }
17 | \examples{
18 | library('randomForest')
19 | myrf <- randomForest(iris[1:4], iris$Species)
20 | rfImp(myrf)
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/R/rfImp.R:
--------------------------------------------------------------------------------
 1 | #' @title Sort Random Forest Variable Importance Features
 2 | #' @description Simple utility function to sort random forest variable importance features.  Very simple.
 3 | #' @param rf randomForest object from randomForest package
 4 | #' @return data.frame
 5 | #' @export
 6 | #' @examples
 7 | #' library('randomForest')
 8 | #' myrf <- randomForest(iris[1:4], iris$Species)
 9 | #' rfImp(myrf)
10 | 
11 | rfImp <- function(rf) {
12 |   df <- data.frame(rf$importance[order(rf$importance, decreasing=T),,drop=F])
13 |   df$variable <- row.names(df)
14 |   df <- df[,c(2,1)]
15 |   row.names(df) <- NULL
16 |   return(df)
17 | }


--------------------------------------------------------------------------------
/man/makeForm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{makeForm}
 3 | \alias{makeForm}
 4 | \title{Create formula}
 5 | \usage{
 6 | makeForm(y, xList)
 7 | }
 8 | \arguments{
 9 | \item{y}{character string, target variable}
10 | 
11 | \item{xList}{character vector, predictor variables (Right Hand Side variables) for formula}
12 | }
13 | \value{
14 | formula object
15 | }
16 | \description{
17 | Creates a formula object which can be passed to many R modeling functions from a vector of variable names.
18 | }
19 | \examples{
20 | form <- makeForm('mpg', c('drat', 'wt', 'hp'))
21 | summary(lm(form, data=mtcars))
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/man/missdf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{missdf}
 3 | \alias{missdf}
 4 | \title{Inspect missing data in a data.frame}
 5 | \usage{
 6 | missdf(df, criteria = "NA")
 7 | }
 8 | \arguments{
 9 | \item{df}{data.frame to inspect for missing values}
10 | 
11 | \item{character}{string, criteria to search for.  \code{'NA'} or \code{'Inf'}}
12 | }
13 | \value{
14 | summary table
15 | }
16 | \description{
17 | Currently supports \code{NA} and \code{Inf}
18 | }
19 | \examples{
20 | mtcars2 <- mtcars
21 | for(i in 1:ncol(mtcars2)) mtcars2[sample(nrow(mtcars2), sample(1:5,1), replace=T),i] <- NA
22 | missdf(mtcars2)
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: Rsenal
 2 | Title: Rsenal (arsenal) of assorted R functions developed over the years
 3 | Description: stats, data transformation, visualization
 4 | Version: 0.1
 5 | Author: Andrew Brooks
 6 | Maintainer: Andrew Brooks <andrewbrooksct@gmail.com>
 7 | Depends:
 8 |     R (>= 3.0.2)
 9 | License: >GPL-2
10 | LazyData: true
11 | Imports:
12 |     data.table,
13 |     roxygen2,
14 |     igraph,
15 |     leaps,
16 |     bestglm,
17 |     randomForest,
18 |     Hmisc,
19 |     shiny,
20 |     ggplot2,
21 |     arules,
22 |     arulesViz,
23 |     vcd,
24 |     RColorBrewer,
25 |     fmsb,
26 |     MASS,
27 |     gplots,
28 |     memoise,
29 |     BBmisc
30 | Suggests:
31 | RoxygenNote: 5.0.1
32 | 


--------------------------------------------------------------------------------
/man/plotdf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{plotdf}
 3 | \alias{plotdf}
 4 | \title{Plot data.frame to PDF}
 5 | \usage{
 6 | plotdf(df, file = "output.pdf", wordy = F)
 7 | }
 8 | \arguments{
 9 | \item{df}{data.frame to plot.}
10 | 
11 | \item{file}{string - name of the PDF that will be created}
12 | 
13 | \item{wordy}{- boolean \code{TRUE} or \code{FALSE}.  Sequentially prints status of each chart to the console.
14 | Could be useful for large data.frames.}
15 | }
16 | \value{
17 | PDF file of plots
18 | }
19 | \description{
20 | Plots every column of a data.frame as an individual plot (one plot per page)
21 |  in a PDF file.
22 | }
23 | \examples{
24 | \dontrun{
25 | plotdf(df=mtcars, file='mtcars_plots.pdf')
26 | }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/man/quantileScore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{quantileScore}
 3 | \alias{quantileScore}
 4 | \title{Quantile scoring function for continuous or integer valued data}
 5 | \usage{
 6 | quantileScore(x)
 7 | }
 8 | \arguments{
 9 | \item{x}{numeric vector to scale}
10 | }
11 | \value{
12 | a vector of attributes that correspond in order with the nodes in your igraph
13 | }
14 | \description{
15 | This function sorts a series from beginning to end.  It uses each observations place in the quantile to assign it a score.
16 | useful for transforming variables into the same units (0-1).  Handles NAs
17 | }
18 | \examples{
19 | cbind(quantileScore(mtcars$mpg), mtcars$mpg)
20 | cbind(quantileScore(mtcars$cyl), mtcars$cyl)
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/quantileFeatureScore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{quantileFeatureScore}
 3 | \alias{quantileFeatureScore}
 4 | \title{Quantile scoring function for continuous or integer valued data}
 5 | \usage{
 6 | quantileFeatureScore(x, wq = 0.5, wf = 0.5)
 7 | }
 8 | \arguments{
 9 | \item{x}{numeric vector to scale}
10 | }
11 | \value{
12 | a vector of attributes that correspond in order with the nodes in your igraph
13 | }
14 | \description{
15 | This function combines the quantile function with the basic feature normalization technique
16 | useful for getting a stable normalization
17 | }
18 | \examples{
19 | cbind(quantileFeatureScore(sort(mtcars$mpg)), sort(mtcars$mpg))
20 | cbind(quantileFeatureScore(mtcars$cyl), mtcars$cyl)
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/plotBestglm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{plotBestglm}
 3 | \alias{plotBestglm}
 4 | \title{All-subsets logistic regression}
 5 | \usage{
 6 | plotBestglm(bglm, rc = 2)
 7 | }
 8 | \arguments{
 9 | \item{bglm}{\code{bestglm} object from \code{\link{bestglm}}}
10 | 
11 | \item{rc}{decimal places to display on y-axis of plot}
12 | }
13 | \value{
14 | plot
15 | }
16 | \description{
17 | Plots the output of the bestglm BestModels object. Similar to the visual output of plotting a regsubsets object
18 | }
19 | \examples{
20 | require('bestglm')
21 | b <- bestglm(Xy=mtcars[,c('mpg', 'hp', 'drat', 'cyl', 'wt', 'qsec', 'vs')], family=binomial(logit), IC='BIC', nvmax=4)
22 | plotBestglm(b$BestModels, rc=3)
23 | }
24 | \seealso{
25 | \code{\link{bestglm}},  \code{\link{leaps}}
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/man/rules2df.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{rules2df}
 3 | \alias{rules2df}
 4 | \title{Transform association rule results into data.frame}
 5 | \usage{
 6 | rules2df(rules, list = F)
 7 | }
 8 | \arguments{
 9 | \item{rules}{list of association rules (S4 arules object).  Output of \code{apriori} function.}
10 | 
11 | \item{list}{logical \code{TRUE} or \code{FALSE}.  sets LHS items in a list, rather than one character string}
12 | }
13 | \value{
14 | association rules in a data.frame
15 | }
16 | \description{
17 | Note this function only currently works when the itemsets are of size 1 on the LHS and RHS
18 | }
19 | \examples{
20 | library('arules')
21 | data("Adult")
22 | ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.6, target = "rules", minlen=2))
23 | df <- rules2df(ar, list=T)
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/man/editForm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{editForm}
 3 | \alias{editForm}
 4 | \title{Edit a formula}
 5 | \usage{
 6 | editForm(form, add = NULL, sub = NULL)
 7 | }
 8 | \arguments{
 9 | \item{form}{formula, the formula you will be editing}
10 | 
11 | \item{add}{character vector, predictor variables to add to formula}
12 | 
13 | \item{sub}{character vector, predictor variables to subtract (remove) from formula}
14 | }
15 | \value{
16 | formula, edited formula object
17 | }
18 | \description{
19 | Easy way to add or subtract predictor variables to a formula
20 | }
21 | \examples{
22 | form <- as.formula('mpg~cyl+hp+drat+qsec')
23 | editForm(form, add=c('wt', 'gear', 'carb'))
24 | editForm(form, sub=c('cyl', 'hp', 'qsec', 'variableNotInFormula'))
25 | editForm(form, sub=c('cyl', 'hp', 'qsec'), add=c('wt'))
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/man/roundCut.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{roundCut}
 3 | \alias{roundCut}
 4 | \title{Round numbers in interval}
 5 | \usage{
 6 | roundCut(x, r = 1)
 7 | }
 8 | \arguments{
 9 | \item{x}{character vector of bins to format}
10 | 
11 | \item{r}{number, 0 to 10 (or higher I suppose) indicating how many decimals to display}
12 | }
13 | \value{
14 | formatted character vector with length of input vector.
15 | }
16 | \description{
17 | Formats an interval of form \code{(5.234,11.783]} to something like \code{(5.2,11.8]}.
18 | Used for formatting only, mainly with binning functions like \code{\link{depthbin}}.  Intervals can be opened or closed with
19 | \code{(} and \code{[} respectively and are maintained as such when formatted.  Useful for prettifying graphs and reports.
20 | }
21 | \examples{
22 | x1 <- cut(quantile(rnorm(100)), breaks=4)
23 | roundCut(x1, 1)
24 | }
25 | \seealso{
26 | \code{\link{depthbin}}
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/man/shuffleAnova.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{shuffleAnova}
 3 | \alias{shuffleAnova}
 4 | \title{Variable importance from iteratively shuffled orders of predictor variables}
 5 | \usage{
 6 | shuffleAnova(form, df, n = 5, test = "Chisq")
 7 | }
 8 | \arguments{
 9 | \item{form}{formula to be passed to \code{\link{travCount}}.}
10 | 
11 | \item{df}{data.frame of data to be used for analysis}
12 | 
13 | \item{n}{number of iterations for shuffled ANOVA analysis}
14 | 
15 | \item{test}{character string, statistical test to run.  Default is 'Chisq'.}
16 | }
17 | \value{
18 | data.frame of results
19 | }
20 | \description{
21 | Runs multiple anova analyses to assess deviance explained by each predictor in shuffled orders, iteratively.
22 | Currently using only logistic regression.  Could be generalized.
23 | }
24 | \examples{
25 | form <- as.formula('am~wt+gear+carb+cyl+hp+drat+qsec')
26 | shuffleAnova(form, mtcars, n=50)
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/man/overlap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{overlap}
 3 | \alias{overlap}
 4 | \title{Assess overlap in vectors}
 5 | \usage{
 6 | overlap(x1, x2, na.rm = F, wordy = T, x1name = NULL, x2name = NULL)
 7 | }
 8 | \arguments{
 9 | \item{x1}{vector of same class as \code{x2}}
10 | 
11 | \item{x2}{vector of same class as \code{x1}}
12 | 
13 | \item{na.rm}{logical, remove NAs from analysis}
14 | 
15 | \item{wordy}{logical, prints convenience information}
16 | }
17 | \value{
18 | summary data.frame
19 | }
20 | \description{
21 | Useful for learning the overlap of absolute and unique values between two vectors of the same type.  They
22 | do not have to be the same length, and are allowed to contain NAs.
23 | }
24 | \examples{
25 | x1 <- sample(1:50, 40, replace=T)
26 | x2 <- sample(c(1:60, rep(NA, 30)), 55, replace=T)
27 | overlap(x1, x2, na.rm=T)
28 | overlap(x1, x2)
29 | overlap(mtcars$gear, mtcars$cyl, x1name='gear', x2name='cyl')
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/man/getEfromVlist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{getEfromVlist}
 3 | \alias{getEfromVlist}
 4 | \title{Get edge IDs from vertices}
 5 | \usage{
 6 | getEfromVlist(g, vlist, mode = "out")
 7 | }
 8 | \arguments{
 9 | \item{g}{igraph object}
10 | 
11 | \item{vlist}{list of node/vertex names}
12 | 
13 | \item{mode}{character string: 'in', 'out' or 'all'}
14 | }
15 | \value{
16 | edge IDs that go OUT or IN from the specified list of vertices
17 | }
18 | \description{
19 | returns the edge IDs that go OUT or IN from the specified list of vertices
20 | }
21 | \examples{
22 | require('igraph')
23 | 
24 | ## build sample network
25 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
26 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
27 | relations<-cbind(from,to)
28 | g <- graph.data.frame(relations)
29 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
30 | V(g)$tier <- tiers
31 | 
32 | getEfromVlist(g, c('A', 'D'), 'to')
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/man/Rsenal.Rd:
--------------------------------------------------------------------------------
 1 | \docType{package}
 2 | \name{Rsenal}
 3 | \alias{Rsenal}
 4 | \title{a(R)senal of assorted R functions and apps}
 5 | \description{
 6 |   This is mainly a collection of random, or more accurately, clusters of random R functions
 7 |   I've written for personal and work projects over the years.  I don't expect it to be immediately 
 8 |   useful to the general population of R users.  For now it's mostly my cloud storage (with nice documentation) for R functions
 9 |   that I use repeatedly.  Some functions are fairly general purpose, while others were developed for specific
10 |   problems and are not super generalizeable in the current state. \cr 
11 |   
12 |   \bold{Current clusters of functionality:} \cr
13 |   - manipulating igraph objects \cr
14 |   - modeling and model validation \cr
15 |   - cleaning data \cr
16 |   - scaling and transforming data 
17 | }
18 | 
19 | \author{Andrew Brooks \cr \email{andrewbrooksct@gmail.com}}
20 | 
21 | 
22 | \details{
23 |   \tabular{ll}{
24 |   Package: \tab Rsenal\cr
25 |   Type: \tab Package\cr
26 |   Version: \tab 0.1\cr
27 |   Date: \tab 2014-10-08\cr
28 |   }
29 | }


--------------------------------------------------------------------------------
/man/filterRules.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{filterRules}
 3 | \alias{filterRules}
 4 | \title{Post process association rules}
 5 | \usage{
 6 | filterRules(rules, oneSideOnly = oneSideOnly)
 7 | }
 8 | \arguments{
 9 | \item{rules}{list of association rules (S4 arules object).  Output of \code{apriori} function.}
10 | 
11 | \item{oneSideOnly}{list of character vectors.  Each character vector of the list contains variables that are likely very similar
12 | and will generate uninteresting rules.  So the filtering algorithm will prune rules where variables within this list appear on
13 | both the RHS & LHS}
14 | }
15 | \value{
16 | association rules
17 | }
18 | \description{
19 | This function filters rules that have already been mined.
20 | }
21 | \examples{
22 | library('arules')
23 | data('Adult')
24 | rules <- apriori(Adult, parameter = list(supp=0.01, conf=0.9, target = "rules"))
25 | oneSideOnly <- list(c('age', 'workclass', 'education'), c('marital-status', 'occupation', 'race'))
26 | f2 <- filterRules(rules, oneSideOnly=oneSideOnly)
27 | length(rules)
28 | length(f2)
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/man/predSortPlot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{predSortPlot}
 3 | \alias{predSortPlot}
 4 | \title{Devil's Horn}
 5 | \usage{
 6 | predSortPlot(pred, ytest, jitterPlot = NULL)
 7 | }
 8 | \arguments{
 9 | \item{pred}{vector of predictions for target variable on test data}
10 | 
11 | \item{ytest,}{vector of the target variable from test data. (0s and 1s)}
12 | 
13 | \item{jitterPlot}{\code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}}
14 | }
15 | \value{
16 | plot
17 | }
18 | \description{
19 | Visualize results of supervised model predictions on test data.  Currently supports binary target variable.
20 | Red dots represent observations where target variable = 1, black dots where target variable = 0.
21 | }
22 | \examples{
23 | ## Setting up some data and building a basic model on training data.
24 | mylogit <- glm(vs~drat+hp+mpg, family=binomial('logit'), data=mtcars[1:25,])
25 | mtcarsTestPred <- predict(mylogit, mtcars[26:32, ], type='response')
26 | predSortPlot(pred=mtcarsTestPred, ytest=mtcars$vs[26:32])
27 | }
28 | \seealso{
29 | \code{\link{predQuantile}}
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/man/getV1fromVlist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{getV1fromVlist}
 3 | \alias{getV1fromVlist}
 4 | \title{Get 1 degree away vertexes}
 5 | \usage{
 6 | getV1fromVlist(g, vlist, mode = "out")
 7 | }
 8 | \arguments{
 9 | \item{g}{igraph object}
10 | 
11 | \item{vlist}{list of starting node/vertex names}
12 | 
13 | \item{mode}{'out' or 'in'.  'out' returns all the vertex/node names 1 degree out from \code{vlist}. 'in' returns all
14 | vertex/node names directed into vlist nodes.}
15 | }
16 | \value{
17 | vector of node names that are 1 degree away from \code{vlist}
18 | }
19 | \description{
20 | Returns a list of all vertex names that touch (going IN or OUT) from the
21 | specified vertex list.
22 | }
23 | \examples{
24 | ## build sample network
25 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
26 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
27 | relations<-cbind(from,to)
28 | g <- graph.data.frame(relations)
29 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
30 | V(g)$tier <- tiers
31 | 
32 | getV1fromVlist(g, c('A', 'D'), 'to')
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/man/logit2tab.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{logit2tab}
 3 | \alias{logit2tab}
 4 | \title{Format logistic regression table}
 5 | \usage{
 6 | logit2tab(reg, file = NULL, xvlab = NULL, stats = F)
 7 | }
 8 | \arguments{
 9 | \item{reg}{logistic regression object.  output from \code{\link{glm}}}
10 | 
11 | \item{file}{filepath of ouput.  \code{txt} or \code{csv} object.}
12 | 
13 | \item{xvlab}{data.frame, lookup table for variable names.  First column is codename, second column is the pretty printed name.}
14 | 
15 | \item{stats}{include statistics in output}
16 | }
17 | \value{
18 | data.frame of results formatted nicely
19 | }
20 | \description{
21 | Turns a logistic regression object into a regression table
22 | and outsheets it to a csv if you choose
23 | }
24 | \examples{
25 | reg <- glm(am~qsec+hp, data=mtcars, family=binomial(logit))
26 | logit2tab(reg)
27 | longnames <- data.frame(short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am', 'qsec'),
28 | long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.', 'Q Seconds'))
29 | logit2tab(reg, xvlab=longnames, stats=T)
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Andrew Brooks
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/man/quantile_df.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{quantile_df}
 3 | \alias{quantile_df}
 4 | \title{Pretty print quantiles}
 5 | \usage{
 6 | quantile_df(x, probs = seq(0, 1, 0.05), na.rm = F, names = F, type = 7,
 7 |   colname = NULL, round = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector to calculate quantile with}
11 | 
12 | \item{probs}{numeric vector of probabilities to use for quantile}
13 | 
14 | \item{na.rm}{logical, pass to quantile function}
15 | 
16 | \item{names}{logical, pass to quantile function}
17 | 
18 | \item{type,}{number, pass to quantile function}
19 | 
20 | \item{colname,}{character, name of variable column in output}
21 | 
22 | \item{round,}{integer, number of digits to round quantile value.
23 | passed to \code{digits} argument of \code{round} function}
24 | }
25 | \value{
26 | prettified data.frame of probabilities
27 | }
28 | \description{
29 | Useful for R Markdown reports
30 | }
31 | \examples{
32 | quantile_df(mtcars$mpg, seq(0,1,.1))
33 | quantile_df(mtcars$mpg, seq(0,1,.1), names=T)
34 | quantile_df(mtcars$mpg, seq(0,1,.2), colname='mpg')
35 | quantile_df(mtcars$mpg, round=1)
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/man/travOut.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{travOut}
 3 | \alias{travOut}
 4 | \title{Find all nodes connected to root node}
 5 | \usage{
 6 | travOut(g, root, orderN = -1)
 7 | }
 8 | \arguments{
 9 | \item{g}{igraph object}
10 | 
11 | \item{root}{character string: name of root node}
12 | 
13 | \item{orderN}{number: # of degrees away from root node to search.}
14 | }
15 | \value{
16 | igraph object, subgragh of \code{g}
17 | }
18 | \description{
19 | Returns a subset of the original graph (all edges and vertices that a
20 | directed path can take from from the root node.  OrderN limits the growth of these paths.
21 | }
22 | \examples{
23 | require('igraph')
24 | 
25 | ## build sample network
26 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
27 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
28 | relations<-cbind(from,to)
29 | g <- graph.data.frame(relations)
30 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
31 | V(g)$tier <- tiers
32 | 
33 | plot(g) ## full network
34 | plot(travOut(g, 'D')) ## sub network
35 | }
36 | \seealso{
37 | Simpler version: \code{\link{travCount}}
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/R/missdf.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title Inspect missing data in a data.frame
 3 | #' @description Currently supports \code{NA} and \code{Inf}
 4 | #' @param df data.frame to inspect for missing values
 5 | #' @param character string, criteria to search for.  \code{'NA'} or \code{'Inf'}
 6 | #' @return summary table
 7 | #' @export
 8 | #' @examples
 9 | #' mtcars2 <- mtcars
10 | #' for(i in 1:ncol(mtcars2)) mtcars2[sample(nrow(mtcars2), sample(1:5,1), replace=T),i] <- NA
11 | #' missdf(mtcars2)
12 | 
13 | 
14 | missdf <- function(df, criteria='NA') {
15 |   n <- nrow(df)
16 |   if(criteria=='NA') a <- sapply(df, function(x) sum(is.na(x)))
17 |   if(criteria=='Inf') a <- sapply(df, function(x) sum(x==Inf))
18 |   miss <- unlist(a)
19 |   missdf <- data.frame(name=names(miss), missing=miss)
20 |   missdf$name <- gsub('.TRUE', '', missdf$name)
21 |   missdf$name <- gsub('.NA', '', missdf$name)
22 |   missdf$nomiss <- unlist(lapply(a, function(x) x['FALSE']))
23 |   
24 |   missdf$missing[is.na(missdf$missing)] <- 0
25 |   missdf$nomiss[is.na(missdf$nomiss)] <- 0
26 |   
27 |   missdf$misspct <- missdf$missing/n
28 |   missdf <- missdf[order(missdf$misspct, decreasing=T),]
29 |   return(missdf)
30 | }
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Rsenal R package
 3 | 
 4 | This is mainly a collection of random, or more accurately, clusters of random R functions I've written for personal and work projects over the years. I don't expect it to be immediately useful to the general population of R users. For now it's mostly my cloud storage (with nice documentation) for R functions that I use repeatedly. Some functions are fairly general purpose, while others were developed for specific problems and are not super generalizeable in the current state. 
 5 | 
 6 | Current clusters of functionality: 
 7 | 
 8 | * manipulating igraph objects 
 9 | * modeling and model validation 
10 | * cleaning data 
11 | * scaling and transforming data
12 | * manipulating/summarizing outputs from models and association rules
13 | * Shiny apps: relational db schema explorer, association rule explorer, IPIP
14 | 
15 | ### How to download
16 | 
17 | `install.packages('devtools') # if devtools not already installed`  
18 | `library('devtools') `  
19 | `install_github('brooksandrew/Rsenal')`  
20 | `library('Rsenal')`
21 | 
22 | That's it, you got it.
23 | 
24 | List all functions
25 | 
26 | `library(help='Rsenal')`
27 | 
28 | OR 
29 | 
30 | `?Rsenal` and then click the `Index` link at the bottom.
31 | 


--------------------------------------------------------------------------------
/man/makeSQL_createTable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{makeSQL_createTable}
 3 | \alias{makeSQL_createTable}
 4 | \title{generate SQL script to create table in SQL database from data.frame}
 5 | \usage{
 6 | makeSQL_createTable(df, tablename = deparse(substitute(df)),
 7 |   saveQuery = NULL, wordy = T)
 8 | }
 9 | \arguments{
10 | \item{df}{data.frame we will generate a SQL create table script for}
11 | 
12 | \item{tablename}{(optional) name of the table we want to create in a sql database}
13 | 
14 | \item{saveQuery}{(optional) name and filepath of the .sql script that will create the table.}
15 | 
16 | \item{wordy}{(optional) prints the sql query out to the console.}
17 | }
18 | \value{
19 | sql query which the create table script
20 | }
21 | \description{
22 | Creates a sql Query that will create a table in a SQL Database from an R dataframe.
23 | It intializes with variable types (naively), but prevents the user from manually writing the query,
24 | which can be especially cumbersome when there are many columns.
25 | Initialized with basic SQL Server variable types.
26 | }
27 | \examples{
28 | makeSQL_createTable(mtcars)
29 |  \dontrun{
30 |  makeSQL_createTable(mtcars, saveQuery='mySQL_query.sql')
31 |  }
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/man/addRuleQuality.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{addRuleQuality}
 3 | \alias{addRuleQuality}
 4 | \title{Add quality measures to association rules}
 5 | \usage{
 6 | addRuleQuality(trans, rules, include = NULL, exclude = "improvement")
 7 | }
 8 | \arguments{
 9 | \item{trans}{transaction set (s4 class from arules package)}
10 | 
11 | \item{rules}{set of rules from \code{apriori}}
12 | 
13 | \item{include}{character vector specifying which quality measures to include.  Default is to include everything.}
14 | 
15 | \item{exclude}{character vector specifying which quality measures to exclude. Default is to exclude 'improvement' because it seems to be slow.}
16 | }
17 | \value{
18 | ruleset with additional quality measures
19 | }
20 | \description{
21 | Adds measures of rule quality (conviction, hyperConfidence, cosine, chiSquare, coverage, doc, gini, hyperlift) to a set of
22 | association rules mined from \code{apriori}.  Usually used before converting ruleset to data.frame and exporting to some sort of text file.
23 | }
24 | \examples{
25 | library('arules')
26 | data("Adult")
27 | ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target = "rules"))
28 | ar <- addRuleQuality(trans=Adult, rules=ar)
29 | df <- Rsenal::rules2df(ar)
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/man/getVfromE.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{getVfromE}
 3 | \alias{getVfromE}
 4 | \title{Get vertex names from edge IDs}
 5 | \usage{
 6 | getVfromE(g, eid, mode = "to", unique = T)
 7 | }
 8 | \arguments{
 9 | \item{g}{igraph object to mine vertex names from}
10 | 
11 | \item{eid}{edge IDs referenced by a numeric vector or the edge object themselves}
12 | 
13 | \item{mode}{"to" or "from". Indicates whether to return vertex names for nodes going to (in) or from (out) from the edge.}
14 | 
15 | \item{unique}{\code{TRUE} or \code{FALSE}.  TRUE simply removes duplicate vertex names and returns a unique list.}
16 | }
17 | \value{
18 | vertex names associated with the specified edge IDs (FROM or TO)
19 | }
20 | \description{
21 | Returns the vertex names associated with the specified edge IDs (FROM or TO)
22 | }
23 | \examples{
24 | require('igraph')
25 | 
26 | ## build sample network
27 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
28 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
29 | relations<-cbind(from,to)
30 | g <- graph.data.frame(relations)
31 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
32 | V(g)$tier <- tiers
33 | 
34 | getVfromE(g, E(g)[1:5])
35 | getVfromE(g, 1:5)
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/R/quantile_df.R:
--------------------------------------------------------------------------------
 1 | #' @title Pretty print quantiles
 2 | #' @description Useful for R Markdown reports
 3 | #' @param x numeric vector to calculate quantile with
 4 | #' @param probs numeric vector of probabilities to use for quantile
 5 | #' @param na.rm logical, pass to quantile function
 6 | #' @param names logical, pass to quantile function
 7 | #' @param type, number, pass to quantile function
 8 | #' @param colname, character, name of variable column in output
 9 | #' @param round, integer, number of digits to round quantile value.  
10 | #' passed to \code{digits} argument of \code{round} function
11 | #' @return prettified data.frame of probabilities
12 | #' @export
13 | #' @examples
14 | #'
15 | #' quantile_df(mtcars$mpg, seq(0,1,.1))
16 | #' quantile_df(mtcars$mpg, seq(0,1,.1), names=T)
17 | #' quantile_df(mtcars$mpg, seq(0,1,.2), colname='mpg')
18 | #' quantile_df(mtcars$mpg, round=1)
19 | 
20 | quantile_df <- function(x, probs=seq(0,1,.05), na.rm=F, names=F, type=7, colname=NULL, round=NULL, ...){
21 |   z <- quantile(x, probs, na.rm, names, type)
22 |   probsprint <- paste0(round(probs*100,3), '%')
23 |   df <- data.frame(quantile=probsprint, values=z)
24 |   if(is.null(round)==F) df[,2] <- round(df[,2], round)
25 |   if(is.null(colname)==F) names(df)[2] <- colname
26 |   return(df)
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/man/pruneEdge.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{pruneEdge}
 3 | \alias{pruneEdge}
 4 | \title{Prune edges of igraph}
 5 | \usage{
 6 | pruneEdge(g, root)
 7 | }
 8 | \arguments{
 9 | \item{g}{igraph object}
10 | 
11 | \item{root}{character string: name of root node}
12 | }
13 | \value{
14 | igraph object, pruned subgragh of \code{g}
15 | }
16 | \description{
17 | This function takes a very connected network graph and prunes the edges down
18 | so that it focuses on depth from the root node to the end nodes (inter-connective
19 | edges are deleted). \cr \cr
20 | 1. Find all nodes 1 step out from root.  These edges must stay.  \cr
21 | 2. Find all nodes 2 steps from root that have more than 2 edges in.
22 |        Keep only one edge (the one that leads to the shortest path back to the root).\cr
23 | 3. Repeat for the nodes one more degree away from root.
24 | }
25 | \examples{
26 | require('igraph')
27 | 
28 | ## build sample network
29 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
30 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
31 | relations<-cbind(from,to)
32 | g <- graph.data.frame(relations)
33 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
34 | V(g)$tier <- tiers
35 | 
36 | prungeEdge(g, 'A')
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/man/removePeriods.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{removePeriods}
 3 | \alias{removePeriods}
 4 | \title{Removes duplicative periods in column names of data.frame}
 5 | \usage{
 6 | removePeriods(x)
 7 | }
 8 | \arguments{
 9 | \item{df}{input data.frame to be aggregated}
10 | }
11 | \value{
12 | vector of strings without duplicate periods
13 | }
14 | \description{
15 | When reading in a flat file (csv, txt, etc) that was converted from an Excel spreadsheet,
16 | column names often have spaces between words.  R reads these spaces as periods.  If there are multiple spaces or special characters that
17 | are not valid in column names, R replaces with a period.  When the data is messy, you sometimes get several consecutive periods
18 | or traling periods at the end of column names.  I use this function as a coarse tool to standardize this ugliness
19 | Periods can then be easily replaced (\code{gsub}'ed) with a single character if periods aren't your thing.
20 | }
21 | \examples{
22 | ## making some messed up data to fix
23 | data(mtcars)
24 | names(mtcars)[1] <- paste(names(mtcars)[1], '..', sep='')
25 | names(mtcars)[3] <- paste(names(mtcars)[3], '.', sep='')
26 | names(mtcars)[4] <- paste(names(mtcars)[3], '..also.known..as.horsepower', sep='')
27 | removePeriods(names(mtcars))
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/man/mergeEdgeLists.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{mergeEdgeLists}
 3 | \alias{mergeEdgeLists}
 4 | \title{Merge edgelists => Master edgelist => igraph}
 5 | \usage{
 6 | mergeEdgeLists(edgeLists, from = "from.ID", to = "to.ID", keepDups = F)
 7 | }
 8 | \arguments{
 9 | \item{edgeLists}{list of edgelists (stored as data.frames)}
10 | 
11 | \item{from}{name of the column that specifies the "from" node in each data.frame in \code{edgeLists}}
12 | 
13 | \item{to}{name of the column that specifies the "to" node in each data.frame in \code{edgeLists}}
14 | 
15 | \item{keepDups}{deletes duplicate from-to relations when set to FALSE}
16 | }
17 | \description{
18 | This function turns a list of edgelists (data.frames) into a single "master" edgelist.
19 | It maintains the edge attributes of each individual edge list
20 | }
21 | \examples{
22 | require('igraph')
23 | cars <- data.frame(mtcars, to=sample(rownames(mtcars), replace=T), from=sample(rownames(mtcars), replace=T))
24 | df1<-cars[sample(1:32,30), c('to', 'from', 'cyl', 'mpg')]
25 | df2<-cars[sample(1:32,15), c('to', 'from', 'cyl', 'qsec')]
26 | df3<-cars[sample(1:32,32), c('to', 'from', 'hp', 'drat')]
27 | 
28 | df1$cyl[1:10] <- df1$cyl[1:10]+2
29 | el <- list(df1, df2, df3)
30 | 
31 | mel <- mergeEdgeLists(el, from='from', to='to', keepDups=T)
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/man/travCount.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{travCount}
 3 | \alias{travCount}
 4 | \title{Count # of nodes connected to root node}
 5 | \usage{
 6 | travCount(g, root, orderN = -1, vmode = "out")
 7 | }
 8 | \arguments{
 9 | \item{g}{igraph object}
10 | 
11 | \item{root}{character string: name of root node}
12 | 
13 | \item{orderN}{number, # of degrees away from root node to search.  Default is -1 which searches all degrees}
14 | 
15 | \item{vmode}{character string: 'out', 'in' or 'all',  determines how to subgraph \code{g} from \code{root}. Default is 'out'.}
16 | }
17 | \value{
18 | number, count of nodes connected to root node.
19 | }
20 | \description{
21 | Count doesn't include root node.  Simpler version of  \code{\link{travOut}}.
22 | Might be faster?  Developed for a different project.
23 | }
24 | \examples{
25 | require('igraph')
26 | 
27 | ## build sample network
28 | from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
29 | to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
30 | relations<-cbind(from,to)
31 | g <- graph.data.frame(relations)
32 | tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
33 | V(g)$tier <- tiers
34 | 
35 | plot(g)
36 | travCount(g, 'B')
37 | travCount(g, 'L', vmode='out')
38 | }
39 | \seealso{
40 | \code{\link{travOut}}
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/man/balPanel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{balPanel}
 3 | \alias{balPanel}
 4 | \title{Create balanced panel dataset}
 5 | \usage{
 6 | balPanel(df, datev, id, freq = "month")
 7 | }
 8 | \arguments{
 9 | \item{df}{data.frame of unbalanced panel data}
10 | 
11 | \item{datev}{name of data variable in data.frame \code{df}}
12 | 
13 | \item{id}{name of individual/entity variable.  Company ID, country or people's names, for example.}
14 | 
15 | \item{freq}{desired frequency of the time-series'}
16 | }
17 | \value{
18 | balanced panel dataset as a data.frame.  Values for dates that are missing in the input dataset show up as NAs
19 | }
20 | \description{
21 | Takes an unbalanced panel dataset and makes it balanced by inserting rows for missing time periods
22 | and initializing with NAs for other columns that are not the time or individual variable.
23 | }
24 | \details{
25 | Doesn't have any special treatment for duplicate values in the input time panel dataset.
26 | Currently handles time/date objects of type "Date" or "numeric".  Can add POSIX and other date/time classes as needed.
27 | }
28 | \examples{
29 | df <- data.frame(date=sort(sample(seq(Sys.Date()-20, Sys.Date(), by=1), 30, replace=T)),
30 |                 x1=runif(30),
31 |                 id=rep(c('A', 'B', 'C'), 10), stringsAsFactors=F)
32 | 
33 | balPanel(df, datev='date', id='id', freq='days')
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/R/filterRules.R:
--------------------------------------------------------------------------------
 1 | #' @title Post process association rules
 2 | #' 
 3 | #' @description This function filters rules that have already been mined.  
 4 | #'
 5 | #' @param rules list of association rules (S4 arules object).  Output of \code{apriori} function.
 6 | #' @param oneSideOnly list of character vectors.  Each character vector of the list contains variables that are likely very similar
 7 | #' and will generate uninteresting rules.  So the filtering algorithm will prune rules where variables within this list appear on
 8 | #' both the RHS & LHS
 9 | #' @return association rules 
10 | #' @import arules
11 | #' @export
12 | #' @examples
13 | #' 
14 | #' library('arules')
15 | #' data('Adult')
16 | #' rules <- apriori(Adult, parameter = list(supp=0.01, conf=0.9, target = "rules"))
17 | #' oneSideOnly <- list(c('age', 'workclass', 'education'), c('marital-status', 'occupation', 'race'))
18 | #' f2 <- filterRules(rules, oneSideOnly=oneSideOnly)
19 | #' length(rules)
20 | #' length(f2)
21 | 
22 | filterRules <- function(rules, oneSideOnly=oneSideOnly) {
23 |   frules <- rules
24 |   for(i in 1:length(oneSideOnly)) {
25 |     cond <- paste0(unlist(oneSideOnly[i]), sep='=')
26 |     comb <- expand.grid(cond, cond, stringsAsFactors=F)
27 |     for(j in 1:nrow(comb)){
28 |       frules <- subset(frules, subset=(!(lhs %pin% comb[j,1] & (rhs %pin% comb[j,2]))))
29 |     }
30 |   }
31 |   return(frules)
32 | }
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/man/uniglm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{uniglm}
 3 | \alias{uniglm}
 4 | \title{Univariate glm regression}
 5 | \usage{
 6 | uniglm(df, yv, xv, file = NULL, sortby = "aic", xvlab = NULL, test = T)
 7 | }
 8 | \arguments{
 9 | \item{df}{data.frame with variables for analysis}
10 | 
11 | \item{yv}{character string, target variable}
12 | 
13 | \item{xv}{character vector, predictor variables to test univariately}
14 | 
15 | \item{file}{character string, filepath to write results out to.  txt or csv file.}
16 | 
17 | \item{sortby}{character string, criteria to sort variables by. Default  = 'aic'}
18 | 
19 | \item{xvlab}{data.frame, lookup table for variable names.  First column is codename, second column is the pretty printed name.}
20 | 
21 | \item{test}{\code{TRUE} or \code{FALSE}.  Includes Chi square test, or not.}
22 | }
23 | \value{
24 | data.frame of results
25 | }
26 | \description{
27 | Runs a univariate logistic regression on each predictor variable of interest.
28 | }
29 | \examples{
30 | require('Hmisc')
31 | 
32 | ##setting up some data
33 | longnames <- data.frame(long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.'),
34 | short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am'), stringsAsFactors=F)
35 | 
36 | glm.out <- uniglm(df=mtcars, yv='vs', xv=c('hp','drat','cyl','mpg','wt'), xvlab=longnames)
37 | }
38 | \seealso{
39 | \code{\link{bestglm}},  \code{\link{leaps}}
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/R/removePeriods.R:
--------------------------------------------------------------------------------
 1 | #' @title Removes duplicative periods in column names of data.frame
 2 | #' 
 3 | #' @description When reading in a flat file (csv, txt, etc) that was converted from an Excel spreadsheet,
 4 | #' column names often have spaces between words.  R reads these spaces as periods.  If there are multiple spaces or special characters that
 5 | #' are not valid in column names, R replaces with a period.  When the data is messy, you sometimes get several consecutive periods
 6 | #' or traling periods at the end of column names.  I use this function as a coarse tool to standardize this ugliness
 7 | #' Periods can then be easily replaced (\code{gsub}'ed) with a single character if periods aren't your thing. 
 8 | #' 
 9 | #' @param df input data.frame to be aggregated
10 | 
11 | #' @return vector of strings without duplicate periods
12 | #' @export
13 | #' @examples
14 | #' ## making some messed up data to fix
15 | #' data(mtcars)
16 | #' names(mtcars)[1] <- paste(names(mtcars)[1], '..', sep='')
17 | #' names(mtcars)[3] <- paste(names(mtcars)[3], '.', sep='')
18 | #' names(mtcars)[4] <- paste(names(mtcars)[3], '..also.known..as.horsepower', sep='')
19 | 
20 | #' removePeriods(names(mtcars))
21 | 
22 | removePeriods <- function(x) {
23 |   ret <- gsub('\\.+', '\\.', x)
24 |   lastE <- sapply(ret, function(z) substr(z,nchar(z),nchar(z)))
25 |   ret[which(lastE=='.')] <- sapply(ret[which(lastE=='.')], function(z) substr(z, 0, nchar(z)-1))
26 |   return(unlist(ret))
27 | }


--------------------------------------------------------------------------------
/man/mergeRules2trans.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{mergeRules2trans}
 3 | \alias{mergeRules2trans}
 4 | \title{Link transactions/observations to association rules}
 5 | \usage{
 6 | mergeRules2trans(trans, rhs, lhs)
 7 | }
 8 | \arguments{
 9 | \item{trans}{list of transactions}
10 | 
11 | \item{rhs}{list of RHS of association rules}
12 | 
13 | \item{lhs}{list of LHS of association rules}
14 | }
15 | \value{
16 | matrix tying obs/transactions to association rules
17 | }
18 | \description{
19 | Creates a matrix of transactions/observations (rows) by association rules (columns).
20 | 1 in the matrix indicates that the observation/transaction is eligible for and follows a given rule (LHS & RHS of rule are in the obs/transaction).
21 | -1 in the matrix indicates that obs/transaction is eligible for, but does violates the rule (LHS of rule, but not RHS of rule are in obs/transaction).
22 | 0 in the matrix indicates that the rule does not apply (LHS of rule not in obs/tranaction).
23 | }
24 | \examples{
25 | library('arules')
26 | library('BBmisc')
27 | data(Adult)
28 | rules <- apriori(Adult, parameter=list(support=0.5, confidence=0.95))
29 | rdf <- rules2df(rules, list=T)
30 | trdf <- as(Adult, 'data.frame')
31 | trdf$items <- gsub('\\\\{', '', trdf$items)
32 | trdf$items <- gsub('\\\\}', '', trdf$items)
33 | trdf$items2 <- strsplit(as.character(trdf$items), split=',')
34 | M <- mergeRules2trans(trans=trdf$items2[1:1000], rhs=rdf$rhs, lhs=rdf$lhs)
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2 (4.0.2): do not edit by hand
 2 | 
 3 | export(addRuleQuality)
 4 | export(arulesApp)
 5 | export(balPanel)
 6 | export(binCat)
 7 | export(depthbin)
 8 | export(dir2dfList)
 9 | export(editForm)
10 | export(filterRules)
11 | export(getEfromVlist)
12 | export(getV1fromVlist)
13 | export(getVfromE)
14 | export(ipipApp)
15 | export(isKey)
16 | export(leafletMapDF)
17 | export(logit2tab)
18 | export(makeForm)
19 | export(makeSQL_createTable)
20 | export(makeVertexAtt)
21 | export(mergeEdgeLists)
22 | export(mergeRules2trans)
23 | export(missdf)
24 | export(optimizeModelWeight)
25 | export(overlap)
26 | export(partialDep)
27 | export(partialDepAll)
28 | export(plotAvgBy)
29 | export(plotBestglm)
30 | export(plotdf)
31 | export(predQuantile)
32 | export(predSortPlot)
33 | export(pruneEdge)
34 | export(quantileFeatureScore)
35 | export(quantileScore)
36 | export(quantile_df)
37 | export(removePeriods)
38 | export(rfImp)
39 | export(roundCut)
40 | export(rules2df)
41 | export(shinyTable)
42 | export(shuffleAnova)
43 | export(smartAgg)
44 | export(tableNet)
45 | export(travCount)
46 | export(travOut)
47 | export(uniglm)
48 | import(BBmisc)
49 | import(Hmisc)
50 | import(MASS)
51 | import(RColorBrewer)
52 | import(arules)
53 | import(arulesViz)
54 | import(bestglm)
55 | import(data.table)
56 | import(fmsb)
57 | import(ggplot2)
58 | import(gplots)
59 | import(htmlwidgets)
60 | import(igraph)
61 | import(leaflet)
62 | import(memoise)
63 | import(randomForest)
64 | import(shiny)
65 | import(vcd)
66 | 


--------------------------------------------------------------------------------
/man/optimizeModelWeight.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{optimizeModelWeight}
 3 | \alias{optimizeModelWeight}
 4 | \title{Optimize weight on ensemble of 2 supervised models}
 5 | \usage{
 6 | optimizeModelWeight(pred1, pred2, actual, steps = 50, cutoff = 0.5)
 7 | }
 8 | \arguments{
 9 | \item{pred1}{numeric vector of probabilities, prediction from model 1}
10 | 
11 | \item{pred2}{numeric vector of probabilities, prediction from model 2}
12 | 
13 | \item{actual}{vector of 1s and 0s.  The target variable test data}
14 | 
15 | \item{steps}{number, high numbers compute a more exhaustive combination of model weights}
16 | 
17 | \item{cutoff}{Cutoff used to demarcate predictions into positive or negative class.}
18 | 
19 | \item{ytest,}{vector of the target variable from test data. (0s and 1s)}
20 | 
21 | \item{jitterPlot}{\code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}}
22 | }
23 | \value{
24 | data.frame of results
25 | }
26 | \description{
27 | This function creates a weighted average of predictions from two models
28 | and evaluates F1, precision, recall, auc or c for each combination of the
29 | models to determine the best weights for each.
30 | }
31 | \examples{
32 | require('Hmisc')
33 | fit_glm1 <- glm(am~cyl, data=mtcars, family=binomial(logit))
34 | fit_glm2 <- glm(am~disp, data=mtcars, family=binomial(logit))
35 | ow <- optimizeModelWeight(fit_glm1$fitted.values, fit_glm2$fitted.values, actual=fit_glm1$model$am)
36 | plot(ow$weights, ow$precision, type='l', xlab='weight on model 1')
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/man/dir2dfList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{dir2dfList}
 3 | \alias{dir2dfList}
 4 | \title{Turn a directory of flat files into a list of data.frames}
 5 | \usage{
 6 | dir2dfList(dfdir, ext = ".txt", exclude = NULL, ...)
 7 | }
 8 | \arguments{
 9 | \item{dfdir}{character string of the directory where you want to load flat files}
10 | 
11 | \item{ext}{file extention on the type of files to load.  Usually \code{.csv} or \code{.txt}}
12 | 
13 | \item{exclude}{character string of table names to be excluded from app.  Needs to be specified to \code{NULL} or a character
14 | vector or else \code{...} arguments will not be handled properly.}
15 | 
16 | \item{...}{parameters to pass to \code{\link{read.delim}}.  Commonly \code{nrow}, \code{sep},}
17 | }
18 | \value{
19 | list of data.frames
20 | }
21 | \description{
22 | Useful to prepare data for \code{\link{tableNet}}
23 | }
24 | \examples{
25 | \dontrun{
26 | ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory
27 | temp <- tempfile()
28 | localDataDir <- paste0(tempdir(), '\\\\lahman2012-csv-onYourComp.zip')
29 | download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir)
30 | unzip(localDataDir, exdir=paste0(tempdir(), '\\\\lahman2012-csv-onYourComp')) ## may not be necessary
31 | 
32 | ## create a list of data.frames from .CSVs
33 | dfL <- dir2dfList(paste0(tempdir(), '\\\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F)
34 | }
35 | }
36 | \seealso{
37 | \code{\link{tableNet}} \code{\link{isKey}}
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/R/quantileScore.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title Quantile scoring function for continuous or integer valued data
 3 | #' 
 4 | #' @description This function sorts a series from beginning to end.  It uses each observations place in the quantile to assign it a score.
 5 | #' useful for transforming variables into the same units (0-1).  Handles NAs 
 6 | #' 
 7 | #' @param x numeric vector to scale
 8 | #' @return a vector of attributes that correspond in order with the nodes in your igraph
 9 | #' @export
10 | #' @examples
11 | #' 
12 | #' cbind(quantileScore(mtcars$mpg), mtcars$mpg)
13 | #' cbind(quantileScore(mtcars$cyl), mtcars$cyl)
14 | 
15 | quantileScore <- function(x) {
16 |   xo <- x
17 |   x <- x[is.na(x)==F]
18 |   xs <- data.frame(x=sort(x), id=1:length(x), stringsAsFactors=F)
19 |   xsd <- xs[duplicated(xs$x)==F,]
20 |   xsd$q <- (xsd$id)/length(x)
21 |   ret <- xsd$q[match(xo, xsd[,1])]
22 |   return(ret)
23 | }
24 | 
25 | 
26 | #' @title Quantile scoring function for continuous or integer valued data
27 | #' 
28 | #' @description This function combines the quantile function with the basic feature normalization technique
29 | #' useful for getting a stable normalization
30 | #' 
31 | #' @param x numeric vector to scale
32 | #' @return a vector of attributes that correspond in order with the nodes in your igraph
33 | #' @export
34 | #' @examples
35 | #' 
36 | #' cbind(quantileFeatureScore(sort(mtcars$mpg)), sort(mtcars$mpg))
37 | #' cbind(quantileFeatureScore(mtcars$cyl), mtcars$cyl)
38 | 
39 | quantileFeatureScore <- function(x, wq=0.5, wf=0.5){
40 |   ret <- wq*quantileScore(x) + wf*(x-min(x, na.rm=T))/(max(x, na.rm=T)-min(x, na.rm=T))
41 |   return(ret)
42 | }
43 | 


--------------------------------------------------------------------------------
/man/leafletMapDF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{leafletMapDF}
 3 | \alias{leafletMapDF}
 4 | \title{Leaflet interactive map widget}
 5 | \usage{
 6 | leafletMapDF(data, vars = c(lon = "longitude", lat = "latitude"))
 7 | }
 8 | \arguments{
 9 | \item{data}{data.frame or data.table of data to visualize on map.  Included must be a column for longitude and latitude coordinates.
10 | Four additional columns are allowed for filtering and coloring the data.  Note, the filters are currently only categorical, not currently
11 | supporting numeric ranges.  Recommend only passing columns needed for filtering.  Default is to pick first 4}
12 | 
13 | \item{vars}{named character vector.  The names of the vector should always be 'lon' and 'lat'.  The corresponding values
14 | of this vector are the column names of the columns in \code{data} with the lat and lon data.}
15 | }
16 | \value{
17 | Shiny App
18 | }
19 | \description{
20 | Launches a Shiny App an interactive Leaflet map which can be used to visualize coordinate data.  Intended
21 | for quick exploratory geographic analyses
22 | }
23 | \examples{
24 | \dontrun{
25 | n <- 5000
26 | df <- data.frame(latitude=runif(n, 35, 40),
27 |                  longitude=runif(n, -100, -85),
28 |                  animals=sample(c('dogs', 'cats', 'turtles'), n, replace=T)
29 |                  )
30 | df$westness <- cut(df$longitude, breaks=seq(min(df$longitude), max(df$longitude), length.out=10))
31 | df$northness <- cut(df$latitude, breaks=seq(min(df$latitude), max(df$latitude), length.out=10))
32 | leafletMapDF(df)
33 | }
34 | }
35 | \seealso{
36 | \code{leaflet}, \code{shiny}
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/man/makeVertexAtt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{makeVertexAtt}
 3 | \alias{makeVertexAtt}
 4 | \title{merge data.frame attributes to an igraph (network) object}
 5 | \usage{
 6 | makeVertexAtt(g, df, vname, by.df, by.g = "name")
 7 | }
 8 | \arguments{
 9 | \item{g}{igraph object to which you want to add a node/vertex attribute}
10 | 
11 | \item{df}{data.frame containing the attribute you want to add to igraph}
12 | 
13 | \item{vname}{name of column in data.frame \code{df} that you will merge into igraph}
14 | 
15 | \item{by.df}{unique key in data.frame that you will use to merge attribute into igraph}
16 | 
17 | \item{by.g}{unique key in igraph that you will use to merge attribute from data.frame}
18 | }
19 | \value{
20 | a vector of attributes that correspond in order with the nodes in your igraph
21 | }
22 | \description{
23 | Enriches an igraph object with node/vertex attributes from a data.frame
24 | }
25 | \examples{
26 | require('igraph')
27 | actors <- data.frame(names=c("Alice", "Bob", "Cecil", "David","Esmeralda"),
28 | age=c(48,33,45,34,21),
29 | gender=c("F","M","F","M","F"))
30 | relations <- data.frame(from=c("Bob", "Cecil", "Cecil", "David",
31 |                               "David", "Esmeralda"),
32 |                        to=c("Alice", "Bob", "Alice", "Alice", "Bob", "Alice"),
33 |                        same.dept=c(FALSE,FALSE,TRUE,FALSE,FALSE,TRUE),
34 |                        friendship=c(4,5,5,2,1,1), advice=c(4,5,5,4,2,3))
35 | g <- igraph::graph.edgelist(as.matrix(relations[,c('from', 'to')]), directed=T)
36 | 
37 | V(g)$age <- makeVertexAtt(g, df=actors, vname='age', by.df='names', by.g='name')
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/R/addRuleQuality.R:
--------------------------------------------------------------------------------
 1 | #' @title Add quality measures to association rules
 2 | #' @description Adds measures of rule quality (conviction, hyperConfidence, cosine, chiSquare, coverage, doc, gini, hyperlift) to a set of
 3 | #' association rules mined from \code{apriori}.  Usually used before converting ruleset to data.frame and exporting to some sort of text file.
 4 | #' @param trans transaction set (s4 class from arules package)
 5 | #' @param rules set of rules from \code{apriori}
 6 | #' @param include character vector specifying which quality measures to include.  Default is to include everything.
 7 | #' @param exclude character vector specifying which quality measures to exclude. Default is to exclude 'improvement' because it seems to be slow.
 8 | #' @return ruleset with additional quality measures
 9 | #' @import arules
10 | #' @export
11 | #' @examples
12 | #' library('arules')
13 | #' data("Adult")
14 | #' ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target = "rules"))
15 | #' ar <- addRuleQuality(trans=Adult, rules=ar)
16 | #' df <- Rsenal::rules2df(ar) 
17 | 
18 | addRuleQuality <- function(trans, rules, include=NULL, exclude='improvement') {
19 |   allMeasures <- c("support", "confidence", "lift", "conviction", "hyperConfidence", "cosine", "chiSquare", "coverage", "doc",    
20 |                    "gini", "hyperLift", "fishersExactTest", "improvement", "leverage", "oddsRatio", "phi", "RLD")
21 |   if(is.null(include)==F) allMeasures <- include
22 |   if(is.null(exclude)==F) allMeasures <- setdiff(allMeasures, exclude)
23 |   for(i in allMeasures) quality(rules)[i] <- interestMeasure(rules, method=i, transactions=trans)
24 |   return(rules)
25 | }
26 |   
27 |   
28 |   
29 | 


--------------------------------------------------------------------------------
/R/mergeRules2trans.R:
--------------------------------------------------------------------------------
 1 | #' @title Link transactions/observations to association rules
 2 | #' 
 3 | #' @description Creates a matrix of transactions/observations (rows) by association rules (columns).
 4 | #' 1 in the matrix indicates that the observation/transaction is eligible for and follows a given rule (LHS & RHS of rule are in the obs/transaction).
 5 | #' -1 in the matrix indicates that obs/transaction is eligible for, but does violates the rule (LHS of rule, but not RHS of rule are in obs/transaction).
 6 | #' 0 in the matrix indicates that the rule does not apply (LHS of rule not in obs/tranaction).
 7 | #'
 8 | #' @param trans list of transactions
 9 | #' @param rhs list of RHS of association rules
10 | #' @param lhs list of LHS of association rules
11 | #' @return matrix tying obs/transactions to association rules
12 | #' @import BBmisc
13 | #' @export
14 | #' @examples
15 | #' library('arules')
16 | #' library('BBmisc')
17 | #' data(Adult)
18 | #' rules <- apriori(Adult, parameter=list(support=0.5, confidence=0.95))
19 | #' rdf <- rules2df(rules, list=T)
20 | 
21 | #' trdf <- as(Adult, 'data.frame')
22 | #' trdf$items <- gsub('\\{', '', trdf$items)
23 | #' trdf$items <- gsub('\\}', '', trdf$items)
24 | #' trdf$items2 <- strsplit(as.character(trdf$items), split=',')
25 | 
26 | #' M <- mergeRules2trans(trans=trdf$items2[1:1000], rhs=rdf$rhs, lhs=rdf$lhs)
27 | 
28 | mergeRules2trans <- function(trans, rhs, lhs){
29 |   M <- data.frame(matrix(0, nrow=length(trans), ncol=length(lhs)))
30 |   for(i in 1:ncol(M)) M[sapply(trans, function(x) isSubset(c(lhs[[i]], rhs[[i]]), x)), i] <- 1
31 |   for(i in 1:ncol(M)) M[sapply(trans, function(x) isSubset(lhs[[i]], x) & isSubset(rhs[[i]], x)==F), i] <- -1
32 |   return(M)
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/R/rules2df.R:
--------------------------------------------------------------------------------
 1 | #' @title Transform association rule results into data.frame
 2 | #' 
 3 | #' @description Note this function only currently works when the itemsets are of size 1 on the LHS and RHS
 4 | #'
 5 | #' @param rules list of association rules (S4 arules object).  Output of \code{apriori} function.
 6 | #' @param list logical \code{TRUE} or \code{FALSE}.  sets LHS items in a list, rather than one character string
 7 | #' @return association rules in a data.frame
 8 | #' @export
 9 | #' @examples
10 | #' library('arules')
11 | #' data("Adult")
12 | #' ar <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.6, target = "rules", minlen=2))
13 | #' df <- rules2df(ar, list=T)
14 | 
15 | rules2df <- function(rules, list=F){  
16 |   df <- as(rules, 'data.frame')
17 |   df[,1] <- as.character(df[,1])
18 |   df$lhs <- sapply(df[,1], function(x) strsplit(x, split=' => ')[[1]][1])
19 |   df$rhs <- sapply(df[,1], function(x) strsplit(x, split=' => ')[[1]][2])
20 |   df$lhs <- gsub(pattern='\\{', replacement='', x=df$lhs)
21 |   df$lhs <- gsub(pattern='}', replacement='', x=df$lhs)
22 |   df$rhs <- gsub(pattern='\\{', replacement='', x=df$rhs)
23 |   df$rhs <- gsub(pattern='}', replacement='', x=df$rhs)
24 |   
25 |   if(list==T){
26 |     p <- rules@lhs@data@p
27 |     i <- rules@lhs@data@i+1
28 |     lhsItems <- unlist(rules@lhs@itemInfo@.Data)
29 |     lhsL <- list()
30 |     for(j in 2:length(p)) lhsL[[j-1]] <- lhsItems[i[(p[j-1]+1):(p[j])]]
31 |     df$lhs <- lhsL
32 |     
33 |     p <- rules@rhs@data@p
34 |     i <- rules@rhs@data@i+1
35 |     rhsItems <- unlist(rules@rhs@itemInfo@.Data)
36 |     rhsL <- list()
37 |     for(j in 2:length(p)) rhsL[[j-1]] <- rhsItems[i[(p[j-1]+1):(p[j])]]
38 |     df$rhs <- rhsL
39 |   }
40 |   return(df)
41 | }
42 | 


--------------------------------------------------------------------------------
/man/isKey.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{isKey}
 3 | \alias{isKey}
 4 | \title{Determine strength of linking variables}
 5 | \usage{
 6 | isKey(dfL, xvar)
 7 | }
 8 | \arguments{
 9 | \item{dfL}{list of data.frames.  easily generated from \code{\link{dir2dfList}}}
10 | 
11 | \item{xvar}{character string, name of the variable to calculate strength for across all tables in \code{dfL}}
12 | 
13 | \item{printdf}{prints progress of flat file loads to R console.}
14 | }
15 | \value{
16 | list of data.frames
17 | }
18 | \description{
19 | This function computes the percentage of unique values of a column \code{x} from \code{table1} that appear in
20 | in a \code{table2}.  It is called and computed on the fly in \code{\link{tableNet}}.  However, these computations can be
21 | slow on large datasets, so it is provided a standalone function that can be run once to store the output and fed into the
22 | \code{\link{tableNet}} app to prevent repetitive slow computations on the fly.
23 | }
24 | \examples{
25 | \dontrun{
26 | ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory
27 | temp <- tempfile()
28 | localDataDir <- paste0(tempdir(), '\\\\lahman2012-csv-onYourComp.zip')
29 | download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir)
30 | unzip(localDataDir, exdir=paste0(tempdir(), '\\\\lahman2012-csv-onYourComp')) ## may not be necessary
31 | 
32 | ## create a list of data.frames from .CSVs
33 | dfL <- dir2dfList(paste0(tempdir(), '\\\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F)
34 | isKey(dfL, 'playerID')
35 | }
36 | }
37 | \seealso{
38 | \code{\link{tableNet}} \code{\link{dir2dfList}}
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/R/shiny_table.R:
--------------------------------------------------------------------------------
 1 | #' @title Shiny app to visualize dataframe as simple interactive datatable
 2 | #' @description Launches a basic Shiny App that renders the given dataframe into an interactive datatable using \code{renderDataTable}
 3 | #' @param df dataframe to be visualized
 4 | #' @return Shiny App
 5 | #' @import shiny
 6 | #' @export
 7 | #' @examples
 8 | #' \dontrun{
 9 | #' shinyTable(mtcars) 
10 | #' }
11 | 
12 | 
13 | shinyTable <- function(df) {
14 |   shinyApp(
15 |     shinyUI(fluidPage(
16 |       h2(paste0(deparse(substitute(df)))),
17 |       sidebarLayout(sidebarPanel(
18 |         textInput('filter', 'Filter rows using R logical operators'),
19 |         uiOutput('querystatus'),
20 |         br(),
21 |         br(),
22 |         checkboxGroupInput('show_vars', 'Columns to show', names(df), selected=names(df))
23 |       ),
24 |       mainPanel(
25 |         dataTableOutput('mytable')
26 |         )
27 |       )
28 |     )),
29 |     server = function(input, output) {
30 |       output$mytable = renderDataTable({
31 |         df1 <- df[,input$show_vars, drop=F]
32 |         df2 <- try(eval(parse(text=paste0('subset(df1,', input$filter, ')'))))
33 |         if(class(df2)!='try-error') {dfp <- df2
34 |                                      output$querystatus <- renderText({'query successfull :)'})
35 |                                      } else {dfp <- df1
36 |                                              output$querystatus <- renderText({'query unsuccessful :( <br/> (showing full dataset)'})
37 |                                      }
38 |         dfp},
39 |         options=list(lengthMenu=list(c(25,50,100,500,1000,5000,-1), c('25', '50', '100', '500', '1000', '5000', 'All')),
40 |                      pageLength=500)
41 |       )
42 |     }
43 |   )
44 | }
45 | 


--------------------------------------------------------------------------------
/R/makeSQL_createTable.R:
--------------------------------------------------------------------------------
 1 | #' generate SQL script to create table in SQL database from data.frame
 2 | #' 
 3 | #' Creates a sql Query that will create a table in a SQL Database from an R dataframe.
 4 | #' It intializes with variable types (naively), but prevents the user from manually writing the query,
 5 | #' which can be especially cumbersome when there are many columns.
 6 | #' Initialized with basic SQL Server variable types.
 7 | #' 
 8 | #' @param df data.frame we will generate a SQL create table script for
 9 | #' @param tablename (optional) name of the table we want to create in a sql database
10 | #' @param saveQuery (optional) name and filepath of the .sql script that will create the table.
11 | #' @param wordy (optional) prints the sql query out to the console.
12 | #' @return sql query which the create table script
13 | #' @export
14 | #' @examples
15 | #'  makeSQL_createTable(mtcars)
16 | #'  \dontrun{
17 | #'  makeSQL_createTable(mtcars, saveQuery='mySQL_query.sql')
18 | #'  }
19 | 
20 | 
21 | makeSQL_createTable <- function(df, tablename=deparse(substitute(df)), saveQuery=NULL, wordy=T) {
22 | 
23 |   types <- list(
24 |     'numeric'='decimal',
25 |     'integer'='int',
26 |     'character'='varchar(40)',
27 |     'logical'='varchar(40)',
28 |     'factor'='varchar(40)',
29 |     'POSIXct'='datetime'
30 |     )
31 |  
32 |   sql <- paste('create table ', tablename, '\n', '(', sep='')
33 |   for(i in 1:ncol(df)) {
34 |     k <- names(df)[i]
35 |     vclass <- ifelse(class(df[,k]) %in% names(types), types[[class(df[,k])]], 'varchar(40)')
36 |     if(i<ncol(df))  sql <- paste(sql, paste(k, ' ', vclass, ',', sep=''), sep='\n')
37 |     else sql <- paste(sql, paste(k, ' ', vclass, '\n', ')', sep=''), sep='\n')
38 |   }
39 |   
40 |   if(wordy==T) cat(sql)
41 |   if(is.null(saveQuery)==F) writeLines(sql, saveQuery)
42 |   
43 |   return(sql)
44 | }
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/R/balPanel.R:
--------------------------------------------------------------------------------
 1 | #' @title Create balanced panel dataset
 2 | #' 
 3 | #' @description Takes an unbalanced panel dataset and makes it balanced by inserting rows for missing time periods 
 4 | #' and initializing with NAs for other columns that are not the time or individual variable.
 5 | #' 
 6 | #' @details Doesn't have any special treatment for duplicate values in the input time panel dataset.
 7 | #' Currently handles time/date objects of type "Date" or "numeric".  Can add POSIX and other date/time classes as needed. 
 8 | #' 
 9 | #' @param df data.frame of unbalanced panel data
10 | #' @param datev name of data variable in data.frame \code{df}
11 | #' @param id name of individual/entity variable.  Company ID, country or people's names, for example.
12 | #' @param freq desired frequency of the time-series'
13 | #' @return balanced panel dataset as a data.frame.  Values for dates that are missing in the input dataset show up as NAs
14 | #' @export
15 | #' @examples
16 | #' df <- data.frame(date=sort(sample(seq(Sys.Date()-20, Sys.Date(), by=1), 30, replace=T)),
17 | #'                 x1=runif(30),
18 | #'                 id=rep(c('A', 'B', 'C'), 10), stringsAsFactors=F)
19 | #'
20 | #' balPanel(df, datev='date', id='id', freq='days')
21 | 
22 | 
23 | balPanel <- function(df, datev, id, freq='month') {
24 |   if(class(df[,datev])=='Date') {dateRange <- seq(min(df[,datev]), max(df[,datev]), by=freq)
25 |   } else if(class(df[,datev]) %in% c('numeric', 'integer')) {dateRange <- seq(min(df[,datev]), max(df[,datev]), by=freq)
26 |   } else {stop('type of date variable not Date, numeric or integer')}
27 |   
28 |   assetList <- unique(df[, id])
29 |   
30 |   master <- data.frame(
31 |     v1 = rep(dateRange, length(assetList)),
32 |     v2 = sort(rep(assetList, length(dateRange)))
33 |   )
34 |   names(master) <- c(datev, id)
35 |   
36 |   bP <- merge(master, df, by=c(datev, id), all.x=T, all.y=F)
37 |   return(bP)
38 | }


--------------------------------------------------------------------------------
/man/depthbin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{depthbin}
 3 | \alias{depthbin}
 4 | \title{Equal depth binning}
 5 | \usage{
 6 | depthbin(ser, nbins = 10, qtype = 7, digits = 10, labelRange = T,
 7 |   labelPct = F, labelOrder = F)
 8 | }
 9 | \arguments{
10 | \item{ser}{numeric vector to bin}
11 | 
12 | \item{nbins}{number of bins desired}
13 | 
14 | \item{qtype}{an integer between 1 and 9 selecting one of the nine quantile algorithms detailed below to be used.  See \code{\link{quantile}} for more details.  Default is 7.}
15 | 
16 | \item{digits}{number, number of digits to display in bin categories}
17 | 
18 | \item{labelRange}{logical: \code{TRUE} assigns a numeric score/ranking (ex. 1/3, 2/3, or 3/3 if 3 bins) to each bin. Can be combined with \code{labelOrder} and \code{labelPct}}
19 | 
20 | \item{labelPct}{logical: \code{TRUE} appends the percent of observations assigned to the bin to the factor level (name). Can be combined with \code{labelOrder} and \code{labelRange}}
21 | 
22 | \item{labelOrder}{logical: \code{TRUE} appends the ordinal position of the bin to the factor level (name).  Can be combined with \code{labelPct} and \code{labelRange}}
23 | }
24 | \value{
25 | ordered factor vector with bins
26 | }
27 | \description{
28 | Simple equal depth binning algorithm.
29 | }
30 | \examples{
31 | ## perfect equal depth bins
32 | x1 <- rnorm(1000, 0, 20)
33 | binned1 <- depthbin(x1, nbins=10)
34 | table(binned1)
35 | 
36 | ## slightly uneven bins with integer data
37 | x2 <- rpois(1000, 3)
38 | binned2 <- depthbin(x2, nbins=5)
39 | summary(binned2)
40 | 
41 | ## as good as we can get with skewed integer data
42 | x3 <- round(abs(log(abs(rnorm(1000)))))
43 | binned3 <- depthbin(x3, nbins=5)
44 | summary(binned3)
45 | 
46 | ## including more information in category names (levels of factor variable)
47 | x4 <- round(abs(log(abs(rnorm(1000)))))
48 | binned4 <- depthbin(x4, nbins=3, labelRange=T, labelPct=T, labelOrder=T)
49 | summary(binned4)
50 | }
51 | \seealso{
52 | \code{\link{quantile}}
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/man/predQuantile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{predQuantile}
 3 | \alias{predQuantile}
 4 | \title{Analyze predictions of supervised model in quantiles}
 5 | \usage{
 6 | predQuantile(model = NULL, xtest = NULL, ytest, n = 5, roundText = 3,
 7 |   testPred = NULL, fw = NULL)
 8 | }
 9 | \arguments{
10 | \item{model}{the model to get predictions from.  Uses \code{predict} method.  If \code{testPred}
11 | is specified, we don't need this parameter,  Probably safer to just use testPred, unless the model you have
12 | has a predict method that will work as intended without any other arguments.}
13 | 
14 | \item{ytest}{vector of test data (target variable)}
15 | 
16 | \item{n}{number of quantiles}
17 | 
18 | \item{roundText}{decimals to print}
19 | 
20 | \item{testPred}{vector of predictions from test data.  Do not use if using \code{model} parameter.}
21 | 
22 | \item{fw}{numeric vector, of bins.  Defaults to}
23 | 
24 | \item{xtext}{data.frame of test data (predictor variables only)}
25 | }
26 | \value{
27 | data.frame of results
28 | }
29 | \description{
30 | Analyzing results of supervised model using test data and quantiles.  Note only
31 | \code{model} or \code{testPred} are necessary, not both. \code{testPred} is advised.
32 | }
33 | \examples{
34 | require('randomForest')
35 | rf <- randomForest(x=mtcars[1:25,1:8], y=as.factor(mtcars[1:25, 'am']), ntree=5)
36 | mtcarsTestPred <- predict(rf, mtcars[26:32, 1:8], type='prob')[,2]
37 | pq2 <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], n=3, roundText=4, testPred=mtcarsTestPred)
38 | pq <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], fw=seq(.1,1,length.out=10), roundText=4, testPred=mtcarsTestPred)
39 | 
40 | barplot(pq$actual, names=pq$predRange, ylab='True Positives', las=2, cex.names=.7)
41 | barplot(pq$hitRate, names=pq$predRange, ylab='True Positive Hit Rate', las=2, cex.names=.7)
42 | barplot(pq$predMax-pq$predMin, pq$hitRate, names=pq$predRange, ylab='prediction', las=2, cex.names=.7)
43 | }
44 | \seealso{
45 | \code{\link{predSortPlot}}
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/man/ipipApp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{ipipApp}
 3 | \alias{ipipApp}
 4 | \title{IPIP-NEO visualization in Shiny}
 5 | \source{
 6 | \link{http://www.personal.psu.edu/j5j/IPIP/}
 7 | }
 8 | \usage{
 9 | ipipApp(ipip35)
10 | }
11 | \arguments{
12 | \item{ipip35}{data.frame of ipip-neo results.  Should be 35 rows.
13 | First column should be the personality traits from the IPIP-NEO Personality test. Each additional column
14 | should be the corresponding results from the test for a person. Frist row is treated is as a header (Person's names). \cr}
15 | }
16 | \value{
17 | Shiny App
18 | }
19 | \description{
20 | The IPIP-NEO (International Personality Item Pool) is a comprehensive personality test of 5 human dimensions: Extraversion,
21 | Agreeableness, Conscientiousness, Neuroticism, Openness to New Experience. \cr
22 | This is a wrapper around a relatively large application built in \code{\link{shiny}}.
23 | The \code{ui.R} and \code{server.R } components are built into the \code{shinyApp} function which allows the user to call
24 | the shiny app as a function with whatever input dataset is desired.
25 | }
26 | \examples{
27 | ## creating some fake ipip data
28 | ipipdf <- data.frame(
29 |  X=c('Extraversion','Friendliness','Gregariousness','Assertiveness','Activity Level','Excitement Seeking','Cheerfulness',
30 |      'Agreeableness','Trust','Morality','Altruism','Cooperation','Modesty','Sympathy',
31 |      'Conscientiousness','Self Efficacy','Orderliness','Dutifulness','Achievement-Striving','Self Discipline','Cautiousness',
32 |      'Neuroticism','Anxiety','Anger','Depression','Self Consciousness','Immoderation','Vulnerability',
33 |      'Openness to Experience','Imagination','Artistic Interests','Emotionality','Adventurousness','Intellect','Liberalism'),
34 |  Baelish=runif(35, 0, 100),
35 |  Tyrion=runif(35, 0, 100),
36 |  Cersei=runif(35, 0, 100),
37 |  Ned=runif(35, 0, 100),
38 |  Robert=runif(35, 0, 100),
39 |  John=runif(35, 0, 100),
40 |  Varys=runif(35, 0, 100),
41 |  Arya=runif(35, 0, 100),
42 |  stringsAsFactors=F
43 | )
44 | 
45 | ## actually run the Shiny App
46 | ipipApp(ipipdf)
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/man/plotAvgBy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{plotAvgBy}
 3 | \alias{plotAvgBy}
 4 | \title{Plot average of an indicator over bins/categories of another another continuous variable}
 5 | \usage{
 6 | plotAvgBy(indv, byv, nbins = 5, data = F, plotNbin = T, ...)
 7 | }
 8 | \arguments{
 9 | \item{indv}{vector.  This the variable whose mean will be calculated over the categories of the \code{byv} vector binned up}
10 | 
11 | \item{byv}{vector. This is variable to be binned up, by which the \code{indv} variable will be averaged}
12 | 
13 | \item{nbins}{numeric. Number of bins to create when discretizing \code{byv}.  Passed to \code{depthbin} function.}
14 | 
15 | \item{data}{logical. TRUE returns the aggregated data.table. FALSE returns noting.  TRUE is default.}
16 | 
17 | \item{plotNbin}{logical.  TRUE plots the count of obs in each bin on top of each bar.  TRUE is default.}
18 | 
19 | \item{...}{additional barplot arguments.}
20 | }
21 | \value{
22 | prints a barplot unless data==T, in which case the aggregated data.table is returned
23 | }
24 | \description{
25 | Useful for assessing the relationship between two variables of interest.  There are cases, especially when
26 | outliers are involved, or many obs, that a scatterplot can be difficult to read.  This function bins up one of the continuous
27 | variables that would be used in a scatterplot and calculates the mean (or other function) of the continuous variable over
28 | a range (discretized into categories) of the second continuous variable.  It uses an equal depth binning algorithm
29 | to compute these bins on the \code{by} variable.
30 | 
31 | It can be used to assess the average of a binary target variable/prediction
32 | over a range of levels of a continuous or categorical variable.
33 | }
34 | \examples{
35 | plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=8)
36 | plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=5, plotNbin=F)
37 | plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=5, plotNbin=F, data=T)
38 | 
39 | ## Example with missing data
40 | df <- mtcars
41 | df$mpg[sample(1:nrow(mtcars), 5)] <- NA
42 | df$drat[sample(1:nrow(mtcars), 5)] <- NA
43 | 
44 | plotAvgBy(df[,'mpg'], df[,'drat'], nbins=5, plotNbin=F, data=T)
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/man/tableNet.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{tableNet}
 3 | \alias{tableNet}
 4 | \title{Shiny app to visualize schema of relational tables}
 5 | \usage{
 6 | tableNet(dfL)
 7 | }
 8 | \arguments{
 9 | \item{dfL}{list of data.frames used to generate schema.  This is easily generated from \code{\link{dir2dfList}}}
10 | }
11 | \value{
12 | Shiny App
13 | }
14 | \description{
15 | Launches a Shiny App that provides a visual representation of the relationships between a collection of tables
16 | (data.frames) with some relational structure.  If given a dump of dozens of flat files and without a formal schema or
17 | documentation on your data, this app will help explore and understand the underlying schema - which tables can be joined
18 | which other tables, which variables can be used to join which tables, etc,  It also gives a read on how strongly each variable with
19 | the same name in multiple tables actually connects tables (how many of the values of the linking variable \code{x} that are in
20 | \code{table1} are also in \code{table2}.  \cr \cr
21 | }
22 | \details{
23 | Note, the Key-strength tables can be slow to display because these computations actually dive into the contents of the
24 | data.frames and perform set operations on every row of the variable of interest.  It is possible to use the \code{\link{isKey}} function
25 | to compute these similarity matrices ahead of time to prevent the Shiny app from doing these computations each time.
26 | }
27 | \examples{
28 | \dontrun{
29 | ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory
30 | temp <- tempfile()
31 | localDataDir <- paste0(tempdir(), '\\\\lahman2012-csv-onYourComp.zip')
32 | download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir)
33 | unzip(localDataDir, exdir=paste0(tempdir(), '\\\\lahman2012-csv-onYourComp')) ## may not be necessary
34 | 
35 | ## create a list of data.frames from .CSVs
36 | dfL <- dir2dfList(paste0(tempdir(), '\\\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F)
37 | 
38 | ## launch app
39 | tableNet(dfL)
40 | }
41 | }
42 | \seealso{
43 | \code{\link{dir2dfList}} \code{\link{isKey}}
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/man/binCat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{binCat}
 3 | \alias{binCat}
 4 | \title{categorical data binning by collapsing}
 5 | \usage{
 6 | binCat(x, ncat = NULL, maxp = NULL, results = F, setNA = NA,
 7 |   keepNA = F)
 8 | }
 9 | \arguments{
10 | \item{x}{vector to bin.  It is transformed to a character, so any type is acceptable}
11 | 
12 | \item{ncat}{number 0 to 100 (or higher I suppose).  Number of bins to collapse data to}
13 | 
14 | \item{maxp}{number 0 to 1.  Percentage of data that will be represented "as is" before categories are collapsed to "other"}
15 | 
16 | \item{results}{logical \code{TRUE} or \code{FALSE}.  Prints a frequency table of the new categories.}
17 | 
18 | \item{setNA}{value to set NAs to.  default is to keep NA.  Can set to a character string to make NAs a category}
19 | 
20 | \item{keepNA}{logical. \code{TRUE} keeps NAs as their own character.  \code{FALSE} bundles NAs into 'other' category.}
21 | }
22 | \value{
23 | vector of binned data
24 | }
25 | \description{
26 | Bins categorical variables into a smaller number of bins.  Useful when modeling with variables that have many small categories.
27 | The largest categories are taken as is and the smaller categories are collapsed into a new field named 'other.'
28 | There are two options for determining the number of bins: \cr
29 | 1. Specify the exact number of bins desired (\code{ncat}) \cr
30 | 2. Specify how the share of your variable that will be represented with actual categories before naming everything else 'other' (\code{maxp}) \cr
31 | }
32 | \details{
33 | It is advisable to use only the \code{ncat} OR \code{maxp} parameters.  When both used together, they will return whichever
34 | criteria yields the smaller number of bins. \cr
35 | Possible unexpected behavior when setNA=NA and keepNA=T.  To keep NAs as standalone category, need to make setNA something that is not NA.
36 | }
37 | \examples{
38 | d <- rpois(1000, 20)
39 | d[d>26] <- sample(1:26, length(d[d>26]), replace=T)
40 | dl <- letters[d]
41 | barplot(table(dl))
42 | table(binCat(dl, results=F, ncat=5))
43 | table(binCat(dl, results=F, maxp=0.5))
44 | table(binCat(dl, results=F, maxp=0.9))
45 | 
46 | ## With missings
47 | ff <- sample(letters[1:15], 100, replace=T)
48 | ff[sample(100, 10)] <- NA
49 | binCat(ff, ncat=7, setNA='missing')
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/R/mergeEdgeLists.R:
--------------------------------------------------------------------------------
 1 | #' @title Merge edgelists => Master edgelist => igraph
 2 | #' 
 3 | #' @description This function turns a list of edgelists (data.frames) into a single "master" edgelist.
 4 | #' It maintains the edge attributes of each individual edge list
 5 | #' 
 6 | #' @param edgeLists list of edgelists (stored as data.frames)
 7 | #' @param from name of the column that specifies the "from" node in each data.frame in \code{edgeLists}
 8 | #' @param to name of the column that specifies the "to" node in each data.frame in \code{edgeLists}
 9 | #' @param keepDups deletes duplicate from-to relations when set to FALSE
10 | #' @export
11 | #' @examples
12 | #' require('igraph')
13 | #' cars <- data.frame(mtcars, to=sample(rownames(mtcars), replace=T), from=sample(rownames(mtcars), replace=T))
14 | #' df1<-cars[sample(1:32,30), c('to', 'from', 'cyl', 'mpg')]
15 | #' df2<-cars[sample(1:32,15), c('to', 'from', 'cyl', 'qsec')]
16 | #' df3<-cars[sample(1:32,32), c('to', 'from', 'hp', 'drat')]
17 | #' 
18 | #' df1$cyl[1:10] <- df1$cyl[1:10]+2
19 | #' el <- list(df1, df2, df3)
20 | #' 
21 | #' mel <- mergeEdgeLists(el, from='from', to='to', keepDups=T)
22 | 
23 | mergeEdgeLists <- function(edgeLists, from='from.ID', to='to.ID', keepDups=F){
24 |   n <- length(edgeLists)
25 |   
26 |   mel <- edgeLists[[1]]
27 |   for(i in 2:n){
28 |     mel <- merge(mel, edgeLists[[i]], by=c(to, from), all=T, suffixes=c(paste('.',i-1, sep=''), paste('.',i, sep='')))
29 |     dupVars <- intersect(setdiff(names(edgeLists[[i]]), c(from, to)), setdiff(names(edgeLists[[i-1]]), c(from, to)))
30 |     
31 |     ## if we have to merge columns with repeat names
32 |     for(d in dupVars){
33 |       print(paste('merging columns with same name in multiple edgelists: ',d))
34 |       dupVarName1 <- paste(d, '.',i-1, sep='')
35 |       dupVarName2 <- paste(d, '.',i, sep='')
36 |       mel[, d] <- mel[,dupVarName1]
37 |       mel[is.na(mel[,d]), d] <- mel[is.na(mel[,d]), dupVarName2]
38 |       nonmissingRows <- is.na(mel[,dupVarName1])==F & is.na(mel[,dupVarName2])==F
39 |       if(sum(mel[nonmissingRows, dupVarName1]!=mel[nonmissingRows, dupVarName2])>0) print(paste('Warning!!! Some values of ', dupVarName1, ' and ', dupVarName2, ' do not match.', sep=''))
40 |       if(keepDups==F) mel <- mel[,setdiff(names(mel), c(dupVarName1, dupVarName2))] 
41 |     }
42 |   }
43 |   return(mel)
44 | }
45 | 


--------------------------------------------------------------------------------
/R/makeVertexAtt.R:
--------------------------------------------------------------------------------
 1 | #' @title merge data.frame attributes to an igraph (network) object
 2 | #' 
 3 | #' @description Enriches an igraph object with node/vertex attributes from a data.frame
 4 | #' 
 5 | #' @param g igraph object to which you want to add a node/vertex attribute
 6 | #' @param df data.frame containing the attribute you want to add to igraph
 7 | #' @param vname name of column in data.frame \code{df} that you will merge into igraph
 8 | #' @param by.df unique key in data.frame that you will use to merge attribute into igraph
 9 | #' @param by.g unique key in igraph that you will use to merge attribute from data.frame
10 | #' @return a vector of attributes that correspond in order with the nodes in your igraph
11 | #' @import igraph
12 | #' @export
13 | #' @examples
14 | #' require('igraph')
15 | #' actors <- data.frame(names=c("Alice", "Bob", "Cecil", "David","Esmeralda"),
16 | #' age=c(48,33,45,34,21),
17 | #' gender=c("F","M","F","M","F"))
18 | #' relations <- data.frame(from=c("Bob", "Cecil", "Cecil", "David",
19 | #'                               "David", "Esmeralda"),
20 | #'                        to=c("Alice", "Bob", "Alice", "Alice", "Bob", "Alice"),
21 | #'                        same.dept=c(FALSE,FALSE,TRUE,FALSE,FALSE,TRUE),
22 | #'                        friendship=c(4,5,5,2,1,1), advice=c(4,5,5,4,2,3))
23 | #' g <- igraph::graph.edgelist(as.matrix(relations[,c('from', 'to')]), directed=T)
24 | #' 
25 | #' V(g)$age <- makeVertexAtt(g, df=actors, vname='age', by.df='names', by.g='name')
26 | 
27 | 
28 | makeVertexAtt <- function(g, df, vname, by.df, by.g='name') {
29 |   gdf <- data.frame(id=igraph::get.vertex.attribute(g, by.g), stringsAsFactors=F)
30 |   mdf <- merge(gdf, df[,c(by.df, vname)], by.x='id', by.y=by.df, all.x=T, all.y=F) 
31 |   check <- igraph::get.vertex.attribute(g, by.g)
32 |   mdf <- mdf[match(check, mdf$id),]
33 |   
34 |   ## checking that order of assets is preserved
35 |   mdf$check01 <- check == mdf$id
36 |   if(sum(check!=mdf$id)>0) cat(paste('Warning: Order of Network Nodes might be misaligned!!! \n', 
37 |                                         sum(mdf$check01), ' out of ', nrow(mdf), ' nodes are misaligned', sep='')) 
38 |   if(sum(is.na(mdf[,vname]))>0) print(paste('Warning: ', sum(is.na(mdf[,vname])), ' NAs out of ', nrow(mdf), ' node attributes in graph.', sep=''))
39 |   
40 |   return(mdf[,vname])
41 | }
42 | 
43 |   
44 | 


--------------------------------------------------------------------------------
/man/partialDep.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{partialDep}
 3 | \alias{partialDep}
 4 | \title{Partial Dependence (single variable)}
 5 | \usage{
 6 | partialDep(model, df, xvar, n = 10, target.class = "1", ci = c(0.9, 0.5,
 7 |   0.3))
 8 | }
 9 | \arguments{
10 | \item{model}{model object used to generate predictions.  Currently only built and tested for random forest.}
11 | 
12 | \item{df}{data.frame or data.table used to generate predictions with \code{model}}
13 | 
14 | \item{xvar}{character of length one; the x variable in \code{df} to assess for partial dependence with the response variable from \code{model}.}
15 | 
16 | \item{n}{numeric of length one; number of values between the min and max of \code{xvar} to score the model.
17 | Note: this number is also how many replicates of \code{df} must be created and stored in memory.  Default is 10.}
18 | 
19 | \item{target.class}{character: Which category (class) of the target variable to use for predictions}
20 | 
21 | \item{ci}{numeric: specify any confidence intervals around the median response.}
22 | }
23 | \value{
24 | data.table of output.  \code{cnt} refers to how many obs from \code{df} are within the fixed-width bin specified by \code{xvar}.
25 | }
26 | \description{
27 | Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model.
28 | Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence
29 | technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max.  This gives a more
30 | realistic idea of the magnitude and direction of the x variable on the response.
31 | }
32 | \examples{
33 | library('randomForest')
34 | library('data.table')
35 | DF <- mtcars
36 | DF$vs <- factor(DF$vs)
37 | rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100)
38 | pd <- partialDep(model=rf, df=DF, xvar='mpg')
39 | pd[ci==0.5,] # median of response when sequenced through 'mpg'
40 | 
41 | ## Plotting
42 | plot(pd[cilev==0, xvar], pd[cilev==0, pred], type='l', ylim=c(0,1))
43 | lines(pd[ci==.95, xvar], pd[ci==.95, pred], type='l', col='red')
44 | lines(pd[ci==.05, xvar], pd[ci==.05, pred], type='l', col='green')
45 | }
46 | \seealso{
47 | \code{\link{partialDepPlot}}
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/man/partialDepAll.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{partialDepAll}
 3 | \alias{partialDepAll}
 4 | \title{Partial Dependence Spark Lines (multiple variables)}
 5 | \usage{
 6 | partialDepAll(model, df, n = 10, xvars = NULL, target.class = "1",
 7 |   ci = c(0.75), plot = T, data = T, plot.yaxis.fixed = T, wordy = T)
 8 | }
 9 | \arguments{
10 | \item{model}{model object used to generate predictions.  Currently only built and tested for random forest.}
11 | 
12 | \item{df}{data.frame or data.table used to generate predictions with \code{model}}
13 | 
14 | \item{n}{numeric of length one; number of values between the min and max of \code{xvar} to score the model.
15 | Note: this number is also how many replicates of \code{df} must be created and stored in memory.  Default is 10.}
16 | 
17 | \item{xvars}{character vector; the x variables in \code{df} to assess for partial dependence with the response variable from \code{model}.  Defaults to choosing all variables from model.}
18 | 
19 | \item{target.class}{character: Which category (class) of the target variable to use for predictions}
20 | 
21 | \item{ci}{numeric: specify any confidence intervals around the median response.}
22 | 
23 | \item{plot}{logical: plot sparklines (\code{TRUE}), or no (\code{FALSE})}
24 | 
25 | \item{data}{logical: return summary table of output. yes (\code{TRUE}), or no (\code{FALSE})}
26 | 
27 | \item{plot.yaxis.fixed}{logical: \code{TRUE} uses same yaxis for all x variables (biggest max and smallest min across all variables).  code{\FALSE} uses variable specific axes.}
28 | 
29 | \item{wordy}{logical: print progress to the console}
30 | }
31 | \value{
32 | list of output and plot
33 | }
34 | \description{
35 | Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model.
36 | Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence
37 | technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max.  This gives a more
38 | realistic idea of the magnitude and direction of the x variable on the response.
39 | }
40 | \examples{
41 | library('randomForest')
42 | library('data.table')
43 | library('ggplot2')
44 | DF <- mtcars
45 | DF$vs <- factor(DF$vs)
46 | rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100)
47 | pda <- partialDepAll(model=rf, df=DF, n=10)
48 | }
49 | \seealso{
50 | \code{\link{partialDep}}
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/man/arulesApp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{arulesApp}
 3 | \alias{arulesApp}
 4 | \title{Assocation Rules Visualization Shiny App}
 5 | \usage{
 6 | arulesApp(dataset, bin = T, vars = 5, supp = 0.1, conf = 0.5)
 7 | }
 8 | \arguments{
 9 | \item{dataset}{data.frame, this is the dataset that association rules will be mined from.  Each row is treated as a transaction.  Seems to work
10 | OK when a the S4 transactions class from \code{arules} is used, however this is not thoroughly tested.}
11 | 
12 | \item{bin}{logical, \code{TRUE} will automatically discretize/bin numerical data into categorical features that can be used for association analysis.}
13 | 
14 | \item{vars}{integer, how many variables to include in initial rule mining}
15 | 
16 | \item{supp}{numeric, the support parameter for initializing visualization.  Useful when it is known that a high support is needed to not crash computationally.}
17 | 
18 | \item{conf}{numeric, the confidence parameter for initializing visualization.  Similarly useful when it is known that a high confidence is needed to not crash computationally.}
19 | }
20 | \value{
21 | Shiny App
22 | }
23 | \description{
24 | Launches a Shiny App that provides an interactive interface to the visualizations of the \code{arulesViz} package.
25 | The app allows users to mine rules based on all or just subsets of features, sort by criteria (lift, support, confidence) and visualize
26 | using network graph, grouped bubble and scatter plots. \cr
27 | Users filter rules to target only those with a certain variable on the RHS or LHS of the rule.
28 | Rule mining is computed using the \link{apriori} algorithm from \code{arules}.
29 | }
30 | \examples{
31 | ## creating some data
32 | n <- 10000 # of obs
33 | d <- data.frame(
34 | eye = sample(c('brown', 'green', 'blue', 'hazel'), n, replace=T),
35 | gender = sample(c('male', 'female'), n, replace=T),
36 | height = sort(sample(c('dwarf', 'short', 'average', 'above average', 'giant'), n, replace=T)),
37 | wealth = sort(sample(c('poor', 'struggling', 'middle', 'uppermiddle', 'comfortable', 'rich', '1\%', 'millionaire', 'billionaire'), n, replace=T)),
38 | favoriteAnimal = sample(c('dog', 'cat', 'bat', 'frog', 'lion', 'cheetah', 'lion', 'walrus', 'squirrel'), n, replace=T),
39 | numkids = abs(round(rnorm(n, 2, 1)))
40 | )
41 | 
42 | ## adding some pattern
43 | d$numkids[d$gender=='male'] <- d$numkids[d$gender=='male'] + sample(0:3, sum(d$gender=='male'), replace=T)
44 | d$numkids <- factor(d$numkids)
45 | 
46 | ## calling Shiny App to visualize association rules
47 | arulesApp(d)
48 | }
49 | \seealso{
50 | \code{arulesViz}, \code{arules}
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/R/plotAvgBy.R:
--------------------------------------------------------------------------------
 1 | #' @title Plot average of an indicator over bins/categories of another another continuous variable
 2 | #' 
 3 | #' @description Useful for assessing the relationship between two variables of interest.  There are cases, especially when 
 4 | #' outliers are involved, or many obs, that a scatterplot can be difficult to read.  This function bins up one of the continuous
 5 | #' variables that would be used in a scatterplot and calculates the mean (or other function) of the continuous variable over 
 6 | #' a range (discretized into categories) of the second continuous variable.  It uses an equal depth binning algorithm
 7 | #' to compute these bins on the \code{by} variable.  
 8 | #' 
 9 | #' It can be used to assess the average of a binary target variable/prediction
10 | #' over a range of levels of a continuous or categorical variable.
11 | #' 
12 | #' @param indv vector.  This the variable whose mean will be calculated over the categories of the \code{byv} vector binned up
13 | #' @param byv vector. This is variable to be binned up, by which the \code{indv} variable will be averaged
14 | #' @param nbins numeric. Number of bins to create when discretizing \code{byv}.  Passed to \code{depthbin} function.
15 | #' @param data logical. TRUE returns the aggregated data.table. FALSE returns noting.  TRUE is default.
16 | #' @param plotNbin logical.  TRUE plots the count of obs in each bin on top of each bar.  TRUE is default.
17 | #' @param ... additional barplot arguments.
18 | 
19 | #' @return prints a barplot unless data==T, in which case the aggregated data.table is returned
20 | #' @import data.table
21 | #' @export
22 | #' @examples
23 | #' 
24 | #' plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=8)
25 | #' plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=5, plotNbin=F)
26 | #' plotAvgBy(mtcars[,'mpg'], mtcars[,'drat'], nbins=5, plotNbin=F, data=T)
27 | #'
28 | #' ## Example with missing data
29 | #' df <- mtcars
30 | #' df$mpg[sample(1:nrow(mtcars), 5)] <- NA
31 | #' df$drat[sample(1:nrow(mtcars), 5)] <- NA
32 | #' 
33 | #' plotAvgBy(df[,'mpg'], df[,'drat'], nbins=5, plotNbin=F, data=T)
34 | 
35 | plotAvgBy <- function(indv, byv, nbins=5, data=F, plotNbin=T, ...){
36 |   dt <- data.table(indv, byv)
37 |   warning(sprintf('removing %s observations where indv is NA', sum(is.na(dt$indv))))
38 |   dt <- dt[is.na(indv)==F,]
39 |   tmp <- dt[, .(N=.N, hits=sum(as.numeric(as.character(indv)))), keyby=depthbin(byv, nbins=nbins)][,hitrate:=hits/N]
40 |   plt <- tmp[,barplot(hitrate, names=depthbin,las=2, col='navy', ...)]
41 |   if(plotNbin==T) text(plt, tmp$hitrate-abs(diff(range(tmp$hitrate)))/20, labels=tmp$N, col='white')
42 |   if(data==T) return(tmp)
43 | }
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/R/overlap.R:
--------------------------------------------------------------------------------
 1 | #' @title Assess overlap in vectors
 2 | #' @description Useful for learning the overlap of absolute and unique values between two vectors of the same type.  They
 3 | #' do not have to be the same length, and are allowed to contain NAs.
 4 | #' @param x1 vector of same class as \code{x2}
 5 | #' @param x2 vector of same class as \code{x1}
 6 | #' @param na.rm logical, remove NAs from analysis
 7 | #' @param wordy logical, prints convenience information
 8 | #' @return summary data.frame
 9 | #' @export
10 | #' @examples
11 | #' x1 <- sample(1:50, 40, replace=T)
12 | #' x2 <- sample(c(1:60, rep(NA, 30)), 55, replace=T)
13 | #' overlap(x1, x2, na.rm=T)
14 | #' overlap(x1, x2)
15 | #' overlap(mtcars$gear, mtcars$cyl, x1name='gear', x2name='cyl')
16 | 
17 | overlap <- function(x1, x2, na.rm=F, wordy=T, x1name=NULL, x2name=NULL) {
18 |   
19 |   # printing which variable is which.
20 |   if(wordy==T) {
21 |     cat(paste0('x1 is ', deparse(substitute(x1))))
22 |     cat('\n')
23 |     cat(paste0('x2 is ', deparse(substitute(x2))))
24 |     cat('\n\n')
25 |   }
26 |   
27 |   # checking for class mismatch
28 |   if(class(x1) != class(x2)) {
29 |     warning('classes of x1 and x2 are not equal.  Coercing both to character')
30 |     if(class(x1)!='character') x1 <- as.character(x1)
31 |     if(class(x2)!='character') x2 <- as.character(x2) 
32 |   }
33 |   
34 |   # adding optional vector names
35 |   x1name <- ifelse(is.null(x1name), 'x1', x1name)
36 |   x2name <- ifelse(is.null(x2name), 'x2', x2name)
37 |   
38 |   # dealing w NAs
39 |   if(na.rm==T){
40 |     x1 <- x1[is.na(x1)==F]
41 |     x2 <- x2[is.na(x2)==F]
42 |   }
43 | 
44 |   # creating statistics to return
45 |   ret <- list()
46 |   ret[[sprintf('%s exist in %s', x1name, x2name)]] <- c(
47 |       length(x1), 
48 |       length(x2),
49 |       sum(x1 %in% x2), 
50 |       sum(x1 %in% x2)/length(x1)
51 |     )
52 |   ret[[sprintf('%s exist in %s', x2name, x1name)]] <- c(
53 |       length(x2),
54 |       length(x1),
55 |       sum(x2 %in% x1), 
56 |       sum(x2 %in% x1)/length(x2)
57 |       )
58 |   ret[[sprintf('unique(%s) exist in unique(%s)', x1name, x2name)]] <- c(
59 |       length(unique(x1)),
60 |       length(unique(x2)),
61 |       sum(unique(x1) %in% unique(x2)), 
62 |       sum(unique(x1) %in% unique(x2))/length(unique(x1))
63 |       )
64 |   ret[[sprintf('unique(%s) exist in unique(%s)', x2name, x1name)]] <- c(
65 |       length(unique(x2)),
66 |       length(unique(x1)),
67 |       sum(unique(x2) %in% unique(x1)), 
68 |       sum(unique(x2) %in% unique(x1))/length(unique(x2))
69 |       )
70 |   
71 | 
72 |   df <- t(data.frame(ret))
73 |   row.names(df) <- names(ret)
74 |   colnames(df) <- c('LHS_count', 'RHS_count', 'count', 'percent')
75 |   
76 |   return(df)
77 | }
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/man/smartAgg.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.2): do not edit by hand
 2 | \name{smartAgg}
 3 | \alias{smartAgg}
 4 | \title{Aggregate multiple columns using different functions harnessing data.table efficiency}
 5 | \usage{
 6 | smartAgg(df, by, ..., catN = T, printAgg = F)
 7 | }
 8 | \arguments{
 9 | \item{df}{input data.frame to be aggregated}
10 | 
11 | \item{by}{variable name of data.frame \code{df} to aggregate on.  Same as \code{by} in base R \code{aggregate} function}
12 | 
13 | \item{...}{method to identify the variables to aggregate and the functions used to do so.
14 | Specify the function first as a string argument and then a vector of the column names to aggregate using that function.
15 | You can specify as many different functions as necessary, but every function must follow a vector of column names.}
16 | 
17 | \item{catN}{adds a column named "countPerBin" with the # of observations aggregated in each row of the output data.frame.}
18 | 
19 | \item{printAgg}{prints the line of code used to}
20 | }
21 | \value{
22 | aggregated data.frame with columns corresponding to the grouping variables in by followed by aggregated columns from df.
23 | }
24 | \description{
25 | aggregates columns of a data.frame on 1 or multiple dimensions using 1 or multiple functions for different columns.
26 | It is equivalent to the base R \code{aggregate} function, except that it allows the user to
27 | aggregate sets of columns (referred to by name or column #) with different functions... and it's fast!
28 | }
29 | \examples{
30 | require('data.table')
31 | 
32 | ## establishing variables to aggregate on
33 | lengthVs <- c('Sepal.Length', 'Petal.Length')
34 | widthVs <- c('Sepal.Width', 'Petal.Width')
35 | 
36 | ## aggregating using 2 different functions and identifying columns to aggregate by variable names
37 | irisAgg1 <- smartAgg(df=iris, by='Species', 'mean', lengthVs, 'sum', widthVs)
38 | 
39 | ## aggregating using 2 dimensions ("Specied" and "randthing")
40 | iris$randthing <- as.character(sample(1:5, nrow(iris), replace=T))
41 | irisAgg2 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', lengthVs, 'sum', widthVs, catN=T, printAgg=T)
42 | 
43 | ## aggregating variables by column number
44 | irisAgg3 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', 1:2, 'sum', 3:4, catN=T, printAgg=T)
45 | 
46 | ## use anonymous functions
47 | data(mtcars)
48 | smartAgg(mtcars, by='cyl', function(x) sum(x*100), c('drat', 'mpg', 'disp'))
49 | 
50 | ## use anonymous functions with more than 1 argument.  Uses the provided variables for all unassigned arguments in anonymous function
51 | smartAgg(mtcars, by='cyl', function(x,y='carb') sum(x*y), c('drat', 'mpg', 'disp'))
52 | with(mtcars[mtcars$cyl==6,], c(sum(drat*carb), sum(mpg*carb), sum(disp*carb)))
53 | 
54 | ## with anonymous functions with more than 1 argument.
55 | ## Example of possible unintended behavior - the user-provided variable is used for both and x and y in this example.
56 | smartAgg(mtcars, by='cyl', function(x,y) sum(x*y), c('drat', 'mpg', 'disp'))
57 | with(mtcars[mtcars$cyl==6,], c(sum(drat*drat), sum(mpg*mpg), sum(carb*carb)))
58 | 
59 | ## demonstrating speed gain of smartAgg using data.table over aggregate
60 | n <- 300000
61 | df <- data.frame(x1=rnorm(n), x2=rbinom(n,5,0.5), x3=sample(letters, n, replace=T))
62 | system.time(aggFast <- smartAgg(df, by='x3', 'mean', c('x1', 'x2')))
63 | system.time(aggSlow <- aggregate(df[,c('x2', 'x1')], by=list(df$x3), FUN='mean'))
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/R/plotdf.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title Plot data.frame to PDF
 3 | #' @description Plots every column of a data.frame as an individual plot (one plot per page)
 4 | #'  in a PDF file.  
 5 | #' 
 6 | #' @param df data.frame to plot.
 7 | #' @param file string - name of the PDF that will be created
 8 | #' @param wordy - boolean \code{TRUE} or \code{FALSE}.  Sequentially prints status of each chart to the console.
 9 | #' Could be useful for large data.frames.
10 | #'
11 | #' @return PDF file of plots
12 | #' @export
13 | #' 
14 | #' @examples
15 | #' \dontrun{
16 | #' plotdf(df=mtcars, file='mtcars_plots.pdf')
17 | #' }
18 | 
19 | plotdf <- function(df, file='output.pdf', wordy=F){
20 |   pdf(file)
21 |   for(i in 1:ncol(df)){
22 |     if(wordy==T) print(i)
23 |     if((class(df[,i]) %in% c('numeric', 'integer')) & length(unique(df[,i]))>15) { plotNum(df[,i], vn=names(df)[i])
24 |     } else { plotChar(df[,i], vn=names(df)[i])}
25 |   }
26 |   graphics.off()
27 |   print(paste0('charts saved in ', getwd(), '/', file))
28 | }
29 | 
30 | 
31 | ##################################################################
32 | ## Helper functions for plotdf (not included in Rsenal package)
33 | ##################################################################
34 | 
35 | plotNum <- function(x, ...) {
36 |   par(mfrow=c(3, 2))
37 |   par(oma=c(1,2,2,2))
38 |   layout(matrix(c(1,1,2,2,2,2), 3, 2, byrow=T), widths=c(1,1), heights=c(1,1,1))
39 |   ptitle <- ifelse(is.null(list(...)$vn), '', list(...)$vn)
40 |   
41 |   p1<-plot(0:1, 0:1, col='white', yaxt='n', ylab = '', xaxt='n', xlab='', main=ptitle)
42 |   text(x=0.2, y=0.8, label='type:', pos=4, font=2)
43 |   text(x=0.2, y=0.7, label='# of unique values:', pos=4, font=2)
44 |   text(x=0.2, y=0.6, label='% NA:', pos=4, font=2)
45 |   text(x=0.2, y=0.5, label='min:', pos=4, font=2)
46 |   text(x=0.2, y=0.4, label='median:', pos=4, font=2)
47 |   text(x=0.2, y=0.3, label='mean:', pos=4, font=2)
48 |   text(x=0.2, y=0.2, label='max:', pos=4, font=2)
49 |   
50 |   text(x=0.4, y=0.8, label=class(x), pos=4)
51 |   text(x=0.4, y=0.7, label=length(unique(x)), pos=4)
52 |   text(x=0.4, y=0.6, label=paste0(round(sum(is.na(x))/length(x)*100, 4), '%'), pos=4)
53 |   text(x=0.4, y=0.5, label=min(x, na.rm=T), pos=4)
54 |   text(x=0.4, y=0.4, label=median(x, na.rm=T), pos=4)
55 |   text(x=0.4, y=0.3, label=mean(x, na.rm=T), pos=4)
56 |   text(x=0.4, y=0.2, label=max(x, na.rm=T), pos=4)
57 |   
58 |   p2<-hist(x, main=ptitle, breaks=30, xlab=ptitle, col='grey')
59 |   rug(x, col='red')
60 |   
61 |   return(list(p1, p2))
62 | }
63 | 
64 | ## example
65 | ## plotNum(rnorm(1000)^2, vn='ssds')
66 | 
67 | 
68 | plotChar <- function(x, ...) {
69 |   par(mfrow=c(3, 2))
70 |   layout(matrix(c(1,1,2,2,2,2), 3, 2, byrow=T), widths=c(1,1), heights=c(1,1,1))
71 |   
72 |   ptitle <- ifelse(is.null(list(...)$vn), '', list(...)$vn)
73 |   tab <- sort(table(x), decreasing=T)[1:min(length(unique(x)),50)]
74 |   tabmiss <- round(sum(tab, na.rm=T)/length(x)*100, 1)
75 |   
76 |   p1 <- plot(0:1, 0:1, col='white', yaxt='n', ylab = '', xaxt='n', xlab='', main=ptitle)
77 |   text(x=0.2, y=0.8, label='type:', pos=4, font=2)
78 |   text(x=0.2, y=0.7, label='# of unique values:', pos=4, font=2)
79 |   text(x=0.2, y=0.6, label='% NA:', pos=4, font=2)
80 |   
81 |   text(x=0.4, y=0.8, label=class(x), pos=4)
82 |   text(x=0.4, y=0.7, label=length(unique(x)), pos=4)
83 |   text(x=0.4, y=0.6, label=paste0(round(sum(is.na(x))/length(x)*100, 4), '%'), pos=4)
84 |   
85 |   par(mar=c(10,4.1,4.1,2.1))
86 |   p2 <- barplot(tab, las=2, cex.names=0.6, col='dodgerblue',
87 |                 main=paste(ptitle, ': ', tabmiss, '% of data shown', sep=''))
88 |   par(mar=c(5.1,4.1,4.1,2.1))
89 |   
90 |   return(list(p1, p2))
91 | }


--------------------------------------------------------------------------------
/R/shiny_leafletDF.R:
--------------------------------------------------------------------------------
  1 | #' @title Leaflet interactive map widget
  2 | #' @description Launches a Shiny App an interactive Leaflet map which can be used to visualize coordinate data.  Intended
  3 | #' for quick exploratory geographic analyses 
  4 | #' 
  5 | #' @param data data.frame or data.table of data to visualize on map.  Included must be a column for longitude and latitude coordinates.
  6 | #' Four additional columns are allowed for filtering and coloring the data.  Note, the filters are currently only categorical, not currently 
  7 | #' supporting numeric ranges.  Recommend only passing columns needed for filtering.  Default is to pick first 4
  8 | #' @param vars named character vector.  The names of the vector should always be 'lon' and 'lat'.  The corresponding values
  9 | #' of this vector are the column names of the columns in \code{data} with the lat and lon data.
 10 | #' @seealso \code{leaflet}, \code{shiny}
 11 | #' @return Shiny App
 12 | #' @import shiny leaflet data.table RColorBrewer htmlwidgets
 13 | #' @export
 14 | #' 
 15 | #' @examples
 16 | #' 
 17 | #' \dontrun{
 18 | #' n <- 5000
 19 | #' df <- data.frame(latitude=runif(n, 35, 40),
 20 | #'                  longitude=runif(n, -100, -85),
 21 | #'                  animals=sample(c('dogs', 'cats', 'turtles'), n, replace=T)
 22 | #'                  )
 23 | #' df$westness <- cut(df$longitude, breaks=seq(min(df$longitude), max(df$longitude), length.out=10))
 24 | #' df$northness <- cut(df$latitude, breaks=seq(min(df$latitude), max(df$latitude), length.out=10))
 25 | #' leafletMapDF(df)
 26 | #' }
 27 | 
 28 | leafletMapDF <- function (data, vars=c('lon'='longitude', 'lat'='latitude')) {
 29 |   
 30 |   qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
 31 |   col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
 32 |   set.seed(110); n=length(col_vector); col_vector <- sample(col_vector, n)
 33 |   uivars <- setdiff(names(data), c(vars['lon'], vars['lat']))
 34 |   
 35 |   if(length(uivars)>4) warning('currently supports only 4 variables for filtering. picking first 4 columns in data')
 36 |   
 37 |   shinyApp(
 38 |     ui = shinyUI(fluidPage(
 39 |       leafletOutput('mymap'),
 40 |       p(),
 41 |       h3('Filter data'),
 42 |       if(length(uivars)>=1) selectInput('ui1', label=sprintf('Filter by %s:', uivars[1]), choices=as.character(unique(data[[uivars[1]]])), multiple=T),
 43 |       if(length(uivars)>=2) selectInput('ui2', label=sprintf('Filter by %s:', uivars[2]), choices=as.character(unique(data[[uivars[2]]])), multiple=T),
 44 |       if(length(uivars)>=3) selectInput('ui3', label=sprintf('Filter by %s:', uivars[3]), choices=as.character(unique(data[[uivars[3]]])), multiple=T),
 45 |       if(length(uivars)>=4) selectInput('ui4', label=sprintf('Filter by %s:', uivars[4]), choices=as.character(unique(data[[uivars[4]]])), multiple=T),
 46 |       h3('Map parameters'),
 47 |       selectInput('colby', 'Color by', uivars),
 48 |       numericInput('radiusid', 'Circle radius (meters)', min=1, max=100000, value=10),
 49 |       downloadButton('downloadid', 'Save as HTML')
 50 |     )),
 51 |     
 52 |     server = function(input, output) {
 53 | 
 54 |       datar <- reactive({
 55 |         # filtering data based on ui.  showing rows that meet ALL (not ANY) of the filters.  blanks/NULLS are assumed to be all points.
 56 |         if(is.null(input[['ui1']])) cond1 <- rep(T, nrow(data)) else cond1 <- data[[uivars[1]]] %in% input[['ui1']]
 57 |         if(is.null(input[['ui2']])) cond2 <- rep(T, nrow(data)) else cond2 <- data[[uivars[2]]] %in% input[['ui2']]
 58 |         if(is.null(input[['ui3']])) cond3 <- rep(T, nrow(data)) else cond3 <- data[[uivars[3]]] %in% input[['ui3']]
 59 |         if(is.null(input[['ui4']])) cond4 <- rep(T, nrow(data)) else cond4 <- data[[uivars[4]]] %in% input[['ui4']]
 60 |         cond <- data.table(cond1, cond2, cond3, cond4)[,which(apply(.SD, 1, all))]
 61 |       
 62 |         data[cond,]
 63 |       })
 64 |       
 65 |       # Map plot function ############################
 66 |       makemap <- function() {
 67 |         tmp <- datar()
 68 |         xvec <- factor(tmp[[input$colby]])
 69 |         legcols <- col_vector[1:length(levels(xvec))]
 70 |         leglabs <- levels(xvec)
 71 |         
 72 |         m <- leaflet() %>%
 73 |           addProviderTiles("Stamen.TonerLite", options=providerTileOptions(noWrap = TRUE)
 74 |           ) %>%
 75 |           addCircles(data=cbind(tmp[[vars['lon']]], tmp[[vars['lat']]]), 
 76 |                      color=col_vector[as.numeric(factor(tmp[[input$colby]]))],
 77 |                      radius=input[['radiusid']]) %>%
 78 |           addLegend("topright", colors=legcols, labels=leglabs, opacity=2, title=input$colby)
 79 |         return(m)
 80 |       }
 81 |       
 82 |       ## Render map ###################################
 83 |       output$mymap <- renderLeaflet({
 84 |         mm <- makemap()
 85 |         #saveWidget(mm, 'leafletmap.html') # this will save on each render without browser directory choice
 86 |         mm
 87 |       })
 88 |       
 89 |       ## Download map as HTML  ########################
 90 |       output$downloadid <- downloadHandler(
 91 |         filename = function() 'leafletdownload.html',
 92 |         content = function(con) saveWidget(makemap(), con)
 93 |       )
 94 |     
 95 |     }
 96 |     
 97 |   )}
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/R/smartAgg.R:
--------------------------------------------------------------------------------
  1 | #' @title Aggregate multiple columns using different functions harnessing data.table efficiency
  2 | #' 
  3 | #' @description aggregates columns of a data.frame on 1 or multiple dimensions using 1 or multiple functions for different columns.
  4 | #' It is equivalent to the base R \code{aggregate} function, except that it allows the user to 
  5 | #' aggregate sets of columns (referred to by name or column #) with different functions... and it's fast!
  6 | #' 
  7 | #' @param df input data.frame to be aggregated
  8 | #' @param by variable name of data.frame \code{df} to aggregate on.  Same as \code{by} in base R \code{aggregate} function
  9 | #' @param ... method to identify the variables to aggregate and the functions used to do so.
 10 | #'            Specify the function first as a string argument and then a vector of the column names to aggregate using that function.
 11 | #'            You can specify as many different functions as necessary, but every function must follow a vector of column names.
 12 | #' @param catN adds a column named "countPerBin" with the # of observations aggregated in each row of the output data.frame.
 13 | #' @param printAgg prints the line of code used to 
 14 | #' @return  aggregated data.frame with columns corresponding to the grouping variables in by followed by aggregated columns from df.
 15 | #' @import data.table
 16 | #' @export
 17 | #' @examples
 18 | #' 
 19 | #' require('data.table')
 20 | #' 
 21 | #' ## establishing variables to aggregate on
 22 | #' lengthVs <- c('Sepal.Length', 'Petal.Length')
 23 | #' widthVs <- c('Sepal.Width', 'Petal.Width')
 24 | #' 
 25 | #' ## aggregating using 2 different functions and identifying columns to aggregate by variable names
 26 | #' irisAgg1 <- smartAgg(df=iris, by='Species', 'mean', lengthVs, 'sum', widthVs)
 27 | #'
 28 | #' ## aggregating using 2 dimensions ("Specied" and "randthing")
 29 | #' iris$randthing <- as.character(sample(1:5, nrow(iris), replace=T))
 30 | #' irisAgg2 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', lengthVs, 'sum', widthVs, catN=T, printAgg=T)
 31 | #' 
 32 | #' ## aggregating variables by column number
 33 | #' irisAgg3 <- smartAgg(df=iris, by=c('Species', 'randthing'), 'mean', 1:2, 'sum', 3:4, catN=T, printAgg=T)
 34 | #'
 35 | #' ## use anonymous functions
 36 | #' data(mtcars)
 37 | #' smartAgg(mtcars, by='cyl', function(x) sum(x*100), c('drat', 'mpg', 'disp'))
 38 | #' 
 39 | #' ## use anonymous functions with more than 1 argument.  Uses the provided variables for all unassigned arguments in anonymous function
 40 | #' smartAgg(mtcars, by='cyl', function(x,y='carb') sum(x*y), c('drat', 'mpg', 'disp'))
 41 | #' with(mtcars[mtcars$cyl==6,], c(sum(drat*carb), sum(mpg*carb), sum(disp*carb))) 
 42 | #' 
 43 | #' ## with anonymous functions with more than 1 argument.  
 44 | #' ## Example of possible unintended behavior - the user-provided variable is used for both and x and y in this example.
 45 | #' smartAgg(mtcars, by='cyl', function(x,y) sum(x*y), c('drat', 'mpg', 'disp'))
 46 | #' with(mtcars[mtcars$cyl==6,], c(sum(drat*drat), sum(mpg*mpg), sum(carb*carb))) 
 47 | #' 
 48 | #' ## demonstrating speed gain of smartAgg using data.table over aggregate
 49 | #' n <- 300000
 50 | #' df <- data.frame(x1=rnorm(n), x2=rbinom(n,5,0.5), x3=sample(letters, n, replace=T))
 51 | #' system.time(aggFast <- smartAgg(df, by='x3', 'mean', c('x1', 'x2')))
 52 | #' system.time(aggSlow <- aggregate(df[,c('x2', 'x1')], by=list(df$x3), FUN='mean'))
 53 | 
 54 | smartAgg <- function(df, by, ..., catN=T, printAgg=F) {
 55 |   args <- list(...)
 56 |   dt <- as.data.table(df)
 57 |   
 58 |   ## organizing agg Methods and variable names into 2 separate lists
 59 |   aggMethod <- list()
 60 |   vars <- list()
 61 |   j<-1
 62 |   for(i in seq(1,length(args),2)) {
 63 |     aggMethod[[j]] <- args[[i]]
 64 |     vars[[j]] <- args[[i+1]]
 65 |     if(class(vars[[j]]) %in% c('integer', 'numeric')) vars[[j]] <- names(df)[vars[[j]]]
 66 |     j<-j+1
 67 |   }
 68 |   
 69 |   ## creat line to exec
 70 |   k<-0
 71 |   varL <- vector()
 72 |   for(j in 1:length(aggMethod)){
 73 |     for(i in 1:length(vars[[j]])){
 74 |       if(vars[[j]][i] %in% names(df)){
 75 |         if(class(aggMethod[[j]])=='function') {
 76 |           afun <- paste0('af',j)
 77 |           assign(afun, aggMethod[[j]])
 78 |           laf2 <- as.list(formals(get(afun)))
 79 |           laf2[which(lapply(laf2, nchar)==0)] <- vars[[j]][i] 
 80 |           rhstmp <- paste(unlist(lapply(seq_along(laf2), function(y,n,i) paste0(n[[i]], '=', y[[i]]), n=names(laf2), y=laf2)), collapse=',')
 81 |           tmp <- paste(vars[[j]][i], '=', afun, '(', rhstmp, ')', sep='') # anonymous functions
 82 |         } else {         
 83 |           tmp <- paste(vars[[j]][i], '=', aggMethod[[j]], '(', vars[[j]][i], ')', sep='') #non-anonymous functions
 84 |         }
 85 |         k <- k+1
 86 |         varL[k] <- tmp
 87 |       } else {print(paste('WARNING: ', vars[[j]][i], ' not in dataframe', sep=''))}
 88 |     }
 89 |   }
 90 |   varL <- paste(varL, collapse=', ')
 91 |   if(catN==T) varL <- paste(varL, ',countPerBin=length(', vars[[1]][1], ')', sep='')  
 92 |   
 93 |   ## actually creating aggregation command and executing it
 94 |   line2exec <- paste('dtAgg <- dt[,list(', varL, '), by=list(', paste(by,collapse=','), ')]', sep='')
 95 |   if(printAgg==T) print(line2exec)
 96 |   eval(parse(text=line2exec))
 97 |   dfAgg <- data.frame(dtAgg)
 98 |   
 99 |   return(dfAgg)
100 | }
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/R/partialDependence.R:
--------------------------------------------------------------------------------
  1 | #' @title Partial Dependence (single variable)
  2 | #' @description Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model.
  3 | #' Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence 
  4 | #' technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max.  This gives a more 
  5 | #' realistic idea of the magnitude and direction of the x variable on the response.
  6 | #' 
  7 | #' @param model model object used to generate predictions.  Currently only built and tested for random forest.
  8 | #' @param df data.frame or data.table used to generate predictions with \code{model}
  9 | #' @param xvar character of length one; the x variable in \code{df} to assess for partial dependence with the response variable from \code{model}.
 10 | #' @param n numeric of length one; number of values between the min and max of \code{xvar} to score the model. 
 11 | #'  Note: this number is also how many replicates of \code{df} must be created and stored in memory.  Default is 10.
 12 | #' @param target.class character: Which category (class) of the target variable to use for predictions
 13 | #' @param ci numeric: specify any confidence intervals around the median response.  
 14 | #' @seealso \code{\link{partialDepPlot}}
 15 | #' @return data.table of output.  \code{cnt} refers to how many obs from \code{df} are within the fixed-width bin specified by \code{xvar}.  
 16 | #' @import randomForest
 17 | #' @import data.table
 18 | #' @export
 19 | #' 
 20 | #' @examples
 21 | #' library('randomForest')
 22 | #' library('data.table')
 23 | #' DF <- mtcars
 24 | #' DF$vs <- factor(DF$vs)
 25 | #' rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100)
 26 | #' pd <- partialDep(model=rf, df=DF, xvar='mpg')
 27 | #' pd[ci==0.5,] # median of response when sequenced through 'mpg' 
 28 | #' 
 29 | #' ## Plotting
 30 | #' plot(pd[cilev==0, xvar], pd[cilev==0, pred], type='l', ylim=c(0,1))
 31 | #' lines(pd[ci==.95, xvar], pd[ci==.95, pred], type='l', col='red')
 32 | #' lines(pd[ci==.05, xvar], pd[ci==.05, pred], type='l', col='green')
 33 | 
 34 | partialDep <- function(model, df, xvar, n=10, target.class='1', ci=c(.9,.5,.3)) {
 35 |   if(!'data.table' %in% class(df)) df <- data.table(df)
 36 |   xv <- df[,get(xvar)] # x vector of interest
 37 |   xvr <- seq(min(xv), max(xv), length.out=n) # fixed width break points
 38 |   dfb <- df[rep(1:.N, each=n), ] # creating replicate of data for each 
 39 |   dfb[,xvar] <- rep(xvr, nrow(df)) # replacing x variable of interest with range of x var
 40 |   
 41 |   dfb[,cnt:=c(0, table(cut(df[,get(xvar)], breaks=xvr, include.lowest=T)))]
 42 |   pred <- predict(model, dfb, type='prob')[,target.class]
 43 |   
 44 |   retdf <- data.table(xvar=dfb[,get(xvar)], pred=pred, cnt=dfb[,cnt])
 45 |   ci <- c(ci, 0)
 46 |   ci <- unique(c(0.5+ci/2, 0.5-ci/2))
 47 |   ret <- retdf[,.(pred=quantile(pred, ci), cnt=cnt[1]), by='xvar'][,ci:=rep(ci, n)][,cilev:=abs(ci-0.5)*2][]
 48 |   return(ret)
 49 | }
 50 | 
 51 | 
 52 | 
 53 | #' @title Partial Dependence Spark Lines (multiple variables) 
 54 | #' @description Calculate the partial dependence of a predictor variable on the response variable from a random forest classification model.
 55 | #' Rather than sequence through values of the predictor variable of interest and keep the other predictors at their median, this partial dependence 
 56 | #' technique creates replicates of the entire dataset for each level of the x variable of interest from it's min to max.  This gives a more 
 57 | #' realistic idea of the magnitude and direction of the x variable on the response.
 58 | #' 
 59 | #' @param model model object used to generate predictions.  Currently only built and tested for random forest.
 60 | #' @param df data.frame or data.table used to generate predictions with \code{model}
 61 | #' @param n numeric of length one; number of values between the min and max of \code{xvar} to score the model. 
 62 | #' Note: this number is also how many replicates of \code{df} must be created and stored in memory.  Default is 10.
 63 | #' @param xvars character vector; the x variables in \code{df} to assess for partial dependence with the response variable from \code{model}.  Defaults to choosing all variables from model.
 64 | #' @param target.class character: Which category (class) of the target variable to use for predictions
 65 | #' @param ci numeric: specify any confidence intervals around the median response.  
 66 | #' @param plot logical: plot sparklines (\code{TRUE}), or no (\code{FALSE})
 67 | #' @param data logical: return summary table of output. yes (\code{TRUE}), or no (\code{FALSE})
 68 | #' @param plot.yaxis.fixed logical: \code{TRUE} uses same yaxis for all x variables (biggest max and smallest min across all variables).  code{\FALSE} uses variable specific axes.
 69 | #' @param wordy logical: print progress to the console
 70 | #' @seealso \code{\link{partialDep}}
 71 | #' @return list of output and plot
 72 | #' @import randomForest
 73 | #' @import data.table
 74 | #' @import ggplot2
 75 | #' @export
 76 | #' 
 77 | #' @examples
 78 | #' library('randomForest')
 79 | #' library('data.table')
 80 | #' library('ggplot2')
 81 | #' DF <- mtcars
 82 | #' DF$vs <- factor(DF$vs)
 83 | #' rf <- randomForest(vs~mpg+cyl+drat+qsec+disp+gear+carb+hp, DF, ntrees=100)
 84 | #' pda <- partialDepAll(model=rf, df=DF, n=10)
 85 | 
 86 | partialDepAll <- function(model, df, n=10, xvars=NULL, target.class='1', ci=c(.75), plot=T, data=T, plot.yaxis.fixed=T, wordy=T) {
 87 |   # currently only works with random forest
 88 |   if(is.null(xvars)) xvars <- names(model$forest$ncat)
 89 |   L <- list()
 90 |   for(i in xvars){
 91 |     tmpdf <- data.table(partialDep(model=model, df=df, xvar=i, n=n, target.class=target.class, ci=ci))
 92 |     tmpdf[,xname:=i]
 93 |     tmpdf[,x:=length(L)+1] # keeping order of xvars
 94 |     L[[i]] <- tmpdf
 95 |     if(wordy==T) cat('.')
 96 |   }
 97 |   ret <- rbindlist(L)
 98 |   ret[,xname:=factor(xname, levels=xvars)] # to conserve order using facet_wrap
 99 |   
100 |   # producing data for sparklines plot
101 |   pdplot <- ret
102 |   pdplot[,y:=rep(rep(1:n, each=length(unique(pdplot$ci))), length(xvars))] 
103 |   setkey(pdplot,x,ci,y)
104 |   pdplot[,preddiff:=pred-shift(pred), by=.(x,ci)] # getting absolute pred of change
105 |   pdplot[,prednorm:=(pred-min(pred))/(max(pred)-min(pred)), by=.(x,ci)] # normalizing pred size
106 |   pdplot[,cntrug:=cumsum(cnt)/nrow(df)*n, by=.(x,ci)] # creating ticks for rug plot
107 |   
108 |   # plotting spark lines
109 |   if(plot==T){
110 |     lwdlen <- c(rep(5, 3), rep(4, 6), rep(3, 9), rep(2, length(xvars)))[length(xvars)]
111 |     wd <- reshape(pdplot, idvar = c('x', 'y'), v.names='pred', timevar="ci", direction = "wide") # reshape wide for geom_ribbons
112 |     gg <- ggplot()
113 |     for(cl in ci) gg <- gg + geom_ribbon(data=wd, aes_string(x='y', ymin=paste0('pred.', 0.5-cl/2), ymax=paste0('pred.', 0.5+cl/2)), alpha=0.75*1/length(ci), fill='gray')
114 |     gg <- gg + geom_line(data=pdplot[ci==0.5,], aes(x=y, y=pred, colour=pred), lwd=lwdlen) +
115 |       facet_wrap(~xname, scales=ifelse(plot.yaxis.fixed==T, 'fixed', 'free')) +
116 |       geom_rug(data=pdplot[ci==0.5,], aes(x=jitter(cntrug))) + 
117 |       scale_colour_gradientn(colours=colorRampPalette(c('firebrick', 'grey', 'forestgreen'))(n)) +
118 |       theme_bw() + xlab('') + ylab('prediction')
119 |     plot(gg)
120 |   }
121 |   
122 |   if(data==T){
123 |     retL <- list(pdplot=data.frame(pdplot),
124 |                  cilev=data.frame(ci),
125 |                  n=n,
126 |                  ggobj=gg
127 |     )
128 |     return(retL)
129 |   }
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/R/bin.R:
--------------------------------------------------------------------------------
  1 | #' @title Equal depth binning
  2 | #' @description Simple equal depth binning algorithm.  
  3 | #' 
  4 | #' @param ser numeric vector to bin
  5 | #' @param nbins number of bins desired
  6 | #' @param qtype an integer between 1 and 9 selecting one of the nine quantile algorithms detailed below to be used.  See \code{\link{quantile}} for more details.  Default is 7.
  7 | #' @param digits number, number of digits to display in bin categories
  8 | #' @param labelRange logical: \code{TRUE} assigns a numeric score/ranking (ex. 1/3, 2/3, or 3/3 if 3 bins) to each bin. Can be combined with \code{labelOrder} and \code{labelPct}
  9 | #' @param labelPct logical: \code{TRUE} appends the percent of observations assigned to the bin to the factor level (name). Can be combined with \code{labelOrder} and \code{labelRange}
 10 | #' @param labelOrder logical: \code{TRUE} appends the ordinal position of the bin to the factor level (name).  Can be combined with \code{labelPct} and \code{labelRange}
 11 | #' 
 12 | #' @seealso \code{\link{quantile}}
 13 | #' @return ordered factor vector with bins
 14 | #' @export
 15 | #' 
 16 | #' @examples
 17 | #' ## perfect equal depth bins
 18 | #' x1 <- rnorm(1000, 0, 20)
 19 | #' binned1 <- depthbin(x1, nbins=10)
 20 | #' table(binned1)
 21 | #' 
 22 | #' ## slightly uneven bins with integer data
 23 | #' x2 <- rpois(1000, 3)
 24 | #' binned2 <- depthbin(x2, nbins=5)
 25 | #' summary(binned2)
 26 | #' 
 27 | #' ## as good as we can get with skewed integer data
 28 | #' x3 <- round(abs(log(abs(rnorm(1000)))))
 29 | #' binned3 <- depthbin(x3, nbins=5)
 30 | #' summary(binned3)
 31 | #' 
 32 | #' ## including more information in category names (levels of factor variable)
 33 | #' x4 <- round(abs(log(abs(rnorm(1000)))))
 34 | #' binned4 <- depthbin(x4, nbins=3, labelRange=T, labelPct=T, labelOrder=T)
 35 | #' summary(binned4)
 36 | 
 37 | depthbin <- function(ser, nbins=10, qtype=7, digits=10, labelRange=T, labelPct=F, labelOrder=F) {
 38 |   cutpts <- quantile(ser, probs=seq(0, 1, 1/nbins), na.rm=T, type=qtype)
 39 |   if(length(unique(cutpts))==nbins+1) {
 40 |     returnser <- cut(ser, breaks=cutpts, right=T, include.lowest=T)  
 41 |   } else {
 42 |     alldup <- vector()
 43 |     while(length(unique(cutpts))+length(alldup) < nbins+1) {
 44 |       dup <- cutpts[duplicated(cutpts)]
 45 |       dups <- unique(dup)
 46 |       alldup <- c(alldup, dups)
 47 |       dupL <- length(alldup) + length(dups)
 48 |       ser2 <- ser[which(!ser %in% alldup)]
 49 |       cutpts <- quantile(ser2, probs=seq(0, 1, 1/(nbins-length(dups))), na.rm=T, type=qtype)
 50 |     }
 51 |     cutpts <- c(unique(cutpts), alldup)
 52 |     returnser <- cut(ser, breaks=cutpts, include.lowest=T, dig.lab=digits, right=F)
 53 |   }
 54 |   if(sum(labelRange, labelPct, labelOrder)==0) {
 55 |     labelRange <- T
 56 |     warning('arguments labelRange, labelOrder, labelPct should not all be set to FALSE. Setting labelRange to TRUE.')
 57 |   }
 58 |   rawlev <- levels(returnser)
 59 |   if (labelRange==T) levels(returnser) <- paste0(levels(returnser), rawlev)
 60 |   if (labelOrder==T) levels(returnser) <- paste0(levels(returnser), ' ', 1:length(rawlev), '/', length(rawlev))
 61 |   if (labelPct==T) levels(returnser) <- paste0(levels(returnser), ' ', paste0('(', as.character(round(table(returnser)/length(returnser)*100, 1)), '%)'))
 62 |   for(i in 1:length(levels(returnser))) levels(returnser)[i] <- substr(levels(returnser)[i], nchar(rawlev[i])+1, nchar(levels(returnser)[i]))
 63 |   return(returnser)
 64 | }
 65 | 
 66 | #' @title Round numbers in interval
 67 | #' @description Formats an interval of form \code{(5.234,11.783]} to something like \code{(5.2,11.8]}.
 68 | #' Used for formatting only, mainly with binning functions like \code{\link{depthbin}}.  Intervals can be opened or closed with 
 69 | #' \code{(} and \code{[} respectively and are maintained as such when formatted.  Useful for prettifying graphs and reports.
 70 | #' 
 71 | #' @param x character vector of bins to format
 72 | #' @param r number, 0 to 10 (or higher I suppose) indicating how many decimals to display
 73 | #' 
 74 | #' @seealso \code{\link{depthbin}}
 75 | #' @return formatted character vector with length of input vector.
 76 | #' @export
 77 | #' @examples
 78 | #' x1 <- cut(quantile(rnorm(100)), breaks=4)
 79 | #' roundCut(x1, 1)
 80 | 
 81 | roundCut <- function(x, r=1){
 82 |   x <- as.character(x)
 83 |   b <- substr(x,0,1)
 84 |   e <- substr(x, nchar(x), nchar(x))
 85 |   xx <- substr(x, 2, nchar(x)-1)
 86 |   xx1 <- round(as.numeric(sapply(xx, function(z) strsplit(z, ',')[[1]][1])), r)
 87 |   xx2 <- round(as.numeric(sapply(xx, function(z) strsplit(z, ',')[[1]][2])), r)
 88 |   return(paste(b, xx1, ', ', xx2, e, sep=''))
 89 | }
 90 | 
 91 | #' @title categorical data binning by collapsing
 92 | #' @description Bins categorical variables into a smaller number of bins.  Useful when modeling with variables that have many small categories.
 93 | #' The largest categories are taken as is and the smaller categories are collapsed into a new field named 'other.'
 94 | #' There are two options for determining the number of bins: \cr 
 95 | #' 1. Specify the exact number of bins desired (\code{ncat}) \cr
 96 | #' 2. Specify how the share of your variable that will be represented with actual categories before naming everything else 'other' (\code{maxp}) \cr
 97 | #' @details It is advisable to use only the \code{ncat} OR \code{maxp} parameters.  When both used together, they will return whichever 
 98 | #' criteria yields the smaller number of bins. \cr
 99 | #' Possible unexpected behavior when setNA=NA and keepNA=T.  To keep NAs as standalone category, need to make setNA something that is not NA.
100 | #' 
101 | #' @param x vector to bin.  It is transformed to a character, so any type is acceptable
102 | #' @param ncat number 0 to 100 (or higher I suppose).  Number of bins to collapse data to
103 | #' @param maxp number 0 to 1.  Percentage of data that will be represented "as is" before categories are collapsed to "other"
104 | #' @param results logical \code{TRUE} or \code{FALSE}.  Prints a frequency table of the new categories.
105 | #' @param setNA value to set NAs to.  default is to keep NA.  Can set to a character string to make NAs a category
106 | #' @param keepNA logical. \code{TRUE} keeps NAs as their own character.  \code{FALSE} bundles NAs into 'other' category.
107 | #' @return vector of binned data
108 | #' @export
109 | #' @examples
110 | #' d <- rpois(1000, 20)
111 | #' d[d>26] <- sample(1:26, length(d[d>26]), replace=T)
112 | #' dl <- letters[d]
113 | #' barplot(table(dl))
114 | #' table(binCat(dl, results=F, ncat=5))
115 | #' table(binCat(dl, results=F, maxp=0.5))
116 | #' table(binCat(dl, results=F, maxp=0.9))
117 | #' 
118 | #' ## With missings
119 | #' ff <- sample(letters[1:15], 100, replace=T)
120 | #' ff[sample(100, 10)] <- NA
121 | #' binCat(ff, ncat=7, setNA='missing')
122 | 
123 | 
124 | binCat <- function(x, ncat=NULL, maxp=NULL, results=F, setNA=NA, keepNA=F) {
125 |   if(is.null(maxp)==F & is.null(ncat)==F) warning("Parameters 'ncat' and 'maxp' are both specified.  It is advisable to only specify one of these criteria.  Algorithm will stop at the first criteria met.")
126 |   if(is.na(setNA)==F) x[is.na(x)] <- setNA
127 |   
128 |   ncat <- min(ncat, length(unique(x)))
129 |   x <- as.character(x)
130 |   n <- length(x)
131 |   if(is.null(maxp)) maxp <- 1
132 |   
133 |   for(i in 1:length(unique(x))){
134 |     xc <- x
135 |     x1 <- sort(table(xc, exclude=NULL), decreasing=T)[1:i]
136 |     catp <- sum(x1)/n
137 |     if(i==ncat | catp>maxp)  {
138 |       x2 <- sort(table(xc, exclude=NULL), decreasing=T)[1:(i+1)]
139 |       if(keepNA==T) {xc[which(!xc %in% c(names(x2), setNA))] <- 'other'
140 |       } else {xc[which(!xc %in% names(x2))] <- 'other'}
141 |       returnser <- xc
142 |       break
143 |     }
144 |   }
145 |   if(results==T) print(sort(table(returnser)/n, decreasing=T))
146 |   return(returnser)  
147 | }
148 |   
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/draft/horseRaceModel.R:
--------------------------------------------------------------------------------
  1 | #' @title 
  2 | #' @description This function performs repeated sub-sampling cross-validation using 3 different models and
  3 | #' saves the AUC and lift for each model at each iteration.  Models used are Random Forest, LASSO Regression, logistic regression (using 
  4 | #' LASSO for variable selection).  An ensemble of the LASSO and Random Forest is also included. 
  5 | #' @param x data.frame or data.table of predictor variables
  6 | #' @param y vector target variable, binary 0 or 1. 
  7 | #' @param n number, number of iterations
  8 | #' @param trainpct numeric scalar.  percent of data to be used for training at each iteration.
  9 | #' @param liftQuantile numeric scalar.  quantile to be used for assessing lift.
 10 | #' @return horseRace S3 object.
 11 | #' @export
 12 | #' @examples
 13 | #' mtcars2 <- mtcars
 14 | #' for(i in 1:ncol(mtcars2)) mtcars2[sample(nrow(mtcars2), sample(1:5,1), replace=T),i] <- NA
 15 | #' missdf(mtcars2)
 16 | 
 17 | library('glmnet')
 18 | library('randomForest')
 19 | library('Rsenal')
 20 | library('pROC')
 21 | 
 22 | 
 23 | horseRaceModel <- function(x, y, n=10, trainPct=0.75, liftQuantile=.04) {
 24 |   
 25 |   if(class(y) %in% c('character', 'factor')) yn <- as.numeric(as.character(y)) else yn <- y
 26 |   if(max(y)>1 | min(y)<0) stop('y must be between 0 and 1') 
 27 |   
 28 |   pb <- txtProgressBar(min=0, max=n, style=3, width=n)
 29 |   
 30 |   # setting up return object
 31 |   ret <- replicate(2, data.frame(rf=rep(NA, n), lasso=rep(NA, n), glm=rep(NA, n), ensemble=rep(NA,n)), simplify=F)
 32 |   names(ret) <- c('lift', 'auc')
 33 |   ret[['lasso_coef']] <- NULL
 34 |   
 35 |   for(i in 1:n) {
 36 |     itrain <- sample(1:nrow(x), round(nrow(x)*trainPct))
 37 |     itest <- setdiff(1:nrow(x), itrain)
 38 |     xtr <- x[itrain,]
 39 |     xte <- x[itest,]
 40 |     ytr <- y[itrain]; yntr <- yn[itrain]
 41 |     yte <- y[itest]; ynte <- yn[itest]
 42 |     
 43 |     # if we have factor variables, dummy up columns for LASSO
 44 |     if(length(setdiff(sapply(xtr, class), c('numeric', 'integer')))>0) {
 45 |       xtr_d <- model.matrix(~.-1, xtr)
 46 |       xte_d <- model.matrix(~.-1, xte)
 47 |       colnames(xtr_d) <- gsub('\\-', '\\_', colnames(xtr_d))
 48 |       colnames(xte_d) <- gsub('\\-', '\\_', colnames(xte_d))
 49 |       facvars <- names(attr(xtr_d, "contrasts"))
 50 |     }
 51 |     
 52 |     ## LASSO
 53 |     fitlasso <- glmnet(x=as.matrix(xtr_d), y=yntr, alpha=1, family='binomial')
 54 |     cvLasso <- cv.glmnet(x=as.matrix(xtr_d), y=yntr, alpha=1, type.measure='deviance')
 55 |     predLasso <- predict(fitlasso, s=cvLasso$lambda.min, newx=as.matrix(xte_d), type='response')[,1]
 56 |     pqLasso <- predQuantile(ytest=ynte, testPred=predLasso, n=round(1/liftQuantile))
 57 |     glmvars <- lassoDummyVarsToFactors(fitlasso=fitlasso, s=cvLasso$lambda.min, contrasts=names(attr(xtr_d, 'contrasts')), xtr=xtr)
 58 |     
 59 |     ## Logistic
 60 |     df4glm <- data.frame(y=ytr, data.frame(xtr)[, glmvars, drop=FALSE])
 61 |     fitglm <- glm(makeForm('y', intersect(colnames(xtr), glmvars)), data=df4glm, family=binomial(logit))
 62 |     predGlm <- predict(fitglm, newdata=xte, type='response')
 63 |     pqGlm <- predQuantile(ytest=ynte, testPred=predGlm, n=round(1/liftQuantile))
 64 |     
 65 |     ## RANDOM FOREST
 66 |     rf <- randomForest(x=xtr, y=factor(ytr), ntree=1000)
 67 |     predRf <- predict(rf, newdata=xte, type='prob')[,2]
 68 |     pqRf <- predQuantile(ytest=ynte, testPred=predRf, n=round(1/liftQuantile))
 69 |     
 70 |     ## ENSEMBLE
 71 |     predRfLasso <- (predRf+predLasso)/2
 72 |     pqRfLasso <- predQuantile(ytest=ynte, testPred=predRfLasso, n=round(1/liftQuantile))
 73 |     
 74 |     ## Saving Results
 75 |     ret$lift[i, 'rf'] <- pqRf$cumLift[1]
 76 |     ret$lift[i, 'lasso'] <- pqLasso$cumLift[1]
 77 |     ret$lift[i, 'glm'] <- pqGlm$cumLift[1]
 78 |     ret$lift[i, 'ensemble'] <- pqRfLasso$cumLift[1]
 79 |     ret$lift[i, 'iteration'] <- i
 80 |     
 81 |     ret$auc[i, 'rf'] <- glmnet::auc(y=ynte, prob=predRf)
 82 |     ret$auc[i, 'lasso'] <- glmnet::auc(y=ynte, prob=predLasso)
 83 |     ret$auc[i, 'glm'] <- glmnet::auc(y=ynte, prob=predGlm)
 84 |     ret$auc[i, 'ensemble'] <- glmnet::auc(y=ynte, prob=predRfLasso)
 85 |     ret$auc[i, 'iteration'] <- i
 86 |     
 87 |     if(i==1) {
 88 |       ret$lasso_coef <- as.matrix(coef(fitlasso, s=cvLasso$lambda.min))
 89 |     } else {
 90 |       ret$lasso_coef <- cbind(ret$lasso_coef, as.matrix(coef(fitlasso, s=cvLasso$lambda.min))) 
 91 |     }
 92 |     
 93 |     setTxtProgressBar(pb, i)
 94 |   }
 95 |   
 96 |   ret$lasso_coef <- as(ret$lasso_coef, 'dgCMatrix')
 97 |   
 98 |   class(ret) <- 'horseRace'
 99 |   attr(ret, 'n') <- n
100 |   
101 |   return(ret)
102 | }
103 | 
104 | ######################
105 | ## Helper Functions ##
106 | ######################
107 | ## NEEDS TO HANDLE ORDERED FACTORS  (LIKE EDUCATION)
108 | 
109 | lassoDummyVarsToFactors <- function(fitlasso, s=cvLasso$lambda.min, contrasts=names(attr(xtr_d, 'contrasts')), xtr){
110 |   # grabbing the dummied variables chosen by lasso regression
111 |   ltmp <- coef(fitlasso, s=s)
112 |   lassoVars <- ltmp@Dimnames[[1]][2:ltmp@Dim[1]][ltmp@i]
113 | 
114 |   # create mapping between factor variable (root) and it's dummied variable names
115 |   if(length(contrasts)>0) {
116 |     rl <- list()
117 |     for(i in contrasts) rl[[i]] <- paste(i, gsub('\\-', '\\_', unique(xtr[,i])), sep='')
118 |   }
119 |   
120 |   # creating list of just root (factor) variables for GLM
121 |   ret <- c()
122 |   for(i in lassoVars){
123 |     if(!i %in% names(xtr)) {
124 |       addvar <- names(rl)[[grep(i, rl)]]
125 |       if(addvar %in% ret) next
126 |       ret <- c(ret, addvar)
127 |     } else {
128 |       ret <- c(ret, i)
129 |     }
130 |   }
131 |   return(ret)
132 | }
133 | 
134 | 
135 | #lassoDummyVarsToFactors(fitlasso, s=cvLasso$lambda.min, contrasts=names(attr(xtr_d, 'contrasts')), xtr=xtr)
136 | 
137 | 
138 | ######################
139 | ## Methods ###########
140 | ######################
141 | 
142 | 
143 | plot.horseRace <- function(object, measure){
144 |   if (!inherits(object, "horseRace")) 
145 |     stop("object not of class horseRace")
146 |   if (is.null(object[[measure]])) 
147 |     stop("Measure argument not recognized")
148 |   
149 |   ggdf <- data.table(melt(object[[measure]], id='iteration'))
150 |   setnames(ggdf, 'variable', 'model')
151 |   ggplot(data=ggdf, aes(x=iteration, y=value, group=model, color=model)) + geom_line() +
152 |     geom_point(size=4, shape=19) + ylab(measure) + ggtitle(measure) + theme_bw()
153 | }
154 | 
155 | summary.horseRace <- function(object){
156 |   cat('AUC: \n')
157 |   dt <- data.frame(data.table(hr[['auc']])[,.(rf, lasso, glm, ensemble)][,lapply(.SD, function(x) quantile(x)[2:4])])
158 |   row.names(dt) <- c('25%', '50%', '75%')
159 |   print(dt)
160 |   
161 |   cat('\n')
162 |   cat('Lift: \n')
163 |   dt <- data.frame(data.table(hr[['lift']])[,.(rf, lasso, glm, ensemble)][,lapply(.SD, function(x) quantile(x)[2:4])])
164 |   row.names(dt) <- c('25%', '50%', '75%')
165 |   print(dt)
166 |   
167 |   cat('\n')
168 |   cat('variables chosen by LASSO: \n')
169 |   lassoB <- apply(as(hr$lasso_coef, 'matrix'), 1, function(x) sum(x>0))
170 |   lassoB <- lassoB[setdiff(names(lassoB), '(Intercept)')]
171 |   data.frame(N=lassoB[order(lassoB, decreasing=T)])
172 | }
173 | summary(hr)
174 | 
175 | #' @examples
176 | #' \dontrun{
177 | library('arules') # for grabbing AdultUCI dataset
178 | library('data.table')
179 | library('ggplot2')
180 | data(AdultUCI)
181 | df <- AdultUCI[1:1000,]
182 | df <- df[complete.cases(df),]
183 | names(df) <- gsub('\\-', '_', names(df)) # dashes mess things up
184 | # still need to handle categorical variables
185 | hr <- horseRaceModel(x=df[,c('age', 'capital_gain', 'capital_loss', 'hours_per_week', 
186 |                              'race', 'workclass', 'marital_status', 'occupation')], 
187 |                      y=as.numeric(df$sex)-1, n=8, trainPct=.75, liftQuantile=.1)
188 | 
189 | plot(hr, measure='lift')
190 | summary(hr)
191 | 
192 | #setdiff(names(df), 'sex')
193 | #c('capital_gain', 'hours_per_week','age')
194 | 
195 | 
196 | #' }
197 | 
198 | 
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/R/igraph_network.R:
--------------------------------------------------------------------------------
  1 | #' @title Get 1 degree away vertexes
  2 | #' 
  3 | #' @description Returns a list of all vertex names that touch (going IN or OUT) from the
  4 | #' specified vertex list.
  5 | #' 
  6 | #' @param g igraph object
  7 | #' @param vlist list of starting node/vertex names
  8 | #' @param mode 'out' or 'in'.  'out' returns all the vertex/node names 1 degree out from \code{vlist}. 'in' returns all 
  9 | #'         vertex/node names directed into vlist nodes.
 10 | #' @return vector of node names that are 1 degree away from \code{vlist}
 11 | #' @import igraph
 12 | #' @export
 13 | #' @examples
 14 | #' ## build sample network
 15 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
 16 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
 17 | #' relations<-cbind(from,to)
 18 | #' g <- graph.data.frame(relations)
 19 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
 20 | #' V(g)$tier <- tiers
 21 | #' 
 22 | #' getV1fromVlist(g, c('A', 'D'), 'to')
 23 | 
 24 | getV1fromVlist <- function(g, vlist, mode='out'){
 25 |   if(!mode %in% c('out', 'in', 'all')) stop("mode argument must be equal to 'in', 'out' or 'all'")
 26 |   eidList <- array()
 27 |   for (i in vlist) {
 28 |     eidList <- c(eidList, igraph::incident(g, i, mode))
 29 |   }
 30 |   eidList <- unique(eidList[!is.na(eidList)])
 31 |   mode2 <- ifelse(mode=='out', 'to', 'from')  
 32 |   v <- getVfromE(g, eidList, mode2)
 33 |   return(v)
 34 | }
 35 | 
 36 | 
 37 | #' @title Get vertex names from edge IDs
 38 | #' @description Returns the vertex names associated with the specified edge IDs (FROM or TO) 
 39 | #' @param g igraph object to mine vertex names from
 40 | #' @param eid edge IDs referenced by a numeric vector or the edge object themselves
 41 | #' @param mode "to" or "from". Indicates whether to return vertex names for nodes going to (in) or from (out) from the edge.
 42 | #' @param unique \code{TRUE} or \code{FALSE}.  TRUE simply removes duplicate vertex names and returns a unique list.
 43 | #' @return vertex names associated with the specified edge IDs (FROM or TO)
 44 | #' @import igraph
 45 | #' @export
 46 | #' @examples
 47 | #' require('igraph')
 48 | #'
 49 | #' ## build sample network
 50 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
 51 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
 52 | #' relations<-cbind(from,to)
 53 | #' g <- graph.data.frame(relations)
 54 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
 55 | #' V(g)$tier <- tiers
 56 | #'
 57 | #' getVfromE(g, E(g)[1:5])
 58 | #' getVfromE(g, 1:5)
 59 | 
 60 | getVfromE <- function(g, eid, mode='to', unique=T) {
 61 |   eid <- unique(eid[!is.na(eid)])
 62 |   df <- get.edgelist(subgraph.edges(g, eid))
 63 |   col <- ifelse(mode=='from', 1, 2)
 64 |   retdf <- df[,col]
 65 |   if(unique==T) retdf <- unique(df[,col])
 66 |   return(retdf)
 67 | }
 68 | 
 69 | 
 70 | 
 71 | #' @title Get edge IDs from vertices
 72 | #' @description  returns the edge IDs that go OUT or IN from the specified list of vertices
 73 | #' @param g igraph object
 74 | #' @param vlist list of node/vertex names
 75 | #' @param mode character string: 'in', 'out' or 'all'
 76 | #' @return edge IDs that go OUT or IN from the specified list of vertices
 77 | #' @import igraph
 78 | #' @export
 79 | #' @examples
 80 | #' require('igraph')
 81 | #'
 82 | #' ## build sample network
 83 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
 84 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
 85 | #' relations<-cbind(from,to)
 86 | #' g <- graph.data.frame(relations)
 87 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
 88 | #' V(g)$tier <- tiers
 89 | #'
 90 | #' getEfromVlist(g, c('A', 'D'), 'to')
 91 | 
 92 | getEfromVlist <- function(g, vlist, mode='out'){
 93 |   eidList <- array()
 94 |   for (i in vlist) {
 95 |     eidList <- c(eidList, igraph::incident(g, i, mode))
 96 |   }
 97 |   eidList <- unique(eidList[!is.na(eidList)])
 98 |   return(eidList)
 99 | }
100 | 
101 | #' @title Prune edges of igraph
102 | #' @description This function takes a very connected network graph and prunes the edges down 
103 | #' so that it focuses on depth from the root node to the end nodes (inter-connective 
104 | #' edges are deleted). \cr \cr
105 | #' 1. Find all nodes 1 step out from root.  These edges must stay.  \cr
106 | #' 2. Find all nodes 2 steps from root that have more than 2 edges in. 
107 | #'        Keep only one edge (the one that leads to the shortest path back to the root).\cr
108 | #' 3. Repeat for the nodes one more degree away from root.
109 | #' @param g igraph object
110 | #' @param root character string: name of root node
111 | #' @return igraph object, pruned subgragh of \code{g}
112 | #' @import igraph
113 | #' @export
114 | #' @examples
115 | #' require('igraph')
116 | #'
117 | #' ## build sample network
118 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
119 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
120 | #' relations<-cbind(from,to)
121 | #' g <- graph.data.frame(relations)
122 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
123 | #' V(g)$tier <- tiers
124 | #'
125 | #' prungeEdge(g, 'A')
126 | 
127 | pruneEdge <- function(g, root){
128 |   gF <- g
129 |   d <- degree(gF, mode='in')
130 |   
131 |   nhood <- V(g)[neighbors(gF, root, 'out')]$name
132 |   nhood <- c(nhood, root)
133 |   d <- d[!names(d) %in% nhood]   
134 |   dP <- names(d[d>1])
135 |   
136 |   for(v in dP){
137 |     spv <- shortest.paths(gF, v, root)
138 |     eid <- getEfromVlist(gF, v, 'in')
139 |     
140 |     df <- t(shortest.paths(gF, root, getVfromE(gF, eid, 'from'), 'out'))
141 |     df <- df[order(df[,1]),1]
142 |     shortPv <- names(df)[1]
143 |     eidKeep <- as.numeric(E(gF)[shortPv %->% v])
144 |     
145 |     if(length(eid)>1){	
146 |       eidDel <- eid[eid!=eidKeep]
147 |       gF <- delete.edges(gF, eidDel)
148 |     }	
149 |   }
150 |   return(gF)
151 | }
152 | 
153 | #' @title Find all nodes connected to root node
154 | #' @description Returns a subset of the original graph (all edges and vertices that a
155 | #' directed path can take from from the root node.  OrderN limits the growth of these paths.
156 | #' @param g igraph object
157 | #' @param root character string: name of root node
158 | #' @param orderN number: # of degrees away from root node to search.
159 | #' @seealso Simpler version: \code{\link{travCount}}
160 | #' @return igraph object, subgragh of \code{g}
161 | #' @import igraph
162 | #' @export
163 | #' @examples
164 | #' require('igraph')
165 | #'
166 | #' ## build sample network
167 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
168 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
169 | #' relations<-cbind(from,to)
170 | #' g <- graph.data.frame(relations)
171 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
172 | #' V(g)$tier <- tiers
173 | #'
174 | #' plot(g) ## full network 
175 | #' plot(travOut(g, 'D')) ## sub network
176 | 
177 | travOut <- function(g, root, orderN=-1) {
178 |   eid <- incident(g, root, 'out')
179 |   noEdgesTest <- identical(eid, numeric(0))
180 |   
181 |   if(noEdgesTest==F){
182 |     v <- getVfromE(g, eid, 'to')
183 |     
184 |     eidKids <- array()
185 |     v2<-v
186 |     spv <- shortest.paths(g, root)
187 |     sp <- max(spv[spv!=Inf])
188 |     n <- ifelse(orderN==-1, sp, orderN)
189 |     for(i in 1:n) {
190 |       eidIter <- getEfromVlist(g, v2)
191 |       v2 <- getV1fromVlist(g, v2, 'out')
192 |       eidKids <- c(eidKids, eidIter)
193 |     }
194 |     
195 |     reteid  <- unique(c(eid, eidKids))
196 |     reteid <- reteid[!is.na(reteid)]
197 |     ret <- subgraph.edges(g, reteid)
198 |     
199 |   } else {ret <- graph.neighborhood(g, 1, root, 'out')[[1]]}
200 |   if(noEdgesTest==F & orderN==1) ret <- subgraph.edges(g, eid)
201 |   return(ret)
202 | }
203 | 
204 | 
205 | #' @title Count # of nodes connected to root node
206 | #' @description Count doesn't include root node.  Simpler version of  \code{\link{travOut}}.  
207 | #' Might be faster?  Developed for a different project.
208 | #' @param g igraph object
209 | #' @param root character string: name of root node
210 | #' @param orderN number, # of degrees away from root node to search.  Default is -1 which searches all degrees
211 | #' @param vmode character string: 'out', 'in' or 'all',  determines how to subgraph \code{g} from \code{root}. Default is 'out'.
212 | #' @seealso \code{\link{travOut}}
213 | #' @return number, count of nodes connected to root node.
214 | #' @import igraph
215 | #' @export
216 | #' @examples
217 | #' require('igraph')
218 | #'
219 | #' ## build sample network
220 | #' from <- c('A','A','B','B','C','C','D','D','E','E','E','F','F','H','L','L','O')
221 | #' to <- c('B','C','D','E','F','G','H','I','J','J','K','L','G','M','N','O','P')
222 | #' relations<-cbind(from,to)
223 | #' g <- graph.data.frame(relations)
224 | #' tiers <- c(1,2,3,4,1,3,4,2,1,2,3,4,0,0,1,2)
225 | #' V(g)$tier <- tiers
226 | #' 
227 | #' plot(g)
228 | #' travCount(g, 'B')
229 | #' travCount(g, 'L', vmode='out')
230 | 
231 | travCount <- function(g, root, orderN=-1, vmode='out') {
232 |   #eid <- which(!E(g)$type %in% vtype)
233 |   #g <- subgraph.edges(g, eid, delete.vertices=T)
234 |   
235 |   vb <- root
236 |   v <- getV1fromVlist(g, root, vmode)
237 |   vv <- root
238 |   if(orderN==-1){
239 |     spv <- igraph::shortest.paths(g, root)
240 |     sp <- max(spv[spv!=Inf])
241 |     n <- sp
242 |   } else {n <- orderN}
243 |   for(i in 1:n) {
244 |     vtmp <- getV1fromVlist(g, vb, vmode)
245 |     v <- unique(c(v, vtmp))
246 |     vb <- vtmp
247 |   }
248 |   return(length(v))
249 | }
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 


--------------------------------------------------------------------------------
/R/shiny_arules.R:
--------------------------------------------------------------------------------
  1 | #' @title Assocation Rules Visualization Shiny App
  2 | #' @description Launches a Shiny App that provides an interactive interface to the visualizations of the \code{arulesViz} package.
  3 | #' The app allows users to mine rules based on all or just subsets of features, sort by criteria (lift, support, confidence) and visualize
  4 | #' using network graph, grouped bubble and scatter plots. \cr
  5 | #' Users filter rules to target only those with a certain variable on the RHS or LHS of the rule.
  6 | #' Rule mining is computed using the \link{apriori} algorithm from \code{arules}.
  7 | #' 
  8 | #' @param dataset data.frame, this is the dataset that association rules will be mined from.  Each row is treated as a transaction.  Seems to work 
  9 | #' OK when a the S4 transactions class from \code{arules} is used, however this is not thoroughly tested.
 10 | #' @param bin logical, \code{TRUE} will automatically discretize/bin numerical data into categorical features that can be used for association analysis.
 11 | #' @param vars integer, how many variables to include in initial rule mining
 12 | #' @param supp numeric, the support parameter for initializing visualization.  Useful when it is known that a high support is needed to not crash computationally.
 13 | #' @param conf numeric, the confidence parameter for initializing visualization.  Similarly useful when it is known that a high confidence is needed to not crash computationally.
 14 | #' @seealso \code{arulesViz}, \code{arules}
 15 | #' @return Shiny App
 16 | #' @import shiny arulesViz arules
 17 | #' @export
 18 | #' 
 19 | #' @examples
 20 | #' ## creating some data 
 21 | #' n <- 10000 # of obs
 22 | #' d <- data.frame(
 23 | #' eye = sample(c('brown', 'green', 'blue', 'hazel'), n, replace=T),
 24 | #' gender = sample(c('male', 'female'), n, replace=T),
 25 | #' height = sort(sample(c('dwarf', 'short', 'average', 'above average', 'giant'), n, replace=T)),
 26 | #' wealth = sort(sample(c('poor', 'struggling', 'middle', 'uppermiddle', 'comfortable', 'rich', '1%', 'millionaire', 'billionaire'), n, replace=T)),
 27 | #' favoriteAnimal = sample(c('dog', 'cat', 'bat', 'frog', 'lion', 'cheetah', 'lion', 'walrus', 'squirrel'), n, replace=T),
 28 | #' numkids = abs(round(rnorm(n, 2, 1)))
 29 | #' )
 30 | #' 
 31 | #' ## adding some pattern
 32 | #' d$numkids[d$gender=='male'] <- d$numkids[d$gender=='male'] + sample(0:3, sum(d$gender=='male'), replace=T)
 33 | #' d$numkids <- factor(d$numkids)
 34 | #' 
 35 | #' ## calling Shiny App to visualize association rules
 36 | #' arulesApp(d)
 37 | 
 38 | arulesApp <- function (dataset, bin=T, vars=5, supp=0.1, conf=0.5) {
 39 |   
 40 |   ## binning numeric data
 41 |   for(i in 1:ncol(dataset)) {
 42 |     if(class(dataset[,i]) %in% c('numeric', 'integer')) dataset[,i] <- Rsenal::depthbin(dataset[,i], nbins=10)
 43 |   }
 44 |   
 45 |   ## calling Shiny App
 46 |   shinyApp(ui = shinyUI(pageWithSidebar(
 47 |   
 48 |     headerPanel("Association Rules"),
 49 |     
 50 |     sidebarPanel(
 51 |       
 52 |       conditionalPanel(
 53 |         condition = "input.samp=='Sample'",
 54 |         numericInput("nrule", 'Number of Rules', 5), br()
 55 |       ),
 56 |       
 57 |       conditionalPanel(
 58 |         condition = "input.mytab=='graph'",
 59 |         radioButtons('graphType', label='Graph Type', choices=c('itemsets','items'), inline=T), br()
 60 |       ),
 61 |       
 62 |       conditionalPanel(
 63 |         condition = "input.lhsv=='Subset'", 
 64 |         uiOutput("choose_lhs"), br()
 65 |       ),
 66 |       
 67 |       conditionalPanel(
 68 |         condition = "input.rhsv=='Subset'", 
 69 |         uiOutput("choose_rhs"), br()
 70 |       ),
 71 |       
 72 |       conditionalPanel(
 73 |         condition = "input.mytab=='grouped'",
 74 |         sliderInput('k', label='Choose # of rule clusters', min=1, max=150, step=1, value=15), br()
 75 |       ),
 76 |             
 77 |       conditionalPanel(
 78 |         condition = "input.mytab %in%' c('grouped', 'graph', 'table', 'datatable', 'scatter', 'paracoord', 'matrix', 'itemFreq')", 
 79 |         radioButtons('samp', label='Sample', choices=c('All Rules', 'Sample'), inline=T), br(),
 80 |         uiOutput("choose_columns"), br(),
 81 |         sliderInput("supp", "Support:", min = 0, max = 1, value = supp , step = 1/10000), br(),
 82 |         sliderInput("conf", "Confidence:", min = 0, max = 1, value = conf , step = 1/10000), br(),
 83 |         selectInput('sort', label='Sorting Criteria:', choices = c('lift', 'confidence', 'support')), br(), br(),
 84 |         numericInput("minL", "Min. items per set:", 2), br(), 
 85 |         numericInput("maxL", "Max. items per set::", 3), br(),
 86 |         radioButtons('lhsv', label='LHS variables', choices=c('All', 'Subset')), br(),
 87 |         radioButtons('rhsv', label='RHS variables', choices=c('All', 'Subset')), br(),
 88 |         downloadButton('downloadData', 'Download Rules as CSV')
 89 |       )
 90 |       
 91 |     ),
 92 |     
 93 |     mainPanel(
 94 |       tabsetPanel(id='mytab',
 95 |                   tabPanel('Grouped', value='grouped', plotOutput("groupedPlot", width='100%', height='100%')),
 96 |                   tabPanel('Graph', value='graph', plotOutput("graphPlot", width='100%', height='100%')),
 97 |                   tabPanel('Scatter', value='scatter', plotOutput("scatterPlot", width='100%', height='100%')),
 98 |                   tabPanel('Parallel Coordinates', value='paracoord', plotOutput("paracoordPlot", width='100%', height='100%')),
 99 |                   tabPanel('Matrix', value='matrix', plotOutput("matrixPlot", width='100%', height='100%')),
100 |                   tabPanel('ItemFreq', value='itemFreq', plotOutput("itemFreqPlot", width='100%', height='100%')),
101 |                   tabPanel('Table', value='table', verbatimTextOutput("rulesTable")),
102 |                   tabPanel('Data Table', value='datatable', dataTableOutput("rulesDataTable"))
103 |       )
104 |     )
105 |     
106 |    )),
107 |    
108 |    server = function(input, output) {
109 |      
110 |      output$choose_columns <- renderUI({
111 |        checkboxGroupInput("cols", "Choose variables:", 
112 |                           choices  = colnames(dataset),
113 |                           selected = colnames(dataset)[1:vars])
114 |      })
115 |      
116 |      
117 |      output$choose_lhs <- renderUI({
118 |        checkboxGroupInput("colsLHS", "Choose LHS variables:", 
119 |                           choices  = input$cols,
120 |                           selected = input$cols[1])
121 |      })
122 |      
123 |      output$choose_rhs <- renderUI({
124 |        checkboxGroupInput("colsRHS", "Choose RHS variables:", 
125 |                           choices  = input$cols,
126 |                           selected = input$cols[1])
127 |      })
128 |      
129 |      ## Extracting and Defining arules
130 |      rules <- reactive({
131 |        tr <- as(dataset[,input$cols], 'transactions')
132 |        arAll <- apriori(tr, parameter=list(support=input$supp, confidence=input$conf, minlen=input$minL, maxlen=input$maxL))
133 |        
134 |        if(input$rhsv=='Subset' & input$lhsv!='Subset'){
135 |          varsR <- character()
136 |          for(i in 1:length(input$colsRHS)){
137 |            tmp <- with(dataset, paste(input$colsRHS[i], '=', levels(as.factor(get(input$colsRHS[i]))), sep=''))
138 |            varsR <- c(varsR, tmp)
139 |          }
140 |          ar <- subset(arAll, subset=rhs %in% varsR)
141 |          
142 |        } else if(input$lhsv=='Subset' & input$rhsv!='Subset') {
143 |          varsL <- character()
144 |          for(i in 1:length(input$colsLHS)){
145 |            tmp <- with(dataset, paste(input$colsLHS[i], '=', levels(as.factor(get(input$colsLHS[i]))), sep=''))
146 |            varsL <- c(varsL, tmp)
147 |          }
148 |          ar <- subset(arAll, subset=lhs %in% varsL)
149 |          
150 |        } else if(input$lhsv=='Subset' & input$rhsv=='Subset') {
151 |          varsL <- character()
152 |          for(i in 1:length(input$colsLHS)){
153 |            tmp <- with(dataset, paste(input$colsLHS[i], '=', levels(as.factor(get(input$colsLHS[i]))), sep=''))
154 |            varsL <- c(varsL, tmp)
155 |          }
156 |          varsR <- character()
157 |          for(i in 1:length(input$colsRHS)){
158 |            tmp <- with(dataset, paste(input$colsRHS[i], '=', levels(as.factor(get(input$colsRHS[i]))), sep=''))
159 |            varsR <- c(varsR, tmp)
160 |          }
161 |          ar <- subset(arAll, subset=lhs %in% varsL & rhs %in% varsR)
162 |          
163 |        } else {
164 |          ar <- arAll
165 |        }
166 |        quality(ar)$conviction <- interestMeasure(ar, 'conviction', transactions=tr)
167 |        quality(ar)$hyperConfidence <- interestMeasure(ar, 'hyperConfidence', transactions=tr)
168 |        quality(ar)$cosine <- interestMeasure(ar, 'cosine', transactions=tr)
169 |        quality(ar)$chiSquare <- interestMeasure(ar, 'chiSquare', transactions=tr)
170 |        quality(ar)$coverage <- interestMeasure(ar, 'coverage', transactions=tr)
171 |        quality(ar)$doc <- interestMeasure(ar, 'doc', transactions=tr)
172 |        quality(ar)$gini <- interestMeasure(ar, 'gini', transactions=tr)
173 |        quality(ar)$hyperLift <- interestMeasure(ar, 'hyperLift', transactions=tr)
174 |        ar
175 |      })
176 |      
177 |      # Rule length
178 |      nR <- reactive({
179 |        nRule <- ifelse(input$samp == 'All Rules', length(rules()), input$nrule)
180 |      })
181 |      
182 |      ## Grouped Plot #########################
183 |      output$groupedPlot <- renderPlot({
184 |        ar <- rules()
185 |        plot(sort(ar, by=input$sort)[1:nR()], method='grouped', control=list(k=input$k))
186 |      }, height=800, width=800)
187 |      
188 |      ## Graph Plot ##########################
189 |      output$graphPlot <- renderPlot({
190 |        ar <- rules()
191 |        plot(sort(ar, by=input$sort)[1:nR()], method='graph', control=list(type=input$graphType))
192 |      }, height=800, width=800)
193 |      
194 |      ## Scatter Plot ##########################
195 |      output$scatterPlot <- renderPlot({
196 |        ar <- rules()
197 |        plot(sort(ar, by=input$sort)[1:nR()], method='scatterplot')
198 |      }, height=800, width=800)
199 |      
200 |      ## Parallel Coordinates Plot ###################
201 |      output$paracoordPlot <- renderPlot({
202 |        ar <- rules()
203 |        plot(sort(ar, by=input$sort)[1:nR()], method='paracoord')
204 |      }, height=800, width=800)
205 |      
206 |      ## Matrix Plot ###################
207 |      output$matrixPlot <- renderPlot({
208 |        ar <- rules()
209 |        plot(sort(ar, by=input$sort)[1:nR()], method='matrix', control=list(reorder=T))
210 |      }, height=800, width=800)
211 |      
212 |      ## Item Frequency Plot ##########################
213 |      output$itemFreqPlot <- renderPlot({
214 |        trans <- as(dataset[,input$cols], 'transactions')
215 |        itemFrequencyPlot(trans)
216 |      }, height=800, width=800)
217 |      
218 |      ## Rules Data Table ##########################
219 |      output$rulesDataTable <- renderDataTable({
220 |        ar <- rules()
221 |        rulesdt <- rules2df(ar)
222 |        rulesdt
223 |      })
224 |      
225 |      ## Rules Printed ########################
226 |      output$rulesTable <- renderPrint({
227 |        #hack to disply results... make sure this match line above!!
228 |        #ar <- apriori(dataset[,input$cols], parameter=list(support=input$supp, confidence=input$conf, minlen=input$minL, maxlen=input$maxL))
229 |        ar <- rules()
230 |        inspect(sort(ar, by=input$sort))
231 |      })
232 |      
233 |      ## Download data to csv ########################
234 |      output$downloadData <- downloadHandler(
235 |        filename = 'arules_data.csv',
236 |        content = function(file) {
237 |          write.csv(rules2df(rules()), file)
238 |        }
239 |      )
240 |      
241 |      
242 |    }
243 |   )
244 | }
245 | 
246 | 
247 | 
248 |    
249 |    
250 |    
251 |    
252 |    


--------------------------------------------------------------------------------
/R/shiny_tableNet.R:
--------------------------------------------------------------------------------
  1 | #' @title Shiny app to visualize schema of relational tables
  2 | #' @description Launches a Shiny App that provides a visual representation of the relationships between a collection of tables 
  3 | #' (data.frames) with some relational structure.  If given a dump of dozens of flat files and without a formal schema or 
  4 | #' documentation on your data, this app will help explore and understand the underlying schema - which tables can be joined
  5 | #' which other tables, which variables can be used to join which tables, etc,  It also gives a read on how strongly each variable with 
  6 | #' the same name in multiple tables actually connects tables (how many of the values of the linking variable \code{x} that are in
  7 | #' \code{table1} are also in \code{table2}.  \cr \cr
  8 | #' @details Note, the Key-strength tables can be slow to display because these computations actually dive into the contents of the 
  9 | #' data.frames and perform set operations on every row of the variable of interest.  It is possible to use the \code{\link{isKey}} function
 10 | #' to compute these similarity matrices ahead of time to prevent the Shiny app from doing these computations each time.
 11 | #' @param dfL list of data.frames used to generate schema.  This is easily generated from \code{\link{dir2dfList}}
 12 | #' @seealso \code{\link{dir2dfList}} \code{\link{isKey}}
 13 | #' @return Shiny App
 14 | #' @import memoise shiny gplots igraph
 15 | #' @export
 16 | #' @examples
 17 | #' \dontrun{
 18 | #' ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory
 19 | #' temp <- tempfile()
 20 | #' localDataDir <- paste0(tempdir(), '\\lahman2012-csv-onYourComp.zip')
 21 | #' download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir)
 22 | #' unzip(localDataDir, exdir=paste0(tempdir(), '\\lahman2012-csv-onYourComp')) ## may not be necessary
 23 | #' 
 24 | #' ## create a list of data.frames from .CSVs
 25 | #' dfL <- dir2dfList(paste0(tempdir(), '\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F)
 26 | #' 
 27 | #' ## launch app
 28 | #' tableNet(dfL)
 29 | #' }
 30 | 
 31 | 
 32 | tableNet <- function(dfL) {
 33 |   
 34 |   ###############################################################################
 35 |   ## takes list of Data frames and makes igraph network  ########################
 36 |   ###############################################################################
 37 |   dfL2network <- function(dfL, islands=T){
 38 |     # create adjacency list
 39 |     edgeL <- data.frame(v1=as.character(), v2=as.character(), v3=as.character(), stringsAsFactors=F)
 40 |     k<-1
 41 |     for(i in 1:length(dfL)){
 42 |       for(j in 1:length(dfL)){
 43 |         commonv <- intersect(names(dfL[[i]]), names(dfL[[j]]))
 44 |         if(length(commonv)>0){
 45 |           for(cv in 1:length(commonv)){
 46 |             edgeL[k,1] <- names(dfL)[i]
 47 |             edgeL[k,2] <- names(dfL)[j]
 48 |             edgeL[k,3] <- commonv[cv]
 49 |             k<-k+1
 50 |           }
 51 |         }    
 52 |       }
 53 |     }
 54 |     
 55 |     ## creating graph from adjacency
 56 |     edgeL <- edgeL[edgeL[,1]!=edgeL[,2],]
 57 |     g <- graph.data.frame(edgeL[,c(1,2)], directed=F)
 58 |     if(islands==T) {
 59 |       xvars <- unlist(lapply(dfL, names),use.names=F)
 60 |       keys <- unique(xvars[duplicated(xvars)==T])
 61 |       islandtab <- names(dfL)[sapply(dfL, function(x) is.na(sum(names(x) %in% keys)))]
 62 |       g <- g + vertices(islandtab)
 63 |     }
 64 |     E(g)$color <- 'gray'
 65 |     E(g)$name <- edgeL[,3]
 66 |     
 67 |     return(g)
 68 |   }
 69 |  
 70 |   ## create schema network graph
 71 |   g <- dfL2network(dfL)
 72 |   
 73 |   ## create memo'ized version of isKey to has
 74 |   isKeym <- memoise(isKey)
 75 |     
 76 |   #######################################################
 77 |   ## ACTUALLY RUN APP ###################################
 78 |   #######################################################
 79 |   
 80 |   shinyApp(ui = shinyUI(pageWithSidebar(
 81 |     
 82 |     headerPanel("Database Schema Visualization"),
 83 |     
 84 |     ## NETWORK 
 85 |     sidebarPanel(
 86 |       conditionalPanel(
 87 |         condition="input.mytab=='network'",
 88 |         
 89 |         sliderInput("vlabcex", "Vertex label size:", value=1, min=.05, max=5, step=0.05),
 90 |         sliderInput("sizeNodesSlider", "Vertex size", value=10, min=0.1, max=30, step=0.1),
 91 |         sliderInput("ewidth", "Edge width:", value=1, min=1, max=10, step=1),
 92 |         selectInput("lay2", "Choose a layout", choices = c('circle', 'fruchterman.reingold')),
 93 |         selectInput("vcolor", "Color vertices by:", choices = c('# of keys', '# of connections', 'strength of keys')),
 94 |         checkboxInput(inputId='sizeNodes', label='Size vertices by connections', value=T),
 95 |         checkboxInput(inputId='curved', label='Curve edges', value=F),
 96 |         checkboxInput(inputId='islands', label='Remove unconnected Tables', value=T),
 97 |         radioButtons(inputId='subEdges', label='Select Edges:', choices=c('all', 'some')),
 98 |         
 99 |         conditionalPanel("input.subEdges=='some'",
100 |                          checkboxInput(inputId='delE', label='Delete un-selected edges', value=F),
101 |                          selectInput(inputId='edgev', label='Variables', choices=sort(unique(E(g)$name)), multiple=T)
102 |         )
103 |         
104 |       ),
105 |       
106 |       ## STRENGTH TABLE
107 |       conditionalPanel(
108 |         condition="input.mytab=='strength'",
109 |         checkboxInput(inputId='keyList', label='Optional: Use pre-computed key strength matrix', value=FALSE),
110 |         
111 |         conditionalPanel("input.keyList==true",
112 |                          selectInput(inputId='keyListObject', label='Pick pre-computed key strength matrix object', choices=ls(name='.GlobalEnv')) 
113 |         ),
114 |         
115 |         selectInput(inputId='key', label='Choose a key', choices=sort(unique(E(g)$name))),
116 |         sliderInput("keylab", "Variable labels size", value=1, min=0.1, max=10, step=0.1)
117 |      
118 |       ),
119 |       
120 |       
121 |       ## KEY-TABLE HEATMAP
122 |       conditionalPanel(
123 |         condition="input.mytab=='keyTab'",
124 |         sliderInput("colLabCex", "Table Label Size", value=1, min=0.1, max=10, step=0.1),
125 |         sliderInput("rowLabCex", "Key Label Size", value=1, min=.1, max=10, step=.1),
126 |         sliderInput("marginSize", "Margin Size", value=15, min=0, max=100, step=1),
127 |         sliderInput("wsize", "width size", value=800, min=0, max=5000, step=50)
128 |       ),
129 |       
130 |       ## ADJACENCY LIST
131 |       conditionalPanel(
132 |         condition="input.mytab=='adjlist'",
133 |         selectInput(inputId='adjcommonvar', label='Choose a key', choices=sort(unique(E(g)$name)), multiple=T)
134 |       )
135 | 
136 |       
137 |     ),
138 |     
139 |     ## PANELS TO SHOW
140 |     mainPanel(
141 |       tabsetPanel(id='mytab',
142 |                   tabPanel('Network', value='network', plotOutput('circle', height='150%')),
143 |                   tabPanel('Key Strength', value='strength', plotOutput('strengthPlot', height='150%')),
144 |                   tabPanel('Key-Table Matrix', value='keyTab', plotOutput('keyTabMat', height='150%')),
145 |                   tabPanel('Adjacency List', value='adjlist', dataTableOutput('adjlisttab'))
146 |       )
147 |     )
148 |     
149 |   )),
150 |   
151 |   server = function(input, output) {
152 | 
153 |     ################################################
154 |     ## NETWORK
155 |     ################################################
156 |     
157 |     ## DEFINING COMMON VARIABLES
158 |     commonv <- unlist(lapply(dfL, names),use.names=F)
159 |     commonv <- unique(commonv[duplicated(commonv)==T])
160 |     
161 |     output$circle <- renderPlot({
162 |       
163 |       ## delete edges
164 |       if(input$delE==T){
165 |         eid <- which(E(g)$name %in% input$edgev)
166 |         g <- subgraph.edges(g, eids=eid, delete.vertices=T)
167 |       }
168 |       
169 |       ## remove island vertices
170 |       if(input$islands==T){
171 |         islandTabs <- which(degree(g)==0)
172 |         g <- delete.vertices(g, v=islandTabs)
173 |       }
174 |       
175 |       ## setting defaul graphic parameters
176 |       E(g)$width <- input$ewidth
177 |       E(g)$color <- 'gray'
178 |       
179 |       ## edge colors
180 |       if(input$subEdges=='some'){
181 |         E(g)$color[E(g)$name %in% input$edgev] <- 'red'
182 |         E(g)$width[E(g)$name %in% input$edgev] <- input$ewidth + 3
183 |       }
184 |       
185 |       ## vertex size
186 |       if(input$sizeNodes==T) {
187 |         deg <- igraph::degree(g, V(g)$name, mode='total')
188 |         V(g)$size <- (deg/mean(deg)*input$sizeNodesSlider) + 5
189 |       } else {
190 |         V(g)$size <- 5 + input$sizeNodesSlider}
191 |       
192 |       ## vertex colors
193 |       if(input$vcolor=='# of keys') {
194 |         namesL <- lapply(dfL, names)
195 |         keysInTab <- unlist(lapply(namesL, function(x) sum((x %in% commonv)==T)))
196 |         keysInTab[is.na(keysInTab)] <- 0
197 |         dfn <- data.frame(keysInTab)
198 |         dfn$vnames <- row.names(dfn)
199 |         V(g)$nkeys <- as.numeric(makeVertexAtt(g, df=dfn, vname='keysInTab', by.df='vnames', by.g='name'))
200 |         vcolPal <- colorRampPalette(c("white", "purple"))(n = max(V(g)$nkeys)+1)
201 |         V(g)$color <- vcolPal[V(g)$nkeys+1]
202 |         
203 |       } else if(input$vcolor=='# of connections') {
204 |         vcolPal <- colorRampPalette(c("white", "purple"))(n = max(degree(g))+1)
205 |         V(g)$color <- vcolPal[degree(g)+1]  
206 |       }
207 |       
208 |       ## layout
209 |       lay <- get(paste('layout.', as.character(input$lay2), sep=''))  
210 |       
211 |       ## ACTUALLY PLOTTING NETWORK
212 |       plot(g, vertex.label.cex=input$vlabcex, layout=lay, edge.curved=input$curved, vertex.label.color='black') 
213 |     }, width=800, height=800)
214 |     
215 |     
216 |     ################################################
217 |     ## STRENGTH CHART 
218 |     ################################################
219 |     
220 |     output$strengthPlot <- renderPlot({
221 |       
222 |       ## defining keyL object
223 |       if(input$keyList==TRUE) {keyL <- get(input$keyListObject)[[input$key]]
224 |       } else {
225 |         keyL <- isKeym(dfL, input$key)
226 |       }
227 | 
228 |       myPalette <- colorRampPalette(c("white", "firebrick"))(n=20)
229 |       checklab <- round(keyL,2)
230 |       heatmap.2(keyL, trace='none', dendrogram='none', Rowv=F, Colv=F, margins=c(18,18), col=myPalette, 
231 |                 cellnote=checklab, notecol='black', na.rm=T, cexRow=input$keylab, cexCol=input$keylab)
232 |       text(x=.5, y=.9, '1. Take unique values of the \n key variable in each table. \n
233 |            2. Look at the share of these unique \n values from table 1 (right) \n that appear in table 2 (bottom) \n
234 |            3. So variables with high row scores are strong keys \n'
235 |            , cex=1.2)
236 |       
237 |     }, width=800, height=800)
238 |     
239 |     
240 |     ################################################
241 |     ## STRENGTH TABLE
242 |     ################################################
243 |     
244 |     plotSize <- reactive({
245 |       return(input$wsize)
246 |     })
247 |     
248 |     output$keyTabMat <- renderPlot({
249 |       ## setting up matrix
250 |       tabv <- names(dfL)
251 |       mat <- matrix(nrow=length(commonv), ncol=length(tabv))
252 |       colnames(mat) <- tabv
253 |       rownames(mat) <- commonv
254 |       
255 |       for(i in 1:nrow(mat)){
256 |         colindx <- which(sapply(lapply(dfL, names), function(x) rownames(mat)[i] %in% x))
257 |         mat[i, colindx] <- 1
258 |       }
259 |       mat[is.na(mat)] <- 0
260 |       mat <- mat[rev(order(rowSums(mat))),]
261 |       
262 |       myPalette <- colorRampPalette(c("white", "firebrick"))(n = 2)
263 |       checklab <- mat
264 |       heatmap.2(mat, trace='none', dendrogram='none', Rowv=F, Colv=F, key=F, 
265 |                 margins=c(input$marginSize, input$marginSize), lhei = c(0.1,0.9), 
266 |                 main='Linking variables (rows) in Tables (columns)',
267 |                 col=myPalette, cellnote=checklab, notecol='black', cexRow=input$rowLabCex, cexCol=input$colLabCex)
268 |     }, width=plotSize, height=plotSize, units='px')
269 |     
270 |     ################################################
271 |     ## ADJACENCY LIST
272 |     ################################################    
273 |     output$adjlisttab <- renderDataTable({
274 |       df <- get.data.frame(g)[,c('from', 'to', 'name')]
275 |       names(df) <- c('table1', 'table2', 'commonVariable')
276 |       if(length(input$adjcommonvar)>0) df <- df[df$commonVariable %in% input$adjcommonvar,]
277 |       df
278 |     })
279 |     
280 |   }
281 |   )
282 |   
283 | }
284 | 
285 | 
286 | 
287 | #' @title Turn a directory of flat files into a list of data.frames
288 | #' @description Useful to prepare data for \code{\link{tableNet}}
289 | #' @param dfdir character string of the directory where you want to load flat files
290 | #' @param ext file extention on the type of files to load.  Usually \code{.csv} or \code{.txt}
291 | #' @param exclude character string of table names to be excluded from app.  Needs to be specified to \code{NULL} or a character
292 | #' vector or else \code{...} arguments will not be handled properly.
293 | #' @param ... parameters to pass to \code{\link{read.delim}}.  Commonly \code{nrow}, \code{sep},
294 | #' @seealso \code{\link{tableNet}} \code{\link{isKey}}
295 | #' @return list of data.frames
296 | #' @export
297 | #' 
298 | #' @examples
299 | #' \dontrun{
300 | #' ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory
301 | #' temp <- tempfile()
302 | #' localDataDir <- paste0(tempdir(), '\\lahman2012-csv-onYourComp.zip')
303 | #' download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir)
304 | #' unzip(localDataDir, exdir=paste0(tempdir(), '\\lahman2012-csv-onYourComp')) ## may not be necessary
305 | #' 
306 | #' ## create a list of data.frames from .CSVs
307 | #' dfL <- dir2dfList(paste0(tempdir(), '\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F)
308 | #' }
309 | 
310 | dir2dfList <- function(dfdir, ext='.txt', exclude=NULL, ...) {
311 |   # get list of .txt text files in directory
312 |   setwd(dfdir)
313 |   tables <- list.files()[sapply(list.files(), function(x) substr(x,nchar(x)-3, nchar(x)))==ext]
314 |   tableNames <- sapply(tables, function(x) substr(x,0, nchar(x)-4), USE.NAMES=F)
315 |   
316 |   # create list of dfs from directory
317 |   dfL <- list()
318 |   for(i in 1:length(tables)) {
319 |     dfL[[tableNames[i]]] <- read.delim(tables[i], ...)
320 |     dfL[[tableNames[i]]] <- dfL[[tableNames[i]]][,!names(dfL[[tableNames[i]]]) %in% exclude]
321 |     print(paste(tableNames[i], nrow(dfL[[tableNames[i]]]), Sys.time()))
322 |   }
323 |   
324 |   return(dfL)
325 | }
326 | 
327 | #' @title Determine strength of linking variables  
328 | #' @description This function computes the percentage of unique values of a column \code{x} from \code{table1} that appear in
329 | #' in a \code{table2}.  It is called and computed on the fly in \code{\link{tableNet}}.  However, these computations can be 
330 | #' slow on large datasets, so it is provided a standalone function that can be run once to store the output and fed into the 
331 | #' \code{\link{tableNet}} app to prevent repetitive slow computations on the fly.
332 | #' @param dfL list of data.frames.  easily generated from \code{\link{dir2dfList}}
333 | #' @param xvar character string, name of the variable to calculate strength for across all tables in \code{dfL}
334 | #' @param printdf prints progress of flat file loads to R console.
335 | #' @seealso \code{\link{tableNet}} \code{\link{dir2dfList}}
336 | #' @return list of data.frames 
337 | #' @export
338 | #' 
339 | #' @examples
340 | #' \dontrun{
341 | #' ## download some baseball data. NOTE This will download 30MB of data (25 csv files) into a temporary directory
342 | #' temp <- tempfile()
343 | #' localDataDir <- paste0(tempdir(), '\\lahman2012-csv-onYourComp.zip')
344 | #' download.file('http://seanlahman.com/files/database/lahman2012-csv.zip', localDataDir)
345 | #' unzip(localDataDir, exdir=paste0(tempdir(), '\\lahman2012-csv-onYourComp')) ## may not be necessary
346 | #' 
347 | #' ## create a list of data.frames from .CSVs
348 | #' dfL <- dir2dfList(paste0(tempdir(), '\\lahman2012-csv-onYourComp'), ext='.csv', exclude=NULL, sep=',', stringsAsFactors=F)
349 | #' isKey(dfL, 'playerID')
350 | #' }
351 | 
352 | 
353 | isKey <- function(dfL, xvar) {
354 |   
355 |   tabNames <- lapply(dfL, names)
356 |   tabs <- names(which(lapply(tabNames, function(x) xvar %in% x)==T))
357 |   mat <- matrix(nrow=length(tabs), ncol=length(tabs))
358 |   ii <- 1; 
359 |   for(i in tabs){
360 |     iivar <- dfL[[i]][,xvar]
361 |     jj <- 1
362 |     for(j in tabs){
363 |       jjvar <- dfL[[j]][,xvar]
364 |       stop
365 |       mat[jj,ii] <- sum(jjvar %in% iivar)/length(jjvar)
366 |       jj<-jj+1
367 |     }
368 |     ii<-ii+1
369 |   }
370 |   
371 |   mat[is.na(mat)] <- 0
372 |   colnames(mat) <- tabs
373 |   rownames(mat) <- tabs
374 |   return(mat)
375 | }
376 | 
377 | 


--------------------------------------------------------------------------------
/R/modeling.R:
--------------------------------------------------------------------------------
  1 | #' @title Create formula
  2 | #' @description Creates a formula object which can be passed to many R modeling functions from a vector of variable names.  
  3 | #' @param y character string, target variable
  4 | #' @param xList character vector, predictor variables (Right Hand Side variables) for formula
  5 | #' @return formula object
  6 | #' @export
  7 | #' @examples
  8 | #' form <- makeForm('mpg', c('drat', 'wt', 'hp'))
  9 | #' summary(lm(form, data=mtcars))
 10 | 
 11 | makeForm <- function(y, xList) {
 12 |   if(as.character(y)[1]=='as.factor') y<-paste(paste(as.character(y), collapse='('), ')', sep='')
 13 |   form <- (as.formula(paste(y, '~', paste(xList, collapse='+'), sep='')))
 14 |   return(form)
 15 | }
 16 | 
 17 | #' @title Edit a formula
 18 | #' @description Easy way to add or subtract predictor variables to a formula 
 19 | #' @param form formula, the formula you will be editing
 20 | #' @param sub character vector, predictor variables to subtract (remove) from formula
 21 | #' @param add character vector, predictor variables to add to formula
 22 | #' @return formula, edited formula object
 23 | #' @export
 24 | #' @examples
 25 | #' form <- as.formula('mpg~cyl+hp+drat+qsec')
 26 | #' editForm(form, add=c('wt', 'gear', 'carb'))
 27 | #' editForm(form, sub=c('cyl', 'hp', 'qsec', 'variableNotInFormula'))
 28 | #' editForm(form, sub=c('cyl', 'hp', 'qsec'), add=c('wt'))
 29 | 
 30 | editForm <- function(form, add=NULL, sub=NULL){
 31 |   eform <- form
 32 |   X <- unlist(strsplit(as.character(form)[[3]], ' +'))
 33 |   X <- X[X!='+']
 34 |   Y <- form[[2]]
 35 |   
 36 |   if(is.null(sub)==F){
 37 |     Xsub <- X[!X %in% sub]
 38 |     eform <- makeForm(Y,Xsub)
 39 |   }
 40 |   
 41 |   if(is.null(add)==F){
 42 |     if('Xsub' %in% ls()) X <- Xsub
 43 |     Xadd <- c(X, add)
 44 |     eform <- makeForm(Y,Xadd)
 45 |   }
 46 |   
 47 |   return(eform)
 48 | }
 49 | 
 50 | shuffleForm <- function(form){
 51 |   eform <- form
 52 |   X <- unlist(strsplit(as.character(form)[[3]], ' +'))
 53 |   X <- X[X!='+']
 54 |   X <- X[sample(1:length(X), replace=F)]
 55 |   Y <- form[[2]]
 56 |   shuffledForm <- makeForm(Y, X)
 57 |   return(shuffledForm)
 58 | }
 59 | 
 60 | #' @title Variable importance from iteratively shuffled orders of predictor variables
 61 | #' @description Runs multiple anova analyses to assess deviance explained by each predictor in shuffled orders, iteratively.
 62 | #' Currently using only logistic regression.  Could be generalized.
 63 | #' @param form formula to be passed to \code{\link{travCount}}.
 64 | #' @param df data.frame of data to be used for analysis
 65 | #' @param n number of iterations for shuffled ANOVA analysis
 66 | #' @param test character string, statistical test to run.  Default is 'Chisq'.
 67 | #' @return data.frame of results
 68 | #' @export
 69 | #' @examples
 70 | #' form <- as.formula('am~wt+gear+carb+cyl+hp+drat+qsec')
 71 | #' shuffleAnova(form, mtcars, n=50)
 72 | 
 73 | shuffleAnova <- function(form, df, n=5, test='Chisq'){
 74 |   # creating a list of all anova analyses (with shuffled order of x variables)
 75 |   anovaL <- list()
 76 |   for(i in 1:n){
 77 |     sform <- shuffleForm(form)
 78 |     anovaL[[i]] <- anova(glm(sform, data=df, family=binomial(logit)), test=test)
 79 |   }
 80 |   
 81 |   # create a summary table from running multiple anova analyses with different subsets of variables 
 82 |   pred <- unique(unlist(lapply(anovaL, function(x) unique(rownames(x)))))
 83 |   res <- data.frame(xvar=pred)
 84 |   for(i in pred){
 85 |     res$deviance_mean[res$xvar==i] <- mean(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
 86 |     res$deviance_median[res$xvar==i] <- median(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
 87 |     res$deviance_sd[res$xvar==i] <- sd(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
 88 |     res$deviance_min[res$xvar==i] <- min(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
 89 |     res$deviance_max[res$xvar==i] <- max(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
 90 |     res$pvalue_median[res$xvar==i] <- median(unlist(lapply(anovaL, function(x) x[rownames(x)==i,5])), na.rm=T)
 91 |   }
 92 |   p <- cut(res$pvalue_median, breaks=c(0,.001,.01,.05,.1, 1))
 93 |   res$pstar <- factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', ''))
 94 |   res <- res[order(res$deviance_mean, decreasing=T),]
 95 |   return(res)
 96 | }
 97 | 
 98 | #' @title Format logistic regression table
 99 | #' @description Turns a logistic regression object into a regression table
100 | #' and outsheets it to a csv if you choose
101 | #' @param reg logistic regression object.  output from \code{\link{glm}}
102 | #' @param file filepath of ouput.  \code{txt} or \code{csv} object.
103 | #' @param xvlab data.frame, lookup table for variable names.  First column is codename, second column is the pretty printed name.
104 | #' @param stats include statistics in output
105 | #' @return data.frame of results formatted nicely
106 | #' @export
107 | #' @examples
108 | #'   reg <- glm(am~qsec+hp, data=mtcars, family=binomial(logit))
109 | #' logit2tab(reg)  
110 | #' longnames <- data.frame(short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am', 'qsec'),
111 | #' long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.', 'Q Seconds'))
112 | #' logit2tab(reg, xvlab=longnames, stats=T)
113 | 
114 | logit2tab <- function(reg, file=NULL, xvlab=NULL, stats=F){
115 |   tab <- summary(reg)$coefficients
116 |   tab <- data.frame(coef=tab[,'Estimate'], oddsRatio=exp(tab[,'Estimate']), stdError=tab[,'Std. Error'], p=tab[,'Pr(>|z|)'], stringsAsFactors=F)
117 |   p <- cut(tab$p, breaks=c(0,.001,.01,.05,.1, 1))
118 |   tab$pstar <- factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', ''))
119 |   if(is.null(xvlab)==F) {
120 |     tab$Metric <- as.character(xvlab[match(rownames(tab), xvlab[,1]), 2])
121 |     tab$Metric[is.na(tab$Metric)] <- ''
122 |     tab$Metric[rownames(tab)=='(Intercept)'] <- '(Intercept)'
123 |     tab <- tab[, c('Metric', names(tab)[!names(tab) %in% 'Metric'])]
124 |   }
125 |   if(is.null(file)==F) write.csv(tab, file=file) 
126 |   if(stats==T){
127 |     tab[nrow(tab)+1, ] <- ''; rownames(tab)[nrow(tab)] <- ''
128 |     tab[nrow(tab)+1,1:2] <- c('aic', reg$aic)
129 |     tab[nrow(tab)+1,1:2] <- c('n', nrow(reg$model))
130 |     tab[nrow(tab)+1,1:2] <- c('Chi Sq.', reg$null.deviance - reg$deviance)
131 |     tab[nrow(tab)+1,1:2] <- c('P(>|Chi|)', pchisq(reg$null.deviance-reg$deviance, 2, lower.tail=F, df=reg$df.null-reg$df.residual))
132 |     tab[is.na(tab)] <- ''
133 |   }
134 |   return(tab)
135 | }
136 | 
137 | #' @title All-subsets logistic regression
138 | #' @description Plots the output of the bestglm BestModels object. Similar to the visual output of plotting a regsubsets object
139 | #' @param bglm \code{bestglm} object from \code{\link{bestglm}}
140 | #' @param rc decimal places to display on y-axis of plot
141 | #' @return plot
142 | #' @seealso  \code{\link{bestglm}},  \code{\link{leaps}}
143 | #' @import bestglm
144 | #' @export
145 | #' @examples
146 | #' require('bestglm')
147 | #' b <- bestglm(Xy=mtcars[,c('mpg', 'hp', 'drat', 'cyl', 'wt', 'qsec', 'vs')], family=binomial(logit), IC='BIC', nvmax=4)
148 | #' plotBestglm(b$BestModels, rc=3)
149 | 
150 | plotBestglm <- function(bglm, rc=2) {
151 |   image(as.matrix(t(bglm[,1:ncol(bglm)-1])), col=c(0:1), xaxt='n', yaxt='n')
152 |   axis(1, at=seq(0,1,length.out=ncol(bglm)-1), labels=colnames(bglm)[1:ncol(bglm)-1], las=2)
153 |   axis(2, at=seq(0,1,length.out=nrow(bglm)), labels=rev(round(bglm$Criterion,rc)), las=2)
154 | }
155 | 
156 | 
157 | #' @title Univariate glm regression
158 | #' @description Runs a univariate logistic regression on each predictor variable of interest.
159 | #' @param df data.frame with variables for analysis
160 | #' @param yv character string, target variable
161 | #' @param xv character vector, predictor variables to test univariately
162 | #' @param file character string, filepath to write results out to.  txt or csv file.
163 | #' @param sortby character string, criteria to sort variables by. Default  = 'aic'
164 | #' @param xvlab data.frame, lookup table for variable names.  First column is codename, second column is the pretty printed name.
165 | #' @param test \code{TRUE} or \code{FALSE}.  Includes Chi square test, or not.
166 | #' @return data.frame of results
167 | #' @seealso  \code{\link{bestglm}},  \code{\link{leaps}}
168 | #' @import Hmisc
169 | #' @export
170 | #' @examples
171 | #'
172 | #' require('Hmisc')
173 | #' 
174 | #' ##setting up some data   
175 | #' longnames <- data.frame(long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.'),
176 | #' short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am'), stringsAsFactors=F)
177 | #'
178 | #' glm.out <- uniglm(df=mtcars, yv='vs', xv=c('hp','drat','cyl','mpg','wt'), xvlab=longnames)
179 | 
180 | uniglm <- function(df, yv, xv, file=NULL, sortby='aic', xvlab=NULL, test=T){
181 |   mat <- data.frame(matrix(nrow=25*length(xv), ncol=12, dimnames=list(NULL, c('Predictor', 'Name', 'fac', 'ref', 'coef', 'oddsRatio', 'p', 'pstar', 'aic', 'c', 'Chisq_pvalue', 'Chisq_pstar'))))
182 |   i<-0
183 |   for(x in xv){
184 |     formt <- makeForm(yv, x)
185 |     reg <- glm(formt, data=df, family=binomial(logit))
186 |     for(j in 2:length(reg$coefficients)){
187 |       i<-i+1
188 |       try({
189 |         mat$Predictor[i] <- names(reg$coefficients[j])
190 |         
191 |         ## If we have nice clean labels to replace variable names in code
192 |         if(is.null(xvlab)==F) {mat$Name[i] <- as.character(xvlab[match(x, xvlab[,2]), 1])
193 |         } else {mat=mat[, setdiff(names(mat), 'Name')]}
194 |         
195 |         ## If we have a categorical variable with a reference 
196 |         if(length(reg$coef)>2) {
197 |           regv <- gsub(x, '', names(reg$coef))
198 |           uxv <- unique(df[,x])
199 |           mat$ref[i] <- setdiff(uxv, regv)
200 |           mat$Predictor[i] <- x
201 |           mat$fac[i] <- gsub(x, '', names(reg$coef[j])) 
202 |         } 
203 |         
204 |         if(test==T){
205 |           av <- anova(reg, test='Chisq')['Pr(>Chi)'][[1]][2]
206 |           mat$Chisq_pvalue[i] <- av
207 |           avp <- cut(mat$Chisq_pvalue[i], breaks=c(0,.001,.01,.05,.1, 1))
208 |           mat$Chisq_pstar[i] <- as.character(factor(x=avp, levels=levels(avp), labels=c('***', '**', '*', '.', '')))
209 |         } else {mat=mat[, setdiff(names(mat), c('Chisq_pvalue', 'Chisq_pstar'))]}
210 |         
211 |         mat$coef[i] <- as.numeric(reg$coefficients[j])
212 |         mat$oddsRatio[i] <- as.numeric(exp(reg$coefficients[j]))
213 |         mat$p[i] <- as.numeric(summary(reg)$coefficients[j,4])
214 |         p <- cut(mat$p[i], breaks=c(0,.001,.01,.05,.1, 1))
215 |         mat$pstar[i] <- as.character(factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', '')))
216 |         mat$aic[i] <- reg$aic
217 |         mat$c[i] <- round(somers2(x=reg$fitted.values, y=reg$y)['C'],3)
218 |       })
219 |     }
220 |     
221 |   }
222 |   mat<-mat[1:i,]
223 |   mat$ref[is.na(mat$ref)] <- ''
224 |   mat$fac[is.na(mat$fac)] <- ''
225 |   if(sum(mat$ref=='')==nrow(mat)) mat <- mat[, setdiff(names(mat), 'ref')]
226 |   if(sum(mat$fac=='')==nrow(mat)) mat <- mat[, setdiff(names(mat), 'fac')]
227 |   decreasingTF <- ifelse(sortby %in% c('c'), T, F)
228 |   mat <- mat[order(mat[,sortby], decreasing=decreasingTF),] 
229 |   if(is.null(file)==F) write.csv(mat, file=file)
230 |   return(mat)
231 | }
232 | 
233 | #' @title Analyze predictions of supervised model in quantiles
234 | #' @description Analyzing results of supervised model using test data and quantiles.  Note only
235 | #' \code{model} or \code{testPred} are necessary, not both. \code{testPred} is advised.
236 | #' @param model the model to get predictions from.  Uses \code{predict} method.  If \code{testPred}
237 | #' is specified, we don't need this parameter,  Probably safer to just use testPred, unless the model you have
238 | #' has a predict method that will work as intended without any other arguments.
239 | #' @param xtext data.frame of test data (predictor variables only)
240 | #' @param ytest vector of test data (target variable)
241 | #' @param n number of quantiles
242 | #' @param roundText decimals to print
243 | #' @param testPred vector of predictions from test data.  Do not use if using \code{model} parameter.
244 | #' @param fw numeric vector, of bins.  Defaults to 
245 | #' @return data.frame of results
246 | #' @seealso  \code{\link{predSortPlot}}
247 | #' @export
248 | #' @examples
249 | #' require('randomForest')
250 | #' rf <- randomForest(x=mtcars[1:25,1:8], y=as.factor(mtcars[1:25, 'am']), ntree=5)
251 | #' mtcarsTestPred <- predict(rf, mtcars[26:32, 1:8], type='prob')[,2]
252 | #' pq2 <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], n=3, roundText=4, testPred=mtcarsTestPred)
253 | #' pq <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], fw=seq(.1,1,length.out=10), roundText=4, testPred=mtcarsTestPred)
254 | #'
255 | #' barplot(pq$actual, names=pq$predRange, ylab='True Positives', las=2, cex.names=.7)
256 | #' barplot(pq$hitRate, names=pq$predRange, ylab='True Positive Hit Rate', las=2, cex.names=.7)
257 | #' barplot(pq$predMax-pq$predMin, pq$hitRate, names=pq$predRange, ylab='prediction', las=2, cex.names=.7)
258 | 
259 | predQuantile <- function(model=NULL, xtest=NULL, ytest, n=5, roundText=3, testPred=NULL, fw=NULL){
260 |   if(is.null(testPred)) {pred <- predict(model, xtest)
261 |   } else {pred <- testPred}
262 |   
263 |   tf <- data.frame(pred=pred, hits=ytest)
264 |   tf <- tf[order(tf$pred),]
265 |   if(is.null(fw)) {tf$cat <- sort(rep(1:n, ceiling(nrow(tf)/n))[1:nrow(tf)])
266 |   } else {tf$cat <- cut(tf$pred, breaks=fw)}
267 |   
268 |   aggSum <- aggregate(hits~cat, data=tf, sum)
269 |   aggLength <- aggregate(hits~cat, data=tf, length)
270 |   aggRange <- aggregate(pred~cat, data=tf, range)
271 |   aggRange <- cbind(aggRange[,1], data.frame(aggRange[,2]))
272 |   aggRangeText <- aggregate(pred~cat, data=tf, function(x) paste(round(range(x), roundText), collapse=' to '))
273 |   
274 |   names(aggLength)[2] <- 'N'
275 |   names(aggRange) <- c('cat', 'predMin', 'predMax')
276 |   names(aggRangeText)[2] <- 'predRange'
277 |   
278 |   agg <- merge(aggSum, aggLength, by='cat')
279 |   agg$hitRate <- agg$hits/agg$N
280 |   agg <- merge(agg, aggRange, by='cat')
281 |   agg <- merge(agg, aggRangeText, by='cat')
282 |   
283 |   guessrate <- sum(ytest==1)/length(ytest)
284 |   agg$cumHitsPct <- rev(cumsum(rev(agg$hits)/sum(agg$hits)))
285 |   agg$cumHitRate <- rev(cumsum(rev(agg$hits))/cumsum(rev(agg$N)))
286 |   agg$cumNPct <- rev(cumsum(rev(agg$N)/sum(agg$N)))
287 |   agg$cumLift <- agg$cumHitRate/guessrate
288 |   
289 |   ##correcting for cutoffs with zero
290 |   if(!is.null(fw)) {
291 |     a<-paste(levels(tf$cat)[!levels(tf$cat) %in% tf$cat], collapse=', ')
292 |     print(paste('no scores in these ranges',a))
293 |     agg$cutoff <- fw[which(levels(tf$cat) %in% tf$cat)]
294 |   }
295 |   agg <- agg[nrow(agg):1,]
296 |   return(agg)
297 | }
298 | 
299 | #' @title Devil's Horn
300 | #' @description Visualize results of supervised model predictions on test data.  Currently supports binary target variable.  
301 | #' Red dots represent observations where target variable = 1, black dots where target variable = 0.
302 | #' @param pred vector of predictions for target variable on test data
303 | #' @param ytest, vector of the target variable from test data. (0s and 1s)
304 | #' @param jitterPlot \code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}
305 | #' @return plot
306 | #' @seealso  \code{\link{predQuantile}}
307 | #' @export
308 | #' @examples
309 | #' ## Setting up some data and building a basic model on training data.
310 | #' mylogit <- glm(vs~drat+hp+mpg, family=binomial('logit'), data=mtcars[1:25,])
311 | #' mtcarsTestPred <- predict(mylogit, mtcars[26:32, ], type='response')
312 | #' predSortPlot(pred=mtcarsTestPred, ytest=mtcars$vs[26:32])
313 | 
314 | predSortPlot <-  function(pred, ytest, jitterPlot=NULL) {
315 |   tf <- data.frame(pred=pred, actual=ytest)
316 |   tf <- tf[order(tf$pred),]
317 |   #return(tf)
318 |   if(is.null(jitterPlot)==T) plot(tf$pred, col=ifelse(tf$actual==1, 'red', 'black'), cex=ifelse(tf$actual==1, 3, 2), ylab='Prediction')
319 |   if(is.null(jitterPlot)==F) plot(jitter(tf$pred, jitterPlot), col=ifelse(tf$actual==1, 'red', 'black'), pch=ifelse(tf$actual==1, 19, 21), ylab='Prediction')
320 | }
321 | 
322 | 
323 | #' @title Optimize weight on ensemble of 2 supervised models
324 | #' @description This function creates a weighted average of predictions from two models
325 | #' and evaluates F1, precision, recall, auc or c for each combination of the 
326 | #' models to determine the best weights for each.
327 | #' @param pred1 numeric vector of probabilities, prediction from model 1
328 | #' @param pred2 numeric vector of probabilities, prediction from model 2
329 | #' @param actual vector of 1s and 0s.  The target variable test data
330 | #' @param steps number, high numbers compute a more exhaustive combination of model weights
331 | #' @param cutoff Cutoff used to demarcate predictions into positive or negative class.
332 | #' @param ytest, vector of the target variable from test data. (0s and 1s)
333 | #' @param jitterPlot \code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}
334 | #' @return data.frame of results
335 | #' @import Hmisc  
336 | #' @export
337 | #' @examples
338 | #' require('Hmisc')
339 | #' fit_glm1 <- glm(am~cyl, data=mtcars, family=binomial(logit))
340 | #' fit_glm2 <- glm(am~disp, data=mtcars, family=binomial(logit))
341 | #' ow <- optimizeModelWeight(fit_glm1$fitted.values, fit_glm2$fitted.values, actual=fit_glm1$model$am)  
342 | #' plot(ow$weights, ow$precision, type='l', xlab='weight on model 1')
343 | 
344 | 
345 | optimizeModelWeight <- function(pred1, pred2, actual, steps=50, cutoff=.5) {
346 |   s <- seq(0,1,length.out=steps)
347 |   df <- data.frame(weights=s, c=NA, precision=NA, recall=NA, f1=NA, f.5=NA, stringsAsFactors=F)
348 |   j <- 1
349 |   for(i in s){
350 |     mds <- (i)*pred1 + (1-i)*pred2
351 |     
352 |     df$c[j] <- somers2(mds,actual)['C']
353 |     df$precision[j] <- sum(actual[mds>cutoff])/(length(actual[mds>cutoff]))
354 |     df$recall[j] <- sum(actual[mds>cutoff])/(sum(actual))
355 |     df$f1[j] <- (2*df$precision[j]*df$recall[j])/(df$precision[j]+df$recall[j])
356 |     df$f.5[j] <- ((1+.5^2)*df$precision[j]*df$recall[j])/(((.5^2)*df$precision[j])+df$recall[j])
357 |     j<-j+1
358 |   }
359 |   return(df)
360 | }
361 | 
362 | 
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 


--------------------------------------------------------------------------------