├── .gitignore
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
    ├── average-comparisons.R
    ├── get-input-vars-from-model.R
    ├── multiple-comparisons.R
    └── pairs.R
├── README.md
├── _site
    ├── DESCRIPTION
    ├── LICENSE
    ├── NAMESPACE
    ├── R
    │   ├── average-comparisons.R
    │   ├── get-input-vars-from-model.R
    │   ├── multiple-comparisons.R
    │   ├── pairs.R
    │   └── transition-plots.R
    ├── README.md
    ├── man
    │   ├── ArrowPlot.Rd
    │   ├── AverageSpecifiedComparison.Rd
    │   ├── ComputeAPCFromPairs.Rd
    │   ├── GetAPC.Rd
    │   ├── GetAPCWithAbsolute.Rd
    │   ├── GetApcDF.Rd
    │   ├── GetComparisonDFFromPairs.Rd
    │   ├── GetPairs.Rd
    │   ├── GetPredCompsDF.Rd
    │   ├── GetPredCompsDFFromPairs.Rd
    │   ├── GetSingleInputPredComps.Rd
    │   ├── PlotApcDF.Rd
    │   ├── PlotApcDF2.Rd
    │   ├── PlotPairCumulativeWeights.Rd
    │   ├── PlotPredCompsDF.Rd
    │   ├── TransitionPlot.Rd
    │   ├── average_specified_comparison.Rd
    │   ├── compute_apc_from_pairs.Rd
    │   ├── get_apc.Rd
    │   ├── get_apc_with_absolute.Rd
    │   ├── get_pairs.Rd
    │   ├── mahal.Rd
    │   └── resample_from_pairs.Rd
    ├── notes
    │   ├── apc.Rmd
    │   ├── examples
    │   │   ├── diamonds.Rmd
    │   │   ├── loan-defaults.Rmd
    │   │   ├── logistic-regression.Rmd
    │   │   ├── overview.Rmd
    │   │   ├── simulated-linear-model-interactions.Rmd
    │   │   └── wine-logistic-regression.Rmd
    │   ├── impact.Rmd
    │   ├── index.Rmd
    │   ├── more
    │   │   ├── compared-with-paper.Rmd
    │   │   ├── future-work.Rmd
    │   │   ├── large-N-limit.Rmd
    │   │   └── renormalize-weights.Rmd
    │   ├── template
    │   └── transition-plots.Rmd
    ├── temp
    │   └── weights-exploration.R
    └── tests
    │   ├── run-all.R
    │   ├── test-apc.R
    │   └── test-pairs.R
├── man
    ├── ComputeAPCFromPairs.Rd
    ├── GetComparisonDFFromPairs.Rd
    ├── GetPairs.Rd
    ├── GetPredCompsDF.Rd
    ├── GetSingleInputApcs.Rd
    ├── GetSingleInputPredComps.Rd
    ├── PlotPairCumulativeWeights.Rd
    └── PlotPredCompsDF.Rd
├── notes
    ├── apc.Rmd
    ├── examples
    │   ├── diamonds.Rmd
    │   ├── loan-defaults.Rmd
    │   ├── logistic-regression.Rmd
    │   ├── overview.Rmd
    │   ├── simulated-linear-model-interactions.Rmd
    │   └── wine-logistic-regression.Rmd
    ├── impact.Rmd
    ├── index.Rmd
    ├── more
    │   ├── compared-with-paper.Rmd
    │   ├── future-work.Rmd
    │   ├── large-N-limit.Rmd
    │   └── pairs-and-weights.Rmd
    ├── presentations
    │   ├── AlpineLunch.Rmd
    │   ├── Bar.Rmd
    │   └── Lunch.Rmd
    └── template
└── tests
    ├── test-apc.R
    └── test-pairs.R


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .Rhistory
3 | main.R
4 | *figure/
5 | ..Rcheck
6 | *.RData


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: predcomps
 2 | Title: Average Predictive Comparisons
 3 | Description: An implementation of Gelman/Pardoe's average predictive comparisons
 4 | Version: 0.1
 5 | Author: David Chudzicki
 6 | Maintainer: David Chudzicki <dchudz@gmail.com>
 7 | Depends:
 8 |     R (>= 3.0.2),
 9 |     ggplot2,
10 |     gridExtra,
11 |     dplyr,
12 |     reshape2,
13 |     assertthat
14 | License: MIT
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 David J Chudzicki
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | S3method(GetComparisonDFFromPairs,"function")
 2 | S3method(GetComparisonDFFromPairs,glm)
 3 | S3method(GetComparisonDFFromPairs,lm)
 4 | S3method(GetComparisonDFFromPairs,randomForest)
 5 | export(ComputeApcFromPairs)
 6 | export(GetComparisonDF)
 7 | export(GetComparisonDFFromPairs)
 8 | export(GetPairs)
 9 | export(GetPredCompsDF)
10 | export(GetSingleInputApcs)
11 | export(PlotPairCumulativeWeights)
12 | export(PlotPredCompsDF)
13 | 


--------------------------------------------------------------------------------
/R/average-comparisons.R:
--------------------------------------------------------------------------------
  1 | #' GetSingleInputApcs
  2 | #' 
  3 | #' makes predictive comparison summaries (both per unit input and impact, both absolute and signed) by forming an data frame of pairs with appropriate weights and then calling `ComputeApcFromPairs`. 
  4 | #' Only works fore continuous inputs right now
  5 | #' 
  6 | #' @param predictionFunction this could be a function (which takes data frame and makes returns a vector of predictions) or an object of class `lm`, `glm`, or `randomForest`
  7 | #' @param X a data frame with all inputs
  8 | #' @param u a string naming the input of interest
  9 | #' @param v a string naming the other inputs
 10 | #' @param ... other arguments to be passed to `GetPairs`
 11 | #' @return a list with: \code{signed} (the usual Apc) and \code{absolute} (Apc applied to the absolute value of the differences)
 12 | #' 
 13 | #' @export
 14 | #' 
 15 | #' @examples
 16 | #' n <- 200
 17 | #' x1 <- runif(n = n, min = 0, max = 1)
 18 | #' x2 <- runif(n = n, min = 0, max = 1)
 19 | #' x3 <- runif(n = n, min = 0, max = 10)
 20 | #' y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1)
 21 | #' df <- data.frame(x1, x2, x3, y)
 22 | #' fittedLm <- lm(y ~ ., data = df)
 23 | #' fittedLm
 24 | #' GetSingleInputApcs(fittedLm, df, "x2", c("x1", "x3"))
 25 | GetSingleInputApcs <- function(predictionFunction, X, u, v, ...) {
 26 |   pairs <- GetPairs(X, u, v, ...)
 27 |   return(
 28 |     list(PerUnitInput.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v),
 29 |          PerUnitInput.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE),
 30 |          Impact.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v, impact=TRUE),
 31 |          Impact.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE, impact=TRUE))
 32 |   )
 33 | }
 34 | 
 35 | 
 36 | #' ComputeApcFromPairs
 37 | #' 
 38 | #' @export
 39 | ComputeApcFromPairs <- function(predictionFunction, pairs, u, v, absolute=FALSE, impact=FALSE) {
 40 |   uNew <- paste(u,".B",sep="")
 41 |   ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v)
 42 |   absoluteOrIdentity <- if (absolute) abs else identity
 43 |   uDiff <- ComparisonDF[[uNew]] - ComparisonDF[[u]]
 44 |   denom <- if (impact) sum(ComparisonDF$Weight) else sum(ComparisonDF$Weight * uDiff * sign(uDiff))
 45 |   Apc <- sum(absoluteOrIdentity(ComparisonDF$Weight * (ComparisonDF$yHat2 - ComparisonDF$yHat1) * sign(uDiff))) / denom
 46 |   return(Apc)
 47 | }
 48 | 
 49 | 
 50 | #' GetComparisonDFFromPairs
 51 | #' 
 52 | #' (abstracted this into a separate function from \code{GetApc} so we can more easily do things 
 53 | #' like \code{GetApcWithAbsolute})
 54 | #' @export
 55 | GetComparisonDFFromPairs <- function(predictionFunction, pairs, u, v) UseMethod("GetComparisonDFFromPairs")
 56 | 
 57 | 
 58 | 
 59 | # Two methods:
 60 | #  one for predictionFunction (df |--> predictions)
 61 | #  another for a glm object
 62 | 
 63 | #' @export
 64 | GetComparisonDFFromPairs.function <- function(predictionFunction, pairs, u, v) {
 65 |   uNew <- paste(u,".B",sep="")
 66 |   pairs$yHat1 <- predictionFunction(pairs)
 67 |   pairsNew <- structure(pairs[,c(v,uNew)], names=c(v,u)) #renaming u in pairsNew so we can call predictionFunction
 68 |   pairs$yHat2 <- predictionFunction(pairsNew)  
 69 |   return(pairs)
 70 | }
 71 | 
 72 | #' @export
 73 | GetComparisonDFFromPairs.glm <- function(glmFit, pairs, u, v) {
 74 |   predictionFunction <- function(df) predict.glm(glmFit, newdata=df, type="response")
 75 |   return(
 76 |     GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v)
 77 |   ) 
 78 | }
 79 | 
 80 | #' @export
 81 | GetComparisonDFFromPairs.lm <- function(lmFit, pairs, u, v) {
 82 |   predictionFunction <- function(df) predict.glm(lmFit, newdata=df)
 83 |   return(
 84 |     GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v)
 85 |   ) 
 86 | }
 87 | 
 88 | #' @export
 89 | GetComparisonDFFromPairs.randomForest <- function(rfFit, pairs, u, v) {
 90 |   # For classification, we need to specify that the predictions should be probabilties (not classes)
 91 |   if (rfFit$type == "classification") { 
 92 |     if (length(rfFit$classes) > 2) {
 93 |       stop("Sorry, I don't know what to do when there are more than 2 classes.")
 94 |     }
 95 |     predictionFunction <- function(df) predict(rfFit, newdata=df, type="prob")[,2]
 96 |   } else
 97 |   {
 98 |     predictionFunction <- function(df) predict(rfFit, newdata=df)
 99 |   }
100 |   return(
101 |     GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v)
102 |   ) 
103 | }
104 | 
105 | #' @export
106 | GetComparisonDFFromPairs.gbm <- function(rfFit, pairs, u, v) {
107 |   # For classification, we need to specify that the predictions should be probabilties (not classes)
108 |   if (rfFit$type == "classification") { 
109 |     if (length(rfFit$classes) > 2) {
110 |       stop("Sorry, I don't know what to do when there are more than 2 classes.")
111 |     }
112 |     predictionFunction <- function(df) predict(rfFit, newdata=df, type="prob")[,2]
113 |   } else
114 |   {
115 |     predictionFunction <- function(df) predict(rfFit, newdata=df)
116 |   }
117 |   return(
118 |     GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v)
119 |   ) 
120 | }
121 | 
122 | 
123 | #' @export
124 | GetComparisonDF <- function(predictionFunction, df, u, v=NULL, ...) {
125 |   if (is.null(v)) {
126 |     inputNames <- GetInputVarsFromModel(predictionFunction)
127 |     v <- setdiff(inputNames, u)
128 |   }
129 |   pairs <- GetPairs(df, u, v, ...)
130 |   ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v)
131 | }
132 | 


--------------------------------------------------------------------------------
/R/get-input-vars-from-model.R:
--------------------------------------------------------------------------------
 1 | GetInputVarsFromModel <- function(model) UseMethod("GetInputVarsFromModel")
 2 | 
 3 | GetInputVarsFromModel.lm <- function(model) {
 4 |   # This seems like a robust way to get the input names from an lm object, but I'm not really sure
 5 |   return(attr(model$terms, "term.labels"))
 6 | }
 7 | 
 8 | GetInputVarsFromModel.glm <- function(model) {
 9 |   # This seems like a robust way to get the input names from an glm object, but I'm not really sure
10 |   return(attr(model$terms, "term.labels"))
11 | }
12 | 
13 | GetInputVarsFromModel.randomForest <- function(model) {
14 |   # This seems like a robust way to get the input names from an glm object, but I'm not really sure
15 |   return(attr(model$terms, "term.labels"))
16 | }
17 | 


--------------------------------------------------------------------------------
/R/multiple-comparisons.R:
--------------------------------------------------------------------------------
 1 | #' GetApcDF
 2 | #' 
 3 | #' makes average predictive comparison for all specified inputs
 4 | #' 
 5 | #' @param model Either a function (from a data frame to vector of predictions) or a model we know how to deal with (lm, glm)
 6 | #' @param df data frame with data
 7 | #' @param inputVars inputs to the model
 8 | #' @param ... extra parguments passed to GetPairs used to control Weight function
 9 | #' @export
10 | #' @examples
11 | #' n <- 200
12 | #' x1 <- runif(n = n, min = 0, max = 1)
13 | #' x2 <- runif(n = n, min = 0, max = 1)
14 | #' x3 <- runif(n = n, min = 0, max = 10)
15 | #' y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1)
16 | #' df <- data.frame(x1, x2, x3, y)
17 | #' fittedLm <- lm(y ~ ., data = df)
18 | #' apcDF <- GetPredCompsDF(fittedLm, df = df)
19 | #' apcDF
20 | GetPredCompsDF <- function(model, df, inputVars = NULL, ...) {
21 |   
22 |   # If inputVars is null, we can try to get the list of inputs from the model:
23 |   if (is.null(inputVars)) {
24 |     inputVars <- GetInputVarsFromModel(model)
25 |   }
26 |   apcList <-  Map(function(currentVar) {
27 |     cat(paste("Working on:", currentVar, "\n"))
28 |     data.frame(Input = currentVar,
29 |                GetSingleInputApcs(model, 
30 |                                        df, 
31 |                                        currentVar, 
32 |                                        c(setdiff(inputVars, currentVar)),
33 |                                        ...))},
34 |     inputVars
35 |   )
36 |   apcDF <- do.call(rbind, apcList)
37 |   return(apcDF)
38 | }
39 | 
40 | #' PlotApcDF
41 | #' 
42 | #' plots the output of GetApcDF -- this is my preferred display for now
43 | #' 
44 | #' @param apcDF the output of GetApcDF
45 | #' @export
46 | #' @examples
47 | #' n <- 200
48 | #' x1 <- runif(n = n, min = 0, max = 1)
49 | #' x2 <- runif(n = n, min = 0, max = 1)
50 | #' x3 <- runif(n = n, min = 0, max = 10)
51 | #' y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1)
52 | #' df <- data.frame(x1, x2, x3, y)
53 | #' fittedLm <- lm(y ~ ., data = df)
54 | #' apcDF <- GetPredCompsDF(fittedLm, df = df)
55 | #' PlotPredCompsDF(apcDF, variant = "PerUnitInput") + theme_gray(base_size = 18)
56 | PlotPredCompsDF <- function(apcDF, variant="Impact") {
57 |   xLabel <- switch(variant, 
58 |                    "Impact"="Avg Change in Output",
59 |                    "PerUnitInput"="Avg Change in Output Per Unit Input",
60 |                    stop("Unknown Predcomps Variant"))
61 |   
62 |   apcDF <- apcDF[c("Input", grep(paste0("^",variant,"\\."), names(apcDF), value=TRUE))]
63 |   names(apcDF) <- gsub(paste0("^",variant,"\\."), "", names(apcDF))
64 |   
65 |   maxAPC <- max(abs(apcDF$Absolute))  
66 |   apcDF$Input <- reorder(apcDF$Input, apcDF$Absolute)
67 |   longAPCs <- melt(apcDF, id="Input", value.name = "Value", variable.name = "Type")
68 |   longAPCs2 <- rbind(
69 |     longAPCs,
70 |     transform(subset(longAPCs, Type=="Absolute"), Value=-Value)
71 |   )
72 |   longAPCs2 <- longAPCs2[order(factor(longAPCs2$Type, levels=c("Absolute", "Signed"))), ]  
73 |   
74 |   return(
75 |     ggplot(longAPCs2) +
76 |       geom_point(aes(y = Input, x=Value, color=Type, shape=Type, size=Type)) +
77 |       scale_x_continuous(limits=c(-maxAPC, maxAPC)) +
78 |       scale_size_discrete(range=c(3,4)) +
79 |       ggtitle(variant) +
80 |       geom_vline(aes(xintercept=0), alpha=.5) +
81 |       xlab(xLabel)
82 |   )
83 | }
84 | 


--------------------------------------------------------------------------------
/R/pairs.R:
--------------------------------------------------------------------------------
  1 | #' GetPairs
  2 | #'
  3 | #' Form all pairs of rows in \code{X} and compute Mahalanobis distances based on \code{v}. 
  4 | #' 
  5 | #' To help with computational constraints, you have the option to not form pairs between all rows of \code{X} but instead of specify a certain number (\code{numForTransitionStart}) to randomly be selected as rows from which transitions start, and another number (\code{numForTransitionEnd}) to be randomly selected as where transitions end. We then form all pairs between transition-start rows and transition-end rows. 
  6 | #' 
  7 | #' In order to get a smaller data frame for later manipulations (and maybe just because it's a good idea), you can also specify \code{onlyIncludeNearestN}, in which case we return only the nearest \code{onlyIncludeNearestN} transition ends for each transition start (instead of all pairs).
  8 | #' 
  9 | #' @param X data frame
 10 | #' @param u input of interest
 11 | #' @param v other inputs
 12 | #' @param mahalanobisConstantTerm Weights are (1 / (mahalanobisConstantTerm + Mahalanobis distance))
 13 | #' @param numForTransitionStart number of rows to use as the start points of transitions (defaulting to `NULL`, we use all rows)
 14 | #' @param numForTransitionEnd number of rows to use as potential end points of transitions (defaulting to `NULL`, we use all rows)
 15 | #' @param onlyIncludeNearestN for each transition start, we only include as transition end points the nearest `onlyIncludeNearestN` rows  (defaulting to `NULL`, we use all rows)
 16 | #' @return a data frame with the inputs \code{v} from the first of each pair, \code{u} from each half (with ".B" appended to the second), and the Mahalanobis distances between the pairs.
 17 | #' @export
 18 | #' @examples
 19 | #' v <- rnorm(100)
 20 | #' u <- v + 0.3*rnorm(100)
 21 | #' qplot(v,u)
 22 | #' X = data.frame(v=v,u=u)
 23 | #' pairsDF <- GetPairs(X, "v", "u")
 24 | #' pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1)
 25 | #' # When we subset to one "original row number", all of the v's are the same:
 26 | #' print(pairsDFRow1$v)
 27 | #' # ... and u's corresponding to closer v.B (the v in the second element of the pair) have higher weight:
 28 | #' qplot(u.B, Weight, data=pairsDFRow1)
 29 | GetPairs <- function(X, u, v,
 30 |                      numForTransitionStart = NULL,
 31 |                      numForTransitionEnd = NULL,
 32 |                      onlyIncludeNearestN = NULL,
 33 |                      mahalanobisConstantTerm=1) {
 34 |   
 35 |   assert_that(length(u) == 1) # make sure we have exactly 1 input var of interest
 36 |   for (columnName in c(u,v)) {
 37 |     assert_that(columnName %in% names(X))
 38 |     columnClass <- class(X[[columnName]])
 39 |     if (!(columnClass) %in% c("integer", "numeric")) {
 40 |       stop(sprintf("Sorry, column %s is of class %s. I can only deal with integer and numeric types for now.", columnName, columnClass))
 41 |     }
 42 |   }
 43 |   
 44 |   if (!is.null(numForTransitionStart)) {
 45 |     X1 <- X[sample.int(nrow(X), size=numForTransitionStart), c(v,u)] 
 46 |   } else {
 47 |     X1 <- X[c(v,u)]
 48 |   }
 49 |   
 50 |   if (!is.null(numForTransitionEnd)) {
 51 |     X2 <- X[sample.int(nrow(X), size=numForTransitionEnd), c(v,u)]
 52 |   } else {
 53 |     X2 <- X[c(v,u)]
 54 |   }
 55 |   
 56 |   X1$OriginalRowNumber <- 1:nrow(X1)
 57 |   X2$OriginalRowNumber.B <- 1:nrow(X2)
 58 |   
 59 |   vMatrix1 <- as.matrix(X1[,v])
 60 |   vMatrix2 <- as.matrix(X2[,v])
 61 |   
 62 |   
 63 |   covV=cov(vMatrix2)
 64 |   
 65 |   distMatrix <- apply(vMatrix1, 1, function(row) mahalanobis(vMatrix2, row, covV))
 66 |   dim(distMatrix)
 67 |   
 68 |   colnames(distMatrix) <- 1:ncol(distMatrix)
 69 |   rownames(distMatrix) <- 1:nrow(distMatrix)
 70 |   distDF <- as.data.frame(as.table(distMatrix))
 71 |   names(distDF) <- c("OriginalRowNumber.B", "OriginalRowNumber", "MahalanobisDistance")
 72 |   
 73 |   if (!is.null(onlyIncludeNearestN)) {
 74 |     distDF <- distDF %>% 
 75 |       group_by(OriginalRowNumber) %>% 
 76 |       filter(rank(MahalanobisDistance, ties.method="random") < onlyIncludeNearestN)
 77 |   }
 78 | 
 79 |   pairs <- merge(X1, distDF, by = "OriginalRowNumber")
 80 |   pairs <- merge(X2, pairs, by = "OriginalRowNumber.B", suffixes = c(".B", ""))
 81 |   pairs$Weight <- 1/(mahalanobisConstantTerm + pairs$MahalanobisDistance)
 82 |   
 83 |   # If we haven't sampled, then OriginalRowNumber == OriginalRowNumber.B means that 
 84 |   # the transition start and end are the same, so we should remove those rows.
 85 |   if (is.null(numForTransitionStart) && is.null(numForTransitionEnd)) {
 86 |     pairs <- subset(pairs, OriginalRowNumber != OriginalRowNumber.B)
 87 |   }
 88 |   
 89 |   # Renormalize weights:
 90 |   pairs <- pairs %>% group_by(OriginalRowNumber) %>% mutate(Weight = Weight/sum(Weight))
 91 |   
 92 |   return(data.frame(pairs))
 93 | }
 94 | 
 95 | 
 96 | #' PlotPairCumulativeWeights
 97 | #'
 98 | #' For a sample of transition start rows, we plot rank of transition end (by increasing weight) vs. cumulative weight. This gives a sense of how much weight is going into the nearest points vs. further ones.
 99 | #' 
100 | #' @export
101 | #' @examples
102 | #' v <- rnorm(100)
103 | #' u <- v + 0.3*rnorm(100)
104 | #' X = data.frame(v=v,u=u)
105 | #' pairsDF <- GetPairs(X, "v", "u")
106 | #' pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1)
107 | #' # For most original rows, we get 75% of the weight in 50% of the pairs:
108 | #' PlotPairCumulativeWeights(pairsDF)
109 | 
110 | PlotPairCumulativeWeights <- function(pairs, numOriginalRowNumbersToPlot = 20) {
111 |   rowNumSample <- sample(unique(pairs$OriginalRowNumber))[1:numOriginalRowNumbersToPlot]
112 |   pairsWithCumWeightSums <- pairs %>% 
113 |     group_by(OriginalRowNumber) %>% 
114 |     arrange(OriginalRowNumber, -Weight) %>% 
115 |     mutate(CumulativeWeight = cumsum(Weight), Rank = dense_rank(-Weight))
116 |   
117 |   pairsSubset <- subset(pairsWithCumWeightSums, OriginalRowNumber %in% rowNumSample)
118 |   
119 |   ggplot() + 
120 |     geom_line(aes(x=Rank, y=CumulativeWeight, color=factor(OriginalRowNumber)), data = pairsSubset, alpha = .2) + 
121 |     geom_line(aes(x=Rank, y=CumulativeWeight), stat = "summary", fun.y = "median", data=pairsWithCumWeightSums)
122 | }  
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [Documentation etc.](http://www.davidchudzicki.com/predcomps/)
 2 | 
 3 | Interested in getting involved? Here are some ways to help:
 4 | 
 5 | - Trying out the package in examples, write about the results. Does it help you better understand your complicated models? If not, what should be different? You can add examples to the documentation here via pull requests, or write in your own space.
 6 | - Clarifying the language in the documentation - what can be made clearer?
 7 | - Working on any of the "future work" below
 8 | 
 9 | 
10 | ## Future Work
11 | 
12 | (Unsure about the priority of the rest, but the top one is definitely most important.)
13 | 
14 | ### Explicit model for p(u|v)
15 | 
16 | Currently [the way we assign weights](http://0.0.0.0:4000/more-pairs-and-weights.html) to sample for p(u|v) (roughly as described in the paper) requires a bit of hand-tweaking to work well in individual examples. It's also hard to generalize to categorical inputs. This may be the biggest barrier to widespread adoption.
17 | 
18 | As an alternative, perhaps we can explicitly build a model for the desired conditional distribution, e.g. maybe by using something like [BART](https://github.com/kapelner/bartMachine).
19 | 
20 | Todo:
21 | 
22 | - implement
23 | - see how well it works
24 | 
25 | 
26 | ### Categorical inputs
27 | 
28 | Once we've done "Explicit model for p(u|v)", allowing categorical inputs should be much easier, but there's still some thought required.
29 | 
30 | ### Sensivity Analysis
31 | 
32 | Some of the examples show how to do sensitivity analysis in the spirit of this package, but it'd be great to have that do it for you.
33 | 
34 | ### "Variable Importance"
35 | 
36 | Implement something like [conditional variable importance](http://www.biomedcentral.com/1471-2105/9/307) in the spirit of this package. (Like "permutation importance", but instead of taking a permutation, you'd sample from the conditional distribution p(u|v).)
37 | 
38 | ### Other tools/methods for understanding complicated models
39 | 
40 | I'd like to compile a list of other work in this direction, maybe comparing them with this.
41 | 
42 | I should add a page discussing other methods people have used to get at somewhat the same idea.
43 | 
44 | - [conditional variable importance](http://www.biomedcentral.com/1471-2105/9/307) - out of everything I've seen, conditional variable importance is the most similar in spirit to this package 
45 | - randomForest package in R (partial plots, variable importance)
46 | - earth package in R (variable importance)
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/_site/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: predcomps
 2 | Title: Average Predictive Comparisons
 3 | Description: An implementation of Gelman/Pardoe's average predictive comparisons
 4 | Version: 0.1
 5 | Author: David Chudzicki
 6 | Maintainer: David Chudzicki <dchudz@gmail.com>
 7 | VignetteBuilder: knitr
 8 | Depends:
 9 |     R (>= 3.0.2),
10 |     ggplot2,
11 |     gridExtra,
12 |     dplyr,
13 |     reshape2,
14 |     assertthat
15 | License: MIT
16 | 


--------------------------------------------------------------------------------
/_site/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 David J Chudzicki
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/_site/NAMESPACE:
--------------------------------------------------------------------------------
 1 | S3method(GetComparisonDFFromPairs,"function")
 2 | S3method(GetComparisonDFFromPairs,glm)
 3 | S3method(GetComparisonDFFromPairs,lm)
 4 | S3method(GetComparisonDFFromPairs,randomForest)
 5 | export(ComputeApcFromPairs)
 6 | export(GetComparisonDF)
 7 | export(GetComparisonDFFromPairs)
 8 | export(GetPairs)
 9 | export(GetPredCompsDF)
10 | export(GetSingleInputPredComps)
11 | export(PlotPairCumulativeWeights)
12 | export(PlotPredCompsDF)
13 | export(TransitionPlot)
14 | 


--------------------------------------------------------------------------------
/_site/R/average-comparisons.R:
--------------------------------------------------------------------------------
  1 | #' GetComparisonDF
  2 | #' 
  3 | #' makes average predictive comparison (based on Gelman/Pardoe) by forming pairs with two versions of the input of interest and averaging the predictive difference using Weights. I think Weights should be an approximation of the density p(u1,u2|v) or something like that... I need to look back at this. At present, I believe this is probably implementing the version in the Gelman/Pardoe paper.
  4 | #' returns a list with the Apc and the Apc applied to the absolute value of the prediction function
  5 | #' Only works fore continuous inputs right now
  6 | #' 
  7 | #' @param predictionFunction
  8 | #' @param X 
  9 | #' @param u input of interest
 10 | #' @param v other inputs
 11 | #' @param k Weights are (1 / (k + Mahalanobis distance))
 12 | #' @return a list with: \code{signed} (the usual Apc) and \code{absolute} (Apc applied to the absolute value of the differences)
 13 | #' @export
 14 | GetSingleInputApcs <- function(predictionFunction, X, u, v, ...) {
 15 |   pairs <- GetPairs(X, u, v, ...)
 16 |   return(
 17 |     list(PerUnitInput.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v),
 18 |          Apc.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE),
 19 |          Impact.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v, impact=TRUE),
 20 |          Impact.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE, impact=TRUE))
 21 |   )
 22 | }
 23 | 
 24 | 
 25 | #' ComputeApcFromPairs
 26 | #' 
 27 | #' @export
 28 | ComputeApcFromPairs <- function(predictionFunction, pairs, u, v, absolute=FALSE, impact=FALSE) {
 29 |   uNew <- paste(u,".B",sep="")
 30 |   ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v)
 31 |   absoluteOrIdentity <- if (absolute) abs else identity
 32 |   uDiff <- ComparisonDF[[uNew]] - ComparisonDF[[u]]
 33 |   denom <- if (impact) sum(ComparisonDF$Weight) else sum(ComparisonDF$Weight * uDiff * sign(uDiff))
 34 |   Apc <- sum(absoluteOrIdentity(ComparisonDF$Weight * (ComparisonDF$yHat2 - ComparisonDF$yHat1) * sign(uDiff))) / denom
 35 |   return(Apc)
 36 | }
 37 | 
 38 | 
 39 | #' GetComparisonDFFromPairs
 40 | #' 
 41 | #' (abstracted this into a separate function from \code{GetApc} so we can more easily do things 
 42 | #' like \code{GetApcWithAbsolute})
 43 | #' @export
 44 | GetComparisonDFFromPairs <- function(predictionFunction, pairs, u, v) UseMethod("GetComparisonDFFromPairs")
 45 | 
 46 | 
 47 | 
 48 | # Two methods:
 49 | #  one for predictionFunction (df |--> predictions)
 50 | #  another for a glm object
 51 | 
 52 | #' @export
 53 | GetComparisonDFFromPairs.function <- function(predictionFunction, pairs, u, v) {
 54 |   uNew <- paste(u,".B",sep="")
 55 |   pairs$yHat1 <- predictionFunction(pairs)
 56 |   pairsNew <- structure(pairs[,c(v,uNew)], names=c(v,u)) #renaming u in pairsNew so we can call predictionFunction
 57 |   pairs$yHat2 <- predictionFunction(pairsNew)  
 58 |   return(pairs)
 59 | }
 60 | 
 61 | #' @export
 62 | GetComparisonDFFromPairs.glm <- function(glmFit, pairs, u, v) {
 63 |   predictionFunction <- function(df) predict.glm(glmFit, newdata=df, type="response")
 64 |   return(
 65 |     GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v)
 66 |   ) 
 67 | }
 68 | 
 69 | #' @export
 70 | GetComparisonDFFromPairs.lm <- function(lmFit, pairs, u, v) {
 71 |   predictionFunction <- function(df) predict.glm(lmFit, newdata=df)
 72 |   return(
 73 |     GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v)
 74 |   ) 
 75 | }
 76 | 
 77 | #' @export
 78 | GetComparisonDFFromPairs.randomForest <- function(rfFit, pairs, u, v) {
 79 |   # For classification, we need to specify that the predictions should be probabilties (not classes)
 80 |   if (rfFit$type == "classification") { 
 81 |     if (length(rfFit$classes) > 2) {
 82 |       stop("Sorry, I don't know what to do when there are more than 2 classes.")
 83 |     }
 84 |     predictionFunction <- function(df) predict(rfFit, newdata=df, type="prob")[,2]
 85 |   } else
 86 |   {
 87 |     predictionFunction <- function(df) predict(rfFit, newdata=df)
 88 |   }
 89 |   return(
 90 |     GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v)
 91 |   ) 
 92 | }
 93 | 
 94 | 
 95 | #' @export
 96 | GetComparisonDF <- function(predictionFunction, df, u, v=NULL, ...) {
 97 |   if (is.null(v)) {
 98 |     inputNames <- GetInputVarsFromModel(predictionFunction)
 99 |     v <- setdiff(inputNames, u)
100 |   }
101 |   pairs <- GetPairs(df, u, v, ...)
102 |   ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v)
103 | }
104 | 


--------------------------------------------------------------------------------
/_site/R/get-input-vars-from-model.R:
--------------------------------------------------------------------------------
 1 | GetInputVarsFromModel <- function(model) UseMethod("GetInputVarsFromModel")
 2 | 
 3 | GetInputVarsFromModel.lm <- function(model) {
 4 |   # This seems like a robust way to get the input names from an lm object, but I'm not really sure
 5 |   return(attr(model$terms, "term.labels"))
 6 | }
 7 | 
 8 | GetInputVarsFromModel.glm <- function(model) {
 9 |   # This seems like a robust way to get the input names from an glm object, but I'm not really sure
10 |   return(attr(model$terms, "term.labels"))
11 | }
12 | 
13 | GetInputVarsFromModel.randomForest <- function(model) {
14 |   # This seems like a robust way to get the input names from an glm object, but I'm not really sure
15 |   return(attr(model$terms, "term.labels"))
16 | }
17 | 


--------------------------------------------------------------------------------
/_site/R/multiple-comparisons.R:
--------------------------------------------------------------------------------
 1 | #' GetApcDF
 2 | #' 
 3 | #' makes average predictive comparison for all specified inputs
 4 | #' 
 5 | #' @param model Either a function (from a data frame to vector of predictions) or a model we know how to deal with (lm, glm)
 6 | #' @param df data frame with data
 7 | #' @param inputVars inputs to the model
 8 | #' @param ... extra parguments passed to GetPairs used to control Weight function
 9 | #' @export
10 | GetPredCompsDF <- function(model, df, inputVars = NULL, ...) {
11 |   
12 |   # If inputVars is null, we can try to get the list of inputs from the model:
13 |   if (is.null(inputVars)) {
14 |     inputVars <- GetInputVarsFromModel(model)
15 |   }
16 |   apcList <-  Map(function(currentVar) {
17 |     cat(paste("Working on:", currentVar, "\n"))
18 |     data.frame(Input = currentVar,
19 |                GetSingleInputApcs(model, 
20 |                             df, 
21 |                             currentVar, 
22 |                             c(setdiff(inputVars, currentVar)),
23 |                             ...))},
24 |     inputVars
25 |   )
26 |   apcDF <- do.call(rbind, apcList)
27 |   return(apcDF)
28 | }
29 | 
30 | #' PlotApcDF
31 | #' 
32 | #' plots the output of GetApcDF -- this is my preferred display for now
33 | #' 
34 | #' @param apcDF the output of GetApcDF
35 | #' @export
36 | PlotPredCompsDF <- function(apcDF, variant="Impact") {
37 |   apcDF <- apcDF[c("Input", grep(paste0("^",variant,"\\."), names(apcDF), value=TRUE))]
38 |   names(apcDF) <- gsub(paste0("^",variant,"\\."), "", names(apcDF))
39 |   
40 |   maxAPC <- max(abs(apcDF$Absolute))  
41 |   apcDF$Input <- reorder(apcDF$Input, apcDF$Absolute)
42 |   longAPCs <- melt(apcDF, id="Input", value.name = "Value", variable.name = "Type")
43 |   longAPCs2 <- rbind(
44 |     longAPCs,
45 |     transform(subset(longAPCs, Type=="Absolute"), Value=-Value)
46 |   )
47 |   longAPCs2 <- longAPCs2[order(factor(longAPCs2$Type, levels=c("Absolute", "Signed"))), ]  
48 |   return(
49 |     ggplot(longAPCs2) +
50 |       geom_point(aes(y = Input, x=Value, color=Type, shape=Type, size=Type)) +
51 |       scale_x_continuous(limits=c(-maxAPC, maxAPC)) +
52 |       scale_size_discrete(range=c(3,4)) +
53 |       ggtitle(variant) +
54 |       geom_vline(aes(xintercept=0), alpha=.5)
55 |   )
56 | }
57 | 


--------------------------------------------------------------------------------
/_site/R/pairs.R:
--------------------------------------------------------------------------------
  1 | #' GetPairs
  2 | #'
  3 | #' Form all pairs of rows in \code{X} and compute Mahalanobis distances based on \code{v}. 
  4 | #' 
  5 | #' To help with computational constraints, you have the option to not form pairs between all rows of \code{X} but instead of specify a certain number (\code{numForTransitionStart}) to randomly be selected as rows from which transitions start, and another number (\code{numForTransitionEnd}) to be randomly selected as where transitions end. We then form all pairs between transition-start rows and transition-end rows. 
  6 | #' 
  7 | #' In order to get a smaller data frame for later manipulations (and maybe just because it's a good idea), you can also specify \code{onlyIncludeNearestN}, in which case we return only the nearest \code{onlyIncludeNearestN} transition ends for each transition start (instead of all pairs).
  8 | #' 
  9 | #' @param X data frame
 10 | #' @param u input of interest
 11 | #' @param v other inputs
 12 | #' @param mahalanobisConstantTerm Weights are (1 / (mahalanobisConstantTerm + Mahalanobis distance))
 13 | #' @param numForTransitionStart
 14 | #' @param numForTransitionEnd
 15 | #' @return a data frame with the inputs \code{v} from the first of each pair, \code{u} from each half (with ".B" appended to the second), and the Mahalanobis distances between the pairs.
 16 | #' @examples
 17 | #' # Should put unit test example here
 18 | #' 
 19 | #' @export
 20 | GetPairs <- function(X, u, v,
 21 |                      numForTransitionStart = NULL,
 22 |                      numForTransitionEnd = NULL,
 23 |                      onlyIncludeNearestN = NULL,
 24 |                      mahalanobisConstantTerm=1) {
 25 |   
 26 |   assert_that(length(u) == 1) # make sure we have exactly 1 input var of interest
 27 |   
 28 |   if (!is.null(numForTransitionStart)) {
 29 |     X1 <- X[sample.int(nrow(X), size=numForTransitionStart), c(v,u)] 
 30 |   } else {
 31 |     X1 <- X[c(v,u)]
 32 |   }
 33 |   
 34 |   if (!is.null(numForTransitionEnd)) {
 35 |     X2 <- X[sample.int(nrow(X), size=numForTransitionEnd), c(v,u)]
 36 |   } else {
 37 |     X2 <- X[c(v,u)]
 38 |   }
 39 |   
 40 |   X1$OriginalRowNumber <- 1:nrow(X1)
 41 |   X2$OriginalRowNumber.B <- 1:nrow(X2)
 42 |   
 43 |   vMatrix1 <- as.matrix(X1[,v])
 44 |   vMatrix2 <- as.matrix(X2[,v])
 45 |   
 46 |   
 47 |   covV=cov(vMatrix2)
 48 |   
 49 |   distMatrix <- apply(vMatrix1, 1, function(row) mahalanobis(vMatrix2, row, covV))
 50 |   dim(distMatrix)
 51 |   
 52 |   colnames(distMatrix) <- 1:ncol(distMatrix)
 53 |   rownames(distMatrix) <- 1:nrow(distMatrix)
 54 |   distDF <- as.data.frame(as.table(distMatrix))
 55 |   names(distDF) <- c("OriginalRowNumber.B", "OriginalRowNumber", "MahalanobisDistance")
 56 |   
 57 |   #   browser()
 58 |   # This isn't really doing its filter properly. Something is wrong!
 59 |   if (!is.null(onlyIncludeNearestN)) {
 60 |     distDF <- distDF %.% 
 61 |       group_by(OriginalRowNumber) %.% 
 62 |       filter(rank(MahalanobisDistance, ties.method="random") < onlyIncludeNearestN)
 63 |   }
 64 |   #   
 65 |   #   distDF2 <- distDF %.% group_by(OriginalRowNumber) %.% mutate(rn = rank(MahalanobisDistance, ties.method="random"))  
 66 |   #   
 67 |   #   (subset(distDF, OriginalRowNumber == 1))
 68 |   #   (subset(distDF2, OriginalRowNumber == 1))
 69 |   #   
 70 |   
 71 |   #   sort(subset(distDF, OriginalRowNumber == 1)$MahalanobisDistance)
 72 |   #   sort(subset(distDF2, OriginalRowNumber == 1)$MahalanobisDistance)
 73 |   
 74 |   pairs <- merge(X1, distDF, by = "OriginalRowNumber")
 75 |   pairs <- merge(X2, pairs, by = "OriginalRowNumber.B", suffixes = c(".B", ""))
 76 |   pairs$Weight <- 1/(mahalanobisConstantTerm + pairs$MahalanobisDistance)
 77 |   
 78 |   # If we haven't sampled, then OriginalRowNumber == OriginalRowNumber.B means that 
 79 |   # the transition start and end are the same, so we should remove those rows.
 80 |   if (is.null(numForTransitionStart) && is.null(numForTransitionEnd)) {
 81 |     pairs <- subset(pairs, OriginalRowNumber != OriginalRowNumber.B)
 82 |   }
 83 |   
 84 |   # Renormalize weights:
 85 |   pairs <- pairs %.% group_by(OriginalRowNumber) %.% mutate(Weight = Weight/sum(Weight))
 86 |   
 87 |   return(data.frame(pairs))
 88 | }
 89 | 
 90 | 
 91 | #' PlotPairCumulativeWeights
 92 | #'
 93 | #' For a sample of transition start rows, we plot rank of transition end (by increasing weight) vs. cumulative weight. This gives a sense of how much weight is going into the nearest points vs. further ones.
 94 | #' 
 95 | #' @export
 96 | 
 97 | PlotPairCumulativeWeights <- function(pairs, numOriginalRowNumbersToPlot = 20) {
 98 |   rowNumSample <- sample(unique(pairs$OriginalRowNumber))[1:numOriginalRowNumbersToPlot]
 99 |   pairsWithCumWeightSums <- pairs %.% 
100 |     group_by(OriginalRowNumber) %.% 
101 |     arrange(OriginalRowNumber, -Weight) %.% 
102 |     mutate(CumulativeWeight = cumsum(Weight), Rank = dense_rank(-Weight))
103 |   
104 |   pairsSubset <- subset(pairsWithCumWeightSums, OriginalRowNumber %in% rowNumSample)
105 |   
106 |   ggplot() + 
107 |     geom_line(aes(x=Rank, y=CumulativeWeight, color=factor(OriginalRowNumber)), data = pairsSubset, alpha = .2) + 
108 |     geom_line(aes(x=Rank, y=CumulativeWeight), stat = "summary", fun.y = "median", data=pairsWithCumWeightSums)
109 | }  
110 | 


--------------------------------------------------------------------------------
/_site/R/transition-plots.R:
--------------------------------------------------------------------------------
 1 | #' ArrowPlot
 2 | #' 
 3 | #' Sampled transitions in 
 4 | #' 
 5 | #' @param predictionFunction
 6 | #' @param df data frame with data
 7 | #' @param u input of interest
 8 | #' @param v other inputs
 9 | #' @export
10 | #' @examples
11 | #' N <- 100
12 | #' df <- data.frame(u=rnorm(N), v=rnorm(N))
13 | #' df$y1 = 2*df$u + 2*df$v + rnorm(N)
14 | #' lm1 <- lm(y1 ~ u + v, data=df)
15 | #' print(lm1)
16 | #' TransitionPlot(function(df) predict(lm1, df), df, "u", "v") + ylab("y1")
17 | #' df$y2 = df$u + df$v + 2*df$v*df$u + rnorm(N)
18 | #' lm2 <- lm(y2 ~ u*v, data=df)
19 | #' print(lm2)
20 | #' TransitionPlot(function(df) predict(lm2, df), df, "u", "v") + ylab("y2")
21 | TransitionPlot <- function(predictionFunction, df, u, v, plot=TRUE) {
22 |   pairs <- GetPairs(df, u, v)
23 |   
24 |   pairsToPlot <- ddply(pairs[pairs[[paste0(u,".B")]] > pairs[[u]], ], 
25 |                        "OriginalRowNumber", function(df) {
26 |                          df[sample.int(nrow(df), size=1, prob=df$Weight), ]
27 |                        })
28 |   
29 |   pairsToPlot$output <- predictionFunction(pairsToPlot)
30 |   pairsToPlot2 <- pairsToPlot
31 |   pairsToPlot2[[u]] <- pairsToPlot2[[paste0(u,".B")]]
32 |   pairsToPlot$outputNew <- predictionFunction(pairsToPlot2)  
33 |   
34 |   p <- ggplot(pairsToPlot) +
35 |     geom_segment(aes_string(x = u, y = "output", xend = paste0(u,".B"), yend = "outputNew"),  arrow = arrow()) + 
36 |     ggtitle(sprintf("Transitions in %s holding %s constant", u, Reduce(paste, v)))
37 |   if(plot) print(p)
38 |   return(p)
39 | }
40 | 


--------------------------------------------------------------------------------
/_site/README.md:
--------------------------------------------------------------------------------
 1 | This README is mainly for people developing the project (which is just me, at the moment). Everything for users is at the [Project Page](http://www.davidchudzicki.com/predcomps/).
 2 | 
 3 | Lots of people are using complicated/non-parametric models for lots of things, but it’s hard to understand what complicated models are telling you. This is general-purpose way to extract understanding from a very large class of complicated models.
 4 | 
 5 | # Todo 
 6 | 
 7 | ## Must be done before releasing to the world
 8 | 
 9 | Finishing pages:
10 | - add example in quickstart section to index.
11 | - make make diamonds presentable 
12 | 	- including maybe transition curves? (arrow plots are not so nice)
13 | - another example, e.g. give me some credit (nice b/c it's logistic)
14 | - document every function other people would use, with example
15 | - Transition Plots -- maybe add another way to visualize the same effect, e.g. input vs. delta output? and/or improve the original one (opacity?)
16 | 	- something like this?: make 30 pairings for each of 100 original points. Draw lines instead of arrows. (include the original point for this! current pairs DF excludes original point.) maybe very thin lines with large and somewhat transparent points? for a sense of the density. 
17 | 
18 | ## Todo later
19 | 
20 | See the [future work](http://www.davidchudzicki.com/predcomps/more-future-work.html) section of the documentation.
21 | 
22 | ## Documentation
23 | 
24 | The [documentation](http://www.davidchudzicki.com/predcomps/) is hosted on Github pages using a theme by [orderedlist](https://github.com/orderedlist).
25 | 
26 | 
27 | -----
28 | 
29 | Next steps for me:
30 | 
31 | - fix horizontal axis label for impact / APC charts
32 | 
33 | - add chart to wine 
34 | 
35 | - More work on credit example, including:
36 | 	- (just clean it up a bit -- it's mostly better)
37 |  - document weights situation better
38 | 	 - explain more concisely about renormalizing (it's obvious enough not to need long explanation)
39 | 	 - explain about why 1/distance doesn't weight nearby points enough (sphere's surphace grows as distance^n)
40 | 	 - explain my "closest n only" heuristic
41 | 	 - explain bias/variance tradeoff in nearness vs. more points -- replace the "limit as n-->infty" page with a section on this page
42 | 


--------------------------------------------------------------------------------
/_site/man/ArrowPlot.Rd:
--------------------------------------------------------------------------------
 1 | \name{ArrowPlot}
 2 | \alias{ArrowPlot}
 3 | \title{ArrowPlot}
 4 | \usage{
 5 | ArrowPlot(predictionFunction, df, u, v)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{df}{data frame with data}
11 | 
12 |   \item{u}{input of interest}
13 | 
14 |   \item{v}{other inputs}
15 | }
16 | \description{
17 | Sampled transitions in
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/_site/man/AverageSpecifiedComparison.Rd:
--------------------------------------------------------------------------------
 1 | \name{AverageSpecifiedComparison}
 2 | \alias{AverageSpecifiedComparison}
 3 | \title{AverageSpecifiedComparison}
 4 | \usage{
 5 | AverageSpecifiedComparison(fit, df, input, low, high)
 6 | }
 7 | \arguments{
 8 |   \item{fit}{fitted glm object}
 9 | 
10 |   \item{df}{data frame for evaluating predictions}
11 | 
12 |   \item{input}{string representing the name of the input
13 |   feature you want an APC for}
14 | 
15 |   \item{low}{low value of input feature}
16 | 
17 |   \item{high}{high value of input feature}
18 | }
19 | \description{
20 | Makes an average predictive comparison for two specified
21 | values of the input variable of interest, as in section 5.7
22 | of ARM
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/_site/man/ComputeAPCFromPairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{ComputeApcFromPairs}
 2 | \alias{ComputeApcFromPairs}
 3 | \title{ComputeApcFromPairs}
 4 | \usage{
 5 | ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute = FALSE,
 6 |   impact = FALSE)
 7 | }
 8 | \description{
 9 | ComputeApcFromPairs
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/_site/man/GetAPC.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetAPC}
 2 | \alias{GetAPC}
 3 | \title{GetAPC}
 4 | \usage{
 5 | GetAPC(predictionFunction, X, u, v, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{X}{}
11 | 
12 |   \item{u}{input of interest}
13 | 
14 |   \item{v}{other inputs}
15 | 
16 |   \item{...}{extra parguments passed to GetPairs used to
17 |   control Weight function}
18 | }
19 | \description{
20 | makes average predictive comparison (based on
21 | Gelman/Pardoe) by forming pairs with two versions of the
22 | input of interest and averaging the predictive difference
23 | using Weights. I think Weights should be an approximation
24 | of the density p(u1,u2|v) or something like that... I need
25 | to look back at this. At present, I believe this is
26 | probably implementing the version in the Gelman/Pardoe
27 | paper.
28 | }
29 | \details{
30 | Only works fore continuous inputs right now
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/_site/man/GetAPCWithAbsolute.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetAPCWithAbsolute}
 2 | \alias{GetAPCWithAbsolute}
 3 | \title{GetAPCWithAbsolute}
 4 | \usage{
 5 | GetAPCWithAbsolute(predictionFunction, X, u, v, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{X}{}
11 | 
12 |   \item{u}{input of interest}
13 | 
14 |   \item{v}{other inputs}
15 | 
16 |   \item{k}{Weights are (1 / (k + Mahalanobis distance))}
17 | }
18 | \value{
19 | a list with: \code{signed} (the usual APC) and
20 | \code{absolute} (APC applied to the absolute value of the
21 | differences)
22 | }
23 | \description{
24 | makes average predictive comparison (based on
25 | Gelman/Pardoe) by forming pairs with two versions of the
26 | input of interest and averaging the predictive difference
27 | using Weights. I think Weights should be an approximation
28 | of the density p(u1,u2|v) or something like that... I need
29 | to look back at this. At present, I believe this is
30 | probably implementing the version in the Gelman/Pardoe
31 | paper. returns a list with the APC and the APC applied to
32 | the absolute value of the prediction function Only works
33 | fore continuous inputs right now
34 | }
35 | \examples{
36 | print("hi")
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/_site/man/GetApcDF.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetApcDF}
 2 | \alias{GetApcDF}
 3 | \title{GetApcDF}
 4 | \usage{
 5 | GetApcDF(predictionFunction, df, inputVars, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{df}{data frame with data}
11 | 
12 |   \item{inputVars}{inputs to the model}
13 | 
14 |   \item{...}{extra parguments passed to GetPairs used to
15 |   control Weight function}
16 | }
17 | \description{
18 | makes average predictive comparison for all specified
19 | inputs
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/_site/man/GetComparisonDFFromPairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetComparisonDFFromPairs}
 2 | \alias{GetComparisonDFFromPairs}
 3 | \title{GetComparisonDFFromPairs}
 4 | \usage{
 5 | GetComparisonDFFromPairs(predictionFunction, pairs, u, v)
 6 | }
 7 | \description{
 8 | (abstracted this into a separate function from
 9 | \code{GetApc} so we can more easily do things like
10 | \code{GetApcWithAbsolute})
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/_site/man/GetPairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetPairs}
 2 | \alias{GetPairs}
 3 | \title{GetPairs}
 4 | \usage{
 5 | GetPairs(X, u, v, numForTransitionStart = NULL, numForTransitionEnd = NULL,
 6 |   onlyIncludeNearestN = NULL, mahalanobisConstantTerm = 1)
 7 | }
 8 | \arguments{
 9 |   \item{X}{data frame}
10 | 
11 |   \item{u}{input of interest}
12 | 
13 |   \item{v}{other inputs}
14 | 
15 |   \item{mahalanobisConstantTerm}{Weights are (1 /
16 |   (mahalanobisConstantTerm + Mahalanobis distance))}
17 | 
18 |   \item{numForTransitionStart}{}
19 | 
20 |   \item{numForTransitionEnd}{}
21 | }
22 | \value{
23 | a data frame with the inputs \code{v} from the first of
24 | each pair, \code{u} from each half (with ".B" appended to
25 | the second), and the Mahalanobis distances between the
26 | pairs.
27 | }
28 | \description{
29 | Form all pairs of rows in \code{X} and compute Mahalanobis
30 | distances based on \code{v}.
31 | }
32 | \details{
33 | To help with computational constraints, you have the option
34 | to not form pairs between all rows of \code{X} but instead
35 | of specify a certain number (\code{numForTransitionStart})
36 | to randomly be selected as rows from which transitions
37 | start, and another number (\code{numForTransitionEnd}) to
38 | be randomly selected as where transitions end. We then form
39 | all pairs between transition-start rows and transition-end
40 | rows.
41 | 
42 | In order to get a smaller data frame for later
43 | manipulations (and maybe just because it's a good idea),
44 | you can also specify \code{onlyIncludeNearestN}, in which
45 | case we return only the nearest \code{onlyIncludeNearestN}
46 | transition ends for each transition start (instead of all
47 | pairs).
48 | }
49 | \examples{
50 | # Should put unit test example here
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/_site/man/GetPredCompsDF.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetPredCompsDF}
 2 | \alias{GetPredCompsDF}
 3 | \title{GetApcDF}
 4 | \usage{
 5 | GetPredCompsDF(model, df, numRowsToUse = NULL, inputVars = NULL, ...)
 6 | }
 7 | \arguments{
 8 |   \item{model}{Either a function (from a data frame to
 9 |   vector of predictions) or a model we know how to deal
10 |   with (lm, glm)}
11 | 
12 |   \item{df}{data frame with data}
13 | 
14 |   \item{inputVars}{inputs to the model}
15 | 
16 |   \item{...}{extra parguments passed to GetPairs used to
17 |   control Weight function}
18 | }
19 | \description{
20 | makes average predictive comparison for all specified
21 | inputs
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/_site/man/GetPredCompsDFFromPairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetPredCompsDFFromPairs}
 2 | \alias{GetPredCompsDFFromPairs}
 3 | \title{GetPredCompsDFFromPairs}
 4 | \usage{
 5 | GetPredCompsDFFromPairs(predictionFunction, pairs, u, v)
 6 | }
 7 | \description{
 8 | (abstracted this into a separate function from
 9 | \code{GetApc} so we can more easily do things like
10 | \code{GetApcWithAbsolute})
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/_site/man/GetSingleInputPredComps.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetSingleInputPredComps}
 2 | \alias{GetSingleInputPredComps}
 3 | \title{GetComparisonDF}
 4 | \usage{
 5 | GetSingleInputPredComps(predictionFunction, X, u, v, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{X}{}
11 | 
12 |   \item{u}{input of interest}
13 | 
14 |   \item{v}{other inputs}
15 | 
16 |   \item{k}{Weights are (1 / (k + Mahalanobis distance))}
17 | }
18 | \value{
19 | a list with: \code{signed} (the usual Apc) and
20 | \code{absolute} (Apc applied to the absolute value of the
21 | differences)
22 | }
23 | \description{
24 | makes average predictive comparison (based on
25 | Gelman/Pardoe) by forming pairs with two versions of the
26 | input of interest and averaging the predictive difference
27 | using Weights. I think Weights should be an approximation
28 | of the density p(u1,u2|v) or something like that... I need
29 | to look back at this. At present, I believe this is
30 | probably implementing the version in the Gelman/Pardoe
31 | paper. returns a list with the Apc and the Apc applied to
32 | the absolute value of the prediction function Only works
33 | fore continuous inputs right now
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/_site/man/PlotApcDF.Rd:
--------------------------------------------------------------------------------
 1 | \name{PlotApcDF}
 2 | \alias{PlotApcDF}
 3 | \title{PlotApcDF}
 4 | \usage{
 5 | PlotApcDF(apcDF, variant = "Impact")
 6 | }
 7 | \arguments{
 8 |   \item{apcDF}{the output of GetApcDF}
 9 | }
10 | \description{
11 | plots the output of GetApcDF -- this is my preferred
12 | display for now
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/_site/man/PlotApcDF2.Rd:
--------------------------------------------------------------------------------
 1 | \name{PlotApcDF2}
 2 | \alias{PlotApcDF2}
 3 | \title{PlotApcDF}
 4 | \usage{
 5 | PlotApcDF2(apcDF)
 6 | }
 7 | \arguments{
 8 |   \item{apcDF}{the output of GetApcDF}
 9 | }
10 | \description{
11 | plots the output of GetApcDF -- this is not my preferred
12 | display but may be more self-explanatory
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/_site/man/PlotPairCumulativeWeights.Rd:
--------------------------------------------------------------------------------
 1 | \name{PlotPairCumulativeWeights}
 2 | \alias{PlotPairCumulativeWeights}
 3 | \title{PlotPairCumulativeWeights}
 4 | \usage{
 5 | PlotPairCumulativeWeights(pairs, numOriginalRowNumbersToPlot = 20)
 6 | }
 7 | \description{
 8 | For a sample of transition start rows, we plot rank of
 9 | transition end (by increasing weight) vs. cumulative
10 | weight. This gives a sense of how much weight is going into
11 | the nearest points vs. further ones.
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/_site/man/PlotPredCompsDF.Rd:
--------------------------------------------------------------------------------
 1 | \name{PlotPredCompsDF}
 2 | \alias{PlotPredCompsDF}
 3 | \title{PlotApcDF}
 4 | \usage{
 5 | PlotPredCompsDF(apcDF, variant = "Impact")
 6 | }
 7 | \arguments{
 8 |   \item{apcDF}{the output of GetApcDF}
 9 | }
10 | \description{
11 | plots the output of GetApcDF -- this is my preferred
12 | display for now
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/_site/man/TransitionPlot.Rd:
--------------------------------------------------------------------------------
 1 | \name{TransitionPlot}
 2 | \alias{TransitionPlot}
 3 | \title{ArrowPlot}
 4 | \usage{
 5 | TransitionPlot(predictionFunction, df, u, v, plot = TRUE)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{df}{data frame with data}
11 | 
12 |   \item{u}{input of interest}
13 | 
14 |   \item{v}{other inputs}
15 | }
16 | \description{
17 | Sampled transitions in
18 | }
19 | \examples{
20 | N <- 100
21 | df <- data.frame(u=rnorm(N), v=rnorm(N))
22 | df$y1 = 2*df$u + 2*df$v + rnorm(N)
23 | lm1 <- lm(y1 ~ u + v, data=df)
24 | print(lm1)
25 | TransitionPlot(function(df) predict(lm1, df), df, "u", "v") + ylab("y1")
26 | df$y2 = df$u + df$v + 2*df$v*df$u + rnorm(N)
27 | lm2 <- lm(y2 ~ u*v, data=df)
28 | print(lm2)
29 | TransitionPlot(function(df) predict(lm2, df), df, "u", "v") + ylab("y2")
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/_site/man/average_specified_comparison.Rd:
--------------------------------------------------------------------------------
 1 | \name{average_specified_comparison}
 2 | \alias{average_specified_comparison}
 3 | \title{average_specified_comparison}
 4 | \usage{
 5 | average_specified_comparison(fit, df, input, low, high)
 6 | }
 7 | \arguments{
 8 |   \item{fit}{fitted glm object}
 9 | 
10 |   \item{df}{data frame for evaluating predictions}
11 | 
12 |   \item{input}{string representing the name of the input
13 |   feature you want an APC for}
14 | 
15 |   \item{low}{low value of input feature}
16 | 
17 |   \item{high}{high value of input feature}
18 | }
19 | \description{
20 | Makes an average predictive comparison for two specified
21 | values of the input variable of interest, as in section 5.7
22 | of ARM
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/_site/man/compute_apc_from_pairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{compute_apc_from_pairs}
 2 | \alias{compute_apc_from_pairs}
 3 | \title{compute_apc_from_pairs}
 4 | \usage{
 5 | compute_apc_from_pairs(predictionFunction, pairs, u, v, absolute = FALSE)
 6 | }
 7 | \description{
 8 | (abstracted this into a separate function from
 9 | \code{get_apc} so we can more easily do things like
10 | \code{get_apc_with_absolute})
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/_site/man/get_apc.Rd:
--------------------------------------------------------------------------------
 1 | \name{get_apc}
 2 | \alias{get_apc}
 3 | \title{get_apc}
 4 | \usage{
 5 | get_apc(predictionFunction, X, u, v, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{X}{}
11 | 
12 |   \item{u}{input of interest}
13 | 
14 |   \item{v}{other inputs}
15 | 
16 |   \item{...}{extra parguments passed to get_pairs used to
17 |   control weight function}
18 | }
19 | \description{
20 | makes average predictive comparison (based on
21 | Gelman/Pardoe) by forming pairs with two versions of the
22 | input of interest and averaging the predictive difference
23 | using weights. I think weights should be an approximation
24 | of the density p(u1,u2|v) or something like that... I need
25 | to look back at this. At present, I believe this is
26 | probably implementing the version in the Gelman/Pardoe
27 | paper.
28 | }
29 | \details{
30 | Only works fore continuous inputs right now
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/_site/man/get_apc_with_absolute.Rd:
--------------------------------------------------------------------------------
 1 | \name{get_apc_with_absolute}
 2 | \alias{get_apc_with_absolute}
 3 | \title{get_apc_with_absolute}
 4 | \usage{
 5 | get_apc_with_absolute(predictionFunction, X, u, v, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{}
 9 | 
10 |   \item{X}{}
11 | 
12 |   \item{u}{input of interest}
13 | 
14 |   \item{v}{other inputs}
15 | 
16 |   \item{k}{weights are (1 / (k + mahalanobis distance))}
17 | }
18 | \value{
19 | a list with: \code{signed} (the usual APC) and
20 | \code{absolute} (APC applied to the absolute value of the
21 | differences)
22 | }
23 | \description{
24 | makes average predictive comparison (based on
25 | Gelman/Pardoe) by forming pairs with two versions of the
26 | input of interest and averaging the predictive difference
27 | using weights. I think weights should be an approximation
28 | of the density p(u1,u2|v) or something like that... I need
29 | to look back at this. At present, I believe this is
30 | probably implementing the version in the Gelman/Pardoe
31 | paper. returns a list with the APC and the APC applied to
32 | the absolute value of the prediction function Only works
33 | fore continuous inputs right now
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/_site/man/get_pairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{get_pairs}
 2 | \alias{get_pairs}
 3 | \title{get_pairs}
 4 | \usage{
 5 | get_pairs(X, u, v, mahalanobisConstantTerm = 1, renormalizeWeights = TRUE)
 6 | }
 7 | \arguments{
 8 |   \item{X}{data frame}
 9 | 
10 |   \item{u}{input of interest}
11 | 
12 |   \item{v}{other inputs}
13 | 
14 |   \item{mahalanobisConstantTerm}{weights are (1 /
15 |   (mahalanobisConstantTerm + mahalanobis distance))}
16 | 
17 |   \item{renormalizeWeights}{whether to renormalize the
18 |   weights to that they sum to 1 within each group (groups
19 |   based on the first element of the pair). If I'm right,
20 |   there's no reason to use \code{FALSE} ever; I'm only
21 |   leaving the option in so I can compare with the paper.}
22 | }
23 | \value{
24 | a data frame with the inputs \code{v} from the first of
25 | each pair, \code{u} from each half (with ".B" appended to
26 | the second), and the mahalanobis distances between the
27 | pairs.
28 | }
29 | \description{
30 | Form all pairs of rows in X and compute mahalanobis
31 | distances based on \code{v}.
32 | }
33 | \examples{
34 | library("mvtnorm")
35 | sigma <- matrix(c(1,.5,-.5,
36 |                   .5,1,.5,
37 |                   -.5,.5,1), ncol=3)
38 | X <- data.frame(rmvnorm(n=10, sigma=sigma))
39 | get_pairs(X, u="X3", v=c("X1","X2"))
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/_site/man/mahal.Rd:
--------------------------------------------------------------------------------
 1 | \name{Mahal}
 2 | \alias{Mahal}
 3 | \title{Mahal}
 4 | \usage{
 5 | Mahal(matrix1, matrix2, covariance)
 6 | }
 7 | \arguments{
 8 |   \item{matrix1}{nxm matrix or data frame representing 1st
 9 |   set of observations -- each row is an observation; each
10 |   column is an input feature}
11 | 
12 |   \item{matrix2}{nxm matrix or data frame representing 2nd
13 |   set of observations}
14 | 
15 |   \item{input}{string representing the name of the input
16 |   feature you want an APC for}
17 | 
18 |   \item{covariance}{covariance to be used for Mahalanobis
19 |   computation}
20 | }
21 | \value{
22 | a vector of Mahalanobis distances between row i of matrix1
23 | and row i of matrix2
24 | }
25 | \description{
26 | Computes Mahalanobis distance between pairs of
27 | observations. I'm using this rather than stats::Mahalanobis
28 | because I want just the distances between corresponding
29 | rows of each matrix, not all pair-wise distances
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/_site/man/resample_from_pairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{resample_from_pairs}
 2 | \alias{resample_from_pairs}
 3 | \title{resample_from_pairs}
 4 | \usage{
 5 | resample_from_pairs(X, u, v,
 6 |   samplingProbsAsFunctionOfMahalanobis = function(x) 1/(1 + x))
 7 | }
 8 | \description{
 9 | form pairs with \code{get_pairs} and resample \code{v} and
10 | \code{u} according to a function of the mahalanobis
11 | distances (used diagnostics / understanding how things are
12 | working)
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/_site/notes/apc.Rmd:
--------------------------------------------------------------------------------
 1 | ## APCs
 2 | 
 3 | ### Predictive Comparisons Generalize Regression Coefficients
 4 | 
 5 | At the heart of this package is the idea of a *predictive comparison*: We vary the input of interest holding the other inputs constant, and look at the differences in predicted values. Let $u$ represent the input of interest and $v$ the (vector of) other inputs. Let $f$ be a functinon making predictions, so
 6 | 
 7 | $$\hat{y} = f(u,v)$$
 8 | 
 9 | Our $f$ could come from any predictive model. If we have a statistical model, then probably we would choose 
10 | 
11 | $$f(u,v) = \mathcal{E}[y \mid u, v, \theta]$$
12 | 
13 | (where $\theta$ are the parameters of the model). But we need not have a statistical model at all. The prediction function $f$ could come from a random forest, or a support vector machine.
14 | 
15 | Given the function $f$ and a choice of $u_1$, $u_2$, and $v$, we can compute
16 | 
17 | $$\delta_{u_1 \rightarrow u_2, v} = \frac{f(u_2, v) - f(u_1, v)}{u_2-u_1}$$
18 | 
19 | If $f$ were a linear model with no interactions, the above would not depend on the particular choices of $u_1$, $u_2$, and $v$ and would be the regression coefficient corresponding to $u$. This is the formal sense in which predictive comparisons generalize regression coefficients. Since for more complicated models this varies as the inputs vary, we will take an average across a well chosen set of inputs. 
20 | 
21 | ### Choice of Inputs
22 | 
23 | The APC is
24 | 
25 | $$\frac{\mathcal{E}[\Delta_f]}{\mathcal{E}[\Delta_u]}$$
26 | 
27 | where $\Delta_f = f(u_2,v) - f(u_1,v)$, $\Delta_u = u_2 - u_1$, and $\mathcal{E}$ is expectation under the following process:
28 | 
29 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs
30 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$
31 | 
32 | The reason for this definition is that we want to use values of $v$ that are representative of our data generation process, and transitions in $u$ that are representative of what really occurs at those values of $v$.
33 | 
34 | Computing the numerator and denominator separately rather than taking an expected value of $\delta_{u_1 \rightarrow u_2, v}$ amounts to weighting by the size of $(u_2 - u_1)$. This avoids having the result excessively influenced by small changes in $u$.
35 | 
36 | ### Estimation
37 | 
38 | The rows of our data represent samples from the joint distribution $u$, $v$, so each row amounts to a sample from $v$ followed by a sample $u_1$ conditional on $v$. The difficult thing is drawing another sample $u_2$ conditional on $v$. We approximate these by assigning weights to rows based on the proximity of the $v$ in that row to the $v$ in the row in question. The weights are:
39 | 
40 | $$\frac{1}{\text{mahalanobisConstantTerm} + \text{(mahalanobis distance)}}$$
41 | 
42 | The [*mahalanobis distance*](https://en.wikipedia.org/wiki/Mahalanobis_distance) is a unitless version of distance that takes into account the correlation structure of $v$. The *mahalanobisConstantTerm* (which defaults to 1, but this is not always an appropriate choice) prevents all of the weight from going to the closest points. More work needs to be done in thinking about the weights.
43 | 
44 | For more details, see [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) (section 4), [my note](more-renormalize-weights.html) explaining a small change from the weights described in the paper, or [my code](https://github.com/dchudz/predcomps/blob/master/R/pairs.R) that computes the appropriate weights.
45 | 
46 | ### Absolute Version
47 | 
48 | The absolute APC (as opposed to the signed version described above) replaces $\Delta_f = f(u_2,v) - f(u_1,v)$ above with $|\Delta_f| = |f(u_2,v) - f(u_1,v)|$. See e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html) for an example where the signed APC is roughly 0 but the absolute APC is large.
49 | 
50 | By default, I always compute and display an absolute version of the APC alongside the signed version. 
51 | 
52 | ### A Small Example
53 | 
54 | This is a example running APCs on a simulated linear model with independent inputs and no interactions. For more involved examples, see the [examples](examples-overview.html) section.
55 | 
56 | The inputs ($x_1$, $x_2$, $x_3$) are independent, with
57 | 
58 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$
59 | 
60 | First we set up the data:
61 | 
62 | ```{r}
63 | n <- 200
64 | x1 <- runif(n=n, min=0, max=1)
65 | x2 <- runif(n=n, min=0, max=1)
66 | x3 <- runif(n=n, min=0, max=10)
67 | y <- 2*x1 + (-2)*x2 + 1*x3 + rnorm(n, sd=.1)
68 | df <- data.frame(x1, x2, x3, y)
69 | ```
70 | 
71 | Then we fit a linear model:
72 | 
73 | ```{r}
74 | fittedLm <- lm(y ~ ., data=df)
75 | fittedLm
76 | ```
77 | 
78 | We can then plot the average predictive comparisons:
79 | 
80 | ```{r results='hide', message=FALSE}
81 | library(predcomps)
82 | apcDF <- GetPredCompsDF(fittedLm, df=df)
83 | PlotPredCompsDF(apcDF, variant="Apc") + theme_gray(base_size = 18)
84 | ```
85 | 
86 | Using different shapes / colors, both the absolute and signed versions are plotted. For symmetry, the absolute version is plotted with both a positive and negative sign. Since this is a linear model with no interactions, the signed APCs match those from the fitted linear model. 
87 | 
88 | This is what the returned data frame `apcDF` looks like:
89 | 
90 | ```{r}
91 | apcDF
92 | ```
93 | 
94 | The columns plotted here are `PerUnitInput.Signed` and `Apc.Absolute`. The [next section](impact.html) is about those columns labeled "Impact".
95 | 


--------------------------------------------------------------------------------
/_site/notes/examples/diamonds.Rmd:
--------------------------------------------------------------------------------
  1 | # A new way to visualize models
  2 | 
  3 | ```{r message=FALSE, echo=FALSE}
  4 | library(knitr)
  5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center')
  6 | library(randomForest)
  7 | library(plyr)
  8 | library(predcomps)
  9 | library(ggplot2)
 10 | ```
 11 | 
 12 | ```{r}
 13 | diamonds <- transform(diamonds, clarity = 
 14 |                         factor(clarity, levels =c("SI1", "SI2", "VS1", "VS2", "VVS1", "VVS2", "IF")))
 15 | 
 16 | diamonds2 <- transform(diamonds,
 17 |                        clarity = as.integer(clarity),
 18 |                        cut = as.integer(cut),
 19 |                        color = as.integer(color))
 20 | 
 21 | diamonds3 <- subset(diamonds2, !is.na(clarity))
 22 | 
 23 | rf <- randomForest(price ~ carat + cut + color + clarity, data=diamonds3, ntree=20)
 24 | diamondsSmall <- diamonds3[sample.int(nrow(diamonds3), size=500), ]
 25 | 
 26 | apcDf <- GetPredCompsDF(function(df) predict(rf, df), diamondsSmall, inputVars=row.names(rf$importance))
 27 | PlotPredCompsDF(apcDf)
 28 | PlotPredCompsDF(apcDf, variant="Apc")
 29 | 
 30 | 
 31 | pairs <- GetPairs(diamondsSmall, "carat", c("cut", "color", "clarity"), 
 32 |                   removeDiagonal=FALSE,
 33 |                   mahalanobisConstantTerm=.1)
 34 | 
 35 | u <- "carat"
 36 | pairsSampled <- ddply(pairs, 
 37 |                       "OriginalRowNumber", function(df) {
 38 |                         df[sample.int(nrow(df), size=5, prob=df$Weight), ]
 39 |                         })
 40 | 
 41 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10)
 42 | 
 43 | pairsSampled$carat <- pairsSampled$carat.B
 44 | 
 45 | pairsSampled$Prediction <- predict(rf, pairsSampled)
 46 | 
 47 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse),
 48 |        aes(x=carat, y=Prediction, color=factor(OriginalRowNumber, levels=sample(originalRowNumbersToUse)))) +
 49 |   geom_point() +
 50 |   geom_line(size=.2) 
 51 | 
 52 | last_plot() + scale_x_continuous(limits=c(0,1)) + scale_y_continuous(limits=c(0,5000))
 53 | 
 54 | 
 55 | 
 56 | u <- "clarity"
 57 | v <- c("carat", "cut", "color")
 58 | pairs <- GetPairs(diamondsSmall, u, v, 
 59 |                   removeDiagonal=FALSE,
 60 |                   mahalanobisConstantTerm=.1)
 61 | 
 62 | pairsSampled <- ddply(pairs, 
 63 |                       "OriginalRowNumber", function(df) {
 64 |                         df[sample.int(nrow(df), size=5, prob=df$Weight), ]
 65 |                         })
 66 | 
 67 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10)
 68 | 
 69 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]]
 70 | 
 71 | pairsSampled$Prediction <- predict(rf, pairsSampled)
 72 | 
 73 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse))
 74 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse),
 75 |        aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) +
 76 |   geom_point() +
 77 |   geom_line(size=.2) 
 78 | 
 79 | 
 80 | 
 81 | u <- "color"
 82 | v <- c("cut", "clarity", "carat")
 83 | pairs <- GetPairs(diamondsSmall, u, v, 
 84 |                   removeDiagonal=FALSE,
 85 |                   mahalanobisConstantTerm=.1)
 86 | 
 87 | pairsSampled <- ddply(pairs, 
 88 |                       "OriginalRowNumber", function(df) {
 89 |                         df[sample.int(nrow(df), size=5, prob=df$Weight), ]
 90 |                         })
 91 | 
 92 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10)
 93 | 
 94 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]]
 95 | 
 96 | pairsSampled$Prediction <- predict(rf, pairsSampled)
 97 | 
 98 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse))
 99 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse),
100 |        aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) +
101 |   geom_point() +
102 |   geom_line(size=.2) 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | ```
109 | 


--------------------------------------------------------------------------------
/_site/notes/examples/loan-defaults.Rmd:
--------------------------------------------------------------------------------
  1 | ## Credit Default Example
  2 | 
  3 | ```{r message=FALSE, echo=FALSE}
  4 | library(knitr)
  5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center', fig.width = 14, size=8)
  6 | library(randomForest)
  7 | library(predcomps)
  8 | library(ggplot2)
  9 | theme_set(theme_gray(base_size = 18))
 10 | library(dplyr)
 11 | library(gridExtra)
 12 | ```
 13 |  
 14 | This example is based on the training data set [found here](https://www.kaggle.com/c/GiveMeSomeCredit). We build a model to predict:
 15 | 
 16 | - **SeriousDlqin2yrs** (target variable):  Person experienced 90 days past due delinquency or worse 
 17 | 
 18 | The input features are:
 19 | 
 20 | - **RevolvingUtilizationOfUnsecuredLines**:	Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits
 21 | - **age**:	Age of borrower in years
 22 | - **NumberOfTime30-59DaysPastDueNotWorse**:	Number of times borrower has been 30-59 days past due but no worse in the last 2 years.
 23 | - **NumberOfTime60-89DaysPastDueNotWorse**:	Number of times borrower has been 60-89 days past due but no worse in the last 2 years.
 24 | - **NumberOfTimes90DaysLate**:	Number of times borrower has been 90 days or more past due.
 25 | - **DebtRatio**:	Monthly debt payments, alimony,living costs divided by monthy gross income
 26 | - **MonthlyIncome**:	Monthly income
 27 | - **NumberOfOpenCreditLinesAndLoans**:	Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)
 28 | - **NumberRealEstateLoansOrLines**:	Number of mortgage and real estate loans including home equity lines of credit
 29 | - **NumberOfDependents**:	Number of dependents in family excluding themselves (spouse, children etc.)
 30 | 
 31 | ### Parameters:
 32 | 
 33 | These are some parameters controlling the aggregate predictive comparisons:
 34 | 
 35 | ```{r}
 36 | # Parameters controlling the predictive comparisons computation:
 37 | #  We consider transitions starting at each of 500 random rows
 38 | numForTransitionStart <- 50
 39 | #  ... going to each of 10,000 other random rows:
 40 | numForTransitionEnd <- 1000
 41 | #  ... keeping only the nearest 100 pairs for each start:
 42 | onlyIncludeNearestN = 10
 43 | ```
 44 | 
 45 | And for the random forest:
 46 | 
 47 | ```{r}
 48 | # 100 trees for random forest 
 49 | ntree = 10
 50 | ```
 51 | 
 52 | 
 53 | ```{r echo=FALSE}
 54 | # Remove some outliers
 55 | credit <- read.csv("~/Downloads/cs-training.csv")[,-1]
 56 | credit <- subset(credit, !is.na(MonthlyIncome) &
 57 |                     NumberOfTime30.59DaysPastDueNotWorse < 5 & 
 58 |                     RevolvingUtilizationOfUnsecuredLines <= 2 &
 59 |                     NumberOfTime30.59DaysPastDueNotWorse <= 5 &
 60 |                     NumberOfTime60.89DaysPastDueNotWorse <= 5 &
 61 |                     NumberOfTimes90DaysLate <= 5 &
 62 |                     MonthlyIncome < 5e4 &
 63 |                     DebtRatio < 2 &
 64 |                     NumberRealEstateLoansOrLines <= 12 &
 65 |                     NumberOfDependents < 10                  
 66 |                   )
 67 | ```
 68 | 
 69 | ### Input Distribution
 70 | 
 71 | The distribution of the inputs (after removing some outliers to make things more manageable):
 72 | 
 73 | ```{r fig.height = 12, echo=FALSE}
 74 | histograms <- Map(function(colName) {
 75 |   qplot(credit[[colName]]) + 
 76 |     ggtitle(colName) +
 77 |     xlab("")},
 78 |   setdiff(names(credit), "SeriousDlqin2yrs"))
 79 | allHistograms <- do.call(arrangeGrob, c(histograms, ncol=2))
 80 | print(allHistograms)
 81 | ```
 82 | 
 83 | Build a random forest model:
 84 | 
 85 | ```{r}
 86 | set.seed(1)
 87 | # Turning the response to type "factor" causes the RF to be build for classification:
 88 | credit$SeriousDlqin2yrs <- factor(credit$SeriousDlqin2yrs) 
 89 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit, ntree=ntree)
 90 | ```
 91 | 
 92 | ### Aggregate Predictive Comparisons
 93 | 
 94 | ```{r message=FALSE, results='hide'}
 95 | set.seed(1)
 96 | apcDF <- GetPredCompsDF(rfFit, credit,
 97 |                         numForTransitionStart = numForTransitionStart,
 98 |                         numForTransitionEnd = numForTransitionEnd,
 99 |                         onlyIncludeNearestN = onlyIncludeNearestN)
100 | ```
101 | 
102 | Hi
103 | 
104 | ```{r echo=FALSE}
105 | kable(apcDF, row.names=FALSE)
106 | ```
107 | 
108 | 
109 | Here [impact](impact.html) chart. Its units are changes in probability, so we can compare it across all of the inputs:
110 | 
111 | ```{r}
112 | PlotPredCompsDF(apcDF)
113 | ```
114 | 
115 | Note that average absolute value of the change in probability associated with changes in age is much larger than the magnitude of the signed average. This indicates either an interact effect between age and other inputs, or a non-linear (non-monotonic) relationship between age an probability of default. We'll look into that a bit later.
116 | 
117 | It wouldn't make sense to chart the average predictive comparisons in this way since they don't share units, but we can chart the inputs corresponding to a number of numbers late for various periods:
118 | 
119 | ```{r}
120 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ],
121 |                 variant = "Apc")
122 | ```
123 | 
124 | As you'd expect, greater periods of lateness are worse (per additional incident).
125 | 
126 | However, `NumberOfTime30.59DaysPastDueNotWorse` makes more overall difference to the model, because its variation is larger (it's non-zero more often):
127 | 
128 | ```{r}
129 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ])
130 | ```
131 | 
132 | ### More Detailed Examination: `NumberOfTime30.59DaysPastDueNotWorse`
133 | 
134 | Recall that data from which the summarized predictive comparisons are computed consists of groups of rows, where with in each group only the input of interest varies (for the point we imagine transitioning to) and the rest are held constant. We can work directly with this data, visualizing it in more detail to better understand our model:
135 | 
136 | ```{r}
137 | set.seed(6)
138 | pairs <- GetComparisonDF(rfFit, credit, 
139 |                          u="NumberOfTime30.59DaysPastDueNotWorse",
140 |                          numForTransitionStart = 20,
141 |                          numForTransitionEnd = numForTransitionEnd*10,
142 |                          onlyIncludeNearestN = onlyIncludeNearestN*10)
143 | 
144 | pairsSummarized <- pairs[c("OriginalRowNumber", "NumberOfTime30.59DaysPastDueNotWorse.B", "yHat2", "Weight")] %.% 
145 |   group_by(OriginalRowNumber, NumberOfTime30.59DaysPastDueNotWorse.B, yHat2) %.% summarise(Weight = sum(Weight))
146 | 
147 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
148 |   geom_point(aes(size = Weight)) +
149 |   geom_line(size=.2) +
150 |   scale_x_continuous(limits=c(0,2)) +
151 |   scale_size_area()
152 | ```
153 | 
154 | I've made the size of the points proportional to weight that the point receives. The summarized predictive comparisons give more weight to points with more weight, and so should we.
155 | 
156 | The relationship is mostly as you'd expect, but let's examine the highlighted one in more detail:
157 | 
158 | ```{r echo=FALSE}
159 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
160 |   geom_point(aes(size = n)) +
161 |   geom_line(aes(alpha=ifelse(OriginalRowNumber == 18, 1, .3))) +
162 |   scale_x_continuous(limits=c(0,2)) +
163 |   scale_alpha_identity()
164 | ```
165 | 
166 | Why does the default probability decrease when `NumberOfTime30.59DaysPastDueNotWorse` increases? Values for all the other inputs are held constant, so let's see what they are:
167 | 
168 | ```{r echo=FALSE}
169 | oneOriginalRowNumber <- subset(pairs, OriginalRowNumber == 18)
170 | oneRow <- t(oneOriginalRowNumber[1,intersect(names(oneOriginalRowNumber), names(credit))])
171 | colnames(oneRow) <- "Input values for highlighted row"
172 | grid.newpage()
173 | grid.table(oneRow)
174 | ```
175 | 
176 | Hmm, note that `NumberOfTimes90DaysLate` (almost always $0$) is $1$ in this case. Looking at the definition of the target variable (`SeriousDlqin2yrs`), this makes a lot of sense:
177 | 
178 | `SeriousDlqin2yrs`: "Person experienced 90 days past due delinquency or worse."
179 | 
180 | As we'd expect, previous instances of 90-days-late are a strong indicator of future ones. Adding a previous 30-days-late (but not more) to a previous 90-days-late seems to decrease the chance of future 90-days-lates. This is sensible -- with both, we have evidence that when you're late, you at least *sometimes* pay back in under 60 days. 
181 | 
182 | Further exploratory analysis would further improve our understanding:
183 | 
184 | - Is this really primarily an interaction between `NumberOfTimes90DaysLate` and `NumberOfTime30.59DaysPastDueNotWorse`, or are other inputs involved? We could vary the other inputs and see if we still get this effect.
185 | - Is the effect validated on test data?
186 | 
187 | ### More Detailed Examination: `Age`
188 |  
189 | For one more example, let's examine the `Age` input in more detail.
190 | 
191 | ```{r}
192 | set.seed(3)
193 | pairs <- GetComparisonDF(rfFit, credit, 
194 |                          u="age",
195 |                          numForTransitionStart = 20,
196 |                          numForTransitionEnd = numForTransitionEnd*10,
197 |                          onlyIncludeNearestN = onlyIncludeNearestN*10)
198 | 
199 | pairsSummarized <- pairs[c("OriginalRowNumber", "age.B", "yHat2", "Weight")] %.% 
200 |   group_by(OriginalRowNumber, age.B, yHat2) %.% 
201 |   summarise(Weight = sum(Weight))
202 | 
203 | ggplot(pairsSummarized, aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 
204 |   geom_point(aes(size = Weight)) +
205 |   geom_line(size=.2)
206 | ```
207 | 
208 | This is a bit of a mess, but we can at least see see that interaction effects and non-monotonicity are both going on.
209 | 
210 | Further exploration would look into which other inputs age is interacting with to determine these differently shaped curves.
211 | 


--------------------------------------------------------------------------------
/_site/notes/examples/logistic-regression.Rmd:
--------------------------------------------------------------------------------
 1 | ## A Logistic Regression with Related Inputs
 2 | 
 3 | ```{r}
 4 | 
 5 | library(predcomps)
 6 | library(ggplot2)
 7 | library(boot)
 8 | 
 9 | N=500
10 | v1 <- rnorm(N)*3
11 | v2 <- rnorm(N) * (abs(v1) > 2)
12 | v3 <- rnorm(N) * (abs(v1) < .5)
13 | 
14 | # should we remove the points with the condition false instead of setting to 0?
15 | # no, it's good how we have it. since we're conditioning on (e.g.) v1 and v3 when we draw v2, 
16 | # setting v2 to 0 in those cases means there really is no variation in v2 for those v1/v3 combos
17 | 
18 | qplot(v2, v1 + v3)
19 | qplot(v3, v1 + v3)
20 | 
21 | # plot we want:
22 | # v2 curve at various v1 + v3 values. sample from the real data to draw this
23 | # (get random rows and then fill in the whole curve.)
24 | # thicken the rows where v2 isn't 0, ie, does vary, or maybe use alpha to make the other ones lighter.
25 | # x = v2, color = v1 + v3 + 
26 | 
27 | table(v2==0)
28 | table(v3==0)
29 | df <- data.frame(v1,v2,v3)
30 | df$y <- rbinom(n=nrow(df), size=1, prob=with(df, inv.logit(v1+v2+v3)))
31 | 
32 | fittedLogit <- glm(y ~ v1 + v2 + v3, data = df, family = "binomial")
33 | 
34 | apcDF <- GetPredCompsDF(function(df) with(df, inv.logit(v1+v2+v3)), df, c("v1","v2","v3"))
35 | 
36 | PlotPredCompsDF(apcDF)
37 | PlotPredCompsDF(apcDF, variant="Apc")
38 | 
39 | 
40 | GetPredCompsDF(fittedLogit, 
41 |                data.frame(v1, v2, v3), 
42 |                c("v1","v2","v3"))
43 | 
44 | 
45 | 
46 | GetPredCompsDF
47 | 
48 | ```
49 | 
50 | 


--------------------------------------------------------------------------------
/_site/notes/examples/overview.Rmd:
--------------------------------------------------------------------------------
 1 | ## Examples Overview
 2 | 
 3 | ### Simulated Examples
 4 | 
 5 | The simulated examples are constructed to demonstrate a property of predictive comparisons.  Simulated examples are nice because (since we set up the data-generating process and we really know what's going on) it's easier to know of the predictive comparisons are telling us what they should be. 
 6 | 
 7 | - [Logistic regression for wine prices](examples-wine-logistic-regression.html): This is a logistic regression model run in two different simulated data situations. Changing the relationship between the inputs (and nothing else) leads to differences in the predictive comparisons (both APC and impact).
 8 | 
 9 | - [Linear model with interactions](examples-simulated-linear-model-interactions.html): This is a linear model with 9 features and 8 interaction terms (an interaction between $v$ and each of $u_1 \ldots u_8$) which are all the same. The way the $u_i$'s are related to $v$ in the distribution of the inputs gives rise to differences in the APC.
10 | 
11 | ### Real Examples
12 | 
13 | Simulated examples are good for gaining understanding of how the package works and what it's doing, but they don't help us understand how it is useful in the real world. These are examples with real-world data sets:
14 | 
15 | - [Diamond prices](examples-diamonds.html): In this example, applying predictive comparisons to a random forest helps understand the influence of carat, cut, color, and clarity on prices.
16 | 


--------------------------------------------------------------------------------
/_site/notes/examples/simulated-linear-model-interactions.Rmd:
--------------------------------------------------------------------------------
 1 | # APCs in a Synthetic Example of a Linear Model with Interactions
 2 | 
 3 | ```{r message=FALSE, echo=FALSE}
 4 | library(knitr)
 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center')
 6 | ```
 7 | 
 8 | ```{r echo=FALSE}
 9 | library(predcomps)
10 | library(ggplot2)
11 | library(reshape2)
12 | library(plyr)
13 | ```
14 | 
15 | ## Input Generation
16 | 
17 | This example will have 9 inputs: $v$ and $u_1$, $u_2$, ..., $u_8$. The input $v$ is uniformly distributed between 7 values, $-3, -2, \ldots, 2, 3$. Each $u$ is mostly constant (at $0$) but at one value of $v$ (which is $v=-3$ for $u_1$, $v=-2$ for $u_2$, etc), $u$ can be either $0$ and $10$. The input $u_8$ can transition at two values of $v$: At either $v=-3$ or $v=3$, $u_8$ can be $0$ or $10$; otherwise, $u_8$ is 0. I'll construct the data and plot the relationship between the $u$'s and $v$. 
18 | 
19 | ```{r}
20 | N <- 200
21 | vValues <- (-3):3
22 | v <- sample(vValues, N, replace=TRUE)
23 | 
24 | df <- data.frame(v)
25 | for (i in seq_along(vValues)) {
26 |   df[[paste0("u",i)]] <- 
27 |     ifelse(v==vValues[i], 
28 |            sample(c(0,10), N, replace=TRUE),  # u can be either 0 or 10 at one v value
29 |            rep(0, N)                          # u is always 0 at other ones
30 |            )
31 | }
32 | 
33 | # u8 can transition at either v=-3 or v=3:
34 | df$u8 <- ifelse(v %in% c(-3,3),
35 |                 sample(c(0,10), N, replace=TRUE), 
36 |                 rep(0, N)                         
37 |                 )
38 | ```
39 | 
40 | ```{r ConditionalDistributionOfUsOnVs, echo=FALSE, fig.cap="distribution of $u$ conditional on $v$"}
41 | theme_set(theme_gray(base_size=20))
42 | ggplot(melt(df, id="v")) +
43 |   geom_bar(aes(x=factor(v), fill=factor(value)), position=position_fill()) +
44 |   facet_grid(variable ~ .) +
45 |   scale_fill_discrete("value of u_i") + 
46 |   scale_x_discrete("v") +
47 |   ggtitle("u_i conditional on v") +
48 |   scale_y_continuous("sample conditional distribution") + 
49 |   opts(axis.text.y = theme_blank(), axis.ticks.y = theme_blank())
50 | ```
51 | 
52 | ## Outcome Generation
53 | 
54 | Each $u$ will have the same role in the model, which is $$\mathbb{E}[y] = vu_1 + vu_2 + vu_3 + vu_4 + vu_5 + vu_6 + vu_7 + vu_8.$$ 
55 | 
56 | Note that each $u$ has the same role in this function. Differences between their APCs can only arise from the correlation structure of the inputs in combination with this function, not from this function alone.
57 | 
58 | For simplicity this demonstration, I'll assume that $\mathbb{E}[y \mid v, u_1, \ldots, u_8]$ is known rather than estimated.
59 | 
60 | (**Todo: Find a way to map these inputs/output to a possible story with meaningful features and outcome. That would make it more interesting, easier to follow, and maybe generate more insight.**)
61 | 
62 | ```{r}
63 | outcomeGenerationFunction <- function(df) {
64 |   with(df, v*u1 + v*u2 + v*u3 + v*u4 + v*u5 + v*u6 + v*u7 + v*u8)
65 | }
66 | df$y <- outcomeGenerationFunction(df)
67 | ```
68 | 
69 | ## Computing and Plotting the APC
70 | 
71 | I will compute both the signed and absolute APC. My absolute APCs are different from anything discussed in the paper, but the idea is simple: Everywhere we would use a predictive difference, we use its absolute value instead. I believe that whenever we display the signed APC, we should display the absolute APC alongside it. Transitions for a given input may have a large expected absolute impact on the output variable, but with signs that cancel out. If that's happening in your model, it would be important to know.
72 | 
73 | Here are the APCs for this example, displayed in two different ways. The first shows signed and absolute APCs in separate charts. The second (my preference) shows both types of APC in the same chart. In this case, each absolute APC is plotted twice, once on the positive half of the APC axis and once on the negative half. This is necessary due to the symmetry between positive and negative numbers. The second version is my preference because it makes comparisons between signed and absolute APCs easier. 
74 | 
75 | For example, we can more quickly notice in the second version that the signed APC for $u_8$ is small relative to its absolute APC. This is because $u_8$ transitions between 0 and 10 at either $v=3$ or $v=-3$. Either $v$ (combined with the $u_8$-transition) leads to a relatively large absolute predictive comparison, but with opposite signs depending on $v$. 
76 | 
77 | ```{r ApcPlotsTwoWays}
78 | inputVars <- c("v",paste0("u",1:8))
79 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars = inputVars)
80 | print(apcDF)
81 | PlotPredCompsDF(apcDF, variant="Apc")
82 | ```
83 | 
84 | These APCs are just what we would expect from the setup of the synthetic examples. When $u$-transitions are at small $v$, the APCs are small due to the $u$/$v$ interaction effect (and vice versa). Even though we know the prediction function exactly, the APCs are a bit off due to errors in our estimates of the distribution of each input conditional on the others. We can mitigate that in this case by increasing the weight given to closer points when assigning weights based on the Mahalanobis distance. The weights are $\frac{1}{\text{mahalanobisConstantTerm}+d}$ where $d$ is the distance, so we can do this by decreasing $\text{mahalanobisConstantTerm}$:
85 | 
86 | ```{r DecreasedMahalanobisConstantTerm}
87 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars, mahalanobisConstantTerm=.01)
88 | PlotPredCompsDF(apcDF, variant="Apc")
89 | ```
90 | 
91 | In this case, the APCs are much closer to the correct values. Giving such extreme weight to pairs with small distance does not cause any problems in this example, but it would in more realistic examples. In applications, getting the weights right may be difficult.
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/_site/notes/examples/wine-logistic-regression.Rmd:
--------------------------------------------------------------------------------
  1 | ```{r echo=FALSE, results='hide', message=FALSE}
  2 | library(predcomps)
  3 | library(ggplot2)
  4 | library(boot)
  5 | library(knitr)
  6 | opts_chunk$set(tidy = FALSE)
  7 | set.seed(1)
  8 | ```
  9 | 
 10 | ## A Logistic Regression with Related Inputs
 11 | 
 12 | *(The source code for this example is [here](https://github.com/dchudz/predcomps/blob/master/notes/examples/wine-logistic-regression.Rmd).)*
 13 | 
 14 | We will set up a simulated data set to use for modeling the probability a customer buys a bottle of wine, given its price and quality. We'll compare a few situation varying the joint distribution of price ($P$) and quality ($Q$). The coefficients of the logistic regression determining the relationship between the inputs and the probability of purchase will not vary.
 15 | 
 16 | In each variation, the probability of purchase is governed by the following logistic regression model:
 17 | 
 18 | $$logit(P(\text{wine is purchased})) = 0.1 Q - 0.12 P$$
 19 | 
 20 | - *Variation 1*: Price is uniform; quality is price plus noise. Quality increases with price, but not enough make up for price, so the expensive wines are rarely purchased and the cheap wines are almost always purchased.
 21 | - *Variation 2*: Just like *Variation 1*, but price is more densely concentrated in its middle range. This leads to price and quality both having a larger APC because the inverse logistic curve is steeper in this middle range.
 22 | - *Variation 3*: Like *Variation 1*, but quality varies more strongly with price. The inverse logistic curve is now steeper at almost all price/quality combinations.
 23 | 
 24 | The APC varies across these variations, but the logistic regression coefficients remain the same. The changes in APC in each of these variations are driven entirely by changes in the distribution of the inputs. The model relating inputs to outputs is unchanged.
 25 | 
 26 | ### Variation 1
 27 | 
 28 | In the first variation, quality and price are independent, with price uniformly distributed and quality set to price plus Gaussian noise:
 29 | 
 30 | ```{r}
 31 | priceCoef <- -.12
 32 | qualityCoef <- .1
 33 | qualityNoiseStdDev <- 5
 34 | nWines=50000
 35 | nRowsForPlottingSample <- 1000
 36 | 
 37 | numForTransitionStart <- 500
 38 | numForTransitionEnd <- 10000
 39 | onlyIncludeNearestN = 100
 40 | 
 41 | priceQualitySlope <- .4
 42 | 
 43 | df1 <- local({
 44 |   price <- sample(20:120, nWines, replace=TRUE)
 45 |   quality <- price * priceQualitySlope + 22 + rnorm(nWines, sd=qualityNoiseStdDev)
 46 |   purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50)  )
 47 |   purchased  <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability)
 48 |   data.frame(Quality = quality, 
 49 |              Price = price, 
 50 |              PurchaseProbability = purchaseProbability, 
 51 |              Purchased = purchased)
 52 |   })
 53 | ```
 54 | 
 55 | A scatter plot (using a random subset to avoid overplotting) shows us the relationship between price and quality:
 56 | 
 57 | ```{r}
 58 | df1Sample <- df1[sample.int(nWines, size=nRowsForPlottingSample), ]
 59 | qplot(Price, Quality, alpha=I(.5), data = df1Sample) + 
 60 |   expand_limits(y=c(0,100))
 61 | ```
 62 | 
 63 | When we fit a logistic regression, the coefficients are what we'd expect from the setup above:
 64 | 
 65 | 
 66 | ```{r}
 67 | logitFit1 <- glm(Purchased ~ Price + Quality, data = df1, family = "binomial")
 68 | logitFit1
 69 | ```
 70 | 
 71 | This plot shows the relationship between quality and probability of purchase for a few prices:
 72 | 
 73 | ```{r}
 74 | myScales <- list(scale_x_continuous(limits=c(0,100)),
 75 |                  scale_y_continuous(limits=c(0,1)))
 76 | 
 77 | ggplot(subset(df1Sample, Price %in% seq(20, 120, by=10))) + 
 78 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
 79 |              size = 3, alpha = 1) + 
 80 |   ggtitle("Quality vs. Purchase Probability at Various Prices") + myScales
 81 | 
 82 | ```
 83 | 
 84 | Each colored set of points is one portion of a shifted inverse logistic curve, determined by which Price/Quality combinations actually occur in our data.
 85 | 
 86 | We can also see the portion of the curve that isn't represented in our data:
 87 | 
 88 | ```{r echo=FALSE}
 89 | linesDF <- expand.grid(Price = seq(20, 120, by=10), Quality = 0:100)
 90 | linesDF$PurchaseProbability <- with(linesDF,
 91 |                                     inv.logit(priceCoef*(Price - 70) + qualityCoef*(Quality - 50)))
 92 | last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
 93 |                         data = linesDF,
 94 |                         size=.2)
 95 | ```
 96 | 
 97 | We can get average predictive comparisons from our fitted regression:
 98 | 
 99 | ```{r}
100 | apc1 <- GetPredCompsDF(logitFit1, df1,
101 |                        numForTransitionStart = numForTransitionStart,
102 |                        numForTransitionEnd = numForTransitionEnd,
103 |                        onlyIncludeNearestN = onlyIncludeNearestN)
104 | ```
105 | 
106 | The `GetPredCompsDF` function produces a few kinds of outputs, but for now let's just focus on the signed average predictive comparison:
107 | 
108 | ```{r}
109 | apc1[c("Input", "PerUnitInput.Signed")]
110 | ```
111 | 
112 | This means that (on average) the probability of purchase increases by about 1.2% per 1 unit increase in quality.
113 | 
114 | ### Variation 2
115 | 
116 | This variation will add some additional wines to the middle range of prices:
117 | 
118 | ```{r}
119 | nAdditionalWines <- nWines
120 | supplementForDF2 <- local({
121 |   price <- sample(55:85, nWines, replace=TRUE)
122 |   quality <- price * .4 + 22 + rnorm(nWines, sd=qualityNoiseStdDev)
123 |   purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50)  )
124 |   purchased  <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability)
125 |   data.frame(Quality = quality, 
126 |              Price = price, 
127 |              PurchaseProbability = purchaseProbability, 
128 |              Purchased = purchased)
129 |   })
130 | df2 <- rbind(df1, supplementForDF2)
131 | ```
132 | 
133 | A scatter plot (again, using a random subset to avoid overplotting) shows us the relationship between price and quality:
134 | 
135 | ```{r}
136 | df2Sample <- df2[sample.int(nrow(df2), size=nRowsForPlottingSample), ]
137 | qplot(Price, Quality, alpha=I(.5), data = df2Sample) + 
138 |   expand_limits(y=c(0,100))
139 | ```
140 | 
141 | When we fit a logistic regression, the coefficients are similar to before, since we haven't changed the underlying model:
142 | 
143 | 
144 | ```{r}
145 | logitFit2 <- glm(Purchased ~ Price + Quality, data = df2, family = "binomial")
146 | logitFit2
147 | ```
148 | 
149 | In the plot showing the relationship between quality and probability of purchase, we see more points at the steep section of the inverse logit curve:
150 | 
151 | ```{r}
152 | ggplot(subset(df2Sample, Price %in% seq(20, 120, by=10))) + 
153 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
154 |              size = 3, alpha = 1) + 
155 |   ggtitle("Quality vs. Purchase Probability at Various Prices") +
156 |   myScales
157 | ```
158 | 
159 | The APC for quality is correspondingly larger:
160 | 
161 | ```{r}
162 | apc2 <- GetPredCompsDF(logitFit2, df2,
163 |                        numForTransitionStart = numForTransitionStart,
164 |                        numForTransitionEnd = numForTransitionEnd,
165 |                        onlyIncludeNearestN = onlyIncludeNearestN)
166 | 
167 | apc2[c("Input",  "PerUnitInput.Signed")]
168 | ```
169 | 
170 | This means that in this variation the probability of purchase increases (on average) by about 1.5% (vs. 1.2% in *Variation 1*) per 1-point increase in quality. The magnitude of the APC for price is also larger.
171 | 
172 | ### Variation 3
173 | 
174 | This is just like *Variation 1*, but price increases more with quality:
175 | 
176 | ```{r}
177 | priceQualitySlope <- 1.2
178 | 
179 | df3 <- local({
180 |   price <- sample(20:120, nWines, replace=TRUE)
181 |   quality <- price * priceQualitySlope - 30 + rnorm(nWines, sd=qualityNoiseStdDev)
182 |   purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50)  )
183 |   purchased  <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability)
184 |   data.frame(Quality = quality, 
185 |              Price = price, 
186 |              PurchaseProbability = purchaseProbability, 
187 |              Purchased = purchased)
188 |   })
189 | ```
190 | 
191 | ```{r}
192 | df3Sample <- df3[sample.int(nWines, size=nRowsForPlottingSample), ]
193 | qplot(Price, Quality, alpha=I(.5), data = df3Sample) + 
194 |   expand_limits(y=c(0,100))
195 | ```
196 | 
197 | The logistic regression still comes out the same:
198 | 
199 | ```{r}
200 | logitFit3 <- glm(Purchased ~ Price + Quality, data = df3, family = "binomial")
201 | logitFit3
202 | ```
203 | 
204 | In this case, purchase is less certain at the low prices and more plausible at the high prices:
205 | 
206 | ```{r}
207 | ggplot(subset(df3Sample, Price %in% seq(-100, 200, by=10))) + 
208 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
209 |              size = 3, alpha = 1) + 
210 |   ggtitle("Quality vs. Purchase Probability at Various Prices") + 
211 |   myScales
212 | ```
213 | 
214 | We can get average predictive comparisons from our fitted regression:
215 | 
216 | ```{r}
217 | apc3 <- GetPredCompsDF(logitFit3, df3,
218 |                        numForTransitionStart = numForTransitionStart,
219 |                        numForTransitionEnd = numForTransitionEnd,
220 |                        onlyIncludeNearestN = onlyIncludeNearestN)
221 | ```
222 | 
223 | As expected, the APCs are (both) larger than in *Variation 1*:
224 | 
225 | ```{r}
226 | apc3[c("Input", "PerUnitInput.Signed")]
227 | ```
228 | 


--------------------------------------------------------------------------------
/_site/notes/impact.Rmd:
--------------------------------------------------------------------------------
 1 | ## Impact: A variation on APCs with comparable units
 2 | 
 3 | *Impact* answers the question: **What difference does each input tend to make in the output?** (Given the model and the distribution of inputs.) The units are the units of the output variable.
 4 | 
 5 | ### The name "Impact"
 6 | 
 7 | First off, I don't love the name "Impact" for this. "Variable importance" may signal somewhat the right idea to people already familiar with other measures of variable importance, but this notion is (or can be) signed, whereas variable importance generally isn't.
 8 | 
 9 | Also, I believe that this metric may turn out to be more commonly useful in my work than APCs, in which case perhaps the name of the package and metric should be chosen to emphasize this.
10 | 
11 | ### Impact
12 | 
13 | *Impact* is a statistic similar to the APC, but which addresses two issues I've had with APCs:
14 | 
15 | 1. APCs are good for their purpose (determining the expected difference in outcome per unit change in input), but this doesn't tell me how important an input is to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference.
16 | 
17 | 2. Relatedly, APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs in that example belong on the same chart, and I would like to visualize the influence .
18 | 
19 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs in the computation of the APC. That is, impact is just the expectation of
20 | 
21 | $\Delta_f = f(u_2,v) - f(u_1,v)$
22 | 
23 | under the same process as in the APC:
24 | 
25 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs
26 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$
27 | 
28 | The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable. This statistic depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs.
29 | 
30 | ### Example (cont.)
31 | 
32 | The same example used to demonstrate APCs demonstrates the difference between *impact* and APCs. Recall that the inputs ($x_1$, $x_2$, $x_3$) are independent, with
33 | 
34 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$
35 | 
36 | However, the variation in input $x_3$ is much larger than the others:
37 | 
38 | ```{r}
39 | n <- 200
40 | x1 <- runif(n=n, min=0, max=1)
41 | x2 <- runif(n=n, min=0, max=1)
42 | x3 <- runif(n=n, min=0, max=10)
43 | y <- 2*x1 + (-2)*x2 + 1*x3
44 | df <- data.frame(x1, x2, x3, y)
45 | fittedLm <- lm(y ~ ., data=df)
46 | ```
47 | 
48 | We can then compute and plot the *impact*:
49 | 
50 | ```{r results='hide', message=FALSE}
51 | library(predcomps)
52 | apcDF <- GetPredCompsDF(fittedLm, df=df)
53 | PlotPredCompsDF(apcDF) + theme_gray(base_size = 18)
54 | ```
55 | 
56 | The *impact* for $x_3$ is about 5 times the impact for $x_1$, which makes sense as $x_3$ varies on a scale that is 10 times as large but with a coefficient half as big.
57 | 
58 | ```{r}
59 | apcDF
60 | ```
61 | 
62 | The [examples](examples-overview.html) section goes through more interesting examples demonstrating more subtle features of how *impact* and APCs work.
63 | 


--------------------------------------------------------------------------------
/_site/notes/index.Rmd:
--------------------------------------------------------------------------------
 1 | ## An R Package for Understanding Arbitrary Complex Models
 2 | 
 3 | As complex models become widely used, it's more important than ever to have ways of understanding them. Whether a model is built primarily for prediction or primarily for understanding, we need to know what it's telling us. It's (relatively) clear how to interpret the coefficients of a linear regression, at least when there are few inputs and no interaction terms. 
 4 | 
 5 | This R package is a collection of tools that are meant to generalize the idea of these coefficients to arbitrary predictive models: Holding all else equal, how does the output vary with the input of interest? The tools here apply to any model that lets you map inputs to outputs, whether it's a linear model, a GLM (with any link function), a neural network, a random forest, etc.
 6 | 
 7 | ### Inspiration
 8 | 
 9 | The ideas implemented here originate in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). If you are already familiar with the paper, you can [skip to the differences between that and what's here](more-compared-with-paper.html). As far as I know, this is the only implementation intended for general use.
10 | 
11 | ### Installation
12 | 
13 | The package is not hosted on CRAN, but it's still easy to install:
14 | 
15 | ```{r eval=FALSE}
16 | library(devtools) # first install devtools if you haven't
17 | install_github("predcomps", user="dchudz")
18 | ```
19 | 
20 | ### Quick Start
21 | 
22 | If you want to get started quickly before reading about the functions in more detail, here's an example:
23 | 
24 | .
25 | .
26 | .
27 | 
28 | 
29 | ### Current Limitations
30 | 
31 | The package has some major limitations in its current form, but we have to start somewhere:
32 | 
33 | **Input types**: At the time of this writing, all inputs to the model must be numerical (or coercable to such). In practice this means that binary or ordered categorical inputs are okay, but unordered categorical inputs are not.
34 | 
35 | **Computational efficiency**: At the moment, the package's implementation is not particularly efficient. You should probably not ask it to handle more than 200-500 observations. (Note however that your model may be trained on any number of observations.) 
36 | 
37 | **Weight function**: The weight function used to estimate conditional distributions doesn't get much attention in the paper, and I haven't given it much more here (yet).
38 | 
39 | 
40 | ## Contact
41 | 
42 | I'm very interested in feedback from folks who are trying this out. Please get in touch with me at dchudz@gmail.com.}
43 | 


--------------------------------------------------------------------------------
/_site/notes/more/compared-with-paper.Rmd:
--------------------------------------------------------------------------------
 1 | ### One Difference: Renormalizing Weights
 2 | 
 3 | There is only one outright difference between APCs as described in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) and as implemented here: After weights are assigned to pairs of observations (as in section __), the weights are normalized across the first element of the pair. [(explanation and justification)](more-renormalize-weights.html)
 4 | 
 5 | ### Absolute APCs
 6 | 
 7 | Gelman & Pardoe mention an unsigned version APCs in the case where the input of interest is an unordered categorical variable (in which case signs wouldn't make sense). They propose a root mean squared APC (equaton 4), but I prefer absolute values and I believe this absolute-values version is more generally useful. By default, I always compute and display an absolute version of the APC alongside the signed version. See e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html) for an example demonstrating the importance of this notion, and [my explanation of APCs](apc.html) for more detail.
 8 | 
 9 | ### Impact: A variation on APCs with comparable units
10 | 
11 | I've created a statistic similar to the APC, but which addresses two issues I've had with APCs:
12 | 
13 | 1. APCs are good for their purpose (the expected difference in outcome per unit change in input), but it doesn't tell me the what difference an input makes to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference.
14 | 
15 | 2. APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs belong on the same chart.
16 | 
17 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs. The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable. It depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs.
18 | 
19 | I'm calling this notion *impact* (feel free to suggest another name), and it's described in more detail [here](impact.html) and in the examples. Just like APCs, it comes in signed and absolute forms.
20 | 
21 | ### Transition Plots
22 | 
23 | (Again, feel free to suggest a better name!)
24 | 
25 | Both APCs and impact summarize the role of an input with one number. This is useful, but I also want to examine the model in more detail. For example, how does the influence of an input vary across its range? Breiman & Cutler's "partial dependence plots" as implemented in the [randomForests](http://cran.r-project.org/web/packages/randomForest/randomForest.pdf) package partly get at this question, but I wanted a visualization more in line with the spirit of APCs. See [my explanation of transition plots](transition-plots.html) and the examples for more detail.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/_site/notes/more/future-work.Rmd:
--------------------------------------------------------------------------------
 1 | ## Future work
 2 | 
 3 | ### Speed/scalability
 4 | 
 5 | I often find myself cutting down the data set size for computational reasons. E.g. starting with 10k rows and looking only at 300. It might be better to use different numbers of rows in different roles, e.g. pairs between 300 rows and 1000 other rows.
 6 | 
 7 | I should make it faster / work with more data.
 8 | 
 9 | 
10 | ### Comparison with other approaches
11 | 
12 | For example, I'd like to compare with the tools (variable importance and partial plots) available in the `randomForest` package. I would guess that those tools are the most similar thing to this that's in common use.
13 | 
14 | Other notions of variable importance to compare with:
15 | 
16 | - the one in the `earth` package (implementing MARS)
17 | - I should look through `caret` to see what's available there
18 | 
19 | ### Categorical inputs
20 | 
21 | I'm not quite sure about the best way to sample from the distribution for an input of interest, conditional on the other inputs, when those other inputs include categorical variables.
22 | 
23 | Gelman & Pardoe 2007 discuss what to do when the input of interest is an unordered categorical variable, but I'm mainly interested in applying the statistic I'm calling [*impact*](impact.html) to categorical variables, since *impact* is comparable across all types of inputs.
24 | 
25 | 
26 | ### Uncertainty
27 | 
28 | As discussed in the paper, it is entirely natural to allow for multiple samples of parameters and plot these measures with their uncertainties as in the paper.
29 | 
30 | I'm also interested in the uncertainty that arises from the density estimation. 
31 | 
32 | ### Weights
33 | 
34 | We should have better & more automated choice of weights. Weights should be chosen in such a way that our estimates of conditional distributions approach the true distributions [in the limit as we get more data](more-large-N-limit.html).
35 | 
36 | ### Code robustness
37 | 
38 | I should deal with potential column name conflicts in data frames (e.g. I use a "weight" column, so I'll have problems if one of the inputs has that name).
39 | 
40 | I should add unit tests to make further development more efficient.
41 | 


--------------------------------------------------------------------------------
/_site/notes/more/large-N-limit.Rmd:
--------------------------------------------------------------------------------
 1 | ```{r echo=FALSE, message=FALSE}
 2 | library("predcomps")
 3 | print.data.frame <- function(...) base::print.data.frame(..., row.names=FALSE)
 4 | ```
 5 | 
 6 | **This is a note about how the weights are (should be) constructed. The observation here is that our estimated APCs do not approach the theoretical APCs in the limit as we get more data. This is a clue that we should tweak the definition of the weights, but I haven't implemented such an improvement yet.**
 7 | 
 8 | ## Large $N$ Limit
 9 | 
10 | As we get more data, it would be nice if the APC we compute tends toward the right answer, equation (2) of [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). I don't care about asymptotics as much as some people, but if we don't get the right answer in the limit, that's at least a clue that we might not be doing as well as we can for smaller $N$. This note shows that unless we adjust the weighting function as we get more data, we won't have that nice property.
11 | 
12 | ```{r}
13 | makeExampleDF <- function(N) {
14 |   exampleDF <- data.frame(
15 |     v=c(3,3,7,7),  
16 |     u=c(10,20,12,22) 
17 |     )[rep(c(1,2,3,4),c(.4*N,.4*N,.1*N,.1*N)),]
18 |   exampleDF <- transform(exampleDF, v = v + rnorm(nrow(exampleDF), sd=.001))
19 |   return(exampleDF)
20 | }
21 | ```
22 | 
23 | Just as in the note "Normalizing Weights", the APC should be:
24 | 
25 | $$.8 \delta_u(10 \rightarrow 20, 3, f) + 0.2 \delta_u(12 \rightarrow 22, 7, f) = (.8)(3) + (.2)(6) = 3.8$$
26 | 
27 | We get almost the same APC with 300 data points as 100:
28 | 
29 | ```{r}
30 | GetSingleInputApcs(function(df) return(df$u * df$v), makeExampleDF(100), u="u", v="v")$PerUnitInput.Signed
31 | GetSingleInputApcs(function(df) return(df$u * df$v), makeExampleDF(300), u="u", v="v")$PerUnitInput.Signed
32 | ```
33 | 
34 | If we're looking at one value for $v$, $v=v_0$, the tradeoff in determining the weights is:
35 | 
36 | 1. $v$'s closer to $v_0$ will do a better job representing the distribution of $u$ conditional on $v=v_0$
37 | 2. but if too few $v$'s get too much of the weight, our estimate for the conditional distribution of $u$ will be too noisy
38 | 
39 | The reason our estimate didn't improve with more data is that we're not moving more weight to nearby points as we get more data. For any $N$, we're presently putting roughly the same amount of mass at the same distances. As we get more data, we can afford to put more weight closer to $v$, because (2) becomes less of a problem. A couple ideas are:
40 | 
41 | - With the weights as $\frac{1}{k+d}$ ($d$ is the Mahalanobis distance), we could scale $k$ down as $N$ goes up.
42 | - Or we could use the weights we are now ($k=1$), except we drop (equivalent to setting the weight to 0) all but the closest $s(N)$ points to each $v$. The function $s$ needs to increases with $N$, but not as fast as $N$, e.g. maybe $s(N) = sqrt(N)$ probably works. This means we're always decreasing bias (sampling from closer to the right $v$) and also decreasing variance (more samples) as N increases. This would also be good for keeping run-times and memory usage under control as $N$ increases.
43 | 


--------------------------------------------------------------------------------
/_site/notes/more/renormalize-weights.Rmd:
--------------------------------------------------------------------------------
  1 | ```{r echo=FALSE, message=FALSE}
  2 | set.seed(89890)
  3 | opts_chunk$set(tidy=FALSE)
  4 | library("plyr")
  5 | library(predcomps)
  6 | print.data.frame <- function(...) base::print.data.frame(..., row.names=FALSE)
  7 | ```
  8 | 
  9 | ## Renormalizing Weights
 10 | 
 11 | In computing the APC, we assign weights to pairs of observations based on the Mahalanobis distance between the corresponding $v$'s. This note uses a toy example to argue that we must *renormalize* the weights so that when we group by the first element of each pair and sum the weights, each group has the same sum-of-weights.
 12 | 
 13 | This suggested renormalization is not discussed in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract).
 14 | 
 15 | ## Toy example with exact transitions
 16 | 
 17 | If $u$ is an input of interest and $v$ are the other inputs, recall that the APC is the expected value of a quantity formed upon sampling from $v$, sampling twice from $u$ conditional on $v$, and computing predictive comparisons using those $u$'s. See equation (2) of [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract).
 18 | 
 19 | If there were enough pairs of points with identical $v$, we could just use the sample distribution of $u$ given $v$. As noted in the paper, we may have few (if any) pairs of points with identical $v$. Even so, let's think through an example where we do have such identical pairs:
 20 | 
 21 | Suppose $v$ consists of only 1 input, which can take one of two values. For simplicity, assume $u$ has exactly two possible (equally likely) values at each $v$ (which are different depending on the value of $v$), so there is only one possible transition at each $v$. Here's an example:
 22 | 
 23 | ```{r}
 24 | exampleDF <- data.frame(
 25 |   v=c(3,3,7,7),  
 26 |   u=c(10,20,12,22) 
 27 |   )[rep(c(1,2,3,4),c(40,40,10,10)),]
 28 | ```
 29 | 
 30 | ```{r echo=FALSE}
 31 | # Count each u/v combination:
 32 | kable(ddply(exampleDF, c("v","u"), function(df) data.frame(CountOfRows = nrow(df))))
 33 | ```
 34 | 
 35 | Say we have a model $\hat{y} = f(u,v)$. I'll choose $\hat{y} = f(u,v) = uv$ for a simple example. (How the model is estimated is completely orthogonal to the questions addressed here, and I'll just pretend it's known exactly.)
 36 | 
 37 | Equation (2) in the paper says the numerator in the APC should be:
 38 | 
 39 | $$(.4)(.5)(.5)(f(20,3) - f(10, 3)) + (0.1)(.5)(.5)(f(22,7) - f(12,7)) $$
 40 | 
 41 | The .5's are the $p(u|v)$'s (and will cancel out in this case). (Terms with transition size 0 aren't included.)
 42 | 
 43 | The denominator is:
 44 | 
 45 | $$(.4)(.5)(.5)((20 - 10) + (.1)(.5)(.5)((22 - 12)$$
 46 | 
 47 | The ratio simplifies to:
 48 | 
 49 | $$.8 \delta_u(10 \rightarrow 20, 3, f) + 0.2 \delta_u(12 \rightarrow 22, 7, f)$$
 50 | 
 51 | This is all overkill for our very simple example, where it's easy to see that the APC is just $(.8)(3) + (.2)(6)$. But I wanted to be very concrete.
 52 | 
 53 | I'll compute it:
 54 | 
 55 | ```{r}
 56 | f <- function(u, v) return(u*v)
 57 | ApcExact <- .8*(f(20,3) - f(10,3))/10 + .2*(f(22,7) - f(12,7))/10
 58 | ApcExact
 59 | ```
 60 | 
 61 | ## Now without exact duplicates
 62 | 
 63 | Now imagine we don't have any exact duplicates of $v$. To get a corresponding example like that, I'll modify the first example by adding a really tiny bit of noise to $v$: $v_{new} = v + N(0,\epsilon)$.
 64 | 
 65 | ```{r}
 66 | exampleDF2 <- transform(exampleDF, v = v + rnorm(nrow(exampleDF), sd=.001))
 67 | ```
 68 | 
 69 | Now we form pairs and compute weights as described in the paper. Here's a sample of the resulting data frame of pairs, just to get a sense of what it looks like:
 70 | 
 71 | ```{r}
 72 | pairsDF <- GetPairs(exampleDF2, u="u", v="v", renormalizeWeights=FALSE)
 73 | ```
 74 | 
 75 | ```{r echo=FALSE}
 76 | kable(pairsDF[sample(1:nrow(pairsDF), 12), ], row.names=FALSE)
 77 | ```
 78 | 
 79 | Now pairs with nearby $v$'s (which would have been the same $v$'s previously) have high weights, where pairs from far-away $v$'s (which were different $v$'s in the previous example) have low weights. That's good.
 80 | 
 81 | But $v$ near 3 now has more weight in the data set for two reasons:
 82 | 
 83 | 1. we started with more $v$'s near 3, so there are more rows with $v$ near 3 as the first element of the pair; and
 84 | 2. each time $v$ is near $3$ in the first element of each pair, there are more nearby $v$'s to pair with, so we get higher Weights.
 85 | 
 86 | Reason (1) is good, but reason (2) is not so good.
 87 | 
 88 | In the data frame of pairs, the weights are all close to 0.14 or 1. Let's look at the joint distribution of $u$ and $v$ in just the pairs with weights close to 1:
 89 | 
 90 | ```{r}
 91 | pairsDF <- data.frame(vRounded = round(pairsDF$v), pairsDF)
 92 | pairsHighWeightsDF <- subset(pairsDF, Weight > 0.9)
 93 | ddply(pairsHighWeightsDF,
 94 |       c("vRounded","u"), 
 95 |       function(df) data.frame(CountOfRows = nrow(df),
 96 |                               ProportionOfRows = nrow(df)/nrow(pairsHighWeightsDF)))
 97 | ```
 98 | 
 99 | We see that $v$'s near 7 makes up only about 5.7% of the pairs. (It would be exactly $(.2)(.2) = 4$%, except that when we form pairs to compute the APC we don't pair any row with itself.)
100 | 
101 | If we form the APC based on these pairs and these weights, we weight the $v$'s near 3 too much, so our APC is too low:
102 | 
103 | ```{r}
104 | pairsDF$yHat1 <- f(pairsDF$u, pairsDF$v)
105 | pairsDF$yHat2 <- f(pairsDF$u.B, pairsDF$v)
106 | pairsDF$uDiff <- pairsDF$u.B - pairsDF$u
107 | ApcApprox1 <- 
108 |   with(pairsDF,
109 |        sum(Weight * (yHat2 - yHat1) * sign(uDiff)) / sum(Weight * uDiff * sign(uDiff)))
110 | ApcApprox1
111 | ```
112 | 
113 | I showed the full computation above, but we can also use the ```GetAPC``` function from this package:
114 | 
115 | ```{r}
116 | GetSingleInputApcs(function(df) return(df$u * df$v), exampleDF2, u="u", v="v", renormalizeWeights=FALSE)$PerUnitInput.Signed
117 | ```
118 | 
119 | Instead, we can normalize Weights so that within each first element of the pair. 
120 | 
121 | ```{r}
122 | pairsDFWeightsNormalized <- ddply(pairsDF, "OriginalRowNumber", transform, Weight = Weight/sum(Weight))
123 | ApcApprox2 <- 
124 |   with(pairsDFWeightsNormalized, 
125 |        sum(Weight * (yHat2 - yHat1) * sign(uDiff)) / sum(Weight * uDiff * sign(uDiff)))
126 | ApcApprox2
127 | ```
128 | 
129 | These renormalized Weights are the ones returned from ```GetPairs``` by default, and used in ```GetAPC``` by default:
130 | 
131 | ```{r}
132 | GetSingleInputApcs(function(df) return(df$u * df$v), exampleDF2, u="u", v="v", renormalizeWeights=TRUE)$PerUnitInput.Signed
133 | ```
134 | 
135 | 


--------------------------------------------------------------------------------
/_site/notes/template:
--------------------------------------------------------------------------------
1 | #!html_output#
2 | 


--------------------------------------------------------------------------------
/_site/notes/transition-plots.Rmd:
--------------------------------------------------------------------------------
 1 | ```{r message=FALSE, echo=FALSE}
 2 | library(knitr)
 3 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center')
 4 | ```
 5 | 
 6 | ```{r echo=TRUE}
 7 | library(predcomps)
 8 | library(ggplot2)
 9 | ```
10 | 
11 | - remove arrow heads -- dots at both ends maybe
12 | 
13 | 
14 | # Transition Plots: A new way to visualize models
15 | 
16 | Transition plots are a way of looking at looking at the comparisons behind APCs in a less aggregated form.
17 | 
18 | Instead of computing an average (one-point summary), we directly show the predictive comparisons. Since showing all of the pairs would cause overplotting (and not reflect the weights), we consider each original data point $(u_{orig},v)$ (where $u_{orig}$ is the input of interest and $v$ the other inputs) with only one resampled $u$ (sampled from all of the pairs between that data point and the others, with the same weights as used in APCs).
19 | 
20 | We  predict the output $y_{orig}$ at $(u_{orig},v)$ with the original $u$, and $y_{new}$ at $(u_{new},v)$ at the resampled $u$, and draw an arrow from $(u_{orig},y_{orig})$ to $(u_{new},y_{new})$. As in APCs, $v$ is held constant for each transition.
21 | 
22 | I will consider a small example with two inputs, first with no interaction and then with an interaction. (In this simple model with only two independent inputs, the transition plot is probably not the best way to visualize the model, but it serves as an example to illustrate a technique that will be useful for more complicated models. For example, the varying size of the arrows in this case is just a distraction, as the value of $v$ does not in this case affect the distribution of $u$-transitions.)
23 | 
24 | The simulated examples here will have $u$ and $v$ independent, and the first model will have:
25 | 
26 | $$\mathcal{E}[y_1] = 2v_1 + 2v_2$$
27 | 
28 | ```{r CreateInputFeatures}
29 | N <- 100
30 | df <- data.frame(u=rnorm(N), v=rnorm(N))
31 | df$y1 = 2*df$u + 2*df$v + rnorm(N)
32 | lm1 <- lm(y1 ~ u + v, data=df)
33 | print(lm1)
34 | TransitionPlot(function(df) predict(lm1, df), df, "u", "v") + ylab("y1")
35 | ```
36 | 
37 | Here is an example with an interaction:
38 | 
39 | $$\mathcal{E}[y_1] =  v_1 + v_2 + 2u_1v_1$$
40 | 
41 | ```{r}
42 | df$y2 = df$u + df$v + 2*df$v*df$u + rnorm(N)
43 | lm2 <- lm(y2 ~ u*v, data=df)
44 | print(lm2)
45 | TransitionPlot(function(df) predict(lm2, df), df, "u", "v") + ylab("y2")
46 | ```
47 | 
48 | Since the plot object returned by `TransitionPlot` includes its data, we can examine things in more detail by drawing our own arrows:
49 | 
50 | ```{r}
51 | p <- TransitionPlot(function(df) predict(lm2, df), df, "u", "v", plot=FALSE)
52 | 
53 | ggplot(p$data) +
54 |   geom_segment(aes(x = u, y = output, xend = u.B, yend = outputNew, color=v), arrow = arrow()) +
55 |   ylab("y2")
56 | ```
57 | 


--------------------------------------------------------------------------------
/_site/temp/weights-exploration.R:
--------------------------------------------------------------------------------
  1 | library(randomForest)
  2 | library(predcomps)
  3 | library(ggplot2)
  4 | theme_set(theme_gray(base_size = 18))
  5 | 
  6 | credit <- read.csv("~/Downloads/cs-training.csv")[,-1]
  7 | 
  8 | credit2 <- credit
  9 | credit2 <- subset(credit2, !is.na(MonthlyIncome) &
 10 |                     NumberOfTime30.59DaysPastDueNotWorse < 5 & 
 11 |                     RevolvingUtilizationOfUnsecuredLines <= 2 &
 12 |                     NumberOfTime30.59DaysPastDueNotWorse <= 5 &
 13 |                     NumberOfTime60.89DaysPastDueNotWorse <= 5 &
 14 |                     NumberOfTimes90DaysLate <= 5 &
 15 |                     MonthlyIncome < 5e4 &
 16 |                     DebtRatio < 2 &
 17 |                     NumberRealEstateLoansOrLines <= 12 &
 18 |                     NumberOfDependents < 10                  
 19 | )
 20 | 
 21 | for (col in names(credit2)) {
 22 |   print(qplot(credit2[[col]]) + ggtitle(col))
 23 | }
 24 | 
 25 | credit2$SeriousDlqin2yrs <- factor(credit2$SeriousDlqin2yrs)
 26 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit2, ntree=10)
 27 | 
 28 | credit2Small <- credit2[sample.int(nrow(credit2), size = 1000), ]
 29 | 
 30 | 
 31 | X1 <- credit2[sample.int(nrow(credit2), size = 50), ]
 32 | X2 <- credit2
 33 | 
 34 | dim(X1)
 35 | 
 36 | u="age"
 37 | v = c("RevolvingUtilizationOfUnsecuredLines", "NumberOfTime30.59DaysPastDueNotWorse", 
 38 |       "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", 
 39 |       "NumberOfTimes90DaysLate", "NumberRealEstateLoansOrLines", "NumberOfTime60.89DaysPastDueNotWorse", 
 40 |       "NumberOfDependents")
 41 | mahalanobisConstantTerm = 1
 42 | renormalizeWeights=TRUE
 43 | removeDiagonal=TRUE
 44 | onlyIncludeNearestN=500
 45 | 
 46 | X1 <- X1[c(v,u)]
 47 | X2 <- X2[c(v,u)]
 48 | 
 49 | X1$OriginalRowNumber <- 1:nrow(X1)
 50 | X2$OriginalRowNumber.B <- 1:nrow(X2)
 51 | 
 52 | vMatrix1 <- as.matrix(X1[,v])
 53 | vMatrix2 <- as.matrix(X2[,v])
 54 | 
 55 | 
 56 | covV=cov(vMatrix2)
 57 | 
 58 | distMatrix <- apply(vMatrix1, 1, function(row) mahalanobis(vMatrix2, row, covV))
 59 | dim(distMatrix)
 60 | 
 61 | colnames(distMatrix) <- 1:ncol(distMatrix)
 62 | rownames(distMatrix) <- 1:nrow(distMatrix)
 63 | distDF <- as.data.frame(as.table(distMatrix))
 64 | names(distDF) <- c("OriginalRowNumber.B", "OriginalRowNumber", "MahalanobisDistance")
 65 | 
 66 | distDF <- distDF %.% group_by(OriginalRowNumber) %.% filter(rank(MahalanobisDistance) < onlyIncludeNearestN)
 67 | pairs <- merge(X1, distDF, by = "OriginalRowNumber")
 68 | pairs <- merge(X2, pairs, by = "OriginalRowNumber.B", suffixes = c(".B", ""))
 69 | pairs$Weight <- 1/(mahalanobisConstantTerm + pairs$MahalanobisDistance)
 70 | if (removeDiagonal) {
 71 |   pairs <- subset(pairs, OriginalRowNumber != OriginalRowNumber.B) #remove pairs where both elements are the same
 72 | }
 73 | 
 74 | 
 75 | if (renormalizeWeights) {
 76 |   #     pairs <- pairs %.% group_by(OriginalRowNumber) %.% mutate(Weight = Weight/sum(Weight))
 77 |   #     browser()
 78 |   pairs <- mutate(group_by(pairs, OriginalRowNumber),
 79 |                   Weight = Weight/sum(Weight))
 80 |   pairs <- data.frame(pairs)
 81 | } #normalizing AFTER removing pairs from same row as each other
 82 | 
 83 | 
 84 | # pairs <- pairs[c("OriginalRowNumber",u,v,paste0(u,".B"),"Weight")]
 85 | 
 86 | 
 87 | PlotPairCumulativeWeights(pairs)
 88 | 
 89 | pairs$OriginalRowNumber
 90 | ggplot(pairs) + geom_histogram(aes(x=MahalanobisDistance)) + 
 91 |   #scale_x_log10(limits=c(1e-3,1e3)) + 
 92 |   facet_wrap(~ OriginalRowNumber)
 93 | ggplot(pairs) + geom_histogram(aes(x=MahalanobisDistance, weight = Weight)) + 
 94 |   #scale_x_log10(limits=c(1e-3,1e3)) + 
 95 |   facet_wrap(~ OriginalRowNumber)
 96 | 
 97 | library(dplyr)
 98 | 
 99 |   
100 | PlotPairCumulativeWeights(GetPairs(credit2Small, u=u, v=v, mahalanobisConstantTerm=0))
101 | 


--------------------------------------------------------------------------------
/_site/tests/run-all.R:
--------------------------------------------------------------------------------
1 | # should be executed from package root
2 | library(testthat)
3 | library(devtools)
4 | 
5 | setwd("~/github/predcomps")
6 | document(".")
7 | install(".")
8 | test_package("predcomps")
9 | 


--------------------------------------------------------------------------------
/_site/tests/test-apc.R:
--------------------------------------------------------------------------------
 1 | test_that("APC matches coefficient exactly for linear model", {
 2 |   
 3 |   df <- data.frame(X = rep(c(1,2),2),
 4 |                    Y = c(1,2,3,4))
 5 |   
 6 |   predictionFunction <- function(df) 2*df$X + 3*df$Y
 7 |   result <- GetSingleInputApcs(predictionFunction, df, u="X", v="Y")
 8 |   expect_that(result$PerUnitInput.Signed, equals(2)) 
 9 | })
10 | 


--------------------------------------------------------------------------------
/_site/tests/test-pairs.R:
--------------------------------------------------------------------------------
 1 | MakeComparable <- function(df) {
 2 |   df <- round(df, digits = 5)
 3 |   return(df[do.call(order, df), ])
 4 | }
 5 | 
 6 | test_that("GetPairs works right in a small example", {
 7 |   
 8 |   df <- data.frame(X = rep(c(1,2),2),
 9 |                    Y = rep(c(2,4),2))
10 |   pairsActual <- GetPairs(df, "X", "Y")
11 |   pairsExpected <- data.frame(OriginalRowNumber = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), 
12 |                               X = c(1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2), 
13 |                               Y = c(2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4), 
14 |                               X.B = c(2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1), 
15 |                               Weight = c(0.166666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.166666666666667))
16 |   pairsActual <- pairsActual[names(pairsExpected)]
17 |   expect_that(all.equal(MakeComparable(pairsActual), MakeComparable(pairsExpected)), is_true()) 
18 | })
19 | 


--------------------------------------------------------------------------------
/man/ComputeAPCFromPairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{ComputeApcFromPairs}
 2 | \alias{ComputeApcFromPairs}
 3 | \title{ComputeApcFromPairs}
 4 | \usage{
 5 | ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute = FALSE,
 6 |   impact = FALSE)
 7 | }
 8 | \description{
 9 | ComputeApcFromPairs
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/man/GetComparisonDFFromPairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetComparisonDFFromPairs}
 2 | \alias{GetComparisonDFFromPairs}
 3 | \title{GetComparisonDFFromPairs}
 4 | \usage{
 5 | GetComparisonDFFromPairs(predictionFunction, pairs, u, v)
 6 | }
 7 | \description{
 8 | (abstracted this into a separate function from
 9 | \code{GetApc} so we can more easily do things like
10 | \code{GetApcWithAbsolute})
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/man/GetPairs.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetPairs}
 2 | \alias{GetPairs}
 3 | \title{GetPairs}
 4 | \usage{
 5 | GetPairs(X, u, v, numForTransitionStart = NULL, numForTransitionEnd = NULL,
 6 |   onlyIncludeNearestN = NULL, mahalanobisConstantTerm = 1)
 7 | }
 8 | \arguments{
 9 |   \item{X}{data frame}
10 | 
11 |   \item{u}{input of interest}
12 | 
13 |   \item{v}{other inputs}
14 | 
15 |   \item{mahalanobisConstantTerm}{Weights are (1 /
16 |   (mahalanobisConstantTerm + Mahalanobis distance))}
17 | 
18 |   \item{numForTransitionStart}{number of rows to use as the
19 |   start points of transitions (defaulting to `NULL`, we use
20 |   all rows)}
21 | 
22 |   \item{numForTransitionEnd}{number of rows to use as
23 |   potential end points of transitions (defaulting to
24 |   `NULL`, we use all rows)}
25 | 
26 |   \item{onlyIncludeNearestN}{for each transition start, we
27 |   only include as transition end points the nearest
28 |   `onlyIncludeNearestN` rows (defaulting to `NULL`, we use
29 |   all rows)}
30 | }
31 | \value{
32 | a data frame with the inputs \code{v} from the first of
33 | each pair, \code{u} from each half (with ".B" appended to
34 | the second), and the Mahalanobis distances between the
35 | pairs.
36 | }
37 | \description{
38 | Form all pairs of rows in \code{X} and compute Mahalanobis
39 | distances based on \code{v}.
40 | }
41 | \details{
42 | To help with computational constraints, you have the option
43 | to not form pairs between all rows of \code{X} but instead
44 | of specify a certain number (\code{numForTransitionStart})
45 | to randomly be selected as rows from which transitions
46 | start, and another number (\code{numForTransitionEnd}) to
47 | be randomly selected as where transitions end. We then form
48 | all pairs between transition-start rows and transition-end
49 | rows.
50 | 
51 | In order to get a smaller data frame for later
52 | manipulations (and maybe just because it's a good idea),
53 | you can also specify \code{onlyIncludeNearestN}, in which
54 | case we return only the nearest \code{onlyIncludeNearestN}
55 | transition ends for each transition start (instead of all
56 | pairs).
57 | }
58 | \examples{
59 | v <- rnorm(100)
60 | u <- v + 0.3*rnorm(100)
61 | qplot(v,u)
62 | X = data.frame(v=v,u=u)
63 | pairsDF <- GetPairs(X, "v", "u")
64 | pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1)
65 | # When we subset to one "original row number", all of the v's are the same:
66 | print(pairsDFRow1$v)
67 | # ... and u's corresponding to closer v.B (the v in the second element of the pair) have higher weight:
68 | qplot(u.B, Weight, data=pairsDFRow1)
69 | }
70 | 
71 | 


--------------------------------------------------------------------------------
/man/GetPredCompsDF.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetPredCompsDF}
 2 | \alias{GetPredCompsDF}
 3 | \title{GetApcDF}
 4 | \usage{
 5 | GetPredCompsDF(model, df, inputVars = NULL, ...)
 6 | }
 7 | \arguments{
 8 |   \item{model}{Either a function (from a data frame to
 9 |   vector of predictions) or a model we know how to deal
10 |   with (lm, glm)}
11 | 
12 |   \item{df}{data frame with data}
13 | 
14 |   \item{inputVars}{inputs to the model}
15 | 
16 |   \item{...}{extra parguments passed to GetPairs used to
17 |   control Weight function}
18 | }
19 | \description{
20 | makes average predictive comparison for all specified
21 | inputs
22 | }
23 | \examples{
24 | n <- 200
25 | x1 <- runif(n = n, min = 0, max = 1)
26 | x2 <- runif(n = n, min = 0, max = 1)
27 | x3 <- runif(n = n, min = 0, max = 10)
28 | y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1)
29 | df <- data.frame(x1, x2, x3, y)
30 | fittedLm <- lm(y ~ ., data = df)
31 | apcDF <- GetPredCompsDF(fittedLm, df = df)
32 | apcDF
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/man/GetSingleInputApcs.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetSingleInputApcs}
 2 | \alias{GetSingleInputApcs}
 3 | \title{GetSingleInputApcs}
 4 | \usage{
 5 | GetSingleInputApcs(predictionFunction, X, u, v, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{this could be a function (which
 9 |   takes data frame and makes returns a vector of
10 |   predictions) or an object of class `lm`, `glm`, or
11 |   `randomForest`}
12 | 
13 |   \item{X}{a data frame with all inputs}
14 | 
15 |   \item{u}{a string naming the input of interest}
16 | 
17 |   \item{v}{a string naming the other inputs}
18 | 
19 |   \item{...}{other arguments to be passed to `GetPairs`}
20 | }
21 | \value{
22 | a list with: \code{signed} (the usual Apc) and
23 | \code{absolute} (Apc applied to the absolute value of the
24 | differences)
25 | }
26 | \description{
27 | makes predictive comparison summaries (both per unit input
28 | and impact, both absolute and signed) by forming an data
29 | frame of pairs with appropriate weights and then calling
30 | `ComputeApcFromPairs`. Only works fore continuous inputs
31 | right now
32 | }
33 | \examples{
34 | n <- 200
35 | x1 <- runif(n = n, min = 0, max = 1)
36 | x2 <- runif(n = n, min = 0, max = 1)
37 | x3 <- runif(n = n, min = 0, max = 10)
38 | y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1)
39 | df <- data.frame(x1, x2, x3, y)
40 | fittedLm <- lm(y ~ ., data = df)
41 | fittedLm
42 | GetSingleInputApcs(fittedLm, df, "x2", c("x1", "x3"))
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/man/GetSingleInputPredComps.Rd:
--------------------------------------------------------------------------------
 1 | \name{GetSingleInputPredComps}
 2 | \alias{GetSingleInputPredComps}
 3 | \title{GetSingleInputPredComps}
 4 | \usage{
 5 | GetSingleInputPredComps(predictionFunction, X, u, v, ...)
 6 | }
 7 | \arguments{
 8 |   \item{predictionFunction}{this could be a function (which
 9 |   takes data frame and makes returns a vector of
10 |   predictions) or an object of class `lm`, `glm`, or
11 |   `randomForest`}
12 | 
13 |   \item{X}{a data frame with all inputs}
14 | 
15 |   \item{u}{a string naming the input of interest}
16 | 
17 |   \item{v}{a string naming the other inputs}
18 | 
19 |   \item{...}{other arguments to be passed to `GetPairs`}
20 | }
21 | \value{
22 | a list with: \code{signed} (the usual Apc) and
23 | \code{absolute} (Apc applied to the absolute value of the
24 | differences)
25 | }
26 | \description{
27 | makes predictive comparison summaries (APC and impact,
28 | absolute and signed) by forming an data frame of pairs with
29 | appropriate weights and then calling `ComputeApcFromPairs`.
30 | Only works fore continuous inputs right now
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/man/PlotPairCumulativeWeights.Rd:
--------------------------------------------------------------------------------
 1 | \name{PlotPairCumulativeWeights}
 2 | \alias{PlotPairCumulativeWeights}
 3 | \title{PlotPairCumulativeWeights}
 4 | \usage{
 5 | PlotPairCumulativeWeights(pairs, numOriginalRowNumbersToPlot = 20)
 6 | }
 7 | \description{
 8 | For a sample of transition start rows, we plot rank of
 9 | transition end (by increasing weight) vs. cumulative
10 | weight. This gives a sense of how much weight is going into
11 | the nearest points vs. further ones.
12 | }
13 | \examples{
14 | v <- rnorm(100)
15 | u <- v + 0.3*rnorm(100)
16 | X = data.frame(v=v,u=u)
17 | pairsDF <- GetPairs(X, "v", "u")
18 | pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1)
19 | # For most original rows, we get 75\% of the weight in 50\% of the pairs:
20 | PlotPairCumulativeWeights(pairsDF)
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/PlotPredCompsDF.Rd:
--------------------------------------------------------------------------------
 1 | \name{PlotPredCompsDF}
 2 | \alias{PlotPredCompsDF}
 3 | \title{PlotApcDF}
 4 | \usage{
 5 | PlotPredCompsDF(apcDF, variant = "Impact")
 6 | }
 7 | \arguments{
 8 |   \item{apcDF}{the output of GetApcDF}
 9 | }
10 | \description{
11 | plots the output of GetApcDF -- this is my preferred
12 | display for now
13 | }
14 | \examples{
15 | n <- 200
16 | x1 <- runif(n = n, min = 0, max = 1)
17 | x2 <- runif(n = n, min = 0, max = 1)
18 | x3 <- runif(n = n, min = 0, max = 10)
19 | y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1)
20 | df <- data.frame(x1, x2, x3, y)
21 | fittedLm <- lm(y ~ ., data = df)
22 | apcDF <- GetPredCompsDF(fittedLm, df = df)
23 | PlotPredCompsDF(apcDF, variant = "PerUnitInput") + theme_gray(base_size = 18)
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/notes/apc.Rmd:
--------------------------------------------------------------------------------
 1 | ## APCs
 2 | 
 3 | ### Predictive Comparisons Generalize Regression Coefficients
 4 | 
 5 | At the heart of this package is the idea of a *predictive comparison*: We vary the input of interest holding the other inputs constant, and look at the differences in predicted values. Let $u$ represent the input of interest and $v$ the (vector of) other inputs. Let $f$ be a functinon making predictions, so
 6 | 
 7 | $$\hat{y} = f(u,v)$$
 8 | 
 9 | Our $f$ could come from any predictive model. If we have a statistical model, then probably we would choose 
10 | 
11 | $$f(u,v) = \mathcal{E}[y \mid u, v, \theta]$$
12 | 
13 | (where $\theta$ are the parameters of the model). But we need not have a statistical model at all. For example, the prediction function $f$ could come from a random forest, or a support vector machine.
14 | 
15 | Given the function $f$ and a choice of $u_1$, $u_2$, and $v$, we can compute
16 | 
17 | $$\delta_{u_1 \rightarrow u_2, v} = \frac{f(u_2, v) - f(u_1, v)}{u_2-u_1}$$
18 | 
19 | If $f$ were a linear model with no interactions, the above would not depend on the particular choices of $u_1$, $u_2$, and $v$ and would be the regression coefficient corresponding to $u$. This is the formal sense in which predictive comparisons generalize regression coefficients. Since for more complicated models this varies as the inputs vary, we will take an average across a well chosen set of inputs. 
20 | 
21 | ### Choice of Inputs
22 | 
23 | The APC is defined as
24 | 
25 | $$\frac{\mathcal{E}[\Delta_f]}{\mathcal{E}[\Delta_u]}$$
26 | 
27 | where $\Delta_f = f(u_2,v) - f(u_1,v)$, $\Delta_u = u_2 - u_1$, and $\mathcal{E}$ is expectation under the following process:
28 | 
29 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs
30 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$
31 | 
32 | The reason for this definition is that we want to use representative values of $v$, and transitions in $u$ that are representative of what really occurs at those values of $v$.
33 | 
34 | The fact that we are computing the numerator and denominator separately (rather than taking an expected value of $\delta_{u_1 \rightarrow u_2, v}$) amounts to weighting by the size of $(u_2 - u_1)$. This avoids having the result excessively influenced by small changes in $u$.
35 | 
36 | ### Estimation
37 | 
38 | The rows of our data represent samples from the joint distribution $u$, $v$, so each row amounts to a sample from $v$ followed by a sample $u_1$ conditional on $v$. The difficult thing is drawing another sample $u_2$ conditional on $v$. We approximate these by assigning weights to rows based on the proximity of the $v$ in that row to the $v$ in the row in question. The weights are:
39 | 
40 | $$\frac{1}{\text{mahalanobisConstantTerm} + \text{(mahalanobis distance)}}$$
41 | 
42 | The [*mahalanobis distance*](https://en.wikipedia.org/wiki/Mahalanobis_distance) is a unitless version of distance that takes into account the correlation structure of $v$. The *mahalanobisConstantTerm* (which defaults to 1, but this is not always an appropriate choice) prevents all of the weight from going to the closest points. More work needs to be done in thinking about the weights.
43 | 
44 | For more details, see [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) (section 4), [my note](more-pairs-and-weights.html) explaining a small change from the weights described in the paper (renormalization) and reasons for considering only a set of the closest $N$ points. You can also look at [my code](https://github.com/dchudz/predcomps/blob/master/R/pairs.R) that computes the appropriate weights.
45 | 
46 | ### Absolute Version
47 | 
48 | The absolute APC (as opposed to the signed version described above) replaces $\Delta_f = f(u_2,v) - f(u_1,v)$ above with $|\Delta_f| = |f(u_2,v) - f(u_1,v)|$. For an extreme artificial example see e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html), where the signed APC is roughly 0 but the absolute APC is large.
49 | 
50 | By default, I always compute and display an absolute version of the APC alongside the signed version. 
51 | 
52 | ### A Small Example
53 | 
54 | This is a example running APCs on a simulated linear model with independent inputs and no interactions. For more involved examples, see the [examples](examples-overview.html) section.
55 | 
56 | The inputs ($x_1$, $x_2$, $x_3$) are independent, with
57 | 
58 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$
59 | 
60 | First we set up the data:
61 | 
62 | ```{r}
63 | n <- 200
64 | x1 <- runif(n=n, min=0, max=1)
65 | x2 <- runif(n=n, min=0, max=1)
66 | x3 <- runif(n=n, min=0, max=10)
67 | y <- 2*x1 + (-2)*x2 + 1*x3 + rnorm(n, sd=.1)
68 | df <- data.frame(x1, x2, x3, y)
69 | ```
70 | 
71 | Then we fit a linear model:
72 | 
73 | ```{r}
74 | fittedLm <- lm(y ~ ., data=df)
75 | fittedLm
76 | ```
77 | 
78 | We can then plot the average predictive comparisons:
79 | 
80 | ```{r results='hide', message=FALSE}
81 | library(predcomps)
82 | apcDF <- GetPredCompsDF(fittedLm, df=df)
83 | PlotPredCompsDF(apcDF, variant="PerUnitInput") + theme_gray(base_size = 18)
84 | ```
85 | 
86 | Using different shapes / colors, both the absolute and signed versions are plotted. For symmetry, the absolute version is plotted with both a positive and negative sign. Since this is a linear model with no interactions, the signed APCs match those from the fitted linear model. 
87 | 
88 | The call to `GetPredCompsDF` returns this data frame:
89 | 
90 | ```{r}
91 | apcDF
92 | ```
93 | 
94 | The columns plotted here are `PerUnitInput.Signed` and `Apc.Absolute`. The [next section](impact.html) desceibes those columns labeled "Impact".
95 | 


--------------------------------------------------------------------------------
/notes/examples/diamonds.Rmd:
--------------------------------------------------------------------------------
  1 | # A new way to visualize models
  2 | 
  3 | ```{r message=FALSE, echo=FALSE}
  4 | library(knitr)
  5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center')
  6 | library(randomForest)
  7 | library(plyr)
  8 | library(predcomps)
  9 | library(ggplot2)
 10 | ```
 11 | 
 12 | ```{r}
 13 | diamonds <- transform(diamonds, clarity = 
 14 |                         factor(clarity, levels =c("SI1", "SI2", "VS1", "VS2", "VVS1", "VVS2", "IF")))
 15 | 
 16 | diamonds2 <- transform(diamonds,
 17 |                        clarity = as.integer(clarity),
 18 |                        cut = as.integer(cut),
 19 |                        color = as.integer(color))
 20 | 
 21 | diamonds3 <- subset(diamonds2, !is.na(clarity))
 22 | 
 23 | rf <- randomForest(price ~ carat + cut + color + clarity, data=diamonds3, ntree=20)
 24 | diamondsSmall <- diamonds3[sample.int(nrow(diamonds3), size=500), ]
 25 | 
 26 | apcDf <- GetPredCompsDF(function(df) predict(rf, df), diamondsSmall, inputVars=row.names(rf$importance))
 27 | PlotPredCompsDF(apcDf)
 28 | PlotPredCompsDF(apcDf, variant="PerUnitInput")
 29 | 
 30 | 
 31 | pairs <- GetPairs(diamondsSmall, "carat", c("cut", "color", "clarity"), 
 32 |                   removeDiagonal=FALSE,
 33 |                   mahalanobisConstantTerm=.1)
 34 | 
 35 | u <- "carat"
 36 | pairsSampled <- ddply(pairs, 
 37 |                       "OriginalRowNumber", function(df) {
 38 |                         df[sample.int(nrow(df), size=5, prob=df$Weight), ]
 39 |                         })
 40 | 
 41 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10)
 42 | 
 43 | pairsSampled$carat <- pairsSampled$carat.B
 44 | 
 45 | pairsSampled$Prediction <- predict(rf, pairsSampled)
 46 | 
 47 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse),
 48 |        aes(x=carat, y=Prediction, color=factor(OriginalRowNumber, levels=sample(originalRowNumbersToUse)))) +
 49 |   geom_point() +
 50 |   geom_line(size=.2) 
 51 | 
 52 | last_plot() + scale_x_continuous(limits=c(0,1)) + scale_y_continuous(limits=c(0,5000))
 53 | 
 54 | 
 55 | 
 56 | u <- "clarity"
 57 | v <- c("carat", "cut", "color")
 58 | pairs <- GetPairs(diamondsSmall, u, v, 
 59 |                   removeDiagonal=FALSE,
 60 |                   mahalanobisConstantTerm=.1)
 61 | 
 62 | pairsSampled <- ddply(pairs, 
 63 |                       "OriginalRowNumber", function(df) {
 64 |                         df[sample.int(nrow(df), size=5, prob=df$Weight), ]
 65 |                         })
 66 | 
 67 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10)
 68 | 
 69 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]]
 70 | 
 71 | pairsSampled$Prediction <- predict(rf, pairsSampled)
 72 | 
 73 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse))
 74 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse),
 75 |        aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) +
 76 |   geom_point() +
 77 |   geom_line(size=.2) 
 78 | 
 79 | 
 80 | 
 81 | u <- "color"
 82 | v <- c("cut", "clarity", "carat")
 83 | pairs <- GetPairs(diamondsSmall, u, v, 
 84 |                   removeDiagonal=FALSE,
 85 |                   mahalanobisConstantTerm=.1)
 86 | 
 87 | pairsSampled <- ddply(pairs, 
 88 |                       "OriginalRowNumber", function(df) {
 89 |                         df[sample.int(nrow(df), size=5, prob=df$Weight), ]
 90 |                         })
 91 | 
 92 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10)
 93 | 
 94 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]]
 95 | 
 96 | pairsSampled$Prediction <- predict(rf, pairsSampled)
 97 | 
 98 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse))
 99 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse),
100 |        aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) +
101 |   geom_point() +
102 |   geom_line(size=.2) 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | ```
109 | 


--------------------------------------------------------------------------------
/notes/examples/loan-defaults.Rmd:
--------------------------------------------------------------------------------
  1 | ## Credit Default Example
  2 | 
  3 | ```{r message=FALSE, echo=FALSE}
  4 | library(knitr)
  5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center', fig.width = 14, size=8)
  6 | library(randomForest)
  7 | library(predcomps)
  8 | library(ggplot2)
  9 | theme_set(theme_gray(base_size = 18))
 10 | library(dplyr)
 11 | library(gridExtra)
 12 | ```
 13 |  
 14 | This example is based on the training data set [found here](https://www.kaggle.com/c/GiveMeSomeCredit). We build a model to predict:
 15 | 
 16 | - **SeriousDlqin2yrs** (target variable):  Person experienced 90 days past due delinquency or worse 
 17 | 
 18 | The input features are:
 19 | 
 20 | - **RevolvingUtilizationOfUnsecuredLines**:	Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits
 21 | - **age**:	Age of borrower in years
 22 | - **NumberOfTime30-59DaysPastDueNotWorse**:	Number of times borrower has been 30-59 days past due but no worse in the last 2 years.
 23 | - **NumberOfTime60-89DaysPastDueNotWorse**:	Number of times borrower has been 60-89 days past due but no worse in the last 2 years.
 24 | - **NumberOfTimes90DaysLate**:	Number of times borrower has been 90 days or more past due.
 25 | - **DebtRatio**:	Monthly debt payments, alimony,living costs divided by monthy gross income
 26 | - **MonthlyIncome**:	Monthly income
 27 | - **NumberOfOpenCreditLinesAndLoans**:	Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)
 28 | - **NumberRealEstateLoansOrLines**:	Number of mortgage and real estate loans including home equity lines of credit
 29 | - **NumberOfDependents**:	Number of dependents in family excluding themselves (spouse, children etc.)
 30 | 
 31 | ### Parameters:
 32 | 
 33 | These are some parameters controlling the aggregate predictive comparisons:
 34 | 
 35 | ```{r}
 36 | #  We will transitions starting at each of 500 random rows
 37 | numForTransitionStart <- 500
 38 | #  ... going to each of 10,000 other random rows:
 39 | numForTransitionEnd <- 10000
 40 | #  ... keeping only the nearest 100 pairs for each start:
 41 | onlyIncludeNearestN = 100
 42 | ```
 43 | 
 44 | And for the random forest:
 45 | 
 46 | ```{r}
 47 | # 100 trees for random forest 
 48 | ntree = 100
 49 | ```
 50 | 
 51 | 
 52 | ```{r echo=FALSE}
 53 | # Remove some outliers
 54 | credit <- read.csv("~/Downloads/cs-training.csv")[,-1]
 55 | credit <- subset(credit, !is.na(MonthlyIncome) &
 56 |                     NumberOfTime30.59DaysPastDueNotWorse < 5 & 
 57 |                     RevolvingUtilizationOfUnsecuredLines <= 2 &
 58 |                     NumberOfTime30.59DaysPastDueNotWorse <= 5 &
 59 |                     NumberOfTime60.89DaysPastDueNotWorse <= 5 &
 60 |                     NumberOfTimes90DaysLate <= 5 &
 61 |                     MonthlyIncome < 5e4 &
 62 |                     DebtRatio < 2 &
 63 |                     NumberRealEstateLoansOrLines <= 12 &
 64 |                     NumberOfDependents < 10                  
 65 |                   )
 66 | ```
 67 | 
 68 | ### Input Distribution
 69 | 
 70 | The distribution of the inputs (after removing some outliers to make things more manageable):
 71 | 
 72 | ```{r fig.height = 12, echo=FALSE}
 73 | histograms <- Map(function(colName) {
 74 |   qplot(credit[[colName]]) + 
 75 |     ggtitle(colName) +
 76 |     xlab("")},
 77 |   setdiff(names(credit), "SeriousDlqin2yrs"))
 78 | allHistograms <- do.call(arrangeGrob, c(histograms, ncol=2))
 79 | print(allHistograms)
 80 | ```
 81 | 
 82 | ### Model Building
 83 | 
 84 | We'll use a random forest for this example:
 85 | 
 86 | ```{r}
 87 | set.seed(1)
 88 | # Turning the response to type "factor" causes the RF to be build for classification:
 89 | credit$SeriousDlqin2yrs <- factor(credit$SeriousDlqin2yrs) 
 90 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit, ntree=ntree)
 91 | ```
 92 | 
 93 | ### Aggregate Predictive Comparisons
 94 | 
 95 | ```{r message=FALSE, results='hide'}
 96 | set.seed(1)
 97 | apcDF <- GetPredCompsDF(rfFit, credit,
 98 |                         numForTransitionStart = numForTransitionStart,
 99 |                         numForTransitionEnd = numForTransitionEnd,
100 |                         onlyIncludeNearestN = onlyIncludeNearestN)
101 | ```
102 | 
103 | This is a table of the aggregate predictive comparisons:
104 | 
105 | ```{r echo=FALSE}
106 | kable(apcDF, row.names=FALSE)
107 | ```
108 | 
109 | Since the [impact](impact.html) values all have the same units, it makes sense to chart them on the same axes:
110 | 
111 | ```{r LoanDefaultImpact}
112 | PlotPredCompsDF(apcDF)
113 | ```
114 | 
115 | Note (for example) that average absolute value of the change in probability associated with changes in age is much larger than the magnitude of the signed average. This indicates either an interaction effect between age and other inputs, or a non-linear (non-monotonic) relationship between age an probability of default. We'll go into more detail on that in a bit.
116 | 
117 | It wouldn't make sense to chart the average predictive comparisons (which are per unit change in input) in this way since they don't share units, but we can chart the inputs corresponding to a number of times late for various periods:
118 | 
119 | ```{r}
120 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ],
121 |                 variant = "PerUnitInput")
122 | ```
123 | 
124 | As you'd expect, this shows that the effect size (per additional incident) increases with the length of lateness, while the previous chart shows that `NumberOfTime30.59DaysPastDueNotWorse` makes more overall difference to the model than `NumberOfTime60.89DaysPastDueNotWorse`. This is because its variation is larger (it's non-zero more often). We can see this more clearly looking at the *impact* chart for just these inputs:
125 | 
126 | 
127 | ```{r}
128 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ])
129 | ```
130 | 
131 | The difference between the absolute and signed versions show that there must be some observations where adding an incident of lateness *decreases* the default probability. This is most pronounced for `NumberOfTime30.59DaysPastDueNotWorse`, so we'll look at this input in a bit more detail.
132 | 
133 | ### Looking at `NumberOfTime30.59DaysPastDueNotWorse` in more detail
134 | 
135 | Recall that the data from which the summarized predictive comparisons are computed consists of groups of rows, where with in each group only the input of interest varies (for the point we imagine transitioning to) and the rest are held constant. We can work directly with this data, visualizing it in more detail to better understand our model.
136 | 
137 | I'll plot the predicted default probability vs. `NumberOfTime30.59DaysPastDueNotWorse`, holding the other inputs constant. Each line corresponds to one choice of values for the other inputs:
138 | 
139 | ```{r NumTimes30To59DaysLateDefaultCurves}
140 | set.seed(6)
141 | pairs <- GetComparisonDF(rfFit, credit, 
142 |                          u="NumberOfTime30.59DaysPastDueNotWorse",
143 |                          numForTransitionStart = 20,
144 |                          numForTransitionEnd = numForTransitionEnd*10,
145 |                          onlyIncludeNearestN = onlyIncludeNearestN*10)
146 | 
147 | pairsSummarized <- pairs[c("OriginalRowNumber", "NumberOfTime30.59DaysPastDueNotWorse.B", "yHat2", "Weight")] %.% 
148 |   group_by(OriginalRowNumber, NumberOfTime30.59DaysPastDueNotWorse.B, yHat2) %.% summarise(Weight = sum(Weight))
149 | 
150 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
151 |   geom_point(aes(size = Weight)) +
152 |   geom_line(size=.2) +
153 |   scale_x_continuous(limits=c(0,2)) +
154 |   scale_size_area() + 
155 |   xlab("NumberOfTime30.59DaysPastDueNotWorse") +
156 |   ylab("Prediction") + 
157 |   guides(color = FALSE)
158 | ```
159 | 
160 | I've made the size of the points proportional to weight that the point receives. The summarized predictive comparisons give more weight to points with more weight, and so should we.
161 | 
162 | The relationship is mostly as you'd expect, but let's examine the highlighted one in more detail:
163 | 
164 | ```{r echo=FALSE}
165 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
166 |   geom_point(aes(size = Weight)) +
167 |   geom_line(aes(alpha=ifelse(OriginalRowNumber == 18, 1, .3))) +
168 |   scale_x_continuous(limits=c(0,2)) +
169 |   scale_alpha_identity() +
170 |   xlab("NumberOfTime30.59DaysPastDueNotWorse") +
171 |   ylab("Prediction") + 
172 |   guides(color = FALSE)
173 | ```
174 | 
175 | Why does the default probability decrease (for this one choice of other input values) when `NumberOfTime30.59DaysPastDueNotWorse` increases? Values for all the other inputs are held constant, so let's see what they are:
176 | 
177 | ```{r echo=FALSE}
178 | oneOriginalRowNumber <- subset(pairs, OriginalRowNumber == 18)
179 | kable(oneOriginalRowNumber[1,intersect(names(oneOriginalRowNumber), names(credit))])
180 | ```
181 | 
182 | Hmm, note that `NumberOfTimes90DaysLate` (almost always $0$) is $1$ in this case. Looking at the definition of the target variable (`SeriousDlqin2yrs`):
183 | 
184 | `SeriousDlqin2yrs`: "Person experienced 90 days past due delinquency or worse."
185 | 
186 | As we'd expect, previous instances of 90-days-late are a strong indicator of future ones. But adding a previous 30-days-late (but not more) to someone with a previous 90-days-late seems to decrease the chance of future 90-days-lates. This is sensible -- with the 30-days-late (but not more), we have evidence that when you're late, you at least sometimes pay back in under 60 days. 
187 | 
188 | Further exploratory analysis would further improve our understanding:
189 | 
190 | - Is this really primarily an interaction between `NumberOfTimes90DaysLate` and `NumberOfTime30.59DaysPastDueNotWorse`, or are other inputs involved? We could vary the other inputs and see if we still get this effect.
191 | - Is the effect validated on test data?
192 | 
193 | ### More Detailed Examination: `Age`
194 |  
195 | For one more example, let's examine the `Age` input in more detail.
196 | 
197 | ```{r AgeDefaultCurves}
198 | set.seed(3)
199 | pairsAge <- GetComparisonDF(rfFit, credit, 
200 |                          u="age",
201 |                          numForTransitionStart = 20,
202 |                          numForTransitionEnd = numForTransitionEnd*10,
203 |                          onlyIncludeNearestN = onlyIncludeNearestN*10)
204 | 
205 | pairsSummarizedAge <- pairsAge[c("OriginalRowNumber", "age.B", "yHat2", "Weight")] %.% 
206 |   group_by(OriginalRowNumber, age.B, yHat2) %.% 
207 |   summarise(Weight = sum(Weight))
208 | 
209 | ggplot(pairsSummarizedAge, aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 
210 |   geom_point(aes(size = Weight)) +
211 |   geom_line(size=.2) +
212 |   xlab("age") +
213 |   ylab("Prediction") + 
214 |   guides(color = FALSE)
215 | ```
216 | 
217 | This is a bit of a mess, but we can at least see see that both interaction effects and non-monotonicity are present.
218 | 
219 | Further exploration would look into which other inputs age is interacting with to determine these differently shaped curves.
220 | 
221 | ```{r echo=FALSE}
222 | save.image(file="loan-defaults.RData")
223 | ```
224 | 


--------------------------------------------------------------------------------
/notes/examples/logistic-regression.Rmd:
--------------------------------------------------------------------------------
 1 | ## A Logistic Regression with Related Inputs
 2 | 
 3 | ```{r}
 4 | 
 5 | library(predcomps)
 6 | library(ggplot2)
 7 | library(boot)
 8 | 
 9 | N=500
10 | v1 <- rnorm(N)*3
11 | v2 <- rnorm(N) * (abs(v1) > 2)
12 | v3 <- rnorm(N) * (abs(v1) < .5)
13 | 
14 | # should we remove the points with the condition false instead of setting to 0?
15 | # no, it's good how we have it. since we're conditioning on (e.g.) v1 and v3 when we draw v2, 
16 | # setting v2 to 0 in those cases means there really is no variation in v2 for those v1/v3 combos
17 | 
18 | qplot(v2, v1 + v3)
19 | qplot(v3, v1 + v3)
20 | 
21 | # plot we want:
22 | # v2 curve at various v1 + v3 values. sample from the real data to draw this
23 | # (get random rows and then fill in the whole curve.)
24 | # thicken the rows where v2 isn't 0, ie, does vary, or maybe use alpha to make the other ones lighter.
25 | # x = v2, color = v1 + v3 + 
26 | 
27 | table(v2==0)
28 | table(v3==0)
29 | df <- data.frame(v1,v2,v3)
30 | df$y <- rbinom(n=nrow(df), size=1, prob=with(df, inv.logit(v1+v2+v3)))
31 | 
32 | fittedLogit <- glm(y ~ v1 + v2 + v3, data = df, family = "binomial")
33 | 
34 | apcDF <- GetPredCompsDF(function(df) with(df, inv.logit(v1+v2+v3)), df, c("v1","v2","v3"))
35 | 
36 | PlotPredCompsDF(apcDF)
37 | PlotPredCompsDF(apcDF, variant="Apc")
38 | 
39 | 
40 | GetPredCompsDF(fittedLogit, 
41 |                data.frame(v1, v2, v3), 
42 |                c("v1","v2","v3"))
43 | 
44 | 
45 | 
46 | GetPredCompsDF
47 | 
48 | ```
49 | 
50 | 


--------------------------------------------------------------------------------
/notes/examples/overview.Rmd:
--------------------------------------------------------------------------------
 1 | ## Examples Overview
 2 | 
 3 | ### Simulated Examples
 4 | 
 5 | The simulated examples are constructed to demonstrate properties of predictive comparisons in artificial simulations that we can perfectly understand (because we created them).
 6 | 
 7 | - [Logistic regression for wine prices](examples-wine-logistic-regression.html): This is a logistic regression model run in different simulated data situations. Changing the relationship between the inputs (and nothing else) leads to differences in the predictive comparisons, despite the logistic regression model remaining the same.
 8 | 
 9 | - [Linear model with interactions](examples-simulated-linear-model-interactions.html): This is a linear model with 9 features and 8 interaction terms (an interaction between $v$ and each of $u_1 \ldots u_8$) which are all the same. The way the $u_i$'s are related to $v$ in the distribution of the inputs gives rise to differences in the APC, despite each $u_i$ playing the same role in the model.
10 | 
11 | ### Real Examples
12 | 
13 | Simulated examples are good for gaining understanding of how the package works and what it's doing, but they don't help us understand how it is useful in the real world. These are examples with real-world data sets:
14 | 
15 | - [Loan Default Model](examples-loan-defaults.html): In this example, applying predictive comparisons to a random forest helps understand the drivers of a credit risk model.
16 | 


--------------------------------------------------------------------------------
/notes/examples/simulated-linear-model-interactions.Rmd:
--------------------------------------------------------------------------------
 1 | ## APCs in a Synthetic Example of a Linear Model with Interactions
 2 | 
 3 | ```{r message=FALSE, echo=FALSE}
 4 | library(knitr)
 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center')
 6 | ```
 7 | 
 8 | ```{r echo=FALSE}
 9 | library(predcomps)
10 | library(ggplot2)
11 | library(reshape2)
12 | library(plyr)
13 | ```
14 | 
15 | ### Input Generation
16 | 
17 | This example will have 9 inputs: $v$ and $u_1$, $u_2$, ..., $u_8$. The input $v$ is uniformly distributed between 7 values, $-3, -2, \ldots, 2, 3$. Each $u$ is mostly constant (at $0$) but at one value of $v$ (which is $v=-3$ for $u_1$, $v=-2$ for $u_2$, etc), $u$ can be either $0$ and $10$. The input $u_8$ can transition at two values of $v$: At either $v=-3$ or $v=3$, $u_8$ can be $0$ or $10$; otherwise, $u_8$ is 0. I'll construct the data and plot the relationship between the $u$'s and $v$. 
18 | 
19 | ```{r}
20 | N <- 200
21 | vValues <- (-3):3
22 | v <- sample(vValues, N, replace=TRUE)
23 | 
24 | df <- data.frame(v)
25 | for (i in seq_along(vValues)) {
26 |   df[[paste0("u",i)]] <- 
27 |     ifelse(v==vValues[i], 
28 |            sample(c(0,10), N, replace=TRUE),  # u can be either 0 or 10 at one v value
29 |            rep(0, N)                          # u is always 0 at other ones
30 |            )
31 | }
32 | 
33 | # u8 can transition at either v=-3 or v=3:
34 | df$u8 <- ifelse(v %in% c(-3,3),
35 |                 sample(c(0,10), N, replace=TRUE), 
36 |                 rep(0, N)                         
37 |                 )
38 | ```
39 | 
40 | ```{r ConditionalDistributionOfUsOnVs, echo=FALSE, fig.cap="distribution of $u$ conditional on $v$"}
41 | theme_set(theme_gray(base_size=20))
42 | ggplot(melt(df, id="v")) +
43 |   geom_bar(aes(x=factor(v), fill=factor(value)), position=position_fill()) +
44 |   facet_grid(variable ~ .) +
45 |   scale_fill_discrete("value of u_i") + 
46 |   scale_x_discrete("v") +
47 |   ggtitle("u_i conditional on v") +
48 |   scale_y_continuous("sample conditional distribution") + 
49 |   theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
50 | ```
51 | 
52 | ### Outcome Generation
53 | 
54 | Each $u$ will have the same role in the model, which is $$\mathbb{E}[y] = vu_1 + vu_2 + vu_3 + vu_4 + vu_5 + vu_6 + vu_7 + vu_8.$$ 
55 | 
56 | Note that each $u$ has the same role in this function. Differences between their APCs can only arise from the correlation structure of the inputs in combination with this function, not from this function alone.
57 | 
58 | For simplicity this demonstration, I'll assume that $\mathbb{E}[y \mid v, u_1, \ldots, u_8]$ is known rather than estimated.
59 | 
60 | (**Todo: Find a way to map these inputs/output to a possible story with meaningful features and outcome. That would make it more interesting, easier to follow, and maybe generate more insight.**)
61 | 
62 | ```{r}
63 | outcomeGenerationFunction <- function(df) {
64 |   with(df, v*u1 + v*u2 + v*u3 + v*u4 + v*u5 + v*u6 + v*u7 + v*u8)
65 | }
66 | df$y <- outcomeGenerationFunction(df)
67 | ```
68 | 
69 | ### Computing and Plotting the APC
70 | 
71 | I will compute both the signed and absolute APC. My absolute APCs are different from anything discussed in the paper, but the idea is simple: Everywhere we would use a predictive difference, we use its absolute value instead. I believe that whenever we display the signed APC, we should display the absolute APC alongside it. Transitions for a given input may have a large expected absolute impact on the output variable, but with signs that cancel out. If that's happening in your model, it would be important to know.
72 | 
73 | Here are the APCs for this example, displayed in two different ways. The first shows signed and absolute APCs in separate charts. The second (my preference) shows both types of APC in the same chart. In this case, each absolute APC is plotted twice, once on the positive half of the APC axis and once on the negative half. This is necessary due to the symmetry between positive and negative numbers. The second version is my preference because it makes comparisons between signed and absolute APCs easier. 
74 | 
75 | For example, we can more quickly notice in the second version that the signed APC for $u_8$ is small relative to its absolute APC. This is because $u_8$ transitions between 0 and 10 at either $v=3$ or $v=-3$. Either $v$ (combined with the $u_8$-transition) leads to a relatively large absolute predictive comparison, but with opposite signs depending on $v$. 
76 | 
77 | ```{r ApcPlotsTwoWays}
78 | inputVars <- c("v",paste0("u",1:8))
79 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars = inputVars)
80 | print(apcDF)
81 | PlotPredCompsDF(apcDF, variant="PerUnitInput")
82 | ```
83 | 
84 | These APCs are just what we would expect from the setup of the synthetic examples. When $u$-transitions are at small $v$, the APCs are small due to the $u$/$v$ interaction effect (and vice versa). Even though we know the prediction function exactly, the APCs are a bit off due to errors in our estimates of the distribution of each input conditional on the others. We can mitigate that in this case by increasing the weight given to closer points when assigning weights based on the Mahalanobis distance. The weights are $\frac{1}{\text{mahalanobisConstantTerm}+d}$ where $d$ is the distance, so we can do this by decreasing $\text{mahalanobisConstantTerm}$:
85 | 
86 | ```{r DecreasedMahalanobisConstantTerm}
87 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars, mahalanobisConstantTerm=.01)
88 | PlotPredCompsDF(apcDF, variant="PerUnitInput")
89 | ```
90 | 
91 | In this case, the APCs are much closer to the correct values. Giving such extreme weight to pairs with small distance does not cause any problems in this example, but it would in more realistic examples. In applications, getting the weights right may be difficult.
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/notes/examples/wine-logistic-regression.Rmd:
--------------------------------------------------------------------------------
  1 | ```{r echo=FALSE, results='hide', message=FALSE}
  2 | library(predcomps)
  3 | library(ggplot2)
  4 | library(boot) 
  5 | library(knitr)
  6 | opts_chunk$set(tidy = FALSE)
  7 | set.seed(1)
  8 | ```
  9 | 
 10 | ## A Logistic Regression with Related Inputs
 11 | 
 12 | *(The source code for this example is [here](https://github.com/dchudz/predcomps/blob/master/notes/examples/wine-logistic-regression.Rmd).)*
 13 | 
 14 | We will set up a simulated data set to use for modeling the probability a customer buys a bottle of wine, given its price and quality. We'll compare a few situation varying the joint distribution of price ($P$) and quality ($Q$). The coefficients of the logistic regression determining the relationship between the inputs and the probability of purchase will not vary.
 15 | 
 16 | In each variation, the probability of purchase is governed by the following logistic regression model:
 17 | 
 18 | $$logit(P(\text{wine is purchased})) = 0.1 Q - 0.12 P$$
 19 | 
 20 | - *Variation 1*: Price is uniform; quality is price plus noise. Quality increases with price, but not enough make up for price, so the expensive wines are rarely purchased and the cheap wines are almost always purchased.
 21 | - *Variation 2*: Just like *Variation 1*, but price is more densely concentrated in its middle range. This leads to price and quality both having a larger APC because the inverse logistic curve is steeper in this middle range.
 22 | - *Variation 3*: Like *Variation 1*, but quality varies more strongly with price. The inverse logistic curve is now steeper at almost all price/quality combinations.
 23 | 
 24 | The APC varies across these variations, but the logistic regression coefficients remain the same. The changes in APC in each of these variations are driven entirely by changes in the distribution of the inputs. The model relating inputs to outputs is unchanged.
 25 | 
 26 | ### Variation 1
 27 | 
 28 | In the first variation, quality and price are independent, with price uniformly distributed and quality set to price plus Gaussian noise:
 29 | 
 30 | ```{r}
 31 | priceCoef <- -.12
 32 | qualityCoef <- .1
 33 | qualityNoiseStdDev <- 5
 34 | nWines=50000
 35 | nRowsForPlottingSample <- 1000
 36 | 
 37 | numForTransitionStart <- 500
 38 | numForTransitionEnd <- 10000
 39 | onlyIncludeNearestN = 100
 40 | 
 41 | priceQualitySlope <- .4
 42 | 
 43 | df1 <- local({
 44 |   price <- sample(20:120, nWines, replace=TRUE)
 45 |   quality <- price * priceQualitySlope + 22 + rnorm(nWines, sd=qualityNoiseStdDev)
 46 |   purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50)  )
 47 |   purchased  <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability)
 48 |   data.frame(Quality = quality, 
 49 |              Price = price, 
 50 |              PurchaseProbability = purchaseProbability, 
 51 |              Purchased = purchased)
 52 |   })
 53 | print(getwd())
 54 | ```
 55 | 
 56 | A scatter plot (using a random subset to avoid overplotting) shows us the relationship between price and quality:
 57 | 
 58 | ```{r V1Scatter}
 59 | df1Sample <- df1[sample.int(nWines, size=nRowsForPlottingSample), ]
 60 | qplot(Price, Quality, alpha=I(.5), data = df1Sample) + 
 61 |   expand_limits(y=c(0,100))
 62 | ```
 63 | 
 64 | When we fit a logistic regression, the coefficients are what we'd expect from the setup above:
 65 | 
 66 | 
 67 | ```{r}
 68 | logitFit1 <- glm(Purchased ~ Price + Quality, data = df1, family = "binomial")
 69 | logitFit1
 70 | ```
 71 | 
 72 | This plot shows the relationship between quality and probability of purchase for a few prices:
 73 | 
 74 | ```{r V1QualityVsProb}
 75 | myScales <- list(scale_x_continuous(limits=c(0,100)),
 76 |                  scale_y_continuous(limits=c(0,1)))
 77 | 
 78 | ggplot(subset(df1Sample, Price %in% seq(20, 120, by=10))) + 
 79 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
 80 |              size = 3, alpha = 1) + 
 81 |   ggtitle("Quality vs. Purchase Probability at Various Prices") + myScales +
 82 |   scale_color_discrete("Price")
 83 | 
 84 | ```
 85 | 
 86 | Each colored set of points is one portion of a shifted inverse logistic curve, determined by which Price/Quality combinations actually occur in our data.
 87 | 
 88 | We can also see the portion of the curve that isn't represented in our data:
 89 | 
 90 | ```{r V1QualityVsProbWithLines, echo=FALSE}
 91 | linesDF <- expand.grid(Price = seq(20, 120, by=10), Quality = -20:130)
 92 | linesDF$PurchaseProbability <- with(linesDF,
 93 |                                     inv.logit(priceCoef*(Price - 70) + qualityCoef*(Quality - 50)))
 94 | last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
 95 |                         data = linesDF,
 96 |                         size=.2)
 97 | ```
 98 | 
 99 | We can get average predictive comparisons from our fitted regression:
100 | 
101 | ```{r}
102 | apc1 <- GetPredCompsDF(logitFit1, df1,
103 |                        numForTransitionStart = numForTransitionStart,
104 |                        numForTransitionEnd = numForTransitionEnd,
105 |                        onlyIncludeNearestN = onlyIncludeNearestN)
106 | ```
107 | 
108 | The `GetPredCompsDF` function produces a few kinds of outputs, but for now let's just focus on the signed average predictive comparison:
109 | 
110 | ```{r}
111 | apc1[c("Input", "PerUnitInput.Signed")]
112 | ```
113 | 
114 | This means that (on average) the probability of purchase increases by about 1.2% per 1 unit increase in quality.
115 | 
116 | ### Variation 2
117 | 
118 | This variation will add some additional wines to the middle range of prices:
119 | 
120 | ```{r}
121 | nAdditionalWines <- nWines
122 | supplementForDF2 <- local({
123 |   price <- sample(55:85, nWines, replace=TRUE)
124 |   quality <- price * .4 + 22 + rnorm(nWines, sd=qualityNoiseStdDev)
125 |   purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50)  )
126 |   purchased  <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability)
127 |   data.frame(Quality = quality, 
128 |              Price = price, 
129 |              PurchaseProbability = purchaseProbability, 
130 |              Purchased = purchased)
131 |   })
132 | df2 <- rbind(df1, supplementForDF2)
133 | ```
134 | 
135 | A scatter plot (again, using a random subset to avoid overplotting) shows us the relationship between price and quality:
136 | 
137 | ```{r}
138 | df2Sample <- df2[sample.int(nrow(df2), size=nRowsForPlottingSample), ]
139 | qplot(Price, Quality, alpha=I(.5), data = df2Sample) + 
140 |   expand_limits(y=c(0,100))
141 | ```
142 | 
143 | When we fit a logistic regression, the coefficients are similar to before, since we haven't changed the underlying model:
144 | 
145 | 
146 | ```{r}
147 | logitFit2 <- glm(Purchased ~ Price + Quality, data = df2, family = "binomial")
148 | logitFit2
149 | ```
150 | 
151 | In the plot showing the relationship between quality and probability of purchase, we see more points at the steep section of the inverse logit curve:
152 | 
153 | ```{r}
154 | ggplot(subset(df2Sample, Price %in% seq(20, 120, by=10))) + 
155 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
156 |              size = 3, alpha = 1) + 
157 |   ggtitle("Quality vs. Purchase Probability at Various Prices") +
158 |   myScales + 
159 |   scale_color_discrete("Price")
160 | ```
161 | 
162 | The APC for quality is correspondingly larger:
163 | 
164 | ```{r}
165 | apc2 <- GetPredCompsDF(logitFit2, df2,
166 |                        numForTransitionStart = numForTransitionStart,
167 |                        numForTransitionEnd = numForTransitionEnd,
168 |                        onlyIncludeNearestN = onlyIncludeNearestN)
169 | 
170 | apc2[c("Input",  "PerUnitInput.Signed")]
171 | ```
172 | 
173 | This means that in this variation the probability of purchase increases (on average) by about 1.5% (vs. 1.2% in *Variation 1*) per 1-point increase in quality. The magnitude of the APC for price is also larger.
174 | 
175 | ### Variation 3
176 | 
177 | This is just like *Variation 1*, but price increases more with quality:
178 | 
179 | ```{r}
180 | priceQualitySlope <- 1.2
181 | 
182 | df3 <- local({
183 |   price <- sample(20:120, nWines, replace=TRUE)
184 |   quality <- price * priceQualitySlope - 30 + rnorm(nWines, sd=qualityNoiseStdDev)
185 |   purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50)  )
186 |   purchased  <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability)
187 |   data.frame(Quality = quality, 
188 |              Price = price, 
189 |              PurchaseProbability = purchaseProbability, 
190 |              Purchased = purchased)
191 |   })
192 | ```
193 | 
194 | ```{r}
195 | df3Sample <- df3[sample.int(nWines, size=nRowsForPlottingSample), ]
196 | qplot(Price, Quality, alpha=I(.5), data = df3Sample) + 
197 |   expand_limits(y=c(0,100))
198 | ```
199 | 
200 | The logistic regression still comes out the same:
201 | 
202 | ```{r}
203 | logitFit3 <- glm(Purchased ~ Price + Quality, data = df3, family = "binomial")
204 | logitFit3
205 | ```
206 | 
207 | In this case, purchase is less certain at the low prices and more plausible at the high prices:
208 | 
209 | ```{r}
210 | ggplot(subset(df3Sample, Price %in% seq(-100, 200, by=10))) + 
211 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
212 |              size = 3, alpha = 1) + 
213 |   ggtitle("Quality vs. Purchase Probability at Various Prices") + 
214 |   myScales +
215 |   scale_color_discrete("Price")
216 | ```
217 | 
218 | We can get average predictive comparisons from our fitted regression:
219 | 
220 | ```{r}
221 | apc3 <- GetPredCompsDF(logitFit3, df3,
222 |                        numForTransitionStart = numForTransitionStart,
223 |                        numForTransitionEnd = numForTransitionEnd,
224 |                        onlyIncludeNearestN = onlyIncludeNearestN)
225 | ```
226 | 
227 | As expected, the APCs are (both) larger than in *Variation 1*:
228 | 
229 | ```{r}
230 | apc3[c("Input", "PerUnitInput.Signed")]
231 | ```
232 | 
233 | ### Comparing the Variations
234 | 
235 | Comparing all of the variation in one plot, we can see the increase in the effect of wine quality on purchase probability going from Variation 1 to Variation 3:
236 | 
237 | ```{r echo=FALSE}
238 | apc1$Variation <- 1
239 | apc2$Variation <- 2
240 | apc3$Variation <- 3
241 | 
242 | apcAllVariations <- do.call(rbind, list(apc1, apc2, apc3))
243 | ggplot(subset(apcAllVariations, Input=="Quality")) +
244 |   geom_point(aes(x=factor(Variation), y=PerUnitInput.Signed), size=3) + 
245 |   expand_limits(y=0) +
246 |   xlab("Variation") + 
247 |   ggtitle("Per Unit Quality APC for Quality across Variations")
248 | 
249 | ```
250 | 
251 | ```{r echo=FALSE}
252 | save.image(file="wine-logistic-regression.RData")
253 | ```
254 | 


--------------------------------------------------------------------------------
/notes/impact.Rmd:
--------------------------------------------------------------------------------
 1 | ## Impact: A variation on APCs with comparable units
 2 | 
 3 | *Impact* answers the question: **What difference does each input tend to make in the output?** (Given the model and the distribution of inputs.) The units are the units of the output variable.
 4 | 
 5 | ### The name "Impact"
 6 | 
 7 | First off, I don't love the name "Impact" for this. "Variable importance" may signal somewhat the right idea to people already familiar with other measures of variable importance, but this notion is (or can be) signed, whereas variable importance generally isn't.
 8 | 
 9 | ### Impact
10 | 
11 | *Impact* is a statistic similar to the APC, but which addresses two issues I've had with APCs:
12 | 
13 | 1. APCs are good for their purpose (determining the expected difference in outcome per unit change in input), but this doesn't tell me how important an input is to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference.
14 | 
15 | 2. Relatedly, APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs in that example belong on the same chart, and I would like to visualize the influence .
16 | 
17 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs in the computation of the APC. That is, impact is just the expectation of
18 | 
19 | $\Delta_f = f(u_2,v) - f(u_1,v)$
20 | 
21 | under the same process as in the APC:
22 | 
23 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs
24 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$
25 | 
26 | The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable. This statistic depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs.
27 | 
28 | ### Example (cont.)
29 | 
30 | The same example used to demonstrate APCs demonstrates the difference between *impact* and APCs. Recall that the inputs ($x_1$, $x_2$, $x_3$) are independent, with
31 | 
32 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$
33 | 
34 | However, the variation in input $x_3$ is much larger than the others:
35 | 
36 | ```{r}
37 | n <- 200
38 | x1 <- runif(n=n, min=0, max=1)
39 | x2 <- runif(n=n, min=0, max=1)
40 | x3 <- runif(n=n, min=0, max=10)
41 | y <- 2*x1 + (-2)*x2 + 1*x3
42 | df <- data.frame(x1, x2, x3, y)
43 | fittedLm <- lm(y ~ ., data=df)
44 | ```
45 | 
46 | We can then compute and plot the *impact*:
47 | 
48 | ```{r results='hide', message=FALSE}
49 | library(predcomps)
50 | apcDF <- GetPredCompsDF(fittedLm, df=df)
51 | PlotPredCompsDF(apcDF) + theme_gray(base_size = 18)
52 | ```
53 | 
54 | The *impact* for $x_3$ is about 5 times the impact for $x_1$, which makes sense as $x_3$ varies on a scale that is 10 times as large but with a coefficient half as big.
55 | 
56 | ```{r}
57 | apcDF
58 | ```
59 | 
60 | The [examples](examples-overview.html) section goes through more interesting examples demonstrating more subtle features of how *impact* and APCs work.
61 | 


--------------------------------------------------------------------------------
/notes/index.Rmd:
--------------------------------------------------------------------------------
 1 | **Please note: This is a very early version of the package. Please take it (and everything written on these pages) as provisional.**
 2 | 
 3 | ## An R Package for Understanding Arbitrary Complex Models
 4 | 
 5 | As complex models become widely used, it's more important than ever to have ways of understanding them. Even when a model is built primarily for prediction (rather than primarily as an aid to understanding), we still need to know what it's telling us. For each input to the model, we should be able to answer questions like these:
 6 | 
 7 | 1. **On average, much does the output increase per unit increase in input?**
 8 | 2. **How does the influence of the input vary? (Across its range? In combination with other inputs?)**
 9 | 3. **How much difference does variation in this input make to the predictions? (Relative to other inputs?)**
10 | 
11 | For example, if our model were a linear regression with no interactions and no transformations of the inputs, the (1) would be answered by our regression coefficients, (2) would be answered "It doesn't vary", and (3) would be a little harder but not too bad. All of these questions are much harder for more complicated models.
12 | 
13 | This R package is a collection of tools that are meant to help answer these questions for arbitrary complicated models. One advantage of the fact that they work work equally well for any model is that they can be used to **compare models**.
14 | 
15 | The key feature of the approach here is that we try to properly **account for relationships between the various inputs**.
16 | 
17 | ### Origin
18 | 
19 | The ideas implemented here originate in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). If you are already familiar with the paper, you can [skip to the differences between that and what's here](more-compared-with-paper.html). As far as I know, this is the only implementation intended for general use.
20 | 
21 | ### Installation
22 | 
23 | The package is not hosted on CRAN, but it's still easy to install:
24 | 
25 | ```{r eval=FALSE}
26 | library(devtools) # first install devtools if you haven't
27 | install_github("predcomps", user="dchudz")
28 | ```
29 | 
30 | ### What we give you:
31 | 
32 | 1. [Average Predictive Comparisons Per Unit Input](apc.html) - On average, much does the output increase per unit increase in input? (This generalizes regression coefficients.)
33 | 2. Sensitivity Analysis - the package doesn't yet have functions that fully automate this, but see the bottom of [this example](examples-loan-defaults.html) for how this package can help with sensitivity analysis in a way that's consistent with accounting for relationships between the various inputs.
34 | 3. [Impact](impact.html) - What is the average difference that a random change in each input makes to the predictions? (You can use this to compare the importance of the various inputs.)
35 | 
36 | You can compute both signed and absolute version of (1) and (3) for all inputs in your model with the function `GetPredCompsDF` and plot them using `PlotPredCompsDF`. See [the PDF documentation](predcomps-manual.pdf) for how to call each of these functions.
37 | 
38 | ### Example Output
39 | 
40 | Here's an example of the output of *[impact](impact.html)* (which gets at a idea similar to 'variable importance' in other packages) from [an example predicting loan defaults](examples-loan-defaults.html). The model is a random forest. Note that *impact* is expressed in the units of the output variable (probabilities in this case), and hence it always makes sense to show it for all of the inputs on the same scale:
41 | 
42 | ![LoanDefaultImpact](figure/LoanDefaultImpact.png)
43 | 
44 | The signed version should be interpreted as the expected value of the change in default probability for a random change in input. The absolute version is the expected absolute value, and is shown on both the positive and negative sides of horizontal axes. The signed version will always be between the two points representing the absolute version. When the signed version is close to the absolute version, the impact of a change the corresponding variable more consistently has the same sign. 
45 | 
46 | We can also look in more detail at an individual input, examining the default probability as we vary that input (holding all else equal at a few example values):
47 | 
48 | ![NumTimes30To59DaysLateDefaultCurves](figure/NumTimes30To59DaysLateDefaultCurves.png)
49 | 
50 | This is sometimes called a *sensitivity analysis*. Notice that in this case for one particular choice of other input values, transitioning from 0 to 1 previous time 30-59 days late leads to a *decrease* in predicted default probability. See [the example](examples-loan-defaults.html) for some thoughts on why this might be.
51 | 
52 | ### Current Limitations
53 | 
54 | The package has some major limitations in its current form, but we have to start somewhere:
55 | 
56 | **Input types**: At the time of this writing, all inputs to the model must be numerical (or coercable to such). In practice this means that binary or ordered categorical inputs are okay, but unordered categorical inputs are not.
57 | 
58 | **Parameters controlling the weights between pairs**: These parameters are important for reasons that are both computational and statistical, are not yet well tuned for you, and the defaults may not be reasonable.
59 | 
60 | ## Contact
61 | 
62 | I'm very interested in feedback from folks who are trying this out. Please get in touch with me at dchudz@gmail.com.
63 | 


--------------------------------------------------------------------------------
/notes/more/compared-with-paper.Rmd:
--------------------------------------------------------------------------------
 1 | ## As Compared With the Paper
 2 | 
 3 | There are a few differences between what's here and what's discribed in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). Many features described in the paper (such as categorical inputs) are just not implemented here. This page ignores those, and describes only outright differences, or additions.
 4 | 
 5 | ### Pairs and Weights
 6 | 
 7 | [This page](more-pairs-and-weights.html) comments on how the weights are computed and describes from differences between this package and what's described in section 4.1 of the paper.
 8 | 
 9 | ### Absolute APCs
10 | 
11 | Gelman & Pardoe mention an unsigned version APCs in the case where the input of interest is an unordered categorical variable (in which case signs wouldn't make sense). They propose a root mean squared APC (equaton 4), but I prefer absolute values and I believe this absolute-values version is useful for any inputs, not just categorical ones. By default, I always compute and display an absolute version of the APC alongside the signed version. See e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html) for an artificial example demonstrating the importance of this notion, and see [my explanation of APCs](apc.html) for more detail.
12 | 
13 | ### Impact: A variation on APCs with comparable units
14 | 
15 | I've created a statistic similar to the original APC, but which addresses two issues I've had with APCs:
16 | 
17 | 1. APCs are good for their purpose (the expected difference in outcome per unit change in input), but it doesn't tell me the what difference an input makes to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference.
18 | 
19 | 2. APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs belong on the same chart.
20 | 
21 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs. The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable, and hence are always comparable across different inputs. Just as with APCs, this quantity depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs.
22 | 
23 | I'm calling this notion *impact* (feel free to suggest another name), and it's described in more detail [here](impact.html) and in the examples. Just like APCs, it comes in signed and absolute forms.
24 | 
25 | ### All-else-equal Curves / Sensitivity Analysis
26 | 
27 | To visualize the model in more detail than is provided by our aggregated predictive comparisons, we can plot $u$ vs. the prediction at a variaty of values of $v$. In order that these plots represent $p(u|v)$, we can use the same set of pairs/weights as is constructed for computing aggregated predictive comparisons.
28 | 
29 | This plot shows age vs. probability of default, as in [the loan defaults example](examples-loan-defaults.html):
30 | 
31 | ![AgeDefaultCurves](figure/AgeDefaultCurves.png)
32 | 
33 | Plots like this are not yet computed in the package, but see [the example](examples-loan-defaults.html) for how to construct them.
34 | 


--------------------------------------------------------------------------------
/notes/more/future-work.Rmd:
--------------------------------------------------------------------------------
 1 | ## Future work
 2 | 
 3 | ### Categorical inputs
 4 | 
 5 | I'm not quite sure about the best way to sample from the distribution for an input of interest, conditional on the other inputs, when those other inputs include categorical variables.
 6 | 
 7 | Gelman & Pardoe 2007 discuss what to do when the input of interest is an unordered categorical variable, but I'm mainly interested in applying the statistic I'm calling [*impact*](impact.html) to categorical variables, since *impact* is comparable across all types of inputs.
 8 | 
 9 | ### Distribution of predictive comparisons
10 | 
11 | Rigth now we're just providing point summaries. It'd be nice to see the distribution. (This is different from accounting for model uncertainty.)
12 | 
13 | ### Uncertainty
14 | 
15 | As discussed in the paper, it is entirely natural to allow for multiple samples of parameters and plot these measures with their uncertainties as in the paper.
16 | 
17 | I'm also interested in the uncertainty that arises from the density estimation. 
18 | 
19 | ### Weights
20 | 
21 | More work is necessary to understand better what the weights should be / how to best represent the appropriate conditional distributions. Maybe in some cases we should even drop the distance-based approach and use something like BART?
22 | 
23 | ### Code robustness
24 | 
25 | I should deal with potential column name conflicts in data frames (e.g. I use a "Weight" column, so I'll have problems if one of the inputs has that name).
26 | 
27 | ### Other tools/methods for understanding complicated models
28 | 
29 | I'd like to compile a list of other work in this direction, maybe comparing them with this (and perhaps giving examples that show why/when you'd want to use this instead).
30 | 
31 | I should add a page discussing other methods people have used to get at somewhat the same idea.
32 | 
33 | - `randomForest` package in R (partial plots, variable importance)
34 | - `earth` package in R (variable importance)
35 | - I should look through `caret` to see what's available there


--------------------------------------------------------------------------------
/notes/more/large-N-limit.Rmd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dchudz/predcomps/cc3bf1155cc01f496da231af4facfc47d986d046/notes/more/large-N-limit.Rmd


--------------------------------------------------------------------------------
/notes/more/pairs-and-weights.Rmd:
--------------------------------------------------------------------------------
 1 | ## Weighting Pairs of Rows
 2 | 
 3 | ### The Original Version
 4 | 
 5 | In computing the APC, we assign weights to pairs of observations. Taking $v$ (the inputs not of interest) from the first element of the pair, $u_1$ from the first element of the pair, and $u_2$ from the second element of the pair, these samples and their weights are meant to approximate the distribution with density $p(v)p(u_1|v)p(u_2|v)$.
 6 | 
 7 | If we had many observations for each unique value of $v$, this would be easy: We'd just assign weight 1 to the pairs that share the same value of $v$, and weight 0 to the other pairs. In reality, we achieve something like this by assigning more weight to closer pairs.
 8 | 
 9 | The suggestion in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) is to consider all pairs of rows from the original data set and use weights based on the Mahalanobis distance $d$ between the corresponding $v$, such as $$\frac{1}{1 + d}.$$
10 | 
11 | 
12 | ### Need for Renormalizing Weights
13 | 
14 | Pairs of rows with nearby $v$'s have high weights, where pairs from far-away $v$'s have low weights. But rows with fewer $v$'s near them end up with less total weight than rows with more $v$'s near them. This isn't so good, since we meant to weight $v$'s according to their distribution in the original data set.
15 | 
16 | The solution is to renormalize the weights so that when grouping by the first element of each pair, the sum for each group is the same. Here's [the line of code that does this in the package](https://github.com/dchudz/predcomps/blob/master/R/pairs.R#L85).
17 | 
18 | Maybe this renormalization goes without saying, but it isn't explicitly mentioned in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract).
19 | 
20 | ### Far Away Points Dominate
21 | 
22 | Imagine $v$ is a vector of $n$ inputs, spread uniformly throughout space. Then the expected number of points at distance $d$ from a given $v$ would be proportional to $d^n$. This means that using weights inversely proportional to distance will end up letting far-away points dominate, once you account for how many more far-away points there are. 
23 | 
24 | ### Current Implementation: Using a Fixed Number of Nearest Points
25 | 
26 | My current implementation allows you an option that addresses the above concern that far-away points dominate (in an unfortunately ad hoc manner): You can specify that for each transition start point, we keep only certain number of the nearest transition end points. This parameter is specified as `onlyIncludeNearestN`.
27 | 
28 | The appropriate setting for this parameter will depend on the number of points used, and maybe other properties of the data set.
29 | 
30 | ### Limiting the number of points used
31 | 
32 | In the paper, Gelman and Pardoe mention that we can save on computation by, when necessary, estimating the average predictive comparison using a subset of the available data. My suggestion is that the set of points considered for transition starts need not be the same as the set considered for transition ends. For example, we choose a small subset as candidate transition starts, we may still wish to consider the full data set as potential endpoints for the transitions. This could be beneficial, since to best represent the conditional distribution $p(u_2|v_1)$ we're looking for $v_2$ close to $v_1$. We will find a closer $v_2$ by considering a larger set of potential transition endpoints than if we restricted the set of candidates for $v_2$ just as we restricted for $v_1$.
33 | 
34 | The parameters to specify are `numForTransitionStart` and `numForTransitionEnd`. They default to `NULL`, in which case we use the entire data set. 
35 | 
36 | ### Recommendations
37 | 
38 | I recommend that `numForTransitionEnd` should be larger than `numForTransitionStart`, and that `onlyIncludeNearestN` should be a fraction of `numForTransitionEnd`.
39 | 
40 | However, appropriate settings are not well understand. Please send me your feedback on what works well for you.
41 | 


--------------------------------------------------------------------------------
/notes/presentations/Lunch.Rmd:
--------------------------------------------------------------------------------
  1 | Changes to make:
  2 | 
  3 | - Flesh out motivating examples in much more detail
  4 |   - logistic regression: more explanation / focus on logistic regression equation
  5 |   - maybe nothing different for other example?
  6 | 
  7 | - For ML audience: lead more on how you would use it by switching order to:
  8 |  - wine as is
  9 |  - calling API on wine
 10 |  - credit
 11 |  - THEN the math definitions
 12 |  - comparison with other approaches
 13 | 
 14 | Anthony feedback:
 15 | 
 16 | - Say earlier why we're doing wine example
 17 | - why are we looking at wine example?
 18 | - picture in motivation for the equations
 19 | - do the exercise live
 20 | - don't need histograms or feature definitions
 21 | 
 22 | 
 23 | ```{r echo=FALSE, message=FALSE}
 24 | load(file="../examples/wine-logistic-regression.RData")
 25 | library(ggplot2)
 26 | theme_set(theme_gray(base_size = 18))
 27 | library(gridExtra)
 28 | library(knitr)
 29 | library(predcomps)
 30 | opts_chunk$set(fig.cap="", echo=FALSE, 
 31 |                fig.width=2*opts_chunk$get("fig.width"),
 32 |                fig.height=.9*opts_chunk$get("fig.height")
 33 |                )
 34 | ```
 35 | 
 36 | ## An R Package to Help Interpret Predictive Models: 
 37 | 
 38 | ### (Average) Predictive Comparisons
 39 | 
 40 | David Chudzicki
 41 | 
 42 | 
 43 | ## Related concepts
 44 | 
 45 | - sensitivity analysis
 46 | - elasticity
 47 | - variable importance
 48 | - partial dependence
 49 | - etc.
 50 | 
 51 | ## **This** approach:
 52 | 
 53 | - treats model as a black box
 54 | - **tries to properly account for relationships among the inputs of interest**
 55 | - based on Gelman & Pardoe (2007), which asked: "On average, much does the output increase per unit increase in input?"
 56 |   
 57 | 
 58 | ## Plan
 59 | 
 60 | 1. Motivating fake example
 61 | 2. Predictive comparisons definitions: what we want 
 62 | 4. Applying average predictive comparisons to (1)
 63 | 5. An example with real data: credit scoring
 64 | 
 65 | - Discussion!
 66 | - (Appendix) Estimation & Computation: how to get what we want
 67 | - (Appendix) Comparison with other approaches
 68 | 
 69 | 
 70 | ## Silly Example
 71 | 
 72 | - We sell wine
 73 | - Wine varies in: price, quality
 74 | - Customers randomly do/don't buy, depending on price and quality
 75 | 
 76 | ## Logistic Regression
 77 | 
 78 | - $P$: price ($)
 79 | - $Q$: quality (score on arbitrary scale)
 80 | - **Model**: $P(\text{wine is purchased}) = logit^{-1}(\beta_0 + \beta_1 Q + \beta_2 P)$
 81 | 
 82 | ```{r echo=FALSE}
 83 | library(boot)
 84 | s <- seq(-4,4,by=.01)
 85 | qplot(s, 1/(1+exp(-s)), geom="line") + ggtitle("Inverse Logit Curve")
 86 | ```
 87 | 
 88 | - difficulty: interpreting coefficients on logit scale
 89 | 
 90 | 
 91 | **True model**: $P(\text{wine is purchased}) = logit^{-1}(0.1 Q - 0.12 P)$
 92 | 
 93 | ## Distribution of Inputs (variation 1):
 94 | 
 95 | Price and quality are (noisily) related:
 96 | 
 97 | ```{r}
 98 | myScales <- list(scale_x_continuous(limits=c(-15,125)),
 99 |                  scale_y_continuous(limits=c(0,1)))
100 | qualityScale <- ylim(c(-20,130))
101 | qplot(Price, Quality, alpha=I(.5), data = df1Sample) + 
102 |   qualityScale
103 | ```
104 | 
105 | ## We don't really need a model to understand this...
106 | 
107 | (A random subset of the data. For clarity, showing only a discrete subset of prices.)
108 | 
109 | ```{r}
110 | v1Plot <- ggplot(subset(df1Sample, Price %in% seq(20, 120, by=10))) + 
111 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
112 |              size = 3, alpha = 1) + 
113 |   ggtitle("Quality vs. Purchase Probability at Various Prices") +
114 |   myScales +
115 |   scale_color_discrete("Price")
116 | v1Plot
117 | ```
118 | 
119 | ## We don't really need a model to understand this...
120 | 
121 | For each individual price, quality vs. purchase probability forms a portion of a shifted inverse logit curve:
122 | 
123 | ```{r warning=FALSE}
124 | last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
125 |                         data = linesDF,
126 |                         size=.2)
127 | ```
128 | 
129 | - how changes in price/quality affect $P(\text{wine is purchased})$ depends a lot on where you are in input space
130 | 
131 | ## Variation 2
132 | 
133 | In another possible world, mid-range wines are more common:
134 | 
135 | ```{r}
136 | qplot(Price, Quality, alpha=I(.5), data = df2Sample) + 
137 |   qualityScale
138 | ```
139 | 
140 | - input distribution is changed
141 | - ... but model is not changed
142 | 
143 | ## In this world, quality matters more....
144 | 
145 | - (more precisely, cases where quality matters are more common)
146 | 
147 | ```{r}
148 | v2Plot <- ggplot(subset(df2Sample, Price %in% seq(20, 120, by=10))) + 
149 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
150 |              size = 3, alpha = 1) + 
151 |   ggtitle("Quality vs. Purchase Probability at Various Prices") +
152 |   myScales +
153 |   scale_color_discrete("Price")
154 | v2Plot
155 | ```
156 | 
157 | ## Variation 3
158 | 
159 | In a third possible world, price varies more strongly with quality:
160 | 
161 | ```{r}
162 | qplot(Price, Quality, alpha=I(.5), data = df3Sample) + 
163 |   qualityScale
164 | ```
165 | 
166 | Again:
167 | 
168 | - input distribution is changed
169 | - ... but model is not changed, still $P(\text{wine is purchased}) = logit^{-1}(\beta_0 + \beta_1 Q + \beta_2 P)$
170 | 
171 | ## In this world, quality matters even more...
172 | 
173 | - (more precisely: quality matters in all cases that we actually see)
174 | 
175 | ```{r}
176 | v3Plot <- ggplot(subset(df3Sample, Price %in% seq(20, 120, by=10))) + 
177 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
178 |              size = 3, alpha = 1) + 
179 |   ggtitle("Quality vs. Purchase Probability at Various Prices") +
180 |   myScales +
181 |   scale_color_discrete("Price")
182 | v3Plot
183 | ```
184 | 
185 | ## Now quality matters more
186 | 
187 | ... across all price ranges (for the kinds of variation **that we see in the data**)
188 | 
189 | ```{r warning=FALSE}
190 | v3PlotWithCurves <- last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
191 |                                             data = linesDF,
192 |                                             size=.2)
193 | v3PlotWithCurves
194 | ```
195 | 
196 | ## Lessons from this example
197 | 
198 | 1. We want to interpret things on the scale we care about (probability in this case)
199 | 2. Relationships among the inputs matter
200 | 
201 | ## Goal is single-number summaries
202 | 
203 | These concepts are vague, but keep them in mind as we try to formalize things in the next few slides:
204 | 
205 | - **For each input, what is the average change in output per unit change in input?**
206 | (generalizes linear regression, units depend on units for input)
207 | 
208 | - **How important is each input in influencing the output?**
209 | (units should be consistent across inputs -- think of standardized regression coefficients)
210 | 
211 | ## Some notation
212 | 
213 | **$u$**: the variable under consideration
214 | 
215 | **$v$**: the vector of other variables (the "all else held equal")
216 | 
217 | **$f(u,v)$**: a function that makes predictions, e.g. maybe $f(u,v) = \mathcal{E}[y \mid u, v, \theta]$
218 | 
219 | * We consider transitions in $u$ holding $v$ constant, e.g. $$\frac{f(u_2, v) - f(u_1, v)}{u_2-u_1}$$
220 | * To get one-point summaries we'll take an average
221 | * All of the subtlety lies in the choice of $v$, $u_1$, $u_2$
222 | 
223 | 
224 | ## What average do we take?
225 | 
226 | The APC is defined as
227 | 
228 | $$\frac{\mathcal{E}[\Delta_f]}{\mathcal{E}[\Delta_u]}$$
229 | 
230 | where
231 | 
232 | * $\Delta_f = f(u_2,v) - f(u_1,v)$
233 | * $\Delta_u = u_2 - u_1$
234 | * $\mathcal{E}$ is expectation under the following process:
235 | 
236 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs
237 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$
238 | 
239 | 
240 | ## Variations
241 | 
242 | - "Impact" (my idea; help me with naming!) is just the expected value of $\Delta_f = f(u_2,v) - f(u_1,v)$
243 | - Absolute versions (of both impact and APC) use $\mathcal{E}[|\Delta_f|]$ and $\mathcal{E}[|\Delta_u|]$
244 | 
245 | ## Computation
246 | 
247 | - Once we've said what we want, computing/estimating it isn't trivial
248 | - More on this in the discussion (depending on time/interest)
249 | 
250 | ## Returning to the wines...
251 | 
252 | ```{r fig.height=4}
253 | apcComparisonPlot <- ggplot(subset(apcAllVariations, Input=="Quality")) +
254 |   geom_bar(aes(x=factor(Variation, levels=3:1), y=PerUnitInput.Signed), stat="identity", width=.5) + 
255 |   expand_limits(y=0) +
256 |   xlab("Variation") + 
257 |   ggtitle("APC for Quality across Variations") +
258 |   coord_flip()
259 | apcComparisonPlot
260 | grid.arrange(v1Plot + ggtitle("V1"), v2Plot + ggtitle("V2"), v3Plot + ggtitle("V3"), nrow=1)
261 | ```
262 | 
263 | Exercise for the reader: Make an example where APC is larger than in Variation 1 but "Impact" is much smaller.
264 | 
265 | ## Credit Scoring Example
266 | 
267 | ```{r}
268 | load(file="../examples/loan-defaults.RData")
269 | ```
270 | 
271 | - **SeriousDlqin2yrs** (target variable, 7% "yes"):  Person experienced 90 days past due delinquency or worse 
272 | - **RevolvingUtilizationOfUnsecuredLines**:  Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits
273 | - **age**:	Age of borrower in years
274 | - **NumberOfTime30-59DaysPastDueNotWorse**:	Number of times borrower has been 30-59 days past due but no worse in the last 2 years.
275 | - **NumberOfTime60-89DaysPastDueNotWorse**:	Number of times borrower has been 60-89 days past due but no worse in the last 2 years.
276 | - **NumberOfTimes90DaysLate**:	Number of times borrower has been 90 days or more past due.
277 | - **DebtRatio**:	Monthly debt payments, alimony,living costs divided by monthy gross income
278 | - **MonthlyIncome**:	Monthly income
279 | - **NumberOfOpenCreditLinesAndLoans**:	Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)
280 | - **NumberRealEstateLoansOrLines**:	Number of mortgage and real estate loans including home equity lines of credit
281 | - **NumberOfDependents**:	Number of dependents in family excluding themselves (spouse, children etc.)
282 | 
283 | 
284 | ## Input Distribution
285 | 
286 | ```{r fig.height = 12, echo=FALSE}
287 | allHistograms
288 | ```
289 | 
290 | Note: previous lateness (esp. 90+) days is rare.
291 | 
292 | ## Model Building
293 | 
294 | We'll use a random forest for this example:
295 | 
296 | ```{r eval=FALSE, echo=TRUE}
297 | set.seed(1)
298 | # Turning the response to type "factor" causes the RF to be build for classification:
299 | credit$SeriousDlqin2yrs <- factor(credit$SeriousDlqin2yrs) 
300 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit, ntree=ntree)
301 | ```
302 | 
303 | ## Aggregate Predictive Comparisons
304 | 
305 | ```{r eval=FALSE, echo=TRUE}
306 | set.seed(1)
307 | apcDF <- GetPredCompsDF(rfFit, credit,
308 |                         numForTransitionStart = numForTransitionStart,
309 |                         numForTransitionEnd = numForTransitionEnd,
310 |                         onlyIncludeNearestN = onlyIncludeNearestN)
311 | ```
312 | 
313 | ```{r}
314 | kable(apcDF, row.names=FALSE)
315 | ```
316 | 
317 | ## Impact Plot
318 | 
319 | (Showing +/- the absolute impact, since signed impact is bounded between those numbers)
320 | 
321 | (Showing impact rather than APC b/c the different APC units wouldn't be comparable, shouldn't go on one chart)
322 | 
323 | ```{r LoanDefaultImpact}
324 | PlotPredCompsDF(apcDF)
325 | ```
326 | 
327 | Summaries like this can guide questions that push is to dig deeper, like: 
328 | 
329 | - What's going on with age? (To get the cancellation we see, it must have strong effects that vary in sign)
330 | - Why don't instances of previous lateness always increase your probability of a 90-days-past-due incident? (cancellation?)
331 | 
332 | 
333 | ## Goals for sensitivity analysis
334 | 
335 | - want something that properly accounts for relationships among variables of interest
336 | - want something that emphasizes the values that are plausible given the other values (two reasons: this is what we care about, and this is what our model has better estimates of)
337 | 
338 | 
339 | ```{r warning=FALSE}
340 | v3PlotWithCurves + ggtitle("Wines: Variation 3")
341 | ```
342 | 
343 | ## How we'll do sensitivity analysis
344 | 
345 | - choose a few random values for $v$ (the "all else held equal")
346 | - sample from $u$ conditional on $v$
347 | - plot $u$ vs. the prediction for each $v$
348 | 
349 | ```{r}
350 | ggplot(pairsSummarizedAge, aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 
351 |   geom_point(aes(size = Weight)) +
352 |   geom_line(size=.2) +
353 |   xlab("age") +
354 |   ylab("Prediction") + 
355 |   guides(color = FALSE)
356 | ```
357 | 
358 | ## Zooming in...
359 | 
360 | ```{r}
361 | ggplot(subset(pairsSummarizedAge, OriginalRowNumber <= 8), 
362 |        aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 
363 |   geom_point(aes(size = Weight)) +
364 |   geom_line(size=.2) +
365 |   xlab("age") +
366 |   ylab("Prediction") + 
367 |   guides(color = FALSE) + 
368 |   coord_cartesian(ylim=(c(-.01,.2)))
369 | ```
370 | 
371 | - shows off some weird behavior of the model!
372 | - we should dig deeper, get more comfortable with the model
373 | - try other models
374 | 
375 | 
376 | ## Sensitivity: Number of Time 30-35 Days Past Due
377 | 
378 | - Mostly we see the increasing probability that we'd expect...
379 | 
380 | ```{r, warning=FALSE}
381 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
382 |   geom_point(aes(size = Weight)) +
383 |   geom_line(size=.2) +
384 |   scale_x_continuous(limits=c(0,2)) +
385 |   scale_size_area() + 
386 |   xlab("NumberOfTime30.59DaysPastDueNotWorse") +
387 |   ylab("Prediction") + 
388 |   guides(color = FALSE)
389 | ```
390 | 
391 | ## Sensitivity: Number of Time 30-35 Days Past Due
392 | 
393 | ... but in one case, probability of default *decreases* with the 0-to-1 transition
394 | 
395 | ```{r echo=FALSE, warning=FALSE}
396 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 
397 |   geom_point(aes(size = Weight)) +
398 |   geom_line(aes(alpha=ifelse(OriginalRowNumber == 18, 1, .3))) +
399 |   scale_x_continuous(limits=c(0,2)) +
400 |   scale_alpha_identity() +
401 |   scale_size_area() + 
402 |   xlab("NumberOfTime30.59DaysPastDueNotWorse") +
403 |   ylab("Prediction") + 
404 |   guides(color = FALSE)
405 | ```
406 | 
407 | ## Can we explain it?
408 | 
409 | ```{r}
410 | oneRowWithDecreasingDefaultProbability <- oneOriginalRowNumber[1,intersect(names(oneOriginalRowNumber), names(credit))]
411 | kable(oneRowWithDecreasingDefaultProbability[,1:5])
412 | kable(oneRowWithDecreasingDefaultProbability[,6:8])
413 | kable(oneRowWithDecreasingDefaultProbability[,9:10])
414 | ```
415 | 
416 | ## Alternative Approach to Sensitivity Analysis: "Partial Plots"
417 | 
418 | - e.g. `partialPlot` function in `randomForest` library in R
419 | 
420 | 
421 | This accounts for relationships among the "all else held equal" but **not** between those and the input under consideration
422 | 
423 | Make predictions on a new data set constructed as follows:
424 | 
425 | ```{r}
426 | oneRow <- data.frame(v1=1, v2="a", v3=2.3, u=6)
427 | ```
428 | 
429 | One row:
430 | 
431 | ```{r results='markup'}
432 | kable(oneRow)
433 | ```
434 | 
435 | Repeat the row varying $u$ across its whole range:
436 | 
437 | ```{r results='markup'}
438 | oneRowRep <- oneRow[rep(1,20),]
439 | oneRowRep$u <- 1:20
440 | kable(oneRowRep)
441 | ```
442 | 
443 | - Then average predictions grouping by $u$ (or not)
444 | 
445 | ## Partial Plot Method Applied to Wines
446 | 
447 | ```{r}
448 | v3Plot + geom_point(aes(x=Quality, y=PurchaseProbability), stat="summary", fun.y="mean", data=linesDF) +
449 |   geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 
450 |                         data = linesDF,
451 |                         size=1) +
452 |   ggtitle("(V3)")
453 | ```
454 | 
455 | 
456 | 
457 | ## Comparison with other approaches
458 | 
459 | Things that vary
460 | 
461 | - Units - depend on input or consistent across inputs
462 | - Sensitive to univariate distribution of inputs
463 | - Sensitive to dependence between inputs
464 | - Shows shape of non-linearity
465 | - Signed
466 | - Model
467 | - Based on holdout performance
468 | 
469 | 
470 | ## Computing the APC
471 | 
472 | - sampling a $v$ is easy (take a row from your data)
473 | - sampling $u_1$ given $v$ is easy (take the $u$ in that row)
474 | - sampling *another* $u$ ($u_2$) is harder
475 | - would be easy if you had a lot of data with the same $v$
476 | - so take $u$'s from rows of data where the correspinding $v$ is close to your $v$
477 | 
478 | ## In Practice
479 | 
480 | - Gelman & Pardoe look at all pairs of rows and assign weights $$\frac{1}{1 + d}$$
481 | - I think this was giving too much weight to faraway points (volume of $n$-spheres grows much faster than linearly in $d$)
482 | - I use Gelman's weights, but only looking at a fixed number of the closest points
483 | - Computational advantage: fewer points to deal with
484 | - A few example 
485 | 
486 | ```{r}
487 | pairsOrdered <- pairs[order(pairs$OriginalRowNumber),]
488 | 
489 | for (i in 1:20) {
490 |   cat("\n\n")
491 |   kable(head(subset(pairsOrdered, OriginalRowNumber==i)[c("OriginalRowNumber", "age", "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", "NumberOfTime30.59DaysPastDueNotWorse", "NumberOfTime30.59DaysPastDueNotWorse.B","yHat1","yHat2", "Weight")]))
492 | }
493 | 
494 | ```
495 | 
496 | 


--------------------------------------------------------------------------------
/notes/template:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | ---
4 | 
5 | #!html_output#
6 | 


--------------------------------------------------------------------------------
/tests/test-apc.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(predcomps)
 3 | 
 4 | test_that("APC matches coefficient exactly for linear model", {
 5 |   
 6 |   df <- data.frame(X = rep(c(1,2),2),
 7 |                    Y = c(1,2,3,4))
 8 |   
 9 |   predictionFunction <- function(df) 2*df$X + 3*df$Y
10 |   result <- GetSingleInputApcs(predictionFunction, df, u="X", v="Y")
11 |   expect_that(result$PerUnitInput.Signed, equals(2)) 
12 | })
13 | 


--------------------------------------------------------------------------------
/tests/test-pairs.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(predcomps)
 3 | 
 4 | MakeComparable <- function(df) {
 5 |   df <- round(df, digits = 5)
 6 |   return(df[do.call(order, df), ])
 7 | }
 8 | 
 9 | test_that("GetPairs works right in a small example", {
10 |   
11 |   df <- data.frame(X = rep(c(1,2),2),
12 |                    Y = rep(c(2,4),2))
13 |   pairsActual <- GetPairs(df, "X", "Y")
14 |   pairsExpected <- data.frame(OriginalRowNumber = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), 
15 |                               X = c(1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2), 
16 |                               Y = c(2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4), 
17 |                               X.B = c(2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1), 
18 |                               Weight = c(0.166666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.166666666666667))
19 |   pairsActual <- pairsActual[names(pairsExpected)]
20 |   expect_that(all.equal(MakeComparable(pairsActual), MakeComparable(pairsExpected)), is_true()) 
21 | })
22 | 


--------------------------------------------------------------------------------