├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── average-comparisons.R ├── get-input-vars-from-model.R ├── multiple-comparisons.R └── pairs.R ├── README.md ├── _site ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R │ ├── average-comparisons.R │ ├── get-input-vars-from-model.R │ ├── multiple-comparisons.R │ ├── pairs.R │ └── transition-plots.R ├── README.md ├── man │ ├── ArrowPlot.Rd │ ├── AverageSpecifiedComparison.Rd │ ├── ComputeAPCFromPairs.Rd │ ├── GetAPC.Rd │ ├── GetAPCWithAbsolute.Rd │ ├── GetApcDF.Rd │ ├── GetComparisonDFFromPairs.Rd │ ├── GetPairs.Rd │ ├── GetPredCompsDF.Rd │ ├── GetPredCompsDFFromPairs.Rd │ ├── GetSingleInputPredComps.Rd │ ├── PlotApcDF.Rd │ ├── PlotApcDF2.Rd │ ├── PlotPairCumulativeWeights.Rd │ ├── PlotPredCompsDF.Rd │ ├── TransitionPlot.Rd │ ├── average_specified_comparison.Rd │ ├── compute_apc_from_pairs.Rd │ ├── get_apc.Rd │ ├── get_apc_with_absolute.Rd │ ├── get_pairs.Rd │ ├── mahal.Rd │ └── resample_from_pairs.Rd ├── notes │ ├── apc.Rmd │ ├── examples │ │ ├── diamonds.Rmd │ │ ├── loan-defaults.Rmd │ │ ├── logistic-regression.Rmd │ │ ├── overview.Rmd │ │ ├── simulated-linear-model-interactions.Rmd │ │ └── wine-logistic-regression.Rmd │ ├── impact.Rmd │ ├── index.Rmd │ ├── more │ │ ├── compared-with-paper.Rmd │ │ ├── future-work.Rmd │ │ ├── large-N-limit.Rmd │ │ └── renormalize-weights.Rmd │ ├── template │ └── transition-plots.Rmd ├── temp │ └── weights-exploration.R └── tests │ ├── run-all.R │ ├── test-apc.R │ └── test-pairs.R ├── man ├── ComputeAPCFromPairs.Rd ├── GetComparisonDFFromPairs.Rd ├── GetPairs.Rd ├── GetPredCompsDF.Rd ├── GetSingleInputApcs.Rd ├── GetSingleInputPredComps.Rd ├── PlotPairCumulativeWeights.Rd └── PlotPredCompsDF.Rd ├── notes ├── apc.Rmd ├── examples │ ├── diamonds.Rmd │ ├── loan-defaults.Rmd │ ├── logistic-regression.Rmd │ ├── overview.Rmd │ ├── simulated-linear-model-interactions.Rmd │ └── wine-logistic-regression.Rmd ├── impact.Rmd ├── index.Rmd ├── more │ ├── compared-with-paper.Rmd │ ├── future-work.Rmd │ ├── large-N-limit.Rmd │ └── pairs-and-weights.Rmd ├── presentations │ ├── AlpineLunch.Rmd │ ├── Bar.Rmd │ └── Lunch.Rmd └── template └── tests ├── test-apc.R └── test-pairs.R /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rhistory 3 | main.R 4 | *figure/ 5 | ..Rcheck 6 | *.RData -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: predcomps 2 | Title: Average Predictive Comparisons 3 | Description: An implementation of Gelman/Pardoe's average predictive comparisons 4 | Version: 0.1 5 | Author: David Chudzicki 6 | Maintainer: David Chudzicki 7 | Depends: 8 | R (>= 3.0.2), 9 | ggplot2, 10 | gridExtra, 11 | dplyr, 12 | reshape2, 13 | assertthat 14 | License: MIT 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 David J Chudzicki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | S3method(GetComparisonDFFromPairs,"function") 2 | S3method(GetComparisonDFFromPairs,glm) 3 | S3method(GetComparisonDFFromPairs,lm) 4 | S3method(GetComparisonDFFromPairs,randomForest) 5 | export(ComputeApcFromPairs) 6 | export(GetComparisonDF) 7 | export(GetComparisonDFFromPairs) 8 | export(GetPairs) 9 | export(GetPredCompsDF) 10 | export(GetSingleInputApcs) 11 | export(PlotPairCumulativeWeights) 12 | export(PlotPredCompsDF) 13 | -------------------------------------------------------------------------------- /R/average-comparisons.R: -------------------------------------------------------------------------------- 1 | #' GetSingleInputApcs 2 | #' 3 | #' makes predictive comparison summaries (both per unit input and impact, both absolute and signed) by forming an data frame of pairs with appropriate weights and then calling `ComputeApcFromPairs`. 4 | #' Only works fore continuous inputs right now 5 | #' 6 | #' @param predictionFunction this could be a function (which takes data frame and makes returns a vector of predictions) or an object of class `lm`, `glm`, or `randomForest` 7 | #' @param X a data frame with all inputs 8 | #' @param u a string naming the input of interest 9 | #' @param v a string naming the other inputs 10 | #' @param ... other arguments to be passed to `GetPairs` 11 | #' @return a list with: \code{signed} (the usual Apc) and \code{absolute} (Apc applied to the absolute value of the differences) 12 | #' 13 | #' @export 14 | #' 15 | #' @examples 16 | #' n <- 200 17 | #' x1 <- runif(n = n, min = 0, max = 1) 18 | #' x2 <- runif(n = n, min = 0, max = 1) 19 | #' x3 <- runif(n = n, min = 0, max = 10) 20 | #' y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1) 21 | #' df <- data.frame(x1, x2, x3, y) 22 | #' fittedLm <- lm(y ~ ., data = df) 23 | #' fittedLm 24 | #' GetSingleInputApcs(fittedLm, df, "x2", c("x1", "x3")) 25 | GetSingleInputApcs <- function(predictionFunction, X, u, v, ...) { 26 | pairs <- GetPairs(X, u, v, ...) 27 | return( 28 | list(PerUnitInput.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v), 29 | PerUnitInput.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE), 30 | Impact.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v, impact=TRUE), 31 | Impact.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE, impact=TRUE)) 32 | ) 33 | } 34 | 35 | 36 | #' ComputeApcFromPairs 37 | #' 38 | #' @export 39 | ComputeApcFromPairs <- function(predictionFunction, pairs, u, v, absolute=FALSE, impact=FALSE) { 40 | uNew <- paste(u,".B",sep="") 41 | ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v) 42 | absoluteOrIdentity <- if (absolute) abs else identity 43 | uDiff <- ComparisonDF[[uNew]] - ComparisonDF[[u]] 44 | denom <- if (impact) sum(ComparisonDF$Weight) else sum(ComparisonDF$Weight * uDiff * sign(uDiff)) 45 | Apc <- sum(absoluteOrIdentity(ComparisonDF$Weight * (ComparisonDF$yHat2 - ComparisonDF$yHat1) * sign(uDiff))) / denom 46 | return(Apc) 47 | } 48 | 49 | 50 | #' GetComparisonDFFromPairs 51 | #' 52 | #' (abstracted this into a separate function from \code{GetApc} so we can more easily do things 53 | #' like \code{GetApcWithAbsolute}) 54 | #' @export 55 | GetComparisonDFFromPairs <- function(predictionFunction, pairs, u, v) UseMethod("GetComparisonDFFromPairs") 56 | 57 | 58 | 59 | # Two methods: 60 | # one for predictionFunction (df |--> predictions) 61 | # another for a glm object 62 | 63 | #' @export 64 | GetComparisonDFFromPairs.function <- function(predictionFunction, pairs, u, v) { 65 | uNew <- paste(u,".B",sep="") 66 | pairs$yHat1 <- predictionFunction(pairs) 67 | pairsNew <- structure(pairs[,c(v,uNew)], names=c(v,u)) #renaming u in pairsNew so we can call predictionFunction 68 | pairs$yHat2 <- predictionFunction(pairsNew) 69 | return(pairs) 70 | } 71 | 72 | #' @export 73 | GetComparisonDFFromPairs.glm <- function(glmFit, pairs, u, v) { 74 | predictionFunction <- function(df) predict.glm(glmFit, newdata=df, type="response") 75 | return( 76 | GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v) 77 | ) 78 | } 79 | 80 | #' @export 81 | GetComparisonDFFromPairs.lm <- function(lmFit, pairs, u, v) { 82 | predictionFunction <- function(df) predict.glm(lmFit, newdata=df) 83 | return( 84 | GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v) 85 | ) 86 | } 87 | 88 | #' @export 89 | GetComparisonDFFromPairs.randomForest <- function(rfFit, pairs, u, v) { 90 | # For classification, we need to specify that the predictions should be probabilties (not classes) 91 | if (rfFit$type == "classification") { 92 | if (length(rfFit$classes) > 2) { 93 | stop("Sorry, I don't know what to do when there are more than 2 classes.") 94 | } 95 | predictionFunction <- function(df) predict(rfFit, newdata=df, type="prob")[,2] 96 | } else 97 | { 98 | predictionFunction <- function(df) predict(rfFit, newdata=df) 99 | } 100 | return( 101 | GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v) 102 | ) 103 | } 104 | 105 | #' @export 106 | GetComparisonDFFromPairs.gbm <- function(rfFit, pairs, u, v) { 107 | # For classification, we need to specify that the predictions should be probabilties (not classes) 108 | if (rfFit$type == "classification") { 109 | if (length(rfFit$classes) > 2) { 110 | stop("Sorry, I don't know what to do when there are more than 2 classes.") 111 | } 112 | predictionFunction <- function(df) predict(rfFit, newdata=df, type="prob")[,2] 113 | } else 114 | { 115 | predictionFunction <- function(df) predict(rfFit, newdata=df) 116 | } 117 | return( 118 | GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v) 119 | ) 120 | } 121 | 122 | 123 | #' @export 124 | GetComparisonDF <- function(predictionFunction, df, u, v=NULL, ...) { 125 | if (is.null(v)) { 126 | inputNames <- GetInputVarsFromModel(predictionFunction) 127 | v <- setdiff(inputNames, u) 128 | } 129 | pairs <- GetPairs(df, u, v, ...) 130 | ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v) 131 | } 132 | -------------------------------------------------------------------------------- /R/get-input-vars-from-model.R: -------------------------------------------------------------------------------- 1 | GetInputVarsFromModel <- function(model) UseMethod("GetInputVarsFromModel") 2 | 3 | GetInputVarsFromModel.lm <- function(model) { 4 | # This seems like a robust way to get the input names from an lm object, but I'm not really sure 5 | return(attr(model$terms, "term.labels")) 6 | } 7 | 8 | GetInputVarsFromModel.glm <- function(model) { 9 | # This seems like a robust way to get the input names from an glm object, but I'm not really sure 10 | return(attr(model$terms, "term.labels")) 11 | } 12 | 13 | GetInputVarsFromModel.randomForest <- function(model) { 14 | # This seems like a robust way to get the input names from an glm object, but I'm not really sure 15 | return(attr(model$terms, "term.labels")) 16 | } 17 | -------------------------------------------------------------------------------- /R/multiple-comparisons.R: -------------------------------------------------------------------------------- 1 | #' GetApcDF 2 | #' 3 | #' makes average predictive comparison for all specified inputs 4 | #' 5 | #' @param model Either a function (from a data frame to vector of predictions) or a model we know how to deal with (lm, glm) 6 | #' @param df data frame with data 7 | #' @param inputVars inputs to the model 8 | #' @param ... extra parguments passed to GetPairs used to control Weight function 9 | #' @export 10 | #' @examples 11 | #' n <- 200 12 | #' x1 <- runif(n = n, min = 0, max = 1) 13 | #' x2 <- runif(n = n, min = 0, max = 1) 14 | #' x3 <- runif(n = n, min = 0, max = 10) 15 | #' y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1) 16 | #' df <- data.frame(x1, x2, x3, y) 17 | #' fittedLm <- lm(y ~ ., data = df) 18 | #' apcDF <- GetPredCompsDF(fittedLm, df = df) 19 | #' apcDF 20 | GetPredCompsDF <- function(model, df, inputVars = NULL, ...) { 21 | 22 | # If inputVars is null, we can try to get the list of inputs from the model: 23 | if (is.null(inputVars)) { 24 | inputVars <- GetInputVarsFromModel(model) 25 | } 26 | apcList <- Map(function(currentVar) { 27 | cat(paste("Working on:", currentVar, "\n")) 28 | data.frame(Input = currentVar, 29 | GetSingleInputApcs(model, 30 | df, 31 | currentVar, 32 | c(setdiff(inputVars, currentVar)), 33 | ...))}, 34 | inputVars 35 | ) 36 | apcDF <- do.call(rbind, apcList) 37 | return(apcDF) 38 | } 39 | 40 | #' PlotApcDF 41 | #' 42 | #' plots the output of GetApcDF -- this is my preferred display for now 43 | #' 44 | #' @param apcDF the output of GetApcDF 45 | #' @export 46 | #' @examples 47 | #' n <- 200 48 | #' x1 <- runif(n = n, min = 0, max = 1) 49 | #' x2 <- runif(n = n, min = 0, max = 1) 50 | #' x3 <- runif(n = n, min = 0, max = 10) 51 | #' y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1) 52 | #' df <- data.frame(x1, x2, x3, y) 53 | #' fittedLm <- lm(y ~ ., data = df) 54 | #' apcDF <- GetPredCompsDF(fittedLm, df = df) 55 | #' PlotPredCompsDF(apcDF, variant = "PerUnitInput") + theme_gray(base_size = 18) 56 | PlotPredCompsDF <- function(apcDF, variant="Impact") { 57 | xLabel <- switch(variant, 58 | "Impact"="Avg Change in Output", 59 | "PerUnitInput"="Avg Change in Output Per Unit Input", 60 | stop("Unknown Predcomps Variant")) 61 | 62 | apcDF <- apcDF[c("Input", grep(paste0("^",variant,"\\."), names(apcDF), value=TRUE))] 63 | names(apcDF) <- gsub(paste0("^",variant,"\\."), "", names(apcDF)) 64 | 65 | maxAPC <- max(abs(apcDF$Absolute)) 66 | apcDF$Input <- reorder(apcDF$Input, apcDF$Absolute) 67 | longAPCs <- melt(apcDF, id="Input", value.name = "Value", variable.name = "Type") 68 | longAPCs2 <- rbind( 69 | longAPCs, 70 | transform(subset(longAPCs, Type=="Absolute"), Value=-Value) 71 | ) 72 | longAPCs2 <- longAPCs2[order(factor(longAPCs2$Type, levels=c("Absolute", "Signed"))), ] 73 | 74 | return( 75 | ggplot(longAPCs2) + 76 | geom_point(aes(y = Input, x=Value, color=Type, shape=Type, size=Type)) + 77 | scale_x_continuous(limits=c(-maxAPC, maxAPC)) + 78 | scale_size_discrete(range=c(3,4)) + 79 | ggtitle(variant) + 80 | geom_vline(aes(xintercept=0), alpha=.5) + 81 | xlab(xLabel) 82 | ) 83 | } 84 | -------------------------------------------------------------------------------- /R/pairs.R: -------------------------------------------------------------------------------- 1 | #' GetPairs 2 | #' 3 | #' Form all pairs of rows in \code{X} and compute Mahalanobis distances based on \code{v}. 4 | #' 5 | #' To help with computational constraints, you have the option to not form pairs between all rows of \code{X} but instead of specify a certain number (\code{numForTransitionStart}) to randomly be selected as rows from which transitions start, and another number (\code{numForTransitionEnd}) to be randomly selected as where transitions end. We then form all pairs between transition-start rows and transition-end rows. 6 | #' 7 | #' In order to get a smaller data frame for later manipulations (and maybe just because it's a good idea), you can also specify \code{onlyIncludeNearestN}, in which case we return only the nearest \code{onlyIncludeNearestN} transition ends for each transition start (instead of all pairs). 8 | #' 9 | #' @param X data frame 10 | #' @param u input of interest 11 | #' @param v other inputs 12 | #' @param mahalanobisConstantTerm Weights are (1 / (mahalanobisConstantTerm + Mahalanobis distance)) 13 | #' @param numForTransitionStart number of rows to use as the start points of transitions (defaulting to `NULL`, we use all rows) 14 | #' @param numForTransitionEnd number of rows to use as potential end points of transitions (defaulting to `NULL`, we use all rows) 15 | #' @param onlyIncludeNearestN for each transition start, we only include as transition end points the nearest `onlyIncludeNearestN` rows (defaulting to `NULL`, we use all rows) 16 | #' @return a data frame with the inputs \code{v} from the first of each pair, \code{u} from each half (with ".B" appended to the second), and the Mahalanobis distances between the pairs. 17 | #' @export 18 | #' @examples 19 | #' v <- rnorm(100) 20 | #' u <- v + 0.3*rnorm(100) 21 | #' qplot(v,u) 22 | #' X = data.frame(v=v,u=u) 23 | #' pairsDF <- GetPairs(X, "v", "u") 24 | #' pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1) 25 | #' # When we subset to one "original row number", all of the v's are the same: 26 | #' print(pairsDFRow1$v) 27 | #' # ... and u's corresponding to closer v.B (the v in the second element of the pair) have higher weight: 28 | #' qplot(u.B, Weight, data=pairsDFRow1) 29 | GetPairs <- function(X, u, v, 30 | numForTransitionStart = NULL, 31 | numForTransitionEnd = NULL, 32 | onlyIncludeNearestN = NULL, 33 | mahalanobisConstantTerm=1) { 34 | 35 | assert_that(length(u) == 1) # make sure we have exactly 1 input var of interest 36 | for (columnName in c(u,v)) { 37 | assert_that(columnName %in% names(X)) 38 | columnClass <- class(X[[columnName]]) 39 | if (!(columnClass) %in% c("integer", "numeric")) { 40 | stop(sprintf("Sorry, column %s is of class %s. I can only deal with integer and numeric types for now.", columnName, columnClass)) 41 | } 42 | } 43 | 44 | if (!is.null(numForTransitionStart)) { 45 | X1 <- X[sample.int(nrow(X), size=numForTransitionStart), c(v,u)] 46 | } else { 47 | X1 <- X[c(v,u)] 48 | } 49 | 50 | if (!is.null(numForTransitionEnd)) { 51 | X2 <- X[sample.int(nrow(X), size=numForTransitionEnd), c(v,u)] 52 | } else { 53 | X2 <- X[c(v,u)] 54 | } 55 | 56 | X1$OriginalRowNumber <- 1:nrow(X1) 57 | X2$OriginalRowNumber.B <- 1:nrow(X2) 58 | 59 | vMatrix1 <- as.matrix(X1[,v]) 60 | vMatrix2 <- as.matrix(X2[,v]) 61 | 62 | 63 | covV=cov(vMatrix2) 64 | 65 | distMatrix <- apply(vMatrix1, 1, function(row) mahalanobis(vMatrix2, row, covV)) 66 | dim(distMatrix) 67 | 68 | colnames(distMatrix) <- 1:ncol(distMatrix) 69 | rownames(distMatrix) <- 1:nrow(distMatrix) 70 | distDF <- as.data.frame(as.table(distMatrix)) 71 | names(distDF) <- c("OriginalRowNumber.B", "OriginalRowNumber", "MahalanobisDistance") 72 | 73 | if (!is.null(onlyIncludeNearestN)) { 74 | distDF <- distDF %>% 75 | group_by(OriginalRowNumber) %>% 76 | filter(rank(MahalanobisDistance, ties.method="random") < onlyIncludeNearestN) 77 | } 78 | 79 | pairs <- merge(X1, distDF, by = "OriginalRowNumber") 80 | pairs <- merge(X2, pairs, by = "OriginalRowNumber.B", suffixes = c(".B", "")) 81 | pairs$Weight <- 1/(mahalanobisConstantTerm + pairs$MahalanobisDistance) 82 | 83 | # If we haven't sampled, then OriginalRowNumber == OriginalRowNumber.B means that 84 | # the transition start and end are the same, so we should remove those rows. 85 | if (is.null(numForTransitionStart) && is.null(numForTransitionEnd)) { 86 | pairs <- subset(pairs, OriginalRowNumber != OriginalRowNumber.B) 87 | } 88 | 89 | # Renormalize weights: 90 | pairs <- pairs %>% group_by(OriginalRowNumber) %>% mutate(Weight = Weight/sum(Weight)) 91 | 92 | return(data.frame(pairs)) 93 | } 94 | 95 | 96 | #' PlotPairCumulativeWeights 97 | #' 98 | #' For a sample of transition start rows, we plot rank of transition end (by increasing weight) vs. cumulative weight. This gives a sense of how much weight is going into the nearest points vs. further ones. 99 | #' 100 | #' @export 101 | #' @examples 102 | #' v <- rnorm(100) 103 | #' u <- v + 0.3*rnorm(100) 104 | #' X = data.frame(v=v,u=u) 105 | #' pairsDF <- GetPairs(X, "v", "u") 106 | #' pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1) 107 | #' # For most original rows, we get 75% of the weight in 50% of the pairs: 108 | #' PlotPairCumulativeWeights(pairsDF) 109 | 110 | PlotPairCumulativeWeights <- function(pairs, numOriginalRowNumbersToPlot = 20) { 111 | rowNumSample <- sample(unique(pairs$OriginalRowNumber))[1:numOriginalRowNumbersToPlot] 112 | pairsWithCumWeightSums <- pairs %>% 113 | group_by(OriginalRowNumber) %>% 114 | arrange(OriginalRowNumber, -Weight) %>% 115 | mutate(CumulativeWeight = cumsum(Weight), Rank = dense_rank(-Weight)) 116 | 117 | pairsSubset <- subset(pairsWithCumWeightSums, OriginalRowNumber %in% rowNumSample) 118 | 119 | ggplot() + 120 | geom_line(aes(x=Rank, y=CumulativeWeight, color=factor(OriginalRowNumber)), data = pairsSubset, alpha = .2) + 121 | geom_line(aes(x=Rank, y=CumulativeWeight), stat = "summary", fun.y = "median", data=pairsWithCumWeightSums) 122 | } 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [Documentation etc.](http://www.davidchudzicki.com/predcomps/) 2 | 3 | Interested in getting involved? Here are some ways to help: 4 | 5 | - Trying out the package in examples, write about the results. Does it help you better understand your complicated models? If not, what should be different? You can add examples to the documentation here via pull requests, or write in your own space. 6 | - Clarifying the language in the documentation - what can be made clearer? 7 | - Working on any of the "future work" below 8 | 9 | 10 | ## Future Work 11 | 12 | (Unsure about the priority of the rest, but the top one is definitely most important.) 13 | 14 | ### Explicit model for p(u|v) 15 | 16 | Currently [the way we assign weights](http://0.0.0.0:4000/more-pairs-and-weights.html) to sample for p(u|v) (roughly as described in the paper) requires a bit of hand-tweaking to work well in individual examples. It's also hard to generalize to categorical inputs. This may be the biggest barrier to widespread adoption. 17 | 18 | As an alternative, perhaps we can explicitly build a model for the desired conditional distribution, e.g. maybe by using something like [BART](https://github.com/kapelner/bartMachine). 19 | 20 | Todo: 21 | 22 | - implement 23 | - see how well it works 24 | 25 | 26 | ### Categorical inputs 27 | 28 | Once we've done "Explicit model for p(u|v)", allowing categorical inputs should be much easier, but there's still some thought required. 29 | 30 | ### Sensivity Analysis 31 | 32 | Some of the examples show how to do sensitivity analysis in the spirit of this package, but it'd be great to have that do it for you. 33 | 34 | ### "Variable Importance" 35 | 36 | Implement something like [conditional variable importance](http://www.biomedcentral.com/1471-2105/9/307) in the spirit of this package. (Like "permutation importance", but instead of taking a permutation, you'd sample from the conditional distribution p(u|v).) 37 | 38 | ### Other tools/methods for understanding complicated models 39 | 40 | I'd like to compile a list of other work in this direction, maybe comparing them with this. 41 | 42 | I should add a page discussing other methods people have used to get at somewhat the same idea. 43 | 44 | - [conditional variable importance](http://www.biomedcentral.com/1471-2105/9/307) - out of everything I've seen, conditional variable importance is the most similar in spirit to this package 45 | - randomForest package in R (partial plots, variable importance) 46 | - earth package in R (variable importance) 47 | 48 | 49 | -------------------------------------------------------------------------------- /_site/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: predcomps 2 | Title: Average Predictive Comparisons 3 | Description: An implementation of Gelman/Pardoe's average predictive comparisons 4 | Version: 0.1 5 | Author: David Chudzicki 6 | Maintainer: David Chudzicki 7 | VignetteBuilder: knitr 8 | Depends: 9 | R (>= 3.0.2), 10 | ggplot2, 11 | gridExtra, 12 | dplyr, 13 | reshape2, 14 | assertthat 15 | License: MIT 16 | -------------------------------------------------------------------------------- /_site/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 David J Chudzicki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /_site/NAMESPACE: -------------------------------------------------------------------------------- 1 | S3method(GetComparisonDFFromPairs,"function") 2 | S3method(GetComparisonDFFromPairs,glm) 3 | S3method(GetComparisonDFFromPairs,lm) 4 | S3method(GetComparisonDFFromPairs,randomForest) 5 | export(ComputeApcFromPairs) 6 | export(GetComparisonDF) 7 | export(GetComparisonDFFromPairs) 8 | export(GetPairs) 9 | export(GetPredCompsDF) 10 | export(GetSingleInputPredComps) 11 | export(PlotPairCumulativeWeights) 12 | export(PlotPredCompsDF) 13 | export(TransitionPlot) 14 | -------------------------------------------------------------------------------- /_site/R/average-comparisons.R: -------------------------------------------------------------------------------- 1 | #' GetComparisonDF 2 | #' 3 | #' makes average predictive comparison (based on Gelman/Pardoe) by forming pairs with two versions of the input of interest and averaging the predictive difference using Weights. I think Weights should be an approximation of the density p(u1,u2|v) or something like that... I need to look back at this. At present, I believe this is probably implementing the version in the Gelman/Pardoe paper. 4 | #' returns a list with the Apc and the Apc applied to the absolute value of the prediction function 5 | #' Only works fore continuous inputs right now 6 | #' 7 | #' @param predictionFunction 8 | #' @param X 9 | #' @param u input of interest 10 | #' @param v other inputs 11 | #' @param k Weights are (1 / (k + Mahalanobis distance)) 12 | #' @return a list with: \code{signed} (the usual Apc) and \code{absolute} (Apc applied to the absolute value of the differences) 13 | #' @export 14 | GetSingleInputApcs <- function(predictionFunction, X, u, v, ...) { 15 | pairs <- GetPairs(X, u, v, ...) 16 | return( 17 | list(PerUnitInput.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v), 18 | Apc.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE), 19 | Impact.Signed = ComputeApcFromPairs(predictionFunction, pairs, u, v, impact=TRUE), 20 | Impact.Absolute = ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute=TRUE, impact=TRUE)) 21 | ) 22 | } 23 | 24 | 25 | #' ComputeApcFromPairs 26 | #' 27 | #' @export 28 | ComputeApcFromPairs <- function(predictionFunction, pairs, u, v, absolute=FALSE, impact=FALSE) { 29 | uNew <- paste(u,".B",sep="") 30 | ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v) 31 | absoluteOrIdentity <- if (absolute) abs else identity 32 | uDiff <- ComparisonDF[[uNew]] - ComparisonDF[[u]] 33 | denom <- if (impact) sum(ComparisonDF$Weight) else sum(ComparisonDF$Weight * uDiff * sign(uDiff)) 34 | Apc <- sum(absoluteOrIdentity(ComparisonDF$Weight * (ComparisonDF$yHat2 - ComparisonDF$yHat1) * sign(uDiff))) / denom 35 | return(Apc) 36 | } 37 | 38 | 39 | #' GetComparisonDFFromPairs 40 | #' 41 | #' (abstracted this into a separate function from \code{GetApc} so we can more easily do things 42 | #' like \code{GetApcWithAbsolute}) 43 | #' @export 44 | GetComparisonDFFromPairs <- function(predictionFunction, pairs, u, v) UseMethod("GetComparisonDFFromPairs") 45 | 46 | 47 | 48 | # Two methods: 49 | # one for predictionFunction (df |--> predictions) 50 | # another for a glm object 51 | 52 | #' @export 53 | GetComparisonDFFromPairs.function <- function(predictionFunction, pairs, u, v) { 54 | uNew <- paste(u,".B",sep="") 55 | pairs$yHat1 <- predictionFunction(pairs) 56 | pairsNew <- structure(pairs[,c(v,uNew)], names=c(v,u)) #renaming u in pairsNew so we can call predictionFunction 57 | pairs$yHat2 <- predictionFunction(pairsNew) 58 | return(pairs) 59 | } 60 | 61 | #' @export 62 | GetComparisonDFFromPairs.glm <- function(glmFit, pairs, u, v) { 63 | predictionFunction <- function(df) predict.glm(glmFit, newdata=df, type="response") 64 | return( 65 | GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v) 66 | ) 67 | } 68 | 69 | #' @export 70 | GetComparisonDFFromPairs.lm <- function(lmFit, pairs, u, v) { 71 | predictionFunction <- function(df) predict.glm(lmFit, newdata=df) 72 | return( 73 | GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v) 74 | ) 75 | } 76 | 77 | #' @export 78 | GetComparisonDFFromPairs.randomForest <- function(rfFit, pairs, u, v) { 79 | # For classification, we need to specify that the predictions should be probabilties (not classes) 80 | if (rfFit$type == "classification") { 81 | if (length(rfFit$classes) > 2) { 82 | stop("Sorry, I don't know what to do when there are more than 2 classes.") 83 | } 84 | predictionFunction <- function(df) predict(rfFit, newdata=df, type="prob")[,2] 85 | } else 86 | { 87 | predictionFunction <- function(df) predict(rfFit, newdata=df) 88 | } 89 | return( 90 | GetComparisonDFFromPairs.function(predictionFunction, pairs, u, v) 91 | ) 92 | } 93 | 94 | 95 | #' @export 96 | GetComparisonDF <- function(predictionFunction, df, u, v=NULL, ...) { 97 | if (is.null(v)) { 98 | inputNames <- GetInputVarsFromModel(predictionFunction) 99 | v <- setdiff(inputNames, u) 100 | } 101 | pairs <- GetPairs(df, u, v, ...) 102 | ComparisonDF <- GetComparisonDFFromPairs(predictionFunction, pairs, u, v) 103 | } 104 | -------------------------------------------------------------------------------- /_site/R/get-input-vars-from-model.R: -------------------------------------------------------------------------------- 1 | GetInputVarsFromModel <- function(model) UseMethod("GetInputVarsFromModel") 2 | 3 | GetInputVarsFromModel.lm <- function(model) { 4 | # This seems like a robust way to get the input names from an lm object, but I'm not really sure 5 | return(attr(model$terms, "term.labels")) 6 | } 7 | 8 | GetInputVarsFromModel.glm <- function(model) { 9 | # This seems like a robust way to get the input names from an glm object, but I'm not really sure 10 | return(attr(model$terms, "term.labels")) 11 | } 12 | 13 | GetInputVarsFromModel.randomForest <- function(model) { 14 | # This seems like a robust way to get the input names from an glm object, but I'm not really sure 15 | return(attr(model$terms, "term.labels")) 16 | } 17 | -------------------------------------------------------------------------------- /_site/R/multiple-comparisons.R: -------------------------------------------------------------------------------- 1 | #' GetApcDF 2 | #' 3 | #' makes average predictive comparison for all specified inputs 4 | #' 5 | #' @param model Either a function (from a data frame to vector of predictions) or a model we know how to deal with (lm, glm) 6 | #' @param df data frame with data 7 | #' @param inputVars inputs to the model 8 | #' @param ... extra parguments passed to GetPairs used to control Weight function 9 | #' @export 10 | GetPredCompsDF <- function(model, df, inputVars = NULL, ...) { 11 | 12 | # If inputVars is null, we can try to get the list of inputs from the model: 13 | if (is.null(inputVars)) { 14 | inputVars <- GetInputVarsFromModel(model) 15 | } 16 | apcList <- Map(function(currentVar) { 17 | cat(paste("Working on:", currentVar, "\n")) 18 | data.frame(Input = currentVar, 19 | GetSingleInputApcs(model, 20 | df, 21 | currentVar, 22 | c(setdiff(inputVars, currentVar)), 23 | ...))}, 24 | inputVars 25 | ) 26 | apcDF <- do.call(rbind, apcList) 27 | return(apcDF) 28 | } 29 | 30 | #' PlotApcDF 31 | #' 32 | #' plots the output of GetApcDF -- this is my preferred display for now 33 | #' 34 | #' @param apcDF the output of GetApcDF 35 | #' @export 36 | PlotPredCompsDF <- function(apcDF, variant="Impact") { 37 | apcDF <- apcDF[c("Input", grep(paste0("^",variant,"\\."), names(apcDF), value=TRUE))] 38 | names(apcDF) <- gsub(paste0("^",variant,"\\."), "", names(apcDF)) 39 | 40 | maxAPC <- max(abs(apcDF$Absolute)) 41 | apcDF$Input <- reorder(apcDF$Input, apcDF$Absolute) 42 | longAPCs <- melt(apcDF, id="Input", value.name = "Value", variable.name = "Type") 43 | longAPCs2 <- rbind( 44 | longAPCs, 45 | transform(subset(longAPCs, Type=="Absolute"), Value=-Value) 46 | ) 47 | longAPCs2 <- longAPCs2[order(factor(longAPCs2$Type, levels=c("Absolute", "Signed"))), ] 48 | return( 49 | ggplot(longAPCs2) + 50 | geom_point(aes(y = Input, x=Value, color=Type, shape=Type, size=Type)) + 51 | scale_x_continuous(limits=c(-maxAPC, maxAPC)) + 52 | scale_size_discrete(range=c(3,4)) + 53 | ggtitle(variant) + 54 | geom_vline(aes(xintercept=0), alpha=.5) 55 | ) 56 | } 57 | -------------------------------------------------------------------------------- /_site/R/pairs.R: -------------------------------------------------------------------------------- 1 | #' GetPairs 2 | #' 3 | #' Form all pairs of rows in \code{X} and compute Mahalanobis distances based on \code{v}. 4 | #' 5 | #' To help with computational constraints, you have the option to not form pairs between all rows of \code{X} but instead of specify a certain number (\code{numForTransitionStart}) to randomly be selected as rows from which transitions start, and another number (\code{numForTransitionEnd}) to be randomly selected as where transitions end. We then form all pairs between transition-start rows and transition-end rows. 6 | #' 7 | #' In order to get a smaller data frame for later manipulations (and maybe just because it's a good idea), you can also specify \code{onlyIncludeNearestN}, in which case we return only the nearest \code{onlyIncludeNearestN} transition ends for each transition start (instead of all pairs). 8 | #' 9 | #' @param X data frame 10 | #' @param u input of interest 11 | #' @param v other inputs 12 | #' @param mahalanobisConstantTerm Weights are (1 / (mahalanobisConstantTerm + Mahalanobis distance)) 13 | #' @param numForTransitionStart 14 | #' @param numForTransitionEnd 15 | #' @return a data frame with the inputs \code{v} from the first of each pair, \code{u} from each half (with ".B" appended to the second), and the Mahalanobis distances between the pairs. 16 | #' @examples 17 | #' # Should put unit test example here 18 | #' 19 | #' @export 20 | GetPairs <- function(X, u, v, 21 | numForTransitionStart = NULL, 22 | numForTransitionEnd = NULL, 23 | onlyIncludeNearestN = NULL, 24 | mahalanobisConstantTerm=1) { 25 | 26 | assert_that(length(u) == 1) # make sure we have exactly 1 input var of interest 27 | 28 | if (!is.null(numForTransitionStart)) { 29 | X1 <- X[sample.int(nrow(X), size=numForTransitionStart), c(v,u)] 30 | } else { 31 | X1 <- X[c(v,u)] 32 | } 33 | 34 | if (!is.null(numForTransitionEnd)) { 35 | X2 <- X[sample.int(nrow(X), size=numForTransitionEnd), c(v,u)] 36 | } else { 37 | X2 <- X[c(v,u)] 38 | } 39 | 40 | X1$OriginalRowNumber <- 1:nrow(X1) 41 | X2$OriginalRowNumber.B <- 1:nrow(X2) 42 | 43 | vMatrix1 <- as.matrix(X1[,v]) 44 | vMatrix2 <- as.matrix(X2[,v]) 45 | 46 | 47 | covV=cov(vMatrix2) 48 | 49 | distMatrix <- apply(vMatrix1, 1, function(row) mahalanobis(vMatrix2, row, covV)) 50 | dim(distMatrix) 51 | 52 | colnames(distMatrix) <- 1:ncol(distMatrix) 53 | rownames(distMatrix) <- 1:nrow(distMatrix) 54 | distDF <- as.data.frame(as.table(distMatrix)) 55 | names(distDF) <- c("OriginalRowNumber.B", "OriginalRowNumber", "MahalanobisDistance") 56 | 57 | # browser() 58 | # This isn't really doing its filter properly. Something is wrong! 59 | if (!is.null(onlyIncludeNearestN)) { 60 | distDF <- distDF %.% 61 | group_by(OriginalRowNumber) %.% 62 | filter(rank(MahalanobisDistance, ties.method="random") < onlyIncludeNearestN) 63 | } 64 | # 65 | # distDF2 <- distDF %.% group_by(OriginalRowNumber) %.% mutate(rn = rank(MahalanobisDistance, ties.method="random")) 66 | # 67 | # (subset(distDF, OriginalRowNumber == 1)) 68 | # (subset(distDF2, OriginalRowNumber == 1)) 69 | # 70 | 71 | # sort(subset(distDF, OriginalRowNumber == 1)$MahalanobisDistance) 72 | # sort(subset(distDF2, OriginalRowNumber == 1)$MahalanobisDistance) 73 | 74 | pairs <- merge(X1, distDF, by = "OriginalRowNumber") 75 | pairs <- merge(X2, pairs, by = "OriginalRowNumber.B", suffixes = c(".B", "")) 76 | pairs$Weight <- 1/(mahalanobisConstantTerm + pairs$MahalanobisDistance) 77 | 78 | # If we haven't sampled, then OriginalRowNumber == OriginalRowNumber.B means that 79 | # the transition start and end are the same, so we should remove those rows. 80 | if (is.null(numForTransitionStart) && is.null(numForTransitionEnd)) { 81 | pairs <- subset(pairs, OriginalRowNumber != OriginalRowNumber.B) 82 | } 83 | 84 | # Renormalize weights: 85 | pairs <- pairs %.% group_by(OriginalRowNumber) %.% mutate(Weight = Weight/sum(Weight)) 86 | 87 | return(data.frame(pairs)) 88 | } 89 | 90 | 91 | #' PlotPairCumulativeWeights 92 | #' 93 | #' For a sample of transition start rows, we plot rank of transition end (by increasing weight) vs. cumulative weight. This gives a sense of how much weight is going into the nearest points vs. further ones. 94 | #' 95 | #' @export 96 | 97 | PlotPairCumulativeWeights <- function(pairs, numOriginalRowNumbersToPlot = 20) { 98 | rowNumSample <- sample(unique(pairs$OriginalRowNumber))[1:numOriginalRowNumbersToPlot] 99 | pairsWithCumWeightSums <- pairs %.% 100 | group_by(OriginalRowNumber) %.% 101 | arrange(OriginalRowNumber, -Weight) %.% 102 | mutate(CumulativeWeight = cumsum(Weight), Rank = dense_rank(-Weight)) 103 | 104 | pairsSubset <- subset(pairsWithCumWeightSums, OriginalRowNumber %in% rowNumSample) 105 | 106 | ggplot() + 107 | geom_line(aes(x=Rank, y=CumulativeWeight, color=factor(OriginalRowNumber)), data = pairsSubset, alpha = .2) + 108 | geom_line(aes(x=Rank, y=CumulativeWeight), stat = "summary", fun.y = "median", data=pairsWithCumWeightSums) 109 | } 110 | -------------------------------------------------------------------------------- /_site/R/transition-plots.R: -------------------------------------------------------------------------------- 1 | #' ArrowPlot 2 | #' 3 | #' Sampled transitions in 4 | #' 5 | #' @param predictionFunction 6 | #' @param df data frame with data 7 | #' @param u input of interest 8 | #' @param v other inputs 9 | #' @export 10 | #' @examples 11 | #' N <- 100 12 | #' df <- data.frame(u=rnorm(N), v=rnorm(N)) 13 | #' df$y1 = 2*df$u + 2*df$v + rnorm(N) 14 | #' lm1 <- lm(y1 ~ u + v, data=df) 15 | #' print(lm1) 16 | #' TransitionPlot(function(df) predict(lm1, df), df, "u", "v") + ylab("y1") 17 | #' df$y2 = df$u + df$v + 2*df$v*df$u + rnorm(N) 18 | #' lm2 <- lm(y2 ~ u*v, data=df) 19 | #' print(lm2) 20 | #' TransitionPlot(function(df) predict(lm2, df), df, "u", "v") + ylab("y2") 21 | TransitionPlot <- function(predictionFunction, df, u, v, plot=TRUE) { 22 | pairs <- GetPairs(df, u, v) 23 | 24 | pairsToPlot <- ddply(pairs[pairs[[paste0(u,".B")]] > pairs[[u]], ], 25 | "OriginalRowNumber", function(df) { 26 | df[sample.int(nrow(df), size=1, prob=df$Weight), ] 27 | }) 28 | 29 | pairsToPlot$output <- predictionFunction(pairsToPlot) 30 | pairsToPlot2 <- pairsToPlot 31 | pairsToPlot2[[u]] <- pairsToPlot2[[paste0(u,".B")]] 32 | pairsToPlot$outputNew <- predictionFunction(pairsToPlot2) 33 | 34 | p <- ggplot(pairsToPlot) + 35 | geom_segment(aes_string(x = u, y = "output", xend = paste0(u,".B"), yend = "outputNew"), arrow = arrow()) + 36 | ggtitle(sprintf("Transitions in %s holding %s constant", u, Reduce(paste, v))) 37 | if(plot) print(p) 38 | return(p) 39 | } 40 | -------------------------------------------------------------------------------- /_site/README.md: -------------------------------------------------------------------------------- 1 | This README is mainly for people developing the project (which is just me, at the moment). Everything for users is at the [Project Page](http://www.davidchudzicki.com/predcomps/). 2 | 3 | Lots of people are using complicated/non-parametric models for lots of things, but it’s hard to understand what complicated models are telling you. This is general-purpose way to extract understanding from a very large class of complicated models. 4 | 5 | # Todo 6 | 7 | ## Must be done before releasing to the world 8 | 9 | Finishing pages: 10 | - add example in quickstart section to index. 11 | - make make diamonds presentable 12 | - including maybe transition curves? (arrow plots are not so nice) 13 | - another example, e.g. give me some credit (nice b/c it's logistic) 14 | - document every function other people would use, with example 15 | - Transition Plots -- maybe add another way to visualize the same effect, e.g. input vs. delta output? and/or improve the original one (opacity?) 16 | - something like this?: make 30 pairings for each of 100 original points. Draw lines instead of arrows. (include the original point for this! current pairs DF excludes original point.) maybe very thin lines with large and somewhat transparent points? for a sense of the density. 17 | 18 | ## Todo later 19 | 20 | See the [future work](http://www.davidchudzicki.com/predcomps/more-future-work.html) section of the documentation. 21 | 22 | ## Documentation 23 | 24 | The [documentation](http://www.davidchudzicki.com/predcomps/) is hosted on Github pages using a theme by [orderedlist](https://github.com/orderedlist). 25 | 26 | 27 | ----- 28 | 29 | Next steps for me: 30 | 31 | - fix horizontal axis label for impact / APC charts 32 | 33 | - add chart to wine 34 | 35 | - More work on credit example, including: 36 | - (just clean it up a bit -- it's mostly better) 37 | - document weights situation better 38 | - explain more concisely about renormalizing (it's obvious enough not to need long explanation) 39 | - explain about why 1/distance doesn't weight nearby points enough (sphere's surphace grows as distance^n) 40 | - explain my "closest n only" heuristic 41 | - explain bias/variance tradeoff in nearness vs. more points -- replace the "limit as n-->infty" page with a section on this page 42 | -------------------------------------------------------------------------------- /_site/man/ArrowPlot.Rd: -------------------------------------------------------------------------------- 1 | \name{ArrowPlot} 2 | \alias{ArrowPlot} 3 | \title{ArrowPlot} 4 | \usage{ 5 | ArrowPlot(predictionFunction, df, u, v) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{df}{data frame with data} 11 | 12 | \item{u}{input of interest} 13 | 14 | \item{v}{other inputs} 15 | } 16 | \description{ 17 | Sampled transitions in 18 | } 19 | 20 | -------------------------------------------------------------------------------- /_site/man/AverageSpecifiedComparison.Rd: -------------------------------------------------------------------------------- 1 | \name{AverageSpecifiedComparison} 2 | \alias{AverageSpecifiedComparison} 3 | \title{AverageSpecifiedComparison} 4 | \usage{ 5 | AverageSpecifiedComparison(fit, df, input, low, high) 6 | } 7 | \arguments{ 8 | \item{fit}{fitted glm object} 9 | 10 | \item{df}{data frame for evaluating predictions} 11 | 12 | \item{input}{string representing the name of the input 13 | feature you want an APC for} 14 | 15 | \item{low}{low value of input feature} 16 | 17 | \item{high}{high value of input feature} 18 | } 19 | \description{ 20 | Makes an average predictive comparison for two specified 21 | values of the input variable of interest, as in section 5.7 22 | of ARM 23 | } 24 | 25 | -------------------------------------------------------------------------------- /_site/man/ComputeAPCFromPairs.Rd: -------------------------------------------------------------------------------- 1 | \name{ComputeApcFromPairs} 2 | \alias{ComputeApcFromPairs} 3 | \title{ComputeApcFromPairs} 4 | \usage{ 5 | ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute = FALSE, 6 | impact = FALSE) 7 | } 8 | \description{ 9 | ComputeApcFromPairs 10 | } 11 | 12 | -------------------------------------------------------------------------------- /_site/man/GetAPC.Rd: -------------------------------------------------------------------------------- 1 | \name{GetAPC} 2 | \alias{GetAPC} 3 | \title{GetAPC} 4 | \usage{ 5 | GetAPC(predictionFunction, X, u, v, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{X}{} 11 | 12 | \item{u}{input of interest} 13 | 14 | \item{v}{other inputs} 15 | 16 | \item{...}{extra parguments passed to GetPairs used to 17 | control Weight function} 18 | } 19 | \description{ 20 | makes average predictive comparison (based on 21 | Gelman/Pardoe) by forming pairs with two versions of the 22 | input of interest and averaging the predictive difference 23 | using Weights. I think Weights should be an approximation 24 | of the density p(u1,u2|v) or something like that... I need 25 | to look back at this. At present, I believe this is 26 | probably implementing the version in the Gelman/Pardoe 27 | paper. 28 | } 29 | \details{ 30 | Only works fore continuous inputs right now 31 | } 32 | 33 | -------------------------------------------------------------------------------- /_site/man/GetAPCWithAbsolute.Rd: -------------------------------------------------------------------------------- 1 | \name{GetAPCWithAbsolute} 2 | \alias{GetAPCWithAbsolute} 3 | \title{GetAPCWithAbsolute} 4 | \usage{ 5 | GetAPCWithAbsolute(predictionFunction, X, u, v, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{X}{} 11 | 12 | \item{u}{input of interest} 13 | 14 | \item{v}{other inputs} 15 | 16 | \item{k}{Weights are (1 / (k + Mahalanobis distance))} 17 | } 18 | \value{ 19 | a list with: \code{signed} (the usual APC) and 20 | \code{absolute} (APC applied to the absolute value of the 21 | differences) 22 | } 23 | \description{ 24 | makes average predictive comparison (based on 25 | Gelman/Pardoe) by forming pairs with two versions of the 26 | input of interest and averaging the predictive difference 27 | using Weights. I think Weights should be an approximation 28 | of the density p(u1,u2|v) or something like that... I need 29 | to look back at this. At present, I believe this is 30 | probably implementing the version in the Gelman/Pardoe 31 | paper. returns a list with the APC and the APC applied to 32 | the absolute value of the prediction function Only works 33 | fore continuous inputs right now 34 | } 35 | \examples{ 36 | print("hi") 37 | } 38 | 39 | -------------------------------------------------------------------------------- /_site/man/GetApcDF.Rd: -------------------------------------------------------------------------------- 1 | \name{GetApcDF} 2 | \alias{GetApcDF} 3 | \title{GetApcDF} 4 | \usage{ 5 | GetApcDF(predictionFunction, df, inputVars, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{df}{data frame with data} 11 | 12 | \item{inputVars}{inputs to the model} 13 | 14 | \item{...}{extra parguments passed to GetPairs used to 15 | control Weight function} 16 | } 17 | \description{ 18 | makes average predictive comparison for all specified 19 | inputs 20 | } 21 | 22 | -------------------------------------------------------------------------------- /_site/man/GetComparisonDFFromPairs.Rd: -------------------------------------------------------------------------------- 1 | \name{GetComparisonDFFromPairs} 2 | \alias{GetComparisonDFFromPairs} 3 | \title{GetComparisonDFFromPairs} 4 | \usage{ 5 | GetComparisonDFFromPairs(predictionFunction, pairs, u, v) 6 | } 7 | \description{ 8 | (abstracted this into a separate function from 9 | \code{GetApc} so we can more easily do things like 10 | \code{GetApcWithAbsolute}) 11 | } 12 | 13 | -------------------------------------------------------------------------------- /_site/man/GetPairs.Rd: -------------------------------------------------------------------------------- 1 | \name{GetPairs} 2 | \alias{GetPairs} 3 | \title{GetPairs} 4 | \usage{ 5 | GetPairs(X, u, v, numForTransitionStart = NULL, numForTransitionEnd = NULL, 6 | onlyIncludeNearestN = NULL, mahalanobisConstantTerm = 1) 7 | } 8 | \arguments{ 9 | \item{X}{data frame} 10 | 11 | \item{u}{input of interest} 12 | 13 | \item{v}{other inputs} 14 | 15 | \item{mahalanobisConstantTerm}{Weights are (1 / 16 | (mahalanobisConstantTerm + Mahalanobis distance))} 17 | 18 | \item{numForTransitionStart}{} 19 | 20 | \item{numForTransitionEnd}{} 21 | } 22 | \value{ 23 | a data frame with the inputs \code{v} from the first of 24 | each pair, \code{u} from each half (with ".B" appended to 25 | the second), and the Mahalanobis distances between the 26 | pairs. 27 | } 28 | \description{ 29 | Form all pairs of rows in \code{X} and compute Mahalanobis 30 | distances based on \code{v}. 31 | } 32 | \details{ 33 | To help with computational constraints, you have the option 34 | to not form pairs between all rows of \code{X} but instead 35 | of specify a certain number (\code{numForTransitionStart}) 36 | to randomly be selected as rows from which transitions 37 | start, and another number (\code{numForTransitionEnd}) to 38 | be randomly selected as where transitions end. We then form 39 | all pairs between transition-start rows and transition-end 40 | rows. 41 | 42 | In order to get a smaller data frame for later 43 | manipulations (and maybe just because it's a good idea), 44 | you can also specify \code{onlyIncludeNearestN}, in which 45 | case we return only the nearest \code{onlyIncludeNearestN} 46 | transition ends for each transition start (instead of all 47 | pairs). 48 | } 49 | \examples{ 50 | # Should put unit test example here 51 | } 52 | 53 | -------------------------------------------------------------------------------- /_site/man/GetPredCompsDF.Rd: -------------------------------------------------------------------------------- 1 | \name{GetPredCompsDF} 2 | \alias{GetPredCompsDF} 3 | \title{GetApcDF} 4 | \usage{ 5 | GetPredCompsDF(model, df, numRowsToUse = NULL, inputVars = NULL, ...) 6 | } 7 | \arguments{ 8 | \item{model}{Either a function (from a data frame to 9 | vector of predictions) or a model we know how to deal 10 | with (lm, glm)} 11 | 12 | \item{df}{data frame with data} 13 | 14 | \item{inputVars}{inputs to the model} 15 | 16 | \item{...}{extra parguments passed to GetPairs used to 17 | control Weight function} 18 | } 19 | \description{ 20 | makes average predictive comparison for all specified 21 | inputs 22 | } 23 | 24 | -------------------------------------------------------------------------------- /_site/man/GetPredCompsDFFromPairs.Rd: -------------------------------------------------------------------------------- 1 | \name{GetPredCompsDFFromPairs} 2 | \alias{GetPredCompsDFFromPairs} 3 | \title{GetPredCompsDFFromPairs} 4 | \usage{ 5 | GetPredCompsDFFromPairs(predictionFunction, pairs, u, v) 6 | } 7 | \description{ 8 | (abstracted this into a separate function from 9 | \code{GetApc} so we can more easily do things like 10 | \code{GetApcWithAbsolute}) 11 | } 12 | 13 | -------------------------------------------------------------------------------- /_site/man/GetSingleInputPredComps.Rd: -------------------------------------------------------------------------------- 1 | \name{GetSingleInputPredComps} 2 | \alias{GetSingleInputPredComps} 3 | \title{GetComparisonDF} 4 | \usage{ 5 | GetSingleInputPredComps(predictionFunction, X, u, v, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{X}{} 11 | 12 | \item{u}{input of interest} 13 | 14 | \item{v}{other inputs} 15 | 16 | \item{k}{Weights are (1 / (k + Mahalanobis distance))} 17 | } 18 | \value{ 19 | a list with: \code{signed} (the usual Apc) and 20 | \code{absolute} (Apc applied to the absolute value of the 21 | differences) 22 | } 23 | \description{ 24 | makes average predictive comparison (based on 25 | Gelman/Pardoe) by forming pairs with two versions of the 26 | input of interest and averaging the predictive difference 27 | using Weights. I think Weights should be an approximation 28 | of the density p(u1,u2|v) or something like that... I need 29 | to look back at this. At present, I believe this is 30 | probably implementing the version in the Gelman/Pardoe 31 | paper. returns a list with the Apc and the Apc applied to 32 | the absolute value of the prediction function Only works 33 | fore continuous inputs right now 34 | } 35 | 36 | -------------------------------------------------------------------------------- /_site/man/PlotApcDF.Rd: -------------------------------------------------------------------------------- 1 | \name{PlotApcDF} 2 | \alias{PlotApcDF} 3 | \title{PlotApcDF} 4 | \usage{ 5 | PlotApcDF(apcDF, variant = "Impact") 6 | } 7 | \arguments{ 8 | \item{apcDF}{the output of GetApcDF} 9 | } 10 | \description{ 11 | plots the output of GetApcDF -- this is my preferred 12 | display for now 13 | } 14 | 15 | -------------------------------------------------------------------------------- /_site/man/PlotApcDF2.Rd: -------------------------------------------------------------------------------- 1 | \name{PlotApcDF2} 2 | \alias{PlotApcDF2} 3 | \title{PlotApcDF} 4 | \usage{ 5 | PlotApcDF2(apcDF) 6 | } 7 | \arguments{ 8 | \item{apcDF}{the output of GetApcDF} 9 | } 10 | \description{ 11 | plots the output of GetApcDF -- this is not my preferred 12 | display but may be more self-explanatory 13 | } 14 | 15 | -------------------------------------------------------------------------------- /_site/man/PlotPairCumulativeWeights.Rd: -------------------------------------------------------------------------------- 1 | \name{PlotPairCumulativeWeights} 2 | \alias{PlotPairCumulativeWeights} 3 | \title{PlotPairCumulativeWeights} 4 | \usage{ 5 | PlotPairCumulativeWeights(pairs, numOriginalRowNumbersToPlot = 20) 6 | } 7 | \description{ 8 | For a sample of transition start rows, we plot rank of 9 | transition end (by increasing weight) vs. cumulative 10 | weight. This gives a sense of how much weight is going into 11 | the nearest points vs. further ones. 12 | } 13 | 14 | -------------------------------------------------------------------------------- /_site/man/PlotPredCompsDF.Rd: -------------------------------------------------------------------------------- 1 | \name{PlotPredCompsDF} 2 | \alias{PlotPredCompsDF} 3 | \title{PlotApcDF} 4 | \usage{ 5 | PlotPredCompsDF(apcDF, variant = "Impact") 6 | } 7 | \arguments{ 8 | \item{apcDF}{the output of GetApcDF} 9 | } 10 | \description{ 11 | plots the output of GetApcDF -- this is my preferred 12 | display for now 13 | } 14 | 15 | -------------------------------------------------------------------------------- /_site/man/TransitionPlot.Rd: -------------------------------------------------------------------------------- 1 | \name{TransitionPlot} 2 | \alias{TransitionPlot} 3 | \title{ArrowPlot} 4 | \usage{ 5 | TransitionPlot(predictionFunction, df, u, v, plot = TRUE) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{df}{data frame with data} 11 | 12 | \item{u}{input of interest} 13 | 14 | \item{v}{other inputs} 15 | } 16 | \description{ 17 | Sampled transitions in 18 | } 19 | \examples{ 20 | N <- 100 21 | df <- data.frame(u=rnorm(N), v=rnorm(N)) 22 | df$y1 = 2*df$u + 2*df$v + rnorm(N) 23 | lm1 <- lm(y1 ~ u + v, data=df) 24 | print(lm1) 25 | TransitionPlot(function(df) predict(lm1, df), df, "u", "v") + ylab("y1") 26 | df$y2 = df$u + df$v + 2*df$v*df$u + rnorm(N) 27 | lm2 <- lm(y2 ~ u*v, data=df) 28 | print(lm2) 29 | TransitionPlot(function(df) predict(lm2, df), df, "u", "v") + ylab("y2") 30 | } 31 | 32 | -------------------------------------------------------------------------------- /_site/man/average_specified_comparison.Rd: -------------------------------------------------------------------------------- 1 | \name{average_specified_comparison} 2 | \alias{average_specified_comparison} 3 | \title{average_specified_comparison} 4 | \usage{ 5 | average_specified_comparison(fit, df, input, low, high) 6 | } 7 | \arguments{ 8 | \item{fit}{fitted glm object} 9 | 10 | \item{df}{data frame for evaluating predictions} 11 | 12 | \item{input}{string representing the name of the input 13 | feature you want an APC for} 14 | 15 | \item{low}{low value of input feature} 16 | 17 | \item{high}{high value of input feature} 18 | } 19 | \description{ 20 | Makes an average predictive comparison for two specified 21 | values of the input variable of interest, as in section 5.7 22 | of ARM 23 | } 24 | 25 | -------------------------------------------------------------------------------- /_site/man/compute_apc_from_pairs.Rd: -------------------------------------------------------------------------------- 1 | \name{compute_apc_from_pairs} 2 | \alias{compute_apc_from_pairs} 3 | \title{compute_apc_from_pairs} 4 | \usage{ 5 | compute_apc_from_pairs(predictionFunction, pairs, u, v, absolute = FALSE) 6 | } 7 | \description{ 8 | (abstracted this into a separate function from 9 | \code{get_apc} so we can more easily do things like 10 | \code{get_apc_with_absolute}) 11 | } 12 | 13 | -------------------------------------------------------------------------------- /_site/man/get_apc.Rd: -------------------------------------------------------------------------------- 1 | \name{get_apc} 2 | \alias{get_apc} 3 | \title{get_apc} 4 | \usage{ 5 | get_apc(predictionFunction, X, u, v, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{X}{} 11 | 12 | \item{u}{input of interest} 13 | 14 | \item{v}{other inputs} 15 | 16 | \item{...}{extra parguments passed to get_pairs used to 17 | control weight function} 18 | } 19 | \description{ 20 | makes average predictive comparison (based on 21 | Gelman/Pardoe) by forming pairs with two versions of the 22 | input of interest and averaging the predictive difference 23 | using weights. I think weights should be an approximation 24 | of the density p(u1,u2|v) or something like that... I need 25 | to look back at this. At present, I believe this is 26 | probably implementing the version in the Gelman/Pardoe 27 | paper. 28 | } 29 | \details{ 30 | Only works fore continuous inputs right now 31 | } 32 | 33 | -------------------------------------------------------------------------------- /_site/man/get_apc_with_absolute.Rd: -------------------------------------------------------------------------------- 1 | \name{get_apc_with_absolute} 2 | \alias{get_apc_with_absolute} 3 | \title{get_apc_with_absolute} 4 | \usage{ 5 | get_apc_with_absolute(predictionFunction, X, u, v, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{} 9 | 10 | \item{X}{} 11 | 12 | \item{u}{input of interest} 13 | 14 | \item{v}{other inputs} 15 | 16 | \item{k}{weights are (1 / (k + mahalanobis distance))} 17 | } 18 | \value{ 19 | a list with: \code{signed} (the usual APC) and 20 | \code{absolute} (APC applied to the absolute value of the 21 | differences) 22 | } 23 | \description{ 24 | makes average predictive comparison (based on 25 | Gelman/Pardoe) by forming pairs with two versions of the 26 | input of interest and averaging the predictive difference 27 | using weights. I think weights should be an approximation 28 | of the density p(u1,u2|v) or something like that... I need 29 | to look back at this. At present, I believe this is 30 | probably implementing the version in the Gelman/Pardoe 31 | paper. returns a list with the APC and the APC applied to 32 | the absolute value of the prediction function Only works 33 | fore continuous inputs right now 34 | } 35 | 36 | -------------------------------------------------------------------------------- /_site/man/get_pairs.Rd: -------------------------------------------------------------------------------- 1 | \name{get_pairs} 2 | \alias{get_pairs} 3 | \title{get_pairs} 4 | \usage{ 5 | get_pairs(X, u, v, mahalanobisConstantTerm = 1, renormalizeWeights = TRUE) 6 | } 7 | \arguments{ 8 | \item{X}{data frame} 9 | 10 | \item{u}{input of interest} 11 | 12 | \item{v}{other inputs} 13 | 14 | \item{mahalanobisConstantTerm}{weights are (1 / 15 | (mahalanobisConstantTerm + mahalanobis distance))} 16 | 17 | \item{renormalizeWeights}{whether to renormalize the 18 | weights to that they sum to 1 within each group (groups 19 | based on the first element of the pair). If I'm right, 20 | there's no reason to use \code{FALSE} ever; I'm only 21 | leaving the option in so I can compare with the paper.} 22 | } 23 | \value{ 24 | a data frame with the inputs \code{v} from the first of 25 | each pair, \code{u} from each half (with ".B" appended to 26 | the second), and the mahalanobis distances between the 27 | pairs. 28 | } 29 | \description{ 30 | Form all pairs of rows in X and compute mahalanobis 31 | distances based on \code{v}. 32 | } 33 | \examples{ 34 | library("mvtnorm") 35 | sigma <- matrix(c(1,.5,-.5, 36 | .5,1,.5, 37 | -.5,.5,1), ncol=3) 38 | X <- data.frame(rmvnorm(n=10, sigma=sigma)) 39 | get_pairs(X, u="X3", v=c("X1","X2")) 40 | } 41 | 42 | -------------------------------------------------------------------------------- /_site/man/mahal.Rd: -------------------------------------------------------------------------------- 1 | \name{Mahal} 2 | \alias{Mahal} 3 | \title{Mahal} 4 | \usage{ 5 | Mahal(matrix1, matrix2, covariance) 6 | } 7 | \arguments{ 8 | \item{matrix1}{nxm matrix or data frame representing 1st 9 | set of observations -- each row is an observation; each 10 | column is an input feature} 11 | 12 | \item{matrix2}{nxm matrix or data frame representing 2nd 13 | set of observations} 14 | 15 | \item{input}{string representing the name of the input 16 | feature you want an APC for} 17 | 18 | \item{covariance}{covariance to be used for Mahalanobis 19 | computation} 20 | } 21 | \value{ 22 | a vector of Mahalanobis distances between row i of matrix1 23 | and row i of matrix2 24 | } 25 | \description{ 26 | Computes Mahalanobis distance between pairs of 27 | observations. I'm using this rather than stats::Mahalanobis 28 | because I want just the distances between corresponding 29 | rows of each matrix, not all pair-wise distances 30 | } 31 | 32 | -------------------------------------------------------------------------------- /_site/man/resample_from_pairs.Rd: -------------------------------------------------------------------------------- 1 | \name{resample_from_pairs} 2 | \alias{resample_from_pairs} 3 | \title{resample_from_pairs} 4 | \usage{ 5 | resample_from_pairs(X, u, v, 6 | samplingProbsAsFunctionOfMahalanobis = function(x) 1/(1 + x)) 7 | } 8 | \description{ 9 | form pairs with \code{get_pairs} and resample \code{v} and 10 | \code{u} according to a function of the mahalanobis 11 | distances (used diagnostics / understanding how things are 12 | working) 13 | } 14 | 15 | -------------------------------------------------------------------------------- /_site/notes/apc.Rmd: -------------------------------------------------------------------------------- 1 | ## APCs 2 | 3 | ### Predictive Comparisons Generalize Regression Coefficients 4 | 5 | At the heart of this package is the idea of a *predictive comparison*: We vary the input of interest holding the other inputs constant, and look at the differences in predicted values. Let $u$ represent the input of interest and $v$ the (vector of) other inputs. Let $f$ be a functinon making predictions, so 6 | 7 | $$\hat{y} = f(u,v)$$ 8 | 9 | Our $f$ could come from any predictive model. If we have a statistical model, then probably we would choose 10 | 11 | $$f(u,v) = \mathcal{E}[y \mid u, v, \theta]$$ 12 | 13 | (where $\theta$ are the parameters of the model). But we need not have a statistical model at all. The prediction function $f$ could come from a random forest, or a support vector machine. 14 | 15 | Given the function $f$ and a choice of $u_1$, $u_2$, and $v$, we can compute 16 | 17 | $$\delta_{u_1 \rightarrow u_2, v} = \frac{f(u_2, v) - f(u_1, v)}{u_2-u_1}$$ 18 | 19 | If $f$ were a linear model with no interactions, the above would not depend on the particular choices of $u_1$, $u_2$, and $v$ and would be the regression coefficient corresponding to $u$. This is the formal sense in which predictive comparisons generalize regression coefficients. Since for more complicated models this varies as the inputs vary, we will take an average across a well chosen set of inputs. 20 | 21 | ### Choice of Inputs 22 | 23 | The APC is 24 | 25 | $$\frac{\mathcal{E}[\Delta_f]}{\mathcal{E}[\Delta_u]}$$ 26 | 27 | where $\Delta_f = f(u_2,v) - f(u_1,v)$, $\Delta_u = u_2 - u_1$, and $\mathcal{E}$ is expectation under the following process: 28 | 29 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs 30 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$ 31 | 32 | The reason for this definition is that we want to use values of $v$ that are representative of our data generation process, and transitions in $u$ that are representative of what really occurs at those values of $v$. 33 | 34 | Computing the numerator and denominator separately rather than taking an expected value of $\delta_{u_1 \rightarrow u_2, v}$ amounts to weighting by the size of $(u_2 - u_1)$. This avoids having the result excessively influenced by small changes in $u$. 35 | 36 | ### Estimation 37 | 38 | The rows of our data represent samples from the joint distribution $u$, $v$, so each row amounts to a sample from $v$ followed by a sample $u_1$ conditional on $v$. The difficult thing is drawing another sample $u_2$ conditional on $v$. We approximate these by assigning weights to rows based on the proximity of the $v$ in that row to the $v$ in the row in question. The weights are: 39 | 40 | $$\frac{1}{\text{mahalanobisConstantTerm} + \text{(mahalanobis distance)}}$$ 41 | 42 | The [*mahalanobis distance*](https://en.wikipedia.org/wiki/Mahalanobis_distance) is a unitless version of distance that takes into account the correlation structure of $v$. The *mahalanobisConstantTerm* (which defaults to 1, but this is not always an appropriate choice) prevents all of the weight from going to the closest points. More work needs to be done in thinking about the weights. 43 | 44 | For more details, see [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) (section 4), [my note](more-renormalize-weights.html) explaining a small change from the weights described in the paper, or [my code](https://github.com/dchudz/predcomps/blob/master/R/pairs.R) that computes the appropriate weights. 45 | 46 | ### Absolute Version 47 | 48 | The absolute APC (as opposed to the signed version described above) replaces $\Delta_f = f(u_2,v) - f(u_1,v)$ above with $|\Delta_f| = |f(u_2,v) - f(u_1,v)|$. See e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html) for an example where the signed APC is roughly 0 but the absolute APC is large. 49 | 50 | By default, I always compute and display an absolute version of the APC alongside the signed version. 51 | 52 | ### A Small Example 53 | 54 | This is a example running APCs on a simulated linear model with independent inputs and no interactions. For more involved examples, see the [examples](examples-overview.html) section. 55 | 56 | The inputs ($x_1$, $x_2$, $x_3$) are independent, with 57 | 58 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$ 59 | 60 | First we set up the data: 61 | 62 | ```{r} 63 | n <- 200 64 | x1 <- runif(n=n, min=0, max=1) 65 | x2 <- runif(n=n, min=0, max=1) 66 | x3 <- runif(n=n, min=0, max=10) 67 | y <- 2*x1 + (-2)*x2 + 1*x3 + rnorm(n, sd=.1) 68 | df <- data.frame(x1, x2, x3, y) 69 | ``` 70 | 71 | Then we fit a linear model: 72 | 73 | ```{r} 74 | fittedLm <- lm(y ~ ., data=df) 75 | fittedLm 76 | ``` 77 | 78 | We can then plot the average predictive comparisons: 79 | 80 | ```{r results='hide', message=FALSE} 81 | library(predcomps) 82 | apcDF <- GetPredCompsDF(fittedLm, df=df) 83 | PlotPredCompsDF(apcDF, variant="Apc") + theme_gray(base_size = 18) 84 | ``` 85 | 86 | Using different shapes / colors, both the absolute and signed versions are plotted. For symmetry, the absolute version is plotted with both a positive and negative sign. Since this is a linear model with no interactions, the signed APCs match those from the fitted linear model. 87 | 88 | This is what the returned data frame `apcDF` looks like: 89 | 90 | ```{r} 91 | apcDF 92 | ``` 93 | 94 | The columns plotted here are `PerUnitInput.Signed` and `Apc.Absolute`. The [next section](impact.html) is about those columns labeled "Impact". 95 | -------------------------------------------------------------------------------- /_site/notes/examples/diamonds.Rmd: -------------------------------------------------------------------------------- 1 | # A new way to visualize models 2 | 3 | ```{r message=FALSE, echo=FALSE} 4 | library(knitr) 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center') 6 | library(randomForest) 7 | library(plyr) 8 | library(predcomps) 9 | library(ggplot2) 10 | ``` 11 | 12 | ```{r} 13 | diamonds <- transform(diamonds, clarity = 14 | factor(clarity, levels =c("SI1", "SI2", "VS1", "VS2", "VVS1", "VVS2", "IF"))) 15 | 16 | diamonds2 <- transform(diamonds, 17 | clarity = as.integer(clarity), 18 | cut = as.integer(cut), 19 | color = as.integer(color)) 20 | 21 | diamonds3 <- subset(diamonds2, !is.na(clarity)) 22 | 23 | rf <- randomForest(price ~ carat + cut + color + clarity, data=diamonds3, ntree=20) 24 | diamondsSmall <- diamonds3[sample.int(nrow(diamonds3), size=500), ] 25 | 26 | apcDf <- GetPredCompsDF(function(df) predict(rf, df), diamondsSmall, inputVars=row.names(rf$importance)) 27 | PlotPredCompsDF(apcDf) 28 | PlotPredCompsDF(apcDf, variant="Apc") 29 | 30 | 31 | pairs <- GetPairs(diamondsSmall, "carat", c("cut", "color", "clarity"), 32 | removeDiagonal=FALSE, 33 | mahalanobisConstantTerm=.1) 34 | 35 | u <- "carat" 36 | pairsSampled <- ddply(pairs, 37 | "OriginalRowNumber", function(df) { 38 | df[sample.int(nrow(df), size=5, prob=df$Weight), ] 39 | }) 40 | 41 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10) 42 | 43 | pairsSampled$carat <- pairsSampled$carat.B 44 | 45 | pairsSampled$Prediction <- predict(rf, pairsSampled) 46 | 47 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse), 48 | aes(x=carat, y=Prediction, color=factor(OriginalRowNumber, levels=sample(originalRowNumbersToUse)))) + 49 | geom_point() + 50 | geom_line(size=.2) 51 | 52 | last_plot() + scale_x_continuous(limits=c(0,1)) + scale_y_continuous(limits=c(0,5000)) 53 | 54 | 55 | 56 | u <- "clarity" 57 | v <- c("carat", "cut", "color") 58 | pairs <- GetPairs(diamondsSmall, u, v, 59 | removeDiagonal=FALSE, 60 | mahalanobisConstantTerm=.1) 61 | 62 | pairsSampled <- ddply(pairs, 63 | "OriginalRowNumber", function(df) { 64 | df[sample.int(nrow(df), size=5, prob=df$Weight), ] 65 | }) 66 | 67 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10) 68 | 69 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]] 70 | 71 | pairsSampled$Prediction <- predict(rf, pairsSampled) 72 | 73 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse)) 74 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse), 75 | aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) + 76 | geom_point() + 77 | geom_line(size=.2) 78 | 79 | 80 | 81 | u <- "color" 82 | v <- c("cut", "clarity", "carat") 83 | pairs <- GetPairs(diamondsSmall, u, v, 84 | removeDiagonal=FALSE, 85 | mahalanobisConstantTerm=.1) 86 | 87 | pairsSampled <- ddply(pairs, 88 | "OriginalRowNumber", function(df) { 89 | df[sample.int(nrow(df), size=5, prob=df$Weight), ] 90 | }) 91 | 92 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10) 93 | 94 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]] 95 | 96 | pairsSampled$Prediction <- predict(rf, pairsSampled) 97 | 98 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse)) 99 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse), 100 | aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) + 101 | geom_point() + 102 | geom_line(size=.2) 103 | 104 | 105 | 106 | 107 | 108 | ``` 109 | -------------------------------------------------------------------------------- /_site/notes/examples/loan-defaults.Rmd: -------------------------------------------------------------------------------- 1 | ## Credit Default Example 2 | 3 | ```{r message=FALSE, echo=FALSE} 4 | library(knitr) 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center', fig.width = 14, size=8) 6 | library(randomForest) 7 | library(predcomps) 8 | library(ggplot2) 9 | theme_set(theme_gray(base_size = 18)) 10 | library(dplyr) 11 | library(gridExtra) 12 | ``` 13 | 14 | This example is based on the training data set [found here](https://www.kaggle.com/c/GiveMeSomeCredit). We build a model to predict: 15 | 16 | - **SeriousDlqin2yrs** (target variable): Person experienced 90 days past due delinquency or worse 17 | 18 | The input features are: 19 | 20 | - **RevolvingUtilizationOfUnsecuredLines**: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits 21 | - **age**: Age of borrower in years 22 | - **NumberOfTime30-59DaysPastDueNotWorse**: Number of times borrower has been 30-59 days past due but no worse in the last 2 years. 23 | - **NumberOfTime60-89DaysPastDueNotWorse**: Number of times borrower has been 60-89 days past due but no worse in the last 2 years. 24 | - **NumberOfTimes90DaysLate**: Number of times borrower has been 90 days or more past due. 25 | - **DebtRatio**: Monthly debt payments, alimony,living costs divided by monthy gross income 26 | - **MonthlyIncome**: Monthly income 27 | - **NumberOfOpenCreditLinesAndLoans**: Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) 28 | - **NumberRealEstateLoansOrLines**: Number of mortgage and real estate loans including home equity lines of credit 29 | - **NumberOfDependents**: Number of dependents in family excluding themselves (spouse, children etc.) 30 | 31 | ### Parameters: 32 | 33 | These are some parameters controlling the aggregate predictive comparisons: 34 | 35 | ```{r} 36 | # Parameters controlling the predictive comparisons computation: 37 | # We consider transitions starting at each of 500 random rows 38 | numForTransitionStart <- 50 39 | # ... going to each of 10,000 other random rows: 40 | numForTransitionEnd <- 1000 41 | # ... keeping only the nearest 100 pairs for each start: 42 | onlyIncludeNearestN = 10 43 | ``` 44 | 45 | And for the random forest: 46 | 47 | ```{r} 48 | # 100 trees for random forest 49 | ntree = 10 50 | ``` 51 | 52 | 53 | ```{r echo=FALSE} 54 | # Remove some outliers 55 | credit <- read.csv("~/Downloads/cs-training.csv")[,-1] 56 | credit <- subset(credit, !is.na(MonthlyIncome) & 57 | NumberOfTime30.59DaysPastDueNotWorse < 5 & 58 | RevolvingUtilizationOfUnsecuredLines <= 2 & 59 | NumberOfTime30.59DaysPastDueNotWorse <= 5 & 60 | NumberOfTime60.89DaysPastDueNotWorse <= 5 & 61 | NumberOfTimes90DaysLate <= 5 & 62 | MonthlyIncome < 5e4 & 63 | DebtRatio < 2 & 64 | NumberRealEstateLoansOrLines <= 12 & 65 | NumberOfDependents < 10 66 | ) 67 | ``` 68 | 69 | ### Input Distribution 70 | 71 | The distribution of the inputs (after removing some outliers to make things more manageable): 72 | 73 | ```{r fig.height = 12, echo=FALSE} 74 | histograms <- Map(function(colName) { 75 | qplot(credit[[colName]]) + 76 | ggtitle(colName) + 77 | xlab("")}, 78 | setdiff(names(credit), "SeriousDlqin2yrs")) 79 | allHistograms <- do.call(arrangeGrob, c(histograms, ncol=2)) 80 | print(allHistograms) 81 | ``` 82 | 83 | Build a random forest model: 84 | 85 | ```{r} 86 | set.seed(1) 87 | # Turning the response to type "factor" causes the RF to be build for classification: 88 | credit$SeriousDlqin2yrs <- factor(credit$SeriousDlqin2yrs) 89 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit, ntree=ntree) 90 | ``` 91 | 92 | ### Aggregate Predictive Comparisons 93 | 94 | ```{r message=FALSE, results='hide'} 95 | set.seed(1) 96 | apcDF <- GetPredCompsDF(rfFit, credit, 97 | numForTransitionStart = numForTransitionStart, 98 | numForTransitionEnd = numForTransitionEnd, 99 | onlyIncludeNearestN = onlyIncludeNearestN) 100 | ``` 101 | 102 | Hi 103 | 104 | ```{r echo=FALSE} 105 | kable(apcDF, row.names=FALSE) 106 | ``` 107 | 108 | 109 | Here [impact](impact.html) chart. Its units are changes in probability, so we can compare it across all of the inputs: 110 | 111 | ```{r} 112 | PlotPredCompsDF(apcDF) 113 | ``` 114 | 115 | Note that average absolute value of the change in probability associated with changes in age is much larger than the magnitude of the signed average. This indicates either an interact effect between age and other inputs, or a non-linear (non-monotonic) relationship between age an probability of default. We'll look into that a bit later. 116 | 117 | It wouldn't make sense to chart the average predictive comparisons in this way since they don't share units, but we can chart the inputs corresponding to a number of numbers late for various periods: 118 | 119 | ```{r} 120 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ], 121 | variant = "Apc") 122 | ``` 123 | 124 | As you'd expect, greater periods of lateness are worse (per additional incident). 125 | 126 | However, `NumberOfTime30.59DaysPastDueNotWorse` makes more overall difference to the model, because its variation is larger (it's non-zero more often): 127 | 128 | ```{r} 129 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ]) 130 | ``` 131 | 132 | ### More Detailed Examination: `NumberOfTime30.59DaysPastDueNotWorse` 133 | 134 | Recall that data from which the summarized predictive comparisons are computed consists of groups of rows, where with in each group only the input of interest varies (for the point we imagine transitioning to) and the rest are held constant. We can work directly with this data, visualizing it in more detail to better understand our model: 135 | 136 | ```{r} 137 | set.seed(6) 138 | pairs <- GetComparisonDF(rfFit, credit, 139 | u="NumberOfTime30.59DaysPastDueNotWorse", 140 | numForTransitionStart = 20, 141 | numForTransitionEnd = numForTransitionEnd*10, 142 | onlyIncludeNearestN = onlyIncludeNearestN*10) 143 | 144 | pairsSummarized <- pairs[c("OriginalRowNumber", "NumberOfTime30.59DaysPastDueNotWorse.B", "yHat2", "Weight")] %.% 145 | group_by(OriginalRowNumber, NumberOfTime30.59DaysPastDueNotWorse.B, yHat2) %.% summarise(Weight = sum(Weight)) 146 | 147 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 148 | geom_point(aes(size = Weight)) + 149 | geom_line(size=.2) + 150 | scale_x_continuous(limits=c(0,2)) + 151 | scale_size_area() 152 | ``` 153 | 154 | I've made the size of the points proportional to weight that the point receives. The summarized predictive comparisons give more weight to points with more weight, and so should we. 155 | 156 | The relationship is mostly as you'd expect, but let's examine the highlighted one in more detail: 157 | 158 | ```{r echo=FALSE} 159 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 160 | geom_point(aes(size = n)) + 161 | geom_line(aes(alpha=ifelse(OriginalRowNumber == 18, 1, .3))) + 162 | scale_x_continuous(limits=c(0,2)) + 163 | scale_alpha_identity() 164 | ``` 165 | 166 | Why does the default probability decrease when `NumberOfTime30.59DaysPastDueNotWorse` increases? Values for all the other inputs are held constant, so let's see what they are: 167 | 168 | ```{r echo=FALSE} 169 | oneOriginalRowNumber <- subset(pairs, OriginalRowNumber == 18) 170 | oneRow <- t(oneOriginalRowNumber[1,intersect(names(oneOriginalRowNumber), names(credit))]) 171 | colnames(oneRow) <- "Input values for highlighted row" 172 | grid.newpage() 173 | grid.table(oneRow) 174 | ``` 175 | 176 | Hmm, note that `NumberOfTimes90DaysLate` (almost always $0$) is $1$ in this case. Looking at the definition of the target variable (`SeriousDlqin2yrs`), this makes a lot of sense: 177 | 178 | `SeriousDlqin2yrs`: "Person experienced 90 days past due delinquency or worse." 179 | 180 | As we'd expect, previous instances of 90-days-late are a strong indicator of future ones. Adding a previous 30-days-late (but not more) to a previous 90-days-late seems to decrease the chance of future 90-days-lates. This is sensible -- with both, we have evidence that when you're late, you at least *sometimes* pay back in under 60 days. 181 | 182 | Further exploratory analysis would further improve our understanding: 183 | 184 | - Is this really primarily an interaction between `NumberOfTimes90DaysLate` and `NumberOfTime30.59DaysPastDueNotWorse`, or are other inputs involved? We could vary the other inputs and see if we still get this effect. 185 | - Is the effect validated on test data? 186 | 187 | ### More Detailed Examination: `Age` 188 | 189 | For one more example, let's examine the `Age` input in more detail. 190 | 191 | ```{r} 192 | set.seed(3) 193 | pairs <- GetComparisonDF(rfFit, credit, 194 | u="age", 195 | numForTransitionStart = 20, 196 | numForTransitionEnd = numForTransitionEnd*10, 197 | onlyIncludeNearestN = onlyIncludeNearestN*10) 198 | 199 | pairsSummarized <- pairs[c("OriginalRowNumber", "age.B", "yHat2", "Weight")] %.% 200 | group_by(OriginalRowNumber, age.B, yHat2) %.% 201 | summarise(Weight = sum(Weight)) 202 | 203 | ggplot(pairsSummarized, aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 204 | geom_point(aes(size = Weight)) + 205 | geom_line(size=.2) 206 | ``` 207 | 208 | This is a bit of a mess, but we can at least see see that interaction effects and non-monotonicity are both going on. 209 | 210 | Further exploration would look into which other inputs age is interacting with to determine these differently shaped curves. 211 | -------------------------------------------------------------------------------- /_site/notes/examples/logistic-regression.Rmd: -------------------------------------------------------------------------------- 1 | ## A Logistic Regression with Related Inputs 2 | 3 | ```{r} 4 | 5 | library(predcomps) 6 | library(ggplot2) 7 | library(boot) 8 | 9 | N=500 10 | v1 <- rnorm(N)*3 11 | v2 <- rnorm(N) * (abs(v1) > 2) 12 | v3 <- rnorm(N) * (abs(v1) < .5) 13 | 14 | # should we remove the points with the condition false instead of setting to 0? 15 | # no, it's good how we have it. since we're conditioning on (e.g.) v1 and v3 when we draw v2, 16 | # setting v2 to 0 in those cases means there really is no variation in v2 for those v1/v3 combos 17 | 18 | qplot(v2, v1 + v3) 19 | qplot(v3, v1 + v3) 20 | 21 | # plot we want: 22 | # v2 curve at various v1 + v3 values. sample from the real data to draw this 23 | # (get random rows and then fill in the whole curve.) 24 | # thicken the rows where v2 isn't 0, ie, does vary, or maybe use alpha to make the other ones lighter. 25 | # x = v2, color = v1 + v3 + 26 | 27 | table(v2==0) 28 | table(v3==0) 29 | df <- data.frame(v1,v2,v3) 30 | df$y <- rbinom(n=nrow(df), size=1, prob=with(df, inv.logit(v1+v2+v3))) 31 | 32 | fittedLogit <- glm(y ~ v1 + v2 + v3, data = df, family = "binomial") 33 | 34 | apcDF <- GetPredCompsDF(function(df) with(df, inv.logit(v1+v2+v3)), df, c("v1","v2","v3")) 35 | 36 | PlotPredCompsDF(apcDF) 37 | PlotPredCompsDF(apcDF, variant="Apc") 38 | 39 | 40 | GetPredCompsDF(fittedLogit, 41 | data.frame(v1, v2, v3), 42 | c("v1","v2","v3")) 43 | 44 | 45 | 46 | GetPredCompsDF 47 | 48 | ``` 49 | 50 | -------------------------------------------------------------------------------- /_site/notes/examples/overview.Rmd: -------------------------------------------------------------------------------- 1 | ## Examples Overview 2 | 3 | ### Simulated Examples 4 | 5 | The simulated examples are constructed to demonstrate a property of predictive comparisons. Simulated examples are nice because (since we set up the data-generating process and we really know what's going on) it's easier to know of the predictive comparisons are telling us what they should be. 6 | 7 | - [Logistic regression for wine prices](examples-wine-logistic-regression.html): This is a logistic regression model run in two different simulated data situations. Changing the relationship between the inputs (and nothing else) leads to differences in the predictive comparisons (both APC and impact). 8 | 9 | - [Linear model with interactions](examples-simulated-linear-model-interactions.html): This is a linear model with 9 features and 8 interaction terms (an interaction between $v$ and each of $u_1 \ldots u_8$) which are all the same. The way the $u_i$'s are related to $v$ in the distribution of the inputs gives rise to differences in the APC. 10 | 11 | ### Real Examples 12 | 13 | Simulated examples are good for gaining understanding of how the package works and what it's doing, but they don't help us understand how it is useful in the real world. These are examples with real-world data sets: 14 | 15 | - [Diamond prices](examples-diamonds.html): In this example, applying predictive comparisons to a random forest helps understand the influence of carat, cut, color, and clarity on prices. 16 | -------------------------------------------------------------------------------- /_site/notes/examples/simulated-linear-model-interactions.Rmd: -------------------------------------------------------------------------------- 1 | # APCs in a Synthetic Example of a Linear Model with Interactions 2 | 3 | ```{r message=FALSE, echo=FALSE} 4 | library(knitr) 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center') 6 | ``` 7 | 8 | ```{r echo=FALSE} 9 | library(predcomps) 10 | library(ggplot2) 11 | library(reshape2) 12 | library(plyr) 13 | ``` 14 | 15 | ## Input Generation 16 | 17 | This example will have 9 inputs: $v$ and $u_1$, $u_2$, ..., $u_8$. The input $v$ is uniformly distributed between 7 values, $-3, -2, \ldots, 2, 3$. Each $u$ is mostly constant (at $0$) but at one value of $v$ (which is $v=-3$ for $u_1$, $v=-2$ for $u_2$, etc), $u$ can be either $0$ and $10$. The input $u_8$ can transition at two values of $v$: At either $v=-3$ or $v=3$, $u_8$ can be $0$ or $10$; otherwise, $u_8$ is 0. I'll construct the data and plot the relationship between the $u$'s and $v$. 18 | 19 | ```{r} 20 | N <- 200 21 | vValues <- (-3):3 22 | v <- sample(vValues, N, replace=TRUE) 23 | 24 | df <- data.frame(v) 25 | for (i in seq_along(vValues)) { 26 | df[[paste0("u",i)]] <- 27 | ifelse(v==vValues[i], 28 | sample(c(0,10), N, replace=TRUE), # u can be either 0 or 10 at one v value 29 | rep(0, N) # u is always 0 at other ones 30 | ) 31 | } 32 | 33 | # u8 can transition at either v=-3 or v=3: 34 | df$u8 <- ifelse(v %in% c(-3,3), 35 | sample(c(0,10), N, replace=TRUE), 36 | rep(0, N) 37 | ) 38 | ``` 39 | 40 | ```{r ConditionalDistributionOfUsOnVs, echo=FALSE, fig.cap="distribution of $u$ conditional on $v$"} 41 | theme_set(theme_gray(base_size=20)) 42 | ggplot(melt(df, id="v")) + 43 | geom_bar(aes(x=factor(v), fill=factor(value)), position=position_fill()) + 44 | facet_grid(variable ~ .) + 45 | scale_fill_discrete("value of u_i") + 46 | scale_x_discrete("v") + 47 | ggtitle("u_i conditional on v") + 48 | scale_y_continuous("sample conditional distribution") + 49 | opts(axis.text.y = theme_blank(), axis.ticks.y = theme_blank()) 50 | ``` 51 | 52 | ## Outcome Generation 53 | 54 | Each $u$ will have the same role in the model, which is $$\mathbb{E}[y] = vu_1 + vu_2 + vu_3 + vu_4 + vu_5 + vu_6 + vu_7 + vu_8.$$ 55 | 56 | Note that each $u$ has the same role in this function. Differences between their APCs can only arise from the correlation structure of the inputs in combination with this function, not from this function alone. 57 | 58 | For simplicity this demonstration, I'll assume that $\mathbb{E}[y \mid v, u_1, \ldots, u_8]$ is known rather than estimated. 59 | 60 | (**Todo: Find a way to map these inputs/output to a possible story with meaningful features and outcome. That would make it more interesting, easier to follow, and maybe generate more insight.**) 61 | 62 | ```{r} 63 | outcomeGenerationFunction <- function(df) { 64 | with(df, v*u1 + v*u2 + v*u3 + v*u4 + v*u5 + v*u6 + v*u7 + v*u8) 65 | } 66 | df$y <- outcomeGenerationFunction(df) 67 | ``` 68 | 69 | ## Computing and Plotting the APC 70 | 71 | I will compute both the signed and absolute APC. My absolute APCs are different from anything discussed in the paper, but the idea is simple: Everywhere we would use a predictive difference, we use its absolute value instead. I believe that whenever we display the signed APC, we should display the absolute APC alongside it. Transitions for a given input may have a large expected absolute impact on the output variable, but with signs that cancel out. If that's happening in your model, it would be important to know. 72 | 73 | Here are the APCs for this example, displayed in two different ways. The first shows signed and absolute APCs in separate charts. The second (my preference) shows both types of APC in the same chart. In this case, each absolute APC is plotted twice, once on the positive half of the APC axis and once on the negative half. This is necessary due to the symmetry between positive and negative numbers. The second version is my preference because it makes comparisons between signed and absolute APCs easier. 74 | 75 | For example, we can more quickly notice in the second version that the signed APC for $u_8$ is small relative to its absolute APC. This is because $u_8$ transitions between 0 and 10 at either $v=3$ or $v=-3$. Either $v$ (combined with the $u_8$-transition) leads to a relatively large absolute predictive comparison, but with opposite signs depending on $v$. 76 | 77 | ```{r ApcPlotsTwoWays} 78 | inputVars <- c("v",paste0("u",1:8)) 79 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars = inputVars) 80 | print(apcDF) 81 | PlotPredCompsDF(apcDF, variant="Apc") 82 | ``` 83 | 84 | These APCs are just what we would expect from the setup of the synthetic examples. When $u$-transitions are at small $v$, the APCs are small due to the $u$/$v$ interaction effect (and vice versa). Even though we know the prediction function exactly, the APCs are a bit off due to errors in our estimates of the distribution of each input conditional on the others. We can mitigate that in this case by increasing the weight given to closer points when assigning weights based on the Mahalanobis distance. The weights are $\frac{1}{\text{mahalanobisConstantTerm}+d}$ where $d$ is the distance, so we can do this by decreasing $\text{mahalanobisConstantTerm}$: 85 | 86 | ```{r DecreasedMahalanobisConstantTerm} 87 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars, mahalanobisConstantTerm=.01) 88 | PlotPredCompsDF(apcDF, variant="Apc") 89 | ``` 90 | 91 | In this case, the APCs are much closer to the correct values. Giving such extreme weight to pairs with small distance does not cause any problems in this example, but it would in more realistic examples. In applications, getting the weights right may be difficult. 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /_site/notes/examples/wine-logistic-regression.Rmd: -------------------------------------------------------------------------------- 1 | ```{r echo=FALSE, results='hide', message=FALSE} 2 | library(predcomps) 3 | library(ggplot2) 4 | library(boot) 5 | library(knitr) 6 | opts_chunk$set(tidy = FALSE) 7 | set.seed(1) 8 | ``` 9 | 10 | ## A Logistic Regression with Related Inputs 11 | 12 | *(The source code for this example is [here](https://github.com/dchudz/predcomps/blob/master/notes/examples/wine-logistic-regression.Rmd).)* 13 | 14 | We will set up a simulated data set to use for modeling the probability a customer buys a bottle of wine, given its price and quality. We'll compare a few situation varying the joint distribution of price ($P$) and quality ($Q$). The coefficients of the logistic regression determining the relationship between the inputs and the probability of purchase will not vary. 15 | 16 | In each variation, the probability of purchase is governed by the following logistic regression model: 17 | 18 | $$logit(P(\text{wine is purchased})) = 0.1 Q - 0.12 P$$ 19 | 20 | - *Variation 1*: Price is uniform; quality is price plus noise. Quality increases with price, but not enough make up for price, so the expensive wines are rarely purchased and the cheap wines are almost always purchased. 21 | - *Variation 2*: Just like *Variation 1*, but price is more densely concentrated in its middle range. This leads to price and quality both having a larger APC because the inverse logistic curve is steeper in this middle range. 22 | - *Variation 3*: Like *Variation 1*, but quality varies more strongly with price. The inverse logistic curve is now steeper at almost all price/quality combinations. 23 | 24 | The APC varies across these variations, but the logistic regression coefficients remain the same. The changes in APC in each of these variations are driven entirely by changes in the distribution of the inputs. The model relating inputs to outputs is unchanged. 25 | 26 | ### Variation 1 27 | 28 | In the first variation, quality and price are independent, with price uniformly distributed and quality set to price plus Gaussian noise: 29 | 30 | ```{r} 31 | priceCoef <- -.12 32 | qualityCoef <- .1 33 | qualityNoiseStdDev <- 5 34 | nWines=50000 35 | nRowsForPlottingSample <- 1000 36 | 37 | numForTransitionStart <- 500 38 | numForTransitionEnd <- 10000 39 | onlyIncludeNearestN = 100 40 | 41 | priceQualitySlope <- .4 42 | 43 | df1 <- local({ 44 | price <- sample(20:120, nWines, replace=TRUE) 45 | quality <- price * priceQualitySlope + 22 + rnorm(nWines, sd=qualityNoiseStdDev) 46 | purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50) ) 47 | purchased <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability) 48 | data.frame(Quality = quality, 49 | Price = price, 50 | PurchaseProbability = purchaseProbability, 51 | Purchased = purchased) 52 | }) 53 | ``` 54 | 55 | A scatter plot (using a random subset to avoid overplotting) shows us the relationship between price and quality: 56 | 57 | ```{r} 58 | df1Sample <- df1[sample.int(nWines, size=nRowsForPlottingSample), ] 59 | qplot(Price, Quality, alpha=I(.5), data = df1Sample) + 60 | expand_limits(y=c(0,100)) 61 | ``` 62 | 63 | When we fit a logistic regression, the coefficients are what we'd expect from the setup above: 64 | 65 | 66 | ```{r} 67 | logitFit1 <- glm(Purchased ~ Price + Quality, data = df1, family = "binomial") 68 | logitFit1 69 | ``` 70 | 71 | This plot shows the relationship between quality and probability of purchase for a few prices: 72 | 73 | ```{r} 74 | myScales <- list(scale_x_continuous(limits=c(0,100)), 75 | scale_y_continuous(limits=c(0,1))) 76 | 77 | ggplot(subset(df1Sample, Price %in% seq(20, 120, by=10))) + 78 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 79 | size = 3, alpha = 1) + 80 | ggtitle("Quality vs. Purchase Probability at Various Prices") + myScales 81 | 82 | ``` 83 | 84 | Each colored set of points is one portion of a shifted inverse logistic curve, determined by which Price/Quality combinations actually occur in our data. 85 | 86 | We can also see the portion of the curve that isn't represented in our data: 87 | 88 | ```{r echo=FALSE} 89 | linesDF <- expand.grid(Price = seq(20, 120, by=10), Quality = 0:100) 90 | linesDF$PurchaseProbability <- with(linesDF, 91 | inv.logit(priceCoef*(Price - 70) + qualityCoef*(Quality - 50))) 92 | last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 93 | data = linesDF, 94 | size=.2) 95 | ``` 96 | 97 | We can get average predictive comparisons from our fitted regression: 98 | 99 | ```{r} 100 | apc1 <- GetPredCompsDF(logitFit1, df1, 101 | numForTransitionStart = numForTransitionStart, 102 | numForTransitionEnd = numForTransitionEnd, 103 | onlyIncludeNearestN = onlyIncludeNearestN) 104 | ``` 105 | 106 | The `GetPredCompsDF` function produces a few kinds of outputs, but for now let's just focus on the signed average predictive comparison: 107 | 108 | ```{r} 109 | apc1[c("Input", "PerUnitInput.Signed")] 110 | ``` 111 | 112 | This means that (on average) the probability of purchase increases by about 1.2% per 1 unit increase in quality. 113 | 114 | ### Variation 2 115 | 116 | This variation will add some additional wines to the middle range of prices: 117 | 118 | ```{r} 119 | nAdditionalWines <- nWines 120 | supplementForDF2 <- local({ 121 | price <- sample(55:85, nWines, replace=TRUE) 122 | quality <- price * .4 + 22 + rnorm(nWines, sd=qualityNoiseStdDev) 123 | purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50) ) 124 | purchased <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability) 125 | data.frame(Quality = quality, 126 | Price = price, 127 | PurchaseProbability = purchaseProbability, 128 | Purchased = purchased) 129 | }) 130 | df2 <- rbind(df1, supplementForDF2) 131 | ``` 132 | 133 | A scatter plot (again, using a random subset to avoid overplotting) shows us the relationship between price and quality: 134 | 135 | ```{r} 136 | df2Sample <- df2[sample.int(nrow(df2), size=nRowsForPlottingSample), ] 137 | qplot(Price, Quality, alpha=I(.5), data = df2Sample) + 138 | expand_limits(y=c(0,100)) 139 | ``` 140 | 141 | When we fit a logistic regression, the coefficients are similar to before, since we haven't changed the underlying model: 142 | 143 | 144 | ```{r} 145 | logitFit2 <- glm(Purchased ~ Price + Quality, data = df2, family = "binomial") 146 | logitFit2 147 | ``` 148 | 149 | In the plot showing the relationship between quality and probability of purchase, we see more points at the steep section of the inverse logit curve: 150 | 151 | ```{r} 152 | ggplot(subset(df2Sample, Price %in% seq(20, 120, by=10))) + 153 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 154 | size = 3, alpha = 1) + 155 | ggtitle("Quality vs. Purchase Probability at Various Prices") + 156 | myScales 157 | ``` 158 | 159 | The APC for quality is correspondingly larger: 160 | 161 | ```{r} 162 | apc2 <- GetPredCompsDF(logitFit2, df2, 163 | numForTransitionStart = numForTransitionStart, 164 | numForTransitionEnd = numForTransitionEnd, 165 | onlyIncludeNearestN = onlyIncludeNearestN) 166 | 167 | apc2[c("Input", "PerUnitInput.Signed")] 168 | ``` 169 | 170 | This means that in this variation the probability of purchase increases (on average) by about 1.5% (vs. 1.2% in *Variation 1*) per 1-point increase in quality. The magnitude of the APC for price is also larger. 171 | 172 | ### Variation 3 173 | 174 | This is just like *Variation 1*, but price increases more with quality: 175 | 176 | ```{r} 177 | priceQualitySlope <- 1.2 178 | 179 | df3 <- local({ 180 | price <- sample(20:120, nWines, replace=TRUE) 181 | quality <- price * priceQualitySlope - 30 + rnorm(nWines, sd=qualityNoiseStdDev) 182 | purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50) ) 183 | purchased <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability) 184 | data.frame(Quality = quality, 185 | Price = price, 186 | PurchaseProbability = purchaseProbability, 187 | Purchased = purchased) 188 | }) 189 | ``` 190 | 191 | ```{r} 192 | df3Sample <- df3[sample.int(nWines, size=nRowsForPlottingSample), ] 193 | qplot(Price, Quality, alpha=I(.5), data = df3Sample) + 194 | expand_limits(y=c(0,100)) 195 | ``` 196 | 197 | The logistic regression still comes out the same: 198 | 199 | ```{r} 200 | logitFit3 <- glm(Purchased ~ Price + Quality, data = df3, family = "binomial") 201 | logitFit3 202 | ``` 203 | 204 | In this case, purchase is less certain at the low prices and more plausible at the high prices: 205 | 206 | ```{r} 207 | ggplot(subset(df3Sample, Price %in% seq(-100, 200, by=10))) + 208 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 209 | size = 3, alpha = 1) + 210 | ggtitle("Quality vs. Purchase Probability at Various Prices") + 211 | myScales 212 | ``` 213 | 214 | We can get average predictive comparisons from our fitted regression: 215 | 216 | ```{r} 217 | apc3 <- GetPredCompsDF(logitFit3, df3, 218 | numForTransitionStart = numForTransitionStart, 219 | numForTransitionEnd = numForTransitionEnd, 220 | onlyIncludeNearestN = onlyIncludeNearestN) 221 | ``` 222 | 223 | As expected, the APCs are (both) larger than in *Variation 1*: 224 | 225 | ```{r} 226 | apc3[c("Input", "PerUnitInput.Signed")] 227 | ``` 228 | -------------------------------------------------------------------------------- /_site/notes/impact.Rmd: -------------------------------------------------------------------------------- 1 | ## Impact: A variation on APCs with comparable units 2 | 3 | *Impact* answers the question: **What difference does each input tend to make in the output?** (Given the model and the distribution of inputs.) The units are the units of the output variable. 4 | 5 | ### The name "Impact" 6 | 7 | First off, I don't love the name "Impact" for this. "Variable importance" may signal somewhat the right idea to people already familiar with other measures of variable importance, but this notion is (or can be) signed, whereas variable importance generally isn't. 8 | 9 | Also, I believe that this metric may turn out to be more commonly useful in my work than APCs, in which case perhaps the name of the package and metric should be chosen to emphasize this. 10 | 11 | ### Impact 12 | 13 | *Impact* is a statistic similar to the APC, but which addresses two issues I've had with APCs: 14 | 15 | 1. APCs are good for their purpose (determining the expected difference in outcome per unit change in input), but this doesn't tell me how important an input is to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference. 16 | 17 | 2. Relatedly, APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs in that example belong on the same chart, and I would like to visualize the influence . 18 | 19 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs in the computation of the APC. That is, impact is just the expectation of 20 | 21 | $\Delta_f = f(u_2,v) - f(u_1,v)$ 22 | 23 | under the same process as in the APC: 24 | 25 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs 26 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$ 27 | 28 | The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable. This statistic depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs. 29 | 30 | ### Example (cont.) 31 | 32 | The same example used to demonstrate APCs demonstrates the difference between *impact* and APCs. Recall that the inputs ($x_1$, $x_2$, $x_3$) are independent, with 33 | 34 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$ 35 | 36 | However, the variation in input $x_3$ is much larger than the others: 37 | 38 | ```{r} 39 | n <- 200 40 | x1 <- runif(n=n, min=0, max=1) 41 | x2 <- runif(n=n, min=0, max=1) 42 | x3 <- runif(n=n, min=0, max=10) 43 | y <- 2*x1 + (-2)*x2 + 1*x3 44 | df <- data.frame(x1, x2, x3, y) 45 | fittedLm <- lm(y ~ ., data=df) 46 | ``` 47 | 48 | We can then compute and plot the *impact*: 49 | 50 | ```{r results='hide', message=FALSE} 51 | library(predcomps) 52 | apcDF <- GetPredCompsDF(fittedLm, df=df) 53 | PlotPredCompsDF(apcDF) + theme_gray(base_size = 18) 54 | ``` 55 | 56 | The *impact* for $x_3$ is about 5 times the impact for $x_1$, which makes sense as $x_3$ varies on a scale that is 10 times as large but with a coefficient half as big. 57 | 58 | ```{r} 59 | apcDF 60 | ``` 61 | 62 | The [examples](examples-overview.html) section goes through more interesting examples demonstrating more subtle features of how *impact* and APCs work. 63 | -------------------------------------------------------------------------------- /_site/notes/index.Rmd: -------------------------------------------------------------------------------- 1 | ## An R Package for Understanding Arbitrary Complex Models 2 | 3 | As complex models become widely used, it's more important than ever to have ways of understanding them. Whether a model is built primarily for prediction or primarily for understanding, we need to know what it's telling us. It's (relatively) clear how to interpret the coefficients of a linear regression, at least when there are few inputs and no interaction terms. 4 | 5 | This R package is a collection of tools that are meant to generalize the idea of these coefficients to arbitrary predictive models: Holding all else equal, how does the output vary with the input of interest? The tools here apply to any model that lets you map inputs to outputs, whether it's a linear model, a GLM (with any link function), a neural network, a random forest, etc. 6 | 7 | ### Inspiration 8 | 9 | The ideas implemented here originate in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). If you are already familiar with the paper, you can [skip to the differences between that and what's here](more-compared-with-paper.html). As far as I know, this is the only implementation intended for general use. 10 | 11 | ### Installation 12 | 13 | The package is not hosted on CRAN, but it's still easy to install: 14 | 15 | ```{r eval=FALSE} 16 | library(devtools) # first install devtools if you haven't 17 | install_github("predcomps", user="dchudz") 18 | ``` 19 | 20 | ### Quick Start 21 | 22 | If you want to get started quickly before reading about the functions in more detail, here's an example: 23 | 24 | . 25 | . 26 | . 27 | 28 | 29 | ### Current Limitations 30 | 31 | The package has some major limitations in its current form, but we have to start somewhere: 32 | 33 | **Input types**: At the time of this writing, all inputs to the model must be numerical (or coercable to such). In practice this means that binary or ordered categorical inputs are okay, but unordered categorical inputs are not. 34 | 35 | **Computational efficiency**: At the moment, the package's implementation is not particularly efficient. You should probably not ask it to handle more than 200-500 observations. (Note however that your model may be trained on any number of observations.) 36 | 37 | **Weight function**: The weight function used to estimate conditional distributions doesn't get much attention in the paper, and I haven't given it much more here (yet). 38 | 39 | 40 | ## Contact 41 | 42 | I'm very interested in feedback from folks who are trying this out. Please get in touch with me at dchudz@gmail.com.} 43 | -------------------------------------------------------------------------------- /_site/notes/more/compared-with-paper.Rmd: -------------------------------------------------------------------------------- 1 | ### One Difference: Renormalizing Weights 2 | 3 | There is only one outright difference between APCs as described in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) and as implemented here: After weights are assigned to pairs of observations (as in section __), the weights are normalized across the first element of the pair. [(explanation and justification)](more-renormalize-weights.html) 4 | 5 | ### Absolute APCs 6 | 7 | Gelman & Pardoe mention an unsigned version APCs in the case where the input of interest is an unordered categorical variable (in which case signs wouldn't make sense). They propose a root mean squared APC (equaton 4), but I prefer absolute values and I believe this absolute-values version is more generally useful. By default, I always compute and display an absolute version of the APC alongside the signed version. See e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html) for an example demonstrating the importance of this notion, and [my explanation of APCs](apc.html) for more detail. 8 | 9 | ### Impact: A variation on APCs with comparable units 10 | 11 | I've created a statistic similar to the APC, but which addresses two issues I've had with APCs: 12 | 13 | 1. APCs are good for their purpose (the expected difference in outcome per unit change in input), but it doesn't tell me the what difference an input makes to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference. 14 | 15 | 2. APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs belong on the same chart. 16 | 17 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs. The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable. It depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs. 18 | 19 | I'm calling this notion *impact* (feel free to suggest another name), and it's described in more detail [here](impact.html) and in the examples. Just like APCs, it comes in signed and absolute forms. 20 | 21 | ### Transition Plots 22 | 23 | (Again, feel free to suggest a better name!) 24 | 25 | Both APCs and impact summarize the role of an input with one number. This is useful, but I also want to examine the model in more detail. For example, how does the influence of an input vary across its range? Breiman & Cutler's "partial dependence plots" as implemented in the [randomForests](http://cran.r-project.org/web/packages/randomForest/randomForest.pdf) package partly get at this question, but I wanted a visualization more in line with the spirit of APCs. See [my explanation of transition plots](transition-plots.html) and the examples for more detail. 26 | 27 | 28 | -------------------------------------------------------------------------------- /_site/notes/more/future-work.Rmd: -------------------------------------------------------------------------------- 1 | ## Future work 2 | 3 | ### Speed/scalability 4 | 5 | I often find myself cutting down the data set size for computational reasons. E.g. starting with 10k rows and looking only at 300. It might be better to use different numbers of rows in different roles, e.g. pairs between 300 rows and 1000 other rows. 6 | 7 | I should make it faster / work with more data. 8 | 9 | 10 | ### Comparison with other approaches 11 | 12 | For example, I'd like to compare with the tools (variable importance and partial plots) available in the `randomForest` package. I would guess that those tools are the most similar thing to this that's in common use. 13 | 14 | Other notions of variable importance to compare with: 15 | 16 | - the one in the `earth` package (implementing MARS) 17 | - I should look through `caret` to see what's available there 18 | 19 | ### Categorical inputs 20 | 21 | I'm not quite sure about the best way to sample from the distribution for an input of interest, conditional on the other inputs, when those other inputs include categorical variables. 22 | 23 | Gelman & Pardoe 2007 discuss what to do when the input of interest is an unordered categorical variable, but I'm mainly interested in applying the statistic I'm calling [*impact*](impact.html) to categorical variables, since *impact* is comparable across all types of inputs. 24 | 25 | 26 | ### Uncertainty 27 | 28 | As discussed in the paper, it is entirely natural to allow for multiple samples of parameters and plot these measures with their uncertainties as in the paper. 29 | 30 | I'm also interested in the uncertainty that arises from the density estimation. 31 | 32 | ### Weights 33 | 34 | We should have better & more automated choice of weights. Weights should be chosen in such a way that our estimates of conditional distributions approach the true distributions [in the limit as we get more data](more-large-N-limit.html). 35 | 36 | ### Code robustness 37 | 38 | I should deal with potential column name conflicts in data frames (e.g. I use a "weight" column, so I'll have problems if one of the inputs has that name). 39 | 40 | I should add unit tests to make further development more efficient. 41 | -------------------------------------------------------------------------------- /_site/notes/more/large-N-limit.Rmd: -------------------------------------------------------------------------------- 1 | ```{r echo=FALSE, message=FALSE} 2 | library("predcomps") 3 | print.data.frame <- function(...) base::print.data.frame(..., row.names=FALSE) 4 | ``` 5 | 6 | **This is a note about how the weights are (should be) constructed. The observation here is that our estimated APCs do not approach the theoretical APCs in the limit as we get more data. This is a clue that we should tweak the definition of the weights, but I haven't implemented such an improvement yet.** 7 | 8 | ## Large $N$ Limit 9 | 10 | As we get more data, it would be nice if the APC we compute tends toward the right answer, equation (2) of [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). I don't care about asymptotics as much as some people, but if we don't get the right answer in the limit, that's at least a clue that we might not be doing as well as we can for smaller $N$. This note shows that unless we adjust the weighting function as we get more data, we won't have that nice property. 11 | 12 | ```{r} 13 | makeExampleDF <- function(N) { 14 | exampleDF <- data.frame( 15 | v=c(3,3,7,7), 16 | u=c(10,20,12,22) 17 | )[rep(c(1,2,3,4),c(.4*N,.4*N,.1*N,.1*N)),] 18 | exampleDF <- transform(exampleDF, v = v + rnorm(nrow(exampleDF), sd=.001)) 19 | return(exampleDF) 20 | } 21 | ``` 22 | 23 | Just as in the note "Normalizing Weights", the APC should be: 24 | 25 | $$.8 \delta_u(10 \rightarrow 20, 3, f) + 0.2 \delta_u(12 \rightarrow 22, 7, f) = (.8)(3) + (.2)(6) = 3.8$$ 26 | 27 | We get almost the same APC with 300 data points as 100: 28 | 29 | ```{r} 30 | GetSingleInputApcs(function(df) return(df$u * df$v), makeExampleDF(100), u="u", v="v")$PerUnitInput.Signed 31 | GetSingleInputApcs(function(df) return(df$u * df$v), makeExampleDF(300), u="u", v="v")$PerUnitInput.Signed 32 | ``` 33 | 34 | If we're looking at one value for $v$, $v=v_0$, the tradeoff in determining the weights is: 35 | 36 | 1. $v$'s closer to $v_0$ will do a better job representing the distribution of $u$ conditional on $v=v_0$ 37 | 2. but if too few $v$'s get too much of the weight, our estimate for the conditional distribution of $u$ will be too noisy 38 | 39 | The reason our estimate didn't improve with more data is that we're not moving more weight to nearby points as we get more data. For any $N$, we're presently putting roughly the same amount of mass at the same distances. As we get more data, we can afford to put more weight closer to $v$, because (2) becomes less of a problem. A couple ideas are: 40 | 41 | - With the weights as $\frac{1}{k+d}$ ($d$ is the Mahalanobis distance), we could scale $k$ down as $N$ goes up. 42 | - Or we could use the weights we are now ($k=1$), except we drop (equivalent to setting the weight to 0) all but the closest $s(N)$ points to each $v$. The function $s$ needs to increases with $N$, but not as fast as $N$, e.g. maybe $s(N) = sqrt(N)$ probably works. This means we're always decreasing bias (sampling from closer to the right $v$) and also decreasing variance (more samples) as N increases. This would also be good for keeping run-times and memory usage under control as $N$ increases. 43 | -------------------------------------------------------------------------------- /_site/notes/more/renormalize-weights.Rmd: -------------------------------------------------------------------------------- 1 | ```{r echo=FALSE, message=FALSE} 2 | set.seed(89890) 3 | opts_chunk$set(tidy=FALSE) 4 | library("plyr") 5 | library(predcomps) 6 | print.data.frame <- function(...) base::print.data.frame(..., row.names=FALSE) 7 | ``` 8 | 9 | ## Renormalizing Weights 10 | 11 | In computing the APC, we assign weights to pairs of observations based on the Mahalanobis distance between the corresponding $v$'s. This note uses a toy example to argue that we must *renormalize* the weights so that when we group by the first element of each pair and sum the weights, each group has the same sum-of-weights. 12 | 13 | This suggested renormalization is not discussed in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). 14 | 15 | ## Toy example with exact transitions 16 | 17 | If $u$ is an input of interest and $v$ are the other inputs, recall that the APC is the expected value of a quantity formed upon sampling from $v$, sampling twice from $u$ conditional on $v$, and computing predictive comparisons using those $u$'s. See equation (2) of [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). 18 | 19 | If there were enough pairs of points with identical $v$, we could just use the sample distribution of $u$ given $v$. As noted in the paper, we may have few (if any) pairs of points with identical $v$. Even so, let's think through an example where we do have such identical pairs: 20 | 21 | Suppose $v$ consists of only 1 input, which can take one of two values. For simplicity, assume $u$ has exactly two possible (equally likely) values at each $v$ (which are different depending on the value of $v$), so there is only one possible transition at each $v$. Here's an example: 22 | 23 | ```{r} 24 | exampleDF <- data.frame( 25 | v=c(3,3,7,7), 26 | u=c(10,20,12,22) 27 | )[rep(c(1,2,3,4),c(40,40,10,10)),] 28 | ``` 29 | 30 | ```{r echo=FALSE} 31 | # Count each u/v combination: 32 | kable(ddply(exampleDF, c("v","u"), function(df) data.frame(CountOfRows = nrow(df)))) 33 | ``` 34 | 35 | Say we have a model $\hat{y} = f(u,v)$. I'll choose $\hat{y} = f(u,v) = uv$ for a simple example. (How the model is estimated is completely orthogonal to the questions addressed here, and I'll just pretend it's known exactly.) 36 | 37 | Equation (2) in the paper says the numerator in the APC should be: 38 | 39 | $$(.4)(.5)(.5)(f(20,3) - f(10, 3)) + (0.1)(.5)(.5)(f(22,7) - f(12,7)) $$ 40 | 41 | The .5's are the $p(u|v)$'s (and will cancel out in this case). (Terms with transition size 0 aren't included.) 42 | 43 | The denominator is: 44 | 45 | $$(.4)(.5)(.5)((20 - 10) + (.1)(.5)(.5)((22 - 12)$$ 46 | 47 | The ratio simplifies to: 48 | 49 | $$.8 \delta_u(10 \rightarrow 20, 3, f) + 0.2 \delta_u(12 \rightarrow 22, 7, f)$$ 50 | 51 | This is all overkill for our very simple example, where it's easy to see that the APC is just $(.8)(3) + (.2)(6)$. But I wanted to be very concrete. 52 | 53 | I'll compute it: 54 | 55 | ```{r} 56 | f <- function(u, v) return(u*v) 57 | ApcExact <- .8*(f(20,3) - f(10,3))/10 + .2*(f(22,7) - f(12,7))/10 58 | ApcExact 59 | ``` 60 | 61 | ## Now without exact duplicates 62 | 63 | Now imagine we don't have any exact duplicates of $v$. To get a corresponding example like that, I'll modify the first example by adding a really tiny bit of noise to $v$: $v_{new} = v + N(0,\epsilon)$. 64 | 65 | ```{r} 66 | exampleDF2 <- transform(exampleDF, v = v + rnorm(nrow(exampleDF), sd=.001)) 67 | ``` 68 | 69 | Now we form pairs and compute weights as described in the paper. Here's a sample of the resulting data frame of pairs, just to get a sense of what it looks like: 70 | 71 | ```{r} 72 | pairsDF <- GetPairs(exampleDF2, u="u", v="v", renormalizeWeights=FALSE) 73 | ``` 74 | 75 | ```{r echo=FALSE} 76 | kable(pairsDF[sample(1:nrow(pairsDF), 12), ], row.names=FALSE) 77 | ``` 78 | 79 | Now pairs with nearby $v$'s (which would have been the same $v$'s previously) have high weights, where pairs from far-away $v$'s (which were different $v$'s in the previous example) have low weights. That's good. 80 | 81 | But $v$ near 3 now has more weight in the data set for two reasons: 82 | 83 | 1. we started with more $v$'s near 3, so there are more rows with $v$ near 3 as the first element of the pair; and 84 | 2. each time $v$ is near $3$ in the first element of each pair, there are more nearby $v$'s to pair with, so we get higher Weights. 85 | 86 | Reason (1) is good, but reason (2) is not so good. 87 | 88 | In the data frame of pairs, the weights are all close to 0.14 or 1. Let's look at the joint distribution of $u$ and $v$ in just the pairs with weights close to 1: 89 | 90 | ```{r} 91 | pairsDF <- data.frame(vRounded = round(pairsDF$v), pairsDF) 92 | pairsHighWeightsDF <- subset(pairsDF, Weight > 0.9) 93 | ddply(pairsHighWeightsDF, 94 | c("vRounded","u"), 95 | function(df) data.frame(CountOfRows = nrow(df), 96 | ProportionOfRows = nrow(df)/nrow(pairsHighWeightsDF))) 97 | ``` 98 | 99 | We see that $v$'s near 7 makes up only about 5.7% of the pairs. (It would be exactly $(.2)(.2) = 4$%, except that when we form pairs to compute the APC we don't pair any row with itself.) 100 | 101 | If we form the APC based on these pairs and these weights, we weight the $v$'s near 3 too much, so our APC is too low: 102 | 103 | ```{r} 104 | pairsDF$yHat1 <- f(pairsDF$u, pairsDF$v) 105 | pairsDF$yHat2 <- f(pairsDF$u.B, pairsDF$v) 106 | pairsDF$uDiff <- pairsDF$u.B - pairsDF$u 107 | ApcApprox1 <- 108 | with(pairsDF, 109 | sum(Weight * (yHat2 - yHat1) * sign(uDiff)) / sum(Weight * uDiff * sign(uDiff))) 110 | ApcApprox1 111 | ``` 112 | 113 | I showed the full computation above, but we can also use the ```GetAPC``` function from this package: 114 | 115 | ```{r} 116 | GetSingleInputApcs(function(df) return(df$u * df$v), exampleDF2, u="u", v="v", renormalizeWeights=FALSE)$PerUnitInput.Signed 117 | ``` 118 | 119 | Instead, we can normalize Weights so that within each first element of the pair. 120 | 121 | ```{r} 122 | pairsDFWeightsNormalized <- ddply(pairsDF, "OriginalRowNumber", transform, Weight = Weight/sum(Weight)) 123 | ApcApprox2 <- 124 | with(pairsDFWeightsNormalized, 125 | sum(Weight * (yHat2 - yHat1) * sign(uDiff)) / sum(Weight * uDiff * sign(uDiff))) 126 | ApcApprox2 127 | ``` 128 | 129 | These renormalized Weights are the ones returned from ```GetPairs``` by default, and used in ```GetAPC``` by default: 130 | 131 | ```{r} 132 | GetSingleInputApcs(function(df) return(df$u * df$v), exampleDF2, u="u", v="v", renormalizeWeights=TRUE)$PerUnitInput.Signed 133 | ``` 134 | 135 | -------------------------------------------------------------------------------- /_site/notes/template: -------------------------------------------------------------------------------- 1 | #!html_output# 2 | -------------------------------------------------------------------------------- /_site/notes/transition-plots.Rmd: -------------------------------------------------------------------------------- 1 | ```{r message=FALSE, echo=FALSE} 2 | library(knitr) 3 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center') 4 | ``` 5 | 6 | ```{r echo=TRUE} 7 | library(predcomps) 8 | library(ggplot2) 9 | ``` 10 | 11 | - remove arrow heads -- dots at both ends maybe 12 | 13 | 14 | # Transition Plots: A new way to visualize models 15 | 16 | Transition plots are a way of looking at looking at the comparisons behind APCs in a less aggregated form. 17 | 18 | Instead of computing an average (one-point summary), we directly show the predictive comparisons. Since showing all of the pairs would cause overplotting (and not reflect the weights), we consider each original data point $(u_{orig},v)$ (where $u_{orig}$ is the input of interest and $v$ the other inputs) with only one resampled $u$ (sampled from all of the pairs between that data point and the others, with the same weights as used in APCs). 19 | 20 | We predict the output $y_{orig}$ at $(u_{orig},v)$ with the original $u$, and $y_{new}$ at $(u_{new},v)$ at the resampled $u$, and draw an arrow from $(u_{orig},y_{orig})$ to $(u_{new},y_{new})$. As in APCs, $v$ is held constant for each transition. 21 | 22 | I will consider a small example with two inputs, first with no interaction and then with an interaction. (In this simple model with only two independent inputs, the transition plot is probably not the best way to visualize the model, but it serves as an example to illustrate a technique that will be useful for more complicated models. For example, the varying size of the arrows in this case is just a distraction, as the value of $v$ does not in this case affect the distribution of $u$-transitions.) 23 | 24 | The simulated examples here will have $u$ and $v$ independent, and the first model will have: 25 | 26 | $$\mathcal{E}[y_1] = 2v_1 + 2v_2$$ 27 | 28 | ```{r CreateInputFeatures} 29 | N <- 100 30 | df <- data.frame(u=rnorm(N), v=rnorm(N)) 31 | df$y1 = 2*df$u + 2*df$v + rnorm(N) 32 | lm1 <- lm(y1 ~ u + v, data=df) 33 | print(lm1) 34 | TransitionPlot(function(df) predict(lm1, df), df, "u", "v") + ylab("y1") 35 | ``` 36 | 37 | Here is an example with an interaction: 38 | 39 | $$\mathcal{E}[y_1] = v_1 + v_2 + 2u_1v_1$$ 40 | 41 | ```{r} 42 | df$y2 = df$u + df$v + 2*df$v*df$u + rnorm(N) 43 | lm2 <- lm(y2 ~ u*v, data=df) 44 | print(lm2) 45 | TransitionPlot(function(df) predict(lm2, df), df, "u", "v") + ylab("y2") 46 | ``` 47 | 48 | Since the plot object returned by `TransitionPlot` includes its data, we can examine things in more detail by drawing our own arrows: 49 | 50 | ```{r} 51 | p <- TransitionPlot(function(df) predict(lm2, df), df, "u", "v", plot=FALSE) 52 | 53 | ggplot(p$data) + 54 | geom_segment(aes(x = u, y = output, xend = u.B, yend = outputNew, color=v), arrow = arrow()) + 55 | ylab("y2") 56 | ``` 57 | -------------------------------------------------------------------------------- /_site/temp/weights-exploration.R: -------------------------------------------------------------------------------- 1 | library(randomForest) 2 | library(predcomps) 3 | library(ggplot2) 4 | theme_set(theme_gray(base_size = 18)) 5 | 6 | credit <- read.csv("~/Downloads/cs-training.csv")[,-1] 7 | 8 | credit2 <- credit 9 | credit2 <- subset(credit2, !is.na(MonthlyIncome) & 10 | NumberOfTime30.59DaysPastDueNotWorse < 5 & 11 | RevolvingUtilizationOfUnsecuredLines <= 2 & 12 | NumberOfTime30.59DaysPastDueNotWorse <= 5 & 13 | NumberOfTime60.89DaysPastDueNotWorse <= 5 & 14 | NumberOfTimes90DaysLate <= 5 & 15 | MonthlyIncome < 5e4 & 16 | DebtRatio < 2 & 17 | NumberRealEstateLoansOrLines <= 12 & 18 | NumberOfDependents < 10 19 | ) 20 | 21 | for (col in names(credit2)) { 22 | print(qplot(credit2[[col]]) + ggtitle(col)) 23 | } 24 | 25 | credit2$SeriousDlqin2yrs <- factor(credit2$SeriousDlqin2yrs) 26 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit2, ntree=10) 27 | 28 | credit2Small <- credit2[sample.int(nrow(credit2), size = 1000), ] 29 | 30 | 31 | X1 <- credit2[sample.int(nrow(credit2), size = 50), ] 32 | X2 <- credit2 33 | 34 | dim(X1) 35 | 36 | u="age" 37 | v = c("RevolvingUtilizationOfUnsecuredLines", "NumberOfTime30.59DaysPastDueNotWorse", 38 | "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", 39 | "NumberOfTimes90DaysLate", "NumberRealEstateLoansOrLines", "NumberOfTime60.89DaysPastDueNotWorse", 40 | "NumberOfDependents") 41 | mahalanobisConstantTerm = 1 42 | renormalizeWeights=TRUE 43 | removeDiagonal=TRUE 44 | onlyIncludeNearestN=500 45 | 46 | X1 <- X1[c(v,u)] 47 | X2 <- X2[c(v,u)] 48 | 49 | X1$OriginalRowNumber <- 1:nrow(X1) 50 | X2$OriginalRowNumber.B <- 1:nrow(X2) 51 | 52 | vMatrix1 <- as.matrix(X1[,v]) 53 | vMatrix2 <- as.matrix(X2[,v]) 54 | 55 | 56 | covV=cov(vMatrix2) 57 | 58 | distMatrix <- apply(vMatrix1, 1, function(row) mahalanobis(vMatrix2, row, covV)) 59 | dim(distMatrix) 60 | 61 | colnames(distMatrix) <- 1:ncol(distMatrix) 62 | rownames(distMatrix) <- 1:nrow(distMatrix) 63 | distDF <- as.data.frame(as.table(distMatrix)) 64 | names(distDF) <- c("OriginalRowNumber.B", "OriginalRowNumber", "MahalanobisDistance") 65 | 66 | distDF <- distDF %.% group_by(OriginalRowNumber) %.% filter(rank(MahalanobisDistance) < onlyIncludeNearestN) 67 | pairs <- merge(X1, distDF, by = "OriginalRowNumber") 68 | pairs <- merge(X2, pairs, by = "OriginalRowNumber.B", suffixes = c(".B", "")) 69 | pairs$Weight <- 1/(mahalanobisConstantTerm + pairs$MahalanobisDistance) 70 | if (removeDiagonal) { 71 | pairs <- subset(pairs, OriginalRowNumber != OriginalRowNumber.B) #remove pairs where both elements are the same 72 | } 73 | 74 | 75 | if (renormalizeWeights) { 76 | # pairs <- pairs %.% group_by(OriginalRowNumber) %.% mutate(Weight = Weight/sum(Weight)) 77 | # browser() 78 | pairs <- mutate(group_by(pairs, OriginalRowNumber), 79 | Weight = Weight/sum(Weight)) 80 | pairs <- data.frame(pairs) 81 | } #normalizing AFTER removing pairs from same row as each other 82 | 83 | 84 | # pairs <- pairs[c("OriginalRowNumber",u,v,paste0(u,".B"),"Weight")] 85 | 86 | 87 | PlotPairCumulativeWeights(pairs) 88 | 89 | pairs$OriginalRowNumber 90 | ggplot(pairs) + geom_histogram(aes(x=MahalanobisDistance)) + 91 | #scale_x_log10(limits=c(1e-3,1e3)) + 92 | facet_wrap(~ OriginalRowNumber) 93 | ggplot(pairs) + geom_histogram(aes(x=MahalanobisDistance, weight = Weight)) + 94 | #scale_x_log10(limits=c(1e-3,1e3)) + 95 | facet_wrap(~ OriginalRowNumber) 96 | 97 | library(dplyr) 98 | 99 | 100 | PlotPairCumulativeWeights(GetPairs(credit2Small, u=u, v=v, mahalanobisConstantTerm=0)) 101 | -------------------------------------------------------------------------------- /_site/tests/run-all.R: -------------------------------------------------------------------------------- 1 | # should be executed from package root 2 | library(testthat) 3 | library(devtools) 4 | 5 | setwd("~/github/predcomps") 6 | document(".") 7 | install(".") 8 | test_package("predcomps") 9 | -------------------------------------------------------------------------------- /_site/tests/test-apc.R: -------------------------------------------------------------------------------- 1 | test_that("APC matches coefficient exactly for linear model", { 2 | 3 | df <- data.frame(X = rep(c(1,2),2), 4 | Y = c(1,2,3,4)) 5 | 6 | predictionFunction <- function(df) 2*df$X + 3*df$Y 7 | result <- GetSingleInputApcs(predictionFunction, df, u="X", v="Y") 8 | expect_that(result$PerUnitInput.Signed, equals(2)) 9 | }) 10 | -------------------------------------------------------------------------------- /_site/tests/test-pairs.R: -------------------------------------------------------------------------------- 1 | MakeComparable <- function(df) { 2 | df <- round(df, digits = 5) 3 | return(df[do.call(order, df), ]) 4 | } 5 | 6 | test_that("GetPairs works right in a small example", { 7 | 8 | df <- data.frame(X = rep(c(1,2),2), 9 | Y = rep(c(2,4),2)) 10 | pairsActual <- GetPairs(df, "X", "Y") 11 | pairsExpected <- data.frame(OriginalRowNumber = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), 12 | X = c(1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2), 13 | Y = c(2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4), 14 | X.B = c(2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1), 15 | Weight = c(0.166666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.166666666666667)) 16 | pairsActual <- pairsActual[names(pairsExpected)] 17 | expect_that(all.equal(MakeComparable(pairsActual), MakeComparable(pairsExpected)), is_true()) 18 | }) 19 | -------------------------------------------------------------------------------- /man/ComputeAPCFromPairs.Rd: -------------------------------------------------------------------------------- 1 | \name{ComputeApcFromPairs} 2 | \alias{ComputeApcFromPairs} 3 | \title{ComputeApcFromPairs} 4 | \usage{ 5 | ComputeApcFromPairs(predictionFunction, pairs, u, v, absolute = FALSE, 6 | impact = FALSE) 7 | } 8 | \description{ 9 | ComputeApcFromPairs 10 | } 11 | 12 | -------------------------------------------------------------------------------- /man/GetComparisonDFFromPairs.Rd: -------------------------------------------------------------------------------- 1 | \name{GetComparisonDFFromPairs} 2 | \alias{GetComparisonDFFromPairs} 3 | \title{GetComparisonDFFromPairs} 4 | \usage{ 5 | GetComparisonDFFromPairs(predictionFunction, pairs, u, v) 6 | } 7 | \description{ 8 | (abstracted this into a separate function from 9 | \code{GetApc} so we can more easily do things like 10 | \code{GetApcWithAbsolute}) 11 | } 12 | 13 | -------------------------------------------------------------------------------- /man/GetPairs.Rd: -------------------------------------------------------------------------------- 1 | \name{GetPairs} 2 | \alias{GetPairs} 3 | \title{GetPairs} 4 | \usage{ 5 | GetPairs(X, u, v, numForTransitionStart = NULL, numForTransitionEnd = NULL, 6 | onlyIncludeNearestN = NULL, mahalanobisConstantTerm = 1) 7 | } 8 | \arguments{ 9 | \item{X}{data frame} 10 | 11 | \item{u}{input of interest} 12 | 13 | \item{v}{other inputs} 14 | 15 | \item{mahalanobisConstantTerm}{Weights are (1 / 16 | (mahalanobisConstantTerm + Mahalanobis distance))} 17 | 18 | \item{numForTransitionStart}{number of rows to use as the 19 | start points of transitions (defaulting to `NULL`, we use 20 | all rows)} 21 | 22 | \item{numForTransitionEnd}{number of rows to use as 23 | potential end points of transitions (defaulting to 24 | `NULL`, we use all rows)} 25 | 26 | \item{onlyIncludeNearestN}{for each transition start, we 27 | only include as transition end points the nearest 28 | `onlyIncludeNearestN` rows (defaulting to `NULL`, we use 29 | all rows)} 30 | } 31 | \value{ 32 | a data frame with the inputs \code{v} from the first of 33 | each pair, \code{u} from each half (with ".B" appended to 34 | the second), and the Mahalanobis distances between the 35 | pairs. 36 | } 37 | \description{ 38 | Form all pairs of rows in \code{X} and compute Mahalanobis 39 | distances based on \code{v}. 40 | } 41 | \details{ 42 | To help with computational constraints, you have the option 43 | to not form pairs between all rows of \code{X} but instead 44 | of specify a certain number (\code{numForTransitionStart}) 45 | to randomly be selected as rows from which transitions 46 | start, and another number (\code{numForTransitionEnd}) to 47 | be randomly selected as where transitions end. We then form 48 | all pairs between transition-start rows and transition-end 49 | rows. 50 | 51 | In order to get a smaller data frame for later 52 | manipulations (and maybe just because it's a good idea), 53 | you can also specify \code{onlyIncludeNearestN}, in which 54 | case we return only the nearest \code{onlyIncludeNearestN} 55 | transition ends for each transition start (instead of all 56 | pairs). 57 | } 58 | \examples{ 59 | v <- rnorm(100) 60 | u <- v + 0.3*rnorm(100) 61 | qplot(v,u) 62 | X = data.frame(v=v,u=u) 63 | pairsDF <- GetPairs(X, "v", "u") 64 | pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1) 65 | # When we subset to one "original row number", all of the v's are the same: 66 | print(pairsDFRow1$v) 67 | # ... and u's corresponding to closer v.B (the v in the second element of the pair) have higher weight: 68 | qplot(u.B, Weight, data=pairsDFRow1) 69 | } 70 | 71 | -------------------------------------------------------------------------------- /man/GetPredCompsDF.Rd: -------------------------------------------------------------------------------- 1 | \name{GetPredCompsDF} 2 | \alias{GetPredCompsDF} 3 | \title{GetApcDF} 4 | \usage{ 5 | GetPredCompsDF(model, df, inputVars = NULL, ...) 6 | } 7 | \arguments{ 8 | \item{model}{Either a function (from a data frame to 9 | vector of predictions) or a model we know how to deal 10 | with (lm, glm)} 11 | 12 | \item{df}{data frame with data} 13 | 14 | \item{inputVars}{inputs to the model} 15 | 16 | \item{...}{extra parguments passed to GetPairs used to 17 | control Weight function} 18 | } 19 | \description{ 20 | makes average predictive comparison for all specified 21 | inputs 22 | } 23 | \examples{ 24 | n <- 200 25 | x1 <- runif(n = n, min = 0, max = 1) 26 | x2 <- runif(n = n, min = 0, max = 1) 27 | x3 <- runif(n = n, min = 0, max = 10) 28 | y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1) 29 | df <- data.frame(x1, x2, x3, y) 30 | fittedLm <- lm(y ~ ., data = df) 31 | apcDF <- GetPredCompsDF(fittedLm, df = df) 32 | apcDF 33 | } 34 | 35 | -------------------------------------------------------------------------------- /man/GetSingleInputApcs.Rd: -------------------------------------------------------------------------------- 1 | \name{GetSingleInputApcs} 2 | \alias{GetSingleInputApcs} 3 | \title{GetSingleInputApcs} 4 | \usage{ 5 | GetSingleInputApcs(predictionFunction, X, u, v, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{this could be a function (which 9 | takes data frame and makes returns a vector of 10 | predictions) or an object of class `lm`, `glm`, or 11 | `randomForest`} 12 | 13 | \item{X}{a data frame with all inputs} 14 | 15 | \item{u}{a string naming the input of interest} 16 | 17 | \item{v}{a string naming the other inputs} 18 | 19 | \item{...}{other arguments to be passed to `GetPairs`} 20 | } 21 | \value{ 22 | a list with: \code{signed} (the usual Apc) and 23 | \code{absolute} (Apc applied to the absolute value of the 24 | differences) 25 | } 26 | \description{ 27 | makes predictive comparison summaries (both per unit input 28 | and impact, both absolute and signed) by forming an data 29 | frame of pairs with appropriate weights and then calling 30 | `ComputeApcFromPairs`. Only works fore continuous inputs 31 | right now 32 | } 33 | \examples{ 34 | n <- 200 35 | x1 <- runif(n = n, min = 0, max = 1) 36 | x2 <- runif(n = n, min = 0, max = 1) 37 | x3 <- runif(n = n, min = 0, max = 10) 38 | y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1) 39 | df <- data.frame(x1, x2, x3, y) 40 | fittedLm <- lm(y ~ ., data = df) 41 | fittedLm 42 | GetSingleInputApcs(fittedLm, df, "x2", c("x1", "x3")) 43 | } 44 | 45 | -------------------------------------------------------------------------------- /man/GetSingleInputPredComps.Rd: -------------------------------------------------------------------------------- 1 | \name{GetSingleInputPredComps} 2 | \alias{GetSingleInputPredComps} 3 | \title{GetSingleInputPredComps} 4 | \usage{ 5 | GetSingleInputPredComps(predictionFunction, X, u, v, ...) 6 | } 7 | \arguments{ 8 | \item{predictionFunction}{this could be a function (which 9 | takes data frame and makes returns a vector of 10 | predictions) or an object of class `lm`, `glm`, or 11 | `randomForest`} 12 | 13 | \item{X}{a data frame with all inputs} 14 | 15 | \item{u}{a string naming the input of interest} 16 | 17 | \item{v}{a string naming the other inputs} 18 | 19 | \item{...}{other arguments to be passed to `GetPairs`} 20 | } 21 | \value{ 22 | a list with: \code{signed} (the usual Apc) and 23 | \code{absolute} (Apc applied to the absolute value of the 24 | differences) 25 | } 26 | \description{ 27 | makes predictive comparison summaries (APC and impact, 28 | absolute and signed) by forming an data frame of pairs with 29 | appropriate weights and then calling `ComputeApcFromPairs`. 30 | Only works fore continuous inputs right now 31 | } 32 | 33 | -------------------------------------------------------------------------------- /man/PlotPairCumulativeWeights.Rd: -------------------------------------------------------------------------------- 1 | \name{PlotPairCumulativeWeights} 2 | \alias{PlotPairCumulativeWeights} 3 | \title{PlotPairCumulativeWeights} 4 | \usage{ 5 | PlotPairCumulativeWeights(pairs, numOriginalRowNumbersToPlot = 20) 6 | } 7 | \description{ 8 | For a sample of transition start rows, we plot rank of 9 | transition end (by increasing weight) vs. cumulative 10 | weight. This gives a sense of how much weight is going into 11 | the nearest points vs. further ones. 12 | } 13 | \examples{ 14 | v <- rnorm(100) 15 | u <- v + 0.3*rnorm(100) 16 | X = data.frame(v=v,u=u) 17 | pairsDF <- GetPairs(X, "v", "u") 18 | pairsDFRow1 <- subset(pairsDF, OriginalRowNumber==1) 19 | # For most original rows, we get 75\% of the weight in 50\% of the pairs: 20 | PlotPairCumulativeWeights(pairsDF) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/PlotPredCompsDF.Rd: -------------------------------------------------------------------------------- 1 | \name{PlotPredCompsDF} 2 | \alias{PlotPredCompsDF} 3 | \title{PlotApcDF} 4 | \usage{ 5 | PlotPredCompsDF(apcDF, variant = "Impact") 6 | } 7 | \arguments{ 8 | \item{apcDF}{the output of GetApcDF} 9 | } 10 | \description{ 11 | plots the output of GetApcDF -- this is my preferred 12 | display for now 13 | } 14 | \examples{ 15 | n <- 200 16 | x1 <- runif(n = n, min = 0, max = 1) 17 | x2 <- runif(n = n, min = 0, max = 1) 18 | x3 <- runif(n = n, min = 0, max = 10) 19 | y <- 2 * x1 + (-2) * x2 + 1 * x3 + rnorm(n, sd = 0.1) 20 | df <- data.frame(x1, x2, x3, y) 21 | fittedLm <- lm(y ~ ., data = df) 22 | apcDF <- GetPredCompsDF(fittedLm, df = df) 23 | PlotPredCompsDF(apcDF, variant = "PerUnitInput") + theme_gray(base_size = 18) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /notes/apc.Rmd: -------------------------------------------------------------------------------- 1 | ## APCs 2 | 3 | ### Predictive Comparisons Generalize Regression Coefficients 4 | 5 | At the heart of this package is the idea of a *predictive comparison*: We vary the input of interest holding the other inputs constant, and look at the differences in predicted values. Let $u$ represent the input of interest and $v$ the (vector of) other inputs. Let $f$ be a functinon making predictions, so 6 | 7 | $$\hat{y} = f(u,v)$$ 8 | 9 | Our $f$ could come from any predictive model. If we have a statistical model, then probably we would choose 10 | 11 | $$f(u,v) = \mathcal{E}[y \mid u, v, \theta]$$ 12 | 13 | (where $\theta$ are the parameters of the model). But we need not have a statistical model at all. For example, the prediction function $f$ could come from a random forest, or a support vector machine. 14 | 15 | Given the function $f$ and a choice of $u_1$, $u_2$, and $v$, we can compute 16 | 17 | $$\delta_{u_1 \rightarrow u_2, v} = \frac{f(u_2, v) - f(u_1, v)}{u_2-u_1}$$ 18 | 19 | If $f$ were a linear model with no interactions, the above would not depend on the particular choices of $u_1$, $u_2$, and $v$ and would be the regression coefficient corresponding to $u$. This is the formal sense in which predictive comparisons generalize regression coefficients. Since for more complicated models this varies as the inputs vary, we will take an average across a well chosen set of inputs. 20 | 21 | ### Choice of Inputs 22 | 23 | The APC is defined as 24 | 25 | $$\frac{\mathcal{E}[\Delta_f]}{\mathcal{E}[\Delta_u]}$$ 26 | 27 | where $\Delta_f = f(u_2,v) - f(u_1,v)$, $\Delta_u = u_2 - u_1$, and $\mathcal{E}$ is expectation under the following process: 28 | 29 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs 30 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$ 31 | 32 | The reason for this definition is that we want to use representative values of $v$, and transitions in $u$ that are representative of what really occurs at those values of $v$. 33 | 34 | The fact that we are computing the numerator and denominator separately (rather than taking an expected value of $\delta_{u_1 \rightarrow u_2, v}$) amounts to weighting by the size of $(u_2 - u_1)$. This avoids having the result excessively influenced by small changes in $u$. 35 | 36 | ### Estimation 37 | 38 | The rows of our data represent samples from the joint distribution $u$, $v$, so each row amounts to a sample from $v$ followed by a sample $u_1$ conditional on $v$. The difficult thing is drawing another sample $u_2$ conditional on $v$. We approximate these by assigning weights to rows based on the proximity of the $v$ in that row to the $v$ in the row in question. The weights are: 39 | 40 | $$\frac{1}{\text{mahalanobisConstantTerm} + \text{(mahalanobis distance)}}$$ 41 | 42 | The [*mahalanobis distance*](https://en.wikipedia.org/wiki/Mahalanobis_distance) is a unitless version of distance that takes into account the correlation structure of $v$. The *mahalanobisConstantTerm* (which defaults to 1, but this is not always an appropriate choice) prevents all of the weight from going to the closest points. More work needs to be done in thinking about the weights. 43 | 44 | For more details, see [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) (section 4), [my note](more-pairs-and-weights.html) explaining a small change from the weights described in the paper (renormalization) and reasons for considering only a set of the closest $N$ points. You can also look at [my code](https://github.com/dchudz/predcomps/blob/master/R/pairs.R) that computes the appropriate weights. 45 | 46 | ### Absolute Version 47 | 48 | The absolute APC (as opposed to the signed version described above) replaces $\Delta_f = f(u_2,v) - f(u_1,v)$ above with $|\Delta_f| = |f(u_2,v) - f(u_1,v)|$. For an extreme artificial example see e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html), where the signed APC is roughly 0 but the absolute APC is large. 49 | 50 | By default, I always compute and display an absolute version of the APC alongside the signed version. 51 | 52 | ### A Small Example 53 | 54 | This is a example running APCs on a simulated linear model with independent inputs and no interactions. For more involved examples, see the [examples](examples-overview.html) section. 55 | 56 | The inputs ($x_1$, $x_2$, $x_3$) are independent, with 57 | 58 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$ 59 | 60 | First we set up the data: 61 | 62 | ```{r} 63 | n <- 200 64 | x1 <- runif(n=n, min=0, max=1) 65 | x2 <- runif(n=n, min=0, max=1) 66 | x3 <- runif(n=n, min=0, max=10) 67 | y <- 2*x1 + (-2)*x2 + 1*x3 + rnorm(n, sd=.1) 68 | df <- data.frame(x1, x2, x3, y) 69 | ``` 70 | 71 | Then we fit a linear model: 72 | 73 | ```{r} 74 | fittedLm <- lm(y ~ ., data=df) 75 | fittedLm 76 | ``` 77 | 78 | We can then plot the average predictive comparisons: 79 | 80 | ```{r results='hide', message=FALSE} 81 | library(predcomps) 82 | apcDF <- GetPredCompsDF(fittedLm, df=df) 83 | PlotPredCompsDF(apcDF, variant="PerUnitInput") + theme_gray(base_size = 18) 84 | ``` 85 | 86 | Using different shapes / colors, both the absolute and signed versions are plotted. For symmetry, the absolute version is plotted with both a positive and negative sign. Since this is a linear model with no interactions, the signed APCs match those from the fitted linear model. 87 | 88 | The call to `GetPredCompsDF` returns this data frame: 89 | 90 | ```{r} 91 | apcDF 92 | ``` 93 | 94 | The columns plotted here are `PerUnitInput.Signed` and `Apc.Absolute`. The [next section](impact.html) desceibes those columns labeled "Impact". 95 | -------------------------------------------------------------------------------- /notes/examples/diamonds.Rmd: -------------------------------------------------------------------------------- 1 | # A new way to visualize models 2 | 3 | ```{r message=FALSE, echo=FALSE} 4 | library(knitr) 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center') 6 | library(randomForest) 7 | library(plyr) 8 | library(predcomps) 9 | library(ggplot2) 10 | ``` 11 | 12 | ```{r} 13 | diamonds <- transform(diamonds, clarity = 14 | factor(clarity, levels =c("SI1", "SI2", "VS1", "VS2", "VVS1", "VVS2", "IF"))) 15 | 16 | diamonds2 <- transform(diamonds, 17 | clarity = as.integer(clarity), 18 | cut = as.integer(cut), 19 | color = as.integer(color)) 20 | 21 | diamonds3 <- subset(diamonds2, !is.na(clarity)) 22 | 23 | rf <- randomForest(price ~ carat + cut + color + clarity, data=diamonds3, ntree=20) 24 | diamondsSmall <- diamonds3[sample.int(nrow(diamonds3), size=500), ] 25 | 26 | apcDf <- GetPredCompsDF(function(df) predict(rf, df), diamondsSmall, inputVars=row.names(rf$importance)) 27 | PlotPredCompsDF(apcDf) 28 | PlotPredCompsDF(apcDf, variant="PerUnitInput") 29 | 30 | 31 | pairs <- GetPairs(diamondsSmall, "carat", c("cut", "color", "clarity"), 32 | removeDiagonal=FALSE, 33 | mahalanobisConstantTerm=.1) 34 | 35 | u <- "carat" 36 | pairsSampled <- ddply(pairs, 37 | "OriginalRowNumber", function(df) { 38 | df[sample.int(nrow(df), size=5, prob=df$Weight), ] 39 | }) 40 | 41 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10) 42 | 43 | pairsSampled$carat <- pairsSampled$carat.B 44 | 45 | pairsSampled$Prediction <- predict(rf, pairsSampled) 46 | 47 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse), 48 | aes(x=carat, y=Prediction, color=factor(OriginalRowNumber, levels=sample(originalRowNumbersToUse)))) + 49 | geom_point() + 50 | geom_line(size=.2) 51 | 52 | last_plot() + scale_x_continuous(limits=c(0,1)) + scale_y_continuous(limits=c(0,5000)) 53 | 54 | 55 | 56 | u <- "clarity" 57 | v <- c("carat", "cut", "color") 58 | pairs <- GetPairs(diamondsSmall, u, v, 59 | removeDiagonal=FALSE, 60 | mahalanobisConstantTerm=.1) 61 | 62 | pairsSampled <- ddply(pairs, 63 | "OriginalRowNumber", function(df) { 64 | df[sample.int(nrow(df), size=5, prob=df$Weight), ] 65 | }) 66 | 67 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10) 68 | 69 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]] 70 | 71 | pairsSampled$Prediction <- predict(rf, pairsSampled) 72 | 73 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse)) 74 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse), 75 | aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) + 76 | geom_point() + 77 | geom_line(size=.2) 78 | 79 | 80 | 81 | u <- "color" 82 | v <- c("cut", "clarity", "carat") 83 | pairs <- GetPairs(diamondsSmall, u, v, 84 | removeDiagonal=FALSE, 85 | mahalanobisConstantTerm=.1) 86 | 87 | pairsSampled <- ddply(pairs, 88 | "OriginalRowNumber", function(df) { 89 | df[sample.int(nrow(df), size=5, prob=df$Weight), ] 90 | }) 91 | 92 | originalRowNumbersToUse <- sample(unique(pairsSampled$OriginalRowNumber), size=10) 93 | 94 | pairsSampled[[u]] <- pairsSampled[[paste0(u,".B")]] 95 | 96 | pairsSampled$Prediction <- predict(rf, pairsSampled) 97 | 98 | pairsSampled$OriginalRowNumberFactor <- factor(pairsSampled$OriginalRowNumber, levels=sample(originalRowNumbersToUse)) 99 | ggplot(subset(pairsSampled, OriginalRowNumber %in% originalRowNumbersToUse), 100 | aes_string(x=u, y="Prediction", color="OriginalRowNumberFactor")) + 101 | geom_point() + 102 | geom_line(size=.2) 103 | 104 | 105 | 106 | 107 | 108 | ``` 109 | -------------------------------------------------------------------------------- /notes/examples/loan-defaults.Rmd: -------------------------------------------------------------------------------- 1 | ## Credit Default Example 2 | 3 | ```{r message=FALSE, echo=FALSE} 4 | library(knitr) 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center', fig.width = 14, size=8) 6 | library(randomForest) 7 | library(predcomps) 8 | library(ggplot2) 9 | theme_set(theme_gray(base_size = 18)) 10 | library(dplyr) 11 | library(gridExtra) 12 | ``` 13 | 14 | This example is based on the training data set [found here](https://www.kaggle.com/c/GiveMeSomeCredit). We build a model to predict: 15 | 16 | - **SeriousDlqin2yrs** (target variable): Person experienced 90 days past due delinquency or worse 17 | 18 | The input features are: 19 | 20 | - **RevolvingUtilizationOfUnsecuredLines**: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits 21 | - **age**: Age of borrower in years 22 | - **NumberOfTime30-59DaysPastDueNotWorse**: Number of times borrower has been 30-59 days past due but no worse in the last 2 years. 23 | - **NumberOfTime60-89DaysPastDueNotWorse**: Number of times borrower has been 60-89 days past due but no worse in the last 2 years. 24 | - **NumberOfTimes90DaysLate**: Number of times borrower has been 90 days or more past due. 25 | - **DebtRatio**: Monthly debt payments, alimony,living costs divided by monthy gross income 26 | - **MonthlyIncome**: Monthly income 27 | - **NumberOfOpenCreditLinesAndLoans**: Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) 28 | - **NumberRealEstateLoansOrLines**: Number of mortgage and real estate loans including home equity lines of credit 29 | - **NumberOfDependents**: Number of dependents in family excluding themselves (spouse, children etc.) 30 | 31 | ### Parameters: 32 | 33 | These are some parameters controlling the aggregate predictive comparisons: 34 | 35 | ```{r} 36 | # We will transitions starting at each of 500 random rows 37 | numForTransitionStart <- 500 38 | # ... going to each of 10,000 other random rows: 39 | numForTransitionEnd <- 10000 40 | # ... keeping only the nearest 100 pairs for each start: 41 | onlyIncludeNearestN = 100 42 | ``` 43 | 44 | And for the random forest: 45 | 46 | ```{r} 47 | # 100 trees for random forest 48 | ntree = 100 49 | ``` 50 | 51 | 52 | ```{r echo=FALSE} 53 | # Remove some outliers 54 | credit <- read.csv("~/Downloads/cs-training.csv")[,-1] 55 | credit <- subset(credit, !is.na(MonthlyIncome) & 56 | NumberOfTime30.59DaysPastDueNotWorse < 5 & 57 | RevolvingUtilizationOfUnsecuredLines <= 2 & 58 | NumberOfTime30.59DaysPastDueNotWorse <= 5 & 59 | NumberOfTime60.89DaysPastDueNotWorse <= 5 & 60 | NumberOfTimes90DaysLate <= 5 & 61 | MonthlyIncome < 5e4 & 62 | DebtRatio < 2 & 63 | NumberRealEstateLoansOrLines <= 12 & 64 | NumberOfDependents < 10 65 | ) 66 | ``` 67 | 68 | ### Input Distribution 69 | 70 | The distribution of the inputs (after removing some outliers to make things more manageable): 71 | 72 | ```{r fig.height = 12, echo=FALSE} 73 | histograms <- Map(function(colName) { 74 | qplot(credit[[colName]]) + 75 | ggtitle(colName) + 76 | xlab("")}, 77 | setdiff(names(credit), "SeriousDlqin2yrs")) 78 | allHistograms <- do.call(arrangeGrob, c(histograms, ncol=2)) 79 | print(allHistograms) 80 | ``` 81 | 82 | ### Model Building 83 | 84 | We'll use a random forest for this example: 85 | 86 | ```{r} 87 | set.seed(1) 88 | # Turning the response to type "factor" causes the RF to be build for classification: 89 | credit$SeriousDlqin2yrs <- factor(credit$SeriousDlqin2yrs) 90 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit, ntree=ntree) 91 | ``` 92 | 93 | ### Aggregate Predictive Comparisons 94 | 95 | ```{r message=FALSE, results='hide'} 96 | set.seed(1) 97 | apcDF <- GetPredCompsDF(rfFit, credit, 98 | numForTransitionStart = numForTransitionStart, 99 | numForTransitionEnd = numForTransitionEnd, 100 | onlyIncludeNearestN = onlyIncludeNearestN) 101 | ``` 102 | 103 | This is a table of the aggregate predictive comparisons: 104 | 105 | ```{r echo=FALSE} 106 | kable(apcDF, row.names=FALSE) 107 | ``` 108 | 109 | Since the [impact](impact.html) values all have the same units, it makes sense to chart them on the same axes: 110 | 111 | ```{r LoanDefaultImpact} 112 | PlotPredCompsDF(apcDF) 113 | ``` 114 | 115 | Note (for example) that average absolute value of the change in probability associated with changes in age is much larger than the magnitude of the signed average. This indicates either an interaction effect between age and other inputs, or a non-linear (non-monotonic) relationship between age an probability of default. We'll go into more detail on that in a bit. 116 | 117 | It wouldn't make sense to chart the average predictive comparisons (which are per unit change in input) in this way since they don't share units, but we can chart the inputs corresponding to a number of times late for various periods: 118 | 119 | ```{r} 120 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ], 121 | variant = "PerUnitInput") 122 | ``` 123 | 124 | As you'd expect, this shows that the effect size (per additional incident) increases with the length of lateness, while the previous chart shows that `NumberOfTime30.59DaysPastDueNotWorse` makes more overall difference to the model than `NumberOfTime60.89DaysPastDueNotWorse`. This is because its variation is larger (it's non-zero more often). We can see this more clearly looking at the *impact* chart for just these inputs: 125 | 126 | 127 | ```{r} 128 | PlotPredCompsDF(apcDF[grep("NumberOfTime", apcDF$Input), ]) 129 | ``` 130 | 131 | The difference between the absolute and signed versions show that there must be some observations where adding an incident of lateness *decreases* the default probability. This is most pronounced for `NumberOfTime30.59DaysPastDueNotWorse`, so we'll look at this input in a bit more detail. 132 | 133 | ### Looking at `NumberOfTime30.59DaysPastDueNotWorse` in more detail 134 | 135 | Recall that the data from which the summarized predictive comparisons are computed consists of groups of rows, where with in each group only the input of interest varies (for the point we imagine transitioning to) and the rest are held constant. We can work directly with this data, visualizing it in more detail to better understand our model. 136 | 137 | I'll plot the predicted default probability vs. `NumberOfTime30.59DaysPastDueNotWorse`, holding the other inputs constant. Each line corresponds to one choice of values for the other inputs: 138 | 139 | ```{r NumTimes30To59DaysLateDefaultCurves} 140 | set.seed(6) 141 | pairs <- GetComparisonDF(rfFit, credit, 142 | u="NumberOfTime30.59DaysPastDueNotWorse", 143 | numForTransitionStart = 20, 144 | numForTransitionEnd = numForTransitionEnd*10, 145 | onlyIncludeNearestN = onlyIncludeNearestN*10) 146 | 147 | pairsSummarized <- pairs[c("OriginalRowNumber", "NumberOfTime30.59DaysPastDueNotWorse.B", "yHat2", "Weight")] %.% 148 | group_by(OriginalRowNumber, NumberOfTime30.59DaysPastDueNotWorse.B, yHat2) %.% summarise(Weight = sum(Weight)) 149 | 150 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 151 | geom_point(aes(size = Weight)) + 152 | geom_line(size=.2) + 153 | scale_x_continuous(limits=c(0,2)) + 154 | scale_size_area() + 155 | xlab("NumberOfTime30.59DaysPastDueNotWorse") + 156 | ylab("Prediction") + 157 | guides(color = FALSE) 158 | ``` 159 | 160 | I've made the size of the points proportional to weight that the point receives. The summarized predictive comparisons give more weight to points with more weight, and so should we. 161 | 162 | The relationship is mostly as you'd expect, but let's examine the highlighted one in more detail: 163 | 164 | ```{r echo=FALSE} 165 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 166 | geom_point(aes(size = Weight)) + 167 | geom_line(aes(alpha=ifelse(OriginalRowNumber == 18, 1, .3))) + 168 | scale_x_continuous(limits=c(0,2)) + 169 | scale_alpha_identity() + 170 | xlab("NumberOfTime30.59DaysPastDueNotWorse") + 171 | ylab("Prediction") + 172 | guides(color = FALSE) 173 | ``` 174 | 175 | Why does the default probability decrease (for this one choice of other input values) when `NumberOfTime30.59DaysPastDueNotWorse` increases? Values for all the other inputs are held constant, so let's see what they are: 176 | 177 | ```{r echo=FALSE} 178 | oneOriginalRowNumber <- subset(pairs, OriginalRowNumber == 18) 179 | kable(oneOriginalRowNumber[1,intersect(names(oneOriginalRowNumber), names(credit))]) 180 | ``` 181 | 182 | Hmm, note that `NumberOfTimes90DaysLate` (almost always $0$) is $1$ in this case. Looking at the definition of the target variable (`SeriousDlqin2yrs`): 183 | 184 | `SeriousDlqin2yrs`: "Person experienced 90 days past due delinquency or worse." 185 | 186 | As we'd expect, previous instances of 90-days-late are a strong indicator of future ones. But adding a previous 30-days-late (but not more) to someone with a previous 90-days-late seems to decrease the chance of future 90-days-lates. This is sensible -- with the 30-days-late (but not more), we have evidence that when you're late, you at least sometimes pay back in under 60 days. 187 | 188 | Further exploratory analysis would further improve our understanding: 189 | 190 | - Is this really primarily an interaction between `NumberOfTimes90DaysLate` and `NumberOfTime30.59DaysPastDueNotWorse`, or are other inputs involved? We could vary the other inputs and see if we still get this effect. 191 | - Is the effect validated on test data? 192 | 193 | ### More Detailed Examination: `Age` 194 | 195 | For one more example, let's examine the `Age` input in more detail. 196 | 197 | ```{r AgeDefaultCurves} 198 | set.seed(3) 199 | pairsAge <- GetComparisonDF(rfFit, credit, 200 | u="age", 201 | numForTransitionStart = 20, 202 | numForTransitionEnd = numForTransitionEnd*10, 203 | onlyIncludeNearestN = onlyIncludeNearestN*10) 204 | 205 | pairsSummarizedAge <- pairsAge[c("OriginalRowNumber", "age.B", "yHat2", "Weight")] %.% 206 | group_by(OriginalRowNumber, age.B, yHat2) %.% 207 | summarise(Weight = sum(Weight)) 208 | 209 | ggplot(pairsSummarizedAge, aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 210 | geom_point(aes(size = Weight)) + 211 | geom_line(size=.2) + 212 | xlab("age") + 213 | ylab("Prediction") + 214 | guides(color = FALSE) 215 | ``` 216 | 217 | This is a bit of a mess, but we can at least see see that both interaction effects and non-monotonicity are present. 218 | 219 | Further exploration would look into which other inputs age is interacting with to determine these differently shaped curves. 220 | 221 | ```{r echo=FALSE} 222 | save.image(file="loan-defaults.RData") 223 | ``` 224 | -------------------------------------------------------------------------------- /notes/examples/logistic-regression.Rmd: -------------------------------------------------------------------------------- 1 | ## A Logistic Regression with Related Inputs 2 | 3 | ```{r} 4 | 5 | library(predcomps) 6 | library(ggplot2) 7 | library(boot) 8 | 9 | N=500 10 | v1 <- rnorm(N)*3 11 | v2 <- rnorm(N) * (abs(v1) > 2) 12 | v3 <- rnorm(N) * (abs(v1) < .5) 13 | 14 | # should we remove the points with the condition false instead of setting to 0? 15 | # no, it's good how we have it. since we're conditioning on (e.g.) v1 and v3 when we draw v2, 16 | # setting v2 to 0 in those cases means there really is no variation in v2 for those v1/v3 combos 17 | 18 | qplot(v2, v1 + v3) 19 | qplot(v3, v1 + v3) 20 | 21 | # plot we want: 22 | # v2 curve at various v1 + v3 values. sample from the real data to draw this 23 | # (get random rows and then fill in the whole curve.) 24 | # thicken the rows where v2 isn't 0, ie, does vary, or maybe use alpha to make the other ones lighter. 25 | # x = v2, color = v1 + v3 + 26 | 27 | table(v2==0) 28 | table(v3==0) 29 | df <- data.frame(v1,v2,v3) 30 | df$y <- rbinom(n=nrow(df), size=1, prob=with(df, inv.logit(v1+v2+v3))) 31 | 32 | fittedLogit <- glm(y ~ v1 + v2 + v3, data = df, family = "binomial") 33 | 34 | apcDF <- GetPredCompsDF(function(df) with(df, inv.logit(v1+v2+v3)), df, c("v1","v2","v3")) 35 | 36 | PlotPredCompsDF(apcDF) 37 | PlotPredCompsDF(apcDF, variant="Apc") 38 | 39 | 40 | GetPredCompsDF(fittedLogit, 41 | data.frame(v1, v2, v3), 42 | c("v1","v2","v3")) 43 | 44 | 45 | 46 | GetPredCompsDF 47 | 48 | ``` 49 | 50 | -------------------------------------------------------------------------------- /notes/examples/overview.Rmd: -------------------------------------------------------------------------------- 1 | ## Examples Overview 2 | 3 | ### Simulated Examples 4 | 5 | The simulated examples are constructed to demonstrate properties of predictive comparisons in artificial simulations that we can perfectly understand (because we created them). 6 | 7 | - [Logistic regression for wine prices](examples-wine-logistic-regression.html): This is a logistic regression model run in different simulated data situations. Changing the relationship between the inputs (and nothing else) leads to differences in the predictive comparisons, despite the logistic regression model remaining the same. 8 | 9 | - [Linear model with interactions](examples-simulated-linear-model-interactions.html): This is a linear model with 9 features and 8 interaction terms (an interaction between $v$ and each of $u_1 \ldots u_8$) which are all the same. The way the $u_i$'s are related to $v$ in the distribution of the inputs gives rise to differences in the APC, despite each $u_i$ playing the same role in the model. 10 | 11 | ### Real Examples 12 | 13 | Simulated examples are good for gaining understanding of how the package works and what it's doing, but they don't help us understand how it is useful in the real world. These are examples with real-world data sets: 14 | 15 | - [Loan Default Model](examples-loan-defaults.html): In this example, applying predictive comparisons to a random forest helps understand the drivers of a credit risk model. 16 | -------------------------------------------------------------------------------- /notes/examples/simulated-linear-model-interactions.Rmd: -------------------------------------------------------------------------------- 1 | ## APCs in a Synthetic Example of a Linear Model with Interactions 2 | 3 | ```{r message=FALSE, echo=FALSE} 4 | library(knitr) 5 | knitr::opts_chunk$set(tidy = FALSE, message=FALSE, warning=FALSE, fig.align='center') 6 | ``` 7 | 8 | ```{r echo=FALSE} 9 | library(predcomps) 10 | library(ggplot2) 11 | library(reshape2) 12 | library(plyr) 13 | ``` 14 | 15 | ### Input Generation 16 | 17 | This example will have 9 inputs: $v$ and $u_1$, $u_2$, ..., $u_8$. The input $v$ is uniformly distributed between 7 values, $-3, -2, \ldots, 2, 3$. Each $u$ is mostly constant (at $0$) but at one value of $v$ (which is $v=-3$ for $u_1$, $v=-2$ for $u_2$, etc), $u$ can be either $0$ and $10$. The input $u_8$ can transition at two values of $v$: At either $v=-3$ or $v=3$, $u_8$ can be $0$ or $10$; otherwise, $u_8$ is 0. I'll construct the data and plot the relationship between the $u$'s and $v$. 18 | 19 | ```{r} 20 | N <- 200 21 | vValues <- (-3):3 22 | v <- sample(vValues, N, replace=TRUE) 23 | 24 | df <- data.frame(v) 25 | for (i in seq_along(vValues)) { 26 | df[[paste0("u",i)]] <- 27 | ifelse(v==vValues[i], 28 | sample(c(0,10), N, replace=TRUE), # u can be either 0 or 10 at one v value 29 | rep(0, N) # u is always 0 at other ones 30 | ) 31 | } 32 | 33 | # u8 can transition at either v=-3 or v=3: 34 | df$u8 <- ifelse(v %in% c(-3,3), 35 | sample(c(0,10), N, replace=TRUE), 36 | rep(0, N) 37 | ) 38 | ``` 39 | 40 | ```{r ConditionalDistributionOfUsOnVs, echo=FALSE, fig.cap="distribution of $u$ conditional on $v$"} 41 | theme_set(theme_gray(base_size=20)) 42 | ggplot(melt(df, id="v")) + 43 | geom_bar(aes(x=factor(v), fill=factor(value)), position=position_fill()) + 44 | facet_grid(variable ~ .) + 45 | scale_fill_discrete("value of u_i") + 46 | scale_x_discrete("v") + 47 | ggtitle("u_i conditional on v") + 48 | scale_y_continuous("sample conditional distribution") + 49 | theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) 50 | ``` 51 | 52 | ### Outcome Generation 53 | 54 | Each $u$ will have the same role in the model, which is $$\mathbb{E}[y] = vu_1 + vu_2 + vu_3 + vu_4 + vu_5 + vu_6 + vu_7 + vu_8.$$ 55 | 56 | Note that each $u$ has the same role in this function. Differences between their APCs can only arise from the correlation structure of the inputs in combination with this function, not from this function alone. 57 | 58 | For simplicity this demonstration, I'll assume that $\mathbb{E}[y \mid v, u_1, \ldots, u_8]$ is known rather than estimated. 59 | 60 | (**Todo: Find a way to map these inputs/output to a possible story with meaningful features and outcome. That would make it more interesting, easier to follow, and maybe generate more insight.**) 61 | 62 | ```{r} 63 | outcomeGenerationFunction <- function(df) { 64 | with(df, v*u1 + v*u2 + v*u3 + v*u4 + v*u5 + v*u6 + v*u7 + v*u8) 65 | } 66 | df$y <- outcomeGenerationFunction(df) 67 | ``` 68 | 69 | ### Computing and Plotting the APC 70 | 71 | I will compute both the signed and absolute APC. My absolute APCs are different from anything discussed in the paper, but the idea is simple: Everywhere we would use a predictive difference, we use its absolute value instead. I believe that whenever we display the signed APC, we should display the absolute APC alongside it. Transitions for a given input may have a large expected absolute impact on the output variable, but with signs that cancel out. If that's happening in your model, it would be important to know. 72 | 73 | Here are the APCs for this example, displayed in two different ways. The first shows signed and absolute APCs in separate charts. The second (my preference) shows both types of APC in the same chart. In this case, each absolute APC is plotted twice, once on the positive half of the APC axis and once on the negative half. This is necessary due to the symmetry between positive and negative numbers. The second version is my preference because it makes comparisons between signed and absolute APCs easier. 74 | 75 | For example, we can more quickly notice in the second version that the signed APC for $u_8$ is small relative to its absolute APC. This is because $u_8$ transitions between 0 and 10 at either $v=3$ or $v=-3$. Either $v$ (combined with the $u_8$-transition) leads to a relatively large absolute predictive comparison, but with opposite signs depending on $v$. 76 | 77 | ```{r ApcPlotsTwoWays} 78 | inputVars <- c("v",paste0("u",1:8)) 79 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars = inputVars) 80 | print(apcDF) 81 | PlotPredCompsDF(apcDF, variant="PerUnitInput") 82 | ``` 83 | 84 | These APCs are just what we would expect from the setup of the synthetic examples. When $u$-transitions are at small $v$, the APCs are small due to the $u$/$v$ interaction effect (and vice versa). Even though we know the prediction function exactly, the APCs are a bit off due to errors in our estimates of the distribution of each input conditional on the others. We can mitigate that in this case by increasing the weight given to closer points when assigning weights based on the Mahalanobis distance. The weights are $\frac{1}{\text{mahalanobisConstantTerm}+d}$ where $d$ is the distance, so we can do this by decreasing $\text{mahalanobisConstantTerm}$: 85 | 86 | ```{r DecreasedMahalanobisConstantTerm} 87 | apcDF <- GetPredCompsDF(outcomeGenerationFunction, df, inputVars, mahalanobisConstantTerm=.01) 88 | PlotPredCompsDF(apcDF, variant="PerUnitInput") 89 | ``` 90 | 91 | In this case, the APCs are much closer to the correct values. Giving such extreme weight to pairs with small distance does not cause any problems in this example, but it would in more realistic examples. In applications, getting the weights right may be difficult. 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /notes/examples/wine-logistic-regression.Rmd: -------------------------------------------------------------------------------- 1 | ```{r echo=FALSE, results='hide', message=FALSE} 2 | library(predcomps) 3 | library(ggplot2) 4 | library(boot) 5 | library(knitr) 6 | opts_chunk$set(tidy = FALSE) 7 | set.seed(1) 8 | ``` 9 | 10 | ## A Logistic Regression with Related Inputs 11 | 12 | *(The source code for this example is [here](https://github.com/dchudz/predcomps/blob/master/notes/examples/wine-logistic-regression.Rmd).)* 13 | 14 | We will set up a simulated data set to use for modeling the probability a customer buys a bottle of wine, given its price and quality. We'll compare a few situation varying the joint distribution of price ($P$) and quality ($Q$). The coefficients of the logistic regression determining the relationship between the inputs and the probability of purchase will not vary. 15 | 16 | In each variation, the probability of purchase is governed by the following logistic regression model: 17 | 18 | $$logit(P(\text{wine is purchased})) = 0.1 Q - 0.12 P$$ 19 | 20 | - *Variation 1*: Price is uniform; quality is price plus noise. Quality increases with price, but not enough make up for price, so the expensive wines are rarely purchased and the cheap wines are almost always purchased. 21 | - *Variation 2*: Just like *Variation 1*, but price is more densely concentrated in its middle range. This leads to price and quality both having a larger APC because the inverse logistic curve is steeper in this middle range. 22 | - *Variation 3*: Like *Variation 1*, but quality varies more strongly with price. The inverse logistic curve is now steeper at almost all price/quality combinations. 23 | 24 | The APC varies across these variations, but the logistic regression coefficients remain the same. The changes in APC in each of these variations are driven entirely by changes in the distribution of the inputs. The model relating inputs to outputs is unchanged. 25 | 26 | ### Variation 1 27 | 28 | In the first variation, quality and price are independent, with price uniformly distributed and quality set to price plus Gaussian noise: 29 | 30 | ```{r} 31 | priceCoef <- -.12 32 | qualityCoef <- .1 33 | qualityNoiseStdDev <- 5 34 | nWines=50000 35 | nRowsForPlottingSample <- 1000 36 | 37 | numForTransitionStart <- 500 38 | numForTransitionEnd <- 10000 39 | onlyIncludeNearestN = 100 40 | 41 | priceQualitySlope <- .4 42 | 43 | df1 <- local({ 44 | price <- sample(20:120, nWines, replace=TRUE) 45 | quality <- price * priceQualitySlope + 22 + rnorm(nWines, sd=qualityNoiseStdDev) 46 | purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50) ) 47 | purchased <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability) 48 | data.frame(Quality = quality, 49 | Price = price, 50 | PurchaseProbability = purchaseProbability, 51 | Purchased = purchased) 52 | }) 53 | print(getwd()) 54 | ``` 55 | 56 | A scatter plot (using a random subset to avoid overplotting) shows us the relationship between price and quality: 57 | 58 | ```{r V1Scatter} 59 | df1Sample <- df1[sample.int(nWines, size=nRowsForPlottingSample), ] 60 | qplot(Price, Quality, alpha=I(.5), data = df1Sample) + 61 | expand_limits(y=c(0,100)) 62 | ``` 63 | 64 | When we fit a logistic regression, the coefficients are what we'd expect from the setup above: 65 | 66 | 67 | ```{r} 68 | logitFit1 <- glm(Purchased ~ Price + Quality, data = df1, family = "binomial") 69 | logitFit1 70 | ``` 71 | 72 | This plot shows the relationship between quality and probability of purchase for a few prices: 73 | 74 | ```{r V1QualityVsProb} 75 | myScales <- list(scale_x_continuous(limits=c(0,100)), 76 | scale_y_continuous(limits=c(0,1))) 77 | 78 | ggplot(subset(df1Sample, Price %in% seq(20, 120, by=10))) + 79 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 80 | size = 3, alpha = 1) + 81 | ggtitle("Quality vs. Purchase Probability at Various Prices") + myScales + 82 | scale_color_discrete("Price") 83 | 84 | ``` 85 | 86 | Each colored set of points is one portion of a shifted inverse logistic curve, determined by which Price/Quality combinations actually occur in our data. 87 | 88 | We can also see the portion of the curve that isn't represented in our data: 89 | 90 | ```{r V1QualityVsProbWithLines, echo=FALSE} 91 | linesDF <- expand.grid(Price = seq(20, 120, by=10), Quality = -20:130) 92 | linesDF$PurchaseProbability <- with(linesDF, 93 | inv.logit(priceCoef*(Price - 70) + qualityCoef*(Quality - 50))) 94 | last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 95 | data = linesDF, 96 | size=.2) 97 | ``` 98 | 99 | We can get average predictive comparisons from our fitted regression: 100 | 101 | ```{r} 102 | apc1 <- GetPredCompsDF(logitFit1, df1, 103 | numForTransitionStart = numForTransitionStart, 104 | numForTransitionEnd = numForTransitionEnd, 105 | onlyIncludeNearestN = onlyIncludeNearestN) 106 | ``` 107 | 108 | The `GetPredCompsDF` function produces a few kinds of outputs, but for now let's just focus on the signed average predictive comparison: 109 | 110 | ```{r} 111 | apc1[c("Input", "PerUnitInput.Signed")] 112 | ``` 113 | 114 | This means that (on average) the probability of purchase increases by about 1.2% per 1 unit increase in quality. 115 | 116 | ### Variation 2 117 | 118 | This variation will add some additional wines to the middle range of prices: 119 | 120 | ```{r} 121 | nAdditionalWines <- nWines 122 | supplementForDF2 <- local({ 123 | price <- sample(55:85, nWines, replace=TRUE) 124 | quality <- price * .4 + 22 + rnorm(nWines, sd=qualityNoiseStdDev) 125 | purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50) ) 126 | purchased <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability) 127 | data.frame(Quality = quality, 128 | Price = price, 129 | PurchaseProbability = purchaseProbability, 130 | Purchased = purchased) 131 | }) 132 | df2 <- rbind(df1, supplementForDF2) 133 | ``` 134 | 135 | A scatter plot (again, using a random subset to avoid overplotting) shows us the relationship between price and quality: 136 | 137 | ```{r} 138 | df2Sample <- df2[sample.int(nrow(df2), size=nRowsForPlottingSample), ] 139 | qplot(Price, Quality, alpha=I(.5), data = df2Sample) + 140 | expand_limits(y=c(0,100)) 141 | ``` 142 | 143 | When we fit a logistic regression, the coefficients are similar to before, since we haven't changed the underlying model: 144 | 145 | 146 | ```{r} 147 | logitFit2 <- glm(Purchased ~ Price + Quality, data = df2, family = "binomial") 148 | logitFit2 149 | ``` 150 | 151 | In the plot showing the relationship between quality and probability of purchase, we see more points at the steep section of the inverse logit curve: 152 | 153 | ```{r} 154 | ggplot(subset(df2Sample, Price %in% seq(20, 120, by=10))) + 155 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 156 | size = 3, alpha = 1) + 157 | ggtitle("Quality vs. Purchase Probability at Various Prices") + 158 | myScales + 159 | scale_color_discrete("Price") 160 | ``` 161 | 162 | The APC for quality is correspondingly larger: 163 | 164 | ```{r} 165 | apc2 <- GetPredCompsDF(logitFit2, df2, 166 | numForTransitionStart = numForTransitionStart, 167 | numForTransitionEnd = numForTransitionEnd, 168 | onlyIncludeNearestN = onlyIncludeNearestN) 169 | 170 | apc2[c("Input", "PerUnitInput.Signed")] 171 | ``` 172 | 173 | This means that in this variation the probability of purchase increases (on average) by about 1.5% (vs. 1.2% in *Variation 1*) per 1-point increase in quality. The magnitude of the APC for price is also larger. 174 | 175 | ### Variation 3 176 | 177 | This is just like *Variation 1*, but price increases more with quality: 178 | 179 | ```{r} 180 | priceQualitySlope <- 1.2 181 | 182 | df3 <- local({ 183 | price <- sample(20:120, nWines, replace=TRUE) 184 | quality <- price * priceQualitySlope - 30 + rnorm(nWines, sd=qualityNoiseStdDev) 185 | purchaseProbability <- inv.logit(priceCoef*(price - 70) + qualityCoef*(quality - 50) ) 186 | purchased <- rbinom(n = length(purchaseProbability), size=1, prob=purchaseProbability) 187 | data.frame(Quality = quality, 188 | Price = price, 189 | PurchaseProbability = purchaseProbability, 190 | Purchased = purchased) 191 | }) 192 | ``` 193 | 194 | ```{r} 195 | df3Sample <- df3[sample.int(nWines, size=nRowsForPlottingSample), ] 196 | qplot(Price, Quality, alpha=I(.5), data = df3Sample) + 197 | expand_limits(y=c(0,100)) 198 | ``` 199 | 200 | The logistic regression still comes out the same: 201 | 202 | ```{r} 203 | logitFit3 <- glm(Purchased ~ Price + Quality, data = df3, family = "binomial") 204 | logitFit3 205 | ``` 206 | 207 | In this case, purchase is less certain at the low prices and more plausible at the high prices: 208 | 209 | ```{r} 210 | ggplot(subset(df3Sample, Price %in% seq(-100, 200, by=10))) + 211 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 212 | size = 3, alpha = 1) + 213 | ggtitle("Quality vs. Purchase Probability at Various Prices") + 214 | myScales + 215 | scale_color_discrete("Price") 216 | ``` 217 | 218 | We can get average predictive comparisons from our fitted regression: 219 | 220 | ```{r} 221 | apc3 <- GetPredCompsDF(logitFit3, df3, 222 | numForTransitionStart = numForTransitionStart, 223 | numForTransitionEnd = numForTransitionEnd, 224 | onlyIncludeNearestN = onlyIncludeNearestN) 225 | ``` 226 | 227 | As expected, the APCs are (both) larger than in *Variation 1*: 228 | 229 | ```{r} 230 | apc3[c("Input", "PerUnitInput.Signed")] 231 | ``` 232 | 233 | ### Comparing the Variations 234 | 235 | Comparing all of the variation in one plot, we can see the increase in the effect of wine quality on purchase probability going from Variation 1 to Variation 3: 236 | 237 | ```{r echo=FALSE} 238 | apc1$Variation <- 1 239 | apc2$Variation <- 2 240 | apc3$Variation <- 3 241 | 242 | apcAllVariations <- do.call(rbind, list(apc1, apc2, apc3)) 243 | ggplot(subset(apcAllVariations, Input=="Quality")) + 244 | geom_point(aes(x=factor(Variation), y=PerUnitInput.Signed), size=3) + 245 | expand_limits(y=0) + 246 | xlab("Variation") + 247 | ggtitle("Per Unit Quality APC for Quality across Variations") 248 | 249 | ``` 250 | 251 | ```{r echo=FALSE} 252 | save.image(file="wine-logistic-regression.RData") 253 | ``` 254 | -------------------------------------------------------------------------------- /notes/impact.Rmd: -------------------------------------------------------------------------------- 1 | ## Impact: A variation on APCs with comparable units 2 | 3 | *Impact* answers the question: **What difference does each input tend to make in the output?** (Given the model and the distribution of inputs.) The units are the units of the output variable. 4 | 5 | ### The name "Impact" 6 | 7 | First off, I don't love the name "Impact" for this. "Variable importance" may signal somewhat the right idea to people already familiar with other measures of variable importance, but this notion is (or can be) signed, whereas variable importance generally isn't. 8 | 9 | ### Impact 10 | 11 | *Impact* is a statistic similar to the APC, but which addresses two issues I've had with APCs: 12 | 13 | 1. APCs are good for their purpose (determining the expected difference in outcome per unit change in input), but this doesn't tell me how important an input is to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference. 14 | 15 | 2. Relatedly, APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs in that example belong on the same chart, and I would like to visualize the influence . 16 | 17 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs in the computation of the APC. That is, impact is just the expectation of 18 | 19 | $\Delta_f = f(u_2,v) - f(u_1,v)$ 20 | 21 | under the same process as in the APC: 22 | 23 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs 24 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$ 25 | 26 | The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable. This statistic depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs. 27 | 28 | ### Example (cont.) 29 | 30 | The same example used to demonstrate APCs demonstrates the difference between *impact* and APCs. Recall that the inputs ($x_1$, $x_2$, $x_3$) are independent, with 31 | 32 | $$y \sim 2x_1 - 2x_2 + x_3 + \mathcal{N}(0,.1)$$ 33 | 34 | However, the variation in input $x_3$ is much larger than the others: 35 | 36 | ```{r} 37 | n <- 200 38 | x1 <- runif(n=n, min=0, max=1) 39 | x2 <- runif(n=n, min=0, max=1) 40 | x3 <- runif(n=n, min=0, max=10) 41 | y <- 2*x1 + (-2)*x2 + 1*x3 42 | df <- data.frame(x1, x2, x3, y) 43 | fittedLm <- lm(y ~ ., data=df) 44 | ``` 45 | 46 | We can then compute and plot the *impact*: 47 | 48 | ```{r results='hide', message=FALSE} 49 | library(predcomps) 50 | apcDF <- GetPredCompsDF(fittedLm, df=df) 51 | PlotPredCompsDF(apcDF) + theme_gray(base_size = 18) 52 | ``` 53 | 54 | The *impact* for $x_3$ is about 5 times the impact for $x_1$, which makes sense as $x_3$ varies on a scale that is 10 times as large but with a coefficient half as big. 55 | 56 | ```{r} 57 | apcDF 58 | ``` 59 | 60 | The [examples](examples-overview.html) section goes through more interesting examples demonstrating more subtle features of how *impact* and APCs work. 61 | -------------------------------------------------------------------------------- /notes/index.Rmd: -------------------------------------------------------------------------------- 1 | **Please note: This is a very early version of the package. Please take it (and everything written on these pages) as provisional.** 2 | 3 | ## An R Package for Understanding Arbitrary Complex Models 4 | 5 | As complex models become widely used, it's more important than ever to have ways of understanding them. Even when a model is built primarily for prediction (rather than primarily as an aid to understanding), we still need to know what it's telling us. For each input to the model, we should be able to answer questions like these: 6 | 7 | 1. **On average, much does the output increase per unit increase in input?** 8 | 2. **How does the influence of the input vary? (Across its range? In combination with other inputs?)** 9 | 3. **How much difference does variation in this input make to the predictions? (Relative to other inputs?)** 10 | 11 | For example, if our model were a linear regression with no interactions and no transformations of the inputs, the (1) would be answered by our regression coefficients, (2) would be answered "It doesn't vary", and (3) would be a little harder but not too bad. All of these questions are much harder for more complicated models. 12 | 13 | This R package is a collection of tools that are meant to help answer these questions for arbitrary complicated models. One advantage of the fact that they work work equally well for any model is that they can be used to **compare models**. 14 | 15 | The key feature of the approach here is that we try to properly **account for relationships between the various inputs**. 16 | 17 | ### Origin 18 | 19 | The ideas implemented here originate in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). If you are already familiar with the paper, you can [skip to the differences between that and what's here](more-compared-with-paper.html). As far as I know, this is the only implementation intended for general use. 20 | 21 | ### Installation 22 | 23 | The package is not hosted on CRAN, but it's still easy to install: 24 | 25 | ```{r eval=FALSE} 26 | library(devtools) # first install devtools if you haven't 27 | install_github("predcomps", user="dchudz") 28 | ``` 29 | 30 | ### What we give you: 31 | 32 | 1. [Average Predictive Comparisons Per Unit Input](apc.html) - On average, much does the output increase per unit increase in input? (This generalizes regression coefficients.) 33 | 2. Sensitivity Analysis - the package doesn't yet have functions that fully automate this, but see the bottom of [this example](examples-loan-defaults.html) for how this package can help with sensitivity analysis in a way that's consistent with accounting for relationships between the various inputs. 34 | 3. [Impact](impact.html) - What is the average difference that a random change in each input makes to the predictions? (You can use this to compare the importance of the various inputs.) 35 | 36 | You can compute both signed and absolute version of (1) and (3) for all inputs in your model with the function `GetPredCompsDF` and plot them using `PlotPredCompsDF`. See [the PDF documentation](predcomps-manual.pdf) for how to call each of these functions. 37 | 38 | ### Example Output 39 | 40 | Here's an example of the output of *[impact](impact.html)* (which gets at a idea similar to 'variable importance' in other packages) from [an example predicting loan defaults](examples-loan-defaults.html). The model is a random forest. Note that *impact* is expressed in the units of the output variable (probabilities in this case), and hence it always makes sense to show it for all of the inputs on the same scale: 41 | 42 | ![LoanDefaultImpact](figure/LoanDefaultImpact.png) 43 | 44 | The signed version should be interpreted as the expected value of the change in default probability for a random change in input. The absolute version is the expected absolute value, and is shown on both the positive and negative sides of horizontal axes. The signed version will always be between the two points representing the absolute version. When the signed version is close to the absolute version, the impact of a change the corresponding variable more consistently has the same sign. 45 | 46 | We can also look in more detail at an individual input, examining the default probability as we vary that input (holding all else equal at a few example values): 47 | 48 | ![NumTimes30To59DaysLateDefaultCurves](figure/NumTimes30To59DaysLateDefaultCurves.png) 49 | 50 | This is sometimes called a *sensitivity analysis*. Notice that in this case for one particular choice of other input values, transitioning from 0 to 1 previous time 30-59 days late leads to a *decrease* in predicted default probability. See [the example](examples-loan-defaults.html) for some thoughts on why this might be. 51 | 52 | ### Current Limitations 53 | 54 | The package has some major limitations in its current form, but we have to start somewhere: 55 | 56 | **Input types**: At the time of this writing, all inputs to the model must be numerical (or coercable to such). In practice this means that binary or ordered categorical inputs are okay, but unordered categorical inputs are not. 57 | 58 | **Parameters controlling the weights between pairs**: These parameters are important for reasons that are both computational and statistical, are not yet well tuned for you, and the defaults may not be reasonable. 59 | 60 | ## Contact 61 | 62 | I'm very interested in feedback from folks who are trying this out. Please get in touch with me at dchudz@gmail.com. 63 | -------------------------------------------------------------------------------- /notes/more/compared-with-paper.Rmd: -------------------------------------------------------------------------------- 1 | ## As Compared With the Paper 2 | 3 | There are a few differences between what's here and what's discribed in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). Many features described in the paper (such as categorical inputs) are just not implemented here. This page ignores those, and describes only outright differences, or additions. 4 | 5 | ### Pairs and Weights 6 | 7 | [This page](more-pairs-and-weights.html) comments on how the weights are computed and describes from differences between this package and what's described in section 4.1 of the paper. 8 | 9 | ### Absolute APCs 10 | 11 | Gelman & Pardoe mention an unsigned version APCs in the case where the input of interest is an unordered categorical variable (in which case signs wouldn't make sense). They propose a root mean squared APC (equaton 4), but I prefer absolute values and I believe this absolute-values version is useful for any inputs, not just categorical ones. By default, I always compute and display an absolute version of the APC alongside the signed version. See e.g. the input $u_8$ in my [simulated linear model with interactions](examples-simulated-linear-model-interactions.html) for an artificial example demonstrating the importance of this notion, and see [my explanation of APCs](apc.html) for more detail. 12 | 13 | ### Impact: A variation on APCs with comparable units 14 | 15 | I've created a statistic similar to the original APC, but which addresses two issues I've had with APCs: 16 | 17 | 1. APCs are good for their purpose (the expected difference in outcome per unit change in input), but it doesn't tell me the what difference an input makes to my predictions. The APC could be high while the variation in the input is so small that it doesn't make a difference. 18 | 19 | 2. APCs across inputs with different units have different units themselves and so are not directly comparable. The example in the paper (see p. 47) uses mostly binary inputs, so this is mostly not a problem there. But I'm not sure the other inputs belong on the same chart. 20 | 21 | Both (1) and (2) could be addressed by standardizing the coefficients before computing the APC, but this feels a bit ad hoc and arbitrary. Instead, I take the simpler and more elegant approach of just not dividing by the difference in inputs. The computed quantity is therefore the expected value of the predictive difference caused by a random transition for the input of interest. The units are the same as the output variable, and hence are always comparable across different inputs. Just as with APCs, this quantity depends on the model, the variation in the input of interest, and the relationship between that inputs and the other inputs. 22 | 23 | I'm calling this notion *impact* (feel free to suggest another name), and it's described in more detail [here](impact.html) and in the examples. Just like APCs, it comes in signed and absolute forms. 24 | 25 | ### All-else-equal Curves / Sensitivity Analysis 26 | 27 | To visualize the model in more detail than is provided by our aggregated predictive comparisons, we can plot $u$ vs. the prediction at a variaty of values of $v$. In order that these plots represent $p(u|v)$, we can use the same set of pairs/weights as is constructed for computing aggregated predictive comparisons. 28 | 29 | This plot shows age vs. probability of default, as in [the loan defaults example](examples-loan-defaults.html): 30 | 31 | ![AgeDefaultCurves](figure/AgeDefaultCurves.png) 32 | 33 | Plots like this are not yet computed in the package, but see [the example](examples-loan-defaults.html) for how to construct them. 34 | -------------------------------------------------------------------------------- /notes/more/future-work.Rmd: -------------------------------------------------------------------------------- 1 | ## Future work 2 | 3 | ### Categorical inputs 4 | 5 | I'm not quite sure about the best way to sample from the distribution for an input of interest, conditional on the other inputs, when those other inputs include categorical variables. 6 | 7 | Gelman & Pardoe 2007 discuss what to do when the input of interest is an unordered categorical variable, but I'm mainly interested in applying the statistic I'm calling [*impact*](impact.html) to categorical variables, since *impact* is comparable across all types of inputs. 8 | 9 | ### Distribution of predictive comparisons 10 | 11 | Rigth now we're just providing point summaries. It'd be nice to see the distribution. (This is different from accounting for model uncertainty.) 12 | 13 | ### Uncertainty 14 | 15 | As discussed in the paper, it is entirely natural to allow for multiple samples of parameters and plot these measures with their uncertainties as in the paper. 16 | 17 | I'm also interested in the uncertainty that arises from the density estimation. 18 | 19 | ### Weights 20 | 21 | More work is necessary to understand better what the weights should be / how to best represent the appropriate conditional distributions. Maybe in some cases we should even drop the distance-based approach and use something like BART? 22 | 23 | ### Code robustness 24 | 25 | I should deal with potential column name conflicts in data frames (e.g. I use a "Weight" column, so I'll have problems if one of the inputs has that name). 26 | 27 | ### Other tools/methods for understanding complicated models 28 | 29 | I'd like to compile a list of other work in this direction, maybe comparing them with this (and perhaps giving examples that show why/when you'd want to use this instead). 30 | 31 | I should add a page discussing other methods people have used to get at somewhat the same idea. 32 | 33 | - `randomForest` package in R (partial plots, variable importance) 34 | - `earth` package in R (variable importance) 35 | - I should look through `caret` to see what's available there -------------------------------------------------------------------------------- /notes/more/large-N-limit.Rmd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dchudz/predcomps/cc3bf1155cc01f496da231af4facfc47d986d046/notes/more/large-N-limit.Rmd -------------------------------------------------------------------------------- /notes/more/pairs-and-weights.Rmd: -------------------------------------------------------------------------------- 1 | ## Weighting Pairs of Rows 2 | 3 | ### The Original Version 4 | 5 | In computing the APC, we assign weights to pairs of observations. Taking $v$ (the inputs not of interest) from the first element of the pair, $u_1$ from the first element of the pair, and $u_2$ from the second element of the pair, these samples and their weights are meant to approximate the distribution with density $p(v)p(u_1|v)p(u_2|v)$. 6 | 7 | If we had many observations for each unique value of $v$, this would be easy: We'd just assign weight 1 to the pairs that share the same value of $v$, and weight 0 to the other pairs. In reality, we achieve something like this by assigning more weight to closer pairs. 8 | 9 | The suggestion in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract) is to consider all pairs of rows from the original data set and use weights based on the Mahalanobis distance $d$ between the corresponding $v$, such as $$\frac{1}{1 + d}.$$ 10 | 11 | 12 | ### Need for Renormalizing Weights 13 | 14 | Pairs of rows with nearby $v$'s have high weights, where pairs from far-away $v$'s have low weights. But rows with fewer $v$'s near them end up with less total weight than rows with more $v$'s near them. This isn't so good, since we meant to weight $v$'s according to their distribution in the original data set. 15 | 16 | The solution is to renormalize the weights so that when grouping by the first element of each pair, the sum for each group is the same. Here's [the line of code that does this in the package](https://github.com/dchudz/predcomps/blob/master/R/pairs.R#L85). 17 | 18 | Maybe this renormalization goes without saying, but it isn't explicitly mentioned in [Gelman and Pardoe 2007](http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9531.2007.00181.x/abstract). 19 | 20 | ### Far Away Points Dominate 21 | 22 | Imagine $v$ is a vector of $n$ inputs, spread uniformly throughout space. Then the expected number of points at distance $d$ from a given $v$ would be proportional to $d^n$. This means that using weights inversely proportional to distance will end up letting far-away points dominate, once you account for how many more far-away points there are. 23 | 24 | ### Current Implementation: Using a Fixed Number of Nearest Points 25 | 26 | My current implementation allows you an option that addresses the above concern that far-away points dominate (in an unfortunately ad hoc manner): You can specify that for each transition start point, we keep only certain number of the nearest transition end points. This parameter is specified as `onlyIncludeNearestN`. 27 | 28 | The appropriate setting for this parameter will depend on the number of points used, and maybe other properties of the data set. 29 | 30 | ### Limiting the number of points used 31 | 32 | In the paper, Gelman and Pardoe mention that we can save on computation by, when necessary, estimating the average predictive comparison using a subset of the available data. My suggestion is that the set of points considered for transition starts need not be the same as the set considered for transition ends. For example, we choose a small subset as candidate transition starts, we may still wish to consider the full data set as potential endpoints for the transitions. This could be beneficial, since to best represent the conditional distribution $p(u_2|v_1)$ we're looking for $v_2$ close to $v_1$. We will find a closer $v_2$ by considering a larger set of potential transition endpoints than if we restricted the set of candidates for $v_2$ just as we restricted for $v_1$. 33 | 34 | The parameters to specify are `numForTransitionStart` and `numForTransitionEnd`. They default to `NULL`, in which case we use the entire data set. 35 | 36 | ### Recommendations 37 | 38 | I recommend that `numForTransitionEnd` should be larger than `numForTransitionStart`, and that `onlyIncludeNearestN` should be a fraction of `numForTransitionEnd`. 39 | 40 | However, appropriate settings are not well understand. Please send me your feedback on what works well for you. 41 | -------------------------------------------------------------------------------- /notes/presentations/Lunch.Rmd: -------------------------------------------------------------------------------- 1 | Changes to make: 2 | 3 | - Flesh out motivating examples in much more detail 4 | - logistic regression: more explanation / focus on logistic regression equation 5 | - maybe nothing different for other example? 6 | 7 | - For ML audience: lead more on how you would use it by switching order to: 8 | - wine as is 9 | - calling API on wine 10 | - credit 11 | - THEN the math definitions 12 | - comparison with other approaches 13 | 14 | Anthony feedback: 15 | 16 | - Say earlier why we're doing wine example 17 | - why are we looking at wine example? 18 | - picture in motivation for the equations 19 | - do the exercise live 20 | - don't need histograms or feature definitions 21 | 22 | 23 | ```{r echo=FALSE, message=FALSE} 24 | load(file="../examples/wine-logistic-regression.RData") 25 | library(ggplot2) 26 | theme_set(theme_gray(base_size = 18)) 27 | library(gridExtra) 28 | library(knitr) 29 | library(predcomps) 30 | opts_chunk$set(fig.cap="", echo=FALSE, 31 | fig.width=2*opts_chunk$get("fig.width"), 32 | fig.height=.9*opts_chunk$get("fig.height") 33 | ) 34 | ``` 35 | 36 | ## An R Package to Help Interpret Predictive Models: 37 | 38 | ### (Average) Predictive Comparisons 39 | 40 | David Chudzicki 41 | 42 | 43 | ## Related concepts 44 | 45 | - sensitivity analysis 46 | - elasticity 47 | - variable importance 48 | - partial dependence 49 | - etc. 50 | 51 | ## **This** approach: 52 | 53 | - treats model as a black box 54 | - **tries to properly account for relationships among the inputs of interest** 55 | - based on Gelman & Pardoe (2007), which asked: "On average, much does the output increase per unit increase in input?" 56 | 57 | 58 | ## Plan 59 | 60 | 1. Motivating fake example 61 | 2. Predictive comparisons definitions: what we want 62 | 4. Applying average predictive comparisons to (1) 63 | 5. An example with real data: credit scoring 64 | 65 | - Discussion! 66 | - (Appendix) Estimation & Computation: how to get what we want 67 | - (Appendix) Comparison with other approaches 68 | 69 | 70 | ## Silly Example 71 | 72 | - We sell wine 73 | - Wine varies in: price, quality 74 | - Customers randomly do/don't buy, depending on price and quality 75 | 76 | ## Logistic Regression 77 | 78 | - $P$: price ($) 79 | - $Q$: quality (score on arbitrary scale) 80 | - **Model**: $P(\text{wine is purchased}) = logit^{-1}(\beta_0 + \beta_1 Q + \beta_2 P)$ 81 | 82 | ```{r echo=FALSE} 83 | library(boot) 84 | s <- seq(-4,4,by=.01) 85 | qplot(s, 1/(1+exp(-s)), geom="line") + ggtitle("Inverse Logit Curve") 86 | ``` 87 | 88 | - difficulty: interpreting coefficients on logit scale 89 | 90 | 91 | **True model**: $P(\text{wine is purchased}) = logit^{-1}(0.1 Q - 0.12 P)$ 92 | 93 | ## Distribution of Inputs (variation 1): 94 | 95 | Price and quality are (noisily) related: 96 | 97 | ```{r} 98 | myScales <- list(scale_x_continuous(limits=c(-15,125)), 99 | scale_y_continuous(limits=c(0,1))) 100 | qualityScale <- ylim(c(-20,130)) 101 | qplot(Price, Quality, alpha=I(.5), data = df1Sample) + 102 | qualityScale 103 | ``` 104 | 105 | ## We don't really need a model to understand this... 106 | 107 | (A random subset of the data. For clarity, showing only a discrete subset of prices.) 108 | 109 | ```{r} 110 | v1Plot <- ggplot(subset(df1Sample, Price %in% seq(20, 120, by=10))) + 111 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 112 | size = 3, alpha = 1) + 113 | ggtitle("Quality vs. Purchase Probability at Various Prices") + 114 | myScales + 115 | scale_color_discrete("Price") 116 | v1Plot 117 | ``` 118 | 119 | ## We don't really need a model to understand this... 120 | 121 | For each individual price, quality vs. purchase probability forms a portion of a shifted inverse logit curve: 122 | 123 | ```{r warning=FALSE} 124 | last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 125 | data = linesDF, 126 | size=.2) 127 | ``` 128 | 129 | - how changes in price/quality affect $P(\text{wine is purchased})$ depends a lot on where you are in input space 130 | 131 | ## Variation 2 132 | 133 | In another possible world, mid-range wines are more common: 134 | 135 | ```{r} 136 | qplot(Price, Quality, alpha=I(.5), data = df2Sample) + 137 | qualityScale 138 | ``` 139 | 140 | - input distribution is changed 141 | - ... but model is not changed 142 | 143 | ## In this world, quality matters more.... 144 | 145 | - (more precisely, cases where quality matters are more common) 146 | 147 | ```{r} 148 | v2Plot <- ggplot(subset(df2Sample, Price %in% seq(20, 120, by=10))) + 149 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 150 | size = 3, alpha = 1) + 151 | ggtitle("Quality vs. Purchase Probability at Various Prices") + 152 | myScales + 153 | scale_color_discrete("Price") 154 | v2Plot 155 | ``` 156 | 157 | ## Variation 3 158 | 159 | In a third possible world, price varies more strongly with quality: 160 | 161 | ```{r} 162 | qplot(Price, Quality, alpha=I(.5), data = df3Sample) + 163 | qualityScale 164 | ``` 165 | 166 | Again: 167 | 168 | - input distribution is changed 169 | - ... but model is not changed, still $P(\text{wine is purchased}) = logit^{-1}(\beta_0 + \beta_1 Q + \beta_2 P)$ 170 | 171 | ## In this world, quality matters even more... 172 | 173 | - (more precisely: quality matters in all cases that we actually see) 174 | 175 | ```{r} 176 | v3Plot <- ggplot(subset(df3Sample, Price %in% seq(20, 120, by=10))) + 177 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 178 | size = 3, alpha = 1) + 179 | ggtitle("Quality vs. Purchase Probability at Various Prices") + 180 | myScales + 181 | scale_color_discrete("Price") 182 | v3Plot 183 | ``` 184 | 185 | ## Now quality matters more 186 | 187 | ... across all price ranges (for the kinds of variation **that we see in the data**) 188 | 189 | ```{r warning=FALSE} 190 | v3PlotWithCurves <- last_plot() + geom_line(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 191 | data = linesDF, 192 | size=.2) 193 | v3PlotWithCurves 194 | ``` 195 | 196 | ## Lessons from this example 197 | 198 | 1. We want to interpret things on the scale we care about (probability in this case) 199 | 2. Relationships among the inputs matter 200 | 201 | ## Goal is single-number summaries 202 | 203 | These concepts are vague, but keep them in mind as we try to formalize things in the next few slides: 204 | 205 | - **For each input, what is the average change in output per unit change in input?** 206 | (generalizes linear regression, units depend on units for input) 207 | 208 | - **How important is each input in influencing the output?** 209 | (units should be consistent across inputs -- think of standardized regression coefficients) 210 | 211 | ## Some notation 212 | 213 | **$u$**: the variable under consideration 214 | 215 | **$v$**: the vector of other variables (the "all else held equal") 216 | 217 | **$f(u,v)$**: a function that makes predictions, e.g. maybe $f(u,v) = \mathcal{E}[y \mid u, v, \theta]$ 218 | 219 | * We consider transitions in $u$ holding $v$ constant, e.g. $$\frac{f(u_2, v) - f(u_1, v)}{u_2-u_1}$$ 220 | * To get one-point summaries we'll take an average 221 | * All of the subtlety lies in the choice of $v$, $u_1$, $u_2$ 222 | 223 | 224 | ## What average do we take? 225 | 226 | The APC is defined as 227 | 228 | $$\frac{\mathcal{E}[\Delta_f]}{\mathcal{E}[\Delta_u]}$$ 229 | 230 | where 231 | 232 | * $\Delta_f = f(u_2,v) - f(u_1,v)$ 233 | * $\Delta_u = u_2 - u_1$ 234 | * $\mathcal{E}$ is expectation under the following process: 235 | 236 | 1. sample $v$ from the (marginal) distribution of the corresponding inputs 237 | 2. sample $u_1$ and $u_2$ independently from the distribution of $u$ conditional on $v$ 238 | 239 | 240 | ## Variations 241 | 242 | - "Impact" (my idea; help me with naming!) is just the expected value of $\Delta_f = f(u_2,v) - f(u_1,v)$ 243 | - Absolute versions (of both impact and APC) use $\mathcal{E}[|\Delta_f|]$ and $\mathcal{E}[|\Delta_u|]$ 244 | 245 | ## Computation 246 | 247 | - Once we've said what we want, computing/estimating it isn't trivial 248 | - More on this in the discussion (depending on time/interest) 249 | 250 | ## Returning to the wines... 251 | 252 | ```{r fig.height=4} 253 | apcComparisonPlot <- ggplot(subset(apcAllVariations, Input=="Quality")) + 254 | geom_bar(aes(x=factor(Variation, levels=3:1), y=PerUnitInput.Signed), stat="identity", width=.5) + 255 | expand_limits(y=0) + 256 | xlab("Variation") + 257 | ggtitle("APC for Quality across Variations") + 258 | coord_flip() 259 | apcComparisonPlot 260 | grid.arrange(v1Plot + ggtitle("V1"), v2Plot + ggtitle("V2"), v3Plot + ggtitle("V3"), nrow=1) 261 | ``` 262 | 263 | Exercise for the reader: Make an example where APC is larger than in Variation 1 but "Impact" is much smaller. 264 | 265 | ## Credit Scoring Example 266 | 267 | ```{r} 268 | load(file="../examples/loan-defaults.RData") 269 | ``` 270 | 271 | - **SeriousDlqin2yrs** (target variable, 7% "yes"): Person experienced 90 days past due delinquency or worse 272 | - **RevolvingUtilizationOfUnsecuredLines**: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits 273 | - **age**: Age of borrower in years 274 | - **NumberOfTime30-59DaysPastDueNotWorse**: Number of times borrower has been 30-59 days past due but no worse in the last 2 years. 275 | - **NumberOfTime60-89DaysPastDueNotWorse**: Number of times borrower has been 60-89 days past due but no worse in the last 2 years. 276 | - **NumberOfTimes90DaysLate**: Number of times borrower has been 90 days or more past due. 277 | - **DebtRatio**: Monthly debt payments, alimony,living costs divided by monthy gross income 278 | - **MonthlyIncome**: Monthly income 279 | - **NumberOfOpenCreditLinesAndLoans**: Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) 280 | - **NumberRealEstateLoansOrLines**: Number of mortgage and real estate loans including home equity lines of credit 281 | - **NumberOfDependents**: Number of dependents in family excluding themselves (spouse, children etc.) 282 | 283 | 284 | ## Input Distribution 285 | 286 | ```{r fig.height = 12, echo=FALSE} 287 | allHistograms 288 | ``` 289 | 290 | Note: previous lateness (esp. 90+) days is rare. 291 | 292 | ## Model Building 293 | 294 | We'll use a random forest for this example: 295 | 296 | ```{r eval=FALSE, echo=TRUE} 297 | set.seed(1) 298 | # Turning the response to type "factor" causes the RF to be build for classification: 299 | credit$SeriousDlqin2yrs <- factor(credit$SeriousDlqin2yrs) 300 | rfFit <- randomForest(SeriousDlqin2yrs ~ ., data=credit, ntree=ntree) 301 | ``` 302 | 303 | ## Aggregate Predictive Comparisons 304 | 305 | ```{r eval=FALSE, echo=TRUE} 306 | set.seed(1) 307 | apcDF <- GetPredCompsDF(rfFit, credit, 308 | numForTransitionStart = numForTransitionStart, 309 | numForTransitionEnd = numForTransitionEnd, 310 | onlyIncludeNearestN = onlyIncludeNearestN) 311 | ``` 312 | 313 | ```{r} 314 | kable(apcDF, row.names=FALSE) 315 | ``` 316 | 317 | ## Impact Plot 318 | 319 | (Showing +/- the absolute impact, since signed impact is bounded between those numbers) 320 | 321 | (Showing impact rather than APC b/c the different APC units wouldn't be comparable, shouldn't go on one chart) 322 | 323 | ```{r LoanDefaultImpact} 324 | PlotPredCompsDF(apcDF) 325 | ``` 326 | 327 | Summaries like this can guide questions that push is to dig deeper, like: 328 | 329 | - What's going on with age? (To get the cancellation we see, it must have strong effects that vary in sign) 330 | - Why don't instances of previous lateness always increase your probability of a 90-days-past-due incident? (cancellation?) 331 | 332 | 333 | ## Goals for sensitivity analysis 334 | 335 | - want something that properly accounts for relationships among variables of interest 336 | - want something that emphasizes the values that are plausible given the other values (two reasons: this is what we care about, and this is what our model has better estimates of) 337 | 338 | 339 | ```{r warning=FALSE} 340 | v3PlotWithCurves + ggtitle("Wines: Variation 3") 341 | ``` 342 | 343 | ## How we'll do sensitivity analysis 344 | 345 | - choose a few random values for $v$ (the "all else held equal") 346 | - sample from $u$ conditional on $v$ 347 | - plot $u$ vs. the prediction for each $v$ 348 | 349 | ```{r} 350 | ggplot(pairsSummarizedAge, aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 351 | geom_point(aes(size = Weight)) + 352 | geom_line(size=.2) + 353 | xlab("age") + 354 | ylab("Prediction") + 355 | guides(color = FALSE) 356 | ``` 357 | 358 | ## Zooming in... 359 | 360 | ```{r} 361 | ggplot(subset(pairsSummarizedAge, OriginalRowNumber <= 8), 362 | aes(x=age.B, y=yHat2, color=factor(OriginalRowNumber))) + 363 | geom_point(aes(size = Weight)) + 364 | geom_line(size=.2) + 365 | xlab("age") + 366 | ylab("Prediction") + 367 | guides(color = FALSE) + 368 | coord_cartesian(ylim=(c(-.01,.2))) 369 | ``` 370 | 371 | - shows off some weird behavior of the model! 372 | - we should dig deeper, get more comfortable with the model 373 | - try other models 374 | 375 | 376 | ## Sensitivity: Number of Time 30-35 Days Past Due 377 | 378 | - Mostly we see the increasing probability that we'd expect... 379 | 380 | ```{r, warning=FALSE} 381 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 382 | geom_point(aes(size = Weight)) + 383 | geom_line(size=.2) + 384 | scale_x_continuous(limits=c(0,2)) + 385 | scale_size_area() + 386 | xlab("NumberOfTime30.59DaysPastDueNotWorse") + 387 | ylab("Prediction") + 388 | guides(color = FALSE) 389 | ``` 390 | 391 | ## Sensitivity: Number of Time 30-35 Days Past Due 392 | 393 | ... but in one case, probability of default *decreases* with the 0-to-1 transition 394 | 395 | ```{r echo=FALSE, warning=FALSE} 396 | ggplot(pairsSummarized, aes(x=NumberOfTime30.59DaysPastDueNotWorse.B, y=yHat2, color=factor(OriginalRowNumber))) + 397 | geom_point(aes(size = Weight)) + 398 | geom_line(aes(alpha=ifelse(OriginalRowNumber == 18, 1, .3))) + 399 | scale_x_continuous(limits=c(0,2)) + 400 | scale_alpha_identity() + 401 | scale_size_area() + 402 | xlab("NumberOfTime30.59DaysPastDueNotWorse") + 403 | ylab("Prediction") + 404 | guides(color = FALSE) 405 | ``` 406 | 407 | ## Can we explain it? 408 | 409 | ```{r} 410 | oneRowWithDecreasingDefaultProbability <- oneOriginalRowNumber[1,intersect(names(oneOriginalRowNumber), names(credit))] 411 | kable(oneRowWithDecreasingDefaultProbability[,1:5]) 412 | kable(oneRowWithDecreasingDefaultProbability[,6:8]) 413 | kable(oneRowWithDecreasingDefaultProbability[,9:10]) 414 | ``` 415 | 416 | ## Alternative Approach to Sensitivity Analysis: "Partial Plots" 417 | 418 | - e.g. `partialPlot` function in `randomForest` library in R 419 | 420 | 421 | This accounts for relationships among the "all else held equal" but **not** between those and the input under consideration 422 | 423 | Make predictions on a new data set constructed as follows: 424 | 425 | ```{r} 426 | oneRow <- data.frame(v1=1, v2="a", v3=2.3, u=6) 427 | ``` 428 | 429 | One row: 430 | 431 | ```{r results='markup'} 432 | kable(oneRow) 433 | ``` 434 | 435 | Repeat the row varying $u$ across its whole range: 436 | 437 | ```{r results='markup'} 438 | oneRowRep <- oneRow[rep(1,20),] 439 | oneRowRep$u <- 1:20 440 | kable(oneRowRep) 441 | ``` 442 | 443 | - Then average predictions grouping by $u$ (or not) 444 | 445 | ## Partial Plot Method Applied to Wines 446 | 447 | ```{r} 448 | v3Plot + geom_point(aes(x=Quality, y=PurchaseProbability), stat="summary", fun.y="mean", data=linesDF) + 449 | geom_point(aes(x = Quality, y = PurchaseProbability, color = factor(Price)), 450 | data = linesDF, 451 | size=1) + 452 | ggtitle("(V3)") 453 | ``` 454 | 455 | 456 | 457 | ## Comparison with other approaches 458 | 459 | Things that vary 460 | 461 | - Units - depend on input or consistent across inputs 462 | - Sensitive to univariate distribution of inputs 463 | - Sensitive to dependence between inputs 464 | - Shows shape of non-linearity 465 | - Signed 466 | - Model 467 | - Based on holdout performance 468 | 469 | 470 | ## Computing the APC 471 | 472 | - sampling a $v$ is easy (take a row from your data) 473 | - sampling $u_1$ given $v$ is easy (take the $u$ in that row) 474 | - sampling *another* $u$ ($u_2$) is harder 475 | - would be easy if you had a lot of data with the same $v$ 476 | - so take $u$'s from rows of data where the correspinding $v$ is close to your $v$ 477 | 478 | ## In Practice 479 | 480 | - Gelman & Pardoe look at all pairs of rows and assign weights $$\frac{1}{1 + d}$$ 481 | - I think this was giving too much weight to faraway points (volume of $n$-spheres grows much faster than linearly in $d$) 482 | - I use Gelman's weights, but only looking at a fixed number of the closest points 483 | - Computational advantage: fewer points to deal with 484 | - A few example 485 | 486 | ```{r} 487 | pairsOrdered <- pairs[order(pairs$OriginalRowNumber),] 488 | 489 | for (i in 1:20) { 490 | cat("\n\n") 491 | kable(head(subset(pairsOrdered, OriginalRowNumber==i)[c("OriginalRowNumber", "age", "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", "NumberOfTime30.59DaysPastDueNotWorse", "NumberOfTime30.59DaysPastDueNotWorse.B","yHat1","yHat2", "Weight")])) 492 | } 493 | 494 | ``` 495 | 496 | -------------------------------------------------------------------------------- /notes/template: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | --- 4 | 5 | #!html_output# 6 | -------------------------------------------------------------------------------- /tests/test-apc.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(predcomps) 3 | 4 | test_that("APC matches coefficient exactly for linear model", { 5 | 6 | df <- data.frame(X = rep(c(1,2),2), 7 | Y = c(1,2,3,4)) 8 | 9 | predictionFunction <- function(df) 2*df$X + 3*df$Y 10 | result <- GetSingleInputApcs(predictionFunction, df, u="X", v="Y") 11 | expect_that(result$PerUnitInput.Signed, equals(2)) 12 | }) 13 | -------------------------------------------------------------------------------- /tests/test-pairs.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(predcomps) 3 | 4 | MakeComparable <- function(df) { 5 | df <- round(df, digits = 5) 6 | return(df[do.call(order, df), ]) 7 | } 8 | 9 | test_that("GetPairs works right in a small example", { 10 | 11 | df <- data.frame(X = rep(c(1,2),2), 12 | Y = rep(c(2,4),2)) 13 | pairsActual <- GetPairs(df, "X", "Y") 14 | pairsExpected <- data.frame(OriginalRowNumber = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), 15 | X = c(1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2), 16 | Y = c(2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4), 17 | X.B = c(2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1), 18 | Weight = c(0.166666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.666666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.666666666666667, 0.166666666666667)) 19 | pairsActual <- pairsActual[names(pairsExpected)] 20 | expect_that(all.equal(MakeComparable(pairsActual), MakeComparable(pairsExpected)), is_true()) 21 | }) 22 | --------------------------------------------------------------------------------