├── data ├── bankData.rda └── bankLabels.rda ├── tests ├── testthat.R └── testthat │ ├── rawData.rda │ ├── drugLabel.rda │ ├── testData.rda │ ├── PlotHistogram.RDS │ ├── test_PlotCatVar.R │ ├── test_PlotNumVar.R │ ├── drugLabel.csv │ ├── test_vlm.R │ ├── test_PlotRates.R │ ├── test_PlotMean.R │ ├── test_PlotQuantiles.R │ ├── test_PlotBarplot.R │ ├── test_PlotRatesOverTime.R │ ├── test_PlotDist.R │ ├── test_CalcR2.R │ ├── test_SummaryStats.R │ ├── rawData_bigint.csv │ ├── test_OrderByR2.R │ ├── rawData.csv │ └── test_PrepData.R ├── .Rbuildignore ├── figures ├── sample_plots_numerical.png └── sample_plots_categorical.png ├── .travis.yml ├── cran-comments.md ├── CODEOWNERS ├── .gitignore ├── man ├── bankLabels.Rd ├── PlotRates.Rd ├── PlotQuantiles.Rd ├── PlotMean.Rd ├── PlotBarplot.Rd ├── PrepLabels.Rd ├── CalcR2.Rd ├── SummaryStats.Rd ├── PlotDist.Rd ├── bankData.Rd ├── PlotRatesOverTime.Rd ├── otvPlots.Rd ├── OrderByR2.Rd ├── PlotNumVar.Rd ├── PlotCatVar.Rd ├── PrintPlots.Rd ├── PrepData.Rd ├── PlotVar.Rd └── vlm.Rd ├── NAMESPACE ├── DESCRIPTION ├── R ├── utils.R ├── data.R ├── package_otvPlots.R ├── plots_order.R ├── vlm.R ├── plot_print.R └── categorical.R ├── README.md └── LICENSE /data/bankData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/data/bankData.rda -------------------------------------------------------------------------------- /data/bankLabels.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/data/bankLabels.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(otvPlots) 3 | 4 | test_check("otvPlots") 5 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^\.travis\.yml$ 2 | figures 3 | cran-comments.md 4 | .whitesource 5 | CODEOWNERS 6 | -------------------------------------------------------------------------------- /tests/testthat/rawData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/rawData.rda -------------------------------------------------------------------------------- /tests/testthat/drugLabel.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/drugLabel.rda -------------------------------------------------------------------------------- /tests/testthat/testData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/testData.rda -------------------------------------------------------------------------------- /figures/sample_plots_numerical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/figures/sample_plots_numerical.png -------------------------------------------------------------------------------- /tests/testthat/PlotHistogram.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/PlotHistogram.RDS -------------------------------------------------------------------------------- /figures/sample_plots_categorical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/figures/sample_plots_categorical.png -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Resubmission 2 | This is a resubmission. In this version I have: 3 | 4 | * Remove the VignetteBuilder field in DESCRIPTION. 5 | 6 | * Modify the Description file in DESCRIPTION, by removing “this package” at the beginning. -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This is a comment. 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # These owners will be the default owners for everything in 5 | # the repo. Unless a later match takes precedence, 6 | # @yingboli and @Yingru will be requested for 7 | # review when someone opens a pull request. 8 | * @yingboli @Yingru 9 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotCatVar.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot categorical variable") 4 | load("../testthat/testData.rda") 5 | setDT(testData) 6 | 7 | test_that("PlotCatVar returns a gtable", { 8 | PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months") 9 | p <- PlotCatVar("marital", testData, NULL, "weeks", "months")$p 10 | expect_is(p, "gtable") 11 | }) 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Capital One Services, LLC 2 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not use 3 | # this file except in compliance with the License. You may obtain a copy of the 4 | # License at http://www.apache.org/licenses/LICENSE-2.0 5 | # Unless required by applicable law or agreed to in writing, software 6 | # distributed under the License is distributed on an "AS IS" BASIS, 7 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | # See the License for the specific language governing permissions and limitations under the License. 9 | .Rproj.user 10 | .Rhistory 11 | .RData 12 | inst/doc 13 | .pdf 14 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotNumVar.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot Continuous Variable") 4 | load("../testthat/testData.rda") 5 | setDT(testData) 6 | 7 | test_that("PlotNumVar returns a gtable", { 8 | PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months") 9 | p <- PlotNumVar("age", testData, NULL, "weeks", "months", 10 | skewOpt = 3, kSample = NULL)$p 11 | expect_is(p, "gtable") 12 | }) 13 | 14 | test_that("Incorrect skewOpt creates error", { 15 | PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months") 16 | expect_error(PlotNumVar("age", testData, NULL, "weeks", "months", 17 | skewOpt = "test", kSample = NULL)$p) 18 | }) 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /man/bankLabels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{bankLabels} 5 | \alias{bankLabels} 6 | \title{Labels for bankData} 7 | \format{A data frame with 16 rows and 3 variables: 8 | \describe{ 9 | \item{V1}{Name of each variable in \code{\link{bankData}}.} 10 | \item{V2}{Label of each variable in \code{\link{bankData}}.} 11 | \item{V3}{A numeric variable, corresponding to the row number.} 12 | }} 13 | \usage{ 14 | bankLabels 15 | } 16 | \description{ 17 | A dataset containing the attribute labels also found in \code{\link{bankData}}. 18 | This data set is used to illustrate the \code{\link{PrepLabels}} function and 19 | other label functionality in the \code{\link{otvPlots}} package in R. 20 | } 21 | \keyword{datasets} 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(CalcR2) 4 | export(OrderByR2) 5 | export(PlotBarplot) 6 | export(PlotCatVar) 7 | export(PlotDist) 8 | export(PlotMean) 9 | export(PlotNumVar) 10 | export(PlotQuantiles) 11 | export(PlotRates) 12 | export(PlotRatesOverTime) 13 | export(PlotVar) 14 | export(PrepData) 15 | export(PrepLabels) 16 | export(PrintPlots) 17 | export(SummaryStats) 18 | export(vlm) 19 | import(data.table) 20 | import(ggplot2) 21 | importFrom(Hmisc,wtd.mean) 22 | importFrom(Hmisc,wtd.quantile) 23 | importFrom(Hmisc,wtd.var) 24 | importFrom(grDevices,cairo_pdf) 25 | importFrom(grDevices,dev.off) 26 | importFrom(graphics,par) 27 | importFrom(grid,gpar) 28 | importFrom(grid,grid.draw) 29 | importFrom(grid,grid.newpage) 30 | importFrom(grid,textGrob) 31 | importFrom(grid,unit) 32 | importFrom(grid,unit.c) 33 | importFrom(gridExtra,arrangeGrob) 34 | importFrom(moments,skewness) 35 | importFrom(scales,hue_pal) 36 | importFrom(stats,lm.fit) 37 | importFrom(stats,lm.wfit) 38 | importFrom(stats,quantile) 39 | importFrom(stats,sd) 40 | importFrom(stats,var) 41 | importFrom(stringi,stri_trans_general) 42 | importFrom(utils,tail) 43 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: otvPlots 2 | Title: Over Time Variable Plots 3 | Version: 0.2.1 4 | Authors@R: c( 5 | person("Rebecca", "Payne", role = "aut"), 6 | person("Zoey", "Zhu", role = c("aut")), 7 | person("Yingbo", "Li", email = "yingbo.li@capitalone.com", role = c("aut", "cre")), 8 | person("Capital One", role = "cph")) 9 | Description: Enables automated visualization of variable 10 | distribution and changes over time for predictive model building. 11 | Computes summary statistics aggregated by time for 12 | large datasets, and creates plots for variable level monitoring. 13 | Depends: 14 | R (>= 3.2.0) 15 | Imports: 16 | data.table (>= 1.9.6), 17 | ggplot2 (>= 2.1.0), 18 | grid (>= 3.2.0), 19 | gridExtra (>= 2.2.1), 20 | Hmisc (>= 3.17-4), 21 | moments, 22 | quantreg (>= 5.33), 23 | scales (>= 0.4.0), 24 | stringi (>= 1.1.1) 25 | License: Apache License 2.0 | file LICENSE 26 | LazyData: true 27 | Suggests: 28 | bit64, 29 | knitr, 30 | proto, 31 | testthat 32 | URL: https://github.com/capitalone/otvPlots 33 | BugReports: https://github.com/capitalone/otvPlots/issues 34 | RoxygenNote: 6.0.1 35 | -------------------------------------------------------------------------------- /tests/testthat/drugLabel.csv: -------------------------------------------------------------------------------- 1 | col1,col2,,,,, 2 | CaseNumber,The case number,,,,, 3 | date,Date of the test,,,,, 4 | Sex,Gender of the patient,,,,, 5 | Race,Race of the patient,,,,, 6 | Age,Age of the patient,,,,, 7 | "Re""side-nce .City",,,,,, 8 | Residence State,,,,,, 9 | Residence County,,,,,, 10 | Death City,,,,,, 11 | ,Wrong result,,,,, 12 | Death State,,,,,, 13 | Death County,,,,,, 14 | Location,,,,,, 15 | DescriptionofInjury,The kind of injury the patient has,,,,, 16 | InjuryPlace,The place the injury exists,,,,, 17 | ImmediateCauseA,The cause of the injury,,,,, 18 | Heroin,Level of heroin used,,,,, 19 | Cocaine,Level of Cocaine used,,,,, 20 | Fentanyl,Level of Fentanyl used,,,,, 21 | Oxycodone,Level of Oxycodone used,,,,, 22 | Oxymorphone,Level of Oxymorphone used,,,,, 23 | EtOH,Level of EtOH used,,,,, 24 | Hydro-codeine,Level of Hydro-codeine used,,,,, 25 | Benzodiazepine,Level of Benzodiazepine used,,,,, 26 | Methadone,Level of Methadone used,,,,, 27 | Amphet,Level of Amphet used,,,,, 28 | Tramad,Level of Tramad used,,,,, 29 | Morphine_not_heroin,Morphine not heroin,,,,, 30 | Other,Other things,,,,, 31 | Any Opioid,Whether there is opioid,,,,, 32 | MannerofDeath,Manner of death,,,,, 33 | DeathLoc,The death location,,,,, -------------------------------------------------------------------------------- /tests/testthat/test_vlm.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | context("Run the main function: vlm") 3 | drugSASDate <- read.csv("../testthat/drugSASDate.csv") 4 | 5 | test_that("At most one of sortVars and sortFn is passed in", { 6 | expect_error(vlm(dataFl = "../testthat/drugSASDate.csv", 7 | dateNm = "date", sortVars = c("age", "residencecity"))) 8 | }) 9 | 10 | test_that("varNms is a subset of sortVars", { 11 | expect_error(vlm(dataFl = drugSASDate, dateNm = "date", 12 | sortVars = c("age", "residencecity"), varNms = c("age"))) 13 | }) 14 | 15 | test_that("Incorrect file input when prepData is False", { 16 | expect_error(vlm(dataFl = "../testthat/drugRDate.csv", dateNm = "date", 17 | prepData = FALSE)) 18 | }) 19 | 20 | test_that("selectCols and dropCols together give an error", { 21 | expect_error(vlm("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 22 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 23 | selectCols = c("age", "balance", "date", "weight"), 24 | dropCols = c("default"), varNms = c("age"))) 25 | 26 | expect_error(vlm("../testthat/rawData.rda", dateNm = "date", weightNm = "weight", 27 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 28 | selectCols = c("age", "balance", "date", "weight"), 29 | dropCols = c("default"))) 30 | }) 31 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotRates.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot Continuous Rates over Time") 4 | load("../testthat/testData.rda") 5 | testData <- setDT(testData) 6 | testData <- testData[, .(balance, weight, date)] 7 | testData[, weeks := round(date, "weeks")] 8 | testDT = testData[, {list("zerorate" = mean(balance == 0), 9 | "missingrate" = mean(is.na(balance)))}, 10 | by = "weeks"] 11 | testMT = melt(testDT, id.vars = "weeks", 12 | measure.vars = c("zerorate", "missingrate")) 13 | 14 | 15 | test_that("Plot layers match expectations",{ 16 | p <- PlotRates(testMT, "balance", "weeks") 17 | expect_is(p$layers[[1]], "ggproto") 18 | expect_is(p$layers[[1]]$geom, "GeomLine") 19 | expect_is(p$layers[[1]]$stat, "StatIdentity") 20 | }) 21 | 22 | test_that("X axis is labelled 'weeks'",{ 23 | p <- PlotRates(testMT, "balance", "weeks") 24 | expect_identical(p$labels$x, "weeks") 25 | expect_identical(p$labels$y, NULL) 26 | }) 27 | 28 | 29 | test_that("Mapping layer contains expected elements", { 30 | p <- PlotRates(testMT, myVar = "balance", dateGp = "weeks") 31 | expect_true( "colour" %in% names(p$mapping)) 32 | expect_true( "group" %in% names(p$mapping)) 33 | expect_true( "x" %in% names(p$mapping)) 34 | expect_true( "y" %in% names(p$mapping)) 35 | expect_length(setdiff(c("colour", "group", "x", "y"), names(p$mapping)), 0) 36 | }) 37 | 38 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotMean.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot Mean over Time") 4 | load("../testthat/testData.rda") 5 | testData <- setDT(testData) 6 | testData <- testData[, .(balance, weight, date)] 7 | testData[, weeks := round(date, "weeks")] 8 | 9 | testDT = testData[, .(Mean = mean(balance)), by = "weeks"] 10 | cl = testData[, c(mean(balance), sd(balance))] 11 | cl = cl %*% matrix(c(1, 1, 1, -1), byrow = TRUE, nrow = 2) # mean +- 1 SD 12 | testDT[, c("cl1", "cl2") := list(cl[1], cl[2]) ] 13 | testMT = melt(testDT, id.vars = "weeks", 14 | measure.vars = c("Mean", "cl1", "cl2")) 15 | 16 | test_that("Plot layers match expectations",{ 17 | p <- PlotMean(testMT, "balance", "weeks") 18 | expect_is(p$layers[[1]], "ggproto") 19 | expect_is(p$layers[[1]]$geom, "GeomLine") 20 | expect_is(p$layers[[1]]$stat, "StatIdentity") 21 | }) 22 | 23 | test_that("X axis is labelled 'weeks'",{ 24 | p <- PlotMean(testMT, "balance", "weeks") 25 | expect_identical(p$labels$x, "weeks") 26 | expect_identical(p$labels$y, NULL) 27 | }) 28 | 29 | test_that("Scale is discrete",{ 30 | p <- PlotMean(testMT, "balance", "weeks") 31 | expect_is(p$scales$scales[[1]], "ScaleDiscrete") 32 | }) 33 | 34 | test_that("Mapping layer contains expected elements",{ 35 | p <- PlotMean(testMT, "balance", "weeks") 36 | expect_true( "group" %in% names(p$mapping)) 37 | expect_true("linetype" %in% names(p$mapping)) 38 | expect_true( "x" %in% names(p$mapping)) 39 | expect_true( "y" %in% names(p$mapping)) 40 | expect_length(setdiff(c("group", "linetype", "x", "y"), names(p$mapping)), 0) 41 | }) -------------------------------------------------------------------------------- /man/PlotRates.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/numerical.R 3 | \name{PlotRates} 4 | \alias{PlotRates} 5 | \title{Plot zero and missing rates for a numerical variable} 6 | \usage{ 7 | PlotRates(meltdx, myVar, dateGp) 8 | } 9 | \arguments{ 10 | \item{meltdx}{A \code{data.table} with missing rate and zero rate in long 11 | format, produced by \code{\link{SummaryStats}}} 12 | 13 | \item{myVar}{The name of the variable to be plotted} 14 | 15 | \item{dateGp}{Name of the variable that the time series plots should be 16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 19 | } 20 | \value{ 21 | A \code{ggplot2} object with a \code{missingrate} and 22 | \code{zerorate} grouped by \code{dateGp}. 23 | } 24 | \description{ 25 | Plot zero and missing rates for a numerical variable 26 | } 27 | \section{License}{ 28 | 29 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 30 | Version 2.0 (the "License"); you may not use this file except in compliance 31 | with the License. You may obtain a copy of the License at 32 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 33 | or agreed to in writing, software distributed under the License is 34 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 35 | KIND, either express or implied. See the License for the specific language 36 | governing permissions and limitations under the License. 37 | } 38 | 39 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotQuantiles.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot Quantiles over Time") 4 | load("../testthat/testData.rda") 5 | setDT(testData) 6 | 7 | testData[, weeks := round(date, "weeks")] 8 | testDT = testData[, { 9 | tmp1 = quantile(balance, p = c(.01, .5, .99)); 10 | list("p1" = tmp1[1] , 11 | "p50" = tmp1[2] , 12 | "p99" = tmp1[3] 13 | )}, by = "weeks"] 14 | 15 | testMT = melt(testDT, id.vars = "weeks", 16 | measure.vars = c("p99", "p50","p1")) 17 | globalPct = testData[ , quantile(balance, p = c(.01, .5, .99) ) ] 18 | globalDT = data.table("weeks" = rep(testMT[variable == "p99", "weeks", 19 | with = FALSE][[1]], 3)) 20 | globalDT[, c("variable", "value") := list(rep(c("p1_g", "p50_g", "p99_g"), 21 | each = .N/3), 22 | rep(globalPct, each = .N/3))] 23 | testMT = rbindlist(list( testMT, globalDT)) 24 | 25 | 26 | test_that("Plot layers match expectations",{ 27 | p <- PlotQuantiles(testMT, myVar = "balance", dateGp = "weeks") 28 | expect_is(p$layers[[1]], "ggproto") 29 | expect_is(p$layers[[1]]$geom, "GeomLine") 30 | expect_is(p$layers[[1]]$stat, "StatIdentity") 31 | }) 32 | 33 | test_that("Mapping layer contains expected elements", { 34 | p <- PlotQuantiles(testMT, myVar = "balance", dateGp = "weeks") 35 | expect_true( "colour" %in% names(p$mapping)) 36 | expect_true( "linetype" %in% names(p$mapping)) 37 | expect_true( "group" %in% names(p$mapping)) 38 | expect_true( "x" %in% names(p$mapping)) 39 | expect_true( "y" %in% names(p$mapping)) 40 | expect_length(setdiff(c("colour", "linetype", "group", "x", "y"), names(p$mapping)), 0) 41 | }) 42 | 43 | -------------------------------------------------------------------------------- /man/PlotQuantiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/numerical.R 3 | \name{PlotQuantiles} 4 | \alias{PlotQuantiles} 5 | \title{Plot 01, 50, and 99 percentile for a numerical variable} 6 | \usage{ 7 | PlotQuantiles(meltdx, myVar, dateGp) 8 | } 9 | \arguments{ 10 | \item{meltdx}{A data.table with p1, p50, and p99 in long format, produced by 11 | \code{\link{SummaryStats}}} 12 | 13 | \item{myVar}{The name of the variable to be plotted} 14 | 15 | \item{dateGp}{Name of the variable that the time series plots should be 16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 19 | } 20 | \value{ 21 | A \code{ggplot2} object with \code{dateGp} on the x axis, 22 | \code{value} on the y axis, and variables \code{p01}, \code{p50}, and 23 | \code{p99} plotted on the same graph, with grouped and global percentiles 24 | differentiated by line type. 25 | } 26 | \description{ 27 | Plot 01, 50, and 99 percentile for a numerical variable 28 | } 29 | \section{License}{ 30 | 31 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 32 | Version 2.0 (the "License"); you may not use this file except in compliance 33 | with the License. You may obtain a copy of the License at 34 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 35 | or agreed to in writing, software distributed under the License is 36 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 37 | KIND, either express or implied. See the License for the specific language 38 | governing permissions and limitations under the License. 39 | } 40 | 41 | -------------------------------------------------------------------------------- /man/PlotMean.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/numerical.R 3 | \name{PlotMean} 4 | \alias{PlotMean} 5 | \title{Plot mean with {Mean +- 1SD} control limits for a numerical variable} 6 | \usage{ 7 | PlotMean(meltdx, myVar, dateGp) 8 | } 9 | \arguments{ 10 | \item{meltdx}{A \code{data.table} with Mean and 1SD control limits in long format, 11 | produced by \code{\link{SummaryStats}}} 12 | 13 | \item{myVar}{The name of the variable to be plotted} 14 | 15 | \item{dateGp}{Name of the variable that the time series plots should be 16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 19 | } 20 | \value{ 21 | A \code{ggplot2} object with \code{dateGp} on the x axis, 22 | \code{value} on the y axis, and variables \code{Mean}, \code{cl1}, and 23 | \code{cl2} plotted on the same graph, with mean and control limits 24 | differentiated by line type. 25 | } 26 | \description{ 27 | Plot mean with {Mean +- 1SD} control limits for a numerical variable 28 | } 29 | \section{License}{ 30 | 31 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 32 | Version 2.0 (the "License"); you may not use this file except in compliance 33 | with the License. You may obtain a copy of the License at 34 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 35 | or agreed to in writing, software distributed under the License is 36 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 37 | KIND, either express or implied. See the License for the specific language 38 | governing permissions and limitations under the License. 39 | } 40 | 41 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotBarplot.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot bar plot") 4 | load("../testthat/testData.rda") 5 | setDT(testData) 6 | suppressMessages(PrepData(testData, dateNm = "date", 7 | dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight")) 8 | 9 | test_that("expected plot elements are returned", { 10 | p <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = "weight") 11 | 12 | expect_is(p$layers[[1]], "ggproto") 13 | expect_is(p$layers[[1]]$geom, "GeomBar") 14 | expect_is(p$layers[[1]]$stat, "StatIdentity") 15 | expect_identical(p$labels$x, "job") 16 | expect_identical(p$labels$y, "count") 17 | expect_is(p$scales$scales[[1]], "ScaleDiscrete") 18 | expect_true( "group" %in% names(p$mapping)) 19 | expect_true( "x" %in% names(p$mapping)) 20 | expect_true( "y" %in% names(p$mapping)) 21 | expect_length(setdiff(c("group", "x", "y"), names(p$mapping)), 0) 22 | }) 23 | 24 | test_that("variable is put in expected order with and without weights", { 25 | p <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = "weight") 26 | o1 <- names(rev(sort(xtabs(weight~job, data=testData)))) 27 | o2 <- as.character(p$data[order(-count)][["job"]]) 28 | expect_equal(o1, o2) 29 | 30 | p <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = NULL) 31 | o1 <- names(rev(sort(testData[, table(job)]))) 32 | o2 <- rev(as.character(p$data[order(count)][["job"]])) 33 | expect_equal(o1, o2) 34 | }) 35 | 36 | test_that("global totals are calculated as expected", { 37 | p1 <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = "weight") 38 | expect_equal(as.numeric(p1$data[job=="retired"]$count), as.numeric(testData[job=="retired", sum(weight)])) 39 | p2 <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = NULL) 40 | expect_equal(as.numeric(p2$data[job=="entrepreneur"]$count), as.numeric(testData[job=="entrepreneur", .N])) 41 | }) 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /man/PlotBarplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/categorical.R 3 | \name{PlotBarplot} 4 | \alias{PlotBarplot} 5 | \title{Creates a bar plot for a discrete (or binary) variable} 6 | \usage{ 7 | PlotBarplot(dataFl, myVar, weightNm = NULL) 8 | } 9 | \arguments{ 10 | \item{dataFl}{A \code{data.table} of data; must be the output of the 11 | \code{\link{PrepData}} function.} 12 | 13 | \item{myVar}{The name of the variable to be plotted} 14 | 15 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 16 | no weights (all rows receiving weight 1).} 17 | } 18 | \value{ 19 | A \code{ggplot} object with a histogram of \code{myVar} ordered by 20 | category frequency 21 | } 22 | \description{ 23 | Creates a bar plot for a discrete (or binary) variable 24 | } 25 | \section{License}{ 26 | 27 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 28 | Version 2.0 (the "License"); you may not use this file except in compliance 29 | with the License. You may obtain a copy of the License at 30 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 31 | or agreed to in writing, software distributed under the License is 32 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 33 | KIND, either express or implied. See the License for the specific language 34 | governing permissions and limitations under the License. 35 | } 36 | 37 | \examples{ 38 | data(bankData) 39 | bankData = PrepData(bankData, dateNm = "date", dateGp = "months", 40 | dateGpBp = "quarters", weightNm = NULL) 41 | PlotBarplot(bankData, "job") 42 | 43 | ## NA will be included as a category if any NA are present 44 | bankData[sample.int(.N)[1:1000], education := NA] 45 | PlotBarplot(bankData, "education") 46 | } 47 | \seealso{ 48 | Functions depend on this function: 49 | \code{\link{PlotCatVar}}. 50 | 51 | This function depends on: 52 | \code{\link{PrepData}}. 53 | } 54 | -------------------------------------------------------------------------------- /man/PrepLabels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prep.R 3 | \name{PrepLabels} 4 | \alias{PrepLabels} 5 | \title{Prepare variable labels} 6 | \usage{ 7 | PrepLabels(labelFl, idx = 1:2) 8 | } 9 | \arguments{ 10 | \item{labelFl}{Either the path of a dataset (a csv file) containing 11 | labels, an R object convertible to \code{data.table} (e.g., data frame) or 12 | \code{NULL}. If \code{NULL}, no labels will be used. The label dataset must 13 | contain at least 2 columns: \code{varCol} (variable names) and 14 | \code{labelCol} (variable labels).} 15 | 16 | \item{idx}{A vector of length 2, giving column index of variable names (first 17 | position) and labels (second position).} 18 | } 19 | \value{ 20 | A data table formated for use by the \code{\link{vlm}} function. 21 | } 22 | \description{ 23 | This function prepares a dataset containing variable labels for use by 24 | the main plotting function \code{\link{vlm}}. The input must contain 25 | variables' names in the first column and labels in the second column. All other 26 | columns will be dropped. Special characters will create errors and should 27 | be stripped outside of R. All labels will be truncated at 145 characters. 28 | } 29 | \section{License}{ 30 | 31 | Copyright 2017 Capital One Services, LLC Licensed under the 32 | Apache License, Version 2.0 (the "License"); you may not use this file 33 | except in compliance with the License. You may obtain a copy of the 34 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 35 | applicable law or agreed to in writing, software distributed under the 36 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 37 | CONDITIONS OF ANY KIND, either express or implied. See the License for the 38 | specific language governing permissions and limitations under the License. 39 | } 40 | 41 | \examples{ 42 | data(bankLabels) 43 | bankLabels <- PrepLabels(bankLabels) 44 | } 45 | \seealso{ 46 | Functions depend on this function: 47 | \code{\link{PrintPlots}}, 48 | \code{\link{vlm}}. 49 | } 50 | -------------------------------------------------------------------------------- /man/CalcR2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plots_order.R 3 | \name{CalcR2} 4 | \alias{CalcR2} 5 | \title{Calculates R2 of a numerical variable using date as the predictor} 6 | \usage{ 7 | CalcR2(myVar, dataFl, dateNm, weightNm = NULL, imputeValue = NULL) 8 | } 9 | \arguments{ 10 | \item{myVar}{Name of variable to model.} 11 | 12 | \item{dataFl}{A \code{data.table}, containing \code{myVar}, \code{dateNm}, 13 | and \code{weightNm}.} 14 | 15 | \item{dateNm}{Name of column containing the date variable (to be modeled as 16 | numeric); this date column must not have NA's.} 17 | 18 | \item{weightNm}{Name of column containing row weights. If weights equal one, 19 | then the \code{\link{lm.fit}} function will be called, otherwise the 20 | \code{\link{lm.wfit}} will be called. The weights column must not have NA's.} 21 | 22 | \item{imputeValue}{Either \code{NULL} or numeric. If \code{NULL}, model will 23 | be fit on only non-NA components of \code{myVar}. If numeric, missing cases 24 | of \code{myVar} will be imputed to \code{imputeValue}.} 25 | } 26 | \value{ 27 | A numeric value of R2. 28 | } 29 | \description{ 30 | Calculates weighted R2 of a univariate weighted linear model with 31 | \code{dateNm} as x and \code{myVar} as y using the workhorse \code{lm.fit} 32 | and \code{lm.wfit} functions. 33 | } 34 | \section{License}{ 35 | 36 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 37 | Version 2.0 (the "License"); you may not use this file except in compliance 38 | with the License. You may obtain a copy of the License at 39 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 40 | or agreed to in writing, software distributed under the License is 41 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 42 | KIND, either express or implied. See the License for the specific language 43 | governing permissions and limitations under the License. 44 | } 45 | 46 | \seealso{ 47 | Functions depend on this function: 48 | \code{\link{OrderByR2}}. 49 | 50 | This function depends on: 51 | \code{\link{PrepData}}. 52 | } 53 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotRatesOverTime.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot trace plots of categories' proportions over time") 4 | load("../testthat/testData.rda") 5 | setDT(testData) 6 | require(ggplot2) 7 | suppressMessages(PrepData(testData, dateNm = "date", 8 | dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight")) 9 | p <- PlotRatesOverTime(dataFl = testData, dateGp = "weeks", myVar = "job", 10 | weightNm = "weight", newLevels = NULL)$p 11 | test_that("expected plot elements are returned", { 12 | expect_is(p$layers[[1]], "ggproto") 13 | expect_is(p$layers[[1]]$geom, "GeomLine") 14 | expect_is(p$layers[[1]]$stat, "StatIdentity") 15 | expect_is(p$layers[[1]]$position, "PositionIdentity") 16 | expect_identical(p$labels$x, "weeks") 17 | expect_identical(p$labels$y, "") 18 | expect_is(p$scales$scales[[1]], "ScaleContinuousDate") 19 | }) 20 | 21 | test_that("rates are calculated correctly normalized by time", { 22 | dat = p$data 23 | # check that all weeks sum to 1 24 | dat[, sum := sum(rate), by = "weeks"] 25 | dat[, table(sum)] 26 | expect_length(dat[, table(sum)], 1) 27 | 28 | # check that 2008-06-03 is correctly calculated 29 | tmpData = testData[weeks == "2008-06-03"] 30 | tmpData[, rate1 := sum(weight), by = "job"] 31 | tmpData[, rate0 := sum(weight)] 32 | tmpData[, rate := rate1/rate0] 33 | 34 | tmpData = unique(tmpData[, .(job, weeks, rate)]) 35 | dat = dat[weeks == "2008-06-03" & rate > 0, .(weeks, job, rate)] 36 | dat[, job := as.character(job)] 37 | setkey(dat, job) 38 | setkey(tmpData, job) 39 | expect_equal(dat[, rate], tmpData[, rate]) 40 | }) 41 | 42 | test_that("rates are calculated correctly normalized by var", { 43 | p <- PlotRatesOverTime(dataFl = testData, dateGp = "weeks", myVar = "job", 44 | weightNm = "weight", newLevels = NULL, normBy = "var")$p 45 | dat = p$data 46 | dat[, sum := sum(rate), by = "job"] 47 | 48 | #check all var rates sum to one 49 | expect_length(dat[, table(sum)], 1) 50 | expect_equal(dat[1, sum], 1) 51 | 52 | # check that rates are correctly calculated for technician 53 | tmpData = testData[job == "technician"] 54 | tmpData[, rate1:=sum(weight), by = "weeks"] 55 | tmpData[, rate0:= sum(weight)] 56 | tmpData[, rate := rate1/rate0] 57 | tmpData = unique(tmpData[, .(job, weeks, rate)]) 58 | expect_equal(tmpData[1:4, rate], dat[job=="technician"][2:5, rate]) 59 | }) 60 | 61 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 Capital One Services, LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software distributed 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 12 | # OF ANY KIND, either express or implied. 13 | # 14 | # See the License for the specific language governing permissions and limitations under the License. 15 | 16 | 17 | ########################################### 18 | # Utility Functions # 19 | ########################################### 20 | 21 | is.nmrcl <- function(x) inherits(x, "nmrcl") 22 | is.ctgrl <- function(x) inherits(x, "ctgrl") 23 | 24 | wtd_quantile_NA <- function(x, weights, probs = c(.0, .25, .5, .75, 1), 25 | ...) { #!# previous name: wtd.quantile_NA 26 | tryCatch(as.double(Hmisc::wtd.quantile(x, weights, probs, 27 | normwt = TRUE, na.rm = TRUE, ...)), 28 | error = function(e) rep(NA_real_, length(probs))) 29 | } 30 | 31 | ## The color-blind friendly color palette 32 | ## Source: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette 33 | cbbPalette <- c("#D55E00", "#009E73", "#0072B2", "#000000", "#E69F00", "#56B4E9", "#F0E442", "#CC79A7") 34 | 35 | # # An example function for fuzzy label matching 36 | # # To be used an input of the \code{\link{PlotVar}} function. 37 | # # If variables look like VAR_nameofvar, and the attribute dictionary contains 38 | # # defintions only for nameofvar, then a fuzzy matching function can be 39 | # # provided which would first attempt to match exactly, and then to attempt to 40 | # # match on the longest piece after splitting on the underscore: 41 | # 42 | # Fuzzy = function(LabelFl, myVar){ 43 | # ll = labelFl[varCol == myVar, labelCol] # exact match 44 | # if (ll == ""){ 45 | # # split on "_", search for exact match of longest piece 46 | # shortNm = names(which.max(sapply(strsplit(myVar, "_")[[1]], nchar))) 47 | # ll = labelFl[varCol == shortNm, labelCol] 48 | # } 49 | # return(ll) 50 | # } 51 | -------------------------------------------------------------------------------- /man/SummaryStats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/numerical.R 3 | \name{SummaryStats} 4 | \alias{SummaryStats} 5 | \title{Create summary statistics for a numerical variable} 6 | \usage{ 7 | SummaryStats(myVar, dataFl, dateGp, weightNm = NULL) 8 | } 9 | \arguments{ 10 | \item{myVar}{The name of the variable to be plotted} 11 | 12 | \item{dataFl}{A \code{data.table} of data; must be the output of the 13 | \code{\link{PrepData}} function.} 14 | 15 | \item{dateGp}{Name of the variable that the time series plots should be 16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 19 | 20 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 21 | no weights (all rows receiving weight 1).} 22 | } 23 | \value{ 24 | \item{meltdx}{A \code{data.table} for use by the plotting funtions 25 | \code{\link{PlotMean}}, \code{\link{PlotQuantiles}}, and 26 | \code{\link{PlotRates}}.} 27 | \item{numVarSummary}{A \code{data.table} of summary statistics.} 28 | } 29 | \description{ 30 | Create summary statistics for a numerical variable 31 | } 32 | \section{License}{ 33 | 34 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 35 | Version 2.0 (the "License"); you may not use this file except in compliance 36 | with the License. You may obtain a copy of the License at 37 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 38 | or agreed to in writing, software distributed under the License is 39 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 40 | KIND, either express or implied. See the License for the specific language 41 | governing permissions and limitations under the License. 42 | } 43 | 44 | \examples{ 45 | data(bankData) 46 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "quarters", 47 | dateGpBp = "years") 48 | mdx <- SummaryStats(myVar = "age", dataFl = bankData, 49 | dateGp = "quarters")$meltdx 50 | plot(PlotQuantiles(mdx[variable \%in\% c("p99", "p50", "p1", "p99_g", "p50_g", 51 | "p1_g")], "age", "quarters")) 52 | plot(PlotMean(mdx[variable \%in\% c("mean", "cl1", "cl2")], "age", "quarters")) 53 | plot(PlotRates(mdx, "age", "quarters")) 54 | } 55 | -------------------------------------------------------------------------------- /tests/testthat/test_PlotDist.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | library(proto) 3 | context("Plot Boxplots") 4 | load("../testthat/testData.rda") 5 | setDT(testData) 6 | suppressMessages(PrepData(dataFl = testData, dateNm = "date", dateGp = "weeks", dateGpBp = "weeks")) 7 | 8 | test_that("Plot layers match expectations",{ 9 | p <- PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", weightNm = "weight") 10 | expect_is(p$layers[[1]], "ggproto") 11 | expect_is(p$layers[[1]]$geom, "GeomBoxplot") 12 | expect_is(p$layers[[1]]$stat, "StatBoxplot") 13 | expect_is(p$layers[[2]]$geom, "GeomRug") 14 | expect_is(p$layers[[2]]$stat, "StatIdentity") 15 | }) 16 | 17 | 18 | test_that("Mapping layer contains expected elements", { 19 | p <- PlotDist(testData, myVar = "balance", dateGpBp = "weeks") 20 | expect_true( "group" %in% names(p$mapping)) 21 | expect_true( "x" %in% names(p$mapping)) 22 | expect_true( "y" %in% names(p$mapping)) 23 | expect_length(setdiff(c("group", "x", "y"), names(p$mapping)), 0) 24 | 25 | expect_true( "x" %in% names(p$layers[[2]]$mapping)) 26 | expect_true( "y" %in% names(p$layers[[2]]$mapping)) 27 | expect_length(setdiff(c("x", "y"), names(p$mapping)), 0) 28 | }) 29 | 30 | 31 | test_that("Y axis is labeled 'balance' and X axis is labeled 'weeks'",{ 32 | p <- PlotDist(testData, "balance", "weeks") 33 | expect_identical(p$labels$x, "weeks") 34 | expect_identical(p$labels$y, "balance") 35 | }) 36 | 37 | test_that("invalid log transform returns message and untransformed plot", { 38 | expect_message(PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", skewOpt = 3), 39 | "untransformed boxplot") 40 | p <- PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", skewOpt = 3) 41 | expect_is(p$layers[[1]], "ggproto") 42 | expect_is(p$layers[[1]]$geom, "GeomBoxplot") 43 | expect_is(p$layers[[1]]$stat, "StatBoxplot") 44 | expect_is(p$layers[[2]]$geom, "GeomRug") 45 | expect_is(p$layers[[2]]$stat, "StatIdentity") 46 | expect_equal(length(grep("log10", p$labels$y)), 0) 47 | }) 48 | 49 | test_that("valid log transform returns transformed scale",{ 50 | testData[, posBalance := ifelse(balance >= 0.1, balance, 0.1)] 51 | p <- PlotDist(dataFl = testData, myVar = "posBalance", dateGpBp = "weeks", skewOpt = 3) 52 | expect_message(PlotDist(dataFl = testData, myVar = "posBalance", dateGpBp = "weeks", skewOpt = 3), 53 | "Scale for 'y' is already present") 54 | expect_is(p$layers[[1]], "ggproto") 55 | expect_is(p$layers[[1]]$geom, "GeomBoxplot") 56 | expect_is(p$layers[[1]]$stat, "StatBoxplot") 57 | expect_is(p$layers[[2]]$geom, "GeomRug") 58 | expect_is(p$layers[[2]]$stat, "StatIdentity") 59 | expect_equal(grep("log10", p$labels$y), 1) 60 | }) 61 | -------------------------------------------------------------------------------- /man/PlotDist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/numerical.R 3 | \name{PlotDist} 4 | \alias{PlotDist} 5 | \title{Side-by-side box plots, for a numerical variable, grouped by \code{dateGpBp}} 6 | \usage{ 7 | PlotDist(dataFl, myVar, dateGpBp, weightNm = NULL, skewOpt = NULL) 8 | } 9 | \arguments{ 10 | \item{dataFl}{A \code{data.table} of data; must be the output of the 11 | \code{\link{PrepData}} function.} 12 | 13 | \item{myVar}{The name of the variable to be plotted} 14 | 15 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same 16 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.} 17 | 18 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 19 | no weights (all rows receiving weight 1).} 20 | 21 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 22 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of 23 | a variable whose skewness exceeds 5 will be on a log10 scale if possible. 24 | Negative input of \code{skewOpt} will be converted to 3.} 25 | } 26 | \value{ 27 | A \code{ggplot2} object with a box plot of \code{myVar} grouped by 28 | \code{dateGpBp} 29 | } 30 | \description{ 31 | For a variable is all positive (no zeros) and has larger than 50 all distinct 32 | values, if it is highly skewed, then all box plots can be plotted under the 33 | log base 10 transformation. See the argument \code{skewOpt} for details. 34 | } 35 | \section{License}{ 36 | 37 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 38 | Version 2.0 (the "License"); you may not use this file except in compliance 39 | with the License. You may obtain a copy of the License at 40 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 41 | or agreed to in writing, software distributed under the License is 42 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 43 | KIND, either express or implied. See the License for the specific language 44 | governing permissions and limitations under the License. 45 | } 46 | 47 | \examples{ 48 | data(bankData) 49 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 50 | dateGpBp = "quarters") 51 | PlotDist(dataFl = bankData, myVar = "balance", dateGpBp = "quarters") 52 | ## The following attempt to log transform will fail due to negative values, 53 | ## and the untransformed version will be returned 54 | PlotDist(dataFl = bankData, myVar = "balance", dateGpBp = "quarters", 55 | skewOpt = 3) 56 | ## This attempt should succeed, as the skew exceeds 3 and there are no 57 | ## negative values 58 | PlotDist(dataFl = bankData, myVar = "duration", dateGpBp = "quarters", 59 | skewOpt = 3) 60 | } 61 | -------------------------------------------------------------------------------- /tests/testthat/test_CalcR2.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | context("Calculate R-squared") 3 | load("../testthat/testData.rda") 4 | testData <- setDT(testData) 5 | testData <- testData[, .(age, weight, date)] 6 | 7 | test_that("CalcR2 gives correct R2 with weight", { 8 | test.R2 <- CalcR2("age", testData, "date", weightNm = "weight", imputeValue = NULL) 9 | ans.R2 <- summary(lm(age~date, weight=weight, data=testData))$r.squared 10 | expect_equal(test.R2, ans.R2) 11 | }) 12 | 13 | 14 | test_that("CalcR2 gives correct R2 without weight", { 15 | test.R2 <- CalcR2("age", testData, "date", weightNm = NULL, imputeValue = NULL) 16 | ans.R2 <- summary(lm(age~date, data=testData))$r.squared 17 | expect_equal(test.R2, ans.R2) 18 | }) 19 | 20 | #testData1 has missings in Y 21 | idx = sample.int(100, 10) 22 | testData1 = testData[idx, age:=NA] 23 | 24 | test_that("CalcR2 is correct with imputation in Y and weight", { 25 | test.R2 <- CalcR2("age", testData1, "date", weightNm = "weight", imputeValue = 0) 26 | ans.R2 <- summary(lm(age~date, data=testData1[is.na(age), age:=0], weight=weight))$r.squared 27 | expect_equal(test.R2, ans.R2) 28 | }) 29 | 30 | test_that("CalcR2 is correct with imputation in Y", { 31 | test.R2 <- CalcR2("age", testData1, "date", weightNm = NULL, imputeValue = 0) 32 | ans.R2 <- summary(lm(age~date, data=testData1[is.na(age), age:=0]))$r.squared 33 | expect_equal(test.R2, ans.R2) 34 | }) 35 | 36 | #testData2 has missings in weight and date, but not in Y 37 | testData2 = testData[sample.int(.N, 10), weight := NA] 38 | testData2 = testData2[sample.int(.N, 10), date := NA] 39 | test_that("CalcR2 is correct with missing values in weight and date", { 40 | test.R2 <- CalcR2("age", testData2, "date", weightNm = "weight", imputeValue = NULL) 41 | ans.R2 <- summary(lm(age~date, data=testData2, weight=weight))$r.squared 42 | expect_equal(test.R2, ans.R2) 43 | }) 44 | 45 | #testData3 has missing in weight, date and Y 46 | testData3 = testData2[idx, age := NA] 47 | test_that("CalcR2 is correct with missing values in weight and date and Y", { 48 | test.R2 <- CalcR2("age", testData3, "date", weightNm = "weight", imputeValue = NULL) 49 | ans.R2 <- summary(lm(age~date, data=testData3, weight=weight))$r.squared 50 | expect_equal(test.R2, ans.R2) 51 | }) 52 | 53 | 54 | test_that("CalcR2 is correct with missing values in weight and date and Y and imputation", { 55 | test.R2 <- CalcR2("age", testData3, "date", weightNm = "weight", imputeValue = 0) 56 | ans.R2 <- summary(lm(age~date, data=testData3[is.na(age), age:=0], weight=weight))$r.squared 57 | expect_equal(test.R2, ans.R2) 58 | }) 59 | 60 | 61 | test_that("CalcR2 is correct with no weight and missing values in date and Y", { 62 | test.R2 <- CalcR2("age", testData3, "date", weightNm = NULL, imputeValue = NULL) 63 | ans.R2 <- summary(lm(age~date, data=testData3))$r.squared 64 | expect_equal(test.R2, ans.R2) 65 | }) 66 | 67 | 68 | test_that("CalcR2 is correct with no weight and missing values in date and Y imputed", { 69 | test.R2 <- CalcR2("age", testData3, "date", weightNm = NULL, imputeValue = 0) 70 | ans.R2 <- summary(lm(age~date, data=testData3[is.na(age), age:=0]))$r.squared 71 | expect_equal(test.R2, ans.R2) 72 | }) 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /man/bankData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{bankData} 5 | \alias{bankData} 6 | \title{Direct marketing campaigns of a Portuguese banking institution} 7 | \format{A data frame with 45,211 rows and 19 variables: 8 | \describe{ 9 | \item{age}{Age of the client, numeric.} 10 | \item{job}{Type of job, a categorical variable with the levels: 11 | \code{'admin.'}, \code{'blue-collar'}, \code{'entrepreneur'}, 12 | \code{'housemaid'}, \code{'management'}, \code{'retired'}, 13 | \code{'self-employed'}, \code{'services'}, \code{'student'}, 14 | \code{'technician'}, \code{'unemployed'}, and \code{'unknown'}.} 15 | \item{marital}{Marital status, a categorical variable with levels: 16 | \code{'divorced'}, \code{'married'}, \code{'single'}, and \code{'unknown'}. 17 | Note that \code{'divorced'} means either divorced or widowed.} 18 | \item{education}{A categorical variable with levels: \code{'basic.4y'}, 19 | \code{'basic.6y'}, \code{'basic.9y'}, \code{'high.school'}, 20 | \code{'illiterate'}, \code{'professional.course'}, 21 | \code{'university.degree'}, and \code{'unknown'}.} 22 | \item{default}{Whether credit is in default, a categorical variable with 23 | levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.} 24 | \item{balance}{Account balance, numeric.} 25 | \item{housing}{Whether the client has a housing loan, a categorical variable 26 | with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.} 27 | \item{loan}{Whether the client has personal loan, a categorical variable 28 | with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.} 29 | \item{contact}{Type of contact communication, a categorical variable 30 | with levels: \code{'cellular'} and \code{'telephone'}.} 31 | \item{duration}{Last contact duration in seconds, a numeric variable.} 32 | \item{campaign}{Number of contacts performed during this campaign for 33 | this client, including the last contact; a numeric variable.} 34 | \item{pdays}{Number of days that passed by after the client was last 35 | contacted from a previous campaign; a numeric variable, with \code{999} 36 | means that client was not previously contacted.} 37 | \item{previous}{Number of contacts performed before this campaign for this 38 | client, a numeric variable.} 39 | \item{poutcome}{Outcome of the previous marketing campaign, a categorical 40 | variable with levels: \code{'failure'}, \code{'nonexistent'}, 41 | and \code{'success'}.} 42 | \item{y}{Whether the client has subscribed a term deposit, a categorical 43 | variable with levels: \code{'yes'} and \code{'no'}.} 44 | \item{date}{Last contact date.} 45 | }} 46 | \source{ 47 | \url{https://archive.ics.uci.edu/ml/datasets/Bank+Marketing} 48 | 49 | \cite{Lichman, M. (2013). UCI Machine Learning Repository 50 | [\url{http://archive.ics.uci.edu/ml}]. Irvine, CA: University of California, 51 | School of Information and Computer Science.} 52 | 53 | \cite{S. Moro, P. Cortez, and P. Rita. (2014) A Data-Driven Approach 54 | to Predict the Success of Bank Telemarketing. Decision Support Systems, 55 | Elsevier, 62:22-31, June 2014.} 56 | } 57 | \usage{ 58 | bankData 59 | } 60 | \description{ 61 | The marketing campaigns were based on phone calls. 62 | Often, more than one contact to the same client was required, in order to 63 | access if the product (bank term deposit) would be ('yes') or not ('no') 64 | subscribed. Records are ordered by date (from May 2008 to November 2010), 65 | similar to data analyzed in Moro et al. [2014]. 66 | } 67 | \keyword{datasets} 68 | -------------------------------------------------------------------------------- /man/PlotRatesOverTime.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/categorical.R 3 | \name{PlotRatesOverTime} 4 | \alias{PlotRatesOverTime} 5 | \title{Creates trace plots of categories' proportions over time for a discrete (or 6 | binary) variable} 7 | \usage{ 8 | PlotRatesOverTime(dataFl, dateGp, myVar, normBy = "time", weightNm = NULL, 9 | newLevels = NULL, kCategories = 9) 10 | } 11 | \arguments{ 12 | \item{dataFl}{A \code{data.table} of data; must be the output of the 13 | \code{\link{PrepData}} function.} 14 | 15 | \item{dateGp}{Name of the variable that the time series plots should be 16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 19 | 20 | \item{myVar}{The name of the variable to be plotted} 21 | 22 | \item{normBy}{The normalization factor for rate plots, can be \code{"time"} 23 | or \code{"var"}. If \code{"time"}, then for each time period of 24 | \code{dateGp}, counts are normalized by the total counts over all 25 | categories in that time period. This illustrates changes of categories' 26 | proportions over time. If \code{"var"}, then for each category, its counts 27 | are normalized by the total counts over time from only this category. This 28 | illustrates changes of categories' volumes over time.} 29 | 30 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 31 | no weights (all rows receiving weight 1).} 32 | 33 | \item{newLevels}{categories of \code{myVar} in order of global frequency} 34 | 35 | \item{kCategories}{If a categorical variable has more than \code{kCategories}, 36 | trace plots of only the \code{kCategories} most prevalent categories are 37 | plotted.} 38 | } 39 | \value{ 40 | A list: 41 | \item{p}{\code{ggplot} object, trace plots of categories' proportions 42 | \code{myVar} over time.} 43 | \item{catVarSummary}{A \code{data.table}, contains categories' proportions 44 | globally, and over-time in each time period in \code{dateGp}. Each row is 45 | a category of the categorical (or binary) variable \code{myVar}. The row 46 | whose \code{category == 'NA'} corresponds to missing. Categories are 47 | ordered by global prevalence in a descending order.} 48 | } 49 | \description{ 50 | Creates trace plots of categories' proportions over time for a discrete (or 51 | binary) variable 52 | } 53 | \section{License}{ 54 | 55 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 56 | Version 2.0 (the "License"); you may not use this file except in compliance 57 | with the License. You may obtain a copy of the License at 58 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 59 | or agreed to in writing, software distributed under the License is 60 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 61 | KIND, either express or implied. See the License for the specific language 62 | governing permissions and limitations under the License. 63 | } 64 | 65 | \examples{ 66 | data(bankData) 67 | bankData$weight = rpois(nrow(bankData), 5) 68 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 69 | dateGpBp = "quarters", weightNm = "weight") 70 | PlotRatesOverTime(dataFl = bankData, dateGp = "months", weightNm = "weight", 71 | myVar = "job", newLevels = NULL, normBy = "time") 72 | 73 | } 74 | \seealso{ 75 | Functions depend on this function: 76 | \code{\link{PlotCatVar}}. 77 | 78 | This function depends on: 79 | \code{\link{PrepData}}. 80 | } 81 | -------------------------------------------------------------------------------- /man/otvPlots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package_otvPlots.R 3 | \docType{package} 4 | \name{otvPlots} 5 | \alias{otvPlots} 6 | \alias{otvPlots-package} 7 | \title{Over time variable plots for predictive modeling (otvPlots)} 8 | \description{ 9 | The \code{otvPlots} package uses \code{data.table} and \code{ggplot2} 10 | packages to efficiently plot time series aggregated from large datasets. 11 | Plots of numerical variables are optionally returned ordered by correlation 12 | with date -- a natural starting point for anomaly detection. Plots are 13 | automatically labeled if a variable dictionary is provided. 14 | } 15 | \details{ 16 | Output files include: 17 | \itemize{ 18 | \item A PDF file of plots saved as \code{outFl}.pdf, with each individual page 19 | on one variable. Variables are plotted in the order indicated in the argument 20 | \code{sortVars} or \code{sortFn}. 21 | For each numerical variable, the output plots include 22 | \itemize{ 23 | \item side-by-side boxplots grouped by \code{dateGpBp} (left), 24 | \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp} 25 | (top right), 26 | \item a trace plot of mean and +-1 SD control limits, grouped by 27 | \code{dateGp}(middle right), and 28 | \item a trace plot of missing and zero rates, grouped by \code{dateGp} 29 | (bottom right). 30 | } 31 | For each categorical variable (including a numerical variable with no more 32 | than 2 unique levels not including NA), the output plots include 33 | \itemize{ 34 | \item a frequency bar plot (left), and 35 | \item a grid of trace plots on categories' proportions over time (right). 36 | If the variable contains more than \code{kCategories} number of 37 | categories, trace plots of only the largest \code{kCategories} will be 38 | plotted. If the variable contains only two categories, then only the 39 | trace plot of the less prevalent category will be plotted. 40 | } 41 | \item CSV file(s) on summary statistics of variables, both globally and over 42 | time aggregated by \code{dateGp}. The order of variables in the CSV files 43 | is the same as in the PDF file. 44 | \itemize{ 45 | \item For numerical variables, number of observations (counts), p1, p25, 46 | p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved 47 | as \code{outFl}_numerical_summary.csv. 48 | \item For categorical varaibles, number of observations (counts) and 49 | categories' proportions are saved as \code{outFl}_categorical_summary.csv. 50 | Each row is a category of a categorical (or binary) variable. 51 | The row whose \code{category == 'NA'} corresponds to missing. Categories 52 | among the same variable are ordered by global prevalence in a descending 53 | order. 54 | } 55 | } 56 | } 57 | \section{License}{ 58 | 59 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 60 | Version 2.0 (the "License"); you may not use this file except in compliance 61 | with the License. You may obtain a copy of the License at 62 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 63 | or agreed to in writing, software distributed under the License is 64 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 65 | KIND, either express or implied. See the License for the specific language 66 | governing permissions and limitations under the License. 67 | } 68 | 69 | \seealso{ 70 | Main function: \code{\link{vlm}}. 71 | 72 | Selected supporting functions: 73 | \code{\link{PrepData}}, 74 | \code{\link{PrepLabels}}, 75 | \code{\link{OrderByR2}}. 76 | } 77 | -------------------------------------------------------------------------------- /tests/testthat/test_SummaryStats.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | context("Summary stats for numerical variables") 3 | load("../testthat/testData.rda") 4 | setDT(testData) 5 | suppressMessages(PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight")) 6 | 7 | test_that("Numerical statistics are calculated correctly without weight", { 8 | mdx = SummaryStats(myVar = "age", dataFl = testData, dateGp = "weeks")$meltdx 9 | Mean = mdx[variable=='Mean'] 10 | p1 = mdx[variable=='p1'] 11 | p99 = mdx[variable=='p99'] 12 | zerorate = mdx[variable=='zerorate'] 13 | missingrate = mdx[variable=='missingrate'] 14 | 15 | p99_g = unique(mdx[variable=='p99_g', value]) 16 | p1_g = unique(mdx[variable=='p1_g', value]) 17 | cl1 = unique(mdx[variable=='cl1', value]) 18 | cl2 = unique(mdx[variable=='cl2', value]) 19 | 20 | expect_equivalent(p99_g, quantile(testData[, age], p=.99)) 21 | expect_equivalent(p1_g, quantile(testData[, age], p=.01)) 22 | expect_equivalent(cl1, mean(testData[, age]) + sd(testData[,age])) 23 | expect_equivalent(cl2, mean(testData[, age]) - sd(testData[,age])) 24 | 25 | mdx2 = mdx[weeks == "2008-05-06" & variable%in%c("p99", "p50", "p1", "mean", "zerorate", "missingrate")] 26 | 27 | expect_equivalent(mdx2[variable=="p99", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .99)) 28 | expect_equivalent(mdx2[variable=="p50", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .5)) 29 | expect_equivalent(mdx2[variable=="p1", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .01)) 30 | expect_equivalent(mdx2[variable=="mean", value], mean(testData[weeks==as.IDate("2008-05-06"),age])) 31 | expect_equivalent(mdx2[variable=="zerorate", value], mean(testData[weeks==as.IDate("2008-05-06"),age]==0)) 32 | expect_equivalent(mdx2[variable=="missingrate", value], mean(is.na(testData[weeks==as.IDate("2008-05-06"),age]))) 33 | }) 34 | 35 | 36 | test_that("Numerical statistics are calculated correctly with weight", { 37 | mdx = SummaryStats(myVar = "age", dataFl = testData, dateGp = "weeks", weightNm = "weight")$meltdx 38 | Mean = mdx[variable=='Mean'] 39 | p1 = mdx[variable=='p1'] 40 | p99 = mdx[variable=='p99'] 41 | zerorate = mdx[variable=='zerorate'] 42 | missingrate = mdx[variable=='missingrate'] 43 | 44 | 45 | p99_g = unique(mdx[variable=='p99_g', value]) 46 | p1_g = unique(mdx[variable=='p1_g', value]) 47 | cl1 = unique(mdx[variable=='cl1', value]) 48 | cl2 = unique(mdx[variable=='cl2', value]) 49 | 50 | expect_equivalent(p99_g, Hmisc::wtd.quantile(testData[, age], testData[, weight], probs=.99, normwt=TRUE)) 51 | expect_equivalent(p1_g, Hmisc::wtd.quantile(testData[, age], testData[, weight], probs=.01, normwt=TRUE)) 52 | expect_equivalent(cl2, Hmisc::wtd.mean(testData[, age], testData[,weight], na.rm=TRUE, normwt=TRUE) - 53 | sqrt(Hmisc::wtd.var(testData[,age], testData[,weight], na.rm=TRUE,normwt=TRUE))) 54 | expect_equivalent(cl1, Hmisc::wtd.mean(testData[, age], testData[,weight], na.rm=TRUE, normwt=TRUE) + 55 | sqrt(Hmisc::wtd.var(testData[,age], testData[,weight], na.rm=TRUE,normwt=TRUE))) 56 | 57 | mdx2 = mdx[weeks == "2008-05-06" & variable%in%c("p99", "p50", "p1", "mean", "zerorate", "missingrate")] 58 | testData2 = testData[weeks==as.IDate("2008-05-06")] 59 | 60 | expect_equivalent(mdx2[variable=="p99", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .99, normwt=TRUE)) 61 | expect_equivalent(mdx2[variable=="p50", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .5, normwt=TRUE)) 62 | expect_equivalent(mdx2[variable=="p1", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .01, normwt=TRUE)) 63 | expect_equivalent(mdx2[variable=="mean", value], Hmisc::wtd.mean(testData2[,age], testData2[,weight])) 64 | expect_equivalent(mdx2[variable=="zerorate", value], Hmisc::wtd.mean((testData2[,age]==0), testData2[,weight])) 65 | expect_equivalent(mdx2[variable=="missingrate", value], Hmisc::wtd.mean(is.na(testData2[,age]), testData2[,weight])) 66 | }) 67 | 68 | -------------------------------------------------------------------------------- /man/OrderByR2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plots_order.R 3 | \name{OrderByR2} 4 | \alias{OrderByR2} 5 | \title{Create numerical variable ranking using R2 between date to and variable} 6 | \usage{ 7 | OrderByR2(dataFl, dateNm, buildTm = NULL, weightNm = NULL, 8 | kSample = 50000) 9 | } 10 | \arguments{ 11 | \item{dataFl}{A \code{data.table} of data; must be the output of the 12 | \code{\link{PrepData}} function.} 13 | 14 | \item{dateNm}{Name of column containing the date variable.} 15 | 16 | \item{buildTm}{Vector identify time period for ranking/anomaly detection 17 | (most likely model build period). Allows for a subset of plotting time 18 | period to be used for anomaly detection. 19 | \itemize{ 20 | \item Must be a vector of dates and must be inclusive i.e. buildTm[1] 21 | <= date <= buildTm[2] will define the time period. 22 | \item Must be either \code{NULL}, a vector of length 2, or a vector of 23 | length 3. 24 | \item If \code{NULL}, the entire dataset will be used for 25 | ranking/anomaly detection. 26 | \item If a vector of length 2, the format of the dates must be 27 | a character vector in default R date format (e.g. "2017-01-30"). 28 | \item If a vector of length 3, the first two columns must contain dates 29 | in any strptime format, while the 3rd column contains the strptime 30 | format (see \code{\link{strptime}}). 31 | \item The following are equivalent ways of selecting 32 | all of 2014: 33 | \itemize{ 34 | \item \code{c("2014-01-01","2014-12-31")} 35 | \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")} 36 | } 37 | }} 38 | 39 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 40 | no weights (all rows receiving weight 1).} 41 | 42 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 43 | indicates the sample size for both drawing boxplots and ordering numerical 44 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 45 | reasonable value (default is 50K) dramatically improves processing speed. 46 | Therefore, for larger datasets (e.g. > 10 percent system memory), this 47 | parameter should not be set to \code{NULL}, or boxplots may take a very 48 | long time to render. This setting has no impact on the accuracy of time 49 | series plots on quantiles, mean, SD, and missing and zero rates.} 50 | } 51 | \value{ 52 | A vector of variable names sorted by R2 of \code{lm} of the formula 53 | \code{var} ~ \code{dateNm} (highest R2 to lowest) 54 | } 55 | \description{ 56 | Calculates R2 of a linear model of the formula \code{var} ~ \code{dateNm} for 57 | each \code{var} of class \code{nmrcl} and returns a vector of 58 | variable names ordered by highest R2. The linear model can be calculated over 59 | a subset of dates, see details of parameter \code{buildTm}. Non-numerical 60 | variables are returned in alphabetical order after the sorted numerical 61 | variables. 62 | } 63 | \section{License}{ 64 | 65 | Copyright 2017 Capital One Services, LLC Licensed under the 66 | Apache License, Version 2.0 (the "License"); you may not use this file 67 | except in compliance with the License. You may obtain a copy of the 68 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 69 | applicable law or agreed to in writing, software distributed under the 70 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 71 | CONDITIONS OF ANY KIND, either express or implied. See the License for the 72 | specific language governing permissions and limitations under the License. 73 | } 74 | 75 | \examples{ 76 | data(bankData) 77 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 78 | dateGpBp = "quarters") 79 | OrderByR2(bankData, dateNm = "date") 80 | } 81 | \seealso{ 82 | Functions depend on this function: 83 | \code{\link{vlm}}. 84 | 85 | This function depends on: 86 | \code{\link{CalcR2}}, 87 | \code{\link{PrepData}}. 88 | } 89 | -------------------------------------------------------------------------------- /man/PlotNumVar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/numerical.R 3 | \name{PlotNumVar} 4 | \alias{PlotNumVar} 5 | \title{Create plots and summary statistics for a numerical variable} 6 | \usage{ 7 | PlotNumVar(myVar, dataFl, weightNm, dateGp, dateGpBp, skewOpt = NULL, 8 | kSample = 50000) 9 | } 10 | \arguments{ 11 | \item{myVar}{The name of the variable to be plotted} 12 | 13 | \item{dataFl}{A \code{data.table} of data; must be the output of the 14 | \code{\link{PrepData}} function.} 15 | 16 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 17 | no weights (all rows receiving weight 1).} 18 | 19 | \item{dateGp}{Name of the variable that the time series plots should be 20 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 21 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 22 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 23 | 24 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same 25 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.} 26 | 27 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 28 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of 29 | a variable whose skewness exceeds 5 will be on a log10 scale if possible. 30 | Negative input of \code{skewOpt} will be converted to 3.} 31 | 32 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 33 | indicates the sample size for both drawing boxplots and ordering numerical 34 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 35 | reasonable value (default is 50K) dramatically improves processing speed. 36 | Therefore, for larger datasets (e.g. > 10 percent system memory), this 37 | parameter should not be set to \code{NULL}, or boxplots may take a very 38 | long time to render. This setting has no impact on the accuracy of time 39 | series plots on quantiles, mean, SD, and missing and zero rates.} 40 | } 41 | \value{ 42 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a 43 | side-byside boxplot grouped by \code{dateGpBp}, a time series plot of p1, 44 | p50 (median), and p99 grouped by \code{dateGp}, a time series plot of 45 | mean and +-1 SD control limits grouped by \code{dateGp}, and a time 46 | series plot of missing and zerorates grouped by \code{dateGp}.} 47 | \item{numVarSummary}{A \code{data.table}, contains global and over time 48 | summary statistics, including p1, p25, p50, p75, and p99 quantiles, mean 49 | and SD, missing and zero rates.} 50 | } 51 | \description{ 52 | Output plots include a boxplot on the left, grouped by a courser time scale 53 | (\code{dateGpBp}), and three trace plots on the right, on p1, p50, 54 | and p99 qunatiles, mean and +-1 SD control limits, missing and zerorates, 55 | all grouped by a finer time scale as in \code{dateGp}. In addition to plots, 56 | a \code{data.table} of summary statistics are generated, on global and 57 | over time summary statistics. 58 | } 59 | \section{License}{ 60 | 61 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 62 | Version 2.0 (the "License"); you may not use this file except in compliance 63 | with the License. You may obtain a copy of the License at 64 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 65 | or agreed to in writing, software distributed under the License is 66 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 67 | KIND, either express or implied. See the License for the specific language 68 | governing permissions and limitations under the License. 69 | } 70 | 71 | \examples{ 72 | data(bankData) 73 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 74 | dateGpBp = "years") 75 | plot(PlotNumVar("balance", bankData, NULL, "months", "years", 76 | skewOpt = NULL, kSample = NULL)$p) 77 | } 78 | \seealso{ 79 | Functions depend on this function: 80 | \code{\link{PlotVar}}. 81 | 82 | This function depends on: 83 | \code{\link{SummaryStats}}, 84 | \code{\link{PlotDist}}, 85 | \code{\link{PlotQuantiles}}, 86 | \code{\link{PlotMean}}, 87 | \code{\link{PlotRates}}, 88 | \code{\link{PrepData}}. 89 | } 90 | -------------------------------------------------------------------------------- /man/PlotCatVar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/categorical.R 3 | \name{PlotCatVar} 4 | \alias{PlotCatVar} 5 | \title{Create plots and summary statistics for a categorical variable} 6 | \usage{ 7 | PlotCatVar(myVar, dataFl, weightNm = NULL, dateNm, dateGp, kCategories = 9, 8 | normBy = "time") 9 | } 10 | \arguments{ 11 | \item{myVar}{The name of the variable to be plotted} 12 | 13 | \item{dataFl}{A \code{data.table} of data; must be the output of the 14 | \code{\link{PrepData}} function.} 15 | 16 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 17 | no weights (all rows receiving weight 1).} 18 | 19 | \item{dateNm}{Name of column containing the date variable.} 20 | 21 | \item{dateGp}{Name of the variable that the time series plots should be 22 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 23 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 24 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 25 | 26 | \item{kCategories}{If a categorical variable has more than \code{kCategories}, 27 | trace plots of only the \code{kCategories} most prevalent categories are 28 | plotted.} 29 | 30 | \item{normBy}{The normalization factor for rate plots, can be \code{"time"} 31 | or \code{"var"}. If \code{"time"}, then for each time period of 32 | \code{dateGp}, counts are normalized by the total counts over all 33 | categories in that time period. This illustrates changes of categories' 34 | proportions over time. If \code{"var"}, then for each category, its counts 35 | are normalized by the total counts over time from only this category. This 36 | illustrates changes of categories' volumes over time.} 37 | } 38 | \value{ 39 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a 40 | bar plot, and trace plots of categories' proportions. If the number of 41 | categories is larger than \code{kCategories}, then trace plots of only the 42 | \code{kCategories} most prevalent categories are be plotted. For a binary 43 | variable, only the trace plot of the less prevalent category is plotted.} 44 | \item{catVarSummary}{A \code{data.table}, contains categories' proportions 45 | globally, and over-time in each time period in \code{dateGp}. Each row is 46 | a category of the categorical (or binary) variable \code{myVar}. The row 47 | whose \code{category == 'NA'} corresponds to missing. Categories are 48 | ordered by global prevalence in a descending order.} 49 | } 50 | \description{ 51 | Output plots include a bar plot with cateogries ordered by global counts, 52 | and trace plots of categories' proportions over time. This function is also 53 | appliable to a binary varible, which is treated as categorical in this 54 | package. In addition to plots, a \code{data.table} of summary statistics 55 | are generated, on global counts and proportions by cateory, and proportions 56 | by category over time. 57 | } 58 | \section{License}{ 59 | 60 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 61 | Version 2.0 (the "License"); you may not use this file except in compliance 62 | with the License. You may obtain a copy of the License at 63 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 64 | or agreed to in writing, software distributed under the License is 65 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 66 | KIND, either express or implied. See the License for the specific language 67 | governing permissions and limitations under the License. 68 | } 69 | 70 | \examples{ 71 | data(bankData) 72 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 73 | dateGpBp = "quarters", weightNm = NULL) 74 | # Single histogram is plotted for job type since there are 12 categories 75 | plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL, 76 | dateNm = "date", dateGp = "months")$p) 77 | 78 | plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL, 79 | dateNm = "date", dateGp = "months", kCategories = 12)$p) 80 | 81 | 82 | ## Binary data is treated as categorical, and only the less frequent 83 | ## category is plotted over time. 84 | plot(PlotCatVar(myVar = "default", dataFl = bankData, weightNm = NULL, 85 | dateNm = "date", dateGp = "months")$p) 86 | } 87 | \seealso{ 88 | Functions depend on this function: 89 | \code{\link{PlotVar}}, 90 | \code{\link{PrintPlots}}, 91 | \code{\link{vlm}}. 92 | 93 | This function depends on: 94 | \code{\link{PlotBarplot}}, 95 | \code{\link{PlotRatesOverTime}}, 96 | \code{\link{PrepData}}. 97 | } 98 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 Capital One Services, LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software distributed 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 12 | # OF ANY KIND, either express or implied. 13 | # 14 | # See the License for the specific language governing permissions and limitations under the License. 15 | 16 | 17 | #' Direct marketing campaigns of a Portuguese banking institution 18 | #' 19 | #' The marketing campaigns were based on phone calls. 20 | #' Often, more than one contact to the same client was required, in order to 21 | #' access if the product (bank term deposit) would be ('yes') or not ('no') 22 | #' subscribed. Records are ordered by date (from May 2008 to November 2010), 23 | #' similar to data analyzed in Moro et al. [2014]. 24 | #' 25 | #' 26 | #' @format A data frame with 45,211 rows and 19 variables: 27 | #' \describe{ 28 | #' \item{age}{Age of the client, numeric.} 29 | #' \item{job}{Type of job, a categorical variable with the levels: 30 | #' \code{'admin.'}, \code{'blue-collar'}, \code{'entrepreneur'}, 31 | #' \code{'housemaid'}, \code{'management'}, \code{'retired'}, 32 | #' \code{'self-employed'}, \code{'services'}, \code{'student'}, 33 | #' \code{'technician'}, \code{'unemployed'}, and \code{'unknown'}.} 34 | #' \item{marital}{Marital status, a categorical variable with levels: 35 | #' \code{'divorced'}, \code{'married'}, \code{'single'}, and \code{'unknown'}. 36 | #' Note that \code{'divorced'} means either divorced or widowed.} 37 | #' \item{education}{A categorical variable with levels: \code{'basic.4y'}, 38 | #' \code{'basic.6y'}, \code{'basic.9y'}, \code{'high.school'}, 39 | #' \code{'illiterate'}, \code{'professional.course'}, 40 | #' \code{'university.degree'}, and \code{'unknown'}.} 41 | #' \item{default}{Whether credit is in default, a categorical variable with 42 | #' levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.} 43 | #' \item{balance}{Account balance, numeric.} 44 | #' \item{housing}{Whether the client has a housing loan, a categorical variable 45 | #' with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.} 46 | #' \item{loan}{Whether the client has personal loan, a categorical variable 47 | #' with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.} 48 | #' \item{contact}{Type of contact communication, a categorical variable 49 | #' with levels: \code{'cellular'} and \code{'telephone'}.} 50 | #' \item{duration}{Last contact duration in seconds, a numeric variable.} 51 | #' \item{campaign}{Number of contacts performed during this campaign for 52 | #' this client, including the last contact; a numeric variable.} 53 | #' \item{pdays}{Number of days that passed by after the client was last 54 | #' contacted from a previous campaign; a numeric variable, with \code{999} 55 | #' means that client was not previously contacted.} 56 | #' \item{previous}{Number of contacts performed before this campaign for this 57 | #' client, a numeric variable.} 58 | #' \item{poutcome}{Outcome of the previous marketing campaign, a categorical 59 | #' variable with levels: \code{'failure'}, \code{'nonexistent'}, 60 | #' and \code{'success'}.} 61 | #' \item{y}{Whether the client has subscribed a term deposit, a categorical 62 | #' variable with levels: \code{'yes'} and \code{'no'}.} 63 | #' \item{date}{Last contact date.} 64 | #' } 65 | #' @source \url{https://archive.ics.uci.edu/ml/datasets/Bank+Marketing} 66 | #' @source \cite{Lichman, M. (2013). UCI Machine Learning Repository 67 | #' [\url{http://archive.ics.uci.edu/ml}]. Irvine, CA: University of California, 68 | #' School of Information and Computer Science.} 69 | #' @source \cite{S. Moro, P. Cortez, and P. Rita. (2014) A Data-Driven Approach 70 | #' to Predict the Success of Bank Telemarketing. Decision Support Systems, 71 | #' Elsevier, 62:22-31, June 2014.} 72 | "bankData" 73 | 74 | #' Labels for bankData 75 | #' 76 | #' A dataset containing the attribute labels also found in \code{\link{bankData}}. 77 | #' This data set is used to illustrate the \code{\link{PrepLabels}} function and 78 | #' other label functionality in the \code{\link{otvPlots}} package in R. 79 | #' 80 | #' @format A data frame with 16 rows and 3 variables: 81 | #' \describe{ 82 | #' \item{V1}{Name of each variable in \code{\link{bankData}}.} 83 | #' \item{V2}{Label of each variable in \code{\link{bankData}}.} 84 | #' \item{V3}{A numeric variable, corresponding to the row number.} 85 | #' } 86 | "bankLabels" 87 | -------------------------------------------------------------------------------- /R/package_otvPlots.R: -------------------------------------------------------------------------------- 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 Capital One Services, LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software distributed 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 12 | # OF ANY KIND, either express or implied. 13 | # 14 | # See the License for the specific language governing permissions and limitations under the License. 15 | 16 | 17 | #' Over time variable plots for predictive modeling (otvPlots) 18 | #' 19 | #' The \code{otvPlots} package uses \code{data.table} and \code{ggplot2} 20 | #' packages to efficiently plot time series aggregated from large datasets. 21 | #' Plots of numerical variables are optionally returned ordered by correlation 22 | #' with date -- a natural starting point for anomaly detection. Plots are 23 | #' automatically labeled if a variable dictionary is provided. 24 | #' 25 | #' Output files include: 26 | #' \itemize{ 27 | #' \item A PDF file of plots saved as \code{outFl}.pdf, with each individual page 28 | #' on one variable. Variables are plotted in the order indicated in the argument 29 | #' \code{sortVars} or \code{sortFn}. 30 | #' For each numerical variable, the output plots include 31 | #' \itemize{ 32 | #' \item side-by-side boxplots grouped by \code{dateGpBp} (left), 33 | #' \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp} 34 | #' (top right), 35 | #' \item a trace plot of mean and +-1 SD control limits, grouped by 36 | #' \code{dateGp}(middle right), and 37 | #' \item a trace plot of missing and zero rates, grouped by \code{dateGp} 38 | #' (bottom right). 39 | #' } 40 | #' For each categorical variable (including a numerical variable with no more 41 | #' than 2 unique levels not including NA), the output plots include 42 | #' \itemize{ 43 | #' \item a frequency bar plot (left), and 44 | #' \item a grid of trace plots on categories' proportions over time (right). 45 | #' If the variable contains more than \code{kCategories} number of 46 | #' categories, trace plots of only the largest \code{kCategories} will be 47 | #' plotted. If the variable contains only two categories, then only the 48 | #' trace plot of the less prevalent category will be plotted. 49 | #' } 50 | #' \item CSV file(s) on summary statistics of variables, both globally and over 51 | #' time aggregated by \code{dateGp}. The order of variables in the CSV files 52 | #' is the same as in the PDF file. 53 | #' \itemize{ 54 | #' \item For numerical variables, number of observations (counts), p1, p25, 55 | #' p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved 56 | #' as \code{outFl}_numerical_summary.csv. 57 | #' \item For categorical varaibles, number of observations (counts) and 58 | #' categories' proportions are saved as \code{outFl}_categorical_summary.csv. 59 | #' Each row is a category of a categorical (or binary) variable. 60 | #' The row whose \code{category == 'NA'} corresponds to missing. Categories 61 | #' among the same variable are ordered by global prevalence in a descending 62 | #' order. 63 | #' } 64 | #' } 65 | #' 66 | #' @seealso Main function: \code{\link{vlm}}. 67 | #' @seealso Selected supporting functions: 68 | #' \code{\link{PrepData}}, 69 | #' \code{\link{PrepLabels}}, 70 | #' \code{\link{OrderByR2}}. 71 | #' 72 | #' @section License: 73 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 74 | #' Version 2.0 (the "License"); you may not use this file except in compliance 75 | #' with the License. You may obtain a copy of the License at 76 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 77 | #' or agreed to in writing, software distributed under the License is 78 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 79 | #' KIND, either express or implied. See the License for the specific language 80 | #' governing permissions and limitations under the License. 81 | #' @docType package 82 | #' @name otvPlots 83 | #' @import data.table 84 | #' @import ggplot2 85 | #' @importFrom grid grid.draw grid.newpage unit unit.c textGrob gpar 86 | #' @importFrom gridExtra arrangeGrob 87 | #' @importFrom moments skewness 88 | #' @importFrom Hmisc wtd.quantile wtd.mean wtd.var 89 | #' @importFrom stringi stri_trans_general 90 | #' @importFrom scales hue_pal 91 | #' @importFrom grDevices cairo_pdf dev.off 92 | #' @importFrom graphics par 93 | #' @importFrom stats lm.fit lm.wfit quantile sd var 94 | #' @importFrom utils tail 95 | NULL 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /man/PrintPlots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_print.R 3 | \name{PrintPlots} 4 | \alias{PrintPlots} 5 | \title{Create a pdf file with plots and compute summary statistics for all variables} 6 | \usage{ 7 | PrintPlots(outFl, dataFl, sortVars, dateNm, dateGp, dateGpBp, weightNm = NULL, 8 | labelFl = NULL, genCSV = TRUE, highlightNms = NULL, skewOpt = NULL, 9 | kSample = 50000, fuzzyLabelFn = NULL, kCategories = 9) 10 | } 11 | \arguments{ 12 | \item{outFl}{Name of the output file, with no extension names (e.g., "bank"). 13 | A pdf file of plots ("bank.pdf"), and two csv files of summary statistics 14 | ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be 15 | saved to your working directory, unless a path is included in \code{outFl} 16 | (e.g. "../plots/bank").} 17 | 18 | \item{dataFl}{A \code{data.table} containing at least the following columns: 19 | \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an 20 | output of the \code{\link{PrepData}} function.} 21 | 22 | \item{sortVars}{A character vector of variable names in the order they will 23 | be plotted.} 24 | 25 | \item{dateNm}{Name of column containing the date variable.} 26 | 27 | \item{dateGp}{Name of the variable that the time series plots should be 28 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 29 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 30 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 31 | 32 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same 33 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.} 34 | 35 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 36 | no weights (all rows receiving weight 1).} 37 | 38 | \item{labelFl}{A \code{data.table} containing variable labels, or \code{NULL} 39 | for no labels; usually an output of \code{\link{PrepLabels}}.} 40 | 41 | \item{genCSV}{Logical, whether to generate the two csv files of summary 42 | statistics for numerical and categorical variables.} 43 | 44 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to 45 | recieve red label. Currently \code{NULL} means all variables will get a 46 | black legend. Ignored this argument if \code{labelFl == NULL}.} 47 | 48 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 49 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of 50 | a variable whose skewness exceeds 5 will be on a log10 scale if possible. 51 | Negative input of \code{skewOpt} will be converted to 3.} 52 | 53 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 54 | indicates the sample size for both drawing boxplots and ordering numerical 55 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 56 | reasonable value (default is 50K) dramatically improves processing speed. 57 | Therefore, for larger datasets (e.g. > 10 percent system memory), this 58 | parameter should not be set to \code{NULL}, or boxplots may take a very 59 | long time to render. This setting has no impact on the accuracy of time 60 | series plots on quantiles, mean, SD, and missing and zero rates.} 61 | 62 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label 63 | file in the format of an output by \code{\link{PrepLabels}} and a string 64 | giving a variable name. The function should return the label corresponding 65 | to the variable given by the second parameter. This function should 66 | describe how fuzzy matching should be performed to find labels (see example 67 | below). If \code{NULL}, only exact matches will be retuned.} 68 | 69 | \item{kCategories}{If a categorical variable has more than \code{kCategories}, 70 | trace plots of only the \code{kCategories} most prevalent categories are 71 | plotted.} 72 | } 73 | \value{ 74 | A pdf of plots saved to file \code{outFl}.pdf, and if the argument 75 | \code{genCSV == TRUE}, also two csv files of summary statistics for 76 | numerical and categorical variables. 77 | } 78 | \description{ 79 | Creates plots and outputs results to a letter-sized pdf file, with each 80 | individual page containing plots on a single variable in the data. In 81 | addition, two summary statistics \code{data.table} are returned, one for 82 | numerical variables, and one for categorical (and binary) ones. 83 | } 84 | \section{License}{ 85 | 86 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 87 | Version 2.0 (the "License"); you may not use this file except in compliance 88 | with the License. You may obtain a copy of the License at 89 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 90 | or agreed to in writing, software distributed under the License is 91 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 92 | KIND, either express or implied. See the License for the specific language 93 | governing permissions and limitations under the License. 94 | } 95 | 96 | \seealso{ 97 | Functions depend on this function: 98 | \code{\link{vlm}}. 99 | 100 | This function depends on: 101 | \code{\link{PlotVar}}, 102 | \code{\link{PrepData}}. 103 | } 104 | -------------------------------------------------------------------------------- /man/PrepData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prep.R 3 | \name{PrepData} 4 | \alias{PrepData} 5 | \title{Prepare an input dataset for plotting} 6 | \usage{ 7 | PrepData(dataFl, dateNm, selectCols = NULL, dropCols = NULL, 8 | dateFt = "\%d\%h\%Y", dateGp = NULL, dateGpBp = NULL, weightNm = NULL, 9 | varNms = NULL, dropConstants = FALSE, ...) 10 | } 11 | \arguments{ 12 | \item{dataFl}{Either the name of an object that can be converted using 13 | \code{\link[data.table]{as.data.table}} (e.g., a data frame), or a 14 | character string containing the name of dataset that can be loaded using 15 | \code{\link[data.table]{fread}} (e.g., a csv file). If the dataset is not in 16 | your working directory then \code{dataFl} must include (relative or 17 | absolute) path to file.} 18 | 19 | \item{dateNm}{Name of column containing the date variable.} 20 | 21 | \item{selectCols}{Either \code{NULL}, or a vector of names or indices of 22 | variables to read into memory -- must include \code{dateNm}, 23 | \code{weightNm} (if not \code{NULL}) and all variables to be plotted. If 24 | both \code{selectCols} and \code{dropCols} are \code{NULL}, then all 25 | variables will be read in.} 26 | 27 | \item{dropCols}{Either \code{NULL}, or a vector of variables names or indices 28 | of variables not to read into memory. If both \code{selectCols} and 29 | \code{dropCols} are \code{NULL}, then all variables will be read in.} 30 | 31 | \item{dateFt}{\code{\link{strptime}} format of date variable. The default is SAS 32 | format \code{"\%d\%h\%Y"}. But input data with R date format 33 | \code{"\%Y-\%m-\%d"} will also be detected. Both of two formats can be 34 | parsed automatically.} 35 | 36 | \item{dateGp}{Name of the variable that the time series plots should be 37 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 38 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 39 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 40 | 41 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same 42 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.} 43 | 44 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 45 | no weights (all rows receiving weight 1).} 46 | 47 | \item{varNms}{Either \code{NULL} or a vector of names or indices of variables 48 | to be plotted. If \code{NULL}, will default to all columns which are not 49 | \code{dateNm} or \code{weightNm}. Can also be a vector of indices of the 50 | column names, after \code{dropCols} or \code{selectCols} have been applied, 51 | if applicable, and not including \code{dateGp}, \code{dateGpBp} 52 | (which will be added to the \code{dataFl} by the function 53 | \code{\link{PrepData}}).} 54 | 55 | \item{dropConstants}{Logical, indicates whether or not constant (all 56 | duplicated or NA) variables should be dropped from \code{dataFl} prior to 57 | plotting.} 58 | 59 | \item{...}{Additional parameters to be passed to 60 | \code{\link[data.table]{fread}}.} 61 | } 62 | \value{ 63 | A \code{data.table} object, formatted for use by all plotting 64 | functions in this package \code{\link{otvPlots}}, including the main function 65 | \code{\link{vlm}}, and the individual variable plotting function 66 | \code{\link{PlotVar}}. 67 | } 68 | \description{ 69 | This function prepares an input dataset for use by all plotting functions 70 | in this package, including the main function \code{\link{vlm}}. 71 | The input data \code{dataFl} must contain, at a minimum, a date column 72 | \code{dateNm} and a variable to be plotted. \code{dataFl} will be 73 | converted to a \code{data.table} class, and all changes are made to it by 74 | reference. 75 | } 76 | \details{ 77 | If weights (\code{weightNm}) are provided, then it is normalized to have a 78 | sum of weights equal the total sample size, and the weights are used in all 79 | summary statistics calculations and plotting. 80 | } 81 | \section{License}{ 82 | 83 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 84 | Version 2.0 (the "License"); you may not use this file except in compliance 85 | with the License. You may obtain a copy of the License at 86 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 87 | or agreed to in writing, software distributed under the License is 88 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 89 | KIND, either express or implied. See the License for the specific language 90 | governing permissions and limitations under the License. 91 | } 92 | 93 | \examples{ 94 | ## Use the bankData dataset in this package 95 | data(bankData) 96 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 97 | dateGpBp = "quarters") 98 | ## Columns have been assigned a plotting class (nmrcl/ctgrl) 99 | str(bankData) 100 | } 101 | \seealso{ 102 | Functions depend on this function: 103 | \code{\link{PlotBarplot}}, 104 | \code{\link{PlotRatesOverTime}}, 105 | \code{\link{PlotCatVar}}, 106 | \code{\link{SummaryStats}}, 107 | \code{\link{PlotMean}}, 108 | \code{\link{PlotQuantiles}}, 109 | \code{\link{PlotRates}}, 110 | \code{\link{PlotDist}}, 111 | \code{\link{PlotNumVar}}, 112 | \code{\link{PlotVar}}, 113 | \code{\link{PrintPlots}}, 114 | \code{\link{CalcR2}}, 115 | \code{\link{OrderByR2}}, 116 | \code{\link{vlm}}. 117 | } 118 | -------------------------------------------------------------------------------- /tests/testthat/rawData_bigint.csv: -------------------------------------------------------------------------------- 1 | age,job,marital,balance,default,weight,date,bigint 2 | 32,blue-collar,single,23,0,0.005102041,6/5/08,2.3E+12 3 | 46,management,single,-246,0,0.010204082,6/5/08,-2.46E+13 4 | 32,admin.,married,0,0,0.010204082,6/5/08,0 5 | 60,retired,married,100,0,0.010204082,6/5/08,1E+13 6 | 60,admin.,married,39,0,0.010204082,7/5/08,3.9E+12 7 | 58,retired,married,96,0,0.005102041,7/5/08,9.6E+12 8 | 35,blue-collar,single,12223,0,0.005102041,7/5/08,1.2223E+15 9 | 55,services,divorced,1,1,0.010204082,7/5/08,1E+11 10 | 45,admin.,single,13,0,0.020408163,8/5/08,1.3E+12 11 | 47,blue-collar,married,306,0,0.005102041,8/5/08,3.06E+13 12 | 45,admin.,single,206,0,0.010204082,8/5/08,2.06E+13 13 | 60,retired,married,81,0,0.005102041,8/5/08,8.1E+12 14 | 28,management,single,447,0,0.015306122,9/5/08,4.47E+13 15 | 47,blue-collar,married,1506,0,0.015306122,10/5/08,1.506E+14 16 | 35,management,married,231,0,0.010204082,10/5/08,2.31E+13 17 | 40,retired,married,0,0,0.015306122,10/5/08,0 18 | 56,management,married,779,0,0.005102041,11/5/08,7.79E+13 19 | 25,services,married,50,0,0.010204082,11/5/08,5E+12 20 | 29,management,single,0,0,0.005102041,11/5/08,0 21 | 36,admin.,divorced,506,0,0.015306122,12/5/08,5.06E+13 22 | 55,technician,divorced,0,0,0.005102041,12/5/08,0 23 | 57,blue-collar,married,52,0,0.015306122,13-05-2008,5.2E+12 24 | 42,admin.,single,-76,0,0.010204082,13-05-2008,-7.6E+12 25 | 24,technician,single,-103,0,0.005102041,13-05-2008,-1.03E+13 26 | 53,technician,divorced,989,0,0.010204082,13-05-2008,9.89E+13 27 | 59,admin.,married,2343,0,0.005102041,13-05-2008,2.343E+14 28 | 51,blue-collar,married,173,0,0.005102041,13-05-2008,1.73E+13 29 | 44,admin.,married,-372,0,0.015306122,14-05-2008,-3.72E+13 30 | 55,services,divorced,91,0,0.010204082,14-05-2008,9.1E+12 31 | 49,services,divorced,0,0,0.010204082,14-05-2008,0 32 | 42,management,single,50,0,0.010204082,14-05-2008,5E+12 33 | 58,retired,married,121,0,0.015306122,15-05-2008,1.21E+13 34 | 36,technician,single,265,0,0.015306122,15-05-2008,2.65E+13 35 | 49,management,married,378,0,0.015306122,15-05-2008,3.78E+13 36 | 54,management,married,282,0,0.010204082,15-05-2008,2.82E+13 37 | 44,blue-collar,married,582,0,0.005102041,15-05-2008,5.82E+13 38 | 57,entrepreneur,divorced,-37,0,0.010204082,16-05-2008,-3.7E+12 39 | 60,retired,married,60,0,0.005102041,17-05-2008,6E+12 40 | 38,management,single,424,0,0.010204082,17-05-2008,4.24E+13 41 | 40,blue-collar,single,24,0,0.015306122,17-05-2008,2.4E+12 42 | 46,management,divorced,16,0,0.005102041,18-05-2008,1.6E+12 43 | 46,management,married,229,0,0.015306122,18-05-2008,2.29E+13 44 | 60,blue-collar,married,104,0,0.010204082,20-05-2008,1.04E+13 45 | 46,services,married,179,0,0.010204082,20-05-2008,1.79E+13 46 | 53,technician,married,6,0,0.015306122,21-05-2008,6E+11 47 | 54,retired,married,529,0,0.010204082,21-05-2008,5.29E+13 48 | 58,management,married,2143,0,0.005102041,22-05-2008,2.143E+14 49 | 43,technician,single,593,0,0.005102041,22-05-2008,5.93E+13 50 | 57,technician,divorced,63,0,0.005102041,22-05-2008,6.3E+12 51 | 42,entrepreneur,divorced,2,1,0.010204082,23-05-2008,2E+11 52 | 51,retired,married,229,0,0.005102041,23-05-2008,2.29E+13 53 | 59,blue-collar,married,0,0,0.005102041,23-05-2008,0 54 | 31,services,married,25,0,0.015306122,23-05-2008,2.5E+12 55 | 55,blue-collar,married,383,0,0.010204082,23-05-2008,3.83E+13 56 | 47,services,divorced,164,0,0.010204082,24-05-2008,1.64E+13 57 | 46,self-employed,married,137,0,0.010204082,24-05-2008,1.37E+13 58 | 48,management,divorced,-244,0,0.025510204,25-05-2008,-2.44E+13 59 | 49,blue-collar,married,154,0,0.010204082,25-05-2008,1.54E+13 60 | 59,management,divorced,59,0,0.005102041,25-05-2008,5.9E+12 61 | 25,blue-collar,married,-7,0,0.010204082,26-05-2008,-7E+11 62 | 50,management,married,49,0,0.010204082,26-05-2008,4.9E+12 63 | 58,self-employed,married,-364,0,0.005102041,26-05-2008,-3.64E+13 64 | 57,retired,married,486,0,0.015306122,26-05-2008,4.86E+13 65 | 33,unknown,single,1,0,0.025510204,27-05-2008,1E+11 66 | 57,services,married,162,0,0.020408163,27-05-2008,1.62E+13 67 | 39,management,single,255,0,0.005102041,27-05-2008,2.55E+13 68 | 57,technician,married,839,0,0.010204082,27-05-2008,8.39E+13 69 | 54,blue-collar,married,1291,0,0.005102041,27-05-2008,1.291E+14 70 | 32,management,married,0,0,0.010204082,27-05-2008,0 71 | 55,blue-collar,married,23,0,0.005102041,27-05-2008,2.3E+12 72 | 33,entrepreneur,married,2,0,0.005102041,28-05-2008,2E+11 73 | 58,technician,married,71,0,0.015306122,28-05-2008,7.1E+12 74 | 51,management,married,10635,0,0.005102041,28-05-2008,1.0635E+15 75 | 36,admin.,single,-171,0,0.020408163,28-05-2008,-1.71E+13 76 | 38,entrepreneur,single,243,0,0.010204082,28-05-2008,2.43E+13 77 | 55,technician,married,1205,0,0.010204082,28-05-2008,1.205E+14 78 | 41,admin.,divorced,270,0,0.005102041,29-05-2008,2.7E+13 79 | 33,services,married,0,0,0.010204082,29-05-2008,0 80 | 28,blue-collar,married,723,0,0.005102041,29-05-2008,7.23E+13 81 | 57,blue-collar,married,5935,0,0.010204082,29-05-2008,5.935E+14 82 | 44,services,divorced,2586,0,0.005102041,30-05-2008,2.586E+14 83 | 56,admin.,married,45,0,0.010204082,30-05-2008,4.5E+12 84 | 30,technician,married,152,0,0.015306122,30-05-2008,1.52E+13 85 | 42,technician,single,690,0,0.010204082,31-05-2008,6.9E+13 86 | 41,technician,married,1270,0,0.015306122,31-05-2008,1.27E+14 87 | 36,management,married,101,0,0.005102041,31-05-2008,1.01E+13 88 | 29,admin.,single,390,0,0.005102041,1/6/08,3.9E+13 89 | 44,technician,married,0,0,0.015306122,1/6/08,0 90 | 33,services,married,790,0,0.005102041,1/6/08,7.9E+13 91 | 60,admin.,married,290,0,0.010204082,1/6/08,2.9E+13 92 | 57,blue-collar,married,249,0,0.010204082,2/6/08,2.49E+13 93 | 53,technician,married,384,0,0.005102041,2/6/08,3.84E+13 94 | 60,blue-collar,married,54,0,0.005102041,2/6/08,5.4E+12 95 | 37,admin.,single,0,0,0.010204082,3/6/08,0 96 | 43,technician,married,1937,0,0.010204082,3/6/08,1.937E+14 97 | 44,technician,single,29,0,0.005102041,4/6/08,2.9E+12 98 | 52,entrepreneur,married,113,0,0.015306122,4/6/08,1.13E+13 99 | 53,technician,married,-3,0,0.010204082,4/6/08,-3E+11 100 | 51,management,married,6530,0,0.005102041,4/6/08,6.53E+14 101 | 39,technician,married,0,0,0.015306122,4/6/08,0 -------------------------------------------------------------------------------- /tests/testthat/test_OrderByR2.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | context("Order by R-squared") 3 | load("../testthat/testData.rda") 4 | #testData = setDT(testData) 5 | testData = PrepData(testData, dateNm = "date", weightNm = "weight") 6 | 7 | 8 | testOrder <- function(out, testData){ 9 | cntnsVars <- names(Filter(is.nmrcl, testData)) 10 | dscrtVars <- names(Filter(is.ctgrl, testData)) 11 | 12 | # testing that number of variables in output is equal to number of classed variables in input 13 | expect_equal(length(out), length(cntnsVars) + length(dscrtVars)) 14 | 15 | cntnsOrder <- match(cntnsVars, out) 16 | dscrtOrder <- match(dscrtVars, out) 17 | 18 | #testing that all numeric variables appear before discrete 19 | expect_lt(max(cntnsOrder), min(dscrtOrder)) 20 | 21 | #testing that all discrete variables appear in order 22 | expect_equal(order(dscrtOrder), 1:length(dscrtOrder)) 23 | } 24 | 25 | 26 | 27 | test_that("OrderByR2 gives expected variable order", { 28 | out <- OrderByR2(dataFl = testData, 29 | dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = NULL) 30 | 31 | #testing order of categorical, and order of numeric relative to discrete 32 | testOrder(out, testData) 33 | 34 | #testing that numeric variables appear in order 35 | rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL) 36 | rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL) 37 | expect_gt(rSq1, rSq2) 38 | }) 39 | 40 | 41 | test_that("OrderByR2 works for buildTm in date range", { 42 | buildTm = range(testData[, date][30:70]) 43 | out <- OrderByR2(dataFl = testData, 44 | dateNm = "date", buildTm = buildTm, weightNm = "weight", kSample = NULL) 45 | 46 | #testing order of categorical, and order of numeric relative to discrete 47 | testOrder(out, testData) 48 | 49 | testData1 = testData[date>=buildTm[1]&date<=buildTm[2]] 50 | #testing that numeric variables appear in order 51 | rSq1 <- CalcR2(out[1] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL) 52 | rSq2 <- CalcR2(out[2] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL) 53 | expect_gt(rSq1, rSq2) 54 | }) 55 | 56 | 57 | test_that("OrderByR2 works for buildTm outside date range", { 58 | buildTm = range(testData[, date][30:100] + 15) 59 | out <- OrderByR2(dataFl = testData, 60 | dateNm = "date", buildTm = buildTm, weightNm = "weight", kSample = NULL) 61 | 62 | #testing order of categorical, and order of numeric relative to discrete 63 | testOrder(out, testData) 64 | 65 | #testing that numeric variables appear in order 66 | testData1 = testData[date>=buildTm[1]&date<=buildTm[2]] 67 | rSq1 <- CalcR2(out[1] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL) 68 | rSq2 <- CalcR2(out[2] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL) 69 | expect_gt(rSq1, rSq2) 70 | }) 71 | 72 | 73 | test_that("OrderByR2 works for kSample < N, with R2 being calculated on reduced sample", { 74 | set.seed(5555) 75 | out <- OrderByR2(dataFl = testData, 76 | dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 50) 77 | 78 | #testing order of categorical, and order of numeric relative to discrete 79 | testOrder(out, testData) 80 | 81 | #testing that numeric variables appear in order 82 | set.seed(5555) 83 | rSq1 <- CalcR2(out[1] , dataFl = testData[sample(.N, min(.N, 50))], dateNm = "date", weightNm = "weight", imputeValue = NULL) 84 | set.seed(5555) 85 | rSq2 <- CalcR2(out[2] , dataFl = testData[sample(.N, min(.N, 50))], dateNm = "date", weightNm = "weight", imputeValue = NULL) 86 | expect_gt(rSq1, rSq2) 87 | }) 88 | 89 | 90 | test_that("OrderByR2 works for kSample > N", { 91 | out <- OrderByR2(dataFl = testData, 92 | dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 200) 93 | 94 | #testing order of categorical, and order of numeric relative to discrete 95 | testOrder(out, testData) 96 | 97 | #testing that numeric variables appear in order 98 | rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL) 99 | rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL) 100 | expect_gt(rSq1, rSq2) 101 | }) 102 | 103 | 104 | test_that("OrderByR2 works when kSample is too small to calculate R2, with numeric variables returned in 105 | order as given", { 106 | out <- OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 2) 107 | 108 | #testing order of categorical, and order of numeric relative to discrete 109 | testOrder(out, testData) 110 | 111 | #testing that all continous variables appear in data order 112 | cntnsVars <- names(Filter(is.nmrcl, testData)) 113 | cntnsOrder <- match(cntnsVars, out) 114 | expect_equal(order(cntnsOrder), 1:length(cntnsOrder)) 115 | }) 116 | 117 | test_that("OrderByR2 works when weight is null", { 118 | out <- OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, weightNm = NULL, kSample = NULL) 119 | 120 | #testing order of categorical, and order of numeric relative to discrete 121 | testOrder(out, testData) 122 | 123 | #testing that numeric variables appear in order 124 | rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = NULL, imputeValue = NULL) 125 | rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = NULL, imputeValue = NULL) 126 | expect_gt(rSq1, rSq2) 127 | }) 128 | 129 | 130 | test_that("OrderByR2 gives warning when weight/date contains missing", { 131 | idx1 = sample(1:100, 100)[1:10] 132 | idx2 = sample(1:100, 100)[1:10] 133 | testData[idx1, weight := NA] 134 | testData[idx2, date := NA] 135 | 136 | # testing for warning that weight column contains missings 137 | expect_warning(OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, 138 | weightNm = "weight", kSample = NULL), "Weights column") 139 | # testing for warning that date column contains missings 140 | expect_warning(OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, 141 | weightNm = "weight", kSample = NULL), "Date column") 142 | }) 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /man/PlotVar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_print.R 3 | \name{PlotVar} 4 | \alias{PlotVar} 5 | \title{Create over time variable plots and summary statitsics for one variable} 6 | \usage{ 7 | PlotVar(dataFl, myVar, weightNm, dateNm, dateGp, dateGpBp = NULL, 8 | labelFl = NULL, highlightNms = NULL, skewOpt = NULL, kSample = 50000, 9 | fuzzyLabelFn = NULL, kCategories = 9) 10 | } 11 | \arguments{ 12 | \item{dataFl}{A \code{data.table} containing at least the following columns: 13 | \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an 14 | output of the \code{\link{PrepData}} function.} 15 | 16 | \item{myVar}{Name of the variable to be plotted.} 17 | 18 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 19 | no weights (all rows receiving weight 1).} 20 | 21 | \item{dateNm}{Name of column containing the date variable.} 22 | 23 | \item{dateGp}{Name of the variable that the time series plots should be 24 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 25 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 26 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 27 | 28 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same 29 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.} 30 | 31 | \item{labelFl}{A \code{data.table} containing variable labels, or \code{NULL} 32 | for no labels; usually an output of \code{\link{PrepLabels}}.} 33 | 34 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to 35 | recieve red label. Currently \code{NULL} means all variables will get a 36 | black legend. Ignored this argument if \code{labelFl == NULL}.} 37 | 38 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 39 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of 40 | a variable whose skewness exceeds 5 will be on a log10 scale if possible. 41 | Negative input of \code{skewOpt} will be converted to 3.} 42 | 43 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 44 | indicates the sample size for both drawing boxplots and ordering numerical 45 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 46 | reasonable value (default is 50K) dramatically improves processing speed. 47 | Therefore, for larger datasets (e.g. > 10 percent system memory), this 48 | parameter should not be set to \code{NULL}, or boxplots may take a very 49 | long time to render. This setting has no impact on the accuracy of time 50 | series plots on quantiles, mean, SD, and missing and zero rates.} 51 | 52 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label 53 | file in the format of an output by \code{\link{PrepLabels}} and a string 54 | giving a variable name. The function should return the label corresponding 55 | to the variable given by the second parameter. This function should 56 | describe how fuzzy matching should be performed to find labels (see example 57 | below). If \code{NULL}, only exact matches will be retuned.} 58 | 59 | \item{kCategories}{If a categorical variable has more than \code{kCategories}, 60 | trace plots of only the \code{kCategories} most prevalent categories are 61 | plotted.} 62 | } 63 | \value{ 64 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object. See the output 65 | \code{p} of the function or \code{\link{PlotNumVar}} 66 | \code{\link{PlotCatVar}} for details.} 67 | \item{varSummary}{A \code{data.table} of summary statistics. See the output 68 | \code{numVarSummary} of the function \code{\link{PlotNumVar}}, or the 69 | output \code{catVarSummary} of the function \code{\link{PlotCatVar}} for 70 | details.} 71 | \item{varType}{Indicator of the variable's type, either \code{"nmrcl"} or 72 | \code{"ctgrl"}.} 73 | } 74 | \description{ 75 | For a numerical variable, the output includes 76 | \itemize{ 77 | \item side-by-side boxplots grouped by \code{dateGpBp} (left), 78 | \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp} 79 | (top right), 80 | \item a trace plot of mean and +-1 SD control limits, grouped by 81 | \code{dateGp}(middle right), and 82 | \item a trace plot of missing and zerorates, grouped by \code{dateGp} 83 | (bottom right). 84 | } 85 | For a categorical variable (including a numerical variable with no more than 2 86 | unique levels not including NA), the output includes 87 | \itemize{ 88 | \item a frequency bar plot (left), and 89 | \item a grid of trace plots on categories' proportions over time (right). 90 | If the variable contains more than \code{kCategories} number of categories, 91 | trace plots of only the largest \code{kCategories} will be plotted. 92 | } 93 | In addition to plots, a \code{data.table} of summary statistics are generated, 94 | on global and over time summary statistics. 95 | } 96 | \section{License}{ 97 | Copyright 2017 Capital One Services, LLC Licensed under the 98 | Apache License, Version 2.0 (the "License"); you may not use this file 99 | except in compliance with the License. You may obtain a copy of the License 100 | at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable 101 | law or agreed to in writing, software distributed under the License is 102 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 103 | KIND, either express or implied. See the License for the specific language 104 | governing permissions and limitations under the License. 105 | } 106 | 107 | \examples{ 108 | data(bankData) 109 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 110 | dateGpBp = "quarters") 111 | data(bankLabels) 112 | bankLabels <- PrepLabels(bankLabels) 113 | 114 | ## PlotVar will treat numerical and categorical data differently. 115 | ## Binary data is always treated as categorical. 116 | plot(PlotVar(bankData, myVar = "duration", weightNm = NULL, dateNm = "date", 117 | dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p) 118 | plot(PlotVar(bankData, myVar = "job", weightNm = NULL, dateNm = "date", 119 | dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p) 120 | plot(PlotVar(bankData, myVar = "loan", weightNm = NULL, dateNm = "date", 121 | dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p) 122 | 123 | } 124 | \seealso{ 125 | Functions depend on this function: 126 | \code{\link{PrintPlots}}. 127 | 128 | This function depends on: 129 | \code{\link{PlotCatVar}}, 130 | \code{\link{PlotNumVar}}, 131 | \code{\link{PrepData}}. 132 | } 133 | -------------------------------------------------------------------------------- /tests/testthat/rawData.csv: -------------------------------------------------------------------------------- 1 | "age","job","marital","balance","default","weight","date" 2 | 32,"blue-collar","single",23,0,0.00510204081632653,"06-05-2008" 3 | 46,"management","single",-246,0,0.0102040816326531,"06-05-2008" 4 | 32,"admin.","married",0,0,0.0102040816326531,"06-05-2008" 5 | 60,"retired","married",100,0,0.0102040816326531,"06-05-2008" 6 | 60,"admin.","married",39,0,0.0102040816326531,"07-05-2008" 7 | 58,"retired","married",96,0,0.00510204081632653,"07-05-2008" 8 | 35,"blue-collar","single",12223,0,0.00510204081632653,"07-05-2008" 9 | 55,"services","divorced",1,1,0.0102040816326531,"07-05-2008" 10 | 45,"admin.","single",13,0,0.0204081632653061,"08-05-2008" 11 | 47,"blue-collar","married",306,0,0.00510204081632653,"08-05-2008" 12 | 45,"admin.","single",206,0,0.0102040816326531,"08-05-2008" 13 | 60,"retired","married",81,0,0.00510204081632653,"08-05-2008" 14 | 28,"management","single",447,0,0.0153061224489796,"09-05-2008" 15 | 47,"blue-collar","married",1506,0,0.0153061224489796,"10-05-2008" 16 | 35,"management","married",231,0,0.0102040816326531,"10-05-2008" 17 | 40,"retired","married",0,0,0.0153061224489796,"10-05-2008" 18 | 56,"management","married",779,0,0.00510204081632653,"11-05-2008" 19 | 25,"services","married",50,0,0.0102040816326531,"11-05-2008" 20 | 29,"management","single",0,0,0.00510204081632653,"11-05-2008" 21 | 36,"admin.","divorced",506,0,0.0153061224489796,"12-05-2008" 22 | 55,"technician","divorced",0,0,0.00510204081632653,"12-05-2008" 23 | 57,"blue-collar","married",52,0,0.0153061224489796,"13-05-2008" 24 | 42,"admin.","single",-76,0,0.0102040816326531,"13-05-2008" 25 | 24,"technician","single",-103,0,0.00510204081632653,"13-05-2008" 26 | 53,"technician","divorced",989,0,0.0102040816326531,"13-05-2008" 27 | 59,"admin.","married",2343,0,0.00510204081632653,"13-05-2008" 28 | 51,"blue-collar","married",173,0,0.00510204081632653,"13-05-2008" 29 | 44,"admin.","married",-372,0,0.0153061224489796,"14-05-2008" 30 | 55,"services","divorced",91,0,0.0102040816326531,"14-05-2008" 31 | 49,"services","divorced",0,0,0.0102040816326531,"14-05-2008" 32 | 42,"management","single",50,0,0.0102040816326531,"14-05-2008" 33 | 58,"retired","married",121,0,0.0153061224489796,"15-05-2008" 34 | 36,"technician","single",265,0,0.0153061224489796,"15-05-2008" 35 | 49,"management","married",378,0,0.0153061224489796,"15-05-2008" 36 | 54,"management","married",282,0,0.0102040816326531,"15-05-2008" 37 | 44,"blue-collar","married",582,0,0.00510204081632653,"15-05-2008" 38 | 57,"entrepreneur","divorced",-37,0,0.0102040816326531,"16-05-2008" 39 | 60,"retired","married",60,0,0.00510204081632653,"17-05-2008" 40 | 38,"management","single",424,0,0.0102040816326531,"17-05-2008" 41 | 40,"blue-collar","single",24,0,0.0153061224489796,"17-05-2008" 42 | 46,"management","divorced",16,0,0.00510204081632653,"18-05-2008" 43 | 46,"management","married",229,0,0.0153061224489796,"18-05-2008" 44 | 60,"blue-collar","married",104,0,0.0102040816326531,"20-05-2008" 45 | 46,"services","married",179,0,0.0102040816326531,"20-05-2008" 46 | 53,"technician","married",6,0,0.0153061224489796,"21-05-2008" 47 | 54,"retired","married",529,0,0.0102040816326531,"21-05-2008" 48 | 58,"management","married",2143,0,0.00510204081632653,"22-05-2008" 49 | 43,"technician","single",593,0,0.00510204081632653,"22-05-2008" 50 | 57,"technician","divorced",63,0,0.00510204081632653,"22-05-2008" 51 | 42,"entrepreneur","divorced",2,1,0.0102040816326531,"23-05-2008" 52 | 51,"retired","married",229,0,0.00510204081632653,"23-05-2008" 53 | 59,"blue-collar","married",0,0,0.00510204081632653,"23-05-2008" 54 | 31,"services","married",25,0,0.0153061224489796,"23-05-2008" 55 | 55,"blue-collar","married",383,0,0.0102040816326531,"23-05-2008" 56 | 47,"services","divorced",164,0,0.0102040816326531,"24-05-2008" 57 | 46,"self-employed","married",137,0,0.0102040816326531,"24-05-2008" 58 | 48,"management","divorced",-244,0,0.0255102040816327,"25-05-2008" 59 | 49,"blue-collar","married",154,0,0.0102040816326531,"25-05-2008" 60 | 59,"management","divorced",59,0,0.00510204081632653,"25-05-2008" 61 | 25,"blue-collar","married",-7,0,0.0102040816326531,"26-05-2008" 62 | 50,"management","married",49,0,0.0102040816326531,"26-05-2008" 63 | 58,"self-employed","married",-364,0,0.00510204081632653,"26-05-2008" 64 | 57,"retired","married",486,0,0.0153061224489796,"26-05-2008" 65 | 33,"unknown","single",1,0,0.0255102040816327,"27-05-2008" 66 | 57,"services","married",162,0,0.0204081632653061,"27-05-2008" 67 | 39,"management","single",255,0,0.00510204081632653,"27-05-2008" 68 | 57,"technician","married",839,0,0.0102040816326531,"27-05-2008" 69 | 54,"blue-collar","married",1291,0,0.00510204081632653,"27-05-2008" 70 | 32,"management","married",0,0,0.0102040816326531,"27-05-2008" 71 | 55,"blue-collar","married",23,0,0.00510204081632653,"27-05-2008" 72 | 33,"entrepreneur","married",2,0,0.00510204081632653,"28-05-2008" 73 | 58,"technician","married",71,0,0.0153061224489796,"28-05-2008" 74 | 51,"management","married",10635,0,0.00510204081632653,"28-05-2008" 75 | 36,"admin.","single",-171,0,0.0204081632653061,"28-05-2008" 76 | 38,"entrepreneur","single",243,0,0.0102040816326531,"28-05-2008" 77 | 55,"technician","married",1205,0,0.0102040816326531,"28-05-2008" 78 | 41,"admin.","divorced",270,0,0.00510204081632653,"29-05-2008" 79 | 33,"services","married",0,0,0.0102040816326531,"29-05-2008" 80 | 28,"blue-collar","married",723,0,0.00510204081632653,"29-05-2008" 81 | 57,"blue-collar","married",5935,0,0.0102040816326531,"29-05-2008" 82 | 44,"services","divorced",2586,0,0.00510204081632653,"30-05-2008" 83 | 56,"admin.","married",45,0,0.0102040816326531,"30-05-2008" 84 | 30,"technician","married",152,0,0.0153061224489796,"30-05-2008" 85 | 42,"technician","single",690,0,0.0102040816326531,"31-05-2008" 86 | 41,"technician","married",1270,0,0.0153061224489796,"31-05-2008" 87 | 36,"management","married",101,0,0.00510204081632653,"31-05-2008" 88 | 29,"admin.","single",390,0,0.00510204081632653,"01-06-2008" 89 | 44,"technician","married",0,0,0.0153061224489796,"01-06-2008" 90 | 33,"services","married",790,0,0.00510204081632653,"01-06-2008" 91 | 60,"admin.","married",290,0,0.0102040816326531,"01-06-2008" 92 | 57,"blue-collar","married",249,0,0.0102040816326531,"02-06-2008" 93 | 53,"technician","married",384,0,0.00510204081632653,"02-06-2008" 94 | 60,"blue-collar","married",54,0,0.00510204081632653,"02-06-2008" 95 | 37,"admin.","single",0,0,0.0102040816326531,"03-06-2008" 96 | 43,"technician","married",1937,0,0.0102040816326531,"03-06-2008" 97 | 44,"technician","single",29,0,0.00510204081632653,"04-06-2008" 98 | 52,"entrepreneur","married",113,0,0.0153061224489796,"04-06-2008" 99 | 53,"technician","married",-3,0,0.0102040816326531,"04-06-2008" 100 | 51,"management","married",6530,0,0.00510204081632653,"04-06-2008" 101 | 39,"technician","married",0,0,0.0153061224489796,"04-06-2008" 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Due to changes in priorities, this project is currently not being supported. The project is archived as of 3/14/24 and will be available in a read-only state. Please note, since archival, the project is not maintained or reviewed. 2 | 3 | # R Package for Variable Level Monitoring 4 | 5 | [![](http://cranlogs.r-pkg.org/badges/otvPlots)](http://cran.rstudio.com/web/packages/otvPlots/index.html) 6 | 7 | An important part of model building is the "proc eyeball" sanity check. It can 8 | also be a painful part of the process, when you are the data scientist tasked 9 | with creating and checking 10,000 or more near-identical plots. The `otvPlots` 10 | package is designed to streamline this process. `otvPlots` is 11 | an R package which takes a csv file as input and provides a pdf of VLM plots 12 | and csv files of summary statistics as output, optionally ordered so 13 | that any severely abnormal time series will be at the top of the pdf. The only 14 | strict requirement of the data scientist is to specify which column of the input 15 | data file contains the date variable. 16 | 17 | `otvPlots` is efficiently implemented using `data.table` and `ggplot2` packages in R. 18 | Plots are automatically labeled if a variable dictionary is provided. Important 19 | variables can be given a highlighted label. A custom fuzzy matching algorithm 20 | can be provided by the user. 21 | 22 | Discrete and numeric variables are handled automatically and given separate 23 | treatment. All binary variables are treated as categorical. 24 | 25 | ## Output files generated by this package 26 | 27 | ### A PDF file of plots, with each individual page on one variable. 28 | 29 | For each numerical variable, the output plots include 30 | * side-by-side boxplots (left), 31 | * a trace plot of p1, p50, and p99 percentiles, 32 | * a trace plot of mean and +-1 SD control limits, and 33 | * a trace plot of missing and zero rates (bottom right). 34 | 35 | #### Here is an example page of plots for a numerical variable 36 | numerical plot 40 | 41 | For each categorical variable (including a numerical variable with no more 42 | than 2 unique levels not including NA), the output plots include 43 | * a frequency bar plot (left), and 44 | * a grid of trace plots on categories' proportions over time (right). 45 | 46 | #### Here is an example page of plots for a categorical variable 47 | categorical plot 51 | 52 | ### CSV file(s) on summary statistics of variables, both globally and over time. 53 | 54 | The order of variables in the CSV files is the same as in the PDF file. 55 | * A CSV file for numerical variables, including the number of observations 56 | (counts), p1, p25, p50, p75, and p99 quantiles, mean, SD, missing and 57 | zero rates. 58 | * A CSV file for categorical variables, including the number of observations 59 | (counts) and categories' proportions. Each row is a category of a 60 | categorical (or binary) variable. The row whose `category == 'NA'` 61 | corresponds to missing. Categories among the same variable are ordered by 62 | global prevalence in a descending order. 63 | 64 | # Installation 65 | Open an R (or RStudio) console and install the package from CRAN 66 | 67 | ``` 68 | install.packages("otvPlots") 69 | ``` 70 | 71 | Alternatively, if you prefer to install from GitHub: 72 | 73 | 1. Install the `devtools` package if not yet. You only need to do this once, so 74 | feel free to skip this step if the `devtools` is already installed. You will be 75 | asked to select a CRAN mirror. 76 | 77 | ``` 78 | install.packages("devtools") 79 | ``` 80 | 81 | 2. Install the `otvPlots` package 82 | ``` 83 | devtools::install_github("capitalone/otvPlots") 84 | ``` 85 | 86 | You can also build the package yourself by cloning the repo, setting your 87 | working directory to the otvPlots folder and running `devtools::build()` 88 | in R, after installing the `devtools` package. 89 | 90 | Note that otvPlots does depend on R and several R packages to run. You can 91 | see a complete and up to date list of dependencies in the Imports field in 92 | the DESCRIPTION file. 93 | 94 | 95 | # Getting Started 96 | 97 | ## Load the package 98 | Open an R console (or RStudio). Load the `otvPlots` pacakge first (all its 99 | dependent packages should be loaded automatically). 100 | 101 | ``` 102 | library(otvPlots) 103 | ``` 104 | 105 | The main function of the package is `vlm`. Before execute this function, 106 | input data need to be prepared using the `PrepData` function. 107 | **Please check out the help files to see all options and many usage examples 108 | (highly recommended!)** 109 | 110 | ``` 111 | help(vlm) 112 | help(PrepData) 113 | ``` 114 | 115 | ## Examples 116 | 117 | The data `bankData` and its labels `bankLables` are built-in datasets in the 118 | `otvPlots` package. 119 | 120 | ### The first example 121 | After running the following code, a pdf file named "bank.pdf" and two csv files 122 | named "bank_numerical_summary.csv" and "bank_categorical_summary.csv" will be 123 | generated in the current working directory. 124 | 125 | ``` 126 | ## Load the datasets 127 | data(bankData) 128 | data(bankLabels) 129 | 130 | ## Prepare data and labels 131 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 132 | dateGpBp = "quarters") 133 | bankLabels <- PrepLabels(bankLabels) 134 | 135 | ## Generate a pdf file of vlm plots, and csv files of summary statistics 136 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 137 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", outFl = "bank") 138 | ``` 139 | 140 | ### More examples on the `bankData` data 141 | The `PrepData` function only needs to be run once on a dataset. After that `vlm` 142 | can be run directly with the argument `dataNeedPrep = FALSE` (the default). 143 | 144 | * If csv files of summary statistics are not need, set `genCSV = FALSE`. 145 | 146 | ``` 147 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE, 148 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", outFl = "bank2") 149 | ``` 150 | * If weights are provided, they will be used in all statistical calculations 151 | 152 | ``` 153 | bankData[, weight := rnorm(.N, 1, .1)] 154 | bankData[, weight := weight / mean(weight)] 155 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 156 | dateGp = "months", dateGpBp = "quarters", weightNm = "weight", outFl = "bank3") 157 | ``` 158 | 159 | * Customize plotting order by passing a vector of variable names to argument 160 | `sortVars`, but the `"date"` column must be excluded from `sortVars` 161 | 162 | ``` 163 | sortVars <- sort(bankLabels[varCol!="date", varCol]) 164 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 165 | dateGp = "months", dateGpBp = "quarters", outFl = "bank4", 166 | sortVars = sortVars) 167 | ``` 168 | 169 | * Create plots for a specific variable using the `varNms` argument 170 | 171 | ``` 172 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 173 | dateGp = "months", dateGpBp = "quarters", outFl = "bank5", 174 | varNms = "age", sortVars = NULL) 175 | ``` 176 | 177 | ## Citations 178 | 179 | All examples for this package come from the 180 | [Bank Marketing dataset](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing) 181 | available at the UCI Machine Learning Repository. The UCI repository maintains 182 | a free collection of datasets for researchers at its 183 | [website](http://archive.ics.uci.edu/ml). 184 | 185 | Moro et al., S. Moro, P. Cortez, and P. Rita (2014). A Data-Driven Approach to 186 | Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 187 | 62:22-31, June 2014 188 | 189 | Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. 190 | 191 | ## Copyright 2017 Capital One Services, LLC 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and limitations under the License. 201 | 202 | ## External Contributors 203 | Contributors: We welcome your interest in Capital One’s Open Source Projects (the “Project”). 204 | 205 | Any Contributor to the project must accept and sign a CLA indicating agreement to the license terms. Except for the license granted in this CLA to Capital One and to recipients of software distributed by Capital One, you reserve all right, title, and interest in and to your contributions; this CLA does not impact your rights to use your own contributions for any other purpose. 206 | 207 | [Link to Individual CLA](https://docs.google.com/forms/d/19LpBBjykHPox18vrZvBbZUcK6gQTj7qv1O5hCduAZFU/viewform) 208 | 209 | [Link to Corporate CLA ](https://docs.google.com/forms/d/e/1FAIpQLSeAbobIPLCVZD_ccgtMWBDAcN68oqbAJBQyDTSAQ1AkYuCp_g/viewform) 210 | 211 | This project adheres to the 212 | [Open Source Code of Conduct](https://developer.capitalone.com/single/code-of-conduct/). 213 | By participating, you are expected to honor this code. 214 | 215 | 216 | -------------------------------------------------------------------------------- /R/plots_order.R: -------------------------------------------------------------------------------- 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 Capital One Services, LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software distributed 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 12 | # OF ANY KIND, either express or implied. 13 | # 14 | # See the License for the specific language governing permissions and limitations under the License. 15 | 16 | 17 | ########################################### 18 | # Order By R2 # 19 | ########################################### 20 | 21 | #' Create numerical variable ranking using R2 between date to and variable 22 | #' 23 | #' Calculates R2 of a linear model of the formula \code{var} ~ \code{dateNm} for 24 | #' each \code{var} of class \code{nmrcl} and returns a vector of 25 | #' variable names ordered by highest R2. The linear model can be calculated over 26 | #' a subset of dates, see details of parameter \code{buildTm}. Non-numerical 27 | #' variables are returned in alphabetical order after the sorted numerical 28 | #' variables. 29 | #' 30 | #' @inheritParams PrepData 31 | #' @inheritParams PlotNumVar 32 | #' @param dataFl A \code{data.table} of data; must be the output of the 33 | #' \code{\link{PrepData}} function. 34 | #' @param buildTm Vector identify time period for ranking/anomaly detection 35 | #' (most likely model build period). Allows for a subset of plotting time 36 | #' period to be used for anomaly detection. 37 | #' \itemize{ 38 | #' \item Must be a vector of dates and must be inclusive i.e. buildTm[1] 39 | #' <= date <= buildTm[2] will define the time period. 40 | #' \item Must be either \code{NULL}, a vector of length 2, or a vector of 41 | #' length 3. 42 | #' \item If \code{NULL}, the entire dataset will be used for 43 | #' ranking/anomaly detection. 44 | #' \item If a vector of length 2, the format of the dates must be 45 | #' a character vector in default R date format (e.g. "2017-01-30"). 46 | #' \item If a vector of length 3, the first two columns must contain dates 47 | #' in any strptime format, while the 3rd column contains the strptime 48 | #' format (see \code{\link{strptime}}). 49 | #' \item The following are equivalent ways of selecting 50 | #' all of 2014: 51 | #' \itemize{ 52 | #' \item \code{c("2014-01-01","2014-12-31")} 53 | #' \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")} 54 | #' } 55 | #' } 56 | #' @export 57 | #' 58 | #' @seealso Functions depend on this function: 59 | #' \code{\link{vlm}}. 60 | #' @seealso This function depends on: 61 | #' \code{\link{CalcR2}}, 62 | #' \code{\link{PrepData}}. 63 | #' 64 | #' @return A vector of variable names sorted by R2 of \code{lm} of the formula 65 | #' \code{var} ~ \code{dateNm} (highest R2 to lowest) 66 | #' @section License: 67 | #' Copyright 2017 Capital One Services, LLC Licensed under the 68 | #' Apache License, Version 2.0 (the "License"); you may not use this file 69 | #' except in compliance with the License. You may obtain a copy of the 70 | #' License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 71 | #' applicable law or agreed to in writing, software distributed under the 72 | #' License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 73 | #' CONDITIONS OF ANY KIND, either express or implied. See the License for the 74 | #' specific language governing permissions and limitations under the License. 75 | #' @examples 76 | #' data(bankData) 77 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 78 | #' dateGpBp = "quarters") 79 | #' OrderByR2(bankData, dateNm = "date") 80 | 81 | OrderByR2 <- function(dataFl, dateNm, buildTm = NULL, weightNm = NULL, 82 | kSample = 50000) { 83 | 84 | ## Make sure no NAs in weights and dates 85 | if (!is.null(weightNm)) { 86 | if (any(is.na(dataFl[[weightNm]]))) { 87 | warning("Weights column contains NAs--will be deleted casewise") 88 | } 89 | } 90 | if (any(is.na(dataFl[[dateNm]]))) { 91 | warning("Date column contains NAs--will be deleted casewise") 92 | } 93 | 94 | ## Convert buildTm to IDate format 95 | ## If the length of input buildTm is not 2 or 3, then use start and end time in dateNm 96 | buildTm <- switch(as.character(length(buildTm)), "2" = as.IDate(buildTm), 97 | "3" = as.IDate(buildTm[1:2], buildTm[3]), 98 | # avoid inheritence as list using [[]] 99 | dataFl[c(1, .N), dateNm, with = FALSE][[1]]) 100 | 101 | num_vars <- names(Filter(is.nmrcl, dataFl)) 102 | cat_vars <- names(Filter(is.ctgrl, dataFl)) 103 | 104 | ## Sorting by R2 only works for numeric variables. 105 | if (length(num_vars > 0)) { 106 | 107 | # Using sample directly in dataFl parameter for brevity, 108 | # which reorders the input to CalcR2 but does not change output 109 | r2 <- vapply(num_vars, CalcR2, 110 | dataFl = dataFl[buildTm[1] <= get(dateNm) & 111 | get(dateNm) <= buildTm[2], ][ 112 | sample(.N, min(.N, kSample))], 113 | dateNm = dateNm, weightNm = weightNm, imputeValue = NULL, 114 | numeric(1)) 115 | sortVars <- c(num_vars[order(r2, decreasing = TRUE)], cat_vars) 116 | } else { 117 | sortVars <- cat_vars 118 | } 119 | 120 | return(sortVars) 121 | } 122 | 123 | 124 | ########################################### 125 | # CalcR2 Function # 126 | ########################################### 127 | 128 | #' Calculates R2 of a numerical variable using date as the predictor 129 | #' 130 | #' Calculates weighted R2 of a univariate weighted linear model with 131 | #' \code{dateNm} as x and \code{myVar} as y using the workhorse \code{lm.fit} 132 | #' and \code{lm.wfit} functions. 133 | #' 134 | #' @param myVar Name of variable to model. 135 | #' @param dataFl A \code{data.table}, containing \code{myVar}, \code{dateNm}, 136 | #' and \code{weightNm}. 137 | #' @param dateNm Name of column containing the date variable (to be modeled as 138 | #' numeric); this date column must not have NA's. 139 | #' @param weightNm Name of column containing row weights. If weights equal one, 140 | #' then the \code{\link{lm.fit}} function will be called, otherwise the 141 | #' \code{\link{lm.wfit}} will be called. The weights column must not have NA's. 142 | #' @param imputeValue Either \code{NULL} or numeric. If \code{NULL}, model will 143 | #' be fit on only non-NA components of \code{myVar}. If numeric, missing cases 144 | #' of \code{myVar} will be imputed to \code{imputeValue}. 145 | #' @return A numeric value of R2. 146 | #' @export 147 | #' 148 | #' @seealso Functions depend on this function: 149 | #' \code{\link{OrderByR2}}. 150 | #' @seealso This function depends on: 151 | #' \code{\link{PrepData}}. 152 | #' 153 | #' @section License: 154 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 155 | #' Version 2.0 (the "License"); you may not use this file except in compliance 156 | #' with the License. You may obtain a copy of the License at 157 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 158 | #' or agreed to in writing, software distributed under the License is 159 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 160 | #' KIND, either express or implied. See the License for the specific language 161 | #' governing permissions and limitations under the License. 162 | 163 | CalcR2 <- function(myVar, dataFl, dateNm, weightNm = NULL, imputeValue = NULL) { 164 | 165 | message("Calculating R2 of ", myVar) 166 | 167 | if (sum(!is.na(dataFl[[myVar]])) < 2) { 168 | ## If kSample is not null, then we need to recheck that the subsample is not 169 | ## all missing. If there are less than 2 numeric values left after sampling 170 | ## we can't calculate R2 171 | return(Inf) 172 | } else { 173 | y <- dataFl[[myVar]] 174 | 175 | ## If imputeValue is available, we impute everywhere Y is missing 176 | if (!is.null(imputeValue)) { 177 | y[is.na(y)] <- imputeValue 178 | } 179 | 180 | ## Index of missing values in y (after imputation if applicable) 181 | yIdx <- which(is.na(y)) 182 | 183 | ## We perform casewise deletion anywhere X, Y or W (if not null) is missing 184 | if (!is.null(weightNm)) { 185 | w <- dataFl[[weightNm]] 186 | wIdx <- which(is.na(w)) 187 | yIdx <- unique(c(yIdx, wIdx)) 188 | } 189 | 190 | ## Convert x from date to numeric, plus a column of ones as the intercept 191 | x <- cbind(1, as.matrix(as.numeric(dataFl[[dateNm]]), ncol = 1)) 192 | xIdx <- which(is.na(x[, 2])) 193 | yIdx <- unique(c(xIdx, yIdx)) 194 | 195 | ## Remove all entries as in yIdx 196 | if (length(yIdx) > 0) { 197 | if (!is.null(weightNm)) { 198 | w <- w[-c(yIdx)] 199 | } 200 | y <- y[-c(yIdx)] 201 | x <- x[-c(yIdx), ] 202 | } 203 | 204 | ## Compute R2 or weighted R2 205 | if (is.null(weightNm)) { 206 | mod <- lm.fit(x = x, y = y) 207 | r2 <- 1 - sum(mod$resid ^ 2) / sum( (y - mean(y)) ^ 2) 208 | } else { 209 | mod <- lm.wfit(x = x, y = y, w = w) 210 | r2 <- 1 - sum(w * mod$resid ^ 2) / sum(w * (y - Hmisc::wtd.mean(y, w, normwt = TRUE)) ^ 2) 211 | } 212 | return(r2) 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /tests/testthat/test_PrepData.R: -------------------------------------------------------------------------------- 1 | library(otvPlots) 2 | context("Prepare Data") 3 | data(bankData); setDT(bankData) 4 | is.cntns <- function(x) inherits(x, "nmrcl") #!#previous name: "cntns" 5 | is.dscrt <- function(x) inherits(x, "ctgrl") #!# previous name: "dscrt" 6 | is.IDate <- function(x) inherits(x, "IDate") 7 | is.binary <- function(x) uniqueN(na.omit(x)) == 2 8 | 9 | test_that("Names of the variables are transformed correctly", { 10 | out <- PrepData(dataFl = "../testthat/drugRDate.csv", dateNm = "date", 11 | dateGp = "months", dateGpBp = "quarters") 12 | expect_equal(names(out)[6], "Residence.City") 13 | }) 14 | 15 | test_that("Parse SAS (eg. 07Apr2017) default date format correctly", { 16 | out <- PrepData(dataFl = "../testthat/drugSASDate.csv", dateNm = "date", 17 | dateGp = "months", dateGpBp = "quarters") 18 | expect_false(all(is.na(out[, "date"])), 'Fail to parse SAS date format') 19 | } 20 | ) 21 | 22 | test_that("Parse R (eg. 2017-04-17) default date format correctly", { 23 | out <- PrepData(dataFl = "../testthat/drugRDate.csv", dateNm = "date", 24 | dateGp = "months", dateGpBp = "quarters") 25 | expect_false(all(is.na(out[, "date"])), 'Fail to parse R date format') 26 | } 27 | ) 28 | 29 | test_that("Incorrect date format creates warnings with csv input file", { 30 | expect_warning( 31 | PrepData("../testthat/rawData.csv", dateNm = "date", weightNm ="weight", 32 | dateGp = "weeks", dateGpBp = "weeks"), "Formatting date as ") 33 | } 34 | ) 35 | 36 | test_that("Incorrect date format creates warnings with Rdata input file", { 37 | expect_warning( 38 | PrepData("../testthat/rawData.rda", dateNm = "date", weightNm ="weight", 39 | dateGp = "weeks", dateGpBp = "weeks"), "Formatting date as ") 40 | } 41 | ) 42 | 43 | out <- suppressMessages(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 44 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y")) 45 | 46 | test_that("All columns have exactly 2 classes, except date and weight", { 47 | cntnsVars = Filter(is.cntns, out) 48 | dscrtVars = Filter(is.dscrt, out) 49 | dateVars = Filter(is.IDate, out) 50 | expect_equal(length(cntnsVars), 2) 51 | expect_equal(length(dscrtVars), 3) 52 | expect_equal(length(dateVars), 2) 53 | expect_equal(length(class(out[, weight])), 1) 54 | expect_equal(length(cntnsVars) + length(dscrtVars) + length(dateVars) + 1, ncol(out)) 55 | }) 56 | 57 | test_that("Variables are assigned to appropriate data type", { 58 | cntnsVars = Filter(is.cntns, out) 59 | 60 | # test that all cntns variables are numeric 61 | expect_equal(length(Filter(Negate(is.numeric), cntnsVars)), 0) 62 | 63 | # test that no cntns variables are binary 64 | expect_equal(length(Filter(is.binary, cntnsVars)), 0) 65 | 66 | # test that all discrete variables are binary, character, or factor 67 | dscrtVars = Filter(is.dscrt, out) 68 | binVars = Filter(is.binary, dscrtVars) 69 | charVars = Filter(Negate(is.binary), dscrtVars) 70 | charClasses = unique(sapply(charVars, function(x) class(x)[1])) 71 | expect_equal(length(setdiff(charClasses, c("character", "factor"))), 0) 72 | 73 | # test that all remaining variables are IDate, except weight 74 | dateVars = Filter(is.IDate, out) 75 | 76 | expect_equal(length(names(dateVars)) + length(names(binVars)) + length(names(charVars)) 77 | + length(names(cntnsVars)) + 1, length(names(out))) 78 | }) 79 | 80 | test_that("varNms parameter works", { 81 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 82 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", varNms = c("age", "balance")) 83 | cntnsVars = Filter(is.cntns, out) 84 | dscrtVars = Filter(is.dscrt, out) 85 | dateVars = Filter(is.IDate, out) 86 | expect_equal(length(cntnsVars), 2) 87 | expect_equal(length(dscrtVars), 0) 88 | expect_equal(length(dateVars), 2) 89 | 90 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 91 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", varNms = c(1, 4)) 92 | cntnsVars = Filter(is.cntns, out) 93 | dscrtVars = Filter(is.dscrt, out) 94 | dateVars = Filter(is.IDate, out) 95 | expect_equal(length(cntnsVars), 2) 96 | expect_equal(length(dscrtVars), 0) 97 | expect_equal(length(dateVars), 2) 98 | }) 99 | 100 | test_that("selectCols and dropCols work as expected for csv file", { 101 | 102 | # Test that selectCols works alone 103 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 104 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 105 | selectCols = c("age", "balance", "date", "weight")) 106 | cntnsVars = Filter(is.cntns, out) 107 | dscrtVars = Filter(is.dscrt, out) 108 | dateVars = Filter(is.IDate, out) 109 | expect_equal(length(cntnsVars), 2) 110 | expect_equal(length(dscrtVars), 0) 111 | expect_equal(length(dateVars), 2) 112 | 113 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 114 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 115 | selectCols = c(1, 4, 7, 6)) 116 | cntnsVars = Filter(is.cntns, out) 117 | dscrtVars = Filter(is.dscrt, out) 118 | dateVars = Filter(is.IDate, out) 119 | expect_equal(length(cntnsVars), 2) 120 | expect_equal(length(dscrtVars), 0) 121 | expect_equal(length(dateVars), 2) 122 | 123 | # test that dropCols works alone 124 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 125 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 126 | dropCols = c("job", "marital", "default")) 127 | cntnsVars = Filter(is.cntns, out) 128 | dscrtVars = Filter(is.dscrt, out) 129 | dateVars = Filter(is.IDate, out) 130 | expect_equal(length(cntnsVars), 2) 131 | expect_equal(length(dscrtVars), 0) 132 | expect_equal(length(dateVars), 2) 133 | 134 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 135 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 136 | dropCols = c(2:3, 5)) 137 | cntnsVars = Filter(is.cntns, out) 138 | dscrtVars = Filter(is.dscrt, out) 139 | dateVars = Filter(is.IDate, out) 140 | expect_equal(length(cntnsVars), 2) 141 | expect_equal(length(dscrtVars), 0) 142 | expect_equal(length(dateVars), 2) 143 | }) 144 | 145 | test_that("selectCols and dropCols work as expected for RData file", { 146 | 147 | # Test that selectCols works alone 148 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight", 149 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 150 | selectCols = c("age", "balance", "date", "weight")) 151 | cntnsVars = Filter(is.cntns, out) 152 | dscrtVars = Filter(is.dscrt, out) 153 | dateVars = Filter(is.IDate, out) 154 | expect_equal(length(cntnsVars), 2) 155 | expect_equal(length(dscrtVars), 0) 156 | expect_equal(length(dateVars), 2) 157 | 158 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight", 159 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 160 | selectCols = c(1, 4, 7, 6)) 161 | cntnsVars = Filter(is.cntns, out) 162 | dscrtVars = Filter(is.dscrt, out) 163 | dateVars = Filter(is.IDate, out) 164 | expect_equal(length(cntnsVars), 2) 165 | expect_equal(length(dscrtVars), 0) 166 | expect_equal(length(dateVars), 2) 167 | 168 | # test that dropCols works alone 169 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight", 170 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 171 | dropCols = c("job", "marital", "default")) 172 | cntnsVars = Filter(is.cntns, out) 173 | dscrtVars = Filter(is.dscrt, out) 174 | dateVars = Filter(is.IDate, out) 175 | expect_equal(length(cntnsVars), 2) 176 | expect_equal(length(dscrtVars), 0) 177 | expect_equal(length(dateVars), 2) 178 | 179 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight", 180 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", 181 | dropCols = c(2:3, 5)) 182 | cntnsVars = Filter(is.cntns, out) 183 | dscrtVars = Filter(is.dscrt, out) 184 | dateVars = Filter(is.IDate, out) 185 | expect_equal(length(cntnsVars), 2) 186 | expect_equal(length(dscrtVars), 0) 187 | expect_equal(length(dateVars), 2) 188 | }) 189 | 190 | test_that("dropConstants works as expected", { 191 | 192 | # test that attempting to group at too coarse a level results in the grouping variable being dropped 193 | out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 194 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = TRUE)) 195 | expect_warning(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 196 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = TRUE), 197 | "The following variables have no variability") 198 | expect_null(out[["quarters"]]) 199 | 200 | 201 | # test that when dropConstants is set to FALSE, the constant grouping variable is retained, with a warning 202 | out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 203 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = FALSE)) 204 | expect_warning(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 205 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = FALSE), 206 | "variability in grouping") 207 | expect_equal(length(unique(out[["quarters"]])), 1) 208 | 209 | }) 210 | 211 | test_that("integer64 data doesn't cause problems", { 212 | require(bit64) 213 | out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 214 | dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y")) 215 | out[ , balance := as.integer64(balance)] 216 | PrepData(out, dateNm = "date", weightNm = "weight", 217 | dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y") 218 | expect_false(is.integer64(out[, balance])) 219 | 220 | out <- suppressWarnings(PrepData("../testthat/rawData_bigint.csv", dateNm = "date", weightNm = "weight", 221 | dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y")) 222 | expect_false(is.integer64(out[,bigint])) 223 | }) 224 | 225 | test_that("Incorrect data input file generates error", { 226 | expect_error(dataFl <- PrepD("../testthat/PlotHistogram.RDS")) 227 | }) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /R/vlm.R: -------------------------------------------------------------------------------- 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 Capital One Services, LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software distributed 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 12 | # OF ANY KIND, either express or implied. 13 | # 14 | # See the License for the specific language governing permissions and limitations under the License. 15 | 16 | 17 | ########################################### 18 | # The Main Function # 19 | ########################################### 20 | 21 | #' Create over time variable plots and summary statistics for variable level monitoring 22 | #' 23 | #' Sorts variables according to either user input or correlation with time 24 | #' (among numerical variables only), and create output files including: 25 | #' \itemize{ 26 | #' \item A PDF file of plots saved as \code{outFl}.pdf, with each indivual page 27 | #' on one variable. Variables are plotted in the order indicated in the argument 28 | #' \code{sortVars} or \code{sortFn}. 29 | #' For each numerical variable, the output plots include 30 | #' \itemize{ 31 | #' \item side-by-side boxplots grouped by \code{dateGpBp} (left), 32 | #' \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp} 33 | #' (top right), 34 | #' \item a trace plot of mean and +-1 SD control limits, grouped by 35 | #' \code{dateGp}(middle right), and 36 | #' \item a trace plot of missing and zerorates, grouped by \code{dateGp} 37 | #' (bottom right). 38 | #' } 39 | #' For each categorical variable (including a numerical variable with no more 40 | #' than 2 unique levels not including NA), the output plots include 41 | #' \itemize{ 42 | #' \item a frequency bar plot (left), and 43 | #' \item a grid of trace plots on categories' proportions over time (right). 44 | #' If the variable contains more than \code{kCategories} number of 45 | #' categories, trace plots of only the largest \code{kCategories} will be 46 | #' plotted. If the variable contains only two categories, then only the 47 | #' trace plot of the less prevalent cateogy will be plotted. 48 | #' } 49 | #' \item CSV file(s) on summary statistics of variable, both globally and over 50 | #' time aggregated by \code{dateGp}. The order of variables in the CSV files 51 | #' are the same as in the PDF file. 52 | #' \itemize{ 53 | #' \item For numerical varaibles, number of observations (counts), p1, p25, 54 | #' p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved 55 | #' as \code{outFl}_numerical_summary.csv. 56 | #' \item For categorical varaibles, number of observations (counts) and 57 | #' categories' proportions are saved as \code{outFl}_categorical_summary.csv. 58 | #' Each row is a category of a categorical (or binary) variable. 59 | #' The row whose \code{category == 'NA'} corresponds to missing. Categories 60 | #' among the same variable are ordered by global prevalence in a descending 61 | #' order. 62 | #' } 63 | #' } 64 | #' 65 | #' If the argument \code{dataNeedPrep} is set to \code{FALSE}, then 66 | #' \itemize{ 67 | #' \item \code{dataFl} must be a \code{data.table} containing variables 68 | #' \code{weightNm}, \code{dateNm}, \code{dateGp}, and \code{dateGpBp}, and 69 | #' names of these variables must be the same as the corresponding arguments 70 | #' of the \code{\link{vlm}} function. 71 | #' \item the arguments \code{selectCols}, \code{dropCols}, \code{dateFt}, 72 | #' \code{dropConstants} will be ignored by the \code{\link{vlm}} function. 73 | #' \item When analyzing a dataset for the first time, it is recommended to first 74 | #' run the \code{\link{PrepData}} function on it, and then apply the 75 | #' \code{\link{vlm}} function with the argument \code{dataNeedPrep = FALSE}. 76 | #' Please see the examples for details. 77 | #' } 78 | #' 79 | #' @inheritParams PrepData 80 | #' @inheritParams PrepLabels 81 | #' @inheritParams OrderByR2 82 | #' @inheritParams PrintPlots 83 | #' @param sortVars Determines which variables to be plotted and their order. 84 | #' Either a character vector of variable names to plot variables in the same 85 | #' order as in the \code{sortVars} argument), or \code{NULL} to keep the 86 | #' original ordering, with numerical variables will being plotted before 87 | #' categorical and binary ones. \code{sortVars} should be \code{NULL} when the 88 | #' \code{sortFn} argument is used. 89 | #' @param sortFn A sorting function which returns \code{sortVars} as an output. 90 | #' The function may take the following variables as input: \code{dataFl}, 91 | #' \code{dateNm}, \code{buildTm}, \code{weightNm}, \code{kSample}. Currently, 92 | #' the only build-in sorting function is \code{\link{OrderByR2}}, which sorts 93 | #' numerical variables in the order of strength of linear association with date, 94 | #' and adds categorical (and binary) variables sorted in alphabetical order 95 | #' after the numerical ones. 96 | #' @param dataNeedPrep Logical, indicates if data should be run through the 97 | #' \code{\link{PrepData}} function. This should be set to \code{TRUE} unless 98 | #' the \code{\link{PrepData}} function has been applied to the input data 99 | #' \code{dataFl}. 100 | #' @export 101 | #' 102 | #' @seealso This function depends on: 103 | #' \code{\link{PrintPlots}}, 104 | #' \code{\link{OrderByR2}}, 105 | #' \code{\link{PrepData}}, 106 | #' \code{\link{PrepLabels}}. 107 | #' 108 | #' @section License: Copyright 2017 Capital One Services, LLC Licensed under the 109 | #' Apache License, Version 2.0 (the "License"); you may not use this file 110 | #' except in compliance with the License. You may obtain a copy of the License 111 | #' at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable 112 | #' law or agreed to in writing, software distributed under the License is 113 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 114 | #' KIND, either express or implied. See the License for the specific language 115 | #' governing permissions and limitations under the License. 116 | #' @examples 117 | #' ## Load the data and its label 118 | #' data(bankData) 119 | #' data(bankLabels) 120 | #' 121 | #' ## The PrepData function should only need to be run once on a dataset, 122 | #' ## after that vlm can be run with the argument dataNeedPrep = FALSE 123 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 124 | #' dateGpBp = "quarters") 125 | #' bankLabels <- PrepLabels(bankLabels) 126 | #' 127 | #'\dontrun{ 128 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 129 | #' sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 130 | #' outFl = "bank") 131 | #' 132 | #' ## If csv files of summary statistics are not need, set genCSV = FALSE 133 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE, 134 | #' sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 135 | #' outFl = "bank") 136 | #' 137 | #' ## If weights are provided, they will be used in all statistical calculations 138 | #' bankData[, weight := rnorm(.N, 1, .1)] 139 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 140 | #' dateGp = "months", dateGpBp = "quarters", weightNm = "weight", 141 | #' outFl = "bank") 142 | #' 143 | #' ## Customize plotting order by passing a vector of variable names to 144 | #' ## sortVars, but the "date" column must be excluded from sortVars 145 | #' sortVars <- sort(bankLabels[varCol!="date", varCol]) 146 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 147 | #' dateGp = "months", dateGpBp = "quarters", outFl = "bank", 148 | #' sortVars = sortVars) 149 | #' 150 | #' ## Create plots for a specific variable using the varNms parameter 151 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 152 | #' dateGp = "months", dateGpBp = "quarters", outFl = "bank", 153 | #' varNms = "age", sortVars = NULL) 154 | #'} 155 | 156 | vlm <- function(dataFl, dateNm, labelFl = NULL, outFl = "otvplots", 157 | genCSV = TRUE, dataNeedPrep = FALSE, dateGp = NULL, 158 | dateGpBp = NULL, weightNm = NULL, varNms = NULL, 159 | sortVars = NULL, sortFn = NULL, selectCols = NULL, 160 | dropCols = NULL, dateFt = "%d%h%Y", buildTm = NULL, 161 | highlightNms = NULL, skewOpt = NULL, kSample = 50000, 162 | fuzzyLabelFn = NULL, dropConstants = FALSE, kCategories = 9, ...) { 163 | 164 | ## Assert statements about inputs 165 | if (!is.null(sortVars) & !is.null(sortFn)) { 166 | stop ("Please choose between sortVars (predetermined order of plotting) and 167 | sortFn (function to determine plotting order)")} 168 | 169 | if (!is.null(sortVars) & !is.null(varNms) && 170 | !all(varNms %in% sortVars)) { 171 | stop ("Please make certain that varNms is a subset of sortVars") 172 | } 173 | 174 | if (!is.null(selectCols) & !is.null(dropCols)) { 175 | stop("Please choose between selectCols or dropCols.") 176 | } 177 | 178 | ## Apply the PrepData function if not previously on dataFl 179 | if (dataNeedPrep) { 180 | # Need to prepare data first 181 | dataFl <- PrepData(dataFl = dataFl, dateNm = dateNm, 182 | selectCols = selectCols, dropCols = dropCols, 183 | dateFt = dateFt, dateGp = dateGp, dateGpBp = dateGpBp, 184 | weightNm = weightNm, varNms = varNms, 185 | dropConstants = dropConstants, ...) 186 | } else { 187 | stopifnot(is.data.table(dataFl) && 188 | all(c(weightNm, dateNm, dateGp, dateGpBp) %in% names(dataFl))) 189 | ## Change integer64 data type to numeric 190 | for (var in names(dataFl)) { 191 | if (inherits(dataFl[[var]], "integer64")) { 192 | dataFl[, (var) := as.numeric(get(var))] 193 | } 194 | } 195 | } 196 | 197 | ## Apply the PrepLabels function 198 | labelFl <- PrepLabels(labelFl) 199 | 200 | ## Apply sortFn to generate sortVars 201 | if (!is.null(sortFn) && is.character(sortFn)) { 202 | sortVars <- do.call(sortFn, list(dataFl = dataFl, dateNm = dateNm, 203 | buildTm = buildTm, weightNm = weightNm, 204 | kSample = kSample)) 205 | } else { 206 | if (is.null(sortVars)) { 207 | num_vars <- names(dataFl)[sapply(dataFl, inherits, "nmrcl")] 208 | cat_vars <- names(dataFl)[sapply(dataFl, inherits, "ctgrl")] 209 | sortVars <- c(num_vars, cat_vars) 210 | } 211 | } 212 | 213 | ## Create the plots 214 | if (!is.null(varNms)) { 215 | PrintPlots(outFl = outFl, 216 | dataFl = dataFl[, c(varNms, dateNm, dateGp, dateGpBp, weightNm), 217 | with = FALSE], 218 | sortVars = sortVars[sortVars %in% varNms], dateNm = dateNm, 219 | dateGp = dateGp, dateGpBp = dateGpBp, weightNm = weightNm, 220 | labelFl = labelFl, genCSV = genCSV, highlightNms = highlightNms, 221 | skewOpt = skewOpt, kSample = kSample, 222 | fuzzyLabelFn = fuzzyLabelFn, kCategories = kCategories) 223 | } else { 224 | PrintPlots(outFl = outFl, dataFl = dataFl, sortVars = sortVars, 225 | dateNm = dateNm, dateGp = dateGp, dateGpBp = dateGpBp, 226 | weightNm = weightNm, labelFl = labelFl, genCSV = genCSV, 227 | highlightNms = highlightNms, skewOpt = skewOpt, 228 | kSample = kSample, fuzzyLabelFn = fuzzyLabelFn, 229 | kCategories = kCategories) 230 | } 231 | } 232 | 233 | -------------------------------------------------------------------------------- /R/plot_print.R: -------------------------------------------------------------------------------- 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 Capital One Services, LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software distributed 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 12 | # OF ANY KIND, either express or implied. 13 | # 14 | # See the License for the specific language governing permissions and limitations under the License. 15 | 16 | 17 | ########################################### 18 | # Create output # 19 | ########################################### 20 | 21 | #' Create a pdf file with plots and compute summary statistics for all variables 22 | #' 23 | #' Creates plots and outputs results to a letter-sized pdf file, with each 24 | #' individual page containing plots on a single variable in the data. In 25 | #' addition, two summary statistics \code{data.table} are returned, one for 26 | #' numerical variables, and one for categorical (and binary) ones. 27 | #' 28 | #' @inheritParams PlotVar 29 | #' @param outFl Name of the output file, with no extension names (e.g., "bank"). 30 | #' A pdf file of plots ("bank.pdf"), and two csv files of summary statistics 31 | #' ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be 32 | #' saved to your working directory, unless a path is included in \code{outFl} 33 | #' (e.g. "../plots/bank"). 34 | #' @param genCSV Logical, whether to generate the two csv files of summary 35 | #' statistics for numerical and categorical variables. 36 | #' @param sortVars A character vector of variable names in the order they will 37 | #' be plotted. 38 | #' @return A pdf of plots saved to file \code{outFl}.pdf, and if the argument 39 | #' \code{genCSV == TRUE}, also two csv files of summary statistics for 40 | #' numerical and categorical variables. 41 | #' 42 | #' @seealso Functions depend on this function: 43 | #' \code{\link{vlm}}. 44 | #' @seealso This function depends on: 45 | #' \code{\link{PlotVar}}, 46 | #' \code{\link{PrepData}}. 47 | #' 48 | #' @section License: 49 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 50 | #' Version 2.0 (the "License"); you may not use this file except in compliance 51 | #' with the License. You may obtain a copy of the License at 52 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 53 | #' or agreed to in writing, software distributed under the License is 54 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 55 | #' KIND, either express or implied. See the License for the specific language 56 | #' governing permissions and limitations under the License. 57 | #' @export 58 | PrintPlots <- function(outFl, dataFl, sortVars, dateNm, dateGp, dateGpBp, 59 | weightNm = NULL, labelFl = NULL, genCSV = TRUE, 60 | highlightNms = NULL, skewOpt = NULL, kSample = 50000, 61 | fuzzyLabelFn = NULL, kCategories = 9) { 62 | 63 | catSummary <- NULL 64 | numSummary <- NULL 65 | . <- NULL 66 | 67 | plotList <- 68 | lapply(sortVars, PlotVar, 69 | dataFl = dataFl, weightNm = weightNm, dateNm = dateNm, 70 | dateGp = dateGp, dateGpBp = dateGpBp, labelFl = labelFl, 71 | highlightNms = highlightNms, skewOpt = skewOpt, 72 | fuzzyLabelFn = fuzzyLabelFn, kCategories = kCategories) 73 | 74 | grDevices::pdf(file = paste(outFl, '.pdf', sep = ''), width = 11, height = 8, 75 | pointsize = 12, onefile = TRUE) 76 | 77 | for (x in plotList) { 78 | grid::grid.newpage() 79 | grid::grid.draw(x$p) 80 | 81 | if(genCSV == TRUE){ 82 | if(x$varType == "ctgrl") 83 | catSummary = rbind(catSummary, x$varSummary) 84 | if(x$varType == "nmrcl") 85 | numSummary = rbind(numSummary, x$varSummary) 86 | } 87 | } 88 | dev.off() 89 | 90 | ## Generate CSV files 91 | if(genCSV == TRUE){ 92 | ## Compute counts in each time 93 | if (is.null(weightNm)){ 94 | total_counts = dataFl[, list(count = .N), by = dateGp] 95 | } else{ 96 | total_counts = dataFl[, list(count = sum(get(weightNm))), by = dateGp] 97 | } 98 | names(total_counts)[1] = "date_group" 99 | total_counts = dcast(total_counts, . ~ date_group, value.var = 'count') 100 | total_counts[, . := NULL] 101 | 102 | ## For numerical variables 103 | if(!is.null(numSummary)){ 104 | ## Add a row of counts at the begining of numSummary 105 | numSummary = rbind(as.list(rep(NA, ncol(numSummary))), numSummary) 106 | numSummary[1, 1:2] = list('ALL_DATA', 'COUNTS') 107 | numSummary[1, 3] = sum(total_counts) 108 | numSummary[1, names(numSummary)[-(1:3)] := total_counts]; 109 | ## Write the csv file 110 | fwrite(numSummary, file = paste(outFl, '_numerical_summary.csv', sep = '')) 111 | } 112 | 113 | ## For categorical variables 114 | if(!is.null(catSummary)){ 115 | ## Add a row of counts at the begining of catSummary 116 | catSummary = rbind(as.list(rep(NA, ncol(catSummary))), catSummary) 117 | catSummary[1, 1:2] = list('ALL_DATA', 'COUNTS') 118 | catSummary[1, 3:4] = list(sum(total_counts), 1) 119 | catSummary[1, names(catSummary)[-(1:4)] := total_counts]; 120 | ## Write the csv file 121 | fwrite(catSummary, file = paste(outFl, '_categorical_summary.csv', sep = '')) 122 | } 123 | } 124 | } 125 | 126 | ############################################### 127 | # Main Plot Function for a single variable # 128 | ############################################### 129 | 130 | #' Create over time variable plots and summary statitsics for one variable 131 | #' 132 | #' For a numerical variable, the output includes 133 | #' \itemize{ 134 | #' \item side-by-side boxplots grouped by \code{dateGpBp} (left), 135 | #' \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp} 136 | #' (top right), 137 | #' \item a trace plot of mean and +-1 SD control limits, grouped by 138 | #' \code{dateGp}(middle right), and 139 | #' \item a trace plot of missing and zerorates, grouped by \code{dateGp} 140 | #' (bottom right). 141 | #' } 142 | #' For a categorical variable (including a numerical variable with no more than 2 143 | #' unique levels not including NA), the output includes 144 | #' \itemize{ 145 | #' \item a frequency bar plot (left), and 146 | #' \item a grid of trace plots on categories' proportions over time (right). 147 | #' If the variable contains more than \code{kCategories} number of categories, 148 | #' trace plots of only the largest \code{kCategories} will be plotted. 149 | #' } 150 | #' In addition to plots, a \code{data.table} of summary statistics are generated, 151 | #' on global and over time summary statistics. 152 | #' 153 | #' @inheritParams PlotCatVar 154 | #' @inheritParams PlotNumVar 155 | #' @inheritParams OrderByR2 156 | #' @param dataFl A \code{data.table} containing at least the following columns: 157 | #' \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an 158 | #' output of the \code{\link{PrepData}} function. 159 | #' @param myVar Name of the variable to be plotted. 160 | #' @param labelFl A \code{data.table} containing variable labels, or \code{NULL} 161 | #' for no labels; usually an output of \code{\link{PrepLabels}}. 162 | #' @param highlightNms Either \code{NULL} or a character vector of variables to 163 | #' recieve red label. Currently \code{NULL} means all variables will get a 164 | #' black legend. Ignored this argument if \code{labelFl == NULL}. 165 | #' @param fuzzyLabelFn Either \code{NULL} or a function of 2 parameters: A label 166 | #' file in the format of an output by \code{\link{PrepLabels}} and a string 167 | #' giving a variable name. The function should return the label corresponding 168 | #' to the variable given by the second parameter. This function should 169 | #' describe how fuzzy matching should be performed to find labels (see example 170 | #' below). If \code{NULL}, only exact matches will be retuned. 171 | #' @return 172 | #' \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object. See the output 173 | #' \code{p} of the function or \code{\link{PlotNumVar}} 174 | #' \code{\link{PlotCatVar}} for details.} 175 | #' \item{varSummary}{A \code{data.table} of summary statistics. See the output 176 | #' \code{numVarSummary} of the function \code{\link{PlotNumVar}}, or the 177 | #' output \code{catVarSummary} of the function \code{\link{PlotCatVar}} for 178 | #' details.} 179 | #' \item{varType}{Indicator of the variable's type, either \code{"nmrcl"} or 180 | #' \code{"ctgrl"}.} 181 | #' @export 182 | #' 183 | #' @seealso Functions depend on this function: 184 | #' \code{\link{PrintPlots}}. 185 | #' @seealso This function depends on: 186 | #' \code{\link{PlotCatVar}}, 187 | #' \code{\link{PlotNumVar}}, 188 | #' \code{\link{PrepData}}. 189 | #' 190 | #' @section License: Copyright 2017 Capital One Services, LLC Licensed under the 191 | #' Apache License, Version 2.0 (the "License"); you may not use this file 192 | #' except in compliance with the License. You may obtain a copy of the License 193 | #' at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable 194 | #' law or agreed to in writing, software distributed under the License is 195 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 196 | #' KIND, either express or implied. See the License for the specific language 197 | #' governing permissions and limitations under the License. 198 | #' @examples 199 | #' data(bankData) 200 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 201 | #' dateGpBp = "quarters") 202 | #' data(bankLabels) 203 | #' bankLabels <- PrepLabels(bankLabels) 204 | #' 205 | #' ## PlotVar will treat numerical and categorical data differently. 206 | #' ## Binary data is always treated as categorical. 207 | #' plot(PlotVar(bankData, myVar = "duration", weightNm = NULL, dateNm = "date", 208 | #' dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p) 209 | #' plot(PlotVar(bankData, myVar = "job", weightNm = NULL, dateNm = "date", 210 | #' dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p) 211 | #' plot(PlotVar(bankData, myVar = "loan", weightNm = NULL, dateNm = "date", 212 | #' dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p) 213 | #' 214 | PlotVar <- function(dataFl, myVar, weightNm, dateNm, dateGp, dateGpBp = NULL, 215 | labelFl = NULL, highlightNms = NULL, skewOpt = NULL, 216 | kSample = 50000, fuzzyLabelFn = NULL, kCategories = 9) { 217 | 218 | varCol <- labelCol <- NULL 219 | message(paste("Plotting ", myVar)) 220 | 221 | ## Make sure that myVar is not a date type 222 | if (any(is.element(unlist(dataFl[, class(get(myVar))]), 223 | c("Date", "IDate")))) { 224 | stop("Cannot plot dates") 225 | } 226 | 227 | ## Label myVar type to be "nmrcl" or "ctgrl" if not labeled yet 228 | if (!(inherits(myVar, "ctgrl") | inherits(myVar, "nmrcl"))) { 229 | if (dataFl[, class(get(myVar))] %in% c("character", "factor") || 230 | dataFl[, length(unique(stats::na.omit(get(myVar))))] == 2) { 231 | setattr(dataFl[, get(myVar)], "class", "ctgrl") 232 | } else { 233 | setattr(dataFl[, get(myVar)], "class", "nmrcl") 234 | } 235 | } 236 | 237 | ## Generate a grid of plots 238 | if (inherits(dataFl[[myVar]], "ctgrl")) { 239 | p_all <- PlotCatVar(myVar, dataFl, weightNm, dateNm, dateGp, kCategories) 240 | p <- p_all$p 241 | varSummary <- p_all$catVarSummary 242 | varType <- "ctgrl" 243 | } else if (inherits(dataFl[[myVar]], "nmrcl")) { 244 | p_all <- PlotNumVar(myVar, dataFl, weightNm, dateGp, dateGpBp, skewOpt, 245 | kSample) 246 | p <- p_all$p 247 | varSummary = p_all$numVarSummary 248 | varType <- "nmrcl" 249 | } 250 | 251 | ## If no fuzzy matching functions are provided, provide exact matches on the 252 | ## first column, otherwise use logic defined in fuzzyLabelFn 253 | ll <- myVar 254 | subHeight <- grid::unit(12, "points") 255 | if (!is.null(labelFl)) { 256 | if (is.null(fuzzyLabelFn)) { 257 | ll <- paste0(labelFl[varCol == myVar, labelCol]) 258 | } else { 259 | ll <- fuzzyLabelFn(labelFl, myVar) 260 | } 261 | ll <- paste0(myVar, " (", ll, ")", "\n") 262 | } 263 | 264 | ## Label color 265 | subCol <- "black" 266 | if (!is.null(highlightNms)) { 267 | highlightNms <- gsub("/|\\-|\"|\\s", "", highlightNms) 268 | if (myVar %in% highlightNms) { 269 | # should add other ways to trigger red labels 270 | subCol <- "red" 271 | } 272 | } 273 | 274 | ## Add the page title as myVar and its label above the grid of plots 275 | subText <- grid::textGrob(ll, gp = grid::gpar(col = subCol, fontface="bold")) 276 | grobHeights <- grid::unit.c(grid::unit(1, "npc") - subHeight, subHeight) 277 | p <- gridExtra::arrangeGrob(p, top = subText) 278 | 279 | return(list(p = p, varSummary = varSummary, varType = varType)) 280 | } 281 | -------------------------------------------------------------------------------- /man/vlm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/vlm.R 3 | \name{vlm} 4 | \alias{vlm} 5 | \title{Create over time variable plots and summary statistics for variable level monitoring} 6 | \usage{ 7 | vlm(dataFl, dateNm, labelFl = NULL, outFl = "otvplots", genCSV = TRUE, 8 | dataNeedPrep = FALSE, dateGp = NULL, dateGpBp = NULL, weightNm = NULL, 9 | varNms = NULL, sortVars = NULL, sortFn = NULL, selectCols = NULL, 10 | dropCols = NULL, dateFt = "\%d\%h\%Y", buildTm = NULL, 11 | highlightNms = NULL, skewOpt = NULL, kSample = 50000, 12 | fuzzyLabelFn = NULL, dropConstants = FALSE, kCategories = 9, ...) 13 | } 14 | \arguments{ 15 | \item{dataFl}{Either the name of an object that can be converted using 16 | \code{\link[data.table]{as.data.table}} (e.g., a data frame), or a 17 | character string containing the name of dataset that can be loaded using 18 | \code{\link[data.table]{fread}} (e.g., a csv file). If the dataset is not in 19 | your working directory then \code{dataFl} must include (relative or 20 | absolute) path to file.} 21 | 22 | \item{dateNm}{Name of column containing the date variable.} 23 | 24 | \item{labelFl}{Either the path of a dataset (a csv file) containing 25 | labels, an R object convertible to \code{data.table} (e.g., data frame) or 26 | \code{NULL}. If \code{NULL}, no labels will be used. The label dataset must 27 | contain at least 2 columns: \code{varCol} (variable names) and 28 | \code{labelCol} (variable labels).} 29 | 30 | \item{outFl}{Name of the output file, with no extension names (e.g., "bank"). 31 | A pdf file of plots ("bank.pdf"), and two csv files of summary statistics 32 | ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be 33 | saved to your working directory, unless a path is included in \code{outFl} 34 | (e.g. "../plots/bank").} 35 | 36 | \item{genCSV}{Logical, whether to generate the two csv files of summary 37 | statistics for numerical and categorical variables.} 38 | 39 | \item{dataNeedPrep}{Logical, indicates if data should be run through the 40 | \code{\link{PrepData}} function. This should be set to \code{TRUE} unless 41 | the \code{\link{PrepData}} function has been applied to the input data 42 | \code{dataFl}.} 43 | 44 | \item{dateGp}{Name of the variable that the time series plots should be 45 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 46 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for 47 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.} 48 | 49 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same 50 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.} 51 | 52 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 53 | no weights (all rows receiving weight 1).} 54 | 55 | \item{varNms}{Either \code{NULL} or a vector of names or indices of variables 56 | to be plotted. If \code{NULL}, will default to all columns which are not 57 | \code{dateNm} or \code{weightNm}. Can also be a vector of indices of the 58 | column names, after \code{dropCols} or \code{selectCols} have been applied, 59 | if applicable, and not including \code{dateGp}, \code{dateGpBp} 60 | (which will be added to the \code{dataFl} by the function 61 | \code{\link{PrepData}}).} 62 | 63 | \item{sortVars}{Determines which variables to be plotted and their order. 64 | Either a character vector of variable names to plot variables in the same 65 | order as in the \code{sortVars} argument), or \code{NULL} to keep the 66 | original ordering, with numerical variables will being plotted before 67 | categorical and binary ones. \code{sortVars} should be \code{NULL} when the 68 | \code{sortFn} argument is used.} 69 | 70 | \item{sortFn}{A sorting function which returns \code{sortVars} as an output. 71 | The function may take the following variables as input: \code{dataFl}, 72 | \code{dateNm}, \code{buildTm}, \code{weightNm}, \code{kSample}. Currently, 73 | the only build-in sorting function is \code{\link{OrderByR2}}, which sorts 74 | numerical variables in the order of strength of linear association with date, 75 | and adds categorical (and binary) variables sorted in alphabetical order 76 | after the numerical ones.} 77 | 78 | \item{selectCols}{Either \code{NULL}, or a vector of names or indices of 79 | variables to read into memory -- must include \code{dateNm}, 80 | \code{weightNm} (if not \code{NULL}) and all variables to be plotted. If 81 | both \code{selectCols} and \code{dropCols} are \code{NULL}, then all 82 | variables will be read in.} 83 | 84 | \item{dropCols}{Either \code{NULL}, or a vector of variables names or indices 85 | of variables not to read into memory. If both \code{selectCols} and 86 | \code{dropCols} are \code{NULL}, then all variables will be read in.} 87 | 88 | \item{dateFt}{\code{\link{strptime}} format of date variable. The default is SAS 89 | format \code{"\%d\%h\%Y"}. But input data with R date format 90 | \code{"\%Y-\%m-\%d"} will also be detected. Both of two formats can be 91 | parsed automatically.} 92 | 93 | \item{buildTm}{Vector identify time period for ranking/anomaly detection 94 | (most likely model build period). Allows for a subset of plotting time 95 | period to be used for anomaly detection. 96 | \itemize{ 97 | \item Must be a vector of dates and must be inclusive i.e. buildTm[1] 98 | <= date <= buildTm[2] will define the time period. 99 | \item Must be either \code{NULL}, a vector of length 2, or a vector of 100 | length 3. 101 | \item If \code{NULL}, the entire dataset will be used for 102 | ranking/anomaly detection. 103 | \item If a vector of length 2, the format of the dates must be 104 | a character vector in default R date format (e.g. "2017-01-30"). 105 | \item If a vector of length 3, the first two columns must contain dates 106 | in any strptime format, while the 3rd column contains the strptime 107 | format (see \code{\link{strptime}}). 108 | \item The following are equivalent ways of selecting 109 | all of 2014: 110 | \itemize{ 111 | \item \code{c("2014-01-01","2014-12-31")} 112 | \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")} 113 | } 114 | }} 115 | 116 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to 117 | recieve red label. Currently \code{NULL} means all variables will get a 118 | black legend. Ignored this argument if \code{labelFl == NULL}.} 119 | 120 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 121 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of 122 | a variable whose skewness exceeds 5 will be on a log10 scale if possible. 123 | Negative input of \code{skewOpt} will be converted to 3.} 124 | 125 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 126 | indicates the sample size for both drawing boxplots and ordering numerical 127 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 128 | reasonable value (default is 50K) dramatically improves processing speed. 129 | Therefore, for larger datasets (e.g. > 10 percent system memory), this 130 | parameter should not be set to \code{NULL}, or boxplots may take a very 131 | long time to render. This setting has no impact on the accuracy of time 132 | series plots on quantiles, mean, SD, and missing and zero rates.} 133 | 134 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label 135 | file in the format of an output by \code{\link{PrepLabels}} and a string 136 | giving a variable name. The function should return the label corresponding 137 | to the variable given by the second parameter. This function should 138 | describe how fuzzy matching should be performed to find labels (see example 139 | below). If \code{NULL}, only exact matches will be retuned.} 140 | 141 | \item{dropConstants}{Logical, indicates whether or not constant (all 142 | duplicated or NA) variables should be dropped from \code{dataFl} prior to 143 | plotting.} 144 | 145 | \item{kCategories}{If a categorical variable has more than \code{kCategories}, 146 | trace plots of only the \code{kCategories} most prevalent categories are 147 | plotted.} 148 | 149 | \item{...}{Additional parameters to be passed to 150 | \code{\link[data.table]{fread}}.} 151 | } 152 | \description{ 153 | Sorts variables according to either user input or correlation with time 154 | (among numerical variables only), and create output files including: 155 | \itemize{ 156 | \item A PDF file of plots saved as \code{outFl}.pdf, with each indivual page 157 | on one variable. Variables are plotted in the order indicated in the argument 158 | \code{sortVars} or \code{sortFn}. 159 | For each numerical variable, the output plots include 160 | \itemize{ 161 | \item side-by-side boxplots grouped by \code{dateGpBp} (left), 162 | \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp} 163 | (top right), 164 | \item a trace plot of mean and +-1 SD control limits, grouped by 165 | \code{dateGp}(middle right), and 166 | \item a trace plot of missing and zerorates, grouped by \code{dateGp} 167 | (bottom right). 168 | } 169 | For each categorical variable (including a numerical variable with no more 170 | than 2 unique levels not including NA), the output plots include 171 | \itemize{ 172 | \item a frequency bar plot (left), and 173 | \item a grid of trace plots on categories' proportions over time (right). 174 | If the variable contains more than \code{kCategories} number of 175 | categories, trace plots of only the largest \code{kCategories} will be 176 | plotted. If the variable contains only two categories, then only the 177 | trace plot of the less prevalent cateogy will be plotted. 178 | } 179 | \item CSV file(s) on summary statistics of variable, both globally and over 180 | time aggregated by \code{dateGp}. The order of variables in the CSV files 181 | are the same as in the PDF file. 182 | \itemize{ 183 | \item For numerical varaibles, number of observations (counts), p1, p25, 184 | p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved 185 | as \code{outFl}_numerical_summary.csv. 186 | \item For categorical varaibles, number of observations (counts) and 187 | categories' proportions are saved as \code{outFl}_categorical_summary.csv. 188 | Each row is a category of a categorical (or binary) variable. 189 | The row whose \code{category == 'NA'} corresponds to missing. Categories 190 | among the same variable are ordered by global prevalence in a descending 191 | order. 192 | } 193 | } 194 | } 195 | \details{ 196 | If the argument \code{dataNeedPrep} is set to \code{FALSE}, then 197 | \itemize{ 198 | \item \code{dataFl} must be a \code{data.table} containing variables 199 | \code{weightNm}, \code{dateNm}, \code{dateGp}, and \code{dateGpBp}, and 200 | names of these variables must be the same as the corresponding arguments 201 | of the \code{\link{vlm}} function. 202 | \item the arguments \code{selectCols}, \code{dropCols}, \code{dateFt}, 203 | \code{dropConstants} will be ignored by the \code{\link{vlm}} function. 204 | \item When analyzing a dataset for the first time, it is recommended to first 205 | run the \code{\link{PrepData}} function on it, and then apply the 206 | \code{\link{vlm}} function with the argument \code{dataNeedPrep = FALSE}. 207 | Please see the examples for details. 208 | } 209 | } 210 | \section{License}{ 211 | Copyright 2017 Capital One Services, LLC Licensed under the 212 | Apache License, Version 2.0 (the "License"); you may not use this file 213 | except in compliance with the License. You may obtain a copy of the License 214 | at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable 215 | law or agreed to in writing, software distributed under the License is 216 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 217 | KIND, either express or implied. See the License for the specific language 218 | governing permissions and limitations under the License. 219 | } 220 | 221 | \examples{ 222 | ## Load the data and its label 223 | data(bankData) 224 | data(bankLabels) 225 | 226 | ## The PrepData function should only need to be run once on a dataset, 227 | ## after that vlm can be run with the argument dataNeedPrep = FALSE 228 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 229 | dateGpBp = "quarters") 230 | bankLabels <- PrepLabels(bankLabels) 231 | 232 | \dontrun{ 233 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 234 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 235 | outFl = "bank") 236 | 237 | ## If csv files of summary statistics are not need, set genCSV = FALSE 238 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE, 239 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 240 | outFl = "bank") 241 | 242 | ## If weights are provided, they will be used in all statistical calculations 243 | bankData[, weight := rnorm(.N, 1, .1)] 244 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 245 | dateGp = "months", dateGpBp = "quarters", weightNm = "weight", 246 | outFl = "bank") 247 | 248 | ## Customize plotting order by passing a vector of variable names to 249 | ## sortVars, but the "date" column must be excluded from sortVars 250 | sortVars <- sort(bankLabels[varCol!="date", varCol]) 251 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 252 | dateGp = "months", dateGpBp = "quarters", outFl = "bank", 253 | sortVars = sortVars) 254 | 255 | ## Create plots for a specific variable using the varNms parameter 256 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 257 | dateGp = "months", dateGpBp = "quarters", outFl = "bank", 258 | varNms = "age", sortVars = NULL) 259 | } 260 | } 261 | \seealso{ 262 | This function depends on: 263 | \code{\link{PrintPlots}}, 264 | \code{\link{OrderByR2}}, 265 | \code{\link{PrepData}}, 266 | \code{\link{PrepLabels}}. 267 | } 268 | -------------------------------------------------------------------------------- /R/categorical.R: -------------------------------------------------------------------------------- 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 Capital One Services, LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software distributed 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 12 | # OF ANY KIND, either express or implied. 13 | # 14 | # See the License for the specific language governing permissions and limitations under the License. 15 | 16 | 17 | ########################################### 18 | # Plots for Categorical Data # 19 | ########################################### 20 | #' Create plots and summary statistics for a categorical variable 21 | #' 22 | #' Output plots include a bar plot with cateogries ordered by global counts, 23 | #' and trace plots of categories' proportions over time. This function is also 24 | #' appliable to a binary varible, which is treated as categorical in this 25 | #' package. In addition to plots, a \code{data.table} of summary statistics 26 | #' are generated, on global counts and proportions by cateory, and proportions 27 | #' by category over time. 28 | #' 29 | #' @inheritParams PrepData 30 | #' @param dataFl A \code{data.table} of data; must be the output of the 31 | #' \code{\link{PrepData}} function. 32 | #' @param myVar The name of the variable to be plotted 33 | #' @param kCategories If a categorical variable has more than \code{kCategories}, 34 | #' trace plots of only the \code{kCategories} most prevalent categories are 35 | #' plotted. 36 | #' @param normBy The normalization factor for rate plots, can be \code{"time"} 37 | #' or \code{"var"}. If \code{"time"}, then for each time period of 38 | #' \code{dateGp}, counts are normalized by the total counts over all 39 | #' categories in that time period. This illustrates changes of categories' 40 | #' proportions over time. If \code{"var"}, then for each category, its counts 41 | #' are normalized by the total counts over time from only this category. This 42 | #' illustrates changes of categories' volumes over time. 43 | #' @export 44 | #' @return 45 | #' \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a 46 | #' bar plot, and trace plots of categories' proportions. If the number of 47 | #' categories is larger than \code{kCategories}, then trace plots of only the 48 | #' \code{kCategories} most prevalent categories are be plotted. For a binary 49 | #' variable, only the trace plot of the less prevalent category is plotted.} 50 | #' \item{catVarSummary}{A \code{data.table}, contains categories' proportions 51 | #' globally, and over-time in each time period in \code{dateGp}. Each row is 52 | #' a category of the categorical (or binary) variable \code{myVar}. The row 53 | #' whose \code{category == 'NA'} corresponds to missing. Categories are 54 | #' ordered by global prevalence in a descending order.} 55 | #' 56 | #' @seealso Functions depend on this function: 57 | #' \code{\link{PlotVar}}, 58 | #' \code{\link{PrintPlots}}, 59 | #' \code{\link{vlm}}. 60 | #' @seealso This function depends on: 61 | #' \code{\link{PlotBarplot}}, 62 | #' \code{\link{PlotRatesOverTime}}, 63 | #' \code{\link{PrepData}}. 64 | #' 65 | #' @section License: 66 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 67 | #' Version 2.0 (the "License"); you may not use this file except in compliance 68 | #' with the License. You may obtain a copy of the License at 69 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 70 | #' or agreed to in writing, software distributed under the License is 71 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 72 | #' KIND, either express or implied. See the License for the specific language 73 | #' governing permissions and limitations under the License. 74 | #' @examples 75 | #' data(bankData) 76 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 77 | #' dateGpBp = "quarters", weightNm = NULL) 78 | #' # Single histogram is plotted for job type since there are 12 categories 79 | #' plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL, 80 | #' dateNm = "date", dateGp = "months")$p) 81 | #' 82 | #' plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL, 83 | #' dateNm = "date", dateGp = "months", kCategories = 12)$p) 84 | #' 85 | #' 86 | #' ## Binary data is treated as categorical, and only the less frequent 87 | #' ## category is plotted over time. 88 | #' plot(PlotCatVar(myVar = "default", dataFl = bankData, weightNm = NULL, 89 | #' dateNm = "date", dateGp = "months")$p) 90 | 91 | PlotCatVar <- function(myVar, dataFl, weightNm = NULL, dateNm, dateGp, 92 | kCategories = 9, normBy = "time") { #!# previous name: PlotDiscreteVar 93 | count <- NULL 94 | 95 | p <- PlotBarplot(dataFl = dataFl, myVar = myVar, weightNm = weightNm) 96 | newLevels <- as.character(p$data[order(-count)][[myVar]]) 97 | 98 | p2 <- PlotRatesOverTime(dataFl = dataFl, dateGp = dateGp, weightNm = weightNm, 99 | myVar = myVar, newLevels = newLevels, normBy = normBy, 100 | kCategories = kCategories) 101 | 102 | p <- gridExtra::arrangeGrob(ggplot2::ggplotGrob(p), p2$p, widths = c(1, 2)) 103 | 104 | return(list(p = p, catVarSummary = p2$catVarSummary)) 105 | } 106 | 107 | ########################################### 108 | # Discrete Plotting Functions # 109 | ########################################### 110 | #' Creates a bar plot for a discrete (or binary) variable 111 | #' 112 | #' @inheritParams PlotCatVar 113 | #' @export 114 | #' @return A \code{ggplot} object with a histogram of \code{myVar} ordered by 115 | #' category frequency 116 | #' 117 | #' @seealso Functions depend on this function: 118 | #' \code{\link{PlotCatVar}}. 119 | #' @seealso This function depends on: 120 | #' \code{\link{PrepData}}. 121 | #' 122 | #' @section License: 123 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 124 | #' Version 2.0 (the "License"); you may not use this file except in compliance 125 | #' with the License. You may obtain a copy of the License at 126 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 127 | #' or agreed to in writing, software distributed under the License is 128 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 129 | #' KIND, either express or implied. See the License for the specific language 130 | #' governing permissions and limitations under the License. 131 | #' @examples 132 | #' data(bankData) 133 | #' bankData = PrepData(bankData, dateNm = "date", dateGp = "months", 134 | #' dateGpBp = "quarters", weightNm = NULL) 135 | #' PlotBarplot(bankData, "job") 136 | #' 137 | #' ## NA will be included as a category if any NA are present 138 | #' bankData[sample.int(.N)[1:1000], education := NA] 139 | #' PlotBarplot(bankData, "education") 140 | 141 | PlotBarplot <- function(dataFl, myVar, weightNm = NULL){ #!# previous name: PlotHistogram 142 | 143 | count <- NULL 144 | 145 | ## A subset dataset to work on 146 | dataSub <- dataFl[, c(myVar, weightNm), with = FALSE] 147 | ## NA is converted to a character, i.e., treated as a new category 148 | dataSub[is.na(get(myVar)) | get(myVar) == "", (myVar) := "NA"] 149 | 150 | ## Create glbTotals, a frequency table of myVar 151 | if (is.null(weightNm)) { 152 | glbTotals <- dataSub[, list(count = .N), by = myVar] 153 | } else { 154 | glbTotals <- dataSub[, list(count = sum(get(weightNm))), by = myVar] 155 | } 156 | 157 | ## Create newLevels, a vector of category names, in descending order of counts 158 | newLevels <- unlist(glbTotals[order(-count), myVar, with = FALSE]) 159 | glbTotals[, (myVar) := factor(get(myVar), levels = newLevels)] 160 | 161 | p <- ggplot2::ggplot(glbTotals, ggplot2::aes_string(x = myVar, 162 | y = "count", 163 | group = myVar)) + 164 | ggplot2::geom_bar(stat = "identity") + 165 | ggplot2::scale_x_discrete(labels = abbreviate, breaks = newLevels) + 166 | ggplot2::theme(text = ggplot2::element_text(size = 10)) 167 | return(p) 168 | } 169 | 170 | 171 | #' Creates trace plots of categories' proportions over time for a discrete (or 172 | #' binary) variable 173 | #' 174 | #' @inheritParams PlotCatVar 175 | #' @param newLevels categories of \code{myVar} in order of global frequency 176 | #' @export 177 | #' @return A list: 178 | #' \item{p}{\code{ggplot} object, trace plots of categories' proportions 179 | #' \code{myVar} over time.} 180 | #' \item{catVarSummary}{A \code{data.table}, contains categories' proportions 181 | #' globally, and over-time in each time period in \code{dateGp}. Each row is 182 | #' a category of the categorical (or binary) variable \code{myVar}. The row 183 | #' whose \code{category == 'NA'} corresponds to missing. Categories are 184 | #' ordered by global prevalence in a descending order.} 185 | #' 186 | #' @seealso Functions depend on this function: 187 | #' \code{\link{PlotCatVar}}. 188 | #' @seealso This function depends on: 189 | #' \code{\link{PrepData}}. 190 | #' 191 | #' @section License: 192 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License, 193 | #' Version 2.0 (the "License"); you may not use this file except in compliance 194 | #' with the License. You may obtain a copy of the License at 195 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law 196 | #' or agreed to in writing, software distributed under the License is 197 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 198 | #' KIND, either express or implied. See the License for the specific language 199 | #' governing permissions and limitations under the License. 200 | #' @examples 201 | #' data(bankData) 202 | #' bankData$weight = rpois(nrow(bankData), 5) 203 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 204 | #' dateGpBp = "quarters", weightNm = "weight") 205 | #' PlotRatesOverTime(dataFl = bankData, dateGp = "months", weightNm = "weight", 206 | #' myVar = "job", newLevels = NULL, normBy = "time") 207 | #' 208 | PlotRatesOverTime <- function(dataFl, dateGp, myVar, normBy = "time", 209 | weightNm = NULL, newLevels = NULL, kCategories = 9){ #!# previous name: PlotHistOverTime 210 | N.x <- NULL 211 | N.y <- NULL 212 | rate <- NULL 213 | N <- NULL 214 | count <- NULL 215 | global_count <- NULL 216 | global_rate <- NULL 217 | variable <- NULL 218 | 219 | ## A subset dataset to work on 220 | dataSub <- dataFl[, c(dateGp, myVar, weightNm), with = FALSE] 221 | ## NA is converted to a character, i.e., treated as a new category 222 | dataSub[is.na(get(myVar)) | get(myVar) == "", (myVar) := "NA"] 223 | 224 | ## Create glbTotals, a frequency table of myVar 225 | ## Create newLevels, a vector of category names, in descending order of counts 226 | if (is.null(newLevels)){ 227 | if (is.null(weightNm)) { 228 | glbTotals <- dataSub[, list(count = .N), by = myVar] 229 | } else { 230 | glbTotals <- dataSub[, list(count = sum(get(weightNm))), by = myVar] 231 | } 232 | 233 | newLevels <- glbTotals[order(-count), myVar, with = FALSE][[myVar]] 234 | } 235 | 236 | ## Compute counts by category and time 237 | if (is.null(weightNm)) { 238 | countData <- dataSub[, .N, by = c(myVar, dateGp)] 239 | if (normBy == "time"){ 240 | countBy <- dataSub[, .N, by = c(dateGp)] 241 | } else { 242 | if (normBy == "var") { 243 | countBy <- dataSub[, .N, by = c(myVar)] 244 | } 245 | } 246 | } else { 247 | countData <- dataSub[, list(N = sum(get(weightNm))), by = c(myVar, dateGp)] 248 | if (normBy == "time"){ 249 | countBy <- dataSub[, list(N = sum(get(weightNm))), by = c(dateGp)] 250 | } else { 251 | if (normBy == "var") { 252 | countBy <- dataSub[, list(N = sum(get(weightNm))), by = c(myVar)] 253 | } 254 | } 255 | } 256 | 257 | ## Make sure countData contains all cateogires and all times 258 | crossLevels <- CJ(unique(countData[[dateGp]]), unique(countData[[myVar]])) 259 | setnames(crossLevels, c("V1", "V2"), c(dateGp, myVar)) 260 | countData <- merge(crossLevels, countData, all.x = TRUE, by = c(dateGp, myVar)) 261 | countData[is.na(N), N := 0] 262 | countData[, (myVar) := factor(get(myVar), levels = newLevels)] 263 | 264 | ## Combine countData (numerator) and countBy (denominator) as rateBy 265 | if (normBy == "time"){ 266 | rateBy <- merge(countData, countBy, by = dateGp) 267 | } else { 268 | if (normBy == "var") { 269 | rateBy <- merge(countData, countBy, by = myVar) 270 | } 271 | } 272 | 273 | ## Compute the rates: 274 | ## For a certain time, N.x is the count of the category, N.y is the total counts 275 | rateBy[, rate := N.x / N.y] 276 | rateBy[, (myVar) := factor(get(myVar), levels = newLevels)] 277 | 278 | ## Compute summary statistics in a wide format 279 | cbytime = copy(rateBy); 280 | names(cbytime)[names(cbytime) == myVar] = 'category' 281 | names(cbytime)[names(cbytime) == dateGp] = 'date_group' 282 | ## Compute global counts and rates 283 | cglobal = cbytime[, list(global_count = sum(N.x)), by = 'category']; 284 | cglobal[, global_rate := global_count / sum(global_count)]; 285 | ## Change cbytime into the wide format 286 | cbytime = dcast(cbytime[, c('date_group', 'category', 'rate')], 287 | category ~ date_group, value.var = 'rate'); 288 | ## Combine cglobal into cbytime 289 | cbytime = merge(cglobal, cbytime, by = 'category') 290 | ## Add a column: variable 291 | cbytime[, variable := myVar]; 292 | setcolorder(cbytime, c(ncol(cbytime), 1:(ncol(cbytime) - 1))) 293 | ## Add a row of NA being all zero, if no missing 294 | if('NA' %in% cbytime$category == FALSE){ 295 | cbytime = rbind(cbytime, as.list(rep(NA, ncol(cbytime)))) 296 | cbytime[nrow(cbytime), 1:2] = list(myVar, 'NA') 297 | cbytime[nrow(cbytime), 3:(ncol(cbytime))] = 0; 298 | } 299 | 300 | ## Plot less frequent category only for a binary variable. 301 | ## This helps when there is a large class imbalance, because the range of y-axis for all trace plots is the same. 302 | if (length(newLevels) == 2) { 303 | rateBy <- rateBy[get(myVar) == newLevels[2]] 304 | } 305 | 306 | if(length(newLevels) <= kCategories){ 307 | p <- ggplot2::ggplot(rateBy, 308 | ggplot2::aes_string(x = dateGp, y = "rate")) 309 | } else { 310 | p <- ggplot2::ggplot(rateBy[get(myVar) %in% newLevels[1:kCategories]], 311 | ggplot2::aes_string(x = dateGp, y = "rate")) 312 | } 313 | 314 | p <- p + 315 | ggplot2::geom_line(stat = "identity") + 316 | ggplot2::facet_wrap(stats::as.formula(paste("~", myVar))) + 317 | ggplot2::ylab("") + 318 | ggplot2::scale_x_date() + 319 | ggplot2::theme(axis.text.x=ggplot2::element_text(angle = 30, hjust = 1)) + 320 | ggplot2::scale_y_continuous(labels=scales::percent) 321 | 322 | return(list(p = p, catVarSummary = cbytime)); 323 | 324 | } 325 | --------------------------------------------------------------------------------