├── data
    ├── bankData.rda
    └── bankLabels.rda
├── tests
    ├── testthat.R
    └── testthat
    │   ├── rawData.rda
    │   ├── drugLabel.rda
    │   ├── testData.rda
    │   ├── PlotHistogram.RDS
    │   ├── test_PlotCatVar.R
    │   ├── test_PlotNumVar.R
    │   ├── drugLabel.csv
    │   ├── test_vlm.R
    │   ├── test_PlotRates.R
    │   ├── test_PlotMean.R
    │   ├── test_PlotQuantiles.R
    │   ├── test_PlotBarplot.R
    │   ├── test_PlotRatesOverTime.R
    │   ├── test_PlotDist.R
    │   ├── test_CalcR2.R
    │   ├── test_SummaryStats.R
    │   ├── rawData_bigint.csv
    │   ├── test_OrderByR2.R
    │   ├── rawData.csv
    │   └── test_PrepData.R
├── .Rbuildignore
├── figures
    ├── sample_plots_numerical.png
    └── sample_plots_categorical.png
├── .travis.yml
├── cran-comments.md
├── CODEOWNERS
├── .gitignore
├── man
    ├── bankLabels.Rd
    ├── PlotRates.Rd
    ├── PlotQuantiles.Rd
    ├── PlotMean.Rd
    ├── PlotBarplot.Rd
    ├── PrepLabels.Rd
    ├── CalcR2.Rd
    ├── SummaryStats.Rd
    ├── PlotDist.Rd
    ├── bankData.Rd
    ├── PlotRatesOverTime.Rd
    ├── otvPlots.Rd
    ├── OrderByR2.Rd
    ├── PlotNumVar.Rd
    ├── PlotCatVar.Rd
    ├── PrintPlots.Rd
    ├── PrepData.Rd
    ├── PlotVar.Rd
    └── vlm.Rd
├── NAMESPACE
├── DESCRIPTION
├── R
    ├── utils.R
    ├── data.R
    ├── package_otvPlots.R
    ├── plots_order.R
    ├── vlm.R
    ├── plot_print.R
    └── categorical.R
├── README.md
└── LICENSE


/data/bankData.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/data/bankData.rda


--------------------------------------------------------------------------------
/data/bankLabels.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/data/bankLabels.rda


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(otvPlots)
3 | 
4 | test_check("otvPlots")
5 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.travis\.yml$
2 | figures
3 | cran-comments.md
4 | .whitesource
5 | CODEOWNERS
6 | 


--------------------------------------------------------------------------------
/tests/testthat/rawData.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/rawData.rda


--------------------------------------------------------------------------------
/tests/testthat/drugLabel.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/drugLabel.rda


--------------------------------------------------------------------------------
/tests/testthat/testData.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/testData.rda


--------------------------------------------------------------------------------
/figures/sample_plots_numerical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/figures/sample_plots_numerical.png


--------------------------------------------------------------------------------
/tests/testthat/PlotHistogram.RDS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/PlotHistogram.RDS


--------------------------------------------------------------------------------
/figures/sample_plots_categorical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/figures/sample_plots_categorical.png


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
2 | 
3 | language: R
4 | sudo: false
5 | cache: packages
6 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Resubmission
2 | This is a resubmission. In this version I have:
3 | 
4 | * Remove the VignetteBuilder field in DESCRIPTION.
5 | 
6 | * Modify the Description file in DESCRIPTION, by removing “this package” at the beginning.


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # This is a comment.
2 | # Each line is a file pattern followed by one or more owners.
3 | 
4 | # These owners will be the default owners for everything in
5 | # the repo. Unless a later match takes precedence,
6 | # @yingboli and @Yingru will be requested for
7 | # review when someone opens a pull request.
8 | *       @yingboli @Yingru
9 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotCatVar.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot categorical variable")
 4 | load("../testthat/testData.rda")
 5 | setDT(testData)
 6 | 
 7 | test_that("PlotCatVar returns a gtable", {
 8 | 	PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months")
 9 | 	p <- PlotCatVar("marital", testData, NULL, "weeks", "months")$p	
10 | 	expect_is(p, "gtable")
11 | })
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Capital One Services, LLC
 2 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 3 | # this file except in compliance with the License.  You may obtain a copy of the
 4 | # License at http://www.apache.org/licenses/LICENSE-2.0
 5 | # Unless required by applicable law or agreed to in writing, software
 6 | # distributed under the License is distributed on an "AS IS" BASIS,
 7 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 8 | # See the License for the specific language governing permissions and limitations under the License. 
 9 | .Rproj.user
10 | .Rhistory
11 | .RData
12 | inst/doc
13 | .pdf
14 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotNumVar.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot Continuous Variable")
 4 | load("../testthat/testData.rda")
 5 | setDT(testData)
 6 | 
 7 | test_that("PlotNumVar returns a gtable", {
 8 | 	PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months")
 9 | 	p <- PlotNumVar("age", testData, NULL, "weeks", "months", 
10 |                   skewOpt = 3, kSample = NULL)$p	
11 | 	expect_is(p, "gtable")
12 | })
13 | 
14 | test_that("Incorrect skewOpt creates error", {
15 | 	PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months")
16 | 	expect_error(PlotNumVar("age", testData, NULL, "weeks", "months", 
17 |                   skewOpt = "test", kSample = NULL)$p)
18 | })
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/man/bankLabels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{bankLabels}
 5 | \alias{bankLabels}
 6 | \title{Labels for bankData}
 7 | \format{A data frame with 16 rows and 3 variables:
 8 | \describe{
 9 |   \item{V1}{Name of each variable in \code{\link{bankData}}.}
10 |   \item{V2}{Label of each variable in \code{\link{bankData}}.}
11 |   \item{V3}{A numeric variable, corresponding to the row number.}
12 | }}
13 | \usage{
14 | bankLabels
15 | }
16 | \description{
17 | A dataset containing the attribute labels also found in \code{\link{bankData}}.
18 | This data set is used to illustrate the \code{\link{PrepLabels}} function and
19 | other label functionality in the \code{\link{otvPlots}} package in R.
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(CalcR2)
 4 | export(OrderByR2)
 5 | export(PlotBarplot)
 6 | export(PlotCatVar)
 7 | export(PlotDist)
 8 | export(PlotMean)
 9 | export(PlotNumVar)
10 | export(PlotQuantiles)
11 | export(PlotRates)
12 | export(PlotRatesOverTime)
13 | export(PlotVar)
14 | export(PrepData)
15 | export(PrepLabels)
16 | export(PrintPlots)
17 | export(SummaryStats)
18 | export(vlm)
19 | import(data.table)
20 | import(ggplot2)
21 | importFrom(Hmisc,wtd.mean)
22 | importFrom(Hmisc,wtd.quantile)
23 | importFrom(Hmisc,wtd.var)
24 | importFrom(grDevices,cairo_pdf)
25 | importFrom(grDevices,dev.off)
26 | importFrom(graphics,par)
27 | importFrom(grid,gpar)
28 | importFrom(grid,grid.draw)
29 | importFrom(grid,grid.newpage)
30 | importFrom(grid,textGrob)
31 | importFrom(grid,unit)
32 | importFrom(grid,unit.c)
33 | importFrom(gridExtra,arrangeGrob)
34 | importFrom(moments,skewness)
35 | importFrom(scales,hue_pal)
36 | importFrom(stats,lm.fit)
37 | importFrom(stats,lm.wfit)
38 | importFrom(stats,quantile)
39 | importFrom(stats,sd)
40 | importFrom(stats,var)
41 | importFrom(stringi,stri_trans_general)
42 | importFrom(utils,tail)
43 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: otvPlots
 2 | Title: Over Time Variable Plots
 3 | Version: 0.2.1
 4 | Authors@R: c(
 5 |     person("Rebecca", "Payne", role = "aut"),
 6 |     person("Zoey", "Zhu", role = c("aut")),
 7 | 		person("Yingbo", "Li", email = "yingbo.li@capitalone.com", role = c("aut", "cre")),
 8 | 		person("Capital One", role = "cph"))
 9 | Description: Enables automated visualization of variable 
10 |     distribution and changes over time for predictive model building.
11 |     Computes summary statistics aggregated by time for 
12 |     large datasets, and creates plots for variable level monitoring.  
13 | Depends:
14 |     R (>= 3.2.0)
15 | Imports:
16 |     data.table (>= 1.9.6),
17 |     ggplot2 (>= 2.1.0),
18 |     grid (>= 3.2.0),
19 |     gridExtra (>= 2.2.1),
20 |     Hmisc (>= 3.17-4),
21 |     moments,
22 |     quantreg (>= 5.33), 
23 |     scales (>= 0.4.0),
24 |     stringi (>= 1.1.1)
25 | License: Apache License 2.0 | file LICENSE
26 | LazyData: true
27 | Suggests:
28 |     bit64,
29 |     knitr,
30 |     proto,
31 |     testthat
32 | URL: https://github.com/capitalone/otvPlots
33 | BugReports: https://github.com/capitalone/otvPlots/issues   
34 | RoxygenNote: 6.0.1
35 | 


--------------------------------------------------------------------------------
/tests/testthat/drugLabel.csv:
--------------------------------------------------------------------------------
 1 | col1,col2,,,,,
 2 | CaseNumber,The case number,,,,,
 3 | date,Date of the test,,,,,
 4 | Sex,Gender of  the patient,,,,,
 5 | Race,Race of the patient,,,,,
 6 | Age,Age of the patient,,,,,
 7 | "Re""side-nce .City",,,,,,
 8 | Residence State,,,,,,
 9 | Residence County,,,,,,
10 | Death City,,,,,,
11 | ,Wrong result,,,,,
12 | Death State,,,,,,
13 | Death County,,,,,,
14 | Location,,,,,,
15 | DescriptionofInjury,The kind of injury the patient has,,,,,
16 | InjuryPlace,The place the injury exists,,,,,
17 | ImmediateCauseA,The cause of the injury,,,,,
18 | Heroin,Level of heroin used,,,,,
19 | Cocaine,Level of Cocaine used,,,,,
20 | Fentanyl,Level of Fentanyl used,,,,,
21 | Oxycodone,Level of Oxycodone used,,,,,
22 | Oxymorphone,Level of Oxymorphone used,,,,,
23 | EtOH,Level of EtOH used,,,,,
24 | Hydro-codeine,Level of Hydro-codeine used,,,,,
25 | Benzodiazepine,Level of Benzodiazepine used,,,,,
26 | Methadone,Level of Methadone used,,,,,
27 | Amphet,Level of Amphet used,,,,,
28 | Tramad,Level of Tramad used,,,,,
29 | Morphine_not_heroin,Morphine not heroin,,,,,
30 | Other,Other things,,,,,
31 | Any Opioid,Whether there is opioid,,,,,
32 | MannerofDeath,Manner of death,,,,,
33 | DeathLoc,The death location,,,,,


--------------------------------------------------------------------------------
/tests/testthat/test_vlm.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | context("Run the main function: vlm")
 3 | drugSASDate <- read.csv("../testthat/drugSASDate.csv")
 4 | 
 5 | test_that("At most one of sortVars and sortFn is passed in", {
 6 |   expect_error(vlm(dataFl = "../testthat/drugSASDate.csv",
 7 | 		  					dateNm = "date", sortVars = c("age", "residencecity")))
 8 | })
 9 | 
10 | test_that("varNms is a subset of sortVars", {
11 |   expect_error(vlm(dataFl = drugSASDate, dateNm = "date", 
12 |   							sortVars = c("age", "residencecity"), varNms = c("age")))
13 | }) 
14 | 
15 | test_that("Incorrect file input when prepData is False", {
16 |   expect_error(vlm(dataFl = "../testthat/drugRDate.csv", dateNm = "date",
17 |                            prepData = FALSE))
18 | }) 
19 | 
20 | test_that("selectCols and dropCols together give an error", {   
21 | 	expect_error(vlm("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
22 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
23 |  		   selectCols = c("age", "balance", "date", "weight"),
24 |  		   dropCols = c("default"), varNms = c("age")))
25 | 
26 | 	expect_error(vlm("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
27 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
28 |  		   selectCols = c("age", "balance", "date", "weight"),
29 |  		   dropCols = c("default")))
30 | })
31 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotRates.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot Continuous Rates over Time")
 4 | load("../testthat/testData.rda")
 5 | testData <- setDT(testData)
 6 | testData <- testData[, .(balance, weight, date)]
 7 | testData[, weeks := round(date, "weeks")]
 8 | testDT = testData[, {list("zerorate" = mean(balance == 0),
 9 |                            "missingrate" = mean(is.na(balance)))}, 
10 |                   by = "weeks"]
11 | testMT = melt(testDT, id.vars = "weeks", 
12 |                measure.vars = c("zerorate", "missingrate"))
13 | 
14 | 
15 | test_that("Plot layers match expectations",{
16 |   p <- PlotRates(testMT, "balance", "weeks")
17 |   expect_is(p$layers[[1]], "ggproto")
18 |   expect_is(p$layers[[1]]$geom, "GeomLine")
19 |   expect_is(p$layers[[1]]$stat, "StatIdentity")
20 | })
21 | 
22 | test_that("X axis is labelled 'weeks'",{
23 |   p <- PlotRates(testMT, "balance", "weeks")
24 |   expect_identical(p$labels$x, "weeks")
25 |   expect_identical(p$labels$y, NULL)
26 | })
27 | 
28 | 
29 | test_that("Mapping layer contains expected elements", {
30 |   p <- PlotRates(testMT, myVar = "balance", dateGp = "weeks")  
31 |   expect_true( "colour" %in% names(p$mapping)) 
32 |   expect_true( "group" %in% names(p$mapping)) 
33 |   expect_true( "x" %in% names(p$mapping)) 
34 |   expect_true( "y" %in% names(p$mapping)) 
35 |   expect_length(setdiff(c("colour", "group", "x", "y"), names(p$mapping)), 0)
36 |  })
37 | 
38 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotMean.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot Mean over Time")
 4 | load("../testthat/testData.rda")
 5 | testData <- setDT(testData)
 6 | testData <- testData[, .(balance, weight, date)]
 7 | testData[, weeks := round(date, "weeks")]
 8 | 
 9 | testDT = testData[, .(Mean = mean(balance)), by = "weeks"]
10 | cl = testData[, c(mean(balance), sd(balance))]
11 | cl = cl %*% matrix(c(1, 1, 1, -1), byrow = TRUE, nrow = 2) # mean +- 1 SD
12 | testDT[, c("cl1", "cl2") := list(cl[1], cl[2])  ]
13 | testMT = melt(testDT, id.vars = "weeks", 
14 |           measure.vars = c("Mean", "cl1", "cl2"))
15 | 
16 | test_that("Plot layers match expectations",{
17 |   p <- PlotMean(testMT, "balance", "weeks")
18 |   expect_is(p$layers[[1]], "ggproto")
19 |   expect_is(p$layers[[1]]$geom, "GeomLine")
20 |   expect_is(p$layers[[1]]$stat, "StatIdentity")
21 | })
22 | 
23 | test_that("X axis is labelled 'weeks'",{
24 |   p <- PlotMean(testMT, "balance", "weeks")
25 |   expect_identical(p$labels$x, "weeks")
26 |   expect_identical(p$labels$y, NULL)
27 | })
28 | 
29 | test_that("Scale is discrete",{
30 |    p <- PlotMean(testMT, "balance", "weeks")
31 |    expect_is(p$scales$scales[[1]], "ScaleDiscrete")
32 | })
33 | 
34 | test_that("Mapping layer contains expected elements",{
35 |   p <- PlotMean(testMT, "balance", "weeks") 
36 |   expect_true( "group" %in% names(p$mapping)) 
37 |   expect_true("linetype" %in% names(p$mapping))
38 |   expect_true( "x" %in% names(p$mapping)) 
39 |   expect_true( "y" %in% names(p$mapping)) 
40 |   expect_length(setdiff(c("group", "linetype", "x", "y"), names(p$mapping)), 0)	
41 | })


--------------------------------------------------------------------------------
/man/PlotRates.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/numerical.R
 3 | \name{PlotRates}
 4 | \alias{PlotRates}
 5 | \title{Plot zero and missing rates for a numerical variable}
 6 | \usage{
 7 | PlotRates(meltdx, myVar, dateGp)
 8 | }
 9 | \arguments{
10 | \item{meltdx}{A \code{data.table} with missing rate and zero rate in long
11 | format, produced by \code{\link{SummaryStats}}}
12 | 
13 | \item{myVar}{The name of the variable to be plotted}
14 | 
15 | \item{dateGp}{Name of the variable that the time series plots should be 
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | }
20 | \value{
21 | A \code{ggplot2} object with a \code{missingrate} and
22 |   \code{zerorate} grouped by \code{dateGp}.
23 | }
24 | \description{
25 | Plot zero and missing rates for a numerical variable
26 | }
27 | \section{License}{
28 | 
29 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
30 | Version 2.0 (the "License"); you may not use this file except in compliance
31 | with the License. You may obtain a copy of the  License at
32 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
33 | or agreed to in writing, software distributed under the License is 
34 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
35 | KIND, either express or implied. See the License for the specific language 
36 | governing permissions and limitations under the License.
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotQuantiles.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot Quantiles over Time")
 4 | load("../testthat/testData.rda")
 5 | setDT(testData)
 6 | 
 7 | testData[, weeks := round(date, "weeks")]
 8 | testDT = testData[, {
 9 |    tmp1 = quantile(balance, p = c(.01, .5, .99));
10 |    list("p1"  = tmp1[1] ,
11 |         "p50" = tmp1[2] ,
12 |         "p99" = tmp1[3]
13 |    )}, by = "weeks"]
14 | 
15 | testMT = melt(testDT, id.vars = "weeks", 
16 |               measure.vars = c("p99", "p50","p1"))
17 | globalPct = testData[ , quantile(balance, p = c(.01, .5, .99) ) ]
18 | globalDT = data.table("weeks" = rep(testMT[variable == "p99", "weeks", 
19 |                       with = FALSE][[1]], 3))
20 | globalDT[, c("variable", "value") := list(rep(c("p1_g", "p50_g", "p99_g"), 
21 |                                               each = .N/3),
22 |                                           rep(globalPct, each = .N/3))]
23 | testMT = rbindlist(list( testMT, globalDT))
24 | 
25 | 
26 | test_that("Plot layers match expectations",{
27 |   p <- PlotQuantiles(testMT, myVar = "balance", dateGp = "weeks")  
28 |   expect_is(p$layers[[1]], "ggproto")
29 |   expect_is(p$layers[[1]]$geom, "GeomLine")
30 |   expect_is(p$layers[[1]]$stat, "StatIdentity")
31 | })
32 | 
33 | test_that("Mapping layer contains expected elements", {
34 |   p <- PlotQuantiles(testMT, myVar = "balance", dateGp = "weeks")  
35 |   expect_true( "colour" %in% names(p$mapping)) 
36 |   expect_true( "linetype" %in% names(p$mapping)) 
37 |   expect_true( "group" %in% names(p$mapping)) 
38 |   expect_true( "x" %in% names(p$mapping)) 
39 |   expect_true( "y" %in% names(p$mapping)) 
40 |   expect_length(setdiff(c("colour", "linetype", "group", "x", "y"), names(p$mapping)), 0)
41 |  })
42 | 
43 | 


--------------------------------------------------------------------------------
/man/PlotQuantiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/numerical.R
 3 | \name{PlotQuantiles}
 4 | \alias{PlotQuantiles}
 5 | \title{Plot 01, 50, and 99 percentile for a numerical variable}
 6 | \usage{
 7 | PlotQuantiles(meltdx, myVar, dateGp)
 8 | }
 9 | \arguments{
10 | \item{meltdx}{A data.table with p1, p50, and p99 in long format, produced by
11 | \code{\link{SummaryStats}}}
12 | 
13 | \item{myVar}{The name of the variable to be plotted}
14 | 
15 | \item{dateGp}{Name of the variable that the time series plots should be 
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | }
20 | \value{
21 | A \code{ggplot2} object with \code{dateGp} on the x axis, 
22 |   \code{value} on the y axis, and variables \code{p01}, \code{p50}, and 
23 |   \code{p99} plotted on the same graph, with grouped and global percentiles 
24 |   differentiated by line type.
25 | }
26 | \description{
27 | Plot 01, 50, and 99 percentile for a numerical variable
28 | }
29 | \section{License}{
30 | 
31 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
32 | Version 2.0 (the "License"); you may not use this file except in compliance
33 | with the License. You may obtain a copy of the  License at
34 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
35 | or agreed to in writing, software distributed under the License is 
36 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
37 | KIND, either express or implied. See the License for the specific language 
38 | governing permissions and limitations under the License.
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/man/PlotMean.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/numerical.R
 3 | \name{PlotMean}
 4 | \alias{PlotMean}
 5 | \title{Plot mean with {Mean +- 1SD} control limits for a numerical variable}
 6 | \usage{
 7 | PlotMean(meltdx, myVar, dateGp)
 8 | }
 9 | \arguments{
10 | \item{meltdx}{A \code{data.table} with Mean and 1SD control limits in long format, 
11 | produced by \code{\link{SummaryStats}}}
12 | 
13 | \item{myVar}{The name of the variable to be plotted}
14 | 
15 | \item{dateGp}{Name of the variable that the time series plots should be 
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | }
20 | \value{
21 | A \code{ggplot2} object with \code{dateGp} on the x axis, 
22 |   \code{value} on the y axis, and variables \code{Mean}, \code{cl1}, and 
23 |   \code{cl2} plotted on the same graph, with mean and control limits 
24 |   differentiated by line type.
25 | }
26 | \description{
27 | Plot mean with {Mean +- 1SD} control limits for a numerical variable
28 | }
29 | \section{License}{
30 |  
31 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
32 | Version 2.0 (the "License"); you may not use this file except in compliance
33 | with the License. You may obtain a copy of the  License at
34 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
35 | or agreed to in writing, software distributed under the License is 
36 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
37 | KIND, either express or implied. See the License for the specific language 
38 | governing permissions and limitations under the License.
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotBarplot.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot bar plot")
 4 | load("../testthat/testData.rda")
 5 | setDT(testData)
 6 | suppressMessages(PrepData(testData, dateNm = "date", 
 7 | 				 dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight"))
 8 | 
 9 | test_that("expected plot elements are returned", {
10 |   p <- PlotBarplot(dataFl = testData, myVar =  "job", weightNm = "weight")
11 | 	
12 |   expect_is(p$layers[[1]], "ggproto")
13 |   expect_is(p$layers[[1]]$geom, "GeomBar")
14 |   expect_is(p$layers[[1]]$stat, "StatIdentity")
15 |   expect_identical(p$labels$x, "job")
16 |   expect_identical(p$labels$y, "count")
17 |   expect_is(p$scales$scales[[1]], "ScaleDiscrete")
18 |   expect_true( "group" %in% names(p$mapping)) 
19 |   expect_true( "x" %in% names(p$mapping)) 
20 |   expect_true( "y" %in% names(p$mapping)) 
21 |   expect_length(setdiff(c("group", "x", "y"), names(p$mapping)), 0)
22 | })
23 | 
24 | test_that("variable is put in expected order with and without weights", {
25 | 	p <- PlotBarplot(dataFl = testData, myVar =  "job", weightNm = "weight")
26 | 	o1 <- names(rev(sort(xtabs(weight~job, data=testData))))
27 | 	o2 <- as.character(p$data[order(-count)][["job"]])
28 | 	expect_equal(o1, o2)
29 | 	
30 | 	p <- PlotBarplot(dataFl = testData, myVar =  "job", weightNm = NULL)
31 | 	o1 <- names(rev(sort(testData[, table(job)])))
32 | 	o2 <- rev(as.character(p$data[order(count)][["job"]]))
33 | 	expect_equal(o1, o2)
34 | })
35 | 
36 | test_that("global totals are calculated as expected", {
37 | 	p1 <- PlotBarplot(dataFl = testData, myVar =  "job", weightNm = "weight")
38 | 	expect_equal(as.numeric(p1$data[job=="retired"]$count), as.numeric(testData[job=="retired", sum(weight)]))
39 | 	p2 <- PlotBarplot(dataFl = testData, myVar =  "job", weightNm = NULL)
40 | 	expect_equal(as.numeric(p2$data[job=="entrepreneur"]$count), as.numeric(testData[job=="entrepreneur", .N]))
41 | })
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/man/PlotBarplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/categorical.R
 3 | \name{PlotBarplot}
 4 | \alias{PlotBarplot}
 5 | \title{Creates a bar plot for a discrete (or binary) variable}
 6 | \usage{
 7 | PlotBarplot(dataFl, myVar, weightNm = NULL)
 8 | }
 9 | \arguments{
10 | \item{dataFl}{A \code{data.table} of data; must be the output of the
11 | \code{\link{PrepData}} function.}
12 | 
13 | \item{myVar}{The name of the variable to be plotted}
14 | 
15 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
16 | no weights (all rows receiving weight 1).}
17 | }
18 | \value{
19 | A \code{ggplot} object with a histogram of \code{myVar} ordered by 
20 |   category frequency
21 | }
22 | \description{
23 | Creates a bar plot for a discrete (or binary) variable
24 | }
25 | \section{License}{
26 | 
27 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
28 | Version 2.0 (the "License"); you may not use this file except in compliance
29 | with the License. You may obtain a copy of the  License at
30 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
31 | or agreed to in writing, software distributed under the License is 
32 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
33 | KIND, either express or implied. See the License for the specific language 
34 | governing permissions and limitations under the License.
35 | }
36 | 
37 | \examples{
38 | data(bankData)
39 | bankData = PrepData(bankData, dateNm = "date", dateGp = "months", 
40 |                     dateGpBp = "quarters", weightNm = NULL)
41 | PlotBarplot(bankData, "job")
42 | 
43 | ## NA will be included as a category if any NA are present
44 | bankData[sample.int(.N)[1:1000], education := NA]
45 | PlotBarplot(bankData, "education")
46 | }
47 | \seealso{
48 | Functions depend on this function:
49 |          \code{\link{PlotCatVar}}.
50 | 
51 | This function depends on:
52 |          \code{\link{PrepData}}.
53 | }
54 | 


--------------------------------------------------------------------------------
/man/PrepLabels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/prep.R
 3 | \name{PrepLabels}
 4 | \alias{PrepLabels}
 5 | \title{Prepare variable labels}
 6 | \usage{
 7 | PrepLabels(labelFl, idx = 1:2)
 8 | }
 9 | \arguments{
10 | \item{labelFl}{Either the path of a dataset (a csv file) containing
11 | labels, an R object convertible to \code{data.table} (e.g., data frame) or 
12 | \code{NULL}. If \code{NULL}, no labels will be used. The label dataset must 
13 | contain at least 2 columns: \code{varCol} (variable names) and 
14 | \code{labelCol} (variable labels).}
15 | 
16 | \item{idx}{A vector of length 2, giving column index of variable names (first
17 | position) and labels (second position).}
18 | }
19 | \value{
20 | A data table formated for use by the \code{\link{vlm}} function.
21 | }
22 | \description{
23 | This function prepares a dataset containing variable labels for use by 
24 | the main plotting function \code{\link{vlm}}. The input must contain 
25 | variables' names in the first column and labels in the second column. All other 
26 | columns will be dropped. Special characters will create errors and should 
27 | be stripped outside of R. All labels will be truncated at 145 characters.
28 | }
29 | \section{License}{
30 |  
31 | Copyright 2017 Capital One Services, LLC Licensed under the
32 | Apache License, Version 2.0 (the "License"); you may not use this file
33 | except in compliance with the License. You may obtain a copy of the 
34 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
35 | applicable law or agreed to in writing, software distributed under the
36 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
37 | CONDITIONS OF ANY KIND, either express or implied. See the License for the
38 | specific language governing permissions and limitations under the License.
39 | }
40 | 
41 | \examples{
42 | data(bankLabels)
43 | bankLabels <- PrepLabels(bankLabels)
44 | }
45 | \seealso{
46 | Functions depend on this function:
47 |          \code{\link{PrintPlots}},
48 |          \code{\link{vlm}}.
49 | }
50 | 


--------------------------------------------------------------------------------
/man/CalcR2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots_order.R
 3 | \name{CalcR2}
 4 | \alias{CalcR2}
 5 | \title{Calculates R2 of a numerical variable using date as the predictor}
 6 | \usage{
 7 | CalcR2(myVar, dataFl, dateNm, weightNm = NULL, imputeValue = NULL)
 8 | }
 9 | \arguments{
10 | \item{myVar}{Name of variable to model.}
11 | 
12 | \item{dataFl}{A \code{data.table}, containing \code{myVar}, \code{dateNm}, 
13 | and \code{weightNm}.}
14 | 
15 | \item{dateNm}{Name of column containing the date variable (to be modeled as
16 | numeric); this date column must not have NA's.}
17 | 
18 | \item{weightNm}{Name of column containing row weights. If weights equal one, 
19 | then the \code{\link{lm.fit}} function will be called, otherwise the 
20 | \code{\link{lm.wfit}} will be called. The weights column must not have NA's.}
21 | 
22 | \item{imputeValue}{Either \code{NULL} or numeric. If \code{NULL}, model will
23 | be fit on only non-NA components of \code{myVar}. If numeric, missing cases
24 | of \code{myVar} will be imputed to \code{imputeValue}.}
25 | }
26 | \value{
27 | A numeric value of R2.
28 | }
29 | \description{
30 | Calculates weighted R2 of a univariate weighted linear model with
31 | \code{dateNm} as x and \code{myVar} as y using the workhorse \code{lm.fit}
32 | and \code{lm.wfit} functions.
33 | }
34 | \section{License}{
35 | 
36 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
37 | Version 2.0 (the "License"); you may not use this file except in compliance
38 | with the License. You may obtain a copy of the  License at
39 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
40 | or agreed to in writing, software distributed under the License is 
41 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
42 | KIND, either express or implied. See the License for the specific language 
43 | governing permissions and limitations under the License.
44 | }
45 | 
46 | \seealso{
47 | Functions depend on this function:
48 |          \code{\link{OrderByR2}}.
49 | 
50 | This function depends on:
51 |          \code{\link{PrepData}}.
52 | }
53 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotRatesOverTime.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot trace plots of categories' proportions over time")
 4 | load("../testthat/testData.rda")
 5 | setDT(testData)
 6 | require(ggplot2)
 7 | suppressMessages(PrepData(testData, dateNm = "date", 
 8 | 				 dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight"))
 9 | p <- PlotRatesOverTime(dataFl = testData, dateGp = "weeks", myVar = "job",
10 |     weightNm = "weight", newLevels = NULL)$p
11 | test_that("expected plot elements are returned", {	
12 |   expect_is(p$layers[[1]], "ggproto")
13 |   expect_is(p$layers[[1]]$geom, "GeomLine")
14 |   expect_is(p$layers[[1]]$stat, "StatIdentity")
15 |   expect_is(p$layers[[1]]$position, "PositionIdentity")
16 |   expect_identical(p$labels$x, "weeks")
17 |   expect_identical(p$labels$y, "")
18 |   expect_is(p$scales$scales[[1]], "ScaleContinuousDate")
19 | })
20 | 
21 | test_that("rates are calculated correctly normalized by time", {
22 |   dat = p$data
23 |   # check that all weeks sum to 1
24 |   dat[, sum := sum(rate), by = "weeks"]
25 |   dat[, table(sum)]
26 |   expect_length(dat[, table(sum)], 1)
27 | 
28 |   # check that 2008-06-03 is correctly calculated
29 |   tmpData = testData[weeks == "2008-06-03"]
30 |   tmpData[, rate1 := sum(weight), by = "job"]
31 |   tmpData[, rate0 := sum(weight)]
32 |   tmpData[, rate  := rate1/rate0]
33 | 
34 |   tmpData = unique(tmpData[, .(job, weeks, rate)])
35 |   dat = dat[weeks == "2008-06-03" & rate > 0, .(weeks, job, rate)]
36 |   dat[, job := as.character(job)]
37 |   setkey(dat, job)
38 |   setkey(tmpData, job)
39 |   expect_equal(dat[, rate], tmpData[, rate])
40 | })
41 | 
42 | test_that("rates are calculated correctly normalized by var", {
43 |   p <- PlotRatesOverTime(dataFl = testData, dateGp = "weeks", myVar = "job",
44 |                         weightNm = "weight", newLevels = NULL, normBy = "var")$p
45 |   dat = p$data
46 |   dat[, sum := sum(rate), by = "job"]
47 |   
48 |   #check all var rates sum to one
49 |   expect_length(dat[, table(sum)], 1)
50 |   expect_equal(dat[1, sum], 1)
51 |   
52 |   # check that rates are correctly calculated for technician
53 |   tmpData = testData[job == "technician"]
54 |   tmpData[, rate1:=sum(weight), by = "weeks"]
55 |   tmpData[, rate0:= sum(weight)]
56 |   tmpData[, rate := rate1/rate0]
57 |   tmpData = unique(tmpData[, .(job, weeks, rate)])
58 |   expect_equal(tmpData[1:4, rate], dat[job=="technician"][2:5, rate])
59 | })
60 | 
61 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 
 2 | # SPDX-License-Identifier: Apache-2.0 
 3 | # Copyright 2017 Capital One Services, LLC 
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 
 6 | # you may not use this file except in compliance with the License. 
 7 | #
 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed 
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied. 
13 | # 
14 | # See the License for the specific language governing permissions and limitations under the License. 
15 | 
16 | 
17 | ###########################################
18 | #           Utility Functions             #
19 | ###########################################
20 | 
21 | is.nmrcl <- function(x)  inherits(x, "nmrcl")
22 | is.ctgrl <- function(x)  inherits(x, "ctgrl")
23 | 
24 | wtd_quantile_NA <- function(x, weights, probs = c(.0, .25, .5, .75, 1),
25 |                             ...) { #!# previous name: wtd.quantile_NA
26 |   tryCatch(as.double(Hmisc::wtd.quantile(x, weights, probs,
27 |                                          normwt = TRUE, na.rm = TRUE, ...)),
28 |            error = function(e) rep(NA_real_, length(probs)))
29 | }
30 | 
31 | ## The color-blind friendly color palette
32 | ## Source: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
33 | cbbPalette <- c("#D55E00", "#009E73", "#0072B2", "#000000", "#E69F00", "#56B4E9",  "#F0E442",  "#CC79A7")
34 | 
35 | # # An example function for fuzzy label matching
36 | # # To be used an input of the \code{\link{PlotVar}} function.
37 | # # If variables look like VAR_nameofvar, and the attribute dictionary contains
38 | # # defintions only for nameofvar, then a fuzzy matching function can be
39 | # # provided which would first attempt to match exactly, and then to attempt to
40 | # # match on the longest piece after splitting on the underscore:
41 | # 
42 | # Fuzzy = function(LabelFl, myVar){
43 | #    ll = labelFl[varCol == myVar, labelCol] # exact match
44 | #    if (ll == ""){
45 | #        # split on "_", search for exact match of longest piece
46 | #        shortNm = names(which.max(sapply(strsplit(myVar, "_")[[1]], nchar)))
47 | #        ll = labelFl[varCol == shortNm, labelCol]
48 | #    }
49 | #    return(ll)
50 | #  }
51 | 


--------------------------------------------------------------------------------
/man/SummaryStats.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/numerical.R
 3 | \name{SummaryStats}
 4 | \alias{SummaryStats}
 5 | \title{Create summary statistics for a numerical variable}
 6 | \usage{
 7 | SummaryStats(myVar, dataFl, dateGp, weightNm = NULL)
 8 | }
 9 | \arguments{
10 | \item{myVar}{The name of the variable to be plotted}
11 | 
12 | \item{dataFl}{A \code{data.table} of data; must be the output of the
13 | \code{\link{PrepData}} function.}
14 | 
15 | \item{dateGp}{Name of the variable that the time series plots should be 
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | 
20 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
21 | no weights (all rows receiving weight 1).}
22 | }
23 | \value{
24 | \item{meltdx}{A \code{data.table} for use by the plotting funtions 
25 |     \code{\link{PlotMean}}, \code{\link{PlotQuantiles}}, and 
26 |     \code{\link{PlotRates}}.}
27 |   \item{numVarSummary}{A \code{data.table} of summary statistics.}
28 | }
29 | \description{
30 | Create summary statistics for a numerical variable
31 | }
32 | \section{License}{
33 | 
34 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
35 | Version 2.0 (the "License"); you may not use this file except in compliance
36 | with the License. You may obtain a copy of the  License at
37 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
38 | or agreed to in writing, software distributed under the License is 
39 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
40 | KIND, either express or implied. See the License for the specific language 
41 | governing permissions and limitations under the License.
42 | }
43 | 
44 | \examples{
45 | data(bankData)
46 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "quarters", 
47 |                     dateGpBp = "years")
48 | mdx <- SummaryStats(myVar = "age", dataFl = bankData, 
49 |                    dateGp = "quarters")$meltdx
50 | plot(PlotQuantiles(mdx[variable \%in\% c("p99", "p50", "p1", "p99_g", "p50_g",
51 |                    "p1_g")], "age", "quarters"))
52 | plot(PlotMean(mdx[variable \%in\% c("mean", "cl1", "cl2")], "age", "quarters"))
53 | plot(PlotRates(mdx, "age", "quarters"))
54 | }
55 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PlotDist.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | library(proto)
 3 | context("Plot Boxplots")
 4 | load("../testthat/testData.rda")
 5 | setDT(testData)
 6 | suppressMessages(PrepData(dataFl = testData, dateNm = "date", dateGp = "weeks", dateGpBp = "weeks"))
 7 | 
 8 | test_that("Plot layers match expectations",{
 9 |   p <- PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", weightNm = "weight")  
10 |   expect_is(p$layers[[1]], "ggproto")
11 |   expect_is(p$layers[[1]]$geom, "GeomBoxplot")
12 |   expect_is(p$layers[[1]]$stat, "StatBoxplot")
13 |   expect_is(p$layers[[2]]$geom, "GeomRug")
14 |   expect_is(p$layers[[2]]$stat, "StatIdentity")
15 | })
16 | 
17 | 
18 | test_that("Mapping layer contains expected elements", {
19 |   p <- PlotDist(testData, myVar = "balance", dateGpBp = "weeks")  
20 |   expect_true( "group" %in% names(p$mapping)) 
21 |   expect_true( "x" %in% names(p$mapping)) 
22 |   expect_true( "y" %in% names(p$mapping)) 
23 |   expect_length(setdiff(c("group", "x", "y"), names(p$mapping)), 0)
24 |   
25 |   expect_true( "x" %in% names(p$layers[[2]]$mapping)) 
26 |   expect_true( "y" %in% names(p$layers[[2]]$mapping)) 
27 |   expect_length(setdiff(c("x", "y"), names(p$mapping)), 0)
28 |  })
29 | 
30 | 
31 | test_that("Y axis is labeled 'balance' and X axis is labeled 'weeks'",{
32 |   p <- PlotDist(testData, "balance", "weeks")
33 |   expect_identical(p$labels$x, "weeks")
34 |   expect_identical(p$labels$y, "balance")
35 | })
36 | 
37 | test_that("invalid log transform returns message and untransformed plot", {
38 | 	expect_message(PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", skewOpt = 3), 
39 | 	"untransformed boxplot")
40 | 	p <- PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", skewOpt = 3)
41 | 	expect_is(p$layers[[1]], "ggproto")
42 |   	expect_is(p$layers[[1]]$geom, "GeomBoxplot")
43 |   	expect_is(p$layers[[1]]$stat, "StatBoxplot")
44 |   	expect_is(p$layers[[2]]$geom, "GeomRug")
45 |   	expect_is(p$layers[[2]]$stat, "StatIdentity")
46 |   	expect_equal(length(grep("log10", p$labels$y)), 0)
47 | })
48 | 
49 | test_that("valid log transform returns transformed scale",{
50 | 	testData[, posBalance := ifelse(balance >= 0.1, balance, 0.1)]
51 | 	p <- PlotDist(dataFl = testData, myVar = "posBalance", dateGpBp = "weeks", skewOpt = 3)
52 | 	expect_message(PlotDist(dataFl = testData, myVar = "posBalance", dateGpBp = "weeks", skewOpt = 3), 
53 | 		"Scale for 'y' is already present")
54 | 	expect_is(p$layers[[1]], "ggproto")
55 |   	expect_is(p$layers[[1]]$geom, "GeomBoxplot")
56 |   	expect_is(p$layers[[1]]$stat, "StatBoxplot")
57 |   	expect_is(p$layers[[2]]$geom, "GeomRug")
58 |   	expect_is(p$layers[[2]]$stat, "StatIdentity")
59 |   	expect_equal(grep("log10", p$labels$y), 1)
60 | })
61 | 


--------------------------------------------------------------------------------
/man/PlotDist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/numerical.R
 3 | \name{PlotDist}
 4 | \alias{PlotDist}
 5 | \title{Side-by-side box plots, for a numerical variable,  grouped by \code{dateGpBp}}
 6 | \usage{
 7 | PlotDist(dataFl, myVar, dateGpBp, weightNm = NULL, skewOpt = NULL)
 8 | }
 9 | \arguments{
10 | \item{dataFl}{A \code{data.table} of data; must be the output of the
11 | \code{\link{PrepData}} function.}
12 | 
13 | \item{myVar}{The name of the variable to be plotted}
14 | 
15 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
16 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
17 | 
18 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
19 | no weights (all rows receiving weight 1).}
20 | 
21 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 
22 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
23 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
24 | Negative input of \code{skewOpt} will be converted to 3.}
25 | }
26 | \value{
27 | A \code{ggplot2} object with a box plot of \code{myVar} grouped by 
28 |   \code{dateGpBp}
29 | }
30 | \description{
31 | For a variable is all positive (no zeros) and has larger than 50 all distinct
32 | values, if it is highly skewed, then all box plots can be plotted under the 
33 | log base 10 transformation. See the argument \code{skewOpt} for details.
34 | }
35 | \section{License}{
36 | 
37 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
38 | Version 2.0 (the "License"); you may not use this file except in compliance
39 | with the License. You may obtain a copy of the  License at
40 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
41 | or agreed to in writing, software distributed under the License is 
42 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
43 | KIND, either express or implied. See the License for the specific language 
44 | governing permissions and limitations under the License.
45 | }
46 | 
47 | \examples{
48 | data(bankData)
49 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
50 |                      dateGpBp = "quarters")
51 | PlotDist(dataFl = bankData, myVar = "balance", dateGpBp = "quarters")
52 | ## The following attempt to log transform will fail due to negative values,
53 | ## and the untransformed version will be returned
54 | PlotDist(dataFl = bankData, myVar = "balance", dateGpBp = "quarters", 
55 |          skewOpt = 3)
56 | ## This attempt should succeed, as the skew exceeds 3 and there are no 
57 | ## negative values
58 | PlotDist(dataFl = bankData, myVar = "duration", dateGpBp = "quarters",
59 |          skewOpt = 3)
60 | }
61 | 


--------------------------------------------------------------------------------
/tests/testthat/test_CalcR2.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | context("Calculate R-squared")
 3 | load("../testthat/testData.rda")
 4 | testData <- setDT(testData)
 5 | testData <- testData[, .(age, weight, date)]
 6 | 
 7 | test_that("CalcR2 gives correct R2 with weight", {
 8 | 	test.R2 <- CalcR2("age", testData, "date", weightNm = "weight", imputeValue = NULL)
 9 | 	ans.R2 <- summary(lm(age~date, weight=weight, data=testData))$r.squared
10 | 	expect_equal(test.R2, ans.R2)
11 | })
12 | 
13 | 
14 | test_that("CalcR2 gives correct R2 without weight", {
15 | 	test.R2 <- CalcR2("age", testData, "date", weightNm = NULL, imputeValue = NULL)
16 | 	ans.R2 <- summary(lm(age~date, data=testData))$r.squared
17 | 	expect_equal(test.R2, ans.R2)
18 | })
19 | 
20 | #testData1 has missings in Y
21 | idx = sample.int(100, 10)
22 | testData1 = testData[idx, age:=NA]
23 | 
24 | test_that("CalcR2 is correct with imputation in Y and weight", {
25 | 	test.R2 <- CalcR2("age", testData1, "date", weightNm = "weight", imputeValue = 0)
26 | 	ans.R2 <- summary(lm(age~date, data=testData1[is.na(age), age:=0], weight=weight))$r.squared
27 | 	expect_equal(test.R2, ans.R2)
28 | })
29 | 
30 | test_that("CalcR2 is correct with imputation in Y", {
31 | 	test.R2 <- CalcR2("age", testData1, "date", weightNm = NULL, imputeValue = 0)
32 | 	ans.R2 <- summary(lm(age~date, data=testData1[is.na(age), age:=0]))$r.squared
33 | 	expect_equal(test.R2, ans.R2)
34 | })
35 | 
36 | #testData2 has missings in weight and date, but not in Y
37 | testData2 =  testData[sample.int(.N, 10), weight := NA]
38 | testData2 = testData2[sample.int(.N, 10),   date := NA]
39 | test_that("CalcR2 is correct with missing values in weight and date", {
40 | 	test.R2 <- CalcR2("age", testData2, "date", weightNm = "weight", imputeValue = NULL)
41 | 	ans.R2 <- summary(lm(age~date, data=testData2, weight=weight))$r.squared
42 | 	expect_equal(test.R2, ans.R2)
43 | })
44 | 
45 | #testData3 has missing in weight, date and Y
46 | testData3 = testData2[idx, age := NA]
47 | test_that("CalcR2 is correct with missing values in weight and date and Y", {
48 | 	test.R2 <- CalcR2("age", testData3, "date", weightNm = "weight", imputeValue = NULL)
49 | 	ans.R2 <- summary(lm(age~date, data=testData3, weight=weight))$r.squared
50 | 	expect_equal(test.R2, ans.R2)
51 | })
52 | 
53 | 
54 | test_that("CalcR2 is correct with missing values in weight and date and Y and imputation", {
55 | 	test.R2 <- CalcR2("age", testData3, "date", weightNm = "weight", imputeValue = 0)
56 | 	ans.R2 <- summary(lm(age~date, data=testData3[is.na(age), age:=0], weight=weight))$r.squared
57 | 	expect_equal(test.R2, ans.R2)
58 | })
59 | 
60 | 
61 | test_that("CalcR2 is correct with no weight and missing values in date and Y", {
62 | 	test.R2 <- CalcR2("age", testData3, "date", weightNm = NULL, imputeValue = NULL)
63 | 	ans.R2 <- summary(lm(age~date, data=testData3))$r.squared
64 | 	expect_equal(test.R2, ans.R2)
65 | })
66 | 
67 | 
68 | test_that("CalcR2 is correct with no weight and missing values in date and Y imputed", {
69 | 	test.R2 <- CalcR2("age", testData3, "date", weightNm = NULL, imputeValue = 0)
70 | 	ans.R2 <- summary(lm(age~date, data=testData3[is.na(age), age:=0]))$r.squared
71 | 	expect_equal(test.R2, ans.R2)
72 | })
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/man/bankData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{bankData}
 5 | \alias{bankData}
 6 | \title{Direct marketing campaigns of a Portuguese banking institution}
 7 | \format{A data frame with 45,211 rows and 19 variables:
 8 | \describe{
 9 |   \item{age}{Age of the client, numeric.}
10 |   \item{job}{Type of job, a categorical variable with the levels: 
11 |     \code{'admin.'}, \code{'blue-collar'}, \code{'entrepreneur'},
12 |     \code{'housemaid'}, \code{'management'}, \code{'retired'},
13 |     \code{'self-employed'}, \code{'services'}, \code{'student'},
14 |     \code{'technician'}, \code{'unemployed'}, and \code{'unknown'}.}
15 |   \item{marital}{Marital status, a categorical variable with levels: 
16 |     \code{'divorced'}, \code{'married'}, \code{'single'}, and \code{'unknown'}.
17 |     Note that \code{'divorced'} means either divorced or widowed.}
18 |   \item{education}{A categorical variable with levels: \code{'basic.4y'},
19 |     \code{'basic.6y'}, \code{'basic.9y'}, \code{'high.school'},
20 |     \code{'illiterate'}, \code{'professional.course'}, 
21 |     \code{'university.degree'}, and \code{'unknown'}.}
22 |   \item{default}{Whether credit is in default, a categorical variable with 
23 |     levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
24 |   \item{balance}{Account balance, numeric.}
25 |   \item{housing}{Whether the client has a housing loan, a categorical variable
26 |     with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
27 |   \item{loan}{Whether the client has personal loan, a categorical variable
28 |     with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
29 |   \item{contact}{Type of contact communication, a categorical variable
30 |     with levels: \code{'cellular'} and \code{'telephone'}.}
31 |   \item{duration}{Last contact duration in seconds, a numeric variable.}
32 |   \item{campaign}{Number of contacts performed during this campaign for 
33 |     this client, including the last contact; a numeric variable.}
34 |   \item{pdays}{Number of days that passed by after the client was last 
35 |     contacted from a previous campaign; a numeric variable, with \code{999} 
36 |     means that client was not previously contacted.}
37 |   \item{previous}{Number of contacts performed before this campaign for this
38 |     client, a numeric variable.}
39 |   \item{poutcome}{Outcome of the previous marketing campaign, a categorical
40 |     variable with levels: \code{'failure'}, \code{'nonexistent'},
41 |     and \code{'success'}.}
42 |   \item{y}{Whether the client has subscribed a term deposit, a categorical
43 |     variable with levels: \code{'yes'} and \code{'no'}.}
44 |   \item{date}{Last contact date.}
45 | }}
46 | \source{
47 | \url{https://archive.ics.uci.edu/ml/datasets/Bank+Marketing}
48 | 
49 | \cite{Lichman, M. (2013). UCI Machine Learning Repository 
50 |   [\url{http://archive.ics.uci.edu/ml}]. Irvine, CA: University of California, 
51 |   School of Information and Computer Science.}
52 | 
53 | \cite{S. Moro, P. Cortez, and P. Rita. (2014) A Data-Driven Approach
54 |   to Predict the Success of Bank Telemarketing. Decision Support Systems, 
55 |   Elsevier, 62:22-31, June 2014.}
56 | }
57 | \usage{
58 | bankData
59 | }
60 | \description{
61 | The marketing campaigns were based on phone calls.
62 | Often, more than one contact to the same client was required, in order to 
63 | access if the product (bank term deposit) would be ('yes') or not ('no') 
64 | subscribed. Records are ordered by date (from May 2008 to November 2010), 
65 | similar to data analyzed in Moro et al. [2014].
66 | }
67 | \keyword{datasets}
68 | 


--------------------------------------------------------------------------------
/man/PlotRatesOverTime.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/categorical.R
 3 | \name{PlotRatesOverTime}
 4 | \alias{PlotRatesOverTime}
 5 | \title{Creates trace plots of categories' proportions over time for a discrete (or
 6 | binary) variable}
 7 | \usage{
 8 | PlotRatesOverTime(dataFl, dateGp, myVar, normBy = "time", weightNm = NULL,
 9 |   newLevels = NULL, kCategories = 9)
10 | }
11 | \arguments{
12 | \item{dataFl}{A \code{data.table} of data; must be the output of the
13 | \code{\link{PrepData}} function.}
14 | 
15 | \item{dateGp}{Name of the variable that the time series plots should be 
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | 
20 | \item{myVar}{The name of the variable to be plotted}
21 | 
22 | \item{normBy}{The normalization factor for rate plots, can be \code{"time"}
23 | or \code{"var"}. If \code{"time"}, then for each time period of 
24 | \code{dateGp}, counts are normalized by the total counts over all 
25 | categories in that time period. This illustrates changes of categories' 
26 | proportions over time. If \code{"var"}, then for each category, its counts 
27 | are normalized by the total counts over time from only this category. This
28 | illustrates changes of categories' volumes over time.}
29 | 
30 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
31 | no weights (all rows receiving weight 1).}
32 | 
33 | \item{newLevels}{categories of \code{myVar} in order of global frequency}
34 | 
35 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
36 | trace plots of only the \code{kCategories} most prevalent categories are
37 | plotted.}
38 | }
39 | \value{
40 | A list:
41 |   \item{p}{\code{ggplot} object, trace plots of categories' proportions 
42 |     \code{myVar} over time.}
43 |   \item{catVarSummary}{A \code{data.table}, contains categories' proportions 
44 |     globally, and over-time in each time period in \code{dateGp}. Each row is
45 |     a category of the categorical (or binary) variable \code{myVar}. The row
46 |     whose \code{category == 'NA'} corresponds to missing. Categories are 
47 |     ordered by global prevalence in a descending order.}
48 | }
49 | \description{
50 | Creates trace plots of categories' proportions over time for a discrete (or
51 | binary) variable
52 | }
53 | \section{License}{
54 | 
55 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
56 | Version 2.0 (the "License"); you may not use this file except in compliance
57 | with the License. You may obtain a copy of the  License at
58 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
59 | or agreed to in writing, software distributed under the License is 
60 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
61 | KIND, either express or implied. See the License for the specific language 
62 | governing permissions and limitations under the License.
63 | }
64 | 
65 | \examples{
66 | data(bankData)
67 | bankData$weight = rpois(nrow(bankData), 5)
68 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
69 |                      dateGpBp = "quarters", weightNm = "weight")
70 | PlotRatesOverTime(dataFl = bankData, dateGp = "months", weightNm = "weight",
71 |                   myVar = "job", newLevels = NULL, normBy = "time")
72 | 
73 | }
74 | \seealso{
75 | Functions depend on this function:
76 |          \code{\link{PlotCatVar}}.
77 | 
78 | This function depends on:
79 |          \code{\link{PrepData}}.
80 | }
81 | 


--------------------------------------------------------------------------------
/man/otvPlots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/package_otvPlots.R
 3 | \docType{package}
 4 | \name{otvPlots}
 5 | \alias{otvPlots}
 6 | \alias{otvPlots-package}
 7 | \title{Over time variable plots for predictive modeling (otvPlots)}
 8 | \description{
 9 | The \code{otvPlots} package uses \code{data.table} and \code{ggplot2} 
10 | packages to efficiently plot time series aggregated from large datasets. 
11 | Plots of numerical variables are optionally returned ordered by correlation 
12 | with date -- a natural starting point for anomaly detection. Plots are 
13 | automatically labeled if a variable dictionary is provided.
14 | }
15 | \details{
16 | Output files include:
17 | \itemize{
18 |  \item A PDF file of plots saved as \code{outFl}.pdf, with each individual page 
19 |  on one variable. Variables are plotted in the order indicated in the argument
20 |  \code{sortVars} or \code{sortFn}. 
21 |  For each numerical variable, the output plots include 
22 |  \itemize{
23 |    \item side-by-side boxplots grouped by \code{dateGpBp} (left), 
24 |    \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
25 |      (top right), 
26 |    \item a trace plot of mean and +-1 SD control limits, grouped by 
27 |      \code{dateGp}(middle right), and 
28 |    \item a trace plot of missing and zero rates, grouped by \code{dateGp} 
29 |      (bottom right).
30 |   }
31 |   For each categorical variable (including a numerical variable with no more 
32 |   than 2 unique levels not including NA), the output plots include 
33 |   \itemize{
34 |     \item a frequency bar plot (left), and 
35 |     \item a grid of trace plots on categories' proportions over time (right). 
36 |       If the variable contains more than \code{kCategories} number of 
37 |       categories, trace plots of only the largest \code{kCategories} will be 
38 |       plotted. If the variable contains only two categories, then only the 
39 |       trace plot of the less prevalent category will be plotted.
40 |   }
41 |   \item CSV file(s) on summary statistics of variables, both globally and over
42 |   time aggregated by \code{dateGp}. The order of variables in the CSV files
43 |   is the same as in the PDF file. 
44 |   \itemize{
45 |     \item For numerical variables, number of observations (counts), p1, p25, 
46 |     p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
47 |     as \code{outFl}_numerical_summary.csv.
48 |     \item For categorical varaibles, number of observations (counts) and 
49 |     categories' proportions are saved as \code{outFl}_categorical_summary.csv. 
50 |     Each row is a category of a categorical (or binary) variable.
51 |     The row whose \code{category == 'NA'} corresponds to missing. Categories
52 |     among the same variable are ordered by global prevalence in a descending 
53 |     order.
54 |   }
55 | }
56 | }
57 | \section{License}{
58 | 
59 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
60 | Version 2.0 (the "License"); you may not use this file except in compliance
61 | with the License. You may obtain a copy of the  License at
62 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
63 | or agreed to in writing, software distributed under the License is 
64 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
65 | KIND, either express or implied. See the License for the specific language 
66 | governing permissions and limitations under the License.
67 | }
68 | 
69 | \seealso{
70 | Main function: \code{\link{vlm}}.
71 | 
72 | Selected supporting functions: 
73 |          \code{\link{PrepData}}, 
74 |          \code{\link{PrepLabels}},
75 |          \code{\link{OrderByR2}}.
76 | }
77 | 


--------------------------------------------------------------------------------
/tests/testthat/test_SummaryStats.R:
--------------------------------------------------------------------------------
 1 | library(otvPlots)
 2 | context("Summary stats for numerical variables")
 3 | load("../testthat/testData.rda")
 4 | setDT(testData)
 5 | suppressMessages(PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight"))
 6 | 
 7 | test_that("Numerical statistics are calculated correctly without weight", {
 8 | 	mdx  = SummaryStats(myVar = "age", dataFl = testData, dateGp = "weeks")$meltdx
 9 | 	Mean = mdx[variable=='Mean']
10 | 	p1   = mdx[variable=='p1']
11 | 	p99  = mdx[variable=='p99']
12 | 	zerorate    = mdx[variable=='zerorate']
13 | 	missingrate = mdx[variable=='missingrate']
14 | 
15 | 	p99_g = unique(mdx[variable=='p99_g', value])
16 | 	p1_g = unique(mdx[variable=='p1_g', value])
17 | 	cl1  = unique(mdx[variable=='cl1', value])
18 | 	cl2  = unique(mdx[variable=='cl2', value])
19 | 	
20 | 	expect_equivalent(p99_g, quantile(testData[, age], p=.99))
21 | 	expect_equivalent(p1_g, quantile(testData[, age],  p=.01))
22 | 	expect_equivalent(cl1,  mean(testData[, age]) + sd(testData[,age]))
23 | 	expect_equivalent(cl2,  mean(testData[, age]) - sd(testData[,age]))
24 | 	
25 | 	mdx2 = mdx[weeks == "2008-05-06" & variable%in%c("p99", "p50", "p1", "mean", "zerorate", "missingrate")]
26 | 	
27 | 	expect_equivalent(mdx2[variable=="p99", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .99))
28 | 	expect_equivalent(mdx2[variable=="p50", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .5))
29 | 	expect_equivalent(mdx2[variable=="p1", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .01))
30 | 	expect_equivalent(mdx2[variable=="mean", value], mean(testData[weeks==as.IDate("2008-05-06"),age]))
31 | 	expect_equivalent(mdx2[variable=="zerorate", value], mean(testData[weeks==as.IDate("2008-05-06"),age]==0))
32 | 	expect_equivalent(mdx2[variable=="missingrate", value], mean(is.na(testData[weeks==as.IDate("2008-05-06"),age])))
33 | })
34 | 
35 | 
36 | test_that("Numerical statistics are calculated correctly with weight", {
37 | 	mdx  = SummaryStats(myVar = "age", dataFl = testData, dateGp = "weeks", weightNm = "weight")$meltdx
38 | 	Mean = mdx[variable=='Mean']
39 | 	p1   = mdx[variable=='p1']
40 | 	p99  = mdx[variable=='p99']
41 | 	zerorate    = mdx[variable=='zerorate']
42 | 	missingrate = mdx[variable=='missingrate']
43 | 
44 | 
45 | 	p99_g = unique(mdx[variable=='p99_g', value])
46 | 	p1_g = unique(mdx[variable=='p1_g', value])
47 | 	cl1  = unique(mdx[variable=='cl1', value])
48 | 	cl2  = unique(mdx[variable=='cl2', value])
49 | 	
50 | 	expect_equivalent(p99_g, Hmisc::wtd.quantile(testData[, age], testData[, weight], probs=.99, normwt=TRUE))
51 | 	expect_equivalent(p1_g, Hmisc::wtd.quantile(testData[, age], testData[, weight], probs=.01, normwt=TRUE))
52 | 	expect_equivalent(cl2, Hmisc::wtd.mean(testData[, age], testData[,weight], na.rm=TRUE, normwt=TRUE) -	
53 | 					 sqrt(Hmisc::wtd.var(testData[,age], testData[,weight], na.rm=TRUE,normwt=TRUE)))
54 |     expect_equivalent(cl1, Hmisc::wtd.mean(testData[, age], testData[,weight], na.rm=TRUE, normwt=TRUE) +	
55 | 					 sqrt(Hmisc::wtd.var(testData[,age], testData[,weight], na.rm=TRUE,normwt=TRUE)))
56 | 					 
57 | 	mdx2 = mdx[weeks == "2008-05-06" & variable%in%c("p99", "p50", "p1", "mean", "zerorate", "missingrate")]
58 | 	testData2 = testData[weeks==as.IDate("2008-05-06")]
59 | 	
60 | 	expect_equivalent(mdx2[variable=="p99", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .99, normwt=TRUE))
61 | 	expect_equivalent(mdx2[variable=="p50", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .5, normwt=TRUE))
62 | 	expect_equivalent(mdx2[variable=="p1", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .01, normwt=TRUE))
63 | 	expect_equivalent(mdx2[variable=="mean", value], Hmisc::wtd.mean(testData2[,age], testData2[,weight]))
64 | 	expect_equivalent(mdx2[variable=="zerorate", value], Hmisc::wtd.mean((testData2[,age]==0), testData2[,weight]))
65 | 	expect_equivalent(mdx2[variable=="missingrate", value], Hmisc::wtd.mean(is.na(testData2[,age]), testData2[,weight]))
66 | })
67 | 
68 | 


--------------------------------------------------------------------------------
/man/OrderByR2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots_order.R
 3 | \name{OrderByR2}
 4 | \alias{OrderByR2}
 5 | \title{Create numerical variable ranking using R2 between date to and variable}
 6 | \usage{
 7 | OrderByR2(dataFl, dateNm, buildTm = NULL, weightNm = NULL,
 8 |   kSample = 50000)
 9 | }
10 | \arguments{
11 | \item{dataFl}{A \code{data.table} of data; must be the output of the
12 | \code{\link{PrepData}} function.}
13 | 
14 | \item{dateNm}{Name of column containing the date variable.}
15 | 
16 | \item{buildTm}{Vector identify time period for ranking/anomaly detection
17 | (most likely model build period). Allows for a subset of plotting time
18 | period to be used for anomaly detection.
19 | \itemize{
20 |      \item Must be a vector of dates and must be inclusive i.e. buildTm[1]
21 |        <= date <= buildTm[2] will define the time period.
22 |      \item Must be either \code{NULL}, a vector of length 2, or a vector of 
23 |        length 3. 
24 |      \item If \code{NULL}, the entire dataset will be used for 
25 |        ranking/anomaly detection. 
26 |      \item If a vector of length 2, the format of the dates must be
27 |        a character vector in default R date format (e.g. "2017-01-30"). 
28 |      \item If a vector of length 3, the first two columns must contain dates 
29 |        in any strptime format, while the 3rd column contains the strptime 
30 |        format (see \code{\link{strptime}}). 
31 |      \item The following are equivalent ways of selecting
32 |        all of 2014:
33 |      \itemize{
34 |        \item \code{c("2014-01-01","2014-12-31")}
35 |        \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")}
36 |      }
37 | }}
38 | 
39 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
40 | no weights (all rows receiving weight 1).}
41 | 
42 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 
43 | indicates the sample size for both drawing boxplots and ordering numerical
44 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 
45 | reasonable value (default is 50K) dramatically improves processing speed. 
46 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
47 | parameter should not be set to \code{NULL}, or boxplots may take a very
48 | long time to render. This setting has no impact on the accuracy of time 
49 | series plots on quantiles, mean, SD, and missing and zero rates.}
50 | }
51 | \value{
52 | A vector of variable names sorted by R2 of \code{lm} of the formula
53 |   \code{var} ~ \code{dateNm} (highest R2 to lowest)
54 | }
55 | \description{
56 | Calculates R2 of a linear model of the formula \code{var} ~ \code{dateNm} for
57 | each \code{var} of class \code{nmrcl} and returns a vector of
58 | variable names ordered by highest R2. The linear model can be calculated over
59 | a subset of dates, see details of parameter \code{buildTm}. Non-numerical
60 | variables are returned in alphabetical order after the sorted numerical
61 | variables.
62 | }
63 | \section{License}{
64 |  
65 | Copyright 2017 Capital One Services, LLC Licensed under the
66 | Apache License, Version 2.0 (the "License"); you may not use this file
67 | except in compliance with the License. You may obtain a copy of the 
68 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
69 | applicable law or agreed to in writing, software distributed under the
70 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
71 | CONDITIONS OF ANY KIND, either express or implied. See the License for the
72 | specific language governing permissions and limitations under the License.
73 | }
74 | 
75 | \examples{
76 | data(bankData)
77 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
78 |                      dateGpBp = "quarters")
79 | OrderByR2(bankData, dateNm = "date")
80 | }
81 | \seealso{
82 | Functions depend on this function:
83 |          \code{\link{vlm}}.
84 | 
85 | This function depends on:
86 |          \code{\link{CalcR2}},
87 |          \code{\link{PrepData}}.
88 | }
89 | 


--------------------------------------------------------------------------------
/man/PlotNumVar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/numerical.R
 3 | \name{PlotNumVar}
 4 | \alias{PlotNumVar}
 5 | \title{Create plots and summary statistics for a numerical variable}
 6 | \usage{
 7 | PlotNumVar(myVar, dataFl, weightNm, dateGp, dateGpBp, skewOpt = NULL,
 8 |   kSample = 50000)
 9 | }
10 | \arguments{
11 | \item{myVar}{The name of the variable to be plotted}
12 | 
13 | \item{dataFl}{A \code{data.table} of data; must be the output of the
14 | \code{\link{PrepData}} function.}
15 | 
16 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
17 | no weights (all rows receiving weight 1).}
18 | 
19 | \item{dateGp}{Name of the variable that the time series plots should be 
20 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
21 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
22 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
23 | 
24 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
25 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
26 | 
27 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 
28 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
29 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
30 | Negative input of \code{skewOpt} will be converted to 3.}
31 | 
32 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 
33 | indicates the sample size for both drawing boxplots and ordering numerical
34 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 
35 | reasonable value (default is 50K) dramatically improves processing speed. 
36 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
37 | parameter should not be set to \code{NULL}, or boxplots may take a very
38 | long time to render. This setting has no impact on the accuracy of time 
39 | series plots on quantiles, mean, SD, and missing and zero rates.}
40 | }
41 | \value{
42 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a 
43 |     side-byside boxplot grouped by \code{dateGpBp}, a time series plot of p1,
44 |     p50 (median), and p99 grouped by \code{dateGp}, a time series plot of 
45 |     mean and +-1 SD control limits grouped by \code{dateGp}, and a time 
46 |     series plot of missing and zerorates grouped by \code{dateGp}.}
47 |   \item{numVarSummary}{A \code{data.table}, contains global and over time
48 |     summary statistics, including p1, p25, p50, p75, and p99 quantiles, mean 
49 |     and SD, missing and zero rates.}
50 | }
51 | \description{
52 | Output plots include a boxplot on the left, grouped by a courser time scale 
53 | (\code{dateGpBp}), and three trace plots on the right, on p1, p50, 
54 | and p99 qunatiles, mean and +-1 SD control limits, missing and zerorates,
55 | all grouped by a finer time scale as in \code{dateGp}. In addition to plots, 
56 | a \code{data.table} of summary statistics are generated, on global and
57 | over time summary statistics.
58 | }
59 | \section{License}{
60 | 
61 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
62 | Version 2.0 (the "License"); you may not use this file except in compliance
63 | with the License. You may obtain a copy of the  License at
64 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
65 | or agreed to in writing, software distributed under the License is 
66 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
67 | KIND, either express or implied. See the License for the specific language 
68 | governing permissions and limitations under the License.
69 | }
70 | 
71 | \examples{
72 | data(bankData)
73 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
74 |                     dateGpBp = "years")
75 | plot(PlotNumVar("balance", bankData, NULL, "months", "years", 
76 |                  skewOpt = NULL, kSample = NULL)$p)
77 | }
78 | \seealso{
79 | Functions depend on this function:
80 |          \code{\link{PlotVar}}.
81 | 
82 | This function depends on:
83 |          \code{\link{SummaryStats}},
84 |          \code{\link{PlotDist}},
85 |          \code{\link{PlotQuantiles}},
86 |          \code{\link{PlotMean}},
87 |          \code{\link{PlotRates}},
88 |          \code{\link{PrepData}}.
89 | }
90 | 


--------------------------------------------------------------------------------
/man/PlotCatVar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/categorical.R
 3 | \name{PlotCatVar}
 4 | \alias{PlotCatVar}
 5 | \title{Create plots and summary statistics for a categorical variable}
 6 | \usage{
 7 | PlotCatVar(myVar, dataFl, weightNm = NULL, dateNm, dateGp, kCategories = 9,
 8 |   normBy = "time")
 9 | }
10 | \arguments{
11 | \item{myVar}{The name of the variable to be plotted}
12 | 
13 | \item{dataFl}{A \code{data.table} of data; must be the output of the
14 | \code{\link{PrepData}} function.}
15 | 
16 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
17 | no weights (all rows receiving weight 1).}
18 | 
19 | \item{dateNm}{Name of column containing the date variable.}
20 | 
21 | \item{dateGp}{Name of the variable that the time series plots should be 
22 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
23 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
24 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
25 | 
26 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
27 | trace plots of only the \code{kCategories} most prevalent categories are
28 | plotted.}
29 | 
30 | \item{normBy}{The normalization factor for rate plots, can be \code{"time"}
31 | or \code{"var"}. If \code{"time"}, then for each time period of 
32 | \code{dateGp}, counts are normalized by the total counts over all 
33 | categories in that time period. This illustrates changes of categories' 
34 | proportions over time. If \code{"var"}, then for each category, its counts 
35 | are normalized by the total counts over time from only this category. This
36 | illustrates changes of categories' volumes over time.}
37 | }
38 | \value{
39 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a 
40 |     bar plot, and trace plots of categories' proportions. If the number of 
41 |     categories is larger than \code{kCategories}, then trace plots of only the
42 |     \code{kCategories} most prevalent categories are be plotted. For a binary
43 |     variable, only the trace plot of the less prevalent category is plotted.}
44 |   \item{catVarSummary}{A \code{data.table}, contains categories' proportions 
45 |     globally, and over-time in each time period in \code{dateGp}. Each row is
46 |     a category of the categorical (or binary) variable \code{myVar}. The row
47 |     whose \code{category == 'NA'} corresponds to missing. Categories are 
48 |     ordered by global prevalence in a descending order.}
49 | }
50 | \description{
51 | Output plots include a bar plot with cateogries ordered by global counts,
52 | and trace plots of categories' proportions over time. This function is also
53 | appliable to a binary varible, which is treated as categorical in this 
54 | package. In addition to plots, a \code{data.table} of summary statistics
55 | are generated, on global counts and proportions by cateory, and proportions 
56 | by category over time.
57 | }
58 | \section{License}{
59 | 
60 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
61 | Version 2.0 (the "License"); you may not use this file except in compliance
62 | with the License. You may obtain a copy of the  License at
63 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
64 | or agreed to in writing, software distributed under the License is 
65 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
66 | KIND, either express or implied. See the License for the specific language 
67 | governing permissions and limitations under the License.
68 | }
69 | 
70 | \examples{
71 | data(bankData)
72 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
73 |                     dateGpBp = "quarters", weightNm = NULL)
74 | # Single histogram is plotted for job type since there are 12 categories
75 | plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm =  NULL, 
76 |                      dateNm = "date", dateGp = "months")$p)
77 |                      
78 | plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL, 
79 |                      dateNm = "date", dateGp = "months", kCategories = 12)$p)
80 | 
81 | 
82 | ## Binary data is treated as categorical,  and only the less frequent 
83 | ## category is plotted over time.
84 | plot(PlotCatVar(myVar = "default", dataFl = bankData, weightNm = NULL, 
85 |                      dateNm = "date", dateGp = "months")$p)
86 | }
87 | \seealso{
88 | Functions depend on this function:
89 |          \code{\link{PlotVar}},
90 |          \code{\link{PrintPlots}},
91 |          \code{\link{vlm}}.
92 | 
93 | This function depends on:
94 |          \code{\link{PlotBarplot}},
95 |          \code{\link{PlotRatesOverTime}},
96 |          \code{\link{PrepData}}.
97 | }
98 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 
 2 | # SPDX-License-Identifier: Apache-2.0 
 3 | # Copyright 2017 Capital One Services, LLC 
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 
 6 | # you may not use this file except in compliance with the License. 
 7 | #
 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed 
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied. 
13 | # 
14 | # See the License for the specific language governing permissions and limitations under the License. 
15 | 
16 | 
17 | #' Direct marketing campaigns of a Portuguese banking institution
18 | #'
19 | #' The marketing campaigns were based on phone calls.
20 | #' Often, more than one contact to the same client was required, in order to 
21 | #' access if the product (bank term deposit) would be ('yes') or not ('no') 
22 | #' subscribed. Records are ordered by date (from May 2008 to November 2010), 
23 | #' similar to data analyzed in Moro et al. [2014].
24 | #'
25 | #'
26 | #' @format A data frame with 45,211 rows and 19 variables:
27 | #' \describe{
28 | #'   \item{age}{Age of the client, numeric.}
29 | #'   \item{job}{Type of job, a categorical variable with the levels: 
30 | #'     \code{'admin.'}, \code{'blue-collar'}, \code{'entrepreneur'},
31 | #'     \code{'housemaid'}, \code{'management'}, \code{'retired'},
32 | #'     \code{'self-employed'}, \code{'services'}, \code{'student'},
33 | #'     \code{'technician'}, \code{'unemployed'}, and \code{'unknown'}.}
34 | #'   \item{marital}{Marital status, a categorical variable with levels: 
35 | #'     \code{'divorced'}, \code{'married'}, \code{'single'}, and \code{'unknown'}.
36 | #'     Note that \code{'divorced'} means either divorced or widowed.}
37 | #'   \item{education}{A categorical variable with levels: \code{'basic.4y'},
38 | #'     \code{'basic.6y'}, \code{'basic.9y'}, \code{'high.school'},
39 | #'     \code{'illiterate'}, \code{'professional.course'}, 
40 | #'     \code{'university.degree'}, and \code{'unknown'}.}
41 | #'   \item{default}{Whether credit is in default, a categorical variable with 
42 | #'     levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
43 | #'   \item{balance}{Account balance, numeric.}
44 | #'   \item{housing}{Whether the client has a housing loan, a categorical variable
45 | #'     with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
46 | #'   \item{loan}{Whether the client has personal loan, a categorical variable
47 | #'     with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
48 | #'   \item{contact}{Type of contact communication, a categorical variable
49 | #'     with levels: \code{'cellular'} and \code{'telephone'}.}
50 | #'   \item{duration}{Last contact duration in seconds, a numeric variable.}
51 | #'   \item{campaign}{Number of contacts performed during this campaign for 
52 | #'     this client, including the last contact; a numeric variable.}
53 | #'   \item{pdays}{Number of days that passed by after the client was last 
54 | #'     contacted from a previous campaign; a numeric variable, with \code{999} 
55 | #'     means that client was not previously contacted.}
56 | #'   \item{previous}{Number of contacts performed before this campaign for this
57 | #'     client, a numeric variable.}
58 | #'   \item{poutcome}{Outcome of the previous marketing campaign, a categorical
59 | #'     variable with levels: \code{'failure'}, \code{'nonexistent'},
60 | #'     and \code{'success'}.}
61 | #'   \item{y}{Whether the client has subscribed a term deposit, a categorical
62 | #'     variable with levels: \code{'yes'} and \code{'no'}.}
63 | #'   \item{date}{Last contact date.}
64 | #' }
65 | #' @source \url{https://archive.ics.uci.edu/ml/datasets/Bank+Marketing}
66 | #' @source \cite{Lichman, M. (2013). UCI Machine Learning Repository 
67 | #'   [\url{http://archive.ics.uci.edu/ml}]. Irvine, CA: University of California, 
68 | #'   School of Information and Computer Science.}
69 | #' @source \cite{S. Moro, P. Cortez, and P. Rita. (2014) A Data-Driven Approach
70 | #'   to Predict the Success of Bank Telemarketing. Decision Support Systems, 
71 | #'   Elsevier, 62:22-31, June 2014.}
72 | "bankData"
73 | 
74 | #' Labels for bankData
75 | #'
76 | #' A dataset containing the attribute labels also found in \code{\link{bankData}}.
77 | #' This data set is used to illustrate the \code{\link{PrepLabels}} function and
78 | #' other label functionality in the \code{\link{otvPlots}} package in R.
79 | #'
80 | #' @format A data frame with 16 rows and 3 variables:
81 | #' \describe{
82 | #'   \item{V1}{Name of each variable in \code{\link{bankData}}.}
83 | #'   \item{V2}{Label of each variable in \code{\link{bankData}}.}
84 | #'   \item{V3}{A numeric variable, corresponding to the row number.}
85 | #' }
86 | "bankLabels"
87 | 


--------------------------------------------------------------------------------
/R/package_otvPlots.R:
--------------------------------------------------------------------------------
 1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 
 2 | # SPDX-License-Identifier: Apache-2.0 
 3 | # Copyright 2017 Capital One Services, LLC 
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 
 6 | # you may not use this file except in compliance with the License. 
 7 | #
 8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed 
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied. 
13 | # 
14 | # See the License for the specific language governing permissions and limitations under the License. 
15 | 
16 | 
17 | #' Over time variable plots for predictive modeling (otvPlots)
18 | #'
19 | #' The \code{otvPlots} package uses \code{data.table} and \code{ggplot2} 
20 | #' packages to efficiently plot time series aggregated from large datasets. 
21 | #' Plots of numerical variables are optionally returned ordered by correlation 
22 | #' with date -- a natural starting point for anomaly detection. Plots are 
23 | #' automatically labeled if a variable dictionary is provided. 
24 | #' 
25 | #' Output files include:
26 | #' \itemize{
27 | #'  \item A PDF file of plots saved as \code{outFl}.pdf, with each individual page 
28 | #'  on one variable. Variables are plotted in the order indicated in the argument
29 | #'  \code{sortVars} or \code{sortFn}. 
30 | #'  For each numerical variable, the output plots include 
31 | #'  \itemize{
32 | #'    \item side-by-side boxplots grouped by \code{dateGpBp} (left), 
33 | #'    \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
34 | #'      (top right), 
35 | #'    \item a trace plot of mean and +-1 SD control limits, grouped by 
36 | #'      \code{dateGp}(middle right), and 
37 | #'    \item a trace plot of missing and zero rates, grouped by \code{dateGp} 
38 | #'      (bottom right).
39 | #'   }
40 | #'   For each categorical variable (including a numerical variable with no more 
41 | #'   than 2 unique levels not including NA), the output plots include 
42 | #'   \itemize{
43 | #'     \item a frequency bar plot (left), and 
44 | #'     \item a grid of trace plots on categories' proportions over time (right). 
45 | #'       If the variable contains more than \code{kCategories} number of 
46 | #'       categories, trace plots of only the largest \code{kCategories} will be 
47 | #'       plotted. If the variable contains only two categories, then only the 
48 | #'       trace plot of the less prevalent category will be plotted.
49 | #'   }
50 | #'   \item CSV file(s) on summary statistics of variables, both globally and over
51 | #'   time aggregated by \code{dateGp}. The order of variables in the CSV files
52 | #'   is the same as in the PDF file. 
53 | #'   \itemize{
54 | #'     \item For numerical variables, number of observations (counts), p1, p25, 
55 | #'     p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
56 | #'     as \code{outFl}_numerical_summary.csv.
57 | #'     \item For categorical varaibles, number of observations (counts) and 
58 | #'     categories' proportions are saved as \code{outFl}_categorical_summary.csv. 
59 | #'     Each row is a category of a categorical (or binary) variable.
60 | #'     The row whose \code{category == 'NA'} corresponds to missing. Categories
61 | #'     among the same variable are ordered by global prevalence in a descending 
62 | #'     order.
63 | #'   }
64 | #' }
65 | #'
66 | #' @seealso Main function: \code{\link{vlm}}.
67 | #' @seealso Selected supporting functions: 
68 | #'          \code{\link{PrepData}}, 
69 | #'          \code{\link{PrepLabels}},
70 | #'          \code{\link{OrderByR2}}.
71 | #'          
72 | #' @section License:
73 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
74 | #' Version 2.0 (the "License"); you may not use this file except in compliance
75 | #' with the License. You may obtain a copy of the  License at
76 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
77 | #' or agreed to in writing, software distributed under the License is 
78 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
79 | #' KIND, either express or implied. See the License for the specific language 
80 | #' governing permissions and limitations under the License.
81 | #' @docType package
82 | #' @name otvPlots
83 | #' @import data.table
84 | #' @import ggplot2
85 | #' @importFrom grid grid.draw grid.newpage unit unit.c textGrob gpar
86 | #' @importFrom gridExtra arrangeGrob
87 | #' @importFrom moments skewness
88 | #' @importFrom Hmisc wtd.quantile wtd.mean wtd.var
89 | #' @importFrom stringi stri_trans_general
90 | #' @importFrom scales hue_pal
91 | #' @importFrom grDevices cairo_pdf dev.off
92 | #' @importFrom graphics par
93 | #' @importFrom stats lm.fit lm.wfit quantile sd var
94 | #' @importFrom utils tail
95 | NULL
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/man/PrintPlots.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/plot_print.R
  3 | \name{PrintPlots}
  4 | \alias{PrintPlots}
  5 | \title{Create a pdf file with plots and compute summary statistics for all variables}
  6 | \usage{
  7 | PrintPlots(outFl, dataFl, sortVars, dateNm, dateGp, dateGpBp, weightNm = NULL,
  8 |   labelFl = NULL, genCSV = TRUE, highlightNms = NULL, skewOpt = NULL,
  9 |   kSample = 50000, fuzzyLabelFn = NULL, kCategories = 9)
 10 | }
 11 | \arguments{
 12 | \item{outFl}{Name of the output file, with no extension names (e.g., "bank"). 
 13 | A pdf file of plots ("bank.pdf"), and two csv files of summary statistics
 14 | ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be
 15 | saved to your working directory, unless a path is included in \code{outFl}
 16 | (e.g. "../plots/bank").}
 17 | 
 18 | \item{dataFl}{A \code{data.table} containing at least the following columns:
 19 | \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an
 20 | output of the \code{\link{PrepData}} function.}
 21 | 
 22 | \item{sortVars}{A character vector of variable names in the order they will
 23 | be plotted.}
 24 | 
 25 | \item{dateNm}{Name of column containing the date variable.}
 26 | 
 27 | \item{dateGp}{Name of the variable that the time series plots should be 
 28 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
 29 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
 30 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
 31 | 
 32 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
 33 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
 34 | 
 35 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
 36 | no weights (all rows receiving weight 1).}
 37 | 
 38 | \item{labelFl}{A \code{data.table} containing variable labels, or \code{NULL}
 39 | for no labels; usually an output of \code{\link{PrepLabels}}.}
 40 | 
 41 | \item{genCSV}{Logical, whether to generate the two csv files of summary
 42 | statistics for numerical and categorical variables.}
 43 | 
 44 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to
 45 | recieve red label. Currently \code{NULL} means all variables will get a 
 46 | black legend. Ignored this argument if \code{labelFl == NULL}.}
 47 | 
 48 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 
 49 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
 50 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
 51 | Negative input of \code{skewOpt} will be converted to 3.}
 52 | 
 53 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 
 54 | indicates the sample size for both drawing boxplots and ordering numerical
 55 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 
 56 | reasonable value (default is 50K) dramatically improves processing speed. 
 57 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
 58 | parameter should not be set to \code{NULL}, or boxplots may take a very
 59 | long time to render. This setting has no impact on the accuracy of time 
 60 | series plots on quantiles, mean, SD, and missing and zero rates.}
 61 | 
 62 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label
 63 | file in the format of an output by \code{\link{PrepLabels}} and a string
 64 | giving a variable name. The function should return the label corresponding
 65 | to the variable given by the second parameter. This function should 
 66 | describe how fuzzy matching should be performed to find labels (see example
 67 | below). If \code{NULL}, only exact matches will be retuned.}
 68 | 
 69 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
 70 | trace plots of only the \code{kCategories} most prevalent categories are
 71 | plotted.}
 72 | }
 73 | \value{
 74 | A pdf of plots saved to file \code{outFl}.pdf, and if the argument
 75 |   \code{genCSV == TRUE}, also two csv files of summary statistics for 
 76 |   numerical and categorical variables.
 77 | }
 78 | \description{
 79 | Creates plots and outputs results to a letter-sized pdf file, with each 
 80 | individual page containing plots on a single variable in the data. In 
 81 | addition, two summary statistics \code{data.table} are returned, one for
 82 | numerical variables, and one for categorical (and binary) ones.
 83 | }
 84 | \section{License}{
 85 | 
 86 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
 87 | Version 2.0 (the "License"); you may not use this file except in compliance
 88 | with the License. You may obtain a copy of the  License at
 89 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
 90 | or agreed to in writing, software distributed under the License is 
 91 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
 92 | KIND, either express or implied. See the License for the specific language 
 93 | governing permissions and limitations under the License.
 94 | }
 95 | 
 96 | \seealso{
 97 | Functions depend on this function:
 98 |          \code{\link{vlm}}.
 99 | 
100 | This function depends on:
101 |          \code{\link{PlotVar}},
102 |          \code{\link{PrepData}}.
103 | }
104 | 


--------------------------------------------------------------------------------
/man/PrepData.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/prep.R
  3 | \name{PrepData}
  4 | \alias{PrepData}
  5 | \title{Prepare an input dataset for plotting}
  6 | \usage{
  7 | PrepData(dataFl, dateNm, selectCols = NULL, dropCols = NULL,
  8 |   dateFt = "\%d\%h\%Y", dateGp = NULL, dateGpBp = NULL, weightNm = NULL,
  9 |   varNms = NULL, dropConstants = FALSE, ...)
 10 | }
 11 | \arguments{
 12 | \item{dataFl}{Either the name of an object that can be converted using
 13 | \code{\link[data.table]{as.data.table}} (e.g., a data frame), or a 
 14 | character string containing the name of dataset that can be loaded using 
 15 | \code{\link[data.table]{fread}} (e.g., a csv file). If the dataset is not in 
 16 | your working directory then \code{dataFl} must include (relative or 
 17 | absolute) path to file.}
 18 | 
 19 | \item{dateNm}{Name of column containing the date variable.}
 20 | 
 21 | \item{selectCols}{Either \code{NULL}, or a vector of names or indices of 
 22 | variables to read into memory -- must include \code{dateNm}, 
 23 | \code{weightNm} (if not \code{NULL}) and all variables to be plotted. If
 24 | both \code{selectCols} and \code{dropCols} are \code{NULL}, then all
 25 | variables will be read in.}
 26 | 
 27 | \item{dropCols}{Either \code{NULL}, or a vector of variables names or indices
 28 | of variables not to read into memory. If both \code{selectCols} and 
 29 | \code{dropCols} are \code{NULL}, then all variables will be read in.}
 30 | 
 31 | \item{dateFt}{\code{\link{strptime}} format of date variable. The default is SAS
 32 | format \code{"\%d\%h\%Y"}. But input data with R date format 
 33 | \code{"\%Y-\%m-\%d"} will also be detected. Both of two formats can be
 34 | parsed automatically.}
 35 | 
 36 | \item{dateGp}{Name of the variable that the time series plots should be 
 37 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
 38 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
 39 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
 40 | 
 41 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
 42 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
 43 | 
 44 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
 45 | no weights (all rows receiving weight 1).}
 46 | 
 47 | \item{varNms}{Either \code{NULL} or a vector of names or indices of variables
 48 | to be plotted. If \code{NULL}, will default to all columns which are not 
 49 | \code{dateNm} or \code{weightNm}. Can also be a vector of indices of the 
 50 | column names, after \code{dropCols} or \code{selectCols} have been applied,
 51 | if applicable, and not including \code{dateGp}, \code{dateGpBp} 
 52 | (which will be added to the \code{dataFl} by the function 
 53 | \code{\link{PrepData}}).}
 54 | 
 55 | \item{dropConstants}{Logical, indicates whether or not constant (all
 56 | duplicated or NA) variables should be dropped from \code{dataFl} prior to
 57 | plotting.}
 58 | 
 59 | \item{...}{Additional parameters to be passed to 
 60 | \code{\link[data.table]{fread}}.}
 61 | }
 62 | \value{
 63 | A \code{data.table} object, formatted for use by all plotting 
 64 | functions in this package \code{\link{otvPlots}}, including the main function
 65 | \code{\link{vlm}}, and the individual variable plotting function 
 66 | \code{\link{PlotVar}}.
 67 | }
 68 | \description{
 69 | This function prepares an input dataset for use by all plotting functions
 70 | in this package, including the main function \code{\link{vlm}}. 
 71 | The input data \code{dataFl} must contain, at a minimum, a date column 
 72 | \code{dateNm} and a variable to be plotted. \code{dataFl} will be 
 73 | converted to a \code{data.table} class, and all changes are made to it by 
 74 | reference.
 75 | }
 76 | \details{
 77 | If weights (\code{weightNm}) are provided, then it is normalized to have a
 78 | sum of weights equal the total sample size, and the weights are used in all 
 79 | summary statistics calculations and plotting.
 80 | }
 81 | \section{License}{
 82 | 
 83 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
 84 | Version 2.0 (the "License"); you may not use this file except in compliance
 85 | with the License. You may obtain a copy of the  License at
 86 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
 87 | or agreed to in writing, software distributed under the License is 
 88 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
 89 | KIND, either express or implied. See the License for the specific language 
 90 | governing permissions and limitations under the License.
 91 | }
 92 | 
 93 | \examples{
 94 | ## Use the bankData dataset in this package
 95 | data(bankData)
 96 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
 97 |                      dateGpBp = "quarters")
 98 | ## Columns have been assigned a plotting class (nmrcl/ctgrl)
 99 | str(bankData) 
100 | }
101 | \seealso{
102 | Functions depend on this function:
103 |          \code{\link{PlotBarplot}},
104 |          \code{\link{PlotRatesOverTime}},
105 |          \code{\link{PlotCatVar}},
106 |          \code{\link{SummaryStats}},
107 |          \code{\link{PlotMean}},
108 |          \code{\link{PlotQuantiles}},
109 |          \code{\link{PlotRates}},
110 |          \code{\link{PlotDist}},
111 |          \code{\link{PlotNumVar}},
112 |          \code{\link{PlotVar}},
113 |          \code{\link{PrintPlots}},
114 |          \code{\link{CalcR2}},
115 |          \code{\link{OrderByR2}},
116 |          \code{\link{vlm}}.
117 | }
118 | 


--------------------------------------------------------------------------------
/tests/testthat/rawData_bigint.csv:
--------------------------------------------------------------------------------
  1 | age,job,marital,balance,default,weight,date,bigint
  2 | 32,blue-collar,single,23,0,0.005102041,6/5/08,2.3E+12
  3 | 46,management,single,-246,0,0.010204082,6/5/08,-2.46E+13
  4 | 32,admin.,married,0,0,0.010204082,6/5/08,0
  5 | 60,retired,married,100,0,0.010204082,6/5/08,1E+13
  6 | 60,admin.,married,39,0,0.010204082,7/5/08,3.9E+12
  7 | 58,retired,married,96,0,0.005102041,7/5/08,9.6E+12
  8 | 35,blue-collar,single,12223,0,0.005102041,7/5/08,1.2223E+15
  9 | 55,services,divorced,1,1,0.010204082,7/5/08,1E+11
 10 | 45,admin.,single,13,0,0.020408163,8/5/08,1.3E+12
 11 | 47,blue-collar,married,306,0,0.005102041,8/5/08,3.06E+13
 12 | 45,admin.,single,206,0,0.010204082,8/5/08,2.06E+13
 13 | 60,retired,married,81,0,0.005102041,8/5/08,8.1E+12
 14 | 28,management,single,447,0,0.015306122,9/5/08,4.47E+13
 15 | 47,blue-collar,married,1506,0,0.015306122,10/5/08,1.506E+14
 16 | 35,management,married,231,0,0.010204082,10/5/08,2.31E+13
 17 | 40,retired,married,0,0,0.015306122,10/5/08,0
 18 | 56,management,married,779,0,0.005102041,11/5/08,7.79E+13
 19 | 25,services,married,50,0,0.010204082,11/5/08,5E+12
 20 | 29,management,single,0,0,0.005102041,11/5/08,0
 21 | 36,admin.,divorced,506,0,0.015306122,12/5/08,5.06E+13
 22 | 55,technician,divorced,0,0,0.005102041,12/5/08,0
 23 | 57,blue-collar,married,52,0,0.015306122,13-05-2008,5.2E+12
 24 | 42,admin.,single,-76,0,0.010204082,13-05-2008,-7.6E+12
 25 | 24,technician,single,-103,0,0.005102041,13-05-2008,-1.03E+13
 26 | 53,technician,divorced,989,0,0.010204082,13-05-2008,9.89E+13
 27 | 59,admin.,married,2343,0,0.005102041,13-05-2008,2.343E+14
 28 | 51,blue-collar,married,173,0,0.005102041,13-05-2008,1.73E+13
 29 | 44,admin.,married,-372,0,0.015306122,14-05-2008,-3.72E+13
 30 | 55,services,divorced,91,0,0.010204082,14-05-2008,9.1E+12
 31 | 49,services,divorced,0,0,0.010204082,14-05-2008,0
 32 | 42,management,single,50,0,0.010204082,14-05-2008,5E+12
 33 | 58,retired,married,121,0,0.015306122,15-05-2008,1.21E+13
 34 | 36,technician,single,265,0,0.015306122,15-05-2008,2.65E+13
 35 | 49,management,married,378,0,0.015306122,15-05-2008,3.78E+13
 36 | 54,management,married,282,0,0.010204082,15-05-2008,2.82E+13
 37 | 44,blue-collar,married,582,0,0.005102041,15-05-2008,5.82E+13
 38 | 57,entrepreneur,divorced,-37,0,0.010204082,16-05-2008,-3.7E+12
 39 | 60,retired,married,60,0,0.005102041,17-05-2008,6E+12
 40 | 38,management,single,424,0,0.010204082,17-05-2008,4.24E+13
 41 | 40,blue-collar,single,24,0,0.015306122,17-05-2008,2.4E+12
 42 | 46,management,divorced,16,0,0.005102041,18-05-2008,1.6E+12
 43 | 46,management,married,229,0,0.015306122,18-05-2008,2.29E+13
 44 | 60,blue-collar,married,104,0,0.010204082,20-05-2008,1.04E+13
 45 | 46,services,married,179,0,0.010204082,20-05-2008,1.79E+13
 46 | 53,technician,married,6,0,0.015306122,21-05-2008,6E+11
 47 | 54,retired,married,529,0,0.010204082,21-05-2008,5.29E+13
 48 | 58,management,married,2143,0,0.005102041,22-05-2008,2.143E+14
 49 | 43,technician,single,593,0,0.005102041,22-05-2008,5.93E+13
 50 | 57,technician,divorced,63,0,0.005102041,22-05-2008,6.3E+12
 51 | 42,entrepreneur,divorced,2,1,0.010204082,23-05-2008,2E+11
 52 | 51,retired,married,229,0,0.005102041,23-05-2008,2.29E+13
 53 | 59,blue-collar,married,0,0,0.005102041,23-05-2008,0
 54 | 31,services,married,25,0,0.015306122,23-05-2008,2.5E+12
 55 | 55,blue-collar,married,383,0,0.010204082,23-05-2008,3.83E+13
 56 | 47,services,divorced,164,0,0.010204082,24-05-2008,1.64E+13
 57 | 46,self-employed,married,137,0,0.010204082,24-05-2008,1.37E+13
 58 | 48,management,divorced,-244,0,0.025510204,25-05-2008,-2.44E+13
 59 | 49,blue-collar,married,154,0,0.010204082,25-05-2008,1.54E+13
 60 | 59,management,divorced,59,0,0.005102041,25-05-2008,5.9E+12
 61 | 25,blue-collar,married,-7,0,0.010204082,26-05-2008,-7E+11
 62 | 50,management,married,49,0,0.010204082,26-05-2008,4.9E+12
 63 | 58,self-employed,married,-364,0,0.005102041,26-05-2008,-3.64E+13
 64 | 57,retired,married,486,0,0.015306122,26-05-2008,4.86E+13
 65 | 33,unknown,single,1,0,0.025510204,27-05-2008,1E+11
 66 | 57,services,married,162,0,0.020408163,27-05-2008,1.62E+13
 67 | 39,management,single,255,0,0.005102041,27-05-2008,2.55E+13
 68 | 57,technician,married,839,0,0.010204082,27-05-2008,8.39E+13
 69 | 54,blue-collar,married,1291,0,0.005102041,27-05-2008,1.291E+14
 70 | 32,management,married,0,0,0.010204082,27-05-2008,0
 71 | 55,blue-collar,married,23,0,0.005102041,27-05-2008,2.3E+12
 72 | 33,entrepreneur,married,2,0,0.005102041,28-05-2008,2E+11
 73 | 58,technician,married,71,0,0.015306122,28-05-2008,7.1E+12
 74 | 51,management,married,10635,0,0.005102041,28-05-2008,1.0635E+15
 75 | 36,admin.,single,-171,0,0.020408163,28-05-2008,-1.71E+13
 76 | 38,entrepreneur,single,243,0,0.010204082,28-05-2008,2.43E+13
 77 | 55,technician,married,1205,0,0.010204082,28-05-2008,1.205E+14
 78 | 41,admin.,divorced,270,0,0.005102041,29-05-2008,2.7E+13
 79 | 33,services,married,0,0,0.010204082,29-05-2008,0
 80 | 28,blue-collar,married,723,0,0.005102041,29-05-2008,7.23E+13
 81 | 57,blue-collar,married,5935,0,0.010204082,29-05-2008,5.935E+14
 82 | 44,services,divorced,2586,0,0.005102041,30-05-2008,2.586E+14
 83 | 56,admin.,married,45,0,0.010204082,30-05-2008,4.5E+12
 84 | 30,technician,married,152,0,0.015306122,30-05-2008,1.52E+13
 85 | 42,technician,single,690,0,0.010204082,31-05-2008,6.9E+13
 86 | 41,technician,married,1270,0,0.015306122,31-05-2008,1.27E+14
 87 | 36,management,married,101,0,0.005102041,31-05-2008,1.01E+13
 88 | 29,admin.,single,390,0,0.005102041,1/6/08,3.9E+13
 89 | 44,technician,married,0,0,0.015306122,1/6/08,0
 90 | 33,services,married,790,0,0.005102041,1/6/08,7.9E+13
 91 | 60,admin.,married,290,0,0.010204082,1/6/08,2.9E+13
 92 | 57,blue-collar,married,249,0,0.010204082,2/6/08,2.49E+13
 93 | 53,technician,married,384,0,0.005102041,2/6/08,3.84E+13
 94 | 60,blue-collar,married,54,0,0.005102041,2/6/08,5.4E+12
 95 | 37,admin.,single,0,0,0.010204082,3/6/08,0
 96 | 43,technician,married,1937,0,0.010204082,3/6/08,1.937E+14
 97 | 44,technician,single,29,0,0.005102041,4/6/08,2.9E+12
 98 | 52,entrepreneur,married,113,0,0.015306122,4/6/08,1.13E+13
 99 | 53,technician,married,-3,0,0.010204082,4/6/08,-3E+11
100 | 51,management,married,6530,0,0.005102041,4/6/08,6.53E+14
101 | 39,technician,married,0,0,0.015306122,4/6/08,0


--------------------------------------------------------------------------------
/tests/testthat/test_OrderByR2.R:
--------------------------------------------------------------------------------
  1 | library(otvPlots)
  2 | context("Order by R-squared")
  3 | load("../testthat/testData.rda")
  4 | #testData = setDT(testData)
  5 | testData = PrepData(testData, dateNm = "date", weightNm = "weight")
  6 | 
  7 | 
  8 | testOrder <- function(out, testData){
  9 | 	cntnsVars <- names(Filter(is.nmrcl, testData))
 10 | 	dscrtVars <- names(Filter(is.ctgrl, testData))
 11 | 		
 12 | 	# testing that number of variables in output is equal to number of classed variables in input
 13 | 	expect_equal(length(out), length(cntnsVars) + length(dscrtVars))
 14 | 	
 15 | 	cntnsOrder <- match(cntnsVars, out)
 16 | 	dscrtOrder <- match(dscrtVars, out)
 17 | 	
 18 | 	#testing that all numeric variables appear before discrete
 19 | 	expect_lt(max(cntnsOrder), min(dscrtOrder))
 20 | 	
 21 | 	#testing that all discrete variables appear in order
 22 | 	expect_equal(order(dscrtOrder), 1:length(dscrtOrder))
 23 | }
 24 | 
 25 | 
 26 | 
 27 | test_that("OrderByR2 gives expected variable order", {
 28 | 	out <- OrderByR2(dataFl = testData, 
 29 | 	dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = NULL)
 30 | 	
 31 | 	#testing order of categorical, and order of numeric relative to discrete
 32 | 	testOrder(out, testData)
 33 | 	
 34 | 	#testing that numeric variables appear in order
 35 | 	rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
 36 | 	rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
 37 | 	expect_gt(rSq1, rSq2)
 38 | })
 39 | 
 40 | 
 41 | test_that("OrderByR2 works for buildTm in date range", {
 42 | 	buildTm = range(testData[, date][30:70])
 43 | 	out <- OrderByR2(dataFl = testData, 
 44 | 	dateNm = "date", buildTm = buildTm, weightNm = "weight", kSample = NULL)
 45 | 	
 46 | 	#testing order of categorical, and order of numeric relative to discrete
 47 | 	testOrder(out, testData)
 48 | 
 49 | 	testData1 = testData[date>=buildTm[1]&date<=buildTm[2]]
 50 | 	#testing that numeric variables appear in order
 51 | 	rSq1 <- CalcR2(out[1] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
 52 | 	rSq2 <- CalcR2(out[2] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
 53 | 	expect_gt(rSq1, rSq2)
 54 | })
 55 | 
 56 | 
 57 | test_that("OrderByR2 works for buildTm outside date range", {
 58 | 	buildTm = range(testData[, date][30:100] + 15)
 59 | 	out <- OrderByR2(dataFl = testData, 
 60 | 	dateNm = "date", buildTm = buildTm, weightNm = "weight", kSample = NULL)
 61 | 	
 62 | 	#testing order of categorical, and order of numeric relative to discrete
 63 | 	testOrder(out, testData)
 64 | 	
 65 | 	#testing that numeric variables appear in order
 66 | 	testData1 = testData[date>=buildTm[1]&date<=buildTm[2]]
 67 | 	rSq1 <- CalcR2(out[1] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
 68 | 	rSq2 <- CalcR2(out[2] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
 69 | 	expect_gt(rSq1, rSq2)
 70 | })
 71 | 
 72 | 
 73 | test_that("OrderByR2 works for kSample < N, with R2 being calculated on reduced sample", {
 74 | 	set.seed(5555) 
 75 | 	out <- OrderByR2(dataFl = testData, 
 76 | 	dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 50)
 77 | 	
 78 | 	#testing order of categorical, and order of numeric relative to discrete
 79 | 	testOrder(out, testData)
 80 | 	
 81 | 	#testing that numeric variables appear in order
 82 | 	set.seed(5555)
 83 | 	rSq1 <- CalcR2(out[1] , dataFl = testData[sample(.N, min(.N, 50))], dateNm = "date", weightNm = "weight", imputeValue = NULL)
 84 | 	set.seed(5555)
 85 | 	rSq2 <- CalcR2(out[2] , dataFl = testData[sample(.N, min(.N, 50))], dateNm = "date", weightNm = "weight", imputeValue = NULL)
 86 | 	expect_gt(rSq1, rSq2)
 87 | })
 88 | 
 89 | 
 90 | test_that("OrderByR2 works for kSample > N", {
 91 | 	out <- OrderByR2(dataFl = testData, 
 92 | 	dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 200)
 93 | 	
 94 | 	#testing order of categorical, and order of numeric relative to discrete
 95 | 	testOrder(out, testData)
 96 | 
 97 | 	#testing that numeric variables appear in order
 98 | 	rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
 99 | 	rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
100 | 	expect_gt(rSq1, rSq2)
101 | })
102 | 
103 | 
104 | test_that("OrderByR2 works when kSample is too small to calculate R2, with numeric variables returned in 
105 | 	 order as given", {
106 | 	out <- OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 2)
107 | 	
108 | 	#testing order of categorical, and order of numeric relative to discrete
109 | 	testOrder(out, testData)
110 | 
111 | 	#testing that all continous variables appear in data order
112 | 	cntnsVars <- names(Filter(is.nmrcl, testData))
113 | 	cntnsOrder <- match(cntnsVars, out)
114 | 	expect_equal(order(cntnsOrder), 1:length(cntnsOrder))
115 | })
116 | 
117 | test_that("OrderByR2 works when weight is null", {
118 | 	out <- OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, weightNm = NULL, kSample = NULL)
119 | 	
120 | 	#testing order of categorical, and order of numeric relative to discrete
121 | 	testOrder(out, testData)
122 | 	
123 | 	#testing that numeric variables appear in order
124 | 	rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = NULL, imputeValue = NULL)
125 | 	rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = NULL, imputeValue = NULL)
126 | 	expect_gt(rSq1, rSq2)
127 | })
128 | 
129 | 
130 |  test_that("OrderByR2 gives warning when weight/date contains missing", {
131 | 	idx1 = sample(1:100, 100)[1:10]
132 | 	idx2 = sample(1:100, 100)[1:10]
133 | 	testData[idx1, weight := NA]
134 | 	testData[idx2, date := NA]
135 | 	
136 | 	# testing for warning that weight column contains missings
137 | 	expect_warning(OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, 
138 | 		weightNm = "weight", kSample = NULL), "Weights column")
139 | 	# testing for warning that date column contains missings
140 | 	expect_warning(OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, 
141 | 		weightNm = "weight", kSample = NULL), "Date column")
142 | })
143 | 
144 | 
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/man/PlotVar.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/plot_print.R
  3 | \name{PlotVar}
  4 | \alias{PlotVar}
  5 | \title{Create over time variable plots and summary statitsics for one variable}
  6 | \usage{
  7 | PlotVar(dataFl, myVar, weightNm, dateNm, dateGp, dateGpBp = NULL,
  8 |   labelFl = NULL, highlightNms = NULL, skewOpt = NULL, kSample = 50000,
  9 |   fuzzyLabelFn = NULL, kCategories = 9)
 10 | }
 11 | \arguments{
 12 | \item{dataFl}{A \code{data.table} containing at least the following columns:
 13 | \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an
 14 | output of the \code{\link{PrepData}} function.}
 15 | 
 16 | \item{myVar}{Name of the variable to be plotted.}
 17 | 
 18 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
 19 | no weights (all rows receiving weight 1).}
 20 | 
 21 | \item{dateNm}{Name of column containing the date variable.}
 22 | 
 23 | \item{dateGp}{Name of the variable that the time series plots should be 
 24 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
 25 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
 26 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
 27 | 
 28 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
 29 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
 30 | 
 31 | \item{labelFl}{A \code{data.table} containing variable labels, or \code{NULL}
 32 | for no labels; usually an output of \code{\link{PrepLabels}}.}
 33 | 
 34 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to
 35 | recieve red label. Currently \code{NULL} means all variables will get a 
 36 | black legend. Ignored this argument if \code{labelFl == NULL}.}
 37 | 
 38 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 
 39 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
 40 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
 41 | Negative input of \code{skewOpt} will be converted to 3.}
 42 | 
 43 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 
 44 | indicates the sample size for both drawing boxplots and ordering numerical
 45 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 
 46 | reasonable value (default is 50K) dramatically improves processing speed. 
 47 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
 48 | parameter should not be set to \code{NULL}, or boxplots may take a very
 49 | long time to render. This setting has no impact on the accuracy of time 
 50 | series plots on quantiles, mean, SD, and missing and zero rates.}
 51 | 
 52 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label
 53 | file in the format of an output by \code{\link{PrepLabels}} and a string
 54 | giving a variable name. The function should return the label corresponding
 55 | to the variable given by the second parameter. This function should 
 56 | describe how fuzzy matching should be performed to find labels (see example
 57 | below). If \code{NULL}, only exact matches will be retuned.}
 58 | 
 59 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
 60 | trace plots of only the \code{kCategories} most prevalent categories are
 61 | plotted.}
 62 | }
 63 | \value{
 64 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object. See the output
 65 |     \code{p} of the function or \code{\link{PlotNumVar}}
 66 |     \code{\link{PlotCatVar}} for details.}
 67 |   \item{varSummary}{A \code{data.table} of summary statistics. See the output
 68 |     \code{numVarSummary} of the function \code{\link{PlotNumVar}}, or the 
 69 |     output \code{catVarSummary} of the function \code{\link{PlotCatVar}} for 
 70 |     details.}
 71 |   \item{varType}{Indicator of the variable's type, either \code{"nmrcl"} or 
 72 |     \code{"ctgrl"}.}
 73 | }
 74 | \description{
 75 | For a numerical variable, the output includes 
 76 | \itemize{
 77 | \item side-by-side boxplots grouped by \code{dateGpBp} (left), 
 78 | \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
 79 |   (top right), 
 80 | \item a trace plot of mean and +-1 SD control limits, grouped by 
 81 |   \code{dateGp}(middle right), and 
 82 | \item a trace plot of missing and zerorates, grouped by \code{dateGp} 
 83 |   (bottom right).
 84 | }
 85 | For a categorical variable (including a numerical variable with no more than 2
 86 | unique levels not including NA), the output includes 
 87 | \itemize{
 88 | \item a frequency bar plot (left), and 
 89 | \item a grid of trace plots on categories' proportions over time (right). 
 90 | If the variable contains more than \code{kCategories} number of categories, 
 91 | trace plots of only the largest \code{kCategories} will be plotted. 
 92 | }
 93 | In addition to plots, a \code{data.table} of summary statistics are generated,
 94 | on global and over time summary statistics.
 95 | }
 96 | \section{License}{
 97 |  Copyright 2017 Capital One Services, LLC Licensed under the
 98 | Apache License, Version 2.0 (the "License"); you may not use this file
 99 | except in compliance with the License. You may obtain a copy of the  License
100 | at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
101 | law or agreed to in writing, software distributed under the License is
102 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
103 | KIND, either express or implied. See the License for the specific language
104 | governing permissions and limitations under the License.
105 | }
106 | 
107 | \examples{
108 | data(bankData)
109 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
110 |                      dateGpBp = "quarters")
111 | data(bankLabels)
112 | bankLabels <- PrepLabels(bankLabels)
113 | 
114 | ## PlotVar will treat numerical and categorical data differently. 
115 | ## Binary data is always treated as categorical.
116 | plot(PlotVar(bankData, myVar = "duration", weightNm = NULL, dateNm = "date", 
117 |      dateGp = "months", dateGpBp =  "quarters", labelFl = bankLabels)$p)
118 | plot(PlotVar(bankData, myVar = "job", weightNm = NULL, dateNm = "date", 
119 |      dateGp = "months", dateGpBp =  "quarters", labelFl = bankLabels)$p)
120 | plot(PlotVar(bankData, myVar = "loan", weightNm = NULL, dateNm = "date", 
121 |      dateGp = "months", dateGpBp =  "quarters", labelFl = bankLabels)$p)
122 | 
123 | }
124 | \seealso{
125 | Functions depend on this function:
126 |          \code{\link{PrintPlots}}.
127 | 
128 | This function depends on:
129 |          \code{\link{PlotCatVar}},
130 |          \code{\link{PlotNumVar}},
131 |          \code{\link{PrepData}}.
132 | }
133 | 


--------------------------------------------------------------------------------
/tests/testthat/rawData.csv:
--------------------------------------------------------------------------------
  1 | "age","job","marital","balance","default","weight","date"
  2 | 32,"blue-collar","single",23,0,0.00510204081632653,"06-05-2008"
  3 | 46,"management","single",-246,0,0.0102040816326531,"06-05-2008"
  4 | 32,"admin.","married",0,0,0.0102040816326531,"06-05-2008"
  5 | 60,"retired","married",100,0,0.0102040816326531,"06-05-2008"
  6 | 60,"admin.","married",39,0,0.0102040816326531,"07-05-2008"
  7 | 58,"retired","married",96,0,0.00510204081632653,"07-05-2008"
  8 | 35,"blue-collar","single",12223,0,0.00510204081632653,"07-05-2008"
  9 | 55,"services","divorced",1,1,0.0102040816326531,"07-05-2008"
 10 | 45,"admin.","single",13,0,0.0204081632653061,"08-05-2008"
 11 | 47,"blue-collar","married",306,0,0.00510204081632653,"08-05-2008"
 12 | 45,"admin.","single",206,0,0.0102040816326531,"08-05-2008"
 13 | 60,"retired","married",81,0,0.00510204081632653,"08-05-2008"
 14 | 28,"management","single",447,0,0.0153061224489796,"09-05-2008"
 15 | 47,"blue-collar","married",1506,0,0.0153061224489796,"10-05-2008"
 16 | 35,"management","married",231,0,0.0102040816326531,"10-05-2008"
 17 | 40,"retired","married",0,0,0.0153061224489796,"10-05-2008"
 18 | 56,"management","married",779,0,0.00510204081632653,"11-05-2008"
 19 | 25,"services","married",50,0,0.0102040816326531,"11-05-2008"
 20 | 29,"management","single",0,0,0.00510204081632653,"11-05-2008"
 21 | 36,"admin.","divorced",506,0,0.0153061224489796,"12-05-2008"
 22 | 55,"technician","divorced",0,0,0.00510204081632653,"12-05-2008"
 23 | 57,"blue-collar","married",52,0,0.0153061224489796,"13-05-2008"
 24 | 42,"admin.","single",-76,0,0.0102040816326531,"13-05-2008"
 25 | 24,"technician","single",-103,0,0.00510204081632653,"13-05-2008"
 26 | 53,"technician","divorced",989,0,0.0102040816326531,"13-05-2008"
 27 | 59,"admin.","married",2343,0,0.00510204081632653,"13-05-2008"
 28 | 51,"blue-collar","married",173,0,0.00510204081632653,"13-05-2008"
 29 | 44,"admin.","married",-372,0,0.0153061224489796,"14-05-2008"
 30 | 55,"services","divorced",91,0,0.0102040816326531,"14-05-2008"
 31 | 49,"services","divorced",0,0,0.0102040816326531,"14-05-2008"
 32 | 42,"management","single",50,0,0.0102040816326531,"14-05-2008"
 33 | 58,"retired","married",121,0,0.0153061224489796,"15-05-2008"
 34 | 36,"technician","single",265,0,0.0153061224489796,"15-05-2008"
 35 | 49,"management","married",378,0,0.0153061224489796,"15-05-2008"
 36 | 54,"management","married",282,0,0.0102040816326531,"15-05-2008"
 37 | 44,"blue-collar","married",582,0,0.00510204081632653,"15-05-2008"
 38 | 57,"entrepreneur","divorced",-37,0,0.0102040816326531,"16-05-2008"
 39 | 60,"retired","married",60,0,0.00510204081632653,"17-05-2008"
 40 | 38,"management","single",424,0,0.0102040816326531,"17-05-2008"
 41 | 40,"blue-collar","single",24,0,0.0153061224489796,"17-05-2008"
 42 | 46,"management","divorced",16,0,0.00510204081632653,"18-05-2008"
 43 | 46,"management","married",229,0,0.0153061224489796,"18-05-2008"
 44 | 60,"blue-collar","married",104,0,0.0102040816326531,"20-05-2008"
 45 | 46,"services","married",179,0,0.0102040816326531,"20-05-2008"
 46 | 53,"technician","married",6,0,0.0153061224489796,"21-05-2008"
 47 | 54,"retired","married",529,0,0.0102040816326531,"21-05-2008"
 48 | 58,"management","married",2143,0,0.00510204081632653,"22-05-2008"
 49 | 43,"technician","single",593,0,0.00510204081632653,"22-05-2008"
 50 | 57,"technician","divorced",63,0,0.00510204081632653,"22-05-2008"
 51 | 42,"entrepreneur","divorced",2,1,0.0102040816326531,"23-05-2008"
 52 | 51,"retired","married",229,0,0.00510204081632653,"23-05-2008"
 53 | 59,"blue-collar","married",0,0,0.00510204081632653,"23-05-2008"
 54 | 31,"services","married",25,0,0.0153061224489796,"23-05-2008"
 55 | 55,"blue-collar","married",383,0,0.0102040816326531,"23-05-2008"
 56 | 47,"services","divorced",164,0,0.0102040816326531,"24-05-2008"
 57 | 46,"self-employed","married",137,0,0.0102040816326531,"24-05-2008"
 58 | 48,"management","divorced",-244,0,0.0255102040816327,"25-05-2008"
 59 | 49,"blue-collar","married",154,0,0.0102040816326531,"25-05-2008"
 60 | 59,"management","divorced",59,0,0.00510204081632653,"25-05-2008"
 61 | 25,"blue-collar","married",-7,0,0.0102040816326531,"26-05-2008"
 62 | 50,"management","married",49,0,0.0102040816326531,"26-05-2008"
 63 | 58,"self-employed","married",-364,0,0.00510204081632653,"26-05-2008"
 64 | 57,"retired","married",486,0,0.0153061224489796,"26-05-2008"
 65 | 33,"unknown","single",1,0,0.0255102040816327,"27-05-2008"
 66 | 57,"services","married",162,0,0.0204081632653061,"27-05-2008"
 67 | 39,"management","single",255,0,0.00510204081632653,"27-05-2008"
 68 | 57,"technician","married",839,0,0.0102040816326531,"27-05-2008"
 69 | 54,"blue-collar","married",1291,0,0.00510204081632653,"27-05-2008"
 70 | 32,"management","married",0,0,0.0102040816326531,"27-05-2008"
 71 | 55,"blue-collar","married",23,0,0.00510204081632653,"27-05-2008"
 72 | 33,"entrepreneur","married",2,0,0.00510204081632653,"28-05-2008"
 73 | 58,"technician","married",71,0,0.0153061224489796,"28-05-2008"
 74 | 51,"management","married",10635,0,0.00510204081632653,"28-05-2008"
 75 | 36,"admin.","single",-171,0,0.0204081632653061,"28-05-2008"
 76 | 38,"entrepreneur","single",243,0,0.0102040816326531,"28-05-2008"
 77 | 55,"technician","married",1205,0,0.0102040816326531,"28-05-2008"
 78 | 41,"admin.","divorced",270,0,0.00510204081632653,"29-05-2008"
 79 | 33,"services","married",0,0,0.0102040816326531,"29-05-2008"
 80 | 28,"blue-collar","married",723,0,0.00510204081632653,"29-05-2008"
 81 | 57,"blue-collar","married",5935,0,0.0102040816326531,"29-05-2008"
 82 | 44,"services","divorced",2586,0,0.00510204081632653,"30-05-2008"
 83 | 56,"admin.","married",45,0,0.0102040816326531,"30-05-2008"
 84 | 30,"technician","married",152,0,0.0153061224489796,"30-05-2008"
 85 | 42,"technician","single",690,0,0.0102040816326531,"31-05-2008"
 86 | 41,"technician","married",1270,0,0.0153061224489796,"31-05-2008"
 87 | 36,"management","married",101,0,0.00510204081632653,"31-05-2008"
 88 | 29,"admin.","single",390,0,0.00510204081632653,"01-06-2008"
 89 | 44,"technician","married",0,0,0.0153061224489796,"01-06-2008"
 90 | 33,"services","married",790,0,0.00510204081632653,"01-06-2008"
 91 | 60,"admin.","married",290,0,0.0102040816326531,"01-06-2008"
 92 | 57,"blue-collar","married",249,0,0.0102040816326531,"02-06-2008"
 93 | 53,"technician","married",384,0,0.00510204081632653,"02-06-2008"
 94 | 60,"blue-collar","married",54,0,0.00510204081632653,"02-06-2008"
 95 | 37,"admin.","single",0,0,0.0102040816326531,"03-06-2008"
 96 | 43,"technician","married",1937,0,0.0102040816326531,"03-06-2008"
 97 | 44,"technician","single",29,0,0.00510204081632653,"04-06-2008"
 98 | 52,"entrepreneur","married",113,0,0.0153061224489796,"04-06-2008"
 99 | 53,"technician","married",-3,0,0.0102040816326531,"04-06-2008"
100 | 51,"management","married",6530,0,0.00510204081632653,"04-06-2008"
101 | 39,"technician","married",0,0,0.0153061224489796,"04-06-2008"
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Due to changes in priorities, this project is currently not being supported. The project is archived as of 3/14/24 and will be available in a read-only state. Please note, since archival, the project is not maintained or reviewed.
  2 | 
  3 | # R Package for Variable Level Monitoring
  4 | 
  5 | [![](http://cranlogs.r-pkg.org/badges/otvPlots)](http://cran.rstudio.com/web/packages/otvPlots/index.html)
  6 | 
  7 | An important part of model building is the "proc eyeball" sanity check. It can
  8 | also be a painful part of the process, when you are the data scientist tasked
  9 | with creating and checking 10,000 or more near-identical plots. The `otvPlots`
 10 | package is designed to streamline this process. `otvPlots` is
 11 | an R package which takes a csv file as input and provides a pdf of VLM plots
 12 | and csv files of summary statistics as output, optionally ordered so
 13 | that any severely abnormal time series will be at the top of the pdf. The only
 14 | strict requirement of the data scientist is to specify which column of the input
 15 | data file contains the date variable.
 16 | 
 17 | `otvPlots` is efficiently implemented using `data.table` and `ggplot2` packages in R.
 18 | Plots are automatically labeled if a variable dictionary is provided. Important
 19 | variables can be given a highlighted label. A custom fuzzy matching algorithm
 20 | can be provided by the user.
 21 | 
 22 | Discrete and numeric variables are handled automatically and given separate
 23 | treatment. All binary variables are treated as categorical.
 24 | 
 25 | ## Output files generated by this package
 26 | 
 27 | ### A PDF file of plots, with each individual page on one variable.
 28 | 
 29 | For each numerical variable, the output plots include
 30 | * side-by-side boxplots (left),
 31 | * a trace plot of p1, p50, and p99 percentiles,
 32 | * a trace plot of mean and +-1 SD control limits, and
 33 | * a trace plot of missing and zero rates (bottom right).
 34 | 
 35 | #### Here is an example page of plots for a numerical variable
 36 | <img src="https://github.com/capitalone/otvPlots/blob/master/figures/sample_plots_numerical.png"
 37 |      alt="numerical plot"
 38 |    width="770"
 39 |    height="560">
 40 | 
 41 | For each categorical variable (including a numerical variable with no more
 42 |   than 2 unique levels not including NA), the output plots include
 43 | * a frequency bar plot (left), and
 44 | * a grid of trace plots on categories' proportions over time (right).
 45 | 
 46 | #### Here is an example page of plots for a categorical variable
 47 | <img src="https://github.com/capitalone/otvPlots/blob/master/figures/sample_plots_categorical.png"
 48 |      alt="categorical plot"
 49 |    width="770"
 50 |    height="560">
 51 | 
 52 | ### CSV file(s) on summary statistics of variables, both globally and over time.
 53 | 
 54 | The order of variables in the CSV files is the same as in the PDF file.
 55 | * A CSV file for numerical variables, including the number of observations
 56 |      (counts), p1, p25, p50, p75, and p99 quantiles, mean, SD, missing and
 57 |      zero rates.
 58 | * A CSV file for categorical variables, including the number of observations
 59 |      (counts) and categories' proportions. Each row is a category of a
 60 |      categorical (or binary) variable. The row whose `category == 'NA'`
 61 |      corresponds to missing. Categories among the same variable are ordered by
 62 |      global prevalence in a descending order.
 63 | 
 64 | # Installation
 65 | Open an R (or RStudio) console and install the package from CRAN
 66 | 
 67 | ```
 68 | install.packages("otvPlots")
 69 | ```
 70 | 
 71 | Alternatively, if you prefer to install from GitHub:
 72 | 
 73 | 1. Install the `devtools` package if not yet. You only need to do this once, so
 74 | feel free to skip this step if the `devtools` is already installed. You will be
 75 | asked to select a CRAN mirror.
 76 | 
 77 | ```
 78 | install.packages("devtools")
 79 | ```
 80 | 
 81 | 2. Install the `otvPlots` package
 82 | ```
 83 | devtools::install_github("capitalone/otvPlots")
 84 | ```
 85 | 
 86 | You can also build the package yourself by cloning the repo, setting your
 87 | working directory to the otvPlots folder and running `devtools::build()`
 88 | in R, after installing the `devtools` package.
 89 | 
 90 | Note that otvPlots does depend on R and several R packages to run. You can
 91 | see a complete and up to date list of dependencies in the Imports field in
 92 | the DESCRIPTION file.
 93 | 
 94 | 
 95 | # Getting Started
 96 | 
 97 | ## Load the package
 98 | Open an R console (or RStudio). Load the `otvPlots` pacakge first (all its
 99 | dependent packages should be loaded automatically).
100 | 
101 | ```
102 | library(otvPlots)
103 | ```
104 | 
105 | The main function of the package is `vlm`. Before execute this function,
106 | input data need to be prepared using the `PrepData` function.
107 | **Please check out the help files to see all options and many usage examples
108 | (highly recommended!)**
109 | 
110 | ```
111 | help(vlm)
112 | help(PrepData)
113 | ```
114 | 
115 | ## Examples
116 | 
117 | The data `bankData` and its labels `bankLables` are built-in datasets in the
118 | `otvPlots` package.
119 | 
120 | ### The first example
121 | After running the following code, a pdf file named "bank.pdf" and two csv files
122 | named "bank_numerical_summary.csv" and "bank_categorical_summary.csv" will be
123 | generated in the current working directory.
124 | 
125 | ```
126 | ## Load the datasets
127 | data(bankData)
128 | data(bankLabels)
129 | 
130 | ## Prepare data and labels
131 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
132 |                      dateGpBp = "quarters")
133 | bankLabels <- PrepLabels(bankLabels)
134 | 
135 | ## Generate a pdf file of vlm plots, and csv files of summary statistics
136 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
137 |     sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", outFl = "bank")
138 | ```
139 | 
140 | ### More examples on the `bankData` data
141 | The `PrepData` function only needs to be run once on a dataset. After that `vlm`
142 | can be run directly with the argument `dataNeedPrep = FALSE` (the default).
143 | 
144 | * If csv files of summary statistics are not need, set `genCSV = FALSE`.
145 | 
146 | ```
147 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE,
148 |     sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", outFl = "bank2")
149 | ```
150 | * If weights are provided, they will be used in all statistical calculations
151 | 
152 | ```
153 | bankData[, weight := rnorm(.N, 1, .1)]
154 | bankData[, weight := weight / mean(weight)]
155 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
156 |     dateGp = "months", dateGpBp = "quarters", weightNm = "weight", outFl = "bank3")
157 | ```
158 | 
159 | * Customize plotting order by passing a vector of variable names to argument
160 | `sortVars`, but the `"date"` column must be excluded from `sortVars`
161 | 
162 | ```
163 | sortVars <- sort(bankLabels[varCol!="date", varCol])
164 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
165 |     dateGp = "months", dateGpBp = "quarters", outFl = "bank4",
166 |     sortVars = sortVars)
167 | ```
168 | 
169 | * Create plots for a specific variable using the `varNms` argument
170 | 
171 | ```
172 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
173 |     dateGp = "months", dateGpBp = "quarters", outFl = "bank5",
174 |     varNms = "age", sortVars = NULL)
175 | ```
176 | 
177 | ## Citations
178 | 
179 | All examples for this package come from the
180 | [Bank Marketing dataset](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)
181 | available at the UCI Machine Learning Repository. The UCI repository maintains
182 | a free collection of datasets for researchers at its
183 | [website](http://archive.ics.uci.edu/ml).
184 | 
185 | Moro et al., S. Moro, P. Cortez, and P. Rita (2014). A Data-Driven Approach to
186 | Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier,
187 | 62:22-31, June 2014
188 | 
189 | Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
190 | 
191 | ## Copyright 2017 Capital One Services, LLC
192 | 
193 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
194 | 
195 | http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and limitations under the License.
201 | 
202 | ## External Contributors
203 | Contributors: We welcome your interest in Capital One’s Open Source Projects (the “Project”).
204 | 
205 | Any Contributor to the project must accept and sign a CLA indicating agreement to the license terms. Except for the license granted in this CLA to Capital One and to recipients of software distributed by Capital One, you reserve all right, title, and interest in and to your contributions; this CLA does not impact your rights to use your own contributions for any other purpose.
206 | 
207 | [Link to Individual CLA](https://docs.google.com/forms/d/19LpBBjykHPox18vrZvBbZUcK6gQTj7qv1O5hCduAZFU/viewform)
208 | 
209 | [Link to Corporate CLA ](https://docs.google.com/forms/d/e/1FAIpQLSeAbobIPLCVZD_ccgtMWBDAcN68oqbAJBQyDTSAQ1AkYuCp_g/viewform)
210 | 
211 | This project adheres to the
212 | [Open Source Code of Conduct](https://developer.capitalone.com/single/code-of-conduct/).
213 | By participating, you are expected to honor this code.
214 | 
215 | 
216 | 


--------------------------------------------------------------------------------
/R/plots_order.R:
--------------------------------------------------------------------------------
  1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 
  2 | # SPDX-License-Identifier: Apache-2.0 
  3 | # Copyright 2017 Capital One Services, LLC 
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); 
  6 | # you may not use this file except in compliance with the License. 
  7 | #
  8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software distributed 
 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
 12 | # OF ANY KIND, either express or implied. 
 13 | # 
 14 | # See the License for the specific language governing permissions and limitations under the License. 
 15 | 
 16 | 
 17 | ###########################################
 18 | #          Order By R2                    #
 19 | ###########################################
 20 | 
 21 | #' Create numerical variable ranking using R2 between date to and variable
 22 | #' 
 23 | #' Calculates R2 of a linear model of the formula \code{var} ~ \code{dateNm} for
 24 | #' each \code{var} of class \code{nmrcl} and returns a vector of
 25 | #' variable names ordered by highest R2. The linear model can be calculated over
 26 | #' a subset of dates, see details of parameter \code{buildTm}. Non-numerical
 27 | #' variables are returned in alphabetical order after the sorted numerical
 28 | #' variables.
 29 | #'
 30 | #' @inheritParams PrepData
 31 | #' @inheritParams PlotNumVar
 32 | #' @param dataFl A \code{data.table} of data; must be the output of the
 33 | #'   \code{\link{PrepData}} function. 
 34 | #' @param buildTm Vector identify time period for ranking/anomaly detection
 35 | #' (most likely model build period). Allows for a subset of plotting time
 36 | #' period to be used for anomaly detection.
 37 | #' \itemize{
 38 | #'      \item Must be a vector of dates and must be inclusive i.e. buildTm[1]
 39 | #'        <= date <= buildTm[2] will define the time period.
 40 | #'      \item Must be either \code{NULL}, a vector of length 2, or a vector of 
 41 | #'        length 3. 
 42 | #'      \item If \code{NULL}, the entire dataset will be used for 
 43 | #'        ranking/anomaly detection. 
 44 | #'      \item If a vector of length 2, the format of the dates must be
 45 | #'        a character vector in default R date format (e.g. "2017-01-30"). 
 46 | #'      \item If a vector of length 3, the first two columns must contain dates 
 47 | #'        in any strptime format, while the 3rd column contains the strptime 
 48 | #'        format (see \code{\link{strptime}}). 
 49 | #'      \item The following are equivalent ways of selecting
 50 | #'        all of 2014:
 51 | #'      \itemize{
 52 | #'        \item \code{c("2014-01-01","2014-12-31")}
 53 | #'        \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")}
 54 | #'      }
 55 | #' }
 56 | #' @export
 57 | #' 
 58 | #' @seealso Functions depend on this function:
 59 | #'          \code{\link{vlm}}.
 60 | #' @seealso This function depends on:
 61 | #'          \code{\link{CalcR2}},
 62 | #'          \code{\link{PrepData}}.
 63 | #'          
 64 | #' @return A vector of variable names sorted by R2 of \code{lm} of the formula
 65 | #'   \code{var} ~ \code{dateNm} (highest R2 to lowest)
 66 | #' @section License: 
 67 | #' Copyright 2017 Capital One Services, LLC Licensed under the
 68 | #' Apache License, Version 2.0 (the "License"); you may not use this file
 69 | #' except in compliance with the License. You may obtain a copy of the 
 70 | #' License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
 71 | #' applicable law or agreed to in writing, software distributed under the
 72 | #' License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 73 | #' CONDITIONS OF ANY KIND, either express or implied. See the License for the
 74 | #' specific language governing permissions and limitations under the License.
 75 | #' @examples
 76 | #' data(bankData)
 77 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
 78 | #'                      dateGpBp = "quarters")
 79 | #' OrderByR2(bankData, dateNm = "date")
 80 | 
 81 | OrderByR2 <- function(dataFl, dateNm, buildTm = NULL, weightNm = NULL,
 82 |                       kSample = 50000) {
 83 |   
 84 |   ## Make sure no NAs in weights and dates
 85 |   if (!is.null(weightNm)) {
 86 |     if (any(is.na(dataFl[[weightNm]]))) {
 87 |       warning("Weights column contains NAs--will be deleted casewise")
 88 |     }
 89 |   }
 90 |   if (any(is.na(dataFl[[dateNm]]))) {
 91 |     warning("Date column contains NAs--will be deleted casewise")
 92 |   }
 93 |   
 94 |   ## Convert buildTm to IDate format
 95 |   ## If the length of input buildTm is not 2 or 3, then use start and end time in dateNm
 96 |   buildTm <- switch(as.character(length(buildTm)), "2" = as.IDate(buildTm),
 97 |                     "3" = as.IDate(buildTm[1:2], buildTm[3]),
 98 |                     # avoid inheritence as list using [[]]
 99 |                     dataFl[c(1, .N), dateNm, with = FALSE][[1]])
100 |   
101 |   num_vars <- names(Filter(is.nmrcl, dataFl))
102 |   cat_vars <- names(Filter(is.ctgrl, dataFl))
103 |   
104 |   ## Sorting by R2 only works for numeric variables.
105 |   if (length(num_vars > 0)) {
106 |     
107 |     # Using sample directly in dataFl parameter for brevity,
108 |     # which reorders the input to CalcR2 but does not change output
109 |     r2 <- vapply(num_vars, CalcR2,
110 |                  dataFl = dataFl[buildTm[1] <= get(dateNm) &
111 |                                    get(dateNm) <= buildTm[2], ][
112 |                                      sample(.N, min(.N, kSample))],
113 |                  dateNm = dateNm, weightNm = weightNm, imputeValue = NULL,
114 |                  numeric(1))
115 |     sortVars <- c(num_vars[order(r2, decreasing = TRUE)], cat_vars)
116 |   } else {
117 |     sortVars <- cat_vars
118 |   }
119 |   
120 |   return(sortVars)
121 | }
122 | 
123 | 
124 | ###########################################
125 | #           CalcR2 Function               #
126 | ###########################################
127 | 
128 | #' Calculates R2 of a numerical variable using date as the predictor
129 | #'
130 | #' Calculates weighted R2 of a univariate weighted linear model with
131 | #' \code{dateNm} as x and \code{myVar} as y using the workhorse \code{lm.fit}
132 | #' and \code{lm.wfit} functions.
133 | #'
134 | #' @param myVar Name of variable to model. 
135 | #' @param dataFl A \code{data.table}, containing \code{myVar}, \code{dateNm}, 
136 | #'   and \code{weightNm}.
137 | #' @param dateNm Name of column containing the date variable (to be modeled as
138 | #'   numeric); this date column must not have NA's. 
139 | #' @param weightNm Name of column containing row weights. If weights equal one, 
140 | #'   then the \code{\link{lm.fit}} function will be called, otherwise the 
141 | #'   \code{\link{lm.wfit}} will be called. The weights column must not have NA's.
142 | #' @param imputeValue Either \code{NULL} or numeric. If \code{NULL}, model will
143 | #'   be fit on only non-NA components of \code{myVar}. If numeric, missing cases
144 | #'   of \code{myVar} will be imputed to \code{imputeValue}.
145 | #' @return A numeric value of R2.
146 | #' @export
147 | #'   
148 | #' @seealso Functions depend on this function:
149 | #'          \code{\link{OrderByR2}}.
150 | #' @seealso This function depends on:
151 | #'          \code{\link{PrepData}}.
152 | #'   
153 | #' @section License:
154 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
155 | #' Version 2.0 (the "License"); you may not use this file except in compliance
156 | #' with the License. You may obtain a copy of the  License at
157 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
158 | #' or agreed to in writing, software distributed under the License is 
159 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
160 | #' KIND, either express or implied. See the License for the specific language 
161 | #' governing permissions and limitations under the License.
162 | 
163 | CalcR2 <- function(myVar, dataFl, dateNm, weightNm = NULL, imputeValue = NULL) {
164 | 
165 |   message("Calculating R2 of ", myVar)
166 |   
167 |   if (sum(!is.na(dataFl[[myVar]])) < 2) {
168 |     ## If kSample is not null, then we need to recheck that the subsample is not
169 |     ## all missing. If there are less than 2 numeric values left after sampling
170 |     ## we can't calculate R2
171 |     return(Inf)
172 |   } else {
173 |     y <- dataFl[[myVar]]
174 |     
175 |     ## If imputeValue is available, we impute everywhere Y is missing
176 |     if (!is.null(imputeValue)) {
177 |       y[is.na(y)] <- imputeValue
178 |     }
179 |     
180 |     ## Index of missing values in y (after imputation if applicable)
181 |     yIdx <- which(is.na(y))
182 |     
183 |     ## We perform casewise deletion anywhere X, Y or W (if not null) is missing
184 |     if (!is.null(weightNm)) {
185 |       w <- dataFl[[weightNm]]
186 |       wIdx <- which(is.na(w))
187 |       yIdx <- unique(c(yIdx, wIdx))
188 |     }
189 |     
190 |     ## Convert x from date to numeric, plus a column of ones as the intercept
191 |     x <- cbind(1, as.matrix(as.numeric(dataFl[[dateNm]]), ncol = 1))
192 |     xIdx <- which(is.na(x[, 2]))
193 |     yIdx <- unique(c(xIdx, yIdx))
194 |     
195 |     ## Remove all entries as in yIdx
196 |     if (length(yIdx) > 0) {
197 |       if (!is.null(weightNm)) {
198 |         w <- w[-c(yIdx)]
199 |       }
200 |       y <- y[-c(yIdx)]
201 |       x <- x[-c(yIdx), ]
202 |     }
203 |     
204 |     ## Compute R2 or weighted R2
205 |     if (is.null(weightNm)) {
206 |       mod <- lm.fit(x = x, y = y)
207 |       r2  <- 1 - sum(mod$resid ^ 2) / sum( (y - mean(y)) ^ 2)
208 |     } else {
209 |       mod <- lm.wfit(x = x, y = y, w = w)
210 |       r2  <- 1 - sum(w * mod$resid ^ 2) / sum(w * (y - Hmisc::wtd.mean(y, w, normwt = TRUE)) ^ 2)
211 |     }
212 |     return(r2)
213 |   }
214 | }
215 | 


--------------------------------------------------------------------------------
/tests/testthat/test_PrepData.R:
--------------------------------------------------------------------------------
  1 | library(otvPlots)
  2 | context("Prepare Data")
  3 | data(bankData);  setDT(bankData)
  4 | is.cntns <- function(x)  inherits(x, "nmrcl") #!#previous name: "cntns"
  5 | is.dscrt <- function(x)  inherits(x, "ctgrl") #!# previous name: "dscrt"
  6 | is.IDate <- function(x)  inherits(x, "IDate")
  7 | is.binary <- function(x) uniqueN(na.omit(x)) == 2
  8 | 
  9 | test_that("Names of the variables are transformed correctly", {
 10 |   out <- PrepData(dataFl = "../testthat/drugRDate.csv", dateNm = "date",
 11 |                	  dateGp = "months", dateGpBp = "quarters")
 12 |   expect_equal(names(out)[6], "Residence.City")
 13 | })
 14 | 
 15 | test_that("Parse SAS (eg. 07Apr2017) default date format correctly", {
 16 |   out <- PrepData(dataFl = "../testthat/drugSASDate.csv", dateNm = "date", 
 17 |                   dateGp = "months", dateGpBp = "quarters")
 18 |   expect_false(all(is.na(out[, "date"])), 'Fail to parse SAS date format')
 19 |   }
 20 | )
 21 | 
 22 | test_that("Parse R (eg. 2017-04-17) default date format correctly", {
 23 |   out <- PrepData(dataFl = "../testthat/drugRDate.csv", dateNm = "date", 
 24 |                 dateGp   = "months", dateGpBp = "quarters")
 25 |   expect_false(all(is.na(out[, "date"])), 'Fail to parse R date format')
 26 |   }
 27 | )
 28 | 
 29 | test_that("Incorrect date format creates warnings with csv input file", {
 30 |   expect_warning(
 31 |   	PrepData("../testthat/rawData.csv", dateNm = "date", weightNm ="weight", 
 32 |   		dateGp = "weeks", dateGpBp = "weeks"), "Formatting date as ")
 33 |   }
 34 | )
 35 | 
 36 | test_that("Incorrect date format creates warnings with Rdata input file", {
 37 |   expect_warning(
 38 |   	PrepData("../testthat/rawData.rda", dateNm = "date", weightNm ="weight", 
 39 |   		dateGp = "weeks", dateGpBp = "weeks"), "Formatting date as ")
 40 |   }
 41 | )
 42 | 
 43 | out <- suppressMessages(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
 44 | 			    dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y"))
 45 | 
 46 | test_that("All columns have exactly 2 classes, except date and weight", {
 47 |  	cntnsVars = Filter(is.cntns, out)
 48 |  	dscrtVars = Filter(is.dscrt, out)
 49 |  	dateVars  = Filter(is.IDate, out)
 50 | 	expect_equal(length(cntnsVars), 2)
 51 | 	expect_equal(length(dscrtVars), 3)
 52 | 	expect_equal(length(dateVars),  2)
 53 | 	expect_equal(length(class(out[, weight])), 1)
 54 | 	expect_equal(length(cntnsVars) + length(dscrtVars) + length(dateVars) + 1, ncol(out))
 55 | })
 56 | 
 57 | test_that("Variables are assigned to appropriate data type", {
 58 |  	cntnsVars = Filter(is.cntns, out)
 59 |  	
 60 | 	# test that all cntns variables are numeric
 61 | 	expect_equal(length(Filter(Negate(is.numeric), cntnsVars)), 0)
 62 | 	
 63 | 	# test that no cntns variables are binary
 64 | 	expect_equal(length(Filter(is.binary, cntnsVars)), 0)
 65 | 	
 66 | 	# test that all discrete variables are binary, character, or factor
 67 | 	dscrtVars = Filter(is.dscrt, out)
 68 |  	binVars = Filter(is.binary, dscrtVars)
 69 |  	charVars = Filter(Negate(is.binary), dscrtVars)
 70 |  	charClasses = unique(sapply(charVars, function(x) class(x)[1]))
 71 |  	expect_equal(length(setdiff(charClasses, c("character", "factor"))), 0)
 72 |  	
 73 |  	# test that all remaining variables are IDate, except weight
 74 |  	dateVars = Filter(is.IDate, out)
 75 | 
 76 | 	expect_equal(length(names(dateVars)) + length(names(binVars)) + length(names(charVars)) 
 77 | 		+ length(names(cntnsVars)) + 1, length(names(out)))	
 78 | })
 79 | 
 80 | test_that("varNms parameter works", {
 81 | 	out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
 82 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", varNms = c("age", "balance"))
 83 | 	cntnsVars = Filter(is.cntns, out)
 84 | 	dscrtVars = Filter(is.dscrt, out)
 85 |  	dateVars  = Filter(is.IDate, out)
 86 | 	expect_equal(length(cntnsVars), 2)
 87 | 	expect_equal(length(dscrtVars), 0)
 88 | 	expect_equal(length(dateVars), 2)
 89 | 
 90 | 	out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
 91 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", varNms = c(1, 4))
 92 | 	cntnsVars = Filter(is.cntns, out)
 93 | 	dscrtVars = Filter(is.dscrt, out)
 94 |  	dateVars  = Filter(is.IDate, out)
 95 | 	expect_equal(length(cntnsVars), 2)
 96 | 	expect_equal(length(dscrtVars), 0)
 97 | 	expect_equal(length(dateVars), 2)
 98 | })
 99 | 
100 | test_that("selectCols and dropCols work as expected for csv file", {
101 | 	
102 | 	# Test that selectCols works alone
103 | 	out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
104 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
105 |  		   selectCols = c("age", "balance", "date", "weight"))
106 |     cntnsVars = Filter(is.cntns, out)
107 | 	dscrtVars = Filter(is.dscrt, out)
108 |  	dateVars  = Filter(is.IDate, out)
109 | 	expect_equal(length(cntnsVars), 2)
110 | 	expect_equal(length(dscrtVars), 0)
111 | 	expect_equal(length(dateVars), 2)
112 |  	
113 |  	out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
114 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
115 |  		   selectCols = c(1, 4, 7, 6))
116 |     cntnsVars = Filter(is.cntns, out)
117 | 	dscrtVars = Filter(is.dscrt, out)
118 |  	dateVars  = Filter(is.IDate, out)
119 | 	expect_equal(length(cntnsVars), 2)
120 | 	expect_equal(length(dscrtVars), 0)
121 | 	expect_equal(length(dateVars), 2)
122 | 
123 |  	# test that dropCols works alone
124 |  	out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
125 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
126 |  		   dropCols = c("job", "marital", "default"))
127 |     cntnsVars = Filter(is.cntns, out)
128 | 	dscrtVars = Filter(is.dscrt, out)
129 |  	dateVars  = Filter(is.IDate, out)
130 | 	expect_equal(length(cntnsVars), 2)
131 | 	expect_equal(length(dscrtVars), 0)
132 | 	expect_equal(length(dateVars), 2)
133 | 
134 |  	out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
135 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
136 |  		   dropCols = c(2:3, 5))
137 |     cntnsVars = Filter(is.cntns, out)
138 | 	dscrtVars = Filter(is.dscrt, out)
139 |  	dateVars  = Filter(is.IDate, out)
140 | 	expect_equal(length(cntnsVars), 2)
141 | 	expect_equal(length(dscrtVars), 0)
142 | 	expect_equal(length(dateVars), 2)
143 | })
144 | 
145 | test_that("selectCols and dropCols work as expected for RData file", {
146 | 	
147 | 	# Test that selectCols works alone
148 | 	out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
149 | 			dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
150 |  		   	selectCols = c("age", "balance", "date", "weight"))
151 |     cntnsVars = Filter(is.cntns, out)
152 | 	dscrtVars = Filter(is.dscrt, out)
153 |  	dateVars  = Filter(is.IDate, out)
154 | 	expect_equal(length(cntnsVars), 2)
155 | 	expect_equal(length(dscrtVars), 0)
156 | 	expect_equal(length(dateVars), 2)
157 |  	
158 |  	out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
159 | 			dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
160 |  		   	selectCols = c(1, 4, 7, 6))
161 |     cntnsVars = Filter(is.cntns, out)
162 | 	dscrtVars = Filter(is.dscrt, out)
163 |  	dateVars  = Filter(is.IDate, out)
164 | 	expect_equal(length(cntnsVars), 2)
165 | 	expect_equal(length(dscrtVars), 0)
166 | 	expect_equal(length(dateVars), 2)
167 | 
168 |  	# test that dropCols works alone
169 |  	out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
170 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
171 |  		   dropCols = c("job", "marital", "default"))
172 |     cntnsVars = Filter(is.cntns, out)
173 | 	dscrtVars = Filter(is.dscrt, out)
174 |  	dateVars  = Filter(is.IDate, out)
175 | 	expect_equal(length(cntnsVars), 2)
176 | 	expect_equal(length(dscrtVars), 0)
177 | 	expect_equal(length(dateVars), 2)
178 | 
179 | 	 	out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
180 |  		   dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
181 |  		   dropCols = c(2:3, 5))
182 |     cntnsVars = Filter(is.cntns, out)
183 | 	dscrtVars = Filter(is.dscrt, out)
184 |  	dateVars  = Filter(is.IDate, out)
185 | 	expect_equal(length(cntnsVars), 2)
186 | 	expect_equal(length(dscrtVars), 0)
187 | 	expect_equal(length(dateVars), 2)
188 | })
189 | 
190 | test_that("dropConstants works as expected", {
191 | 	
192 | 	# test that attempting to group at too coarse a level results in the grouping variable being dropped 
193 | 	out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
194 |  		   dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = TRUE))
195 |     expect_warning(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
196 |  		   dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = TRUE),
197 |  		   "The following variables have no variability")
198 | 	expect_null(out[["quarters"]])
199 | 	
200 | 	
201 | 	# test that when dropConstants is set to FALSE, the constant grouping variable is retained, with a warning
202 | 	out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
203 |  		   dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = FALSE))
204 |     expect_warning(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
205 |  		   dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = FALSE),
206 |  		   "variability in grouping")
207 |     expect_equal(length(unique(out[["quarters"]])), 1)
208 |     
209 | })
210 | 	
211 | test_that("integer64 data doesn't cause problems", {
212 | 	require(bit64)
213 | 	out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight", 
214 |  		   dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y"))
215 | 	out[ , balance := as.integer64(balance)]
216 | 	PrepData(out, dateNm = "date", weightNm = "weight", 
217 |  		   dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y")
218 | 	expect_false(is.integer64(out[, balance]))
219 | 	
220 | 	out <- suppressWarnings(PrepData("../testthat/rawData_bigint.csv", dateNm = "date", weightNm = "weight", 
221 |  		   dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y"))
222 | 	expect_false(is.integer64(out[,bigint]))
223 | })
224 | 
225 | test_that("Incorrect data input file generates error", {
226 | 	expect_error(dataFl <- PrepD("../testthat/PlotHistogram.RDS"))
227 | })


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 | Apache License
  3 | Version 2.0, January 2004
  4 | http://www.apache.org/licenses/
  5 |   
  6 |   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 | 1. Definitions.
  9 | 
 10 | "License" shall mean the terms and conditions for use, reproduction,
 11 | and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 | "Licensor" shall mean the copyright owner or entity authorized by
 14 | the copyright owner that is granting the License.
 15 | 
 16 | "Legal Entity" shall mean the union of the acting entity and all
 17 | other entities that control, are controlled by, or are under common
 18 | control with that entity. For the purposes of this definition,
 19 | "control" means (i) the power, direct or indirect, to cause the
 20 | direction or management of such entity, whether by contract or
 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 | outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 | "You" (or "Your") shall mean an individual or Legal Entity
 25 | exercising permissions granted by this License.
 26 | 
 27 | "Source" form shall mean the preferred form for making modifications,
 28 | including but not limited to software source code, documentation
 29 | source, and configuration files.
 30 | 
 31 | "Object" form shall mean any form resulting from mechanical
 32 | transformation or translation of a Source form, including but
 33 | not limited to compiled object code, generated documentation,
 34 | and conversions to other media types.
 35 | 
 36 | "Work" shall mean the work of authorship, whether in Source or
 37 | Object form, made available under the License, as indicated by a
 38 | copyright notice that is included in or attached to the work
 39 | (an example is provided in the Appendix below).
 40 | 
 41 | "Derivative Works" shall mean any work, whether in Source or Object
 42 | form, that is based on (or derived from) the Work and for which the
 43 | editorial revisions, annotations, elaborations, or other modifications
 44 | represent, as a whole, an original work of authorship. For the purposes
 45 | of this License, Derivative Works shall not include works that remain
 46 | separable from, or merely link (or bind by name) to the interfaces of,
 47 | the Work and Derivative Works thereof.
 48 | 
 49 | "Contribution" shall mean any work of authorship, including
 50 | the original version of the Work and any modifications or additions
 51 | to that Work or Derivative Works thereof, that is intentionally
 52 | submitted to Licensor for inclusion in the Work by the copyright owner
 53 | or by an individual or Legal Entity authorized to submit on behalf of
 54 | the copyright owner. For the purposes of this definition, "submitted"
 55 | means any form of electronic, verbal, or written communication sent
 56 | to the Licensor or its representatives, including but not limited to
 57 | communication on electronic mailing lists, source code control systems,
 58 | and issue tracking systems that are managed by, or on behalf of, the
 59 | Licensor for the purpose of discussing and improving the Work, but
 60 | excluding communication that is conspicuously marked or otherwise
 61 | designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 | "Contributor" shall mean Licensor and any individual or Legal Entity
 64 | on behalf of whom a Contribution has been received by Licensor and
 65 | subsequently incorporated within the Work.
 66 | 
 67 | 2. Grant of Copyright License. Subject to the terms and conditions of
 68 | this License, each Contributor hereby grants to You a perpetual,
 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 | copyright license to reproduce, prepare Derivative Works of,
 71 | publicly display, publicly perform, sublicense, and distribute the
 72 | Work and such Derivative Works in Source or Object form.
 73 | 
 74 | 3. Grant of Patent License. Subject to the terms and conditions of
 75 | this License, each Contributor hereby grants to You a perpetual,
 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 | (except as stated in this section) patent license to make, have made,
 78 | use, offer to sell, sell, import, and otherwise transfer the Work,
 79 | where such license applies only to those patent claims licensable
 80 | by such Contributor that are necessarily infringed by their
 81 | Contribution(s) alone or by combination of their Contribution(s)
 82 | with the Work to which such Contribution(s) was submitted. If You
 83 | institute patent litigation against any entity (including a
 84 |                                                 cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 | or a Contribution incorporated within the Work constitutes direct
 86 | or contributory patent infringement, then any patent licenses
 87 | granted to You under this License for that Work shall terminate
 88 | as of the date such litigation is filed.
 89 | 
 90 | 4. Redistribution. You may reproduce and distribute copies of the
 91 | Work or Derivative Works thereof in any medium, with or without
 92 | modifications, and in Source or Object form, provided that You
 93 | meet the following conditions:
 94 |   
 95 |   (a) You must give any other recipients of the Work or
 96 | Derivative Works a copy of this License; and
 97 | 
 98 | (b) You must cause any modified files to carry prominent notices
 99 | stating that You changed the files; and
100 | 
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 | 
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 | 
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 | 
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 | 
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 |                                                   Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 | 
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 |                                    negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 | 
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 | 
177 | END OF TERMS AND CONDITIONS
178 | 
179 | APPENDIX: How to apply the Apache License to your work.
180 | 
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/R/vlm.R:
--------------------------------------------------------------------------------
  1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 
  2 | # SPDX-License-Identifier: Apache-2.0 
  3 | # Copyright 2017 Capital One Services, LLC 
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); 
  6 | # you may not use this file except in compliance with the License. 
  7 | #
  8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software distributed 
 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
 12 | # OF ANY KIND, either express or implied. 
 13 | # 
 14 | # See the License for the specific language governing permissions and limitations under the License. 
 15 | 
 16 | 
 17 | ###########################################
 18 | #          The Main Function              #
 19 | ###########################################
 20 | 
 21 | #' Create over time variable plots and summary statistics for variable level monitoring
 22 | #' 
 23 | #' Sorts variables according to either user input or correlation with time 
 24 | #' (among numerical variables only), and create output files including:
 25 | #' \itemize{
 26 | #'  \item A PDF file of plots saved as \code{outFl}.pdf, with each indivual page 
 27 | #'  on one variable. Variables are plotted in the order indicated in the argument
 28 | #'  \code{sortVars} or \code{sortFn}. 
 29 | #'  For each numerical variable, the output plots include 
 30 | #'  \itemize{
 31 | #'    \item side-by-side boxplots grouped by \code{dateGpBp} (left), 
 32 | #'    \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
 33 | #'      (top right), 
 34 | #'    \item a trace plot of mean and +-1 SD control limits, grouped by 
 35 | #'      \code{dateGp}(middle right), and 
 36 | #'    \item a trace plot of missing and zerorates, grouped by \code{dateGp} 
 37 | #'      (bottom right).
 38 | #'   }
 39 | #'   For each categorical variable (including a numerical variable with no more 
 40 | #'   than 2 unique levels not including NA), the output plots include 
 41 | #'   \itemize{
 42 | #'     \item a frequency bar plot (left), and 
 43 | #'     \item a grid of trace plots on categories' proportions over time (right). 
 44 | #'       If the variable contains more than \code{kCategories} number of 
 45 | #'       categories, trace plots of only the largest \code{kCategories} will be 
 46 | #'       plotted. If the variable contains only two categories, then only the 
 47 | #'       trace plot of the less prevalent cateogy will be plotted.
 48 | #'   }
 49 | #'   \item CSV file(s) on summary statistics of variable, both globally and over
 50 | #'   time aggregated by \code{dateGp}. The order of variables in the CSV files
 51 | #'   are the same as in the PDF file. 
 52 | #'   \itemize{
 53 | #'     \item For numerical varaibles, number of observations (counts), p1, p25, 
 54 | #'     p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
 55 | #'     as \code{outFl}_numerical_summary.csv.
 56 | #'     \item For categorical varaibles, number of observations (counts) and 
 57 | #'     categories' proportions are saved as \code{outFl}_categorical_summary.csv. 
 58 | #'     Each row is a category of a categorical (or binary) variable.
 59 | #'     The row whose \code{category == 'NA'} corresponds to missing. Categories
 60 | #'     among the same variable are ordered by global prevalence in a descending 
 61 | #'     order.
 62 | #'   }
 63 | #' }
 64 | #' 
 65 | #' If the argument \code{dataNeedPrep} is set to \code{FALSE}, then 
 66 | #' \itemize{
 67 | #' \item \code{dataFl} must be a \code{data.table} containing variables 
 68 | #'   \code{weightNm}, \code{dateNm}, \code{dateGp}, and \code{dateGpBp}, and 
 69 | #'   names of these variables must be the same as the corresponding arguments
 70 | #'   of the \code{\link{vlm}} function.
 71 | #' \item the arguments \code{selectCols}, \code{dropCols}, \code{dateFt}, 
 72 | #'   \code{dropConstants} will be ignored by the \code{\link{vlm}} function.
 73 | #' \item When analyzing a dataset for the first time, it is recommended to first
 74 | #'   run the \code{\link{PrepData}} function on it, and then apply the 
 75 | #'   \code{\link{vlm}} function with the argument \code{dataNeedPrep = FALSE}.
 76 | #'   Please see the examples for details. 
 77 | #' }
 78 | #' 
 79 | #' @inheritParams PrepData
 80 | #' @inheritParams PrepLabels
 81 | #' @inheritParams OrderByR2
 82 | #' @inheritParams PrintPlots
 83 | #' @param sortVars Determines which variables to be plotted and their order. 
 84 | #'   Either a character vector of variable names to plot variables in the same
 85 | #'   order as in the \code{sortVars} argument), or \code{NULL} to keep the 
 86 | #'   original ordering, with numerical variables will being plotted before 
 87 | #'   categorical and binary ones. \code{sortVars} should be \code{NULL} when the
 88 | #'   \code{sortFn} argument is used.
 89 | #' @param sortFn A sorting function which returns \code{sortVars} as an output. 
 90 | #'   The function may take the following variables as input: \code{dataFl}, 
 91 | #'   \code{dateNm}, \code{buildTm}, \code{weightNm}, \code{kSample}. Currently, 
 92 | #'   the only build-in sorting function is \code{\link{OrderByR2}}, which sorts
 93 | #'   numerical variables in the order of strength of linear association with date,
 94 | #'   and adds categorical (and binary) variables sorted in alphabetical order
 95 | #'   after the numerical ones. 
 96 | #' @param dataNeedPrep Logical, indicates if data should be run through the 
 97 | #'   \code{\link{PrepData}} function. This should be set to \code{TRUE} unless 
 98 | #'   the \code{\link{PrepData}} function has been applied to the input data 
 99 | #'   \code{dataFl}. 
100 | #' @export
101 | #' 
102 | #' @seealso This function depends on:
103 | #'          \code{\link{PrintPlots}},
104 | #'          \code{\link{OrderByR2}},
105 | #'          \code{\link{PrepData}},
106 | #'          \code{\link{PrepLabels}}.
107 | #'          
108 | #' @section License: Copyright 2017 Capital One Services, LLC Licensed under the
109 | #' Apache License, Version 2.0 (the "License"); you may not use this file 
110 | #' except in compliance with the License. You may obtain a copy of the License
111 | #' at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
112 | #' law or agreed to in writing, software distributed under the License is
113 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
114 | #' KIND, either express or implied. See the License for the specific language
115 | #' governing permissions and limitations under the License.
116 | #' @examples
117 | #' ## Load the data and its label
118 | #' data(bankData)
119 | #' data(bankLabels)
120 | #' 
121 | #' ## The PrepData function should only need to be run once on a dataset, 
122 | #' ## after that vlm can be run with the argument dataNeedPrep = FALSE
123 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
124 | #'                     dateGpBp = "quarters")
125 | #' bankLabels <- PrepLabels(bankLabels)
126 | #'
127 | #'\dontrun{ 
128 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 
129 | #'     sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 
130 | #'     outFl = "bank")
131 | #'     
132 | #' ## If csv files of summary statistics are not need, set genCSV = FALSE
133 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE,
134 | #'     sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 
135 | #'     outFl = "bank")
136 | #'     
137 | #' ## If weights are provided, they will be used in all statistical calculations
138 | #' bankData[, weight := rnorm(.N, 1, .1)]
139 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
140 | #'     dateGp = "months", dateGpBp = "quarters", weightNm = "weight", 
141 | #'     outFl = "bank")
142 | #'
143 | #' ## Customize plotting order by passing a vector of variable names to 
144 | #' ## sortVars, but the "date" column must be excluded from sortVars
145 | #' sortVars <- sort(bankLabels[varCol!="date", varCol])
146 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 
147 | #'     dateGp = "months", dateGpBp = "quarters", outFl = "bank", 
148 | #'     sortVars = sortVars)
149 | #'             
150 | #' ## Create plots for a specific variable using the varNms parameter
151 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 
152 | #'     dateGp = "months", dateGpBp = "quarters", outFl = "bank", 
153 | #'     varNms = "age", sortVars = NULL)
154 | #'}
155 | 
156 | vlm <- function(dataFl, dateNm, labelFl = NULL, outFl = "otvplots", 
157 |                 genCSV = TRUE, dataNeedPrep = FALSE, dateGp = NULL, 
158 |                 dateGpBp = NULL, weightNm = NULL, varNms = NULL, 
159 |                 sortVars = NULL, sortFn = NULL, selectCols = NULL, 
160 |                 dropCols = NULL, dateFt = "%d%h%Y", buildTm = NULL, 
161 |                 highlightNms = NULL, skewOpt = NULL, kSample = 50000, 
162 |                 fuzzyLabelFn = NULL, dropConstants = FALSE, kCategories = 9, ...) {
163 |   
164 |   ## Assert statements about inputs
165 |   if (!is.null(sortVars) & !is.null(sortFn)) {
166 |     stop ("Please choose between sortVars (predetermined order of plotting) and
167 |           sortFn (function to determine plotting order)")}
168 |   
169 |   if (!is.null(sortVars) & !is.null(varNms) &&
170 |       !all(varNms %in% sortVars)) {
171 |     stop ("Please make certain that varNms is a subset of sortVars")
172 |   }
173 |   
174 |   if (!is.null(selectCols) & !is.null(dropCols)) {
175 |     stop("Please choose between selectCols or dropCols.")
176 |   }
177 | 
178 |   ## Apply the PrepData function if not previously on dataFl
179 |   if (dataNeedPrep) { 
180 |     # Need to prepare data first
181 |     dataFl <- PrepData(dataFl = dataFl, dateNm = dateNm,
182 |                          selectCols = selectCols, dropCols = dropCols,
183 |                          dateFt = dateFt, dateGp = dateGp, dateGpBp = dateGpBp,
184 |                          weightNm = weightNm, varNms = varNms,
185 |                          dropConstants = dropConstants, ...)
186 |   } else {
187 |     stopifnot(is.data.table(dataFl) &&
188 |                 all(c(weightNm, dateNm, dateGp, dateGpBp) %in% names(dataFl)))
189 |     ## Change integer64 data type to numeric
190 |     for (var in names(dataFl)) {
191 |       if (inherits(dataFl[[var]], "integer64")) {
192 |         dataFl[, (var) := as.numeric(get(var))]
193 |       }
194 |     }
195 |   }
196 | 
197 |   ## Apply the PrepLabels function 
198 |   labelFl <- PrepLabels(labelFl)
199 | 
200 |   ## Apply sortFn to generate sortVars
201 |   if (!is.null(sortFn) && is.character(sortFn)) {
202 |     sortVars <- do.call(sortFn, list(dataFl = dataFl, dateNm = dateNm,
203 |                                      buildTm = buildTm, weightNm = weightNm,
204 |                                      kSample = kSample))
205 |   } else {
206 |     if (is.null(sortVars)) {
207 |       num_vars <- names(dataFl)[sapply(dataFl, inherits, "nmrcl")]
208 |       cat_vars <- names(dataFl)[sapply(dataFl, inherits, "ctgrl")]
209 |       sortVars <- c(num_vars, cat_vars)
210 |     }
211 |   }
212 |   
213 |   ## Create the plots
214 |   if (!is.null(varNms)) {
215 |     PrintPlots(outFl = outFl,
216 |                dataFl = dataFl[, c(varNms, dateNm, dateGp, dateGpBp, weightNm),
217 |                                with = FALSE],
218 |                sortVars = sortVars[sortVars %in% varNms], dateNm = dateNm,
219 |                dateGp = dateGp, dateGpBp = dateGpBp, weightNm = weightNm,
220 |                labelFl = labelFl, genCSV = genCSV, highlightNms = highlightNms,
221 |                skewOpt = skewOpt, kSample = kSample,
222 |                fuzzyLabelFn = fuzzyLabelFn, kCategories = kCategories)
223 |   } else {
224 |     PrintPlots(outFl = outFl, dataFl = dataFl, sortVars = sortVars,
225 |                dateNm = dateNm, dateGp = dateGp, dateGpBp = dateGpBp,
226 |                weightNm = weightNm, labelFl = labelFl, genCSV = genCSV,
227 |                highlightNms = highlightNms, skewOpt = skewOpt,
228 |                kSample = kSample, fuzzyLabelFn = fuzzyLabelFn,
229 |                kCategories = kCategories)
230 |   }
231 | }
232 | 
233 | 


--------------------------------------------------------------------------------
/R/plot_print.R:
--------------------------------------------------------------------------------
  1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 
  2 | # SPDX-License-Identifier: Apache-2.0 
  3 | # Copyright 2017 Capital One Services, LLC 
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); 
  6 | # you may not use this file except in compliance with the License. 
  7 | #
  8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software distributed 
 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
 12 | # OF ANY KIND, either express or implied. 
 13 | # 
 14 | # See the License for the specific language governing permissions and limitations under the License. 
 15 | 
 16 | 
 17 | ###########################################
 18 | #           Create output                 #
 19 | ###########################################
 20 | 
 21 | #' Create a pdf file with plots and compute summary statistics for all variables 
 22 | #'
 23 | #' Creates plots and outputs results to a letter-sized pdf file, with each 
 24 | #' individual page containing plots on a single variable in the data. In 
 25 | #' addition, two summary statistics \code{data.table} are returned, one for
 26 | #' numerical variables, and one for categorical (and binary) ones. 
 27 | #' 
 28 | #' @inheritParams PlotVar
 29 | #' @param outFl Name of the output file, with no extension names (e.g., "bank"). 
 30 | #'   A pdf file of plots ("bank.pdf"), and two csv files of summary statistics
 31 | #'   ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be
 32 | #'   saved to your working directory, unless a path is included in \code{outFl}
 33 | #'   (e.g. "../plots/bank").
 34 | #' @param genCSV Logical, whether to generate the two csv files of summary
 35 | #'   statistics for numerical and categorical variables.    
 36 | #' @param sortVars A character vector of variable names in the order they will
 37 | #'   be plotted. 
 38 | #' @return A pdf of plots saved to file \code{outFl}.pdf, and if the argument
 39 | #'   \code{genCSV == TRUE}, also two csv files of summary statistics for 
 40 | #'   numerical and categorical variables. 
 41 | #' 
 42 | #' @seealso Functions depend on this function:
 43 | #'          \code{\link{vlm}}.
 44 | #' @seealso This function depends on:
 45 | #'          \code{\link{PlotVar}},
 46 | #'          \code{\link{PrepData}}.
 47 | #'          
 48 | #' @section License:
 49 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
 50 | #' Version 2.0 (the "License"); you may not use this file except in compliance
 51 | #' with the License. You may obtain a copy of the  License at
 52 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
 53 | #' or agreed to in writing, software distributed under the License is 
 54 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
 55 | #' KIND, either express or implied. See the License for the specific language 
 56 | #' governing permissions and limitations under the License.
 57 | #' @export
 58 | PrintPlots <- function(outFl, dataFl, sortVars, dateNm, dateGp, dateGpBp, 
 59 |                        weightNm = NULL, labelFl = NULL, genCSV = TRUE, 
 60 |                        highlightNms = NULL, skewOpt = NULL, kSample = 50000, 
 61 |                        fuzzyLabelFn = NULL, kCategories = 9) {
 62 |   
 63 |   catSummary <- NULL
 64 |   numSummary <- NULL
 65 |   . <- NULL
 66 |   
 67 |   plotList <-
 68 |     lapply(sortVars, PlotVar,
 69 |            dataFl = dataFl, weightNm = weightNm, dateNm = dateNm,
 70 |            dateGp = dateGp, dateGpBp = dateGpBp, labelFl = labelFl,
 71 |            highlightNms = highlightNms, skewOpt = skewOpt,
 72 |            fuzzyLabelFn = fuzzyLabelFn, kCategories = kCategories)
 73 |   
 74 |   grDevices::pdf(file = paste(outFl, '.pdf', sep = ''),  width = 11, height = 8,
 75 |                  pointsize = 12, onefile = TRUE)
 76 |   
 77 |   for (x in plotList)  {
 78 |     grid::grid.newpage()
 79 |     grid::grid.draw(x$p)
 80 |     
 81 |     if(genCSV == TRUE){
 82 |       if(x$varType == "ctgrl")
 83 |         catSummary = rbind(catSummary, x$varSummary) 
 84 |       if(x$varType == "nmrcl")
 85 |         numSummary = rbind(numSummary, x$varSummary) 
 86 |     }  
 87 |   }
 88 |   dev.off()
 89 |   
 90 |   ## Generate CSV files
 91 |   if(genCSV == TRUE){
 92 |     ## Compute counts in each time
 93 |     if (is.null(weightNm)){  
 94 |       total_counts = dataFl[, list(count = .N), by = dateGp]
 95 |     } else{
 96 |       total_counts = dataFl[, list(count = sum(get(weightNm))), by = dateGp]
 97 |     }
 98 |     names(total_counts)[1] = "date_group"
 99 |     total_counts = dcast(total_counts, . ~ date_group, value.var = 'count')
100 |     total_counts[, . := NULL]
101 |   
102 |     ## For numerical variables
103 |     if(!is.null(numSummary)){
104 |       ## Add a row of counts at the begining of numSummary
105 |       numSummary = rbind(as.list(rep(NA, ncol(numSummary))), numSummary)
106 |       numSummary[1, 1:2] = list('ALL_DATA', 'COUNTS')
107 |       numSummary[1, 3] = sum(total_counts)
108 |       numSummary[1, names(numSummary)[-(1:3)] := total_counts];  
109 |       ## Write the csv file
110 |       fwrite(numSummary, file = paste(outFl, '_numerical_summary.csv', sep = ''))
111 |     }
112 |     
113 |     ## For categorical variables
114 |     if(!is.null(catSummary)){
115 |       ## Add a row of counts at the begining of catSummary
116 |       catSummary = rbind(as.list(rep(NA, ncol(catSummary))), catSummary)
117 |       catSummary[1, 1:2] = list('ALL_DATA', 'COUNTS')
118 |       catSummary[1, 3:4] = list(sum(total_counts), 1)
119 |       catSummary[1, names(catSummary)[-(1:4)] := total_counts];  
120 |       ## Write the csv file
121 |       fwrite(catSummary, file = paste(outFl, '_categorical_summary.csv', sep = ''))
122 |     }  
123 |   }
124 | }
125 | 
126 | ###############################################
127 | #   Main Plot Function for a single variable  #
128 | ###############################################
129 | 
130 | #' Create over time variable plots and summary statitsics for one variable
131 | #' 
132 | #' For a numerical variable, the output includes 
133 | #' \itemize{
134 | #' \item side-by-side boxplots grouped by \code{dateGpBp} (left), 
135 | #' \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
136 | #'   (top right), 
137 | #' \item a trace plot of mean and +-1 SD control limits, grouped by 
138 | #'   \code{dateGp}(middle right), and 
139 | #' \item a trace plot of missing and zerorates, grouped by \code{dateGp} 
140 | #'   (bottom right).
141 | #' }
142 | #' For a categorical variable (including a numerical variable with no more than 2
143 | #' unique levels not including NA), the output includes 
144 | #' \itemize{
145 | #' \item a frequency bar plot (left), and 
146 | #' \item a grid of trace plots on categories' proportions over time (right). 
147 | #' If the variable contains more than \code{kCategories} number of categories, 
148 | #' trace plots of only the largest \code{kCategories} will be plotted. 
149 | #' }
150 | #' In addition to plots, a \code{data.table} of summary statistics are generated,
151 | #' on global and over time summary statistics. 
152 | #'
153 | #' @inheritParams PlotCatVar
154 | #' @inheritParams PlotNumVar
155 | #' @inheritParams OrderByR2
156 | #' @param dataFl A \code{data.table} containing at least the following columns:
157 | #'   \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an
158 | #'   output of the \code{\link{PrepData}} function.
159 | #' @param myVar Name of the variable to be plotted.
160 | #' @param labelFl A \code{data.table} containing variable labels, or \code{NULL}
161 | #'   for no labels; usually an output of \code{\link{PrepLabels}}.
162 | #' @param highlightNms Either \code{NULL} or a character vector of variables to
163 | #'   recieve red label. Currently \code{NULL} means all variables will get a 
164 | #'   black legend. Ignored this argument if \code{labelFl == NULL}.
165 | #' @param fuzzyLabelFn Either \code{NULL} or a function of 2 parameters: A label
166 | #'   file in the format of an output by \code{\link{PrepLabels}} and a string
167 | #'   giving a variable name. The function should return the label corresponding
168 | #'   to the variable given by the second parameter. This function should 
169 | #'   describe how fuzzy matching should be performed to find labels (see example
170 | #'   below). If \code{NULL}, only exact matches will be retuned.
171 | #' @return
172 | #'   \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object. See the output
173 | #'     \code{p} of the function or \code{\link{PlotNumVar}}
174 | #'     \code{\link{PlotCatVar}} for details.}
175 | #'   \item{varSummary}{A \code{data.table} of summary statistics. See the output
176 | #'     \code{numVarSummary} of the function \code{\link{PlotNumVar}}, or the 
177 | #'     output \code{catVarSummary} of the function \code{\link{PlotCatVar}} for 
178 | #'     details.}
179 | #'   \item{varType}{Indicator of the variable's type, either \code{"nmrcl"} or 
180 | #'     \code{"ctgrl"}.}
181 | #' @export
182 | #' 
183 | #' @seealso Functions depend on this function:
184 | #'          \code{\link{PrintPlots}}.
185 | #' @seealso This function depends on:
186 | #'          \code{\link{PlotCatVar}},
187 | #'          \code{\link{PlotNumVar}},
188 | #'          \code{\link{PrepData}}.
189 | #'          
190 | #' @section License: Copyright 2017 Capital One Services, LLC Licensed under the
191 | #' Apache License, Version 2.0 (the "License"); you may not use this file
192 | #' except in compliance with the License. You may obtain a copy of the  License
193 | #' at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
194 | #' law or agreed to in writing, software distributed under the License is
195 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
196 | #' KIND, either express or implied. See the License for the specific language
197 | #' governing permissions and limitations under the License.
198 | #' @examples
199 | #' data(bankData)
200 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
201 | #'                      dateGpBp = "quarters")
202 | #' data(bankLabels)
203 | #' bankLabels <- PrepLabels(bankLabels)
204 | #'
205 | #' ## PlotVar will treat numerical and categorical data differently. 
206 | #' ## Binary data is always treated as categorical.
207 | #' plot(PlotVar(bankData, myVar = "duration", weightNm = NULL, dateNm = "date", 
208 | #'      dateGp = "months", dateGpBp =  "quarters", labelFl = bankLabels)$p)
209 | #' plot(PlotVar(bankData, myVar = "job", weightNm = NULL, dateNm = "date", 
210 | #'      dateGp = "months", dateGpBp =  "quarters", labelFl = bankLabels)$p)
211 | #' plot(PlotVar(bankData, myVar = "loan", weightNm = NULL, dateNm = "date", 
212 | #'      dateGp = "months", dateGpBp =  "quarters", labelFl = bankLabels)$p)
213 | #'
214 | PlotVar <- function(dataFl, myVar, weightNm, dateNm, dateGp, dateGpBp = NULL,
215 |                     labelFl = NULL, highlightNms = NULL, skewOpt = NULL,
216 |                     kSample = 50000, fuzzyLabelFn = NULL, kCategories = 9) {
217 |   
218 |   varCol <- labelCol <- NULL
219 |   message(paste("Plotting ", myVar))
220 |   
221 |   ## Make sure that myVar is not a date type
222 |   if (any(is.element(unlist(dataFl[, class(get(myVar))]),
223 |                      c("Date", "IDate")))) {
224 |     stop("Cannot plot dates")
225 |   }
226 |   
227 |   ## Label myVar type to be "nmrcl" or "ctgrl" if not labeled yet
228 |   if (!(inherits(myVar, "ctgrl") | inherits(myVar, "nmrcl"))) {
229 |     if (dataFl[, class(get(myVar))] %in% c("character", "factor") ||
230 |         dataFl[, length(unique(stats::na.omit(get(myVar))))] == 2) {
231 |       setattr(dataFl[, get(myVar)], "class", "ctgrl")
232 |     } else {
233 |       setattr(dataFl[, get(myVar)], "class", "nmrcl")
234 |     }
235 |   }
236 |   
237 |   ## Generate a grid of plots
238 |   if (inherits(dataFl[[myVar]], "ctgrl")) {
239 |     p_all <- PlotCatVar(myVar, dataFl, weightNm, dateNm, dateGp, kCategories)
240 |     p <- p_all$p
241 |     varSummary <- p_all$catVarSummary
242 |     varType <- "ctgrl"
243 |   } else if (inherits(dataFl[[myVar]], "nmrcl")) {
244 |     p_all <- PlotNumVar(myVar, dataFl, weightNm, dateGp, dateGpBp, skewOpt,
245 |                      kSample)
246 |     p <- p_all$p
247 |     varSummary = p_all$numVarSummary
248 |     varType <- "nmrcl"
249 |   }
250 |   
251 |   ## If no fuzzy matching functions are provided, provide exact matches on the 
252 |   ## first column, otherwise use logic defined in fuzzyLabelFn
253 |   ll <- myVar
254 |   subHeight <- grid::unit(12, "points")
255 |   if (!is.null(labelFl)) {
256 |     if (is.null(fuzzyLabelFn)) {
257 |       ll <- paste0(labelFl[varCol == myVar, labelCol])
258 |     } else {
259 |       ll <- fuzzyLabelFn(labelFl, myVar)
260 |     }
261 |     ll <- paste0(myVar, " (", ll, ")", "\n")
262 |   }
263 |   
264 |   ## Label color
265 |   subCol <- "black"
266 |   if (!is.null(highlightNms)) {
267 |     highlightNms <- gsub("/|\\-|\"|\\s", "", highlightNms)
268 |     if (myVar %in% highlightNms) {
269 |       # should add other ways to trigger red labels
270 |       subCol <- "red"
271 |     }
272 |   }
273 |   
274 |   ## Add the page title as myVar and its label above the grid of plots
275 |   subText <- grid::textGrob(ll, gp = grid::gpar(col = subCol, fontface="bold"))
276 |   grobHeights <- grid::unit.c(grid::unit(1, "npc") - subHeight, subHeight)
277 |   p <- gridExtra::arrangeGrob(p, top = subText)
278 |   
279 |   return(list(p = p, varSummary = varSummary, varType = varType))
280 | }
281 | 


--------------------------------------------------------------------------------
/man/vlm.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/vlm.R
  3 | \name{vlm}
  4 | \alias{vlm}
  5 | \title{Create over time variable plots and summary statistics for variable level monitoring}
  6 | \usage{
  7 | vlm(dataFl, dateNm, labelFl = NULL, outFl = "otvplots", genCSV = TRUE,
  8 |   dataNeedPrep = FALSE, dateGp = NULL, dateGpBp = NULL, weightNm = NULL,
  9 |   varNms = NULL, sortVars = NULL, sortFn = NULL, selectCols = NULL,
 10 |   dropCols = NULL, dateFt = "\%d\%h\%Y", buildTm = NULL,
 11 |   highlightNms = NULL, skewOpt = NULL, kSample = 50000,
 12 |   fuzzyLabelFn = NULL, dropConstants = FALSE, kCategories = 9, ...)
 13 | }
 14 | \arguments{
 15 | \item{dataFl}{Either the name of an object that can be converted using
 16 | \code{\link[data.table]{as.data.table}} (e.g., a data frame), or a 
 17 | character string containing the name of dataset that can be loaded using 
 18 | \code{\link[data.table]{fread}} (e.g., a csv file). If the dataset is not in 
 19 | your working directory then \code{dataFl} must include (relative or 
 20 | absolute) path to file.}
 21 | 
 22 | \item{dateNm}{Name of column containing the date variable.}
 23 | 
 24 | \item{labelFl}{Either the path of a dataset (a csv file) containing
 25 | labels, an R object convertible to \code{data.table} (e.g., data frame) or 
 26 | \code{NULL}. If \code{NULL}, no labels will be used. The label dataset must 
 27 | contain at least 2 columns: \code{varCol} (variable names) and 
 28 | \code{labelCol} (variable labels).}
 29 | 
 30 | \item{outFl}{Name of the output file, with no extension names (e.g., "bank"). 
 31 | A pdf file of plots ("bank.pdf"), and two csv files of summary statistics
 32 | ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be
 33 | saved to your working directory, unless a path is included in \code{outFl}
 34 | (e.g. "../plots/bank").}
 35 | 
 36 | \item{genCSV}{Logical, whether to generate the two csv files of summary
 37 | statistics for numerical and categorical variables.}
 38 | 
 39 | \item{dataNeedPrep}{Logical, indicates if data should be run through the 
 40 | \code{\link{PrepData}} function. This should be set to \code{TRUE} unless 
 41 | the \code{\link{PrepData}} function has been applied to the input data 
 42 | \code{dataFl}.}
 43 | 
 44 | \item{dateGp}{Name of the variable that the time series plots should be 
 45 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"}, 
 46 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
 47 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
 48 | 
 49 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
 50 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
 51 | 
 52 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for 
 53 | no weights (all rows receiving weight 1).}
 54 | 
 55 | \item{varNms}{Either \code{NULL} or a vector of names or indices of variables
 56 | to be plotted. If \code{NULL}, will default to all columns which are not 
 57 | \code{dateNm} or \code{weightNm}. Can also be a vector of indices of the 
 58 | column names, after \code{dropCols} or \code{selectCols} have been applied,
 59 | if applicable, and not including \code{dateGp}, \code{dateGpBp} 
 60 | (which will be added to the \code{dataFl} by the function 
 61 | \code{\link{PrepData}}).}
 62 | 
 63 | \item{sortVars}{Determines which variables to be plotted and their order. 
 64 | Either a character vector of variable names to plot variables in the same
 65 | order as in the \code{sortVars} argument), or \code{NULL} to keep the 
 66 | original ordering, with numerical variables will being plotted before 
 67 | categorical and binary ones. \code{sortVars} should be \code{NULL} when the
 68 | \code{sortFn} argument is used.}
 69 | 
 70 | \item{sortFn}{A sorting function which returns \code{sortVars} as an output. 
 71 | The function may take the following variables as input: \code{dataFl}, 
 72 | \code{dateNm}, \code{buildTm}, \code{weightNm}, \code{kSample}. Currently, 
 73 | the only build-in sorting function is \code{\link{OrderByR2}}, which sorts
 74 | numerical variables in the order of strength of linear association with date,
 75 | and adds categorical (and binary) variables sorted in alphabetical order
 76 | after the numerical ones.}
 77 | 
 78 | \item{selectCols}{Either \code{NULL}, or a vector of names or indices of 
 79 | variables to read into memory -- must include \code{dateNm}, 
 80 | \code{weightNm} (if not \code{NULL}) and all variables to be plotted. If
 81 | both \code{selectCols} and \code{dropCols} are \code{NULL}, then all
 82 | variables will be read in.}
 83 | 
 84 | \item{dropCols}{Either \code{NULL}, or a vector of variables names or indices
 85 | of variables not to read into memory. If both \code{selectCols} and 
 86 | \code{dropCols} are \code{NULL}, then all variables will be read in.}
 87 | 
 88 | \item{dateFt}{\code{\link{strptime}} format of date variable. The default is SAS
 89 | format \code{"\%d\%h\%Y"}. But input data with R date format 
 90 | \code{"\%Y-\%m-\%d"} will also be detected. Both of two formats can be
 91 | parsed automatically.}
 92 | 
 93 | \item{buildTm}{Vector identify time period for ranking/anomaly detection
 94 | (most likely model build period). Allows for a subset of plotting time
 95 | period to be used for anomaly detection.
 96 | \itemize{
 97 |      \item Must be a vector of dates and must be inclusive i.e. buildTm[1]
 98 |        <= date <= buildTm[2] will define the time period.
 99 |      \item Must be either \code{NULL}, a vector of length 2, or a vector of 
100 |        length 3. 
101 |      \item If \code{NULL}, the entire dataset will be used for 
102 |        ranking/anomaly detection. 
103 |      \item If a vector of length 2, the format of the dates must be
104 |        a character vector in default R date format (e.g. "2017-01-30"). 
105 |      \item If a vector of length 3, the first two columns must contain dates 
106 |        in any strptime format, while the 3rd column contains the strptime 
107 |        format (see \code{\link{strptime}}). 
108 |      \item The following are equivalent ways of selecting
109 |        all of 2014:
110 |      \itemize{
111 |        \item \code{c("2014-01-01","2014-12-31")}
112 |        \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")}
113 |      }
114 | }}
115 | 
116 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to
117 | recieve red label. Currently \code{NULL} means all variables will get a 
118 | black legend. Ignored this argument if \code{labelFl == NULL}.}
119 | 
120 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is 
121 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
122 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
123 | Negative input of \code{skewOpt} will be converted to 3.}
124 | 
125 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer, 
126 | indicates the sample size for both drawing boxplots and ordering numerical
127 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a 
128 | reasonable value (default is 50K) dramatically improves processing speed. 
129 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
130 | parameter should not be set to \code{NULL}, or boxplots may take a very
131 | long time to render. This setting has no impact on the accuracy of time 
132 | series plots on quantiles, mean, SD, and missing and zero rates.}
133 | 
134 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label
135 | file in the format of an output by \code{\link{PrepLabels}} and a string
136 | giving a variable name. The function should return the label corresponding
137 | to the variable given by the second parameter. This function should 
138 | describe how fuzzy matching should be performed to find labels (see example
139 | below). If \code{NULL}, only exact matches will be retuned.}
140 | 
141 | \item{dropConstants}{Logical, indicates whether or not constant (all
142 | duplicated or NA) variables should be dropped from \code{dataFl} prior to
143 | plotting.}
144 | 
145 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
146 | trace plots of only the \code{kCategories} most prevalent categories are
147 | plotted.}
148 | 
149 | \item{...}{Additional parameters to be passed to 
150 | \code{\link[data.table]{fread}}.}
151 | }
152 | \description{
153 | Sorts variables according to either user input or correlation with time 
154 | (among numerical variables only), and create output files including:
155 | \itemize{
156 |  \item A PDF file of plots saved as \code{outFl}.pdf, with each indivual page 
157 |  on one variable. Variables are plotted in the order indicated in the argument
158 |  \code{sortVars} or \code{sortFn}. 
159 |  For each numerical variable, the output plots include 
160 |  \itemize{
161 |    \item side-by-side boxplots grouped by \code{dateGpBp} (left), 
162 |    \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
163 |      (top right), 
164 |    \item a trace plot of mean and +-1 SD control limits, grouped by 
165 |      \code{dateGp}(middle right), and 
166 |    \item a trace plot of missing and zerorates, grouped by \code{dateGp} 
167 |      (bottom right).
168 |   }
169 |   For each categorical variable (including a numerical variable with no more 
170 |   than 2 unique levels not including NA), the output plots include 
171 |   \itemize{
172 |     \item a frequency bar plot (left), and 
173 |     \item a grid of trace plots on categories' proportions over time (right). 
174 |       If the variable contains more than \code{kCategories} number of 
175 |       categories, trace plots of only the largest \code{kCategories} will be 
176 |       plotted. If the variable contains only two categories, then only the 
177 |       trace plot of the less prevalent cateogy will be plotted.
178 |   }
179 |   \item CSV file(s) on summary statistics of variable, both globally and over
180 |   time aggregated by \code{dateGp}. The order of variables in the CSV files
181 |   are the same as in the PDF file. 
182 |   \itemize{
183 |     \item For numerical varaibles, number of observations (counts), p1, p25, 
184 |     p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
185 |     as \code{outFl}_numerical_summary.csv.
186 |     \item For categorical varaibles, number of observations (counts) and 
187 |     categories' proportions are saved as \code{outFl}_categorical_summary.csv. 
188 |     Each row is a category of a categorical (or binary) variable.
189 |     The row whose \code{category == 'NA'} corresponds to missing. Categories
190 |     among the same variable are ordered by global prevalence in a descending 
191 |     order.
192 |   }
193 | }
194 | }
195 | \details{
196 | If the argument \code{dataNeedPrep} is set to \code{FALSE}, then 
197 | \itemize{
198 | \item \code{dataFl} must be a \code{data.table} containing variables 
199 |   \code{weightNm}, \code{dateNm}, \code{dateGp}, and \code{dateGpBp}, and 
200 |   names of these variables must be the same as the corresponding arguments
201 |   of the \code{\link{vlm}} function.
202 | \item the arguments \code{selectCols}, \code{dropCols}, \code{dateFt}, 
203 |   \code{dropConstants} will be ignored by the \code{\link{vlm}} function.
204 | \item When analyzing a dataset for the first time, it is recommended to first
205 |   run the \code{\link{PrepData}} function on it, and then apply the 
206 |   \code{\link{vlm}} function with the argument \code{dataNeedPrep = FALSE}.
207 |   Please see the examples for details. 
208 | }
209 | }
210 | \section{License}{
211 |  Copyright 2017 Capital One Services, LLC Licensed under the
212 | Apache License, Version 2.0 (the "License"); you may not use this file 
213 | except in compliance with the License. You may obtain a copy of the License
214 | at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
215 | law or agreed to in writing, software distributed under the License is
216 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
217 | KIND, either express or implied. See the License for the specific language
218 | governing permissions and limitations under the License.
219 | }
220 | 
221 | \examples{
222 | ## Load the data and its label
223 | data(bankData)
224 | data(bankLabels)
225 | 
226 | ## The PrepData function should only need to be run once on a dataset, 
227 | ## after that vlm can be run with the argument dataNeedPrep = FALSE
228 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
229 |                     dateGpBp = "quarters")
230 | bankLabels <- PrepLabels(bankLabels)
231 | 
232 | \dontrun{ 
233 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 
234 |     sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 
235 |     outFl = "bank")
236 |     
237 | ## If csv files of summary statistics are not need, set genCSV = FALSE
238 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE,
239 |     sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", 
240 |     outFl = "bank")
241 |     
242 | ## If weights are provided, they will be used in all statistical calculations
243 | bankData[, weight := rnorm(.N, 1, .1)]
244 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
245 |     dateGp = "months", dateGpBp = "quarters", weightNm = "weight", 
246 |     outFl = "bank")
247 | 
248 | ## Customize plotting order by passing a vector of variable names to 
249 | ## sortVars, but the "date" column must be excluded from sortVars
250 | sortVars <- sort(bankLabels[varCol!="date", varCol])
251 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 
252 |     dateGp = "months", dateGpBp = "quarters", outFl = "bank", 
253 |     sortVars = sortVars)
254 |             
255 | ## Create plots for a specific variable using the varNms parameter
256 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, 
257 |     dateGp = "months", dateGpBp = "quarters", outFl = "bank", 
258 |     varNms = "age", sortVars = NULL)
259 | }
260 | }
261 | \seealso{
262 | This function depends on:
263 |          \code{\link{PrintPlots}},
264 |          \code{\link{OrderByR2}},
265 |          \code{\link{PrepData}},
266 |          \code{\link{PrepLabels}}.
267 | }
268 | 


--------------------------------------------------------------------------------
/R/categorical.R:
--------------------------------------------------------------------------------
  1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC 
  2 | # SPDX-License-Identifier: Apache-2.0 
  3 | # Copyright 2017 Capital One Services, LLC 
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); 
  6 | # you may not use this file except in compliance with the License. 
  7 | #
  8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software distributed 
 11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
 12 | # OF ANY KIND, either express or implied. 
 13 | # 
 14 | # See the License for the specific language governing permissions and limitations under the License. 
 15 | 
 16 | 
 17 | ###########################################
 18 | #      Plots for Categorical Data         #
 19 | ###########################################
 20 | #' Create plots and summary statistics for a categorical variable
 21 | #' 
 22 | #' Output plots include a bar plot with cateogries ordered by global counts,
 23 | #' and trace plots of categories' proportions over time. This function is also
 24 | #' appliable to a binary varible, which is treated as categorical in this 
 25 | #' package. In addition to plots, a \code{data.table} of summary statistics
 26 | #' are generated, on global counts and proportions by cateory, and proportions 
 27 | #' by category over time. 
 28 | #' 
 29 | #' @inheritParams PrepData
 30 | #' @param dataFl A \code{data.table} of data; must be the output of the
 31 | #'   \code{\link{PrepData}} function. 
 32 | #' @param myVar The name of the variable to be plotted
 33 | #' @param kCategories If a categorical variable has more than \code{kCategories},
 34 | #'   trace plots of only the \code{kCategories} most prevalent categories are
 35 | #'   plotted.  
 36 | #' @param normBy The normalization factor for rate plots, can be \code{"time"}
 37 | #'   or \code{"var"}. If \code{"time"}, then for each time period of 
 38 | #'   \code{dateGp}, counts are normalized by the total counts over all 
 39 | #'   categories in that time period. This illustrates changes of categories' 
 40 | #'   proportions over time. If \code{"var"}, then for each category, its counts 
 41 | #'   are normalized by the total counts over time from only this category. This
 42 | #'   illustrates changes of categories' volumes over time.
 43 | #' @export
 44 | #' @return 
 45 | #'   \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a 
 46 | #'     bar plot, and trace plots of categories' proportions. If the number of 
 47 | #'     categories is larger than \code{kCategories}, then trace plots of only the
 48 | #'     \code{kCategories} most prevalent categories are be plotted. For a binary
 49 | #'     variable, only the trace plot of the less prevalent category is plotted.}
 50 | #'   \item{catVarSummary}{A \code{data.table}, contains categories' proportions 
 51 | #'     globally, and over-time in each time period in \code{dateGp}. Each row is
 52 | #'     a category of the categorical (or binary) variable \code{myVar}. The row
 53 | #'     whose \code{category == 'NA'} corresponds to missing. Categories are 
 54 | #'     ordered by global prevalence in a descending order.}
 55 | #'     
 56 | #' @seealso Functions depend on this function:
 57 | #'          \code{\link{PlotVar}},
 58 | #'          \code{\link{PrintPlots}},
 59 | #'          \code{\link{vlm}}.
 60 | #' @seealso This function depends on:
 61 | #'          \code{\link{PlotBarplot}},
 62 | #'          \code{\link{PlotRatesOverTime}},
 63 | #'          \code{\link{PrepData}}.
 64 | #'          
 65 | #' @section License:
 66 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
 67 | #' Version 2.0 (the "License"); you may not use this file except in compliance
 68 | #' with the License. You may obtain a copy of the  License at
 69 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
 70 | #' or agreed to in writing, software distributed under the License is 
 71 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
 72 | #' KIND, either express or implied. See the License for the specific language 
 73 | #' governing permissions and limitations under the License.
 74 | #' @examples
 75 | #' data(bankData)
 76 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
 77 | #'                     dateGpBp = "quarters", weightNm = NULL)
 78 | #' # Single histogram is plotted for job type since there are 12 categories
 79 | #' plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm =  NULL, 
 80 | #'                      dateNm = "date", dateGp = "months")$p)
 81 | #'                      
 82 | #' plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL, 
 83 | #'                      dateNm = "date", dateGp = "months", kCategories = 12)$p)
 84 | #'
 85 | #'
 86 | #' ## Binary data is treated as categorical,  and only the less frequent 
 87 | #' ## category is plotted over time.
 88 | #' plot(PlotCatVar(myVar = "default", dataFl = bankData, weightNm = NULL, 
 89 | #'                      dateNm = "date", dateGp = "months")$p)
 90 | 
 91 | PlotCatVar <- function(myVar, dataFl, weightNm = NULL, dateNm, dateGp,
 92 |                             kCategories = 9, normBy = "time") { #!# previous name: PlotDiscreteVar
 93 |   count <- NULL
 94 |   
 95 |   p <- PlotBarplot(dataFl = dataFl, myVar = myVar, weightNm = weightNm)
 96 |   newLevels <- as.character(p$data[order(-count)][[myVar]])
 97 |   
 98 |   p2 <- PlotRatesOverTime(dataFl = dataFl, dateGp = dateGp, weightNm = weightNm,
 99 |                           myVar = myVar, newLevels = newLevels, normBy = normBy,
100 |                           kCategories = kCategories)
101 |   
102 |   p  <- gridExtra::arrangeGrob(ggplot2::ggplotGrob(p), p2$p, widths = c(1, 2))
103 |   
104 |   return(list(p = p, catVarSummary = p2$catVarSummary))
105 | }
106 | 
107 | ###########################################
108 | #       Discrete Plotting Functions       #
109 | ###########################################
110 | #' Creates a bar plot for a discrete (or binary) variable
111 | #'
112 | #' @inheritParams PlotCatVar
113 | #' @export
114 | #' @return A \code{ggplot} object with a histogram of \code{myVar} ordered by 
115 | #'   category frequency
116 | #'   
117 | #' @seealso Functions depend on this function:
118 | #'          \code{\link{PlotCatVar}}.
119 | #' @seealso This function depends on:
120 | #'          \code{\link{PrepData}}.
121 | #'          
122 | #' @section License:
123 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
124 | #' Version 2.0 (the "License"); you may not use this file except in compliance
125 | #' with the License. You may obtain a copy of the  License at
126 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
127 | #' or agreed to in writing, software distributed under the License is 
128 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
129 | #' KIND, either express or implied. See the License for the specific language 
130 | #' governing permissions and limitations under the License.
131 | #' @examples
132 | #' data(bankData)
133 | #' bankData = PrepData(bankData, dateNm = "date", dateGp = "months", 
134 | #'                     dateGpBp = "quarters", weightNm = NULL)
135 | #' PlotBarplot(bankData, "job")
136 | #' 
137 | #' ## NA will be included as a category if any NA are present
138 | #' bankData[sample.int(.N)[1:1000], education := NA]
139 | #' PlotBarplot(bankData, "education")
140 | 
141 | PlotBarplot <- function(dataFl, myVar, weightNm = NULL){ #!# previous name: PlotHistogram
142 | 
143 |   count <- NULL
144 |   
145 |   ## A subset dataset to work on
146 |   dataSub <- dataFl[, c(myVar, weightNm), with = FALSE]
147 |   ## NA is converted to a character, i.e., treated as a new category
148 |   dataSub[is.na(get(myVar)) | get(myVar) == "", (myVar) := "NA"]
149 |   
150 |   ## Create glbTotals, a frequency table of myVar 
151 |   if (is.null(weightNm)) {
152 |     glbTotals <- dataSub[, list(count = .N), by = myVar]
153 |   } else {
154 |     glbTotals <- dataSub[, list(count = sum(get(weightNm))), by = myVar]
155 |   }
156 |   
157 |   ## Create newLevels, a vector of category names, in descending order of counts
158 |   newLevels <- unlist(glbTotals[order(-count), myVar, with = FALSE])
159 |   glbTotals[, (myVar) := factor(get(myVar), levels = newLevels)]
160 |   
161 |   p <- ggplot2::ggplot(glbTotals, ggplot2::aes_string(x = myVar,
162 |                                                       y = "count",
163 |                                                       group = myVar)) +
164 |     ggplot2::geom_bar(stat = "identity") +
165 |     ggplot2::scale_x_discrete(labels = abbreviate, breaks = newLevels) +
166 |     ggplot2::theme(text = ggplot2::element_text(size = 10))
167 |   return(p)
168 | }
169 | 
170 | 
171 | #' Creates trace plots of categories' proportions over time for a discrete (or
172 | #' binary) variable
173 | #'
174 | #' @inheritParams PlotCatVar
175 | #' @param newLevels categories of \code{myVar} in order of global frequency
176 | #' @export
177 | #' @return A list:
178 | #'   \item{p}{\code{ggplot} object, trace plots of categories' proportions 
179 | #'     \code{myVar} over time.}
180 | #'   \item{catVarSummary}{A \code{data.table}, contains categories' proportions 
181 | #'     globally, and over-time in each time period in \code{dateGp}. Each row is
182 | #'     a category of the categorical (or binary) variable \code{myVar}. The row
183 | #'     whose \code{category == 'NA'} corresponds to missing. Categories are 
184 | #'     ordered by global prevalence in a descending order.}
185 | #'     
186 | #' @seealso Functions depend on this function:
187 | #'          \code{\link{PlotCatVar}}.
188 | #' @seealso This function depends on:
189 | #'          \code{\link{PrepData}}.
190 | #'          
191 | #' @section License:
192 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
193 | #' Version 2.0 (the "License"); you may not use this file except in compliance
194 | #' with the License. You may obtain a copy of the  License at
195 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
196 | #' or agreed to in writing, software distributed under the License is 
197 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
198 | #' KIND, either express or implied. See the License for the specific language 
199 | #' governing permissions and limitations under the License.
200 | #' @examples
201 | #' data(bankData)
202 | #' bankData$weight = rpois(nrow(bankData), 5)
203 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months", 
204 | #'                      dateGpBp = "quarters", weightNm = "weight")
205 | #' PlotRatesOverTime(dataFl = bankData, dateGp = "months", weightNm = "weight",
206 | #'                   myVar = "job", newLevels = NULL, normBy = "time")
207 | #' 
208 | PlotRatesOverTime <- function(dataFl, dateGp, myVar, normBy = "time",
209 |                              weightNm = NULL, newLevels = NULL, kCategories = 9){ #!# previous name: PlotHistOverTime
210 |   N.x <- NULL
211 |   N.y <- NULL
212 |   rate <- NULL
213 |   N <- NULL
214 |   count <- NULL
215 |   global_count <- NULL
216 |   global_rate <- NULL
217 |   variable <- NULL
218 |   
219 |   ## A subset dataset to work on
220 |   dataSub <- dataFl[, c(dateGp, myVar, weightNm), with = FALSE]
221 |   ## NA is converted to a character, i.e., treated as a new category
222 |   dataSub[is.na(get(myVar)) | get(myVar) == "", (myVar) := "NA"]
223 |   
224 |   ## Create glbTotals, a frequency table of myVar 
225 |   ## Create newLevels, a vector of category names, in descending order of counts
226 |   if (is.null(newLevels)){
227 |     if (is.null(weightNm)) {
228 |       glbTotals <- dataSub[, list(count = .N), by = myVar]
229 |     } else {
230 |       glbTotals <- dataSub[, list(count = sum(get(weightNm))), by = myVar]
231 |     }
232 |     
233 |     newLevels <- glbTotals[order(-count), myVar, with = FALSE][[myVar]]
234 |   }
235 |   
236 |   ## Compute counts by category and time
237 |   if (is.null(weightNm)) {
238 |     countData <- dataSub[, .N, by = c(myVar, dateGp)]
239 |     if (normBy == "time"){
240 |       countBy <- dataSub[, .N, by = c(dateGp)]
241 |     } else {
242 |       if (normBy == "var") {
243 |         countBy <- dataSub[, .N, by = c(myVar)]
244 |       }
245 |     }
246 |   } else {
247 |     countData <- dataSub[, list(N = sum(get(weightNm))), by = c(myVar, dateGp)]
248 |     if (normBy == "time"){
249 |       countBy <- dataSub[, list(N = sum(get(weightNm))), by = c(dateGp)]
250 |     } else {
251 |       if (normBy == "var") {
252 |         countBy <- dataSub[, list(N = sum(get(weightNm))), by = c(myVar)]
253 |       }
254 |     }
255 |   }
256 |   
257 |   ## Make sure countData contains all cateogires and all times
258 |   crossLevels <- CJ(unique(countData[[dateGp]]), unique(countData[[myVar]]))
259 |   setnames(crossLevels, c("V1", "V2"), c(dateGp, myVar))
260 |   countData <- merge(crossLevels, countData, all.x = TRUE, by = c(dateGp, myVar))
261 |   countData[is.na(N), N := 0]
262 |   countData[, (myVar) := factor(get(myVar), levels = newLevels)]
263 |   
264 |   ## Combine countData (numerator) and countBy (denominator) as rateBy
265 |   if (normBy == "time"){
266 |     rateBy <- merge(countData, countBy, by = dateGp)
267 |   } else {
268 |     if (normBy == "var") {
269 |       rateBy  <- merge(countData, countBy, by = myVar)
270 |     }
271 |   }
272 |   
273 |   ## Compute the rates: 
274 |   ## For a certain time, N.x is the count of the category, N.y is the total counts
275 |   rateBy[, rate := N.x / N.y]
276 |   rateBy[, (myVar) := factor(get(myVar), levels = newLevels)]
277 |   
278 |   ## Compute summary statistics in a wide format
279 |   cbytime = copy(rateBy);
280 |   names(cbytime)[names(cbytime) == myVar] = 'category'
281 |   names(cbytime)[names(cbytime) == dateGp] = 'date_group'
282 |   ## Compute global counts and rates
283 |   cglobal = cbytime[, list(global_count = sum(N.x)), by = 'category'];
284 |   cglobal[, global_rate := global_count / sum(global_count)];
285 |   ## Change cbytime into the wide format
286 |   cbytime = dcast(cbytime[, c('date_group', 'category', 'rate')], 
287 |                   category ~ date_group, value.var = 'rate');
288 |   ## Combine cglobal into cbytime
289 |   cbytime = merge(cglobal, cbytime, by = 'category')
290 |   ## Add a column: variable
291 |   cbytime[, variable := myVar];
292 |   setcolorder(cbytime, c(ncol(cbytime), 1:(ncol(cbytime) - 1)))
293 |   ## Add a row of NA being all zero, if no missing
294 |   if('NA' %in% cbytime$category == FALSE){
295 |     cbytime = rbind(cbytime, as.list(rep(NA, ncol(cbytime)))) 
296 |     cbytime[nrow(cbytime), 1:2] = list(myVar, 'NA')
297 |     cbytime[nrow(cbytime), 3:(ncol(cbytime))] = 0;
298 |   }
299 |   
300 |   ## Plot less frequent category only for a binary variable.
301 |   ## This helps when there is a large class imbalance, because the range of y-axis for all trace plots is the same.
302 |   if (length(newLevels) == 2) {
303 |     rateBy <- rateBy[get(myVar) == newLevels[2]]
304 |   }
305 |   
306 |   if(length(newLevels) <= kCategories){
307 |     p <- ggplot2::ggplot(rateBy,
308 |                          ggplot2::aes_string(x = dateGp, y = "rate"))   
309 |   } else {
310 |     p <- ggplot2::ggplot(rateBy[get(myVar) %in% newLevels[1:kCategories]],
311 |                          ggplot2::aes_string(x = dateGp, y = "rate"))
312 |   }
313 |   
314 |   p <- p +
315 |     ggplot2::geom_line(stat = "identity")  +
316 |     ggplot2::facet_wrap(stats::as.formula(paste("~", myVar))) +
317 |     ggplot2::ylab("") +
318 |     ggplot2::scale_x_date() +
319 |     ggplot2::theme(axis.text.x=ggplot2::element_text(angle = 30, hjust = 1)) +
320 |     ggplot2::scale_y_continuous(labels=scales::percent)
321 |   
322 |   return(list(p = p, catVarSummary = cbytime));
323 |   
324 | }
325 | 


--------------------------------------------------------------------------------