├── data
├── bankData.rda
└── bankLabels.rda
├── tests
├── testthat.R
└── testthat
│ ├── rawData.rda
│ ├── drugLabel.rda
│ ├── testData.rda
│ ├── PlotHistogram.RDS
│ ├── test_PlotCatVar.R
│ ├── test_PlotNumVar.R
│ ├── drugLabel.csv
│ ├── test_vlm.R
│ ├── test_PlotRates.R
│ ├── test_PlotMean.R
│ ├── test_PlotQuantiles.R
│ ├── test_PlotBarplot.R
│ ├── test_PlotRatesOverTime.R
│ ├── test_PlotDist.R
│ ├── test_CalcR2.R
│ ├── test_SummaryStats.R
│ ├── rawData_bigint.csv
│ ├── test_OrderByR2.R
│ ├── rawData.csv
│ └── test_PrepData.R
├── .Rbuildignore
├── figures
├── sample_plots_numerical.png
└── sample_plots_categorical.png
├── .travis.yml
├── cran-comments.md
├── CODEOWNERS
├── .gitignore
├── man
├── bankLabels.Rd
├── PlotRates.Rd
├── PlotQuantiles.Rd
├── PlotMean.Rd
├── PlotBarplot.Rd
├── PrepLabels.Rd
├── CalcR2.Rd
├── SummaryStats.Rd
├── PlotDist.Rd
├── bankData.Rd
├── PlotRatesOverTime.Rd
├── otvPlots.Rd
├── OrderByR2.Rd
├── PlotNumVar.Rd
├── PlotCatVar.Rd
├── PrintPlots.Rd
├── PrepData.Rd
├── PlotVar.Rd
└── vlm.Rd
├── NAMESPACE
├── DESCRIPTION
├── R
├── utils.R
├── data.R
├── package_otvPlots.R
├── plots_order.R
├── vlm.R
├── plot_print.R
└── categorical.R
├── README.md
└── LICENSE
/data/bankData.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/data/bankData.rda
--------------------------------------------------------------------------------
/data/bankLabels.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/data/bankLabels.rda
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(otvPlots)
3 |
4 | test_check("otvPlots")
5 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.travis\.yml$
2 | figures
3 | cran-comments.md
4 | .whitesource
5 | CODEOWNERS
6 |
--------------------------------------------------------------------------------
/tests/testthat/rawData.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/rawData.rda
--------------------------------------------------------------------------------
/tests/testthat/drugLabel.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/drugLabel.rda
--------------------------------------------------------------------------------
/tests/testthat/testData.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/testData.rda
--------------------------------------------------------------------------------
/figures/sample_plots_numerical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/figures/sample_plots_numerical.png
--------------------------------------------------------------------------------
/tests/testthat/PlotHistogram.RDS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/tests/testthat/PlotHistogram.RDS
--------------------------------------------------------------------------------
/figures/sample_plots_categorical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capitalone/otvPlots/HEAD/figures/sample_plots_categorical.png
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
2 |
3 | language: R
4 | sudo: false
5 | cache: packages
6 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Resubmission
2 | This is a resubmission. In this version I have:
3 |
4 | * Remove the VignetteBuilder field in DESCRIPTION.
5 |
6 | * Modify the Description file in DESCRIPTION, by removing “this package” at the beginning.
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # This is a comment.
2 | # Each line is a file pattern followed by one or more owners.
3 |
4 | # These owners will be the default owners for everything in
5 | # the repo. Unless a later match takes precedence,
6 | # @yingboli and @Yingru will be requested for
7 | # review when someone opens a pull request.
8 | * @yingboli @Yingru
9 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotCatVar.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot categorical variable")
4 | load("../testthat/testData.rda")
5 | setDT(testData)
6 |
7 | test_that("PlotCatVar returns a gtable", {
8 | PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months")
9 | p <- PlotCatVar("marital", testData, NULL, "weeks", "months")$p
10 | expect_is(p, "gtable")
11 | })
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Capital One Services, LLC
2 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not use
3 | # this file except in compliance with the License. You may obtain a copy of the
4 | # License at http://www.apache.org/licenses/LICENSE-2.0
5 | # Unless required by applicable law or agreed to in writing, software
6 | # distributed under the License is distributed on an "AS IS" BASIS,
7 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 | # See the License for the specific language governing permissions and limitations under the License.
9 | .Rproj.user
10 | .Rhistory
11 | .RData
12 | inst/doc
13 | .pdf
14 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotNumVar.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot Continuous Variable")
4 | load("../testthat/testData.rda")
5 | setDT(testData)
6 |
7 | test_that("PlotNumVar returns a gtable", {
8 | PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months")
9 | p <- PlotNumVar("age", testData, NULL, "weeks", "months",
10 | skewOpt = 3, kSample = NULL)$p
11 | expect_is(p, "gtable")
12 | })
13 |
14 | test_that("Incorrect skewOpt creates error", {
15 | PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "months")
16 | expect_error(PlotNumVar("age", testData, NULL, "weeks", "months",
17 | skewOpt = "test", kSample = NULL)$p)
18 | })
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/man/bankLabels.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{bankLabels}
5 | \alias{bankLabels}
6 | \title{Labels for bankData}
7 | \format{A data frame with 16 rows and 3 variables:
8 | \describe{
9 | \item{V1}{Name of each variable in \code{\link{bankData}}.}
10 | \item{V2}{Label of each variable in \code{\link{bankData}}.}
11 | \item{V3}{A numeric variable, corresponding to the row number.}
12 | }}
13 | \usage{
14 | bankLabels
15 | }
16 | \description{
17 | A dataset containing the attribute labels also found in \code{\link{bankData}}.
18 | This data set is used to illustrate the \code{\link{PrepLabels}} function and
19 | other label functionality in the \code{\link{otvPlots}} package in R.
20 | }
21 | \keyword{datasets}
22 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(CalcR2)
4 | export(OrderByR2)
5 | export(PlotBarplot)
6 | export(PlotCatVar)
7 | export(PlotDist)
8 | export(PlotMean)
9 | export(PlotNumVar)
10 | export(PlotQuantiles)
11 | export(PlotRates)
12 | export(PlotRatesOverTime)
13 | export(PlotVar)
14 | export(PrepData)
15 | export(PrepLabels)
16 | export(PrintPlots)
17 | export(SummaryStats)
18 | export(vlm)
19 | import(data.table)
20 | import(ggplot2)
21 | importFrom(Hmisc,wtd.mean)
22 | importFrom(Hmisc,wtd.quantile)
23 | importFrom(Hmisc,wtd.var)
24 | importFrom(grDevices,cairo_pdf)
25 | importFrom(grDevices,dev.off)
26 | importFrom(graphics,par)
27 | importFrom(grid,gpar)
28 | importFrom(grid,grid.draw)
29 | importFrom(grid,grid.newpage)
30 | importFrom(grid,textGrob)
31 | importFrom(grid,unit)
32 | importFrom(grid,unit.c)
33 | importFrom(gridExtra,arrangeGrob)
34 | importFrom(moments,skewness)
35 | importFrom(scales,hue_pal)
36 | importFrom(stats,lm.fit)
37 | importFrom(stats,lm.wfit)
38 | importFrom(stats,quantile)
39 | importFrom(stats,sd)
40 | importFrom(stats,var)
41 | importFrom(stringi,stri_trans_general)
42 | importFrom(utils,tail)
43 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: otvPlots
2 | Title: Over Time Variable Plots
3 | Version: 0.2.1
4 | Authors@R: c(
5 | person("Rebecca", "Payne", role = "aut"),
6 | person("Zoey", "Zhu", role = c("aut")),
7 | person("Yingbo", "Li", email = "yingbo.li@capitalone.com", role = c("aut", "cre")),
8 | person("Capital One", role = "cph"))
9 | Description: Enables automated visualization of variable
10 | distribution and changes over time for predictive model building.
11 | Computes summary statistics aggregated by time for
12 | large datasets, and creates plots for variable level monitoring.
13 | Depends:
14 | R (>= 3.2.0)
15 | Imports:
16 | data.table (>= 1.9.6),
17 | ggplot2 (>= 2.1.0),
18 | grid (>= 3.2.0),
19 | gridExtra (>= 2.2.1),
20 | Hmisc (>= 3.17-4),
21 | moments,
22 | quantreg (>= 5.33),
23 | scales (>= 0.4.0),
24 | stringi (>= 1.1.1)
25 | License: Apache License 2.0 | file LICENSE
26 | LazyData: true
27 | Suggests:
28 | bit64,
29 | knitr,
30 | proto,
31 | testthat
32 | URL: https://github.com/capitalone/otvPlots
33 | BugReports: https://github.com/capitalone/otvPlots/issues
34 | RoxygenNote: 6.0.1
35 |
--------------------------------------------------------------------------------
/tests/testthat/drugLabel.csv:
--------------------------------------------------------------------------------
1 | col1,col2,,,,,
2 | CaseNumber,The case number,,,,,
3 | date,Date of the test,,,,,
4 | Sex,Gender of the patient,,,,,
5 | Race,Race of the patient,,,,,
6 | Age,Age of the patient,,,,,
7 | "Re""side-nce .City",,,,,,
8 | Residence State,,,,,,
9 | Residence County,,,,,,
10 | Death City,,,,,,
11 | ,Wrong result,,,,,
12 | Death State,,,,,,
13 | Death County,,,,,,
14 | Location,,,,,,
15 | DescriptionofInjury,The kind of injury the patient has,,,,,
16 | InjuryPlace,The place the injury exists,,,,,
17 | ImmediateCauseA,The cause of the injury,,,,,
18 | Heroin,Level of heroin used,,,,,
19 | Cocaine,Level of Cocaine used,,,,,
20 | Fentanyl,Level of Fentanyl used,,,,,
21 | Oxycodone,Level of Oxycodone used,,,,,
22 | Oxymorphone,Level of Oxymorphone used,,,,,
23 | EtOH,Level of EtOH used,,,,,
24 | Hydro-codeine,Level of Hydro-codeine used,,,,,
25 | Benzodiazepine,Level of Benzodiazepine used,,,,,
26 | Methadone,Level of Methadone used,,,,,
27 | Amphet,Level of Amphet used,,,,,
28 | Tramad,Level of Tramad used,,,,,
29 | Morphine_not_heroin,Morphine not heroin,,,,,
30 | Other,Other things,,,,,
31 | Any Opioid,Whether there is opioid,,,,,
32 | MannerofDeath,Manner of death,,,,,
33 | DeathLoc,The death location,,,,,
--------------------------------------------------------------------------------
/tests/testthat/test_vlm.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | context("Run the main function: vlm")
3 | drugSASDate <- read.csv("../testthat/drugSASDate.csv")
4 |
5 | test_that("At most one of sortVars and sortFn is passed in", {
6 | expect_error(vlm(dataFl = "../testthat/drugSASDate.csv",
7 | dateNm = "date", sortVars = c("age", "residencecity")))
8 | })
9 |
10 | test_that("varNms is a subset of sortVars", {
11 | expect_error(vlm(dataFl = drugSASDate, dateNm = "date",
12 | sortVars = c("age", "residencecity"), varNms = c("age")))
13 | })
14 |
15 | test_that("Incorrect file input when prepData is False", {
16 | expect_error(vlm(dataFl = "../testthat/drugRDate.csv", dateNm = "date",
17 | prepData = FALSE))
18 | })
19 |
20 | test_that("selectCols and dropCols together give an error", {
21 | expect_error(vlm("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
22 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
23 | selectCols = c("age", "balance", "date", "weight"),
24 | dropCols = c("default"), varNms = c("age")))
25 |
26 | expect_error(vlm("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
27 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
28 | selectCols = c("age", "balance", "date", "weight"),
29 | dropCols = c("default")))
30 | })
31 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotRates.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot Continuous Rates over Time")
4 | load("../testthat/testData.rda")
5 | testData <- setDT(testData)
6 | testData <- testData[, .(balance, weight, date)]
7 | testData[, weeks := round(date, "weeks")]
8 | testDT = testData[, {list("zerorate" = mean(balance == 0),
9 | "missingrate" = mean(is.na(balance)))},
10 | by = "weeks"]
11 | testMT = melt(testDT, id.vars = "weeks",
12 | measure.vars = c("zerorate", "missingrate"))
13 |
14 |
15 | test_that("Plot layers match expectations",{
16 | p <- PlotRates(testMT, "balance", "weeks")
17 | expect_is(p$layers[[1]], "ggproto")
18 | expect_is(p$layers[[1]]$geom, "GeomLine")
19 | expect_is(p$layers[[1]]$stat, "StatIdentity")
20 | })
21 |
22 | test_that("X axis is labelled 'weeks'",{
23 | p <- PlotRates(testMT, "balance", "weeks")
24 | expect_identical(p$labels$x, "weeks")
25 | expect_identical(p$labels$y, NULL)
26 | })
27 |
28 |
29 | test_that("Mapping layer contains expected elements", {
30 | p <- PlotRates(testMT, myVar = "balance", dateGp = "weeks")
31 | expect_true( "colour" %in% names(p$mapping))
32 | expect_true( "group" %in% names(p$mapping))
33 | expect_true( "x" %in% names(p$mapping))
34 | expect_true( "y" %in% names(p$mapping))
35 | expect_length(setdiff(c("colour", "group", "x", "y"), names(p$mapping)), 0)
36 | })
37 |
38 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotMean.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot Mean over Time")
4 | load("../testthat/testData.rda")
5 | testData <- setDT(testData)
6 | testData <- testData[, .(balance, weight, date)]
7 | testData[, weeks := round(date, "weeks")]
8 |
9 | testDT = testData[, .(Mean = mean(balance)), by = "weeks"]
10 | cl = testData[, c(mean(balance), sd(balance))]
11 | cl = cl %*% matrix(c(1, 1, 1, -1), byrow = TRUE, nrow = 2) # mean +- 1 SD
12 | testDT[, c("cl1", "cl2") := list(cl[1], cl[2]) ]
13 | testMT = melt(testDT, id.vars = "weeks",
14 | measure.vars = c("Mean", "cl1", "cl2"))
15 |
16 | test_that("Plot layers match expectations",{
17 | p <- PlotMean(testMT, "balance", "weeks")
18 | expect_is(p$layers[[1]], "ggproto")
19 | expect_is(p$layers[[1]]$geom, "GeomLine")
20 | expect_is(p$layers[[1]]$stat, "StatIdentity")
21 | })
22 |
23 | test_that("X axis is labelled 'weeks'",{
24 | p <- PlotMean(testMT, "balance", "weeks")
25 | expect_identical(p$labels$x, "weeks")
26 | expect_identical(p$labels$y, NULL)
27 | })
28 |
29 | test_that("Scale is discrete",{
30 | p <- PlotMean(testMT, "balance", "weeks")
31 | expect_is(p$scales$scales[[1]], "ScaleDiscrete")
32 | })
33 |
34 | test_that("Mapping layer contains expected elements",{
35 | p <- PlotMean(testMT, "balance", "weeks")
36 | expect_true( "group" %in% names(p$mapping))
37 | expect_true("linetype" %in% names(p$mapping))
38 | expect_true( "x" %in% names(p$mapping))
39 | expect_true( "y" %in% names(p$mapping))
40 | expect_length(setdiff(c("group", "linetype", "x", "y"), names(p$mapping)), 0)
41 | })
--------------------------------------------------------------------------------
/man/PlotRates.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/numerical.R
3 | \name{PlotRates}
4 | \alias{PlotRates}
5 | \title{Plot zero and missing rates for a numerical variable}
6 | \usage{
7 | PlotRates(meltdx, myVar, dateGp)
8 | }
9 | \arguments{
10 | \item{meltdx}{A \code{data.table} with missing rate and zero rate in long
11 | format, produced by \code{\link{SummaryStats}}}
12 |
13 | \item{myVar}{The name of the variable to be plotted}
14 |
15 | \item{dateGp}{Name of the variable that the time series plots should be
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | }
20 | \value{
21 | A \code{ggplot2} object with a \code{missingrate} and
22 | \code{zerorate} grouped by \code{dateGp}.
23 | }
24 | \description{
25 | Plot zero and missing rates for a numerical variable
26 | }
27 | \section{License}{
28 |
29 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
30 | Version 2.0 (the "License"); you may not use this file except in compliance
31 | with the License. You may obtain a copy of the License at
32 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
33 | or agreed to in writing, software distributed under the License is
34 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
35 | KIND, either express or implied. See the License for the specific language
36 | governing permissions and limitations under the License.
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotQuantiles.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot Quantiles over Time")
4 | load("../testthat/testData.rda")
5 | setDT(testData)
6 |
7 | testData[, weeks := round(date, "weeks")]
8 | testDT = testData[, {
9 | tmp1 = quantile(balance, p = c(.01, .5, .99));
10 | list("p1" = tmp1[1] ,
11 | "p50" = tmp1[2] ,
12 | "p99" = tmp1[3]
13 | )}, by = "weeks"]
14 |
15 | testMT = melt(testDT, id.vars = "weeks",
16 | measure.vars = c("p99", "p50","p1"))
17 | globalPct = testData[ , quantile(balance, p = c(.01, .5, .99) ) ]
18 | globalDT = data.table("weeks" = rep(testMT[variable == "p99", "weeks",
19 | with = FALSE][[1]], 3))
20 | globalDT[, c("variable", "value") := list(rep(c("p1_g", "p50_g", "p99_g"),
21 | each = .N/3),
22 | rep(globalPct, each = .N/3))]
23 | testMT = rbindlist(list( testMT, globalDT))
24 |
25 |
26 | test_that("Plot layers match expectations",{
27 | p <- PlotQuantiles(testMT, myVar = "balance", dateGp = "weeks")
28 | expect_is(p$layers[[1]], "ggproto")
29 | expect_is(p$layers[[1]]$geom, "GeomLine")
30 | expect_is(p$layers[[1]]$stat, "StatIdentity")
31 | })
32 |
33 | test_that("Mapping layer contains expected elements", {
34 | p <- PlotQuantiles(testMT, myVar = "balance", dateGp = "weeks")
35 | expect_true( "colour" %in% names(p$mapping))
36 | expect_true( "linetype" %in% names(p$mapping))
37 | expect_true( "group" %in% names(p$mapping))
38 | expect_true( "x" %in% names(p$mapping))
39 | expect_true( "y" %in% names(p$mapping))
40 | expect_length(setdiff(c("colour", "linetype", "group", "x", "y"), names(p$mapping)), 0)
41 | })
42 |
43 |
--------------------------------------------------------------------------------
/man/PlotQuantiles.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/numerical.R
3 | \name{PlotQuantiles}
4 | \alias{PlotQuantiles}
5 | \title{Plot 01, 50, and 99 percentile for a numerical variable}
6 | \usage{
7 | PlotQuantiles(meltdx, myVar, dateGp)
8 | }
9 | \arguments{
10 | \item{meltdx}{A data.table with p1, p50, and p99 in long format, produced by
11 | \code{\link{SummaryStats}}}
12 |
13 | \item{myVar}{The name of the variable to be plotted}
14 |
15 | \item{dateGp}{Name of the variable that the time series plots should be
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | }
20 | \value{
21 | A \code{ggplot2} object with \code{dateGp} on the x axis,
22 | \code{value} on the y axis, and variables \code{p01}, \code{p50}, and
23 | \code{p99} plotted on the same graph, with grouped and global percentiles
24 | differentiated by line type.
25 | }
26 | \description{
27 | Plot 01, 50, and 99 percentile for a numerical variable
28 | }
29 | \section{License}{
30 |
31 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
32 | Version 2.0 (the "License"); you may not use this file except in compliance
33 | with the License. You may obtain a copy of the License at
34 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
35 | or agreed to in writing, software distributed under the License is
36 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
37 | KIND, either express or implied. See the License for the specific language
38 | governing permissions and limitations under the License.
39 | }
40 |
41 |
--------------------------------------------------------------------------------
/man/PlotMean.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/numerical.R
3 | \name{PlotMean}
4 | \alias{PlotMean}
5 | \title{Plot mean with {Mean +- 1SD} control limits for a numerical variable}
6 | \usage{
7 | PlotMean(meltdx, myVar, dateGp)
8 | }
9 | \arguments{
10 | \item{meltdx}{A \code{data.table} with Mean and 1SD control limits in long format,
11 | produced by \code{\link{SummaryStats}}}
12 |
13 | \item{myVar}{The name of the variable to be plotted}
14 |
15 | \item{dateGp}{Name of the variable that the time series plots should be
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 | }
20 | \value{
21 | A \code{ggplot2} object with \code{dateGp} on the x axis,
22 | \code{value} on the y axis, and variables \code{Mean}, \code{cl1}, and
23 | \code{cl2} plotted on the same graph, with mean and control limits
24 | differentiated by line type.
25 | }
26 | \description{
27 | Plot mean with {Mean +- 1SD} control limits for a numerical variable
28 | }
29 | \section{License}{
30 |
31 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
32 | Version 2.0 (the "License"); you may not use this file except in compliance
33 | with the License. You may obtain a copy of the License at
34 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
35 | or agreed to in writing, software distributed under the License is
36 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
37 | KIND, either express or implied. See the License for the specific language
38 | governing permissions and limitations under the License.
39 | }
40 |
41 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotBarplot.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot bar plot")
4 | load("../testthat/testData.rda")
5 | setDT(testData)
6 | suppressMessages(PrepData(testData, dateNm = "date",
7 | dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight"))
8 |
9 | test_that("expected plot elements are returned", {
10 | p <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = "weight")
11 |
12 | expect_is(p$layers[[1]], "ggproto")
13 | expect_is(p$layers[[1]]$geom, "GeomBar")
14 | expect_is(p$layers[[1]]$stat, "StatIdentity")
15 | expect_identical(p$labels$x, "job")
16 | expect_identical(p$labels$y, "count")
17 | expect_is(p$scales$scales[[1]], "ScaleDiscrete")
18 | expect_true( "group" %in% names(p$mapping))
19 | expect_true( "x" %in% names(p$mapping))
20 | expect_true( "y" %in% names(p$mapping))
21 | expect_length(setdiff(c("group", "x", "y"), names(p$mapping)), 0)
22 | })
23 |
24 | test_that("variable is put in expected order with and without weights", {
25 | p <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = "weight")
26 | o1 <- names(rev(sort(xtabs(weight~job, data=testData))))
27 | o2 <- as.character(p$data[order(-count)][["job"]])
28 | expect_equal(o1, o2)
29 |
30 | p <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = NULL)
31 | o1 <- names(rev(sort(testData[, table(job)])))
32 | o2 <- rev(as.character(p$data[order(count)][["job"]]))
33 | expect_equal(o1, o2)
34 | })
35 |
36 | test_that("global totals are calculated as expected", {
37 | p1 <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = "weight")
38 | expect_equal(as.numeric(p1$data[job=="retired"]$count), as.numeric(testData[job=="retired", sum(weight)]))
39 | p2 <- PlotBarplot(dataFl = testData, myVar = "job", weightNm = NULL)
40 | expect_equal(as.numeric(p2$data[job=="entrepreneur"]$count), as.numeric(testData[job=="entrepreneur", .N]))
41 | })
42 |
43 |
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/man/PlotBarplot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/categorical.R
3 | \name{PlotBarplot}
4 | \alias{PlotBarplot}
5 | \title{Creates a bar plot for a discrete (or binary) variable}
6 | \usage{
7 | PlotBarplot(dataFl, myVar, weightNm = NULL)
8 | }
9 | \arguments{
10 | \item{dataFl}{A \code{data.table} of data; must be the output of the
11 | \code{\link{PrepData}} function.}
12 |
13 | \item{myVar}{The name of the variable to be plotted}
14 |
15 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
16 | no weights (all rows receiving weight 1).}
17 | }
18 | \value{
19 | A \code{ggplot} object with a histogram of \code{myVar} ordered by
20 | category frequency
21 | }
22 | \description{
23 | Creates a bar plot for a discrete (or binary) variable
24 | }
25 | \section{License}{
26 |
27 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
28 | Version 2.0 (the "License"); you may not use this file except in compliance
29 | with the License. You may obtain a copy of the License at
30 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
31 | or agreed to in writing, software distributed under the License is
32 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
33 | KIND, either express or implied. See the License for the specific language
34 | governing permissions and limitations under the License.
35 | }
36 |
37 | \examples{
38 | data(bankData)
39 | bankData = PrepData(bankData, dateNm = "date", dateGp = "months",
40 | dateGpBp = "quarters", weightNm = NULL)
41 | PlotBarplot(bankData, "job")
42 |
43 | ## NA will be included as a category if any NA are present
44 | bankData[sample.int(.N)[1:1000], education := NA]
45 | PlotBarplot(bankData, "education")
46 | }
47 | \seealso{
48 | Functions depend on this function:
49 | \code{\link{PlotCatVar}}.
50 |
51 | This function depends on:
52 | \code{\link{PrepData}}.
53 | }
54 |
--------------------------------------------------------------------------------
/man/PrepLabels.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/prep.R
3 | \name{PrepLabels}
4 | \alias{PrepLabels}
5 | \title{Prepare variable labels}
6 | \usage{
7 | PrepLabels(labelFl, idx = 1:2)
8 | }
9 | \arguments{
10 | \item{labelFl}{Either the path of a dataset (a csv file) containing
11 | labels, an R object convertible to \code{data.table} (e.g., data frame) or
12 | \code{NULL}. If \code{NULL}, no labels will be used. The label dataset must
13 | contain at least 2 columns: \code{varCol} (variable names) and
14 | \code{labelCol} (variable labels).}
15 |
16 | \item{idx}{A vector of length 2, giving column index of variable names (first
17 | position) and labels (second position).}
18 | }
19 | \value{
20 | A data table formated for use by the \code{\link{vlm}} function.
21 | }
22 | \description{
23 | This function prepares a dataset containing variable labels for use by
24 | the main plotting function \code{\link{vlm}}. The input must contain
25 | variables' names in the first column and labels in the second column. All other
26 | columns will be dropped. Special characters will create errors and should
27 | be stripped outside of R. All labels will be truncated at 145 characters.
28 | }
29 | \section{License}{
30 |
31 | Copyright 2017 Capital One Services, LLC Licensed under the
32 | Apache License, Version 2.0 (the "License"); you may not use this file
33 | except in compliance with the License. You may obtain a copy of the
34 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
35 | applicable law or agreed to in writing, software distributed under the
36 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
37 | CONDITIONS OF ANY KIND, either express or implied. See the License for the
38 | specific language governing permissions and limitations under the License.
39 | }
40 |
41 | \examples{
42 | data(bankLabels)
43 | bankLabels <- PrepLabels(bankLabels)
44 | }
45 | \seealso{
46 | Functions depend on this function:
47 | \code{\link{PrintPlots}},
48 | \code{\link{vlm}}.
49 | }
50 |
--------------------------------------------------------------------------------
/man/CalcR2.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plots_order.R
3 | \name{CalcR2}
4 | \alias{CalcR2}
5 | \title{Calculates R2 of a numerical variable using date as the predictor}
6 | \usage{
7 | CalcR2(myVar, dataFl, dateNm, weightNm = NULL, imputeValue = NULL)
8 | }
9 | \arguments{
10 | \item{myVar}{Name of variable to model.}
11 |
12 | \item{dataFl}{A \code{data.table}, containing \code{myVar}, \code{dateNm},
13 | and \code{weightNm}.}
14 |
15 | \item{dateNm}{Name of column containing the date variable (to be modeled as
16 | numeric); this date column must not have NA's.}
17 |
18 | \item{weightNm}{Name of column containing row weights. If weights equal one,
19 | then the \code{\link{lm.fit}} function will be called, otherwise the
20 | \code{\link{lm.wfit}} will be called. The weights column must not have NA's.}
21 |
22 | \item{imputeValue}{Either \code{NULL} or numeric. If \code{NULL}, model will
23 | be fit on only non-NA components of \code{myVar}. If numeric, missing cases
24 | of \code{myVar} will be imputed to \code{imputeValue}.}
25 | }
26 | \value{
27 | A numeric value of R2.
28 | }
29 | \description{
30 | Calculates weighted R2 of a univariate weighted linear model with
31 | \code{dateNm} as x and \code{myVar} as y using the workhorse \code{lm.fit}
32 | and \code{lm.wfit} functions.
33 | }
34 | \section{License}{
35 |
36 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
37 | Version 2.0 (the "License"); you may not use this file except in compliance
38 | with the License. You may obtain a copy of the License at
39 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
40 | or agreed to in writing, software distributed under the License is
41 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
42 | KIND, either express or implied. See the License for the specific language
43 | governing permissions and limitations under the License.
44 | }
45 |
46 | \seealso{
47 | Functions depend on this function:
48 | \code{\link{OrderByR2}}.
49 |
50 | This function depends on:
51 | \code{\link{PrepData}}.
52 | }
53 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotRatesOverTime.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot trace plots of categories' proportions over time")
4 | load("../testthat/testData.rda")
5 | setDT(testData)
6 | require(ggplot2)
7 | suppressMessages(PrepData(testData, dateNm = "date",
8 | dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight"))
9 | p <- PlotRatesOverTime(dataFl = testData, dateGp = "weeks", myVar = "job",
10 | weightNm = "weight", newLevels = NULL)$p
11 | test_that("expected plot elements are returned", {
12 | expect_is(p$layers[[1]], "ggproto")
13 | expect_is(p$layers[[1]]$geom, "GeomLine")
14 | expect_is(p$layers[[1]]$stat, "StatIdentity")
15 | expect_is(p$layers[[1]]$position, "PositionIdentity")
16 | expect_identical(p$labels$x, "weeks")
17 | expect_identical(p$labels$y, "")
18 | expect_is(p$scales$scales[[1]], "ScaleContinuousDate")
19 | })
20 |
21 | test_that("rates are calculated correctly normalized by time", {
22 | dat = p$data
23 | # check that all weeks sum to 1
24 | dat[, sum := sum(rate), by = "weeks"]
25 | dat[, table(sum)]
26 | expect_length(dat[, table(sum)], 1)
27 |
28 | # check that 2008-06-03 is correctly calculated
29 | tmpData = testData[weeks == "2008-06-03"]
30 | tmpData[, rate1 := sum(weight), by = "job"]
31 | tmpData[, rate0 := sum(weight)]
32 | tmpData[, rate := rate1/rate0]
33 |
34 | tmpData = unique(tmpData[, .(job, weeks, rate)])
35 | dat = dat[weeks == "2008-06-03" & rate > 0, .(weeks, job, rate)]
36 | dat[, job := as.character(job)]
37 | setkey(dat, job)
38 | setkey(tmpData, job)
39 | expect_equal(dat[, rate], tmpData[, rate])
40 | })
41 |
42 | test_that("rates are calculated correctly normalized by var", {
43 | p <- PlotRatesOverTime(dataFl = testData, dateGp = "weeks", myVar = "job",
44 | weightNm = "weight", newLevels = NULL, normBy = "var")$p
45 | dat = p$data
46 | dat[, sum := sum(rate), by = "job"]
47 |
48 | #check all var rates sum to one
49 | expect_length(dat[, table(sum)], 1)
50 | expect_equal(dat[1, sum], 1)
51 |
52 | # check that rates are correctly calculated for technician
53 | tmpData = testData[job == "technician"]
54 | tmpData[, rate1:=sum(weight), by = "weeks"]
55 | tmpData[, rate0:= sum(weight)]
56 | tmpData[, rate := rate1/rate0]
57 | tmpData = unique(tmpData[, .(job, weeks, rate)])
58 | expect_equal(tmpData[1:4, rate], dat[job=="technician"][2:5, rate])
59 | })
60 |
61 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Copyright 2017 Capital One Services, LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | #
8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied.
13 | #
14 | # See the License for the specific language governing permissions and limitations under the License.
15 |
16 |
17 | ###########################################
18 | # Utility Functions #
19 | ###########################################
20 |
21 | is.nmrcl <- function(x) inherits(x, "nmrcl")
22 | is.ctgrl <- function(x) inherits(x, "ctgrl")
23 |
24 | wtd_quantile_NA <- function(x, weights, probs = c(.0, .25, .5, .75, 1),
25 | ...) { #!# previous name: wtd.quantile_NA
26 | tryCatch(as.double(Hmisc::wtd.quantile(x, weights, probs,
27 | normwt = TRUE, na.rm = TRUE, ...)),
28 | error = function(e) rep(NA_real_, length(probs)))
29 | }
30 |
31 | ## The color-blind friendly color palette
32 | ## Source: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
33 | cbbPalette <- c("#D55E00", "#009E73", "#0072B2", "#000000", "#E69F00", "#56B4E9", "#F0E442", "#CC79A7")
34 |
35 | # # An example function for fuzzy label matching
36 | # # To be used an input of the \code{\link{PlotVar}} function.
37 | # # If variables look like VAR_nameofvar, and the attribute dictionary contains
38 | # # defintions only for nameofvar, then a fuzzy matching function can be
39 | # # provided which would first attempt to match exactly, and then to attempt to
40 | # # match on the longest piece after splitting on the underscore:
41 | #
42 | # Fuzzy = function(LabelFl, myVar){
43 | # ll = labelFl[varCol == myVar, labelCol] # exact match
44 | # if (ll == ""){
45 | # # split on "_", search for exact match of longest piece
46 | # shortNm = names(which.max(sapply(strsplit(myVar, "_")[[1]], nchar)))
47 | # ll = labelFl[varCol == shortNm, labelCol]
48 | # }
49 | # return(ll)
50 | # }
51 |
--------------------------------------------------------------------------------
/man/SummaryStats.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/numerical.R
3 | \name{SummaryStats}
4 | \alias{SummaryStats}
5 | \title{Create summary statistics for a numerical variable}
6 | \usage{
7 | SummaryStats(myVar, dataFl, dateGp, weightNm = NULL)
8 | }
9 | \arguments{
10 | \item{myVar}{The name of the variable to be plotted}
11 |
12 | \item{dataFl}{A \code{data.table} of data; must be the output of the
13 | \code{\link{PrepData}} function.}
14 |
15 | \item{dateGp}{Name of the variable that the time series plots should be
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 |
20 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
21 | no weights (all rows receiving weight 1).}
22 | }
23 | \value{
24 | \item{meltdx}{A \code{data.table} for use by the plotting funtions
25 | \code{\link{PlotMean}}, \code{\link{PlotQuantiles}}, and
26 | \code{\link{PlotRates}}.}
27 | \item{numVarSummary}{A \code{data.table} of summary statistics.}
28 | }
29 | \description{
30 | Create summary statistics for a numerical variable
31 | }
32 | \section{License}{
33 |
34 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
35 | Version 2.0 (the "License"); you may not use this file except in compliance
36 | with the License. You may obtain a copy of the License at
37 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
38 | or agreed to in writing, software distributed under the License is
39 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
40 | KIND, either express or implied. See the License for the specific language
41 | governing permissions and limitations under the License.
42 | }
43 |
44 | \examples{
45 | data(bankData)
46 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "quarters",
47 | dateGpBp = "years")
48 | mdx <- SummaryStats(myVar = "age", dataFl = bankData,
49 | dateGp = "quarters")$meltdx
50 | plot(PlotQuantiles(mdx[variable \%in\% c("p99", "p50", "p1", "p99_g", "p50_g",
51 | "p1_g")], "age", "quarters"))
52 | plot(PlotMean(mdx[variable \%in\% c("mean", "cl1", "cl2")], "age", "quarters"))
53 | plot(PlotRates(mdx, "age", "quarters"))
54 | }
55 |
--------------------------------------------------------------------------------
/tests/testthat/test_PlotDist.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | library(proto)
3 | context("Plot Boxplots")
4 | load("../testthat/testData.rda")
5 | setDT(testData)
6 | suppressMessages(PrepData(dataFl = testData, dateNm = "date", dateGp = "weeks", dateGpBp = "weeks"))
7 |
8 | test_that("Plot layers match expectations",{
9 | p <- PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", weightNm = "weight")
10 | expect_is(p$layers[[1]], "ggproto")
11 | expect_is(p$layers[[1]]$geom, "GeomBoxplot")
12 | expect_is(p$layers[[1]]$stat, "StatBoxplot")
13 | expect_is(p$layers[[2]]$geom, "GeomRug")
14 | expect_is(p$layers[[2]]$stat, "StatIdentity")
15 | })
16 |
17 |
18 | test_that("Mapping layer contains expected elements", {
19 | p <- PlotDist(testData, myVar = "balance", dateGpBp = "weeks")
20 | expect_true( "group" %in% names(p$mapping))
21 | expect_true( "x" %in% names(p$mapping))
22 | expect_true( "y" %in% names(p$mapping))
23 | expect_length(setdiff(c("group", "x", "y"), names(p$mapping)), 0)
24 |
25 | expect_true( "x" %in% names(p$layers[[2]]$mapping))
26 | expect_true( "y" %in% names(p$layers[[2]]$mapping))
27 | expect_length(setdiff(c("x", "y"), names(p$mapping)), 0)
28 | })
29 |
30 |
31 | test_that("Y axis is labeled 'balance' and X axis is labeled 'weeks'",{
32 | p <- PlotDist(testData, "balance", "weeks")
33 | expect_identical(p$labels$x, "weeks")
34 | expect_identical(p$labels$y, "balance")
35 | })
36 |
37 | test_that("invalid log transform returns message and untransformed plot", {
38 | expect_message(PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", skewOpt = 3),
39 | "untransformed boxplot")
40 | p <- PlotDist(dataFl = testData, myVar = "balance", dateGpBp = "weeks", skewOpt = 3)
41 | expect_is(p$layers[[1]], "ggproto")
42 | expect_is(p$layers[[1]]$geom, "GeomBoxplot")
43 | expect_is(p$layers[[1]]$stat, "StatBoxplot")
44 | expect_is(p$layers[[2]]$geom, "GeomRug")
45 | expect_is(p$layers[[2]]$stat, "StatIdentity")
46 | expect_equal(length(grep("log10", p$labels$y)), 0)
47 | })
48 |
49 | test_that("valid log transform returns transformed scale",{
50 | testData[, posBalance := ifelse(balance >= 0.1, balance, 0.1)]
51 | p <- PlotDist(dataFl = testData, myVar = "posBalance", dateGpBp = "weeks", skewOpt = 3)
52 | expect_message(PlotDist(dataFl = testData, myVar = "posBalance", dateGpBp = "weeks", skewOpt = 3),
53 | "Scale for 'y' is already present")
54 | expect_is(p$layers[[1]], "ggproto")
55 | expect_is(p$layers[[1]]$geom, "GeomBoxplot")
56 | expect_is(p$layers[[1]]$stat, "StatBoxplot")
57 | expect_is(p$layers[[2]]$geom, "GeomRug")
58 | expect_is(p$layers[[2]]$stat, "StatIdentity")
59 | expect_equal(grep("log10", p$labels$y), 1)
60 | })
61 |
--------------------------------------------------------------------------------
/man/PlotDist.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/numerical.R
3 | \name{PlotDist}
4 | \alias{PlotDist}
5 | \title{Side-by-side box plots, for a numerical variable, grouped by \code{dateGpBp}}
6 | \usage{
7 | PlotDist(dataFl, myVar, dateGpBp, weightNm = NULL, skewOpt = NULL)
8 | }
9 | \arguments{
10 | \item{dataFl}{A \code{data.table} of data; must be the output of the
11 | \code{\link{PrepData}} function.}
12 |
13 | \item{myVar}{The name of the variable to be plotted}
14 |
15 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
16 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
17 |
18 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
19 | no weights (all rows receiving weight 1).}
20 |
21 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is
22 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
23 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
24 | Negative input of \code{skewOpt} will be converted to 3.}
25 | }
26 | \value{
27 | A \code{ggplot2} object with a box plot of \code{myVar} grouped by
28 | \code{dateGpBp}
29 | }
30 | \description{
31 | For a variable is all positive (no zeros) and has larger than 50 all distinct
32 | values, if it is highly skewed, then all box plots can be plotted under the
33 | log base 10 transformation. See the argument \code{skewOpt} for details.
34 | }
35 | \section{License}{
36 |
37 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
38 | Version 2.0 (the "License"); you may not use this file except in compliance
39 | with the License. You may obtain a copy of the License at
40 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
41 | or agreed to in writing, software distributed under the License is
42 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
43 | KIND, either express or implied. See the License for the specific language
44 | governing permissions and limitations under the License.
45 | }
46 |
47 | \examples{
48 | data(bankData)
49 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
50 | dateGpBp = "quarters")
51 | PlotDist(dataFl = bankData, myVar = "balance", dateGpBp = "quarters")
52 | ## The following attempt to log transform will fail due to negative values,
53 | ## and the untransformed version will be returned
54 | PlotDist(dataFl = bankData, myVar = "balance", dateGpBp = "quarters",
55 | skewOpt = 3)
56 | ## This attempt should succeed, as the skew exceeds 3 and there are no
57 | ## negative values
58 | PlotDist(dataFl = bankData, myVar = "duration", dateGpBp = "quarters",
59 | skewOpt = 3)
60 | }
61 |
--------------------------------------------------------------------------------
/tests/testthat/test_CalcR2.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | context("Calculate R-squared")
3 | load("../testthat/testData.rda")
4 | testData <- setDT(testData)
5 | testData <- testData[, .(age, weight, date)]
6 |
7 | test_that("CalcR2 gives correct R2 with weight", {
8 | test.R2 <- CalcR2("age", testData, "date", weightNm = "weight", imputeValue = NULL)
9 | ans.R2 <- summary(lm(age~date, weight=weight, data=testData))$r.squared
10 | expect_equal(test.R2, ans.R2)
11 | })
12 |
13 |
14 | test_that("CalcR2 gives correct R2 without weight", {
15 | test.R2 <- CalcR2("age", testData, "date", weightNm = NULL, imputeValue = NULL)
16 | ans.R2 <- summary(lm(age~date, data=testData))$r.squared
17 | expect_equal(test.R2, ans.R2)
18 | })
19 |
20 | #testData1 has missings in Y
21 | idx = sample.int(100, 10)
22 | testData1 = testData[idx, age:=NA]
23 |
24 | test_that("CalcR2 is correct with imputation in Y and weight", {
25 | test.R2 <- CalcR2("age", testData1, "date", weightNm = "weight", imputeValue = 0)
26 | ans.R2 <- summary(lm(age~date, data=testData1[is.na(age), age:=0], weight=weight))$r.squared
27 | expect_equal(test.R2, ans.R2)
28 | })
29 |
30 | test_that("CalcR2 is correct with imputation in Y", {
31 | test.R2 <- CalcR2("age", testData1, "date", weightNm = NULL, imputeValue = 0)
32 | ans.R2 <- summary(lm(age~date, data=testData1[is.na(age), age:=0]))$r.squared
33 | expect_equal(test.R2, ans.R2)
34 | })
35 |
36 | #testData2 has missings in weight and date, but not in Y
37 | testData2 = testData[sample.int(.N, 10), weight := NA]
38 | testData2 = testData2[sample.int(.N, 10), date := NA]
39 | test_that("CalcR2 is correct with missing values in weight and date", {
40 | test.R2 <- CalcR2("age", testData2, "date", weightNm = "weight", imputeValue = NULL)
41 | ans.R2 <- summary(lm(age~date, data=testData2, weight=weight))$r.squared
42 | expect_equal(test.R2, ans.R2)
43 | })
44 |
45 | #testData3 has missing in weight, date and Y
46 | testData3 = testData2[idx, age := NA]
47 | test_that("CalcR2 is correct with missing values in weight and date and Y", {
48 | test.R2 <- CalcR2("age", testData3, "date", weightNm = "weight", imputeValue = NULL)
49 | ans.R2 <- summary(lm(age~date, data=testData3, weight=weight))$r.squared
50 | expect_equal(test.R2, ans.R2)
51 | })
52 |
53 |
54 | test_that("CalcR2 is correct with missing values in weight and date and Y and imputation", {
55 | test.R2 <- CalcR2("age", testData3, "date", weightNm = "weight", imputeValue = 0)
56 | ans.R2 <- summary(lm(age~date, data=testData3[is.na(age), age:=0], weight=weight))$r.squared
57 | expect_equal(test.R2, ans.R2)
58 | })
59 |
60 |
61 | test_that("CalcR2 is correct with no weight and missing values in date and Y", {
62 | test.R2 <- CalcR2("age", testData3, "date", weightNm = NULL, imputeValue = NULL)
63 | ans.R2 <- summary(lm(age~date, data=testData3))$r.squared
64 | expect_equal(test.R2, ans.R2)
65 | })
66 |
67 |
68 | test_that("CalcR2 is correct with no weight and missing values in date and Y imputed", {
69 | test.R2 <- CalcR2("age", testData3, "date", weightNm = NULL, imputeValue = 0)
70 | ans.R2 <- summary(lm(age~date, data=testData3[is.na(age), age:=0]))$r.squared
71 | expect_equal(test.R2, ans.R2)
72 | })
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/man/bankData.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{bankData}
5 | \alias{bankData}
6 | \title{Direct marketing campaigns of a Portuguese banking institution}
7 | \format{A data frame with 45,211 rows and 19 variables:
8 | \describe{
9 | \item{age}{Age of the client, numeric.}
10 | \item{job}{Type of job, a categorical variable with the levels:
11 | \code{'admin.'}, \code{'blue-collar'}, \code{'entrepreneur'},
12 | \code{'housemaid'}, \code{'management'}, \code{'retired'},
13 | \code{'self-employed'}, \code{'services'}, \code{'student'},
14 | \code{'technician'}, \code{'unemployed'}, and \code{'unknown'}.}
15 | \item{marital}{Marital status, a categorical variable with levels:
16 | \code{'divorced'}, \code{'married'}, \code{'single'}, and \code{'unknown'}.
17 | Note that \code{'divorced'} means either divorced or widowed.}
18 | \item{education}{A categorical variable with levels: \code{'basic.4y'},
19 | \code{'basic.6y'}, \code{'basic.9y'}, \code{'high.school'},
20 | \code{'illiterate'}, \code{'professional.course'},
21 | \code{'university.degree'}, and \code{'unknown'}.}
22 | \item{default}{Whether credit is in default, a categorical variable with
23 | levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
24 | \item{balance}{Account balance, numeric.}
25 | \item{housing}{Whether the client has a housing loan, a categorical variable
26 | with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
27 | \item{loan}{Whether the client has personal loan, a categorical variable
28 | with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
29 | \item{contact}{Type of contact communication, a categorical variable
30 | with levels: \code{'cellular'} and \code{'telephone'}.}
31 | \item{duration}{Last contact duration in seconds, a numeric variable.}
32 | \item{campaign}{Number of contacts performed during this campaign for
33 | this client, including the last contact; a numeric variable.}
34 | \item{pdays}{Number of days that passed by after the client was last
35 | contacted from a previous campaign; a numeric variable, with \code{999}
36 | means that client was not previously contacted.}
37 | \item{previous}{Number of contacts performed before this campaign for this
38 | client, a numeric variable.}
39 | \item{poutcome}{Outcome of the previous marketing campaign, a categorical
40 | variable with levels: \code{'failure'}, \code{'nonexistent'},
41 | and \code{'success'}.}
42 | \item{y}{Whether the client has subscribed a term deposit, a categorical
43 | variable with levels: \code{'yes'} and \code{'no'}.}
44 | \item{date}{Last contact date.}
45 | }}
46 | \source{
47 | \url{https://archive.ics.uci.edu/ml/datasets/Bank+Marketing}
48 |
49 | \cite{Lichman, M. (2013). UCI Machine Learning Repository
50 | [\url{http://archive.ics.uci.edu/ml}]. Irvine, CA: University of California,
51 | School of Information and Computer Science.}
52 |
53 | \cite{S. Moro, P. Cortez, and P. Rita. (2014) A Data-Driven Approach
54 | to Predict the Success of Bank Telemarketing. Decision Support Systems,
55 | Elsevier, 62:22-31, June 2014.}
56 | }
57 | \usage{
58 | bankData
59 | }
60 | \description{
61 | The marketing campaigns were based on phone calls.
62 | Often, more than one contact to the same client was required, in order to
63 | access if the product (bank term deposit) would be ('yes') or not ('no')
64 | subscribed. Records are ordered by date (from May 2008 to November 2010),
65 | similar to data analyzed in Moro et al. [2014].
66 | }
67 | \keyword{datasets}
68 |
--------------------------------------------------------------------------------
/man/PlotRatesOverTime.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/categorical.R
3 | \name{PlotRatesOverTime}
4 | \alias{PlotRatesOverTime}
5 | \title{Creates trace plots of categories' proportions over time for a discrete (or
6 | binary) variable}
7 | \usage{
8 | PlotRatesOverTime(dataFl, dateGp, myVar, normBy = "time", weightNm = NULL,
9 | newLevels = NULL, kCategories = 9)
10 | }
11 | \arguments{
12 | \item{dataFl}{A \code{data.table} of data; must be the output of the
13 | \code{\link{PrepData}} function.}
14 |
15 | \item{dateGp}{Name of the variable that the time series plots should be
16 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
17 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
18 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
19 |
20 | \item{myVar}{The name of the variable to be plotted}
21 |
22 | \item{normBy}{The normalization factor for rate plots, can be \code{"time"}
23 | or \code{"var"}. If \code{"time"}, then for each time period of
24 | \code{dateGp}, counts are normalized by the total counts over all
25 | categories in that time period. This illustrates changes of categories'
26 | proportions over time. If \code{"var"}, then for each category, its counts
27 | are normalized by the total counts over time from only this category. This
28 | illustrates changes of categories' volumes over time.}
29 |
30 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
31 | no weights (all rows receiving weight 1).}
32 |
33 | \item{newLevels}{categories of \code{myVar} in order of global frequency}
34 |
35 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
36 | trace plots of only the \code{kCategories} most prevalent categories are
37 | plotted.}
38 | }
39 | \value{
40 | A list:
41 | \item{p}{\code{ggplot} object, trace plots of categories' proportions
42 | \code{myVar} over time.}
43 | \item{catVarSummary}{A \code{data.table}, contains categories' proportions
44 | globally, and over-time in each time period in \code{dateGp}. Each row is
45 | a category of the categorical (or binary) variable \code{myVar}. The row
46 | whose \code{category == 'NA'} corresponds to missing. Categories are
47 | ordered by global prevalence in a descending order.}
48 | }
49 | \description{
50 | Creates trace plots of categories' proportions over time for a discrete (or
51 | binary) variable
52 | }
53 | \section{License}{
54 |
55 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
56 | Version 2.0 (the "License"); you may not use this file except in compliance
57 | with the License. You may obtain a copy of the License at
58 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
59 | or agreed to in writing, software distributed under the License is
60 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
61 | KIND, either express or implied. See the License for the specific language
62 | governing permissions and limitations under the License.
63 | }
64 |
65 | \examples{
66 | data(bankData)
67 | bankData$weight = rpois(nrow(bankData), 5)
68 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
69 | dateGpBp = "quarters", weightNm = "weight")
70 | PlotRatesOverTime(dataFl = bankData, dateGp = "months", weightNm = "weight",
71 | myVar = "job", newLevels = NULL, normBy = "time")
72 |
73 | }
74 | \seealso{
75 | Functions depend on this function:
76 | \code{\link{PlotCatVar}}.
77 |
78 | This function depends on:
79 | \code{\link{PrepData}}.
80 | }
81 |
--------------------------------------------------------------------------------
/man/otvPlots.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/package_otvPlots.R
3 | \docType{package}
4 | \name{otvPlots}
5 | \alias{otvPlots}
6 | \alias{otvPlots-package}
7 | \title{Over time variable plots for predictive modeling (otvPlots)}
8 | \description{
9 | The \code{otvPlots} package uses \code{data.table} and \code{ggplot2}
10 | packages to efficiently plot time series aggregated from large datasets.
11 | Plots of numerical variables are optionally returned ordered by correlation
12 | with date -- a natural starting point for anomaly detection. Plots are
13 | automatically labeled if a variable dictionary is provided.
14 | }
15 | \details{
16 | Output files include:
17 | \itemize{
18 | \item A PDF file of plots saved as \code{outFl}.pdf, with each individual page
19 | on one variable. Variables are plotted in the order indicated in the argument
20 | \code{sortVars} or \code{sortFn}.
21 | For each numerical variable, the output plots include
22 | \itemize{
23 | \item side-by-side boxplots grouped by \code{dateGpBp} (left),
24 | \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
25 | (top right),
26 | \item a trace plot of mean and +-1 SD control limits, grouped by
27 | \code{dateGp}(middle right), and
28 | \item a trace plot of missing and zero rates, grouped by \code{dateGp}
29 | (bottom right).
30 | }
31 | For each categorical variable (including a numerical variable with no more
32 | than 2 unique levels not including NA), the output plots include
33 | \itemize{
34 | \item a frequency bar plot (left), and
35 | \item a grid of trace plots on categories' proportions over time (right).
36 | If the variable contains more than \code{kCategories} number of
37 | categories, trace plots of only the largest \code{kCategories} will be
38 | plotted. If the variable contains only two categories, then only the
39 | trace plot of the less prevalent category will be plotted.
40 | }
41 | \item CSV file(s) on summary statistics of variables, both globally and over
42 | time aggregated by \code{dateGp}. The order of variables in the CSV files
43 | is the same as in the PDF file.
44 | \itemize{
45 | \item For numerical variables, number of observations (counts), p1, p25,
46 | p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
47 | as \code{outFl}_numerical_summary.csv.
48 | \item For categorical varaibles, number of observations (counts) and
49 | categories' proportions are saved as \code{outFl}_categorical_summary.csv.
50 | Each row is a category of a categorical (or binary) variable.
51 | The row whose \code{category == 'NA'} corresponds to missing. Categories
52 | among the same variable are ordered by global prevalence in a descending
53 | order.
54 | }
55 | }
56 | }
57 | \section{License}{
58 |
59 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
60 | Version 2.0 (the "License"); you may not use this file except in compliance
61 | with the License. You may obtain a copy of the License at
62 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
63 | or agreed to in writing, software distributed under the License is
64 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
65 | KIND, either express or implied. See the License for the specific language
66 | governing permissions and limitations under the License.
67 | }
68 |
69 | \seealso{
70 | Main function: \code{\link{vlm}}.
71 |
72 | Selected supporting functions:
73 | \code{\link{PrepData}},
74 | \code{\link{PrepLabels}},
75 | \code{\link{OrderByR2}}.
76 | }
77 |
--------------------------------------------------------------------------------
/tests/testthat/test_SummaryStats.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | context("Summary stats for numerical variables")
3 | load("../testthat/testData.rda")
4 | setDT(testData)
5 | suppressMessages(PrepData(testData, dateNm = "date", dateGp = "weeks", dateGpBp = "weeks", weightNm = "weight"))
6 |
7 | test_that("Numerical statistics are calculated correctly without weight", {
8 | mdx = SummaryStats(myVar = "age", dataFl = testData, dateGp = "weeks")$meltdx
9 | Mean = mdx[variable=='Mean']
10 | p1 = mdx[variable=='p1']
11 | p99 = mdx[variable=='p99']
12 | zerorate = mdx[variable=='zerorate']
13 | missingrate = mdx[variable=='missingrate']
14 |
15 | p99_g = unique(mdx[variable=='p99_g', value])
16 | p1_g = unique(mdx[variable=='p1_g', value])
17 | cl1 = unique(mdx[variable=='cl1', value])
18 | cl2 = unique(mdx[variable=='cl2', value])
19 |
20 | expect_equivalent(p99_g, quantile(testData[, age], p=.99))
21 | expect_equivalent(p1_g, quantile(testData[, age], p=.01))
22 | expect_equivalent(cl1, mean(testData[, age]) + sd(testData[,age]))
23 | expect_equivalent(cl2, mean(testData[, age]) - sd(testData[,age]))
24 |
25 | mdx2 = mdx[weeks == "2008-05-06" & variable%in%c("p99", "p50", "p1", "mean", "zerorate", "missingrate")]
26 |
27 | expect_equivalent(mdx2[variable=="p99", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .99))
28 | expect_equivalent(mdx2[variable=="p50", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .5))
29 | expect_equivalent(mdx2[variable=="p1", value], quantile(testData[weeks==as.IDate("2008-05-06"),age], .01))
30 | expect_equivalent(mdx2[variable=="mean", value], mean(testData[weeks==as.IDate("2008-05-06"),age]))
31 | expect_equivalent(mdx2[variable=="zerorate", value], mean(testData[weeks==as.IDate("2008-05-06"),age]==0))
32 | expect_equivalent(mdx2[variable=="missingrate", value], mean(is.na(testData[weeks==as.IDate("2008-05-06"),age])))
33 | })
34 |
35 |
36 | test_that("Numerical statistics are calculated correctly with weight", {
37 | mdx = SummaryStats(myVar = "age", dataFl = testData, dateGp = "weeks", weightNm = "weight")$meltdx
38 | Mean = mdx[variable=='Mean']
39 | p1 = mdx[variable=='p1']
40 | p99 = mdx[variable=='p99']
41 | zerorate = mdx[variable=='zerorate']
42 | missingrate = mdx[variable=='missingrate']
43 |
44 |
45 | p99_g = unique(mdx[variable=='p99_g', value])
46 | p1_g = unique(mdx[variable=='p1_g', value])
47 | cl1 = unique(mdx[variable=='cl1', value])
48 | cl2 = unique(mdx[variable=='cl2', value])
49 |
50 | expect_equivalent(p99_g, Hmisc::wtd.quantile(testData[, age], testData[, weight], probs=.99, normwt=TRUE))
51 | expect_equivalent(p1_g, Hmisc::wtd.quantile(testData[, age], testData[, weight], probs=.01, normwt=TRUE))
52 | expect_equivalent(cl2, Hmisc::wtd.mean(testData[, age], testData[,weight], na.rm=TRUE, normwt=TRUE) -
53 | sqrt(Hmisc::wtd.var(testData[,age], testData[,weight], na.rm=TRUE,normwt=TRUE)))
54 | expect_equivalent(cl1, Hmisc::wtd.mean(testData[, age], testData[,weight], na.rm=TRUE, normwt=TRUE) +
55 | sqrt(Hmisc::wtd.var(testData[,age], testData[,weight], na.rm=TRUE,normwt=TRUE)))
56 |
57 | mdx2 = mdx[weeks == "2008-05-06" & variable%in%c("p99", "p50", "p1", "mean", "zerorate", "missingrate")]
58 | testData2 = testData[weeks==as.IDate("2008-05-06")]
59 |
60 | expect_equivalent(mdx2[variable=="p99", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .99, normwt=TRUE))
61 | expect_equivalent(mdx2[variable=="p50", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .5, normwt=TRUE))
62 | expect_equivalent(mdx2[variable=="p1", value], Hmisc::wtd.quantile(testData2[, age],testData2[, weight], .01, normwt=TRUE))
63 | expect_equivalent(mdx2[variable=="mean", value], Hmisc::wtd.mean(testData2[,age], testData2[,weight]))
64 | expect_equivalent(mdx2[variable=="zerorate", value], Hmisc::wtd.mean((testData2[,age]==0), testData2[,weight]))
65 | expect_equivalent(mdx2[variable=="missingrate", value], Hmisc::wtd.mean(is.na(testData2[,age]), testData2[,weight]))
66 | })
67 |
68 |
--------------------------------------------------------------------------------
/man/OrderByR2.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plots_order.R
3 | \name{OrderByR2}
4 | \alias{OrderByR2}
5 | \title{Create numerical variable ranking using R2 between date to and variable}
6 | \usage{
7 | OrderByR2(dataFl, dateNm, buildTm = NULL, weightNm = NULL,
8 | kSample = 50000)
9 | }
10 | \arguments{
11 | \item{dataFl}{A \code{data.table} of data; must be the output of the
12 | \code{\link{PrepData}} function.}
13 |
14 | \item{dateNm}{Name of column containing the date variable.}
15 |
16 | \item{buildTm}{Vector identify time period for ranking/anomaly detection
17 | (most likely model build period). Allows for a subset of plotting time
18 | period to be used for anomaly detection.
19 | \itemize{
20 | \item Must be a vector of dates and must be inclusive i.e. buildTm[1]
21 | <= date <= buildTm[2] will define the time period.
22 | \item Must be either \code{NULL}, a vector of length 2, or a vector of
23 | length 3.
24 | \item If \code{NULL}, the entire dataset will be used for
25 | ranking/anomaly detection.
26 | \item If a vector of length 2, the format of the dates must be
27 | a character vector in default R date format (e.g. "2017-01-30").
28 | \item If a vector of length 3, the first two columns must contain dates
29 | in any strptime format, while the 3rd column contains the strptime
30 | format (see \code{\link{strptime}}).
31 | \item The following are equivalent ways of selecting
32 | all of 2014:
33 | \itemize{
34 | \item \code{c("2014-01-01","2014-12-31")}
35 | \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")}
36 | }
37 | }}
38 |
39 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
40 | no weights (all rows receiving weight 1).}
41 |
42 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer,
43 | indicates the sample size for both drawing boxplots and ordering numerical
44 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a
45 | reasonable value (default is 50K) dramatically improves processing speed.
46 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
47 | parameter should not be set to \code{NULL}, or boxplots may take a very
48 | long time to render. This setting has no impact on the accuracy of time
49 | series plots on quantiles, mean, SD, and missing and zero rates.}
50 | }
51 | \value{
52 | A vector of variable names sorted by R2 of \code{lm} of the formula
53 | \code{var} ~ \code{dateNm} (highest R2 to lowest)
54 | }
55 | \description{
56 | Calculates R2 of a linear model of the formula \code{var} ~ \code{dateNm} for
57 | each \code{var} of class \code{nmrcl} and returns a vector of
58 | variable names ordered by highest R2. The linear model can be calculated over
59 | a subset of dates, see details of parameter \code{buildTm}. Non-numerical
60 | variables are returned in alphabetical order after the sorted numerical
61 | variables.
62 | }
63 | \section{License}{
64 |
65 | Copyright 2017 Capital One Services, LLC Licensed under the
66 | Apache License, Version 2.0 (the "License"); you may not use this file
67 | except in compliance with the License. You may obtain a copy of the
68 | License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
69 | applicable law or agreed to in writing, software distributed under the
70 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
71 | CONDITIONS OF ANY KIND, either express or implied. See the License for the
72 | specific language governing permissions and limitations under the License.
73 | }
74 |
75 | \examples{
76 | data(bankData)
77 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
78 | dateGpBp = "quarters")
79 | OrderByR2(bankData, dateNm = "date")
80 | }
81 | \seealso{
82 | Functions depend on this function:
83 | \code{\link{vlm}}.
84 |
85 | This function depends on:
86 | \code{\link{CalcR2}},
87 | \code{\link{PrepData}}.
88 | }
89 |
--------------------------------------------------------------------------------
/man/PlotNumVar.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/numerical.R
3 | \name{PlotNumVar}
4 | \alias{PlotNumVar}
5 | \title{Create plots and summary statistics for a numerical variable}
6 | \usage{
7 | PlotNumVar(myVar, dataFl, weightNm, dateGp, dateGpBp, skewOpt = NULL,
8 | kSample = 50000)
9 | }
10 | \arguments{
11 | \item{myVar}{The name of the variable to be plotted}
12 |
13 | \item{dataFl}{A \code{data.table} of data; must be the output of the
14 | \code{\link{PrepData}} function.}
15 |
16 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
17 | no weights (all rows receiving weight 1).}
18 |
19 | \item{dateGp}{Name of the variable that the time series plots should be
20 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
21 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
22 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
23 |
24 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
25 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
26 |
27 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is
28 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
29 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
30 | Negative input of \code{skewOpt} will be converted to 3.}
31 |
32 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer,
33 | indicates the sample size for both drawing boxplots and ordering numerical
34 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a
35 | reasonable value (default is 50K) dramatically improves processing speed.
36 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
37 | parameter should not be set to \code{NULL}, or boxplots may take a very
38 | long time to render. This setting has no impact on the accuracy of time
39 | series plots on quantiles, mean, SD, and missing and zero rates.}
40 | }
41 | \value{
42 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a
43 | side-byside boxplot grouped by \code{dateGpBp}, a time series plot of p1,
44 | p50 (median), and p99 grouped by \code{dateGp}, a time series plot of
45 | mean and +-1 SD control limits grouped by \code{dateGp}, and a time
46 | series plot of missing and zerorates grouped by \code{dateGp}.}
47 | \item{numVarSummary}{A \code{data.table}, contains global and over time
48 | summary statistics, including p1, p25, p50, p75, and p99 quantiles, mean
49 | and SD, missing and zero rates.}
50 | }
51 | \description{
52 | Output plots include a boxplot on the left, grouped by a courser time scale
53 | (\code{dateGpBp}), and three trace plots on the right, on p1, p50,
54 | and p99 qunatiles, mean and +-1 SD control limits, missing and zerorates,
55 | all grouped by a finer time scale as in \code{dateGp}. In addition to plots,
56 | a \code{data.table} of summary statistics are generated, on global and
57 | over time summary statistics.
58 | }
59 | \section{License}{
60 |
61 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
62 | Version 2.0 (the "License"); you may not use this file except in compliance
63 | with the License. You may obtain a copy of the License at
64 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
65 | or agreed to in writing, software distributed under the License is
66 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
67 | KIND, either express or implied. See the License for the specific language
68 | governing permissions and limitations under the License.
69 | }
70 |
71 | \examples{
72 | data(bankData)
73 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
74 | dateGpBp = "years")
75 | plot(PlotNumVar("balance", bankData, NULL, "months", "years",
76 | skewOpt = NULL, kSample = NULL)$p)
77 | }
78 | \seealso{
79 | Functions depend on this function:
80 | \code{\link{PlotVar}}.
81 |
82 | This function depends on:
83 | \code{\link{SummaryStats}},
84 | \code{\link{PlotDist}},
85 | \code{\link{PlotQuantiles}},
86 | \code{\link{PlotMean}},
87 | \code{\link{PlotRates}},
88 | \code{\link{PrepData}}.
89 | }
90 |
--------------------------------------------------------------------------------
/man/PlotCatVar.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/categorical.R
3 | \name{PlotCatVar}
4 | \alias{PlotCatVar}
5 | \title{Create plots and summary statistics for a categorical variable}
6 | \usage{
7 | PlotCatVar(myVar, dataFl, weightNm = NULL, dateNm, dateGp, kCategories = 9,
8 | normBy = "time")
9 | }
10 | \arguments{
11 | \item{myVar}{The name of the variable to be plotted}
12 |
13 | \item{dataFl}{A \code{data.table} of data; must be the output of the
14 | \code{\link{PrepData}} function.}
15 |
16 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
17 | no weights (all rows receiving weight 1).}
18 |
19 | \item{dateNm}{Name of column containing the date variable.}
20 |
21 | \item{dateGp}{Name of the variable that the time series plots should be
22 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
23 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
24 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
25 |
26 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
27 | trace plots of only the \code{kCategories} most prevalent categories are
28 | plotted.}
29 |
30 | \item{normBy}{The normalization factor for rate plots, can be \code{"time"}
31 | or \code{"var"}. If \code{"time"}, then for each time period of
32 | \code{dateGp}, counts are normalized by the total counts over all
33 | categories in that time period. This illustrates changes of categories'
34 | proportions over time. If \code{"var"}, then for each category, its counts
35 | are normalized by the total counts over time from only this category. This
36 | illustrates changes of categories' volumes over time.}
37 | }
38 | \value{
39 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a
40 | bar plot, and trace plots of categories' proportions. If the number of
41 | categories is larger than \code{kCategories}, then trace plots of only the
42 | \code{kCategories} most prevalent categories are be plotted. For a binary
43 | variable, only the trace plot of the less prevalent category is plotted.}
44 | \item{catVarSummary}{A \code{data.table}, contains categories' proportions
45 | globally, and over-time in each time period in \code{dateGp}. Each row is
46 | a category of the categorical (or binary) variable \code{myVar}. The row
47 | whose \code{category == 'NA'} corresponds to missing. Categories are
48 | ordered by global prevalence in a descending order.}
49 | }
50 | \description{
51 | Output plots include a bar plot with cateogries ordered by global counts,
52 | and trace plots of categories' proportions over time. This function is also
53 | appliable to a binary varible, which is treated as categorical in this
54 | package. In addition to plots, a \code{data.table} of summary statistics
55 | are generated, on global counts and proportions by cateory, and proportions
56 | by category over time.
57 | }
58 | \section{License}{
59 |
60 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
61 | Version 2.0 (the "License"); you may not use this file except in compliance
62 | with the License. You may obtain a copy of the License at
63 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
64 | or agreed to in writing, software distributed under the License is
65 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
66 | KIND, either express or implied. See the License for the specific language
67 | governing permissions and limitations under the License.
68 | }
69 |
70 | \examples{
71 | data(bankData)
72 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
73 | dateGpBp = "quarters", weightNm = NULL)
74 | # Single histogram is plotted for job type since there are 12 categories
75 | plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL,
76 | dateNm = "date", dateGp = "months")$p)
77 |
78 | plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL,
79 | dateNm = "date", dateGp = "months", kCategories = 12)$p)
80 |
81 |
82 | ## Binary data is treated as categorical, and only the less frequent
83 | ## category is plotted over time.
84 | plot(PlotCatVar(myVar = "default", dataFl = bankData, weightNm = NULL,
85 | dateNm = "date", dateGp = "months")$p)
86 | }
87 | \seealso{
88 | Functions depend on this function:
89 | \code{\link{PlotVar}},
90 | \code{\link{PrintPlots}},
91 | \code{\link{vlm}}.
92 |
93 | This function depends on:
94 | \code{\link{PlotBarplot}},
95 | \code{\link{PlotRatesOverTime}},
96 | \code{\link{PrepData}}.
97 | }
98 |
--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Copyright 2017 Capital One Services, LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | #
8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied.
13 | #
14 | # See the License for the specific language governing permissions and limitations under the License.
15 |
16 |
17 | #' Direct marketing campaigns of a Portuguese banking institution
18 | #'
19 | #' The marketing campaigns were based on phone calls.
20 | #' Often, more than one contact to the same client was required, in order to
21 | #' access if the product (bank term deposit) would be ('yes') or not ('no')
22 | #' subscribed. Records are ordered by date (from May 2008 to November 2010),
23 | #' similar to data analyzed in Moro et al. [2014].
24 | #'
25 | #'
26 | #' @format A data frame with 45,211 rows and 19 variables:
27 | #' \describe{
28 | #' \item{age}{Age of the client, numeric.}
29 | #' \item{job}{Type of job, a categorical variable with the levels:
30 | #' \code{'admin.'}, \code{'blue-collar'}, \code{'entrepreneur'},
31 | #' \code{'housemaid'}, \code{'management'}, \code{'retired'},
32 | #' \code{'self-employed'}, \code{'services'}, \code{'student'},
33 | #' \code{'technician'}, \code{'unemployed'}, and \code{'unknown'}.}
34 | #' \item{marital}{Marital status, a categorical variable with levels:
35 | #' \code{'divorced'}, \code{'married'}, \code{'single'}, and \code{'unknown'}.
36 | #' Note that \code{'divorced'} means either divorced or widowed.}
37 | #' \item{education}{A categorical variable with levels: \code{'basic.4y'},
38 | #' \code{'basic.6y'}, \code{'basic.9y'}, \code{'high.school'},
39 | #' \code{'illiterate'}, \code{'professional.course'},
40 | #' \code{'university.degree'}, and \code{'unknown'}.}
41 | #' \item{default}{Whether credit is in default, a categorical variable with
42 | #' levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
43 | #' \item{balance}{Account balance, numeric.}
44 | #' \item{housing}{Whether the client has a housing loan, a categorical variable
45 | #' with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
46 | #' \item{loan}{Whether the client has personal loan, a categorical variable
47 | #' with levels: \code{'no'}, \code{'yes'}, and \code{'unknown'}.}
48 | #' \item{contact}{Type of contact communication, a categorical variable
49 | #' with levels: \code{'cellular'} and \code{'telephone'}.}
50 | #' \item{duration}{Last contact duration in seconds, a numeric variable.}
51 | #' \item{campaign}{Number of contacts performed during this campaign for
52 | #' this client, including the last contact; a numeric variable.}
53 | #' \item{pdays}{Number of days that passed by after the client was last
54 | #' contacted from a previous campaign; a numeric variable, with \code{999}
55 | #' means that client was not previously contacted.}
56 | #' \item{previous}{Number of contacts performed before this campaign for this
57 | #' client, a numeric variable.}
58 | #' \item{poutcome}{Outcome of the previous marketing campaign, a categorical
59 | #' variable with levels: \code{'failure'}, \code{'nonexistent'},
60 | #' and \code{'success'}.}
61 | #' \item{y}{Whether the client has subscribed a term deposit, a categorical
62 | #' variable with levels: \code{'yes'} and \code{'no'}.}
63 | #' \item{date}{Last contact date.}
64 | #' }
65 | #' @source \url{https://archive.ics.uci.edu/ml/datasets/Bank+Marketing}
66 | #' @source \cite{Lichman, M. (2013). UCI Machine Learning Repository
67 | #' [\url{http://archive.ics.uci.edu/ml}]. Irvine, CA: University of California,
68 | #' School of Information and Computer Science.}
69 | #' @source \cite{S. Moro, P. Cortez, and P. Rita. (2014) A Data-Driven Approach
70 | #' to Predict the Success of Bank Telemarketing. Decision Support Systems,
71 | #' Elsevier, 62:22-31, June 2014.}
72 | "bankData"
73 |
74 | #' Labels for bankData
75 | #'
76 | #' A dataset containing the attribute labels also found in \code{\link{bankData}}.
77 | #' This data set is used to illustrate the \code{\link{PrepLabels}} function and
78 | #' other label functionality in the \code{\link{otvPlots}} package in R.
79 | #'
80 | #' @format A data frame with 16 rows and 3 variables:
81 | #' \describe{
82 | #' \item{V1}{Name of each variable in \code{\link{bankData}}.}
83 | #' \item{V2}{Label of each variable in \code{\link{bankData}}.}
84 | #' \item{V3}{A numeric variable, corresponding to the row number.}
85 | #' }
86 | "bankLabels"
87 |
--------------------------------------------------------------------------------
/R/package_otvPlots.R:
--------------------------------------------------------------------------------
1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Copyright 2017 Capital One Services, LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | #
8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied.
13 | #
14 | # See the License for the specific language governing permissions and limitations under the License.
15 |
16 |
17 | #' Over time variable plots for predictive modeling (otvPlots)
18 | #'
19 | #' The \code{otvPlots} package uses \code{data.table} and \code{ggplot2}
20 | #' packages to efficiently plot time series aggregated from large datasets.
21 | #' Plots of numerical variables are optionally returned ordered by correlation
22 | #' with date -- a natural starting point for anomaly detection. Plots are
23 | #' automatically labeled if a variable dictionary is provided.
24 | #'
25 | #' Output files include:
26 | #' \itemize{
27 | #' \item A PDF file of plots saved as \code{outFl}.pdf, with each individual page
28 | #' on one variable. Variables are plotted in the order indicated in the argument
29 | #' \code{sortVars} or \code{sortFn}.
30 | #' For each numerical variable, the output plots include
31 | #' \itemize{
32 | #' \item side-by-side boxplots grouped by \code{dateGpBp} (left),
33 | #' \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
34 | #' (top right),
35 | #' \item a trace plot of mean and +-1 SD control limits, grouped by
36 | #' \code{dateGp}(middle right), and
37 | #' \item a trace plot of missing and zero rates, grouped by \code{dateGp}
38 | #' (bottom right).
39 | #' }
40 | #' For each categorical variable (including a numerical variable with no more
41 | #' than 2 unique levels not including NA), the output plots include
42 | #' \itemize{
43 | #' \item a frequency bar plot (left), and
44 | #' \item a grid of trace plots on categories' proportions over time (right).
45 | #' If the variable contains more than \code{kCategories} number of
46 | #' categories, trace plots of only the largest \code{kCategories} will be
47 | #' plotted. If the variable contains only two categories, then only the
48 | #' trace plot of the less prevalent category will be plotted.
49 | #' }
50 | #' \item CSV file(s) on summary statistics of variables, both globally and over
51 | #' time aggregated by \code{dateGp}. The order of variables in the CSV files
52 | #' is the same as in the PDF file.
53 | #' \itemize{
54 | #' \item For numerical variables, number of observations (counts), p1, p25,
55 | #' p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
56 | #' as \code{outFl}_numerical_summary.csv.
57 | #' \item For categorical varaibles, number of observations (counts) and
58 | #' categories' proportions are saved as \code{outFl}_categorical_summary.csv.
59 | #' Each row is a category of a categorical (or binary) variable.
60 | #' The row whose \code{category == 'NA'} corresponds to missing. Categories
61 | #' among the same variable are ordered by global prevalence in a descending
62 | #' order.
63 | #' }
64 | #' }
65 | #'
66 | #' @seealso Main function: \code{\link{vlm}}.
67 | #' @seealso Selected supporting functions:
68 | #' \code{\link{PrepData}},
69 | #' \code{\link{PrepLabels}},
70 | #' \code{\link{OrderByR2}}.
71 | #'
72 | #' @section License:
73 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
74 | #' Version 2.0 (the "License"); you may not use this file except in compliance
75 | #' with the License. You may obtain a copy of the License at
76 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
77 | #' or agreed to in writing, software distributed under the License is
78 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
79 | #' KIND, either express or implied. See the License for the specific language
80 | #' governing permissions and limitations under the License.
81 | #' @docType package
82 | #' @name otvPlots
83 | #' @import data.table
84 | #' @import ggplot2
85 | #' @importFrom grid grid.draw grid.newpage unit unit.c textGrob gpar
86 | #' @importFrom gridExtra arrangeGrob
87 | #' @importFrom moments skewness
88 | #' @importFrom Hmisc wtd.quantile wtd.mean wtd.var
89 | #' @importFrom stringi stri_trans_general
90 | #' @importFrom scales hue_pal
91 | #' @importFrom grDevices cairo_pdf dev.off
92 | #' @importFrom graphics par
93 | #' @importFrom stats lm.fit lm.wfit quantile sd var
94 | #' @importFrom utils tail
95 | NULL
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/man/PrintPlots.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_print.R
3 | \name{PrintPlots}
4 | \alias{PrintPlots}
5 | \title{Create a pdf file with plots and compute summary statistics for all variables}
6 | \usage{
7 | PrintPlots(outFl, dataFl, sortVars, dateNm, dateGp, dateGpBp, weightNm = NULL,
8 | labelFl = NULL, genCSV = TRUE, highlightNms = NULL, skewOpt = NULL,
9 | kSample = 50000, fuzzyLabelFn = NULL, kCategories = 9)
10 | }
11 | \arguments{
12 | \item{outFl}{Name of the output file, with no extension names (e.g., "bank").
13 | A pdf file of plots ("bank.pdf"), and two csv files of summary statistics
14 | ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be
15 | saved to your working directory, unless a path is included in \code{outFl}
16 | (e.g. "../plots/bank").}
17 |
18 | \item{dataFl}{A \code{data.table} containing at least the following columns:
19 | \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an
20 | output of the \code{\link{PrepData}} function.}
21 |
22 | \item{sortVars}{A character vector of variable names in the order they will
23 | be plotted.}
24 |
25 | \item{dateNm}{Name of column containing the date variable.}
26 |
27 | \item{dateGp}{Name of the variable that the time series plots should be
28 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
29 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
30 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
31 |
32 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
33 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
34 |
35 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
36 | no weights (all rows receiving weight 1).}
37 |
38 | \item{labelFl}{A \code{data.table} containing variable labels, or \code{NULL}
39 | for no labels; usually an output of \code{\link{PrepLabels}}.}
40 |
41 | \item{genCSV}{Logical, whether to generate the two csv files of summary
42 | statistics for numerical and categorical variables.}
43 |
44 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to
45 | recieve red label. Currently \code{NULL} means all variables will get a
46 | black legend. Ignored this argument if \code{labelFl == NULL}.}
47 |
48 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is
49 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
50 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
51 | Negative input of \code{skewOpt} will be converted to 3.}
52 |
53 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer,
54 | indicates the sample size for both drawing boxplots and ordering numerical
55 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a
56 | reasonable value (default is 50K) dramatically improves processing speed.
57 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
58 | parameter should not be set to \code{NULL}, or boxplots may take a very
59 | long time to render. This setting has no impact on the accuracy of time
60 | series plots on quantiles, mean, SD, and missing and zero rates.}
61 |
62 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label
63 | file in the format of an output by \code{\link{PrepLabels}} and a string
64 | giving a variable name. The function should return the label corresponding
65 | to the variable given by the second parameter. This function should
66 | describe how fuzzy matching should be performed to find labels (see example
67 | below). If \code{NULL}, only exact matches will be retuned.}
68 |
69 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
70 | trace plots of only the \code{kCategories} most prevalent categories are
71 | plotted.}
72 | }
73 | \value{
74 | A pdf of plots saved to file \code{outFl}.pdf, and if the argument
75 | \code{genCSV == TRUE}, also two csv files of summary statistics for
76 | numerical and categorical variables.
77 | }
78 | \description{
79 | Creates plots and outputs results to a letter-sized pdf file, with each
80 | individual page containing plots on a single variable in the data. In
81 | addition, two summary statistics \code{data.table} are returned, one for
82 | numerical variables, and one for categorical (and binary) ones.
83 | }
84 | \section{License}{
85 |
86 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
87 | Version 2.0 (the "License"); you may not use this file except in compliance
88 | with the License. You may obtain a copy of the License at
89 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
90 | or agreed to in writing, software distributed under the License is
91 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
92 | KIND, either express or implied. See the License for the specific language
93 | governing permissions and limitations under the License.
94 | }
95 |
96 | \seealso{
97 | Functions depend on this function:
98 | \code{\link{vlm}}.
99 |
100 | This function depends on:
101 | \code{\link{PlotVar}},
102 | \code{\link{PrepData}}.
103 | }
104 |
--------------------------------------------------------------------------------
/man/PrepData.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/prep.R
3 | \name{PrepData}
4 | \alias{PrepData}
5 | \title{Prepare an input dataset for plotting}
6 | \usage{
7 | PrepData(dataFl, dateNm, selectCols = NULL, dropCols = NULL,
8 | dateFt = "\%d\%h\%Y", dateGp = NULL, dateGpBp = NULL, weightNm = NULL,
9 | varNms = NULL, dropConstants = FALSE, ...)
10 | }
11 | \arguments{
12 | \item{dataFl}{Either the name of an object that can be converted using
13 | \code{\link[data.table]{as.data.table}} (e.g., a data frame), or a
14 | character string containing the name of dataset that can be loaded using
15 | \code{\link[data.table]{fread}} (e.g., a csv file). If the dataset is not in
16 | your working directory then \code{dataFl} must include (relative or
17 | absolute) path to file.}
18 |
19 | \item{dateNm}{Name of column containing the date variable.}
20 |
21 | \item{selectCols}{Either \code{NULL}, or a vector of names or indices of
22 | variables to read into memory -- must include \code{dateNm},
23 | \code{weightNm} (if not \code{NULL}) and all variables to be plotted. If
24 | both \code{selectCols} and \code{dropCols} are \code{NULL}, then all
25 | variables will be read in.}
26 |
27 | \item{dropCols}{Either \code{NULL}, or a vector of variables names or indices
28 | of variables not to read into memory. If both \code{selectCols} and
29 | \code{dropCols} are \code{NULL}, then all variables will be read in.}
30 |
31 | \item{dateFt}{\code{\link{strptime}} format of date variable. The default is SAS
32 | format \code{"\%d\%h\%Y"}. But input data with R date format
33 | \code{"\%Y-\%m-\%d"} will also be detected. Both of two formats can be
34 | parsed automatically.}
35 |
36 | \item{dateGp}{Name of the variable that the time series plots should be
37 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
38 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
39 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
40 |
41 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
42 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
43 |
44 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
45 | no weights (all rows receiving weight 1).}
46 |
47 | \item{varNms}{Either \code{NULL} or a vector of names or indices of variables
48 | to be plotted. If \code{NULL}, will default to all columns which are not
49 | \code{dateNm} or \code{weightNm}. Can also be a vector of indices of the
50 | column names, after \code{dropCols} or \code{selectCols} have been applied,
51 | if applicable, and not including \code{dateGp}, \code{dateGpBp}
52 | (which will be added to the \code{dataFl} by the function
53 | \code{\link{PrepData}}).}
54 |
55 | \item{dropConstants}{Logical, indicates whether or not constant (all
56 | duplicated or NA) variables should be dropped from \code{dataFl} prior to
57 | plotting.}
58 |
59 | \item{...}{Additional parameters to be passed to
60 | \code{\link[data.table]{fread}}.}
61 | }
62 | \value{
63 | A \code{data.table} object, formatted for use by all plotting
64 | functions in this package \code{\link{otvPlots}}, including the main function
65 | \code{\link{vlm}}, and the individual variable plotting function
66 | \code{\link{PlotVar}}.
67 | }
68 | \description{
69 | This function prepares an input dataset for use by all plotting functions
70 | in this package, including the main function \code{\link{vlm}}.
71 | The input data \code{dataFl} must contain, at a minimum, a date column
72 | \code{dateNm} and a variable to be plotted. \code{dataFl} will be
73 | converted to a \code{data.table} class, and all changes are made to it by
74 | reference.
75 | }
76 | \details{
77 | If weights (\code{weightNm}) are provided, then it is normalized to have a
78 | sum of weights equal the total sample size, and the weights are used in all
79 | summary statistics calculations and plotting.
80 | }
81 | \section{License}{
82 |
83 | Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
84 | Version 2.0 (the "License"); you may not use this file except in compliance
85 | with the License. You may obtain a copy of the License at
86 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
87 | or agreed to in writing, software distributed under the License is
88 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
89 | KIND, either express or implied. See the License for the specific language
90 | governing permissions and limitations under the License.
91 | }
92 |
93 | \examples{
94 | ## Use the bankData dataset in this package
95 | data(bankData)
96 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
97 | dateGpBp = "quarters")
98 | ## Columns have been assigned a plotting class (nmrcl/ctgrl)
99 | str(bankData)
100 | }
101 | \seealso{
102 | Functions depend on this function:
103 | \code{\link{PlotBarplot}},
104 | \code{\link{PlotRatesOverTime}},
105 | \code{\link{PlotCatVar}},
106 | \code{\link{SummaryStats}},
107 | \code{\link{PlotMean}},
108 | \code{\link{PlotQuantiles}},
109 | \code{\link{PlotRates}},
110 | \code{\link{PlotDist}},
111 | \code{\link{PlotNumVar}},
112 | \code{\link{PlotVar}},
113 | \code{\link{PrintPlots}},
114 | \code{\link{CalcR2}},
115 | \code{\link{OrderByR2}},
116 | \code{\link{vlm}}.
117 | }
118 |
--------------------------------------------------------------------------------
/tests/testthat/rawData_bigint.csv:
--------------------------------------------------------------------------------
1 | age,job,marital,balance,default,weight,date,bigint
2 | 32,blue-collar,single,23,0,0.005102041,6/5/08,2.3E+12
3 | 46,management,single,-246,0,0.010204082,6/5/08,-2.46E+13
4 | 32,admin.,married,0,0,0.010204082,6/5/08,0
5 | 60,retired,married,100,0,0.010204082,6/5/08,1E+13
6 | 60,admin.,married,39,0,0.010204082,7/5/08,3.9E+12
7 | 58,retired,married,96,0,0.005102041,7/5/08,9.6E+12
8 | 35,blue-collar,single,12223,0,0.005102041,7/5/08,1.2223E+15
9 | 55,services,divorced,1,1,0.010204082,7/5/08,1E+11
10 | 45,admin.,single,13,0,0.020408163,8/5/08,1.3E+12
11 | 47,blue-collar,married,306,0,0.005102041,8/5/08,3.06E+13
12 | 45,admin.,single,206,0,0.010204082,8/5/08,2.06E+13
13 | 60,retired,married,81,0,0.005102041,8/5/08,8.1E+12
14 | 28,management,single,447,0,0.015306122,9/5/08,4.47E+13
15 | 47,blue-collar,married,1506,0,0.015306122,10/5/08,1.506E+14
16 | 35,management,married,231,0,0.010204082,10/5/08,2.31E+13
17 | 40,retired,married,0,0,0.015306122,10/5/08,0
18 | 56,management,married,779,0,0.005102041,11/5/08,7.79E+13
19 | 25,services,married,50,0,0.010204082,11/5/08,5E+12
20 | 29,management,single,0,0,0.005102041,11/5/08,0
21 | 36,admin.,divorced,506,0,0.015306122,12/5/08,5.06E+13
22 | 55,technician,divorced,0,0,0.005102041,12/5/08,0
23 | 57,blue-collar,married,52,0,0.015306122,13-05-2008,5.2E+12
24 | 42,admin.,single,-76,0,0.010204082,13-05-2008,-7.6E+12
25 | 24,technician,single,-103,0,0.005102041,13-05-2008,-1.03E+13
26 | 53,technician,divorced,989,0,0.010204082,13-05-2008,9.89E+13
27 | 59,admin.,married,2343,0,0.005102041,13-05-2008,2.343E+14
28 | 51,blue-collar,married,173,0,0.005102041,13-05-2008,1.73E+13
29 | 44,admin.,married,-372,0,0.015306122,14-05-2008,-3.72E+13
30 | 55,services,divorced,91,0,0.010204082,14-05-2008,9.1E+12
31 | 49,services,divorced,0,0,0.010204082,14-05-2008,0
32 | 42,management,single,50,0,0.010204082,14-05-2008,5E+12
33 | 58,retired,married,121,0,0.015306122,15-05-2008,1.21E+13
34 | 36,technician,single,265,0,0.015306122,15-05-2008,2.65E+13
35 | 49,management,married,378,0,0.015306122,15-05-2008,3.78E+13
36 | 54,management,married,282,0,0.010204082,15-05-2008,2.82E+13
37 | 44,blue-collar,married,582,0,0.005102041,15-05-2008,5.82E+13
38 | 57,entrepreneur,divorced,-37,0,0.010204082,16-05-2008,-3.7E+12
39 | 60,retired,married,60,0,0.005102041,17-05-2008,6E+12
40 | 38,management,single,424,0,0.010204082,17-05-2008,4.24E+13
41 | 40,blue-collar,single,24,0,0.015306122,17-05-2008,2.4E+12
42 | 46,management,divorced,16,0,0.005102041,18-05-2008,1.6E+12
43 | 46,management,married,229,0,0.015306122,18-05-2008,2.29E+13
44 | 60,blue-collar,married,104,0,0.010204082,20-05-2008,1.04E+13
45 | 46,services,married,179,0,0.010204082,20-05-2008,1.79E+13
46 | 53,technician,married,6,0,0.015306122,21-05-2008,6E+11
47 | 54,retired,married,529,0,0.010204082,21-05-2008,5.29E+13
48 | 58,management,married,2143,0,0.005102041,22-05-2008,2.143E+14
49 | 43,technician,single,593,0,0.005102041,22-05-2008,5.93E+13
50 | 57,technician,divorced,63,0,0.005102041,22-05-2008,6.3E+12
51 | 42,entrepreneur,divorced,2,1,0.010204082,23-05-2008,2E+11
52 | 51,retired,married,229,0,0.005102041,23-05-2008,2.29E+13
53 | 59,blue-collar,married,0,0,0.005102041,23-05-2008,0
54 | 31,services,married,25,0,0.015306122,23-05-2008,2.5E+12
55 | 55,blue-collar,married,383,0,0.010204082,23-05-2008,3.83E+13
56 | 47,services,divorced,164,0,0.010204082,24-05-2008,1.64E+13
57 | 46,self-employed,married,137,0,0.010204082,24-05-2008,1.37E+13
58 | 48,management,divorced,-244,0,0.025510204,25-05-2008,-2.44E+13
59 | 49,blue-collar,married,154,0,0.010204082,25-05-2008,1.54E+13
60 | 59,management,divorced,59,0,0.005102041,25-05-2008,5.9E+12
61 | 25,blue-collar,married,-7,0,0.010204082,26-05-2008,-7E+11
62 | 50,management,married,49,0,0.010204082,26-05-2008,4.9E+12
63 | 58,self-employed,married,-364,0,0.005102041,26-05-2008,-3.64E+13
64 | 57,retired,married,486,0,0.015306122,26-05-2008,4.86E+13
65 | 33,unknown,single,1,0,0.025510204,27-05-2008,1E+11
66 | 57,services,married,162,0,0.020408163,27-05-2008,1.62E+13
67 | 39,management,single,255,0,0.005102041,27-05-2008,2.55E+13
68 | 57,technician,married,839,0,0.010204082,27-05-2008,8.39E+13
69 | 54,blue-collar,married,1291,0,0.005102041,27-05-2008,1.291E+14
70 | 32,management,married,0,0,0.010204082,27-05-2008,0
71 | 55,blue-collar,married,23,0,0.005102041,27-05-2008,2.3E+12
72 | 33,entrepreneur,married,2,0,0.005102041,28-05-2008,2E+11
73 | 58,technician,married,71,0,0.015306122,28-05-2008,7.1E+12
74 | 51,management,married,10635,0,0.005102041,28-05-2008,1.0635E+15
75 | 36,admin.,single,-171,0,0.020408163,28-05-2008,-1.71E+13
76 | 38,entrepreneur,single,243,0,0.010204082,28-05-2008,2.43E+13
77 | 55,technician,married,1205,0,0.010204082,28-05-2008,1.205E+14
78 | 41,admin.,divorced,270,0,0.005102041,29-05-2008,2.7E+13
79 | 33,services,married,0,0,0.010204082,29-05-2008,0
80 | 28,blue-collar,married,723,0,0.005102041,29-05-2008,7.23E+13
81 | 57,blue-collar,married,5935,0,0.010204082,29-05-2008,5.935E+14
82 | 44,services,divorced,2586,0,0.005102041,30-05-2008,2.586E+14
83 | 56,admin.,married,45,0,0.010204082,30-05-2008,4.5E+12
84 | 30,technician,married,152,0,0.015306122,30-05-2008,1.52E+13
85 | 42,technician,single,690,0,0.010204082,31-05-2008,6.9E+13
86 | 41,technician,married,1270,0,0.015306122,31-05-2008,1.27E+14
87 | 36,management,married,101,0,0.005102041,31-05-2008,1.01E+13
88 | 29,admin.,single,390,0,0.005102041,1/6/08,3.9E+13
89 | 44,technician,married,0,0,0.015306122,1/6/08,0
90 | 33,services,married,790,0,0.005102041,1/6/08,7.9E+13
91 | 60,admin.,married,290,0,0.010204082,1/6/08,2.9E+13
92 | 57,blue-collar,married,249,0,0.010204082,2/6/08,2.49E+13
93 | 53,technician,married,384,0,0.005102041,2/6/08,3.84E+13
94 | 60,blue-collar,married,54,0,0.005102041,2/6/08,5.4E+12
95 | 37,admin.,single,0,0,0.010204082,3/6/08,0
96 | 43,technician,married,1937,0,0.010204082,3/6/08,1.937E+14
97 | 44,technician,single,29,0,0.005102041,4/6/08,2.9E+12
98 | 52,entrepreneur,married,113,0,0.015306122,4/6/08,1.13E+13
99 | 53,technician,married,-3,0,0.010204082,4/6/08,-3E+11
100 | 51,management,married,6530,0,0.005102041,4/6/08,6.53E+14
101 | 39,technician,married,0,0,0.015306122,4/6/08,0
--------------------------------------------------------------------------------
/tests/testthat/test_OrderByR2.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | context("Order by R-squared")
3 | load("../testthat/testData.rda")
4 | #testData = setDT(testData)
5 | testData = PrepData(testData, dateNm = "date", weightNm = "weight")
6 |
7 |
8 | testOrder <- function(out, testData){
9 | cntnsVars <- names(Filter(is.nmrcl, testData))
10 | dscrtVars <- names(Filter(is.ctgrl, testData))
11 |
12 | # testing that number of variables in output is equal to number of classed variables in input
13 | expect_equal(length(out), length(cntnsVars) + length(dscrtVars))
14 |
15 | cntnsOrder <- match(cntnsVars, out)
16 | dscrtOrder <- match(dscrtVars, out)
17 |
18 | #testing that all numeric variables appear before discrete
19 | expect_lt(max(cntnsOrder), min(dscrtOrder))
20 |
21 | #testing that all discrete variables appear in order
22 | expect_equal(order(dscrtOrder), 1:length(dscrtOrder))
23 | }
24 |
25 |
26 |
27 | test_that("OrderByR2 gives expected variable order", {
28 | out <- OrderByR2(dataFl = testData,
29 | dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = NULL)
30 |
31 | #testing order of categorical, and order of numeric relative to discrete
32 | testOrder(out, testData)
33 |
34 | #testing that numeric variables appear in order
35 | rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
36 | rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
37 | expect_gt(rSq1, rSq2)
38 | })
39 |
40 |
41 | test_that("OrderByR2 works for buildTm in date range", {
42 | buildTm = range(testData[, date][30:70])
43 | out <- OrderByR2(dataFl = testData,
44 | dateNm = "date", buildTm = buildTm, weightNm = "weight", kSample = NULL)
45 |
46 | #testing order of categorical, and order of numeric relative to discrete
47 | testOrder(out, testData)
48 |
49 | testData1 = testData[date>=buildTm[1]&date<=buildTm[2]]
50 | #testing that numeric variables appear in order
51 | rSq1 <- CalcR2(out[1] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
52 | rSq2 <- CalcR2(out[2] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
53 | expect_gt(rSq1, rSq2)
54 | })
55 |
56 |
57 | test_that("OrderByR2 works for buildTm outside date range", {
58 | buildTm = range(testData[, date][30:100] + 15)
59 | out <- OrderByR2(dataFl = testData,
60 | dateNm = "date", buildTm = buildTm, weightNm = "weight", kSample = NULL)
61 |
62 | #testing order of categorical, and order of numeric relative to discrete
63 | testOrder(out, testData)
64 |
65 | #testing that numeric variables appear in order
66 | testData1 = testData[date>=buildTm[1]&date<=buildTm[2]]
67 | rSq1 <- CalcR2(out[1] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
68 | rSq2 <- CalcR2(out[2] , dataFl = testData1, dateNm = "date", weightNm = "weight", imputeValue = NULL)
69 | expect_gt(rSq1, rSq2)
70 | })
71 |
72 |
73 | test_that("OrderByR2 works for kSample < N, with R2 being calculated on reduced sample", {
74 | set.seed(5555)
75 | out <- OrderByR2(dataFl = testData,
76 | dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 50)
77 |
78 | #testing order of categorical, and order of numeric relative to discrete
79 | testOrder(out, testData)
80 |
81 | #testing that numeric variables appear in order
82 | set.seed(5555)
83 | rSq1 <- CalcR2(out[1] , dataFl = testData[sample(.N, min(.N, 50))], dateNm = "date", weightNm = "weight", imputeValue = NULL)
84 | set.seed(5555)
85 | rSq2 <- CalcR2(out[2] , dataFl = testData[sample(.N, min(.N, 50))], dateNm = "date", weightNm = "weight", imputeValue = NULL)
86 | expect_gt(rSq1, rSq2)
87 | })
88 |
89 |
90 | test_that("OrderByR2 works for kSample > N", {
91 | out <- OrderByR2(dataFl = testData,
92 | dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 200)
93 |
94 | #testing order of categorical, and order of numeric relative to discrete
95 | testOrder(out, testData)
96 |
97 | #testing that numeric variables appear in order
98 | rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
99 | rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = "weight", imputeValue = NULL)
100 | expect_gt(rSq1, rSq2)
101 | })
102 |
103 |
104 | test_that("OrderByR2 works when kSample is too small to calculate R2, with numeric variables returned in
105 | order as given", {
106 | out <- OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, weightNm = "weight", kSample = 2)
107 |
108 | #testing order of categorical, and order of numeric relative to discrete
109 | testOrder(out, testData)
110 |
111 | #testing that all continous variables appear in data order
112 | cntnsVars <- names(Filter(is.nmrcl, testData))
113 | cntnsOrder <- match(cntnsVars, out)
114 | expect_equal(order(cntnsOrder), 1:length(cntnsOrder))
115 | })
116 |
117 | test_that("OrderByR2 works when weight is null", {
118 | out <- OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL, weightNm = NULL, kSample = NULL)
119 |
120 | #testing order of categorical, and order of numeric relative to discrete
121 | testOrder(out, testData)
122 |
123 | #testing that numeric variables appear in order
124 | rSq1 <- CalcR2(out[1] , dataFl = testData, dateNm = "date", weightNm = NULL, imputeValue = NULL)
125 | rSq2 <- CalcR2(out[2] , dataFl = testData, dateNm = "date", weightNm = NULL, imputeValue = NULL)
126 | expect_gt(rSq1, rSq2)
127 | })
128 |
129 |
130 | test_that("OrderByR2 gives warning when weight/date contains missing", {
131 | idx1 = sample(1:100, 100)[1:10]
132 | idx2 = sample(1:100, 100)[1:10]
133 | testData[idx1, weight := NA]
134 | testData[idx2, date := NA]
135 |
136 | # testing for warning that weight column contains missings
137 | expect_warning(OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL,
138 | weightNm = "weight", kSample = NULL), "Weights column")
139 | # testing for warning that date column contains missings
140 | expect_warning(OrderByR2(dataFl = testData, dateNm = "date", buildTm = NULL,
141 | weightNm = "weight", kSample = NULL), "Date column")
142 | })
143 |
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/man/PlotVar.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_print.R
3 | \name{PlotVar}
4 | \alias{PlotVar}
5 | \title{Create over time variable plots and summary statitsics for one variable}
6 | \usage{
7 | PlotVar(dataFl, myVar, weightNm, dateNm, dateGp, dateGpBp = NULL,
8 | labelFl = NULL, highlightNms = NULL, skewOpt = NULL, kSample = 50000,
9 | fuzzyLabelFn = NULL, kCategories = 9)
10 | }
11 | \arguments{
12 | \item{dataFl}{A \code{data.table} containing at least the following columns:
13 | \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an
14 | output of the \code{\link{PrepData}} function.}
15 |
16 | \item{myVar}{Name of the variable to be plotted.}
17 |
18 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
19 | no weights (all rows receiving weight 1).}
20 |
21 | \item{dateNm}{Name of column containing the date variable.}
22 |
23 | \item{dateGp}{Name of the variable that the time series plots should be
24 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
25 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
26 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
27 |
28 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
29 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
30 |
31 | \item{labelFl}{A \code{data.table} containing variable labels, or \code{NULL}
32 | for no labels; usually an output of \code{\link{PrepLabels}}.}
33 |
34 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to
35 | recieve red label. Currently \code{NULL} means all variables will get a
36 | black legend. Ignored this argument if \code{labelFl == NULL}.}
37 |
38 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is
39 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
40 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
41 | Negative input of \code{skewOpt} will be converted to 3.}
42 |
43 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer,
44 | indicates the sample size for both drawing boxplots and ordering numerical
45 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a
46 | reasonable value (default is 50K) dramatically improves processing speed.
47 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
48 | parameter should not be set to \code{NULL}, or boxplots may take a very
49 | long time to render. This setting has no impact on the accuracy of time
50 | series plots on quantiles, mean, SD, and missing and zero rates.}
51 |
52 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label
53 | file in the format of an output by \code{\link{PrepLabels}} and a string
54 | giving a variable name. The function should return the label corresponding
55 | to the variable given by the second parameter. This function should
56 | describe how fuzzy matching should be performed to find labels (see example
57 | below). If \code{NULL}, only exact matches will be retuned.}
58 |
59 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
60 | trace plots of only the \code{kCategories} most prevalent categories are
61 | plotted.}
62 | }
63 | \value{
64 | \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object. See the output
65 | \code{p} of the function or \code{\link{PlotNumVar}}
66 | \code{\link{PlotCatVar}} for details.}
67 | \item{varSummary}{A \code{data.table} of summary statistics. See the output
68 | \code{numVarSummary} of the function \code{\link{PlotNumVar}}, or the
69 | output \code{catVarSummary} of the function \code{\link{PlotCatVar}} for
70 | details.}
71 | \item{varType}{Indicator of the variable's type, either \code{"nmrcl"} or
72 | \code{"ctgrl"}.}
73 | }
74 | \description{
75 | For a numerical variable, the output includes
76 | \itemize{
77 | \item side-by-side boxplots grouped by \code{dateGpBp} (left),
78 | \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
79 | (top right),
80 | \item a trace plot of mean and +-1 SD control limits, grouped by
81 | \code{dateGp}(middle right), and
82 | \item a trace plot of missing and zerorates, grouped by \code{dateGp}
83 | (bottom right).
84 | }
85 | For a categorical variable (including a numerical variable with no more than 2
86 | unique levels not including NA), the output includes
87 | \itemize{
88 | \item a frequency bar plot (left), and
89 | \item a grid of trace plots on categories' proportions over time (right).
90 | If the variable contains more than \code{kCategories} number of categories,
91 | trace plots of only the largest \code{kCategories} will be plotted.
92 | }
93 | In addition to plots, a \code{data.table} of summary statistics are generated,
94 | on global and over time summary statistics.
95 | }
96 | \section{License}{
97 | Copyright 2017 Capital One Services, LLC Licensed under the
98 | Apache License, Version 2.0 (the "License"); you may not use this file
99 | except in compliance with the License. You may obtain a copy of the License
100 | at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
101 | law or agreed to in writing, software distributed under the License is
102 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
103 | KIND, either express or implied. See the License for the specific language
104 | governing permissions and limitations under the License.
105 | }
106 |
107 | \examples{
108 | data(bankData)
109 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
110 | dateGpBp = "quarters")
111 | data(bankLabels)
112 | bankLabels <- PrepLabels(bankLabels)
113 |
114 | ## PlotVar will treat numerical and categorical data differently.
115 | ## Binary data is always treated as categorical.
116 | plot(PlotVar(bankData, myVar = "duration", weightNm = NULL, dateNm = "date",
117 | dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p)
118 | plot(PlotVar(bankData, myVar = "job", weightNm = NULL, dateNm = "date",
119 | dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p)
120 | plot(PlotVar(bankData, myVar = "loan", weightNm = NULL, dateNm = "date",
121 | dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p)
122 |
123 | }
124 | \seealso{
125 | Functions depend on this function:
126 | \code{\link{PrintPlots}}.
127 |
128 | This function depends on:
129 | \code{\link{PlotCatVar}},
130 | \code{\link{PlotNumVar}},
131 | \code{\link{PrepData}}.
132 | }
133 |
--------------------------------------------------------------------------------
/tests/testthat/rawData.csv:
--------------------------------------------------------------------------------
1 | "age","job","marital","balance","default","weight","date"
2 | 32,"blue-collar","single",23,0,0.00510204081632653,"06-05-2008"
3 | 46,"management","single",-246,0,0.0102040816326531,"06-05-2008"
4 | 32,"admin.","married",0,0,0.0102040816326531,"06-05-2008"
5 | 60,"retired","married",100,0,0.0102040816326531,"06-05-2008"
6 | 60,"admin.","married",39,0,0.0102040816326531,"07-05-2008"
7 | 58,"retired","married",96,0,0.00510204081632653,"07-05-2008"
8 | 35,"blue-collar","single",12223,0,0.00510204081632653,"07-05-2008"
9 | 55,"services","divorced",1,1,0.0102040816326531,"07-05-2008"
10 | 45,"admin.","single",13,0,0.0204081632653061,"08-05-2008"
11 | 47,"blue-collar","married",306,0,0.00510204081632653,"08-05-2008"
12 | 45,"admin.","single",206,0,0.0102040816326531,"08-05-2008"
13 | 60,"retired","married",81,0,0.00510204081632653,"08-05-2008"
14 | 28,"management","single",447,0,0.0153061224489796,"09-05-2008"
15 | 47,"blue-collar","married",1506,0,0.0153061224489796,"10-05-2008"
16 | 35,"management","married",231,0,0.0102040816326531,"10-05-2008"
17 | 40,"retired","married",0,0,0.0153061224489796,"10-05-2008"
18 | 56,"management","married",779,0,0.00510204081632653,"11-05-2008"
19 | 25,"services","married",50,0,0.0102040816326531,"11-05-2008"
20 | 29,"management","single",0,0,0.00510204081632653,"11-05-2008"
21 | 36,"admin.","divorced",506,0,0.0153061224489796,"12-05-2008"
22 | 55,"technician","divorced",0,0,0.00510204081632653,"12-05-2008"
23 | 57,"blue-collar","married",52,0,0.0153061224489796,"13-05-2008"
24 | 42,"admin.","single",-76,0,0.0102040816326531,"13-05-2008"
25 | 24,"technician","single",-103,0,0.00510204081632653,"13-05-2008"
26 | 53,"technician","divorced",989,0,0.0102040816326531,"13-05-2008"
27 | 59,"admin.","married",2343,0,0.00510204081632653,"13-05-2008"
28 | 51,"blue-collar","married",173,0,0.00510204081632653,"13-05-2008"
29 | 44,"admin.","married",-372,0,0.0153061224489796,"14-05-2008"
30 | 55,"services","divorced",91,0,0.0102040816326531,"14-05-2008"
31 | 49,"services","divorced",0,0,0.0102040816326531,"14-05-2008"
32 | 42,"management","single",50,0,0.0102040816326531,"14-05-2008"
33 | 58,"retired","married",121,0,0.0153061224489796,"15-05-2008"
34 | 36,"technician","single",265,0,0.0153061224489796,"15-05-2008"
35 | 49,"management","married",378,0,0.0153061224489796,"15-05-2008"
36 | 54,"management","married",282,0,0.0102040816326531,"15-05-2008"
37 | 44,"blue-collar","married",582,0,0.00510204081632653,"15-05-2008"
38 | 57,"entrepreneur","divorced",-37,0,0.0102040816326531,"16-05-2008"
39 | 60,"retired","married",60,0,0.00510204081632653,"17-05-2008"
40 | 38,"management","single",424,0,0.0102040816326531,"17-05-2008"
41 | 40,"blue-collar","single",24,0,0.0153061224489796,"17-05-2008"
42 | 46,"management","divorced",16,0,0.00510204081632653,"18-05-2008"
43 | 46,"management","married",229,0,0.0153061224489796,"18-05-2008"
44 | 60,"blue-collar","married",104,0,0.0102040816326531,"20-05-2008"
45 | 46,"services","married",179,0,0.0102040816326531,"20-05-2008"
46 | 53,"technician","married",6,0,0.0153061224489796,"21-05-2008"
47 | 54,"retired","married",529,0,0.0102040816326531,"21-05-2008"
48 | 58,"management","married",2143,0,0.00510204081632653,"22-05-2008"
49 | 43,"technician","single",593,0,0.00510204081632653,"22-05-2008"
50 | 57,"technician","divorced",63,0,0.00510204081632653,"22-05-2008"
51 | 42,"entrepreneur","divorced",2,1,0.0102040816326531,"23-05-2008"
52 | 51,"retired","married",229,0,0.00510204081632653,"23-05-2008"
53 | 59,"blue-collar","married",0,0,0.00510204081632653,"23-05-2008"
54 | 31,"services","married",25,0,0.0153061224489796,"23-05-2008"
55 | 55,"blue-collar","married",383,0,0.0102040816326531,"23-05-2008"
56 | 47,"services","divorced",164,0,0.0102040816326531,"24-05-2008"
57 | 46,"self-employed","married",137,0,0.0102040816326531,"24-05-2008"
58 | 48,"management","divorced",-244,0,0.0255102040816327,"25-05-2008"
59 | 49,"blue-collar","married",154,0,0.0102040816326531,"25-05-2008"
60 | 59,"management","divorced",59,0,0.00510204081632653,"25-05-2008"
61 | 25,"blue-collar","married",-7,0,0.0102040816326531,"26-05-2008"
62 | 50,"management","married",49,0,0.0102040816326531,"26-05-2008"
63 | 58,"self-employed","married",-364,0,0.00510204081632653,"26-05-2008"
64 | 57,"retired","married",486,0,0.0153061224489796,"26-05-2008"
65 | 33,"unknown","single",1,0,0.0255102040816327,"27-05-2008"
66 | 57,"services","married",162,0,0.0204081632653061,"27-05-2008"
67 | 39,"management","single",255,0,0.00510204081632653,"27-05-2008"
68 | 57,"technician","married",839,0,0.0102040816326531,"27-05-2008"
69 | 54,"blue-collar","married",1291,0,0.00510204081632653,"27-05-2008"
70 | 32,"management","married",0,0,0.0102040816326531,"27-05-2008"
71 | 55,"blue-collar","married",23,0,0.00510204081632653,"27-05-2008"
72 | 33,"entrepreneur","married",2,0,0.00510204081632653,"28-05-2008"
73 | 58,"technician","married",71,0,0.0153061224489796,"28-05-2008"
74 | 51,"management","married",10635,0,0.00510204081632653,"28-05-2008"
75 | 36,"admin.","single",-171,0,0.0204081632653061,"28-05-2008"
76 | 38,"entrepreneur","single",243,0,0.0102040816326531,"28-05-2008"
77 | 55,"technician","married",1205,0,0.0102040816326531,"28-05-2008"
78 | 41,"admin.","divorced",270,0,0.00510204081632653,"29-05-2008"
79 | 33,"services","married",0,0,0.0102040816326531,"29-05-2008"
80 | 28,"blue-collar","married",723,0,0.00510204081632653,"29-05-2008"
81 | 57,"blue-collar","married",5935,0,0.0102040816326531,"29-05-2008"
82 | 44,"services","divorced",2586,0,0.00510204081632653,"30-05-2008"
83 | 56,"admin.","married",45,0,0.0102040816326531,"30-05-2008"
84 | 30,"technician","married",152,0,0.0153061224489796,"30-05-2008"
85 | 42,"technician","single",690,0,0.0102040816326531,"31-05-2008"
86 | 41,"technician","married",1270,0,0.0153061224489796,"31-05-2008"
87 | 36,"management","married",101,0,0.00510204081632653,"31-05-2008"
88 | 29,"admin.","single",390,0,0.00510204081632653,"01-06-2008"
89 | 44,"technician","married",0,0,0.0153061224489796,"01-06-2008"
90 | 33,"services","married",790,0,0.00510204081632653,"01-06-2008"
91 | 60,"admin.","married",290,0,0.0102040816326531,"01-06-2008"
92 | 57,"blue-collar","married",249,0,0.0102040816326531,"02-06-2008"
93 | 53,"technician","married",384,0,0.00510204081632653,"02-06-2008"
94 | 60,"blue-collar","married",54,0,0.00510204081632653,"02-06-2008"
95 | 37,"admin.","single",0,0,0.0102040816326531,"03-06-2008"
96 | 43,"technician","married",1937,0,0.0102040816326531,"03-06-2008"
97 | 44,"technician","single",29,0,0.00510204081632653,"04-06-2008"
98 | 52,"entrepreneur","married",113,0,0.0153061224489796,"04-06-2008"
99 | 53,"technician","married",-3,0,0.0102040816326531,"04-06-2008"
100 | 51,"management","married",6530,0,0.00510204081632653,"04-06-2008"
101 | 39,"technician","married",0,0,0.0153061224489796,"04-06-2008"
102 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Due to changes in priorities, this project is currently not being supported. The project is archived as of 3/14/24 and will be available in a read-only state. Please note, since archival, the project is not maintained or reviewed.
2 |
3 | # R Package for Variable Level Monitoring
4 |
5 | [](http://cran.rstudio.com/web/packages/otvPlots/index.html)
6 |
7 | An important part of model building is the "proc eyeball" sanity check. It can
8 | also be a painful part of the process, when you are the data scientist tasked
9 | with creating and checking 10,000 or more near-identical plots. The `otvPlots`
10 | package is designed to streamline this process. `otvPlots` is
11 | an R package which takes a csv file as input and provides a pdf of VLM plots
12 | and csv files of summary statistics as output, optionally ordered so
13 | that any severely abnormal time series will be at the top of the pdf. The only
14 | strict requirement of the data scientist is to specify which column of the input
15 | data file contains the date variable.
16 |
17 | `otvPlots` is efficiently implemented using `data.table` and `ggplot2` packages in R.
18 | Plots are automatically labeled if a variable dictionary is provided. Important
19 | variables can be given a highlighted label. A custom fuzzy matching algorithm
20 | can be provided by the user.
21 |
22 | Discrete and numeric variables are handled automatically and given separate
23 | treatment. All binary variables are treated as categorical.
24 |
25 | ## Output files generated by this package
26 |
27 | ### A PDF file of plots, with each individual page on one variable.
28 |
29 | For each numerical variable, the output plots include
30 | * side-by-side boxplots (left),
31 | * a trace plot of p1, p50, and p99 percentiles,
32 | * a trace plot of mean and +-1 SD control limits, and
33 | * a trace plot of missing and zero rates (bottom right).
34 |
35 | #### Here is an example page of plots for a numerical variable
36 |
40 |
41 | For each categorical variable (including a numerical variable with no more
42 | than 2 unique levels not including NA), the output plots include
43 | * a frequency bar plot (left), and
44 | * a grid of trace plots on categories' proportions over time (right).
45 |
46 | #### Here is an example page of plots for a categorical variable
47 |
51 |
52 | ### CSV file(s) on summary statistics of variables, both globally and over time.
53 |
54 | The order of variables in the CSV files is the same as in the PDF file.
55 | * A CSV file for numerical variables, including the number of observations
56 | (counts), p1, p25, p50, p75, and p99 quantiles, mean, SD, missing and
57 | zero rates.
58 | * A CSV file for categorical variables, including the number of observations
59 | (counts) and categories' proportions. Each row is a category of a
60 | categorical (or binary) variable. The row whose `category == 'NA'`
61 | corresponds to missing. Categories among the same variable are ordered by
62 | global prevalence in a descending order.
63 |
64 | # Installation
65 | Open an R (or RStudio) console and install the package from CRAN
66 |
67 | ```
68 | install.packages("otvPlots")
69 | ```
70 |
71 | Alternatively, if you prefer to install from GitHub:
72 |
73 | 1. Install the `devtools` package if not yet. You only need to do this once, so
74 | feel free to skip this step if the `devtools` is already installed. You will be
75 | asked to select a CRAN mirror.
76 |
77 | ```
78 | install.packages("devtools")
79 | ```
80 |
81 | 2. Install the `otvPlots` package
82 | ```
83 | devtools::install_github("capitalone/otvPlots")
84 | ```
85 |
86 | You can also build the package yourself by cloning the repo, setting your
87 | working directory to the otvPlots folder and running `devtools::build()`
88 | in R, after installing the `devtools` package.
89 |
90 | Note that otvPlots does depend on R and several R packages to run. You can
91 | see a complete and up to date list of dependencies in the Imports field in
92 | the DESCRIPTION file.
93 |
94 |
95 | # Getting Started
96 |
97 | ## Load the package
98 | Open an R console (or RStudio). Load the `otvPlots` pacakge first (all its
99 | dependent packages should be loaded automatically).
100 |
101 | ```
102 | library(otvPlots)
103 | ```
104 |
105 | The main function of the package is `vlm`. Before execute this function,
106 | input data need to be prepared using the `PrepData` function.
107 | **Please check out the help files to see all options and many usage examples
108 | (highly recommended!)**
109 |
110 | ```
111 | help(vlm)
112 | help(PrepData)
113 | ```
114 |
115 | ## Examples
116 |
117 | The data `bankData` and its labels `bankLables` are built-in datasets in the
118 | `otvPlots` package.
119 |
120 | ### The first example
121 | After running the following code, a pdf file named "bank.pdf" and two csv files
122 | named "bank_numerical_summary.csv" and "bank_categorical_summary.csv" will be
123 | generated in the current working directory.
124 |
125 | ```
126 | ## Load the datasets
127 | data(bankData)
128 | data(bankLabels)
129 |
130 | ## Prepare data and labels
131 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
132 | dateGpBp = "quarters")
133 | bankLabels <- PrepLabels(bankLabels)
134 |
135 | ## Generate a pdf file of vlm plots, and csv files of summary statistics
136 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
137 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", outFl = "bank")
138 | ```
139 |
140 | ### More examples on the `bankData` data
141 | The `PrepData` function only needs to be run once on a dataset. After that `vlm`
142 | can be run directly with the argument `dataNeedPrep = FALSE` (the default).
143 |
144 | * If csv files of summary statistics are not need, set `genCSV = FALSE`.
145 |
146 | ```
147 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE,
148 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters", outFl = "bank2")
149 | ```
150 | * If weights are provided, they will be used in all statistical calculations
151 |
152 | ```
153 | bankData[, weight := rnorm(.N, 1, .1)]
154 | bankData[, weight := weight / mean(weight)]
155 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
156 | dateGp = "months", dateGpBp = "quarters", weightNm = "weight", outFl = "bank3")
157 | ```
158 |
159 | * Customize plotting order by passing a vector of variable names to argument
160 | `sortVars`, but the `"date"` column must be excluded from `sortVars`
161 |
162 | ```
163 | sortVars <- sort(bankLabels[varCol!="date", varCol])
164 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
165 | dateGp = "months", dateGpBp = "quarters", outFl = "bank4",
166 | sortVars = sortVars)
167 | ```
168 |
169 | * Create plots for a specific variable using the `varNms` argument
170 |
171 | ```
172 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
173 | dateGp = "months", dateGpBp = "quarters", outFl = "bank5",
174 | varNms = "age", sortVars = NULL)
175 | ```
176 |
177 | ## Citations
178 |
179 | All examples for this package come from the
180 | [Bank Marketing dataset](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)
181 | available at the UCI Machine Learning Repository. The UCI repository maintains
182 | a free collection of datasets for researchers at its
183 | [website](http://archive.ics.uci.edu/ml).
184 |
185 | Moro et al., S. Moro, P. Cortez, and P. Rita (2014). A Data-Driven Approach to
186 | Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier,
187 | 62:22-31, June 2014
188 |
189 | Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
190 |
191 | ## Copyright 2017 Capital One Services, LLC
192 |
193 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and limitations under the License.
201 |
202 | ## External Contributors
203 | Contributors: We welcome your interest in Capital One’s Open Source Projects (the “Project”).
204 |
205 | Any Contributor to the project must accept and sign a CLA indicating agreement to the license terms. Except for the license granted in this CLA to Capital One and to recipients of software distributed by Capital One, you reserve all right, title, and interest in and to your contributions; this CLA does not impact your rights to use your own contributions for any other purpose.
206 |
207 | [Link to Individual CLA](https://docs.google.com/forms/d/19LpBBjykHPox18vrZvBbZUcK6gQTj7qv1O5hCduAZFU/viewform)
208 |
209 | [Link to Corporate CLA ](https://docs.google.com/forms/d/e/1FAIpQLSeAbobIPLCVZD_ccgtMWBDAcN68oqbAJBQyDTSAQ1AkYuCp_g/viewform)
210 |
211 | This project adheres to the
212 | [Open Source Code of Conduct](https://developer.capitalone.com/single/code-of-conduct/).
213 | By participating, you are expected to honor this code.
214 |
215 |
216 |
--------------------------------------------------------------------------------
/R/plots_order.R:
--------------------------------------------------------------------------------
1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Copyright 2017 Capital One Services, LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | #
8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied.
13 | #
14 | # See the License for the specific language governing permissions and limitations under the License.
15 |
16 |
17 | ###########################################
18 | # Order By R2 #
19 | ###########################################
20 |
21 | #' Create numerical variable ranking using R2 between date to and variable
22 | #'
23 | #' Calculates R2 of a linear model of the formula \code{var} ~ \code{dateNm} for
24 | #' each \code{var} of class \code{nmrcl} and returns a vector of
25 | #' variable names ordered by highest R2. The linear model can be calculated over
26 | #' a subset of dates, see details of parameter \code{buildTm}. Non-numerical
27 | #' variables are returned in alphabetical order after the sorted numerical
28 | #' variables.
29 | #'
30 | #' @inheritParams PrepData
31 | #' @inheritParams PlotNumVar
32 | #' @param dataFl A \code{data.table} of data; must be the output of the
33 | #' \code{\link{PrepData}} function.
34 | #' @param buildTm Vector identify time period for ranking/anomaly detection
35 | #' (most likely model build period). Allows for a subset of plotting time
36 | #' period to be used for anomaly detection.
37 | #' \itemize{
38 | #' \item Must be a vector of dates and must be inclusive i.e. buildTm[1]
39 | #' <= date <= buildTm[2] will define the time period.
40 | #' \item Must be either \code{NULL}, a vector of length 2, or a vector of
41 | #' length 3.
42 | #' \item If \code{NULL}, the entire dataset will be used for
43 | #' ranking/anomaly detection.
44 | #' \item If a vector of length 2, the format of the dates must be
45 | #' a character vector in default R date format (e.g. "2017-01-30").
46 | #' \item If a vector of length 3, the first two columns must contain dates
47 | #' in any strptime format, while the 3rd column contains the strptime
48 | #' format (see \code{\link{strptime}}).
49 | #' \item The following are equivalent ways of selecting
50 | #' all of 2014:
51 | #' \itemize{
52 | #' \item \code{c("2014-01-01","2014-12-31")}
53 | #' \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")}
54 | #' }
55 | #' }
56 | #' @export
57 | #'
58 | #' @seealso Functions depend on this function:
59 | #' \code{\link{vlm}}.
60 | #' @seealso This function depends on:
61 | #' \code{\link{CalcR2}},
62 | #' \code{\link{PrepData}}.
63 | #'
64 | #' @return A vector of variable names sorted by R2 of \code{lm} of the formula
65 | #' \code{var} ~ \code{dateNm} (highest R2 to lowest)
66 | #' @section License:
67 | #' Copyright 2017 Capital One Services, LLC Licensed under the
68 | #' Apache License, Version 2.0 (the "License"); you may not use this file
69 | #' except in compliance with the License. You may obtain a copy of the
70 | #' License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
71 | #' applicable law or agreed to in writing, software distributed under the
72 | #' License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
73 | #' CONDITIONS OF ANY KIND, either express or implied. See the License for the
74 | #' specific language governing permissions and limitations under the License.
75 | #' @examples
76 | #' data(bankData)
77 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
78 | #' dateGpBp = "quarters")
79 | #' OrderByR2(bankData, dateNm = "date")
80 |
81 | OrderByR2 <- function(dataFl, dateNm, buildTm = NULL, weightNm = NULL,
82 | kSample = 50000) {
83 |
84 | ## Make sure no NAs in weights and dates
85 | if (!is.null(weightNm)) {
86 | if (any(is.na(dataFl[[weightNm]]))) {
87 | warning("Weights column contains NAs--will be deleted casewise")
88 | }
89 | }
90 | if (any(is.na(dataFl[[dateNm]]))) {
91 | warning("Date column contains NAs--will be deleted casewise")
92 | }
93 |
94 | ## Convert buildTm to IDate format
95 | ## If the length of input buildTm is not 2 or 3, then use start and end time in dateNm
96 | buildTm <- switch(as.character(length(buildTm)), "2" = as.IDate(buildTm),
97 | "3" = as.IDate(buildTm[1:2], buildTm[3]),
98 | # avoid inheritence as list using [[]]
99 | dataFl[c(1, .N), dateNm, with = FALSE][[1]])
100 |
101 | num_vars <- names(Filter(is.nmrcl, dataFl))
102 | cat_vars <- names(Filter(is.ctgrl, dataFl))
103 |
104 | ## Sorting by R2 only works for numeric variables.
105 | if (length(num_vars > 0)) {
106 |
107 | # Using sample directly in dataFl parameter for brevity,
108 | # which reorders the input to CalcR2 but does not change output
109 | r2 <- vapply(num_vars, CalcR2,
110 | dataFl = dataFl[buildTm[1] <= get(dateNm) &
111 | get(dateNm) <= buildTm[2], ][
112 | sample(.N, min(.N, kSample))],
113 | dateNm = dateNm, weightNm = weightNm, imputeValue = NULL,
114 | numeric(1))
115 | sortVars <- c(num_vars[order(r2, decreasing = TRUE)], cat_vars)
116 | } else {
117 | sortVars <- cat_vars
118 | }
119 |
120 | return(sortVars)
121 | }
122 |
123 |
124 | ###########################################
125 | # CalcR2 Function #
126 | ###########################################
127 |
128 | #' Calculates R2 of a numerical variable using date as the predictor
129 | #'
130 | #' Calculates weighted R2 of a univariate weighted linear model with
131 | #' \code{dateNm} as x and \code{myVar} as y using the workhorse \code{lm.fit}
132 | #' and \code{lm.wfit} functions.
133 | #'
134 | #' @param myVar Name of variable to model.
135 | #' @param dataFl A \code{data.table}, containing \code{myVar}, \code{dateNm},
136 | #' and \code{weightNm}.
137 | #' @param dateNm Name of column containing the date variable (to be modeled as
138 | #' numeric); this date column must not have NA's.
139 | #' @param weightNm Name of column containing row weights. If weights equal one,
140 | #' then the \code{\link{lm.fit}} function will be called, otherwise the
141 | #' \code{\link{lm.wfit}} will be called. The weights column must not have NA's.
142 | #' @param imputeValue Either \code{NULL} or numeric. If \code{NULL}, model will
143 | #' be fit on only non-NA components of \code{myVar}. If numeric, missing cases
144 | #' of \code{myVar} will be imputed to \code{imputeValue}.
145 | #' @return A numeric value of R2.
146 | #' @export
147 | #'
148 | #' @seealso Functions depend on this function:
149 | #' \code{\link{OrderByR2}}.
150 | #' @seealso This function depends on:
151 | #' \code{\link{PrepData}}.
152 | #'
153 | #' @section License:
154 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
155 | #' Version 2.0 (the "License"); you may not use this file except in compliance
156 | #' with the License. You may obtain a copy of the License at
157 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
158 | #' or agreed to in writing, software distributed under the License is
159 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
160 | #' KIND, either express or implied. See the License for the specific language
161 | #' governing permissions and limitations under the License.
162 |
163 | CalcR2 <- function(myVar, dataFl, dateNm, weightNm = NULL, imputeValue = NULL) {
164 |
165 | message("Calculating R2 of ", myVar)
166 |
167 | if (sum(!is.na(dataFl[[myVar]])) < 2) {
168 | ## If kSample is not null, then we need to recheck that the subsample is not
169 | ## all missing. If there are less than 2 numeric values left after sampling
170 | ## we can't calculate R2
171 | return(Inf)
172 | } else {
173 | y <- dataFl[[myVar]]
174 |
175 | ## If imputeValue is available, we impute everywhere Y is missing
176 | if (!is.null(imputeValue)) {
177 | y[is.na(y)] <- imputeValue
178 | }
179 |
180 | ## Index of missing values in y (after imputation if applicable)
181 | yIdx <- which(is.na(y))
182 |
183 | ## We perform casewise deletion anywhere X, Y or W (if not null) is missing
184 | if (!is.null(weightNm)) {
185 | w <- dataFl[[weightNm]]
186 | wIdx <- which(is.na(w))
187 | yIdx <- unique(c(yIdx, wIdx))
188 | }
189 |
190 | ## Convert x from date to numeric, plus a column of ones as the intercept
191 | x <- cbind(1, as.matrix(as.numeric(dataFl[[dateNm]]), ncol = 1))
192 | xIdx <- which(is.na(x[, 2]))
193 | yIdx <- unique(c(xIdx, yIdx))
194 |
195 | ## Remove all entries as in yIdx
196 | if (length(yIdx) > 0) {
197 | if (!is.null(weightNm)) {
198 | w <- w[-c(yIdx)]
199 | }
200 | y <- y[-c(yIdx)]
201 | x <- x[-c(yIdx), ]
202 | }
203 |
204 | ## Compute R2 or weighted R2
205 | if (is.null(weightNm)) {
206 | mod <- lm.fit(x = x, y = y)
207 | r2 <- 1 - sum(mod$resid ^ 2) / sum( (y - mean(y)) ^ 2)
208 | } else {
209 | mod <- lm.wfit(x = x, y = y, w = w)
210 | r2 <- 1 - sum(w * mod$resid ^ 2) / sum(w * (y - Hmisc::wtd.mean(y, w, normwt = TRUE)) ^ 2)
211 | }
212 | return(r2)
213 | }
214 | }
215 |
--------------------------------------------------------------------------------
/tests/testthat/test_PrepData.R:
--------------------------------------------------------------------------------
1 | library(otvPlots)
2 | context("Prepare Data")
3 | data(bankData); setDT(bankData)
4 | is.cntns <- function(x) inherits(x, "nmrcl") #!#previous name: "cntns"
5 | is.dscrt <- function(x) inherits(x, "ctgrl") #!# previous name: "dscrt"
6 | is.IDate <- function(x) inherits(x, "IDate")
7 | is.binary <- function(x) uniqueN(na.omit(x)) == 2
8 |
9 | test_that("Names of the variables are transformed correctly", {
10 | out <- PrepData(dataFl = "../testthat/drugRDate.csv", dateNm = "date",
11 | dateGp = "months", dateGpBp = "quarters")
12 | expect_equal(names(out)[6], "Residence.City")
13 | })
14 |
15 | test_that("Parse SAS (eg. 07Apr2017) default date format correctly", {
16 | out <- PrepData(dataFl = "../testthat/drugSASDate.csv", dateNm = "date",
17 | dateGp = "months", dateGpBp = "quarters")
18 | expect_false(all(is.na(out[, "date"])), 'Fail to parse SAS date format')
19 | }
20 | )
21 |
22 | test_that("Parse R (eg. 2017-04-17) default date format correctly", {
23 | out <- PrepData(dataFl = "../testthat/drugRDate.csv", dateNm = "date",
24 | dateGp = "months", dateGpBp = "quarters")
25 | expect_false(all(is.na(out[, "date"])), 'Fail to parse R date format')
26 | }
27 | )
28 |
29 | test_that("Incorrect date format creates warnings with csv input file", {
30 | expect_warning(
31 | PrepData("../testthat/rawData.csv", dateNm = "date", weightNm ="weight",
32 | dateGp = "weeks", dateGpBp = "weeks"), "Formatting date as ")
33 | }
34 | )
35 |
36 | test_that("Incorrect date format creates warnings with Rdata input file", {
37 | expect_warning(
38 | PrepData("../testthat/rawData.rda", dateNm = "date", weightNm ="weight",
39 | dateGp = "weeks", dateGpBp = "weeks"), "Formatting date as ")
40 | }
41 | )
42 |
43 | out <- suppressMessages(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
44 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y"))
45 |
46 | test_that("All columns have exactly 2 classes, except date and weight", {
47 | cntnsVars = Filter(is.cntns, out)
48 | dscrtVars = Filter(is.dscrt, out)
49 | dateVars = Filter(is.IDate, out)
50 | expect_equal(length(cntnsVars), 2)
51 | expect_equal(length(dscrtVars), 3)
52 | expect_equal(length(dateVars), 2)
53 | expect_equal(length(class(out[, weight])), 1)
54 | expect_equal(length(cntnsVars) + length(dscrtVars) + length(dateVars) + 1, ncol(out))
55 | })
56 |
57 | test_that("Variables are assigned to appropriate data type", {
58 | cntnsVars = Filter(is.cntns, out)
59 |
60 | # test that all cntns variables are numeric
61 | expect_equal(length(Filter(Negate(is.numeric), cntnsVars)), 0)
62 |
63 | # test that no cntns variables are binary
64 | expect_equal(length(Filter(is.binary, cntnsVars)), 0)
65 |
66 | # test that all discrete variables are binary, character, or factor
67 | dscrtVars = Filter(is.dscrt, out)
68 | binVars = Filter(is.binary, dscrtVars)
69 | charVars = Filter(Negate(is.binary), dscrtVars)
70 | charClasses = unique(sapply(charVars, function(x) class(x)[1]))
71 | expect_equal(length(setdiff(charClasses, c("character", "factor"))), 0)
72 |
73 | # test that all remaining variables are IDate, except weight
74 | dateVars = Filter(is.IDate, out)
75 |
76 | expect_equal(length(names(dateVars)) + length(names(binVars)) + length(names(charVars))
77 | + length(names(cntnsVars)) + 1, length(names(out)))
78 | })
79 |
80 | test_that("varNms parameter works", {
81 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
82 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", varNms = c("age", "balance"))
83 | cntnsVars = Filter(is.cntns, out)
84 | dscrtVars = Filter(is.dscrt, out)
85 | dateVars = Filter(is.IDate, out)
86 | expect_equal(length(cntnsVars), 2)
87 | expect_equal(length(dscrtVars), 0)
88 | expect_equal(length(dateVars), 2)
89 |
90 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
91 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y", varNms = c(1, 4))
92 | cntnsVars = Filter(is.cntns, out)
93 | dscrtVars = Filter(is.dscrt, out)
94 | dateVars = Filter(is.IDate, out)
95 | expect_equal(length(cntnsVars), 2)
96 | expect_equal(length(dscrtVars), 0)
97 | expect_equal(length(dateVars), 2)
98 | })
99 |
100 | test_that("selectCols and dropCols work as expected for csv file", {
101 |
102 | # Test that selectCols works alone
103 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
104 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
105 | selectCols = c("age", "balance", "date", "weight"))
106 | cntnsVars = Filter(is.cntns, out)
107 | dscrtVars = Filter(is.dscrt, out)
108 | dateVars = Filter(is.IDate, out)
109 | expect_equal(length(cntnsVars), 2)
110 | expect_equal(length(dscrtVars), 0)
111 | expect_equal(length(dateVars), 2)
112 |
113 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
114 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
115 | selectCols = c(1, 4, 7, 6))
116 | cntnsVars = Filter(is.cntns, out)
117 | dscrtVars = Filter(is.dscrt, out)
118 | dateVars = Filter(is.IDate, out)
119 | expect_equal(length(cntnsVars), 2)
120 | expect_equal(length(dscrtVars), 0)
121 | expect_equal(length(dateVars), 2)
122 |
123 | # test that dropCols works alone
124 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
125 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
126 | dropCols = c("job", "marital", "default"))
127 | cntnsVars = Filter(is.cntns, out)
128 | dscrtVars = Filter(is.dscrt, out)
129 | dateVars = Filter(is.IDate, out)
130 | expect_equal(length(cntnsVars), 2)
131 | expect_equal(length(dscrtVars), 0)
132 | expect_equal(length(dateVars), 2)
133 |
134 | out <- PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
135 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
136 | dropCols = c(2:3, 5))
137 | cntnsVars = Filter(is.cntns, out)
138 | dscrtVars = Filter(is.dscrt, out)
139 | dateVars = Filter(is.IDate, out)
140 | expect_equal(length(cntnsVars), 2)
141 | expect_equal(length(dscrtVars), 0)
142 | expect_equal(length(dateVars), 2)
143 | })
144 |
145 | test_that("selectCols and dropCols work as expected for RData file", {
146 |
147 | # Test that selectCols works alone
148 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
149 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
150 | selectCols = c("age", "balance", "date", "weight"))
151 | cntnsVars = Filter(is.cntns, out)
152 | dscrtVars = Filter(is.dscrt, out)
153 | dateVars = Filter(is.IDate, out)
154 | expect_equal(length(cntnsVars), 2)
155 | expect_equal(length(dscrtVars), 0)
156 | expect_equal(length(dateVars), 2)
157 |
158 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
159 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
160 | selectCols = c(1, 4, 7, 6))
161 | cntnsVars = Filter(is.cntns, out)
162 | dscrtVars = Filter(is.dscrt, out)
163 | dateVars = Filter(is.IDate, out)
164 | expect_equal(length(cntnsVars), 2)
165 | expect_equal(length(dscrtVars), 0)
166 | expect_equal(length(dateVars), 2)
167 |
168 | # test that dropCols works alone
169 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
170 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
171 | dropCols = c("job", "marital", "default"))
172 | cntnsVars = Filter(is.cntns, out)
173 | dscrtVars = Filter(is.dscrt, out)
174 | dateVars = Filter(is.IDate, out)
175 | expect_equal(length(cntnsVars), 2)
176 | expect_equal(length(dscrtVars), 0)
177 | expect_equal(length(dateVars), 2)
178 |
179 | out <- PrepData("../testthat/rawData.rda", dateNm = "date", weightNm = "weight",
180 | dateGp = "weeks", dateGpBp = "weeks", dateFt = "%d-%m-%Y",
181 | dropCols = c(2:3, 5))
182 | cntnsVars = Filter(is.cntns, out)
183 | dscrtVars = Filter(is.dscrt, out)
184 | dateVars = Filter(is.IDate, out)
185 | expect_equal(length(cntnsVars), 2)
186 | expect_equal(length(dscrtVars), 0)
187 | expect_equal(length(dateVars), 2)
188 | })
189 |
190 | test_that("dropConstants works as expected", {
191 |
192 | # test that attempting to group at too coarse a level results in the grouping variable being dropped
193 | out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
194 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = TRUE))
195 | expect_warning(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
196 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = TRUE),
197 | "The following variables have no variability")
198 | expect_null(out[["quarters"]])
199 |
200 |
201 | # test that when dropConstants is set to FALSE, the constant grouping variable is retained, with a warning
202 | out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
203 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = FALSE))
204 | expect_warning(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
205 | dateGp = "weeks", dateGpBp = "quarters", dateFt = "%d-%m-%Y", dropConstants = FALSE),
206 | "variability in grouping")
207 | expect_equal(length(unique(out[["quarters"]])), 1)
208 |
209 | })
210 |
211 | test_that("integer64 data doesn't cause problems", {
212 | require(bit64)
213 | out <- suppressWarnings(PrepData("../testthat/rawData.csv", dateNm = "date", weightNm = "weight",
214 | dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y"))
215 | out[ , balance := as.integer64(balance)]
216 | PrepData(out, dateNm = "date", weightNm = "weight",
217 | dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y")
218 | expect_false(is.integer64(out[, balance]))
219 |
220 | out <- suppressWarnings(PrepData("../testthat/rawData_bigint.csv", dateNm = "date", weightNm = "weight",
221 | dateGp = "weeks", dateGpBp = "months", dateFt = "%d-%m-%Y"))
222 | expect_false(is.integer64(out[,bigint]))
223 | })
224 |
225 | test_that("Incorrect data input file generates error", {
226 | expect_error(dataFl <- PrepD("../testthat/PlotHistogram.RDS"))
227 | })
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/R/vlm.R:
--------------------------------------------------------------------------------
1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Copyright 2017 Capital One Services, LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | #
8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied.
13 | #
14 | # See the License for the specific language governing permissions and limitations under the License.
15 |
16 |
17 | ###########################################
18 | # The Main Function #
19 | ###########################################
20 |
21 | #' Create over time variable plots and summary statistics for variable level monitoring
22 | #'
23 | #' Sorts variables according to either user input or correlation with time
24 | #' (among numerical variables only), and create output files including:
25 | #' \itemize{
26 | #' \item A PDF file of plots saved as \code{outFl}.pdf, with each indivual page
27 | #' on one variable. Variables are plotted in the order indicated in the argument
28 | #' \code{sortVars} or \code{sortFn}.
29 | #' For each numerical variable, the output plots include
30 | #' \itemize{
31 | #' \item side-by-side boxplots grouped by \code{dateGpBp} (left),
32 | #' \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
33 | #' (top right),
34 | #' \item a trace plot of mean and +-1 SD control limits, grouped by
35 | #' \code{dateGp}(middle right), and
36 | #' \item a trace plot of missing and zerorates, grouped by \code{dateGp}
37 | #' (bottom right).
38 | #' }
39 | #' For each categorical variable (including a numerical variable with no more
40 | #' than 2 unique levels not including NA), the output plots include
41 | #' \itemize{
42 | #' \item a frequency bar plot (left), and
43 | #' \item a grid of trace plots on categories' proportions over time (right).
44 | #' If the variable contains more than \code{kCategories} number of
45 | #' categories, trace plots of only the largest \code{kCategories} will be
46 | #' plotted. If the variable contains only two categories, then only the
47 | #' trace plot of the less prevalent cateogy will be plotted.
48 | #' }
49 | #' \item CSV file(s) on summary statistics of variable, both globally and over
50 | #' time aggregated by \code{dateGp}. The order of variables in the CSV files
51 | #' are the same as in the PDF file.
52 | #' \itemize{
53 | #' \item For numerical varaibles, number of observations (counts), p1, p25,
54 | #' p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
55 | #' as \code{outFl}_numerical_summary.csv.
56 | #' \item For categorical varaibles, number of observations (counts) and
57 | #' categories' proportions are saved as \code{outFl}_categorical_summary.csv.
58 | #' Each row is a category of a categorical (or binary) variable.
59 | #' The row whose \code{category == 'NA'} corresponds to missing. Categories
60 | #' among the same variable are ordered by global prevalence in a descending
61 | #' order.
62 | #' }
63 | #' }
64 | #'
65 | #' If the argument \code{dataNeedPrep} is set to \code{FALSE}, then
66 | #' \itemize{
67 | #' \item \code{dataFl} must be a \code{data.table} containing variables
68 | #' \code{weightNm}, \code{dateNm}, \code{dateGp}, and \code{dateGpBp}, and
69 | #' names of these variables must be the same as the corresponding arguments
70 | #' of the \code{\link{vlm}} function.
71 | #' \item the arguments \code{selectCols}, \code{dropCols}, \code{dateFt},
72 | #' \code{dropConstants} will be ignored by the \code{\link{vlm}} function.
73 | #' \item When analyzing a dataset for the first time, it is recommended to first
74 | #' run the \code{\link{PrepData}} function on it, and then apply the
75 | #' \code{\link{vlm}} function with the argument \code{dataNeedPrep = FALSE}.
76 | #' Please see the examples for details.
77 | #' }
78 | #'
79 | #' @inheritParams PrepData
80 | #' @inheritParams PrepLabels
81 | #' @inheritParams OrderByR2
82 | #' @inheritParams PrintPlots
83 | #' @param sortVars Determines which variables to be plotted and their order.
84 | #' Either a character vector of variable names to plot variables in the same
85 | #' order as in the \code{sortVars} argument), or \code{NULL} to keep the
86 | #' original ordering, with numerical variables will being plotted before
87 | #' categorical and binary ones. \code{sortVars} should be \code{NULL} when the
88 | #' \code{sortFn} argument is used.
89 | #' @param sortFn A sorting function which returns \code{sortVars} as an output.
90 | #' The function may take the following variables as input: \code{dataFl},
91 | #' \code{dateNm}, \code{buildTm}, \code{weightNm}, \code{kSample}. Currently,
92 | #' the only build-in sorting function is \code{\link{OrderByR2}}, which sorts
93 | #' numerical variables in the order of strength of linear association with date,
94 | #' and adds categorical (and binary) variables sorted in alphabetical order
95 | #' after the numerical ones.
96 | #' @param dataNeedPrep Logical, indicates if data should be run through the
97 | #' \code{\link{PrepData}} function. This should be set to \code{TRUE} unless
98 | #' the \code{\link{PrepData}} function has been applied to the input data
99 | #' \code{dataFl}.
100 | #' @export
101 | #'
102 | #' @seealso This function depends on:
103 | #' \code{\link{PrintPlots}},
104 | #' \code{\link{OrderByR2}},
105 | #' \code{\link{PrepData}},
106 | #' \code{\link{PrepLabels}}.
107 | #'
108 | #' @section License: Copyright 2017 Capital One Services, LLC Licensed under the
109 | #' Apache License, Version 2.0 (the "License"); you may not use this file
110 | #' except in compliance with the License. You may obtain a copy of the License
111 | #' at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
112 | #' law or agreed to in writing, software distributed under the License is
113 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
114 | #' KIND, either express or implied. See the License for the specific language
115 | #' governing permissions and limitations under the License.
116 | #' @examples
117 | #' ## Load the data and its label
118 | #' data(bankData)
119 | #' data(bankLabels)
120 | #'
121 | #' ## The PrepData function should only need to be run once on a dataset,
122 | #' ## after that vlm can be run with the argument dataNeedPrep = FALSE
123 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
124 | #' dateGpBp = "quarters")
125 | #' bankLabels <- PrepLabels(bankLabels)
126 | #'
127 | #'\dontrun{
128 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
129 | #' sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters",
130 | #' outFl = "bank")
131 | #'
132 | #' ## If csv files of summary statistics are not need, set genCSV = FALSE
133 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE,
134 | #' sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters",
135 | #' outFl = "bank")
136 | #'
137 | #' ## If weights are provided, they will be used in all statistical calculations
138 | #' bankData[, weight := rnorm(.N, 1, .1)]
139 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
140 | #' dateGp = "months", dateGpBp = "quarters", weightNm = "weight",
141 | #' outFl = "bank")
142 | #'
143 | #' ## Customize plotting order by passing a vector of variable names to
144 | #' ## sortVars, but the "date" column must be excluded from sortVars
145 | #' sortVars <- sort(bankLabels[varCol!="date", varCol])
146 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
147 | #' dateGp = "months", dateGpBp = "quarters", outFl = "bank",
148 | #' sortVars = sortVars)
149 | #'
150 | #' ## Create plots for a specific variable using the varNms parameter
151 | #' vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
152 | #' dateGp = "months", dateGpBp = "quarters", outFl = "bank",
153 | #' varNms = "age", sortVars = NULL)
154 | #'}
155 |
156 | vlm <- function(dataFl, dateNm, labelFl = NULL, outFl = "otvplots",
157 | genCSV = TRUE, dataNeedPrep = FALSE, dateGp = NULL,
158 | dateGpBp = NULL, weightNm = NULL, varNms = NULL,
159 | sortVars = NULL, sortFn = NULL, selectCols = NULL,
160 | dropCols = NULL, dateFt = "%d%h%Y", buildTm = NULL,
161 | highlightNms = NULL, skewOpt = NULL, kSample = 50000,
162 | fuzzyLabelFn = NULL, dropConstants = FALSE, kCategories = 9, ...) {
163 |
164 | ## Assert statements about inputs
165 | if (!is.null(sortVars) & !is.null(sortFn)) {
166 | stop ("Please choose between sortVars (predetermined order of plotting) and
167 | sortFn (function to determine plotting order)")}
168 |
169 | if (!is.null(sortVars) & !is.null(varNms) &&
170 | !all(varNms %in% sortVars)) {
171 | stop ("Please make certain that varNms is a subset of sortVars")
172 | }
173 |
174 | if (!is.null(selectCols) & !is.null(dropCols)) {
175 | stop("Please choose between selectCols or dropCols.")
176 | }
177 |
178 | ## Apply the PrepData function if not previously on dataFl
179 | if (dataNeedPrep) {
180 | # Need to prepare data first
181 | dataFl <- PrepData(dataFl = dataFl, dateNm = dateNm,
182 | selectCols = selectCols, dropCols = dropCols,
183 | dateFt = dateFt, dateGp = dateGp, dateGpBp = dateGpBp,
184 | weightNm = weightNm, varNms = varNms,
185 | dropConstants = dropConstants, ...)
186 | } else {
187 | stopifnot(is.data.table(dataFl) &&
188 | all(c(weightNm, dateNm, dateGp, dateGpBp) %in% names(dataFl)))
189 | ## Change integer64 data type to numeric
190 | for (var in names(dataFl)) {
191 | if (inherits(dataFl[[var]], "integer64")) {
192 | dataFl[, (var) := as.numeric(get(var))]
193 | }
194 | }
195 | }
196 |
197 | ## Apply the PrepLabels function
198 | labelFl <- PrepLabels(labelFl)
199 |
200 | ## Apply sortFn to generate sortVars
201 | if (!is.null(sortFn) && is.character(sortFn)) {
202 | sortVars <- do.call(sortFn, list(dataFl = dataFl, dateNm = dateNm,
203 | buildTm = buildTm, weightNm = weightNm,
204 | kSample = kSample))
205 | } else {
206 | if (is.null(sortVars)) {
207 | num_vars <- names(dataFl)[sapply(dataFl, inherits, "nmrcl")]
208 | cat_vars <- names(dataFl)[sapply(dataFl, inherits, "ctgrl")]
209 | sortVars <- c(num_vars, cat_vars)
210 | }
211 | }
212 |
213 | ## Create the plots
214 | if (!is.null(varNms)) {
215 | PrintPlots(outFl = outFl,
216 | dataFl = dataFl[, c(varNms, dateNm, dateGp, dateGpBp, weightNm),
217 | with = FALSE],
218 | sortVars = sortVars[sortVars %in% varNms], dateNm = dateNm,
219 | dateGp = dateGp, dateGpBp = dateGpBp, weightNm = weightNm,
220 | labelFl = labelFl, genCSV = genCSV, highlightNms = highlightNms,
221 | skewOpt = skewOpt, kSample = kSample,
222 | fuzzyLabelFn = fuzzyLabelFn, kCategories = kCategories)
223 | } else {
224 | PrintPlots(outFl = outFl, dataFl = dataFl, sortVars = sortVars,
225 | dateNm = dateNm, dateGp = dateGp, dateGpBp = dateGpBp,
226 | weightNm = weightNm, labelFl = labelFl, genCSV = genCSV,
227 | highlightNms = highlightNms, skewOpt = skewOpt,
228 | kSample = kSample, fuzzyLabelFn = fuzzyLabelFn,
229 | kCategories = kCategories)
230 | }
231 | }
232 |
233 |
--------------------------------------------------------------------------------
/R/plot_print.R:
--------------------------------------------------------------------------------
1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Copyright 2017 Capital One Services, LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | #
8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied.
13 | #
14 | # See the License for the specific language governing permissions and limitations under the License.
15 |
16 |
17 | ###########################################
18 | # Create output #
19 | ###########################################
20 |
21 | #' Create a pdf file with plots and compute summary statistics for all variables
22 | #'
23 | #' Creates plots and outputs results to a letter-sized pdf file, with each
24 | #' individual page containing plots on a single variable in the data. In
25 | #' addition, two summary statistics \code{data.table} are returned, one for
26 | #' numerical variables, and one for categorical (and binary) ones.
27 | #'
28 | #' @inheritParams PlotVar
29 | #' @param outFl Name of the output file, with no extension names (e.g., "bank").
30 | #' A pdf file of plots ("bank.pdf"), and two csv files of summary statistics
31 | #' ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be
32 | #' saved to your working directory, unless a path is included in \code{outFl}
33 | #' (e.g. "../plots/bank").
34 | #' @param genCSV Logical, whether to generate the two csv files of summary
35 | #' statistics for numerical and categorical variables.
36 | #' @param sortVars A character vector of variable names in the order they will
37 | #' be plotted.
38 | #' @return A pdf of plots saved to file \code{outFl}.pdf, and if the argument
39 | #' \code{genCSV == TRUE}, also two csv files of summary statistics for
40 | #' numerical and categorical variables.
41 | #'
42 | #' @seealso Functions depend on this function:
43 | #' \code{\link{vlm}}.
44 | #' @seealso This function depends on:
45 | #' \code{\link{PlotVar}},
46 | #' \code{\link{PrepData}}.
47 | #'
48 | #' @section License:
49 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
50 | #' Version 2.0 (the "License"); you may not use this file except in compliance
51 | #' with the License. You may obtain a copy of the License at
52 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
53 | #' or agreed to in writing, software distributed under the License is
54 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
55 | #' KIND, either express or implied. See the License for the specific language
56 | #' governing permissions and limitations under the License.
57 | #' @export
58 | PrintPlots <- function(outFl, dataFl, sortVars, dateNm, dateGp, dateGpBp,
59 | weightNm = NULL, labelFl = NULL, genCSV = TRUE,
60 | highlightNms = NULL, skewOpt = NULL, kSample = 50000,
61 | fuzzyLabelFn = NULL, kCategories = 9) {
62 |
63 | catSummary <- NULL
64 | numSummary <- NULL
65 | . <- NULL
66 |
67 | plotList <-
68 | lapply(sortVars, PlotVar,
69 | dataFl = dataFl, weightNm = weightNm, dateNm = dateNm,
70 | dateGp = dateGp, dateGpBp = dateGpBp, labelFl = labelFl,
71 | highlightNms = highlightNms, skewOpt = skewOpt,
72 | fuzzyLabelFn = fuzzyLabelFn, kCategories = kCategories)
73 |
74 | grDevices::pdf(file = paste(outFl, '.pdf', sep = ''), width = 11, height = 8,
75 | pointsize = 12, onefile = TRUE)
76 |
77 | for (x in plotList) {
78 | grid::grid.newpage()
79 | grid::grid.draw(x$p)
80 |
81 | if(genCSV == TRUE){
82 | if(x$varType == "ctgrl")
83 | catSummary = rbind(catSummary, x$varSummary)
84 | if(x$varType == "nmrcl")
85 | numSummary = rbind(numSummary, x$varSummary)
86 | }
87 | }
88 | dev.off()
89 |
90 | ## Generate CSV files
91 | if(genCSV == TRUE){
92 | ## Compute counts in each time
93 | if (is.null(weightNm)){
94 | total_counts = dataFl[, list(count = .N), by = dateGp]
95 | } else{
96 | total_counts = dataFl[, list(count = sum(get(weightNm))), by = dateGp]
97 | }
98 | names(total_counts)[1] = "date_group"
99 | total_counts = dcast(total_counts, . ~ date_group, value.var = 'count')
100 | total_counts[, . := NULL]
101 |
102 | ## For numerical variables
103 | if(!is.null(numSummary)){
104 | ## Add a row of counts at the begining of numSummary
105 | numSummary = rbind(as.list(rep(NA, ncol(numSummary))), numSummary)
106 | numSummary[1, 1:2] = list('ALL_DATA', 'COUNTS')
107 | numSummary[1, 3] = sum(total_counts)
108 | numSummary[1, names(numSummary)[-(1:3)] := total_counts];
109 | ## Write the csv file
110 | fwrite(numSummary, file = paste(outFl, '_numerical_summary.csv', sep = ''))
111 | }
112 |
113 | ## For categorical variables
114 | if(!is.null(catSummary)){
115 | ## Add a row of counts at the begining of catSummary
116 | catSummary = rbind(as.list(rep(NA, ncol(catSummary))), catSummary)
117 | catSummary[1, 1:2] = list('ALL_DATA', 'COUNTS')
118 | catSummary[1, 3:4] = list(sum(total_counts), 1)
119 | catSummary[1, names(catSummary)[-(1:4)] := total_counts];
120 | ## Write the csv file
121 | fwrite(catSummary, file = paste(outFl, '_categorical_summary.csv', sep = ''))
122 | }
123 | }
124 | }
125 |
126 | ###############################################
127 | # Main Plot Function for a single variable #
128 | ###############################################
129 |
130 | #' Create over time variable plots and summary statitsics for one variable
131 | #'
132 | #' For a numerical variable, the output includes
133 | #' \itemize{
134 | #' \item side-by-side boxplots grouped by \code{dateGpBp} (left),
135 | #' \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
136 | #' (top right),
137 | #' \item a trace plot of mean and +-1 SD control limits, grouped by
138 | #' \code{dateGp}(middle right), and
139 | #' \item a trace plot of missing and zerorates, grouped by \code{dateGp}
140 | #' (bottom right).
141 | #' }
142 | #' For a categorical variable (including a numerical variable with no more than 2
143 | #' unique levels not including NA), the output includes
144 | #' \itemize{
145 | #' \item a frequency bar plot (left), and
146 | #' \item a grid of trace plots on categories' proportions over time (right).
147 | #' If the variable contains more than \code{kCategories} number of categories,
148 | #' trace plots of only the largest \code{kCategories} will be plotted.
149 | #' }
150 | #' In addition to plots, a \code{data.table} of summary statistics are generated,
151 | #' on global and over time summary statistics.
152 | #'
153 | #' @inheritParams PlotCatVar
154 | #' @inheritParams PlotNumVar
155 | #' @inheritParams OrderByR2
156 | #' @param dataFl A \code{data.table} containing at least the following columns:
157 | #' \code{myVar}, \code{weightNm}, \code{dateGp}, \code{dateGpBp}; usually an
158 | #' output of the \code{\link{PrepData}} function.
159 | #' @param myVar Name of the variable to be plotted.
160 | #' @param labelFl A \code{data.table} containing variable labels, or \code{NULL}
161 | #' for no labels; usually an output of \code{\link{PrepLabels}}.
162 | #' @param highlightNms Either \code{NULL} or a character vector of variables to
163 | #' recieve red label. Currently \code{NULL} means all variables will get a
164 | #' black legend. Ignored this argument if \code{labelFl == NULL}.
165 | #' @param fuzzyLabelFn Either \code{NULL} or a function of 2 parameters: A label
166 | #' file in the format of an output by \code{\link{PrepLabels}} and a string
167 | #' giving a variable name. The function should return the label corresponding
168 | #' to the variable given by the second parameter. This function should
169 | #' describe how fuzzy matching should be performed to find labels (see example
170 | #' below). If \code{NULL}, only exact matches will be retuned.
171 | #' @return
172 | #' \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object. See the output
173 | #' \code{p} of the function or \code{\link{PlotNumVar}}
174 | #' \code{\link{PlotCatVar}} for details.}
175 | #' \item{varSummary}{A \code{data.table} of summary statistics. See the output
176 | #' \code{numVarSummary} of the function \code{\link{PlotNumVar}}, or the
177 | #' output \code{catVarSummary} of the function \code{\link{PlotCatVar}} for
178 | #' details.}
179 | #' \item{varType}{Indicator of the variable's type, either \code{"nmrcl"} or
180 | #' \code{"ctgrl"}.}
181 | #' @export
182 | #'
183 | #' @seealso Functions depend on this function:
184 | #' \code{\link{PrintPlots}}.
185 | #' @seealso This function depends on:
186 | #' \code{\link{PlotCatVar}},
187 | #' \code{\link{PlotNumVar}},
188 | #' \code{\link{PrepData}}.
189 | #'
190 | #' @section License: Copyright 2017 Capital One Services, LLC Licensed under the
191 | #' Apache License, Version 2.0 (the "License"); you may not use this file
192 | #' except in compliance with the License. You may obtain a copy of the License
193 | #' at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
194 | #' law or agreed to in writing, software distributed under the License is
195 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
196 | #' KIND, either express or implied. See the License for the specific language
197 | #' governing permissions and limitations under the License.
198 | #' @examples
199 | #' data(bankData)
200 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
201 | #' dateGpBp = "quarters")
202 | #' data(bankLabels)
203 | #' bankLabels <- PrepLabels(bankLabels)
204 | #'
205 | #' ## PlotVar will treat numerical and categorical data differently.
206 | #' ## Binary data is always treated as categorical.
207 | #' plot(PlotVar(bankData, myVar = "duration", weightNm = NULL, dateNm = "date",
208 | #' dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p)
209 | #' plot(PlotVar(bankData, myVar = "job", weightNm = NULL, dateNm = "date",
210 | #' dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p)
211 | #' plot(PlotVar(bankData, myVar = "loan", weightNm = NULL, dateNm = "date",
212 | #' dateGp = "months", dateGpBp = "quarters", labelFl = bankLabels)$p)
213 | #'
214 | PlotVar <- function(dataFl, myVar, weightNm, dateNm, dateGp, dateGpBp = NULL,
215 | labelFl = NULL, highlightNms = NULL, skewOpt = NULL,
216 | kSample = 50000, fuzzyLabelFn = NULL, kCategories = 9) {
217 |
218 | varCol <- labelCol <- NULL
219 | message(paste("Plotting ", myVar))
220 |
221 | ## Make sure that myVar is not a date type
222 | if (any(is.element(unlist(dataFl[, class(get(myVar))]),
223 | c("Date", "IDate")))) {
224 | stop("Cannot plot dates")
225 | }
226 |
227 | ## Label myVar type to be "nmrcl" or "ctgrl" if not labeled yet
228 | if (!(inherits(myVar, "ctgrl") | inherits(myVar, "nmrcl"))) {
229 | if (dataFl[, class(get(myVar))] %in% c("character", "factor") ||
230 | dataFl[, length(unique(stats::na.omit(get(myVar))))] == 2) {
231 | setattr(dataFl[, get(myVar)], "class", "ctgrl")
232 | } else {
233 | setattr(dataFl[, get(myVar)], "class", "nmrcl")
234 | }
235 | }
236 |
237 | ## Generate a grid of plots
238 | if (inherits(dataFl[[myVar]], "ctgrl")) {
239 | p_all <- PlotCatVar(myVar, dataFl, weightNm, dateNm, dateGp, kCategories)
240 | p <- p_all$p
241 | varSummary <- p_all$catVarSummary
242 | varType <- "ctgrl"
243 | } else if (inherits(dataFl[[myVar]], "nmrcl")) {
244 | p_all <- PlotNumVar(myVar, dataFl, weightNm, dateGp, dateGpBp, skewOpt,
245 | kSample)
246 | p <- p_all$p
247 | varSummary = p_all$numVarSummary
248 | varType <- "nmrcl"
249 | }
250 |
251 | ## If no fuzzy matching functions are provided, provide exact matches on the
252 | ## first column, otherwise use logic defined in fuzzyLabelFn
253 | ll <- myVar
254 | subHeight <- grid::unit(12, "points")
255 | if (!is.null(labelFl)) {
256 | if (is.null(fuzzyLabelFn)) {
257 | ll <- paste0(labelFl[varCol == myVar, labelCol])
258 | } else {
259 | ll <- fuzzyLabelFn(labelFl, myVar)
260 | }
261 | ll <- paste0(myVar, " (", ll, ")", "\n")
262 | }
263 |
264 | ## Label color
265 | subCol <- "black"
266 | if (!is.null(highlightNms)) {
267 | highlightNms <- gsub("/|\\-|\"|\\s", "", highlightNms)
268 | if (myVar %in% highlightNms) {
269 | # should add other ways to trigger red labels
270 | subCol <- "red"
271 | }
272 | }
273 |
274 | ## Add the page title as myVar and its label above the grid of plots
275 | subText <- grid::textGrob(ll, gp = grid::gpar(col = subCol, fontface="bold"))
276 | grobHeights <- grid::unit.c(grid::unit(1, "npc") - subHeight, subHeight)
277 | p <- gridExtra::arrangeGrob(p, top = subText)
278 |
279 | return(list(p = p, varSummary = varSummary, varType = varType))
280 | }
281 |
--------------------------------------------------------------------------------
/man/vlm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/vlm.R
3 | \name{vlm}
4 | \alias{vlm}
5 | \title{Create over time variable plots and summary statistics for variable level monitoring}
6 | \usage{
7 | vlm(dataFl, dateNm, labelFl = NULL, outFl = "otvplots", genCSV = TRUE,
8 | dataNeedPrep = FALSE, dateGp = NULL, dateGpBp = NULL, weightNm = NULL,
9 | varNms = NULL, sortVars = NULL, sortFn = NULL, selectCols = NULL,
10 | dropCols = NULL, dateFt = "\%d\%h\%Y", buildTm = NULL,
11 | highlightNms = NULL, skewOpt = NULL, kSample = 50000,
12 | fuzzyLabelFn = NULL, dropConstants = FALSE, kCategories = 9, ...)
13 | }
14 | \arguments{
15 | \item{dataFl}{Either the name of an object that can be converted using
16 | \code{\link[data.table]{as.data.table}} (e.g., a data frame), or a
17 | character string containing the name of dataset that can be loaded using
18 | \code{\link[data.table]{fread}} (e.g., a csv file). If the dataset is not in
19 | your working directory then \code{dataFl} must include (relative or
20 | absolute) path to file.}
21 |
22 | \item{dateNm}{Name of column containing the date variable.}
23 |
24 | \item{labelFl}{Either the path of a dataset (a csv file) containing
25 | labels, an R object convertible to \code{data.table} (e.g., data frame) or
26 | \code{NULL}. If \code{NULL}, no labels will be used. The label dataset must
27 | contain at least 2 columns: \code{varCol} (variable names) and
28 | \code{labelCol} (variable labels).}
29 |
30 | \item{outFl}{Name of the output file, with no extension names (e.g., "bank").
31 | A pdf file of plots ("bank.pdf"), and two csv files of summary statistics
32 | ("bank_categorical_summary.csv" and "bank_numerical_summary.csv") will be
33 | saved to your working directory, unless a path is included in \code{outFl}
34 | (e.g. "../plots/bank").}
35 |
36 | \item{genCSV}{Logical, whether to generate the two csv files of summary
37 | statistics for numerical and categorical variables.}
38 |
39 | \item{dataNeedPrep}{Logical, indicates if data should be run through the
40 | \code{\link{PrepData}} function. This should be set to \code{TRUE} unless
41 | the \code{\link{PrepData}} function has been applied to the input data
42 | \code{dataFl}.}
43 |
44 | \item{dateGp}{Name of the variable that the time series plots should be
45 | grouped by. Options are \code{NULL}, \code{"weeks"}, \code{"months"},
46 | \code{"quarters"}, \code{"years"}. See \code{\link[data.table]{IDate}} for
47 | details. If \code{NULL}, then \code{dateNm} will be used as \code{dateGp}.}
48 |
49 | \item{dateGpBp}{Name of variable the boxplots should be grouped by. Same
50 | options as \code{dateGp}. If \code{NULL}, then \code{dateGp} will be used.}
51 |
52 | \item{weightNm}{Name of the variable containing row weights, or \code{NULL} for
53 | no weights (all rows receiving weight 1).}
54 |
55 | \item{varNms}{Either \code{NULL} or a vector of names or indices of variables
56 | to be plotted. If \code{NULL}, will default to all columns which are not
57 | \code{dateNm} or \code{weightNm}. Can also be a vector of indices of the
58 | column names, after \code{dropCols} or \code{selectCols} have been applied,
59 | if applicable, and not including \code{dateGp}, \code{dateGpBp}
60 | (which will be added to the \code{dataFl} by the function
61 | \code{\link{PrepData}}).}
62 |
63 | \item{sortVars}{Determines which variables to be plotted and their order.
64 | Either a character vector of variable names to plot variables in the same
65 | order as in the \code{sortVars} argument), or \code{NULL} to keep the
66 | original ordering, with numerical variables will being plotted before
67 | categorical and binary ones. \code{sortVars} should be \code{NULL} when the
68 | \code{sortFn} argument is used.}
69 |
70 | \item{sortFn}{A sorting function which returns \code{sortVars} as an output.
71 | The function may take the following variables as input: \code{dataFl},
72 | \code{dateNm}, \code{buildTm}, \code{weightNm}, \code{kSample}. Currently,
73 | the only build-in sorting function is \code{\link{OrderByR2}}, which sorts
74 | numerical variables in the order of strength of linear association with date,
75 | and adds categorical (and binary) variables sorted in alphabetical order
76 | after the numerical ones.}
77 |
78 | \item{selectCols}{Either \code{NULL}, or a vector of names or indices of
79 | variables to read into memory -- must include \code{dateNm},
80 | \code{weightNm} (if not \code{NULL}) and all variables to be plotted. If
81 | both \code{selectCols} and \code{dropCols} are \code{NULL}, then all
82 | variables will be read in.}
83 |
84 | \item{dropCols}{Either \code{NULL}, or a vector of variables names or indices
85 | of variables not to read into memory. If both \code{selectCols} and
86 | \code{dropCols} are \code{NULL}, then all variables will be read in.}
87 |
88 | \item{dateFt}{\code{\link{strptime}} format of date variable. The default is SAS
89 | format \code{"\%d\%h\%Y"}. But input data with R date format
90 | \code{"\%Y-\%m-\%d"} will also be detected. Both of two formats can be
91 | parsed automatically.}
92 |
93 | \item{buildTm}{Vector identify time period for ranking/anomaly detection
94 | (most likely model build period). Allows for a subset of plotting time
95 | period to be used for anomaly detection.
96 | \itemize{
97 | \item Must be a vector of dates and must be inclusive i.e. buildTm[1]
98 | <= date <= buildTm[2] will define the time period.
99 | \item Must be either \code{NULL}, a vector of length 2, or a vector of
100 | length 3.
101 | \item If \code{NULL}, the entire dataset will be used for
102 | ranking/anomaly detection.
103 | \item If a vector of length 2, the format of the dates must be
104 | a character vector in default R date format (e.g. "2017-01-30").
105 | \item If a vector of length 3, the first two columns must contain dates
106 | in any strptime format, while the 3rd column contains the strptime
107 | format (see \code{\link{strptime}}).
108 | \item The following are equivalent ways of selecting
109 | all of 2014:
110 | \itemize{
111 | \item \code{c("2014-01-01","2014-12-31")}
112 | \item \code{c("01JAN2014","31DEC2014", "\%d\%h\%Y")}
113 | }
114 | }}
115 |
116 | \item{highlightNms}{Either \code{NULL} or a character vector of variables to
117 | recieve red label. Currently \code{NULL} means all variables will get a
118 | black legend. Ignored this argument if \code{labelFl == NULL}.}
119 |
120 | \item{skewOpt}{Either a numeric constant or \code{NULL}. Default is
121 | \code{NULL} (no transformation). If numeric, say 5, then all box plots of
122 | a variable whose skewness exceeds 5 will be on a log10 scale if possible.
123 | Negative input of \code{skewOpt} will be converted to 3.}
124 |
125 | \item{kSample}{Either \code{NULL} or a positive integer. If an integer,
126 | indicates the sample size for both drawing boxplots and ordering numerical
127 | graphs by \eqn{R^2}. When the data is large, setting \code{kSample} to a
128 | reasonable value (default is 50K) dramatically improves processing speed.
129 | Therefore, for larger datasets (e.g. > 10 percent system memory), this
130 | parameter should not be set to \code{NULL}, or boxplots may take a very
131 | long time to render. This setting has no impact on the accuracy of time
132 | series plots on quantiles, mean, SD, and missing and zero rates.}
133 |
134 | \item{fuzzyLabelFn}{Either \code{NULL} or a function of 2 parameters: A label
135 | file in the format of an output by \code{\link{PrepLabels}} and a string
136 | giving a variable name. The function should return the label corresponding
137 | to the variable given by the second parameter. This function should
138 | describe how fuzzy matching should be performed to find labels (see example
139 | below). If \code{NULL}, only exact matches will be retuned.}
140 |
141 | \item{dropConstants}{Logical, indicates whether or not constant (all
142 | duplicated or NA) variables should be dropped from \code{dataFl} prior to
143 | plotting.}
144 |
145 | \item{kCategories}{If a categorical variable has more than \code{kCategories},
146 | trace plots of only the \code{kCategories} most prevalent categories are
147 | plotted.}
148 |
149 | \item{...}{Additional parameters to be passed to
150 | \code{\link[data.table]{fread}}.}
151 | }
152 | \description{
153 | Sorts variables according to either user input or correlation with time
154 | (among numerical variables only), and create output files including:
155 | \itemize{
156 | \item A PDF file of plots saved as \code{outFl}.pdf, with each indivual page
157 | on one variable. Variables are plotted in the order indicated in the argument
158 | \code{sortVars} or \code{sortFn}.
159 | For each numerical variable, the output plots include
160 | \itemize{
161 | \item side-by-side boxplots grouped by \code{dateGpBp} (left),
162 | \item a trace plot of p1, p50, and p99 percentiles, grouped by \code{dateGp}
163 | (top right),
164 | \item a trace plot of mean and +-1 SD control limits, grouped by
165 | \code{dateGp}(middle right), and
166 | \item a trace plot of missing and zerorates, grouped by \code{dateGp}
167 | (bottom right).
168 | }
169 | For each categorical variable (including a numerical variable with no more
170 | than 2 unique levels not including NA), the output plots include
171 | \itemize{
172 | \item a frequency bar plot (left), and
173 | \item a grid of trace plots on categories' proportions over time (right).
174 | If the variable contains more than \code{kCategories} number of
175 | categories, trace plots of only the largest \code{kCategories} will be
176 | plotted. If the variable contains only two categories, then only the
177 | trace plot of the less prevalent cateogy will be plotted.
178 | }
179 | \item CSV file(s) on summary statistics of variable, both globally and over
180 | time aggregated by \code{dateGp}. The order of variables in the CSV files
181 | are the same as in the PDF file.
182 | \itemize{
183 | \item For numerical varaibles, number of observations (counts), p1, p25,
184 | p50, p75, and p99 qunatiles, mean, SD, missing and zerorates are saved
185 | as \code{outFl}_numerical_summary.csv.
186 | \item For categorical varaibles, number of observations (counts) and
187 | categories' proportions are saved as \code{outFl}_categorical_summary.csv.
188 | Each row is a category of a categorical (or binary) variable.
189 | The row whose \code{category == 'NA'} corresponds to missing. Categories
190 | among the same variable are ordered by global prevalence in a descending
191 | order.
192 | }
193 | }
194 | }
195 | \details{
196 | If the argument \code{dataNeedPrep} is set to \code{FALSE}, then
197 | \itemize{
198 | \item \code{dataFl} must be a \code{data.table} containing variables
199 | \code{weightNm}, \code{dateNm}, \code{dateGp}, and \code{dateGpBp}, and
200 | names of these variables must be the same as the corresponding arguments
201 | of the \code{\link{vlm}} function.
202 | \item the arguments \code{selectCols}, \code{dropCols}, \code{dateFt},
203 | \code{dropConstants} will be ignored by the \code{\link{vlm}} function.
204 | \item When analyzing a dataset for the first time, it is recommended to first
205 | run the \code{\link{PrepData}} function on it, and then apply the
206 | \code{\link{vlm}} function with the argument \code{dataNeedPrep = FALSE}.
207 | Please see the examples for details.
208 | }
209 | }
210 | \section{License}{
211 | Copyright 2017 Capital One Services, LLC Licensed under the
212 | Apache License, Version 2.0 (the "License"); you may not use this file
213 | except in compliance with the License. You may obtain a copy of the License
214 | at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
215 | law or agreed to in writing, software distributed under the License is
216 | distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
217 | KIND, either express or implied. See the License for the specific language
218 | governing permissions and limitations under the License.
219 | }
220 |
221 | \examples{
222 | ## Load the data and its label
223 | data(bankData)
224 | data(bankLabels)
225 |
226 | ## The PrepData function should only need to be run once on a dataset,
227 | ## after that vlm can be run with the argument dataNeedPrep = FALSE
228 | bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
229 | dateGpBp = "quarters")
230 | bankLabels <- PrepLabels(bankLabels)
231 |
232 | \dontrun{
233 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
234 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters",
235 | outFl = "bank")
236 |
237 | ## If csv files of summary statistics are not need, set genCSV = FALSE
238 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels, genCSV = FALSE,
239 | sortFn = "OrderByR2", dateGp = "months", dateGpBp = "quarters",
240 | outFl = "bank")
241 |
242 | ## If weights are provided, they will be used in all statistical calculations
243 | bankData[, weight := rnorm(.N, 1, .1)]
244 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
245 | dateGp = "months", dateGpBp = "quarters", weightNm = "weight",
246 | outFl = "bank")
247 |
248 | ## Customize plotting order by passing a vector of variable names to
249 | ## sortVars, but the "date" column must be excluded from sortVars
250 | sortVars <- sort(bankLabels[varCol!="date", varCol])
251 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
252 | dateGp = "months", dateGpBp = "quarters", outFl = "bank",
253 | sortVars = sortVars)
254 |
255 | ## Create plots for a specific variable using the varNms parameter
256 | vlm(dataFl = bankData, dateNm = "date", labelFl = bankLabels,
257 | dateGp = "months", dateGpBp = "quarters", outFl = "bank",
258 | varNms = "age", sortVars = NULL)
259 | }
260 | }
261 | \seealso{
262 | This function depends on:
263 | \code{\link{PrintPlots}},
264 | \code{\link{OrderByR2}},
265 | \code{\link{PrepData}},
266 | \code{\link{PrepLabels}}.
267 | }
268 |
--------------------------------------------------------------------------------
/R/categorical.R:
--------------------------------------------------------------------------------
1 | # SPDX-Copyright: Copyright (c) Capital One Services, LLC
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Copyright 2017 Capital One Services, LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | #
8 | # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software distributed
11 | # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
12 | # OF ANY KIND, either express or implied.
13 | #
14 | # See the License for the specific language governing permissions and limitations under the License.
15 |
16 |
17 | ###########################################
18 | # Plots for Categorical Data #
19 | ###########################################
20 | #' Create plots and summary statistics for a categorical variable
21 | #'
22 | #' Output plots include a bar plot with cateogries ordered by global counts,
23 | #' and trace plots of categories' proportions over time. This function is also
24 | #' appliable to a binary varible, which is treated as categorical in this
25 | #' package. In addition to plots, a \code{data.table} of summary statistics
26 | #' are generated, on global counts and proportions by cateory, and proportions
27 | #' by category over time.
28 | #'
29 | #' @inheritParams PrepData
30 | #' @param dataFl A \code{data.table} of data; must be the output of the
31 | #' \code{\link{PrepData}} function.
32 | #' @param myVar The name of the variable to be plotted
33 | #' @param kCategories If a categorical variable has more than \code{kCategories},
34 | #' trace plots of only the \code{kCategories} most prevalent categories are
35 | #' plotted.
36 | #' @param normBy The normalization factor for rate plots, can be \code{"time"}
37 | #' or \code{"var"}. If \code{"time"}, then for each time period of
38 | #' \code{dateGp}, counts are normalized by the total counts over all
39 | #' categories in that time period. This illustrates changes of categories'
40 | #' proportions over time. If \code{"var"}, then for each category, its counts
41 | #' are normalized by the total counts over time from only this category. This
42 | #' illustrates changes of categories' volumes over time.
43 | #' @export
44 | #' @return
45 | #' \item{p}{A \code{grob} (i.e., \code{ggplot} grid) object, including a
46 | #' bar plot, and trace plots of categories' proportions. If the number of
47 | #' categories is larger than \code{kCategories}, then trace plots of only the
48 | #' \code{kCategories} most prevalent categories are be plotted. For a binary
49 | #' variable, only the trace plot of the less prevalent category is plotted.}
50 | #' \item{catVarSummary}{A \code{data.table}, contains categories' proportions
51 | #' globally, and over-time in each time period in \code{dateGp}. Each row is
52 | #' a category of the categorical (or binary) variable \code{myVar}. The row
53 | #' whose \code{category == 'NA'} corresponds to missing. Categories are
54 | #' ordered by global prevalence in a descending order.}
55 | #'
56 | #' @seealso Functions depend on this function:
57 | #' \code{\link{PlotVar}},
58 | #' \code{\link{PrintPlots}},
59 | #' \code{\link{vlm}}.
60 | #' @seealso This function depends on:
61 | #' \code{\link{PlotBarplot}},
62 | #' \code{\link{PlotRatesOverTime}},
63 | #' \code{\link{PrepData}}.
64 | #'
65 | #' @section License:
66 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
67 | #' Version 2.0 (the "License"); you may not use this file except in compliance
68 | #' with the License. You may obtain a copy of the License at
69 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
70 | #' or agreed to in writing, software distributed under the License is
71 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
72 | #' KIND, either express or implied. See the License for the specific language
73 | #' governing permissions and limitations under the License.
74 | #' @examples
75 | #' data(bankData)
76 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
77 | #' dateGpBp = "quarters", weightNm = NULL)
78 | #' # Single histogram is plotted for job type since there are 12 categories
79 | #' plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL,
80 | #' dateNm = "date", dateGp = "months")$p)
81 | #'
82 | #' plot(PlotCatVar(myVar = "job", dataFl = bankData, weightNm = NULL,
83 | #' dateNm = "date", dateGp = "months", kCategories = 12)$p)
84 | #'
85 | #'
86 | #' ## Binary data is treated as categorical, and only the less frequent
87 | #' ## category is plotted over time.
88 | #' plot(PlotCatVar(myVar = "default", dataFl = bankData, weightNm = NULL,
89 | #' dateNm = "date", dateGp = "months")$p)
90 |
91 | PlotCatVar <- function(myVar, dataFl, weightNm = NULL, dateNm, dateGp,
92 | kCategories = 9, normBy = "time") { #!# previous name: PlotDiscreteVar
93 | count <- NULL
94 |
95 | p <- PlotBarplot(dataFl = dataFl, myVar = myVar, weightNm = weightNm)
96 | newLevels <- as.character(p$data[order(-count)][[myVar]])
97 |
98 | p2 <- PlotRatesOverTime(dataFl = dataFl, dateGp = dateGp, weightNm = weightNm,
99 | myVar = myVar, newLevels = newLevels, normBy = normBy,
100 | kCategories = kCategories)
101 |
102 | p <- gridExtra::arrangeGrob(ggplot2::ggplotGrob(p), p2$p, widths = c(1, 2))
103 |
104 | return(list(p = p, catVarSummary = p2$catVarSummary))
105 | }
106 |
107 | ###########################################
108 | # Discrete Plotting Functions #
109 | ###########################################
110 | #' Creates a bar plot for a discrete (or binary) variable
111 | #'
112 | #' @inheritParams PlotCatVar
113 | #' @export
114 | #' @return A \code{ggplot} object with a histogram of \code{myVar} ordered by
115 | #' category frequency
116 | #'
117 | #' @seealso Functions depend on this function:
118 | #' \code{\link{PlotCatVar}}.
119 | #' @seealso This function depends on:
120 | #' \code{\link{PrepData}}.
121 | #'
122 | #' @section License:
123 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
124 | #' Version 2.0 (the "License"); you may not use this file except in compliance
125 | #' with the License. You may obtain a copy of the License at
126 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
127 | #' or agreed to in writing, software distributed under the License is
128 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
129 | #' KIND, either express or implied. See the License for the specific language
130 | #' governing permissions and limitations under the License.
131 | #' @examples
132 | #' data(bankData)
133 | #' bankData = PrepData(bankData, dateNm = "date", dateGp = "months",
134 | #' dateGpBp = "quarters", weightNm = NULL)
135 | #' PlotBarplot(bankData, "job")
136 | #'
137 | #' ## NA will be included as a category if any NA are present
138 | #' bankData[sample.int(.N)[1:1000], education := NA]
139 | #' PlotBarplot(bankData, "education")
140 |
141 | PlotBarplot <- function(dataFl, myVar, weightNm = NULL){ #!# previous name: PlotHistogram
142 |
143 | count <- NULL
144 |
145 | ## A subset dataset to work on
146 | dataSub <- dataFl[, c(myVar, weightNm), with = FALSE]
147 | ## NA is converted to a character, i.e., treated as a new category
148 | dataSub[is.na(get(myVar)) | get(myVar) == "", (myVar) := "NA"]
149 |
150 | ## Create glbTotals, a frequency table of myVar
151 | if (is.null(weightNm)) {
152 | glbTotals <- dataSub[, list(count = .N), by = myVar]
153 | } else {
154 | glbTotals <- dataSub[, list(count = sum(get(weightNm))), by = myVar]
155 | }
156 |
157 | ## Create newLevels, a vector of category names, in descending order of counts
158 | newLevels <- unlist(glbTotals[order(-count), myVar, with = FALSE])
159 | glbTotals[, (myVar) := factor(get(myVar), levels = newLevels)]
160 |
161 | p <- ggplot2::ggplot(glbTotals, ggplot2::aes_string(x = myVar,
162 | y = "count",
163 | group = myVar)) +
164 | ggplot2::geom_bar(stat = "identity") +
165 | ggplot2::scale_x_discrete(labels = abbreviate, breaks = newLevels) +
166 | ggplot2::theme(text = ggplot2::element_text(size = 10))
167 | return(p)
168 | }
169 |
170 |
171 | #' Creates trace plots of categories' proportions over time for a discrete (or
172 | #' binary) variable
173 | #'
174 | #' @inheritParams PlotCatVar
175 | #' @param newLevels categories of \code{myVar} in order of global frequency
176 | #' @export
177 | #' @return A list:
178 | #' \item{p}{\code{ggplot} object, trace plots of categories' proportions
179 | #' \code{myVar} over time.}
180 | #' \item{catVarSummary}{A \code{data.table}, contains categories' proportions
181 | #' globally, and over-time in each time period in \code{dateGp}. Each row is
182 | #' a category of the categorical (or binary) variable \code{myVar}. The row
183 | #' whose \code{category == 'NA'} corresponds to missing. Categories are
184 | #' ordered by global prevalence in a descending order.}
185 | #'
186 | #' @seealso Functions depend on this function:
187 | #' \code{\link{PlotCatVar}}.
188 | #' @seealso This function depends on:
189 | #' \code{\link{PrepData}}.
190 | #'
191 | #' @section License:
192 | #' Copyright 2017 Capital One Services, LLC Licensed under the Apache License,
193 | #' Version 2.0 (the "License"); you may not use this file except in compliance
194 | #' with the License. You may obtain a copy of the License at
195 | #' http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
196 | #' or agreed to in writing, software distributed under the License is
197 | #' distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
198 | #' KIND, either express or implied. See the License for the specific language
199 | #' governing permissions and limitations under the License.
200 | #' @examples
201 | #' data(bankData)
202 | #' bankData$weight = rpois(nrow(bankData), 5)
203 | #' bankData <- PrepData(bankData, dateNm = "date", dateGp = "months",
204 | #' dateGpBp = "quarters", weightNm = "weight")
205 | #' PlotRatesOverTime(dataFl = bankData, dateGp = "months", weightNm = "weight",
206 | #' myVar = "job", newLevels = NULL, normBy = "time")
207 | #'
208 | PlotRatesOverTime <- function(dataFl, dateGp, myVar, normBy = "time",
209 | weightNm = NULL, newLevels = NULL, kCategories = 9){ #!# previous name: PlotHistOverTime
210 | N.x <- NULL
211 | N.y <- NULL
212 | rate <- NULL
213 | N <- NULL
214 | count <- NULL
215 | global_count <- NULL
216 | global_rate <- NULL
217 | variable <- NULL
218 |
219 | ## A subset dataset to work on
220 | dataSub <- dataFl[, c(dateGp, myVar, weightNm), with = FALSE]
221 | ## NA is converted to a character, i.e., treated as a new category
222 | dataSub[is.na(get(myVar)) | get(myVar) == "", (myVar) := "NA"]
223 |
224 | ## Create glbTotals, a frequency table of myVar
225 | ## Create newLevels, a vector of category names, in descending order of counts
226 | if (is.null(newLevels)){
227 | if (is.null(weightNm)) {
228 | glbTotals <- dataSub[, list(count = .N), by = myVar]
229 | } else {
230 | glbTotals <- dataSub[, list(count = sum(get(weightNm))), by = myVar]
231 | }
232 |
233 | newLevels <- glbTotals[order(-count), myVar, with = FALSE][[myVar]]
234 | }
235 |
236 | ## Compute counts by category and time
237 | if (is.null(weightNm)) {
238 | countData <- dataSub[, .N, by = c(myVar, dateGp)]
239 | if (normBy == "time"){
240 | countBy <- dataSub[, .N, by = c(dateGp)]
241 | } else {
242 | if (normBy == "var") {
243 | countBy <- dataSub[, .N, by = c(myVar)]
244 | }
245 | }
246 | } else {
247 | countData <- dataSub[, list(N = sum(get(weightNm))), by = c(myVar, dateGp)]
248 | if (normBy == "time"){
249 | countBy <- dataSub[, list(N = sum(get(weightNm))), by = c(dateGp)]
250 | } else {
251 | if (normBy == "var") {
252 | countBy <- dataSub[, list(N = sum(get(weightNm))), by = c(myVar)]
253 | }
254 | }
255 | }
256 |
257 | ## Make sure countData contains all cateogires and all times
258 | crossLevels <- CJ(unique(countData[[dateGp]]), unique(countData[[myVar]]))
259 | setnames(crossLevels, c("V1", "V2"), c(dateGp, myVar))
260 | countData <- merge(crossLevels, countData, all.x = TRUE, by = c(dateGp, myVar))
261 | countData[is.na(N), N := 0]
262 | countData[, (myVar) := factor(get(myVar), levels = newLevels)]
263 |
264 | ## Combine countData (numerator) and countBy (denominator) as rateBy
265 | if (normBy == "time"){
266 | rateBy <- merge(countData, countBy, by = dateGp)
267 | } else {
268 | if (normBy == "var") {
269 | rateBy <- merge(countData, countBy, by = myVar)
270 | }
271 | }
272 |
273 | ## Compute the rates:
274 | ## For a certain time, N.x is the count of the category, N.y is the total counts
275 | rateBy[, rate := N.x / N.y]
276 | rateBy[, (myVar) := factor(get(myVar), levels = newLevels)]
277 |
278 | ## Compute summary statistics in a wide format
279 | cbytime = copy(rateBy);
280 | names(cbytime)[names(cbytime) == myVar] = 'category'
281 | names(cbytime)[names(cbytime) == dateGp] = 'date_group'
282 | ## Compute global counts and rates
283 | cglobal = cbytime[, list(global_count = sum(N.x)), by = 'category'];
284 | cglobal[, global_rate := global_count / sum(global_count)];
285 | ## Change cbytime into the wide format
286 | cbytime = dcast(cbytime[, c('date_group', 'category', 'rate')],
287 | category ~ date_group, value.var = 'rate');
288 | ## Combine cglobal into cbytime
289 | cbytime = merge(cglobal, cbytime, by = 'category')
290 | ## Add a column: variable
291 | cbytime[, variable := myVar];
292 | setcolorder(cbytime, c(ncol(cbytime), 1:(ncol(cbytime) - 1)))
293 | ## Add a row of NA being all zero, if no missing
294 | if('NA' %in% cbytime$category == FALSE){
295 | cbytime = rbind(cbytime, as.list(rep(NA, ncol(cbytime))))
296 | cbytime[nrow(cbytime), 1:2] = list(myVar, 'NA')
297 | cbytime[nrow(cbytime), 3:(ncol(cbytime))] = 0;
298 | }
299 |
300 | ## Plot less frequent category only for a binary variable.
301 | ## This helps when there is a large class imbalance, because the range of y-axis for all trace plots is the same.
302 | if (length(newLevels) == 2) {
303 | rateBy <- rateBy[get(myVar) == newLevels[2]]
304 | }
305 |
306 | if(length(newLevels) <= kCategories){
307 | p <- ggplot2::ggplot(rateBy,
308 | ggplot2::aes_string(x = dateGp, y = "rate"))
309 | } else {
310 | p <- ggplot2::ggplot(rateBy[get(myVar) %in% newLevels[1:kCategories]],
311 | ggplot2::aes_string(x = dateGp, y = "rate"))
312 | }
313 |
314 | p <- p +
315 | ggplot2::geom_line(stat = "identity") +
316 | ggplot2::facet_wrap(stats::as.formula(paste("~", myVar))) +
317 | ggplot2::ylab("") +
318 | ggplot2::scale_x_date() +
319 | ggplot2::theme(axis.text.x=ggplot2::element_text(angle = 30, hjust = 1)) +
320 | ggplot2::scale_y_continuous(labels=scales::percent)
321 |
322 | return(list(p = p, catVarSummary = cbytime));
323 |
324 | }
325 |
--------------------------------------------------------------------------------