├── vignettes ├── .gitignore └── intro.Rmd ├── LICENSE ├── R ├── .DS_Store ├── sysdata.rda ├── grizbayr-package.R ├── sysdata.R ├── calculate_total_cm.R ├── is_winner_max.R ├── validate_data_values.R ├── calculate_multi_rev_per_session.R ├── validate_posterior_samples.R ├── validate_wrt_option.R ├── is_prior_valid.R ├── validate_priors.R ├── impute_missing_options.R ├── find_best_option.R ├── estimate_win_prob.R ├── rdirichlet.R ├── sample_response_rate.R ├── update_beta.R ├── estimate_win_prob_vs_baseline.R ├── estimate_win_prob_vs_baseline_given_posterior.R ├── sample_ctr.R ├── validate_input_column.R ├── sample_cpc.R ├── validate_input_df.R ├── sample_conv_rate.R ├── estimate_win_prob_given_posterior.R ├── estimate_lift_vs_baseline.R ├── sample_session_duration.R ├── update_gamma.R ├── update_dirichlet.R ├── estimate_value_remaining.R ├── sample_page_views_per_session.R ├── estimate_loss.R ├── estimate_lift.R ├── sample_from_posterior.R ├── sample_cpa.R ├── sample_rev_per_session.R ├── sample_cm_per_click.R ├── estimate_all_values.R ├── sample_multi_rev_per_session.R └── sample_total_cm.R ├── tests ├── testthat.R ├── spelling.R └── testthat │ ├── test-validate_posterior_samples.R │ ├── test-is_winner_max.R │ ├── test-validate_wrt_option.R │ ├── test-sample_cpc.R │ ├── test-calculate_total_cm.R │ ├── test-validate_priors.R │ ├── test-sample_conv_rate.R │ ├── test-sample_session_duration.R │ ├── test-calculate_multi_rev_per_session.R │ ├── test-sample_cpa.R │ ├── test-sample_ctr.R │ ├── test-sample_page_views_per_session.R │ ├── test-sample_response_rate.R │ ├── test-estimate_all_values.R │ ├── test-sample_rev_per_session.R │ ├── test-sample_cm_per_click.R │ ├── test-sample_total_cm.R │ ├── test-estimate_value_remaining.R │ ├── test-sample_multi_rev_per_session.R │ ├── test-update_beta.R │ ├── test-update_dirichlet.R │ ├── test-validate_data_values.R │ ├── test-estimate_lift_vs_baseline.R │ ├── test-estimate_win_prob_vs_baseline.R │ ├── test-rdirichlet.R │ ├── test-validate_input_df.R │ ├── test-estimate_win_prob.R │ ├── test-find_best_option.R │ ├── test-impute_missing_options.R │ ├── test-estimate_win_prob_vs_baseline_given_posterior.R │ ├── test-is_prior_valid.R │ ├── test-update_gamma.R │ ├── test-validate_input_column.R │ ├── test-estimate_win_prob_given_posterior.R │ ├── test-sample_from_posterior.R │ ├── test-estimate_loss.R │ └── test-estimate_lift.R ├── CRAN-SUBMISSION ├── .Rbuildignore ├── inst └── WORDLIST ├── man ├── validate_data_values.Rd ├── is_winner_max.Rd ├── validate_posterior_samples.Rd ├── is_prior_valid.Rd ├── calculate_total_cm.Rd ├── impute_missing_options.Rd ├── validate_wrt_option.Rd ├── validate_priors.Rd ├── calculate_multi_rev_per_session.Rd ├── rdirichlet.Rd ├── validate_input_column.Rd ├── update_beta.Rd ├── find_best_option.Rd ├── grizbayr-package.Rd ├── validate_input_df.Rd ├── estimate_win_prob.Rd ├── sample_cpc.Rd ├── sample_conv_rate.Rd ├── estimate_win_prob_given_posterior.Rd ├── sample_from_posterior.Rd ├── estimate_loss.Rd ├── update_gamma.Rd ├── sample_response_rate.Rd ├── estimate_win_prob_vs_baseline_given_posterior.Rd ├── sample_ctr.Rd ├── estimate_lift.Rd ├── estimate_win_prob_vs_baseline.Rd ├── update_dirichlet.Rd ├── sample_session_duration.Rd ├── sample_cm_per_click.Rd ├── sample_multi_rev_per_session.Rd ├── sample_page_views_per_session.Rd ├── estimate_lift_vs_baseline.Rd ├── sample_cpa.Rd ├── estimate_value_remaining.Rd ├── sample_total_cm.Rd ├── sample_rev_per_session.Rd └── estimate_all_values.Rd ├── grizbayr.Rproj ├── .gitignore ├── LICENSE.md ├── DESCRIPTION ├── NEWS.md ├── NAMESPACE └── README.md /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2020 2 | COPYRIGHT HOLDER: Ryan Angi 3 | -------------------------------------------------------------------------------- /R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-angi/grizbayr/HEAD/R/.DS_Store -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-angi/grizbayr/HEAD/R/sysdata.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(grizbayr) 3 | 4 | test_check("grizbayr") 5 | -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 1.3.3 2 | Date: 2022-12-22 19:55:22 UTC 3 | SHA: f4e8ecd3b054d1ef13bff84d0e4752527f11057e 4 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.git$ 4 | ^LICENSE\.md$ 5 | ^doc$ 6 | ^Meta$ 7 | ^CRAN-RELEASE$ 8 | ^cran-comments\.md$ 9 | ^CRAN-SUBMISSION$ 10 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly = TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = FALSE, 3 | skip_on_cran = TRUE) 4 | -------------------------------------------------------------------------------- /R/grizbayr-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | ## usethis namespace: start 5 | #' @importFrom dplyr %>% 6 | #' @importFrom rlang .data 7 | ## usethis namespace: end 8 | NULL 9 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | Bool 2 | Bugfix 3 | CPC 4 | DataFrame 5 | KaTeX 6 | README 7 | Rmd 8 | Stucchio 9 | Tibble 10 | VWO 11 | bayesr 12 | cdot 13 | chr 14 | coercian 15 | conv 16 | cpc 17 | df 18 | dirichlet 19 | dplyr 20 | dplyr's 21 | griz 22 | params 23 | roxygen 24 | str 25 | tibble 26 | tidyr 27 | wp 28 | -------------------------------------------------------------------------------- /tests/testthat/test-validate_posterior_samples.R: -------------------------------------------------------------------------------- 1 | context("Validate Posterior Samples") 2 | 3 | test_that("validate_posterior_samples fails if the input is not a dataframe", { 4 | input_list <- list(option_name = c("A", "B", "C"), samples = c(0,1,2)) 5 | expect_error({ 6 | validate_posterior_samples(posterior_samples = input_list) 7 | }) 8 | }) 9 | -------------------------------------------------------------------------------- /R/sysdata.R: -------------------------------------------------------------------------------- 1 | #' A Mapping from distribution names (inputs) to columns. This allows the 2 | #' package to dynamically select column names for different distribution types. 3 | #' 4 | #' @docType data 5 | #' @name distribution_column_mapping 6 | #' @usage distribution_column_mapping 7 | #' @format A data frame with 11 rows and 12 columns 8 | #' 9 | #' "distribution_column_mapping" 10 | 11 | -------------------------------------------------------------------------------- /tests/testthat/test-is_winner_max.R: -------------------------------------------------------------------------------- 1 | context("Is Winner Max") 2 | 3 | test_that("is_winner_max returns FALSE when CPC or CPA is input", { 4 | expect_false(is_winner_max("cpc")) 5 | expect_false(is_winner_max("cpa")) 6 | }) 7 | 8 | test_that("is_winner_max returns TRUE when anything else is input", { 9 | expect_true(is_winner_max("conversion_rate")) 10 | expect_true(is_winner_max("something_random")) 11 | }) 12 | -------------------------------------------------------------------------------- /man/validate_data_values.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate_data_values.R 3 | \name{validate_data_values} 4 | \alias{validate_data_values} 5 | \title{Validate Data Values} 6 | \usage{ 7 | validate_data_values(data_values) 8 | } 9 | \arguments{ 10 | \item{data_values}{List of named data values} 11 | } 12 | \value{ 13 | None 14 | } 15 | \description{ 16 | Validates data values are all greater than 0. 17 | } 18 | -------------------------------------------------------------------------------- /R/calculate_total_cm.R: -------------------------------------------------------------------------------- 1 | #' Calculate Total CM 2 | #' 3 | #' @param rev_per_click vector of rev per click samples 4 | #' @param cost_per_click vector of cost per click (cpc) samples 5 | #' @param expected_clicks vector of expected clicks (expected CTR * fixed impressions) 6 | #' 7 | #' @return vector of CM estimates (dbl) 8 | #' 9 | calculate_total_cm <- function(rev_per_click, cost_per_click, expected_clicks){ 10 | (rev_per_click - cost_per_click) * expected_clicks 11 | } 12 | -------------------------------------------------------------------------------- /R/is_winner_max.R: -------------------------------------------------------------------------------- 1 | #' Is Winner Max 2 | #' 3 | #' Determines if the max or min function should be used for win probability. 4 | #' If CPA or CPC distribution, lower is better, else higher number is better. 5 | #' 6 | #' @param distribution String: the name of the distribution 7 | #' 8 | #' @return Boolean TRUE/FALSE 9 | #' 10 | is_winner_max <- function(distribution){ 11 | switch(distribution, 12 | "cpc" = , 13 | "cpa" = FALSE, 14 | TRUE) 15 | } 16 | -------------------------------------------------------------------------------- /grizbayr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | -------------------------------------------------------------------------------- /man/is_winner_max.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/is_winner_max.R 3 | \name{is_winner_max} 4 | \alias{is_winner_max} 5 | \title{Is Winner Max} 6 | \usage{ 7 | is_winner_max(distribution) 8 | } 9 | \arguments{ 10 | \item{distribution}{String: the name of the distribution} 11 | } 12 | \value{ 13 | Boolean TRUE/FALSE 14 | } 15 | \description{ 16 | Determines if the max or min function should be used for win probability. 17 | If CPA or CPC distribution, lower is better, else higher number is better. 18 | } 19 | -------------------------------------------------------------------------------- /R/validate_data_values.R: -------------------------------------------------------------------------------- 1 | #' Validate Data Values 2 | #' 3 | #' Validates data values are all greater than 0. 4 | #' 5 | #' @param data_values List of named data values 6 | #' 7 | #' @return None 8 | #' 9 | #' @importFrom purrr walk 10 | #' 11 | validate_data_values <- function(data_values){ 12 | if (length(data_values) == 0) { 13 | stop("No Data Values available to validate.") 14 | } 15 | purrr::walk(data_values, ~ if (.x < 0) { 16 | stop(paste(names(which(data_values == .x)), 17 | "is less than 0. Cannot update distribution.")) 18 | }) 19 | } 20 | -------------------------------------------------------------------------------- /man/validate_posterior_samples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate_posterior_samples.R 3 | \name{validate_posterior_samples} 4 | \alias{validate_posterior_samples} 5 | \title{Validate Posterior Samples Dataframe} 6 | \usage{ 7 | validate_posterior_samples(posterior_samples) 8 | } 9 | \arguments{ 10 | \item{posterior_samples}{Tibble of data in long form with 2 columns 11 | `option_name` and `samples`} 12 | } 13 | \value{ 14 | None 15 | } 16 | \description{ 17 | Function fails if posterior is not shaped correctly. 18 | } 19 | -------------------------------------------------------------------------------- /R/calculate_multi_rev_per_session.R: -------------------------------------------------------------------------------- 1 | #' Calculate Multi Rev Per Session 2 | #' 3 | #' @param conv_rates Dirichlet samples containing a tibble with columns alpha_1, alpha_2, and alpha_0 4 | #' @param inverse_rev_A Vector of inverse revenue samples from A conversion type 5 | #' @param inverse_rev_B Vector of inverse revenue samples from B conversion type 6 | #' 7 | #' @return Vector of samples (dbl) 8 | #' 9 | calculate_multi_rev_per_session <- function(conv_rates, inverse_rev_A, inverse_rev_B){ 10 | (conv_rates$alpha_1 / inverse_rev_A) + (conv_rates$alpha_2 / inverse_rev_B) 11 | } 12 | -------------------------------------------------------------------------------- /man/is_prior_valid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/is_prior_valid.R 3 | \name{is_prior_valid} 4 | \alias{is_prior_valid} 5 | \title{Is Prior Valid} 6 | \usage{ 7 | is_prior_valid(priors_list, valid_prior) 8 | } 9 | \arguments{ 10 | \item{priors_list}{A list of valid priors} 11 | 12 | \item{valid_prior}{A character string} 13 | } 14 | \value{ 15 | Boolean (TRUE/FALSE) 16 | } 17 | \description{ 18 | Checks if a single valid prior name is in the list of prior values and if 19 | that prior value from the list is greater than 0. 20 | } 21 | -------------------------------------------------------------------------------- /tests/testthat/test-validate_wrt_option.R: -------------------------------------------------------------------------------- 1 | context("Validate With Respect To Option") 2 | 3 | test_that("validate_wrt_option fails when not in option name", { 4 | post_samples_example <- tibble::tibble(option_name = c("X", "Y", "Z")) 5 | expect_error({ 6 | validate_wrt_option(wrt_option = "B", posterior_samples = post_samples_example) 7 | }) 8 | }) 9 | 10 | 11 | test_that("validate_wrt_option passes when in option name", { 12 | post_samples_example <- tibble::tibble(option_name = c("A", "B", "C")) 13 | expect_invisible({ 14 | validate_wrt_option(wrt_option = "B", posterior_samples = post_samples_example) 15 | }) 16 | }) 17 | -------------------------------------------------------------------------------- /R/validate_posterior_samples.R: -------------------------------------------------------------------------------- 1 | #' Validate Posterior Samples Dataframe 2 | #' 3 | #' Function fails if posterior is not shaped correctly. 4 | #' 5 | #' @param posterior_samples Tibble of data in long form with 2 columns 6 | #' `option_name` and `samples` 7 | #' 8 | #' @return None 9 | #' 10 | validate_posterior_samples <- function(posterior_samples){ 11 | if(!is.data.frame(posterior_samples)){ 12 | stop("Posterior samples input is not a dataframe object.") 13 | } 14 | required_columns <- c("option_name", "samples", "sample_id") 15 | purrr::walk(required_columns, ~validate_input_column(.x, posterior_samples, greater_than_zero = FALSE)) 16 | } 17 | -------------------------------------------------------------------------------- /man/calculate_total_cm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_total_cm.R 3 | \name{calculate_total_cm} 4 | \alias{calculate_total_cm} 5 | \title{Calculate Total CM} 6 | \usage{ 7 | calculate_total_cm(rev_per_click, cost_per_click, expected_clicks) 8 | } 9 | \arguments{ 10 | \item{rev_per_click}{vector of rev per click samples} 11 | 12 | \item{cost_per_click}{vector of cost per click (cpc) samples} 13 | 14 | \item{expected_clicks}{vector of expected clicks (expected CTR * fixed impressions)} 15 | } 16 | \value{ 17 | vector of CM estimates (dbl) 18 | } 19 | \description{ 20 | Calculate Total CM 21 | } 22 | -------------------------------------------------------------------------------- /R/validate_wrt_option.R: -------------------------------------------------------------------------------- 1 | #' Validate With Respect To Option 2 | #' 3 | #' Verify that the option provided is in the poster_samples dataframe `option_name` column. 4 | #' Raises error if not TRUE 5 | #' 6 | #' @param wrt_option string name of the option 7 | #' @param posterior_samples Tibble returned from sample_from_posterior with 3 columns 8 | #' `option_name`, `samples`, and `sample_id`. 9 | #' 10 | #' @return None 11 | validate_wrt_option <- function(wrt_option, posterior_samples){ 12 | if(!(wrt_option %in% posterior_samples$option_name)){ 13 | stop(paste(wrt_option, "is an invalid wrt_option. Not one of in the posterior_samples options.")) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /man/impute_missing_options.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/impute_missing_options.R 3 | \name{impute_missing_options} 4 | \alias{impute_missing_options} 5 | \title{Impute Missing Options} 6 | \usage{ 7 | impute_missing_options(posterior_samples, wp_raw) 8 | } 9 | \arguments{ 10 | \item{posterior_samples}{Tibble of data in long form with 2 columns 11 | `option_name` and `samples`} 12 | 13 | \item{wp_raw}{Tibble of win probabilities with the columns: 14 | `option_name` and `win_prob_raw`} 15 | } 16 | \value{ 17 | wp_raw table with new rows if option names were missing. 18 | } 19 | \description{ 20 | When win probability is calculated 21 | } 22 | -------------------------------------------------------------------------------- /man/validate_wrt_option.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate_wrt_option.R 3 | \name{validate_wrt_option} 4 | \alias{validate_wrt_option} 5 | \title{Validate With Respect To Option} 6 | \usage{ 7 | validate_wrt_option(wrt_option, posterior_samples) 8 | } 9 | \arguments{ 10 | \item{wrt_option}{string name of the option} 11 | 12 | \item{posterior_samples}{Tibble returned from sample_from_posterior with 3 columns 13 | `option_name`, `samples`, and `sample_id`.} 14 | } 15 | \value{ 16 | None 17 | } 18 | \description{ 19 | Verify that the option provided is in the poster_samples dataframe `option_name` column. 20 | Raises error if not TRUE 21 | } 22 | -------------------------------------------------------------------------------- /man/validate_priors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate_priors.R 3 | \name{validate_priors} 4 | \alias{validate_priors} 5 | \title{Validate Priors} 6 | \usage{ 7 | validate_priors(priors, valid_priors, default_priors) 8 | } 9 | \arguments{ 10 | \item{priors}{List of named priors with double values.} 11 | 12 | \item{valid_priors}{A character vector of valid prior names.} 13 | 14 | \item{default_priors}{A list of default priors for the distribution.} 15 | } 16 | \value{ 17 | A named list of valid priors for the distribution. 18 | } 19 | \description{ 20 | Validates list of priors against a vector of valid priors and if 21 | the values are not valid, default priors are returned. 22 | } 23 | -------------------------------------------------------------------------------- /R/is_prior_valid.R: -------------------------------------------------------------------------------- 1 | #' Is Prior Valid 2 | #' 3 | #' Checks if a single valid prior name is in the list of prior values and if 4 | #' that prior value from the list is greater than 0. 5 | #' 6 | #' @param priors_list A list of valid priors 7 | #' @param valid_prior A character string 8 | #' 9 | #' @return Boolean (TRUE/FALSE) 10 | #' 11 | 12 | is_prior_valid <- function(priors_list, valid_prior) { 13 | if (length(priors_list) == 0) { 14 | return(FALSE) 15 | } 16 | if (!valid_prior %in% names(priors_list)) { 17 | warning(paste(valid_prior, "is not in priors list.")) 18 | return(FALSE) 19 | } 20 | if (priors_list[[valid_prior]] <= 0) { 21 | warning(paste(valid_prior, "prior is not greater than 0.")) 22 | return(FALSE) 23 | } 24 | TRUE 25 | } 26 | -------------------------------------------------------------------------------- /man/calculate_multi_rev_per_session.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_multi_rev_per_session.R 3 | \name{calculate_multi_rev_per_session} 4 | \alias{calculate_multi_rev_per_session} 5 | \title{Calculate Multi Rev Per Session} 6 | \usage{ 7 | calculate_multi_rev_per_session(conv_rates, inverse_rev_A, inverse_rev_B) 8 | } 9 | \arguments{ 10 | \item{conv_rates}{Dirichlet samples containing a tibble with columns alpha_1, alpha_2, and alpha_0} 11 | 12 | \item{inverse_rev_A}{Vector of inverse revenue samples from A conversion type} 13 | 14 | \item{inverse_rev_B}{Vector of inverse revenue samples from B conversion type} 15 | } 16 | \value{ 17 | Vector of samples (dbl) 18 | } 19 | \description{ 20 | Calculate Multi Rev Per Session 21 | } 22 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_cpc.R: -------------------------------------------------------------------------------- 1 | context("Sample CPC") 2 | 3 | test_that("sample_cpc returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_cost = c(10, 50, 30), 8 | ) 9 | n_options <- length(unique(input_df$option_name)) 10 | n_samples <- 150 11 | expected_col_names <- c(colnames(input_df), "gamma_params", "samples") 12 | output <- sample_cpc(input_df, priors = list(), n_samples = n_samples) 13 | expect_true(is.data.frame(output)) 14 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 15 | expect_length(output$samples, n_options) 16 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 17 | expect_equal(colnames(output), expected_col_names) 18 | }) 19 | -------------------------------------------------------------------------------- /tests/testthat/test-calculate_total_cm.R: -------------------------------------------------------------------------------- 1 | context("Calculate Total CM") 2 | 3 | test_that("calculate_total_cm returns correct cm", { 4 | expected_ouput <- 15 5 | output <- calculate_total_cm(rev_per_click = 10, cost_per_click = 5, expected_clicks = 3) 6 | expect_equal(output, expected_ouput) 7 | }) 8 | 9 | test_that("calculate_total_cm returns negative value", { 10 | expected_ouput <- -30 11 | output <- calculate_total_cm(rev_per_click = 10, cost_per_click = 20, expected_clicks = 3) 12 | expect_equal(output, expected_ouput) 13 | }) 14 | 15 | test_that("calculate_total_cm returns 0 when there are no clicks", { 16 | expected_ouput <- 0 17 | output <- calculate_total_cm(rev_per_click = 10, cost_per_click = 5, expected_clicks = 0) 18 | expect_equal(output, expected_ouput) 19 | }) 20 | -------------------------------------------------------------------------------- /tests/testthat/test-validate_priors.R: -------------------------------------------------------------------------------- 1 | context("Validate Priors") 2 | 3 | test_that("validate_priors selects subset of valid priors", { 4 | expected_output <- list(b = 2, c = 3) 5 | priors <- list(a = 1, b = 2, c = 3) 6 | valid_priors <- c("b", "c") 7 | default_priors <- list(b = 1, c = 1) 8 | output <- validate_priors(priors, valid_priors, default_priors) 9 | testthat::expect_equal(output, expected_output) 10 | }) 11 | 12 | test_that("validate_priors returns default priors when a single parameter is invalid", { 13 | priors <- list(a = 1, b = 2, c = -4) 14 | valid_priors <- c("b", "c") 15 | default_priors <- list(b = 1, c = 1) 16 | testthat::expect_warning({ 17 | output <- validate_priors(priors, valid_priors, default_priors) 18 | }) 19 | testthat::expect_equal(output, default_priors) 20 | }) 21 | -------------------------------------------------------------------------------- /man/rdirichlet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rdirichlet.R 3 | \name{rdirichlet} 4 | \alias{rdirichlet} 5 | \title{Random Dirichlet} 6 | \usage{ 7 | rdirichlet(n, alphas_list) 8 | } 9 | \arguments{ 10 | \item{n}{integer, the number of samples} 11 | 12 | \item{alphas_list}{Named List of Integers: parameters of the dirichlet, 13 | interpreted as the number of success of each outcome} 14 | } 15 | \value{ 16 | n x length(alphas) named tibble representing the probability of observing each outcome 17 | } 18 | \description{ 19 | Randomly samples a vector of length n from a dirichlet distribution parameterized by a vector of alphas 20 | PDF of Gamma with scale = 1 : f(x)= 1/(Gamma(a)) x^(a-1) e^-(x) 21 | } 22 | \examples{ 23 | rdirichlet(100, list(a = 20, b = 15, c = 60)) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_conv_rate.R: -------------------------------------------------------------------------------- 1 | context("Sample Conversion Rate") 2 | 3 | test_that("sample_conv_rate returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110) 8 | ) 9 | n_options <- length(unique(input_df$option_name)) 10 | n_samples <- 150 11 | expected_col_names <- c(colnames(input_df), "beta_params", "samples") 12 | output <- sample_conv_rate(input_df, priors = list(), n_samples = n_samples) 13 | expect_true(is.data.frame(output)) 14 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 15 | expect_length(output$samples, n_options) 16 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 17 | expect_equal(colnames(output), expected_col_names) 18 | }) 19 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_session_duration.R: -------------------------------------------------------------------------------- 1 | context("Sample Session Duration") 2 | 3 | test_that("sample_session_duration returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_sessions = c(1000, 500, 1000), 7 | sum_duration = c(50000, 60000, 35000), 8 | ) 9 | n_options <- length(unique(input_df$option_name)) 10 | n_samples <- 150 11 | expected_col_names <- c(colnames(input_df), "gamma_params", "samples") 12 | output <- sample_session_duration(input_df, priors = list(), n_samples = n_samples) 13 | expect_true(is.data.frame(output)) 14 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 15 | expect_length(output$samples, n_options) 16 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 17 | expect_equal(colnames(output), expected_col_names) 18 | }) 19 | -------------------------------------------------------------------------------- /man/validate_input_column.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate_input_column.R 3 | \name{validate_input_column} 4 | \alias{validate_input_column} 5 | \title{Validate Input Column} 6 | \usage{ 7 | validate_input_column(column_name, input_df, greater_than_zero = TRUE) 8 | } 9 | \arguments{ 10 | \item{column_name}{String value of the column name} 11 | 12 | \item{input_df}{Dataframe containing option_name (str) and various other columns 13 | depending on the distribution type. See vignette for more details.} 14 | 15 | \item{greater_than_zero}{Boolean: Do all values in the column have to be greater than zero?} 16 | } 17 | \value{ 18 | None 19 | } 20 | \description{ 21 | Validates the input column exists in the dataframe, is of the correct type, 22 | and that all values are greater than or equal to 0. 23 | } 24 | -------------------------------------------------------------------------------- /tests/testthat/test-calculate_multi_rev_per_session.R: -------------------------------------------------------------------------------- 1 | context("Calculate Mult Rev Per Session") 2 | 3 | test_that("calculate_multi_rev_per_session returns correct calculations", { 4 | expected_ouput <- 45 5 | output <- calculate_multi_rev_per_session(conv_rates = list(alpha_1 = 0.5, alpha_2 = 0.2), 6 | inverse_rev_A = 0.1, 7 | inverse_rev_B = 0.005) 8 | expect_equal(output, expected_ouput) 9 | }) 10 | 11 | test_that("calculate_multi_rev_per_session returns 0 when rates are 0", { 12 | expected_ouput <- 0 13 | output <- calculate_multi_rev_per_session(conv_rates = list(alpha_1 = 0, alpha_2 = 0), 14 | inverse_rev_A = 0.1, 15 | inverse_rev_B = 0.005) 16 | expect_equal(output, expected_ouput) 17 | }) 18 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_cpa.R: -------------------------------------------------------------------------------- 1 | context("Sample CPA") 2 | 3 | test_that("sample_cpa returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110), 8 | sum_cost = c(10, 50, 30), 9 | ) 10 | n_options <- length(unique(input_df$option_name)) 11 | n_samples <- 150 12 | expected_col_names <- c(colnames(input_df), 13 | "beta_params", "gamma_params", "samples") 14 | output <- sample_cpa(input_df, priors = list(), n_samples = n_samples) 15 | expect_true(is.data.frame(output)) 16 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 17 | expect_length(output$samples, n_options) 18 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 19 | expect_equal(colnames(output), expected_col_names) 20 | }) 21 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_ctr.R: -------------------------------------------------------------------------------- 1 | context("Sample CTR") 2 | 3 | test_that("sample_ctr returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_impressions = c(10000, 10000, 10000), 7 | sum_clicks = c(1000, 950, 1050) 8 | ) 9 | n_options <- length(unique(input_df$option_name)) 10 | n_samples <- 150 11 | expected_col_names <- c("option_name", "sum_impressions", "sum_clicks", 12 | "sum_conversions", "beta_params", "samples") 13 | output <- sample_ctr(input_df, priors = list(), n_samples = n_samples) 14 | expect_true(is.data.frame(output)) 15 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 16 | expect_length(output$samples, n_options) 17 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 18 | expect_equal(colnames(output), expected_col_names) 19 | }) 20 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_page_views_per_session.R: -------------------------------------------------------------------------------- 1 | context("Sample Page Views Per Session") 2 | 3 | test_that("sample_page_views_per_session returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_sessions = c(1000, 500, 1000), 7 | sum_page_views = c(2000, 1500, 3465), 8 | ) 9 | n_options <- length(unique(input_df$option_name)) 10 | n_samples <- 150 11 | expected_col_names <- c(colnames(input_df), "gamma_params", "samples") 12 | output <- sample_page_views_per_session(input_df, priors = list(), n_samples = n_samples) 13 | expect_true(is.data.frame(output)) 14 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 15 | expect_length(output$samples, n_options) 16 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 17 | expect_equal(colnames(output), expected_col_names) 18 | }) 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | # History files 3 | .Rhistory 4 | .Rapp.history 5 | 6 | # Session Data files 7 | .RData 8 | 9 | # User-specific files 10 | .Ruserdata 11 | 12 | # Example code in package build process 13 | *-Ex.R 14 | 15 | # Output files from R CMD build 16 | /*.tar.gz 17 | 18 | # Output files from R CMD check 19 | /*.Rcheck/ 20 | 21 | # RStudio files 22 | .Rproj.user/ 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | *_cache/ 33 | /cache/ 34 | 35 | # Temporary files created by R markdown 36 | *.utf8.md 37 | *.knit.md 38 | 39 | # R Environment Variables 40 | .Renviron 41 | 42 | # pkgdown site 43 | docs/ 44 | inst/doc 45 | doc 46 | Meta 47 | 48 | # CRAN comments 49 | cran-comments.md 50 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_response_rate.R: -------------------------------------------------------------------------------- 1 | context("Sample Response Rate") 2 | 3 | test_that("sample_response_rate returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_sessions = c(1000, 1000, 1000), 8 | sum_conversions = c(100, 120, 110) 9 | ) 10 | n_options <- length(unique(input_df$option_name)) 11 | n_samples <- 150 12 | expected_col_names <- c(colnames(input_df), "beta_params", "samples") 13 | output <- sample_response_rate(input_df, priors = list(), n_samples = n_samples) 14 | expect_true(is.data.frame(output)) 15 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 16 | expect_length(output$samples, n_options) 17 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 18 | expect_equal(colnames(output), expected_col_names) 19 | }) 20 | -------------------------------------------------------------------------------- /R/validate_priors.R: -------------------------------------------------------------------------------- 1 | #' Validate Priors 2 | #' 3 | #' Validates list of priors against a vector of valid priors and if 4 | #' the values are not valid, default priors are returned. 5 | #' 6 | #' @param priors List of named priors with double values. 7 | #' @param valid_priors A character vector of valid prior names. 8 | #' @param default_priors A list of default priors for the distribution. 9 | #' 10 | #' @return A named list of valid priors for the distribution. 11 | #' 12 | #' @importFrom purrr map 13 | #' @importFrom magrittr %>% 14 | #' 15 | #' 16 | validate_priors <- function(priors, valid_priors, default_priors) { 17 | are_priors_valid <- purrr::map(valid_priors, ~ is_prior_valid(priors, .x)) %>% 18 | unlist() 19 | if (all(are_priors_valid)) { 20 | priors[valid_priors] %>% as.list() 21 | } else{ 22 | message("Using default priors.") 23 | default_priors 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_all_values.R: -------------------------------------------------------------------------------- 1 | context("Estimate All Values") 2 | 3 | test_that("Estimate All Values returns correct types", { 4 | expected_output <- 5 | input_df <- tibble::tibble( 6 | option_name = c("A", "B", "C"), 7 | sum_clicks = c(1000, 1000, 1000), 8 | sum_conversions = c(100, 120, 110) 9 | ) 10 | output <- estimate_all_values(input_df, distribution = "conversion_rate", wrt_option_lift = "A") 11 | expect_length(output, 4) 12 | 13 | is_wp_output_tibble <- is.data.frame(output[["Win Probability"]]) 14 | expect_true(is_wp_output_tibble) 15 | 16 | is_wpb_output_tibble <- is.data.frame(output[["Win Probability vs Baseline"]]) 17 | expect_true(is_wpb_output_tibble) 18 | 19 | is_vr_double <- is.double(output[["Value Remaining"]]) 20 | expect_true(is_vr_double) 21 | 22 | is_lift_double <- is.double(output[["Lift vs Baseline"]]) 23 | expect_true(is_lift_double) 24 | }) 25 | -------------------------------------------------------------------------------- /man/update_beta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/update_beta.R 3 | \name{update_beta} 4 | \alias{update_beta} 5 | \title{Update Beta} 6 | \usage{ 7 | update_beta(alpha, beta, priors = list()) 8 | } 9 | \arguments{ 10 | \item{alpha}{Double value for alpha (count of successes). Must be 0 or greater.} 11 | 12 | \item{beta}{Double value for beta (count of failures). Must be 0 or greater.} 13 | 14 | \item{priors}{An optional list object that contains alpha0 and 15 | beta0. Otherwise the function with use Beta(1,1) as the prior distribution.} 16 | } 17 | \value{ 18 | A tibble object that contains `alpha` and `beta` 19 | } 20 | \description{ 21 | Updates Beta Distribution with the Beta-Bernoulli 22 | conjugate prior update rule 23 | } 24 | \examples{ 25 | update_beta(alpha = 1, beta = 5, priors = list(alpha0 = 2, beta0 = 2)) 26 | update_beta(alpha = 20000, beta = 50000) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /R/impute_missing_options.R: -------------------------------------------------------------------------------- 1 | #' Impute Missing Options 2 | #' 3 | #' When win probability is calculated 4 | #' 5 | #' @param posterior_samples Tibble of data in long form with 2 columns 6 | #' `option_name` and `samples` 7 | #' @param wp_raw Tibble of win probabilities with the columns: 8 | #' `option_name` and `win_prob_raw` 9 | #' 10 | #' @return wp_raw table with new rows if option names were missing. 11 | #' 12 | #' @importFrom dplyr bind_rows 13 | #' @importFrom tibble tibble 14 | 15 | impute_missing_options <- function(posterior_samples, wp_raw) { 16 | all_option_names <- unique(posterior_samples$option_name) 17 | missing_option_names <- all_option_names[!all_option_names %in% wp_raw$option_name] 18 | if(length(missing_option_names > 0)){ 19 | wp_raw %>% 20 | dplyr::bind_rows( 21 | tibble::tibble(option_name = missing_option_names, win_prob_raw = 0) 22 | ) 23 | }else{ 24 | wp_raw 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/find_best_option.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_best_option.R 3 | \name{find_best_option} 4 | \alias{find_best_option} 5 | \title{Find Best Option} 6 | \usage{ 7 | find_best_option(posterior_samples, distribution) 8 | } 9 | \arguments{ 10 | \item{posterior_samples}{Tibble returned from sample_from_posterior with 3 columns 11 | `option_name`, `samples`, and `sample_id`.} 12 | 13 | \item{distribution}{String: name of the distribution} 14 | } 15 | \value{ 16 | String: the best option name 17 | } 18 | \description{ 19 | Samples from posterior, calculates win probability, and selects the best option. 20 | Note: this can be inefficient if you already have the win probability dataframe. 21 | Only use this if that has not already been calculated. 22 | } 23 | \examples{ 24 | # Requires posterior distribution 25 | \dontrun{ 26 | find_best_option(posterior_samples = posterior_samples, distribution = "conversion_rate") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_rev_per_session.R: -------------------------------------------------------------------------------- 1 | context("Sample Rev Per Session") 2 | 3 | test_that("sample_rev_per_session returns correct shape", { 4 | input_df_cm_per_click <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_sessions = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110), 8 | sum_revenue = c(900, 1200, 1150) 9 | ) 10 | n_options <- length(unique(input_df_cm_per_click$option_name)) 11 | n_samples <- 150 12 | expected_col_names <- c(colnames(input_df_cm_per_click), 13 | "beta_params", "gamma_params", "samples") 14 | output <- sample_rev_per_session(input_df_cm_per_click, priors = list(), n_samples = n_samples) 15 | expect_true(is.data.frame(output)) 16 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 17 | expect_length(output$samples, n_options) 18 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 19 | expect_equal(colnames(output), expected_col_names) 20 | }) 21 | -------------------------------------------------------------------------------- /man/grizbayr-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/grizbayr-package.R 3 | \docType{package} 4 | \name{grizbayr-package} 5 | \alias{grizbayr} 6 | \alias{grizbayr-package} 7 | \title{grizbayr: Bayesian Inference for A|B and Bandit Marketing Tests} 8 | \description{ 9 | Uses simple Bayesian conjugate prior update rules to calculate the win probability of each option, value remaining in the test, and percent lift over the baseline for various marketing objectives. References: Fink, Daniel (1997) "A Compendium of Conjugate Priors" \url{https://www.johndcook.com/CompendiumOfConjugatePriors.pdf}. Stucchio, Chris (2015) "Bayesian A/B Testing at VWO" \url{https://vwo.com/downloads/VWO_SmartStats_technical_whitepaper.pdf}. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/rangi513/grizbayr} 15 | \item Report bugs at \url{https://github.com/rangi513/grizbayr/issues} 16 | } 17 | 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_cm_per_click.R: -------------------------------------------------------------------------------- 1 | context("Sample CM Per Click") 2 | 3 | test_that("sample_cm_per_click returns correct shape", { 4 | input_df_cm_per_click <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110), 8 | sum_revenue = c(900, 1200, 1150), 9 | sum_cost = c(10, 50, 30), 10 | ) 11 | n_options <- length(unique(input_df_cm_per_click$option_name)) 12 | n_samples <- 150 13 | expected_col_names <- c(colnames(input_df_cm_per_click), 14 | "beta_params", "gamma_params_rev", "gamma_params_cost", "samples") 15 | output <- sample_cm_per_click(input_df_cm_per_click, priors = list(), n_samples = n_samples) 16 | expect_true(is.data.frame(output)) 17 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 18 | expect_length(output$samples, n_options) 19 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 20 | expect_equal(colnames(output), expected_col_names) 21 | }) 22 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_total_cm.R: -------------------------------------------------------------------------------- 1 | context("Sample Total CM") 2 | 3 | test_that("sample_total_cm returns correct shape", { 4 | input_df_cm <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_impressions = c(10000, 10000, 10000), 7 | sum_clicks = c(1000, 1000, 1000), 8 | sum_conversions = c(100, 120, 110), 9 | sum_revenue = c(900, 1200, 1150), 10 | sum_cost = c(10, 50, 30), 11 | ) 12 | n_options <- length(unique(input_df_cm$option_name)) 13 | n_samples <- 150 14 | expected_col_names <- c(colnames(input_df_cm), "beta_params_conv", "beta_params_ctr", 15 | "gamma_params_rev", "gamma_params_cost", "samples") 16 | output <- sample_total_cm(input_df_cm, priors = list(), n_samples = n_samples) 17 | expect_true(is.data.frame(output)) 18 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 19 | expect_length(output$samples, n_options) 20 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 21 | expect_equal(colnames(output), expected_col_names) 22 | }) 23 | -------------------------------------------------------------------------------- /man/validate_input_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate_input_df.R 3 | \name{validate_input_df} 4 | \alias{validate_input_df} 5 | \title{Validate Input DataFrame} 6 | \usage{ 7 | validate_input_df(input_df, distribution) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str) and various other columns 11 | depending on the distribution type. See vignette for more details.} 12 | 13 | \item{distribution}{String of the distribution name} 14 | } 15 | \value{ 16 | Bool TRUE if all checks pass. 17 | } 18 | \description{ 19 | Validates the input dataframe has the correct type, correct required column names, 20 | that the distribution is valid, that the column types are correct, and that the 21 | column values are greater than or equal to 0 when they are numeric. 22 | } 23 | \examples{ 24 | input_df <- tibble::tibble( 25 | option_name = c("A", "B"), 26 | sum_clicks = c(1000, 1000), 27 | sum_conversions = c(100, 120) 28 | ) 29 | validate_input_df(input_df, "conversion_rate") 30 | 31 | } 32 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_value_remaining.R: -------------------------------------------------------------------------------- 1 | context("Estimate Value Remaining") 2 | 3 | 4 | test_that("estimate_value_remaining returns single value around 0.13", { 5 | input_df <- tibble::tibble( 6 | option_name = c("A", "B", "C"), 7 | sum_clicks = c(1000, 1000, 1000), 8 | sum_conversions = c(100, 120, 110) 9 | ) 10 | output <- estimate_value_remaining(input_df, distribution = "conversion_rate") 11 | expect_length(output, 1) 12 | expect_true(is.double(output)) 13 | expect_lt(output, 0.18) 14 | expect_gt(output, 0.10) 15 | }) 16 | 17 | test_that("estimate_lift_vs_baseline returns single value around .22 when threshold is 0.99", { 18 | input_df <- tibble::tibble( 19 | option_name = c("A", "B", "C"), 20 | sum_clicks = c(1000, 1000, 1000), 21 | sum_conversions = c(100, 120, 110) 22 | ) 23 | output <- estimate_value_remaining(input_df, distribution = "conversion_rate", threshold = 0.99) 24 | expect_length(output, 1) 25 | expect_true(is.double(output)) 26 | expect_lt(output, 0.26) 27 | expect_gt(output, 0.15) 28 | }) 29 | -------------------------------------------------------------------------------- /man/estimate_win_prob.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_win_prob.R 3 | \name{estimate_win_prob} 4 | \alias{estimate_win_prob} 5 | \title{Estimate Win Probability} 6 | \usage{ 7 | estimate_win_prob(input_df, distribution, priors = list()) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str) and various other columns 11 | depending on the distribution type. See vignette for more details.} 12 | 13 | \item{distribution}{String of the distribution name} 14 | 15 | \item{priors}{Optional list of priors. Defaults will be use otherwise.} 16 | } 17 | \value{ 18 | tibble object with 2 columns: `option_name` 19 | and `win_probability` formatted as a percent 20 | } 21 | \description{ 22 | Creates a tibble of win probabilities for each option based on the data observed. 23 | } 24 | \examples{ 25 | input_df <- tibble::tibble( 26 | option_name = c("A", "B"), 27 | sum_clicks = c(1000, 1000), 28 | sum_conversions = c(100, 120) 29 | ) 30 | estimate_win_prob(input_df, "conversion_rate") 31 | 32 | } 33 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_multi_rev_per_session.R: -------------------------------------------------------------------------------- 1 | context("Sample Multi Rev Per Session") 2 | 3 | test_that("sample_multi_rev_per_session returns correct shape", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_sessions = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110), 8 | sum_revenue = c(900, 1200, 1150), 9 | sum_conversions_2 = c(10, 8, 20), 10 | sum_revenue_2 = c(10, 16, 15) 11 | ) 12 | n_options <- length(unique(input_df$option_name)) 13 | n_samples <- 150 14 | expected_col_names <- c(colnames(input_df), "dirichlet_params", 15 | "gamma_params_A", "gamma_params_B", "samples") 16 | output <- sample_multi_rev_per_session(input_df, priors = list(), n_samples = n_samples) 17 | expect_true(is.data.frame(output)) 18 | expect_true(all(c("option_name", "samples") %in% colnames(output))) 19 | expect_length(output$samples, n_options) 20 | purrr::walk(output$samples, ~ expect_length(.x, n_samples)) 21 | expect_equal(sort(colnames(output)), sort(expected_col_names)) 22 | }) 23 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2020 Ryan Angi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: grizbayr 2 | Type: Package 3 | Title: Bayesian Inference for A|B and Bandit Marketing Tests 4 | Version: 1.3.5 5 | Author: Ryan Angi 6 | Maintainer: Ryan Angi 7 | Description: Uses simple Bayesian conjugate prior update rules to calculate 8 | the win probability of each option, value remaining in the test, and 9 | percent lift over the baseline for various marketing objectives. 10 | References: 11 | Fink, Daniel (1997) "A Compendium of Conjugate Priors" . 12 | Stucchio, Chris (2015) "Bayesian A/B Testing at VWO" . 13 | Depends: R (>= 2.10) 14 | License: MIT + file LICENSE 15 | Encoding: UTF-8 16 | RoxygenNote: 7.2.3 17 | Imports: 18 | purrr, 19 | dplyr, 20 | tidyr (>= 1.0.0), 21 | magrittr, 22 | tibble, 23 | rlang 24 | Suggests: 25 | spelling, 26 | knitr, 27 | testthat (>= 2.1.0), 28 | rmarkdown 29 | Language: en-US 30 | VignetteBuilder: knitr 31 | URL: https://github.com/rangi513/grizbayr 32 | BugReports: https://github.com/rangi513/grizbayr/issues 33 | -------------------------------------------------------------------------------- /tests/testthat/test-update_beta.R: -------------------------------------------------------------------------------- 1 | context("Update Beta") 2 | 3 | test_that("update_beta updates with priors", { 4 | expected_output <- tibble::tibble(alpha = 3, beta = 7) 5 | output <- update_beta(alpha = 1, beta = 5, priors = list(alpha0 = 2, beta0 = 2)) 6 | testthat::expect_equal(output, expected_output) 7 | }) 8 | 9 | test_that("update_beta updates without priors", { 10 | expected_output <- tibble::tibble(alpha = 2, beta = 6) 11 | output <- update_beta(alpha = 1, beta = 5) 12 | testthat::expect_equal(output, expected_output) 13 | }) 14 | 15 | test_that("update_beta updates but warns with invalid priors", { 16 | expected_output <- tibble::tibble(alpha = 2, beta = 6) 17 | invalid_priors <- list(alpha0 = -1, beta0 = 3) 18 | testthat::expect_warning({ 19 | output <- update_beta(alpha = 1, beta = 5, priors = invalid_priors) 20 | }) 21 | testthat::expect_equal(output, expected_output) 22 | }) 23 | 24 | test_that("update_beta updates without priors large numbers", { 25 | expected_output <- tibble::tibble(alpha = 20001, beta = 50001) 26 | output <- update_beta(alpha = 20000, beta = 50000) 27 | testthat::expect_equal(output, expected_output) 28 | }) 29 | -------------------------------------------------------------------------------- /R/find_best_option.R: -------------------------------------------------------------------------------- 1 | #' Find Best Option 2 | #' 3 | #' Samples from posterior, calculates win probability, and selects the best option. 4 | #' Note: this can be inefficient if you already have the win probability dataframe. 5 | #' Only use this if that has not already been calculated. 6 | #' 7 | #' @param posterior_samples Tibble returned from sample_from_posterior with 3 columns 8 | #' `option_name`, `samples`, and `sample_id`. 9 | #' @param distribution String: name of the distribution 10 | #' 11 | #' @return String: the best option name 12 | #' 13 | #' @importFrom dplyr filter 14 | #' @importFrom magrittr extract2 %>% 15 | #' @importFrom rlang .data 16 | #' @export 17 | #' 18 | #' @examples 19 | #' # Requires posterior distribution 20 | #' \dontrun{ 21 | #' find_best_option(posterior_samples = posterior_samples, distribution = "conversion_rate") 22 | #' } 23 | find_best_option <- function(posterior_samples, distribution){ 24 | estimate_win_prob_given_posterior(posterior_samples = posterior_samples, 25 | winner_is_max = is_winner_max(distribution)) %>% 26 | dplyr::filter(.data$win_prob_raw == max(.data$win_prob_raw)) %>% 27 | magrittr::extract2("option_name") 28 | } 29 | -------------------------------------------------------------------------------- /man/sample_cpc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_cpc.R 3 | \name{sample_cpc} 4 | \alias{sample_cpc} 5 | \title{Sample Cost Per Click} 6 | \usage{ 7 | sample_cpc(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), sum_clicks (dbl), sum_cost (dbl).} 11 | 12 | \item{priors}{Optional list of priors {k0, theta0} for Gamma. 13 | Default \eqn{Gamma(1, 250)} will be use otherwise.} 14 | 15 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 16 | } 17 | \value{ 18 | input_df with 2 new nested columns `gamma_params` and `samples` 19 | } 20 | \description{ 21 | Adds 2 new nested columns to the input_df: `gamma_params` and `samples` 22 | `gamma_params` in each row should be a tibble of length 2 (\eqn{k} 23 | and \eqn{\theta} parameters) 24 | `samples` in each row should be a tibble of length `n_samples` 25 | } 26 | \details{ 27 | See update_rules vignette for a mathematical representation. 28 | \deqn{cpc_i ~ Exponential(\lambda)} 29 | \deqn{\lambda ~ Gamma(k, \theta)} 30 | Average CPC is sampled from a Gamma distribution with an Exponential likelihood 31 | of an individual cost. 32 | } 33 | -------------------------------------------------------------------------------- /tests/testthat/test-update_dirichlet.R: -------------------------------------------------------------------------------- 1 | context("Update Dirichlet") 2 | 3 | test_that("update_dirichlet adds 1 when default priors are used", { 4 | expected_output <- tibble::tibble(alpha_0 = 21, alpha_1 = 6, alpha_2 = 3) 5 | output <- update_dirichlet(alpha_0 = 20, alpha_1 = 5, alpha_2 = 2) 6 | expect_true(is.data.frame(output)) 7 | expect_equal(output, expected_output) 8 | }) 9 | 10 | test_that("update_dirichlet adds non default priors", { 11 | sample_priors_list <- list(alpha00 = 2, alpha01 = 3, alpha02 = 5) 12 | expected_output <- tibble::tibble(alpha_0 = 22, alpha_1 = 8, alpha_2 = 7) 13 | output <- update_dirichlet(alpha_0 = 20, alpha_1 = 5, alpha_2 = 2, priors = sample_priors_list) 14 | expect_true(is.data.frame(output)) 15 | expect_equal(output, expected_output) 16 | }) 17 | 18 | test_that("update_dirichlet adds 1 when default incorrect priors are used", { 19 | incorrect_priors_list <- list(beta00 = 2, beta01 = 3, beta02 = 5) 20 | expected_output <- tibble::tibble(alpha_0 = 21, alpha_1 = 6, alpha_2 = 3) 21 | expect_warning({ 22 | output <- update_dirichlet(alpha_0 = 20, alpha_1 = 5, alpha_2 = 2, priors = incorrect_priors_list) 23 | }) 24 | expect_true(is.data.frame(output)) 25 | expect_equal(output, expected_output) 26 | }) 27 | -------------------------------------------------------------------------------- /man/sample_conv_rate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_conv_rate.R 3 | \name{sample_conv_rate} 4 | \alias{sample_conv_rate} 5 | \title{Sample Conversion Rate} 6 | \usage{ 7 | sample_conv_rate(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_conversions (dbl), and sum_clicks (dbl).} 12 | 13 | \item{priors}{Optional list of priors alpha0 and beta0. 14 | Default \eqn{Beta(1,1)} will be use otherwise.} 15 | 16 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 17 | } 18 | \value{ 19 | input_df with 2 new nested columns `beta_params` and `samples` 20 | } 21 | \description{ 22 | Adds 2 new nested columns to the input_df: `beta_params` and `samples` 23 | `beta_params` in each row should be a tibble of length 2 (\eqn{\alpha} 24 | and \eqn{\beta} parameters) 25 | `samples` in each row should be a tibble of length `n_samples` 26 | } 27 | \details{ 28 | See update_rules vignette for a mathematical representation. 29 | \deqn{conversion_i ~ Bernoulli(\phi)} 30 | \deqn{\phi ~ Beta(\alpha, \beta)} 31 | Conversion Rate is sampled from a Beta distribution with a Binomial likelihood 32 | of an individual converting. 33 | } 34 | -------------------------------------------------------------------------------- /R/estimate_win_prob.R: -------------------------------------------------------------------------------- 1 | #' Estimate Win Probability 2 | #' 3 | #' Creates a tibble of win probabilities for each option based on the data observed. 4 | #' 5 | #' @param input_df Dataframe containing option_name (str) and various other columns 6 | #' depending on the distribution type. See vignette for more details. 7 | #' @param distribution String of the distribution name 8 | #' @param priors Optional list of priors. Defaults will be use otherwise. 9 | #' 10 | #' @return tibble object with 2 columns: `option_name` 11 | #' and `win_probability` formatted as a percent 12 | #' @export 13 | #' 14 | #' @examples 15 | #' input_df <- tibble::tibble( 16 | #' option_name = c("A", "B"), 17 | #' sum_clicks = c(1000, 1000), 18 | #' sum_conversions = c(100, 120) 19 | #' ) 20 | #' estimate_win_prob(input_df, "conversion_rate") 21 | #' 22 | estimate_win_prob <- function(input_df, distribution, priors = list()){ 23 | validate_input_df(input_df, distribution) 24 | 25 | # Sample from posterior distribution 26 | posterior_samples <- sample_from_posterior(input_df, distribution, priors) 27 | 28 | # Calculate Win Probability 29 | estimate_win_prob_given_posterior(posterior_samples = posterior_samples, 30 | winner_is_max = is_winner_max(distribution)) 31 | } 32 | -------------------------------------------------------------------------------- /tests/testthat/test-validate_data_values.R: -------------------------------------------------------------------------------- 1 | context("Validate Data Values") 2 | 3 | test_that("validate_data_values returns same list of 2 when values are valid.", { 4 | data_values <- list(successes = 22, failures = 100) 5 | output <- validate_data_values(data_values) 6 | testthat::expect_equal(output, expected = data_values) 7 | }) 8 | 9 | test_that("validate_data_values returns same list of 4 when values are valid.", { 10 | data_values <- list(successes = 22, failures = 100, revenue = 5678, cost = 1234) 11 | output <- validate_data_values(data_values) 12 | testthat::expect_equal(output, expected = data_values) 13 | }) 14 | 15 | test_that("validate_data_values fails when list is empty.", { 16 | empty_data_values <- list() 17 | testthat::expect_error(validate_data_values(empty_data_values)) 18 | }) 19 | 20 | test_that("validate_data_values fails when one value is less than 0.", { 21 | data_values <- list(successes = -10, failures = 100) 22 | testthat::expect_error(validate_data_values(data_values)) 23 | }) 24 | 25 | test_that("validate_data_values returns the same list when one value equals 0.", { 26 | data_values <- list(successes = 0, failures = 100) 27 | output <- validate_data_values(data_values) 28 | testthat::expect_equal(output, expected = data_values) 29 | }) 30 | -------------------------------------------------------------------------------- /R/rdirichlet.R: -------------------------------------------------------------------------------- 1 | #' Random Dirichlet 2 | #' 3 | #' Randomly samples a vector of length n from a dirichlet distribution parameterized by a vector of alphas 4 | #' PDF of Gamma with scale = 1 : f(x)= 1/(Gamma(a)) x^(a-1) e^-(x) 5 | #' 6 | #' @param alphas_list Named List of Integers: parameters of the dirichlet, 7 | #' interpreted as the number of success of each outcome 8 | #' @param n integer, the number of samples 9 | #' 10 | #' @importFrom magrittr set_colnames %>% 11 | #' @importFrom tibble as_tibble 12 | #' @importFrom stats rgamma 13 | #' @export 14 | #' 15 | #' @return n x length(alphas) named tibble representing the probability of observing each outcome 16 | #' 17 | #' @examples 18 | #' rdirichlet(100, list(a = 20, b = 15, c = 60)) 19 | #' 20 | rdirichlet <- function(n, alphas_list) { 21 | alphas <- unlist(alphas_list) 22 | dimensions <- length(alphas) 23 | 24 | # generate a n x length_alphas matrix of samples from a gamma with shape = alpha_i & scale = 1 25 | gamma_samples <- matrix(stats::rgamma(n * dimensions, alphas), ncol = dimensions, byrow = TRUE) %>% 26 | magrittr::set_colnames(names(alphas)) 27 | 28 | # standardize each sample, so that each of n samples is a simplex 29 | scaled_samples <- gamma_samples / apply(gamma_samples, 1, sum) 30 | tibble::as_tibble(scaled_samples) 31 | } 32 | -------------------------------------------------------------------------------- /man/estimate_win_prob_given_posterior.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_win_prob_given_posterior.R 3 | \name{estimate_win_prob_given_posterior} 4 | \alias{estimate_win_prob_given_posterior} 5 | \title{Estimate Win Probability Given Posterior Distribution} 6 | \usage{ 7 | estimate_win_prob_given_posterior(posterior_samples, winner_is_max = TRUE) 8 | } 9 | \arguments{ 10 | \item{posterior_samples}{Tibble of data in long form with 2 columns 11 | `option_name` and `samples`} 12 | 13 | \item{winner_is_max}{Boolean. This should almost always be TRUE. If a larger number is better 14 | then this should be TRUE. This should be FALSE for metrics such as CPA or CPC where a higher cost 15 | is not necessarily better.} 16 | } 17 | \value{ 18 | Tibble of each option_name and the win probability expressed as a percentage and a decimal `raw` 19 | } 20 | \description{ 21 | Estimate Win Probability Given Posterior Distribution 22 | } 23 | \examples{ 24 | # Requires posterior_samples dataframe. See `sample_from_posterior()` 25 | # for an example. 26 | \dontrun{ 27 | estimate_win_prob_given_posterior(posterior_samples = posterior_samples) 28 | estimate_win_prob_given_posterior( 29 | posterior_samples = posterior_samples, 30 | winner_is_max = TRUE 31 | ) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /man/sample_from_posterior.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_from_posterior.R 3 | \name{sample_from_posterior} 4 | \alias{sample_from_posterior} 5 | \title{Sample From Posterior} 6 | \usage{ 7 | sample_from_posterior( 8 | input_df, 9 | distribution, 10 | priors = list(), 11 | n_samples = 50000 12 | ) 13 | } 14 | \arguments{ 15 | \item{input_df}{Dataframe containing option_name (str) and various other columns 16 | depending on the distribution type. See vignette for more details.} 17 | 18 | \item{distribution}{String of the distribution name} 19 | 20 | \item{priors}{Optional list of priors. Defaults will be use otherwise.} 21 | 22 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 23 | } 24 | \value{ 25 | A tibble with 2 columns: option_name (chr) and samples (dbl) [long form data]. 26 | } 27 | \description{ 28 | Selects which function to use to sample from the posterior distribution 29 | } 30 | \examples{ 31 | input_df <- tibble::tibble( 32 | option_name = c("A", "B"), 33 | sum_clicks = c(1000, 1000), 34 | sum_conversions = c(100, 120), 35 | sum_sessions = c(1000, 1000), 36 | sum_revenue = c(1000, 1500) 37 | ) 38 | sample_from_posterior(input_df, "conversion_rate") 39 | sample_from_posterior(input_df, "rev_per_session") 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/estimate_loss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_loss.R 3 | \name{estimate_loss} 4 | \alias{estimate_loss} 5 | \title{Estimate Loss} 6 | \usage{ 7 | estimate_loss( 8 | posterior_samples, 9 | distribution, 10 | wrt_option = NULL, 11 | metric = c("absolute", "lift", "relative_risk") 12 | ) 13 | } 14 | \arguments{ 15 | \item{posterior_samples}{Tibble: returned from sample_from_posterior with 3 columns 16 | `option_name`, `samples`, and `sample_id`.} 17 | 18 | \item{distribution}{String: the name of the distribution} 19 | 20 | \item{wrt_option}{String: the option loss is calculated with respect to (wrt). If NULL, the best option will be chosen.} 21 | 22 | \item{metric}{String: the type of loss. 23 | absolute will be the difference, on the outcome scale. 0 when best = wrt_option 24 | lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 25 | relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option} 26 | } 27 | \value{ 28 | numeric, the loss distribution 29 | } 30 | \description{ 31 | Estimate Loss 32 | } 33 | \examples{ 34 | # Requires posterior_samples dataframe. See `sample_from_posterior()` 35 | # for an example. 36 | 37 | \dontrun{ 38 | estimate_loss(posterior_samples = posterior_samples, distribution = "conversion_rate") 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /man/update_gamma.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/update_gamma.R 3 | \name{update_gamma} 4 | \alias{update_gamma} 5 | \title{Update Gamma} 6 | \usage{ 7 | update_gamma(k, theta, priors = list(), alternate_priors = FALSE) 8 | } 9 | \arguments{ 10 | \item{k}{Double value for \eqn{k} (total revenue generating events). Must be 0 or greater.} 11 | 12 | \item{theta}{Double value for \eqn{\theta} (sum of revenue). Must be 0 or greater.} 13 | 14 | \item{priors}{An optional list object that contains k0 and 15 | theta0. Otherwise the function will use \eqn{Gamma(1,250)} as the prior distribution. 16 | If a second gamma distribution is used k01 and theta01 can be defined as separate priors 17 | when alternate_priors is set to TRUE.} 18 | 19 | \item{alternate_priors}{Boolean Defaults to FALSE. Allows a user to specify alternate 20 | prior names so the same prior isn't required when multiple gamma distributions are used.} 21 | } 22 | \value{ 23 | A list object that contains `k` and `theta` 24 | } 25 | \description{ 26 | Updates Gamma Distribution with the Gamma-Exponential 27 | conjugate prior update rule. Parameterized by \eqn{k} and \eqn{\theta} (not \eqn{\alpha, \beta}) 28 | } 29 | \examples{ 30 | update_gamma(k = 1, theta = 100, priors = list(k0 = 2, theta0 = 1000)) 31 | update_gamma(k = 10, theta = 200) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/sample_response_rate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_response_rate.R 3 | \name{sample_response_rate} 4 | \alias{sample_response_rate} 5 | \title{Sample Response Rate} 6 | \usage{ 7 | sample_response_rate(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_conversions (dbl), and sum_sessions (dbl).} 12 | 13 | \item{priors}{Optional list of priors alpha0 and beta0. 14 | Default \eqn{Beta(1,1)} will be use otherwise.} 15 | 16 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 17 | } 18 | \value{ 19 | input_df with 2 new nested columns `beta_params` and `samples` 20 | } 21 | \description{ 22 | This is an alias for sample_conv_rate with a different input column. 23 | Adds 2 new nested columns to the input_df: `beta_params` and `samples` 24 | `beta_params` in each row should be a tibble of length 2 (\eqn{\alpha} 25 | and \eqn{\beta} parameters) 26 | `samples` in each row should be a tibble of length `n_samples` 27 | } 28 | \details{ 29 | See update_rules vignette for a mathematical representation. 30 | \deqn{conversion_i ~ Bernoulli(\phi)} 31 | \deqn{\phi ~ Beta(\alpha, \beta)} 32 | Response Rate is sampled from a Beta distribution with a Binomial likelihood 33 | of an individual converting. 34 | } 35 | -------------------------------------------------------------------------------- /man/estimate_win_prob_vs_baseline_given_posterior.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_win_prob_vs_baseline_given_posterior.R 3 | \name{estimate_win_prob_vs_baseline_given_posterior} 4 | \alias{estimate_win_prob_vs_baseline_given_posterior} 5 | \title{Estimate Win Probability vs. Baseline Given Posterior} 6 | \usage{ 7 | estimate_win_prob_vs_baseline_given_posterior( 8 | posterior_samples, 9 | distribution, 10 | wrt_option 11 | ) 12 | } 13 | \arguments{ 14 | \item{posterior_samples}{Tibble returned from sample_from_posterior with 3 columns 15 | `option_name`, `samples`, and `sample_id`.} 16 | 17 | \item{distribution}{String: the distribution name} 18 | 19 | \item{wrt_option}{String: the option to compare against the best option.} 20 | } 21 | \value{ 22 | Tibble of each option_name and the win probability expressed as a percentage and a decimal `raw` 23 | } 24 | \description{ 25 | Calculates the win probability of the best option compared to a single other option 26 | given a posterior distribution. 27 | } 28 | \examples{ 29 | # Requires posterior_samples dataframe. See `sample_from_posterior()` 30 | # for an example. 31 | \dontrun{ 32 | estimate_win_prob_vs_baseline_given_posterior( 33 | posterior_samples = posterior_samples, 34 | distribution = "conversion_rate", 35 | wrt_option = "A") 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # grizbayr 1.3.5 2 | 3 | ## 1.3.5 4 | 5 | - Update package documentation due to roxygen2 breaking change 6 | 7 | ## 1.3.3 8 | 9 | - Update maintainer email 10 | - Update url in DESCRIPTION to referenced paper 11 | - Fix test in `test-sample_from_posterior.R` 12 | - Add CRAN badges to README 13 | 14 | ## 1.3.2 15 | 16 | - Fix backslash in KaTeX in sample_session_duration.R roxygen2 documentation 17 | - Added roxygen2 documentation for lazy loaded `sysdata.rda` 18 | 19 | ## 1.3.1 20 | 21 | - Bugfix in sample_total_cm distribution where the CM distribution is no longer incorrectly bimodal. 22 | 23 | ## 1.3.0 24 | 25 | - Add 2 new distributions: **Page Views Per Session** and **Session Duration** 26 | 27 | ## 1.2.3 28 | 29 | - Remove `add = FALSE` argument to a group_by since default is already FALSE and dplyr 1.0.0 throws deprecation warning. 30 | - Add hard requirement for tidyr >= 1.0.0 to use pivot_wider and pivot_longer functions. 31 | 32 | ## 1.2.2 33 | 34 | - Bugfix - Fixed Win Probability Vs. Baseline to divide by samples per option, not total samples. 35 | - Fixed documentation example on `estimate_all_values()` to make sure option_name column returned a string not a factor. 36 | - Changed `estimate_lift()` metric argument to default to `"lift"` and provided clearer error messaging when there is an invalid argument. 37 | 38 | ## 1.2.1 39 | 40 | - Initial release. 41 | -------------------------------------------------------------------------------- /man/sample_ctr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_ctr.R 3 | \name{sample_ctr} 4 | \alias{sample_ctr} 5 | \title{Sample Click Through Rate} 6 | \usage{ 7 | sample_ctr(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_clicks (dbl), and sum_impressions (dbl).} 12 | 13 | \item{priors}{Optional list of priors alpha0 and beta0. 14 | Default \eqn{Beta(1,1)} will be use otherwise.} 15 | 16 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 17 | } 18 | \value{ 19 | input_df with 2 new nested columns `beta_params` and `samples` 20 | } 21 | \description{ 22 | This is an alias for sample_conv_rate with 2 different input 23 | columns. This function calculates posterior samples of 24 | \eqn{CTR = clicks/impressions}. Adds 2 new nested columns to 25 | the input_df: `beta_params` and `samples`. 26 | `beta_params` in each row should be a tibble of length 2 (\eqn{\alpha} 27 | and \eqn{\beta} parameters) 28 | `samples` in each row should be a tibble of length `n_samples` 29 | } 30 | \details{ 31 | See update_rules vignette for a mathematical representation. 32 | \deqn{click_i ~ Bernoulli(\phi)} 33 | \deqn{\phi ~ Beta(\alpha, \beta)} 34 | Click Through Rate is sampled from a Beta distribution with a Binomial 35 | likelihood of an individual Clicking 36 | } 37 | -------------------------------------------------------------------------------- /R/sample_response_rate.R: -------------------------------------------------------------------------------- 1 | #' Sample Response Rate 2 | #' 3 | #' This is an alias for sample_conv_rate with a different input column. 4 | #' Adds 2 new nested columns to the input_df: `beta_params` and `samples` 5 | #' `beta_params` in each row should be a tibble of length 2 (\eqn{\alpha} 6 | #' and \eqn{\beta} parameters) 7 | #' `samples` in each row should be a tibble of length `n_samples` 8 | #' 9 | #' See update_rules vignette for a mathematical representation. 10 | #' \deqn{conversion_i ~ Bernoulli(\phi)} 11 | #' \deqn{\phi ~ Beta(\alpha, \beta)} 12 | #' Response Rate is sampled from a Beta distribution with a Binomial likelihood 13 | #' of an individual converting. 14 | #' 15 | #' @param input_df Dataframe containing option_name (str), 16 | #' sum_conversions (dbl), and sum_sessions (dbl). 17 | #' @param priors Optional list of priors alpha0 and beta0. 18 | #' Default \eqn{Beta(1,1)} will be use otherwise. 19 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 20 | #' 21 | #' @importFrom dplyr rename 22 | #' @importFrom rlang .data 23 | #' 24 | #' @return input_df with 2 new nested columns `beta_params` and `samples` 25 | #' 26 | sample_response_rate <- function(input_df, priors, n_samples = 5e4){ 27 | renamed_input_df <- dplyr::mutate(input_df, sum_clicks = .data$sum_sessions) 28 | sample_conv_rate(renamed_input_df, priors, n_samples) 29 | } 30 | 31 | -------------------------------------------------------------------------------- /man/estimate_lift.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_lift.R 3 | \name{estimate_lift} 4 | \alias{estimate_lift} 5 | \title{Estimate Lift Distribution} 6 | \usage{ 7 | estimate_lift(posterior_samples, distribution, wrt_option, metric = "lift") 8 | } 9 | \arguments{ 10 | \item{posterior_samples}{Tibble returned from sample_from_posterior with 3 columns 11 | `option_name`, `samples`, and `sample_id`.} 12 | 13 | \item{distribution}{String of the distribution name} 14 | 15 | \item{wrt_option}{string the option lift is calculated with respect to (wrt). Required.} 16 | 17 | \item{metric}{string the type of lift. 18 | `absolute`` will be the difference, on the outcome scale. 0 when best = wrt_option 19 | `lift`` will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 20 | `relative_risk`` will be the ratio best/wrt_option, 1 when best = wrt_option} 21 | } 22 | \value{ 23 | numeric, the lift distribution 24 | } 25 | \description{ 26 | Estimates lift distribution vector from posterior samples. 27 | } 28 | \examples{ 29 | # Requires posterior_samples dataframe. See `sample_from_posterior()` 30 | # for an example. 31 | 32 | \dontrun{ 33 | estimate_lift(posterior_samples = posterior_samples, 34 | distribution = "conversion_rate", 35 | wrt_option = "A", 36 | metric = "lift") 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /man/estimate_win_prob_vs_baseline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_win_prob_vs_baseline.R 3 | \name{estimate_win_prob_vs_baseline} 4 | \alias{estimate_win_prob_vs_baseline} 5 | \title{Estimate Win Probability vs. Baseline} 6 | \usage{ 7 | estimate_win_prob_vs_baseline( 8 | input_df, 9 | distribution, 10 | priors = list(), 11 | wrt_option 12 | ) 13 | } 14 | \arguments{ 15 | \item{input_df}{Dataframe containing option_name (str) and various other columns 16 | depending on the distribution type. See vignette for more details.} 17 | 18 | \item{distribution}{String of the distribution name} 19 | 20 | \item{priors}{Optional list of priors. Defaults will be use otherwise.} 21 | 22 | \item{wrt_option}{string the option win prob is calculated with respect to (wrt). Required.} 23 | } 24 | \value{ 25 | Tibble of each option_name and the win probability expressed as a percentage and a decimal `raw` 26 | } 27 | \description{ 28 | Calculates the win probability of the best option compared to a single other option 29 | given an input_df 30 | } 31 | \examples{ 32 | input_df <- tibble::tibble( 33 | option_name = c("A", "B", "C"), 34 | sum_clicks = c(1000, 1000, 1000), 35 | sum_conversions = c(100, 120, 110) 36 | ) 37 | estimate_win_prob_vs_baseline(input_df = input_df, 38 | distribution = "conversion_rate", 39 | wrt_option = "B") 40 | 41 | } 42 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_lift_vs_baseline.R: -------------------------------------------------------------------------------- 1 | context("Estimate Lift Vs Baseline") 2 | 3 | test_that("estimate_lift_vs_baseline returns single value around .12", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110) 8 | ) 9 | output <- estimate_lift_vs_baseline(input_df, 10 | distribution = "conversion_rate", 11 | wrt_option = "A", 12 | metric = "lift", 13 | threshold = 0.7) 14 | expect_length(output, 1) 15 | expect_lt(output, 0.18) 16 | expect_gt(output, 0.06) 17 | }) 18 | 19 | test_that("estimate_lift_vs_baseline returns single value around .20 when threshold is 0.5", { 20 | input_df <- tibble::tibble( 21 | option_name = c("A", "B", "C"), 22 | sum_clicks = c(1000, 1000, 1000), 23 | sum_conversions = c(100, 120, 110) 24 | ) 25 | output <- estimate_lift_vs_baseline(input_df, 26 | distribution = "conversion_rate", 27 | wrt_option = "A", 28 | metric = "lift", 29 | threshold = 0.5) 30 | expect_length(output, 1) 31 | expect_lt(output, 0.25) 32 | expect_gt(output, 0.15) 33 | }) 34 | -------------------------------------------------------------------------------- /R/update_beta.R: -------------------------------------------------------------------------------- 1 | #' Update Beta 2 | #' 3 | #' Updates Beta Distribution with the Beta-Bernoulli 4 | #' conjugate prior update rule 5 | #' 6 | #' @param alpha Double value for alpha (count of successes). Must be 0 or greater. 7 | #' @param beta Double value for beta (count of failures). Must be 0 or greater. 8 | #' @param priors An optional list object that contains alpha0 and 9 | #' beta0. Otherwise the function with use Beta(1,1) as the prior distribution. 10 | #' 11 | #' 12 | #' @return A tibble object that contains `alpha` and `beta` 13 | #' @export 14 | #' @importFrom tibble tibble 15 | #' 16 | #' @examples 17 | #' update_beta(alpha = 1, beta = 5, priors = list(alpha0 = 2, beta0 = 2)) 18 | #' update_beta(alpha = 20000, beta = 50000) 19 | #' 20 | update_beta <- function(alpha, beta, priors = list()) { 21 | validate_data_values(data_values = list(alpha = alpha, beta = beta)) 22 | 23 | # Set Attributes 24 | valid_beta_params <- c("alpha0", "beta0") 25 | default_beta_priors <- list(alpha0 = 1, beta0 = 1) 26 | 27 | # Validate Priors 28 | validated_priors <- validate_priors(priors = priors, 29 | valid_priors = valid_beta_params, 30 | default_priors = default_beta_priors) 31 | alpha0 <- validated_priors$alpha0 32 | beta0 <- validated_priors$beta0 33 | 34 | tibble::tibble(alpha = alpha0 + alpha, beta = beta0 + beta) 35 | } 36 | -------------------------------------------------------------------------------- /man/update_dirichlet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/update_dirichlet.R 3 | \name{update_dirichlet} 4 | \alias{update_dirichlet} 5 | \title{Update Dirichlet Distribution} 6 | \usage{ 7 | update_dirichlet(alpha_0, alpha_1, alpha_2, priors = list()) 8 | } 9 | \arguments{ 10 | \item{alpha_0}{Double value for alpha_0 (count of failures). Must be 0 or greater.} 11 | 12 | \item{alpha_1}{Double value for alpha_1 (count of successes side 1). Must be 0 or greater.} 13 | 14 | \item{alpha_2}{Double value for alpha_2 (count of successes side 2). Must be 0 or greater.} 15 | 16 | \item{priors}{An optional list object that contains alpha00, alpha01, and alpha02. 17 | Otherwise the function with use \eqn{Dirichlet(1,1,1)} as the prior distribution.} 18 | } 19 | \value{ 20 | tibble with columns alpha_0, alpha_1, and alpha_2 21 | } 22 | \description{ 23 | This function updates the Dirichlet distribution with the 24 | Dirichlet-Multinomial conjugate prior update rule. 25 | } 26 | \details{ 27 | TODO: This function currently only works in 3 dimensions. 28 | Should be extended into N dimensions in the future. Can use ... notation. 29 | } 30 | \examples{ 31 | update_dirichlet(alpha_0 = 20, alpha_1 = 5, alpha_2 = 2) 32 | sample_priors_list <- list(alpha00 = 2, alpha01 = 3, alpha02 = 5) 33 | update_dirichlet(alpha_0 = 20, alpha_1 = 5, alpha_2 = 2, priors = sample_priors_list) 34 | 35 | } 36 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_win_prob_vs_baseline.R: -------------------------------------------------------------------------------- 1 | context("Estimate Win Probability vs Baseline") 2 | 3 | test_that("estimate_win_prob_vs_baseline returns only 2 options", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110) 8 | ) 9 | all_option_names <- unique(input_df$option_name) 10 | output <- estimate_win_prob_vs_baseline(input_df = input_df, 11 | distribution = "conversion_rate", 12 | wrt_option = "A") 13 | # Subset exists 14 | expect_true(all(output$option_name %in% all_option_names)) 15 | expect_true(nrow(output) == 2) 16 | }) 17 | 18 | 19 | test_that("estimate_win_prob_vs_baseline_given_posterior handles when wrt_option is the best", { 20 | # Can't be better than yourself 21 | input_df <- tibble::tibble( 22 | option_name = c("A", "B", "C"), 23 | sum_clicks = c(1000, 1000, 1000), 24 | sum_conversions = c(100, 120, 110) 25 | ) 26 | all_option_names <- unique(input_df$option_name) 27 | output <- estimate_win_prob_vs_baseline(input_df = input_df, 28 | distribution = "conversion_rate", 29 | wrt_option = "B") 30 | expect_true(all(output$option_name %in% all_option_names)) 31 | expect_true(nrow(output) == 1) 32 | }) 33 | -------------------------------------------------------------------------------- /man/sample_session_duration.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_session_duration.R 3 | \name{sample_session_duration} 4 | \alias{sample_session_duration} 5 | \title{Sample Session Duration} 6 | \usage{ 7 | sample_session_duration(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_sessions (dbl), and sum_duration (dbl).} 12 | 13 | \item{priors}{Optional list of priors k0 and theta0. 14 | Default \eqn{Gamma(1, 250)} will be use otherwise.} 15 | 16 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 17 | } 18 | \value{ 19 | input_df with 2 new nested columns `gamma_params` and `samples` 20 | } 21 | \description{ 22 | Adds 2 new nested columns to the input_df: `gamma_params` and `samples` 23 | `gamma_params` in each row should be a tibble of length 2 (\eqn{k} 24 | and \eqn{\theta} parameters) 25 | `samples` in each row should be a tibble of length `n_samples` 26 | } 27 | \details{ 28 | See update_rules vignette for a mathematical representation. 29 | \deqn{duration_i ~ Exponential(\lambda)} 30 | \deqn{\lambda ~ Gamma(k, \theta)} 31 | Session Duration is sampled from a Gamma distribution with a Exponential likelihood 32 | of an individual leaving the site or ending a session at time t. 33 | 34 | This is not always the case, so verify your data follows the shape of 35 | an exponential distribution before using this. 36 | } 37 | -------------------------------------------------------------------------------- /tests/testthat/test-rdirichlet.R: -------------------------------------------------------------------------------- 1 | context("Random Dirichlet") 2 | 3 | test_that("rdirichlet returns correct size df", { 4 | n_samples <- 100 5 | output <- rdirichlet(n_samples, list(a = 20, b = 15, c = 60)) 6 | expect_true(is.data.frame(output)) 7 | expect_true(nrow(output) == n_samples) 8 | }) 9 | 10 | test_that("rdirichlet returns empty df when 0 samples are requested", { 11 | n_samples <- 0 12 | output <- rdirichlet(n_samples, list(a = 20, b = 15, c = 60)) 13 | expect_true(is.data.frame(output)) 14 | expect_true(nrow(output) == n_samples) 15 | }) 16 | 17 | test_that("rdirichlet returns simplex for each row", { 18 | n_samples <- 100 19 | output <- rdirichlet(n_samples, list(a = 20, b = 15, c = 60)) 20 | expect_true({ 21 | purrr::pmap(output, function(...) round(sum(...), 2)) %>% 22 | purrr::every( ~ .x == 1) 23 | }) 24 | }) 25 | 26 | test_that("rdirichlet returns simplex for each row long list", { 27 | n_samples <- 100 28 | output <- rdirichlet(n_samples, list(a = 20, b = 15, c = 60, d = 30, f = 60, g = 22, h = 1, i = 0, k = 6)) 29 | expect_true({ 30 | purrr::pmap(output, function(...) round(sum(...), 2)) %>% 31 | purrr::every( ~ .x == 1) 32 | }) 33 | }) 34 | 35 | test_that("rdirichlet names match input", { 36 | n_samples <- 100 37 | input_list <- list(a = 20, b = 15, c = 60, d = 30, f = 60, g = 22, h = 1, i = 0, k = 6) 38 | output <- rdirichlet(n_samples, input_list) 39 | expect_true(all(names(output) == names(input_list))) 40 | }) 41 | -------------------------------------------------------------------------------- /R/estimate_win_prob_vs_baseline.R: -------------------------------------------------------------------------------- 1 | #' Estimate Win Probability vs. Baseline 2 | #' 3 | #' Calculates the win probability of the best option compared to a single other option 4 | #' given an input_df 5 | #' 6 | #' @param input_df Dataframe containing option_name (str) and various other columns 7 | #' depending on the distribution type. See vignette for more details. 8 | #' @param distribution String of the distribution name 9 | #' @param priors Optional list of priors. Defaults will be use otherwise. 10 | #' @param wrt_option string the option win prob is calculated with respect to (wrt). Required. 11 | #' 12 | #' @return Tibble of each option_name and the win probability expressed as a percentage and a decimal `raw` 13 | #' @export 14 | #' 15 | #' @examples 16 | #' input_df <- tibble::tibble( 17 | #' option_name = c("A", "B", "C"), 18 | #' sum_clicks = c(1000, 1000, 1000), 19 | #' sum_conversions = c(100, 120, 110) 20 | #' ) 21 | #' estimate_win_prob_vs_baseline(input_df = input_df, 22 | #' distribution = "conversion_rate", 23 | #' wrt_option = "B") 24 | #' 25 | estimate_win_prob_vs_baseline <- function(input_df, distribution, priors = list(), wrt_option){ 26 | validate_input_df(input_df, distribution) 27 | 28 | # Sample from posterior distribution 29 | posterior_samples <- sample_from_posterior(input_df, distribution, priors) 30 | 31 | # Calculate Win Prob vs Baseline 32 | estimate_win_prob_vs_baseline_given_posterior(posterior_samples, distribution, wrt_option) 33 | } 34 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(estimate_all_values) 4 | export(estimate_lift) 5 | export(estimate_lift_vs_baseline) 6 | export(estimate_loss) 7 | export(estimate_value_remaining) 8 | export(estimate_win_prob) 9 | export(estimate_win_prob_given_posterior) 10 | export(estimate_win_prob_vs_baseline) 11 | export(estimate_win_prob_vs_baseline_given_posterior) 12 | export(find_best_option) 13 | export(rdirichlet) 14 | export(sample_from_posterior) 15 | export(update_beta) 16 | export(update_dirichlet) 17 | export(update_gamma) 18 | export(validate_input_df) 19 | importFrom(dplyr,"%>%") 20 | importFrom(dplyr,arrange) 21 | importFrom(dplyr,bind_rows) 22 | importFrom(dplyr,desc) 23 | importFrom(dplyr,filter) 24 | importFrom(dplyr,group_by) 25 | importFrom(dplyr,mutate) 26 | importFrom(dplyr,rename) 27 | importFrom(dplyr,row_number) 28 | importFrom(dplyr,select) 29 | importFrom(dplyr,select_if) 30 | importFrom(dplyr,summarise) 31 | importFrom(magrittr,"%>%") 32 | importFrom(magrittr,extract2) 33 | importFrom(magrittr,set_colnames) 34 | importFrom(magrittr,set_names) 35 | importFrom(magrittr,use_series) 36 | importFrom(purrr,map) 37 | importFrom(purrr,map2) 38 | importFrom(purrr,pmap) 39 | importFrom(purrr,some) 40 | importFrom(purrr,walk) 41 | importFrom(rlang,.data) 42 | importFrom(stats,quantile) 43 | importFrom(stats,rbeta) 44 | importFrom(stats,rgamma) 45 | importFrom(tibble,as_tibble) 46 | importFrom(tibble,tibble) 47 | importFrom(tidyr,pivot_wider) 48 | importFrom(tidyr,unnest) 49 | -------------------------------------------------------------------------------- /man/sample_cm_per_click.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_cm_per_click.R 3 | \name{sample_cm_per_click} 4 | \alias{sample_cm_per_click} 5 | \title{Sample CM Per Click} 6 | \usage{ 7 | sample_cm_per_click(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), sum_conversions (dbl), sum_revenue (dbl), 11 | and sum_clicks (dbl).} 12 | 13 | \item{priors}{Optional list of priors {alpha0, beta0} for Beta, {k0, theta0} for Gamma Inverse Revenue, 14 | and {k01, theta01} for Gamma Cost (uses alternate priors so they can be different from Revenue). 15 | Default \eqn{Beta(1,1)} and \eqn{Gamma(1, 250)} will be use otherwise.} 16 | 17 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 18 | } 19 | \value{ 20 | input_df with 4 new nested columns `beta_params`, `gamma_params_rev`, 21 | `gamma_params_cost`, and `samples` 22 | } 23 | \description{ 24 | Adds 4 new nested columns to the input_df: `beta_params`, 25 | `gamma_params_rev`, `gamma_params_cost`and `samples` 26 | } 27 | \details{ 28 | `beta_params` and `gamma_params_rev` in each row should be a 29 | tibble of length 2 (\eqn{\alpha} and \eqn{\beta} parameters 30 | and \eqn{k} and \eqn{\theta} parameters) 31 | `samples` in each row should be a tibble of length `n_samples` 32 | 33 | See update_rules vignette for a mathematical representation. 34 | \deqn{CMPerClick = ConversionsPerClick * RevPerConversion - CostPerClick} 35 | } 36 | -------------------------------------------------------------------------------- /R/estimate_win_prob_vs_baseline_given_posterior.R: -------------------------------------------------------------------------------- 1 | #' Estimate Win Probability vs. Baseline Given Posterior 2 | #' 3 | #' Calculates the win probability of the best option compared to a single other option 4 | #' given a posterior distribution. 5 | #' 6 | #' @param posterior_samples Tibble returned from sample_from_posterior with 3 columns 7 | #' `option_name`, `samples`, and `sample_id`. 8 | #' @param distribution String: the distribution name 9 | #' @param wrt_option String: the option to compare against the best option. 10 | #' 11 | #' @return Tibble of each option_name and the win probability expressed as a percentage and a decimal `raw` 12 | #' @export 13 | #' @importFrom rlang .data 14 | #' @importFrom dplyr filter 15 | #' 16 | #' @examples 17 | #' # Requires posterior_samples dataframe. See `sample_from_posterior()` 18 | #' # for an example. 19 | #' \dontrun{ 20 | #' estimate_win_prob_vs_baseline_given_posterior( 21 | #' posterior_samples = posterior_samples, 22 | #' distribution = "conversion_rate", 23 | #' wrt_option = "A") 24 | #'} 25 | #' 26 | estimate_win_prob_vs_baseline_given_posterior <- function(posterior_samples, distribution, wrt_option){ 27 | validate_wrt_option(wrt_option, posterior_samples) 28 | best_option <- find_best_option(posterior_samples, distribution) 29 | 30 | posterior_samples_subset <- posterior_samples %>% 31 | dplyr::filter(.data$option_name %in% c(wrt_option, best_option)) 32 | 33 | estimate_win_prob_given_posterior(posterior_samples_subset, is_winner_max(distribution)) 34 | } 35 | 36 | -------------------------------------------------------------------------------- /man/sample_multi_rev_per_session.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_multi_rev_per_session.R 3 | \name{sample_multi_rev_per_session} 4 | \alias{sample_multi_rev_per_session} 5 | \title{Sample Multiple Revenue Per Session} 6 | \usage{ 7 | sample_multi_rev_per_session(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_conversions (dbl), sum_sessions (dbl), sum_revenue (dbl), 12 | sum_conversion_2 (dbl), sum_sessions_2 (dbl), sum_revenue_2 (dbl).} 13 | 14 | \item{priors}{Optional list of priors alpha0 and beta0. 15 | Default \eqn{Beta(1,1)} will be use otherwise.} 16 | 17 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 18 | } 19 | \value{ 20 | input_df with 4 new nested columns `dirichlet_params`, 21 | `gamma_params_A`, `gamma_params_B`, and `samples`. 22 | `samples` in each row should be a tibble of length `n_samples`. 23 | } 24 | \description{ 25 | Adds 5 new nested columns to the input_df: `dirichlet_params`, 26 | `gamma_params_A`, `gamma_params_B`, and `samples`. 27 | This samples from multiple revenue per session distributions 28 | at once. 29 | } 30 | \details{ 31 | See update_rules vignette for a mathematical representation. 32 | 33 | \deqn{conversion_i ~ MultiNomial(\phi_1, \phi_2, ..., \phi_k)} 34 | \deqn{\phi_k ~ Dirichlet(\alpha, \beta)} 35 | Conversion Rate is sampled from a Dirichlet distribution with a Multinomial likelihood 36 | of an individual converting. 37 | } 38 | -------------------------------------------------------------------------------- /R/sample_ctr.R: -------------------------------------------------------------------------------- 1 | #' Sample Click Through Rate 2 | #' 3 | #' This is an alias for sample_conv_rate with 2 different input 4 | #' columns. This function calculates posterior samples of 5 | #' \eqn{CTR = clicks/impressions}. Adds 2 new nested columns to 6 | #' the input_df: `beta_params` and `samples`. 7 | #' `beta_params` in each row should be a tibble of length 2 (\eqn{\alpha} 8 | #' and \eqn{\beta} parameters) 9 | #' `samples` in each row should be a tibble of length `n_samples` 10 | #' 11 | #' See update_rules vignette for a mathematical representation. 12 | #' \deqn{click_i ~ Bernoulli(\phi)} 13 | #' \deqn{\phi ~ Beta(\alpha, \beta)} 14 | #' Click Through Rate is sampled from a Beta distribution with a Binomial 15 | #' likelihood of an individual Clicking 16 | #' 17 | #' @param input_df Dataframe containing option_name (str), 18 | #' sum_clicks (dbl), and sum_impressions (dbl). 19 | #' @param priors Optional list of priors alpha0 and beta0. 20 | #' Default \eqn{Beta(1,1)} will be use otherwise. 21 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 22 | #' 23 | #' @importFrom dplyr mutate 24 | #' @importFrom rlang .data 25 | #' 26 | #' @return input_df with 2 new nested columns `beta_params` and `samples` 27 | #' 28 | sample_ctr <- function(input_df, priors, n_samples = 5e4){ 29 | renamed_input_df <- dplyr::mutate(input_df, 30 | sum_conversions = .data$sum_clicks, 31 | sum_clicks = .data$sum_impressions) 32 | sample_conv_rate(renamed_input_df, priors, n_samples) 33 | } 34 | 35 | -------------------------------------------------------------------------------- /R/validate_input_column.R: -------------------------------------------------------------------------------- 1 | #' Validate Input Column 2 | #' 3 | #' Validates the input column exists in the dataframe, is of the correct type, 4 | #' and that all values are greater than or equal to 0. 5 | #' 6 | #' @param column_name String value of the column name 7 | #' @param input_df Dataframe containing option_name (str) and various other columns 8 | #' depending on the distribution type. See vignette for more details. 9 | #' @param greater_than_zero Boolean: Do all values in the column have to be greater than zero? 10 | #' 11 | #' @return None 12 | #' 13 | #' @importFrom purrr some 14 | #' 15 | validate_input_column <- function(column_name, input_df, greater_than_zero = TRUE){ 16 | # Ensure All Columns Exist 17 | if(!column_name %in% colnames(input_df)){ 18 | stop(paste(column_name, "is a required column for this distribution type and is not found in the input_df.")) 19 | } 20 | 21 | # Ensure Column Types are correct 22 | if(column_name == "option_name"){ 23 | if(!is.character(input_df[["option_name"]])){ 24 | stop("option_name column is not a character string") 25 | } 26 | }else{ 27 | if(!is.numeric(input_df[[column_name]])){ 28 | stop(paste(column_name, "is not a numeric column.")) 29 | } 30 | } 31 | 32 | if(greater_than_zero){ 33 | # Ensure all values are greater than or equal to 0. 34 | if(purrr::some(input_df[[column_name]], ~ .x < 0)){ 35 | stop(paste("All values in column `", 36 | column_name, 37 | "` must be greater than or equal to zero.")) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tests/testthat/test-validate_input_df.R: -------------------------------------------------------------------------------- 1 | test_that("validate_input_df returns TRUE when df is valid for conversion rate", { 2 | input_df <- tibble::tibble( 3 | option_name = c("A", "B"), 4 | sum_clicks = c(1000, 1000), 5 | sum_conversions = c(100, 120) 6 | ) 7 | expect_true(validate_input_df(input_df, "conversion_rate")) 8 | }) 9 | 10 | test_that("validate_input_df fails when df is not valid for rev per session", { 11 | input_df <- tibble::tibble( 12 | option_name = c("A", "B"), 13 | sum_clicks = c(1000, 1000), 14 | sum_conversions = c(100, 120) 15 | ) 16 | expect_error(validate_input_df(input_df, "rev_per_session")) 17 | }) 18 | 19 | test_that("validate_input_df fails when df is not valid for rev per session", { 20 | input_df <- tibble::tibble( 21 | option_name = c("A", "B"), 22 | sum_clicks = c(1000, 1000), 23 | sum_conversions = c(100, 120) 24 | ) 25 | expect_error(validate_input_df(input_df, "rev_per_session")) 26 | }) 27 | 28 | test_that("validate_input_df fails when input is not a dataframe", { 29 | input_df <- list( 30 | option_name = c("A", "B"), 31 | sum_clicks = c(1000, 1000), 32 | sum_conversions = c(100, 120) 33 | ) 34 | expect_error(validate_input_df(input_df, "conversion_rate")) 35 | }) 36 | 37 | test_that("validate_input_df fails if distribution name is invalid", { 38 | input_df <- tibble::tibble( 39 | option_name = c("A", "B"), 40 | sum_clicks = c(1000, 1000), 41 | sum_conversions = c(100, 120) 42 | ) 43 | expect_error(validate_input_df(input_df, "invalid_distribution")) 44 | }) 45 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_win_prob.R: -------------------------------------------------------------------------------- 1 | context("Estimate Win Probability") 2 | 3 | test_that("estimate_win_prob returns tibble with correct values", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110) 8 | ) 9 | output <- estimate_win_prob(input_df, "conversion_rate") 10 | all_option_names <- input_df$option_name 11 | expected_column_names <- c("option_name", "win_prob_raw", "win_prob") 12 | expect_true(is.data.frame(output)) 13 | expect_true(all(expected_column_names %in% colnames(output))) 14 | expect_true(all(output$option_name %in% all_option_names)) 15 | expect_true(is.double(output$win_prob_raw)) 16 | expect_true(is.character(output$win_prob)) 17 | expect_equal(nrow(output), length(all_option_names)) 18 | }) 19 | 20 | test_that("estimate_win_prob returns tibble with correct values when win prob is 0", { 21 | input_df <- tibble::tibble( 22 | option_name = c("A", "B", "C"), 23 | sum_clicks = c(1000, 1000, 1000), 24 | sum_conversions = c(1, 120, 2) 25 | ) 26 | output <- estimate_win_prob(input_df, "conversion_rate") 27 | all_option_names <- input_df$option_name 28 | expected_column_names <- c("option_name", "win_prob_raw", "win_prob") 29 | expect_true(is.data.frame(output)) 30 | expect_true(all(expected_column_names %in% colnames(output))) 31 | expect_true(all(output$option_name %in% all_option_names)) 32 | expect_true(is.double(output$win_prob_raw)) 33 | expect_true(is.character(output$win_prob)) 34 | expect_equal(nrow(output), length(all_option_names)) 35 | }) 36 | -------------------------------------------------------------------------------- /man/sample_page_views_per_session.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_page_views_per_session.R 3 | \name{sample_page_views_per_session} 4 | \alias{sample_page_views_per_session} 5 | \title{Sample Page Views Per Session (Visit)} 6 | \usage{ 7 | sample_page_views_per_session(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_sessions (dbl), and sum_page_views_per_session (dbl).} 12 | 13 | \item{priors}{Optional list of priors k0 and theta0. 14 | Default \eqn{Gamma(1, 250)} will be use otherwise. 15 | \eqn{Gamma(1, 1)} might also be a good choice for this distribution 16 | if you only have a few page views per session.} 17 | 18 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 19 | } 20 | \value{ 21 | input_df with 2 new nested columns `gamma_params` and `samples` 22 | } 23 | \description{ 24 | Adds 2 new nested columns to the input_df: `gamma_params` and `samples` 25 | `gamma_params` in each row should be a tibble of length 2 (\eqn{k} 26 | and \eqn{\theta} parameters) 27 | `samples` in each row should be a tibble of length `n_samples` 28 | } 29 | \details{ 30 | See update_rules vignette for a mathematical representation. 31 | \deqn{page_views_i ~ Poisson(\lambda)} 32 | \deqn{\lambda ~ Gamma(k, \theta)} 33 | Page Views Per Visit is sampled from a Gamma distribution with a Poisson likelihood 34 | of an individual having n page views by the end of their session. 35 | 36 | This is not always the case, so verify your data follows the shape of 37 | an Poisson distribution before using this. 38 | } 39 | -------------------------------------------------------------------------------- /man/estimate_lift_vs_baseline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_lift_vs_baseline.R 3 | \name{estimate_lift_vs_baseline} 4 | \alias{estimate_lift_vs_baseline} 5 | \title{Estimate Lift vs Baseline} 6 | \usage{ 7 | estimate_lift_vs_baseline( 8 | input_df, 9 | distribution, 10 | priors = list(), 11 | wrt_option, 12 | metric = "lift", 13 | threshold = 0.7 14 | ) 15 | } 16 | \arguments{ 17 | \item{input_df}{Dataframe containing option_name (str) and various other columns 18 | depending on the distribution type. See vignette for more details.} 19 | 20 | \item{distribution}{String of the distribution name} 21 | 22 | \item{priors}{Optional list of priors. Defaults will be use otherwise.} 23 | 24 | \item{wrt_option}{string the option loss is calculated with respect to (wrt). Required.} 25 | 26 | \item{metric}{string the type of loss. 27 | absolute will be the difference, on the outcome scale. 0 when best = wrt_option 28 | lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 29 | relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option} 30 | 31 | \item{threshold}{Lift percentage threshold between 0 and 1. (0.7 32 | threshold is "at least 70\% lift"). Defaults to 0.7.} 33 | } 34 | \value{ 35 | numeric value remaining at the specified threshold 36 | } 37 | \description{ 38 | Estimate Lift vs Baseline 39 | } 40 | \examples{ 41 | input_df <- tibble::tibble(option_name = c("A", "B", "C"), 42 | sum_clicks = c(1000, 1000, 1000), 43 | sum_conversions = c(100, 120, 110)) 44 | estimate_lift_vs_baseline(input_df, distribution = "conversion_rate", wrt_option = "A") 45 | 46 | } 47 | -------------------------------------------------------------------------------- /tests/testthat/test-find_best_option.R: -------------------------------------------------------------------------------- 1 | context("Find Best Option") 2 | 3 | test_that("find_best_option returns the correct best option when max is best", { 4 | expected_output <- "B" 5 | input_df <- tibble::tibble( 6 | option_name = c("A", "B", "C"), 7 | sum_clicks = c(1000, 1000, 1000), 8 | sum_conversions = c(100, 120, 110) 9 | ) 10 | posterior_samples <- sample_from_posterior(input_df, "conversion_rate") 11 | output <- find_best_option(posterior_samples = posterior_samples, 12 | distribution = "conversion_rate") 13 | expect_equal(output, expected_output) 14 | }) 15 | 16 | test_that("find_best_option returns the correct best option when min is best", { 17 | expected_output <- "C" 18 | input_df <- tibble::tibble( 19 | option_name = c("A", "B", "C"), 20 | sum_clicks = c(1000, 1000, 1000), 21 | sum_conversions = c(100, 120, 110), 22 | sum_cost = c(150, 200, 100), 23 | ) 24 | posterior_samples <- sample_from_posterior(input_df, "cpa") 25 | output <- find_best_option(posterior_samples = posterior_samples, 26 | distribution = "cpa") 27 | expect_equal(output, expected_output) 28 | }) 29 | 30 | 31 | test_that("find_best_option returns when 2 options are equal", { 32 | expected_output <- c("B","C") 33 | input_df <- tibble::tibble( 34 | option_name = c("A", "B", "C"), 35 | sum_clicks = c(1000, 1000, 1000), 36 | sum_conversions = c(100, 120, 120), 37 | ) 38 | posterior_samples <- sample_from_posterior(input_df, "conversion_rate") 39 | output <- find_best_option(posterior_samples = posterior_samples, 40 | distribution = "conversion_rate") 41 | expect_true(all(output %in% expected_output)) 42 | }) 43 | -------------------------------------------------------------------------------- /tests/testthat/test-impute_missing_options.R: -------------------------------------------------------------------------------- 1 | context("Impute Missing Options") 2 | 3 | test_that("impute_missing_options adds a row if it doesn't exist", { 4 | expected_output <- tibble::tibble(option_name = c("B", "C", "A"), win_prob_raw = c(0.4, 0.6, 0.0)) 5 | post_sample_example <- tibble::tibble(option_name = c("A", "B", "C")) 6 | wp_raw <- tibble::tibble(option_name = c("B", "C"), 7 | win_prob_raw = c(0.4, 0.6)) 8 | output <- impute_missing_options(posterior_samples = post_sample_example, 9 | wp_raw = wp_raw) 10 | expect_true(is.data.frame(output)) 11 | expect_equal(output, expected_output) 12 | }) 13 | 14 | test_that("impute_missing_options adds multiple rows if they don't exist", { 15 | expected_output <- tibble::tibble(option_name = c("C", "A", "B"), win_prob_raw = c(1.0, 0.0, 0.0)) 16 | post_sample_example <- tibble::tibble(option_name = c("A", "B", "C")) 17 | wp_raw <- tibble::tibble(option_name = c("C"), 18 | win_prob_raw = c(1.0)) 19 | output <- impute_missing_options(posterior_samples = post_sample_example, 20 | wp_raw = wp_raw) 21 | expect_true(is.data.frame(output)) 22 | expect_equal(output, expected_output) 23 | }) 24 | 25 | test_that("impute_missing_options doesn't add anything if all exist", { 26 | expected_output <- tibble::tibble(option_name = c("A", "B", "C"), win_prob_raw = c(0.3, 0.4, 0.3)) 27 | post_sample_example <- tibble::tibble(option_name = c("A", "B", "C")) 28 | output <- impute_missing_options(posterior_samples = post_sample_example, 29 | wp_raw = expected_output) 30 | expect_true(is.data.frame(output)) 31 | expect_equal(output, expected_output) 32 | }) 33 | 34 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_win_prob_vs_baseline_given_posterior.R: -------------------------------------------------------------------------------- 1 | context("Estimate Win Probability vs Baseline Given Posterior") 2 | 3 | test_that("estimate_win_prob_vs_baseline_given_posterior returns only 2 options", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110) 8 | ) 9 | posterior_samples <- sample_from_posterior(input_df, "conversion_rate", priors = list()) 10 | all_option_names <- unique(posterior_samples$option_name) 11 | output <- estimate_win_prob_vs_baseline_given_posterior(posterior_samples = posterior_samples, 12 | "conversion_rate", 13 | "A") 14 | # Subset exists 15 | expect_true(all(output$option_name %in% all_option_names)) 16 | expect_true(nrow(output) == 2) 17 | }) 18 | 19 | 20 | test_that("estimate_win_prob_vs_baseline_given_posterior handles when wrt_option is the best", { 21 | # Can't be better than yourself 22 | input_df <- tibble::tibble( 23 | option_name = c("A", "B", "C"), 24 | sum_clicks = c(1000, 1000, 1000), 25 | sum_conversions = c(100, 120, 110) 26 | ) 27 | posterior_samples <- sample_from_posterior(input_df, "conversion_rate", priors = list()) 28 | all_option_names <- unique(posterior_samples$option_name) 29 | output <- estimate_win_prob_vs_baseline_given_posterior(posterior_samples = posterior_samples, 30 | distribution = "conversion_rate", 31 | wrt_option = "B") 32 | expect_true(all(output$option_name %in% all_option_names)) 33 | expect_true(nrow(output) == 1) 34 | }) 35 | -------------------------------------------------------------------------------- /man/sample_cpa.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_cpa.R 3 | \name{sample_cpa} 4 | \alias{sample_cpa} 5 | \title{Sample Cost Per Activation (CPA)} 6 | \usage{ 7 | sample_cpa(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), sum_conversions (dbl), 11 | sum_cost (dbl), and sum_clicks (dbl).} 12 | 13 | \item{priors}{Optional list of priors {alpha0, beta0} for Beta and {k0, theta0} 14 | for Gamma. 15 | Default \eqn{Beta(1,1)} and \eqn{Gamma(1, 250)} will be use otherwise.} 16 | 17 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 18 | } 19 | \value{ 20 | input_df with 3 new nested columns `beta_params`, `gamma_params`, and `samples` 21 | } 22 | \description{ 23 | Adds 3 new nested columns to the input_df: `beta_params`, `gamma_params`, and `samples` 24 | `beta_params` and `gamma_params` in each row should be a tibble of length 2 (\eqn{\alpha} 25 | and \eqn{\beta} parameters and \eqn{k} and \eqn{\theta} parameters) 26 | `samples` in each row should be a tibble of length `n_samples` 27 | } 28 | \details{ 29 | See update_rules vignette for a mathematical representation. 30 | This is a combination of a Beta-Bernoulli update and a Gamma-Exponential update. 31 | 32 | \deqn{conversion_i ~ Bernoulli(\phi)} 33 | \deqn{cpc_i ~ Exponential(\lambda)} 34 | \deqn{\phi ~ Beta(\alpha, \beta)} 35 | \deqn{\lambda ~ Gamma(k, \theta)} 36 | 37 | \deqn{cpa_i ~ 1/ (Bernoulli(\phi) * Exponential(\lambda))} 38 | \deqn{averageCPA ~ 1/(\phi\lambda)} 39 | 40 | Conversion Rate is sampled from a Beta distribution with a Binomial likelihood 41 | of an individual converting. 42 | 43 | Average CPC is sampled from a Gamma distribution with an Exponential likelihood 44 | of an individual cost. 45 | } 46 | -------------------------------------------------------------------------------- /tests/testthat/test-is_prior_valid.R: -------------------------------------------------------------------------------- 1 | context("Is Prior Valid") 2 | 3 | test_that("is_prior_valid returns TRUE when valid value exists.", { 4 | example_prior_list <- list(x = 1, y = 10, z = 15) 5 | output <- is_prior_valid(priors_list = example_prior_list, valid_prior = "y") 6 | testthat::expect_equal(output, expected = TRUE) 7 | }) 8 | 9 | test_that("is_prior_valid returns TRUE when valid value exists and list is length 1.", { 10 | example_prior_list <- list(y = 10) 11 | output <- is_prior_valid(priors_list = example_prior_list, valid_prior = "y") 12 | testthat::expect_equal(output, expected = TRUE) 13 | }) 14 | 15 | test_that("is_prior_valid returns FALSE when empty list is passed in.", { 16 | example_prior_list <- list() 17 | output <- is_prior_valid(priors_list = example_prior_list, valid_prior = "y") 18 | testthat::expect_equal(output, expected = FALSE) 19 | }) 20 | 21 | test_that("is_prior_valid returns FALSE when valid value is not in list.", { 22 | example_prior_list <- list(x = 1, y = 10, z = 15) 23 | expect_warning({ 24 | output <- is_prior_valid(priors_list = example_prior_list, valid_prior = "a") 25 | }) 26 | testthat::expect_equal(output, expected = FALSE) 27 | }) 28 | 29 | test_that("is_prior_valid returns FALSE when negative value is in list.", { 30 | example_prior_list <- list(x = 1, y = -8, z = 15) 31 | expect_warning({ 32 | output <- is_prior_valid(priors_list = example_prior_list, valid_prior = "y") 33 | }) 34 | testthat::expect_equal(output, expected = FALSE) 35 | }) 36 | 37 | test_that("is_prior_valid returns FALSE value in list is 0.", { 38 | example_prior_list <- list(x = 1, y = 0, z = 15) 39 | expect_warning({ 40 | output <- is_prior_valid(priors_list = example_prior_list, valid_prior = "y") 41 | }) 42 | testthat::expect_equal(output, expected = FALSE) 43 | }) 44 | -------------------------------------------------------------------------------- /R/sample_cpc.R: -------------------------------------------------------------------------------- 1 | #' Sample Cost Per Click 2 | #' 3 | #' Adds 2 new nested columns to the input_df: `gamma_params` and `samples` 4 | #' `gamma_params` in each row should be a tibble of length 2 (\eqn{k} 5 | #' and \eqn{\theta} parameters) 6 | #' `samples` in each row should be a tibble of length `n_samples` 7 | #' 8 | #' See update_rules vignette for a mathematical representation. 9 | #' \deqn{cpc_i ~ Exponential(\lambda)} 10 | #' \deqn{\lambda ~ Gamma(k, \theta)} 11 | #' Average CPC is sampled from a Gamma distribution with an Exponential likelihood 12 | #' of an individual cost. 13 | #' 14 | #' @param input_df Dataframe containing option_name (str), sum_clicks (dbl), sum_cost (dbl). 15 | #' @param priors Optional list of priors {k0, theta0} for Gamma. 16 | #' Default \eqn{Gamma(1, 250)} will be use otherwise. 17 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 18 | #' 19 | #' @importFrom purrr map map2 20 | #' @importFrom dplyr mutate %>% 21 | #' @importFrom stats rgamma 22 | #' @importFrom rlang .data 23 | #' 24 | #' @return input_df with 2 new nested columns `gamma_params` and `samples` 25 | #' 26 | sample_cpc <- function(input_df, priors, n_samples = 5e4){ 27 | input_df %>% 28 | dplyr::mutate( 29 | gamma_params = purrr::map2(.x = .data$sum_clicks, 30 | .y = .data$sum_cost, 31 | ~ update_gamma(k = .x, 32 | theta = .y, 33 | priors = priors) 34 | ), 35 | samples = purrr::map(.x = .data$gamma_params, 36 | ~ 1 / stats::rgamma(n_samples, 37 | shape = .x$k, 38 | scale = .x$theta) 39 | ) 40 | ) 41 | } 42 | -------------------------------------------------------------------------------- /R/validate_input_df.R: -------------------------------------------------------------------------------- 1 | #' Validate Input DataFrame 2 | #' 3 | #' Validates the input dataframe has the correct type, correct required column names, 4 | #' that the distribution is valid, that the column types are correct, and that the 5 | #' column values are greater than or equal to 0 when they are numeric. 6 | #' 7 | #' @param input_df Dataframe containing option_name (str) and various other columns 8 | #' depending on the distribution type. See vignette for more details. 9 | #' @param distribution String of the distribution name 10 | #' 11 | #' @return Bool TRUE if all checks pass. 12 | #' 13 | #' @importFrom dplyr select select_if filter %>% 14 | #' @importFrom purrr walk 15 | #' @importFrom rlang .data 16 | #' @export 17 | #' 18 | #' @examples 19 | #' input_df <- tibble::tibble( 20 | #' option_name = c("A", "B"), 21 | #' sum_clicks = c(1000, 1000), 22 | #' sum_conversions = c(100, 120) 23 | #' ) 24 | #' validate_input_df(input_df, "conversion_rate") 25 | #' 26 | validate_input_df <- function(input_df, distribution){ 27 | if(!is.data.frame(input_df)){ 28 | stop("input_df is not of type data frame.") 29 | } 30 | 31 | valid_distribution_names <- unique(distribution_column_mapping$distribution_type) 32 | if(!distribution %in% valid_distribution_names){ 33 | comma_sep_names<- paste(valid_distribution_names, collapse = ", ") 34 | stop(paste(distribution), 35 | "is an invalid distribution. Select a distribution from the following:", 36 | comma_sep_names) 37 | } 38 | 39 | required_column_names <- distribution_column_mapping %>% 40 | dplyr::filter(.data$distribution_type == distribution) %>% 41 | dplyr::select(-"distribution_type") %>% 42 | dplyr::select_if(~ sum(.) == 1) %>% 43 | colnames() 44 | 45 | purrr::walk(required_column_names, ~validate_input_column(.x, input_df)) 46 | TRUE 47 | } 48 | 49 | -------------------------------------------------------------------------------- /tests/testthat/test-update_gamma.R: -------------------------------------------------------------------------------- 1 | context("Update Gamma") 2 | 3 | test_that("update_gamma updates with priors", { 4 | expected_output <- tibble::tibble(k = 3, theta = 1000/(1 + 1000 * 100)) 5 | output <- update_gamma(k = 1, theta = 100, priors = list(k0 = 2, theta0 = 1000)) 6 | testthat::expect_equal(output, expected_output) 7 | }) 8 | 9 | test_that("update_gamma updates without priors", { 10 | expected_output <- tibble::tibble(k = 11, theta = 250/(1 + 250 * 200)) 11 | output <- update_gamma(k = 10, theta = 200) 12 | testthat::expect_equal(output, expected_output) 13 | }) 14 | 15 | test_that("update_gamma updates but warns with invalid priors", { 16 | expected_output <- tibble::tibble(k = 11, theta = 250/(1 + 250 * 200)) 17 | invalid_priors <- list(k0 = -2, theta0 = 1000) 18 | testthat::expect_warning({ 19 | output <- update_gamma(k = 10, theta = 200, priors = invalid_priors) 20 | }) 21 | testthat::expect_equal(output, expected_output) 22 | }) 23 | 24 | test_that("update_gamma updates with alternate priors", { 25 | expected_output <- tibble::tibble(k = 3, theta = 1000/(1 + 1000 * 100)) 26 | output <- update_gamma(k = 1, theta = 100, priors = list(k01 = 2, theta01 = 1000), alternate_priors = TRUE) 27 | testthat::expect_equal(output, expected_output) 28 | }) 29 | 30 | test_that("update_gamma uses default priors with only alternate priors when set to FALSE", { 31 | expected_output <- tibble::tibble(k = 2, theta = 250/(1 + 250 * 100)) 32 | testthat::expect_warning(output <- update_gamma(k = 1, 33 | theta = 100, 34 | priors = list(k01 = 2, theta01 = 1000), 35 | alternate_priors = FALSE) 36 | ) 37 | testthat::expect_equal(output, expected_output) 38 | }) 39 | -------------------------------------------------------------------------------- /R/sample_conv_rate.R: -------------------------------------------------------------------------------- 1 | #' Sample Conversion Rate 2 | #' 3 | #' Adds 2 new nested columns to the input_df: `beta_params` and `samples` 4 | #' `beta_params` in each row should be a tibble of length 2 (\eqn{\alpha} 5 | #' and \eqn{\beta} parameters) 6 | #' `samples` in each row should be a tibble of length `n_samples` 7 | #' 8 | #' See update_rules vignette for a mathematical representation. 9 | #' \deqn{conversion_i ~ Bernoulli(\phi)} 10 | #' \deqn{\phi ~ Beta(\alpha, \beta)} 11 | #' Conversion Rate is sampled from a Beta distribution with a Binomial likelihood 12 | #' of an individual converting. 13 | #' 14 | #' @param input_df Dataframe containing option_name (str), 15 | #' sum_conversions (dbl), and sum_clicks (dbl). 16 | #' @param priors Optional list of priors alpha0 and beta0. 17 | #' Default \eqn{Beta(1,1)} will be use otherwise. 18 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 19 | #' 20 | #' @importFrom purrr map map2 21 | #' @importFrom dplyr mutate %>% 22 | #' @importFrom rlang .data 23 | #' 24 | #' @return input_df with 2 new nested columns `beta_params` and `samples` 25 | #' 26 | sample_conv_rate <- function(input_df, priors, n_samples = 5e4){ 27 | input_df %>% 28 | dplyr::mutate( 29 | beta_params = purrr::map2(.x = .data$sum_conversions, 30 | .y = .data$sum_clicks, 31 | ~ update_beta(alpha = .x, 32 | beta = .y - .x, 33 | priors = priors) 34 | ), 35 | samples = purrr::map(.x = .data$beta_params, 36 | ~ rbeta(n_samples, 37 | shape1 = .x$alpha, 38 | shape2 = .x$beta) 39 | ) 40 | ) 41 | } 42 | -------------------------------------------------------------------------------- /R/estimate_win_prob_given_posterior.R: -------------------------------------------------------------------------------- 1 | #' Estimate Win Probability Given Posterior Distribution 2 | #' 3 | #' @param posterior_samples Tibble of data in long form with 2 columns 4 | #' `option_name` and `samples` 5 | #' @param winner_is_max Boolean. This should almost always be TRUE. If a larger number is better 6 | #' then this should be TRUE. This should be FALSE for metrics such as CPA or CPC where a higher cost 7 | #' is not necessarily better. 8 | #' 9 | #' @return Tibble of each option_name and the win probability expressed as a percentage and a decimal `raw` 10 | #' @export 11 | #' @importFrom dplyr %>% group_by filter summarise mutate arrange desc 12 | #' 13 | #' @examples 14 | #' # Requires posterior_samples dataframe. See `sample_from_posterior()` 15 | #' # for an example. 16 | #' \dontrun{ 17 | #' estimate_win_prob_given_posterior(posterior_samples = posterior_samples) 18 | #' estimate_win_prob_given_posterior( 19 | #' posterior_samples = posterior_samples, 20 | #' winner_is_max = TRUE 21 | #' ) 22 | #' } 23 | #' 24 | estimate_win_prob_given_posterior <- function(posterior_samples, winner_is_max = TRUE){ 25 | validate_posterior_samples(posterior_samples) 26 | n_unique_options <- length(unique(posterior_samples$option_name)) 27 | samples_per_option <- nrow(posterior_samples)/n_unique_options 28 | wp_raw <- posterior_samples %>% 29 | dplyr::group_by(.data$sample_id) %>% 30 | dplyr::filter(.data$samples == if(winner_is_max) max(.data$samples) else min(.data$samples)) %>% 31 | dplyr::group_by(.data$option_name) %>% 32 | dplyr::summarise(win_prob_raw = dplyr::n()/samples_per_option) 33 | 34 | wp_raw_imputed <- impute_missing_options(posterior_samples, wp_raw) 35 | 36 | wp_raw_imputed %>% 37 | dplyr::mutate(win_prob = paste0(round(.data$win_prob_raw * 100, 2), "%")) %>% 38 | dplyr::arrange(dplyr::desc(.data$win_prob_raw)) 39 | } 40 | -------------------------------------------------------------------------------- /R/estimate_lift_vs_baseline.R: -------------------------------------------------------------------------------- 1 | #' Estimate Lift vs Baseline 2 | #' 3 | #' @param input_df Dataframe containing option_name (str) and various other columns 4 | #' depending on the distribution type. See vignette for more details. 5 | #' @param distribution String of the distribution name 6 | #' @param priors Optional list of priors. Defaults will be use otherwise. 7 | #' @param wrt_option string the option loss is calculated with respect to (wrt). Required. 8 | #' @param metric string the type of loss. 9 | #' absolute will be the difference, on the outcome scale. 0 when best = wrt_option 10 | #' lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 11 | #' relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option 12 | #' @param threshold Lift percentage threshold between 0 and 1. (0.7 13 | #' threshold is "at least 70\% lift"). Defaults to 0.7. 14 | #' 15 | #' @return numeric value remaining at the specified threshold 16 | #' @export 17 | #' 18 | #' @importFrom stats quantile 19 | #' 20 | #' @examples 21 | #' input_df <- tibble::tibble(option_name = c("A", "B", "C"), 22 | #' sum_clicks = c(1000, 1000, 1000), 23 | #' sum_conversions = c(100, 120, 110)) 24 | #' estimate_lift_vs_baseline(input_df, distribution = "conversion_rate", wrt_option = "A") 25 | #' 26 | estimate_lift_vs_baseline <- function(input_df, distribution, priors = list(), 27 | wrt_option, metric = "lift", threshold = 0.7){ 28 | validate_input_df(input_df, distribution) 29 | 30 | # Sample from posterior distribution 31 | posterior_samples <- sample_from_posterior(input_df, distribution, priors) 32 | 33 | # Calculate Lift Distribution 34 | estimate_lift(posterior_samples = posterior_samples, 35 | distribution = distribution, 36 | wrt_option = wrt_option, 37 | metric = metric) %>% 38 | # Select a single point 39 | stats::quantile(probs = 1 - threshold, na.rm = TRUE) 40 | } 41 | -------------------------------------------------------------------------------- /man/estimate_value_remaining.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_value_remaining.R 3 | \name{estimate_value_remaining} 4 | \alias{estimate_value_remaining} 5 | \title{Estimate Value Remaining} 6 | \usage{ 7 | estimate_value_remaining( 8 | input_df, 9 | distribution, 10 | priors = list(), 11 | wrt_option = NULL, 12 | metric = "lift", 13 | threshold = 0.95 14 | ) 15 | } 16 | \arguments{ 17 | \item{input_df}{Dataframe containing option_name (str) and various other columns 18 | depending on the distribution type. See vignette for more details.} 19 | 20 | \item{distribution}{String of the distribution name} 21 | 22 | \item{priors}{Optional list of priors. Defaults will be use otherwise.} 23 | 24 | \item{wrt_option}{string the option loss is calculated with respect to (wrt). If NULL, the best option will be chosen.} 25 | 26 | \item{metric}{string the type of loss. 27 | absolute will be the difference, on the outcome scale. 0 when best = wrt_option 28 | lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 29 | relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option} 30 | 31 | \item{threshold}{The confidence interval specifying what the "worst case scenario should be. 32 | Defaults to 95\%. (optional)} 33 | } 34 | \value{ 35 | numeric value remaining at the specified threshold 36 | } 37 | \description{ 38 | Estimates value remaining or loss (in terms of percent lift, absolute, or relative). 39 | } 40 | \examples{ 41 | input_df <- tibble::tibble(option_name = c("A", "B", "C"), 42 | sum_clicks = c(1000, 1000, 1000), 43 | sum_conversions = c(100, 120, 110)) 44 | estimate_value_remaining(input_df, distribution = "conversion_rate") 45 | estimate_value_remaining(input_df, 46 | distribution = "conversion_rate", 47 | threshold = 0.99) 48 | estimate_value_remaining(input_df, 49 | distribution = "conversion_rate", 50 | wrt_option = "A", 51 | metric = "absolute") 52 | 53 | } 54 | -------------------------------------------------------------------------------- /man/sample_total_cm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_total_cm.R 3 | \name{sample_total_cm} 4 | \alias{sample_total_cm} 5 | \title{Sample Total CM (Given Impression Count)} 6 | \usage{ 7 | sample_total_cm(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_conversions (dbl), sum_revenue (dbl), and sum_clicks (dbl).} 12 | 13 | \item{priors}{Optional list of priors {alpha0, beta0} for Beta, 14 | {k0, theta0} for Gamma Inverse Revenue, and {k01, theta01} for 15 | Gamma Cost (uses alternate priors so they can be different from Revenue). 16 | Default \eqn{Beta(1,1)} and \eqn{Gamma(1, 250)} will be use otherwise.} 17 | 18 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 19 | } 20 | \value{ 21 | input_df with 5 new nested columns `beta_params_conv`, 22 | `beta_params_ctr`, `gamma_params_rev`,`gamma_params_cost`, 23 | and `samples` 24 | } 25 | \description{ 26 | Adds 4 new nested columns to the input_df: `beta_params_ctr`, 27 | `beta_params_conv`,`gamma_params_rev`, `gamma_params_cost` 28 | and `samples`. 29 | } 30 | \details{ 31 | `beta_params` and `gamma_params` in each row should be a tibble of length 2 32 | (\eqn{\alpha} and \eqn{\beta} params and \eqn{k} and \eqn{\theta} params). 33 | `samples` in each row should be a tibble of length `n_samples`. 34 | 35 | One assumption in this model is that sum_impressions is not stochastic. 36 | This assumes that Clicks are stochastically generated from a set number 37 | of Impressions. It does not require that the number of impressions are 38 | equal on either side. Generally this assumption holds true in marketing 39 | tests where traffic is split 50/50 and very little variance is observed 40 | in the number of impressions on either side. 41 | 42 | 43 | See update_rules vignette for a mathematical representation. 44 | 45 | \deqn{TotalCM = Impr * ExpectedCTR * (RevPerOrder * OrdersPerClick - ExpectedCPC)} 46 | } 47 | -------------------------------------------------------------------------------- /R/sample_session_duration.R: -------------------------------------------------------------------------------- 1 | #' Sample Session Duration 2 | #' 3 | #' Adds 2 new nested columns to the input_df: `gamma_params` and `samples` 4 | #' `gamma_params` in each row should be a tibble of length 2 (\eqn{k} 5 | #' and \eqn{\theta} parameters) 6 | #' `samples` in each row should be a tibble of length `n_samples` 7 | #' 8 | #' See update_rules vignette for a mathematical representation. 9 | #' \deqn{duration_i ~ Exponential(\lambda)} 10 | #' \deqn{\lambda ~ Gamma(k, \theta)} 11 | #' Session Duration is sampled from a Gamma distribution with a Exponential likelihood 12 | #' of an individual leaving the site or ending a session at time t. 13 | #' 14 | #' This is not always the case, so verify your data follows the shape of 15 | #' an exponential distribution before using this. 16 | #' 17 | #' @param input_df Dataframe containing option_name (str), 18 | #' sum_sessions (dbl), and sum_duration (dbl). 19 | #' @param priors Optional list of priors k0 and theta0. 20 | #' Default \eqn{Gamma(1, 250)} will be use otherwise. 21 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 22 | #' 23 | #' @importFrom purrr map map2 24 | #' @importFrom dplyr mutate %>% 25 | #' @importFrom rlang .data 26 | #' 27 | #' @return input_df with 2 new nested columns `gamma_params` and `samples` 28 | #' 29 | sample_session_duration <- function(input_df, priors, n_samples = 5e4){ 30 | input_df %>% 31 | dplyr::mutate( 32 | gamma_params = purrr::map2(.x = .data$sum_sessions, 33 | .y = .data$sum_duration, 34 | ~ update_gamma(k = .x, 35 | theta = .y, 36 | priors = priors) 37 | ), 38 | samples = purrr::map(.x = .data$gamma_params, 39 | ~ 1 / stats::rgamma(n_samples, 40 | shape = .x$k, 41 | scale = .x$theta) 42 | ) 43 | ) 44 | } 45 | -------------------------------------------------------------------------------- /R/update_gamma.R: -------------------------------------------------------------------------------- 1 | #' Update Gamma 2 | #' 3 | #' Updates Gamma Distribution with the Gamma-Exponential 4 | #' conjugate prior update rule. Parameterized by \eqn{k} and \eqn{\theta} (not \eqn{\alpha, \beta}) 5 | #' 6 | #' @param k Double value for \eqn{k} (total revenue generating events). Must be 0 or greater. 7 | #' @param theta Double value for \eqn{\theta} (sum of revenue). Must be 0 or greater. 8 | #' @param priors An optional list object that contains k0 and 9 | #' theta0. Otherwise the function will use \eqn{Gamma(1,250)} as the prior distribution. 10 | #' If a second gamma distribution is used k01 and theta01 can be defined as separate priors 11 | #' when alternate_priors is set to TRUE. 12 | #' @param alternate_priors Boolean Defaults to FALSE. Allows a user to specify alternate 13 | #' prior names so the same prior isn't required when multiple gamma distributions are used. 14 | #' 15 | #' @return A list object that contains `k` and `theta` 16 | #' @export 17 | #' @importFrom tibble tibble 18 | #' @importFrom magrittr set_names 19 | #' 20 | #' @examples 21 | #' update_gamma(k = 1, theta = 100, priors = list(k0 = 2, theta0 = 1000)) 22 | #' update_gamma(k = 10, theta = 200) 23 | #' 24 | update_gamma <- function(k, theta, priors = list(), alternate_priors = FALSE) { 25 | validate_data_values(data_values = list(k = k, theta = theta)) 26 | 27 | # Set Attributes 28 | valid_gamma_params <- if(alternate_priors) c("k01", "theta01") else c("k0", "theta0") 29 | default_gamma_priors <- list(k0 = 1, theta0 = 250) %>% 30 | magrittr::set_names(valid_gamma_params) 31 | 32 | # Validate Priors 33 | validated_priors <- validate_priors(priors = priors, 34 | valid_priors = valid_gamma_params, 35 | default_priors = default_gamma_priors) 36 | 37 | k0 <- if(alternate_priors) validated_priors$k01 else validated_priors$k0 38 | theta0 <- if(alternate_priors) validated_priors$theta01 else validated_priors$theta0 39 | 40 | tibble::tibble(k = k0 + k, theta = theta0/(1 + theta0 * theta)) 41 | } 42 | -------------------------------------------------------------------------------- /man/sample_rev_per_session.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sample_rev_per_session.R 3 | \name{sample_rev_per_session} 4 | \alias{sample_rev_per_session} 5 | \title{Sample Rev Per Session} 6 | \usage{ 7 | sample_rev_per_session(input_df, priors, n_samples = 50000) 8 | } 9 | \arguments{ 10 | \item{input_df}{Dataframe containing option_name (str), 11 | sum_conversions (dbl), sum_revenue (dbl), and sum_clicks (dbl).} 12 | 13 | \item{priors}{Optional list of priors {alpha0, beta0} for Beta 14 | and {k0, theta0} for Gamma. Default \eqn{Beta(1,1)} 15 | and \eqn{Gamma(1, 250)} will be use otherwise.} 16 | 17 | \item{n_samples}{Optional integer value. Defaults to 50,000 samples.} 18 | } 19 | \value{ 20 | input_df with 3 new nested columns `beta_params`, `gamma_params`, and `samples` 21 | } 22 | \description{ 23 | Adds 3 new nested columns to the input_df: `beta_params`, `gamma_params`, and `samples` 24 | `beta_params` and `gamma_params` in each row should be a tibble of length 2 (\eqn{\alpha} 25 | and \eqn{\beta} parameters and \eqn{k} and \eqn{\theta} parameters) 26 | `samples` in each row should be a tibble of length `n_samples` 27 | } 28 | \details{ 29 | See update_rules vignette for a mathematical representation. 30 | 31 | \deqn{RevPerSession = RevPerOrder * OrdersPerClick} 32 | This is a combination of a Beta-Bernoulli update and a Gamma-Exponential update. 33 | 34 | \deqn{conversion_i ~ Bernoulli(\phi)} 35 | \deqn{revenue_i ~ Exponential(\lambda)} 36 | \deqn{\phi ~ Beta(\alpha, \beta)} 37 | \deqn{\lambda ~ Gamma(k, \theta)} 38 | 39 | \deqn{revenue_i ~ Bernoulli(\phi) * Exponential(\lambda)^-1)} 40 | \deqn{Rev Per Session ~ \phi / \lambda} 41 | 42 | Conversion Rate is sampled from a Beta distribution with a Binomial likelihood 43 | of an individual converting. 44 | 45 | Average Rev Per Order is sampled from a Gamma distribution with an Exponential likelihood 46 | of Revenue from an individual order. 47 | This function makes sense to use if there is a distribution of possible revenue values 48 | that can be produced from a single order or conversion. 49 | } 50 | -------------------------------------------------------------------------------- /tests/testthat/test-validate_input_column.R: -------------------------------------------------------------------------------- 1 | context("Validate Input Column") 2 | 3 | test_that("validate_input_column returns nothing with valid data", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B"), 6 | sum_clicks = c(1000, 1000), 7 | sum_conversions = c(100, 120) 8 | ) 9 | expect_invisible(validate_input_column("option_name", input_df)) 10 | expect_invisible(validate_input_column("sum_clicks", input_df)) 11 | expect_invisible(validate_input_column("sum_conversions", input_df)) 12 | }) 13 | 14 | test_that("validate_input_column fails when required column is not in the df", { 15 | input_df <- tibble::tibble( 16 | option_name = c("A", "B"), 17 | sum_clicks = c(1000, 1000), 18 | sum_conversions = c(100, 120) 19 | ) 20 | expect_error(validate_input_column("sum_revenue", input_df)) 21 | }) 22 | 23 | test_that("validate_input_column fails when option_name is not a character string column", { 24 | input_df <- tibble::tibble( 25 | option_name = as.double(c(0, 2)), 26 | sum_clicks = c(1000, 1000), 27 | sum_conversions = c(100, 120) 28 | ) 29 | expect_error(validate_input_column("option_name", input_df)) 30 | }) 31 | 32 | test_that("validate_input_column fails when sum_clicks is not a double column", { 33 | input_df <- tibble::tibble( 34 | option_name = c("A", "B"), 35 | sum_clicks = c("something wrong", "another wrong thing"), 36 | sum_conversions = c(100, 120) 37 | ) 38 | expect_error(validate_input_column("sum_clicks", input_df)) 39 | }) 40 | 41 | test_that("validate_input_column fails when a value is less than 0", { 42 | input_df <- tibble::tibble( 43 | option_name = as.double(c(0, 2)), 44 | sum_clicks = c(1000, 1000), 45 | sum_conversions = c(-2, 120) 46 | ) 47 | expect_error(validate_input_column("sum_conversions", input_df)) 48 | }) 49 | 50 | test_that("validate_input_column returns nothing when a value is equal to 0", { 51 | input_df <- tibble::tibble( 52 | option_name = as.double(c(0, 2)), 53 | sum_clicks = c(1000, 0), 54 | sum_conversions = c(0, 120) 55 | ) 56 | expect_invisible(validate_input_column("sum_conversions", input_df)) 57 | }) 58 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_win_prob_given_posterior.R: -------------------------------------------------------------------------------- 1 | context("Estimate Win Probability Given Posterior") 2 | 3 | test_that("estimate_win_prob_given_posterior returns correct results", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110) 8 | ) 9 | posterior_samples <- sample_from_posterior(input_df, "conversion_rate", priors = list()) 10 | all_option_names <- unique(posterior_samples$option_name) 11 | 12 | output <- estimate_win_prob_given_posterior(posterior_samples = posterior_samples, winner_is_max = TRUE) 13 | expected_column_names <- c("option_name", "win_prob_raw", "win_prob") 14 | 15 | expect_true(is.data.frame(output)) 16 | expect_true(all(expected_column_names %in% colnames(output))) 17 | expect_true(all(output$option_name %in% all_option_names)) 18 | expect_true(is.double(output$win_prob_raw)) 19 | expect_true(is.character(output$win_prob)) 20 | expect_equal(nrow(output), length(all_option_names)) 21 | }) 22 | 23 | test_that("estimate_win_prob_given_posterior returns correct results when winner_is_max = FALSE", { 24 | input_df <- tibble::tibble( 25 | option_name = c("A", "B", "C"), 26 | sum_clicks = c(1000, 1000, 1000), 27 | sum_conversions = c(100, 120, 110) 28 | ) 29 | posterior_samples <- sample_from_posterior(input_df, "conversion_rate", priors = list()) 30 | all_option_names <- unique(posterior_samples$option_name) 31 | 32 | output <- estimate_win_prob_given_posterior(posterior_samples = posterior_samples, winner_is_max = FALSE) 33 | expected_column_names <- c("option_name", "win_prob_raw", "win_prob") 34 | win_prob_of_lowest <- as.double(output[output$option_name == "A","win_prob_raw"][1]) 35 | 36 | expect_equal(win_prob_of_lowest, max(output$win_prob_raw)) 37 | 38 | expect_true(is.data.frame(output)) 39 | expect_true(all(expected_column_names %in% colnames(output))) 40 | expect_true(all(output$option_name %in% all_option_names)) 41 | expect_true(is.double(output$win_prob_raw)) 42 | expect_true(is.character(output$win_prob)) 43 | expect_equal(nrow(output), length(all_option_names)) 44 | }) 45 | -------------------------------------------------------------------------------- /R/update_dirichlet.R: -------------------------------------------------------------------------------- 1 | #' Update Dirichlet Distribution 2 | #' 3 | #' This function updates the Dirichlet distribution with the 4 | #' Dirichlet-Multinomial conjugate prior update rule. 5 | #' 6 | #' TODO: This function currently only works in 3 dimensions. 7 | #' Should be extended into N dimensions in the future. Can use ... notation. 8 | #' 9 | #' @param alpha_0 Double value for alpha_0 (count of failures). Must be 0 or greater. 10 | #' @param alpha_1 Double value for alpha_1 (count of successes side 1). Must be 0 or greater. 11 | #' @param alpha_2 Double value for alpha_2 (count of successes side 2). Must be 0 or greater. 12 | #' @param priors An optional list object that contains alpha00, alpha01, and alpha02. 13 | #' Otherwise the function with use \eqn{Dirichlet(1,1,1)} as the prior distribution. 14 | #' 15 | #' @return tibble with columns alpha_0, alpha_1, and alpha_2 16 | #' @export 17 | #' @importFrom tibble tibble 18 | #' 19 | #' @examples 20 | #' update_dirichlet(alpha_0 = 20, alpha_1 = 5, alpha_2 = 2) 21 | #' sample_priors_list <- list(alpha00 = 2, alpha01 = 3, alpha02 = 5) 22 | #' update_dirichlet(alpha_0 = 20, alpha_1 = 5, alpha_2 = 2, priors = sample_priors_list) 23 | #' 24 | update_dirichlet <- function(alpha_0, alpha_1, alpha_2, priors = list()) { 25 | validate_data_values(data_values = list(alpha_0 = alpha_0, # None 26 | alpha_1 = alpha_1, # sum_conversions 27 | alpha_2 = alpha_2)) # sum_conversions_2 28 | 29 | # Set Attributes 30 | valid_dirichlet_params <- c("alpha00", "alpha01", "alpha02") 31 | default_dirichlet_priors <- list(alpha00 = 1, alpha01 = 1, alpha02 = 1) 32 | 33 | # Validate Priors 34 | validated_priors <- validate_priors(priors = priors, 35 | valid_priors = valid_dirichlet_params, 36 | default_priors = default_dirichlet_priors) 37 | alpha00 <- validated_priors$alpha00 38 | alpha01 <- validated_priors$alpha01 39 | alpha02 <- validated_priors$alpha02 40 | 41 | tibble::tibble(alpha_0 = alpha_0 + alpha00, 42 | alpha_1 = alpha_1 + alpha01, 43 | alpha_2 = alpha_2 + alpha02) 44 | } 45 | -------------------------------------------------------------------------------- /R/estimate_value_remaining.R: -------------------------------------------------------------------------------- 1 | #' Estimate Value Remaining 2 | #' 3 | #' Estimates value remaining or loss (in terms of percent lift, absolute, or relative). 4 | #' 5 | #' @param input_df Dataframe containing option_name (str) and various other columns 6 | #' depending on the distribution type. See vignette for more details. 7 | #' @param distribution String of the distribution name 8 | #' @param priors Optional list of priors. Defaults will be use otherwise. 9 | #' @param wrt_option string the option loss is calculated with respect to (wrt). If NULL, the best option will be chosen. 10 | #' @param metric string the type of loss. 11 | #' absolute will be the difference, on the outcome scale. 0 when best = wrt_option 12 | #' lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 13 | #' relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option 14 | #' @param threshold The confidence interval specifying what the "worst case scenario should be. 15 | #' Defaults to 95\%. (optional) 16 | #' 17 | #' @return numeric value remaining at the specified threshold 18 | #' @export 19 | #' @importFrom stats quantile 20 | #' 21 | #' @examples 22 | #' input_df <- tibble::tibble(option_name = c("A", "B", "C"), 23 | #' sum_clicks = c(1000, 1000, 1000), 24 | #' sum_conversions = c(100, 120, 110)) 25 | #' estimate_value_remaining(input_df, distribution = "conversion_rate") 26 | #' estimate_value_remaining(input_df, 27 | #' distribution = "conversion_rate", 28 | #' threshold = 0.99) 29 | #' estimate_value_remaining(input_df, 30 | #' distribution = "conversion_rate", 31 | #' wrt_option = "A", 32 | #' metric = "absolute") 33 | #' 34 | estimate_value_remaining <- function(input_df, distribution, priors = list(), 35 | wrt_option = NULL, metric = "lift", threshold = 0.95){ 36 | validate_input_df(input_df, distribution) 37 | 38 | # Sample from posterior distribution 39 | posterior_samples <- sample_from_posterior(input_df, distribution, priors) 40 | 41 | # Calculate Loss Distribution 42 | estimate_loss(posterior_samples = posterior_samples, 43 | distribution = distribution, 44 | wrt_option = wrt_option, 45 | metric = metric) %>% 46 | stats::quantile(probs = threshold, na.rm = TRUE) 47 | } 48 | -------------------------------------------------------------------------------- /R/sample_page_views_per_session.R: -------------------------------------------------------------------------------- 1 | #' Sample Page Views Per Session (Visit) 2 | #' 3 | #' Adds 2 new nested columns to the input_df: `gamma_params` and `samples` 4 | #' `gamma_params` in each row should be a tibble of length 2 (\eqn{k} 5 | #' and \eqn{\theta} parameters) 6 | #' `samples` in each row should be a tibble of length `n_samples` 7 | #' 8 | #' See update_rules vignette for a mathematical representation. 9 | #' \deqn{page_views_i ~ Poisson(\lambda)} 10 | #' \deqn{\lambda ~ Gamma(k, \theta)} 11 | #' Page Views Per Visit is sampled from a Gamma distribution with a Poisson likelihood 12 | #' of an individual having n page views by the end of their session. 13 | #' 14 | #' This is not always the case, so verify your data follows the shape of 15 | #' an Poisson distribution before using this. 16 | #' 17 | #' @param input_df Dataframe containing option_name (str), 18 | #' sum_sessions (dbl), and sum_page_views_per_session (dbl). 19 | #' @param priors Optional list of priors k0 and theta0. 20 | #' Default \eqn{Gamma(1, 250)} will be use otherwise. 21 | #' \eqn{Gamma(1, 1)} might also be a good choice for this distribution 22 | #' if you only have a few page views per session. 23 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 24 | #' 25 | #' @importFrom purrr map map2 26 | #' @importFrom dplyr mutate %>% 27 | #' @importFrom rlang .data 28 | #' 29 | #' @return input_df with 2 new nested columns `gamma_params` and `samples` 30 | #' 31 | sample_page_views_per_session <- function(input_df, priors, n_samples = 5e4){ 32 | input_df %>% 33 | dplyr::mutate( 34 | gamma_params = purrr::map2(.x = .data$sum_page_views, 35 | .y = .data$sum_sessions, 36 | ~ update_gamma(k = .x - .y, # Page Views offset since every session starts with 1 PV. Add 1 to likelihood observations 37 | theta = .y, 38 | priors = priors) 39 | ), 40 | samples = purrr::map(.x = .data$gamma_params, 41 | ~ stats::rgamma(n_samples, 42 | shape = .x$k, 43 | scale = .x$theta) + 1 # Adding the offset from above back in 44 | ) 45 | ) 46 | } 47 | -------------------------------------------------------------------------------- /tests/testthat/test-sample_from_posterior.R: -------------------------------------------------------------------------------- 1 | context("Sample From Posterior") 2 | 3 | test_that("sample_from_posterior returns correct dataframe shape: conversion_rate", { 4 | input_df <- tibble::tibble( 5 | option_name = c("A", "B", "C"), 6 | sum_clicks = c(1000, 1000, 1000), 7 | sum_conversions = c(100, 120, 110) 8 | ) 9 | n_samples <- 150 10 | n_unique_options <- length(unique(input_df$option_name)) 11 | expected_col_names <- c("option_name", "samples", "sample_id") 12 | output <- sample_from_posterior(input_df, "conversion_rate", n_samples = n_samples) 13 | expect_true(is.data.frame(output)) 14 | expect_length(output, n_unique_options) 15 | expect_equal(nrow(output), n_unique_options * n_samples) 16 | expect_true(all(colnames(output) == expected_col_names)) 17 | }) 18 | 19 | test_that("sample_from_posterior returns correct dataframe shape for all types", { 20 | input_df_all <- tibble::tibble( 21 | option_name = c("A", "B", "C"), 22 | sum_impressions = c(10000, 9000, 11000), 23 | sum_sessions = c(1000, 1000, 1000), 24 | sum_conversions = c(100, 120, 110), 25 | sum_revenue = c(900, 1200, 1150), 26 | sum_cost = c(10, 50, 30), 27 | sum_conversions_2 = c(10, 8, 20), 28 | sum_revenue_2 = c(10, 16, 15), 29 | sum_duration = c(5000, 3000, 4000), 30 | sum_page_views = c(3000, 2000, 1345) 31 | ) %>% 32 | dplyr::mutate(sum_clicks = sum_sessions) 33 | n_samples <- 150 34 | n_unique_options <- length(unique(input_df_all$option_name)) 35 | expected_col_names <- c("option_name", "samples", "sample_id") 36 | 37 | output <- purrr::map(distribution_column_mapping$distribution_type, 38 | ~ sample_from_posterior(input_df_all, 39 | .x, 40 | n_samples = n_samples)) 41 | 42 | expect_true(purrr::every(output, ~ is.data.frame(.x))) 43 | purrr::walk(output, ~ expect_length(.x, n_unique_options)) 44 | purrr::walk(output, ~ expect_equal(nrow(.x), n_unique_options * n_samples)) 45 | expect_true(purrr::every(output, ~ all(colnames(.x) == expected_col_names))) 46 | }) 47 | 48 | test_that("sample_from_posterior fails when incorrect distribution is input", { 49 | input_df <- tibble::tibble( 50 | option_name = c("A", "B", "C"), 51 | sum_clicks = c(1000, 1000, 1000), 52 | sum_conversions = c(100, 120, 110) 53 | ) 54 | expect_error(sample_from_posterior(input_df, "bad_input", n_samples = n_samples)) 55 | }) 56 | -------------------------------------------------------------------------------- /man/estimate_all_values.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_all_values.R 3 | \name{estimate_all_values} 4 | \alias{estimate_all_values} 5 | \title{Estimate All Values} 6 | \usage{ 7 | estimate_all_values( 8 | input_df, 9 | distribution, 10 | wrt_option_lift, 11 | priors = list(), 12 | wrt_option_vr = NULL, 13 | loss_threshold = 0.95, 14 | lift_threshold = 0.7, 15 | metric = "lift" 16 | ) 17 | } 18 | \arguments{ 19 | \item{input_df}{Dataframe containing option_name (str) and various other columns 20 | depending on the distribution type. See vignette for more details.} 21 | 22 | \item{distribution}{String of the distribution name} 23 | 24 | \item{wrt_option_lift}{String: the option lift and win probability is calculated 25 | with respect to (wrt). Required.} 26 | 27 | \item{priors}{Optional list of priors. Defaults will be use otherwise.} 28 | 29 | \item{wrt_option_vr}{String: the option against which loss (value remaining) 30 | is calculated. If NULL the best option will be used. (optional)} 31 | 32 | \item{loss_threshold}{The confidence interval specifying what the "worst case scenario" should be. 33 | Defaults to 95\%. (optional)} 34 | 35 | \item{lift_threshold}{The confidence interval specifying how likely the lift is to be true. 36 | Defaults to 70\%. (optional)} 37 | 38 | \item{metric}{string the type of loss. 39 | absolute will be the difference, on the outcome scale. 0 when best = wrt_option 40 | lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 41 | relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option} 42 | } 43 | \value{ 44 | A list with 4 named items: Win Probability, Value Remaining, 45 | Lift vs Baseline, and Win Probability vs Baseline. 46 | } 47 | \description{ 48 | Efficiently estimates all values at once so the posterior only need to be 49 | sampled one time. This function will return as a list win probability, 50 | value remaining, estimated percent lift with respect to the provided option, 51 | and the win probability of the best option vs the provided option. 52 | } 53 | \details{ 54 | TODO: Add high density credible intervals to this output for each option. 55 | } 56 | \examples{ 57 | \dontrun{ 58 | input_df <- data.frame(option_name = c("A", "B", "C"), 59 | sum_clicks = c(1000, 1000, 1000), 60 | sum_conversions = c(100, 120, 110), stringsAsFactors = FALSE) 61 | estimate_all_values(input_df, distribution = "conversion_rate", wrt_option_lift = "A") 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_loss.R: -------------------------------------------------------------------------------- 1 | context("Estimate Loss Distribution") 2 | 3 | test_that("estimate_loss returns double vector the same length as the rows in posterior_samples input", { 4 | distribution_type <- "conversion_rate" 5 | input_df <- tibble::tibble( 6 | option_name = c("A", "B", "C"), 7 | sum_clicks = c(1000, 1000, 1000), 8 | sum_conversions = c(100, 120, 110) 9 | ) 10 | posterior_samples <- sample_from_posterior(input_df, distribution_type, priors = list()) 11 | count_unique_options <- length(unique(posterior_samples$option_name)) 12 | posterior_samples_rows <- nrow(posterior_samples)/count_unique_options 13 | 14 | output <- estimate_loss(posterior_samples = posterior_samples, 15 | distribution = distribution_type) 16 | expect_length(output, posterior_samples_rows) 17 | expect_true(is.double(output)) 18 | }) 19 | 20 | 21 | test_that("estimate_loss returns double vector when wrt_option is set", { 22 | distribution_type <- "conversion_rate" 23 | input_df <- tibble::tibble( 24 | option_name = c("A", "B", "C"), 25 | sum_clicks = c(1000, 1000, 1000), 26 | sum_conversions = c(100, 120, 110) 27 | ) 28 | posterior_samples <- sample_from_posterior(input_df, distribution_type, priors = list()) 29 | count_unique_options <- length(unique(posterior_samples$option_name)) 30 | posterior_samples_rows <- nrow(posterior_samples)/count_unique_options 31 | 32 | output <- estimate_loss(posterior_samples = posterior_samples, 33 | distribution = distribution_type, 34 | wrt_option = "C") 35 | expect_length(output, posterior_samples_rows) 36 | expect_true(is.double(output)) 37 | }) 38 | 39 | test_that("estimate_loss returns double vector the same length 40 | as the rows in posterior_samples input 41 | when winner_is_min", { 42 | distribution_type <- "cpa" 43 | input_df <- tibble::tibble( 44 | option_name = c("A", "B", "C"), 45 | sum_clicks = c(1000, 1000, 1000), 46 | sum_conversions = c(100, 120, 110), 47 | sum_cost = c(50, 100, 150), 48 | ) 49 | posterior_samples <- sample_from_posterior(input_df, distribution_type, priors = list()) 50 | count_unique_options <- length(unique(posterior_samples$option_name)) 51 | posterior_samples_rows <- nrow(posterior_samples)/count_unique_options 52 | 53 | output <- estimate_loss(posterior_samples = posterior_samples, 54 | distribution = distribution_type) 55 | expect_length(output, posterior_samples_rows) 56 | expect_true(is.double(output)) 57 | }) 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # grizbayr 2 | 3 | 4 | [![CRAN status](https://www.r-pkg.org/badges/version/grizbayr)](https://CRAN.R-project.org/package=grizbayr) 5 | [![](https://cranlogs.r-pkg.org/badges/grizbayr)](https://cran.r-project.org/package=grizbayr) 6 | 7 | 8 | ## A Bayesian Inference Package for A|B and Bandit Marketing Tests 9 | 10 | ### Description: 11 | 12 | Uses simple Bayesian conjugate prior update rules to calculate the following metrics for various marketing objectives: 13 | 14 | 1. Win Probability of each option 15 | 2. Value Remaining in the Test 16 | 3. Percent Lift Over the Baseline 17 | 18 | This allows a user to implement Bayesian Inference methods when analyzing the results of a split test or Bandit experiment. 19 | 20 | ## Examples 21 | 22 | See the `intro` vignette for examples to get started. 23 | 24 | ## Marketing objectives supported: 25 | 26 | - Conversion Rate 27 | - Response Rate 28 | - Click Through Rate (CTR) 29 | - Revenue Per Session 30 | - Multi Revenue Per Session 31 | - Cost Per Activation (CPA) 32 | - Total Contribution Margin (CM) 33 | - CM Per Click 34 | - Cost Per Click (CPC) 35 | - Session Duration (seconds) 36 | - Page Views Per Session 37 | 38 | 39 | ## Contributing 40 | 41 | ### New Posterior Distributions 42 | 43 | To add a new posterior distribution you must complete the following: 44 | 45 | 1. Create a new function called `sample_...(input_df, priors, n_samples)`. Use the internal helper functions update_gamma, update_beta, etc. included in this package or you can create a new one. 46 | 1. This function (and the name) must be added to the switch statement in `sample_from_posterior()` 47 | 1. A new row must be added to the internal data object `distribution_column_mapping`. 48 | - Select this object from the package 49 | - Add a new row with a 1 for every column that is required for this distribution (this is for data validation and clear alerting for the end user) 50 | - Save the updated tibble object using `use_data(new_tibble, internal = TRUE, overwrite = TRUE)` and it will be saved as `sysdata.rda` in the package for internal use. 51 | - Update the intro.Rmd markdown table to include which columns are required for your function. 52 | 53 | 1. Create a PR for review. 54 | 55 | ### New Features Ideas (TODO) 56 | 57 | - High Density Credible Intervals with each option 58 | - Conjugate Prior Update Rules vignette deriving each marketing objective `update_rules` 59 | 60 | --- 61 | 62 | #### Package Name 63 | 64 | The name is a play on Bayes with an added r (bayesr). The added griz (or Grizzly Bear) creates a unique name that is searchable due to too many similarly named packages. 65 | -------------------------------------------------------------------------------- /R/estimate_loss.R: -------------------------------------------------------------------------------- 1 | #' Estimate Loss 2 | #' 3 | #' @param posterior_samples Tibble: returned from sample_from_posterior with 3 columns 4 | #' `option_name`, `samples`, and `sample_id`. 5 | #' @param distribution String: the name of the distribution 6 | #' @param wrt_option String: the option loss is calculated with respect to (wrt). If NULL, the best option will be chosen. 7 | #' @param metric String: the type of loss. 8 | #' absolute will be the difference, on the outcome scale. 0 when best = wrt_option 9 | #' lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 10 | #' relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option 11 | #' 12 | #' @return numeric, the loss distribution 13 | #' 14 | #' @importFrom dplyr filter select 15 | #' @importFrom magrittr use_series %>% 16 | #' @importFrom tidyr pivot_wider 17 | #' @importFrom rlang .data 18 | #' 19 | #' @export 20 | #' 21 | #' @examples 22 | #' # Requires posterior_samples dataframe. See `sample_from_posterior()` 23 | #' # for an example. 24 | #' 25 | #' \dontrun{ 26 | #' estimate_loss(posterior_samples = posterior_samples, distribution = "conversion_rate") 27 | #' } 28 | estimate_loss <- function(posterior_samples, distribution, wrt_option = NULL, metric = c("absolute", "lift", "relative_risk")) { 29 | metric <- match.arg(metric) 30 | 31 | # estimate 'best' option if no wrt option is provided 32 | if (is.null(wrt_option)) { 33 | wrt_option <- find_best_option(posterior_samples, distribution) 34 | } else { 35 | validate_wrt_option(wrt_option, posterior_samples) 36 | } 37 | 38 | posterior_samples_wide <- posterior_samples %>% 39 | tidyr::pivot_wider(names_from = "option_name", values_from = "samples") %>% 40 | dplyr::select(-"sample_id") 41 | 42 | theta_star <- posterior_samples_wide[[wrt_option]] 43 | 44 | # Need to Flip Loss Distributions if Lower is Better 45 | if(is_winner_max(distribution)){ 46 | theta_max <- as.matrix(posterior_samples_wide) %>% 47 | apply(1, max) 48 | 49 | loss_distribution <- switch(metric, 50 | absolute = theta_max - theta_star, 51 | lift = (theta_max - theta_star) / theta_star, 52 | relative_risk = theta_max / theta_star 53 | ) 54 | }else{ 55 | theta_min <- as.matrix(posterior_samples_wide) %>% 56 | apply(1, min) 57 | 58 | loss_distribution <- switch(metric, 59 | absolute = theta_star - theta_min, 60 | lift = ( theta_star - theta_min ) / theta_star, 61 | relative_risk = theta_min / theta_star 62 | ) 63 | } 64 | loss_distribution 65 | } 66 | -------------------------------------------------------------------------------- /R/estimate_lift.R: -------------------------------------------------------------------------------- 1 | #' Estimate Lift Distribution 2 | #' 3 | #' Estimates lift distribution vector from posterior samples. 4 | #' 5 | #' @param posterior_samples Tibble returned from sample_from_posterior with 3 columns 6 | #' `option_name`, `samples`, and `sample_id`. 7 | #' @param distribution String of the distribution name 8 | #' @param wrt_option string the option lift is calculated with respect to (wrt). Required. 9 | #' @param metric string the type of lift. 10 | #' `absolute`` will be the difference, on the outcome scale. 0 when best = wrt_option 11 | #' `lift`` will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 12 | #' `relative_risk`` will be the ratio best/wrt_option, 1 when best = wrt_option 13 | #' 14 | #' @return numeric, the lift distribution 15 | #' @export 16 | #' @importFrom rlang .data 17 | #' @importFrom tidyr pivot_wider 18 | #' @importFrom dplyr select 19 | #' 20 | #' @examples 21 | #' # Requires posterior_samples dataframe. See `sample_from_posterior()` 22 | #' # for an example. 23 | #' 24 | #' \dontrun{ 25 | #' estimate_lift(posterior_samples = posterior_samples, 26 | #' distribution = "conversion_rate", 27 | #' wrt_option = "A", 28 | #' metric = "lift") 29 | #'} 30 | #' 31 | estimate_lift <- function(posterior_samples, distribution, wrt_option, metric = "lift"){ 32 | if(!metric %in% c("lift", "relative_risk", "absolute")){ 33 | stop("Invalid argument. `metric` must be one of `lift`, `absolute`, or `relative_risk`.") 34 | } 35 | validate_wrt_option(wrt_option, posterior_samples) 36 | 37 | best_option <- find_best_option(posterior_samples, distribution) 38 | 39 | # Format samples wider so matrices can be used 40 | posterior_samples_wide <- posterior_samples %>% 41 | tidyr::pivot_wider(names_from = "option_name", values_from = "samples") %>% 42 | dplyr::select(-"sample_id") 43 | 44 | theta_best <- posterior_samples_wide[[best_option]] 45 | theta_control <- posterior_samples_wide[[wrt_option]] 46 | 47 | # Need to Flip Loss Distributions if Lower is Better 48 | if(is_winner_max(distribution)){ 49 | lift_distribution <- switch(metric, 50 | absolute = theta_best - theta_control, 51 | lift = (theta_best - theta_control) / theta_control, 52 | relative_risk = theta_best / theta_control 53 | ) 54 | }else{ 55 | lift_distribution <- switch(metric, 56 | absolute = theta_control - theta_best, 57 | lift = ( theta_control - theta_best ) / theta_control, 58 | relative_risk = theta_best / theta_control 59 | ) 60 | } 61 | lift_distribution 62 | } 63 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_lift.R: -------------------------------------------------------------------------------- 1 | context("Estimated Lift Distribution") 2 | 3 | test_that("estimate_lift returns double vector the same length as the rows in posterior_samples input", { 4 | distribution_type <- "conversion_rate" 5 | input_df <- tibble::tibble( 6 | option_name = c("A", "B", "C"), 7 | sum_clicks = c(1000, 1000, 1000), 8 | sum_conversions = c(100, 120, 110) 9 | ) 10 | posterior_samples <- sample_from_posterior(input_df, distribution_type, priors = list()) 11 | count_unique_options <- length(unique(posterior_samples$option_name)) 12 | posterior_samples_rows <- nrow(posterior_samples)/count_unique_options 13 | 14 | output <- estimate_lift(posterior_samples = posterior_samples, 15 | distribution = distribution_type, 16 | wrt_option = "A", 17 | metric = "lift") 18 | expect_length(output, posterior_samples_rows) 19 | expect_true(is.double(output)) 20 | }) 21 | 22 | 23 | test_that("estimate_lift returns double vector the same length 24 | as the rows in posterior_samples input when lower is better", { 25 | distribution_type <- "cpa" 26 | input_df <- tibble::tibble( 27 | option_name = c("A", "B", "C"), 28 | sum_clicks = c(1000, 1000, 1000), 29 | sum_conversions = c(100, 120, 110), 30 | sum_cost = c(50, 100, 150), 31 | ) 32 | posterior_samples <- sample_from_posterior(input_df, distribution_type, priors = list()) 33 | count_unique_options <- length(unique(posterior_samples$option_name)) 34 | posterior_samples_rows <- nrow(posterior_samples)/count_unique_options 35 | 36 | output <- estimate_lift(posterior_samples = posterior_samples, 37 | distribution = distribution_type, 38 | wrt_option = "A", 39 | metric = "lift") 40 | expect_length(output, posterior_samples_rows) 41 | expect_true(is.double(output)) 42 | }) 43 | 44 | 45 | test_that("estimate_lift errors when invalid metric is passed", { 46 | distribution_type <- "cpa" 47 | input_df <- tibble::tibble( 48 | option_name = c("A", "B", "C"), 49 | sum_clicks = c(1000, 1000, 1000), 50 | sum_conversions = c(100, 120, 110), 51 | sum_cost = c(50, 100, 150), 52 | ) 53 | posterior_samples <- sample_from_posterior(input_df, distribution_type, priors = list()) 54 | count_unique_options <- length(unique(posterior_samples$option_name)) 55 | posterior_samples_rows <- nrow(posterior_samples)/count_unique_options 56 | 57 | testthat::expect_error({ 58 | estimate_lift(posterior_samples = posterior_samples, 59 | distribution = distribution_type, 60 | wrt_option = "A", 61 | metric = "invalid") 62 | }) 63 | }) 64 | -------------------------------------------------------------------------------- /R/sample_from_posterior.R: -------------------------------------------------------------------------------- 1 | #' Sample From Posterior 2 | #' 3 | #' Selects which function to use to sample from the posterior distribution 4 | #' 5 | #' @param input_df Dataframe containing option_name (str) and various other columns 6 | #' depending on the distribution type. See vignette for more details. 7 | #' @param distribution String of the distribution name 8 | #' @param priors Optional list of priors. Defaults will be use otherwise. 9 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 10 | #' 11 | #' @return A tibble with 2 columns: option_name (chr) and samples (dbl) [long form data]. 12 | #' @export 13 | #' @importFrom dplyr select mutate row_number %>% 14 | #' @importFrom tidyr unnest 15 | #' @importFrom purrr map 16 | #' @importFrom tibble tibble 17 | #' @importFrom rlang .data 18 | #' 19 | #' @examples 20 | #' input_df <- tibble::tibble( 21 | #' option_name = c("A", "B"), 22 | #' sum_clicks = c(1000, 1000), 23 | #' sum_conversions = c(100, 120), 24 | #' sum_sessions = c(1000, 1000), 25 | #' sum_revenue = c(1000, 1500) 26 | #' ) 27 | #' sample_from_posterior(input_df, "conversion_rate") 28 | #' sample_from_posterior(input_df, "rev_per_session") 29 | #' 30 | sample_from_posterior <- function(input_df, distribution, priors = list(), n_samples = 5e4){ 31 | samples_tibble <- switch(distribution, 32 | "conversion_rate" = sample_conv_rate(input_df, priors, n_samples), 33 | "response_rate" = sample_response_rate(input_df, priors, n_samples), 34 | "ctr" = sample_ctr(input_df, priors, n_samples), 35 | "rev_per_session" = sample_rev_per_session(input_df, priors, n_samples), 36 | "multi_rev_per_session" = sample_multi_rev_per_session(input_df, priors, n_samples), 37 | "cpa" = sample_cpa(input_df, priors, n_samples), 38 | "total_cm" = sample_total_cm(input_df, priors, n_samples), 39 | "cm_per_click" = sample_cm_per_click(input_df, priors, n_samples), 40 | "cpc" = sample_cpc(input_df, priors, n_samples), 41 | "session_duration" = sample_session_duration(input_df, priors, n_samples), 42 | "page_views_per_session" = sample_page_views_per_session(input_df, priors, n_samples), 43 | stop( 44 | paste(distribution, 45 | "is an invalid distribution type. Select from one of the following:", 46 | paste(distribution_column_mapping$distribution_type, collapse = ", ")) 47 | ) 48 | ) 49 | # Clean tibble into expected 2 dimensional output with added sample_id 50 | samples_tibble %>% 51 | dplyr::mutate(samples = purrr::map(.x = .data$samples, 52 | ~ tibble::tibble(samples = .x) %>% 53 | dplyr::mutate(sample_id = dplyr::row_number()) 54 | ) 55 | ) %>% 56 | dplyr::select("option_name", "samples") %>% 57 | tidyr::unnest(cols = "samples") 58 | } 59 | -------------------------------------------------------------------------------- /R/sample_cpa.R: -------------------------------------------------------------------------------- 1 | #' Sample Cost Per Activation (CPA) 2 | #' 3 | #' Adds 3 new nested columns to the input_df: `beta_params`, `gamma_params`, and `samples` 4 | #' `beta_params` and `gamma_params` in each row should be a tibble of length 2 (\eqn{\alpha} 5 | #' and \eqn{\beta} parameters and \eqn{k} and \eqn{\theta} parameters) 6 | #' `samples` in each row should be a tibble of length `n_samples` 7 | #' 8 | #' 9 | #' See update_rules vignette for a mathematical representation. 10 | #' This is a combination of a Beta-Bernoulli update and a Gamma-Exponential update. 11 | #' 12 | #' \deqn{conversion_i ~ Bernoulli(\phi)} 13 | #' \deqn{cpc_i ~ Exponential(\lambda)} 14 | #' \deqn{\phi ~ Beta(\alpha, \beta)} 15 | #' \deqn{\lambda ~ Gamma(k, \theta)} 16 | #' 17 | #' \deqn{cpa_i ~ 1/ (Bernoulli(\phi) * Exponential(\lambda))} 18 | #' \deqn{averageCPA ~ 1/(\phi\lambda)} 19 | #' 20 | #' Conversion Rate is sampled from a Beta distribution with a Binomial likelihood 21 | #' of an individual converting. 22 | #' 23 | #' Average CPC is sampled from a Gamma distribution with an Exponential likelihood 24 | #' of an individual cost. 25 | #' 26 | #' 27 | #' @param input_df Dataframe containing option_name (str), sum_conversions (dbl), 28 | #' sum_cost (dbl), and sum_clicks (dbl). 29 | #' @param priors Optional list of priors {alpha0, beta0} for Beta and {k0, theta0} 30 | #' for Gamma. 31 | #' Default \eqn{Beta(1,1)} and \eqn{Gamma(1, 250)} will be use otherwise. 32 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 33 | #' 34 | #' @importFrom purrr map2 35 | #' @importFrom dplyr mutate %>% 36 | #' @importFrom stats rgamma rbeta 37 | #' @importFrom rlang .data 38 | #' 39 | #' @return input_df with 3 new nested columns `beta_params`, `gamma_params`, and `samples` 40 | #' 41 | sample_cpa <- function(input_df, priors, n_samples = 5e4){ 42 | input_df %>% 43 | dplyr::mutate( 44 | beta_params = purrr::map2(.x = .data$sum_conversions, 45 | .y = .data$sum_clicks, 46 | ~ update_beta(alpha = .x, 47 | beta = .y - .x, 48 | priors = priors) 49 | ), 50 | gamma_params = purrr::map2(.x = .data$sum_clicks, 51 | .y = .data$sum_cost, 52 | ~ update_gamma(k = .x, 53 | theta = .y, 54 | priors = priors) 55 | ), 56 | samples = purrr::map2(.x = .data$beta_params, 57 | .y = .data$gamma_params, 58 | ~ 1 /( stats::rgamma(n_samples, 59 | shape = .y$k, 60 | scale = .y$theta) * 61 | stats::rbeta(n_samples, 62 | shape1 = .x$alpha, 63 | shape2 = .x$beta) ) 64 | ) 65 | ) 66 | } 67 | -------------------------------------------------------------------------------- /R/sample_rev_per_session.R: -------------------------------------------------------------------------------- 1 | #' Sample Rev Per Session 2 | #' 3 | #' Adds 3 new nested columns to the input_df: `beta_params`, `gamma_params`, and `samples` 4 | #' `beta_params` and `gamma_params` in each row should be a tibble of length 2 (\eqn{\alpha} 5 | #' and \eqn{\beta} parameters and \eqn{k} and \eqn{\theta} parameters) 6 | #' `samples` in each row should be a tibble of length `n_samples` 7 | #' 8 | #' 9 | #' See update_rules vignette for a mathematical representation. 10 | #' 11 | #' \deqn{RevPerSession = RevPerOrder * OrdersPerClick} 12 | #' This is a combination of a Beta-Bernoulli update and a Gamma-Exponential update. 13 | #' 14 | #' \deqn{conversion_i ~ Bernoulli(\phi)} 15 | #' \deqn{revenue_i ~ Exponential(\lambda)} 16 | #' \deqn{\phi ~ Beta(\alpha, \beta)} 17 | #' \deqn{\lambda ~ Gamma(k, \theta)} 18 | #' 19 | #' \deqn{revenue_i ~ Bernoulli(\phi) * Exponential(\lambda)^-1)} 20 | #' \deqn{Rev Per Session ~ \phi / \lambda} 21 | #' 22 | #' Conversion Rate is sampled from a Beta distribution with a Binomial likelihood 23 | #' of an individual converting. 24 | #' 25 | #' Average Rev Per Order is sampled from a Gamma distribution with an Exponential likelihood 26 | #' of Revenue from an individual order. 27 | #' This function makes sense to use if there is a distribution of possible revenue values 28 | #' that can be produced from a single order or conversion. 29 | #' 30 | #' @param input_df Dataframe containing option_name (str), 31 | #' sum_conversions (dbl), sum_revenue (dbl), and sum_clicks (dbl). 32 | #' @param priors Optional list of priors {alpha0, beta0} for Beta 33 | #' and {k0, theta0} for Gamma. Default \eqn{Beta(1,1)} 34 | #' and \eqn{Gamma(1, 250)} will be use otherwise. 35 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 36 | #' 37 | #' @importFrom purrr map2 38 | #' @importFrom dplyr mutate %>% 39 | #' @importFrom stats rgamma rbeta 40 | #' @importFrom rlang .data 41 | #' 42 | #' @return input_df with 3 new nested columns `beta_params`, `gamma_params`, and `samples` 43 | #' 44 | sample_rev_per_session <- function(input_df, priors, n_samples = 5e4){ 45 | input_df %>% 46 | dplyr::mutate( 47 | beta_params = purrr::map2(.x = .data$sum_conversions, 48 | .y = .data$sum_sessions, 49 | ~ update_beta(alpha = .x, 50 | beta = .y - .x, 51 | priors = priors) 52 | ), 53 | gamma_params = purrr::map2(.x = .data$sum_conversions, 54 | .y = .data$sum_revenue, 55 | ~ update_gamma(k = .x, 56 | theta = .y, 57 | priors = priors) 58 | ), 59 | samples = purrr::map2(.x = .data$beta_params, 60 | .y = .data$gamma_params, 61 | ~ stats::rbeta(n_samples, 62 | shape1 = .x$alpha, 63 | shape2 = .x$beta) / 64 | stats::rgamma(n_samples, 65 | shape = .y$k, 66 | scale = .y$theta) 67 | ) 68 | ) 69 | } 70 | -------------------------------------------------------------------------------- /R/sample_cm_per_click.R: -------------------------------------------------------------------------------- 1 | #' Sample CM Per Click 2 | #' 3 | #' @description 4 | #' Adds 4 new nested columns to the input_df: `beta_params`, 5 | #' `gamma_params_rev`, `gamma_params_cost`and `samples` 6 | #' 7 | #' @details 8 | #'`beta_params` and `gamma_params_rev` in each row should be a 9 | #' tibble of length 2 (\eqn{\alpha} and \eqn{\beta} parameters 10 | #' and \eqn{k} and \eqn{\theta} parameters) 11 | #'`samples` in each row should be a tibble of length `n_samples` 12 | #' 13 | #' See update_rules vignette for a mathematical representation. 14 | #' \deqn{CMPerClick = ConversionsPerClick * RevPerConversion - CostPerClick} 15 | #' 16 | #' @param input_df Dataframe containing option_name (str), sum_conversions (dbl), sum_revenue (dbl), 17 | #' and sum_clicks (dbl). 18 | #' @param priors Optional list of priors {alpha0, beta0} for Beta, {k0, theta0} for Gamma Inverse Revenue, 19 | #' and {k01, theta01} for Gamma Cost (uses alternate priors so they can be different from Revenue). 20 | #' Default \eqn{Beta(1,1)} and \eqn{Gamma(1, 250)} will be use otherwise. 21 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 22 | #' 23 | #' @importFrom purrr pmap map2 24 | #' @importFrom dplyr mutate %>% 25 | #' @importFrom stats rbeta rgamma 26 | #' @importFrom rlang .data 27 | #' 28 | #' @return input_df with 4 new nested columns `beta_params`, `gamma_params_rev`, 29 | #' `gamma_params_cost`, and `samples` 30 | #' 31 | sample_cm_per_click <- function(input_df, priors, n_samples = 5e4){ 32 | input_df %>% 33 | dplyr::mutate( 34 | beta_params = purrr::map2(.x = .data$sum_conversions, 35 | .y = .data$sum_clicks, 36 | ~ update_beta(alpha = .x, 37 | beta = .y - .x, 38 | priors = priors) 39 | ), 40 | gamma_params_rev = purrr::map2(.x = .data$sum_conversions, 41 | .y = .data$sum_revenue, 42 | ~ update_gamma(k = .x, 43 | theta = .y, 44 | priors = priors) 45 | ), 46 | gamma_params_cost = purrr::map2(.x = .data$sum_clicks, 47 | .y = .data$sum_cost, 48 | ~ update_gamma(k = .x, 49 | theta = .y, 50 | priors = priors, 51 | alternate_priors = TRUE) 52 | ), 53 | samples = purrr::pmap(list(.data$beta_params, 54 | .data$gamma_params_rev, 55 | .data$gamma_params_cost), 56 | ~ ( # Rev Per Click 57 | stats::rbeta(n_samples, 58 | shape1 = ..1$alpha, 59 | shape2 = ..1$beta) / 60 | stats::rgamma(n_samples, 61 | shape = ..2$k, 62 | scale = ..2$theta) ) - 63 | # Minus Variable Cost Per Click 64 | 1 / stats::rgamma(n_samples, 65 | shape = ..3$k, 66 | scale = ..3$theta) 67 | ) 68 | ) 69 | } 70 | -------------------------------------------------------------------------------- /R/estimate_all_values.R: -------------------------------------------------------------------------------- 1 | #' Estimate All Values 2 | #' 3 | #' Efficiently estimates all values at once so the posterior only need to be 4 | #' sampled one time. This function will return as a list win probability, 5 | #' value remaining, estimated percent lift with respect to the provided option, 6 | #' and the win probability of the best option vs the provided option. 7 | #' 8 | #' TODO: Add high density credible intervals to this output for each option. 9 | #' 10 | #' @param input_df Dataframe containing option_name (str) and various other columns 11 | #' depending on the distribution type. See vignette for more details. 12 | #' @param distribution String of the distribution name 13 | #' @param wrt_option_lift String: the option lift and win probability is calculated 14 | #' with respect to (wrt). Required. 15 | #' @param priors Optional list of priors. Defaults will be use otherwise. 16 | #' @param wrt_option_vr String: the option against which loss (value remaining) 17 | #' is calculated. If NULL the best option will be used. (optional) 18 | #' @param loss_threshold The confidence interval specifying what the "worst case scenario" should be. 19 | #' Defaults to 95\%. (optional) 20 | #' @param lift_threshold The confidence interval specifying how likely the lift is to be true. 21 | #' Defaults to 70\%. (optional) 22 | #' @param metric string the type of loss. 23 | #' absolute will be the difference, on the outcome scale. 0 when best = wrt_option 24 | #' lift will be the (best - wrt_option) / wrt_option, 0 when best = wrt_option 25 | #' relative_risk will be the ratio best/wrt_option, 1 when best = wrt_option 26 | #' 27 | #' 28 | #' @return A list with 4 named items: Win Probability, Value Remaining, 29 | #' Lift vs Baseline, and Win Probability vs Baseline. 30 | #' @export 31 | #' 32 | #' @importFrom stats quantile 33 | #' 34 | #' @examples 35 | #' \dontrun{ 36 | #' input_df <- data.frame(option_name = c("A", "B", "C"), 37 | #' sum_clicks = c(1000, 1000, 1000), 38 | #' sum_conversions = c(100, 120, 110), stringsAsFactors = FALSE) 39 | #' estimate_all_values(input_df, distribution = "conversion_rate", wrt_option_lift = "A") 40 | #' } 41 | #' 42 | estimate_all_values <- function(input_df, distribution, wrt_option_lift, priors = list(), 43 | wrt_option_vr = NULL, loss_threshold = 0.95, lift_threshold = 0.7, 44 | metric = "lift"){ 45 | validate_input_df(input_df, distribution) 46 | 47 | # Sample from posterior distribution 48 | posterior_samples <- sample_from_posterior(input_df, distribution, priors) 49 | 50 | # Calculate Win Probability 51 | win_prob <- estimate_win_prob_given_posterior(posterior_samples = posterior_samples, 52 | winner_is_max = is_winner_max(distribution)) 53 | 54 | # Calculate Value Remaining 55 | vr <- estimate_loss(posterior_samples = posterior_samples, 56 | distribution = distribution, 57 | wrt_option = wrt_option_vr, 58 | metric = metric) %>% 59 | stats::quantile(probs = loss_threshold, na.rm = TRUE) 60 | 61 | # Calculate Lift Relative to Baseline 62 | lift <- estimate_lift(posterior_samples = posterior_samples, 63 | distribution = distribution, 64 | wrt_option = wrt_option_lift, 65 | metric = metric) %>% 66 | stats::quantile(probs = 1 - lift_threshold, na.rm = TRUE) 67 | 68 | # Calculate Win Prob vs Baseline 69 | win_prob_vs_base <- estimate_win_prob_vs_baseline_given_posterior(posterior_samples, 70 | distribution, 71 | wrt_option_lift) 72 | list(`Win Probability` = win_prob, 73 | `Value Remaining` = vr, 74 | `Lift vs Baseline` = lift, 75 | `Win Probability vs Baseline` = win_prob_vs_base) 76 | } 77 | -------------------------------------------------------------------------------- /R/sample_multi_rev_per_session.R: -------------------------------------------------------------------------------- 1 | #' Sample Multiple Revenue Per Session 2 | #' 3 | #' Adds 5 new nested columns to the input_df: `dirichlet_params`, 4 | #' `gamma_params_A`, `gamma_params_B`, and `samples`. 5 | #' This samples from multiple revenue per session distributions 6 | #' at once. 7 | #' 8 | #' 9 | #' See update_rules vignette for a mathematical representation. 10 | #' 11 | #' \deqn{conversion_i ~ MultiNomial(\phi_1, \phi_2, ..., \phi_k)} 12 | #' \deqn{\phi_k ~ Dirichlet(\alpha, \beta)} 13 | #' Conversion Rate is sampled from a Dirichlet distribution with a Multinomial likelihood 14 | #' of an individual converting. 15 | #' 16 | #' @param input_df Dataframe containing option_name (str), 17 | #' sum_conversions (dbl), sum_sessions (dbl), sum_revenue (dbl), 18 | #' sum_conversion_2 (dbl), sum_sessions_2 (dbl), sum_revenue_2 (dbl). 19 | #' @param priors Optional list of priors alpha0 and beta0. 20 | #' Default \eqn{Beta(1,1)} will be use otherwise. 21 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 22 | #' 23 | #' @importFrom purrr map map2 pmap 24 | #' @importFrom dplyr mutate select %>% 25 | #' @importFrom stats rgamma 26 | #' @importFrom rlang .data 27 | #' 28 | #' @return input_df with 4 new nested columns `dirichlet_params`, 29 | #' `gamma_params_A`, `gamma_params_B`, and `samples`. 30 | #' `samples` in each row should be a tibble of length `n_samples`. 31 | #' 32 | sample_multi_rev_per_session <- function(input_df, priors, n_samples = 5e4){ 33 | input_df %>% 34 | dplyr::mutate( 35 | no_clicks = .data$sum_sessions - .data$sum_conversions - .data$sum_conversions_2, 36 | dirichlet_params = purrr::pmap(.l = list(alpha_0 = .data$no_clicks, 37 | alpha_1 = .data$sum_conversions, 38 | alpha_2 = .data$sum_conversions_2), 39 | ~ update_dirichlet(..., 40 | priors = priors) 41 | ), 42 | gamma_params_A = purrr::map2(.x = .data$sum_conversions, 43 | .y = .data$sum_revenue, 44 | ~ update_gamma(k = .x, 45 | theta = .y, 46 | priors = priors) 47 | ), 48 | gamma_params_B = purrr::map2(.x = .data$sum_conversions_2, 49 | .y = .data$sum_revenue_2, 50 | ~ update_gamma(k = .x, 51 | theta = .y, 52 | priors = priors) 53 | ), 54 | dirichlet_samples = purrr::map(.data$dirichlet_params, 55 | ~ rdirichlet(n_samples, alphas_list = .x) 56 | ), 57 | gamma_samples_A = purrr::map(.data$gamma_params_A, 58 | ~ stats::rgamma(n_samples, 59 | shape = .x$k, 60 | scale = .x$theta) 61 | ), 62 | gamma_samples_B = purrr::map(.data$gamma_params_B, 63 | ~ stats::rgamma(n_samples, 64 | shape = .x$k, 65 | scale = .x$theta) 66 | ), 67 | samples = purrr::pmap(.l = list(conv_rates = .data$dirichlet_samples, 68 | inverse_rev_A = .data$gamma_samples_A, 69 | inverse_rev_B = .data$gamma_samples_B), 70 | ~ calculate_multi_rev_per_session(...) 71 | ) 72 | ) %>% 73 | select( 74 | "option_name", 75 | "sum_sessions", 76 | "sum_conversions", 77 | "sum_conversions_2", 78 | "sum_revenue", 79 | "sum_revenue_2", 80 | "dirichlet_params", 81 | "gamma_params_A", 82 | "gamma_params_B", 83 | "samples", 84 | ) 85 | } 86 | -------------------------------------------------------------------------------- /R/sample_total_cm.R: -------------------------------------------------------------------------------- 1 | #' Sample Total CM (Given Impression Count) 2 | #' 3 | #' @description 4 | #' Adds 4 new nested columns to the input_df: `beta_params_ctr`, 5 | #' `beta_params_conv`,`gamma_params_rev`, `gamma_params_cost` 6 | #' and `samples`. 7 | #' 8 | #' @details 9 | #'`beta_params` and `gamma_params` in each row should be a tibble of length 2 10 | #' (\eqn{\alpha} and \eqn{\beta} params and \eqn{k} and \eqn{\theta} params). 11 | #'`samples` in each row should be a tibble of length `n_samples`. 12 | #' 13 | #' One assumption in this model is that sum_impressions is not stochastic. 14 | #' This assumes that Clicks are stochastically generated from a set number 15 | #' of Impressions. It does not require that the number of impressions are 16 | #' equal on either side. Generally this assumption holds true in marketing 17 | #' tests where traffic is split 50/50 and very little variance is observed 18 | #' in the number of impressions on either side. 19 | #' 20 | #' 21 | #' See update_rules vignette for a mathematical representation. 22 | #' 23 | #' \deqn{TotalCM = Impr * ExpectedCTR * (RevPerOrder * OrdersPerClick - ExpectedCPC)} 24 | #' 25 | #' 26 | #' @param input_df Dataframe containing option_name (str), 27 | #' sum_conversions (dbl), sum_revenue (dbl), and sum_clicks (dbl). 28 | #' @param priors Optional list of priors {alpha0, beta0} for Beta, 29 | #' {k0, theta0} for Gamma Inverse Revenue, and {k01, theta01} for 30 | #' Gamma Cost (uses alternate priors so they can be different from Revenue). 31 | #' Default \eqn{Beta(1,1)} and \eqn{Gamma(1, 250)} will be use otherwise. 32 | #' @param n_samples Optional integer value. Defaults to 50,000 samples. 33 | #' 34 | #' @importFrom purrr pmap map2 35 | #' @importFrom dplyr mutate select %>% 36 | #' @importFrom stats rgamma rbeta 37 | #' @importFrom rlang .data 38 | #' 39 | #' @return input_df with 5 new nested columns `beta_params_conv`, 40 | #' `beta_params_ctr`, `gamma_params_rev`,`gamma_params_cost`, 41 | #' and `samples` 42 | #' 43 | sample_total_cm <- function(input_df, priors, n_samples = 5e4){ 44 | input_df %>% 45 | dplyr::mutate( 46 | beta_params_conv = purrr::map2(.x = .data$sum_conversions, 47 | .y = .data$sum_clicks, 48 | ~ update_beta(alpha = .x, 49 | beta = .y - .x, 50 | priors = priors) 51 | ), 52 | beta_params_ctr = purrr::map2(.x = .data$sum_clicks, 53 | .y = .data$sum_impressions, 54 | ~ update_beta(alpha = .x, 55 | beta = .y - .x, 56 | priors = priors) 57 | ), 58 | gamma_params_rev = purrr::map2(.x = .data$sum_conversions, 59 | .y = .data$sum_revenue, 60 | ~ update_gamma(k = .x, 61 | theta = .y, 62 | priors = priors) 63 | ), 64 | gamma_params_cost = purrr::map2(.x = .data$sum_clicks, 65 | .y = .data$sum_cost, 66 | ~ update_gamma(k = .x, 67 | theta = .y, 68 | priors = priors, 69 | alternate_priors = TRUE) 70 | ), 71 | rev_per_click_samples = purrr::map2(.x = .data$beta_params_conv, 72 | .y = .data$gamma_params_rev, 73 | ~ stats::rbeta(n_samples, 74 | shape1 = .x$alpha, 75 | shape2 = .x$beta) / 76 | stats::rgamma(n_samples, 77 | shape = .y$k, 78 | scale = .y$theta) 79 | ), 80 | cost_per_click_samples = purrr::map(.x = .data$gamma_params_cost, 81 | ~ 1 / stats::rgamma(n_samples, 82 | shape = .x$k, 83 | scale = .x$theta) 84 | ), 85 | expected_clicks_rates = purrr::map(.x = .data$beta_params_ctr, 86 | ~ stats::rbeta(n_samples, 87 | shape1 = .x$alpha, 88 | shape2 = .x$beta) 89 | ), 90 | # Expected CTR samples Times Fixed Impressions 91 | expected_clicks_samples = purrr::map2(.x = .data$expected_clicks_rates, 92 | .y = .data$sum_impressions, 93 | ~ .x * .y), 94 | samples = purrr::pmap(list(rev_per_click = .data$rev_per_click_samples, 95 | cost_per_click = .data$cost_per_click_samples, 96 | expected_clicks = .data$expected_clicks_samples), 97 | ~ calculate_total_cm(...) 98 | ) 99 | ) %>% 100 | dplyr::select( 101 | -"rev_per_click_samples", 102 | -"cost_per_click_samples", 103 | -"expected_clicks_rates", 104 | -"expected_clicks_samples", 105 | ) 106 | } 107 | 108 | 109 | -------------------------------------------------------------------------------- /vignettes/intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Intro Examples to grizbayr" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{start} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | \usepackage[utf8]{inputenc} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | ```{r setup, warning=FALSE, message=FALSE} 18 | library(grizbayr) 19 | library(dplyr) 20 | ``` 21 | 22 | ## About the Package 23 | 24 | Bayesian Inference is a method of statistical inference that can be used in the analysis of observed data from marketing tests. Bayesian updates start with a prior distribution (prior probable information about the environment) and a likelihood function (an expected distribution from which the samples are drawn). Then, given some observed data, the prior can be multiplied by the likelihood of the data to produce a posterior distribution of probabilities. At the core of all of this is Bayes' Rule. 25 | 26 | $$ P(A\ |\ Data) \sim P(Data\ |\ A) \cdot P(A)$$ 27 | This package is intended to abstract the math of the conjugate prior update rules to provide 3 pieces of information for a user: 28 | 29 | 1. Win Probability (overall and vs baseline) 30 | 1. Value Remaining 31 | 1. Lift vs. Control 32 | 33 | ## Usage 34 | 35 | Select which piece of information you would like to calculate. 36 | 37 | | Metric | Function Call | 38 | |------------------------------|-----------------------------------| 39 | | All Below Metrics | `calculate_all_metrics()` | 40 | | Win Probability | `estimate_win_prob()` | 41 | | Value Remaining | `estimate_value_remaining()` | 42 | | Lift vs. Control | `estimate_lift_vs_baseline()` | 43 | | Win Probability vs. Baseline | `estimate_win_prob_vs_baseline()` | 44 | 45 | If you would like to calculate all the metrics then use `calculate_all_metrics()`. This is a slightly more efficient implementation since it only needs to sample from the posterior once for all 4 calculations instead of once for each metric. 46 | 47 | ### Create an Input Dataframe or Tibble 48 | 49 | All of these functions require a very specific tibble format. However, the same tibble can be used in all metric calculations. A tibble is used here because it has the additional check that all column lengths are the same. A tibble of this format can also conveniently be created using dplyr's `group_by() %>% summarise()` sequence of functions. 50 | 51 | The columns in the following table are required if there is an `X` in the box for the distribution. (Int columns can also be dbl due to R coercian) 52 | 53 | | Distribution Type | option_name (char) | sum_impressions (int) | sum_clicks (int) | sum_sessions (int) | sum_conversions (dbl) | sum_revenue (dbl) | sum_cost (dbl) | sum_conversions_2 (dbl) | sum_revenue_2 (dbl) | sum_duration (dbl) | sum_page_views (int) | 54 | |---------------------------|:------------------:|:---------------------:|:----------------:|:------------------:|:---------------------:|:-----------------:|:--------------:|:-----------------------:|:-------------------:|:------------------:|:--------------------:| 55 | | Conversion Rate | X | | X | | X | | | | | | | 56 | | Response Rate | X | | | X | X | | | | | | | 57 | | Click Through Rate (CTR) | X | X | X | | | | | | | | | 58 | | Revenue Per Session | X | | | X | X | X | | | | | | 59 | | Multi Revenue Per Session | X | | | X | X | X | | X | X | | | 60 | | Cost Per Activation (CPA) | X | | X | | X | | X | | | | | 61 | | Total CM | X | X | X | | X | X | X | | | | | 62 | | CM Per Click | X | | X | | X | X | X | | | | | 63 | | Cost Per Click (CPC) | X | | X | | | | X | | | | | 64 | | Session Duration | X | | | X | | | | | | X | | 65 | | Page Views Per Session | X | | | X | | | | | | | X | 66 | 67 | #### Example: 68 | We will use the Conversion Rate distribution for this example so we need the columns **option_name**, **sum_clicks**, and **sum_conversions**. 69 | 70 | ```{r} 71 | raw_data_long_format <- tibble::tribble( 72 | ~option_name, ~clicks, ~conversions, 73 | "A", 6, 3, 74 | "A", 1, 0, 75 | "B", 2, 1, 76 | "A", 2, 0, 77 | "A", 1, 0, 78 | "B", 5, 2, 79 | "A", 1, 0, 80 | "B", 1, 1, 81 | "B", 1, 0, 82 | "A", 3, 1, 83 | "B", 1, 0, 84 | "A", 1, 1 85 | ) 86 | 87 | raw_data_long_format %>% 88 | dplyr::group_by(option_name) %>% 89 | dplyr::summarise(sum_clicks = sum(clicks), 90 | sum_conversions = sum(conversions)) 91 | ``` 92 | 93 | This input dataframe can also be created manually if the aggregations are already done in an external program. 94 | 95 | ```{r} 96 | # Since this is a stochastic process with a random number generator, 97 | # it is worth setting the seed to get consistent results. 98 | set.seed(1776) 99 | 100 | input_df <- tibble::tibble( 101 | option_name = c("A", "B", "C"), 102 | sum_clicks = c(1000, 1000, 1000), 103 | sum_conversions = c(100, 120, 110) 104 | ) 105 | input_df 106 | ``` 107 | 108 | One note: clicks or sessions must be greater than or equal to the number of conversions (this is a rate bound between 0 and 1). 109 | 110 | `input_df` is used in the following examples. 111 | 112 | ### Estimate All Metrics 113 | 114 | This function wraps all the below functions into one call. 115 | 116 | ```{r} 117 | estimate_all_values(input_df, distribution = "conversion_rate", wrt_option_lift = "A") 118 | ``` 119 | 120 | 121 | ### Win Probability 122 | 123 | This produces a tibble with all the option names, the `win_prob_raw` so this can be used as a double, and a cleaned string `win_prob` where the decimal is represented as a percent. 124 | 125 | ```{r} 126 | estimate_win_prob(input_df, distribution = "conversion_rate") 127 | ``` 128 | 129 | ### Value Remaining (Loss) 130 | 131 | Value Remaining is a measure of loss. If B is selected as the current best option, we can estimate with 95% confidence (default), that an alternative option is not more than X% worse than the current expected best option. 132 | 133 | ```{r} 134 | estimate_value_remaining(input_df, distribution = "conversion_rate") 135 | ``` 136 | 137 | This number can also be framed in absolute dollar terms (or percentage points in the case of a rate metric). 138 | 139 | ```{r} 140 | estimate_value_remaining(input_df, distribution = "conversion_rate", metric = "absolute") 141 | ``` 142 | 143 | ### Estimate Lift 144 | 145 | The `metric` argument defaults to `lift` which produces a percent lift vs the baseline. Sometimes we may want to understand this lift in absolute terms (especially when samples from the posteriors could be negative, such as Contribution Margin (CM).) 146 | 147 | ```{r} 148 | estimate_lift_vs_baseline(input_df, distribution = "conversion_rate", wrt_option = "A") 149 | ``` 150 | 151 | ```{r} 152 | estimate_lift_vs_baseline(input_df, distribution = "conversion_rate", wrt_option = "A", metric = "absolute") 153 | ``` 154 | 155 | ### Win Probability vs. Baseline 156 | 157 | This function is used to compare an individual option to the best option as opposed to the win probability of each option overall. 158 | 159 | ```{r} 160 | estimate_win_prob_vs_baseline(input_df, distribution = "conversion_rate", wrt_option = "A") 161 | ``` 162 | 163 | ### Sample From the Posterior 164 | 165 | Samples can be directly collected from the posterior with the following function. 166 | 167 | ```{r} 168 | sample_from_posterior(input_df, distribution = "conversion_rate") 169 | ``` 170 | 171 | ## Alternate Distribution Type (Rev Per Session) 172 | ```{r} 173 | (input_df_rps <- tibble::tibble( 174 | option_name = c("A", "B", "C"), 175 | sum_sessions = c(1000, 1000, 1000), 176 | sum_conversions = c(100, 120, 110), 177 | sum_revenue = c(900, 1200, 1150) 178 | )) 179 | 180 | estimate_all_values(input_df_rps, distribution = "rev_per_session", wrt_option_lift = "A") 181 | ``` 182 | 183 | 184 | 185 | ## Valid Posteriors 186 | 187 | You may want to pass alternate priors to a distribution. 188 | Only do this if you are making an informed decision. 189 | 190 | ``` 191 | Beta - alpha0, beta0 192 | Gamma - k0, theta0 (k01, theta01 if alternate Gamma priors are required) 193 | Dirichlet - alpha_00 (none), alpha_01 (first conversion type), alpha_02 (alternate conversion type) 194 | ``` 195 | 196 | ```{r} 197 | # You can also pass priors for just the Beta distribution and not the Gamma distribution. 198 | new_priors <- list(alpha0 = 2, beta0 = 10, k0 = 3, theta0 = 10000) 199 | estimate_all_values(input_df_rps, distribution = "rev_per_session", wrt_option_lift = "A", priors = new_priors) 200 | ``` 201 | 202 | ## Looping Over All Distributions 203 | 204 | You may want to evaluate the results of a test in multiple different distributions. 205 | 206 | ```{r} 207 | (input_df_all <- tibble::tibble( 208 | option_name = c("A", "B", "C"), 209 | sum_impressions = c(10000, 9000, 11000), 210 | sum_sessions = c(1000, 1000, 1000), 211 | sum_conversions = c(100, 120, 110), 212 | sum_revenue = c(900, 1200, 1150), 213 | sum_cost = c(10, 50, 30), 214 | sum_conversions_2 = c(10, 8, 20), 215 | sum_revenue_2 = c(10, 16, 15) 216 | ) %>% 217 | dplyr::mutate(sum_clicks = sum_sessions)) # Clicks are the same as Sessions 218 | 219 | distributions <- c("conversion_rate", "response_rate", "ctr", "rev_per_session", "multi_rev_per_session", "cpa", "total_cm", "cm_per_click", "cpc") 220 | 221 | # Purrr map allows us to apply a function to each element of a list. (Similar to a for loop) 222 | purrr::map(distributions, 223 | ~ estimate_all_values(input_df_all, 224 | distribution = .x, 225 | wrt_option_lift = "A", 226 | metric = "absolute") 227 | ) 228 | ``` 229 | 230 | --------------------------------------------------------------------------------