├── .DS_Store ├── .Rbuildignore ├── .github └── CODEOWNERS ├── .gitignore ├── Archive ├── .DS_Store ├── MeterRevAct.R ├── RevActUser.R ├── RevAct_Q.R ├── old_version_RevActUser.R ├── summarySE2.R └── summarySE3.R ├── DESCRIPTION ├── Data ├── AirlineRating.RData ├── SegData.RData ├── df_encirca.RData ├── df_mon.RData ├── df_phi.RData ├── links.RData ├── nodes.RData ├── sim1_da1.RData ├── sim1_da2.RData ├── sim1_da3.RData └── tweets.RData ├── NAMESPACE ├── NetlifyDS.Rproj ├── R ├── .DS_Store ├── ClusterAnalysis.R ├── MeterRevActUserProduct.R ├── NoAcctDateType.R ├── Outliers.R ├── RevAct.R ├── RevActUserProduct.R ├── RevCntAct.R ├── cv-glasso.R ├── fitglasso.R ├── impute-dat.R ├── mrrAtDateType.R ├── multiplot.R ├── pairwise_ks_test.R ├── plot.cv_glasso.R ├── plot.rocTest.R ├── plot.tune_cutoff.R ├── predict.fitglasso.R ├── rocTest.R ├── subclass_eff_est.R └── tune-cutoff.R ├── README.md └── man ├── .DS_Store ├── MeterRevActUserProduct.Rd ├── RevAct.Rd ├── RevActUserProduct.Rd ├── RevCntAct.Rd ├── clust_ana.Rd ├── cv_glasso.Rd ├── fitglasso.Rd ├── impute_dat.Rd ├── mrrAtDateType.Rd ├── multiplot.Rd ├── out_mad.Rd ├── pairwise_ks_test.Rd ├── plot.cv_glasso.Rd ├── plot.rocTest.Rd ├── plot.tune.cutoff.Rd ├── predict_glasso.Rd ├── rocTest.Rd ├── subclass_eff_est.Rd └── tune_cutoff.Rd /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/.DS_Store -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ # An Rmarkdown file used to generate README.md 4 | ^Archive$ 5 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @netlify/datascience 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /Archive/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Archive/.DS_Store -------------------------------------------------------------------------------- /Archive/MeterRevAct.R: -------------------------------------------------------------------------------- 1 | #' Calculate metered revenue accounting matrix 2 | #' @description It calculates the metered accounting matrix given date and subscription data. It only returns one row for each date, i.e. it will sum up all the products. 3 | #' @param dates a date vector (have to be date type) 4 | #' @param datall_revenue subscription data, need to have columns: user_id, Product, Effective_Start, one_time_charge 5 | #' @param type product type 6 | #' @return A revenue accounting matrix is returned for each date. 7 | #' @author Hui Lin, \email{longqiman@gmail.com} 8 | #' @examples 9 | #' \dontrun{ 10 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1) 11 | #' dates 12 | #' metered_growth_matrix = MeterRevAct(datall_revenue = meter_revenue, type=one_time_charge_product, dates = dates) 13 | #' } 14 | #' @export 15 | 16 | MeterRevAct <- function(datall_revenue, type, dates){ 17 | 18 | dat <- datall_revenue %>% 19 | ## change this filter to get result for different products 20 | filter(Product %in% type)%>% 21 | transmute(user_id=user_id, 22 | one_time_charge = one_time_charge, 23 | Effective_Start = as.Date(Effective_Start)) 24 | 25 | ## set up the matrix and fiil in values 26 | cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn") 27 | grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>% 28 | data.frame() 29 | names(grow_matrix) <- cnam 30 | # i=10 31 | 32 | for (i in 1:length(dates)){ 33 | #################################################################### 34 | ############# This block of code is for the previous defition of MRR 35 | ############# based on the first day of each month 36 | # current month is from the 2nd day of last month to current day 37 | # int_start = dates[i] %m-% months(1) + days(1) 38 | # int_end = dates[i] 39 | # current = interval(int_start, int_end) # current month 40 | 41 | # previous month 42 | # int_start = dates[i] %m-% months(2) + days(1) 43 | # int_end = dates[i] %m-% months(1) 44 | # previous = interval(int_start, int_end) 45 | 46 | 47 | # all before until previous month 48 | # int_start = dates[i] %m-% months(12*100) 49 | # int_end = dates[i] %m-% months(2) 50 | # past = interval(int_start, int_end) 51 | #################################################################### 52 | 53 | # current month is from the 1nd day to the last day each month 54 | int_start_current = floor_date(dates[i], unit = "month") 55 | int_end_current = dates[i] 56 | current = interval(int_start_current, int_end_current) # current month 57 | 58 | # calculate new revenue which is from new customer this month 59 | active_current <- dat %>% 60 | filter(Effective_Start %within% current) %>% 61 | group_by(user_id) %>% 62 | summarise(one_time_charge = round(sum(one_time_charge), 2)) %>% 63 | select(user_id, one_time_charge_current = one_time_charge) 64 | 65 | # previous month 66 | int_start_previous = floor_date(dates[i], unit = "month") %m-% months(1) 67 | int_end_previous = ceiling_date(int_start_previous, unit = "month") - days(1) 68 | previous = interval(int_start_previous, int_end_previous) 69 | 70 | active_last_month <- dat %>% 71 | filter(Effective_Start %within% previous) %>% 72 | group_by(user_id) %>% 73 | summarise(one_time_charge = round(sum(one_time_charge), 2)) %>% 74 | select(user_id, one_time_charge_last_month = one_time_charge) 75 | 76 | # all before until previous month 77 | int_start_past = dates[i] %m-% months(12*100) 78 | int_end_past = floor_date(dates[i], unit = "month") %m-% months(1) 79 | int_end_past = int_end_past - days(1) 80 | past = interval(int_start_past, int_end_past) 81 | 82 | active_past <- dat %>% 83 | filter(Effective_Start %within% past) %>% 84 | group_by(user_id) %>% 85 | summarise(one_time_charge = round(sum(one_time_charge), 2)) %>% 86 | select(user_id, one_time_charge_past = one_time_charge) 87 | 88 | alltable <- merge(active_current, active_last_month, all = T) %>% 89 | merge(active_past, all=T) 90 | 91 | new <- alltable %>% 92 | filter( (is.na(one_time_charge_past) | one_time_charge_past <= 0) & (is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0 ) ) %>% 93 | summarise(one_time_charge = sum(one_time_charge_current, na.rm = T)) 94 | 95 | resurrected <- alltable %>% 96 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 )%>% 97 | filter( is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0) %>% 98 | filter( (!is.na(one_time_charge_past)) & one_time_charge_past > 0 ) %>% 99 | summarise(one_time_charge = sum(one_time_charge_current, na.rm = T)) 100 | 101 | # an alternative way is to add up the smaller number from MRR_current and MRR_last_month 102 | retain1 <- alltable %>% 103 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 ) %>% 104 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>% 105 | filter(one_time_charge_current >= one_time_charge_last_month) %>% 106 | summarise(one_time_charge = sum(one_time_charge_last_month, na.rm = T)) 107 | 108 | retain2 <- alltable %>% 109 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 ) %>% 110 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>% 111 | filter(one_time_charge_current < one_time_charge_last_month) %>% 112 | summarise(one_time_charge = sum(one_time_charge_current, na.rm = T)) 113 | 114 | retain = retain1 + retain2 115 | 116 | expansion <- alltable %>% 117 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 ) %>% 118 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>% 119 | filter(one_time_charge_current > one_time_charge_last_month) %>% 120 | summarise(one_time_charge = sum(one_time_charge_current, na.rm = T) - sum(one_time_charge_last_month, na.rm = T) ) 121 | 122 | contraction <- alltable %>% 123 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 )%>% 124 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>% 125 | filter(one_time_charge_current < one_time_charge_last_month) %>% 126 | summarise(one_time_charge = sum(one_time_charge_last_month, na.rm = T)-sum(one_time_charge_current, na.rm = T)) 127 | 128 | churn <- alltable %>% 129 | filter( is.na(one_time_charge_current) | one_time_charge_current <= 0 ) %>% 130 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>% 131 | summarise(one_time_charge = sum(one_time_charge_last_month, na.rm = T)) 132 | 133 | grow_matrix[i, "New"] <- new 134 | grow_matrix[i, "Retain"] <- retain 135 | grow_matrix[i, "Resurrected"] <- resurrected 136 | grow_matrix[i, "Expansion"] <- expansion 137 | grow_matrix[i, "Contraction"] <- contraction 138 | grow_matrix[i, "Churn"] <- churn 139 | } 140 | 141 | grow_matrix$dates <- dates 142 | 143 | grow_matrix <- grow_matrix %>% 144 | mutate(total_one_time_charge = New + Retain + Resurrected + Expansion) 145 | 146 | return(grow_matrix) 147 | } 148 | -------------------------------------------------------------------------------- /Archive/RevActUser.R: -------------------------------------------------------------------------------- 1 | #' Calculate revenue accounting matrix for each day and user 2 | #' @description It calculates the accounting matrix given date, subscription data. 3 | #' @param dates a date vector (have to be date type) 4 | #' @param datall_revenue subscription data, need to have columns: user_id, Effective_Start, Effective_End, MRR 5 | #' @return A revenue accounting matrix is returned for each day and user 6 | #' @author Hui Lin, \email{longqiman@gmail.com} 7 | #' @examples 8 | #' \dontrun{ 9 | #' res = RevActUser(total_revenue, dates) 10 | #' } 11 | #' @export 12 | 13 | 14 | RevActUser <- function(datall_revenue, dates){ 15 | 16 | dat <- total_revenue %>% 17 | # change this filter to get result for different products 18 | # filter(Product %in% type)%>% 19 | transmute(user_id=user_id, 20 | MRR= MRR, 21 | Effective_Start = as.Date(Effective_Start), 22 | Effective_End = as.Date(Effective_End)) 23 | 24 | res_test = do.call("rbind", 25 | lapply(dates, function(time_point){ 26 | 27 | # current month is from the 1nd day to the last day each month 28 | int_start = floor_date(time_point, unit = "month") 29 | int_end = time_point 30 | current = interval(int_start, int_end) # current month 31 | 32 | # calculate new revenue which is from new customer this month 33 | active_current <- dat %>% 34 | filter(Effective_Start <= time_point) %>% 35 | filter(is.na(Effective_End) | Effective_End > time_point) %>% 36 | group_by(user_id) %>% 37 | summarise(MRR = round(sum(MRR), 2)) %>% 38 | select(user_id, MRR_current = MRR) 39 | 40 | # previous month 41 | int_start = floor_date(time_point, unit = "month") %m-% months(1) 42 | int_end = ceiling_date(int_start, unit = "month") - days(1) 43 | previous = interval(int_start, int_end) 44 | 45 | active_last_month <- dat %>% 46 | filter(Effective_Start <= int_end) %>% 47 | filter(is.na(Effective_End) | Effective_End > int_end) %>% 48 | group_by(user_id) %>% 49 | summarise(MRR = round(sum(MRR), 2)) %>% 50 | select(user_id, MRR_last_month = MRR) 51 | 52 | # all before until previous month 53 | int_start = time_point %m-% months(12*100) 54 | int_end = floor_date(time_point, unit = "month") %m-% months(1) 55 | int_end = int_end - days(1) 56 | past = interval(int_start, int_end) 57 | 58 | active_past <- dat %>% 59 | filter(Effective_Start <= int_end) %>% 60 | filter(is.na(Effective_End) | Effective_End > int_end ) %>% 61 | group_by(user_id) %>% 62 | summarise(MRR = round(sum(MRR), 2)) %>% 63 | select(user_id, MRR_past = MRR) 64 | 65 | alltable <- merge(active_current, active_last_month, all = T) %>% 66 | merge(active_past, all=T) 67 | 68 | ########################## Break out the revenue ########################### 69 | 70 | # current month is from the 1nd day to the last day each month 71 | 72 | new <- alltable %>% 73 | filter( (is.na(MRR_past) | MRR_past <= 0) & (is.na(MRR_last_month) | MRR_last_month <= 0 ) ) %>% 74 | select(user_id, new = MRR_current ) 75 | 76 | resurrected <- alltable %>% 77 | filter( (!is.na(MRR_current)) & MRR_current > 0 )%>% 78 | filter( is.na(MRR_last_month) | MRR_last_month <= 0) %>% 79 | filter( (!is.na(MRR_past)) & MRR_past > 0 ) %>% 80 | select(user_id,resurrected = MRR_current) 81 | 82 | # an alternative way is to add up the smaller number from MRR_current and MRR_last_month 83 | 84 | retain1 <- alltable %>% 85 | filter( (!is.na(MRR_current)) & MRR_current > 0 ) %>% 86 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>% 87 | filter(MRR_current >= MRR_last_month) %>% 88 | select(user_id,retain1 = MRR_last_month) 89 | 90 | retain2 <- alltable %>% 91 | filter( (!is.na(MRR_current)) & MRR_current > 0 ) %>% 92 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>% 93 | filter(MRR_current < MRR_last_month) %>% 94 | select(user_id,retain2 = MRR_current) 95 | 96 | retain = merge(retain1, retain2, all=T) %>% 97 | impute0()%>% 98 | transmute(user_id = user_id, retain = retain1 + retain2) 99 | 100 | expansion <- alltable %>% 101 | filter( (!is.na(MRR_current)) & MRR_current > 0 ) %>% 102 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>% 103 | filter(MRR_current > MRR_last_month) %>% 104 | impute0()%>% 105 | transmute(user_id = user_id, expansion = MRR_current-MRR_last_month) 106 | 107 | contraction <- alltable %>% 108 | filter( (!is.na(MRR_current)) & MRR_current > 0 )%>% 109 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>% 110 | filter(MRR_current < MRR_last_month) %>% 111 | impute0()%>% 112 | transmute(user_id = user_id, contraction = MRR_last_month-MRR_current) 113 | 114 | churn <- alltable %>% 115 | filter( is.na(MRR_current) | MRR_current <= 0 ) %>% 116 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>% 117 | transmute(user_id = user_id, churn =MRR_last_month) 118 | 119 | res0 = merge(new, resurrected, all = T) 120 | res0 = merge(res0, retain, all = T) 121 | res0 = merge(res0, expansion, all = T) 122 | res0 = merge(res0, contraction, all = T) 123 | res0 = merge(res0, churn, all = T) 124 | res0 = impute0(res0) 125 | 126 | res0$date = time_point 127 | res0 = res0 %>% filter( !(new == 0 & resurrected == 0 & retain == 0 & expansion == 0 & contraction == 0 & churn == 0) ) 128 | 129 | return(res0) 130 | } 131 | ) ) 132 | 133 | return(res_test) 134 | } 135 | -------------------------------------------------------------------------------- /Archive/RevAct_Q.R: -------------------------------------------------------------------------------- 1 | #' Calculate revenue accounting matrix for each quarter 2 | #' @description It calculates the accounting matrix given date, subscription data, and product type. It only returns one row for each quarter, i.e. it will sum up all the products. 3 | #' @param dates a date vector (have to be date type) 4 | #' @param datall_revenue subscription data, need to have columns: user_id, Product, Effective_Start, Effective_End, MRR 5 | #' @param type product type 6 | #' @return A revenue accounting matrix is returned for each date. 7 | #' @author Hui Lin, \email{longqiman@gmail.com} 8 | #' @export 9 | 10 | RevAct_Q <- function(datall_revenue, type, dates){ 11 | dat <- datall_revenue %>% 12 | # change this filter to get result for different products 13 | filter(Product %in% type)%>% 14 | transmute(user_id=user_id, 15 | MRR= MRR, 16 | Effective_Start = as.Date(Effective_Start), 17 | Effective_End = as.Date(Effective_End)) 18 | 19 | cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn") 20 | grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>% 21 | data.frame() 22 | names(grow_matrix) <- cnam 23 | # i=10 24 | 25 | for (i in 1:length(dates)){ 26 | #################################################################### 27 | ############# This block of code is for the previous defition of MRR 28 | ############# based on the first day of each month 29 | # current month is from the 2nd day of last month to current day 30 | # int_start = dates[i] %m-% months(1) + days(1) 31 | # int_end = dates[i] 32 | # current = interval(int_start, int_end) # current month 33 | 34 | # previous month 35 | # int_start = dates[i] %m-% months(2) + days(1) 36 | # int_end = dates[i] %m-% months(1) 37 | # previous = interval(int_start, int_end) 38 | 39 | 40 | # all before until previous month 41 | # int_start = dates[i] %m-% months(12*100) 42 | # int_end = dates[i] %m-% months(2) 43 | # past = interval(int_start, int_end) 44 | #################################################################### 45 | 46 | # current Q 47 | int_start = floor_date(dates[i], unit = "month") %m-% months(2) 48 | int_end = dates[i] 49 | current = interval(int_start, int_end) # current month 50 | 51 | # calculate new revenue which is from new customer this month 52 | active_current <- dat %>% 53 | filter(Effective_Start <= dates[i]) %>% 54 | filter(is.na(Effective_End) | Effective_End > dates[i]) %>% 55 | group_by(user_id) %>% 56 | summarise(MRR = round(sum(MRR), 2) ) %>% 57 | select(user_id, MRR_current = MRR) 58 | 59 | # previous Q 60 | int_end = floor_date(dates[i], unit = "month") %m-% months(2) - days(1) 61 | int_start = floor_date(int_end, unit = "month") %m-% months(2) 62 | previous = interval(int_start, int_end) 63 | 64 | active_last_month <- dat %>% 65 | filter(Effective_Start <= int_end) %>% 66 | filter(is.na(Effective_End) | Effective_End > int_end) %>% 67 | group_by(user_id) %>% 68 | summarise(MRR = round(sum(MRR),2) ) %>% 69 | select(user_id, MRR_last_month = MRR) 70 | 71 | # all before until previous month 72 | int_end = dates[i] %m-% months(6) 73 | int_start = dates[i] %m-% months(1000) 74 | past = interval(int_start, int_end) 75 | 76 | active_past <- dat %>% 77 | filter(Effective_Start <= int_end) %>% 78 | filter(is.na(Effective_End) | Effective_End > int_end ) %>% 79 | group_by(user_id) %>% 80 | summarise(MRR = round(sum(MRR),2)) %>% 81 | select(user_id, MRR_past = MRR) 82 | 83 | alltable <- merge(active_current, active_last_month, all = T) %>% 84 | merge(active_past, all=T) 85 | 86 | new <- alltable %>% 87 | # filter(is.na(MRR_past)&is.na(MRR_last_month)) %>% 88 | filter( (is.na(MRR_past) | MRR_past <= 0) & (is.na(MRR_last_month) | MRR_last_month <= 0 ) ) %>% 89 | summarise(MRR = sum(MRR_current, na.rm = T)) 90 | 91 | resurrected <- alltable %>% 92 | # filter(!is.na(MRR_current))%>% 93 | # filter(is.na(MRR_last_month)) %>% 94 | # filter(!is.na(MRR_past)) %>% 95 | filter( (!is.na(MRR_current)) & MRR_current > 0 )%>% 96 | filter( is.na(MRR_last_month) | MRR_last_month <= 0) %>% 97 | filter( (!is.na(MRR_past)) & MRR_past > 0 ) %>% 98 | summarise(MRR = sum(MRR_current, na.rm = T)) 99 | 100 | # an alternative way is to add up the smaller number from MRR_current and MRR_last_month 101 | retain1 <- alltable %>% 102 | # filter(!is.na(MRR_current)) %>% 103 | # filter(!is.na(MRR_last_month)) %>% 104 | filter( (!is.na(MRR_current)) & MRR_current > 0 ) %>% 105 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>% 106 | filter(MRR_current >= MRR_last_month) %>% 107 | summarise(MRR = sum(MRR_last_month, na.rm = T)) 108 | 109 | retain2 <- alltable %>% 110 | # filter(!is.na(MRR_current)) %>% 111 | # filter(!is.na(MRR_last_month))%>% 112 | filter( (!is.na(MRR_current)) & MRR_current > 0 ) %>% 113 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>% 114 | filter(MRR_current < MRR_last_month) %>% 115 | summarise(MRR = sum(MRR_current, na.rm = T)) 116 | 117 | retain = retain1 + retain2 118 | 119 | expansion <- alltable %>% 120 | # filter(!is.na(MRR_current)) %>% 121 | # filter(!is.na(MRR_last_month))%>% 122 | filter( (!is.na(MRR_current)) & MRR_current > 0 ) %>% 123 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>% 124 | filter(MRR_current > MRR_last_month) %>% 125 | summarise(MRR = sum(MRR_current, na.rm = T) - sum(MRR_last_month, na.rm = T)) 126 | 127 | contraction <- alltable %>% 128 | # filter(!is.na(MRR_current))%>% 129 | # filter(!is.na(MRR_last_month)) %>% 130 | filter( (!is.na(MRR_current)) & MRR_current > 0 )%>% 131 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>% 132 | filter(MRR_current < MRR_last_month) %>% 133 | summarise(MRR = sum(MRR_last_month, na.rm = T) -sum(MRR_current, na.rm = T)) 134 | 135 | churn <- alltable %>% 136 | # filter(is.na(MRR_current))%>% 137 | # filter(!is.na(MRR_last_month)) %>% 138 | filter( is.na(MRR_current) | MRR_current <= 0 ) %>% 139 | filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>% 140 | summarise(MRR = sum(MRR_last_month, na.rm = T)) 141 | 142 | grow_matrix[i, "New"] <- new 143 | grow_matrix[i, "Retain"] <- retain 144 | grow_matrix[i, "Resurrected"] <- resurrected 145 | grow_matrix[i, "Expansion"] <- expansion 146 | grow_matrix[i, "Contraction"] <- contraction 147 | grow_matrix[i, "Churn"] <- churn 148 | } 149 | 150 | grow_matrix$dates <- dates 151 | 152 | grow_matrix <- grow_matrix %>% 153 | mutate(MRR = New + Retain + Resurrected + Expansion, 154 | ARR = MRR*12) %>% 155 | mutate(QuickRatio = round((New + Resurrected+ Expansion)/(Churn+Contraction),2)) 156 | 157 | return(grow_matrix) 158 | } 159 | -------------------------------------------------------------------------------- /Archive/old_version_RevActUser.R: -------------------------------------------------------------------------------- 1 | ## this is the old version of RevActUser 2 | ## this is the function to get revenue accounting matrix 3 | ## user level 4 | ## column required: user_id, MRR, Effective_Start, Effective_End 5 | 6 | RevActUser <- function(datall_revenue, dates){ 7 | 8 | dat <- total_revenue %>% 9 | # change this filter to get result for different products 10 | # filter(Product %in% type)%>% 11 | transmute(user_id=user_id, 12 | MRR= MRR, 13 | Effective_Start = as.Date(Effective_Start), 14 | Effective_End = as.Date(Effective_End)) 15 | 16 | res = NULL 17 | for (i in 1:length(dates)){ 18 | # current month is from the 1nd day to the last day each month 19 | int_start = floor_date(dates[i], unit = "month") 20 | int_end = dates[i] 21 | current = interval(int_start, int_end) # current month 22 | 23 | # calculate new revenue which is from new customer this month 24 | active_current <- dat %>% 25 | filter(Effective_Start <= dates[i]) %>% 26 | filter(is.na(Effective_End) | Effective_End >= dates[i]) %>% 27 | group_by(user_id) %>% 28 | summarise(MRR = sum(MRR)) %>% 29 | select(user_id, MRR_current = MRR) 30 | 31 | # previous month 32 | int_start = floor_date(dates[i], unit = "month") %m-% months(1) 33 | int_end = ceiling_date(int_start, unit = "month") - days(1) 34 | previous = interval(int_start, int_end) 35 | 36 | active_last_month <- dat %>% 37 | filter(Effective_Start <= int_end) %>% 38 | filter(is.na(Effective_End) | Effective_End >= int_end) %>% 39 | group_by(user_id) %>% 40 | summarise(MRR = sum(MRR)) %>% 41 | select(user_id, MRR_last_month = MRR) 42 | 43 | # all before until previous month 44 | int_start = dates[i] %m-% months(12*100) 45 | int_end = floor_date(dates[i], unit = "month") %m-% months(1) 46 | int_end = int_end - days(1) 47 | past = interval(int_start, int_end) 48 | 49 | active_past <- dat %>% 50 | filter(Effective_Start <= int_end) %>% 51 | filter(is.na(Effective_End) | Effective_End >= int_end ) %>% 52 | group_by(user_id) %>% 53 | summarise(MRR = sum(MRR)) %>% 54 | select(user_id, MRR_past = MRR) 55 | 56 | alltable <- merge(active_current, active_last_month, all = T) %>% 57 | merge(active_past, all=T) 58 | 59 | new <- alltable %>% 60 | filter(is.na(MRR_past)&is.na(MRR_last_month)) %>% 61 | select(user_id,new = MRR_current) 62 | 63 | resurrected <- alltable %>% 64 | filter(!is.na(MRR_current))%>% 65 | filter(is.na(MRR_last_month)) %>% 66 | filter(!is.na(MRR_past)) %>% 67 | select(user_id,resurrected = MRR_current) 68 | 69 | # an alternative way is to add up the smaller number from MRR_current and MRR_last_month 70 | retain1 <- alltable %>% 71 | filter(!is.na(MRR_current)) %>% 72 | filter(!is.na(MRR_last_month))%>% 73 | filter(MRR_current >= MRR_last_month) %>% 74 | select(user_id,retain1 = MRR_last_month) 75 | 76 | retain2 <- alltable %>% 77 | filter(!is.na(MRR_current)) %>% 78 | filter(!is.na(MRR_last_month))%>% 79 | filter(MRR_current < MRR_last_month) %>% 80 | select(user_id,retain2 = MRR_current) 81 | 82 | retain = merge(retain1, retain2, all=T) %>% 83 | impute0()%>% 84 | transmute(user_id = user_id, retain = retain1 + retain2) 85 | 86 | expansion <- alltable %>% 87 | filter(!is.na(MRR_current)) %>% 88 | filter(!is.na(MRR_last_month))%>% 89 | filter(MRR_current > MRR_last_month) %>% 90 | impute0()%>% 91 | transmute(user_id = user_id, expansion = MRR_current-MRR_last_month) 92 | 93 | contraction <- alltable %>% 94 | filter(!is.na(MRR_current))%>% 95 | filter(!is.na(MRR_last_month)) %>% 96 | filter(MRR_current < MRR_last_month) %>% 97 | impute0()%>% 98 | transmute(user_id = user_id, contraction =MRR_last_month-MRR_current) 99 | 100 | churn <- alltable %>% 101 | filter(is.na(MRR_current))%>% 102 | filter(!is.na(MRR_last_month)) %>% 103 | transmute(user_id = user_id, churn =MRR_last_month) 104 | 105 | res0 = merge(new, resurrected, all = T) 106 | res0 = merge(res0, retain, all = T) 107 | res0 = merge(res0, expansion, all = T) 108 | res0 = merge(res0, contraction, all = T) 109 | res0 = merge(res0, churn, all = T) 110 | res0 = impute0(res0) 111 | 112 | res0$date = dates[i] 113 | 114 | res = rbind(res, res0) 115 | } 116 | 117 | return(res) 118 | } 119 | -------------------------------------------------------------------------------- /Archive/summarySE2.R: -------------------------------------------------------------------------------- 1 | ## Summarizes data. 2 | ## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). 3 | ## data: a data frame. 4 | ## measurevar: the name of a column that contains the variable to be summariezed 5 | ## groupvars: a vector containing names of columns that contain grouping variables 6 | ## na.rm: a boolean that indicates whether to ignore NA's 7 | ## conf.interval: the percent range of the confidence interval (default is 95%) 8 | summarySE2 <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, 9 | conf.interval=.95, .drop=TRUE,pce_less_than_zero=FALSE) { 10 | 11 | require(plyr) 12 | 13 | # New version of length which can handle NA's: if na.rm==T, don't count them 14 | length2 <- function (x, na.rm=FALSE) { 15 | if (na.rm) sum(!is.na(x)) 16 | else length(x) 17 | } 18 | 19 | qx<-function(x,q){ 20 | 21 | sort( na.omit(x) )[round(length( na.omit(x) )*q,0)]->res 22 | return(res) 23 | } 24 | 25 | # This is does the summary; it's not easy to understand... 26 | datac <- ddply(data, groupvars, .drop=.drop, 27 | if (pce_less_than_zero) 28 | .fun= function(xx, col, na.rm) { 29 | c( N = length2(xx[,col], na.rm=na.rm), 30 | mean = mean (xx[,col], na.rm=na.rm), 31 | pct = mean (xx[,col]>0, na.rm=na.rm), 32 | sd = sd (xx[,col], na.rm=na.rm) 33 | #q25 = qx(xx[,col],0.25) , 34 | #q75 = qx(xx[,col],0.75) , 35 | #q5 = qx(xx[,col],0.05) , 36 | #q95 = qx(xx[,col],0.95) , 37 | #q15 = qx(xx[,col],0.15) , 38 | #q85 = qx(xx[,col],0.85) , 39 | 40 | ) 41 | } 42 | else 43 | .fun= function(xx, col, na.rm) { 44 | c( N = length2(xx[,col], na.rm=na.rm), 45 | mean = mean (xx[,col], na.rm=na.rm), 46 | pct = mean (xx[,col]>0, na.rm=na.rm), 47 | sd = sd (xx[,col], na.rm=na.rm) 48 | # q5 = qx(xx[,col],0.05) , 49 | # q15 = qx(xx[,col],0.15) , 50 | # q25 = qx(xx[,col],0.25) , 51 | # q75 = qx(xx[,col],0.75) , 52 | # q85 = qx(xx[,col],0.85) , 53 | # q95 = qx(xx[,col],0.95) 54 | 55 | ) 56 | }, 57 | measurevar, 58 | na.rm 59 | ) 60 | 61 | # Rename the "mean" column 62 | datac <- rename(datac, c("mean"=measurevar)) 63 | datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean 64 | 65 | # Confidence interval multiplier for standard error 66 | # Calculate t-statistic for confidence interval: 67 | # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 68 | ciMult <- qt(conf.interval/2 + .5, datac$N-1) 69 | datac$ci <- datac$se * ciMult 70 | return(datac) 71 | } 72 | -------------------------------------------------------------------------------- /Archive/summarySE3.R: -------------------------------------------------------------------------------- 1 | ## Summarizes data. 2 | ## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). 3 | ## data: a data frame. 4 | ## measurevar: the name of a column that contains the variable to be summariezed 5 | ## groupvars: a vector containing names of columns that contain grouping variables 6 | ## na.rm: a boolean that indicates whether to ignore NA's 7 | ## conf.interval: the percent range of the confidence interval (default is 95%) 8 | summarySE3 <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, 9 | conf.interval=.95, .drop=TRUE,pce_less_than_zero=FALSE) { 10 | require(plyr) 11 | 12 | # New version of length which can handle NA's: if na.rm==T, don't count them 13 | length2 <- function (x, na.rm=FALSE) { 14 | if (na.rm) sum(!is.na(x)) 15 | else length(x) 16 | } 17 | 18 | qx<-function(x,q){ 19 | 20 | sort( na.omit(x) )[round(length( na.omit(x) )*q,0)]->res 21 | return(res) 22 | } 23 | # This is does the summary; it's not easy to understand... 24 | datac <- ddply(data, groupvars, .drop=.drop, 25 | if (pce_less_than_zero) 26 | .fun= function(xx, col, na.rm) { 27 | c( N = length2(xx[,col], na.rm=na.rm), 28 | mean = mean (xx[,col], na.rm=na.rm), 29 | # pct = mean (xx[,col]>0, na.rm=na.rm), 30 | sd = sd (xx[,col], na.rm=na.rm), 31 | q25 = qx(xx[,col],0.25) , 32 | q75 = qx(xx[,col],0.75) , 33 | q5 = qx(xx[,col],0.05) , 34 | q95 = qx(xx[,col],0.95) , 35 | q15 = qx(xx[,col],0.15) , 36 | q85 = qx(xx[,col],0.85) , 37 | pce_down= mean(xx[,col]<0) 38 | ) 39 | } 40 | else 41 | .fun= function(xx, col, na.rm) { 42 | c( N = length2(xx[,col], na.rm=na.rm), 43 | mean = mean (xx[,col], na.rm=na.rm), 44 | # pct = mean (xx[,col]>0, na.rm=na.rm), 45 | sd = sd (xx[,col], na.rm=na.rm), 46 | q5 = qx(xx[,col],0.05) , 47 | q15 = qx(xx[,col],0.15) , 48 | q25 = qx(xx[,col],0.25) , 49 | q75 = qx(xx[,col],0.75) , 50 | q85 = qx(xx[,col],0.85) , 51 | q95 = qx(xx[,col],0.95) 52 | 53 | ) 54 | }, 55 | measurevar, 56 | na.rm 57 | ) 58 | 59 | # Rename the "mean" column 60 | datac <- rename(datac, c("mean"=measurevar)) 61 | datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean 62 | 63 | # Confidence interval multiplier for standard error 64 | # Calculate t-statistic for confidence interval: 65 | # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 66 | ciMult <- qt(conf.interval/2 + .5, datac$N-1) 67 | datac$ci <- datac$se * ciMult 68 | return(datac) 69 | } 70 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: NetlifyDS 2 | Title: Netlify Data Science Using R 3 | Version: 0.0.0.9000 4 | Authors@R: person("Hui", "Lin", email = "longqiman@gmail.com", role = c("aut", "cre")) 5 | Description: The `NetlifyDS` package contains functions to streamline Netlify's data science project. It covers data pipeline, analytics, and modeling. 6 | Depends: R (>= 3.4.0) 7 | License: MIT 8 | Suggests: 9 | stringr, 10 | dplyr, 11 | tidyr, 12 | tibble (>= 1.4.2), 13 | magrittr (>= 1.5) 14 | Encoding: UTF-8 15 | LazyData: true 16 | RoxygenNote: 6.1.1 17 | VignetteBuilder: knitr 18 | -------------------------------------------------------------------------------- /Data/AirlineRating.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/AirlineRating.RData -------------------------------------------------------------------------------- /Data/SegData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/SegData.RData -------------------------------------------------------------------------------- /Data/df_encirca.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/df_encirca.RData -------------------------------------------------------------------------------- /Data/df_mon.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/df_mon.RData -------------------------------------------------------------------------------- /Data/df_phi.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/df_phi.RData -------------------------------------------------------------------------------- /Data/links.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/links.RData -------------------------------------------------------------------------------- /Data/nodes.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/nodes.RData -------------------------------------------------------------------------------- /Data/sim1_da1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/sim1_da1.RData -------------------------------------------------------------------------------- /Data/sim1_da2.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/sim1_da2.RData -------------------------------------------------------------------------------- /Data/sim1_da3.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/sim1_da3.RData -------------------------------------------------------------------------------- /Data/tweets.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/tweets.RData -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(plot,cv_glasso) 4 | S3method(plot,rocTest) 5 | S3method(plot,tune.cutoff) 6 | export(MeterRevActUserProduct) 7 | export(RevAct) 8 | export(RevActUserProduct) 9 | export(RevCntAct) 10 | export(clust_ana) 11 | export(cv_glasso) 12 | export(fitglasso) 13 | export(impute_dat) 14 | export(mrrAtDateType) 15 | export(multiplot) 16 | export(out_mad) 17 | export(pairwise_ks_test) 18 | export(predict_glasso) 19 | export(rocTest) 20 | export(subclass_eff_est) 21 | export(tune_cutoff) 22 | -------------------------------------------------------------------------------- /NetlifyDS.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/R/.DS_Store -------------------------------------------------------------------------------- /R/ClusterAnalysis.R: -------------------------------------------------------------------------------- 1 | #' Cluster analysis 2 | #' @description Does cluster analysis and report an index of cluster 3 | #' @param ans data matrix 4 | #' @param k number of clusters 5 | #' @param distm Dissimilarity index, partial match to "\code{manhattan}", "\code{euclidean}"(default), "\code{canberra}", "\code{bray}", "\code{kulczynski}", "\code{jaccard}", "\code{gower}", "\code{altGower}", "\code{morisita}", "\code{horn}", "\code{mountford}", "\code{raup}" , "\code{binomial}", "\code{chao}", "\code{cao}" or "\code{mahalanobis}". 6 | #' @param hclustm the agglomeration method to be used. This should be (an unambiguous abbreviation of) one of "\code{ward.D}"(default), "\code{ward.D2}", "\code{single}", "\code{complete}", "\code{average}" (= UPGMA), "\code{mcquitty}" (= WPGMA), "\code{median}" (= WPGMC) or "\code{centroid}" (= UPGMC). 7 | #' @param na.rm Pairwise deletion of missing observations when computing dissimilarities. 8 | #' @author Hui Lin, \email{longqiman@gmail.com} 9 | #' @examples 10 | #' \dontrun{ 11 | #' data("SegData") 12 | #' library(dplyr) 13 | #' library(vegan) 14 | #' cind <- clust_ana(dat, k=4) 15 | #' } 16 | 17 | #' @export 18 | clust_ana <- function(ans, k, distm = "euclidean", hclustm = "ward.D", 19 | na.rm = FALSE) { 20 | library(vegan) 21 | dist_t <- vegdist(ans, method = distm, na.rm = na.rm) 22 | clust <- hclust(dist_t, method = hclustm) 23 | plot(clust, main = "Cluster Dendrogram of Segment Survey") 24 | return(cutree(clust, k = k)) 25 | } 26 | 27 | -------------------------------------------------------------------------------- /R/MeterRevActUserProduct.R: -------------------------------------------------------------------------------- 1 | #' Calculate metered revenue accounting matrix per day, user and product 2 | #' @description It calculates the accounting matrix for each user and product given date, subscription data and product category. 3 | #' @param dates a date vector (have to be date type) 4 | #' @param datall_revenue subscription data, need to have columns: user_id, Product, Effective_Start, one_time_charge 5 | #' @param type a vector of product names 6 | #' @return A metered revenue accounting matrix is returned for each day, user and product 7 | #' @author Hui Lin, \email{longqiman@gmail.com} 8 | #' @export 9 | 10 | MeterRevActUserProduct <- function(datall_revenue, dates, type){ 11 | 12 | dat <- datall_revenue %>% 13 | ## change this filter to get result for different products 14 | filter(Product %in% type)%>% 15 | transmute(user_id=user_id, 16 | one_time_charge = one_time_charge, 17 | Effective_Start = as.Date(Effective_Start)) 18 | 19 | # function to impute 0 to missing value 20 | impute0 = function(dat) { 21 | for (i in 2:ncol(dat)){ 22 | idx = which(is.na(dat[,i])) 23 | 24 | if(length(idx)>0){ 25 | dat[idx,i] = 0 26 | } 27 | } 28 | return(dat) 29 | } 30 | 31 | res = NULL 32 | for (i in 1:length(dates)){ 33 | # current month is from the 1nd day to the last day each month 34 | int_start = floor_date(dates[i], unit = "month") 35 | int_end = dates[i] 36 | current = interval(int_start, int_end) # current month 37 | 38 | # calculate new revenue which is from new customer this month 39 | active_current <- dat %>% 40 | filter(Effective_Start %within% current) %>% 41 | group_by(user_id) %>% 42 | summarise(one_time_charge = round(sum(one_time_charge), 2)) %>% 43 | select(user_id, one_time_charge_current = one_time_charge) 44 | 45 | # previous month 46 | int_start = floor_date(dates[i], unit = "month") %m-% months(1) 47 | int_end = ceiling_date(int_start, unit = "month") - days(1) 48 | previous = interval(int_start, int_end) 49 | 50 | active_last_month <- dat %>% 51 | filter(Effective_Start %within% previous) %>% 52 | group_by(user_id) %>% 53 | summarise(one_time_charge = round(sum(one_time_charge), 2)) %>% 54 | select(user_id, one_time_charge_last_month = one_time_charge) 55 | 56 | # all before until previous month 57 | int_start = dates[i] %m-% months(12*100) 58 | int_end = floor_date(dates[i], unit = "month") %m-% months(1) 59 | int_end = int_end - days(1) 60 | past = interval(int_start, int_end) 61 | 62 | active_past <- dat %>% 63 | filter(Effective_Start %within% past) %>% 64 | group_by(user_id) %>% 65 | summarise(one_time_charge = round(sum(one_time_charge), 2)) %>% 66 | select(user_id, one_time_charge_past = one_time_charge) 67 | 68 | alltable <- merge(active_current, active_last_month, all = T) %>% 69 | merge(active_past, all=T) 70 | 71 | ########################## Break out the revenue ########################### 72 | 73 | # current month is from the 1nd day to the last day each month 74 | 75 | new <- alltable %>% 76 | filter( (is.na(one_time_charge_past) | one_time_charge_past <= 0) & (is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0 ) ) %>% 77 | select(user_id, new = one_time_charge_current ) 78 | 79 | resurrected <- alltable %>% 80 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 )%>% 81 | filter( is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0) %>% 82 | filter( (!is.na(one_time_charge_past)) & one_time_charge_past > 0 ) %>% 83 | select(user_id,resurrected = one_time_charge_current) 84 | 85 | # an alternative way is to add up the smaller number from MRR_current and MRR_last_month 86 | retain1 <- alltable %>% 87 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 ) %>% 88 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>% 89 | filter(one_time_charge_current >= one_time_charge_last_month) %>% 90 | select(user_id,retain1 = one_time_charge_last_month) 91 | 92 | retain2 <- alltable %>% 93 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 ) %>% 94 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>% 95 | filter(one_time_charge_current < one_time_charge_last_month) %>% 96 | select(user_id,retain2 = one_time_charge_current) 97 | 98 | retain = merge(retain1, retain2, all=T) %>% 99 | impute0()%>% 100 | transmute(user_id = user_id, retain = retain1 + retain2) 101 | 102 | expansion <- alltable %>% 103 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 ) %>% 104 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>% 105 | filter(one_time_charge_current > one_time_charge_last_month) %>% 106 | impute0()%>% 107 | transmute(user_id = user_id, expansion = one_time_charge_current-one_time_charge_last_month) 108 | 109 | contraction <- alltable %>% 110 | filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0 )%>% 111 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>% 112 | filter(one_time_charge_current < one_time_charge_last_month) %>% 113 | impute0()%>% 114 | transmute(user_id = user_id, contraction =one_time_charge_last_month - one_time_charge_current) 115 | 116 | churn <- alltable %>% 117 | filter( is.na(one_time_charge_current) | one_time_charge_current <= 0 ) %>% 118 | filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>% 119 | transmute(user_id = user_id, churn =one_time_charge_last_month) 120 | 121 | res0 = merge(new, resurrected, all = T) 122 | res0 = merge(res0, retain, all = T) 123 | res0 = merge(res0, expansion, all = T) 124 | res0 = merge(res0, contraction, all = T) 125 | res0 = merge(res0, churn, all = T) 126 | res0 = impute0(res0) 127 | 128 | res0$date = dates[i] 129 | 130 | res = rbind(res, res0) 131 | res = res %>% filter( !(new == 0 & resurrected == 0 & retain == 0 & expansion == 0 & contraction == 0 & churn == 0) ) 132 | } 133 | return(res) 134 | } 135 | -------------------------------------------------------------------------------- /R/NoAcctDateType.R: -------------------------------------------------------------------------------- 1 | ## function to calculate numbuer of accounts 2 | NoAcctDateType <- function(time_point, subs, type) { 3 | time_point <- as.Date(time_point) 4 | # time_point: time of mrr 5 | # subs: subscription data 6 | actives <- subs %>% 7 | filter(Product %in% type) %>% 8 | filter(Effective_Start <= time_point) %>% 9 | filter( is.na(Effective_End) | Effective_End > time_point) %>% 10 | # only keep those account with positive mrr 11 | filter(MRR > 0) 12 | # return the number of accounts 13 | return(length(unique(actives$account_id))) 14 | } 15 | -------------------------------------------------------------------------------- /R/Outliers.R: -------------------------------------------------------------------------------- 1 | #' Identify outliers using MAD 2 | #' 3 | #' @param x a vector 4 | #' @return a vector indicating identified outliers 5 | #' @author Hui Lin, \email{longqiman@gmail.com} 6 | #' @examples 7 | #' \dontrun{ 8 | #' x<-c(seq(1:1000),20000) 9 | #' out_mad(x) 10 | #' } 11 | 12 | #' @export 13 | ########################outliers 14 | out_mad<-function(x){ 15 | v2<-x-median(na.omit(x)) 16 | mad<-median(na.omit(abs(v2))) 17 | idx<-which(0.6745*(x-median(na.omit(v2)))/mad>3.5) 18 | return(idx) 19 | } 20 | 21 | -------------------------------------------------------------------------------- /R/RevAct.R: -------------------------------------------------------------------------------- 1 | #' Calculate revenue accounting matrix 2 | #' @description It calculates the accounting matrix given date and daily revenue data. It only returns one row for each date, i.e. it will sum up all the products. 3 | #' @param dates a date vector (have to be date type) 4 | #' @param datall_revenue daily revenue data, need to have columns: user_id, Product, active_date, MRR 5 | #' @param type product type 6 | #' @return A revenue accounting matrix is returned for each date. 7 | #' @author Hui Lin, \email{longqiman@gmail.com} 8 | #' @examples 9 | #' \dontrun{ 10 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1) 11 | #' dates 12 | #' # need to get daily revenue from subscription data if you don't have that already 13 | #' base_monthly = do.call("rbind", lapply(dates, function(time_point) 14 | #' growth_matrix_identity = RevAct(datall_revenue = base_monthly, type = c("Identity"), dates = dates) 15 | #' } 16 | #' @export 17 | 18 | 19 | RevAct <- function(datall_revenue, type, dates){ 20 | 21 | 22 | dat <- datall_revenue %>% 23 | ungroup()%>% 24 | ## change this filter to get result for different products 25 | filter(Product %in% type) %>% 26 | transmute(user_id = user_id, 27 | MRR= MRR, 28 | active_date = as.Date(active_date)) %>% 29 | group_by(user_id, active_date) %>% 30 | summarise(MRR = round(sum(MRR), 2)) %>% 31 | ungroup() 32 | 33 | cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn") 34 | grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>% 35 | data.frame() 36 | names(grow_matrix) <- cnam 37 | 38 | for (i in 1:length(dates)){ 39 | # the current month 40 | cmon = dates[i] 41 | 42 | # the previous month 43 | pmon = floor_date(dates[i], unit = "month")- days(1) 44 | 45 | # the legacy month 46 | lmon = dates[dates < pmon] 47 | 48 | # current mrr 49 | cmrr = dat %>% 50 | filter(active_date == cmon) %>% 51 | select(user_id, cmrr = MRR) %>% 52 | filter(cmrr > 0) 53 | 54 | # previous mrr 55 | pmrr = dat %>% 56 | filter(active_date == pmon) %>% 57 | select(user_id, pmrr = MRR) %>% 58 | filter(pmrr > 0) 59 | 60 | # legacy mrr, get the maximum number 61 | lmrr = dat %>% 62 | filter(active_date %in% lmon) %>% 63 | group_by(user_id) %>% 64 | summarise(lmrr = max(MRR)) %>% 65 | filter(lmrr > 0) 66 | 67 | 68 | # join all together 69 | alltable <- merge(cmrr, pmrr, all = T) %>% 70 | merge(lmrr, all=T) %>% 71 | NetlifyDS::impute_dat(method = "zero") 72 | 73 | # Get new 74 | new = alltable %>% 75 | filter( cmrr > 0 & pmrr == 0 & lmrr == 0) %>% 76 | summarise(MRR = sum(cmrr, na.rm = T)) 77 | 78 | # Get reesurrected 79 | resurrected <- alltable %>% 80 | filter(cmrr > 0 & pmrr == 0 & lmrr > 0) %>% 81 | summarise(MRR = sum(cmrr, na.rm = T)) 82 | 83 | # Get Retain 84 | retain1 <- alltable %>% 85 | filter(cmrr > 0 & pmrr > 0) %>% 86 | filter(cmrr >= pmrr) %>% 87 | summarise(MRR = sum(pmrr, na.rm = T)) 88 | 89 | retain2 <- alltable %>% 90 | filter(cmrr > 0 & pmrr > 0) %>% 91 | filter(cmrr < pmrr) %>% 92 | summarise(MRR = sum(cmrr, na.rm = T)) 93 | 94 | retain = retain1 + retain2 95 | 96 | # Get expansion 97 | expansion <- alltable %>% 98 | filter(cmrr > 0 & pmrr >0) %>% 99 | filter(cmrr > pmrr) %>% 100 | summarise(MRR = sum(cmrr, na.rm = T) - sum(pmrr, na.rm = T) ) 101 | 102 | # Get contraction 103 | contraction <- alltable %>% 104 | filter(cmrr > 0 & pmrr >0) %>% 105 | filter(cmrr < pmrr) %>% 106 | summarise(MRR = sum(pmrr, na.rm = T)-sum(cmrr, na.rm = T)) 107 | 108 | # Get churn 109 | churn <- alltable %>% 110 | filter(cmrr == 0 & pmrr > 0) %>% 111 | summarise(MRR = sum(pmrr, na.rm = T)) 112 | 113 | grow_matrix[i, "New"] <- new 114 | grow_matrix[i, "Retain"] <- retain 115 | grow_matrix[i, "Resurrected"] <- resurrected 116 | grow_matrix[i, "Expansion"] <- expansion 117 | grow_matrix[i, "Contraction"] <- contraction 118 | grow_matrix[i, "Churn"] <- churn 119 | } 120 | grow_matrix$dates <- dates 121 | 122 | grow_matrix <- grow_matrix %>% 123 | mutate(MRR = New + Retain + Resurrected + Expansion, 124 | ARR = MRR*12) %>% 125 | mutate(QuickRatio = round((New + Resurrected+ Expansion)/(Churn+Contraction),2)) 126 | 127 | return(grow_matrix) 128 | } 129 | -------------------------------------------------------------------------------- /R/RevActUserProduct.R: -------------------------------------------------------------------------------- 1 | #' Calculate revenue accounting matrix per day, user and product 2 | #' @description It calculates the accounting matrix for each user and product given date, subscription data and product category. 3 | #' @param dates a date vector (have to be date type) 4 | #' @param dates_full a full range of dates 5 | #' @param datall_revenue daily revenue data, need to have columns: user_id, Product, active_date, MRR 6 | #' @param type a vector of product names 7 | #' @return A revenue accounting matrix is returned for each day, user and product 8 | #' @author Hui Lin, \email{longqiman@gmail.com} 9 | #' @examples 10 | #' \dontrun{ 11 | #' team <- c("netlify_team_premium_monthly","Teams","netlify_team_plus_monthly","netlify_team_plus_yearly") 12 | #' res_0 = RevActUserProduct(total_revenue, dates, team) 13 | #' } 14 | #' @export 15 | #' 16 | 17 | RevActUserProduct <- function(datall_revenue, dates, dates_full, type){ 18 | 19 | dat <- datall_revenue %>% 20 | ungroup()%>% 21 | ## change this filter to get result for different products 22 | dplyr::filter(Product %in% type) %>% 23 | dplyr::transmute(user_id = user_id, 24 | MRR= MRR, 25 | active_date = as.Date(active_date)) %>% 26 | dplyr::group_by(user_id, active_date) %>% 27 | dplyr::summarise(MRR = round(sum(MRR), 2)) %>% 28 | ungroup() 29 | 30 | ######################## BEGIN OF TEM_FUN ############################# 31 | tem_fun = function(time_point, dat, dates_full){ 32 | 33 | cmon = time_point 34 | 35 | # the previous month 36 | # pmon = floor_date(time_point, unit = "month")- days(1) 37 | pmon = dates_full[which(dates_full == cmon) - 1] 38 | 39 | # the legacy month 40 | lmon = dates_full[dates_full < pmon] 41 | 42 | # current mrr 43 | cmrr = dat %>% 44 | dplyr::filter(active_date == cmon) %>% 45 | dplyr::select(user_id, cmrr = MRR) %>% 46 | dplyr::filter(cmrr > 0) 47 | 48 | # previous mrr 49 | pmrr = dat %>% 50 | dplyr::filter(active_date == pmon) %>% 51 | dplyr::select(user_id, pmrr = MRR) %>% 52 | dplyr::filter(pmrr > 0) 53 | 54 | # legacy mrr, get the maximum number 55 | lmrr = dat %>% 56 | dplyr::filter(active_date %in% lmon) %>% 57 | dplyr::group_by(user_id) %>% 58 | dplyr::summarise(lmrr = max(MRR)) %>% 59 | dplyr::filter(lmrr > 0) 60 | 61 | 62 | # join all together 63 | alltable <- merge(cmrr, pmrr, all = T) %>% 64 | merge(lmrr, all=T) %>% 65 | NetlifyDS::impute_dat(method = "zero") 66 | 67 | # Get new 68 | new = alltable %>% 69 | dplyr::filter( cmrr > 0 & pmrr == 0 & lmrr == 0) %>% 70 | dplyr::select(user_id, new = cmrr ) 71 | 72 | # Get reesurrected 73 | resurrected <- alltable %>% 74 | dplyr::filter(cmrr > 0 & pmrr == 0 & lmrr > 0) %>% 75 | dplyr::select(user_id,resurrected = cmrr) 76 | 77 | # Get Retain 78 | retain1 <- alltable %>% 79 | dplyr::filter(cmrr > 0 & pmrr > 0) %>% 80 | dplyr::filter(cmrr >= pmrr) %>% 81 | dplyr::select(user_id,retain1 = pmrr) 82 | 83 | retain2 <- alltable %>% 84 | dplyr::filter(cmrr > 0 & pmrr > 0) %>% 85 | dplyr::filter(cmrr < pmrr) %>% 86 | dplyr::select(user_id,retain2 = cmrr) 87 | 88 | retain = merge(retain1, retain2, all=T) %>% 89 | NetlifyDS::impute_dat(method = "zero")%>% 90 | dplyr::transmute(user_id = user_id, retain = retain1 + retain2) 91 | 92 | # Get expansion 93 | expansion <- alltable %>% 94 | dplyr::filter(cmrr > 0 & pmrr >0) %>% 95 | dplyr::filter(cmrr > pmrr) %>% 96 | dplyr::transmute(user_id = user_id, expansion = cmrr -pmrr) 97 | 98 | # Get contraction 99 | contraction <- alltable %>% 100 | dplyr::filter(cmrr > 0 & pmrr >0) %>% 101 | dplyr::filter(cmrr < pmrr) %>% 102 | dplyr::transmute(user_id = user_id, contraction =pmrr-cmrr) 103 | 104 | # Get churn 105 | churn <- alltable %>% 106 | dplyr::filter(cmrr == 0 & pmrr > 0) %>% 107 | dplyr::transmute(user_id = user_id, churn =pmrr) 108 | 109 | res0 = merge(new, resurrected, all = T) 110 | res0 = merge(res0, retain, all = T) 111 | res0 = merge(res0, expansion, all = T) 112 | res0 = merge(res0, contraction, all = T) 113 | res0 = merge(res0, churn, all = T) 114 | res0 = NetlifyDS::impute_dat(res0, method = "zero") 115 | 116 | res0$date = time_point 117 | return(res0) 118 | } 119 | 120 | ######################## END OF TEM_FUN ############################# 121 | 122 | res = do.call("rbind", lapply(dates, function(time_point){tem_fun(time_point, dat = dat, dates_full = dates_full)})) 123 | res = res %>% dplyr::filter( !(new == 0 & resurrected == 0 & retain == 0 & expansion == 0 & contraction == 0 & churn == 0) ) 124 | 125 | return(res) 126 | } 127 | -------------------------------------------------------------------------------- /R/RevCntAct.R: -------------------------------------------------------------------------------- 1 | #' Calculate revenue accounting matrix by counts 2 | #' @description It calculates the accounting matrix by counts given date and subscription data. It only returns one row for each date, i.e. it will sum up all the products. 3 | #' @param dates a date vector (have to be date type) 4 | #' @param datall_revenue subscription data, need to have columns: user_id, Product, Effective_Start, Effective_End, MRR 5 | #' @param type product type 6 | #' @return A revenue accounting matrix by count is returned for each date. 7 | #' @author Hui Lin, \email{longqiman@gmail.com} 8 | #' @examples 9 | #' \dontrun{ 10 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1) 11 | #' dates 12 | #' # need to get daily revenue from subscription data if you don't have that already 13 | #' base_monthly = do.call("rbind", lapply(dates, function(time_point) 14 | #' {NetlifyDS::mrrAtDateType(time_point, subs = base_monthly, type = NULL)}) 15 | #' growth_matrix_identity = RevCntAct(datall_revenue = total_revenue, type = c("Identity"), dates = dates) 16 | #' } 17 | #' @export 18 | 19 | RevCntAct <- function(datall_revenue, type, dates){ 20 | 21 | dat <- datall_revenue %>% 22 | ungroup()%>% 23 | ## change this filter to get result for different products 24 | filter(Product %in% type) %>% 25 | transmute(user_id = user_id, 26 | MRR= MRR, 27 | active_date = as.Date(active_date)) %>% 28 | group_by(user_id, active_date) %>% 29 | summarise(MRR = round(sum(MRR), 2)) %>% 30 | ungroup() 31 | 32 | cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn") 33 | grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>% 34 | data.frame() 35 | names(grow_matrix) <- cnam 36 | 37 | for (i in 1:length(dates)){ 38 | # the current month 39 | cmon = dates[i] 40 | 41 | # the previous month 42 | pmon = floor_date(dates[i], unit = "month")- days(1) 43 | 44 | # the legacy month 45 | lmon = dates[dates < pmon] 46 | 47 | # current mrr 48 | cmrr = dat %>% 49 | filter(active_date == cmon) %>% 50 | select(user_id, cmrr = MRR) %>% 51 | filter(cmrr > 0) 52 | 53 | # previous mrr 54 | pmrr = dat %>% 55 | filter(active_date == pmon) %>% 56 | select(user_id, pmrr = MRR) %>% 57 | filter(pmrr > 0) 58 | 59 | # legacy mrr, get the maximum number 60 | lmrr = dat %>% 61 | filter(active_date %in% lmon) %>% 62 | group_by(user_id) %>% 63 | summarise(lmrr = max(MRR)) %>% 64 | filter(lmrr > 0) 65 | 66 | 67 | # join all together 68 | alltable <- merge(cmrr, pmrr, all = T) %>% 69 | merge(lmrr, all=T) %>% 70 | NetlifyDS::impute_dat(method = "zero") 71 | 72 | # Get new 73 | new = alltable %>% 74 | filter( cmrr > 0 & pmrr == 0 & lmrr == 0) %>% 75 | summarise(user_cnt = length(unique(user_id))) 76 | 77 | # Get reesurrected 78 | resurrected <- alltable %>% 79 | filter(cmrr > 0 & pmrr == 0 & lmrr > 0) %>% 80 | summarise(user_cnt = length(unique(user_id))) 81 | 82 | # Get Retain 83 | retain1 <- alltable %>% 84 | filter(cmrr > 0 & pmrr > 0) %>% 85 | filter(cmrr >= pmrr) %>% 86 | summarise(user_cnt = length(unique(user_id))) 87 | 88 | retain2 <- alltable %>% 89 | filter(cmrr > 0 & pmrr > 0) %>% 90 | filter(cmrr < pmrr) %>% 91 | summarise(user_cnt = length(unique(user_id))) 92 | 93 | retain = retain1 + retain2 94 | 95 | # Get expansion 96 | expansion <- alltable %>% 97 | filter(cmrr > 0 & pmrr >0) %>% 98 | filter(cmrr > pmrr) %>% 99 | summarise(user_cnt = length(unique(user_id)) ) 100 | 101 | # Get contraction 102 | contraction <- alltable %>% 103 | filter(cmrr > 0 & pmrr >0) %>% 104 | filter(cmrr < pmrr) %>% 105 | summarise(user_cnt = length(unique(user_id))) 106 | 107 | # Get churn 108 | churn <- alltable %>% 109 | filter(cmrr == 0 & pmrr > 0) %>% 110 | summarise(user_cnt = length(unique(user_id))) 111 | 112 | grow_matrix[i, "New"] <- new 113 | grow_matrix[i, "Retain"] <- retain 114 | grow_matrix[i, "Resurrected"] <- resurrected 115 | grow_matrix[i, "Expansion"] <- expansion 116 | grow_matrix[i, "Contraction"] <- contraction 117 | grow_matrix[i, "Churn"] <- churn 118 | } 119 | grow_matrix$dates <- dates 120 | 121 | return(grow_matrix) 122 | } 123 | 124 | -------------------------------------------------------------------------------- /R/cv-glasso.R: -------------------------------------------------------------------------------- 1 | #' Cross-validation for group lasso logistic regression 2 | #' @description Does k-fold cross-validation for group lasso logistic regression and returns a list object 3 | #' @param trainx a data frame where samples are in rows and features are in columns 4 | #' @param trainy a numeric or factor vector containing the outcome for each sample 5 | #' @param nlam number of lambda values. The default is 100 6 | #' @param type the type of prediction. \code{type = 'link'} is on the scale of linear predictors (default), whereas \code{type = 'response'} is on the scale of the response variable, i.e. \code{type = 'response'} applies the inverse link function to the linear predictors. 7 | #' @param na_action function determining what should be done with missing values when predicting new data during cross validation. The default is to predict NA. 8 | #' @param kfold number of folds - default is 10. Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. The default is 10. 9 | #' @return 10 | #' an object of class "\code{cv_glasso}" is returned, which is a list with the ingredients of the cross-validation fit. 11 | #' @author Hui Lin, \email{longqiman@gmail.com} 12 | #' @examples 13 | #' \dontrun{ 14 | #' data("sim1_da1") 15 | #' trainx = dplyr::select(sim1_da1, -y) 16 | #' trainy = sim1_da1$y 17 | #' # index of the group 18 | #' index <- gsub("\\..*", "", names(trainx)) 19 | #' # nlam is the number of values of tuning variable 20 | #' nlam <- 10 21 | #' # type of prediction 22 | #' type = "link" 23 | #' # number of cross-validation folds 24 | #' kfold <- 10 25 | #' cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold) 26 | #' str(cv_fit) 27 | #' } 28 | 29 | #' @export 30 | #' 31 | cv_glasso <- function(trainx, trainy, nlam = 100, type = "link", kfold = 10, 32 | na_action = na.pass) { 33 | 34 | library(grplasso) 35 | library(caret) 36 | library(pROC) 37 | n <- nrow(trainx) 38 | # Do grouplasso logistic regression 39 | x <- cbind(1, as.matrix(trainx)) 40 | colnames(x) <- c("Intercept", colnames(trainx)) 41 | index <- c(NA, as.factor(index)) 42 | ################################# Get a vector of tuning parameters 43 | lambda <- lambdamax(x, y = trainy, index = index, penscale = sqrt, 44 | model = LogReg()) * 0.96^(1:nlam) 45 | 46 | index0 <- c(1:n) 47 | y0 <- NULL 48 | pre <- NULL 49 | 50 | for (i in 1:kfold) { 51 | idx <- sample(index0, round(n/kfold, 0), replace = F) 52 | test <- trainx[idx, ] 53 | train <- trainx[-idx, ] 54 | y0 <- c(y0, trainy[idx]) 55 | xtest <- cbind(1, as.matrix(test)) 56 | xtrain <- cbind(1, as.matrix(train)) 57 | colnames(xtrain) <- c("Intercept", colnames(train)) 58 | colnames(xtest) <- c("Intercept", colnames(test)) 59 | fit <- grplasso(xtrain, trainy[-idx], index = index, lambda = lambda, 60 | model = LogReg(), penscale = sqrt) 61 | pre0 <- predict(fit, xtest, type = type, na.action = na_action) 62 | pre0 <- data.frame(pre0) 63 | pre <- rbind(pre, pre0) 64 | index0 <- index0[-idx] 65 | } 66 | 67 | auc <- rep(0, ncol(pre)) 68 | for (i in 1:ncol(pre)) { 69 | auc[i] <- as.numeric(auc(y0, pre[, i])) 70 | } 71 | pmsure = auc 72 | lambda.max.auc = c(lambda = lambda[which(pmsure == max(pmsure))], auc = max(pmsure)) 73 | se.auc = max(pmsure) - sd(pmsure) 74 | lambda.1se.auc = c(lambda[min(which(pmsure >= se.auc))], se.auc = se.auc) 75 | ############################## Maximize log-likelihood 76 | loglike <- rep(0, ncol(pre)) 77 | for (i in 1:ncol(pre)) { 78 | loglike[i] <- sum((pre[, i] - log(1 + exp(pre[, i]))) * y0 + (-log(1 + 79 | exp(pre[, i]))) * (1 - y0)) 80 | } 81 | pmsure = loglike 82 | lambda.max.loglike = c(lambda = lambda[which(pmsure == max(pmsure))], loglike = max(pmsure)) 83 | se.loglike = max(pmsure) - sd(pmsure) 84 | lambda.1se.loglike = c(lambda = lambda[min(which(pmsure >= se.loglike))], se.loglike = se.loglike) 85 | ############################## Maximize correlation 86 | co <- pre 87 | maxco <- rep(0, ncol(pre)) 88 | for (i in 1:ncol(pre)) { 89 | s <- sort(pre[, i]) 90 | for (j in 1:nrow(pre)) { 91 | yhat <- as.numeric(pre[, i] >= s[j]) 92 | co[j, i] <- cor(yhat, y0) 93 | } 94 | maxco[i] <- max(na.omit(co[, i])) 95 | } 96 | pmsure = maxco 97 | lambda.max.maxco = c(lambda = lambda[which(pmsure == max(pmsure))], maxco = max(pmsure)) 98 | se.maxco = max(pmsure) - sd(pmsure) 99 | lambda.1se.maxco = c(lambda = lambda[min(which(pmsure >= se.maxco))], se.maxco = se.maxco) 100 | ############### 101 | res <- list(lambda = lambda, pred = pre, auc = auc, log_likelihood = loglike, 102 | maxrho = maxco, lambda.max.auc = lambda.max.auc, lambda.1se.auc = lambda.1se.auc, 103 | lambda.max.loglike = lambda.max.loglike, lambda.1se.loglike = lambda.1se.loglike, 104 | lambda.max.maxco = lambda.max.maxco, lambda.1se.maxco = lambda.1se.maxco) 105 | class(res) <- "cv_glasso" 106 | return(res) 107 | invisible(res) 108 | } 109 | -------------------------------------------------------------------------------- /R/fitglasso.R: -------------------------------------------------------------------------------- 1 | #' Fit roup lasso logistic regression 2 | #' @description Fit roup lasso logistic regression and returns a list object 3 | #' @param trainx a data frame where samples are in rows and features are in columns 4 | #' @param trainy a numeric or factor vector containing the outcome for each sample 5 | #' @param lambda value of tuning parameter lambda 6 | #' 7 | #' @return 8 | #' A \code{grplasso} object is returned, for which \code{coef}, \code{print}, \code{plot} and \code{predict} methods exist. 9 | #' @author Hui Lin, \email{longqiman@gmail.com} 10 | #' @examples 11 | #' \dontrun{ 12 | #' data("sim1_da1") 13 | #' trainx = dplyr::select(sim1_da1, -y) 14 | #' trainy = sim1_da1$y 15 | #' # index of the group 16 | #' index <- gsub("\\..*", "", names(trainx)) 17 | #' # nlam is the number of values of tuning variable 18 | #' nlam <- 20 19 | #' # type of prediction 20 | #' type = "link" 21 | #' # number of cross-validation folds 22 | #' kfold <- 10 23 | #' cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold) 24 | #' str(cv_fit) 25 | #' 26 | #' fitgl <- fitglasso(trainx = trainx, trainy= trainy, lambda = cv_fit$lambda.max.auc[1]) 27 | #' } 28 | #' @export 29 | #' 30 | 31 | fitglasso <- function(trainx, trainy, lambda, na_action = na.pass) { 32 | library(grplasso) 33 | n <- nrow(trainx) 34 | # Do grouplasso logistic regression 35 | x <- cbind(1, as.matrix(trainx)) 36 | colnames(x) <- c("Intercept", colnames(trainx)) 37 | index <- c(NA, as.factor(index)) 38 | ################################# Get a vector of tuning parameters 39 | fit <- grplasso(x, trainy, index = index, lambda = lambda) 40 | # class(fit)<- "fitglasso" 41 | return(fit) 42 | } 43 | -------------------------------------------------------------------------------- /R/impute-dat.R: -------------------------------------------------------------------------------- 1 | #' Impute missing values 2 | #' 3 | #' @param dat a data frame. 4 | #' @param method imputation method. Need to be one of these: \code{zero}, \code{mean}, \code{Inf0}, \code{code99} 5 | #' @author Hui Lin, \email{longqiman@gmail.com} 6 | #' @examples 7 | #' \dontrun{ 8 | #' library(DataScienceR) 9 | #' data("SegData") 10 | #' impdat <- SegData[, c("income", "house")] 11 | #' # summary(SegData) 12 | #' impdat = impute_dat(impdat, method = "mean") 13 | #' summary(impdat) 14 | #' } 15 | 16 | #' @export 17 | impute_dat <- function (dat, method){ 18 | ##--------------------------------- zero 19 | if (method == "zero") 20 | { 21 | for (i in 1:ncol(dat)) { 22 | idx <- which(is.na(dat[, i])) 23 | if (length(idx)) 24 | dat[, i][idx] <- 0 25 | } 26 | return(dat) 27 | } 28 | ##--------------------------------- mean 29 | else if (method == "mean") 30 | { 31 | for (i in 1:ncol(dat)) { 32 | idx <- which(is.na(dat[, i])) 33 | if (length(idx)) 34 | dat[, i][idx] <- mean(na.omit(dat[, i])) 35 | } 36 | return(dat) 37 | } 38 | ##--------------------------------Impute Inf to be 0 39 | else if (method == "Inf0") 40 | { 41 | for (i in 1:ncol(dat)) { 42 | idx <- which(abs(dat[, i]) == Inf) 43 | if (length(idx)) 44 | dat[, i][idx] <- 0 45 | } 46 | return(dat) 47 | } 48 | ##------------------------------Impute 99 to be 0 49 | else if (method == "code99") 50 | { 51 | for (i in 1:ncol(dat)) { 52 | idx <- which(dat[, i] == 99) 53 | if (length(idx)) 54 | dat[, i][idx] <- 0 55 | } 56 | return(dat) 57 | } 58 | else 59 | {cat(" Error in argument \"method\", \n 60 | it has to be one of these: zero, mean, Inf0, code99")} 61 | } 62 | 63 | 64 | -------------------------------------------------------------------------------- /R/mrrAtDateType.R: -------------------------------------------------------------------------------- 1 | #' Calculate MRR for a specific date and product type 2 | #' @description It calculates the MRR given date, subscription data, and product type 3 | #' @param time_point a date value (have to be date type) 4 | #' @param subs subscription data, need to have columns: account_id, user_id, Product, Effective_Start, Effective_End, MRR 5 | #' @param type product type, if "NULL", it will return MRR for each product 6 | #' @return 7 | #' a vector of MRR is returned 8 | #' @author Hui Lin, \email{longqiman@gmail.com} 9 | #' @examples 10 | #' \dontrun{ 11 | #' add_ons <- c("Forms", "Functions", "Identity", "Domains", "Large Media") 12 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1) 13 | #' dates 14 | #' add_ons_mrr <- sapply(dates, function(time_point) {mrrAtDateType(time_point, datall_revenue, add_ons)}) 15 | #' } 16 | #' @export 17 | #' 18 | 19 | mrrAtDateType <- function(time_point, subs, type = NULL) { 20 | # time_point: time of mrr 21 | # subs: subscription data 22 | time_point <- as.Date(time_point) 23 | 24 | if (is.null(type)){ 25 | actives <- subs %>% 26 | dplyr::filter(Effective_Start <= time_point) %>% 27 | dplyr::filter( is.na(Effective_End) | Effective_End > time_point) 28 | 29 | res <- actives %>% 30 | dplyr::group_by(account_id,user_id,Product, rate_plan_name) %>% 31 | # round it in case we have some magic number again 32 | dplyr::summarise(MRR = round(sum(MRR , na.rm=T), 10)) 33 | 34 | res$Date = time_point 35 | return(res) 36 | } 37 | else{ 38 | actives <- subs %>% 39 | dplyr::filter(Product %in% type) %>% 40 | dplyr::filter(Effective_Start <= time_point) %>% 41 | dplyr::filter(is.na(Effective_End) | Effective_End > time_point) 42 | return(sum(actives$MRR , na.rm=T)) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /R/multiplot.R: -------------------------------------------------------------------------------- 1 | #' Multiple plot function 2 | #' @description make multiple plot for ggplot objects. 3 | #' @param ... Pass ggplot objects or plotlist (as a list of ggplot objects) 4 | #' @param cols Number of columns in layout 5 | #' @param layout A matrix specifying the layout. If present, 'cols' is ignored. If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), then plot 1 will go in the upper left, 2 will go in the upper right, and 3 will go all the way across the bottom. 6 | 7 | #' @export 8 | multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { 9 | require(grid) 10 | 11 | # Make a list from the ... arguments and plotlist 12 | plots <- c(list(...), plotlist) 13 | 14 | numPlots = length(plots) 15 | 16 | # If layout is NULL, then use 'cols' to determine layout 17 | if (is.null(layout)) { 18 | # Make the panel 19 | # ncol: Number of columns of plots 20 | # nrow: Number of rows needed, calculated from # of cols 21 | layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), 22 | ncol = cols, nrow = ceiling(numPlots/cols)) 23 | } 24 | 25 | if (numPlots==1) { 26 | print(plots[[1]]) 27 | 28 | } else { 29 | # Set up the page 30 | grid.newpage() 31 | pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) 32 | 33 | # Make each plot, in the correct location 34 | for (i in 1:numPlots) { 35 | # Get the i,j matrix positions of the regions that contain this subplot 36 | matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) 37 | 38 | print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, 39 | layout.pos.col = matchidx$col)) 40 | } 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /R/pairwise_ks_test.R: -------------------------------------------------------------------------------- 1 | #' Pairwise Kolmogorov-Smirnov Test 2 | #' @description Perform Pairwise Multiple Comparisons Using Kolmogorov-Smirnov Test 3 | #' @param value a numeric vector of data values 4 | #' @param group a group indicator 5 | #' @param n_min The minimum number of observations in each group. Group(s) with observation less than \code{n_min} will be removed. "\code{n_min=50}"(default) 6 | #' @param warning sets the handling of warning messages. If \code{warning} is negative all warnings are ignored. If \code{warning} is zero (the default) warnings are stored until the top–level function returns. If 10 or fewer warnings were signalled they will be printed otherwise a message saying how many were signalled. An object called last.warning is created and can be printed through the function warnings. If \code{warning} is one, warnings are printed as they occur. If warn is two or larger all warnings are turned into errors. 7 | #' @param alternative indicates the alternative hypothesis and must be one of "\code{two.sided}" (default), "\code{less}", or "\code{greater}". You can specify just the initial letter of the value, but the argument name must be give in full. See ‘Details’ for the meanings of the possible values. 8 | #' @details Missing values are silently omitted from x and (in the two-sample case) y. 9 | #' 10 | #' The possible values "\code{two.sided}", "\code{less}" and "\code{greater}" of alternative specify the null hypothesis that the true distribution function of x is equal to, not less than or not greater than the hypothesized distribution function (one-sample case) or the distribution function of y (two-sample case), respectively. This is a comparison of cumulative distribution functions, and the test statistic is the maximum difference in value, with the statistic in the "greater" alternative being \eqn{D^+ = max[F_x(u) - F_y(u)]}. Thus in the two-sample case alternative = "greater" includes distributions for which x is stochastically smaller than y (the CDF of x lies above and hence to the left of that for y), in contrast to \code{t.test} or \code{wilcox.test}. 11 | #' 12 | #' @return Pairwise Kolmogorov-Smirnov Test p-value Matrix 13 | #' 14 | #' @author Hui Lin, \email{longqiman@gmail.com} 15 | #' @examples 16 | #' \dontrun{ 17 | #' data("iris") 18 | #' value<-iris$Sepal.Length 19 | #' group<-iris$Species 20 | #' pairwise_ks_test(value,group,warning = -1) 21 | #' } 22 | #' @export 23 | 24 | pairwise_ks_test <- function(value, group, n_min = 50, warning = 0, alternative = "two.sided" ){ 25 | 26 | lev <- unique(group) 27 | 28 | lst <- lapply( seq_along(lev), function(i) value[group == lev[i]] ) 29 | names(lst)<-lev 30 | 31 | if (sum(lengths(lst)< n_min)) { 32 | lst <- lst [-which(lengths(lst)< n_min)]} 33 | 34 | f <- function(x, y){ 35 | w <- getOption("warn") 36 | options(warn = warning) # ignore warnings 37 | p <- ks.test(x, y, alternative = alternative, exact = 38 | F)$p.value 39 | options(warn = w) 40 | return(p) 41 | } 42 | 43 | res <- lapply(lst, function(x) lapply(lst, function(y) f(x, y))) 44 | 45 | res<-unlist(res) 46 | res <- matrix(res, nrow = length(lst), ncol = length(lst), byrow = T) 47 | row.names(res) <- colnames(res) <- names(lst) 48 | cat("Pairwise Kolmogorov-Smirnov Test p-value Matrix","\n","\n") 49 | return(res) 50 | } 51 | 52 | -------------------------------------------------------------------------------- /R/plot.cv_glasso.R: -------------------------------------------------------------------------------- 1 | #' plot the cross-validation curve produced by "\code{cv_glasso}" 2 | #' @description Plots the cross-validation curve as a function of the lambda values used. 3 | #' @param x fitted "\code{cv_glasso}" object 4 | #' @param type.measure criteria to use for cross-validation. Currently three options. The default is \code{type.measure = "auc"} which gives area under the ROC curve. \code{type.measure = "loglike"} computes the log-likelihood score in Meier et al2008. \code{type.measure = "maxco"} computes the maximum correlation coefficient in Yeo and Burge. 5 | #' @return A plot is produced, and nothing is returned. 6 | #' @author Hui Lin, \email{longqiman@gmail.com} 7 | #' @references L. Meier, S. van de Geer, and P. Buhlmann, The group lasso for logistic regression, J. R. Stat. Soc. Ser. B Stat. Methodol. 70 (2008), pp. 53-71. 8 | #' @references G.W. Yeo and C.B. Burge, Maximum entropy modeling of short sequence motifs with applications to RNA splicing signals, J. Computnl Biol. 11 (2004), pp. 475-494. 9 | #' 10 | #' @examples 11 | #' \dontrun{ 12 | #' data("sim1_da1") 13 | #' trainx = dplyr::select(sim1_da1, -y) 14 | #' trainy = sim1_da1$y 15 | #' # index of the group 16 | #' index <- gsub("\\..*", "", names(trainx)) 17 | #' # nlam is the number of values of tuning variable 18 | #' nlam <- 10 19 | #' # type of prediction 20 | #' type = "link" 21 | #' # number of cross-validation folds 22 | #' kfold <- 10 23 | #' cv.fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold) 24 | #' plot.cv_glasso(cv.fit) 25 | #' } 26 | #' 27 | #' @export 28 | plot.cv_glasso <- function (x, type.measure = "auc", ...) 29 | { 30 | cvobj <- x 31 | xlab <- "Lambda" 32 | y <- cvobj[[type.measure]] 33 | plot.args = list(x = cvobj$lambda, y = y, 34 | ylim = range(y)+c(-sd(y), sd(y)), xlab = xlab, ylab = type.measure, 35 | type = "n") 36 | new.args = list(...) 37 | if (length(new.args)) 38 | plot.args[names(new.args)] = new.args 39 | do.call("plot", plot.args) 40 | # error.bars(sign.lambda * log(cvobj$lambda), cvobj$cvup, cvobj$cvlo, 41 | # width = 0.01, col = "darkgrey") 42 | points(cvobj$lambda, cvobj[[type.measure]], pch = 20, 43 | col = "red") 44 | # axis(side = 3, at = cvobj$lambda, labels = paste(cvobj$nz), 45 | # tick = FALSE, line = 0) 46 | 47 | abline(v = cvobj[[paste("lambda.max",type.measure,sep=".")]][1], lty = 3) 48 | abline(v = cvobj[[paste("lambda.1se",type.measure,sep=".")]][1], lty = 3) 49 | invisible() 50 | } 51 | 52 | 53 | -------------------------------------------------------------------------------- /R/plot.rocTest.R: -------------------------------------------------------------------------------- 1 | #' Plot ROC curve 2 | #' @description Plot ROC curve based on rocTest object 3 | #' @param x rocTest object 4 | #' @author Hui Lin, \email{longqiman@gmail.com} 5 | 6 | #' @export 7 | plot.rocTest <- function(x, auto.legend = TRUE, ...) 8 | 9 | { 10 | 11 | plot(c(0,1), c(0,1), type = "n",ann=FALSE, asp=0,xaxs='i',yaxs='i',xaxt='n',yaxt='n') 12 | 13 | axis(1,c(0,.5,1)) 14 | 15 | axis(2,c(0,.5,1)) 16 | 17 | for (j in 1:length(x$sens)) { 18 | 19 | lines(I(1 - x$spec[[j]]), x$sens[[j]], lty = j) 20 | 21 | abline(0, 1, lty = 6) 22 | 23 | } 24 | 25 | if (auto.legend) legend(0.6,0.3, 26 | #"bottomright", 27 | lty = c(1:length(x$sens),6), 28 | 29 | #col = 1:length(x$sens), 30 | 31 | c('Group Lasso','Expert Opinion','Diagonal'), bty = "n",cex=1 32 | 33 | # xjust = 1, yjust = 0 34 | 35 | ) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /R/plot.tune_cutoff.R: -------------------------------------------------------------------------------- 1 | #' Plot cutoff tuning process from "\code{tune_cutoff}" 2 | #' @description Plots cutoff tuning process 3 | #' @param x "\code{tune_cutoff}" object 4 | #' @param pch type of cutoff points, default is 20 5 | #' @param cex size of cutoff points, default is 1.5, col = "red" 6 | #' @param ... other parameters in "\code{plot()}" function 7 | #' @param col color of cutoff points, default is \code{col = "red"} 8 | #' @author Hui Lin, \email{longqiman@gmail.com} 9 | #' 10 | #' @examples 11 | #' \dontrun{ 12 | #' data("sim1_da1") 13 | #' trainx = sim1_da1[,1:50] 14 | #' trainy = sim1_da1$y 15 | #' library(glmnet) 16 | #' fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial") 17 | #' test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min") 18 | #' test <- as.vector(test) 19 | #' summary(test) 20 | #' likelihood <- c(0.2, 0.5, 0.8) 21 | #' y <- trainy 22 | #' x <- tune_cutoff(test = test, y = y, likelihood = likelihood) 23 | #' str(x) 24 | #' plot.tune.cutoff(x) 25 | #' } 26 | #' 27 | #' @export 28 | plot.tune.cutoff <- function (x, pch= 20, cex = 1.5, col = "red", ...) { 29 | dat <- x$cutoff.tune 30 | plot(dat$PPV, dat$NPV, type = "l", xlab = "True Positive", ylab = "True Negative", 31 | main = paste("Likelihood groups:", paste(x$likelihood, collapse = ", ") ), ...) 32 | points(dat$PPV[x$cutoffs], dat$NPV[x$cutoffs], pch = pch, cex = cex, col = col) 33 | text(dat$PPV[x$cutoffs], dat$NPV[x$cutoffs]+0.1, x$cutoffs) 34 | } 35 | 36 | -------------------------------------------------------------------------------- /R/predict.fitglasso.R: -------------------------------------------------------------------------------- 1 | #' Predict method for grplasso objects 2 | #' @description Obtains predictions from a \code{grplasso} object. 3 | 4 | 5 | #' @param object a \code{grplasso} object 6 | #' @param newdata data.frame or design matrix of new observations 7 | #' @param type the type of prediction. \code{type = "link"} is on the scale of linear predictors, whereas \code{type = "response"} is on the scale of the response variable, i.e. \code{type = "response"} applies the inverse link function to the linear predictors. 8 | #' na.action function determining what should be done with missing values in newdata. The default is to predict NA. 9 | #' @param ... other options to be passed to the predict function. 10 | #' @export 11 | #' 12 | predict_glasso <- function(object, newdata, type = c("link", "response"), 13 | na.action = na.pass, ...) { 14 | library(grplasso) 15 | n <- nrow(newdata) 16 | # Do grouplasso logistic regression 17 | x <- cbind(1, as.matrix(newdata)) 18 | return(predict(object, x, type = type, na.action = na.action, ...)) 19 | } 20 | -------------------------------------------------------------------------------- /R/rocTest.R: -------------------------------------------------------------------------------- 1 | #' Compute the area under the ROC curve and compare different AUC's 2 | #' @description This function compares the AUC of two correlated (or paired) or uncorrelated (un- paired) ROC curves. 3 | #' @author Hui Lin, \email{longqiman@gmail.com} 4 | #' @param y response 5 | #' @param x prediction 6 | #' @param L list to assign the comparisons 7 | 8 | #' @export 9 | rocTest <- function(y, x, L = NULL) { 10 | 11 | trapezarea <- function (x, y) 12 | { 13 | if (x[1] > x[length(x)]) { 14 | x <- rev(x) 15 | y <- rev(y) 16 | } 17 | if (length(x) != length(y)) 18 | stop("length x must equal length y") 19 | if (length(unique(x)) < 2) 20 | return(NA) 21 | ya <- approx(x, y, 0, ties = max, rule = 2)$y 22 | yb <- approx(x, y, 1, ties = max, rule = 2)$y 23 | x <- c(0, x, 1) 24 | y <- c(ya, y, yb) 25 | h <- diff(x) 26 | lx <- length(x) 27 | area <- 0.5 * sum(h * (y[-1] + y[-lx])) 28 | area 29 | } 30 | 31 | th <- NULL 32 | sens <- spec <- list(rep(NULL, length(x))) 33 | for (j in 1:length(x)) { 34 | DD <- table(-x[[j]], y) 35 | sens[[j]] <- c(0, cumsum(DD[,2])/sum(DD[,2])) 36 | spec[[j]] <- c(1, 1 - cumsum(DD[,1])/sum(DD[,1])) 37 | th[j] <- trapezarea(1 - spec[[j]], sens[[j]]) 38 | } 39 | 40 | if (!is.null(names(x))) { 41 | names(sens) <- names(x) 42 | names(spec) <- names(x) 43 | } 44 | else { 45 | names(sens) <- paste("Test", LETTERS[1:length(x)]) 46 | names(spec) <- paste("Test", LETTERS[1:length(x)]) 47 | } 48 | 49 | if (!is.null(L)) { 50 | V10 <- matrix(NA, nrow = length(y[y == 1]), ncol = length(x)) 51 | V01 <- matrix(NA, nrow = length(y[y == 0]), ncol = length(x)) 52 | 53 | for (j in 1:length(x)) { 54 | x.s <- split(x[[j]], y) 55 | for (i in 1:length(x.s$"1")) 56 | V10[i, j] <- (length(x.s$"0"[x.s$"0" < x.s$"1"[i]]) + .5 * length(x.s$"0"[x.s$"0" == x.s$"1"[i]])) / length(y[y == 0]) 57 | for (i in 1:length(x.s$"0")) 58 | V01[i, j] <- (length(x.s$"1"[x.s$"0"[i] < x.s$"1"]) + .5 * length(x.s$"1"[x.s$"1" == x.s$"0"[i]])) / length(y[y == 1]) 59 | } 60 | 61 | S10 <- (t(V10) %*% V10 - length(y[y == 1]) * th %*% t(th)) / (length(y[y == 1]) - 1) 62 | S01 <- (t(V01) %*% V01 - length(y[y == 0]) * th %*% t(th)) / (length(y[y == 0]) - 1) 63 | 64 | S <- S10 / length(y[y == 1]) + S01 / length(y[y == 0]) 65 | 66 | contr <- L %*% th 67 | se <- sqrt((L %*% S %*% t(L))) 68 | 69 | test <- t(th) %*% t(L) %*% solve(L %*% (1 /length(y[y ==1]) * S10 + 1 / length(y[y ==0]) * S01) %*% t(L), t(t(th) %*% t(L))) 70 | 71 | p.value <- pchisq(test, df = qr(L %*% S %*% t(L))$rank, lower.tail = FALSE) 72 | } 73 | else { 74 | S <- NULL 75 | p.value <- NULL 76 | contr <- NULL 77 | } 78 | 79 | names(th) <- names(x) 80 | res <- list(th = th, sens = sens, spec = spec, contr = contr, S = S, p.value = p.value) 81 | class(res) <- "rocTest" 82 | invisible(res) 83 | } 84 | -------------------------------------------------------------------------------- /R/subclass_eff_est.R: -------------------------------------------------------------------------------- 1 | #' Estimation ATT and ATE after stratification 2 | #' @description After stratification, estimation the weighted mean difference. Return estimates of ATT and ATE 3 | #' @param object The output object from matchit. This is a required input. 4 | #' @param y outcome variable. This is a required input. 5 | #' @return 6 | #' Fuction returns a list with the ATT and ATE estimates 7 | #' @author Hui Lin, \email{longqiman@gmail.com} 8 | #' @examples 9 | #' \dontrun{ 10 | #' subclass_eff_est(m2.out, match.data(m2.out)$CORN_UNITS_CHG) 11 | #' } 12 | 13 | #' @export 14 | #' 15 | #' 16 | subclass_eff_est <- function(object, y){ 17 | 18 | mdat_str = tibble(subclass = object$subclass, 19 | treatment = object$treat, 20 | y = y 21 | ) 22 | 23 | 24 | TAOs = mdat_str%>% 25 | group_by(subclass,treatment)%>% 26 | summarise(ybar = mean(y))%>% 27 | ungroup()%>% 28 | group_by(subclass)%>% 29 | summarise(taos=diff(ybar)) 30 | 31 | Ns <- mdat_str%>% 32 | group_by(subclass)%>% 33 | summarise(Ns = n(), Ns1 = sum(treatment))%>% 34 | mutate(wt_ate = Ns/sum(Ns), wt_att = Ns1/sum(mdat_str$treatment)) 35 | 36 | tao_att = sum(TAOs$taos * Ns$wt_att) 37 | tao_ate = sum(TAOs$taos * Ns$wt_ate) 38 | 39 | ############### 40 | res <- list(tao_att = tao_att, 41 | tao_ate = tao_ate) 42 | return(res) 43 | invisible(res) 44 | } 45 | -------------------------------------------------------------------------------- /R/tune-cutoff.R: -------------------------------------------------------------------------------- 1 | #' Tune the cutoffs for different likelihood groups 2 | #' @description Tune the cutoffs for different likelihood groups given predicted score, response and a vector of likelihood values 3 | #' @param test a vector of predicted score from model 4 | #' @param y a vector of response, \code{y} need to be the same length of \code{test} 5 | #' @param likelihood a vector of likelihood values 6 | #' 7 | #' @return 8 | #' an object of class "\code{tune_cutoff}" is returned, which is a list with the ingredients of the tuning process. 9 | #' @author Hui Lin, \email{longqiman@gmail.com} 10 | #' @examples 11 | #' \dontrun{ 12 | #' data("sim1_da1") 13 | #' trainx = sim1_da1[,1:50] 14 | #' trainy = sim1_da1$y 15 | #' library(glmnet) 16 | #' fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial") 17 | #' test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min") 18 | #' test <- as.vector(test) 19 | #' summary(test) 20 | #' likelihood <- c(0.2, 0.5, 0.8) 21 | #' y <- trainy 22 | #' x <- tune_cutoff(test = test, y = y, likelihood = likelihood) 23 | #' str(x) 24 | #' plot.tune.cutoff(x) 25 | #' } 26 | #' 27 | #' @export 28 | #' 29 | tune_cutoff <- function(test, y, likelihood) { 30 | 31 | n <- length(y) 32 | ppv <- rep(-1, n) 33 | npv <- rep(-1, n) 34 | for (i in 1:n) { 35 | t <- sort(test)[i] 36 | as.numeric(test >= t) 37 | np <- sum(test >= t) 38 | nn <- n - np 39 | tp <- sum((test >= t) * y) 40 | tn <- sum((test < t) * (1 - y)) 41 | ppv[i] <- tp/np 42 | npv[i] <- tn/nn 43 | } 44 | 45 | ta <- data.frame(PPV = ppv, NPV = npv, cutoff = sort(test)) 46 | ResPre <- data.frame(cbind(cutoff = test, testClasses = y)) 47 | ResPre <- ResPre[order(ResPre$cutoff), ] 48 | ta$testClasses <- ResPre$testClasses 49 | # ------------------------------------------------------------------ 50 | # tune the cutoffs 51 | likelihood <- sort(likelihood) 52 | 53 | rk <- seq_along(likelihood) 54 | rk <- c(1, length(rk), rk[2:(length(rk) - 1)]) 55 | likelihood <- likelihood[rk] 56 | 57 | ids <- seq(1:(length(likelihood) - 1)) 58 | 59 | for (i in seq_along(ids)) { 60 | i=2 61 | p <- likelihood[i] 62 | 63 | id1s <- NULL 64 | 65 | ##------------------ the 1st group 66 | if (i == 1) { 67 | for (j in seq_along(ta$testClasses)) { 68 | if (mean(ta$testClasses[1:j], na.rm = T) <= p) { 69 | id1s <- c(id1s, j) 70 | } 71 | } 72 | ids[i] <- max(id1s, na.rm = T) 73 | } 74 | 75 | ##------------------ the 2nd group 76 | if (i == 2) { 77 | for (j in ids[1]:length(ta$testClasses)) { 78 | if (mean(ta$testClasses[j:length(ta$testClasses)], na.rm = T) >= p) { 79 | id1s <- c(id1s, j) 80 | } 81 | } 82 | ids[i] <- min(id1s, na.rm = T) 83 | } 84 | 85 | ##------------------ the 3rd group 86 | if (i == 3) { 87 | for (j in (ids[1] + 1):(ids[2] - 1)) { 88 | if (mean(ta$testClasses[(ids[1] + 1):j], na.rm = T) <= p) { 89 | id1s <- c(id1s, j) 90 | } 91 | } 92 | ids[i] <- max(id1s, na.rm = T) 93 | } 94 | ##------------------ the rest groups 95 | if (i > 3) { 96 | for (j in (ids[i - 1] + 1):(ids[2] - 1)) { 97 | if (mean(ta$testClasses[(ids[i - 1] + 1):j], na.rm = T) <= p) { 98 | id1s <- c(id1s, j) 99 | } 100 | } 101 | ids[i] <- max(id1s, na.rm = T) 102 | } 103 | } 104 | res <- list(cutoff.tune = ta, 105 | cutoffs = sort(ids), 106 | likelihood = sort(likelihood) 107 | ) 108 | 109 | class(res) <- "tune_cutoff" 110 | return(res) 111 | invisible(res) 112 | } 113 | 114 | 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The `NetlifyDS` package contains functions to streamline Netlify's data science project. It espouses the philosophy: Anything that can be automated should be automated. Do as much as possible with functions. 2 | 3 | Functions in this package are for the following purposes: 4 | 5 | - Build data pipeline 6 | - Predictive analytics (complementary to caret package) 7 | - Market research analysis (psychometric models and segmentation) 8 | - Unstructured data analysis (text mining, web scraping) 9 | 10 | `NetlifyDS` utilizes a number of R packages. It loads packages as needed and assumes that they are installed. Install the package using: 11 | 12 | ```r 13 | # if you haven't installed devtools 14 | # install.packages("devtools") 15 | devtools::install_github("netlify/NetlifyDS") 16 | library("NetlifyDS") 17 | ``` 18 | -------------------------------------------------------------------------------- /man/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/man/.DS_Store -------------------------------------------------------------------------------- /man/MeterRevActUserProduct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/MeterRevActUserProduct.R 3 | \name{MeterRevActUserProduct} 4 | \alias{MeterRevActUserProduct} 5 | \title{Calculate metered revenue accounting matrix per day, user and product} 6 | \usage{ 7 | MeterRevActUserProduct(datall_revenue, dates, type) 8 | } 9 | \arguments{ 10 | \item{datall_revenue}{subscription data, need to have columns: user_id, Product, Effective_Start, one_time_charge} 11 | 12 | \item{dates}{a date vector (have to be date type)} 13 | 14 | \item{type}{a vector of product names} 15 | } 16 | \value{ 17 | A metered revenue accounting matrix is returned for each day, user and product 18 | } 19 | \description{ 20 | It calculates the accounting matrix for each user and product given date, subscription data and product category. 21 | } 22 | \author{ 23 | Hui Lin, \email{longqiman@gmail.com} 24 | } 25 | -------------------------------------------------------------------------------- /man/RevAct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RevAct.R 3 | \name{RevAct} 4 | \alias{RevAct} 5 | \title{Calculate revenue accounting matrix} 6 | \usage{ 7 | RevAct(datall_revenue, type, dates) 8 | } 9 | \arguments{ 10 | \item{datall_revenue}{daily revenue data, need to have columns: user_id, Product, active_date, MRR} 11 | 12 | \item{type}{product type} 13 | 14 | \item{dates}{a date vector (have to be date type)} 15 | } 16 | \value{ 17 | A revenue accounting matrix is returned for each date. 18 | } 19 | \description{ 20 | It calculates the accounting matrix given date and daily revenue data. It only returns one row for each date, i.e. it will sum up all the products. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1) 25 | dates 26 | # need to get daily revenue from subscription data if you don't have that already 27 | base_monthly = do.call("rbind", lapply(dates, function(time_point) 28 | growth_matrix_identity = RevAct(datall_revenue = base_monthly, type = c("Identity"), dates = dates) 29 | } 30 | } 31 | \author{ 32 | Hui Lin, \email{longqiman@gmail.com} 33 | } 34 | -------------------------------------------------------------------------------- /man/RevActUserProduct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RevActUserProduct.R 3 | \name{RevActUserProduct} 4 | \alias{RevActUserProduct} 5 | \title{Calculate revenue accounting matrix per day, user and product} 6 | \usage{ 7 | RevActUserProduct(datall_revenue, dates, dates_full, type) 8 | } 9 | \arguments{ 10 | \item{datall_revenue}{daily revenue data, need to have columns: user_id, Product, active_date, MRR} 11 | 12 | \item{dates}{a date vector (have to be date type)} 13 | 14 | \item{dates_full}{a full range of dates} 15 | 16 | \item{type}{a vector of product names} 17 | } 18 | \value{ 19 | A revenue accounting matrix is returned for each day, user and product 20 | } 21 | \description{ 22 | It calculates the accounting matrix for each user and product given date, subscription data and product category. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | team <- c("netlify_team_premium_monthly","Teams","netlify_team_plus_monthly","netlify_team_plus_yearly") 27 | res_0 = RevActUserProduct(total_revenue, dates, team) 28 | } 29 | } 30 | \author{ 31 | Hui Lin, \email{longqiman@gmail.com} 32 | } 33 | -------------------------------------------------------------------------------- /man/RevCntAct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RevCntAct.R 3 | \name{RevCntAct} 4 | \alias{RevCntAct} 5 | \title{Calculate revenue accounting matrix by counts} 6 | \usage{ 7 | RevCntAct(datall_revenue, type, dates) 8 | } 9 | \arguments{ 10 | \item{datall_revenue}{subscription data, need to have columns: user_id, Product, Effective_Start, Effective_End, MRR} 11 | 12 | \item{type}{product type} 13 | 14 | \item{dates}{a date vector (have to be date type)} 15 | } 16 | \value{ 17 | A revenue accounting matrix by count is returned for each date. 18 | } 19 | \description{ 20 | It calculates the accounting matrix by counts given date and subscription data. It only returns one row for each date, i.e. it will sum up all the products. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1) 25 | dates 26 | # need to get daily revenue from subscription data if you don't have that already 27 | base_monthly = do.call("rbind", lapply(dates, function(time_point) 28 | {NetlifyDS::mrrAtDateType(time_point, subs = base_monthly, type = NULL)}) 29 | growth_matrix_identity = RevCntAct(datall_revenue = total_revenue, type = c("Identity"), dates = dates) 30 | } 31 | } 32 | \author{ 33 | Hui Lin, \email{longqiman@gmail.com} 34 | } 35 | -------------------------------------------------------------------------------- /man/clust_ana.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ClusterAnalysis.R 3 | \name{clust_ana} 4 | \alias{clust_ana} 5 | \title{Cluster analysis} 6 | \usage{ 7 | clust_ana(ans, k, distm = "euclidean", hclustm = "ward.D", 8 | na.rm = FALSE) 9 | } 10 | \arguments{ 11 | \item{ans}{data matrix} 12 | 13 | \item{k}{number of clusters} 14 | 15 | \item{distm}{Dissimilarity index, partial match to "\code{manhattan}", "\code{euclidean}"(default), "\code{canberra}", "\code{bray}", "\code{kulczynski}", "\code{jaccard}", "\code{gower}", "\code{altGower}", "\code{morisita}", "\code{horn}", "\code{mountford}", "\code{raup}" , "\code{binomial}", "\code{chao}", "\code{cao}" or "\code{mahalanobis}".} 16 | 17 | \item{hclustm}{the agglomeration method to be used. This should be (an unambiguous abbreviation of) one of "\code{ward.D}"(default), "\code{ward.D2}", "\code{single}", "\code{complete}", "\code{average}" (= UPGMA), "\code{mcquitty}" (= WPGMA), "\code{median}" (= WPGMC) or "\code{centroid}" (= UPGMC).} 18 | 19 | \item{na.rm}{Pairwise deletion of missing observations when computing dissimilarities.} 20 | } 21 | \description{ 22 | Does cluster analysis and report an index of cluster 23 | } 24 | \examples{ 25 | \dontrun{ 26 | data("SegData") 27 | library(dplyr) 28 | library(vegan) 29 | cind <- clust_ana(dat, k=4) 30 | } 31 | } 32 | \author{ 33 | Hui Lin, \email{longqiman@gmail.com} 34 | } 35 | -------------------------------------------------------------------------------- /man/cv_glasso.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cv-glasso.R 3 | \name{cv_glasso} 4 | \alias{cv_glasso} 5 | \title{Cross-validation for group lasso logistic regression} 6 | \usage{ 7 | cv_glasso(trainx, trainy, nlam = 100, type = "link", kfold = 10, 8 | na_action = na.pass) 9 | } 10 | \arguments{ 11 | \item{trainx}{a data frame where samples are in rows and features are in columns} 12 | 13 | \item{trainy}{a numeric or factor vector containing the outcome for each sample} 14 | 15 | \item{nlam}{number of lambda values. The default is 100} 16 | 17 | \item{type}{the type of prediction. \code{type = 'link'} is on the scale of linear predictors (default), whereas \code{type = 'response'} is on the scale of the response variable, i.e. \code{type = 'response'} applies the inverse link function to the linear predictors.} 18 | 19 | \item{kfold}{number of folds - default is 10. Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. The default is 10.} 20 | 21 | \item{na_action}{function determining what should be done with missing values when predicting new data during cross validation. The default is to predict NA.} 22 | } 23 | \value{ 24 | an object of class "\code{cv_glasso}" is returned, which is a list with the ingredients of the cross-validation fit. 25 | } 26 | \description{ 27 | Does k-fold cross-validation for group lasso logistic regression and returns a list object 28 | } 29 | \examples{ 30 | \dontrun{ 31 | data("sim1_da1") 32 | trainx = dplyr::select(sim1_da1, -y) 33 | trainy = sim1_da1$y 34 | # index of the group 35 | index <- gsub("\\\\..*", "", names(trainx)) 36 | # nlam is the number of values of tuning variable 37 | nlam <- 10 38 | # type of prediction 39 | type = "link" 40 | # number of cross-validation folds 41 | kfold <- 10 42 | cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold) 43 | str(cv_fit) 44 | } 45 | } 46 | \author{ 47 | Hui Lin, \email{longqiman@gmail.com} 48 | } 49 | -------------------------------------------------------------------------------- /man/fitglasso.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fitglasso.R 3 | \name{fitglasso} 4 | \alias{fitglasso} 5 | \title{Fit roup lasso logistic regression} 6 | \usage{ 7 | fitglasso(trainx, trainy, lambda, na_action = na.pass) 8 | } 9 | \arguments{ 10 | \item{trainx}{a data frame where samples are in rows and features are in columns} 11 | 12 | \item{trainy}{a numeric or factor vector containing the outcome for each sample} 13 | 14 | \item{lambda}{value of tuning parameter lambda} 15 | } 16 | \value{ 17 | A \code{grplasso} object is returned, for which \code{coef}, \code{print}, \code{plot} and \code{predict} methods exist. 18 | } 19 | \description{ 20 | Fit roup lasso logistic regression and returns a list object 21 | } 22 | \examples{ 23 | \dontrun{ 24 | data("sim1_da1") 25 | trainx = dplyr::select(sim1_da1, -y) 26 | trainy = sim1_da1$y 27 | # index of the group 28 | index <- gsub("\\\\..*", "", names(trainx)) 29 | # nlam is the number of values of tuning variable 30 | nlam <- 20 31 | # type of prediction 32 | type = "link" 33 | # number of cross-validation folds 34 | kfold <- 10 35 | cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold) 36 | str(cv_fit) 37 | 38 | fitgl <- fitglasso(trainx = trainx, trainy= trainy, lambda = cv_fit$lambda.max.auc[1]) 39 | } 40 | } 41 | \author{ 42 | Hui Lin, \email{longqiman@gmail.com} 43 | } 44 | -------------------------------------------------------------------------------- /man/impute_dat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/impute-dat.R 3 | \name{impute_dat} 4 | \alias{impute_dat} 5 | \title{Impute missing values} 6 | \usage{ 7 | impute_dat(dat, method) 8 | } 9 | \arguments{ 10 | \item{dat}{a data frame.} 11 | 12 | \item{method}{imputation method. Need to be one of these: \code{zero}, \code{mean}, \code{Inf0}, \code{code99}} 13 | } 14 | \description{ 15 | Impute missing values 16 | } 17 | \examples{ 18 | \dontrun{ 19 | library(DataScienceR) 20 | data("SegData") 21 | impdat <- SegData[, c("income", "house")] 22 | # summary(SegData) 23 | impdat = impute_dat(impdat, method = "mean") 24 | summary(impdat) 25 | } 26 | } 27 | \author{ 28 | Hui Lin, \email{longqiman@gmail.com} 29 | } 30 | -------------------------------------------------------------------------------- /man/mrrAtDateType.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mrrAtDateType.R 3 | \name{mrrAtDateType} 4 | \alias{mrrAtDateType} 5 | \title{Calculate MRR for a specific date and product type} 6 | \usage{ 7 | mrrAtDateType(time_point, subs, type = NULL) 8 | } 9 | \arguments{ 10 | \item{time_point}{a date value (have to be date type)} 11 | 12 | \item{subs}{subscription data, need to have columns: account_id, user_id, Product, Effective_Start, Effective_End, MRR} 13 | 14 | \item{type}{product type, if "NULL", it will return MRR for each product} 15 | } 16 | \value{ 17 | a vector of MRR is returned 18 | } 19 | \description{ 20 | It calculates the MRR given date, subscription data, and product type 21 | } 22 | \examples{ 23 | \dontrun{ 24 | add_ons <- c("Forms", "Functions", "Identity", "Domains", "Large Media") 25 | dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1) 26 | dates 27 | add_ons_mrr <- sapply(dates, function(time_point) {mrrAtDateType(time_point, datall_revenue, add_ons)}) 28 | } 29 | } 30 | \author{ 31 | Hui Lin, \email{longqiman@gmail.com} 32 | } 33 | -------------------------------------------------------------------------------- /man/multiplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/multiplot.R 3 | \name{multiplot} 4 | \alias{multiplot} 5 | \title{Multiple plot function} 6 | \usage{ 7 | multiplot(..., plotlist = NULL, file, cols = 1, layout = NULL) 8 | } 9 | \arguments{ 10 | \item{...}{Pass ggplot objects or plotlist (as a list of ggplot objects)} 11 | 12 | \item{cols}{Number of columns in layout} 13 | 14 | \item{layout}{A matrix specifying the layout. If present, 'cols' is ignored. If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), then plot 1 will go in the upper left, 2 will go in the upper right, and 3 will go all the way across the bottom.} 15 | } 16 | \description{ 17 | make multiple plot for ggplot objects. 18 | } 19 | -------------------------------------------------------------------------------- /man/out_mad.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Outliers.R 3 | \name{out_mad} 4 | \alias{out_mad} 5 | \title{Identify outliers using MAD} 6 | \usage{ 7 | out_mad(x) 8 | } 9 | \arguments{ 10 | \item{x}{a vector} 11 | } 12 | \value{ 13 | a vector indicating identified outliers 14 | } 15 | \description{ 16 | Identify outliers using MAD 17 | } 18 | \examples{ 19 | \dontrun{ 20 | x<-c(seq(1:1000),20000) 21 | out_mad(x) 22 | } 23 | } 24 | \author{ 25 | Hui Lin, \email{longqiman@gmail.com} 26 | } 27 | -------------------------------------------------------------------------------- /man/pairwise_ks_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pairwise_ks_test.R 3 | \name{pairwise_ks_test} 4 | \alias{pairwise_ks_test} 5 | \title{Pairwise Kolmogorov-Smirnov Test} 6 | \usage{ 7 | pairwise_ks_test(value, group, n_min = 50, warning = 0, 8 | alternative = "two.sided") 9 | } 10 | \arguments{ 11 | \item{value}{a numeric vector of data values} 12 | 13 | \item{group}{a group indicator} 14 | 15 | \item{n_min}{The minimum number of observations in each group. Group(s) with observation less than \code{n_min} will be removed. "\code{n_min=50}"(default)} 16 | 17 | \item{warning}{sets the handling of warning messages. If \code{warning} is negative all warnings are ignored. If \code{warning} is zero (the default) warnings are stored until the top–level function returns. If 10 or fewer warnings were signalled they will be printed otherwise a message saying how many were signalled. An object called last.warning is created and can be printed through the function warnings. If \code{warning} is one, warnings are printed as they occur. If warn is two or larger all warnings are turned into errors.} 18 | 19 | \item{alternative}{indicates the alternative hypothesis and must be one of "\code{two.sided}" (default), "\code{less}", or "\code{greater}". You can specify just the initial letter of the value, but the argument name must be give in full. See ‘Details’ for the meanings of the possible values.} 20 | } 21 | \value{ 22 | Pairwise Kolmogorov-Smirnov Test p-value Matrix 23 | } 24 | \description{ 25 | Perform Pairwise Multiple Comparisons Using Kolmogorov-Smirnov Test 26 | } 27 | \details{ 28 | Missing values are silently omitted from x and (in the two-sample case) y. 29 | 30 | The possible values "\code{two.sided}", "\code{less}" and "\code{greater}" of alternative specify the null hypothesis that the true distribution function of x is equal to, not less than or not greater than the hypothesized distribution function (one-sample case) or the distribution function of y (two-sample case), respectively. This is a comparison of cumulative distribution functions, and the test statistic is the maximum difference in value, with the statistic in the "greater" alternative being \eqn{D^+ = max[F_x(u) - F_y(u)]}. Thus in the two-sample case alternative = "greater" includes distributions for which x is stochastically smaller than y (the CDF of x lies above and hence to the left of that for y), in contrast to \code{t.test} or \code{wilcox.test}. 31 | } 32 | \examples{ 33 | \dontrun{ 34 | data("iris") 35 | value<-iris$Sepal.Length 36 | group<-iris$Species 37 | pairwise_ks_test(value,group,warning = -1) 38 | } 39 | } 40 | \author{ 41 | Hui Lin, \email{longqiman@gmail.com} 42 | } 43 | -------------------------------------------------------------------------------- /man/plot.cv_glasso.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.cv_glasso.R 3 | \name{plot.cv_glasso} 4 | \alias{plot.cv_glasso} 5 | \title{plot the cross-validation curve produced by "\code{cv_glasso}"} 6 | \usage{ 7 | \method{plot}{cv_glasso}(x, type.measure = "auc", ...) 8 | } 9 | \arguments{ 10 | \item{x}{fitted "\code{cv_glasso}" object} 11 | 12 | \item{type.measure}{criteria to use for cross-validation. Currently three options. The default is \code{type.measure = "auc"} which gives area under the ROC curve. \code{type.measure = "loglike"} computes the log-likelihood score in Meier et al2008. \code{type.measure = "maxco"} computes the maximum correlation coefficient in Yeo and Burge.} 13 | } 14 | \value{ 15 | A plot is produced, and nothing is returned. 16 | } 17 | \description{ 18 | Plots the cross-validation curve as a function of the lambda values used. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | data("sim1_da1") 23 | trainx = dplyr::select(sim1_da1, -y) 24 | trainy = sim1_da1$y 25 | # index of the group 26 | index <- gsub("\\\\..*", "", names(trainx)) 27 | # nlam is the number of values of tuning variable 28 | nlam <- 10 29 | # type of prediction 30 | type = "link" 31 | # number of cross-validation folds 32 | kfold <- 10 33 | cv.fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold) 34 | plot.cv_glasso(cv.fit) 35 | } 36 | 37 | } 38 | \references{ 39 | L. Meier, S. van de Geer, and P. Buhlmann, The group lasso for logistic regression, J. R. Stat. Soc. Ser. B Stat. Methodol. 70 (2008), pp. 53-71. 40 | 41 | G.W. Yeo and C.B. Burge, Maximum entropy modeling of short sequence motifs with applications to RNA splicing signals, J. Computnl Biol. 11 (2004), pp. 475-494. 42 | } 43 | \author{ 44 | Hui Lin, \email{longqiman@gmail.com} 45 | } 46 | -------------------------------------------------------------------------------- /man/plot.rocTest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.rocTest.R 3 | \name{plot.rocTest} 4 | \alias{plot.rocTest} 5 | \title{Plot ROC curve} 6 | \usage{ 7 | \method{plot}{rocTest}(x, auto.legend = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{x}{rocTest object} 11 | } 12 | \description{ 13 | Plot ROC curve based on rocTest object 14 | } 15 | \author{ 16 | Hui Lin, \email{longqiman@gmail.com} 17 | } 18 | -------------------------------------------------------------------------------- /man/plot.tune.cutoff.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.tune_cutoff.R 3 | \name{plot.tune.cutoff} 4 | \alias{plot.tune.cutoff} 5 | \title{Plot cutoff tuning process from "\code{tune_cutoff}"} 6 | \usage{ 7 | \method{plot}{tune.cutoff}(x, pch = 20, cex = 1.5, col = "red", ...) 8 | } 9 | \arguments{ 10 | \item{x}{"\code{tune_cutoff}" object} 11 | 12 | \item{pch}{type of cutoff points, default is 20} 13 | 14 | \item{cex}{size of cutoff points, default is 1.5, col = "red"} 15 | 16 | \item{col}{color of cutoff points, default is \code{col = "red"}} 17 | 18 | \item{...}{other parameters in "\code{plot()}" function} 19 | } 20 | \description{ 21 | Plots cutoff tuning process 22 | } 23 | \examples{ 24 | \dontrun{ 25 | data("sim1_da1") 26 | trainx = sim1_da1[,1:50] 27 | trainy = sim1_da1$y 28 | library(glmnet) 29 | fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial") 30 | test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min") 31 | test <- as.vector(test) 32 | summary(test) 33 | likelihood <- c(0.2, 0.5, 0.8) 34 | y <- trainy 35 | x <- tune_cutoff(test = test, y = y, likelihood = likelihood) 36 | str(x) 37 | plot.tune.cutoff(x) 38 | } 39 | 40 | } 41 | \author{ 42 | Hui Lin, \email{longqiman@gmail.com} 43 | } 44 | -------------------------------------------------------------------------------- /man/predict_glasso.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict.fitglasso.R 3 | \name{predict_glasso} 4 | \alias{predict_glasso} 5 | \title{Predict method for grplasso objects} 6 | \usage{ 7 | predict_glasso(object, newdata, type = c("link", "response"), 8 | na.action = na.pass, ...) 9 | } 10 | \arguments{ 11 | \item{object}{a \code{grplasso} object} 12 | 13 | \item{newdata}{data.frame or design matrix of new observations} 14 | 15 | \item{type}{the type of prediction. \code{type = "link"} is on the scale of linear predictors, whereas \code{type = "response"} is on the scale of the response variable, i.e. \code{type = "response"} applies the inverse link function to the linear predictors. 16 | na.action function determining what should be done with missing values in newdata. The default is to predict NA.} 17 | 18 | \item{...}{other options to be passed to the predict function.} 19 | } 20 | \description{ 21 | Obtains predictions from a \code{grplasso} object. 22 | } 23 | -------------------------------------------------------------------------------- /man/rocTest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rocTest.R 3 | \name{rocTest} 4 | \alias{rocTest} 5 | \title{Compute the area under the ROC curve and compare different AUC's} 6 | \usage{ 7 | rocTest(y, x, L = NULL) 8 | } 9 | \arguments{ 10 | \item{y}{response} 11 | 12 | \item{x}{prediction} 13 | 14 | \item{L}{list to assign the comparisons} 15 | } 16 | \description{ 17 | This function compares the AUC of two correlated (or paired) or uncorrelated (un- paired) ROC curves. 18 | } 19 | \author{ 20 | Hui Lin, \email{longqiman@gmail.com} 21 | } 22 | -------------------------------------------------------------------------------- /man/subclass_eff_est.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/subclass_eff_est.R 3 | \name{subclass_eff_est} 4 | \alias{subclass_eff_est} 5 | \title{Estimation ATT and ATE after stratification} 6 | \usage{ 7 | subclass_eff_est(object, y) 8 | } 9 | \arguments{ 10 | \item{object}{The output object from matchit. This is a required input.} 11 | 12 | \item{y}{outcome variable. This is a required input.} 13 | } 14 | \value{ 15 | Fuction returns a list with the ATT and ATE estimates 16 | } 17 | \description{ 18 | After stratification, estimation the weighted mean difference. Return estimates of ATT and ATE 19 | } 20 | \examples{ 21 | \dontrun{ 22 | subclass_eff_est(m2.out, match.data(m2.out)$CORN_UNITS_CHG) 23 | } 24 | } 25 | \author{ 26 | Hui Lin, \email{longqiman@gmail.com} 27 | } 28 | -------------------------------------------------------------------------------- /man/tune_cutoff.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tune-cutoff.R 3 | \name{tune_cutoff} 4 | \alias{tune_cutoff} 5 | \title{Tune the cutoffs for different likelihood groups} 6 | \usage{ 7 | tune_cutoff(test, y, likelihood) 8 | } 9 | \arguments{ 10 | \item{test}{a vector of predicted score from model} 11 | 12 | \item{y}{a vector of response, \code{y} need to be the same length of \code{test}} 13 | 14 | \item{likelihood}{a vector of likelihood values} 15 | } 16 | \value{ 17 | an object of class "\code{tune_cutoff}" is returned, which is a list with the ingredients of the tuning process. 18 | } 19 | \description{ 20 | Tune the cutoffs for different likelihood groups given predicted score, response and a vector of likelihood values 21 | } 22 | \examples{ 23 | \dontrun{ 24 | data("sim1_da1") 25 | trainx = sim1_da1[,1:50] 26 | trainy = sim1_da1$y 27 | library(glmnet) 28 | fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial") 29 | test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min") 30 | test <- as.vector(test) 31 | summary(test) 32 | likelihood <- c(0.2, 0.5, 0.8) 33 | y <- trainy 34 | x <- tune_cutoff(test = test, y = y, likelihood = likelihood) 35 | str(x) 36 | plot.tune.cutoff(x) 37 | } 38 | 39 | } 40 | \author{ 41 | Hui Lin, \email{longqiman@gmail.com} 42 | } 43 | --------------------------------------------------------------------------------