├── .DS_Store
├── .Rbuildignore
├── .github
    └── CODEOWNERS
├── .gitignore
├── Archive
    ├── .DS_Store
    ├── MeterRevAct.R
    ├── RevActUser.R
    ├── RevAct_Q.R
    ├── old_version_RevActUser.R
    ├── summarySE2.R
    └── summarySE3.R
├── DESCRIPTION
├── Data
    ├── AirlineRating.RData
    ├── SegData.RData
    ├── df_encirca.RData
    ├── df_mon.RData
    ├── df_phi.RData
    ├── links.RData
    ├── nodes.RData
    ├── sim1_da1.RData
    ├── sim1_da2.RData
    ├── sim1_da3.RData
    └── tweets.RData
├── NAMESPACE
├── NetlifyDS.Rproj
├── R
    ├── .DS_Store
    ├── ClusterAnalysis.R
    ├── MeterRevActUserProduct.R
    ├── NoAcctDateType.R
    ├── Outliers.R
    ├── RevAct.R
    ├── RevActUserProduct.R
    ├── RevCntAct.R
    ├── cv-glasso.R
    ├── fitglasso.R
    ├── impute-dat.R
    ├── mrrAtDateType.R
    ├── multiplot.R
    ├── pairwise_ks_test.R
    ├── plot.cv_glasso.R
    ├── plot.rocTest.R
    ├── plot.tune_cutoff.R
    ├── predict.fitglasso.R
    ├── rocTest.R
    ├── subclass_eff_est.R
    └── tune-cutoff.R
├── README.md
└── man
    ├── .DS_Store
    ├── MeterRevActUserProduct.Rd
    ├── RevAct.Rd
    ├── RevActUserProduct.Rd
    ├── RevCntAct.Rd
    ├── clust_ana.Rd
    ├── cv_glasso.Rd
    ├── fitglasso.Rd
    ├── impute_dat.Rd
    ├── mrrAtDateType.Rd
    ├── multiplot.Rd
    ├── out_mad.Rd
    ├── pairwise_ks_test.Rd
    ├── plot.cv_glasso.Rd
    ├── plot.rocTest.Rd
    ├── plot.tune.cutoff.Rd
    ├── predict_glasso.Rd
    ├── rocTest.Rd
    ├── subclass_eff_est.Rd
    └── tune_cutoff.Rd


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/.DS_Store


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$       # An Rmarkdown file used to generate README.md
4 | ^Archive$
5 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @netlify/datascience
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/Archive/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Archive/.DS_Store


--------------------------------------------------------------------------------
/Archive/MeterRevAct.R:
--------------------------------------------------------------------------------
  1 | #' Calculate metered revenue accounting matrix
  2 | #' @description It calculates the metered accounting matrix given date and subscription data. It only returns one row for each date, i.e. it will sum up all the products.
  3 | #' @param dates  a date vector (have to be date type)
  4 | #' @param datall_revenue subscription data, need to have columns:  user_id, Product, Effective_Start, one_time_charge
  5 | #' @param type product type
  6 | #' @return A revenue accounting matrix is returned for each date.
  7 | #' @author Hui Lin, \email{longqiman@gmail.com}
  8 | #' @examples
  9 | #' \dontrun{
 10 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1)
 11 | #' dates
 12 | #' metered_growth_matrix = MeterRevAct(datall_revenue = meter_revenue, type=one_time_charge_product, dates = dates)
 13 | #' }
 14 | #' @export
 15 | 
 16 | MeterRevAct <- function(datall_revenue, type, dates){
 17 | 
 18 |   dat <- datall_revenue %>%
 19 |     ## change this filter to get result for different products
 20 |     filter(Product %in% type)%>%
 21 |     transmute(user_id=user_id,
 22 |               one_time_charge = one_time_charge,
 23 |               Effective_Start = as.Date(Effective_Start))
 24 | 
 25 |   ## set up the matrix and fiil in values
 26 |   cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn")
 27 |   grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>%
 28 |     data.frame()
 29 |   names(grow_matrix) <- cnam
 30 |   # i=10
 31 | 
 32 |   for (i in 1:length(dates)){
 33 |     ####################################################################
 34 |     ############# This block of code is for the previous defition of MRR
 35 |     ############# based on the first day of each month
 36 |     # current month is from the 2nd day of last month to current day
 37 |     # int_start = dates[i] %m-% months(1) + days(1)
 38 |     # int_end = dates[i]
 39 |     # current = interval(int_start, int_end) # current month
 40 | 
 41 |     # previous month
 42 |     # int_start = dates[i] %m-% months(2) + days(1)
 43 |     # int_end = dates[i] %m-% months(1)
 44 |     # previous = interval(int_start, int_end)
 45 | 
 46 | 
 47 |     # all before until previous month
 48 |     # int_start = dates[i] %m-% months(12*100)
 49 |     # int_end = dates[i] %m-% months(2)
 50 |     # past = interval(int_start, int_end)
 51 |     ####################################################################
 52 | 
 53 |     # current month is from the 1nd day to the last day each month
 54 |     int_start_current = floor_date(dates[i], unit = "month")
 55 |     int_end_current = dates[i]
 56 |     current = interval(int_start_current, int_end_current) # current month
 57 | 
 58 |     # calculate new revenue which is from new customer this month
 59 |     active_current <- dat %>%
 60 |       filter(Effective_Start %within% current) %>%
 61 |       group_by(user_id) %>%
 62 |       summarise(one_time_charge = round(sum(one_time_charge), 2)) %>%
 63 |       select(user_id, one_time_charge_current = one_time_charge)
 64 | 
 65 |     # previous month
 66 |     int_start_previous = floor_date(dates[i], unit = "month")  %m-% months(1)
 67 |     int_end_previous = ceiling_date(int_start_previous, unit = "month") - days(1)
 68 |     previous = interval(int_start_previous, int_end_previous)
 69 | 
 70 |     active_last_month <- dat %>%
 71 |       filter(Effective_Start %within% previous) %>%
 72 |       group_by(user_id) %>%
 73 |       summarise(one_time_charge = round(sum(one_time_charge), 2)) %>%
 74 |       select(user_id, one_time_charge_last_month = one_time_charge)
 75 | 
 76 |     # all before until previous month
 77 |     int_start_past = dates[i] %m-% months(12*100)
 78 |     int_end_past = floor_date(dates[i], unit = "month")  %m-% months(1)
 79 |     int_end_past = int_end_past - days(1)
 80 |     past = interval(int_start_past, int_end_past)
 81 | 
 82 |     active_past <- dat %>%
 83 |       filter(Effective_Start %within% past) %>%
 84 |       group_by(user_id) %>%
 85 |       summarise(one_time_charge = round(sum(one_time_charge), 2)) %>%
 86 |       select(user_id, one_time_charge_past = one_time_charge)
 87 | 
 88 |     alltable <- merge(active_current, active_last_month, all = T) %>%
 89 |       merge(active_past, all=T)
 90 | 
 91 |     new <- alltable %>%
 92 |       filter( (is.na(one_time_charge_past) | one_time_charge_past <= 0) &  (is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0 ) ) %>%
 93 |       summarise(one_time_charge = sum(one_time_charge_current, na.rm = T))
 94 | 
 95 |     resurrected <- alltable %>%
 96 |       filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0  )%>%
 97 |       filter( is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0) %>%
 98 |       filter( (!is.na(one_time_charge_past)) & one_time_charge_past > 0 ) %>%
 99 |       summarise(one_time_charge = sum(one_time_charge_current, na.rm = T))
100 | 
101 |     # an alternative way is to add up the smaller number from MRR_current and MRR_last_month
102 |     retain1 <- alltable %>%
103 |       filter( (!is.na(one_time_charge_current)) &  one_time_charge_current > 0 ) %>%
104 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>%
105 |       filter(one_time_charge_current >= one_time_charge_last_month) %>%
106 |       summarise(one_time_charge = sum(one_time_charge_last_month, na.rm = T))
107 | 
108 |     retain2 <- alltable %>%
109 |       filter( (!is.na(one_time_charge_current)) &  one_time_charge_current > 0 ) %>%
110 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>%
111 |       filter(one_time_charge_current < one_time_charge_last_month) %>%
112 |       summarise(one_time_charge = sum(one_time_charge_current, na.rm = T))
113 | 
114 |     retain = retain1 + retain2
115 | 
116 |     expansion <- alltable %>%
117 |       filter(   (!is.na(one_time_charge_current)) &  one_time_charge_current > 0 ) %>%
118 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>%
119 |       filter(one_time_charge_current > one_time_charge_last_month) %>%
120 |       summarise(one_time_charge = sum(one_time_charge_current, na.rm = T) - sum(one_time_charge_last_month, na.rm = T) )
121 | 
122 |     contraction <- alltable %>%
123 |       filter( (!is.na(one_time_charge_current)) &  one_time_charge_current > 0  )%>%
124 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>%
125 |       filter(one_time_charge_current < one_time_charge_last_month) %>%
126 |       summarise(one_time_charge = sum(one_time_charge_last_month, na.rm = T)-sum(one_time_charge_current, na.rm = T))
127 | 
128 |     churn <- alltable %>%
129 |       filter( is.na(one_time_charge_current) | one_time_charge_current <= 0 ) %>%
130 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>%
131 |       summarise(one_time_charge = sum(one_time_charge_last_month, na.rm = T))
132 | 
133 |     grow_matrix[i, "New"] <- new
134 |     grow_matrix[i, "Retain"] <- retain
135 |     grow_matrix[i, "Resurrected"] <- resurrected
136 |     grow_matrix[i, "Expansion"] <- expansion
137 |     grow_matrix[i, "Contraction"] <- contraction
138 |     grow_matrix[i, "Churn"] <- churn
139 |   }
140 | 
141 |   grow_matrix$dates <- dates
142 | 
143 |   grow_matrix <- grow_matrix %>%
144 |     mutate(total_one_time_charge = New + Retain + Resurrected + Expansion)
145 | 
146 |   return(grow_matrix)
147 | }
148 | 


--------------------------------------------------------------------------------
/Archive/RevActUser.R:
--------------------------------------------------------------------------------
  1 | #' Calculate revenue accounting matrix for each day and user
  2 | #' @description It calculates the accounting matrix given date, subscription data.
  3 | #' @param dates  a date vector (have to be date type)
  4 | #' @param datall_revenue subscription data, need to have columns:  user_id, Effective_Start, Effective_End, MRR
  5 | #' @return A revenue accounting matrix is returned for each day and user
  6 | #' @author Hui Lin, \email{longqiman@gmail.com}
  7 | #' @examples
  8 | #' \dontrun{
  9 | #' res = RevActUser(total_revenue, dates)
 10 | #' }
 11 | #' @export
 12 | 
 13 | 
 14 | RevActUser <- function(datall_revenue, dates){
 15 | 
 16 |   dat <- total_revenue %>%
 17 |     # change this filter to get result for different products
 18 |     # filter(Product %in% type)%>%
 19 |     transmute(user_id=user_id,
 20 |               MRR= MRR,
 21 |               Effective_Start = as.Date(Effective_Start),
 22 |               Effective_End = as.Date(Effective_End))
 23 | 
 24 |   res_test = do.call("rbind",
 25 |                      lapply(dates, function(time_point){
 26 | 
 27 |                        # current month is from the 1nd day to the last day each month
 28 |                        int_start = floor_date(time_point, unit = "month")
 29 |                        int_end = time_point
 30 |                        current = interval(int_start, int_end) # current month
 31 | 
 32 |                        # calculate new revenue which is from new customer this month
 33 |                        active_current <- dat %>%
 34 |                          filter(Effective_Start <= time_point) %>%
 35 |                          filter(is.na(Effective_End) | Effective_End > time_point) %>%
 36 |                          group_by(user_id) %>%
 37 |                          summarise(MRR = round(sum(MRR), 2)) %>%
 38 |                          select(user_id, MRR_current = MRR)
 39 | 
 40 |                        # previous month
 41 |                        int_start = floor_date(time_point, unit = "month")  %m-% months(1)
 42 |                        int_end = ceiling_date(int_start, unit = "month") - days(1)
 43 |                        previous = interval(int_start, int_end)
 44 | 
 45 |                        active_last_month <- dat %>%
 46 |                          filter(Effective_Start <= int_end) %>%
 47 |                          filter(is.na(Effective_End) | Effective_End > int_end) %>%
 48 |                          group_by(user_id) %>%
 49 |                          summarise(MRR = round(sum(MRR), 2)) %>%
 50 |                          select(user_id, MRR_last_month = MRR)
 51 | 
 52 |                        # all before until previous month
 53 |                        int_start = time_point %m-% months(12*100)
 54 |                        int_end = floor_date(time_point, unit = "month")  %m-% months(1)
 55 |                        int_end = int_end - days(1)
 56 |                        past = interval(int_start, int_end)
 57 | 
 58 |                        active_past <- dat %>%
 59 |                          filter(Effective_Start <= int_end) %>%
 60 |                          filter(is.na(Effective_End) | Effective_End > int_end ) %>%
 61 |                          group_by(user_id) %>%
 62 |                          summarise(MRR = round(sum(MRR), 2)) %>%
 63 |                          select(user_id, MRR_past = MRR)
 64 | 
 65 |                        alltable <- merge(active_current, active_last_month, all = T) %>%
 66 |                          merge(active_past, all=T)
 67 | 
 68 |                        ########################## Break out the revenue ###########################
 69 | 
 70 |                        # current month is from the 1nd day to the last day each month
 71 | 
 72 |                        new <- alltable %>%
 73 |                          filter( (is.na(MRR_past) | MRR_past <= 0) &  (is.na(MRR_last_month) | MRR_last_month <= 0 ) ) %>%
 74 |                          select(user_id, new = MRR_current )
 75 | 
 76 |                        resurrected <- alltable %>%
 77 |                          filter( (!is.na(MRR_current)) & MRR_current > 0  )%>%
 78 |                          filter( is.na(MRR_last_month) | MRR_last_month <= 0) %>%
 79 |                          filter( (!is.na(MRR_past)) & MRR_past > 0 ) %>%
 80 |                          select(user_id,resurrected = MRR_current)
 81 | 
 82 |                        # an alternative way is to add up the smaller number from MRR_current and MRR_last_month
 83 | 
 84 |                        retain1 <- alltable %>%
 85 |                          filter( (!is.na(MRR_current)) &  MRR_current > 0 ) %>%
 86 |                          filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>%
 87 |                          filter(MRR_current >= MRR_last_month) %>%
 88 |                          select(user_id,retain1 = MRR_last_month)
 89 | 
 90 |                        retain2 <- alltable %>%
 91 |                          filter( (!is.na(MRR_current)) &  MRR_current > 0 ) %>%
 92 |                          filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>%
 93 |                          filter(MRR_current < MRR_last_month) %>%
 94 |                          select(user_id,retain2 = MRR_current)
 95 | 
 96 |                        retain = merge(retain1, retain2, all=T) %>%
 97 |                          impute0()%>%
 98 |                          transmute(user_id = user_id, retain = retain1 + retain2)
 99 | 
100 |                        expansion <- alltable %>%
101 |                          filter(   (!is.na(MRR_current)) &  MRR_current > 0 ) %>%
102 |                          filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>%
103 |                          filter(MRR_current > MRR_last_month) %>%
104 |                          impute0()%>%
105 |                          transmute(user_id = user_id, expansion = MRR_current-MRR_last_month)
106 | 
107 |                        contraction <- alltable %>%
108 |                          filter( (!is.na(MRR_current)) &  MRR_current > 0  )%>%
109 |                          filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>%
110 |                          filter(MRR_current < MRR_last_month) %>%
111 |                          impute0()%>%
112 |                          transmute(user_id = user_id, contraction = MRR_last_month-MRR_current)
113 | 
114 |                        churn <- alltable %>%
115 |                          filter( is.na(MRR_current) | MRR_current <= 0 ) %>%
116 |                          filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>%
117 |                          transmute(user_id = user_id, churn =MRR_last_month)
118 | 
119 |                        res0 = merge(new, resurrected, all = T)
120 |                        res0 = merge(res0, retain, all = T)
121 |                        res0 = merge(res0, expansion, all = T)
122 |                        res0 = merge(res0, contraction, all = T)
123 |                        res0 = merge(res0, churn, all = T)
124 |                        res0 = impute0(res0)
125 | 
126 |                        res0$date = time_point
127 |                        res0 = res0 %>% filter( !(new == 0 &  resurrected == 0 & retain == 0 &  expansion == 0 & contraction == 0 & churn == 0) )
128 | 
129 |                        return(res0)
130 |                      }
131 |                      ) )
132 | 
133 |   return(res_test)
134 | }
135 | 


--------------------------------------------------------------------------------
/Archive/RevAct_Q.R:
--------------------------------------------------------------------------------
  1 | #' Calculate revenue accounting matrix for each quarter
  2 | #' @description It calculates the accounting matrix given date, subscription data, and product type. It only returns one row for each quarter, i.e. it will sum up all the products.
  3 | #' @param dates  a date vector (have to be date type)
  4 | #' @param datall_revenue subscription data, need to have columns:  user_id, Product, Effective_Start, Effective_End, MRR
  5 | #' @param type product type
  6 | #' @return A revenue accounting matrix is returned for each date.
  7 | #' @author Hui Lin, \email{longqiman@gmail.com}
  8 | #' @export
  9 | 
 10 | RevAct_Q <- function(datall_revenue, type, dates){
 11 |   dat <- datall_revenue %>%
 12 |     # change this filter to get result for different products
 13 |     filter(Product %in% type)%>%
 14 |     transmute(user_id=user_id,
 15 |               MRR= MRR,
 16 |               Effective_Start = as.Date(Effective_Start),
 17 |               Effective_End = as.Date(Effective_End))
 18 | 
 19 |   cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn")
 20 |   grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>%
 21 |     data.frame()
 22 |   names(grow_matrix) <- cnam
 23 |   # i=10
 24 | 
 25 |   for (i in 1:length(dates)){
 26 |     ####################################################################
 27 |     ############# This block of code is for the previous defition of MRR
 28 |     ############# based on the first day of each month
 29 |     # current month is from the 2nd day of last month to current day
 30 |     # int_start = dates[i] %m-% months(1) + days(1)
 31 |     # int_end = dates[i]
 32 |     # current = interval(int_start, int_end) # current month
 33 | 
 34 |     # previous month
 35 |     # int_start = dates[i] %m-% months(2) + days(1)
 36 |     # int_end = dates[i] %m-% months(1)
 37 |     # previous = interval(int_start, int_end)
 38 | 
 39 | 
 40 |     # all before until previous month
 41 |     # int_start = dates[i] %m-% months(12*100)
 42 |     # int_end = dates[i] %m-% months(2)
 43 |     # past = interval(int_start, int_end)
 44 |     ####################################################################
 45 | 
 46 |     # current Q
 47 |     int_start = floor_date(dates[i], unit = "month") %m-% months(2)
 48 |     int_end = dates[i]
 49 |     current = interval(int_start, int_end) # current month
 50 | 
 51 |     # calculate new revenue which is from new customer this month
 52 |     active_current <- dat %>%
 53 |       filter(Effective_Start <= dates[i]) %>%
 54 |       filter(is.na(Effective_End) | Effective_End > dates[i]) %>%
 55 |       group_by(user_id) %>%
 56 |       summarise(MRR = round(sum(MRR), 2) ) %>%
 57 |       select(user_id, MRR_current = MRR)
 58 | 
 59 |     # previous Q
 60 |     int_end = floor_date(dates[i], unit = "month") %m-% months(2) - days(1)
 61 |     int_start = floor_date(int_end, unit = "month")  %m-% months(2)
 62 |     previous = interval(int_start, int_end)
 63 | 
 64 |     active_last_month <- dat %>%
 65 |       filter(Effective_Start <= int_end) %>%
 66 |       filter(is.na(Effective_End) | Effective_End > int_end) %>%
 67 |       group_by(user_id) %>%
 68 |       summarise(MRR = round(sum(MRR),2) ) %>%
 69 |       select(user_id, MRR_last_month = MRR)
 70 | 
 71 |     # all before until previous month
 72 |     int_end = dates[i]  %m-% months(6)
 73 |     int_start = dates[i]  %m-% months(1000)
 74 |     past = interval(int_start, int_end)
 75 | 
 76 |     active_past <- dat %>%
 77 |       filter(Effective_Start <= int_end) %>%
 78 |       filter(is.na(Effective_End) | Effective_End > int_end ) %>%
 79 |       group_by(user_id) %>%
 80 |       summarise(MRR = round(sum(MRR),2)) %>%
 81 |       select(user_id, MRR_past = MRR)
 82 | 
 83 |     alltable <- merge(active_current, active_last_month, all = T) %>%
 84 |       merge(active_past, all=T)
 85 | 
 86 |     new <- alltable %>%
 87 |       # filter(is.na(MRR_past)&is.na(MRR_last_month)) %>%
 88 |       filter( (is.na(MRR_past) | MRR_past <= 0) &  (is.na(MRR_last_month) | MRR_last_month <= 0 ) ) %>%
 89 |       summarise(MRR = sum(MRR_current, na.rm = T))
 90 | 
 91 |     resurrected <- alltable %>%
 92 |       # filter(!is.na(MRR_current))%>%
 93 |       # filter(is.na(MRR_last_month)) %>%
 94 |       # filter(!is.na(MRR_past)) %>%
 95 |       filter( (!is.na(MRR_current)) & MRR_current > 0  )%>%
 96 |       filter( is.na(MRR_last_month) | MRR_last_month <= 0) %>%
 97 |       filter( (!is.na(MRR_past)) & MRR_past > 0 ) %>%
 98 |       summarise(MRR = sum(MRR_current, na.rm = T))
 99 | 
100 |     # an alternative way is to add up the smaller number from MRR_current and MRR_last_month
101 |     retain1 <- alltable %>%
102 |       # filter(!is.na(MRR_current)) %>%
103 |       # filter(!is.na(MRR_last_month)) %>%
104 |       filter( (!is.na(MRR_current)) &  MRR_current > 0 ) %>%
105 |       filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>%
106 |       filter(MRR_current >= MRR_last_month) %>%
107 |       summarise(MRR = sum(MRR_last_month, na.rm = T))
108 | 
109 |     retain2 <- alltable %>%
110 |       # filter(!is.na(MRR_current)) %>%
111 |       # filter(!is.na(MRR_last_month))%>%
112 |       filter( (!is.na(MRR_current)) &  MRR_current > 0 ) %>%
113 |       filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>%
114 |       filter(MRR_current < MRR_last_month) %>%
115 |       summarise(MRR = sum(MRR_current, na.rm = T))
116 | 
117 |     retain = retain1 + retain2
118 | 
119 |     expansion <- alltable %>%
120 |       # filter(!is.na(MRR_current)) %>%
121 |       # filter(!is.na(MRR_last_month))%>%
122 |       filter(   (!is.na(MRR_current)) &  MRR_current > 0 ) %>%
123 |       filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 )%>%
124 |       filter(MRR_current > MRR_last_month) %>%
125 |       summarise(MRR = sum(MRR_current, na.rm = T) - sum(MRR_last_month, na.rm = T))
126 | 
127 |     contraction <- alltable %>%
128 |       # filter(!is.na(MRR_current))%>%
129 |       # filter(!is.na(MRR_last_month)) %>%
130 |       filter( (!is.na(MRR_current)) &  MRR_current > 0  )%>%
131 |       filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>%
132 |       filter(MRR_current < MRR_last_month) %>%
133 |       summarise(MRR = sum(MRR_last_month, na.rm = T) -sum(MRR_current, na.rm = T))
134 | 
135 |     churn <- alltable %>%
136 |       # filter(is.na(MRR_current))%>%
137 |       # filter(!is.na(MRR_last_month)) %>%
138 |       filter( is.na(MRR_current) | MRR_current <= 0 ) %>%
139 |       filter( (!is.na(MRR_last_month)) & MRR_last_month > 0 ) %>%
140 |       summarise(MRR = sum(MRR_last_month, na.rm = T))
141 | 
142 |     grow_matrix[i, "New"] <- new
143 |     grow_matrix[i, "Retain"] <- retain
144 |     grow_matrix[i, "Resurrected"] <- resurrected
145 |     grow_matrix[i, "Expansion"] <- expansion
146 |     grow_matrix[i, "Contraction"] <- contraction
147 |     grow_matrix[i, "Churn"] <- churn
148 |   }
149 | 
150 |   grow_matrix$dates <- dates
151 | 
152 |   grow_matrix <- grow_matrix %>%
153 |     mutate(MRR = New + Retain + Resurrected + Expansion,
154 |            ARR = MRR*12) %>%
155 |     mutate(QuickRatio = round((New + Resurrected+ Expansion)/(Churn+Contraction),2))
156 | 
157 |   return(grow_matrix)
158 | }
159 | 


--------------------------------------------------------------------------------
/Archive/old_version_RevActUser.R:
--------------------------------------------------------------------------------
  1 | ## this is the old version of RevActUser
  2 | ## this is the function to get revenue accounting matrix
  3 | ## user level
  4 | ## column required: user_id, MRR, Effective_Start,  Effective_End
  5 | 
  6 | RevActUser <- function(datall_revenue, dates){
  7 | 
  8 |   dat <- total_revenue %>%
  9 |     # change this filter to get result for different products
 10 |     # filter(Product %in% type)%>%
 11 |     transmute(user_id=user_id,
 12 |               MRR= MRR,
 13 |               Effective_Start = as.Date(Effective_Start),
 14 |               Effective_End = as.Date(Effective_End))
 15 | 
 16 |   res = NULL
 17 |   for (i in 1:length(dates)){
 18 |     # current month is from the 1nd day to the last day each month
 19 |     int_start = floor_date(dates[i], unit = "month")
 20 |     int_end = dates[i]
 21 |     current = interval(int_start, int_end) # current month
 22 | 
 23 |     # calculate new revenue which is from new customer this month
 24 |     active_current <- dat %>%
 25 |       filter(Effective_Start <= dates[i]) %>%
 26 |       filter(is.na(Effective_End) | Effective_End >= dates[i]) %>%
 27 |       group_by(user_id) %>%
 28 |       summarise(MRR = sum(MRR)) %>%
 29 |       select(user_id, MRR_current = MRR)
 30 | 
 31 |     # previous month
 32 |     int_start = floor_date(dates[i], unit = "month")  %m-% months(1)
 33 |     int_end = ceiling_date(int_start, unit = "month") - days(1)
 34 |     previous = interval(int_start, int_end)
 35 | 
 36 |     active_last_month <- dat %>%
 37 |       filter(Effective_Start <= int_end) %>%
 38 |       filter(is.na(Effective_End) | Effective_End >= int_end) %>%
 39 |       group_by(user_id) %>%
 40 |       summarise(MRR = sum(MRR)) %>%
 41 |       select(user_id, MRR_last_month = MRR)
 42 | 
 43 |     # all before until previous month
 44 |     int_start = dates[i] %m-% months(12*100)
 45 |     int_end = floor_date(dates[i], unit = "month")  %m-% months(1)
 46 |     int_end = int_end - days(1)
 47 |     past = interval(int_start, int_end)
 48 | 
 49 |     active_past <- dat %>%
 50 |       filter(Effective_Start <= int_end) %>%
 51 |       filter(is.na(Effective_End) | Effective_End >= int_end ) %>%
 52 |       group_by(user_id) %>%
 53 |       summarise(MRR = sum(MRR)) %>%
 54 |       select(user_id, MRR_past = MRR)
 55 | 
 56 |     alltable <- merge(active_current, active_last_month, all = T) %>%
 57 |       merge(active_past, all=T)
 58 | 
 59 |     new <- alltable %>%
 60 |       filter(is.na(MRR_past)&is.na(MRR_last_month)) %>%
 61 |       select(user_id,new = MRR_current)
 62 | 
 63 |     resurrected <- alltable %>%
 64 |       filter(!is.na(MRR_current))%>%
 65 |       filter(is.na(MRR_last_month)) %>%
 66 |       filter(!is.na(MRR_past)) %>%
 67 |       select(user_id,resurrected = MRR_current)
 68 | 
 69 |     # an alternative way is to add up the smaller number from MRR_current and MRR_last_month
 70 |     retain1 <- alltable %>%
 71 |       filter(!is.na(MRR_current)) %>%
 72 |       filter(!is.na(MRR_last_month))%>%
 73 |       filter(MRR_current >= MRR_last_month) %>%
 74 |       select(user_id,retain1 = MRR_last_month)
 75 | 
 76 |     retain2 <- alltable %>%
 77 |       filter(!is.na(MRR_current)) %>%
 78 |       filter(!is.na(MRR_last_month))%>%
 79 |       filter(MRR_current < MRR_last_month) %>%
 80 |       select(user_id,retain2 = MRR_current)
 81 | 
 82 |     retain = merge(retain1, retain2, all=T) %>%
 83 |       impute0()%>%
 84 |       transmute(user_id = user_id, retain = retain1 + retain2)
 85 | 
 86 |     expansion <- alltable %>%
 87 |       filter(!is.na(MRR_current)) %>%
 88 |       filter(!is.na(MRR_last_month))%>%
 89 |       filter(MRR_current > MRR_last_month) %>%
 90 |       impute0()%>%
 91 |       transmute(user_id = user_id, expansion = MRR_current-MRR_last_month)
 92 | 
 93 |     contraction <- alltable %>%
 94 |       filter(!is.na(MRR_current))%>%
 95 |       filter(!is.na(MRR_last_month)) %>%
 96 |       filter(MRR_current < MRR_last_month) %>%
 97 |       impute0()%>%
 98 |       transmute(user_id = user_id, contraction =MRR_last_month-MRR_current)
 99 | 
100 |     churn <- alltable %>%
101 |       filter(is.na(MRR_current))%>%
102 |       filter(!is.na(MRR_last_month)) %>%
103 |       transmute(user_id = user_id, churn =MRR_last_month)
104 | 
105 |     res0 = merge(new, resurrected, all = T)
106 |     res0 = merge(res0, retain, all = T)
107 |     res0 = merge(res0, expansion, all = T)
108 |     res0 = merge(res0, contraction, all = T)
109 |     res0 = merge(res0, churn, all = T)
110 |     res0 = impute0(res0)
111 | 
112 |     res0$date = dates[i]
113 | 
114 |     res = rbind(res, res0)
115 |   }
116 | 
117 |   return(res)
118 | }
119 | 


--------------------------------------------------------------------------------
/Archive/summarySE2.R:
--------------------------------------------------------------------------------
 1 | ## Summarizes data.
 2 | ## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
 3 | ##   data: a data frame.
 4 | ##   measurevar: the name of a column that contains the variable to be summariezed
 5 | ##   groupvars: a vector containing names of columns that contain grouping variables
 6 | ##   na.rm: a boolean that indicates whether to ignore NA's
 7 | ##   conf.interval: the percent range of the confidence interval (default is 95%)
 8 | summarySE2 <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
 9 |                       conf.interval=.95, .drop=TRUE,pce_less_than_zero=FALSE) {
10 | 
11 |   require(plyr)
12 | 
13 |   # New version of length which can handle NA's: if na.rm==T, don't count them
14 |   length2 <- function (x, na.rm=FALSE) {
15 |     if (na.rm) sum(!is.na(x))
16 |     else       length(x)
17 |   }
18 | 
19 |   qx<-function(x,q){
20 | 
21 |   sort( na.omit(x) )[round(length( na.omit(x) )*q,0)]->res
22 |     return(res)
23 |   }
24 | 
25 |   # This is does the summary; it's not easy to understand...
26 |   datac <- ddply(data, groupvars, .drop=.drop,
27 |                  if (pce_less_than_zero)
28 |                  .fun= function(xx, col, na.rm) {
29 |                    c( N    = length2(xx[,col], na.rm=na.rm),
30 |                       mean = mean   (xx[,col], na.rm=na.rm),
31 |                       pct = mean   (xx[,col]>0, na.rm=na.rm),
32 |                       sd   = sd     (xx[,col], na.rm=na.rm)
33 |                       #q25   = qx(xx[,col],0.25) ,
34 |                       #q75   = qx(xx[,col],0.75) ,
35 |                       #q5   = qx(xx[,col],0.05) ,
36 |                       #q95   = qx(xx[,col],0.95) ,
37 |                       #q15   = qx(xx[,col],0.15) ,
38 |                       #q85   = qx(xx[,col],0.85) ,
39 | 
40 |                    )
41 |                  }
42 |                  else
43 |                    .fun= function(xx, col, na.rm) {
44 |                      c( N    = length2(xx[,col], na.rm=na.rm),
45 |                         mean = mean   (xx[,col], na.rm=na.rm),
46 |                         pct = mean   (xx[,col]>0, na.rm=na.rm),
47 |                         sd   = sd     (xx[,col], na.rm=na.rm)
48 |                        # q5   = qx(xx[,col],0.05) ,
49 |                       #  q15   = qx(xx[,col],0.15) ,
50 |                       #  q25   = qx(xx[,col],0.25) ,
51 |                       #  q75   = qx(xx[,col],0.75) ,
52 |                       #  q85   = qx(xx[,col],0.85) ,
53 |                       #  q95   = qx(xx[,col],0.95)
54 | 
55 |                      )
56 |                    },
57 |                  measurevar,
58 |                  na.rm
59 |   )
60 | 
61 |   # Rename the "mean" column
62 |   datac <- rename(datac, c("mean"=measurevar))
63 |   datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
64 | 
65 |   # Confidence interval multiplier for standard error
66 |   # Calculate t-statistic for confidence interval:
67 |   # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
68 |   ciMult <- qt(conf.interval/2 + .5, datac$N-1)
69 |   datac$ci <- datac$se * ciMult
70 |   return(datac)
71 | }
72 | 


--------------------------------------------------------------------------------
/Archive/summarySE3.R:
--------------------------------------------------------------------------------
 1 | ## Summarizes data.
 2 | ## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
 3 | ##   data: a data frame.
 4 | ##   measurevar: the name of a column that contains the variable to be summariezed
 5 | ##   groupvars: a vector containing names of columns that contain grouping variables
 6 | ##   na.rm: a boolean that indicates whether to ignore NA's
 7 | ##   conf.interval: the percent range of the confidence interval (default is 95%)
 8 | summarySE3 <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
 9 |                       conf.interval=.95, .drop=TRUE,pce_less_than_zero=FALSE) {
10 |   require(plyr)
11 | 
12 |   # New version of length which can handle NA's: if na.rm==T, don't count them
13 |   length2 <- function (x, na.rm=FALSE) {
14 |     if (na.rm) sum(!is.na(x))
15 |     else       length(x)
16 |   }
17 | 
18 |   qx<-function(x,q){
19 | 
20 |     sort( na.omit(x) )[round(length( na.omit(x) )*q,0)]->res
21 |     return(res)
22 |   }
23 |   # This is does the summary; it's not easy to understand...
24 |   datac <- ddply(data, groupvars, .drop=.drop,
25 |                  if (pce_less_than_zero)
26 |                    .fun= function(xx, col, na.rm) {
27 |                      c( N    = length2(xx[,col], na.rm=na.rm),
28 |                         mean = mean   (xx[,col], na.rm=na.rm),
29 |                        # pct = mean   (xx[,col]>0, na.rm=na.rm),
30 |                         sd   = sd     (xx[,col], na.rm=na.rm),
31 |                         q25   = qx(xx[,col],0.25) ,
32 |                         q75   = qx(xx[,col],0.75) ,
33 |                         q5   = qx(xx[,col],0.05) ,
34 |                         q95   = qx(xx[,col],0.95) ,
35 |                         q15   = qx(xx[,col],0.15) ,
36 |                         q85   = qx(xx[,col],0.85) ,
37 |                         pce_down= mean(xx[,col]<0)
38 |                      )
39 |                    }
40 |                  else
41 |                    .fun= function(xx, col, na.rm) {
42 |                      c( N    = length2(xx[,col], na.rm=na.rm),
43 |                         mean = mean   (xx[,col], na.rm=na.rm),
44 |                       #  pct = mean   (xx[,col]>0, na.rm=na.rm),
45 |                         sd   = sd     (xx[,col], na.rm=na.rm),
46 |                         q5   = qx(xx[,col],0.05) ,
47 |                         q15   = qx(xx[,col],0.15) ,
48 |                         q25   = qx(xx[,col],0.25) ,
49 |                         q75   = qx(xx[,col],0.75) ,
50 |                         q85   = qx(xx[,col],0.85) ,
51 |                         q95   = qx(xx[,col],0.95)
52 | 
53 |                      )
54 |                    },
55 |                  measurevar,
56 |                  na.rm
57 |   )
58 | 
59 |   # Rename the "mean" column
60 |   datac <- rename(datac, c("mean"=measurevar))
61 |   datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
62 | 
63 |   # Confidence interval multiplier for standard error
64 |   # Calculate t-statistic for confidence interval:
65 |   # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
66 |   ciMult <- qt(conf.interval/2 + .5, datac$N-1)
67 |   datac$ci <- datac$se * ciMult
68 |   return(datac)
69 | }
70 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: NetlifyDS
 2 | Title: Netlify Data Science Using R
 3 | Version: 0.0.0.9000
 4 | Authors@R: person("Hui", "Lin", email = "longqiman@gmail.com", role = c("aut", "cre"))
 5 | Description: The `NetlifyDS` package contains functions to streamline Netlify's data science project. It covers data pipeline, analytics, and modeling.
 6 | Depends: R (>= 3.4.0)
 7 | License: MIT
 8 | Suggests:
 9 |     stringr,
10 |     dplyr,
11 |     tidyr,
12 |     tibble (>= 1.4.2),
13 |     magrittr (>= 1.5)
14 | Encoding: UTF-8
15 | LazyData: true
16 | RoxygenNote: 6.1.1
17 | VignetteBuilder: knitr
18 | 


--------------------------------------------------------------------------------
/Data/AirlineRating.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/AirlineRating.RData


--------------------------------------------------------------------------------
/Data/SegData.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/SegData.RData


--------------------------------------------------------------------------------
/Data/df_encirca.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/df_encirca.RData


--------------------------------------------------------------------------------
/Data/df_mon.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/df_mon.RData


--------------------------------------------------------------------------------
/Data/df_phi.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/df_phi.RData


--------------------------------------------------------------------------------
/Data/links.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/links.RData


--------------------------------------------------------------------------------
/Data/nodes.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/nodes.RData


--------------------------------------------------------------------------------
/Data/sim1_da1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/sim1_da1.RData


--------------------------------------------------------------------------------
/Data/sim1_da2.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/sim1_da2.RData


--------------------------------------------------------------------------------
/Data/sim1_da3.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/sim1_da3.RData


--------------------------------------------------------------------------------
/Data/tweets.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/Data/tweets.RData


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(plot,cv_glasso)
 4 | S3method(plot,rocTest)
 5 | S3method(plot,tune.cutoff)
 6 | export(MeterRevActUserProduct)
 7 | export(RevAct)
 8 | export(RevActUserProduct)
 9 | export(RevCntAct)
10 | export(clust_ana)
11 | export(cv_glasso)
12 | export(fitglasso)
13 | export(impute_dat)
14 | export(mrrAtDateType)
15 | export(multiplot)
16 | export(out_mad)
17 | export(pairwise_ks_test)
18 | export(predict_glasso)
19 | export(rocTest)
20 | export(subclass_eff_est)
21 | export(tune_cutoff)
22 | 


--------------------------------------------------------------------------------
/NetlifyDS.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/R/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/R/.DS_Store


--------------------------------------------------------------------------------
/R/ClusterAnalysis.R:
--------------------------------------------------------------------------------
 1 | #' Cluster analysis
 2 | #' @description Does cluster analysis and report an index of cluster
 3 | #' @param ans data matrix
 4 | #' @param  k number of clusters
 5 | #' @param distm  Dissimilarity index, partial match to "\code{manhattan}", "\code{euclidean}"(default), "\code{canberra}", "\code{bray}", "\code{kulczynski}", "\code{jaccard}", "\code{gower}", "\code{altGower}", "\code{morisita}", "\code{horn}", "\code{mountford}", "\code{raup}" , "\code{binomial}", "\code{chao}", "\code{cao}" or "\code{mahalanobis}".
 6 | #' @param hclustm the agglomeration method to be used. This should be (an unambiguous abbreviation of) one of "\code{ward.D}"(default), "\code{ward.D2}", "\code{single}", "\code{complete}", "\code{average}" (= UPGMA), "\code{mcquitty}" (= WPGMA), "\code{median}" (= WPGMC) or "\code{centroid}" (= UPGMC).
 7 | #' @param na.rm	Pairwise deletion of missing observations when computing dissimilarities.
 8 | #' @author Hui Lin, \email{longqiman@gmail.com}
 9 | #' @examples
10 | #' \dontrun{
11 | #' data("SegData")
12 | #' library(dplyr)
13 | #' library(vegan)
14 | #' cind <- clust_ana(dat, k=4)
15 | #' }
16 | 
17 | #' @export
18 | clust_ana <- function(ans, k, distm = "euclidean", hclustm = "ward.D",
19 |                       na.rm = FALSE) {
20 |   library(vegan)
21 |   dist_t <- vegdist(ans, method = distm, na.rm = na.rm)
22 |   clust <- hclust(dist_t, method = hclustm)
23 |   plot(clust, main = "Cluster Dendrogram of Segment Survey")
24 |   return(cutree(clust, k = k))
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/R/MeterRevActUserProduct.R:
--------------------------------------------------------------------------------
  1 | #' Calculate metered revenue accounting matrix per day, user and product
  2 | #' @description It calculates the accounting matrix for each user and product given date, subscription data and product category.
  3 | #' @param dates  a date vector (have to be date type)
  4 | #' @param datall_revenue subscription data, need to have columns:  user_id, Product, Effective_Start, one_time_charge
  5 | #' @param type a vector of product names
  6 | #' @return A metered revenue accounting matrix is returned for each day, user and product
  7 | #' @author Hui Lin, \email{longqiman@gmail.com}
  8 | #' @export
  9 | 
 10 | MeterRevActUserProduct <- function(datall_revenue, dates, type){
 11 | 
 12 |   dat <- datall_revenue %>%
 13 |     ## change this filter to get result for different products
 14 |     filter(Product %in% type)%>%
 15 |     transmute(user_id=user_id,
 16 |               one_time_charge = one_time_charge,
 17 |               Effective_Start = as.Date(Effective_Start))
 18 | 
 19 |   # function to impute 0 to missing value
 20 |   impute0 = function(dat) {
 21 |     for (i in 2:ncol(dat)){
 22 |       idx = which(is.na(dat[,i]))
 23 | 
 24 |       if(length(idx)>0){
 25 |         dat[idx,i] = 0
 26 |       }
 27 |     }
 28 |     return(dat)
 29 |   }
 30 | 
 31 |   res = NULL
 32 |   for (i in 1:length(dates)){
 33 |     # current month is from the 1nd day to the last day each month
 34 |     int_start = floor_date(dates[i], unit = "month")
 35 |     int_end = dates[i]
 36 |     current = interval(int_start, int_end) # current month
 37 | 
 38 |     # calculate new revenue which is from new customer this month
 39 |     active_current <- dat %>%
 40 |       filter(Effective_Start %within% current) %>%
 41 |       group_by(user_id) %>%
 42 |       summarise(one_time_charge = round(sum(one_time_charge), 2)) %>%
 43 |       select(user_id, one_time_charge_current = one_time_charge)
 44 | 
 45 |     # previous month
 46 |     int_start = floor_date(dates[i], unit = "month")  %m-% months(1)
 47 |     int_end = ceiling_date(int_start, unit = "month") - days(1)
 48 |     previous = interval(int_start, int_end)
 49 | 
 50 |     active_last_month <- dat %>%
 51 |       filter(Effective_Start %within% previous) %>%
 52 |       group_by(user_id) %>%
 53 |       summarise(one_time_charge = round(sum(one_time_charge), 2)) %>%
 54 |       select(user_id, one_time_charge_last_month = one_time_charge)
 55 | 
 56 |     # all before until previous month
 57 |     int_start = dates[i] %m-% months(12*100)
 58 |     int_end = floor_date(dates[i], unit = "month")  %m-% months(1)
 59 |     int_end = int_end - days(1)
 60 |     past = interval(int_start, int_end)
 61 | 
 62 |     active_past <- dat %>%
 63 |       filter(Effective_Start %within% past) %>%
 64 |       group_by(user_id) %>%
 65 |       summarise(one_time_charge = round(sum(one_time_charge), 2)) %>%
 66 |       select(user_id, one_time_charge_past = one_time_charge)
 67 | 
 68 |     alltable <- merge(active_current, active_last_month, all = T) %>%
 69 |       merge(active_past, all=T)
 70 | 
 71 |     ########################## Break out the revenue ###########################
 72 | 
 73 |     # current month is from the 1nd day to the last day each month
 74 | 
 75 |     new <- alltable %>%
 76 |       filter( (is.na(one_time_charge_past) | one_time_charge_past <= 0) &  (is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0 ) ) %>%
 77 |       select(user_id, new = one_time_charge_current )
 78 | 
 79 |     resurrected <- alltable %>%
 80 |       filter( (!is.na(one_time_charge_current)) & one_time_charge_current > 0  )%>%
 81 |       filter( is.na(one_time_charge_last_month) | one_time_charge_last_month <= 0) %>%
 82 |       filter( (!is.na(one_time_charge_past)) & one_time_charge_past > 0 ) %>%
 83 |       select(user_id,resurrected = one_time_charge_current)
 84 | 
 85 |     # an alternative way is to add up the smaller number from MRR_current and MRR_last_month
 86 |     retain1 <- alltable %>%
 87 |       filter( (!is.na(one_time_charge_current)) &  one_time_charge_current > 0 ) %>%
 88 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>%
 89 |       filter(one_time_charge_current >= one_time_charge_last_month) %>%
 90 |       select(user_id,retain1 = one_time_charge_last_month)
 91 | 
 92 |     retain2 <- alltable %>%
 93 |       filter( (!is.na(one_time_charge_current)) &  one_time_charge_current > 0 ) %>%
 94 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>%
 95 |       filter(one_time_charge_current < one_time_charge_last_month) %>%
 96 |       select(user_id,retain2 = one_time_charge_current)
 97 | 
 98 |     retain = merge(retain1, retain2, all=T) %>%
 99 |       impute0()%>%
100 |       transmute(user_id = user_id, retain = retain1 + retain2)
101 | 
102 |     expansion <- alltable %>%
103 |       filter(   (!is.na(one_time_charge_current)) &  one_time_charge_current > 0 ) %>%
104 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 )%>%
105 |       filter(one_time_charge_current > one_time_charge_last_month) %>%
106 |       impute0()%>%
107 |       transmute(user_id = user_id, expansion = one_time_charge_current-one_time_charge_last_month)
108 | 
109 |     contraction <- alltable %>%
110 |       filter( (!is.na(one_time_charge_current)) &  one_time_charge_current > 0  )%>%
111 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>%
112 |       filter(one_time_charge_current < one_time_charge_last_month) %>%
113 |       impute0()%>%
114 |       transmute(user_id = user_id, contraction =one_time_charge_last_month - one_time_charge_current)
115 | 
116 |     churn <- alltable %>%
117 |       filter( is.na(one_time_charge_current) | one_time_charge_current <= 0 ) %>%
118 |       filter( (!is.na(one_time_charge_last_month)) & one_time_charge_last_month > 0 ) %>%
119 |       transmute(user_id = user_id, churn =one_time_charge_last_month)
120 | 
121 |     res0 = merge(new, resurrected, all = T)
122 |     res0 = merge(res0, retain, all = T)
123 |     res0 = merge(res0, expansion, all = T)
124 |     res0 = merge(res0, contraction, all = T)
125 |     res0 = merge(res0, churn, all = T)
126 |     res0 = impute0(res0)
127 | 
128 |     res0$date = dates[i]
129 | 
130 |     res = rbind(res, res0)
131 |     res = res %>% filter( !(new == 0 &  resurrected == 0 & retain == 0 &  expansion == 0 & contraction == 0 & churn == 0) )
132 |   }
133 |   return(res)
134 | }
135 | 


--------------------------------------------------------------------------------
/R/NoAcctDateType.R:
--------------------------------------------------------------------------------
 1 | ## function to calculate numbuer of accounts
 2 | NoAcctDateType <- function(time_point, subs, type) {
 3 |   time_point <- as.Date(time_point)
 4 |   # time_point: time of mrr
 5 |   # subs: subscription data
 6 |   actives <- subs  %>%
 7 |     filter(Product %in% type) %>%
 8 |     filter(Effective_Start <= time_point) %>%
 9 |     filter( is.na(Effective_End) | Effective_End > time_point) %>%
10 |     # only keep those account with positive mrr
11 |     filter(MRR > 0)
12 |   # return the number of accounts
13 |   return(length(unique(actives$account_id)))
14 | }
15 | 


--------------------------------------------------------------------------------
/R/Outliers.R:
--------------------------------------------------------------------------------
 1 | #' Identify outliers using MAD
 2 | #'
 3 | #' @param x a vector
 4 | #' @return a vector indicating identified outliers
 5 | #' @author Hui Lin, \email{longqiman@gmail.com}
 6 | #' @examples
 7 | #' \dontrun{
 8 | #' x<-c(seq(1:1000),20000)
 9 | #' out_mad(x)
10 | #' }
11 | 
12 | #' @export
13 | ########################outliers
14 | out_mad<-function(x){
15 | v2<-x-median(na.omit(x))
16 | mad<-median(na.omit(abs(v2)))
17 | idx<-which(0.6745*(x-median(na.omit(v2)))/mad>3.5)
18 | return(idx)
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/R/RevAct.R:
--------------------------------------------------------------------------------
  1 | #' Calculate revenue accounting matrix
  2 | #' @description It calculates the accounting matrix given date and daily revenue data. It only returns one row for each date, i.e. it will sum up all the products.
  3 | #' @param dates  a date vector (have to be date type)
  4 | #' @param datall_revenue daily revenue data, need to have columns:  user_id, Product, active_date, MRR
  5 | #' @param type product type
  6 | #' @return A revenue accounting matrix is returned for each date.
  7 | #' @author Hui Lin, \email{longqiman@gmail.com}
  8 | #' @examples
  9 | #' \dontrun{
 10 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1)
 11 | #' dates
 12 | #' # need to get daily revenue from subscription data if you don't have that already
 13 | #' base_monthly = do.call("rbind", lapply(dates, function(time_point)
 14 | #' growth_matrix_identity = RevAct(datall_revenue = base_monthly, type = c("Identity"), dates = dates)
 15 | #' }
 16 | #' @export
 17 | 
 18 | 
 19 | RevAct <- function(datall_revenue, type, dates){
 20 | 
 21 | 
 22 |   dat <- datall_revenue %>%
 23 |     ungroup()%>%
 24 |     ## change this filter to get result for different products
 25 |     filter(Product %in% type) %>%
 26 |     transmute(user_id = user_id,
 27 |               MRR= MRR,
 28 |               active_date = as.Date(active_date)) %>%
 29 |     group_by(user_id, active_date) %>%
 30 |     summarise(MRR = round(sum(MRR), 2)) %>%
 31 |     ungroup()
 32 | 
 33 |   cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn")
 34 |   grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>%
 35 |     data.frame()
 36 |   names(grow_matrix) <- cnam
 37 | 
 38 |   for (i in 1:length(dates)){
 39 |     # the current month
 40 |     cmon = dates[i]
 41 | 
 42 |     # the previous month
 43 |     pmon = floor_date(dates[i], unit = "month")- days(1)
 44 | 
 45 |     # the legacy month
 46 |     lmon = dates[dates < pmon]
 47 | 
 48 |     # current mrr
 49 |     cmrr = dat %>%
 50 |       filter(active_date == cmon) %>%
 51 |       select(user_id, cmrr = MRR) %>%
 52 |       filter(cmrr > 0)
 53 | 
 54 |     # previous mrr
 55 |     pmrr = dat %>%
 56 |       filter(active_date == pmon) %>%
 57 |       select(user_id, pmrr = MRR) %>%
 58 |       filter(pmrr > 0)
 59 | 
 60 |     # legacy mrr, get the maximum number
 61 |     lmrr = dat %>%
 62 |       filter(active_date %in% lmon) %>%
 63 |       group_by(user_id) %>%
 64 |       summarise(lmrr = max(MRR)) %>%
 65 |       filter(lmrr > 0)
 66 | 
 67 | 
 68 |     # join all together
 69 |     alltable <- merge(cmrr, pmrr, all = T) %>%
 70 |       merge(lmrr, all=T) %>%
 71 |       NetlifyDS::impute_dat(method = "zero")
 72 | 
 73 |     # Get new
 74 |     new = alltable %>%
 75 |       filter( cmrr > 0 & pmrr == 0 & lmrr == 0) %>%
 76 |       summarise(MRR = sum(cmrr, na.rm = T))
 77 | 
 78 |     # Get reesurrected
 79 |     resurrected <- alltable %>%
 80 |       filter(cmrr > 0 & pmrr == 0 & lmrr > 0) %>%
 81 |       summarise(MRR = sum(cmrr, na.rm = T))
 82 | 
 83 |     # Get Retain
 84 |     retain1 <- alltable %>%
 85 |       filter(cmrr > 0 & pmrr > 0) %>%
 86 |       filter(cmrr >= pmrr) %>%
 87 |       summarise(MRR = sum(pmrr, na.rm = T))
 88 | 
 89 |     retain2 <- alltable %>%
 90 |       filter(cmrr > 0 & pmrr > 0) %>%
 91 |       filter(cmrr < pmrr) %>%
 92 |       summarise(MRR = sum(cmrr, na.rm = T))
 93 | 
 94 |     retain = retain1 + retain2
 95 | 
 96 |     # Get expansion
 97 |     expansion <- alltable %>%
 98 |       filter(cmrr > 0 & pmrr >0) %>%
 99 |       filter(cmrr > pmrr) %>%
100 |       summarise(MRR = sum(cmrr, na.rm = T) - sum(pmrr, na.rm = T) )
101 | 
102 |     # Get contraction
103 |     contraction <- alltable %>%
104 |       filter(cmrr > 0 & pmrr >0) %>%
105 |       filter(cmrr < pmrr) %>%
106 |       summarise(MRR = sum(pmrr, na.rm = T)-sum(cmrr, na.rm = T))
107 | 
108 |     # Get churn
109 |     churn <- alltable %>%
110 |       filter(cmrr == 0 & pmrr > 0) %>%
111 |       summarise(MRR = sum(pmrr, na.rm = T))
112 | 
113 |     grow_matrix[i, "New"] <- new
114 |     grow_matrix[i, "Retain"] <- retain
115 |     grow_matrix[i, "Resurrected"] <- resurrected
116 |     grow_matrix[i, "Expansion"] <- expansion
117 |     grow_matrix[i, "Contraction"] <- contraction
118 |     grow_matrix[i, "Churn"] <- churn
119 |   }
120 |   grow_matrix$dates <- dates
121 | 
122 |   grow_matrix <- grow_matrix %>%
123 |     mutate(MRR = New + Retain + Resurrected + Expansion,
124 |            ARR = MRR*12) %>%
125 |     mutate(QuickRatio = round((New + Resurrected+ Expansion)/(Churn+Contraction),2))
126 | 
127 |   return(grow_matrix)
128 | }
129 | 


--------------------------------------------------------------------------------
/R/RevActUserProduct.R:
--------------------------------------------------------------------------------
  1 | #' Calculate revenue accounting matrix per day, user and product
  2 | #' @description It calculates the accounting matrix for each user and product given date, subscription data and product category.
  3 | #' @param dates  a date vector (have to be date type)
  4 | #' @param dates_full  a full range of dates
  5 | #' @param datall_revenue daily revenue data, need to have columns:  user_id, Product, active_date, MRR
  6 | #' @param type a vector of product names
  7 | #' @return A revenue accounting matrix is returned for each day, user and product
  8 | #' @author Hui Lin, \email{longqiman@gmail.com}
  9 | #' @examples
 10 | #' \dontrun{
 11 | #' team <- c("netlify_team_premium_monthly","Teams","netlify_team_plus_monthly","netlify_team_plus_yearly")
 12 | #' res_0 = RevActUserProduct(total_revenue, dates, team)
 13 | #' }
 14 | #' @export
 15 | #'
 16 | 
 17 | RevActUserProduct <- function(datall_revenue, dates, dates_full, type){
 18 | 
 19 |   dat <- datall_revenue %>%
 20 |     ungroup()%>%
 21 |     ## change this filter to get result for different products
 22 |     dplyr::filter(Product %in% type) %>%
 23 |     dplyr::transmute(user_id = user_id,
 24 |               MRR= MRR,
 25 |               active_date = as.Date(active_date)) %>%
 26 |     dplyr::group_by(user_id, active_date) %>%
 27 |     dplyr::summarise(MRR = round(sum(MRR), 2)) %>%
 28 |     ungroup()
 29 | 
 30 |   ######################## BEGIN OF TEM_FUN #############################
 31 |   tem_fun = function(time_point, dat, dates_full){
 32 | 
 33 |     cmon = time_point
 34 | 
 35 |     # the previous month
 36 |     # pmon = floor_date(time_point, unit = "month")- days(1)
 37 |     pmon =  dates_full[which(dates_full == cmon) - 1]
 38 | 
 39 |     # the legacy month
 40 |     lmon = dates_full[dates_full < pmon]
 41 | 
 42 |     # current mrr
 43 |     cmrr = dat %>%
 44 |       dplyr::filter(active_date == cmon) %>%
 45 |       dplyr::select(user_id, cmrr = MRR) %>%
 46 |       dplyr::filter(cmrr > 0)
 47 | 
 48 |     # previous mrr
 49 |     pmrr = dat %>%
 50 |       dplyr::filter(active_date == pmon) %>%
 51 |       dplyr::select(user_id, pmrr = MRR) %>%
 52 |       dplyr::filter(pmrr > 0)
 53 | 
 54 |     # legacy mrr, get the maximum number
 55 |     lmrr = dat %>%
 56 |       dplyr::filter(active_date %in% lmon) %>%
 57 |       dplyr::group_by(user_id) %>%
 58 |       dplyr::summarise(lmrr = max(MRR)) %>%
 59 |       dplyr::filter(lmrr > 0)
 60 | 
 61 | 
 62 |     # join all together
 63 |     alltable <- merge(cmrr, pmrr, all = T) %>%
 64 |       merge(lmrr, all=T) %>%
 65 |       NetlifyDS::impute_dat(method = "zero")
 66 | 
 67 |     # Get new
 68 |     new = alltable %>%
 69 |       dplyr::filter( cmrr > 0 & pmrr == 0 & lmrr == 0) %>%
 70 |       dplyr::select(user_id, new = cmrr )
 71 | 
 72 |     # Get reesurrected
 73 |     resurrected <- alltable %>%
 74 |       dplyr::filter(cmrr > 0 & pmrr == 0 & lmrr > 0) %>%
 75 |       dplyr::select(user_id,resurrected = cmrr)
 76 | 
 77 |     # Get Retain
 78 |     retain1 <- alltable %>%
 79 |       dplyr::filter(cmrr > 0 & pmrr > 0) %>%
 80 |       dplyr::filter(cmrr >= pmrr) %>%
 81 |       dplyr::select(user_id,retain1 = pmrr)
 82 | 
 83 |     retain2 <- alltable %>%
 84 |       dplyr::filter(cmrr > 0 & pmrr > 0) %>%
 85 |       dplyr::filter(cmrr < pmrr) %>%
 86 |       dplyr::select(user_id,retain2 = cmrr)
 87 | 
 88 |     retain = merge(retain1, retain2, all=T) %>%
 89 |       NetlifyDS::impute_dat(method = "zero")%>%
 90 |       dplyr::transmute(user_id = user_id, retain = retain1 + retain2)
 91 | 
 92 |     # Get expansion
 93 |     expansion <- alltable %>%
 94 |       dplyr::filter(cmrr > 0 & pmrr >0) %>%
 95 |       dplyr::filter(cmrr > pmrr) %>%
 96 |       dplyr::transmute(user_id = user_id, expansion = cmrr -pmrr)
 97 | 
 98 |     # Get contraction
 99 |     contraction <- alltable %>%
100 |       dplyr::filter(cmrr > 0 & pmrr >0) %>%
101 |       dplyr::filter(cmrr < pmrr) %>%
102 |       dplyr::transmute(user_id = user_id, contraction =pmrr-cmrr)
103 | 
104 |     # Get churn
105 |     churn <- alltable %>%
106 |       dplyr::filter(cmrr == 0 & pmrr > 0) %>%
107 |       dplyr::transmute(user_id = user_id, churn =pmrr)
108 | 
109 |     res0 = merge(new, resurrected, all = T)
110 |     res0 = merge(res0, retain, all = T)
111 |     res0 = merge(res0, expansion, all = T)
112 |     res0 = merge(res0, contraction, all = T)
113 |     res0 = merge(res0, churn, all = T)
114 |     res0 = NetlifyDS::impute_dat(res0, method = "zero")
115 | 
116 |     res0$date = time_point
117 |     return(res0)
118 |   }
119 | 
120 |   ######################## END OF TEM_FUN #############################
121 | 
122 |   res = do.call("rbind", lapply(dates, function(time_point){tem_fun(time_point, dat = dat, dates_full = dates_full)}))
123 |   res = res %>% dplyr::filter( !(new == 0 &  resurrected == 0 & retain == 0 &  expansion == 0 & contraction == 0 & churn == 0) )
124 | 
125 |   return(res)
126 | }
127 | 


--------------------------------------------------------------------------------
/R/RevCntAct.R:
--------------------------------------------------------------------------------
  1 | #' Calculate revenue accounting matrix by counts
  2 | #' @description It calculates the accounting matrix by counts given date and subscription data. It only returns one row for each date, i.e. it will sum up all the products.
  3 | #' @param dates  a date vector (have to be date type)
  4 | #' @param datall_revenue subscription data, need to have columns:  user_id, Product, Effective_Start, Effective_End, MRR
  5 | #' @param type product type
  6 | #' @return A revenue accounting matrix by count is returned for each date.
  7 | #' @author Hui Lin, \email{longqiman@gmail.com}
  8 | #' @examples
  9 | #' \dontrun{
 10 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1)
 11 | #' dates
 12 | #' # need to get daily revenue from subscription data if you don't have that already
 13 | #' base_monthly = do.call("rbind", lapply(dates, function(time_point)
 14 | #' {NetlifyDS::mrrAtDateType(time_point, subs = base_monthly, type = NULL)})
 15 | #' growth_matrix_identity = RevCntAct(datall_revenue = total_revenue, type = c("Identity"), dates = dates)
 16 | #' }
 17 | #' @export
 18 | 
 19 | RevCntAct <- function(datall_revenue, type, dates){
 20 | 
 21 |   dat <- datall_revenue %>%
 22 |     ungroup()%>%
 23 |     ## change this filter to get result for different products
 24 |     filter(Product %in% type) %>%
 25 |     transmute(user_id = user_id,
 26 |               MRR= MRR,
 27 |               active_date = as.Date(active_date)) %>%
 28 |     group_by(user_id, active_date) %>%
 29 |     summarise(MRR = round(sum(MRR), 2)) %>%
 30 |     ungroup()
 31 | 
 32 |   cnam <- c("New", "Retain", "Resurrected","Expansion", "Contraction", "Churn")
 33 |   grow_matrix <- matrix(0,nrow=length(dates), ncol= length(cnam)) %>%
 34 |     data.frame()
 35 |   names(grow_matrix) <- cnam
 36 | 
 37 |   for (i in 1:length(dates)){
 38 |     # the current month
 39 |     cmon = dates[i]
 40 | 
 41 |     # the previous month
 42 |     pmon = floor_date(dates[i], unit = "month")- days(1)
 43 | 
 44 |     # the legacy month
 45 |     lmon = dates[dates < pmon]
 46 | 
 47 |     # current mrr
 48 |     cmrr = dat %>%
 49 |       filter(active_date == cmon) %>%
 50 |       select(user_id, cmrr = MRR) %>%
 51 |       filter(cmrr > 0)
 52 | 
 53 |     # previous mrr
 54 |     pmrr = dat %>%
 55 |       filter(active_date == pmon) %>%
 56 |       select(user_id, pmrr = MRR) %>%
 57 |       filter(pmrr > 0)
 58 | 
 59 |     # legacy mrr, get the maximum number
 60 |     lmrr = dat %>%
 61 |       filter(active_date %in% lmon) %>%
 62 |       group_by(user_id) %>%
 63 |       summarise(lmrr = max(MRR)) %>%
 64 |       filter(lmrr > 0)
 65 | 
 66 | 
 67 |     # join all together
 68 |     alltable <- merge(cmrr, pmrr, all = T) %>%
 69 |       merge(lmrr, all=T) %>%
 70 |       NetlifyDS::impute_dat(method = "zero")
 71 | 
 72 |     # Get new
 73 |     new = alltable %>%
 74 |       filter( cmrr > 0 & pmrr == 0 & lmrr == 0) %>%
 75 |       summarise(user_cnt = length(unique(user_id)))
 76 | 
 77 |     # Get reesurrected
 78 |     resurrected <- alltable %>%
 79 |       filter(cmrr > 0 & pmrr == 0 & lmrr > 0) %>%
 80 |       summarise(user_cnt = length(unique(user_id)))
 81 | 
 82 |     # Get Retain
 83 |     retain1 <- alltable %>%
 84 |       filter(cmrr > 0 & pmrr > 0) %>%
 85 |       filter(cmrr >= pmrr) %>%
 86 |       summarise(user_cnt = length(unique(user_id)))
 87 | 
 88 |     retain2 <- alltable %>%
 89 |       filter(cmrr > 0 & pmrr > 0) %>%
 90 |       filter(cmrr < pmrr) %>%
 91 |       summarise(user_cnt = length(unique(user_id)))
 92 | 
 93 |     retain = retain1 + retain2
 94 | 
 95 |     # Get expansion
 96 |     expansion <- alltable %>%
 97 |       filter(cmrr > 0 & pmrr >0) %>%
 98 |       filter(cmrr > pmrr) %>%
 99 |       summarise(user_cnt = length(unique(user_id)) )
100 | 
101 |     # Get contraction
102 |     contraction <- alltable %>%
103 |       filter(cmrr > 0 & pmrr >0) %>%
104 |       filter(cmrr < pmrr) %>%
105 |       summarise(user_cnt = length(unique(user_id)))
106 | 
107 |     # Get churn
108 |     churn <- alltable %>%
109 |       filter(cmrr == 0 & pmrr > 0) %>%
110 |       summarise(user_cnt = length(unique(user_id)))
111 | 
112 |     grow_matrix[i, "New"] <- new
113 |     grow_matrix[i, "Retain"] <- retain
114 |     grow_matrix[i, "Resurrected"] <- resurrected
115 |     grow_matrix[i, "Expansion"] <- expansion
116 |     grow_matrix[i, "Contraction"] <- contraction
117 |     grow_matrix[i, "Churn"] <- churn
118 |   }
119 |   grow_matrix$dates <- dates
120 | 
121 |   return(grow_matrix)
122 | }
123 | 
124 | 


--------------------------------------------------------------------------------
/R/cv-glasso.R:
--------------------------------------------------------------------------------
  1 | #' Cross-validation for group lasso logistic regression
  2 | #' @description Does k-fold cross-validation for group lasso logistic regression and returns a list object
  3 | #' @param trainx  a data frame where samples are in rows and features are in columns
  4 | #' @param trainy a numeric or factor vector containing the outcome for each sample
  5 | #' @param nlam    number of lambda values. The default is 100
  6 | #' @param type  the type of prediction. \code{type = 'link'} is on the scale of linear predictors (default), whereas \code{type = 'response'} is on the scale of the response  variable, i.e. \code{type = 'response'} applies the inverse link function to the linear predictors.
  7 | #' @param na_action function determining what should be done with missing values when predicting new data during cross validation. The default is to predict NA.
  8 | #' @param kfold number of folds - default is 10. Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. The default is 10.
  9 | #' @return
 10 | #' an object of class "\code{cv_glasso}" is returned, which is a list with the ingredients of the cross-validation fit.
 11 | #' @author Hui Lin, \email{longqiman@gmail.com}
 12 | #' @examples
 13 | #' \dontrun{
 14 | #' data("sim1_da1")
 15 | #' trainx = dplyr::select(sim1_da1, -y)
 16 | #' trainy = sim1_da1$y
 17 | #' # index of the group
 18 | #' index <- gsub("\\..*", "", names(trainx))
 19 | #' # nlam is the number of values of tuning variable
 20 | #' nlam <- 10
 21 | #' # type of prediction
 22 | #' type = "link"
 23 | #' # number of cross-validation folds
 24 | #' kfold <- 10
 25 | #' cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold)
 26 | #' str(cv_fit)
 27 | #' }
 28 | 
 29 | #' @export
 30 | #'
 31 | cv_glasso <- function(trainx, trainy, nlam = 100, type = "link", kfold = 10,
 32 |                       na_action = na.pass) {
 33 | 
 34 |   library(grplasso)
 35 |   library(caret)
 36 |   library(pROC)
 37 |   n <- nrow(trainx)
 38 |   # Do grouplasso logistic regression
 39 |   x <- cbind(1, as.matrix(trainx))
 40 |   colnames(x) <- c("Intercept", colnames(trainx))
 41 |   index <- c(NA, as.factor(index))
 42 |   ################################# Get a vector of tuning parameters
 43 |   lambda <- lambdamax(x, y = trainy, index = index, penscale = sqrt,
 44 |                       model = LogReg()) * 0.96^(1:nlam)
 45 | 
 46 |   index0 <- c(1:n)
 47 |   y0 <- NULL
 48 |   pre <- NULL
 49 | 
 50 |   for (i in 1:kfold) {
 51 |     idx <- sample(index0, round(n/kfold, 0), replace = F)
 52 |     test <- trainx[idx, ]
 53 |     train <- trainx[-idx, ]
 54 |     y0 <- c(y0, trainy[idx])
 55 |     xtest <- cbind(1, as.matrix(test))
 56 |     xtrain <- cbind(1, as.matrix(train))
 57 |     colnames(xtrain) <- c("Intercept", colnames(train))
 58 |     colnames(xtest) <- c("Intercept", colnames(test))
 59 |     fit <- grplasso(xtrain, trainy[-idx], index = index, lambda = lambda,
 60 |                     model = LogReg(), penscale = sqrt)
 61 |     pre0 <- predict(fit, xtest, type = type, na.action = na_action)
 62 |     pre0 <- data.frame(pre0)
 63 |     pre <- rbind(pre, pre0)
 64 |     index0 <- index0[-idx]
 65 |   }
 66 | 
 67 |   auc <- rep(0, ncol(pre))
 68 |   for (i in 1:ncol(pre)) {
 69 |     auc[i] <- as.numeric(auc(y0, pre[, i]))
 70 |   }
 71 |   pmsure = auc
 72 |   lambda.max.auc = c(lambda = lambda[which(pmsure == max(pmsure))], auc = max(pmsure))
 73 |   se.auc = max(pmsure) - sd(pmsure)
 74 |   lambda.1se.auc = c(lambda[min(which(pmsure >= se.auc))], se.auc = se.auc)
 75 |   ############################## Maximize log-likelihood
 76 |   loglike <- rep(0, ncol(pre))
 77 |   for (i in 1:ncol(pre)) {
 78 |     loglike[i] <- sum((pre[, i] - log(1 + exp(pre[, i]))) * y0 + (-log(1 +
 79 |                                                                          exp(pre[, i]))) * (1 - y0))
 80 |   }
 81 |   pmsure = loglike
 82 |   lambda.max.loglike = c(lambda = lambda[which(pmsure == max(pmsure))], loglike = max(pmsure))
 83 |   se.loglike = max(pmsure) - sd(pmsure)
 84 |   lambda.1se.loglike = c(lambda = lambda[min(which(pmsure >= se.loglike))], se.loglike = se.loglike)
 85 |   ############################## Maximize correlation
 86 |   co <- pre
 87 |   maxco <- rep(0, ncol(pre))
 88 |   for (i in 1:ncol(pre)) {
 89 |     s <- sort(pre[, i])
 90 |     for (j in 1:nrow(pre)) {
 91 |       yhat <- as.numeric(pre[, i] >= s[j])
 92 |       co[j, i] <- cor(yhat, y0)
 93 |     }
 94 |     maxco[i] <- max(na.omit(co[, i]))
 95 |   }
 96 |   pmsure = maxco
 97 |   lambda.max.maxco = c(lambda = lambda[which(pmsure == max(pmsure))], maxco = max(pmsure))
 98 |   se.maxco = max(pmsure) - sd(pmsure)
 99 |   lambda.1se.maxco = c(lambda = lambda[min(which(pmsure >= se.maxco))], se.maxco = se.maxco)
100 |   ###############
101 |   res <- list(lambda = lambda, pred = pre, auc = auc, log_likelihood = loglike,
102 |               maxrho = maxco, lambda.max.auc = lambda.max.auc, lambda.1se.auc = lambda.1se.auc,
103 |               lambda.max.loglike = lambda.max.loglike, lambda.1se.loglike = lambda.1se.loglike,
104 |               lambda.max.maxco = lambda.max.maxco, lambda.1se.maxco = lambda.1se.maxco)
105 |   class(res) <- "cv_glasso"
106 |   return(res)
107 |   invisible(res)
108 | }
109 | 


--------------------------------------------------------------------------------
/R/fitglasso.R:
--------------------------------------------------------------------------------
 1 | #' Fit roup lasso logistic regression
 2 | #' @description Fit roup lasso logistic regression and returns a list object
 3 | #' @param trainx  a data frame where samples are in rows and features are in columns
 4 | #' @param trainy a numeric or factor vector containing the outcome for each sample
 5 | #' @param lambda  value of tuning parameter lambda
 6 | #'
 7 | #' @return
 8 | #' A \code{grplasso} object is returned, for which \code{coef}, \code{print}, \code{plot} and \code{predict} methods exist.
 9 | #' @author Hui Lin, \email{longqiman@gmail.com}
10 | #' @examples
11 | #' \dontrun{
12 | #' data("sim1_da1")
13 | #' trainx = dplyr::select(sim1_da1, -y)
14 | #' trainy = sim1_da1$y
15 | #' # index of the group
16 | #' index <- gsub("\\..*", "", names(trainx))
17 | #' # nlam is the number of values of tuning variable
18 | #' nlam <- 20
19 | #' # type of prediction
20 | #' type = "link"
21 | #' # number of cross-validation folds
22 | #' kfold <- 10
23 | #' cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold)
24 | #' str(cv_fit)
25 | #'
26 | #' fitgl <- fitglasso(trainx = trainx, trainy= trainy, lambda = cv_fit$lambda.max.auc[1])
27 | #' }
28 | #' @export
29 | #'
30 | 
31 | fitglasso <- function(trainx, trainy, lambda, na_action = na.pass) {
32 |   library(grplasso)
33 |   n <- nrow(trainx)
34 |   # Do grouplasso logistic regression
35 |   x <- cbind(1, as.matrix(trainx))
36 |   colnames(x) <- c("Intercept", colnames(trainx))
37 |   index <- c(NA, as.factor(index))
38 |   ################################# Get a vector of tuning parameters
39 |   fit <- grplasso(x, trainy, index = index, lambda = lambda)
40 |   # class(fit)<- "fitglasso"
41 |   return(fit)
42 | }
43 | 


--------------------------------------------------------------------------------
/R/impute-dat.R:
--------------------------------------------------------------------------------
 1 | #' Impute missing values
 2 | #'
 3 | #' @param dat a data frame.
 4 | #' @param method imputation method. Need to be one of these: \code{zero}, \code{mean}, \code{Inf0}, \code{code99}
 5 | #' @author Hui Lin, \email{longqiman@gmail.com}
 6 | #' @examples
 7 | #' \dontrun{
 8 | #' library(DataScienceR)
 9 | #' data("SegData")
10 | #' impdat <- SegData[, c("income", "house")]
11 | #' # summary(SegData)
12 | #' impdat = impute_dat(impdat, method = "mean")
13 | #' summary(impdat)
14 | #' }
15 | 
16 | #' @export
17 | impute_dat <- function (dat, method){
18 |   ##--------------------------------- zero
19 |   if (method == "zero")
20 |   {
21 |     for (i in 1:ncol(dat)) {
22 |       idx <- which(is.na(dat[, i]))
23 |       if (length(idx))
24 |         dat[, i][idx] <- 0
25 |     }
26 |     return(dat)
27 |   }
28 |   ##--------------------------------- mean
29 |   else if (method == "mean")
30 |   {
31 |     for (i in 1:ncol(dat)) {
32 |       idx <- which(is.na(dat[, i]))
33 |       if (length(idx))
34 |         dat[, i][idx] <- mean(na.omit(dat[, i]))
35 |     }
36 |     return(dat)
37 |   }
38 |   ##--------------------------------Impute Inf to be 0
39 |   else if (method == "Inf0")
40 |   {
41 |     for (i in 1:ncol(dat)) {
42 |       idx <- which(abs(dat[, i]) == Inf)
43 |       if (length(idx))
44 |         dat[, i][idx] <- 0
45 |     }
46 |     return(dat)
47 |   }
48 |   ##------------------------------Impute 99 to be 0
49 |   else if (method == "code99")
50 |   {
51 |     for (i in 1:ncol(dat)) {
52 |       idx <- which(dat[, i] == 99)
53 |       if (length(idx))
54 |         dat[, i][idx] <- 0
55 |     }
56 |     return(dat)
57 |   }
58 |   else
59 |   {cat(" Error in argument \"method\", \n
60 |        it has to be one of these: zero, mean, Inf0, code99")}
61 | }
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/R/mrrAtDateType.R:
--------------------------------------------------------------------------------
 1 | #' Calculate MRR for a specific date and product type
 2 | #' @description It calculates the MRR given date, subscription data, and product type
 3 | #' @param time_point  a date value (have to be date type)
 4 | #' @param subs subscription data, need to have columns: account_id, user_id, Product, Effective_Start, Effective_End, MRR
 5 | #' @param type product type, if "NULL", it will return MRR for each product
 6 | #' @return
 7 | #' a vector of MRR is returned
 8 | #' @author Hui Lin, \email{longqiman@gmail.com}
 9 | #' @examples
10 | #' \dontrun{
11 | #' add_ons <- c("Forms", "Functions", "Identity", "Domains", "Large Media")
12 | #' dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1)
13 | #' dates
14 | #' add_ons_mrr <- sapply(dates, function(time_point) {mrrAtDateType(time_point, datall_revenue, add_ons)})
15 | #' }
16 | #' @export
17 | #'
18 | 
19 | mrrAtDateType <- function(time_point, subs, type = NULL) {
20 |   # time_point: time of mrr
21 |   # subs: subscription data
22 |   time_point <- as.Date(time_point)
23 | 
24 |   if (is.null(type)){
25 |     actives <- subs  %>%
26 |       dplyr::filter(Effective_Start <= time_point) %>%
27 |       dplyr::filter( is.na(Effective_End) | Effective_End > time_point)
28 | 
29 |     res <- actives %>%
30 |       dplyr::group_by(account_id,user_id,Product, rate_plan_name) %>%
31 |       # round it in case we have some magic number again
32 |       dplyr::summarise(MRR = round(sum(MRR , na.rm=T), 10))
33 | 
34 |     res$Date = time_point
35 |     return(res)
36 |   }
37 |   else{
38 |     actives <- subs  %>%
39 |       dplyr::filter(Product %in% type) %>%
40 |       dplyr::filter(Effective_Start <= time_point) %>%
41 |       dplyr::filter(is.na(Effective_End) | Effective_End > time_point)
42 |     return(sum(actives$MRR , na.rm=T))
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/R/multiplot.R:
--------------------------------------------------------------------------------
 1 | #' Multiple plot function
 2 | #' @description make multiple plot for ggplot objects.
 3 | #' @param ... Pass ggplot objects or plotlist (as a list of ggplot objects)
 4 | #' @param cols Number of columns in layout
 5 | #' @param layout A matrix specifying the layout. If present, 'cols' is ignored. If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), then plot 1 will go in the upper left, 2 will go in the upper right, and 3 will go all the way across the bottom.
 6 | 
 7 | #' @export
 8 | multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
 9 |   require(grid)
10 | 
11 |   # Make a list from the ... arguments and plotlist
12 |   plots <- c(list(...), plotlist)
13 | 
14 |   numPlots = length(plots)
15 | 
16 |   # If layout is NULL, then use 'cols' to determine layout
17 |   if (is.null(layout)) {
18 |     # Make the panel
19 |     # ncol: Number of columns of plots
20 |     # nrow: Number of rows needed, calculated from # of cols
21 |     layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
22 |                      ncol = cols, nrow = ceiling(numPlots/cols))
23 |   }
24 | 
25 |   if (numPlots==1) {
26 |     print(plots[[1]])
27 | 
28 |   } else {
29 |     # Set up the page
30 |     grid.newpage()
31 |     pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
32 | 
33 |     # Make each plot, in the correct location
34 |     for (i in 1:numPlots) {
35 |       # Get the i,j matrix positions of the regions that contain this subplot
36 |       matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
37 | 
38 |       print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
39 |                                       layout.pos.col = matchidx$col))
40 |     }
41 |   }
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/R/pairwise_ks_test.R:
--------------------------------------------------------------------------------
 1 | #' Pairwise Kolmogorov-Smirnov Test
 2 | #' @description Perform Pairwise Multiple Comparisons Using Kolmogorov-Smirnov Test 
 3 | #' @param value a numeric vector of data values
 4 | #' @param group a group indicator
 5 | #' @param n_min  The minimum number of observations in each group. Group(s) with observation less than \code{n_min} will be removed. "\code{n_min=50}"(default)
 6 | #' @param warning sets the handling of warning messages. If \code{warning} is negative all warnings are ignored. If \code{warning} is zero (the default) warnings are stored until the top–level function returns. If 10 or fewer warnings were signalled they will be printed otherwise a message saying how many were signalled. An object called last.warning is created and can be printed through the function warnings. If \code{warning} is one, warnings are printed as they occur. If warn is two or larger all warnings are turned into errors.
 7 | #' @param alternative	indicates the alternative hypothesis and must be one of "\code{two.sided}" (default), "\code{less}", or "\code{greater}". You can specify just the initial letter of the value, but the argument name must be give in full. See ‘Details’ for the meanings of the possible values.
 8 | #' @details Missing values are silently omitted from x and (in the two-sample case) y.
 9 | #' 
10 | #' The possible values "\code{two.sided}", "\code{less}" and "\code{greater}" of alternative specify the null hypothesis that the true distribution function of x is equal to, not less than or not greater than the hypothesized distribution function (one-sample case) or the distribution function of y (two-sample case), respectively. This is a comparison of cumulative distribution functions, and the test statistic is the maximum difference in value, with the statistic in the "greater" alternative being \eqn{D^+ = max[F_x(u) - F_y(u)]}. Thus in the two-sample case alternative = "greater" includes distributions for which x is stochastically smaller than y (the CDF of x lies above and hence to the left of that for y), in contrast to \code{t.test} or \code{wilcox.test}.
11 | #' 
12 | #' @return Pairwise Kolmogorov-Smirnov Test p-value Matrix
13 | #' 
14 | #' @author Hui Lin, \email{longqiman@gmail.com}
15 | #' @examples
16 | #' \dontrun{
17 | #' data("iris")
18 | #' value<-iris$Sepal.Length
19 | #' group<-iris$Species
20 | #' pairwise_ks_test(value,group,warning = -1)
21 | #' }
22 | #' @export
23 | 
24 | pairwise_ks_test <- function(value, group, n_min = 50, warning = 0, alternative = "two.sided" ){
25 |   
26 |   lev <- unique(group)
27 |   
28 |   lst <- lapply( seq_along(lev), function(i) value[group == lev[i]] )
29 |   names(lst)<-lev
30 |   
31 |   if (sum(lengths(lst)< n_min)) {
32 |     lst <- lst [-which(lengths(lst)< n_min)]}
33 |   
34 |   f <- function(x, y){ 
35 |     w <- getOption("warn") 
36 |     options(warn = warning)  # ignore warnings 
37 |     p <- ks.test(x, y, alternative = alternative, exact = 
38 |                    F)$p.value 
39 |     options(warn = w) 
40 |     return(p) 
41 |   } 
42 |   
43 |   res <- lapply(lst, function(x) lapply(lst, function(y) f(x, y))) 
44 |   
45 |   res<-unlist(res)
46 |   res <- matrix(res, nrow = length(lst), ncol = length(lst), byrow = T)
47 |   row.names(res) <- colnames(res) <- names(lst)
48 |   cat("Pairwise Kolmogorov-Smirnov Test p-value Matrix","\n","\n")
49 |   return(res)
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/R/plot.cv_glasso.R:
--------------------------------------------------------------------------------
 1 | #' plot the cross-validation curve produced by  "\code{cv_glasso}"
 2 | #' @description Plots the cross-validation curve as a function of the lambda values used.
 3 | #' @param x  fitted "\code{cv_glasso}" object
 4 | #' @param type.measure criteria to use for cross-validation. Currently three options. The default is \code{type.measure = "auc"} which gives area under the ROC curve. \code{type.measure = "loglike"} computes the log-likelihood score in Meier et al2008. \code{type.measure = "maxco"} computes the maximum correlation coefficient in Yeo and Burge.
 5 | #' @return A plot is produced, and nothing is returned.
 6 | #' @author Hui Lin, \email{longqiman@gmail.com}
 7 | #' @references L. Meier, S. van de Geer, and P. Buhlmann, The group lasso for logistic regression, J. R. Stat. Soc. Ser. B Stat. Methodol. 70 (2008), pp. 53-71.
 8 | #' @references G.W. Yeo and C.B. Burge, Maximum entropy modeling of short sequence motifs with applications to RNA splicing signals, J. Computnl Biol. 11 (2004), pp. 475-494.
 9 | #'
10 | #' @examples
11 | #' \dontrun{
12 | #' data("sim1_da1")
13 | #' trainx = dplyr::select(sim1_da1, -y)
14 | #' trainy = sim1_da1$y
15 | #' # index of the group
16 | #' index <- gsub("\\..*", "", names(trainx))
17 | #' # nlam is the number of values of tuning variable
18 | #' nlam <- 10
19 | #' # type of prediction
20 | #' type = "link"
21 | #' # number of cross-validation folds
22 | #' kfold <- 10
23 | #' cv.fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold)
24 | #' plot.cv_glasso(cv.fit)
25 | #' }
26 | #'
27 | #' @export
28 | plot.cv_glasso <- function (x, type.measure = "auc", ...)
29 | {
30 |   cvobj <- x
31 |   xlab <- "Lambda"
32 |   y <- cvobj[[type.measure]]
33 |   plot.args = list(x = cvobj$lambda, y = y,
34 |                    ylim = range(y)+c(-sd(y), sd(y)), xlab = xlab, ylab = type.measure,
35 |                    type = "n")
36 |   new.args = list(...)
37 |   if (length(new.args))
38 |     plot.args[names(new.args)] = new.args
39 |   do.call("plot", plot.args)
40 |  # error.bars(sign.lambda * log(cvobj$lambda), cvobj$cvup, cvobj$cvlo,
41 |  #             width = 0.01, col = "darkgrey")
42 |   points(cvobj$lambda, cvobj[[type.measure]], pch = 20,
43 |          col = "red")
44 |   # axis(side = 3, at = cvobj$lambda, labels = paste(cvobj$nz),
45 |   #     tick = FALSE, line = 0)
46 | 
47 |   abline(v = cvobj[[paste("lambda.max",type.measure,sep=".")]][1], lty = 3)
48 |   abline(v = cvobj[[paste("lambda.1se",type.measure,sep=".")]][1], lty = 3)
49 |   invisible()
50 | }
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/R/plot.rocTest.R:
--------------------------------------------------------------------------------
 1 | #' Plot ROC curve
 2 | #' @description Plot ROC curve based on rocTest object
 3 | #' @param x  rocTest object
 4 | #' @author Hui Lin, \email{longqiman@gmail.com}
 5 | 
 6 | #' @export
 7 | plot.rocTest <- function(x, auto.legend = TRUE, ...)
 8 | 
 9 | {
10 | 
11 |     plot(c(0,1), c(0,1), type = "n",ann=FALSE, asp=0,xaxs='i',yaxs='i',xaxt='n',yaxt='n')
12 | 
13 | axis(1,c(0,.5,1))
14 | 
15 | axis(2,c(0,.5,1))
16 | 
17 |     for (j in 1:length(x$sens)) {
18 | 
19 |         lines(I(1 - x$spec[[j]]), x$sens[[j]], lty = j)
20 | 
21 |         abline(0, 1, lty = 6)
22 | 
23 |     }
24 | 
25 |     if (auto.legend) legend(0.6,0.3,
26 |     #"bottomright",
27 |     lty = c(1:length(x$sens),6),
28 | 
29 |     #col = 1:length(x$sens),
30 | 
31 |      c('Group Lasso','Expert Opinion','Diagonal'), bty = "n",cex=1
32 | 
33 | # xjust = 1, yjust = 0
34 | 
35 | )
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/R/plot.tune_cutoff.R:
--------------------------------------------------------------------------------
 1 | #' Plot cutoff tuning process from "\code{tune_cutoff}"
 2 | #' @description Plots cutoff tuning process
 3 | #' @param x "\code{tune_cutoff}" object
 4 | #' @param pch type of cutoff points, default is 20
 5 | #' @param cex size of cutoff points, default is 1.5, col = "red"
 6 | #' @param ... other parameters in "\code{plot()}" function
 7 | #' @param col color of cutoff points, default is \code{col = "red"}
 8 | #' @author Hui Lin, \email{longqiman@gmail.com}
 9 | #'
10 | #' @examples
11 | #' \dontrun{
12 | #' data("sim1_da1")
13 | #' trainx = sim1_da1[,1:50]
14 | #' trainy = sim1_da1$y
15 | #' library(glmnet)
16 | #' fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial")
17 | #' test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min")
18 | #' test <- as.vector(test)
19 | #' summary(test)
20 | #' likelihood <- c(0.2, 0.5, 0.8)
21 | #' y <- trainy
22 | #' x <- tune_cutoff(test = test, y = y, likelihood = likelihood)
23 | #' str(x)
24 | #' plot.tune.cutoff(x)
25 | #' }
26 | #'
27 | #' @export
28 | plot.tune.cutoff <- function (x, pch= 20, cex = 1.5, col = "red", ...) {
29 |   dat <- x$cutoff.tune
30 |   plot(dat$PPV, dat$NPV, type = "l", xlab = "True Positive", ylab = "True Negative",
31 |        main = paste("Likelihood groups:", paste(x$likelihood, collapse = ", ") ), ...)
32 |   points(dat$PPV[x$cutoffs], dat$NPV[x$cutoffs], pch = pch, cex = cex, col =  col)
33 |   text(dat$PPV[x$cutoffs], dat$NPV[x$cutoffs]+0.1, x$cutoffs)
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/R/predict.fitglasso.R:
--------------------------------------------------------------------------------
 1 | #' Predict method for grplasso objects
 2 | #' @description Obtains predictions from a \code{grplasso} object.
 3 | 
 4 | 
 5 | #' @param object	a \code{grplasso} object
 6 | #' @param newdata	data.frame or design matrix of new observations
 7 | #' @param type	the type of prediction. \code{type = "link"} is on the scale of linear predictors, whereas \code{type = "response"} is on the scale of the response variable, i.e. \code{type = "response"} applies the inverse link function to the linear predictors.
 8 | #' na.action	function determining what should be done with missing values in newdata. The default is to predict NA.
 9 | #' @param ...	other options to be passed to the predict function.
10 | #' @export
11 | #'
12 | predict_glasso <- function(object, newdata, type = c("link", "response"),
13 |                               na.action = na.pass, ...) {
14 |   library(grplasso)
15 |   n <- nrow(newdata)
16 |   # Do grouplasso logistic regression
17 |   x <- cbind(1, as.matrix(newdata))
18 |   return(predict(object, x, type = type, na.action = na.action, ...))
19 | }
20 | 


--------------------------------------------------------------------------------
/R/rocTest.R:
--------------------------------------------------------------------------------
 1 | #' Compute the area under the ROC curve and compare different AUC's
 2 | #' @description This function compares the AUC of two correlated (or paired) or uncorrelated (un- paired) ROC curves.
 3 | #' @author Hui Lin, \email{longqiman@gmail.com}
 4 | #' @param y  response
 5 | #' @param x  prediction
 6 | #' @param L  list to assign the comparisons
 7 | 
 8 | #' @export
 9 | rocTest <- function(y, x, L = NULL) {
10 | 
11 |     trapezarea <- function (x, y)
12 |     {
13 |         if (x[1] > x[length(x)]) {
14 |             x <- rev(x)
15 |             y <- rev(y)
16 |         }
17 |         if (length(x) != length(y))
18 |             stop("length x must equal length y")
19 |         if (length(unique(x)) < 2)
20 |             return(NA)
21 |         ya <- approx(x, y, 0, ties = max, rule = 2)$y
22 |         yb <- approx(x, y, 1, ties = max, rule = 2)$y
23 |         x <- c(0, x, 1)
24 |         y <- c(ya, y, yb)
25 |         h <- diff(x)
26 |         lx <- length(x)
27 |         area <- 0.5 * sum(h * (y[-1] + y[-lx]))
28 |         area
29 |     }
30 | 
31 |     th <- NULL
32 |     sens <- spec <- list(rep(NULL, length(x)))
33 |     for (j in 1:length(x)) {
34 |         DD <- table(-x[[j]], y)
35 |         sens[[j]] <- c(0, cumsum(DD[,2])/sum(DD[,2]))
36 |         spec[[j]] <- c(1, 1 - cumsum(DD[,1])/sum(DD[,1]))
37 |         th[j] <- trapezarea(1 - spec[[j]], sens[[j]])
38 |     }
39 | 
40 |     if (!is.null(names(x))) {
41 |         names(sens) <- names(x)
42 |         names(spec) <- names(x)
43 |     }
44 |     else {
45 |         names(sens) <- paste("Test", LETTERS[1:length(x)])
46 |         names(spec) <- paste("Test", LETTERS[1:length(x)])
47 |     }
48 | 
49 |     if (!is.null(L)) {
50 |         V10 <- matrix(NA, nrow = length(y[y == 1]), ncol = length(x))
51 |         V01 <- matrix(NA, nrow = length(y[y == 0]), ncol = length(x))
52 | 
53 |         for (j in 1:length(x)) {
54 |             x.s <- split(x[[j]], y)
55 |             for (i in 1:length(x.s$"1"))
56 |                 V10[i, j] <- (length(x.s$"0"[x.s$"0" < x.s$"1"[i]]) + .5 * length(x.s$"0"[x.s$"0" == x.s$"1"[i]])) / length(y[y == 0])
57 |             for (i in 1:length(x.s$"0"))
58 |                 V01[i, j] <- (length(x.s$"1"[x.s$"0"[i] < x.s$"1"]) + .5 * length(x.s$"1"[x.s$"1" == x.s$"0"[i]])) / length(y[y == 1])
59 |         }
60 | 
61 |         S10 <- (t(V10) %*% V10 - length(y[y == 1]) * th %*% t(th)) / (length(y[y == 1]) - 1)
62 |         S01 <- (t(V01) %*% V01 - length(y[y == 0]) * th %*% t(th)) / (length(y[y == 0]) - 1)
63 | 
64 |         S <- S10 / length(y[y == 1]) + S01 / length(y[y == 0])
65 | 
66 |         contr <- L %*% th
67 |         se <- sqrt((L %*% S %*% t(L)))
68 | 
69 |         test <- t(th) %*% t(L) %*% solve(L %*% (1 /length(y[y ==1]) * S10 + 1 / length(y[y ==0]) * S01) %*% t(L), t(t(th) %*% t(L)))
70 | 
71 |         p.value <- pchisq(test, df = qr(L %*% S %*% t(L))$rank, lower.tail = FALSE)
72 |     }
73 |     else {
74 |         S <- NULL
75 |         p.value <- NULL
76 |         contr <- NULL
77 |     }
78 | 
79 |     names(th) <- names(x)
80 |     res <- list(th = th, sens = sens, spec = spec, contr = contr, S = S, p.value = p.value)
81 |     class(res) <- "rocTest"
82 |     invisible(res)
83 | }
84 | 


--------------------------------------------------------------------------------
/R/subclass_eff_est.R:
--------------------------------------------------------------------------------
 1 | #' Estimation ATT and ATE after stratification
 2 | #' @description After stratification, estimation the weighted mean difference. Return estimates of ATT and ATE
 3 | #' @param object  The output object from matchit. This is a required input.
 4 | #' @param y outcome variable. This is a required input.
 5 | #' @return
 6 | #' Fuction returns a list with the ATT and ATE estimates
 7 | #' @author Hui Lin, \email{longqiman@gmail.com}
 8 | #' @examples
 9 | #' \dontrun{
10 | #' subclass_eff_est(m2.out, match.data(m2.out)$CORN_UNITS_CHG)
11 | #' }
12 | 
13 | #' @export
14 | #'
15 | #'
16 | subclass_eff_est <- function(object, y){
17 | 
18 | mdat_str = tibble(subclass = object$subclass,
19 |        treatment = object$treat,
20 |        y = y
21 | )
22 | 
23 | 
24 | TAOs = mdat_str%>%
25 |   group_by(subclass,treatment)%>%
26 |   summarise(ybar = mean(y))%>%
27 |   ungroup()%>%
28 |   group_by(subclass)%>%
29 |   summarise(taos=diff(ybar))
30 | 
31 | Ns <- mdat_str%>%
32 |   group_by(subclass)%>%
33 |   summarise(Ns = n(), Ns1 = sum(treatment))%>%
34 |   mutate(wt_ate = Ns/sum(Ns), wt_att = Ns1/sum(mdat_str$treatment))
35 | 
36 | tao_att = sum(TAOs$taos * Ns$wt_att)
37 | tao_ate = sum(TAOs$taos * Ns$wt_ate)
38 | 
39 | ###############
40 | res <- list(tao_att = tao_att,
41 |             tao_ate = tao_ate)
42 | return(res)
43 | invisible(res)
44 | }
45 | 


--------------------------------------------------------------------------------
/R/tune-cutoff.R:
--------------------------------------------------------------------------------
  1 | #' Tune the cutoffs for different likelihood groups
  2 | #' @description Tune the cutoffs for different likelihood groups given predicted score, response and a vector of likelihood values
  3 | #' @param test a vector of predicted score from model
  4 | #' @param  y a vector of response, \code{y} need to be the same length of \code{test}
  5 | #' @param  likelihood a vector of likelihood values
  6 | #'
  7 | #' @return
  8 | #' an object of class "\code{tune_cutoff}" is returned, which is a list with the ingredients of the tuning process.
  9 | #' @author Hui Lin, \email{longqiman@gmail.com}
 10 | #' @examples
 11 | #' \dontrun{
 12 | #' data("sim1_da1")
 13 | #' trainx = sim1_da1[,1:50]
 14 | #' trainy = sim1_da1$y
 15 | #' library(glmnet)
 16 | #' fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial")
 17 | #' test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min")
 18 | #' test <- as.vector(test)
 19 | #' summary(test)
 20 | #' likelihood <- c(0.2, 0.5, 0.8)
 21 | #' y <- trainy
 22 | #' x <- tune_cutoff(test = test, y = y, likelihood = likelihood)
 23 | #' str(x)
 24 | #' plot.tune.cutoff(x)
 25 | #' }
 26 | #'
 27 | #' @export
 28 | #'
 29 | tune_cutoff <- function(test, y, likelihood) {
 30 | 
 31 |   n <- length(y)
 32 |   ppv <- rep(-1, n)
 33 |   npv <- rep(-1, n)
 34 |   for (i in 1:n) {
 35 |     t <- sort(test)[i]
 36 |     as.numeric(test >= t)
 37 |     np <- sum(test >= t)
 38 |     nn <- n - np
 39 |     tp <- sum((test >= t) * y)
 40 |     tn <- sum((test < t) * (1 - y))
 41 |     ppv[i] <- tp/np
 42 |     npv[i] <- tn/nn
 43 |   }
 44 | 
 45 |   ta <- data.frame(PPV = ppv, NPV = npv, cutoff = sort(test))
 46 |   ResPre <- data.frame(cbind(cutoff = test, testClasses = y))
 47 |   ResPre <- ResPre[order(ResPre$cutoff), ]
 48 |   ta$testClasses <- ResPre$testClasses
 49 |   # ------------------------------------------------------------------
 50 |   # tune the cutoffs
 51 |   likelihood <- sort(likelihood)
 52 | 
 53 |   rk <- seq_along(likelihood)
 54 |   rk <- c(1, length(rk), rk[2:(length(rk) - 1)])
 55 |   likelihood <- likelihood[rk]
 56 | 
 57 |   ids <- seq(1:(length(likelihood) - 1))
 58 | 
 59 |   for (i in seq_along(ids)) {
 60 |     i=2
 61 |     p <- likelihood[i]
 62 | 
 63 |     id1s <- NULL
 64 | 
 65 |     ##------------------ the 1st group
 66 |     if (i == 1) {
 67 |       for (j in seq_along(ta$testClasses)) {
 68 |         if (mean(ta$testClasses[1:j], na.rm = T) <= p) {
 69 |           id1s <- c(id1s, j)
 70 |         }
 71 |       }
 72 |       ids[i] <- max(id1s, na.rm = T)
 73 |     }
 74 | 
 75 |     ##------------------ the 2nd group
 76 |     if (i == 2) {
 77 |       for (j in ids[1]:length(ta$testClasses)) {
 78 |         if (mean(ta$testClasses[j:length(ta$testClasses)], na.rm = T) >= p) {
 79 |           id1s <- c(id1s, j)
 80 |         }
 81 |       }
 82 |       ids[i] <- min(id1s, na.rm = T)
 83 |     }
 84 | 
 85 |     ##------------------ the 3rd group
 86 |     if (i == 3) {
 87 |       for (j in (ids[1] + 1):(ids[2] - 1)) {
 88 |         if (mean(ta$testClasses[(ids[1] + 1):j], na.rm = T) <= p) {
 89 |           id1s <- c(id1s, j)
 90 |         }
 91 |       }
 92 |       ids[i] <- max(id1s, na.rm = T)
 93 |     }
 94 |     ##------------------ the rest groups
 95 |     if (i > 3) {
 96 |       for (j in (ids[i - 1] + 1):(ids[2] - 1)) {
 97 |         if (mean(ta$testClasses[(ids[i - 1] + 1):j], na.rm = T) <= p) {
 98 |           id1s <- c(id1s, j)
 99 |         }
100 |       }
101 |       ids[i] <- max(id1s, na.rm = T)
102 |     }
103 |   }
104 |   res <- list(cutoff.tune = ta,
105 |   cutoffs = sort(ids),
106 |   likelihood = sort(likelihood)
107 |   )
108 |   
109 |   class(res) <- "tune_cutoff"
110 |   return(res)
111 |   invisible(res)
112 | }
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | The `NetlifyDS` package contains functions to streamline Netlify's data science project.  It espouses the philosophy: Anything that can be automated should be automated. Do as much as possible with functions. 
 2 | 
 3 | Functions in this package are for the following purposes:
 4 | 
 5 | - Build data pipeline
 6 | - Predictive analytics (complementary to caret package)
 7 | - Market research analysis (psychometric models and segmentation)
 8 | - Unstructured data analysis (text mining, web scraping)
 9 | 
10 | `NetlifyDS` utilizes a number of R packages.  It loads packages as needed and assumes that they are installed. Install the package using:
11 | 
12 | ```r
13 | # if you haven't installed devtools
14 | # install.packages("devtools")
15 | devtools::install_github("netlify/NetlifyDS")
16 | library("NetlifyDS")
17 | ```
18 | 


--------------------------------------------------------------------------------
/man/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netlify/NetlifyDS/613822924a26ebd1b37715f7f6d135732ad89d04/man/.DS_Store


--------------------------------------------------------------------------------
/man/MeterRevActUserProduct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/MeterRevActUserProduct.R
 3 | \name{MeterRevActUserProduct}
 4 | \alias{MeterRevActUserProduct}
 5 | \title{Calculate metered revenue accounting matrix per day, user and product}
 6 | \usage{
 7 | MeterRevActUserProduct(datall_revenue, dates, type)
 8 | }
 9 | \arguments{
10 | \item{datall_revenue}{subscription data, need to have columns:  user_id, Product, Effective_Start, one_time_charge}
11 | 
12 | \item{dates}{a date vector (have to be date type)}
13 | 
14 | \item{type}{a vector of product names}
15 | }
16 | \value{
17 | A metered revenue accounting matrix is returned for each day, user and product
18 | }
19 | \description{
20 | It calculates the accounting matrix for each user and product given date, subscription data and product category.
21 | }
22 | \author{
23 | Hui Lin, \email{longqiman@gmail.com}
24 | }
25 | 


--------------------------------------------------------------------------------
/man/RevAct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RevAct.R
 3 | \name{RevAct}
 4 | \alias{RevAct}
 5 | \title{Calculate revenue accounting matrix}
 6 | \usage{
 7 | RevAct(datall_revenue, type, dates)
 8 | }
 9 | \arguments{
10 | \item{datall_revenue}{daily revenue data, need to have columns:  user_id, Product, active_date, MRR}
11 | 
12 | \item{type}{product type}
13 | 
14 | \item{dates}{a date vector (have to be date type)}
15 | }
16 | \value{
17 | A revenue accounting matrix is returned for each date.
18 | }
19 | \description{
20 | It calculates the accounting matrix given date and daily revenue data. It only returns one row for each date, i.e. it will sum up all the products.
21 | }
22 | \examples{
23 | \dontrun{
24 | dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1)
25 | dates
26 | # need to get daily revenue from subscription data if you don't have that already
27 | base_monthly = do.call("rbind", lapply(dates, function(time_point)
28 | growth_matrix_identity = RevAct(datall_revenue = base_monthly, type = c("Identity"), dates = dates)
29 | }
30 | }
31 | \author{
32 | Hui Lin, \email{longqiman@gmail.com}
33 | }
34 | 


--------------------------------------------------------------------------------
/man/RevActUserProduct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RevActUserProduct.R
 3 | \name{RevActUserProduct}
 4 | \alias{RevActUserProduct}
 5 | \title{Calculate revenue accounting matrix per day, user and product}
 6 | \usage{
 7 | RevActUserProduct(datall_revenue, dates, dates_full, type)
 8 | }
 9 | \arguments{
10 | \item{datall_revenue}{daily revenue data, need to have columns:  user_id, Product, active_date, MRR}
11 | 
12 | \item{dates}{a date vector (have to be date type)}
13 | 
14 | \item{dates_full}{a full range of dates}
15 | 
16 | \item{type}{a vector of product names}
17 | }
18 | \value{
19 | A revenue accounting matrix is returned for each day, user and product
20 | }
21 | \description{
22 | It calculates the accounting matrix for each user and product given date, subscription data and product category.
23 | }
24 | \examples{
25 | \dontrun{
26 | team <- c("netlify_team_premium_monthly","Teams","netlify_team_plus_monthly","netlify_team_plus_yearly")
27 | res_0 = RevActUserProduct(total_revenue, dates, team)
28 | }
29 | }
30 | \author{
31 | Hui Lin, \email{longqiman@gmail.com}
32 | }
33 | 


--------------------------------------------------------------------------------
/man/RevCntAct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RevCntAct.R
 3 | \name{RevCntAct}
 4 | \alias{RevCntAct}
 5 | \title{Calculate revenue accounting matrix by counts}
 6 | \usage{
 7 | RevCntAct(datall_revenue, type, dates)
 8 | }
 9 | \arguments{
10 | \item{datall_revenue}{subscription data, need to have columns:  user_id, Product, Effective_Start, Effective_End, MRR}
11 | 
12 | \item{type}{product type}
13 | 
14 | \item{dates}{a date vector (have to be date type)}
15 | }
16 | \value{
17 | A revenue accounting matrix by count is returned for each date.
18 | }
19 | \description{
20 | It calculates the accounting matrix by counts given date and subscription data. It only returns one row for each date, i.e. it will sum up all the products.
21 | }
22 | \examples{
23 | \dontrun{
24 | dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1)
25 | dates
26 | # need to get daily revenue from subscription data if you don't have that already
27 | base_monthly = do.call("rbind", lapply(dates, function(time_point)
28 | {NetlifyDS::mrrAtDateType(time_point, subs = base_monthly, type = NULL)})
29 | growth_matrix_identity = RevCntAct(datall_revenue = total_revenue, type = c("Identity"), dates = dates)
30 | }
31 | }
32 | \author{
33 | Hui Lin, \email{longqiman@gmail.com}
34 | }
35 | 


--------------------------------------------------------------------------------
/man/clust_ana.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ClusterAnalysis.R
 3 | \name{clust_ana}
 4 | \alias{clust_ana}
 5 | \title{Cluster analysis}
 6 | \usage{
 7 | clust_ana(ans, k, distm = "euclidean", hclustm = "ward.D",
 8 |   na.rm = FALSE)
 9 | }
10 | \arguments{
11 | \item{ans}{data matrix}
12 | 
13 | \item{k}{number of clusters}
14 | 
15 | \item{distm}{Dissimilarity index, partial match to "\code{manhattan}", "\code{euclidean}"(default), "\code{canberra}", "\code{bray}", "\code{kulczynski}", "\code{jaccard}", "\code{gower}", "\code{altGower}", "\code{morisita}", "\code{horn}", "\code{mountford}", "\code{raup}" , "\code{binomial}", "\code{chao}", "\code{cao}" or "\code{mahalanobis}".}
16 | 
17 | \item{hclustm}{the agglomeration method to be used. This should be (an unambiguous abbreviation of) one of "\code{ward.D}"(default), "\code{ward.D2}", "\code{single}", "\code{complete}", "\code{average}" (= UPGMA), "\code{mcquitty}" (= WPGMA), "\code{median}" (= WPGMC) or "\code{centroid}" (= UPGMC).}
18 | 
19 | \item{na.rm}{Pairwise deletion of missing observations when computing dissimilarities.}
20 | }
21 | \description{
22 | Does cluster analysis and report an index of cluster
23 | }
24 | \examples{
25 | \dontrun{
26 | data("SegData")
27 | library(dplyr)
28 | library(vegan)
29 | cind <- clust_ana(dat, k=4)
30 | }
31 | }
32 | \author{
33 | Hui Lin, \email{longqiman@gmail.com}
34 | }
35 | 


--------------------------------------------------------------------------------
/man/cv_glasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cv-glasso.R
 3 | \name{cv_glasso}
 4 | \alias{cv_glasso}
 5 | \title{Cross-validation for group lasso logistic regression}
 6 | \usage{
 7 | cv_glasso(trainx, trainy, nlam = 100, type = "link", kfold = 10,
 8 |   na_action = na.pass)
 9 | }
10 | \arguments{
11 | \item{trainx}{a data frame where samples are in rows and features are in columns}
12 | 
13 | \item{trainy}{a numeric or factor vector containing the outcome for each sample}
14 | 
15 | \item{nlam}{number of lambda values. The default is 100}
16 | 
17 | \item{type}{the type of prediction. \code{type = 'link'} is on the scale of linear predictors (default), whereas \code{type = 'response'} is on the scale of the response  variable, i.e. \code{type = 'response'} applies the inverse link function to the linear predictors.}
18 | 
19 | \item{kfold}{number of folds - default is 10. Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. The default is 10.}
20 | 
21 | \item{na_action}{function determining what should be done with missing values when predicting new data during cross validation. The default is to predict NA.}
22 | }
23 | \value{
24 | an object of class "\code{cv_glasso}" is returned, which is a list with the ingredients of the cross-validation fit.
25 | }
26 | \description{
27 | Does k-fold cross-validation for group lasso logistic regression and returns a list object
28 | }
29 | \examples{
30 | \dontrun{
31 | data("sim1_da1")
32 | trainx = dplyr::select(sim1_da1, -y)
33 | trainy = sim1_da1$y
34 | # index of the group
35 | index <- gsub("\\\\..*", "", names(trainx))
36 | # nlam is the number of values of tuning variable
37 | nlam <- 10
38 | # type of prediction
39 | type = "link"
40 | # number of cross-validation folds
41 | kfold <- 10
42 | cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold)
43 | str(cv_fit)
44 | }
45 | }
46 | \author{
47 | Hui Lin, \email{longqiman@gmail.com}
48 | }
49 | 


--------------------------------------------------------------------------------
/man/fitglasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fitglasso.R
 3 | \name{fitglasso}
 4 | \alias{fitglasso}
 5 | \title{Fit roup lasso logistic regression}
 6 | \usage{
 7 | fitglasso(trainx, trainy, lambda, na_action = na.pass)
 8 | }
 9 | \arguments{
10 | \item{trainx}{a data frame where samples are in rows and features are in columns}
11 | 
12 | \item{trainy}{a numeric or factor vector containing the outcome for each sample}
13 | 
14 | \item{lambda}{value of tuning parameter lambda}
15 | }
16 | \value{
17 | A \code{grplasso} object is returned, for which \code{coef}, \code{print}, \code{plot} and \code{predict} methods exist.
18 | }
19 | \description{
20 | Fit roup lasso logistic regression and returns a list object
21 | }
22 | \examples{
23 | \dontrun{
24 | data("sim1_da1")
25 | trainx = dplyr::select(sim1_da1, -y)
26 | trainy = sim1_da1$y
27 | # index of the group
28 | index <- gsub("\\\\..*", "", names(trainx))
29 | # nlam is the number of values of tuning variable
30 | nlam <- 20
31 | # type of prediction
32 | type = "link"
33 | # number of cross-validation folds
34 | kfold <- 10
35 | cv_fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold)
36 | str(cv_fit)
37 | 
38 | fitgl <- fitglasso(trainx = trainx, trainy= trainy, lambda = cv_fit$lambda.max.auc[1])
39 | }
40 | }
41 | \author{
42 | Hui Lin, \email{longqiman@gmail.com}
43 | }
44 | 


--------------------------------------------------------------------------------
/man/impute_dat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/impute-dat.R
 3 | \name{impute_dat}
 4 | \alias{impute_dat}
 5 | \title{Impute missing values}
 6 | \usage{
 7 | impute_dat(dat, method)
 8 | }
 9 | \arguments{
10 | \item{dat}{a data frame.}
11 | 
12 | \item{method}{imputation method. Need to be one of these: \code{zero}, \code{mean}, \code{Inf0}, \code{code99}}
13 | }
14 | \description{
15 | Impute missing values
16 | }
17 | \examples{
18 | \dontrun{
19 | library(DataScienceR)
20 | data("SegData")
21 | impdat <- SegData[, c("income", "house")]
22 | # summary(SegData)
23 | impdat = impute_dat(impdat, method = "mean")
24 | summary(impdat)
25 | }
26 | }
27 | \author{
28 | Hui Lin, \email{longqiman@gmail.com}
29 | }
30 | 


--------------------------------------------------------------------------------
/man/mrrAtDateType.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mrrAtDateType.R
 3 | \name{mrrAtDateType}
 4 | \alias{mrrAtDateType}
 5 | \title{Calculate MRR for a specific date and product type}
 6 | \usage{
 7 | mrrAtDateType(time_point, subs, type = NULL)
 8 | }
 9 | \arguments{
10 | \item{time_point}{a date value (have to be date type)}
11 | 
12 | \item{subs}{subscription data, need to have columns: account_id, user_id, Product, Effective_Start, Effective_End, MRR}
13 | 
14 | \item{type}{product type, if "NULL", it will return MRR for each product}
15 | }
16 | \value{
17 | a vector of MRR is returned
18 | }
19 | \description{
20 | It calculates the MRR given date, subscription data, and product type
21 | }
22 | \examples{
23 | \dontrun{
24 | add_ons <- c("Forms", "Functions", "Identity", "Domains", "Large Media")
25 | dates <- seq(as.Date("2018-02-01"), lubridate::ceiling_date(Sys.Date(), unit = "month"), "months") - days(1)
26 | dates
27 | add_ons_mrr <- sapply(dates, function(time_point) {mrrAtDateType(time_point, datall_revenue, add_ons)})
28 | }
29 | }
30 | \author{
31 | Hui Lin, \email{longqiman@gmail.com}
32 | }
33 | 


--------------------------------------------------------------------------------
/man/multiplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/multiplot.R
 3 | \name{multiplot}
 4 | \alias{multiplot}
 5 | \title{Multiple plot function}
 6 | \usage{
 7 | multiplot(..., plotlist = NULL, file, cols = 1, layout = NULL)
 8 | }
 9 | \arguments{
10 | \item{...}{Pass ggplot objects or plotlist (as a list of ggplot objects)}
11 | 
12 | \item{cols}{Number of columns in layout}
13 | 
14 | \item{layout}{A matrix specifying the layout. If present, 'cols' is ignored. If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), then plot 1 will go in the upper left, 2 will go in the upper right, and 3 will go all the way across the bottom.}
15 | }
16 | \description{
17 | make multiple plot for ggplot objects.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/out_mad.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Outliers.R
 3 | \name{out_mad}
 4 | \alias{out_mad}
 5 | \title{Identify outliers using MAD}
 6 | \usage{
 7 | out_mad(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a vector}
11 | }
12 | \value{
13 | a vector indicating identified outliers
14 | }
15 | \description{
16 | Identify outliers using MAD
17 | }
18 | \examples{
19 | \dontrun{
20 | x<-c(seq(1:1000),20000)
21 | out_mad(x)
22 | }
23 | }
24 | \author{
25 | Hui Lin, \email{longqiman@gmail.com}
26 | }
27 | 


--------------------------------------------------------------------------------
/man/pairwise_ks_test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pairwise_ks_test.R
 3 | \name{pairwise_ks_test}
 4 | \alias{pairwise_ks_test}
 5 | \title{Pairwise Kolmogorov-Smirnov Test}
 6 | \usage{
 7 | pairwise_ks_test(value, group, n_min = 50, warning = 0,
 8 |   alternative = "two.sided")
 9 | }
10 | \arguments{
11 | \item{value}{a numeric vector of data values}
12 | 
13 | \item{group}{a group indicator}
14 | 
15 | \item{n_min}{The minimum number of observations in each group. Group(s) with observation less than \code{n_min} will be removed. "\code{n_min=50}"(default)}
16 | 
17 | \item{warning}{sets the handling of warning messages. If \code{warning} is negative all warnings are ignored. If \code{warning} is zero (the default) warnings are stored until the top–level function returns. If 10 or fewer warnings were signalled they will be printed otherwise a message saying how many were signalled. An object called last.warning is created and can be printed through the function warnings. If \code{warning} is one, warnings are printed as they occur. If warn is two or larger all warnings are turned into errors.}
18 | 
19 | \item{alternative}{indicates the alternative hypothesis and must be one of "\code{two.sided}" (default), "\code{less}", or "\code{greater}". You can specify just the initial letter of the value, but the argument name must be give in full. See ‘Details’ for the meanings of the possible values.}
20 | }
21 | \value{
22 | Pairwise Kolmogorov-Smirnov Test p-value Matrix
23 | }
24 | \description{
25 | Perform Pairwise Multiple Comparisons Using Kolmogorov-Smirnov Test
26 | }
27 | \details{
28 | Missing values are silently omitted from x and (in the two-sample case) y.
29 | 
30 | The possible values "\code{two.sided}", "\code{less}" and "\code{greater}" of alternative specify the null hypothesis that the true distribution function of x is equal to, not less than or not greater than the hypothesized distribution function (one-sample case) or the distribution function of y (two-sample case), respectively. This is a comparison of cumulative distribution functions, and the test statistic is the maximum difference in value, with the statistic in the "greater" alternative being \eqn{D^+ = max[F_x(u) - F_y(u)]}. Thus in the two-sample case alternative = "greater" includes distributions for which x is stochastically smaller than y (the CDF of x lies above and hence to the left of that for y), in contrast to \code{t.test} or \code{wilcox.test}.
31 | }
32 | \examples{
33 | \dontrun{
34 | data("iris")
35 | value<-iris$Sepal.Length
36 | group<-iris$Species
37 | pairwise_ks_test(value,group,warning = -1)
38 | }
39 | }
40 | \author{
41 | Hui Lin, \email{longqiman@gmail.com}
42 | }
43 | 


--------------------------------------------------------------------------------
/man/plot.cv_glasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.cv_glasso.R
 3 | \name{plot.cv_glasso}
 4 | \alias{plot.cv_glasso}
 5 | \title{plot the cross-validation curve produced by  "\code{cv_glasso}"}
 6 | \usage{
 7 | \method{plot}{cv_glasso}(x, type.measure = "auc", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{fitted "\code{cv_glasso}" object}
11 | 
12 | \item{type.measure}{criteria to use for cross-validation. Currently three options. The default is \code{type.measure = "auc"} which gives area under the ROC curve. \code{type.measure = "loglike"} computes the log-likelihood score in Meier et al2008. \code{type.measure = "maxco"} computes the maximum correlation coefficient in Yeo and Burge.}
13 | }
14 | \value{
15 | A plot is produced, and nothing is returned.
16 | }
17 | \description{
18 | Plots the cross-validation curve as a function of the lambda values used.
19 | }
20 | \examples{
21 | \dontrun{
22 | data("sim1_da1")
23 | trainx = dplyr::select(sim1_da1, -y)
24 | trainy = sim1_da1$y
25 | # index of the group
26 | index <- gsub("\\\\..*", "", names(trainx))
27 | # nlam is the number of values of tuning variable
28 | nlam <- 10
29 | # type of prediction
30 | type = "link"
31 | # number of cross-validation folds
32 | kfold <- 10
33 | cv.fit <- cv_glasso(trainx, trainy, nlam = nlam, kfold = kfold)
34 | plot.cv_glasso(cv.fit)
35 | }
36 | 
37 | }
38 | \references{
39 | L. Meier, S. van de Geer, and P. Buhlmann, The group lasso for logistic regression, J. R. Stat. Soc. Ser. B Stat. Methodol. 70 (2008), pp. 53-71.
40 | 
41 | G.W. Yeo and C.B. Burge, Maximum entropy modeling of short sequence motifs with applications to RNA splicing signals, J. Computnl Biol. 11 (2004), pp. 475-494.
42 | }
43 | \author{
44 | Hui Lin, \email{longqiman@gmail.com}
45 | }
46 | 


--------------------------------------------------------------------------------
/man/plot.rocTest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.rocTest.R
 3 | \name{plot.rocTest}
 4 | \alias{plot.rocTest}
 5 | \title{Plot ROC curve}
 6 | \usage{
 7 | \method{plot}{rocTest}(x, auto.legend = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{rocTest object}
11 | }
12 | \description{
13 | Plot ROC curve based on rocTest object
14 | }
15 | \author{
16 | Hui Lin, \email{longqiman@gmail.com}
17 | }
18 | 


--------------------------------------------------------------------------------
/man/plot.tune.cutoff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.tune_cutoff.R
 3 | \name{plot.tune.cutoff}
 4 | \alias{plot.tune.cutoff}
 5 | \title{Plot cutoff tuning process from "\code{tune_cutoff}"}
 6 | \usage{
 7 | \method{plot}{tune.cutoff}(x, pch = 20, cex = 1.5, col = "red", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{"\code{tune_cutoff}" object}
11 | 
12 | \item{pch}{type of cutoff points, default is 20}
13 | 
14 | \item{cex}{size of cutoff points, default is 1.5, col = "red"}
15 | 
16 | \item{col}{color of cutoff points, default is \code{col = "red"}}
17 | 
18 | \item{...}{other parameters in "\code{plot()}" function}
19 | }
20 | \description{
21 | Plots cutoff tuning process
22 | }
23 | \examples{
24 | \dontrun{
25 | data("sim1_da1")
26 | trainx = sim1_da1[,1:50]
27 | trainy = sim1_da1$y
28 | library(glmnet)
29 | fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial")
30 | test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min")
31 | test <- as.vector(test)
32 | summary(test)
33 | likelihood <- c(0.2, 0.5, 0.8)
34 | y <- trainy
35 | x <- tune_cutoff(test = test, y = y, likelihood = likelihood)
36 | str(x)
37 | plot.tune.cutoff(x)
38 | }
39 | 
40 | }
41 | \author{
42 | Hui Lin, \email{longqiman@gmail.com}
43 | }
44 | 


--------------------------------------------------------------------------------
/man/predict_glasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predict.fitglasso.R
 3 | \name{predict_glasso}
 4 | \alias{predict_glasso}
 5 | \title{Predict method for grplasso objects}
 6 | \usage{
 7 | predict_glasso(object, newdata, type = c("link", "response"),
 8 |   na.action = na.pass, ...)
 9 | }
10 | \arguments{
11 | \item{object}{a \code{grplasso} object}
12 | 
13 | \item{newdata}{data.frame or design matrix of new observations}
14 | 
15 | \item{type}{the type of prediction. \code{type = "link"} is on the scale of linear predictors, whereas \code{type = "response"} is on the scale of the response variable, i.e. \code{type = "response"} applies the inverse link function to the linear predictors.
16 | na.action    function determining what should be done with missing values in newdata. The default is to predict NA.}
17 | 
18 | \item{...}{other options to be passed to the predict function.}
19 | }
20 | \description{
21 | Obtains predictions from a \code{grplasso} object.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/rocTest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rocTest.R
 3 | \name{rocTest}
 4 | \alias{rocTest}
 5 | \title{Compute the area under the ROC curve and compare different AUC's}
 6 | \usage{
 7 | rocTest(y, x, L = NULL)
 8 | }
 9 | \arguments{
10 | \item{y}{response}
11 | 
12 | \item{x}{prediction}
13 | 
14 | \item{L}{list to assign the comparisons}
15 | }
16 | \description{
17 | This function compares the AUC of two correlated (or paired) or uncorrelated (un- paired) ROC curves.
18 | }
19 | \author{
20 | Hui Lin, \email{longqiman@gmail.com}
21 | }
22 | 


--------------------------------------------------------------------------------
/man/subclass_eff_est.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/subclass_eff_est.R
 3 | \name{subclass_eff_est}
 4 | \alias{subclass_eff_est}
 5 | \title{Estimation ATT and ATE after stratification}
 6 | \usage{
 7 | subclass_eff_est(object, y)
 8 | }
 9 | \arguments{
10 | \item{object}{The output object from matchit. This is a required input.}
11 | 
12 | \item{y}{outcome variable. This is a required input.}
13 | }
14 | \value{
15 | Fuction returns a list with the ATT and ATE estimates
16 | }
17 | \description{
18 | After stratification, estimation the weighted mean difference. Return estimates of ATT and ATE
19 | }
20 | \examples{
21 | \dontrun{
22 | subclass_eff_est(m2.out, match.data(m2.out)$CORN_UNITS_CHG)
23 | }
24 | }
25 | \author{
26 | Hui Lin, \email{longqiman@gmail.com}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/tune_cutoff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tune-cutoff.R
 3 | \name{tune_cutoff}
 4 | \alias{tune_cutoff}
 5 | \title{Tune the cutoffs for different likelihood groups}
 6 | \usage{
 7 | tune_cutoff(test, y, likelihood)
 8 | }
 9 | \arguments{
10 | \item{test}{a vector of predicted score from model}
11 | 
12 | \item{y}{a vector of response, \code{y} need to be the same length of \code{test}}
13 | 
14 | \item{likelihood}{a vector of likelihood values}
15 | }
16 | \value{
17 | an object of class "\code{tune_cutoff}" is returned, which is a list with the ingredients of the tuning process.
18 | }
19 | \description{
20 | Tune the cutoffs for different likelihood groups given predicted score, response and a vector of likelihood values
21 | }
22 | \examples{
23 | \dontrun{
24 | data("sim1_da1")
25 | trainx = sim1_da1[,1:50]
26 | trainy = sim1_da1$y
27 | library(glmnet)
28 | fit <- cv.glmnet(as.matrix(trainx), trainy, family = "binomial")
29 | test <- predict(fit, as.matrix(trainx), type = "link", s = "lambda.min")
30 | test <- as.vector(test)
31 | summary(test)
32 | likelihood <- c(0.2, 0.5, 0.8)
33 | y <- trainy
34 | x <- tune_cutoff(test = test, y = y, likelihood = likelihood)
35 | str(x)
36 | plot.tune.cutoff(x)
37 | }
38 | 
39 | }
40 | \author{
41 | Hui Lin, \email{longqiman@gmail.com}
42 | }
43 | 


--------------------------------------------------------------------------------