├── README.md ├── simple-marketing-mix-modelling-in-r.R └── wrangling-messy-data-full-code.R /README.md: -------------------------------------------------------------------------------- 1 | # marketing-mix-modelling 2 | Discussing process and experimenting with marketing mix modelling (MMM) - with code examples 3 | 4 |

Read the articles 👇

5 | 6 |

1. Cleaning and preparing marketing data in R prior to ML and analysis

7 |

A basic, step-by-step guide on cleaning typically messy marketing data in R.

8 | 📙 https://towardsdatascience.com/cleaning-and-preparing-marketing-data-in-r-prior-to-machine-learning-or-analysis-ec1a12079f1 9 |
10 |
11 |

Code Examples:

12 | 📌 Full R Code: https://github.com/Practical-ML/marketing-mix-modelling/blob/main/wrangling-messy-data-full-code.R 13 |
14 | 15 | ----- 16 |

2. Building a marketing mix model in R

17 |

We review the difference between marketing mix modelling (MMM) vs multi-touch attribution (MTA) and then we go on to build a simple MMM model in R!

18 | 📙 https://towardsdatascience.com/building-a-marketing-mix-model-in-r-3a7004d21239 19 |
20 |
21 |

Code Examples:

22 | 📌 Full R Code: https://github.com/Practical-ML/marketing-mix-modelling/blob/main/simple-marketing-mix-modelling-in-r.R 23 |
24 |
25 | -------------------------------------------------------------------------------- /simple-marketing-mix-modelling-in-r.R: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #BUILDING A BASIC MARKETING MIX MODEL USING R 3 | 4 | #VIEW THE POST ON MEDIUM: 5 | # https://towardsdatascience.com/building-a-marketing-mix-model-in-r-3a7004d21239 6 | ################################################### 7 | 8 | 9 | 10 | ################################################## 11 | #LOAD DATASET 12 | ################################################## 13 | library(datarium) 14 | 15 | #Load marketing data from the datarium package, assign it to a dataframe called "sampledf". 16 | data(marketing) 17 | sampledf <- marketing 18 | 19 | #View it 20 | str(sampledf) 21 | View(sampledf) 22 | 23 | ################################## 24 | #Checking correlation 25 | ################################## 26 | 27 | library(PerformanceAnalytics) 28 | chart.Correlation(sampledf, histogram = TRUE, pch=19) 29 | 30 | 31 | ################################## 32 | #DEFINING ADSTOCK 33 | #Applying Gabriel Mohanna's maximum period decay method 34 | ################################## 35 | 36 | #set adstock fb rate 37 | set_rate_fb <- 0.1 38 | set_memory <- 2 39 | get_adstock_fb <- rep(set_rate_fb, set_memory+1) ^ c(0:set_memory) 40 | 41 | #set adstock youtube rate 42 | set_rate_yt <- 0.15 43 | set_memory <- 2 44 | get_adstock_youtube <- rep(set_rate_yt, set_memory+1) ^ c(0:set_memory) 45 | 46 | #set adstock news rate 47 | set_rate_news <- 0.25 48 | set_memory <- 2 49 | get_adstock_news <- rep(set_rate_news, set_memory+1) ^ c(0:set_memory) 50 | 51 | 52 | #adstocked fb 53 | ads_fb <- stats::filter(c(rep(0, set_memory), sampledf$facebook), get_adstock_fb, method="convolution") 54 | ads_fb <- ads_fb[!is.na(ads_fb)] 55 | #plot 56 | plot(seq(1,length(sampledf$facebook)), sampledf$facebook, type="h", 57 | main = "Adstocked Facebook", 58 | xlab="Time (Weeks)", ylab="Facebook", 59 | ylim=c(0, max(c(sampledf$facebook, ads_fb))), 60 | frame.plot=FALSE) 61 | lines(ads_fb, col="blue") 62 | 63 | 64 | #adstocked youtube 65 | ads_youtube <- stats::filter(c(rep(0, set_memory), sampledf$youtube), get_adstock_youtube, method="convolution") 66 | ads_youtube <- ads_youtube[!is.na(ads_youtube)] 67 | #plot 68 | plot(seq(1,length(sampledf$youtube)), sampledf$youtube, type="h", 69 | main = "Adstocked Youtube", 70 | xlab="Time (Weeks)", ylab="Youtube", 71 | ylim=c(0, max(c(sampledf$youtube, ads_youtube))), 72 | frame.plot=FALSE) 73 | lines(ads_youtube, col="blue") 74 | 75 | 76 | #adstocked newpaper 77 | ads_news <- stats::filter(c(rep(0, set_memory), sampledf$newspaper), get_adstock_news, method="convolution") 78 | ads_news <- ads_news[!is.na(ads_news)] 79 | #plot 80 | plot(seq(1,length(sampledf$newspaper)), sampledf$newspaper, type="h", 81 | main = "Adstocked Newspaper", 82 | xlab="Time (Weeks)", ylab="Newspaper", 83 | ylim=c(0, max(c(sampledf$newspaper, ads_news))), 84 | frame.plot=FALSE) 85 | lines(ads_news, col="blue") 86 | 87 | ################################## 88 | #BUILD MARKETING MIX MODEL USING MULTIPLE REGRESSION 89 | ################################## 90 | 91 | #We are specifying sales as the dependent variable in the lm() function 92 | mmm_1 <- lm(sampledf$sales ~ ads_youtube + ads_fb + ads_news) 93 | summary(mmm_1) 94 | 95 | #Check for multicollinearity using VIFs 96 | library(mctest) 97 | imcdiag(mmm_1, method = "VIF") 98 | 99 | #or use jtools 100 | #library(jtools) 101 | #summ(mmm_1, vifs=TRUE) 102 | 103 | #check for heteroscedasticity 104 | #first, plot the model out and review the siduals vs fitted plot and the Sclae-Location plot 105 | par(mfrow=c(2,2)) # put all 4 charts into 1 page 106 | plot(mmm_1) 107 | 108 | #Confirm with an objective test for heteroscedasticity using Breusch Pagan test and NCV test 109 | library(lmtest) 110 | lmtest::bptest(mmm_1) 111 | 112 | library(car) 113 | car::ncvTest(mmm_1) 114 | 115 | #h0: the variance in the model is homoskedastic (what we want). 116 | #Both returned p-values higher than significance level 0.05, 117 | #so we can't reject the null and can say that there are no major issues with heteroscedasticity. 118 | 119 | ################################## 120 | #BUILD MARKETING MIX MODEL WITH TREND AND SEASONALITY USING TIMESERIES 121 | ################################## 122 | 123 | #Create timeseries 124 | library(forecast) 125 | 126 | #frequency is 52 to denote weekly as there are about 52 weeks in a year. 127 | #ts() needs a minimum of 2 periods (52 x 2 = 104 weeks), 128 | #our data has observations from 200 weeks so this should be sufficient 129 | ts_sales <- ts(sampledf$sales, start = 1, frequency = 52) 130 | 131 | #check class. should state "ts" 132 | class(ts_sales) 133 | 134 | #decompose to get the individual components for trends, seasonality, etc 135 | ts_sales_comp <- decompose(ts_sales) 136 | 137 | #plot out 138 | plot(ts_sales_comp) 139 | 140 | #we use tslm() for our regression. 141 | #this is just a timeseries wrapper for lm() but allows trend and seasons on the fly from the data 142 | #https://www.rdocumentation.org/packages/forecast/versions/8.16/topics/tslm 143 | #just specify "trend" and "season" and tslm() will automatically generate values based on the ts() object you have specified. 144 | 145 | #fit the model 146 | mmm_2 <- tslm(ts_sales ~ trend + season + ads_youtube + ads_fb + ads_news) 147 | summary(mmm_2) 148 | 149 | 150 | #we want to forecast using our model 151 | #i.e. if we were to spend x1 in youtube, x2 in facebook and 0 in newspaper for the next period what would this look like? 152 | #we first need to create a dataframe containing new figures 153 | 154 | #we want to get newspaper spend 155 | news_spend <- as.data.frame(ads_news) 156 | names(news_spend)[1] <- "ads_news" 157 | 158 | #and give 40% to Youtube. 159 | #this is added to the current youtube spend budget (assuming we are keeping the youtube budget 160 | #for the next period is the same as it has been for the previous period) 161 | yt_spend <- as.data.frame(ads_youtube) 162 | names(yt_spend)[1] <- "ads_youtube" 163 | yt_spend$ads_youtube <- yt_spend$ads_youtube + (news_spend$ads_news*0.4) 164 | yt_spend 165 | 166 | 167 | #and give the remainder 60% of newspaper spend to facebook. 168 | #this is added to the current fb spend budget (assuming we are keeping the fb budget 169 | #for the next period is the same as it has been for the previous period) 170 | fb_spend <- as.data.frame(ads_fb) 171 | names(fb_spend)[1] <- "ads_fb" 172 | fb_spend$ads_fb <- fb_spend$ads_fb + (news_spend$ads_news*0.6) 173 | fb_spend 174 | 175 | 176 | #leaving nothing for newspapers (we are swtiching it off for the next period) 177 | final_news_spend <- as.data.frame(news_spend*0) 178 | names(final_news_spend)[1] <- "ads_news" 179 | final_news_spend 180 | 181 | #now put these new values all into a dataframe. 182 | #We'll use the model to predict sales for the next period based on these new budget allocation values 183 | new_spends <- cbind(yt_spend, fb_spend,final_news_spend) 184 | new_spends 185 | 186 | 187 | library(ggfortify) 188 | par(mfrow=c(1,1)) # reset to 1 chart per page 189 | set.seed(9999) 190 | 191 | #what performance looks like with no change 192 | forecast_unchanged <- forecast(mmm_2, h=200) 193 | ggplot2::autoplot(forecast_unchanged, ts.colour = 'black', size= 0.7, predict.size = 0.7, predict.colour = 'red', conf.int = TRUE, conf.int.fill = 'red', main = "Forecasted", predict.linetype='dashed') 194 | 195 | #forecast with budget changes 196 | forecast_new_spends <- forecast(mmm_2, newdata=new_spends) 197 | ggplot2::autoplot(forecast_new_spends, ts.colour = 'black', size= 0.7, predict.size = 0.7, predict.colour = 'blue', conf.int = TRUE, conf.int.fill = 'blue', main = "Forecasted") 198 | 199 | 200 | #overlaying them together using autolayer() 201 | forecast_unchanged <- forecast(mmm_2, h=200) 202 | ggplot2::autoplot(forecast_unchanged, ts.colour = 'black', size= 0.7, predict.size = 0.7, predict.colour = 'red', conf.int = TRUE, conf.int.fill = 'red', main = "Forecasted", predict.linetype='dashed') + forecast::autolayer(forecast_new_spends, col = 'blue') 203 | 204 | #Get fitted values 205 | #Finally, you can access fitted values from the model by quering forecast_new_spends$fitted 206 | 207 | 208 | 209 | ################################## 210 | #THE END!! 211 | ################################## 212 | -------------------------------------------------------------------------------- /wrangling-messy-data-full-code.R: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #CLEANING AND PREPARING MESSY MARKETING DATA 3 | #BEFORE PERFORMING MARKETING MIX MODELLING 4 | 5 | #VIEW THE POST ON MEDIUM: 6 | # https://towardsdatascience.com/cleaning-and-preparing-marketing-data-in-r-prior-to-machine-learning-or-analysis-ec1a12079f1 7 | ################################################### 8 | 9 | 10 | 11 | 12 | ################################################## 13 | #IMPORT DATA 14 | ################################################## 15 | 16 | #Get marketing data 17 | marketing_df <- read.csv("MarketingReportCoreCSV.csv", fileEncoding="UTF-8-BOM", head = TRUE) 18 | 19 | #Lets look at it 20 | View(marketing_df) 21 | 22 | #Yuck. Needs semi colon seperators 23 | marketing_df <- read.csv("MarketingReportCoreCSV.csv", sep = ';', fileEncoding="UTF-8-BOM", head = TRUE) 24 | 25 | #Check again 26 | View(marketing_df) 27 | 28 | 29 | #K. Now do the same for transactions 30 | orders_df <- read.csv("WebTransactionsCSV.csv", sep = ';', fileEncoding="UTF-8-BOM", head = TRUE) 31 | 32 | #Check again 33 | View(orders_df) 34 | 35 | 36 | ################################################## 37 | #PROCESS MARKETING DATA 38 | ################################################## 39 | 40 | #Keep only the columns we care about 41 | 42 | desired_columns <-c(1, 4, 7) 43 | marketing_df_clean <- marketing_df[desired_columns] 44 | 45 | #Check 46 | View(marketing_df_clean) 47 | 48 | #check class is still data frame 49 | class(marketing_df_clean) 50 | 51 | #rename columns names 52 | colnames(marketing_df_clean) <- c("date", "channel", "spend") 53 | 54 | #Check 55 | str(marketing_df_clean) 56 | 57 | 58 | #force lowercase channel character column 59 | marketing_df_clean$channel <- tolower(marketing_df_clean$channel) 60 | 61 | #check all unique channel names specified in the channel column 62 | unique(marketing_df_clean$channel) 63 | 64 | #rename "not tracked" to "direct" 65 | marketing_df_clean$channel <- gsub("not tracked","direct",marketing_df_clean$channel) 66 | 67 | #rename "unpaid" to "organic" 68 | marketing_df_clean$channel <- gsub("unpaid","organic",marketing_df_clean$channel) 69 | 70 | #rename "silverpop" to "email" 71 | marketing_df_clean$channel <- gsub("silverpop","email",marketing_df_clean$channel) 72 | 73 | #shorten facebookbusinessadsmanager to just "facebook" as there are no other FB activity in here 74 | marketing_df_clean$channel <- gsub("facebookbusinessadsmanager","facebook",marketing_df_clean$channel) 75 | 76 | #check 77 | View(marketing_df_clean) 78 | 79 | 80 | #set dates to year-month-day using ymd() from lubridate library 81 | library(lubridate) 82 | marketing_df_clean$date <- ymd(marketing_df_clean$date) 83 | 84 | #now check class. must be date 85 | class(marketing_df_clean$date) 86 | 87 | # check all column formats 88 | str(marketing_df_clean) 89 | 90 | #spend column is string for some reason. 91 | #There are commas in there which should be turned into decimals. 92 | marketing_df_clean$spend <- as.numeric(gsub("," , ".", marketing_df_clean$spend)) 93 | 94 | #change all other integer column to numeric for consistency 95 | #marketing_df_clean[,4:7] <- sapply(marketing_df_clean[,4:7], as.numeric) 96 | 97 | # check column formats 98 | str(marketing_df_clean) 99 | 100 | #check again. 101 | View(marketing_df_clean) 102 | 103 | #looks like lots of duplicates. 104 | #Some of these rows have been segmented by the columns we've dropped previously (e.g. segment channel by mobile and desktop). 105 | #so we just need to remember to sum spend by date and group by channel 106 | 107 | marketing_df_clean <- aggregate(spend ~ date + channel, data = marketing_df_clean, sum) 108 | 109 | 110 | 111 | ################################################## 112 | #PROCESS ORDERS DATA 113 | ################################################## 114 | 115 | #we've loaded this previously. remind ourselves what it looks like 116 | View(orders_df) 117 | 118 | #Keep only the columns we care about 119 | 120 | needed_columns <-c(2, 4, 18) 121 | orders_df_clean <- orders_df[needed_columns] 122 | 123 | #check class is still data frame 124 | class(orders_df_clean) 125 | 126 | #check 127 | View(orders_df_clean) 128 | 129 | #rename columns names 130 | colnames(orders_df_clean) <- c("date", "channel", "revenue") 131 | 132 | #force lowercase channel character column 133 | orders_df_clean$channel <- tolower(orders_df_clean$channel) 134 | 135 | #check all unique channel names specified in the channel column 136 | unique(orders_df_clean$channel) 137 | 138 | #rename "notset" to "direct" 139 | orders_df_clean$channel <- gsub("notset", "direct", orders_df_clean$channel) 140 | 141 | #rename "silverpop" to "email" 142 | orders_df_clean$channel <- gsub("silverpop", "email", orders_df_clean$channel) 143 | 144 | #check values are correct 145 | head(orders_df_clean) 146 | 147 | #date values have "T00:00:00" appended. This is unnecessary lets clean this out. 148 | orders_df_clean$date <- gsub("T00:00:00" , "", orders_df_clean$date) 149 | 150 | #now set the date column to ymd 151 | orders_df_clean$date <- ymd(orders_df_clean$date) 152 | 153 | #check format is correct 154 | str(orders_df_clean) 155 | 156 | #revenue needs to be numeric for consistency 157 | orders_df_clean$revenue <- as.numeric(orders_df_clean$revenue) 158 | 159 | #check format is correct 160 | str(orders_df_clean) 161 | 162 | #check the data 163 | View(orders_df_clean) 164 | 165 | #looks like lots of duplicates. 166 | #Some of these rows have been segmented by the columns we've dropped previously (e.g. segment channel by mobile and desktop). 167 | #so we just need to remember to sum revenue by date and group by channel 168 | orders_df_clean <- aggregate(revenue ~ date + channel, data = orders_df_clean, sum) 169 | 170 | #check the data 171 | View(orders_df_clean) 172 | 173 | 174 | 175 | ################################################## 176 | #TURN DAILY DATA INTO WEEKLY 177 | ################################################## 178 | 179 | #Sorting out the marketing data first 180 | 181 | #create new column called week, assign same values as date and then convert it into week of the year. 182 | # e.g. 2020-03-15 falls on week 11 of year 2020. https://www.timeanddate.com/date/weeknumber.html 183 | marketing_df_clean$week = lubridate::week(marketing_df_clean$date) 184 | 185 | #create new column called month year, assign same values as date and then convert it to month year 186 | marketing_df_clean$monthyear = format(as.Date(marketing_df_clean$date), "%m-%Y") 187 | 188 | #save to a new df that we will prepare for weekly data 189 | marketing_df_weekly <- marketing_df_clean 190 | 191 | #reorder columns for sanity 192 | marketing_df_weekly <- marketing_df_weekly [, c(4, 5, 1, 2, 3)] 193 | 194 | 195 | #view it 196 | View(marketing_df_weekly) 197 | 198 | 199 | #Now prep the orders data 200 | 201 | #create new column called week, assign same values as date and then convert it into week of the year. 202 | #e.g. 2020-03-15 falls on week 11 of year 2020. https://www.timeanddate.com/date/weeknumber.html 203 | orders_df_clean$week = lubridate::week(orders_df_clean$date) 204 | 205 | #create new column called month year, assign same values as date and then convert it to month year 206 | orders_df_clean$monthyear = format(as.Date(orders_df_clean$date), "%m-%Y") 207 | 208 | #save to a new df that we will prepare for weekly data 209 | orders_df_weekly <- orders_df_clean 210 | 211 | #reorder columns for sanity 212 | orders_df_weekly <- orders_df_weekly [, c(4, 5, 1, 2, 3)] 213 | 214 | #check figures 215 | View(orders_df_weekly) 216 | 217 | 218 | #NOW MERGE! 219 | weekly_df <- merge(marketing_df_weekly,orders_df_weekly) 220 | View(weekly_df) 221 | 222 | 223 | #get spend per week by channel and save it in its own dataframe 224 | weekly_spend_df <- aggregate(spend ~ week + channel, data = weekly_df, sum) 225 | colnames(weekly_spend_df)[3] <- "weeklyspend" 226 | View(weekly_spend_df) 227 | 228 | #get revenue per week and save it in its own dataframe 229 | weekly_rev_df <- aggregate(revenue ~ week, data = weekly_df, sum) 230 | #rename columns names 231 | colnames(weekly_rev_df)[2] <- "totalrevenueforweek" 232 | View(weekly_rev_df) 233 | 234 | 235 | #just want dates and save to data frame called weekly_df_dates 236 | keep_dates <- c(1,3) 237 | weekly_df_dates <- weekly_df[keep_dates] 238 | View(weekly_df_dates) 239 | 240 | #remove duplicate rows 241 | weekly_df_dates <- weekly_df_dates[!duplicated(weekly_df_dates[c("week")]),] 242 | colnames(weekly_df_dates)[2] <- "weekdatestart" 243 | View(weekly_df_dates) 244 | 245 | 246 | #begin merge 247 | #merge() only allows joining 2 dataframes at a time. 248 | 249 | #so here's the first merge 250 | weekly_df_updated <- merge(weekly_spend_df, weekly_rev_df, by="week" ) 251 | View(weekly_df_updated) 252 | 253 | #now the 2nd merge 254 | weekly_df_updated <- merge(weekly_df_dates, weekly_df_updated, by="week" ) 255 | 256 | #drop week column. we no longer need it 257 | weekly_df_updated <-weekly_df_updated[,-1] 258 | View(weekly_df_updated) 259 | 260 | str(weekly_df_updated) 261 | 262 | #now view it 263 | View(weekly_df_updated) 264 | 265 | 266 | #finally turn long data to wide 267 | weekly_reshaped_channel <- reshape(weekly_df_updated, idvar = c("weekdatestart","totalrevenueforweek"), timevar = "channel", direction = "wide") 268 | View(weekly_reshaped_channel) 269 | 270 | 271 | #view the data and check for NA or missing values 272 | print(weekly_reshaped_channel) 273 | 274 | #if found, replace any NA with 0 275 | weekly_reshaped_channel[is.na(weekly_reshaped_channel)] <- 0 276 | 277 | str(weekly_reshaped_channel) 278 | View(weekly_reshaped_channel) 279 | 280 | #data is now FINALLY READY!! 281 | 282 | --------------------------------------------------------------------------------