├── README.md
├── simple-marketing-mix-modelling-in-r.R
└── wrangling-messy-data-full-code.R
/README.md:
--------------------------------------------------------------------------------
1 | # marketing-mix-modelling
2 | Discussing process and experimenting with marketing mix modelling (MMM) - with code examples
3 |
4 |
Read the articles 👇
5 |
6 | 1. Cleaning and preparing marketing data in R prior to ML and analysis
7 | A basic, step-by-step guide on cleaning typically messy marketing data in R.
8 | 📙 https://towardsdatascience.com/cleaning-and-preparing-marketing-data-in-r-prior-to-machine-learning-or-analysis-ec1a12079f1
9 |
10 |
11 | Code Examples:
12 | 📌 Full R Code: https://github.com/Practical-ML/marketing-mix-modelling/blob/main/wrangling-messy-data-full-code.R
13 |
14 |
15 | -----
16 | 2. Building a marketing mix model in R
17 | We review the difference between marketing mix modelling (MMM) vs multi-touch attribution (MTA) and then we go on to build a simple MMM model in R!
18 | 📙 https://towardsdatascience.com/building-a-marketing-mix-model-in-r-3a7004d21239
19 |
20 |
21 | Code Examples:
22 | 📌 Full R Code: https://github.com/Practical-ML/marketing-mix-modelling/blob/main/simple-marketing-mix-modelling-in-r.R
23 |
24 |
25 |
--------------------------------------------------------------------------------
/simple-marketing-mix-modelling-in-r.R:
--------------------------------------------------------------------------------
1 | ###################################################
2 | #BUILDING A BASIC MARKETING MIX MODEL USING R
3 |
4 | #VIEW THE POST ON MEDIUM:
5 | # https://towardsdatascience.com/building-a-marketing-mix-model-in-r-3a7004d21239
6 | ###################################################
7 |
8 |
9 |
10 | ##################################################
11 | #LOAD DATASET
12 | ##################################################
13 | library(datarium)
14 |
15 | #Load marketing data from the datarium package, assign it to a dataframe called "sampledf".
16 | data(marketing)
17 | sampledf <- marketing
18 |
19 | #View it
20 | str(sampledf)
21 | View(sampledf)
22 |
23 | ##################################
24 | #Checking correlation
25 | ##################################
26 |
27 | library(PerformanceAnalytics)
28 | chart.Correlation(sampledf, histogram = TRUE, pch=19)
29 |
30 |
31 | ##################################
32 | #DEFINING ADSTOCK
33 | #Applying Gabriel Mohanna's maximum period decay method
34 | ##################################
35 |
36 | #set adstock fb rate
37 | set_rate_fb <- 0.1
38 | set_memory <- 2
39 | get_adstock_fb <- rep(set_rate_fb, set_memory+1) ^ c(0:set_memory)
40 |
41 | #set adstock youtube rate
42 | set_rate_yt <- 0.15
43 | set_memory <- 2
44 | get_adstock_youtube <- rep(set_rate_yt, set_memory+1) ^ c(0:set_memory)
45 |
46 | #set adstock news rate
47 | set_rate_news <- 0.25
48 | set_memory <- 2
49 | get_adstock_news <- rep(set_rate_news, set_memory+1) ^ c(0:set_memory)
50 |
51 |
52 | #adstocked fb
53 | ads_fb <- stats::filter(c(rep(0, set_memory), sampledf$facebook), get_adstock_fb, method="convolution")
54 | ads_fb <- ads_fb[!is.na(ads_fb)]
55 | #plot
56 | plot(seq(1,length(sampledf$facebook)), sampledf$facebook, type="h",
57 | main = "Adstocked Facebook",
58 | xlab="Time (Weeks)", ylab="Facebook",
59 | ylim=c(0, max(c(sampledf$facebook, ads_fb))),
60 | frame.plot=FALSE)
61 | lines(ads_fb, col="blue")
62 |
63 |
64 | #adstocked youtube
65 | ads_youtube <- stats::filter(c(rep(0, set_memory), sampledf$youtube), get_adstock_youtube, method="convolution")
66 | ads_youtube <- ads_youtube[!is.na(ads_youtube)]
67 | #plot
68 | plot(seq(1,length(sampledf$youtube)), sampledf$youtube, type="h",
69 | main = "Adstocked Youtube",
70 | xlab="Time (Weeks)", ylab="Youtube",
71 | ylim=c(0, max(c(sampledf$youtube, ads_youtube))),
72 | frame.plot=FALSE)
73 | lines(ads_youtube, col="blue")
74 |
75 |
76 | #adstocked newpaper
77 | ads_news <- stats::filter(c(rep(0, set_memory), sampledf$newspaper), get_adstock_news, method="convolution")
78 | ads_news <- ads_news[!is.na(ads_news)]
79 | #plot
80 | plot(seq(1,length(sampledf$newspaper)), sampledf$newspaper, type="h",
81 | main = "Adstocked Newspaper",
82 | xlab="Time (Weeks)", ylab="Newspaper",
83 | ylim=c(0, max(c(sampledf$newspaper, ads_news))),
84 | frame.plot=FALSE)
85 | lines(ads_news, col="blue")
86 |
87 | ##################################
88 | #BUILD MARKETING MIX MODEL USING MULTIPLE REGRESSION
89 | ##################################
90 |
91 | #We are specifying sales as the dependent variable in the lm() function
92 | mmm_1 <- lm(sampledf$sales ~ ads_youtube + ads_fb + ads_news)
93 | summary(mmm_1)
94 |
95 | #Check for multicollinearity using VIFs
96 | library(mctest)
97 | imcdiag(mmm_1, method = "VIF")
98 |
99 | #or use jtools
100 | #library(jtools)
101 | #summ(mmm_1, vifs=TRUE)
102 |
103 | #check for heteroscedasticity
104 | #first, plot the model out and review the siduals vs fitted plot and the Sclae-Location plot
105 | par(mfrow=c(2,2)) # put all 4 charts into 1 page
106 | plot(mmm_1)
107 |
108 | #Confirm with an objective test for heteroscedasticity using Breusch Pagan test and NCV test
109 | library(lmtest)
110 | lmtest::bptest(mmm_1)
111 |
112 | library(car)
113 | car::ncvTest(mmm_1)
114 |
115 | #h0: the variance in the model is homoskedastic (what we want).
116 | #Both returned p-values higher than significance level 0.05,
117 | #so we can't reject the null and can say that there are no major issues with heteroscedasticity.
118 |
119 | ##################################
120 | #BUILD MARKETING MIX MODEL WITH TREND AND SEASONALITY USING TIMESERIES
121 | ##################################
122 |
123 | #Create timeseries
124 | library(forecast)
125 |
126 | #frequency is 52 to denote weekly as there are about 52 weeks in a year.
127 | #ts() needs a minimum of 2 periods (52 x 2 = 104 weeks),
128 | #our data has observations from 200 weeks so this should be sufficient
129 | ts_sales <- ts(sampledf$sales, start = 1, frequency = 52)
130 |
131 | #check class. should state "ts"
132 | class(ts_sales)
133 |
134 | #decompose to get the individual components for trends, seasonality, etc
135 | ts_sales_comp <- decompose(ts_sales)
136 |
137 | #plot out
138 | plot(ts_sales_comp)
139 |
140 | #we use tslm() for our regression.
141 | #this is just a timeseries wrapper for lm() but allows trend and seasons on the fly from the data
142 | #https://www.rdocumentation.org/packages/forecast/versions/8.16/topics/tslm
143 | #just specify "trend" and "season" and tslm() will automatically generate values based on the ts() object you have specified.
144 |
145 | #fit the model
146 | mmm_2 <- tslm(ts_sales ~ trend + season + ads_youtube + ads_fb + ads_news)
147 | summary(mmm_2)
148 |
149 |
150 | #we want to forecast using our model
151 | #i.e. if we were to spend x1 in youtube, x2 in facebook and 0 in newspaper for the next period what would this look like?
152 | #we first need to create a dataframe containing new figures
153 |
154 | #we want to get newspaper spend
155 | news_spend <- as.data.frame(ads_news)
156 | names(news_spend)[1] <- "ads_news"
157 |
158 | #and give 40% to Youtube.
159 | #this is added to the current youtube spend budget (assuming we are keeping the youtube budget
160 | #for the next period is the same as it has been for the previous period)
161 | yt_spend <- as.data.frame(ads_youtube)
162 | names(yt_spend)[1] <- "ads_youtube"
163 | yt_spend$ads_youtube <- yt_spend$ads_youtube + (news_spend$ads_news*0.4)
164 | yt_spend
165 |
166 |
167 | #and give the remainder 60% of newspaper spend to facebook.
168 | #this is added to the current fb spend budget (assuming we are keeping the fb budget
169 | #for the next period is the same as it has been for the previous period)
170 | fb_spend <- as.data.frame(ads_fb)
171 | names(fb_spend)[1] <- "ads_fb"
172 | fb_spend$ads_fb <- fb_spend$ads_fb + (news_spend$ads_news*0.6)
173 | fb_spend
174 |
175 |
176 | #leaving nothing for newspapers (we are swtiching it off for the next period)
177 | final_news_spend <- as.data.frame(news_spend*0)
178 | names(final_news_spend)[1] <- "ads_news"
179 | final_news_spend
180 |
181 | #now put these new values all into a dataframe.
182 | #We'll use the model to predict sales for the next period based on these new budget allocation values
183 | new_spends <- cbind(yt_spend, fb_spend,final_news_spend)
184 | new_spends
185 |
186 |
187 | library(ggfortify)
188 | par(mfrow=c(1,1)) # reset to 1 chart per page
189 | set.seed(9999)
190 |
191 | #what performance looks like with no change
192 | forecast_unchanged <- forecast(mmm_2, h=200)
193 | ggplot2::autoplot(forecast_unchanged, ts.colour = 'black', size= 0.7, predict.size = 0.7, predict.colour = 'red', conf.int = TRUE, conf.int.fill = 'red', main = "Forecasted", predict.linetype='dashed')
194 |
195 | #forecast with budget changes
196 | forecast_new_spends <- forecast(mmm_2, newdata=new_spends)
197 | ggplot2::autoplot(forecast_new_spends, ts.colour = 'black', size= 0.7, predict.size = 0.7, predict.colour = 'blue', conf.int = TRUE, conf.int.fill = 'blue', main = "Forecasted")
198 |
199 |
200 | #overlaying them together using autolayer()
201 | forecast_unchanged <- forecast(mmm_2, h=200)
202 | ggplot2::autoplot(forecast_unchanged, ts.colour = 'black', size= 0.7, predict.size = 0.7, predict.colour = 'red', conf.int = TRUE, conf.int.fill = 'red', main = "Forecasted", predict.linetype='dashed') + forecast::autolayer(forecast_new_spends, col = 'blue')
203 |
204 | #Get fitted values
205 | #Finally, you can access fitted values from the model by quering forecast_new_spends$fitted
206 |
207 |
208 |
209 | ##################################
210 | #THE END!!
211 | ##################################
212 |
--------------------------------------------------------------------------------
/wrangling-messy-data-full-code.R:
--------------------------------------------------------------------------------
1 | ###################################################
2 | #CLEANING AND PREPARING MESSY MARKETING DATA
3 | #BEFORE PERFORMING MARKETING MIX MODELLING
4 |
5 | #VIEW THE POST ON MEDIUM:
6 | # https://towardsdatascience.com/cleaning-and-preparing-marketing-data-in-r-prior-to-machine-learning-or-analysis-ec1a12079f1
7 | ###################################################
8 |
9 |
10 |
11 |
12 | ##################################################
13 | #IMPORT DATA
14 | ##################################################
15 |
16 | #Get marketing data
17 | marketing_df <- read.csv("MarketingReportCoreCSV.csv", fileEncoding="UTF-8-BOM", head = TRUE)
18 |
19 | #Lets look at it
20 | View(marketing_df)
21 |
22 | #Yuck. Needs semi colon seperators
23 | marketing_df <- read.csv("MarketingReportCoreCSV.csv", sep = ';', fileEncoding="UTF-8-BOM", head = TRUE)
24 |
25 | #Check again
26 | View(marketing_df)
27 |
28 |
29 | #K. Now do the same for transactions
30 | orders_df <- read.csv("WebTransactionsCSV.csv", sep = ';', fileEncoding="UTF-8-BOM", head = TRUE)
31 |
32 | #Check again
33 | View(orders_df)
34 |
35 |
36 | ##################################################
37 | #PROCESS MARKETING DATA
38 | ##################################################
39 |
40 | #Keep only the columns we care about
41 |
42 | desired_columns <-c(1, 4, 7)
43 | marketing_df_clean <- marketing_df[desired_columns]
44 |
45 | #Check
46 | View(marketing_df_clean)
47 |
48 | #check class is still data frame
49 | class(marketing_df_clean)
50 |
51 | #rename columns names
52 | colnames(marketing_df_clean) <- c("date", "channel", "spend")
53 |
54 | #Check
55 | str(marketing_df_clean)
56 |
57 |
58 | #force lowercase channel character column
59 | marketing_df_clean$channel <- tolower(marketing_df_clean$channel)
60 |
61 | #check all unique channel names specified in the channel column
62 | unique(marketing_df_clean$channel)
63 |
64 | #rename "not tracked" to "direct"
65 | marketing_df_clean$channel <- gsub("not tracked","direct",marketing_df_clean$channel)
66 |
67 | #rename "unpaid" to "organic"
68 | marketing_df_clean$channel <- gsub("unpaid","organic",marketing_df_clean$channel)
69 |
70 | #rename "silverpop" to "email"
71 | marketing_df_clean$channel <- gsub("silverpop","email",marketing_df_clean$channel)
72 |
73 | #shorten facebookbusinessadsmanager to just "facebook" as there are no other FB activity in here
74 | marketing_df_clean$channel <- gsub("facebookbusinessadsmanager","facebook",marketing_df_clean$channel)
75 |
76 | #check
77 | View(marketing_df_clean)
78 |
79 |
80 | #set dates to year-month-day using ymd() from lubridate library
81 | library(lubridate)
82 | marketing_df_clean$date <- ymd(marketing_df_clean$date)
83 |
84 | #now check class. must be date
85 | class(marketing_df_clean$date)
86 |
87 | # check all column formats
88 | str(marketing_df_clean)
89 |
90 | #spend column is string for some reason.
91 | #There are commas in there which should be turned into decimals.
92 | marketing_df_clean$spend <- as.numeric(gsub("," , ".", marketing_df_clean$spend))
93 |
94 | #change all other integer column to numeric for consistency
95 | #marketing_df_clean[,4:7] <- sapply(marketing_df_clean[,4:7], as.numeric)
96 |
97 | # check column formats
98 | str(marketing_df_clean)
99 |
100 | #check again.
101 | View(marketing_df_clean)
102 |
103 | #looks like lots of duplicates.
104 | #Some of these rows have been segmented by the columns we've dropped previously (e.g. segment channel by mobile and desktop).
105 | #so we just need to remember to sum spend by date and group by channel
106 |
107 | marketing_df_clean <- aggregate(spend ~ date + channel, data = marketing_df_clean, sum)
108 |
109 |
110 |
111 | ##################################################
112 | #PROCESS ORDERS DATA
113 | ##################################################
114 |
115 | #we've loaded this previously. remind ourselves what it looks like
116 | View(orders_df)
117 |
118 | #Keep only the columns we care about
119 |
120 | needed_columns <-c(2, 4, 18)
121 | orders_df_clean <- orders_df[needed_columns]
122 |
123 | #check class is still data frame
124 | class(orders_df_clean)
125 |
126 | #check
127 | View(orders_df_clean)
128 |
129 | #rename columns names
130 | colnames(orders_df_clean) <- c("date", "channel", "revenue")
131 |
132 | #force lowercase channel character column
133 | orders_df_clean$channel <- tolower(orders_df_clean$channel)
134 |
135 | #check all unique channel names specified in the channel column
136 | unique(orders_df_clean$channel)
137 |
138 | #rename "notset" to "direct"
139 | orders_df_clean$channel <- gsub("notset", "direct", orders_df_clean$channel)
140 |
141 | #rename "silverpop" to "email"
142 | orders_df_clean$channel <- gsub("silverpop", "email", orders_df_clean$channel)
143 |
144 | #check values are correct
145 | head(orders_df_clean)
146 |
147 | #date values have "T00:00:00" appended. This is unnecessary lets clean this out.
148 | orders_df_clean$date <- gsub("T00:00:00" , "", orders_df_clean$date)
149 |
150 | #now set the date column to ymd
151 | orders_df_clean$date <- ymd(orders_df_clean$date)
152 |
153 | #check format is correct
154 | str(orders_df_clean)
155 |
156 | #revenue needs to be numeric for consistency
157 | orders_df_clean$revenue <- as.numeric(orders_df_clean$revenue)
158 |
159 | #check format is correct
160 | str(orders_df_clean)
161 |
162 | #check the data
163 | View(orders_df_clean)
164 |
165 | #looks like lots of duplicates.
166 | #Some of these rows have been segmented by the columns we've dropped previously (e.g. segment channel by mobile and desktop).
167 | #so we just need to remember to sum revenue by date and group by channel
168 | orders_df_clean <- aggregate(revenue ~ date + channel, data = orders_df_clean, sum)
169 |
170 | #check the data
171 | View(orders_df_clean)
172 |
173 |
174 |
175 | ##################################################
176 | #TURN DAILY DATA INTO WEEKLY
177 | ##################################################
178 |
179 | #Sorting out the marketing data first
180 |
181 | #create new column called week, assign same values as date and then convert it into week of the year.
182 | # e.g. 2020-03-15 falls on week 11 of year 2020. https://www.timeanddate.com/date/weeknumber.html
183 | marketing_df_clean$week = lubridate::week(marketing_df_clean$date)
184 |
185 | #create new column called month year, assign same values as date and then convert it to month year
186 | marketing_df_clean$monthyear = format(as.Date(marketing_df_clean$date), "%m-%Y")
187 |
188 | #save to a new df that we will prepare for weekly data
189 | marketing_df_weekly <- marketing_df_clean
190 |
191 | #reorder columns for sanity
192 | marketing_df_weekly <- marketing_df_weekly [, c(4, 5, 1, 2, 3)]
193 |
194 |
195 | #view it
196 | View(marketing_df_weekly)
197 |
198 |
199 | #Now prep the orders data
200 |
201 | #create new column called week, assign same values as date and then convert it into week of the year.
202 | #e.g. 2020-03-15 falls on week 11 of year 2020. https://www.timeanddate.com/date/weeknumber.html
203 | orders_df_clean$week = lubridate::week(orders_df_clean$date)
204 |
205 | #create new column called month year, assign same values as date and then convert it to month year
206 | orders_df_clean$monthyear = format(as.Date(orders_df_clean$date), "%m-%Y")
207 |
208 | #save to a new df that we will prepare for weekly data
209 | orders_df_weekly <- orders_df_clean
210 |
211 | #reorder columns for sanity
212 | orders_df_weekly <- orders_df_weekly [, c(4, 5, 1, 2, 3)]
213 |
214 | #check figures
215 | View(orders_df_weekly)
216 |
217 |
218 | #NOW MERGE!
219 | weekly_df <- merge(marketing_df_weekly,orders_df_weekly)
220 | View(weekly_df)
221 |
222 |
223 | #get spend per week by channel and save it in its own dataframe
224 | weekly_spend_df <- aggregate(spend ~ week + channel, data = weekly_df, sum)
225 | colnames(weekly_spend_df)[3] <- "weeklyspend"
226 | View(weekly_spend_df)
227 |
228 | #get revenue per week and save it in its own dataframe
229 | weekly_rev_df <- aggregate(revenue ~ week, data = weekly_df, sum)
230 | #rename columns names
231 | colnames(weekly_rev_df)[2] <- "totalrevenueforweek"
232 | View(weekly_rev_df)
233 |
234 |
235 | #just want dates and save to data frame called weekly_df_dates
236 | keep_dates <- c(1,3)
237 | weekly_df_dates <- weekly_df[keep_dates]
238 | View(weekly_df_dates)
239 |
240 | #remove duplicate rows
241 | weekly_df_dates <- weekly_df_dates[!duplicated(weekly_df_dates[c("week")]),]
242 | colnames(weekly_df_dates)[2] <- "weekdatestart"
243 | View(weekly_df_dates)
244 |
245 |
246 | #begin merge
247 | #merge() only allows joining 2 dataframes at a time.
248 |
249 | #so here's the first merge
250 | weekly_df_updated <- merge(weekly_spend_df, weekly_rev_df, by="week" )
251 | View(weekly_df_updated)
252 |
253 | #now the 2nd merge
254 | weekly_df_updated <- merge(weekly_df_dates, weekly_df_updated, by="week" )
255 |
256 | #drop week column. we no longer need it
257 | weekly_df_updated <-weekly_df_updated[,-1]
258 | View(weekly_df_updated)
259 |
260 | str(weekly_df_updated)
261 |
262 | #now view it
263 | View(weekly_df_updated)
264 |
265 |
266 | #finally turn long data to wide
267 | weekly_reshaped_channel <- reshape(weekly_df_updated, idvar = c("weekdatestart","totalrevenueforweek"), timevar = "channel", direction = "wide")
268 | View(weekly_reshaped_channel)
269 |
270 |
271 | #view the data and check for NA or missing values
272 | print(weekly_reshaped_channel)
273 |
274 | #if found, replace any NA with 0
275 | weekly_reshaped_channel[is.na(weekly_reshaped_channel)] <- 0
276 |
277 | str(weekly_reshaped_channel)
278 | View(weekly_reshaped_channel)
279 |
280 | #data is now FINALLY READY!!
281 |
282 |
--------------------------------------------------------------------------------