├── README.md └── advertising_attribution_spark.py /README.md: -------------------------------------------------------------------------------- 1 | #Multi Channel Attribution Based On Markov Graph and Removal Effects 2 | -------------------------------------------------------------------------------- /advertising_attribution_spark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Feb 9 13:50:01 2017 5 | 6 | @author: zhouyonglong 7 | """ 8 | import numpy as np 9 | import pandas as pd 10 | from collections import defaultdict 11 | import networkx as nx 12 | import seaborn as sns 13 | import matplotlib.pyplot as plt 14 | import re 15 | 16 | 17 | from pyspark import SparkContext,SparkConf 18 | from pyspark.sql.functions import unix_timestamp 19 | from pyspark.sql import SparkSession 20 | from pyspark.sql.types import StringType,StructType,IntegerType,TimestampType 21 | from pyspark.sql.functions import UserDefinedFunction 22 | 23 | 24 | 25 | 26 | def create_path(element): 27 | path=[] 28 | path.append('start') 29 | 30 | visits = element[1] 31 | visits = sorted(visits,key=lambda x:x[3]) 32 | 33 | channels = [x[1] for x in visits] 34 | for channel in channels: 35 | path.append(channel) 36 | 37 | conversion = sum([x[2] for x in visits])>0 38 | if conversion: 39 | path.append('conversion') 40 | else: 41 | path.append('null') 42 | 43 | return path 44 | 45 | 46 | 47 | def create_path_with_value(element): 48 | path=[] 49 | path.append('start') 50 | 51 | visits = element[1] 52 | visits = sorted(visits,key=lambda x:x[3]) 53 | 54 | channels = [x[1] for x in visits] 55 | for channel in channels: 56 | path.append(channel) 57 | 58 | conversion = sum([x[2] for x in visits])>0 59 | if conversion: 60 | path.append('conversion') 61 | else: 62 | path.append('null') 63 | 64 | value = sum([x[4] for x in visits]) 65 | path.append(str(value)) 66 | 67 | 68 | return path 69 | 70 | 71 | 72 | def create_last_click_stats_pair(element): 73 | visits = element[1] 74 | visits = sorted(visits,key=lambda x:x[3],reverse=True) 75 | 76 | conversion = sum(x[2] for x in visits)>0 77 | if conversion: 78 | result = [] 79 | channel = visits[0][1] 80 | value = sum(x[4] for x in visits) 81 | result.append((channel,value)) 82 | return result 83 | else: 84 | return [] 85 | 86 | 87 | def create_linear_click_stats_pair(element): 88 | visits = element[1] 89 | visits = sorted(visits,key=lambda x:x[3]) 90 | 91 | conversion = sum(x[2] for x in visits)>0 92 | if conversion: 93 | result = [] 94 | value = sum(x[4] for x in visits) 95 | for channel in [x[1] for x in visits]: 96 | result.append((channel,value/len(visits))) 97 | return result 98 | else: 99 | return [] 100 | 101 | 102 | 103 | def create_first_order_states_pair(path): 104 | if len(path)<3: 105 | return [] 106 | else: 107 | result = [] 108 | 109 | for i in range(0,len(path)-1): 110 | from_state = path[i] 111 | to_state = path[i+1] 112 | result.append(((from_state,to_state),1)) 113 | 114 | return result 115 | 116 | 117 | def create_first_order_states_pair_with_value(path): 118 | if len(path)<3: 119 | return [] 120 | else: 121 | result = [] 122 | value = float(path[-1]) 123 | for i in range(0,len(path)-2): 124 | from_state = path[i] 125 | to_state = path[i+1] 126 | result.append(((from_state,to_state),value)) 127 | 128 | return result 129 | 130 | 131 | 132 | channels = ["banner","text","keyword","link","video","mobile",'unknown'] 133 | 134 | channels_dict = {'01':'banner', 135 | '02':'text', 136 | '06':'keyword', 137 | '07':'link', 138 | '08':'video', 139 | '09':'mobile', 140 | '10':'mobile', 141 | '11':'mobile'} 142 | 143 | columns = channels.copy() 144 | columns.insert(0,"start") 145 | columns.append("null") 146 | columns.append("conversion") 147 | 148 | 149 | 150 | def create_first_order_markov_model(states_pair): 151 | states = defaultdict() 152 | states["start"]=0 153 | 154 | channel_index=0 155 | for channel in channels: 156 | channel_index=channel_index+1 157 | states[channel]=channel_index 158 | 159 | states["null"]=len(channels)+1 160 | states["conversion"]=len(channels)+2 161 | num_states = len(states) 162 | 163 | table = pd.DataFrame(np.zeros((num_states,num_states)), 164 | columns=columns, 165 | index=columns) 166 | 167 | for item in states_pair: 168 | from_state = item[0][0] 169 | to_state = item[0][1] 170 | value = item[1] 171 | 172 | row = states[from_state] 173 | column = states[to_state] 174 | table.iloc[row][column] = value 175 | 176 | # remove same-state transitions 177 | for r in range(0,num_states): 178 | prior_state = columns[r] 179 | for c in range(0,num_states): 180 | current_state = columns[c] 181 | if prior_state in channels and prior_state!=current_state and current_state!='start': 182 | table.iloc[r][c] = table.iloc[r][c] 183 | elif prior_state=='start' and current_state in channels: 184 | table.iloc[r][c] = table.iloc[r][c] 185 | else: 186 | table.iloc[r][c] = 0 187 | 188 | #normalize 189 | for r in range(0,num_states): 190 | prior_state = columns[r] 191 | if prior_state in channels or prior_state=='start': 192 | row_sum = sum([x for x in table.iloc[r]]) 193 | for c in range(0,num_states): 194 | table.iloc[r][c] = table.iloc[r][c] / row_sum 195 | 196 | table = table.fillna(0) 197 | 198 | return table 199 | 200 | 201 | # calculate orders count 202 | get_orders = UserDefinedFunction(lambda x:0 if x=='' else int(str.replace(x,',','')), 203 | IntegerType()) 204 | 205 | # calculate orders count 206 | get_revenue = UserDefinedFunction(lambda x:0 if x=='' else int(str.replace(x,',','')), 207 | IntegerType()) 208 | 209 | 210 | def get_channel(campaign): 211 | channel_type = campaign[-2:] 212 | if channel_type in channels_dict.keys(): 213 | return channels_dict[channel_type] 214 | else: 215 | return 'unknown' 216 | 217 | map_campaign = UserDefinedFunction(lambda x:get_channel(x), 218 | StringType()) 219 | 220 | 221 | 222 | conf = SparkConf() 223 | conf.set("spark.app.name", "channel_attribution_spark") 224 | #conf.set("spark.master", "spark://192.168.42.141:7077") 225 | conf.set("spark.master", "local[4]") 226 | 227 | 228 | session = SparkSession.builder.config(conf=conf).getOrCreate() 229 | 230 | 231 | schema = StructType().add("TrackingID", StringType(), True)\ 232 | .add("IdentityNumber", StringType(), True, None)\ 233 | .add("VisitNumber", StringType(), True, None)\ 234 | .add("HitTimestring", StringType(), True, None)\ 235 | .add("Campaign", StringType(), True, None)\ 236 | .add("Type", StringType(), True, None)\ 237 | .add("PhoneNumber", StringType(), True, None)\ 238 | .add("Email", StringType(), True, None)\ 239 | .add("ReferrerDomain", StringType(), True, None)\ 240 | .add("Orders", StringType(), True, None)\ 241 | .add("Revenue", StringType(), True, None) 242 | 243 | 244 | # clicks_test_users.txt 245 | # raw_clicks.csv 246 | raw_clicks = session.read.csv("./web/*.txt", 247 | schema=schema, 248 | sep='\t') 249 | 250 | raw_clicks = raw_clicks.filter(raw_clicks.TrackingID != 'None') 251 | 252 | raw_clicks = raw_clicks.filter(raw_clicks.HitTimestring!='None') 253 | raw_clicks = raw_clicks.filter(raw_clicks.HitTimestring!='0') 254 | raw_clicks = raw_clicks.dropna(subset='HitTimestring') 255 | 256 | raw_clicks = raw_clicks.filter(raw_clicks.ReferrerDomain != 'None') 257 | raw_clicks = raw_clicks.filter(raw_clicks.ReferrerDomain != 'Internal Domain') 258 | 259 | 260 | raw_clicks = raw_clicks.filter(raw_clicks.Revenue != 'None') 261 | raw_clicks = raw_clicks.filter(raw_clicks.Campaign != 'None') 262 | 263 | 264 | clicks = raw_clicks.select('TrackingID','VisitNumber','HitTimestring', 265 | 'Campaign','Orders','Revenue') 266 | 267 | 268 | clicks = clicks.select(*[map_campaign(column).alias('Campaign') 269 | if column == 'Campaign' else column for column in clicks.columns]) 270 | 271 | clicks = clicks.select(*[unix_timestamp(column,'yyyy-MM-dd HH:mm:ss.SSS').alias('VisitDatetime') 272 | if column == 'HitTimestring' else column for column in clicks.columns]) 273 | 274 | clicks = clicks.select(*[get_orders(column).alias('Orders') 275 | if column == 'Orders' else column for column in clicks.columns]) 276 | 277 | clicks = clicks.select(*[get_revenue(column).alias('Revenue') 278 | if column == 'Revenue' else column for column in clicks.columns]) 279 | 280 | 281 | visits = clicks.groupBy('TrackingID','VisitNumber','Campaign')\ 282 | .agg({'Orders':'max','VisitDatetime':'max','Revenue':'sum'}) 283 | 284 | 285 | 286 | visits = visits.rdd.map(lambda x:((x[0]),(x[1],x[2],x[3],x[4],x[5]))) 287 | 288 | visits = visits.groupByKey().mapValues(list) 289 | 290 | 291 | 292 | last_click_states_pair = visits.flatMap(lambda x:create_last_click_stats_pair(x))\ 293 | .reduceByKey(lambda x,y:x+y).collect() 294 | 295 | total_conversion = sum([x[1] for x in last_click_states_pair]) 296 | 297 | last_click_conversion_rate = \ 298 | [(x[0],x[1]/total_conversion) for x in last_click_states_pair] 299 | 300 | 301 | last_click_channels = [x[0] for x in last_click_conversion_rate] 302 | for channel in channels: 303 | if channel not in last_click_channels: 304 | last_click_conversion_rate.append((channel,0)) 305 | 306 | last_click_conversion_rate.sort(key=lambda x:x[0]) 307 | 308 | 309 | linear_states_pair = visits.flatMap(lambda x:create_linear_click_stats_pair(x))\ 310 | .reduceByKey(lambda x,y:x+y).collect() 311 | 312 | total_conversion = sum([x[1] for x in linear_states_pair]) 313 | 314 | linear_click_conversion_rate = \ 315 | [(x[0],x[1]/total_conversion) for x in linear_states_pair] 316 | 317 | 318 | linear_click_channels = [x[0] for x in linear_click_conversion_rate] 319 | for channel in channels: 320 | if channel not in linear_click_channels: 321 | linear_click_conversion_rate.append((channel,0)) 322 | 323 | linear_click_conversion_rate.sort(key=lambda x:x[0]) 324 | 325 | 326 | #path = visits.map(lambda x:create_path(x)) 327 | 328 | #first_order_states_pair = path.flatMap(lambda x:create_first_order_states_pair(x))\ 329 | # .reduceByKey(lambda x,y:x+y) 330 | 331 | path = visits.map(lambda x:create_path_with_value(x)) 332 | 333 | first_order_states_pair = path.flatMap( 334 | lambda x:create_first_order_states_pair_with_value(x))\ 335 | .reduceByKey(lambda x,y:x+y) 336 | 337 | first_order_markov_model = \ 338 | create_first_order_markov_model(first_order_states_pair.collect()) 339 | 340 | 341 | 342 | DG=nx.DiGraph() 343 | 344 | for prior in columns: 345 | for current in columns: 346 | weight = first_order_markov_model[current][prior] 347 | if weight > 0: 348 | DG.add_edge(prior,current,weight=weight) 349 | 350 | 351 | pos = nx.spring_layout(DG) 352 | 353 | nx.draw_networkx_edges(DG,pos,edgelist=DG.edges(),width=1,arrows=True) 354 | nx.draw_networkx_labels(DG,pos,font_size=20,font_family='sans-serif') 355 | 356 | edge_labels=dict([((u,v,),round(d['weight'],5)) for u,v,d in DG.edges(data=True)]) 357 | nx.draw_networkx_edge_labels(DG,pos,label_pos=0.3,edge_labels=edge_labels) 358 | 359 | 360 | 361 | total_conversion = 0 362 | all_simple_paths = list(nx.all_simple_paths(DG, source='start', 363 | target='conversion')) 364 | for path in all_simple_paths: 365 | 366 | conversion = 1 367 | for i in range(0,len(path)-1): 368 | antecesor = path[i] 369 | successor = path[i+1] 370 | weight = DG.edge[antecesor][successor]['weight'] 371 | conversion = conversion * weight 372 | 373 | total_conversion = total_conversion + conversion 374 | 375 | 376 | nodes = DG.nodes() 377 | removal_effects = defaultdict() 378 | 379 | 380 | for node in nodes: 381 | if node in channels: 382 | 383 | removal_effects[node] = 0 384 | graph = DG.copy() 385 | graph.remove_node(node) 386 | 387 | conversion_after_removal = 0 388 | for path in nx.all_simple_paths(graph, source='start', 389 | target='conversion'): 390 | 391 | conversion = 1 392 | for i in range(0,len(path)-1): 393 | antecesor = path[i] 394 | successor = path[i+1] 395 | weight = DG.edge[antecesor][successor]['weight'] 396 | conversion = conversion * weight 397 | 398 | conversion_after_removal = conversion_after_removal + conversion 399 | 400 | removal_effect = 1 - conversion_after_removal / total_conversion 401 | removal_effects[node] = removal_effect 402 | 403 | 404 | total_removal_effects = sum([item[1] for item in removal_effects.items()]) 405 | 406 | 407 | first_order_markov_model_conversion_rate = \ 408 | [(item[0],item[1]/total_removal_effects) 409 | for item in removal_effects.items()] 410 | 411 | 412 | first_markov_channels = [x[0] for x in first_order_markov_model_conversion_rate] 413 | for channel in channels: 414 | if channel not in first_markov_channels: 415 | first_order_markov_model_conversion_rate.append((channel,0)) 416 | 417 | first_order_markov_model_conversion_rate.sort(key=lambda x:x[0]) 418 | 419 | 420 | 421 | 422 | result = pd.DataFrame(last_click_conversion_rate, 423 | columns=['channel','last_conversion_rate']) 424 | 425 | result = result.merge(pd.DataFrame(linear_click_conversion_rate, 426 | columns=['channel','linear_conversion_rate']), 427 | on='channel') 428 | 429 | result = result.merge(pd.DataFrame(first_order_markov_model_conversion_rate, 430 | columns=['channel','markov_conversion_rate']), 431 | on='channel') 432 | 433 | 434 | 435 | fig = plt.figure() 436 | ax = fig.add_subplot(111) 437 | index = np.arange(len(channels)) 438 | width = 0.3 439 | 440 | rects1 = ax.bar(index, result['last_conversion_rate'], width, 441 | color='red', 442 | error_kw=dict(elinewidth=2,ecolor='red')) 443 | 444 | rects2 = ax.bar(index+width, result['linear_conversion_rate'], width, 445 | color='black', 446 | error_kw=dict(elinewidth=2,ecolor='black')) 447 | 448 | rects3 = ax.bar(index+2*width, result['markov_conversion_rate'], width, 449 | color='blue', 450 | error_kw=dict(elinewidth=2,ecolor='black')) 451 | 452 | # axes and labels 453 | ax.set_xlim(-width,len(index)+width) 454 | ax.set_ylim(0,1) 455 | ax.set_ylabel('conversion rate') 456 | ax.set_title('conversion rate by channel') 457 | 458 | ax.set_xticks(index+1.5*width) 459 | xtickNames = ax.set_xticklabels(result['channel']) 460 | plt.setp(xtickNames, rotation=0, fontsize=10) 461 | 462 | ax.legend((rects1[0],rects2[0],rects3[0]), 463 | ('last_click','linear_click','markov_first_order')) 464 | 465 | 466 | 467 | 468 | 469 | --------------------------------------------------------------------------------