├── README.md
├── README.txt
├── word2vec.py
├── association_rules.py
├── basic_recsys.py
├── apriori.py
├── svdRec.py
└── next_basket.py


/README.md:
--------------------------------------------------------------------------------
1 | # Next-Basket-Analysis-Ta-Feng-Dataset
2 | ##### Ta-Feng is a grocery shopping dataset released by ACM RecSys, it covers products from food, office supplies to furniture. 
3 | ##### The dataset collected users` transaction data of 4 months, from November 2000 to February 2001. The total count of transactions in this dataset is 817741, which belong to 32266 users and 23812 products. 
4 | See http://recsyswiki.com/wiki/Grocery_shopping_datasets for more details.
5 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | OVERVIEW
 2 | ------------------------------------------------------------------
 3 | next_basket.py, association_rules.py, basic_recsys.py, word2vec.py
 4 | are the polished pieces of code for Ta-Feng project. Each file can 
 5 | run as the main function. 
 6 | 
 7 | next_basket.py
 8 | ------------------------------------------------------------------
 9 | Read files from data folder, feature engineering, model training 
10 | for "the next basket prediction" problem.
11 | 
12 | association_rules.py
13 | ------------------------------------------------------------------
14 | Identify the association rules between subclass and products
15 | 
16 | basic_recsys.py
17 | ------------------------------------------------------------------
18 | Build a recommendation system using collaborative filtering. 
19 | Performance is evaluated.
20 | 
21 | word2vec.py
22 | ------------------------------------------------------------------
23 | Create word2vec represesntations of product ids.
24 | In order to reduce the no sequence effect, window size is set to the 
25 | largest basket size in the training set. 
26 | 


--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
 1 | """Compute the word2vec representation for products.
 2 | """
 3 | 
 4 | 
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | from matplotlib import pyplot as plt
 9 | from tqdm import tqdm
10 | import os
11 | #from numba import jit
12 | from next_basket import read_file, add_transaction_id
13 | from association_rules import item_in_one_basket
14 | 
15 | import gensim
16 | 
17 | from datetime import date as dt
18 | from datetime import datetime
19 | 
20 | from sklearn.decomposition import PCA
21 | 
22 | dataFolder = 'Data\\'
23 | 
24 | def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
25 |     """From Tensorflow's tutorial.
26 |     """
27 |     plt.figure(figsize=(18, 18))
28 |     for i, label in enumerate(labels):
29 |     	x, y = low_dim_embs[i,:]
30 |     	plt.scatter(x, y)
31 |     	plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
32 |     		ha='right', va='bottom')
33 | 
34 |     plt.show()
35 | 
36 | 
37 | 
38 | if __name__ == "__main__":
39 | 
40 | 	print('Reading files ...\n')
41 | 	df = read_file(dataFolder)
42 | 
43 | 	df['trans_id'] = df['customer_id'].astype(str) + df['date_time'].astype(str)
44 | 	baskets = df.groupby("trans_id").apply(lambda order: order['product_id'].astype(str).tolist())
45 | 	longest = np.max(baskets.apply(len))
46 | 	baskets = baskets.values
47 | 
48 | 	# I choose the window size as the largest basket size.
49 | 	# Since there is no sequence characteristics of the products in an order, we should have a 
50 | 	# training window huge enough to accommodate all the products together.
51 | 	model = gensim.models.Word2Vec(baskets, size=100, window=longest, min_count=2, workers=4)
52 | 
53 | 	# product_id
54 | 	vocab = list(model.wv.vocab.keys())
55 | 
56 | 	# reduce dimension using PCA with 2 components
57 | 	pca = PCA(n_components=2)
58 | 	pca.fit(model.wv.syn0)
59 | 
60 | 	# plot 2d word2vec
61 | 	plot_with_labels(pca.fit_transform(model.wv.syn0), vocab)
62 | 
63 | 


--------------------------------------------------------------------------------
/association_rules.py:
--------------------------------------------------------------------------------
 1 | """Find the association rules between subclass and products.
 2 | """
 3 | 
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from matplotlib import pyplot as plt
 8 | from tqdm import tqdm
 9 | import os
10 | #from numba import jit
11 | from next_basket import read_file, add_transaction_id
12 | 
13 | from datetime import date as dt
14 | from datetime import datetime
15 | 
16 | import apriori
17 | 
18 | dataFolder = 'Data\\'
19 | 
20 | def item_in_one_basket(df: pd.DataFrame, item_tp: str) -> list:
21 | 	"""Group the items in the same basket as a list.
22 | 	
23 | 	Args:
24 |         df: product purchasing information.
25 |         item_tp: "subclass" or "product_id"
26 |         
27 |     Returns:
28 |         basket_list: a list of items in baskets. Each sub list represents the items in one basket.
29 | 	
30 | 	"""
31 | 
32 | 	df['trans_id'] =  df['customer_id'].astype(str) + df['date_time'].astype(str) # assume each customer only make at most one transaction everyday
33 | 	df = df.sort_values(['trans_id'])
34 | 	df['subclass'] = df['subclass'].astype(str)
35 | 	row_num = 0
36 | 	trans_id_buf = df['trans_id'].values[0]
37 | 	basket_list = []
38 | 	item_list = []
39 | 	for j in tqdm(range(df.shape[0])):
40 | 		if df['trans_id'].values[j] == trans_id_buf:
41 | 			item_list.append(df['subclass'].values[j])
42 | 	else:
43 | 		item_list = list(set(item_list))
44 | 		basket_list.append(item_list[:])
45 | 		item_list = [df['subclass'].values[j]]
46 | 		trans_id_buf = df['trans_id'].values[j]
47 | 
48 | 	return basket_list
49 | 
50 | 
51 | if __name__ == "__main__":
52 | 
53 | 	print('Reading files ...\n')
54 | 	df = read_file(dataFolder)
55 | 
56 | 	# identify the association rule between various subclass
57 | 	basket_list = item_in_one_basket(df, 'subclass')
58 | 	L, suppData = apriori.apriori(basket_list, minSupport=0.01) # support score is 0.01 --> the persesntage of appearance > 1%
59 | 	rules = apriori.generateRules(L, suppData, minConf=0.3) # conditional probability >= 0.3
60 | 	print('Association rules in Subclass: \n',rules)
61 | 
62 | 	# identify the association rule between various products
63 | 	basket_list = item_in_one_basket(df, 'product_id')
64 | 	L, suppData = apriori.apriori(basket_list, minSupport=0.005) # support score is 0.01 --> the persesntage of appearance > 0.5%
65 | 	rules = apriori.generateRules(L,suppData,minConf=0.3) # conditional probability >= 0.3
66 | 	print('Association rules in Products: \n',L)
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/basic_recsys.py:
--------------------------------------------------------------------------------
 1 | """A simple recommendation model based on collaborative filtering.
 2 | The performance is evaluation by randomly removing 1000 scores from the table.
 3 | And compare the estimation of these scores with the actual ones.
 4 | """
 5 | 
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | from matplotlib import pyplot as plt
10 | from tqdm import tqdm
11 | import os
12 | 
13 | from next_basket import read_file, add_transaction_id
14 | 
15 | from datetime import date as dt
16 | from datetime import datetime
17 | 
18 | from numpy import mat
19 | import random
20 | import svdRec
21 | 
22 | dataFolder = 'Data\\'
23 | 
24 | if __name__ == "__main__":
25 | 
26 | 	print('Reading files ...\n')
27 | 	df = read_file(dataFolder)
28 | 
29 | 	print('Adding transaction numbers ...\n')
30 | 	df = add_transaction_id(df)
31 | 
32 | 	# the number of time that a customer purchassed a certain product
33 | 	df_cust_row = df.groupby(['customer_id','product_id'],as_index = False).agg({'trans_id':pd.Series.nunique})
34 | 	# Number of transactions made by each customer
35 | 	df_cust_num_trans = df.groupby(['customer_id'],as_index = False).agg({'trans_id':pd.Series.nunique})
36 | 	df_cust_num_trans.columns = ['customer_id','num_trans']
37 | 	# create a dataframe with columns: customer id, product id, number of times this product has been purchased, number of trans made by this customer
38 | 	df_rs = pd.merge(df_cust_row, df_cust_num_trans, on = 'customer_id', how = 'left')
39 | 	# compute score
40 | 	df_rs['score'] = df_rs['trans_id'] / df_rs['num_trans']
41 | 
42 | 	# initialise the dataframe to store all the score information
43 | 	df_rs_sys = pd.DataFrame(np.zeros((df_rs['customer_id'].nunique(), df_rs.product_id.nunique())))
44 | 	# each column represesnts a product
45 | 	df_rs_sys.columns = list(df_rs.product_id.unique())
46 | 	# each row represents a customer
47 | 	df_rs_sys.index = list(df_rs['customer_id'].unique())
48 | 
49 | 	# efficiently put the scores into df_rs_sys
50 | 	cust_id = df_rs.customer_id.values[0]
51 | 	start = 0
52 | 	print('Creating recsys table ...\n')
53 | 	for i in tqdm(range(df_rs.shape[0])):
54 | 		if df_rs.customer_id.values[i] == cust_id:
55 | 			continue
56 | 		df_buf = df_rs.iloc[start:i,:].copy()
57 | 		for j in range(df_buf.shape[0]):
58 | 			prod_id = df_buf['product_id'].values[j]
59 | 			score = df_buf['score'].values[j]
60 | 			df_rs_sys.loc[df_rs_sys.index == cust_id,prod_id] = score
61 | 		start = i
62 | 		cust_id = df_rs.customer_id.values[i] 
63 | 
64 | 
65 | 	# randomly remove 1000 scores from the table(df_rs_sys)
66 | 	c_level = random.sample(range(32266), 1000) # record the rows of the removed scores, 32266 is the number of customers
67 | 	p_level = [] # resord the columns of the removed scores
68 | 	real = [] # actual score lists
69 | 	esti = [] # estimated score lists
70 | 	for i in tqdm(c_level):
71 | 		for j in range(23812): # 23812 is the number of product
72 | 			if df_rs_sys.iloc[i,j]!=0:
73 | 				p_level.append(j)
74 | 				real.append(df_rs_sys.iloc[i,j])
75 | 				df_rs_sys.iloc[i,j] = 0
76 | 				c = i
77 | 				L = [j]
78 | 				esti.append(svdRec.recommend(mat(df_rs_sys.values), c, L,len(L))[0][1])
79 | 				df_rs_sys.iloc[i,j] = real[-1]
80 | 				break
81 | 
82 | 	abs_err = np.abs(esti - np.array(real)).mean() # absolute error of the estimated recommendation scores
83 | 	avg_score = df_rs['score'].mean()
84 | 
85 | 	print('The average actual score of the recommendation system is {:.3f}.\n'.format(avg_score))
86 | 	print('The aboluate error of the recommendation system is {:.3f}.\n'.format(abs_error))
87 | 
88 | 


--------------------------------------------------------------------------------
/apriori.py:
--------------------------------------------------------------------------------
  1 | """
  2 | From "Machine Learning for Action"
  3 | """
  4 | 
  5 | from numpy import *
  6 | 
  7 | 
  8 | def createC1(dataSet):
  9 |     C1 = []
 10 |     for transaction in dataSet:
 11 |         for item in transaction:
 12 |             if not [item] in C1:
 13 |                 C1.append([item])
 14 |                 
 15 |     C1.sort()
 16 |     return list(map(frozenset, C1))#use frozen set so we
 17 |                             #can use it as a key in a dict    
 18 | 
 19 | def scanD(D, Ck, minSupport):
 20 |     ssCnt = {}
 21 |     for tid in D:
 22 |         for can in Ck:
 23 |             if can.issubset(tid):
 24 |                 if can not in ssCnt: ssCnt[can]=1
 25 |                 else: ssCnt[can] += 1
 26 |     numItems = float(len(D))
 27 |     retList = []
 28 |     supportData = {}
 29 |     for key in ssCnt:
 30 |         support = ssCnt[key]/numItems
 31 |         if support >= minSupport:
 32 |             retList.insert(0,key)
 33 |         supportData[key] = support
 34 |     return retList, supportData
 35 | 
 36 | def aprioriGen(Lk, k): #creates Ck
 37 |     retList = []
 38 |     lenLk = len(Lk)
 39 |     for i in range(lenLk):
 40 |         for j in range(i+1, lenLk): 
 41 |             L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
 42 |             L1.sort(); L2.sort()
 43 |             if L1==L2: #if first k-2 elements are equal
 44 |                 retList.append(Lk[i] | Lk[j]) #set union
 45 |     return retList
 46 | 
 47 | def apriori(dataSet, minSupport = 0.5):
 48 |     C1 = createC1(dataSet)
 49 |     D = list(map(set, dataSet))
 50 |     L1, supportData = scanD(D, C1, minSupport)
 51 |     L = [L1]
 52 |     k = 2
 53 |     while (len(L[k-2]) > 0):
 54 |         Ck = aprioriGen(L[k-2], k)
 55 |         Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
 56 |         supportData.update(supK)
 57 |         L.append(Lk)
 58 |         k += 1
 59 |     return L, supportData
 60 | 
 61 | def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
 62 |     bigRuleList = []
 63 |     for i in range(1, len(L)):#only get the sets with two or more items
 64 |         for freqSet in L[i]:
 65 |             H1 = [frozenset([item]) for item in freqSet]
 66 |             if (i > 1):
 67 |                 rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
 68 |             else:
 69 |                 calcConf(freqSet, H1, supportData, bigRuleList, minConf)
 70 |     return bigRuleList         
 71 | 
 72 | def calcConf(freqSet, H, supportData, brl, minConf=0.7):
 73 |     prunedH = [] #create new list to return
 74 |     for conseq in H:
 75 |         conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
 76 |         if conf >= minConf: 
 77 |             print(freqSet-conseq,'-->',conseq,'conf:',conf)
 78 |             brl.append((freqSet-conseq, conseq, conf))
 79 |             prunedH.append(conseq)
 80 |     return prunedH
 81 | 
 82 | def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
 83 |     m = len(H[0])
 84 |     if (len(freqSet) > (m + 1)): #try further merging
 85 |         Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
 86 |         Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
 87 |         if (len(Hmp1) > 1):    #need at least two sets to merge
 88 |             rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
 89 |             
 90 | def pntRules(ruleList, itemMeaning):
 91 |     for ruleTup in ruleList:
 92 |         for item in ruleTup[0]:
 93 |             print(itemMeaning[item])
 94 |         print("           -------->")
 95 |         for item in ruleTup[1]:
 96 |             print(itemMeaning[item])
 97 |         print("confidence: %f" % ruleTup[2])
 98 |         print('\n')      #print a blank line
 99 |         
100 |             
101 | 


--------------------------------------------------------------------------------
/svdRec.py:
--------------------------------------------------------------------------------
  1 | """
  2 | From "Machine Learning for Action"
  3 | """
  4 | 
  5 | 
  6 | from numpy import *
  7 | from numpy import linalg as la
  8 | from tqdm import tqdm
  9 | 
 10 |     
 11 | def ecludSim(inA, inB):
 12 |     """
 13 |     ecludean distance
 14 |     """
 15 | 
 16 |     return 1.0/(1.0 + la.norm(inA - inB))
 17 | 
 18 | def pearsSim(inA, inB):
 19 |     """
 20 |     pearson correlation as similarity metric
 21 |     """
 22 | 
 23 |     if len(inA) < 3 : return 1.0
 24 |     return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]
 25 | 
 26 | def cosSim(inA,inB):
 27 |     """
 28 |     cosine distance.
 29 |     """
 30 |     num = float(inA.T*inB)
 31 |     denom = la.norm(inA)*la.norm(inB)
 32 |     return 0.5+0.5*(num/denom)
 33 | 
 34 | def standEst(dataMat, user, simMeas, item):
 35 |     n = shape(dataMat)[1]
 36 |     simTotal = 0.0; ratSimTotal = 0.0
 37 |     for j in range(n):
 38 |         userRating = dataMat[user,j]
 39 |         if userRating == 0: continue
 40 |         overLap = nonzero(logical_and(dataMat[:,item].A>0, \
 41 |                                       dataMat[:,j].A>0))[0]
 42 |         if len(overLap) == 0: similarity = 0
 43 |         else: similarity = simMeas(dataMat[overLap,item], \
 44 |                                    dataMat[overLap,j])
 45 |         #print('the %d and %d similarity is: %f' % (item, j, similarity))
 46 |         simTotal += similarity
 47 |         ratSimTotal += similarity * userRating
 48 |     if simTotal == 0: return 0
 49 |     else: return ratSimTotal/simTotal
 50 |     
 51 | def svdEst(dataMat, user, simMeas, item):
 52 |     n = shape(dataMat)[1]
 53 |     simTotal = 0.0; ratSimTotal = 0.0
 54 |     U,Sigma,VT = la.svd(dataMat)
 55 |     Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
 56 |     xformedItems = dataMat.T * U[:,:4] * Sig4.I  #create transformed items
 57 |     for j in range(n):
 58 |         userRating = dataMat[user,j]
 59 |         if userRating == 0 or j==item: continue
 60 |         similarity = simMeas(xformedItems[item,:].T,\
 61 |                              xformedItems[j,:].T)
 62 |         print('the %d and %d similarity is: %f' % (item, j, similarity))
 63 |         simTotal += similarity
 64 |         ratSimTotal += similarity * userRating
 65 |     if simTotal == 0: return 0
 66 |     else: return ratSimTotal/simTotal
 67 | 
 68 | def recommend(dataMat, user, L, N=3, simMeas=cosSim, estMethod=standEst):
 69 |     unratedItems = nonzero(dataMat[user,:]==0)[1]#find unrated items 
 70 |     if len(unratedItems) == 0: return 'you rated everything'
 71 |     itemScores = []
 72 |     unratedItems = L
 73 |     for item in unratedItems[:10]:
 74 |         estimatedScore = estMethod(dataMat, user, simMeas, item)
 75 |         itemScores.append((item, estimatedScore))
 76 |     return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]
 77 | 
 78 | def printMat(inMat, thresh=0.8):
 79 |     for i in range(32):
 80 |         for k in range(32):
 81 |             if float(inMat[i,k]) > thresh:
 82 |                 print(1,)
 83 |             else: print(0,)
 84 |         print('')
 85 | 
 86 | def imgCompress(numSV=3, thresh=0.8):
 87 |     myl = []
 88 |     for line in open('0_5.txt').readlines():
 89 |         newRow = []
 90 |         for i in range(32):
 91 |             newRow.append(int(line[i]))
 92 |         myl.append(newRow)
 93 |     myMat = mat(myl)
 94 |     print("****original matrix******")
 95 |     printMat(myMat, thresh)
 96 |     U,Sigma,VT = la.svd(myMat)
 97 |     SigRecon = mat(zeros((numSV, numSV)))
 98 |     for k in range(numSV):#construct diagonal matrix from vector
 99 |         SigRecon[k,k] = Sigma[k]
100 |     reconMat = U[:,:numSV]*SigRecon*VT[:numSV,:]
101 |     print("****reconstructed matrix using %d singular values******" % numSV)
102 |     printMat(reconMat, thresh)


--------------------------------------------------------------------------------
/next_basket.py:
--------------------------------------------------------------------------------
  1 | """The model for the next basket predictions.
  2 | """
  3 | 
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from matplotlib import pyplot as plt
  8 | from tqdm import tqdm
  9 | import os
 10 | from numba import jit
 11 | 
 12 | from datetime import date as dt
 13 | from datetime import datetime
 14 | 
 15 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 16 | from sklearn import metrics
 17 | from sklearn.metrics import confusion_matrix
 18 | from sklearn.metrics import recall_score, precision_score, f1_score
 19 | from xgboost import XGBClassifier
 20 | 
 21 | dataFolder = 'Data\\'
 22 | 
 23 | def read_file(data_folder: str) -> pd.DataFrame:
 24 | 	"""Read csv files from the data folder.
 25 |     
 26 |     Args:
 27 |         data_folder: the name of the folder which stores all the csv files.
 28 |         
 29 |     Returns:
 30 |         Transactions in pd.DataFrame.
 31 | 	"""
 32 | 
 33 | 	dataFiles = [os.path.join(data_folder, f) for f in os.listdir(data_folder)]
 34 | 	container = []
 35 | 	for name in dataFiles:
 36 | 		df = pd.read_csv(name, encoding="ISO-8859-1", sep=';')
 37 | 		container.append(df)
 38 | 	df = pd.concat(container)
 39 | 	df.columns = ['date_time', 'customer_id', 'age', 'area', 'subclass', 'product_id', 
 40 | 	'amount', 'asset', 'sale_price']
 41 | 
 42 | 	df['profit'] = df['sale_price'] - df['asset']
 43 | 	df['date_time'] = df['date_time'].apply(lambda x: datetime.strptime(x[:10], '%Y-%m-%d'))
 44 | 	df['weekday'] = df['date_time'].apply(lambda x: x.weekday()) 
 45 | 	df['month'] = df.date_time.astype(str).apply(lambda x:x[5:7])
 46 | 
 47 | 	return df
 48 | 
 49 | 
 50 | @jit
 51 | def add_transaction_id(df: pd.DataFrame) -> pd.DataFrame:
 52 | 	"""Create transaction id for each product purchased (row in df). Products in the same basket have the same transaction id.
 53 | 
 54 | 	Args:
 55 |         df: Dataframe which includes all the product purchased.
 56 |         
 57 |     Returns:
 58 |         df: Add transaction id to the input df.
 59 | 
 60 | 	"""
 61 | 	df['trans_id'] =  df['customer_id'].astype(str) + df['date_time'].astype(str) # assume each customer only make at most one transaction everyday
 62 | 	df = df.sort_values(['trans_id']) # sort before finding the products in the same basket
 63 | 	cust_id = df.customer_id.values[0]
 64 | 	trans_buf_id = df.trans_id.values[0]
 65 | 	trans = 0
 66 | 	trans_id_list = []
 67 | 	for i in tqdm(range(df.shape[0])):
 68 | 		if df.customer_id.values[i] == cust_id:
 69 | 			if df.trans_id.values[i] == trans_buf_id:
 70 | 				trans_id_list.append(trans)
 71 | 			else:
 72 | 				trans += 1
 73 | 				trans_buf_id = df.trans_id.values[i]
 74 | 				trans_id_list.append(trans)
 75 | 		else:
 76 | 			cust_id = df.customer_id.values[i]
 77 | 			trans_buf_id = df.trans_id.values[i]
 78 | 			trans = 0
 79 | 			trans_id_list.append(trans)
 80 | 	df['trans_id'] = trans_id_list
 81 | 
 82 | 	return df
 83 | 
 84 | 
 85 | def create_feature_for_one_customer(df_cust: pd.DataFrame, features: list, targets: list, train_test: list):
 86 | 	"""Create modeling features for each cutomer. Uses each transaction's previous two transactions information.
 87 | 
 88 | 	Args:
 89 |         df_cust: Dataframe which includes all the transactions made by a single customer.
 90 |         features: a list of lists;
 91 |         			day of the week of the transactions, difference between the current transaction and the previous two transactions in days,
 92 | 					amount of all products, amount of the certain product, item prices, month of the transactions, product information, 
 93 | 					customer information. 
 94 |         targets: a list of 0, 1. 0 indicates a product is not purchased in the current transaction.
 95 |         		1 indicates a product is purchased in the current transaction.
 96 |         train_test: a list of 0, 1. 1 indicates the corresponding feature will be usesd for training.
 97 |         			0 indicates the corresponding feature will be used for tessting.
 98 | 
 99 | 	"""
100 | 
101 | 	if df_cust.trans_id.max() < 4: # exclude customers with less than 4 transactions
102 | 		return
103 | 
104 | 	for i in range(2, df_cust.trans_id.max() + 1): # start from the third transaction of each cutomer
105 | 		if i == df_cust.trans_id.max(): # if it is the last order
106 | 			train_test = 0
107 | 		else:
108 | 			train_test = 1
109 | 		df_prev = df_cust.loc[df_cust.trans_id.isin([i - 1, i - 2])].copy()
110 | 		df_prev_1 = df_prev.loc[df_prev.trans_id == i - 1]
111 | 		df_prev_2 = df_prev.loc[df_prev.trans_id == i - 2]
112 | 		curr_product = set(df_cust.loc[df_cust.trans_id == i,'product_id'])
113 | 		curr_date = df_cust.loc[df_cust.trans_id == i,'date_time'].values[0]
114 | 		feature = []
115 | 		last_weekday = df_prev_1['weekday'].values[0]
116 | 		last_weekday_2 = df_prev_2['weekday'].values[0]
117 | 		this_weekday = df_cust.loc[df_cust.trans_id == i, 'weekday'].values[0]
118 | 		diff_last_day = (curr_date - df_prev_1['date_time'].values[0]) / np.timedelta64(1, 'D')
119 | 		diff_last_day_2 = (curr_date - df_prev_2['date_time'].values[0]) / np.timedelta64(1, 'D')
120 | 		last_all_amount = df_prev_1['amount'].sum()
121 | 		last_all_amount_2 = df_prev_2['amount'].sum()
122 | 		cust_area = df_cust['area'].values[0]
123 | 		cust_age = df_cust['age'].values[0]
124 | 		last_month = str(df_prev_1.date_time.values[0])[5:7]
125 | 		last_month_2 = str(df_prev_2.date_time.values[0])[5:7]
126 | 		cust_id = df_cust['customer_id'].values[0]
127 | 		for prod_id in df_prev.product_id.unique():
128 | 			if prod_id in curr_product:
129 | 				targets.append(1)
130 | 			else:
131 | 				targets.append(0)
132 | 			dfdf_last = df_prev_1.loc[(df_prev_1.product_id == prod_id)].copy()
133 | 			dfdf_last_2 = df_prev_2.loc[(df_prev_2.product_id == prod_id)].copy()
134 | 			last_amount = 0
135 | 			last_amount_2 = 0
136 | 			last_price = 0
137 | 			last_price_2 = 0
138 | 
139 | 			try:
140 | 				last_amount = dfdf_last['amount'].values[0]
141 | 				last_price = dfdf_last['sale_price'].values[0] / last_amount
142 | 			except:
143 | 				None
144 |         
145 | 			try:
146 | 				last_amount_2 = dfdf_last_2['amount'].values[0]
147 | 				last_price_2 = dfdf_last_2['sale_price'].values[0] / last_amount_2
148 | 			except:
149 | 				None
150 |         
151 | 			subcls = df_prev.loc[df_prev.product_id == prod_id,'subclass'].values[0]
152 | 			feature = [last_weekday, last_weekday_2, this_weekday, diff_last_day, diff_last_day_2, last_all_amount, last_all_amount_2,
153 | 						last_amount, last_amount_2, last_price, last_price_2, last_month, last_month_2, int(subcls), int(prod_id), cust_area, 
154 | 						cust_age, cust_id, train_test]
155 | 			features.append(feature)
156 | 
157 | 	return
158 | 
159 | 
160 | @jit
161 | def create_feature(df: pd.DataFrame) -> pd.DataFrame:
162 | 	"""Create features and targets from the transaction dataframe.
163 | 	
164 | 	Args:
165 |         df: Dataframe which includes all transactions.
166 |         
167 |     Returns:
168 |         df_feature_target: a dataframe includes features and targets.
169 |         					df_feature_target['Y'] are the targets.
170 | 
171 | 	"""
172 | 
173 | 	cust_id = df.customer_id.values[0]
174 | 	start = 0
175 | 	features = [] # initial feature sets
176 | 	targets = [] # initial targets
177 | 	train_test = [] # initial train_test indicator
178 | 	for i in tqdm(range(df.shape[0])): # find transactions of each customer
179 | 		if df['customer_id'].values[i] == cust_id:
180 | 			continue
181 | 		df_cust = df.iloc[start:i,:].copy()
182 | 		create_feature_for_one_customer(df_cust, features, targets, train_test)
183 | 		cust_id = df.customer_id.values[i]
184 | 		start = i
185 | 
186 | 	df_feature_target = pd.DataFrame(features)
187 | 	df_feature_target['Y'] = targets
188 | 
189 | 	return df_feature_target
190 | 
191 | 
192 | def train_test_split(df_feaure_target: pd.DataFrame) -> (np.array, np.array, np.array, np.array):
193 | 	"""Onehot encoding plus train test split.
194 | 
195 | 	Args:
196 |         df_feaure_target: Dataframe with raw features and targets.
197 |         
198 |     Returns:
199 |         X_train:
200 |         X_test:
201 |         y_train:
202 |         y_test:
203 | 
204 | 	"""
205 | 	# onehot encoding customer area information
206 | 	label_encoder = LabelEncoder()
207 | 	lb_f = label_encoder.fit_transform(df_feaure_target[15]).reshape(-1,1)
208 | 	one_hot = OneHotEncoder(sparse=False)
209 | 	oh_f = one_hot.fit_transform(lb_f)
210 | 
211 | 	# onehot encoding customer age information
212 | 	label_encoder = LabelEncoder()
213 | 	lb_f = label_encoder.fit_transform(df_feaure_target[16]).reshape(-1,1)
214 | 	one_hot = OneHotEncoder(sparse=False)
215 | 	oh_f_1 = one_hot.fit_transform(lb_f)
216 | 
217 | 	bools = df_feaure_target[18] == 1 # identify training set
218 | 	del df_feaure_target[15], df_feaure_target[16], df_feaure_target[18]
219 | 
220 | 	X = df_feaure_target.iloc[:,:-1].values
221 | 	X = np.concatenate((X, oh_f), axis=1)
222 | 	X = np.concatenate((X, oh_f_1), axis=1) 
223 | 
224 | 	Y = df_feaure_target.iloc[:,-1].values
225 | 
226 | 	X_train, X_test, y_train, y_test = X[bools, :], X[~bools, :], Y[bools], Y[~bools]
227 | 
228 | 	return X_train, X_test, y_train, y_test
229 | 
230 | 
231 | 
232 | def plt_auc(model, X_test, Y_test, test_start):
233 | 	"""Plot ROC area and return roc score.
234 | 	"""
235 | 	probs = model.predict_proba(X_test)
236 | 
237 | 	preds = probs[:,1]
238 | 	fpr, tpr, threshold = metrics.roc_curve(Y_test, preds)
239 | 	roc_auc = metrics.auc(fpr, tpr)
240 | 	#y_pred = xgb1.predict(X[train_len:])
241 | 
242 | 	plt.title(str(test_start) + 'Receiver Operating Characteristic')
243 | 	plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
244 | 	plt.legend(loc = 'lower right')
245 | 	plt.plot([0, 1], [0, 1],'r--')
246 | 	plt.xlim([0, 1])
247 | 	plt.ylim([0, 1])
248 | 	plt.ylabel('True Positive Rate')
249 | 	plt.xlabel('False Positive Rate')
250 | 	plt.show()
251 | 	return roc_auc
252 | 
253 | 
254 | 
255 | if __name__ == "__main__":
256 | 
257 | 	print('Reading files ...\n')
258 | 	df = read_file(dataFolder)
259 | 
260 | 	print('Adding transaction numbers ...\n')
261 | 	df = add_transaction_id(df)
262 | 
263 | 
264 | 	# create features and targets
265 | 	print('Creating features ...\n')
266 | 	df_feature_target = create_feature(df)
267 | 
268 | 
269 | 	X_train, X_test, y_train, y_test = train_test_split(df_feature_target)
270 | 
271 | 	num_of_eval = int(len(y_test) * 0.5)
272 | 	model = XGBClassifier(n_estimators=1000, max_depth=7, colsample_bytree=0.7, nthread=-1)
273 | 	eval_set = [(X_test[:num_of_eval, :], y_test[:num_of_eval])]
274 | 
275 | 	# I use early stopping here to prevent model from overfitting to the training set.
276 | 	# Cross validation is a method to avoid overfitting when tunning the model parameters.
277 | 	model.fit(X_train, Y_train, early_stopping_rounds=100, eval_metric="auc",eval_set=eval_set, verbose=True)
278 | 
279 | 	# record the number of trees before the model overfitting to the training set and retrain the model
280 | 	num_tree = 200 
281 | 	model = XGBClassifier(n_estimators=num_tree, max_depth=7, colsample_bytree=0.7, nthread=-1)
282 | 	model.fit(X_train, Y_train)
283 | 
284 | 	# plot roc curve
285 | 	roc_auc = plt_auc(model,X_test[num_of_eval:,:],y_test[num_of_eval:],'test')
286 | 
287 | 	
288 | 	y_pred = model.predict_proba(X_test[num_of_eval:,:])[:,1]
289 | 	y_pred = [1 if i > 0.18 else 0 for i in y_pred] # choose difference threshold to meet different requirements
290 | 	print("Recall: ", recall_score(y_test[num_of_eval:],y_pred)) 
291 | 	print("Precision: ", precision_score(y_test[num_of_eval:],y_pred))
292 | 	print("Accuracy: ", accuracy_score(y_test[num_of_eval:],y_pred))
293 | 	prin("F1: ", f1_score(y_test[num_of_eval:],y_pred))
294 | 


--------------------------------------------------------------------------------