├── README.md ├── README.txt ├── word2vec.py ├── association_rules.py ├── basic_recsys.py ├── apriori.py ├── svdRec.py └── next_basket.py /README.md: -------------------------------------------------------------------------------- 1 | # Next-Basket-Analysis-Ta-Feng-Dataset 2 | ##### Ta-Feng is a grocery shopping dataset released by ACM RecSys, it covers products from food, office supplies to furniture. 3 | ##### The dataset collected users` transaction data of 4 months, from November 2000 to February 2001. The total count of transactions in this dataset is 817741, which belong to 32266 users and 23812 products. 4 | See http://recsyswiki.com/wiki/Grocery_shopping_datasets for more details. 5 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | OVERVIEW 2 | ------------------------------------------------------------------ 3 | next_basket.py, association_rules.py, basic_recsys.py, word2vec.py 4 | are the polished pieces of code for Ta-Feng project. Each file can 5 | run as the main function. 6 | 7 | next_basket.py 8 | ------------------------------------------------------------------ 9 | Read files from data folder, feature engineering, model training 10 | for "the next basket prediction" problem. 11 | 12 | association_rules.py 13 | ------------------------------------------------------------------ 14 | Identify the association rules between subclass and products 15 | 16 | basic_recsys.py 17 | ------------------------------------------------------------------ 18 | Build a recommendation system using collaborative filtering. 19 | Performance is evaluated. 20 | 21 | word2vec.py 22 | ------------------------------------------------------------------ 23 | Create word2vec represesntations of product ids. 24 | In order to reduce the no sequence effect, window size is set to the 25 | largest basket size in the training set. 26 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | """Compute the word2vec representation for products. 2 | """ 3 | 4 | 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from matplotlib import pyplot as plt 9 | from tqdm import tqdm 10 | import os 11 | #from numba import jit 12 | from next_basket import read_file, add_transaction_id 13 | from association_rules import item_in_one_basket 14 | 15 | import gensim 16 | 17 | from datetime import date as dt 18 | from datetime import datetime 19 | 20 | from sklearn.decomposition import PCA 21 | 22 | dataFolder = 'Data\\' 23 | 24 | def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): 25 | """From Tensorflow's tutorial. 26 | """ 27 | plt.figure(figsize=(18, 18)) 28 | for i, label in enumerate(labels): 29 | x, y = low_dim_embs[i,:] 30 | plt.scatter(x, y) 31 | plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', 32 | ha='right', va='bottom') 33 | 34 | plt.show() 35 | 36 | 37 | 38 | if __name__ == "__main__": 39 | 40 | print('Reading files ...\n') 41 | df = read_file(dataFolder) 42 | 43 | df['trans_id'] = df['customer_id'].astype(str) + df['date_time'].astype(str) 44 | baskets = df.groupby("trans_id").apply(lambda order: order['product_id'].astype(str).tolist()) 45 | longest = np.max(baskets.apply(len)) 46 | baskets = baskets.values 47 | 48 | # I choose the window size as the largest basket size. 49 | # Since there is no sequence characteristics of the products in an order, we should have a 50 | # training window huge enough to accommodate all the products together. 51 | model = gensim.models.Word2Vec(baskets, size=100, window=longest, min_count=2, workers=4) 52 | 53 | # product_id 54 | vocab = list(model.wv.vocab.keys()) 55 | 56 | # reduce dimension using PCA with 2 components 57 | pca = PCA(n_components=2) 58 | pca.fit(model.wv.syn0) 59 | 60 | # plot 2d word2vec 61 | plot_with_labels(pca.fit_transform(model.wv.syn0), vocab) 62 | 63 | -------------------------------------------------------------------------------- /association_rules.py: -------------------------------------------------------------------------------- 1 | """Find the association rules between subclass and products. 2 | """ 3 | 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from matplotlib import pyplot as plt 8 | from tqdm import tqdm 9 | import os 10 | #from numba import jit 11 | from next_basket import read_file, add_transaction_id 12 | 13 | from datetime import date as dt 14 | from datetime import datetime 15 | 16 | import apriori 17 | 18 | dataFolder = 'Data\\' 19 | 20 | def item_in_one_basket(df: pd.DataFrame, item_tp: str) -> list: 21 | """Group the items in the same basket as a list. 22 | 23 | Args: 24 | df: product purchasing information. 25 | item_tp: "subclass" or "product_id" 26 | 27 | Returns: 28 | basket_list: a list of items in baskets. Each sub list represents the items in one basket. 29 | 30 | """ 31 | 32 | df['trans_id'] = df['customer_id'].astype(str) + df['date_time'].astype(str) # assume each customer only make at most one transaction everyday 33 | df = df.sort_values(['trans_id']) 34 | df['subclass'] = df['subclass'].astype(str) 35 | row_num = 0 36 | trans_id_buf = df['trans_id'].values[0] 37 | basket_list = [] 38 | item_list = [] 39 | for j in tqdm(range(df.shape[0])): 40 | if df['trans_id'].values[j] == trans_id_buf: 41 | item_list.append(df['subclass'].values[j]) 42 | else: 43 | item_list = list(set(item_list)) 44 | basket_list.append(item_list[:]) 45 | item_list = [df['subclass'].values[j]] 46 | trans_id_buf = df['trans_id'].values[j] 47 | 48 | return basket_list 49 | 50 | 51 | if __name__ == "__main__": 52 | 53 | print('Reading files ...\n') 54 | df = read_file(dataFolder) 55 | 56 | # identify the association rule between various subclass 57 | basket_list = item_in_one_basket(df, 'subclass') 58 | L, suppData = apriori.apriori(basket_list, minSupport=0.01) # support score is 0.01 --> the persesntage of appearance > 1% 59 | rules = apriori.generateRules(L, suppData, minConf=0.3) # conditional probability >= 0.3 60 | print('Association rules in Subclass: \n',rules) 61 | 62 | # identify the association rule between various products 63 | basket_list = item_in_one_basket(df, 'product_id') 64 | L, suppData = apriori.apriori(basket_list, minSupport=0.005) # support score is 0.01 --> the persesntage of appearance > 0.5% 65 | rules = apriori.generateRules(L,suppData,minConf=0.3) # conditional probability >= 0.3 66 | print('Association rules in Products: \n',L) 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /basic_recsys.py: -------------------------------------------------------------------------------- 1 | """A simple recommendation model based on collaborative filtering. 2 | The performance is evaluation by randomly removing 1000 scores from the table. 3 | And compare the estimation of these scores with the actual ones. 4 | """ 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from matplotlib import pyplot as plt 10 | from tqdm import tqdm 11 | import os 12 | 13 | from next_basket import read_file, add_transaction_id 14 | 15 | from datetime import date as dt 16 | from datetime import datetime 17 | 18 | from numpy import mat 19 | import random 20 | import svdRec 21 | 22 | dataFolder = 'Data\\' 23 | 24 | if __name__ == "__main__": 25 | 26 | print('Reading files ...\n') 27 | df = read_file(dataFolder) 28 | 29 | print('Adding transaction numbers ...\n') 30 | df = add_transaction_id(df) 31 | 32 | # the number of time that a customer purchassed a certain product 33 | df_cust_row = df.groupby(['customer_id','product_id'],as_index = False).agg({'trans_id':pd.Series.nunique}) 34 | # Number of transactions made by each customer 35 | df_cust_num_trans = df.groupby(['customer_id'],as_index = False).agg({'trans_id':pd.Series.nunique}) 36 | df_cust_num_trans.columns = ['customer_id','num_trans'] 37 | # create a dataframe with columns: customer id, product id, number of times this product has been purchased, number of trans made by this customer 38 | df_rs = pd.merge(df_cust_row, df_cust_num_trans, on = 'customer_id', how = 'left') 39 | # compute score 40 | df_rs['score'] = df_rs['trans_id'] / df_rs['num_trans'] 41 | 42 | # initialise the dataframe to store all the score information 43 | df_rs_sys = pd.DataFrame(np.zeros((df_rs['customer_id'].nunique(), df_rs.product_id.nunique()))) 44 | # each column represesnts a product 45 | df_rs_sys.columns = list(df_rs.product_id.unique()) 46 | # each row represents a customer 47 | df_rs_sys.index = list(df_rs['customer_id'].unique()) 48 | 49 | # efficiently put the scores into df_rs_sys 50 | cust_id = df_rs.customer_id.values[0] 51 | start = 0 52 | print('Creating recsys table ...\n') 53 | for i in tqdm(range(df_rs.shape[0])): 54 | if df_rs.customer_id.values[i] == cust_id: 55 | continue 56 | df_buf = df_rs.iloc[start:i,:].copy() 57 | for j in range(df_buf.shape[0]): 58 | prod_id = df_buf['product_id'].values[j] 59 | score = df_buf['score'].values[j] 60 | df_rs_sys.loc[df_rs_sys.index == cust_id,prod_id] = score 61 | start = i 62 | cust_id = df_rs.customer_id.values[i] 63 | 64 | 65 | # randomly remove 1000 scores from the table(df_rs_sys) 66 | c_level = random.sample(range(32266), 1000) # record the rows of the removed scores, 32266 is the number of customers 67 | p_level = [] # resord the columns of the removed scores 68 | real = [] # actual score lists 69 | esti = [] # estimated score lists 70 | for i in tqdm(c_level): 71 | for j in range(23812): # 23812 is the number of product 72 | if df_rs_sys.iloc[i,j]!=0: 73 | p_level.append(j) 74 | real.append(df_rs_sys.iloc[i,j]) 75 | df_rs_sys.iloc[i,j] = 0 76 | c = i 77 | L = [j] 78 | esti.append(svdRec.recommend(mat(df_rs_sys.values), c, L,len(L))[0][1]) 79 | df_rs_sys.iloc[i,j] = real[-1] 80 | break 81 | 82 | abs_err = np.abs(esti - np.array(real)).mean() # absolute error of the estimated recommendation scores 83 | avg_score = df_rs['score'].mean() 84 | 85 | print('The average actual score of the recommendation system is {:.3f}.\n'.format(avg_score)) 86 | print('The aboluate error of the recommendation system is {:.3f}.\n'.format(abs_error)) 87 | 88 | -------------------------------------------------------------------------------- /apriori.py: -------------------------------------------------------------------------------- 1 | """ 2 | From "Machine Learning for Action" 3 | """ 4 | 5 | from numpy import * 6 | 7 | 8 | def createC1(dataSet): 9 | C1 = [] 10 | for transaction in dataSet: 11 | for item in transaction: 12 | if not [item] in C1: 13 | C1.append([item]) 14 | 15 | C1.sort() 16 | return list(map(frozenset, C1))#use frozen set so we 17 | #can use it as a key in a dict 18 | 19 | def scanD(D, Ck, minSupport): 20 | ssCnt = {} 21 | for tid in D: 22 | for can in Ck: 23 | if can.issubset(tid): 24 | if can not in ssCnt: ssCnt[can]=1 25 | else: ssCnt[can] += 1 26 | numItems = float(len(D)) 27 | retList = [] 28 | supportData = {} 29 | for key in ssCnt: 30 | support = ssCnt[key]/numItems 31 | if support >= minSupport: 32 | retList.insert(0,key) 33 | supportData[key] = support 34 | return retList, supportData 35 | 36 | def aprioriGen(Lk, k): #creates Ck 37 | retList = [] 38 | lenLk = len(Lk) 39 | for i in range(lenLk): 40 | for j in range(i+1, lenLk): 41 | L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2] 42 | L1.sort(); L2.sort() 43 | if L1==L2: #if first k-2 elements are equal 44 | retList.append(Lk[i] | Lk[j]) #set union 45 | return retList 46 | 47 | def apriori(dataSet, minSupport = 0.5): 48 | C1 = createC1(dataSet) 49 | D = list(map(set, dataSet)) 50 | L1, supportData = scanD(D, C1, minSupport) 51 | L = [L1] 52 | k = 2 53 | while (len(L[k-2]) > 0): 54 | Ck = aprioriGen(L[k-2], k) 55 | Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk 56 | supportData.update(supK) 57 | L.append(Lk) 58 | k += 1 59 | return L, supportData 60 | 61 | def generateRules(L, supportData, minConf=0.7): #supportData is a dict coming from scanD 62 | bigRuleList = [] 63 | for i in range(1, len(L)):#only get the sets with two or more items 64 | for freqSet in L[i]: 65 | H1 = [frozenset([item]) for item in freqSet] 66 | if (i > 1): 67 | rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) 68 | else: 69 | calcConf(freqSet, H1, supportData, bigRuleList, minConf) 70 | return bigRuleList 71 | 72 | def calcConf(freqSet, H, supportData, brl, minConf=0.7): 73 | prunedH = [] #create new list to return 74 | for conseq in H: 75 | conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence 76 | if conf >= minConf: 77 | print(freqSet-conseq,'-->',conseq,'conf:',conf) 78 | brl.append((freqSet-conseq, conseq, conf)) 79 | prunedH.append(conseq) 80 | return prunedH 81 | 82 | def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): 83 | m = len(H[0]) 84 | if (len(freqSet) > (m + 1)): #try further merging 85 | Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates 86 | Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) 87 | if (len(Hmp1) > 1): #need at least two sets to merge 88 | rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) 89 | 90 | def pntRules(ruleList, itemMeaning): 91 | for ruleTup in ruleList: 92 | for item in ruleTup[0]: 93 | print(itemMeaning[item]) 94 | print(" -------->") 95 | for item in ruleTup[1]: 96 | print(itemMeaning[item]) 97 | print("confidence: %f" % ruleTup[2]) 98 | print('\n') #print a blank line 99 | 100 | 101 | -------------------------------------------------------------------------------- /svdRec.py: -------------------------------------------------------------------------------- 1 | """ 2 | From "Machine Learning for Action" 3 | """ 4 | 5 | 6 | from numpy import * 7 | from numpy import linalg as la 8 | from tqdm import tqdm 9 | 10 | 11 | def ecludSim(inA, inB): 12 | """ 13 | ecludean distance 14 | """ 15 | 16 | return 1.0/(1.0 + la.norm(inA - inB)) 17 | 18 | def pearsSim(inA, inB): 19 | """ 20 | pearson correlation as similarity metric 21 | """ 22 | 23 | if len(inA) < 3 : return 1.0 24 | return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1] 25 | 26 | def cosSim(inA,inB): 27 | """ 28 | cosine distance. 29 | """ 30 | num = float(inA.T*inB) 31 | denom = la.norm(inA)*la.norm(inB) 32 | return 0.5+0.5*(num/denom) 33 | 34 | def standEst(dataMat, user, simMeas, item): 35 | n = shape(dataMat)[1] 36 | simTotal = 0.0; ratSimTotal = 0.0 37 | for j in range(n): 38 | userRating = dataMat[user,j] 39 | if userRating == 0: continue 40 | overLap = nonzero(logical_and(dataMat[:,item].A>0, \ 41 | dataMat[:,j].A>0))[0] 42 | if len(overLap) == 0: similarity = 0 43 | else: similarity = simMeas(dataMat[overLap,item], \ 44 | dataMat[overLap,j]) 45 | #print('the %d and %d similarity is: %f' % (item, j, similarity)) 46 | simTotal += similarity 47 | ratSimTotal += similarity * userRating 48 | if simTotal == 0: return 0 49 | else: return ratSimTotal/simTotal 50 | 51 | def svdEst(dataMat, user, simMeas, item): 52 | n = shape(dataMat)[1] 53 | simTotal = 0.0; ratSimTotal = 0.0 54 | U,Sigma,VT = la.svd(dataMat) 55 | Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix 56 | xformedItems = dataMat.T * U[:,:4] * Sig4.I #create transformed items 57 | for j in range(n): 58 | userRating = dataMat[user,j] 59 | if userRating == 0 or j==item: continue 60 | similarity = simMeas(xformedItems[item,:].T,\ 61 | xformedItems[j,:].T) 62 | print('the %d and %d similarity is: %f' % (item, j, similarity)) 63 | simTotal += similarity 64 | ratSimTotal += similarity * userRating 65 | if simTotal == 0: return 0 66 | else: return ratSimTotal/simTotal 67 | 68 | def recommend(dataMat, user, L, N=3, simMeas=cosSim, estMethod=standEst): 69 | unratedItems = nonzero(dataMat[user,:]==0)[1]#find unrated items 70 | if len(unratedItems) == 0: return 'you rated everything' 71 | itemScores = [] 72 | unratedItems = L 73 | for item in unratedItems[:10]: 74 | estimatedScore = estMethod(dataMat, user, simMeas, item) 75 | itemScores.append((item, estimatedScore)) 76 | return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N] 77 | 78 | def printMat(inMat, thresh=0.8): 79 | for i in range(32): 80 | for k in range(32): 81 | if float(inMat[i,k]) > thresh: 82 | print(1,) 83 | else: print(0,) 84 | print('') 85 | 86 | def imgCompress(numSV=3, thresh=0.8): 87 | myl = [] 88 | for line in open('0_5.txt').readlines(): 89 | newRow = [] 90 | for i in range(32): 91 | newRow.append(int(line[i])) 92 | myl.append(newRow) 93 | myMat = mat(myl) 94 | print("****original matrix******") 95 | printMat(myMat, thresh) 96 | U,Sigma,VT = la.svd(myMat) 97 | SigRecon = mat(zeros((numSV, numSV))) 98 | for k in range(numSV):#construct diagonal matrix from vector 99 | SigRecon[k,k] = Sigma[k] 100 | reconMat = U[:,:numSV]*SigRecon*VT[:numSV,:] 101 | print("****reconstructed matrix using %d singular values******" % numSV) 102 | printMat(reconMat, thresh) -------------------------------------------------------------------------------- /next_basket.py: -------------------------------------------------------------------------------- 1 | """The model for the next basket predictions. 2 | """ 3 | 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from matplotlib import pyplot as plt 8 | from tqdm import tqdm 9 | import os 10 | from numba import jit 11 | 12 | from datetime import date as dt 13 | from datetime import datetime 14 | 15 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 16 | from sklearn import metrics 17 | from sklearn.metrics import confusion_matrix 18 | from sklearn.metrics import recall_score, precision_score, f1_score 19 | from xgboost import XGBClassifier 20 | 21 | dataFolder = 'Data\\' 22 | 23 | def read_file(data_folder: str) -> pd.DataFrame: 24 | """Read csv files from the data folder. 25 | 26 | Args: 27 | data_folder: the name of the folder which stores all the csv files. 28 | 29 | Returns: 30 | Transactions in pd.DataFrame. 31 | """ 32 | 33 | dataFiles = [os.path.join(data_folder, f) for f in os.listdir(data_folder)] 34 | container = [] 35 | for name in dataFiles: 36 | df = pd.read_csv(name, encoding="ISO-8859-1", sep=';') 37 | container.append(df) 38 | df = pd.concat(container) 39 | df.columns = ['date_time', 'customer_id', 'age', 'area', 'subclass', 'product_id', 40 | 'amount', 'asset', 'sale_price'] 41 | 42 | df['profit'] = df['sale_price'] - df['asset'] 43 | df['date_time'] = df['date_time'].apply(lambda x: datetime.strptime(x[:10], '%Y-%m-%d')) 44 | df['weekday'] = df['date_time'].apply(lambda x: x.weekday()) 45 | df['month'] = df.date_time.astype(str).apply(lambda x:x[5:7]) 46 | 47 | return df 48 | 49 | 50 | @jit 51 | def add_transaction_id(df: pd.DataFrame) -> pd.DataFrame: 52 | """Create transaction id for each product purchased (row in df). Products in the same basket have the same transaction id. 53 | 54 | Args: 55 | df: Dataframe which includes all the product purchased. 56 | 57 | Returns: 58 | df: Add transaction id to the input df. 59 | 60 | """ 61 | df['trans_id'] = df['customer_id'].astype(str) + df['date_time'].astype(str) # assume each customer only make at most one transaction everyday 62 | df = df.sort_values(['trans_id']) # sort before finding the products in the same basket 63 | cust_id = df.customer_id.values[0] 64 | trans_buf_id = df.trans_id.values[0] 65 | trans = 0 66 | trans_id_list = [] 67 | for i in tqdm(range(df.shape[0])): 68 | if df.customer_id.values[i] == cust_id: 69 | if df.trans_id.values[i] == trans_buf_id: 70 | trans_id_list.append(trans) 71 | else: 72 | trans += 1 73 | trans_buf_id = df.trans_id.values[i] 74 | trans_id_list.append(trans) 75 | else: 76 | cust_id = df.customer_id.values[i] 77 | trans_buf_id = df.trans_id.values[i] 78 | trans = 0 79 | trans_id_list.append(trans) 80 | df['trans_id'] = trans_id_list 81 | 82 | return df 83 | 84 | 85 | def create_feature_for_one_customer(df_cust: pd.DataFrame, features: list, targets: list, train_test: list): 86 | """Create modeling features for each cutomer. Uses each transaction's previous two transactions information. 87 | 88 | Args: 89 | df_cust: Dataframe which includes all the transactions made by a single customer. 90 | features: a list of lists; 91 | day of the week of the transactions, difference between the current transaction and the previous two transactions in days, 92 | amount of all products, amount of the certain product, item prices, month of the transactions, product information, 93 | customer information. 94 | targets: a list of 0, 1. 0 indicates a product is not purchased in the current transaction. 95 | 1 indicates a product is purchased in the current transaction. 96 | train_test: a list of 0, 1. 1 indicates the corresponding feature will be usesd for training. 97 | 0 indicates the corresponding feature will be used for tessting. 98 | 99 | """ 100 | 101 | if df_cust.trans_id.max() < 4: # exclude customers with less than 4 transactions 102 | return 103 | 104 | for i in range(2, df_cust.trans_id.max() + 1): # start from the third transaction of each cutomer 105 | if i == df_cust.trans_id.max(): # if it is the last order 106 | train_test = 0 107 | else: 108 | train_test = 1 109 | df_prev = df_cust.loc[df_cust.trans_id.isin([i - 1, i - 2])].copy() 110 | df_prev_1 = df_prev.loc[df_prev.trans_id == i - 1] 111 | df_prev_2 = df_prev.loc[df_prev.trans_id == i - 2] 112 | curr_product = set(df_cust.loc[df_cust.trans_id == i,'product_id']) 113 | curr_date = df_cust.loc[df_cust.trans_id == i,'date_time'].values[0] 114 | feature = [] 115 | last_weekday = df_prev_1['weekday'].values[0] 116 | last_weekday_2 = df_prev_2['weekday'].values[0] 117 | this_weekday = df_cust.loc[df_cust.trans_id == i, 'weekday'].values[0] 118 | diff_last_day = (curr_date - df_prev_1['date_time'].values[0]) / np.timedelta64(1, 'D') 119 | diff_last_day_2 = (curr_date - df_prev_2['date_time'].values[0]) / np.timedelta64(1, 'D') 120 | last_all_amount = df_prev_1['amount'].sum() 121 | last_all_amount_2 = df_prev_2['amount'].sum() 122 | cust_area = df_cust['area'].values[0] 123 | cust_age = df_cust['age'].values[0] 124 | last_month = str(df_prev_1.date_time.values[0])[5:7] 125 | last_month_2 = str(df_prev_2.date_time.values[0])[5:7] 126 | cust_id = df_cust['customer_id'].values[0] 127 | for prod_id in df_prev.product_id.unique(): 128 | if prod_id in curr_product: 129 | targets.append(1) 130 | else: 131 | targets.append(0) 132 | dfdf_last = df_prev_1.loc[(df_prev_1.product_id == prod_id)].copy() 133 | dfdf_last_2 = df_prev_2.loc[(df_prev_2.product_id == prod_id)].copy() 134 | last_amount = 0 135 | last_amount_2 = 0 136 | last_price = 0 137 | last_price_2 = 0 138 | 139 | try: 140 | last_amount = dfdf_last['amount'].values[0] 141 | last_price = dfdf_last['sale_price'].values[0] / last_amount 142 | except: 143 | None 144 | 145 | try: 146 | last_amount_2 = dfdf_last_2['amount'].values[0] 147 | last_price_2 = dfdf_last_2['sale_price'].values[0] / last_amount_2 148 | except: 149 | None 150 | 151 | subcls = df_prev.loc[df_prev.product_id == prod_id,'subclass'].values[0] 152 | feature = [last_weekday, last_weekday_2, this_weekday, diff_last_day, diff_last_day_2, last_all_amount, last_all_amount_2, 153 | last_amount, last_amount_2, last_price, last_price_2, last_month, last_month_2, int(subcls), int(prod_id), cust_area, 154 | cust_age, cust_id, train_test] 155 | features.append(feature) 156 | 157 | return 158 | 159 | 160 | @jit 161 | def create_feature(df: pd.DataFrame) -> pd.DataFrame: 162 | """Create features and targets from the transaction dataframe. 163 | 164 | Args: 165 | df: Dataframe which includes all transactions. 166 | 167 | Returns: 168 | df_feature_target: a dataframe includes features and targets. 169 | df_feature_target['Y'] are the targets. 170 | 171 | """ 172 | 173 | cust_id = df.customer_id.values[0] 174 | start = 0 175 | features = [] # initial feature sets 176 | targets = [] # initial targets 177 | train_test = [] # initial train_test indicator 178 | for i in tqdm(range(df.shape[0])): # find transactions of each customer 179 | if df['customer_id'].values[i] == cust_id: 180 | continue 181 | df_cust = df.iloc[start:i,:].copy() 182 | create_feature_for_one_customer(df_cust, features, targets, train_test) 183 | cust_id = df.customer_id.values[i] 184 | start = i 185 | 186 | df_feature_target = pd.DataFrame(features) 187 | df_feature_target['Y'] = targets 188 | 189 | return df_feature_target 190 | 191 | 192 | def train_test_split(df_feaure_target: pd.DataFrame) -> (np.array, np.array, np.array, np.array): 193 | """Onehot encoding plus train test split. 194 | 195 | Args: 196 | df_feaure_target: Dataframe with raw features and targets. 197 | 198 | Returns: 199 | X_train: 200 | X_test: 201 | y_train: 202 | y_test: 203 | 204 | """ 205 | # onehot encoding customer area information 206 | label_encoder = LabelEncoder() 207 | lb_f = label_encoder.fit_transform(df_feaure_target[15]).reshape(-1,1) 208 | one_hot = OneHotEncoder(sparse=False) 209 | oh_f = one_hot.fit_transform(lb_f) 210 | 211 | # onehot encoding customer age information 212 | label_encoder = LabelEncoder() 213 | lb_f = label_encoder.fit_transform(df_feaure_target[16]).reshape(-1,1) 214 | one_hot = OneHotEncoder(sparse=False) 215 | oh_f_1 = one_hot.fit_transform(lb_f) 216 | 217 | bools = df_feaure_target[18] == 1 # identify training set 218 | del df_feaure_target[15], df_feaure_target[16], df_feaure_target[18] 219 | 220 | X = df_feaure_target.iloc[:,:-1].values 221 | X = np.concatenate((X, oh_f), axis=1) 222 | X = np.concatenate((X, oh_f_1), axis=1) 223 | 224 | Y = df_feaure_target.iloc[:,-1].values 225 | 226 | X_train, X_test, y_train, y_test = X[bools, :], X[~bools, :], Y[bools], Y[~bools] 227 | 228 | return X_train, X_test, y_train, y_test 229 | 230 | 231 | 232 | def plt_auc(model, X_test, Y_test, test_start): 233 | """Plot ROC area and return roc score. 234 | """ 235 | probs = model.predict_proba(X_test) 236 | 237 | preds = probs[:,1] 238 | fpr, tpr, threshold = metrics.roc_curve(Y_test, preds) 239 | roc_auc = metrics.auc(fpr, tpr) 240 | #y_pred = xgb1.predict(X[train_len:]) 241 | 242 | plt.title(str(test_start) + 'Receiver Operating Characteristic') 243 | plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc) 244 | plt.legend(loc = 'lower right') 245 | plt.plot([0, 1], [0, 1],'r--') 246 | plt.xlim([0, 1]) 247 | plt.ylim([0, 1]) 248 | plt.ylabel('True Positive Rate') 249 | plt.xlabel('False Positive Rate') 250 | plt.show() 251 | return roc_auc 252 | 253 | 254 | 255 | if __name__ == "__main__": 256 | 257 | print('Reading files ...\n') 258 | df = read_file(dataFolder) 259 | 260 | print('Adding transaction numbers ...\n') 261 | df = add_transaction_id(df) 262 | 263 | 264 | # create features and targets 265 | print('Creating features ...\n') 266 | df_feature_target = create_feature(df) 267 | 268 | 269 | X_train, X_test, y_train, y_test = train_test_split(df_feature_target) 270 | 271 | num_of_eval = int(len(y_test) * 0.5) 272 | model = XGBClassifier(n_estimators=1000, max_depth=7, colsample_bytree=0.7, nthread=-1) 273 | eval_set = [(X_test[:num_of_eval, :], y_test[:num_of_eval])] 274 | 275 | # I use early stopping here to prevent model from overfitting to the training set. 276 | # Cross validation is a method to avoid overfitting when tunning the model parameters. 277 | model.fit(X_train, Y_train, early_stopping_rounds=100, eval_metric="auc",eval_set=eval_set, verbose=True) 278 | 279 | # record the number of trees before the model overfitting to the training set and retrain the model 280 | num_tree = 200 281 | model = XGBClassifier(n_estimators=num_tree, max_depth=7, colsample_bytree=0.7, nthread=-1) 282 | model.fit(X_train, Y_train) 283 | 284 | # plot roc curve 285 | roc_auc = plt_auc(model,X_test[num_of_eval:,:],y_test[num_of_eval:],'test') 286 | 287 | 288 | y_pred = model.predict_proba(X_test[num_of_eval:,:])[:,1] 289 | y_pred = [1 if i > 0.18 else 0 for i in y_pred] # choose difference threshold to meet different requirements 290 | print("Recall: ", recall_score(y_test[num_of_eval:],y_pred)) 291 | print("Precision: ", precision_score(y_test[num_of_eval:],y_pred)) 292 | print("Accuracy: ", accuracy_score(y_test[num_of_eval:],y_pred)) 293 | prin("F1: ", f1_score(y_test[num_of_eval:],y_pred)) 294 | --------------------------------------------------------------------------------