├── README.md ├── chapter-4-data-mining ├── Data Standardization.py └── DiscreteByEntropy.py ├── chapter-7-Area-hot-rec └── RecBasedAH.py ├── chapter-8-CTR ├── GBDT_LR.py ├── LRbased.py └── GBDT_based.py ├── chapter-9-Code-Start └── SignUpInfo.py ├── chapter-2-First-Rec-Sys └── first_Rec.py ├── chapter-5-User-based-rec ├── LFM based rec.py └── content based.py └── chapter-6-Tag-based-rec └── RecBasedTag.py /README.md: -------------------------------------------------------------------------------- 1 | # recommand-system-book-project 2 | codes from the book “推荐系统开发实战” 3 | -------------------------------------------------------------------------------- /chapter-4-data-mining/Data Standardization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | 5 | """ 6 | 代码4-1 数据标准化 7 | """ 8 | 9 | 10 | class DataNorm: 11 | def __init__(self): 12 | self.arr = [1,2,3,4,5,6,7,8,9] 13 | self.x_max = max(self.arr) 14 | self.x_min = min(self.arr) 15 | self.x_mean = sum(self.arr) / len(self.arr) 16 | self.x_std = np.std(self.arr) # 标准差 17 | 18 | def Min_Max(self): 19 | arr_ = list() 20 | for x in self.arr: 21 | _x = (x-self.x_min)/(self.x_max - self.x_min) 22 | arr_.append(round(_x,4)) 23 | return arr_ 24 | 25 | def Z_Score(self): 26 | arr_ = list() 27 | for x in self.arr: 28 | arr_.append(round((x - self.x_mean) / self.x_std, 4)) 29 | return arr_ 30 | 31 | def DecimalScaling(self): # 小数定标标准化 32 | arr_ = list() 33 | j = 1 34 | x_max = max([abs(one) for one in self.arr]) 35 | while x_max /10 >= 1.0: 36 | j += 1 37 | x_max = x_max / 10 38 | for x in self.arr: 39 | arr_.append(round(x / math.pow(10,j), 4)) 40 | return arr_ 41 | 42 | def Mean(self): # 均值归一化 43 | arr_ = list() 44 | for x in self.arr: 45 | arr_.append(round((x - self.x_mean) / (self.x_max - self.x_min), 4)) 46 | return arr_ 47 | 48 | def Vector(self): # 向量归一化 49 | arr_ = list() 50 | for x in self.arr: 51 | arr_.append(round(x / sum(self.arr), 4)) 52 | return arr_ 53 | 54 | 55 | if __name__ == "__main__": 56 | nor = DataNorm() 57 | min_max = nor.Min_Max() 58 | print("{}".format(min_max)) -------------------------------------------------------------------------------- /chapter-7-Area-hot-rec/RecBasedAH.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | """ 4 | 基于地域和热度的酒店推荐 5 | """ 6 | class RecBasedAH: 7 | def __init__(self, path=None, addr="朝阳区", type="score", k=10, sort=False): 8 | self.path = path 9 | self.addr = addr 10 | self.type = type 11 | self.k = k 12 | self.sort = sort 13 | self.data = self.load_mess() 14 | 15 | def load_mess(self): 16 | data = pd.read_csv(self.path, header=0, sep=",",encoding='GBK') 17 | return data[data["addr"] == self.addr] 18 | 19 | def recommend(self): 20 | if self.type in ["score","comment_num","lowest_price","decoration_time","open_time"]: 21 | data = self.data.sort_values(by=[self.type, "lowest_price"], ascending=self.sort)[:self.k] 22 | return dict(data.filter(items=["name", self.type]).values) 23 | elif self.type == "combine": 24 | data = self.data.filter(items=["name","score","comment_num","decoration_time","open_time","lowest_price"]) 25 | data["decoration_time"] = data["decoration_time"].apply(lambda x:int(x)-2018) 26 | data["open_time"] = data["open_time"].apply(lambda x:2018-int(x)) 27 | for col in data.keys(): 28 | if col != "name": 29 | data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min()) 30 | data[self.type] = 1 * data["score"] + 2 * data["comment_num"] + 0.5 * data["decoration_time"] +\ 31 | 0.5 * data["open_time"] + 1.5 * data["lowest_price"] 32 | data = data.sort_values(by=self.type, ascending=self.sort)[:self.k] 33 | return dict(data.filter(items=["name",self.type]).values) 34 | 35 | 36 | if __name__ == "__main__": 37 | path = "../hotel-mess/hotel-mess.csv" 38 | rbah = RecBasedAH(path,type="combine",k=10,sort=False) 39 | print(rbah.recommend()) -------------------------------------------------------------------------------- /chapter-8-CTR/GBDT_LR.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | from sklearn.metrics import mean_squared_error 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.ensemble import GradientBoostingClassifier 5 | from sklearn.linear_model import LogisticRegression 6 | import pandas as pd 7 | from sklearn.preprocessing import OneHotEncoder 8 | 9 | class GBDTWithLR: 10 | def __init__(self): 11 | self.file = "../telecom-churn/new_churn.csv" 12 | self.data = self.load_data() 13 | self.train, self.test = self.split() 14 | 15 | def load_data(self): 16 | return pd.read_csv(self.file) 17 | 18 | def split(self): 19 | train, test = train_test_split(self.data, test_size=0.1, random_state=40) 20 | return train, test 21 | 22 | def train_model(self): 23 | print("training") 24 | label = "Churn" 25 | ID = "customerID" 26 | x_columns = [x for x in self.train.columns if x not in [ID, label]] 27 | x_train = self.train[x_columns] 28 | y_train = self.train[label] 29 | 30 | gbdt = GradientBoostingClassifier() 31 | gbdt.fit(x_train,y_train) 32 | 33 | gbdt_lr = LogisticRegression(max_iter=3000) 34 | enc = OneHotEncoder() 35 | enc.fit(gbdt.apply(x_train).reshape(-1,100)) 36 | 37 | gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1,100)),y_train) 38 | return enc, gbdt, gbdt_lr 39 | 40 | def evaluate(self,enc,gbdt,gbdt_lr): 41 | print("evaluating") 42 | label = "Churn" 43 | ID = "customerID" 44 | x_columns = [x for x in self.train.columns if x not in [ID, label]] 45 | x_test = self.test[x_columns] 46 | y_test = self.test[label] 47 | 48 | gbdt_pred = gbdt.predict(x_test) 49 | print("GBDT accuracy: %.4g" % metrics.accuracy_score(y_test.values, gbdt_pred)) 50 | 51 | gbdt_lr_pred = gbdt_lr.predict(enc.transform(gbdt.apply(x_test).reshape(-1,100))) 52 | print("GBDT_LR accuracy: %.4g" % metrics.accuracy_score(y_test.values, gbdt_lr_pred)) 53 | 54 | 55 | if __name__ == "__main__": 56 | new_model= GBDTWithLR() 57 | enc, gbdt, gbdt_lr = new_model.train_model() 58 | new_model.evaluate(enc, gbdt, gbdt_lr) -------------------------------------------------------------------------------- /chapter-8-CTR/LRbased.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | from sklearn.model_selection import train_test_split 3 | from sklearn import metrics 4 | from sklearn.metrics import mean_squared_error 5 | import pandas as pd 6 | 7 | 8 | class ChurnPredWithLR: 9 | def __init__(self): 10 | self.file = "../telecom-churn/new_churn.csv" 11 | self.data = self.load_data() 12 | self.train, self.test = self.split() 13 | 14 | def load_data(self): 15 | data = pd.read_csv(self.file) 16 | labels = list(data.keys()) 17 | fDict = dict() 18 | for f in labels: 19 | if f not in ['customerID','tenure','MonthlyCharges','TotalCharges','Churn']: 20 | fDict[f] = sorted(list(data.get(f).unique())) 21 | fw = open("../telecom-churn/one_hot_churn.csv","w") 22 | fw.write("customerID,") 23 | for i in range(1,47): 24 | fw.write('f_%s,' % i) 25 | fw.write("Churn\n") 26 | for line in data.values: 27 | list_line = list(line) 28 | list_result = list() 29 | for i in range(0, list_line.__len__()): 30 | if labels[i] in ['customerID','tenure','MonthlyCharges','TotalCharges','Churn']: 31 | list_result.append(list_line[i]) 32 | else: 33 | arr = [0] * fDict[labels[i]].__len__() 34 | ind = fDict[labels[i]].index(list_line[i]) 35 | arr[ind] = 1 36 | for one in arr: 37 | list_result.append(one) 38 | #list_result.append(arr) 39 | fw.write(",".join([str(f) for f in list_result]) + "\n") 40 | fw.close() 41 | return pd.read_csv("../telecom-churn/one_hot_churn.csv") 42 | 43 | def split(self): 44 | train, test = train_test_split(self.data, test_size=0.1, random_state=40) 45 | return train, test 46 | 47 | def train_model(self): 48 | print("Start Train Model ...") 49 | label = "Churn" 50 | ID = "customerID" 51 | x_columns = [x for x in self.train.columns if x not in [ID, label]] 52 | x_train = self.train[x_columns] 53 | y_train = self.train[label] 54 | lr = LogisticRegression(penalty='l2', tol=1e-4, fit_intercept=True) 55 | lr.fit(x_train, y_train) 56 | return lr 57 | 58 | def evaluate(self,lr,type): 59 | label = "Churn" 60 | ID = "customerID" 61 | x_columns = [x for x in self.train.columns if x not in [ID, label]] 62 | x_test = self.test[x_columns] 63 | y_test = self.test[label] 64 | if type == 1: 65 | y_pred = lr.predict(x_test) 66 | new_y_pre = y_pred 67 | elif type == 2: 68 | y_pred = lr.predict_proba(x_test) 69 | new_y_pre = list() 70 | for y in y_pred: 71 | new_y_pre.append(1 if y[1]>0.5 else 0) 72 | accuracy = metrics.accuracy_score(y_test.values, new_y_pre) 73 | print(accuracy) 74 | 75 | 76 | if __name__ == "__main__": 77 | pred = ChurnPredWithLR() 78 | lr = pred.train_model() 79 | pred.evaluate(lr,type=1) -------------------------------------------------------------------------------- /chapter-4-data-mining/DiscreteByEntropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | """ 5 | 代码4-2 基于信息熵的数据离散化 6 | """ 7 | 8 | 9 | class DiscreteByEntropy: 10 | def __init__(self, group, threshold): 11 | self.maxGroup = group # 最大分组数 12 | self.minInfoThreshold = threshold # 停止划分的最小熵 13 | self.result = dict() # 保存划分结果 14 | 15 | def loadData(self): 16 | data = np.array( 17 | [ 18 | [56, 1], [87, 1], [129, 0], [23, 0], [342, 1], 19 | [641, 1], [63, 0], [2764, 1], [2323, 0], [453, 1], 20 | [10, 1], [9, 0], [88, 1], [222, 0], [97, 0], 21 | [2398, 1], [592, 1], [561, 1], [764, 0], [121, 1] 22 | ] 23 | ) 24 | return data 25 | 26 | def calEntropy(self,data): # 计算熵 27 | numData = len(data) 28 | labelCounts = {} 29 | for feature in data: 30 | oneLabel = feature[-1] 31 | labelCounts.setdefault(oneLabel,0) 32 | labelCounts[oneLabel] += 1 33 | shannonEnt = 0.0 34 | for key in labelCounts: 35 | prob = float(labelCounts[key]) / numData 36 | shannonEnt -= prob * math.log(prob, 2) 37 | return shannonEnt 38 | 39 | def split(self, data): 40 | # inf为正无穷大 41 | minEntropy = np.inf 42 | # 记录最终分割索引 43 | index = -1 44 | sortData = data[np.argsort(data[:, 0])] 45 | lastE1, lastE2 = -1, -1 46 | S1 = dict() 47 | S2 = dict() 48 | for i in range(len(sortData)): 49 | splitData1, splitData2 = sortData[: i+1], sortData[i+1 :] 50 | entropy1, entropy2 = ( 51 | self.calEntropy(splitData1), 52 | self.calEntropy(splitData2) 53 | ) 54 | entropy = entropy1 * len(splitData1) / len(sortData) + \ 55 | entropy2 * len(splitData2) / len(sortData) 56 | if entropy < minEntropy: 57 | minEntropy = entropy 58 | index = i 59 | lastE1 = entropy1 60 | lastE2 = entropy2 61 | S1["entropy"] = lastE1 62 | S1["data"] = sortData[: index + 1] 63 | S2["entropy"] = lastE2 64 | S2["data"] = sortData[index + 1 :] 65 | return S1, S2, minEntropy 66 | 67 | def train(self, data): # 离散化处理 68 | needSplitKey = [0] 69 | self.result.setdefault(0,{}) 70 | self.result[0]["entropy"] = np.inf 71 | self.result[0]["data"] = data 72 | group = 1 73 | for key in needSplitKey: 74 | S1, S2, entropy = self.split(self.result[key]["data"]) 75 | if entropy > self.minInfoThreshold and group < self.maxGroup: 76 | self.result[key] = S1 77 | newKey = max(self.result.keys()) + 1 78 | self.result[newKey] = S2 79 | needSplitKey.extend([key]) 80 | needSplitKey.extend([newKey]) 81 | group += 1 82 | else: 83 | break 84 | 85 | 86 | if __name__ == "__main__": 87 | dbe = DiscreteByEntropy(group=6, threshold=0.5) 88 | data = dbe.loadData() 89 | dbe.train(data) 90 | print("result is {}".format(dbe.result)) -------------------------------------------------------------------------------- /chapter-9-Code-Start/SignUpInfo.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | 5 | class UserShow: 6 | def __init__(self): 7 | self.file_user = "../bookcrossings/BX-Users.csv" 8 | self.file_book = "../bookcrossings/BX-Books.csv" 9 | self.file_rate = "../bookcrossings/BX-Book-Ratings.csv" 10 | self.user_mess = self.loadUserData() 11 | self.book_mess = self.loadBookMess() 12 | self.user_book = self.loadUserBook() 13 | 14 | def loadUserData(self): 15 | user_mess = dict() 16 | for line in open(self.file_user,"r",encoding="ISO-8859-1"): 17 | if line.startswith("\"User-ID\""): 18 | continue 19 | if len(line.split(";")) != 3: 20 | continue 21 | line = line.strip().replace(" ","") 22 | userid, addr, age = [one.replace("\"","") for one in line.split(";")] 23 | if age == "NULL" or int(age) not in range(1,120): 24 | continue 25 | user_mess.setdefault(userid,{}) 26 | user_mess[userid]["age"] = int(age) 27 | if len(addr.split(",")) < 3: 28 | continue 29 | city, province, country = addr.split(",")[-3:] 30 | user_mess[userid]["country"] = country 31 | user_mess[userid]["province"] = province 32 | user_mess[userid]["city"] = city 33 | return user_mess 34 | 35 | def loadBookMess(self): 36 | book_mess = dict() 37 | for line in open(self.file_book,"r",encoding="ISO-8859-1"): 38 | if line.startswith("\"ISBN\""): 39 | continue 40 | isbn, book_name = line.replace("\"","").split(";")[:2] 41 | book_mess[isbn] = book_name 42 | return book_mess 43 | 44 | def loadUserBook(self): 45 | user_book = dict() 46 | for line in open(self.file_rate,"r",encoding="ISO-8859-1"): 47 | if line.startswith("\"User-ID\""): 48 | continue 49 | user_id, isbn, rating = line.replace("\"","").split(";")[:3] 50 | user_book.setdefault(user_id,list()) 51 | if int(rating) > 5: 52 | user_book[user_id].append(isbn) 53 | return user_book 54 | 55 | def show(self, X, Y, X_label, Y_label="数目"): 56 | plt.xlabel(X_label) 57 | plt.ylabel(Y_label) 58 | plt.xticks(np.arange(len(X)),X, rotation = 90) 59 | for a, b in zip(np.arange(len(X)),Y): 60 | plt.text(a, b, b, rotation = 45) 61 | plt.bar(np.arange(len(X)),Y) 62 | plt.show() 63 | 64 | def diffAge(self): 65 | age_user = dict() 66 | for key in self.user_mess.keys(): 67 | age_split = int(int(self.user_mess[key]["age"]) / 10) 68 | age_user.setdefault(age_split,0) 69 | age_user[age_split] += 1 70 | age_user_sort = sorted(age_user.items(), key=lambda x:x[0], reverse=False) 71 | X = [x[0] for x in age_user_sort] 72 | Y = [x[1] for x in age_user_sort] 73 | print(age_user_sort) 74 | self.show(X,Y, X_label="用户年龄段") 75 | 76 | def diffpro(self): 77 | pro_user = dict() 78 | for key in self.user_mess.keys(): 79 | if "province" in self.user_mess[key].keys() and self.user_mess[key]["province"] != "n/a": 80 | pro_user.setdefault(self.user_mess[key]["province"], 0) 81 | pro_user[self.user_mess[key]]["province"] += 1 82 | pro_user_sort = sorted(pro_user.items(), key=lambda x:x[1],reverse=True)[:20] 83 | X = [x[0] for x in pro_user_sort] 84 | Y = [x[1] for x in pro_user_sort] 85 | print(pro_user_sort) 86 | self.show(X, Y, X_label="用户所在州") 87 | 88 | def diffUserAge(self): 89 | age_books = dict() 90 | age_books.setdefault(1,dict()) 91 | age_books.setdefault(2,dict()) 92 | for key in self.user_mess.keys(): 93 | if "country" not in self.user_mess[key].keys(): 94 | continue 95 | if key not in self.user_book.keys(): 96 | continue 97 | if int(self.user_mess[key]["age"]) in range(0,30): 98 | for book in self.user_book[key]: 99 | if book not in self.book_mess.keys(): 100 | continue 101 | age_books[1].setdefault(book,0) 102 | age_books[1][book] += 1 103 | if int(self.user_mess[key]["age"]) in range(50,120): 104 | for book in self.user_book[key]: 105 | if book not in self.book_mess.keys(): 106 | continue 107 | age_books[2].setdefault(book,0) 108 | age_books[2][book] += 1 109 | print("年龄在30岁以下的用户偏好共性图书top10") 110 | for one in sorted(age_books[1].items(), key=lambda x:x[1], reverse=True)[:10]: 111 | print(self.book_mess[one[0]]) 112 | print("年龄在50岁以上的用户偏好共性图书top10") 113 | for one in sorted(age_books[2].items(), key=lambda x:x[1], reverse=True)[:10]: 114 | print(self.book_mess[one[0]]) 115 | 116 | 117 | if __name__ == "__main__": 118 | ushow = UserShow() 119 | ushow.diffUserAge() -------------------------------------------------------------------------------- /chapter-2-First-Rec-Sys/first_Rec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | import math 5 | 6 | """ 7 | 第二章内容,搭建第一个推荐系统 8 | """ 9 | 10 | 11 | class FirstRec: 12 | """ 13 | k 近邻用户个数 14 | n_items 为每个用户推荐的电影数 15 | """ 16 | def __init__(self,file_path,seed,k,n_items): 17 | self.file_path = file_path 18 | self.users_1000 = self.__select_1000_users() 19 | self.seed = seed 20 | self.k = k 21 | self.n_items = n_items 22 | self.train,self.test = self._load_and_split_data() 23 | 24 | def __select_1000_users(self): 25 | print("随机选取1000个用户!") 26 | if os.path.exists("../data/train.json") and os.path.exists("../data/test.json"): 27 | return list() 28 | else: 29 | users = set() 30 | for file in os.listdir(self.file_path): 31 | one_path = "{}/{}".format(self.file_path,file) 32 | print("{}".format(one_path)) 33 | with open(one_path, "r") as fp: 34 | for line in fp.readlines(): 35 | if line.strip().endswith(":"): 36 | continue 37 | userID, _, _ = line.split(",") 38 | users.add(userID) 39 | users_1000 = random.sample(list(users),1000) 40 | print(users_1000) 41 | return users_1000 42 | 43 | def _load_and_split_data(self): 44 | train = dict() 45 | test = dict() 46 | if os.path.exists("../data/train.json") and os.path.exists("../data/test.json"): 47 | print("从文件中加载训练集和测试集") 48 | train = json.load(open("../data/train.json")) 49 | test = json.load(open("../data/test.json")) 50 | print("从文件中加载数据完成") 51 | else: 52 | random.seed(self.seed) 53 | for file in os.listdir(self.file_path): 54 | one_path = "{}/{}".format(self.file_path,file) 55 | print("{}".format(one_path)) 56 | with open(one_path,"r") as fp: 57 | movieID = fp.readline().split(":")[0] 58 | for line in fp.readlines(): 59 | if line.strip().endswith(":"): 60 | movieID = line.split(":")[0] 61 | continue 62 | userID, rate, _ = line.split(",") 63 | if userID in self.users_1000: 64 | if random.randint(1,50) == 1: 65 | test.setdefault(userID, {})[movieID] = int(rate) 66 | else: 67 | train.setdefault(userID, {})[movieID] = int(rate) 68 | print("加载数据到 ../data/train.json 和 ../data/test.json") 69 | json.dump(train, open("../data/train.json", "w")) 70 | json.dump(test, open("../data/test.json", "w")) 71 | print("加载完成") 72 | return train, test 73 | 74 | def pearson(self,rating1,rating2): 75 | sum_xy = 0 76 | sum_x = 0 77 | sum_y = 0 78 | sum_x2 = 0 79 | sum_y2 = 0 80 | num = 0 81 | for key in rating1.keys(): 82 | if key in rating2.keys(): 83 | num += 1 84 | x = rating1[key] 85 | y = rating2[key] 86 | sum_xy += x * y 87 | sum_x += x 88 | sum_y += y 89 | sum_x2 += math.pow(x,2) 90 | sum_y2 += math.pow(y,2) 91 | if num == 0: 92 | return 0 93 | de = math.sqrt(sum_x2 - math.pow(sum_x,2)/num) * math.sqrt(sum_y2 - math.pow(sum_y,2)/num) 94 | if de == 0: 95 | return 0 96 | else: 97 | return (sum_xy - (sum_x * sum_y) / num) / de 98 | 99 | def recommend(self,userID): 100 | neighborUser = dict() 101 | for user in self.train.keys(): 102 | if user != userID: 103 | distance = self.pearson(self.train[user],self.train[userID]) 104 | neighborUser[user] = distance 105 | newNU = sorted(neighborUser.items(), key=lambda m: m[1], reverse= True) 106 | movies = dict() 107 | for (sim_user,sim) in newNU[:self.k]: 108 | for movieID in self.train[sim_user].keys(): 109 | movies.setdefault(movieID,0) 110 | movies[movieID] += sim * self.train[sim_user][movieID] 111 | newMovies = sorted(movies.items(),key = lambda m:m[1],reverse=True) 112 | return newMovies 113 | 114 | def evaluate(self,num=30): 115 | print("开始计算准确率") 116 | precisions = list() 117 | random.seed(10) 118 | for userID in random.sample(self.test.keys(),num): 119 | hit = 0 120 | result = self.recommend(userID)[:self.n_items] 121 | for (item,rate) in result: 122 | if item in self.test[userID]: 123 | hit += 1 124 | precisions.append(hit/self.n_items) 125 | return sum(precisions) / precisions.__len__() 126 | 127 | 128 | if __name__ == "__main__": 129 | file_path = "../data/training_set" 130 | seed = 30 131 | k = 15 132 | n_items = 20 133 | f_rec = FirstRec(file_path,seed,k,n_items) 134 | print("算法的推荐准确率{}".format(f_rec.evaluate())) -------------------------------------------------------------------------------- /chapter-5-User-based-rec/LFM based rec.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | import os 4 | import random 5 | import numpy as np 6 | from math import exp 7 | import time 8 | 9 | class DataProcessing: 10 | def __init__(self): 11 | pass 12 | 13 | def get_pos_neg_item(self, file_path = "../ml-1m/ratings.csv"): 14 | if not os.path.exists("../ml-1m/lfm_items.dict"): 15 | self.items_dict_path = "../ml-1m/lfm_items.dict" 16 | self.uiscores = pd.read_csv(file_path) 17 | self.user_ids = set(self.uiscores["UserID"].values) 18 | self.item_ids = set(self.uiscores["MovieID"].values) 19 | self.items_dict = {user_id: self.get_one(user_id) for user_id in list(self.user_ids)} 20 | fw = open(self.items_dict_path, "wb") 21 | pickle.dump(self.items_dict, fw) 22 | fw.close() 23 | 24 | def get_one(self, user_id): 25 | print('为用户%s准备正向和负向数据...' % user_id) 26 | pos_item_ids = set(self.uiscores[self.uiscores['UserID'] == user_id]['MovieID']) 27 | neg_item_ids = self.item_ids ^ pos_item_ids # “^”为异或符号 28 | neg_item_ids = list(neg_item_ids)[:len([pos_item_ids])] 29 | item_dict = {} 30 | for item in pos_item_ids: 31 | item_dict[item] = 1 32 | for item in neg_item_ids: 33 | item_dict[item] = 0 34 | return item_dict 35 | 36 | 37 | class LFM: 38 | def __init__(self): 39 | self.class_count = 5 40 | self.iter_count = 5 41 | self.lr = 0.02 42 | self.lam = 0.01 43 | self._init_model() 44 | 45 | def _init_model(self): 46 | file_path = '../ml-1m/ratings.csv' 47 | pos_neg_path = '../ml-1m/lfm_items.dict' 48 | self.uiscores = pd.read_csv(file_path) 49 | self.user_ids = set(self.uiscores['UserID'].values) 50 | self.item_ids = set(self.uiscores['MovieID'].values) 51 | self.items_dict = pickle.load(open(pos_neg_path,'rb')) 52 | 53 | array_p = np.random.randn(len(self.user_ids),self.class_count) 54 | array_q = np.random.randn(len(self.item_ids),self.class_count) 55 | self.p = pd.DataFrame(array_p, columns=range(0, self.class_count), index=list(self.user_ids)) 56 | self.q = pd.DataFrame(array_q, columns=range(0, self.class_count), index=list(self.item_ids)) 57 | 58 | def _predict(self, user_id, item_id): 59 | p = np.mat(self.p.ix[user_id].values) 60 | q = np.mat(self.q.ix[item_id].values).T 61 | r = (p * q).sum() 62 | logit = 1.0 / (1 + exp(-r)) 63 | return logit 64 | 65 | def _loss(self, user_id, item_id, y, step): 66 | e = y - self._predict(user_id, item_id) 67 | return e 68 | 69 | def _optimize(self, user_id, item_id, e): 70 | gradient_p = -e * self.q.ix[item_id].values 71 | l2_p = self.lam * self.p.ix[user_id].values 72 | delta_p = self.lr * (gradient_p + l2_p) 73 | 74 | gradient_q = -e * self.p.ix[user_id].values 75 | l2_q = self.lam * self.q.ix[item_id].values 76 | delta_q = self.lr * (gradient_q + l2_q) 77 | 78 | self.p.loc[user_id] -= delta_p 79 | self.q.loc[item_id] -= delta_q 80 | 81 | def train(self): 82 | for step in range(0,self.iter_count): 83 | time.sleep(30) 84 | for user_id, item_dict in self.items_dict.items(): 85 | print('step: {}, user_id: {}'.format(step, user_id)) 86 | item_ids = list(item_dict.keys()) 87 | random.shuffle(item_ids) 88 | for item_id in item_ids: 89 | e = self._loss(user_id, item_id, item_dict[item_id], step) 90 | self._optimize(user_id, item_id, e) 91 | self.lr *= 0.9 92 | self.save() 93 | 94 | def save(self): 95 | f = open('../ml-1m/lfm.model','wb') 96 | pickle.dump((self.p, self.q), f) 97 | f.close() 98 | 99 | def load(self): 100 | f = open('../ml-1m/lfm.model','rb') 101 | self.p, self.q = pickle.load(f) 102 | f.close() 103 | 104 | def predict(self, user_id, top_n = 10): 105 | self.load() 106 | user_item_ids = set(self.uiscores[self.uiscores['UserID'] == user_id]['MovieID']) 107 | other_item_ids = self.item_ids ^ user_item_ids 108 | interest_list = [self._predict(user_id, item_id) for item_id in other_item_ids] 109 | candidates = sorted(zip(list(other_item_ids), interest_list), key= lambda x:x[1], reverse=True) 110 | return candidates[:top_n] 111 | 112 | def evaluate(self): 113 | self.load() 114 | users = random.sample(self.user_ids, 10) 115 | user_dict = {} 116 | for user in users: 117 | user_item_ids = set(self.uiscores[self.uiscores['UserID'] == user]['MovieID']) 118 | _sum = 0.0 119 | for item_id in user_item_ids: 120 | p = np.mat(self.p.ix[user].values) 121 | q = np.mat(self.q.ix[item_id].values).T 122 | _r = (p * q).sum() 123 | r = self.uiscores[(self.uiscores['UserID'] == user)&(self.uiscores['MovieID']==item_id)]["Rating"].values[0] 124 | _sum += abs(r - _r) 125 | user_dict[user] = _sum/len(user_item_ids) 126 | print("user: {} AI: {}".format(user,user_dict[user])) 127 | return sum(user_dict.values()) / len(user_dict.keys()) 128 | 129 | 130 | if __name__ == '__main__': 131 | # dp = DataProcessing() 132 | # dp.get_pos_neg_item() 133 | lfm = LFM() 134 | #lfm.train() 135 | #print(lfm.predict(6027,10)) 136 | print(lfm.evaluate()) -------------------------------------------------------------------------------- /chapter-5-User-based-rec/content based.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import numpy as np 4 | import math 5 | import random 6 | 7 | 8 | class DataProcessing: 9 | def __init__(self): 10 | pass 11 | 12 | def process(self): 13 | print('开始转换用户数据(users.dat)...') 14 | self.process_user_data() 15 | print('开始转换电影数据(movies.dat)...') 16 | self.process_movie_data() 17 | print('开始转换用户对电影评分数据(ratings.dat)') 18 | self.process_rating_data() 19 | print('Over!') 20 | 21 | def process_user_data(self, file='../ml-1m/users.dat'): 22 | fp = pd.read_table(file, sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']) 23 | fp.to_csv('../ml-1m/users.csv', index=False) 24 | 25 | def process_rating_data(self, file='../ml-1m/ratings.dat'): 26 | fp = pd.read_table(file, sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp']) 27 | fp.to_csv('../ml-1m/ratings.csv', index=False) 28 | 29 | def process_movie_data(self, file='../ml-1m/movies.dat'): 30 | fp = pd.read_table(file, sep='::', engine='python', names=['MovieID', 'Title', 'Genres']) 31 | fp.to_csv('../ml-1m/movies.csv', index=False) 32 | 33 | def prepare_item_profile(self,file='../ml-1m/movies.csv'): # 计算电影特征信息矩阵 34 | items = pd.read_csv(file) 35 | item_ids = set(items["MovieID"].values) 36 | self.item_dict = {} 37 | genres_all = list() 38 | # 将每个电影的类型放在item_dict中 39 | for item in item_ids: 40 | genres = items[items["MovieID"] == item]["Genres"].values[0].split("|") 41 | self.item_dict.setdefault(item, []).extend(genres) 42 | genres_all.extend(genres) 43 | self.genres_all = set(genres_all) 44 | # 将每个电影特征信息矩阵存放在self.item_matrix中 45 | self.item_matrix = {} 46 | for item in self.item_dict.keys(): 47 | self.item_matrix[str(item)] = [0] * len(set(self.genres_all)) 48 | for genre in self.item_dict[item]: 49 | index = list(set(genres_all)).index(genre) 50 | self.item_matrix[str(item)][index] = 1 51 | json.dump(self.item_matrix, open('../ml-1m/item_profile.json', 'w')) 52 | print("item信息计算完成,保存路径为'../ml-1m/item_profile.json'") 53 | 54 | def prepare_user_profile(self,file='../ml-1m/ratings.csv'): # 计算用户偏好矩阵 55 | users = pd.read_csv(file) 56 | user_ids = set(users["UserID"].values) 57 | # 将user信息转换成dict 58 | users_rating_dict = {} 59 | for user in user_ids: 60 | users_rating_dict.setdefault(str(user),{}) 61 | with open(file,"r") as fr: 62 | for line in fr.readlines(): 63 | if not line.startswith("UserID"): 64 | (user, item, rate) = line.split(",")[:3] 65 | users_rating_dict[user][item] = int(rate) 66 | # 获取用户对每个类型下的哪些电影进行了评分 67 | self.user_matrix = {} 68 | for user in users_rating_dict.keys(): 69 | score_list = users_rating_dict[user].values() 70 | avg = sum(score_list)/len(score_list) 71 | self.user_matrix[user] = [] 72 | for genre in self.genres_all: 73 | score_all = 0.0 74 | score_len = 0 75 | for item in users_rating_dict[user].keys(): 76 | if genre in self.item_dict[int(item)]: 77 | score_all += (users_rating_dict[user][item]-avg) 78 | score_len += 1 79 | if score_len == 0: 80 | self.user_matrix[user].append(0.0) 81 | else: 82 | self.user_matrix[user].append(score_all/score_len) 83 | json.dump(self.user_matrix,open('../ml-1m/user_profile.json','w')) 84 | print("user信息计算完成,保存路径'../ml-1m/user_profile.json'") 85 | 86 | 87 | class CBRecommend: 88 | def __init__(self,k): 89 | self.k = k # 给用户推荐的item个数 90 | self.item_profile = json.load(open("../ml-1m/item_profile.json","r")) 91 | self.user_profile = json.load(open("../ml-1m/user_profile.json","r")) 92 | 93 | def get_none_score_item(self,user): 94 | items = pd.read_csv("../ml-1m/movies.csv")["MovieID"].values 95 | data = pd.read_csv("../ml-1m/ratings.csv") 96 | have_score_items = data[data["UserID"]==user]["MovieID"].values 97 | none_score_items = set(items)-set(have_score_items) 98 | return none_score_items 99 | 100 | def cosUI(self,user,item): 101 | Uia = sum(np.array(self.user_profile[str(user)]) * np.array(self.item_profile[str(item)])) 102 | Ua = math.sqrt(sum([math.pow(one,2) for one in self.user_profile[str(user)]])) 103 | Ia = math.sqrt(sum([math.pow(one,2) for one in self.item_profile[str(item)]])) 104 | return Uia / (Ua * Ia) 105 | 106 | def recommend(self,user): 107 | user_result = {} 108 | item_list = self.get_none_score_item(user) 109 | for item in item_list: 110 | user_result[item] = self.cosUI(user, item) 111 | if self.k is None: 112 | result = sorted(user_result.items(), key=lambda a:a[1], reverse=True) 113 | else: 114 | result = sorted(user_result.items(), key=lambda a:a[1], reverse=True)[:self.k] 115 | print(result) 116 | 117 | def evaluate(self): 118 | evas = [] 119 | data = pd.read_csv("../ml-1m/ratings.csv") 120 | for user in random.sample([one for one in range(1,6041)],20): 121 | have_score_items = data 122 | 123 | 124 | 125 | if __name__ == '__main__': 126 | # dp = DataProcessing() 127 | # dp.process() 128 | # dp.prepare_item_profile() 129 | # dp.prepare_user_profile() 130 | cb = CBRecommend(k=10) 131 | cb.recommend(1) 132 | -------------------------------------------------------------------------------- /chapter-6-Tag-based-rec/RecBasedTag.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import math 3 | 4 | 5 | class RecBasedTag: 6 | def __init__(self): 7 | # 用户听过艺术家次数的文件 8 | self.user_rate_file = "../lastfm-2k/user_artists.dat" 9 | # 用户打标信息 10 | self.user_tag_file = "../lastfm-2k/user_taggedartists.dat" 11 | 12 | # 获取所有艺术家ID 13 | self.artistsAll = list(pd.read_table("../lastfm-2k/artists.dat", delimiter="\t")["id"].values) 14 | # 用户对艺术家的评分 15 | self.userRateDict = self.getUserRate() 16 | # 艺术家和标签的相关度 17 | self.artistsTagsDict = self.getArtistsTags() 18 | # 用户对每个标签打标的次数统计和每个标签被所有用户打标的次数统计 19 | self.userTagDict, self.tagUserDict = self.getUserTagNum() 20 | # 用户最终对每个标签的喜好程度 21 | self.userTagPre = self.getUserTagPre() 22 | 23 | 24 | def getUserRate(self): 25 | userRateDict = dict() 26 | fr = open(self.user_rate_file, "r", encoding="utf-8") 27 | for line in fr.readlines(): 28 | if not line.startswith("userID"): 29 | userID, artistID, weight = line.split("\t") 30 | userRateDict.setdefault(int(userID),{}) 31 | userRateDict[int(userID)][int(artistID)] = float(weight) / 10000 32 | return userRateDict 33 | 34 | def getUserTagNum(self): 35 | userTagDict = dict() 36 | tagUserDict = dict() 37 | for line in open(self.user_tag_file, "r", encoding="utf-8"): 38 | if not line.startswith("userID"): 39 | userID, artistID, tagID = line.strip().split("\t")[:3] 40 | if int(tagID) in tagUserDict.keys(): 41 | tagUserDict[int(tagID)] += 1 42 | else: 43 | tagUserDict[int(tagID)] = 1 44 | userTagDict.setdefault(int(userID),{}) 45 | if int(tagID) in userTagDict[int(userID)].keys(): 46 | userTagDict[int(userID)][int(tagID)] += 1 47 | else: 48 | userTagDict[int(userID)][int(tagID)] = 1 49 | return userTagDict, tagUserDict 50 | 51 | def getArtistsTags(self): # 标签基因 52 | artistsTagsdict = dict() 53 | for line in open(self.user_tag_file, "r", encoding="utf-8"): 54 | if not line.startswith("userID"): 55 | artistID, tagID = line.strip().split("\t")[1:3] 56 | artistsTagsdict.setdefault(int(artistID),{}) 57 | artistsTagsdict[int(artistID)][int(tagID)] = 1 58 | return artistsTagsdict 59 | 60 | def getUserTagPre(self): # 用户对标签最终兴趣度 61 | userTagPre = dict() 62 | userTagCount = dict() 63 | Num = len(open(self.user_tag_file, "r", encoding="utf-8").readlines()) 64 | for line in open(self.user_tag_file, "r", encoding="utf-8").readlines(): 65 | if not line.startswith("userID"): 66 | userID, artistID, tagID = line.strip().split("\t")[:3] 67 | userTagPre.setdefault(int(userID),{}) 68 | userTagCount.setdefault(int(userID),{}) 69 | rate_ui = (self.userRateDict[int(userID)][int(artistID)] if int(artistID) in self.userRateDict[int(userID)].keys() 70 | else 0) 71 | if int(tagID) not in userTagPre[int(userID)].keys(): 72 | userTagPre[int(userID)][int(tagID)] = rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)] 73 | userTagCount[int(userID)][int(tagID)] = 1 74 | else: 75 | userTagPre[int(userID)][int(tagID)] += rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)] 76 | userTagCount[int(userID)][int(tagID)] += 1 77 | for userID in userTagPre.keys(): 78 | for tagID in userTagPre[userID].keys(): 79 | tf_ut = self.userTagDict[int(userID)][int(tagID)]/sum(self.userTagDict[int(userID)].values()) 80 | idf_ut = math.log(Num * 1.0/(self.tagUserDict[int(tagID)]+1)) 81 | userTagPre[userID][tagID] = userTagPre[userID][tagID]/userTagCount[userID][tagID] * tf_ut * idf_ut 82 | return userTagPre 83 | 84 | def recommendForUser(self, user, K, flag=True): 85 | userArtistPreDict = dict() 86 | for artist in self.artistsAll: 87 | if int(artist) in self.artistsTagsDict.keys(): 88 | for tag in self.userTagPre[int(user)].keys(): 89 | rate_ut = self.userTagPre[int(user)][int(tag)] 90 | rel_it = (0 if tag not in self.artistsTagsDict[int(artist)].keys() 91 | else self.artistsTagsDict[int(artist)][tag]) 92 | if artist in userArtistPreDict.keys(): 93 | userArtistPreDict[int(artist)] += rate_ut * rel_it 94 | else: 95 | userArtistPreDict[int(artist)] = rate_ut * rel_it 96 | newUserArtistPreDict = dict() 97 | if flag: 98 | for artist in userArtistPreDict.keys(): 99 | if artist not in self.userRateDict[int(user)].keys(): 100 | newUserArtistPreDict[artist] = userArtistPreDict[int(artist)] 101 | return sorted(newUserArtistPreDict.items(), key = lambda y:y[1], reverse=True)[:K] 102 | else: 103 | # 用来效果评估 104 | return sorted(userArtistPreDict.items(), key = lambda y:y[1], reverse=True)[:K] 105 | 106 | def evaluate(self, user): 107 | K = len(self.userRateDict[int(user)]) 108 | recResult = self.recommendForUser(user, K=K, flag=False) 109 | count = 0 110 | for (artist, pre) in recResult: 111 | if artist in self.userRateDict[int(user)]: 112 | count += 1 113 | return count * 1.0 / K 114 | 115 | 116 | if __name__ == "__main__": 117 | rbt = RecBasedTag() 118 | # print(rbt.recommendForUser("2",K=20)) 119 | print(rbt.evaluate("2")) 120 | 121 | -------------------------------------------------------------------------------- /chapter-8-CTR/GBDT_based.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | from sklearn.ensemble import GradientBoostingClassifier 3 | from sklearn import metrics 4 | from sklearn.metrics import mean_squared_error 5 | import pandas as pd 6 | import os 7 | 8 | 9 | class ChurnPredWithGBDT: 10 | def __init__(self): 11 | self.file = "../telecom-churn/telecom-churn-prediction-data.csv" 12 | self.data = self.feature_transform() 13 | self.train, self.test = self.split_data() 14 | 15 | def isNone(self, value): 16 | if value == " " or value is None: 17 | return "0.0" 18 | else: 19 | return value 20 | 21 | def feature_transform(self): 22 | if not os.path.exists("../telecom-churn/new_churn.csv"): 23 | print("Start feature transform ...") 24 | feature_dict = { 25 | "gender": {"Male":"1","Female":"0"}, 26 | "Partner": {"Yes":"1","No":"0"}, 27 | "Dependents": {"Yes":"1","No":"0"}, 28 | "PhoneService": {"Yes":"1","No":"0"}, 29 | "MultipleLines": {"Yes":"1","No":"0","No phone service":"2"}, 30 | "InternetService": {"DSL":"1","Fiber optic":"2","No":"0"}, 31 | "OnlineSecurity": {"Yes":"1","No":"0","No internet service":"2"}, 32 | "OnlineBackup": {"Yes":"1","No":"0","No internet service":"2"}, 33 | "DeviceProtection": {"Yes":"1","No":"0","No internet service":"2"}, 34 | "TechSupport": {"Yes":"1","No":"0","No internet service":"2"}, 35 | "StreamingTV": {"Yes":"1","No":"0","No internet service":"2"}, 36 | "StreamingMovies": {"Yes":"1","No":"0","No internet service":"2"}, 37 | "Contract": {"Month-to-month":"0","One year":"1","Two year":"2"}, 38 | "PaperlessBilling": {"Yes":"1","No":"0"}, 39 | "PaymentMethod":{ 40 | "Electronic check":"0", 41 | "Mailed check": "1", 42 | "Bank transfer (automatic)": "2", 43 | "Credit card (automatic)":"3", 44 | }, 45 | "Churn": {"Yes":"1","No":"0"}, 46 | } 47 | fw = open("../telecom-churn/new_churn.csv","w") 48 | fw.write( 49 | "customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines," 50 | "InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV," 51 | "StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn\n" 52 | ) 53 | for line in open(self.file, "r").readlines(): 54 | if line.startswith("customerID"): 55 | continue 56 | customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines,\ 57 | InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV,\ 58 | StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn \ 59 | = line.strip().split(",") 60 | _list = list() 61 | _list.append(customerID) 62 | _list.append(self.isNone(feature_dict["gender"][gender])) 63 | _list.append(self.isNone(SeniorCitizen)) 64 | _list.append(self.isNone(feature_dict["Partner"][Partner])) 65 | _list.append(self.isNone(feature_dict["Dependents"][Dependents])) 66 | _list.append(self.isNone(tenure)) 67 | _list.append(self.isNone(feature_dict["PhoneService"][PhoneService])) 68 | _list.append(self.isNone(feature_dict["MultipleLines"][MultipleLines])) 69 | _list.append(self.isNone(feature_dict["InternetService"][InternetService])) 70 | _list.append(self.isNone(feature_dict["OnlineSecurity"][OnlineSecurity])) 71 | _list.append(self.isNone(feature_dict["OnlineBackup"][OnlineBackup])) 72 | _list.append(self.isNone(feature_dict["DeviceProtection"][DeviceProtection])) 73 | _list.append(self.isNone(feature_dict["TechSupport"][TechSupport])) 74 | _list.append(self.isNone(feature_dict["StreamingTV"][StreamingTV])) 75 | _list.append(self.isNone(feature_dict["StreamingMovies"][StreamingMovies])) 76 | _list.append(self.isNone(feature_dict["Contract"][Contract])) 77 | _list.append(self.isNone(feature_dict["PaperlessBilling"][PaperlessBilling])) 78 | _list.append(self.isNone(feature_dict["PaymentMethod"][PaymentMethod])) 79 | _list.append(self.isNone(MonthlyCharges)) 80 | _list.append(self.isNone(TotalCharges)) 81 | _list.append(self.isNone(feature_dict["Churn"][Churn])) 82 | fw.write(",".join(_list)) 83 | fw.write("\n") 84 | return pd.read_csv("../telecom-churn/new_churn.csv") 85 | else: 86 | return pd.read_csv("../telecom-churn/new_churn.csv") 87 | 88 | def split_data(self): 89 | train, test = train_test_split(self.data, test_size=0.1, random_state=40) 90 | return train, test 91 | 92 | def train_model(self): 93 | print("Start Train Model ... ") 94 | label = "Churn" 95 | ID = "customerID" 96 | x_columns = [x for x in self.train.columns if x not in [label, ID]] 97 | x_train = self.train[x_columns] 98 | y_train = self.train[label] 99 | gbdt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, max_depth=5) 100 | gbdt.fit(x_train, y_train) 101 | return gbdt 102 | 103 | def evaluate(self, gbdt): 104 | label = "Churn" 105 | ID = "customerID" 106 | x_columns = [x for x in self.train.columns if x not in [label, ID]] 107 | x_test = self.test[x_columns] 108 | y_test = self.test[label] 109 | y_pre = gbdt.predict_proba(x_test) 110 | new_y_pre = list() 111 | for y in y_pre: 112 | new_y_pre.append(1 if y[1] > 0.5 else 0) 113 | mse = mean_squared_error(y_test, new_y_pre) 114 | print("MSE: %.4f" % mse) 115 | accuracy = metrics.accuracy_score(y_test.values, new_y_pre) 116 | print("Accuracy: %.4g" % accuracy) 117 | 118 | 119 | if __name__ == "__main__": 120 | pred = ChurnPredWithGBDT() 121 | gbdt = pred.train_model() 122 | pred.evaluate(gbdt) --------------------------------------------------------------------------------