├── README.md
├── chapter-4-data-mining
    ├── Data Standardization.py
    └── DiscreteByEntropy.py
├── chapter-7-Area-hot-rec
    └── RecBasedAH.py
├── chapter-8-CTR
    ├── GBDT_LR.py
    ├── LRbased.py
    └── GBDT_based.py
├── chapter-9-Code-Start
    └── SignUpInfo.py
├── chapter-2-First-Rec-Sys
    └── first_Rec.py
├── chapter-5-User-based-rec
    ├── LFM based rec.py
    └── content based.py
└── chapter-6-Tag-based-rec
    └── RecBasedTag.py


/README.md:
--------------------------------------------------------------------------------
1 | # recommand-system-book-project
2 | codes from the book “推荐系统开发实战”
3 | 


--------------------------------------------------------------------------------
/chapter-4-data-mining/Data Standardization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | 
 4 | 
 5 | """
 6 | 代码4-1 数据标准化
 7 | """
 8 | 
 9 | 
10 | class DataNorm:
11 |     def __init__(self):
12 |         self.arr = [1,2,3,4,5,6,7,8,9]
13 |         self.x_max = max(self.arr)
14 |         self.x_min = min(self.arr)
15 |         self.x_mean = sum(self.arr) / len(self.arr)
16 |         self.x_std = np.std(self.arr)  # 标准差
17 | 
18 |     def Min_Max(self):
19 |         arr_ = list()
20 |         for x in self.arr:
21 |             _x = (x-self.x_min)/(self.x_max - self.x_min)
22 |             arr_.append(round(_x,4))
23 |         return arr_
24 | 
25 |     def Z_Score(self):
26 |         arr_ = list()
27 |         for x in self.arr:
28 |             arr_.append(round((x - self.x_mean) / self.x_std, 4))
29 |         return arr_
30 | 
31 |     def DecimalScaling(self):  # 小数定标标准化
32 |         arr_ = list()
33 |         j = 1
34 |         x_max = max([abs(one) for one in self.arr])
35 |         while x_max /10 >= 1.0:
36 |             j += 1
37 |             x_max = x_max / 10
38 |         for x in self.arr:
39 |             arr_.append(round(x / math.pow(10,j), 4))
40 |         return arr_
41 | 
42 |     def Mean(self):  # 均值归一化
43 |         arr_ = list()
44 |         for x in self.arr:
45 |             arr_.append(round((x - self.x_mean) / (self.x_max - self.x_min), 4))
46 |         return arr_
47 | 
48 |     def Vector(self):  # 向量归一化
49 |         arr_ = list()
50 |         for x in self.arr:
51 |             arr_.append(round(x / sum(self.arr), 4))
52 |         return arr_
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     nor = DataNorm()
57 |     min_max = nor.Min_Max()
58 |     print("{}".format(min_max))


--------------------------------------------------------------------------------
/chapter-7-Area-hot-rec/RecBasedAH.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | """
 4 | 基于地域和热度的酒店推荐
 5 | """
 6 | class RecBasedAH:
 7 |     def __init__(self, path=None, addr="朝阳区", type="score", k=10, sort=False):
 8 |         self.path = path
 9 |         self.addr = addr
10 |         self.type = type
11 |         self.k = k
12 |         self.sort = sort
13 |         self.data = self.load_mess()
14 | 
15 |     def load_mess(self):
16 |         data = pd.read_csv(self.path, header=0, sep=",",encoding='GBK')
17 |         return data[data["addr"] == self.addr]
18 | 
19 |     def recommend(self):
20 |         if self.type in ["score","comment_num","lowest_price","decoration_time","open_time"]:
21 |             data = self.data.sort_values(by=[self.type, "lowest_price"], ascending=self.sort)[:self.k]
22 |             return dict(data.filter(items=["name", self.type]).values)
23 |         elif self.type == "combine":
24 |             data = self.data.filter(items=["name","score","comment_num","decoration_time","open_time","lowest_price"])
25 |             data["decoration_time"] = data["decoration_time"].apply(lambda x:int(x)-2018)
26 |             data["open_time"] = data["open_time"].apply(lambda x:2018-int(x))
27 |             for col in data.keys():
28 |                 if col != "name":
29 |                     data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
30 |                 data[self.type] = 1 * data["score"] + 2 * data["comment_num"] + 0.5 * data["decoration_time"] +\
31 |                     0.5 * data["open_time"] + 1.5 * data["lowest_price"]
32 |                 data = data.sort_values(by=self.type, ascending=self.sort)[:self.k]
33 |                 return dict(data.filter(items=["name",self.type]).values)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     path = "../hotel-mess/hotel-mess.csv"
38 |     rbah = RecBasedAH(path,type="combine",k=10,sort=False)
39 |     print(rbah.recommend())


--------------------------------------------------------------------------------
/chapter-8-CTR/GBDT_LR.py:
--------------------------------------------------------------------------------
 1 | from sklearn import metrics
 2 | from sklearn.metrics import mean_squared_error
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.ensemble import GradientBoostingClassifier
 5 | from sklearn.linear_model import LogisticRegression
 6 | import pandas as pd
 7 | from sklearn.preprocessing import OneHotEncoder
 8 | 
 9 | class GBDTWithLR:
10 |     def __init__(self):
11 |         self.file = "../telecom-churn/new_churn.csv"
12 |         self.data = self.load_data()
13 |         self.train, self.test = self.split()
14 | 
15 |     def load_data(self):
16 |         return pd.read_csv(self.file)
17 | 
18 |     def split(self):
19 |         train, test = train_test_split(self.data, test_size=0.1, random_state=40)
20 |         return train, test
21 | 
22 |     def train_model(self):
23 |         print("training")
24 |         label = "Churn"
25 |         ID = "customerID"
26 |         x_columns = [x for x in self.train.columns if x not in [ID, label]]
27 |         x_train = self.train[x_columns]
28 |         y_train = self.train[label]
29 | 
30 |         gbdt = GradientBoostingClassifier()
31 |         gbdt.fit(x_train,y_train)
32 | 
33 |         gbdt_lr = LogisticRegression(max_iter=3000)
34 |         enc = OneHotEncoder()
35 |         enc.fit(gbdt.apply(x_train).reshape(-1,100))
36 | 
37 |         gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1,100)),y_train)
38 |         return enc, gbdt, gbdt_lr
39 | 
40 |     def evaluate(self,enc,gbdt,gbdt_lr):
41 |         print("evaluating")
42 |         label = "Churn"
43 |         ID = "customerID"
44 |         x_columns = [x for x in self.train.columns if x not in [ID, label]]
45 |         x_test = self.test[x_columns]
46 |         y_test = self.test[label]
47 | 
48 |         gbdt_pred = gbdt.predict(x_test)
49 |         print("GBDT accuracy: %.4g" % metrics.accuracy_score(y_test.values, gbdt_pred))
50 | 
51 |         gbdt_lr_pred = gbdt_lr.predict(enc.transform(gbdt.apply(x_test).reshape(-1,100)))
52 |         print("GBDT_LR accuracy: %.4g" % metrics.accuracy_score(y_test.values, gbdt_lr_pred))
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     new_model= GBDTWithLR()
57 |     enc, gbdt, gbdt_lr = new_model.train_model()
58 |     new_model.evaluate(enc, gbdt, gbdt_lr)


--------------------------------------------------------------------------------
/chapter-8-CTR/LRbased.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LogisticRegression
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn import metrics
 4 | from sklearn.metrics import mean_squared_error
 5 | import pandas as pd
 6 | 
 7 | 
 8 | class ChurnPredWithLR:
 9 |     def __init__(self):
10 |         self.file = "../telecom-churn/new_churn.csv"
11 |         self.data = self.load_data()
12 |         self.train, self.test = self.split()
13 | 
14 |     def load_data(self):
15 |         data = pd.read_csv(self.file)
16 |         labels = list(data.keys())
17 |         fDict = dict()
18 |         for f in labels:
19 |             if f not in ['customerID','tenure','MonthlyCharges','TotalCharges','Churn']:
20 |                 fDict[f] = sorted(list(data.get(f).unique()))
21 |         fw = open("../telecom-churn/one_hot_churn.csv","w")
22 |         fw.write("customerID,")
23 |         for i in range(1,47):
24 |             fw.write('f_%s,' % i)
25 |         fw.write("Churn\n")
26 |         for line in data.values:
27 |             list_line = list(line)
28 |             list_result = list()
29 |             for i in range(0, list_line.__len__()):
30 |                 if labels[i] in ['customerID','tenure','MonthlyCharges','TotalCharges','Churn']:
31 |                     list_result.append(list_line[i])
32 |                 else:
33 |                     arr = [0] * fDict[labels[i]].__len__()
34 |                     ind = fDict[labels[i]].index(list_line[i])
35 |                     arr[ind] = 1
36 |                     for one in arr:
37 |                         list_result.append(one)
38 |                     #list_result.append(arr)
39 |             fw.write(",".join([str(f) for f in list_result]) + "\n")
40 |         fw.close()
41 |         return pd.read_csv("../telecom-churn/one_hot_churn.csv")
42 | 
43 |     def split(self):
44 |         train, test = train_test_split(self.data, test_size=0.1, random_state=40)
45 |         return train, test
46 | 
47 |     def train_model(self):
48 |         print("Start Train Model ...")
49 |         label = "Churn"
50 |         ID = "customerID"
51 |         x_columns = [x for x in self.train.columns if x not in [ID, label]]
52 |         x_train = self.train[x_columns]
53 |         y_train = self.train[label]
54 |         lr = LogisticRegression(penalty='l2', tol=1e-4, fit_intercept=True)
55 |         lr.fit(x_train, y_train)
56 |         return lr
57 | 
58 |     def evaluate(self,lr,type):
59 |         label = "Churn"
60 |         ID = "customerID"
61 |         x_columns = [x for x in self.train.columns if x not in [ID, label]]
62 |         x_test = self.test[x_columns]
63 |         y_test = self.test[label]
64 |         if type == 1:
65 |             y_pred = lr.predict(x_test)
66 |             new_y_pre = y_pred
67 |         elif type == 2:
68 |             y_pred = lr.predict_proba(x_test)
69 |             new_y_pre = list()
70 |             for y in y_pred:
71 |                 new_y_pre.append(1 if y[1]>0.5 else 0)
72 |         accuracy = metrics.accuracy_score(y_test.values, new_y_pre)
73 |         print(accuracy)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     pred = ChurnPredWithLR()
78 |     lr = pred.train_model()
79 |     pred.evaluate(lr,type=1)


--------------------------------------------------------------------------------
/chapter-4-data-mining/DiscreteByEntropy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | 
 4 | """
 5 | 代码4-2 基于信息熵的数据离散化
 6 | """
 7 | 
 8 | 
 9 | class DiscreteByEntropy:
10 |     def __init__(self, group, threshold):
11 |         self.maxGroup = group  # 最大分组数
12 |         self.minInfoThreshold = threshold  # 停止划分的最小熵
13 |         self.result = dict()  # 保存划分结果
14 | 
15 |     def loadData(self):
16 |         data = np.array(
17 |             [
18 |                 [56, 1], [87, 1], [129, 0], [23, 0], [342, 1],
19 |                 [641, 1], [63, 0], [2764, 1], [2323, 0], [453, 1],
20 |                 [10, 1], [9, 0], [88, 1], [222, 0], [97, 0],
21 |                 [2398, 1], [592, 1], [561, 1], [764, 0], [121, 1]
22 |             ]
23 |         )
24 |         return data
25 | 
26 |     def calEntropy(self,data):  # 计算熵
27 |         numData = len(data)
28 |         labelCounts = {}
29 |         for feature in data:
30 |             oneLabel = feature[-1]
31 |             labelCounts.setdefault(oneLabel,0)
32 |             labelCounts[oneLabel] += 1
33 |         shannonEnt = 0.0
34 |         for key in labelCounts:
35 |             prob = float(labelCounts[key]) / numData
36 |             shannonEnt -= prob * math.log(prob, 2)
37 |         return shannonEnt
38 | 
39 |     def split(self, data):
40 |         # inf为正无穷大
41 |         minEntropy = np.inf
42 |         # 记录最终分割索引
43 |         index = -1
44 |         sortData = data[np.argsort(data[:, 0])]
45 |         lastE1, lastE2 = -1, -1
46 |         S1 = dict()
47 |         S2 = dict()
48 |         for i in range(len(sortData)):
49 |             splitData1, splitData2 = sortData[: i+1], sortData[i+1 :]
50 |             entropy1, entropy2 = (
51 |                 self.calEntropy(splitData1),
52 |                 self.calEntropy(splitData2)
53 |             )
54 |             entropy = entropy1 * len(splitData1) / len(sortData) + \
55 |                 entropy2 * len(splitData2) / len(sortData)
56 |             if entropy < minEntropy:
57 |                 minEntropy = entropy
58 |                 index = i
59 |                 lastE1 = entropy1
60 |                 lastE2 = entropy2
61 |         S1["entropy"] = lastE1
62 |         S1["data"] = sortData[: index + 1]
63 |         S2["entropy"] = lastE2
64 |         S2["data"] = sortData[index + 1 :]
65 |         return S1, S2, minEntropy
66 | 
67 |     def train(self, data):  # 离散化处理
68 |         needSplitKey = [0]
69 |         self.result.setdefault(0,{})
70 |         self.result[0]["entropy"] = np.inf
71 |         self.result[0]["data"] = data
72 |         group = 1
73 |         for key in needSplitKey:
74 |             S1, S2, entropy = self.split(self.result[key]["data"])
75 |             if entropy > self.minInfoThreshold and group < self.maxGroup:
76 |                 self.result[key] = S1
77 |                 newKey = max(self.result.keys()) + 1
78 |                 self.result[newKey] = S2
79 |                 needSplitKey.extend([key])
80 |                 needSplitKey.extend([newKey])
81 |                 group += 1
82 |             else:
83 |                 break
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     dbe = DiscreteByEntropy(group=6, threshold=0.5)
88 |     data = dbe.loadData()
89 |     dbe.train(data)
90 |     print("result is {}".format(dbe.result))


--------------------------------------------------------------------------------
/chapter-9-Code-Start/SignUpInfo.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | 
  4 | 
  5 | class UserShow:
  6 |     def __init__(self):
  7 |         self.file_user = "../bookcrossings/BX-Users.csv"
  8 |         self.file_book = "../bookcrossings/BX-Books.csv"
  9 |         self.file_rate = "../bookcrossings/BX-Book-Ratings.csv"
 10 |         self.user_mess = self.loadUserData()
 11 |         self.book_mess = self.loadBookMess()
 12 |         self.user_book = self.loadUserBook()
 13 | 
 14 |     def loadUserData(self):
 15 |         user_mess = dict()
 16 |         for line in open(self.file_user,"r",encoding="ISO-8859-1"):
 17 |             if line.startswith("\"User-ID\""):
 18 |                 continue
 19 |             if len(line.split(";")) != 3:
 20 |                 continue
 21 |             line = line.strip().replace(" ","")
 22 |             userid, addr, age = [one.replace("\"","") for one in line.split(";")]
 23 |             if age == "NULL" or int(age) not in range(1,120):
 24 |                 continue
 25 |             user_mess.setdefault(userid,{})
 26 |             user_mess[userid]["age"] = int(age)
 27 |             if len(addr.split(",")) < 3:
 28 |                 continue
 29 |             city, province, country = addr.split(",")[-3:]
 30 |             user_mess[userid]["country"] = country
 31 |             user_mess[userid]["province"] = province
 32 |             user_mess[userid]["city"] = city
 33 |         return user_mess
 34 | 
 35 |     def loadBookMess(self):
 36 |         book_mess = dict()
 37 |         for line in open(self.file_book,"r",encoding="ISO-8859-1"):
 38 |             if line.startswith("\"ISBN\""):
 39 |                 continue
 40 |             isbn, book_name = line.replace("\"","").split(";")[:2]
 41 |             book_mess[isbn] = book_name
 42 |         return book_mess
 43 | 
 44 |     def loadUserBook(self):
 45 |         user_book = dict()
 46 |         for line in open(self.file_rate,"r",encoding="ISO-8859-1"):
 47 |             if line.startswith("\"User-ID\""):
 48 |                 continue
 49 |             user_id, isbn, rating = line.replace("\"","").split(";")[:3]
 50 |             user_book.setdefault(user_id,list())
 51 |             if int(rating) > 5:
 52 |                 user_book[user_id].append(isbn)
 53 |         return user_book
 54 | 
 55 |     def show(self, X, Y, X_label, Y_label="数目"):
 56 |         plt.xlabel(X_label)
 57 |         plt.ylabel(Y_label)
 58 |         plt.xticks(np.arange(len(X)),X, rotation = 90)
 59 |         for a, b in zip(np.arange(len(X)),Y):
 60 |             plt.text(a, b, b, rotation = 45)
 61 |         plt.bar(np.arange(len(X)),Y)
 62 |         plt.show()
 63 | 
 64 |     def diffAge(self):
 65 |         age_user = dict()
 66 |         for key in self.user_mess.keys():
 67 |             age_split = int(int(self.user_mess[key]["age"]) / 10)
 68 |             age_user.setdefault(age_split,0)
 69 |             age_user[age_split] += 1
 70 |         age_user_sort = sorted(age_user.items(), key=lambda x:x[0], reverse=False)
 71 |         X = [x[0] for x in age_user_sort]
 72 |         Y = [x[1] for x in age_user_sort]
 73 |         print(age_user_sort)
 74 |         self.show(X,Y, X_label="用户年龄段")
 75 | 
 76 |     def diffpro(self):
 77 |         pro_user = dict()
 78 |         for key in self.user_mess.keys():
 79 |             if "province" in self.user_mess[key].keys() and self.user_mess[key]["province"] != "n/a":
 80 |                 pro_user.setdefault(self.user_mess[key]["province"], 0)
 81 |                 pro_user[self.user_mess[key]]["province"] += 1
 82 |             pro_user_sort = sorted(pro_user.items(), key=lambda x:x[1],reverse=True)[:20]
 83 |             X = [x[0] for x in pro_user_sort]
 84 |             Y = [x[1] for x in pro_user_sort]
 85 |             print(pro_user_sort)
 86 |             self.show(X, Y, X_label="用户所在州")
 87 | 
 88 |     def diffUserAge(self):
 89 |         age_books = dict()
 90 |         age_books.setdefault(1,dict())
 91 |         age_books.setdefault(2,dict())
 92 |         for key in self.user_mess.keys():
 93 |             if "country" not in self.user_mess[key].keys():
 94 |                 continue
 95 |             if key not in self.user_book.keys():
 96 |                 continue
 97 |             if int(self.user_mess[key]["age"]) in range(0,30):
 98 |                 for book in self.user_book[key]:
 99 |                     if book not in self.book_mess.keys():
100 |                         continue
101 |                     age_books[1].setdefault(book,0)
102 |                     age_books[1][book] += 1
103 |             if int(self.user_mess[key]["age"]) in range(50,120):
104 |                 for book in self.user_book[key]:
105 |                     if book not in self.book_mess.keys():
106 |                         continue
107 |                     age_books[2].setdefault(book,0)
108 |                     age_books[2][book] += 1
109 |         print("年龄在30岁以下的用户偏好共性图书top10")
110 |         for one in sorted(age_books[1].items(), key=lambda x:x[1], reverse=True)[:10]:
111 |             print(self.book_mess[one[0]])
112 |         print("年龄在50岁以上的用户偏好共性图书top10")
113 |         for one in sorted(age_books[2].items(), key=lambda x:x[1], reverse=True)[:10]:
114 |             print(self.book_mess[one[0]])
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     ushow = UserShow()
119 |     ushow.diffUserAge()


--------------------------------------------------------------------------------
/chapter-2-First-Rec-Sys/first_Rec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import random
  4 | import math
  5 | 
  6 | """
  7 | 第二章内容，搭建第一个推荐系统
  8 | """
  9 | 
 10 | 
 11 | class FirstRec:
 12 |     """
 13 |     k 近邻用户个数
 14 |     n_items 为每个用户推荐的电影数
 15 |     """
 16 |     def __init__(self,file_path,seed,k,n_items):
 17 |         self.file_path = file_path
 18 |         self.users_1000 = self.__select_1000_users()
 19 |         self.seed = seed
 20 |         self.k = k
 21 |         self.n_items = n_items
 22 |         self.train,self.test = self._load_and_split_data()
 23 | 
 24 |     def __select_1000_users(self):
 25 |         print("随机选取1000个用户！")
 26 |         if os.path.exists("../data/train.json") and os.path.exists("../data/test.json"):
 27 |             return list()
 28 |         else:
 29 |             users = set()
 30 |             for file in os.listdir(self.file_path):
 31 |                 one_path = "{}/{}".format(self.file_path,file)
 32 |                 print("{}".format(one_path))
 33 |                 with open(one_path, "r") as fp:
 34 |                     for line in fp.readlines():
 35 |                         if line.strip().endswith(":"):
 36 |                             continue
 37 |                         userID, _, _ = line.split(",")
 38 |                         users.add(userID)
 39 |             users_1000 = random.sample(list(users),1000)
 40 |             print(users_1000)
 41 |             return users_1000
 42 | 
 43 |     def _load_and_split_data(self):
 44 |         train = dict()
 45 |         test = dict()
 46 |         if os.path.exists("../data/train.json") and os.path.exists("../data/test.json"):
 47 |             print("从文件中加载训练集和测试集")
 48 |             train = json.load(open("../data/train.json"))
 49 |             test = json.load(open("../data/test.json"))
 50 |             print("从文件中加载数据完成")
 51 |         else:
 52 |             random.seed(self.seed)
 53 |             for file in os.listdir(self.file_path):
 54 |                 one_path = "{}/{}".format(self.file_path,file)
 55 |                 print("{}".format(one_path))
 56 |                 with open(one_path,"r") as fp:
 57 |                     movieID = fp.readline().split(":")[0]
 58 |                     for line in fp.readlines():
 59 |                         if line.strip().endswith(":"):
 60 |                             movieID = line.split(":")[0]
 61 |                             continue
 62 |                         userID, rate, _ = line.split(",")
 63 |                         if userID in self.users_1000:
 64 |                             if random.randint(1,50) == 1:
 65 |                                 test.setdefault(userID, {})[movieID] = int(rate)
 66 |                             else:
 67 |                                 train.setdefault(userID, {})[movieID] = int(rate)
 68 |             print("加载数据到 ../data/train.json 和 ../data/test.json")
 69 |             json.dump(train, open("../data/train.json", "w"))
 70 |             json.dump(test, open("../data/test.json", "w"))
 71 |             print("加载完成")
 72 |         return train, test
 73 | 
 74 |     def pearson(self,rating1,rating2):
 75 |         sum_xy = 0
 76 |         sum_x = 0
 77 |         sum_y = 0
 78 |         sum_x2 = 0
 79 |         sum_y2 = 0
 80 |         num = 0
 81 |         for key in rating1.keys():
 82 |             if key in rating2.keys():
 83 |                 num += 1
 84 |                 x = rating1[key]
 85 |                 y = rating2[key]
 86 |                 sum_xy += x * y
 87 |                 sum_x += x
 88 |                 sum_y += y
 89 |                 sum_x2 += math.pow(x,2)
 90 |                 sum_y2 += math.pow(y,2)
 91 |         if num == 0:
 92 |             return 0
 93 |         de = math.sqrt(sum_x2 - math.pow(sum_x,2)/num) * math.sqrt(sum_y2 - math.pow(sum_y,2)/num)
 94 |         if de == 0:
 95 |             return 0
 96 |         else:
 97 |             return (sum_xy - (sum_x * sum_y) / num) / de
 98 | 
 99 |     def recommend(self,userID):
100 |         neighborUser = dict()
101 |         for user in self.train.keys():
102 |             if user != userID:
103 |                 distance = self.pearson(self.train[user],self.train[userID])
104 |                 neighborUser[user] = distance
105 |         newNU = sorted(neighborUser.items(), key=lambda m: m[1], reverse= True)
106 |         movies = dict()
107 |         for (sim_user,sim) in newNU[:self.k]:
108 |             for movieID in self.train[sim_user].keys():
109 |                 movies.setdefault(movieID,0)
110 |                 movies[movieID] += sim * self.train[sim_user][movieID]
111 |         newMovies = sorted(movies.items(),key = lambda m:m[1],reverse=True)
112 |         return newMovies
113 | 
114 |     def evaluate(self,num=30):
115 |         print("开始计算准确率")
116 |         precisions = list()
117 |         random.seed(10)
118 |         for userID in random.sample(self.test.keys(),num):
119 |             hit = 0
120 |             result = self.recommend(userID)[:self.n_items]
121 |             for (item,rate) in result:
122 |                 if item in self.test[userID]:
123 |                     hit += 1
124 |             precisions.append(hit/self.n_items)
125 |         return sum(precisions) / precisions.__len__()
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     file_path = "../data/training_set"
130 |     seed = 30
131 |     k = 15
132 |     n_items = 20
133 |     f_rec = FirstRec(file_path,seed,k,n_items)
134 |     print("算法的推荐准确率{}".format(f_rec.evaluate()))


--------------------------------------------------------------------------------
/chapter-5-User-based-rec/LFM based rec.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pickle
  3 | import os
  4 | import random
  5 | import numpy as np
  6 | from math import exp
  7 | import time
  8 | 
  9 | class DataProcessing:
 10 |     def __init__(self):
 11 |         pass
 12 | 
 13 |     def get_pos_neg_item(self, file_path = "../ml-1m/ratings.csv"):
 14 |         if not os.path.exists("../ml-1m/lfm_items.dict"):
 15 |             self.items_dict_path = "../ml-1m/lfm_items.dict"
 16 |             self.uiscores = pd.read_csv(file_path)
 17 |             self.user_ids = set(self.uiscores["UserID"].values)
 18 |             self.item_ids = set(self.uiscores["MovieID"].values)
 19 |             self.items_dict = {user_id: self.get_one(user_id) for user_id in list(self.user_ids)}
 20 |             fw = open(self.items_dict_path, "wb")
 21 |             pickle.dump(self.items_dict, fw)
 22 |             fw.close()
 23 | 
 24 |     def get_one(self, user_id):
 25 |         print('为用户%s准备正向和负向数据...' % user_id)
 26 |         pos_item_ids = set(self.uiscores[self.uiscores['UserID'] == user_id]['MovieID'])
 27 |         neg_item_ids = self.item_ids ^ pos_item_ids  # “^”为异或符号
 28 |         neg_item_ids = list(neg_item_ids)[:len([pos_item_ids])]
 29 |         item_dict = {}
 30 |         for item in pos_item_ids:
 31 |             item_dict[item] = 1
 32 |         for item in neg_item_ids:
 33 |             item_dict[item] = 0
 34 |         return item_dict
 35 | 
 36 | 
 37 | class LFM:
 38 |     def __init__(self):
 39 |         self.class_count = 5
 40 |         self.iter_count = 5
 41 |         self.lr = 0.02
 42 |         self.lam = 0.01
 43 |         self._init_model()
 44 | 
 45 |     def _init_model(self):
 46 |         file_path = '../ml-1m/ratings.csv'
 47 |         pos_neg_path = '../ml-1m/lfm_items.dict'
 48 |         self.uiscores = pd.read_csv(file_path)
 49 |         self.user_ids = set(self.uiscores['UserID'].values)
 50 |         self.item_ids = set(self.uiscores['MovieID'].values)
 51 |         self.items_dict = pickle.load(open(pos_neg_path,'rb'))
 52 | 
 53 |         array_p = np.random.randn(len(self.user_ids),self.class_count)
 54 |         array_q = np.random.randn(len(self.item_ids),self.class_count)
 55 |         self.p = pd.DataFrame(array_p, columns=range(0, self.class_count), index=list(self.user_ids))
 56 |         self.q = pd.DataFrame(array_q, columns=range(0, self.class_count), index=list(self.item_ids))
 57 | 
 58 |     def _predict(self, user_id, item_id):
 59 |         p = np.mat(self.p.ix[user_id].values)
 60 |         q = np.mat(self.q.ix[item_id].values).T
 61 |         r = (p * q).sum()
 62 |         logit = 1.0 / (1 + exp(-r))
 63 |         return logit
 64 | 
 65 |     def _loss(self, user_id, item_id, y, step):
 66 |         e = y - self._predict(user_id, item_id)
 67 |         return e
 68 | 
 69 |     def _optimize(self, user_id, item_id, e):
 70 |         gradient_p = -e * self.q.ix[item_id].values
 71 |         l2_p = self.lam * self.p.ix[user_id].values
 72 |         delta_p = self.lr * (gradient_p + l2_p)
 73 | 
 74 |         gradient_q = -e * self.p.ix[user_id].values
 75 |         l2_q = self.lam * self.q.ix[item_id].values
 76 |         delta_q = self.lr * (gradient_q + l2_q)
 77 | 
 78 |         self.p.loc[user_id] -= delta_p
 79 |         self.q.loc[item_id] -= delta_q
 80 | 
 81 |     def train(self):
 82 |         for step in range(0,self.iter_count):
 83 |             time.sleep(30)
 84 |             for user_id, item_dict in self.items_dict.items():
 85 |                 print('step: {}, user_id: {}'.format(step, user_id))
 86 |                 item_ids = list(item_dict.keys())
 87 |                 random.shuffle(item_ids)
 88 |                 for item_id in item_ids:
 89 |                     e = self._loss(user_id, item_id, item_dict[item_id], step)
 90 |                     self._optimize(user_id, item_id, e)
 91 |             self.lr *= 0.9
 92 |         self.save()
 93 | 
 94 |     def save(self):
 95 |         f = open('../ml-1m/lfm.model','wb')
 96 |         pickle.dump((self.p, self.q), f)
 97 |         f.close()
 98 | 
 99 |     def load(self):
100 |         f = open('../ml-1m/lfm.model','rb')
101 |         self.p, self.q = pickle.load(f)
102 |         f.close()
103 | 
104 |     def predict(self, user_id, top_n = 10):
105 |         self.load()
106 |         user_item_ids = set(self.uiscores[self.uiscores['UserID'] == user_id]['MovieID'])
107 |         other_item_ids = self.item_ids ^ user_item_ids
108 |         interest_list = [self._predict(user_id, item_id) for item_id in other_item_ids]
109 |         candidates = sorted(zip(list(other_item_ids), interest_list), key= lambda x:x[1], reverse=True)
110 |         return candidates[:top_n]
111 | 
112 |     def evaluate(self):
113 |         self.load()
114 |         users = random.sample(self.user_ids, 10)
115 |         user_dict = {}
116 |         for user in users:
117 |             user_item_ids = set(self.uiscores[self.uiscores['UserID'] == user]['MovieID'])
118 |             _sum = 0.0
119 |             for item_id in user_item_ids:
120 |                 p = np.mat(self.p.ix[user].values)
121 |                 q = np.mat(self.q.ix[item_id].values).T
122 |                 _r = (p * q).sum()
123 |                 r = self.uiscores[(self.uiscores['UserID'] == user)&(self.uiscores['MovieID']==item_id)]["Rating"].values[0]
124 |                 _sum += abs(r - _r)
125 |             user_dict[user] =  _sum/len(user_item_ids)
126 |             print("user: {} AI: {}".format(user,user_dict[user]))
127 |         return sum(user_dict.values()) / len(user_dict.keys())
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     # dp = DataProcessing()
132 |     # dp.get_pos_neg_item()
133 |     lfm = LFM()
134 |     #lfm.train()
135 |     #print(lfm.predict(6027,10))
136 |     print(lfm.evaluate())


--------------------------------------------------------------------------------
/chapter-5-User-based-rec/content based.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import json
  3 | import numpy as np
  4 | import math
  5 | import random
  6 | 
  7 | 
  8 | class DataProcessing:
  9 |     def __init__(self):
 10 |         pass
 11 | 
 12 |     def process(self):
 13 |         print('开始转换用户数据(users.dat)...')
 14 |         self.process_user_data()
 15 |         print('开始转换电影数据(movies.dat)...')
 16 |         self.process_movie_data()
 17 |         print('开始转换用户对电影评分数据(ratings.dat)')
 18 |         self.process_rating_data()
 19 |         print('Over!')
 20 | 
 21 |     def process_user_data(self, file='../ml-1m/users.dat'):
 22 |         fp = pd.read_table(file, sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
 23 |         fp.to_csv('../ml-1m/users.csv', index=False)
 24 | 
 25 |     def process_rating_data(self, file='../ml-1m/ratings.dat'):
 26 |         fp = pd.read_table(file, sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
 27 |         fp.to_csv('../ml-1m/ratings.csv', index=False)
 28 | 
 29 |     def process_movie_data(self, file='../ml-1m/movies.dat'):
 30 |         fp = pd.read_table(file, sep='::', engine='python', names=['MovieID', 'Title', 'Genres'])
 31 |         fp.to_csv('../ml-1m/movies.csv', index=False)
 32 | 
 33 |     def prepare_item_profile(self,file='../ml-1m/movies.csv'):  # 计算电影特征信息矩阵
 34 |         items = pd.read_csv(file)
 35 |         item_ids = set(items["MovieID"].values)
 36 |         self.item_dict = {}
 37 |         genres_all = list()
 38 |         # 将每个电影的类型放在item_dict中
 39 |         for item in item_ids:
 40 |             genres = items[items["MovieID"] == item]["Genres"].values[0].split("|")
 41 |             self.item_dict.setdefault(item, []).extend(genres)
 42 |             genres_all.extend(genres)
 43 |         self.genres_all = set(genres_all)
 44 |         # 将每个电影特征信息矩阵存放在self.item_matrix中
 45 |         self.item_matrix = {}
 46 |         for item in self.item_dict.keys():
 47 |             self.item_matrix[str(item)] = [0] * len(set(self.genres_all))
 48 |             for genre in self.item_dict[item]:
 49 |                 index = list(set(genres_all)).index(genre)
 50 |                 self.item_matrix[str(item)][index] = 1
 51 |         json.dump(self.item_matrix, open('../ml-1m/item_profile.json', 'w'))
 52 |         print("item信息计算完成，保存路径为'../ml-1m/item_profile.json'")
 53 | 
 54 |     def prepare_user_profile(self,file='../ml-1m/ratings.csv'):  # 计算用户偏好矩阵
 55 |         users = pd.read_csv(file)
 56 |         user_ids = set(users["UserID"].values)
 57 |         # 将user信息转换成dict
 58 |         users_rating_dict = {}
 59 |         for user in user_ids:
 60 |             users_rating_dict.setdefault(str(user),{})
 61 |         with open(file,"r") as fr:
 62 |             for line in fr.readlines():
 63 |                 if not line.startswith("UserID"):
 64 |                     (user, item, rate) = line.split(",")[:3]
 65 |                     users_rating_dict[user][item] = int(rate)
 66 |         # 获取用户对每个类型下的哪些电影进行了评分
 67 |         self.user_matrix = {}
 68 |         for user in users_rating_dict.keys():
 69 |             score_list = users_rating_dict[user].values()
 70 |             avg = sum(score_list)/len(score_list)
 71 |             self.user_matrix[user] = []
 72 |             for genre in self.genres_all:
 73 |                 score_all = 0.0
 74 |                 score_len = 0
 75 |                 for item in users_rating_dict[user].keys():
 76 |                     if genre in self.item_dict[int(item)]:
 77 |                         score_all += (users_rating_dict[user][item]-avg)
 78 |                         score_len += 1
 79 |                 if score_len == 0:
 80 |                     self.user_matrix[user].append(0.0)
 81 |                 else:
 82 |                     self.user_matrix[user].append(score_all/score_len)
 83 |         json.dump(self.user_matrix,open('../ml-1m/user_profile.json','w'))
 84 |         print("user信息计算完成，保存路径'../ml-1m/user_profile.json'")
 85 | 
 86 | 
 87 | class CBRecommend:
 88 |     def __init__(self,k):
 89 |         self.k = k  # 给用户推荐的item个数
 90 |         self.item_profile = json.load(open("../ml-1m/item_profile.json","r"))
 91 |         self.user_profile = json.load(open("../ml-1m/user_profile.json","r"))
 92 | 
 93 |     def get_none_score_item(self,user):
 94 |         items = pd.read_csv("../ml-1m/movies.csv")["MovieID"].values
 95 |         data = pd.read_csv("../ml-1m/ratings.csv")
 96 |         have_score_items = data[data["UserID"]==user]["MovieID"].values
 97 |         none_score_items = set(items)-set(have_score_items)
 98 |         return none_score_items
 99 | 
100 |     def cosUI(self,user,item):
101 |         Uia = sum(np.array(self.user_profile[str(user)]) * np.array(self.item_profile[str(item)]))
102 |         Ua = math.sqrt(sum([math.pow(one,2) for one in self.user_profile[str(user)]]))
103 |         Ia = math.sqrt(sum([math.pow(one,2) for one in self.item_profile[str(item)]]))
104 |         return Uia / (Ua * Ia)
105 | 
106 |     def recommend(self,user):
107 |         user_result = {}
108 |         item_list = self.get_none_score_item(user)
109 |         for item in item_list:
110 |             user_result[item] = self.cosUI(user, item)
111 |         if self.k is None:
112 |             result = sorted(user_result.items(), key=lambda a:a[1], reverse=True)
113 |         else:
114 |             result = sorted(user_result.items(), key=lambda a:a[1], reverse=True)[:self.k]
115 |         print(result)
116 | 
117 |     def evaluate(self):
118 |         evas = []
119 |         data = pd.read_csv("../ml-1m/ratings.csv")
120 |         for user in random.sample([one for one in range(1,6041)],20):
121 |             have_score_items = data
122 | 
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     # dp = DataProcessing()
127 |     # dp.process()
128 |     # dp.prepare_item_profile()
129 |     # dp.prepare_user_profile()
130 |     cb = CBRecommend(k=10)
131 |     cb.recommend(1)
132 | 


--------------------------------------------------------------------------------
/chapter-6-Tag-based-rec/RecBasedTag.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import math
  3 | 
  4 | 
  5 | class RecBasedTag:
  6 |     def __init__(self):
  7 |         # 用户听过艺术家次数的文件
  8 |         self.user_rate_file = "../lastfm-2k/user_artists.dat"
  9 |         # 用户打标信息
 10 |         self.user_tag_file = "../lastfm-2k/user_taggedartists.dat"
 11 | 
 12 |         # 获取所有艺术家ID
 13 |         self.artistsAll = list(pd.read_table("../lastfm-2k/artists.dat", delimiter="\t")["id"].values)
 14 |         # 用户对艺术家的评分
 15 |         self.userRateDict = self.getUserRate()
 16 |         # 艺术家和标签的相关度
 17 |         self.artistsTagsDict = self.getArtistsTags()
 18 |         # 用户对每个标签打标的次数统计和每个标签被所有用户打标的次数统计
 19 |         self.userTagDict, self.tagUserDict = self.getUserTagNum()
 20 |         # 用户最终对每个标签的喜好程度
 21 |         self.userTagPre = self.getUserTagPre()
 22 | 
 23 | 
 24 |     def getUserRate(self):
 25 |         userRateDict = dict()
 26 |         fr = open(self.user_rate_file, "r", encoding="utf-8")
 27 |         for line in fr.readlines():
 28 |             if not line.startswith("userID"):
 29 |                 userID, artistID, weight = line.split("\t")
 30 |                 userRateDict.setdefault(int(userID),{})
 31 |                 userRateDict[int(userID)][int(artistID)] = float(weight) / 10000
 32 |         return userRateDict
 33 | 
 34 |     def getUserTagNum(self):
 35 |         userTagDict = dict()
 36 |         tagUserDict = dict()
 37 |         for line in open(self.user_tag_file, "r", encoding="utf-8"):
 38 |             if not line.startswith("userID"):
 39 |                 userID, artistID, tagID = line.strip().split("\t")[:3]
 40 |                 if int(tagID) in tagUserDict.keys():
 41 |                     tagUserDict[int(tagID)] += 1
 42 |                 else:
 43 |                     tagUserDict[int(tagID)] = 1
 44 |                 userTagDict.setdefault(int(userID),{})
 45 |                 if int(tagID) in userTagDict[int(userID)].keys():
 46 |                     userTagDict[int(userID)][int(tagID)] += 1
 47 |                 else:
 48 |                     userTagDict[int(userID)][int(tagID)] = 1
 49 |         return userTagDict, tagUserDict
 50 | 
 51 |     def getArtistsTags(self):  # 标签基因
 52 |         artistsTagsdict = dict()
 53 |         for line in open(self.user_tag_file, "r", encoding="utf-8"):
 54 |             if not line.startswith("userID"):
 55 |                 artistID, tagID = line.strip().split("\t")[1:3]
 56 |                 artistsTagsdict.setdefault(int(artistID),{})
 57 |                 artistsTagsdict[int(artistID)][int(tagID)] = 1
 58 |         return artistsTagsdict
 59 | 
 60 |     def getUserTagPre(self):  # 用户对标签最终兴趣度
 61 |         userTagPre = dict()
 62 |         userTagCount = dict()
 63 |         Num = len(open(self.user_tag_file, "r", encoding="utf-8").readlines())
 64 |         for line in open(self.user_tag_file, "r", encoding="utf-8").readlines():
 65 |             if not line.startswith("userID"):
 66 |                 userID, artistID, tagID = line.strip().split("\t")[:3]
 67 |                 userTagPre.setdefault(int(userID),{})
 68 |                 userTagCount.setdefault(int(userID),{})
 69 |                 rate_ui = (self.userRateDict[int(userID)][int(artistID)] if int(artistID) in self.userRateDict[int(userID)].keys()
 70 |                            else 0)
 71 |                 if int(tagID) not in userTagPre[int(userID)].keys():
 72 |                     userTagPre[int(userID)][int(tagID)] = rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)]
 73 |                     userTagCount[int(userID)][int(tagID)] = 1
 74 |                 else:
 75 |                     userTagPre[int(userID)][int(tagID)] += rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)]
 76 |                     userTagCount[int(userID)][int(tagID)] += 1
 77 |         for userID in userTagPre.keys():
 78 |             for tagID in userTagPre[userID].keys():
 79 |                 tf_ut = self.userTagDict[int(userID)][int(tagID)]/sum(self.userTagDict[int(userID)].values())
 80 |                 idf_ut = math.log(Num * 1.0/(self.tagUserDict[int(tagID)]+1))
 81 |                 userTagPre[userID][tagID] = userTagPre[userID][tagID]/userTagCount[userID][tagID] * tf_ut * idf_ut
 82 |         return userTagPre
 83 | 
 84 |     def recommendForUser(self, user, K, flag=True):
 85 |         userArtistPreDict = dict()
 86 |         for artist in self.artistsAll:
 87 |             if int(artist) in self.artistsTagsDict.keys():
 88 |                 for tag in self.userTagPre[int(user)].keys():
 89 |                     rate_ut = self.userTagPre[int(user)][int(tag)]
 90 |                     rel_it = (0 if tag not in self.artistsTagsDict[int(artist)].keys()
 91 |                               else self.artistsTagsDict[int(artist)][tag])
 92 |                     if artist in userArtistPreDict.keys():
 93 |                         userArtistPreDict[int(artist)] += rate_ut * rel_it
 94 |                     else:
 95 |                         userArtistPreDict[int(artist)] = rate_ut * rel_it
 96 |         newUserArtistPreDict = dict()
 97 |         if flag:
 98 |             for artist in userArtistPreDict.keys():
 99 |                 if artist not in self.userRateDict[int(user)].keys():
100 |                     newUserArtistPreDict[artist] = userArtistPreDict[int(artist)]
101 |             return sorted(newUserArtistPreDict.items(), key = lambda y:y[1], reverse=True)[:K]
102 |         else:
103 |             # 用来效果评估
104 |             return sorted(userArtistPreDict.items(), key = lambda y:y[1], reverse=True)[:K]
105 | 
106 |     def evaluate(self, user):
107 |         K = len(self.userRateDict[int(user)])
108 |         recResult = self.recommendForUser(user, K=K, flag=False)
109 |         count = 0
110 |         for (artist, pre) in recResult:
111 |             if artist in self.userRateDict[int(user)]:
112 |                 count += 1
113 |         return count * 1.0 / K
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     rbt = RecBasedTag()
118 |     # print(rbt.recommendForUser("2",K=20))
119 |     print(rbt.evaluate("2"))
120 | 
121 | 


--------------------------------------------------------------------------------
/chapter-8-CTR/GBDT_based.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import train_test_split
  2 | from sklearn.ensemble import GradientBoostingClassifier
  3 | from sklearn import metrics
  4 | from sklearn.metrics import mean_squared_error
  5 | import pandas as pd
  6 | import os
  7 | 
  8 | 
  9 | class ChurnPredWithGBDT:
 10 |     def __init__(self):
 11 |         self.file = "../telecom-churn/telecom-churn-prediction-data.csv"
 12 |         self.data = self.feature_transform()
 13 |         self.train, self.test = self.split_data()
 14 | 
 15 |     def isNone(self, value):
 16 |         if value == " " or value is None:
 17 |             return "0.0"
 18 |         else:
 19 |             return value
 20 | 
 21 |     def feature_transform(self):
 22 |         if not os.path.exists("../telecom-churn/new_churn.csv"):
 23 |             print("Start feature transform ...")
 24 |             feature_dict = {
 25 |                 "gender": {"Male":"1","Female":"0"},
 26 |                 "Partner": {"Yes":"1","No":"0"},
 27 |                 "Dependents": {"Yes":"1","No":"0"},
 28 |                 "PhoneService": {"Yes":"1","No":"0"},
 29 |                 "MultipleLines": {"Yes":"1","No":"0","No phone service":"2"},
 30 |                 "InternetService": {"DSL":"1","Fiber optic":"2","No":"0"},
 31 |                 "OnlineSecurity": {"Yes":"1","No":"0","No internet service":"2"},
 32 |                 "OnlineBackup": {"Yes":"1","No":"0","No internet service":"2"},
 33 |                 "DeviceProtection": {"Yes":"1","No":"0","No internet service":"2"},
 34 |                 "TechSupport": {"Yes":"1","No":"0","No internet service":"2"},
 35 |                 "StreamingTV": {"Yes":"1","No":"0","No internet service":"2"},
 36 |                 "StreamingMovies": {"Yes":"1","No":"0","No internet service":"2"},
 37 |                 "Contract": {"Month-to-month":"0","One year":"1","Two year":"2"},
 38 |                 "PaperlessBilling":  {"Yes":"1","No":"0"},
 39 |                 "PaymentMethod":{
 40 |                     "Electronic check":"0",
 41 |                     "Mailed check": "1",
 42 |                     "Bank transfer (automatic)": "2",
 43 |                     "Credit card (automatic)":"3",
 44 |                 },
 45 |                 "Churn": {"Yes":"1","No":"0"},
 46 |             }
 47 |             fw = open("../telecom-churn/new_churn.csv","w")
 48 |             fw.write(
 49 |                 "customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,"
 50 |                 "InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,"
 51 |                 "StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn\n"
 52 |             )
 53 |             for line in open(self.file, "r").readlines():
 54 |                 if line.startswith("customerID"):
 55 |                     continue
 56 |                 customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines,\
 57 |                 InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV,\
 58 |                 StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn \
 59 |                 = line.strip().split(",")
 60 |                 _list = list()
 61 |                 _list.append(customerID)
 62 |                 _list.append(self.isNone(feature_dict["gender"][gender]))
 63 |                 _list.append(self.isNone(SeniorCitizen))
 64 |                 _list.append(self.isNone(feature_dict["Partner"][Partner]))
 65 |                 _list.append(self.isNone(feature_dict["Dependents"][Dependents]))
 66 |                 _list.append(self.isNone(tenure))
 67 |                 _list.append(self.isNone(feature_dict["PhoneService"][PhoneService]))
 68 |                 _list.append(self.isNone(feature_dict["MultipleLines"][MultipleLines]))
 69 |                 _list.append(self.isNone(feature_dict["InternetService"][InternetService]))
 70 |                 _list.append(self.isNone(feature_dict["OnlineSecurity"][OnlineSecurity]))
 71 |                 _list.append(self.isNone(feature_dict["OnlineBackup"][OnlineBackup]))
 72 |                 _list.append(self.isNone(feature_dict["DeviceProtection"][DeviceProtection]))
 73 |                 _list.append(self.isNone(feature_dict["TechSupport"][TechSupport]))
 74 |                 _list.append(self.isNone(feature_dict["StreamingTV"][StreamingTV]))
 75 |                 _list.append(self.isNone(feature_dict["StreamingMovies"][StreamingMovies]))
 76 |                 _list.append(self.isNone(feature_dict["Contract"][Contract]))
 77 |                 _list.append(self.isNone(feature_dict["PaperlessBilling"][PaperlessBilling]))
 78 |                 _list.append(self.isNone(feature_dict["PaymentMethod"][PaymentMethod]))
 79 |                 _list.append(self.isNone(MonthlyCharges))
 80 |                 _list.append(self.isNone(TotalCharges))
 81 |                 _list.append(self.isNone(feature_dict["Churn"][Churn]))
 82 |                 fw.write(",".join(_list))
 83 |                 fw.write("\n")
 84 |             return pd.read_csv("../telecom-churn/new_churn.csv")
 85 |         else:
 86 |             return pd.read_csv("../telecom-churn/new_churn.csv")
 87 | 
 88 |     def split_data(self):
 89 |         train, test = train_test_split(self.data, test_size=0.1, random_state=40)
 90 |         return train, test
 91 | 
 92 |     def train_model(self):
 93 |         print("Start Train Model ... ")
 94 |         label = "Churn"
 95 |         ID = "customerID"
 96 |         x_columns = [x for x in self.train.columns if x not in [label, ID]]
 97 |         x_train = self.train[x_columns]
 98 |         y_train = self.train[label]
 99 |         gbdt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, max_depth=5)
100 |         gbdt.fit(x_train, y_train)
101 |         return gbdt
102 | 
103 |     def evaluate(self, gbdt):
104 |         label = "Churn"
105 |         ID = "customerID"
106 |         x_columns = [x for x in self.train.columns if x not in [label, ID]]
107 |         x_test = self.test[x_columns]
108 |         y_test = self.test[label]
109 |         y_pre = gbdt.predict_proba(x_test)
110 |         new_y_pre = list()
111 |         for y in y_pre:
112 |             new_y_pre.append(1 if y[1] > 0.5 else 0)
113 |         mse = mean_squared_error(y_test, new_y_pre)
114 |         print("MSE: %.4f" % mse)
115 |         accuracy = metrics.accuracy_score(y_test.values, new_y_pre)
116 |         print("Accuracy: %.4g" % accuracy)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     pred = ChurnPredWithGBDT()
121 |     gbdt = pred.train_model()
122 |     pred.evaluate(gbdt)


--------------------------------------------------------------------------------