├── Factorization_Machine ├── FM.py ├── all.wmf ├── data │ ├── diabetes_test.txt │ └── diabetes_train.txt ├── rr.gif └── rr2.gif ├── Inverted_index └── invert_indexx.py ├── README.md ├── collaborative_filtering ├── ItemCF │ ├── item_book.txt │ └── main.py └── UserCF │ ├── cf.gif │ └── cf.py ├── ctr_fm_ffm ├── FFM.py ├── FM.py └── LR.py ├── deepfm_recomend ├── __pycache__ │ ├── deepfm.cpython-38.pyc │ ├── feature.cpython-38.pyc │ ├── feature_column.cpython-38.pyc │ ├── inputs.cpython-38.pyc │ └── inputs1.cpython-38.pyc ├── activation.py ├── core.py ├── criteo_sample.txt ├── deepfm.png ├── deepfm_main.py ├── feature.py ├── inputs.py ├── inputs1.py ├── interaction.py ├── layers │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── activation.cpython-38.pyc │ │ ├── core.cpython-38.pyc │ │ ├── interaction.cpython-38.pyc │ │ ├── normalization.cpython-38.pyc │ │ ├── sequence.cpython-38.pyc │ │ └── utils.cpython-38.pyc │ ├── activation.py │ ├── contrib │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── rnn_v2.cpython-38.pyc │ │ │ └── utils.cpython-38.pyc │ │ ├── rnn.py │ │ ├── rnn_v2.py │ │ └── utils.py │ ├── core.py │ ├── interaction.py │ ├── normalization.py │ ├── sequence.py │ ├── untitled17.py │ └── utils.py ├── run_classification_criteo.py ├── temp │ └── deepfm.py ├── xdeepfm.png └── xdeepfm_main.py ├── ffm ├── .gitignore ├── ffm.py ├── ffm_test.py ├── logistic.py ├── readme.md ├── singleton.py ├── train.txt └── valid.txt ├── gbdt_source ├── GBDTReg.py ├── README.txt ├── __pycache__ │ └── GBDTReg.cpython-37.pyc ├── gbdt_demo.py └── testGBDT.py ├── item_book.txt ├── logstic ├── lf1000.gif ├── logstic.py └── testSet.txt ├── main.py ├── other ├── DeepFM-Keras-master │ ├── README.md │ ├── data │ │ └── README.md │ └── keras_FM.py ├── deep_and_wide_keras │ └── wide_and_deep_keras.py ├── deepfm │ ├── DeepFM.py │ ├── data │ │ ├── README.md │ │ ├── test.csv │ │ └── train.csv │ └── 广告预估CTR系列--DeepFM模型架构图--实现篇.jpg └── svd │ ├── README.md │ ├── algorithm │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── estimator.cpython-37.pyc │ ├── dnn │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── neumf.cpython-37.pyc │ │ └── neumf.py │ ├── estimator.py │ ├── mf │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── baseline.cpython-37.pyc │ │ │ ├── explicit_als.cpython-37.pyc │ │ │ ├── implicit_als.cpython-37.pyc │ │ │ ├── svd.cpython-37.pyc │ │ │ └── svdpp.cpython-37.pyc │ │ ├── baseline.py │ │ ├── explicit_als.py │ │ ├── implicit_als.py │ │ ├── svd.py │ │ └── svdpp.py │ └── neighborhood │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── itemcf.cpython-37.pyc │ │ └── slop_one.cpython-37.pyc │ │ ├── itemcf.py │ │ ├── itemcf.pyc │ │ ├── slop_one.py │ │ └── slop_one.pyc │ ├── data │ └── ml-100k │ │ └── u.data │ ├── main.py │ ├── tests │ ├── __init__.py │ └── algorithm_test.py │ └── util │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── databuilder.cpython-37.pyc │ ├── matrix.cpython-37.pyc │ ├── measure.cpython-37.pyc │ └── tools.cpython-37.pyc │ ├── databuilder.py │ ├── dnn_util.py │ ├── matrix.py │ ├── measure.py │ └── tools.py ├── svd ├── svd.gif ├── untitled1.py └── 满秩.gif └── wide-and-deep-learning-keras ├── README.md ├── adult.data ├── adult.test ├── model.png ├── wide_and_deep.h5 ├── wide_and_deep.png └── wide_and_deep.py /Factorization_Machine/FM.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # coding:UTF-8 4 | 5 | from __future__ import division 6 | from math import exp 7 | from numpy import * 8 | from random import normalvariate # 正态分布 9 | from datetime import datetime 10 | import pandas as pd 11 | import numpy as np 12 | from math import log10 13 | trainData = 'data/diabetes_train.txt' #请换为自己文件的路径 14 | testData = 'data/diabetes_test.txt' 15 | 16 | def preprocessData(data): 17 | feature=np.array(data.iloc[:,:-1]) #取特征 18 | label=data.iloc[:,-1].map(lambda x: 1 if x==1 else -1) #取标签并转化为 +1,-1 19 | #将数组按行进行归一化 20 | zmax, zmin = feature.max(axis=0), feature.min(axis=0) 21 | feature = (feature - zmin) / (zmax - zmin) 22 | label=np.array(label) 23 | 24 | return feature,label 25 | 26 | def sigmoid(inx): 27 | #return 1. / (1. + exp(-max(min(inx, 15.), -15.))) 28 | return 1.0 / (1 + exp(-inx)) 29 | 30 | def SGD_FM(dataMatrix, classLabels, k, iter): 31 | ''' 32 | :param dataMatrix: 特征矩阵 33 | :param classLabels: 类别矩阵 34 | :param k: 辅助向量的大小 35 | :param iter: 迭代次数 36 | :return: 37 | ''' 38 | # dataMatrix用的是mat, classLabels是列表 39 | m, n = shape(dataMatrix) #矩阵的行列数,即样本数和特征数 40 | alpha = 0.01 41 | # 初始化参数 42 | # w = random.randn(n, 1)#其中n是特征的个数 43 | w = zeros((n, 1)) #一阶特征的系数 44 | w_0 = 0. 45 | v = normalvariate(0, 0.2) * ones((n, k)) #即生成辅助向量,用来训练二阶交叉特征的系数 46 | 47 | for it in range(iter): 48 | for x in range(m): # 随机优化,每次只使用一个样本 49 | 50 | xx0=dataMatrix[x] 51 | xx=np.array(xx0) 52 | xx1=xx.T@xx 53 | vv=v@v.T 54 | e=xx1*vv 55 | interaction=0.5*(e.sum()-e.trace()) 56 | p = w_0 + xx@w + interaction 57 | loss = (1-sigmoid(classLabels[x] * p[0, 0]) ) #计算损失 58 | w_0 = w_0 +alpha * loss * classLabels[x] 59 | 60 | for i in range(n): 61 | if dataMatrix[x, i] != 0: 62 | w[i, 0] = w[i, 0] +alpha * loss * classLabels[x] * xx[0,i]#dataMatrix[x, i] 63 | for j in range(k): 64 | vv=np.array([v[:,j]]) 65 | v[i, j] = v[i, j]+ alpha * loss * classLabels[x] * xx[0,i]*( xx@vv.T - v[i, j] * xx[0,i]) 66 | print("第{}次迭代后的损失为{}".format(it, loss)) 67 | 68 | return w_0, w, v 69 | 70 | 71 | def getAccuracy(dataMatrix, classLabels, w_0, w, v): 72 | m, n = shape(dataMatrix) 73 | allItem = 0 74 | error = 0 75 | result = [] 76 | for x in range(m): #计算每一个样本的误差 77 | allItem += 1 78 | xx0=dataMatrix[x] 79 | xx=np.array(xx0) 80 | xx1=xx.T@xx 81 | vv=v@v.T 82 | e=xx1*vv 83 | interaction=0.5*(e.sum()-e.trace()) 84 | p = w_0 + xx@w + interaction 85 | 86 | pre = sigmoid(p[0, 0]) 87 | result.append(pre) 88 | 89 | if pre < 0.5 and classLabels[x] == 1.0: 90 | error += 1 91 | elif pre >= 0.5 and classLabels[x] == -1.0: 92 | error += 1 93 | else: 94 | continue 95 | 96 | return float(error) / allItem 97 | 98 | 99 | if __name__ == '__main__': 100 | train=pd.read_csv(trainData) 101 | test = pd.read_csv(testData) 102 | dataTrain, labelTrain = preprocessData(train) 103 | dataTest, labelTest = preprocessData(test) 104 | date_startTrain = datetime.now() 105 | print ("开始训练") 106 | w_0, w, v = SGD_FM(mat(dataTrain), labelTrain, 20, 30) 107 | print( 108 | "训练准确性为:%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))) 109 | date_endTrain = datetime.now() 110 | print( 111 | "训练用时为:%s" % (date_endTrain - date_startTrain)) 112 | print("开始测试") 113 | print( 114 | "测试准确性为:%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))) 115 | -------------------------------------------------------------------------------- /Factorization_Machine/all.wmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/Factorization_Machine/all.wmf -------------------------------------------------------------------------------- /Factorization_Machine/data/diabetes_test.txt: -------------------------------------------------------------------------------- 1 | 2,117,90,19,71,25.2,0.313,21,0 2 | 3,84,72,32,0,37.2,0.267,28,0 3 | 6,0,68,41,0,39.0,0.727,41,1 4 | 7,94,64,25,79,33.3,0.738,41,0 5 | 3,96,78,39,0,37.3,0.238,40,0 6 | 10,75,82,0,0,33.3,0.263,38,0 7 | 0,180,90,26,90,36.5,0.314,35,1 8 | 1,130,60,23,170,28.6,0.692,21,0 9 | 2,84,50,23,76,30.4,0.968,21,0 10 | 8,120,78,0,0,25.0,0.409,64,0 11 | 12,84,72,31,0,29.7,0.297,46,1 12 | 0,139,62,17,210,22.1,0.207,21,0 13 | 9,91,68,0,0,24.2,0.200,58,0 14 | 2,91,62,0,0,27.3,0.525,22,0 15 | 3,99,54,19,86,25.6,0.154,24,0 16 | 3,163,70,18,105,31.6,0.268,28,1 17 | 9,145,88,34,165,30.3,0.771,53,1 18 | 7,125,86,0,0,37.6,0.304,51,0 19 | 13,76,60,0,0,32.8,0.180,41,0 20 | 6,129,90,7,326,19.6,0.582,60,0 21 | 2,68,70,32,66,25.0,0.187,25,0 22 | 3,124,80,33,130,33.2,0.305,26,0 23 | 6,114,0,0,0,0.0,0.189,26,0 24 | 9,130,70,0,0,34.2,0.652,45,1 25 | 3,125,58,0,0,31.6,0.151,24,0 26 | 3,87,60,18,0,21.8,0.444,21,0 27 | 1,97,64,19,82,18.2,0.299,21,0 28 | 3,116,74,15,105,26.3,0.107,24,0 29 | 0,117,66,31,188,30.8,0.493,22,0 30 | 0,111,65,0,0,24.6,0.660,31,0 31 | 2,122,60,18,106,29.8,0.717,22,0 32 | 0,107,76,0,0,45.3,0.686,24,0 33 | 1,86,66,52,65,41.3,0.917,29,0 34 | 6,91,0,0,0,29.8,0.501,31,0 35 | 1,77,56,30,56,33.3,1.251,24,0 36 | 4,132,0,0,0,32.9,0.302,23,1 37 | 0,105,90,0,0,29.6,0.197,46,0 38 | 0,57,60,0,0,21.7,0.735,67,0 39 | 0,127,80,37,210,36.3,0.804,23,0 40 | 3,129,92,49,155,36.4,0.968,32,1 41 | 8,100,74,40,215,39.4,0.661,43,1 42 | 3,128,72,25,190,32.4,0.549,27,1 43 | 10,90,85,32,0,34.9,0.825,56,1 44 | 4,84,90,23,56,39.5,0.159,25,0 45 | 1,88,78,29,76,32.0,0.365,29,0 46 | 8,186,90,35,225,34.5,0.423,37,1 47 | 5,187,76,27,207,43.6,1.034,53,1 48 | 4,131,68,21,166,33.1,0.160,28,0 49 | 1,164,82,43,67,32.8,0.341,50,0 50 | 4,189,110,31,0,28.5,0.680,37,0 51 | 1,116,70,28,0,27.4,0.204,21,0 52 | 3,84,68,30,106,31.9,0.591,25,0 53 | 6,114,88,0,0,27.8,0.247,66,0 54 | 1,88,62,24,44,29.9,0.422,23,0 55 | 1,84,64,23,115,36.9,0.471,28,0 56 | 7,124,70,33,215,25.5,0.161,37,0 57 | 1,97,70,40,0,38.1,0.218,30,0 58 | 8,110,76,0,0,27.8,0.237,58,0 59 | 11,103,68,40,0,46.2,0.126,42,0 60 | 11,85,74,0,0,30.1,0.300,35,0 61 | 6,125,76,0,0,33.8,0.121,54,1 62 | 0,198,66,32,274,41.3,0.502,28,1 63 | 1,87,68,34,77,37.6,0.401,24,0 64 | 6,99,60,19,54,26.9,0.497,32,0 65 | 0,91,80,0,0,32.4,0.601,27,0 66 | 2,95,54,14,88,26.1,0.748,22,0 67 | 1,99,72,30,18,38.6,0.412,21,0 68 | 6,92,62,32,126,32.0,0.085,46,0 69 | 4,154,72,29,126,31.3,0.338,37,0 70 | 0,121,66,30,165,34.3,0.203,33,1 71 | 3,78,70,0,0,32.5,0.270,39,0 72 | 2,130,96,0,0,22.6,0.268,21,0 73 | 3,111,58,31,44,29.5,0.430,22,0 74 | 2,98,60,17,120,34.7,0.198,22,0 75 | 1,143,86,30,330,30.1,0.892,23,0 76 | 1,119,44,47,63,35.5,0.280,25,0 77 | 6,108,44,20,130,24.0,0.813,35,0 78 | 2,118,80,0,0,42.9,0.693,21,1 79 | 10,133,68,0,0,27.0,0.245,36,0 80 | 2,197,70,99,0,34.7,0.575,62,1 81 | 0,151,90,46,0,42.1,0.371,21,1 82 | 6,109,60,27,0,25.0,0.206,27,0 83 | 12,121,78,17,0,26.5,0.259,62,0 84 | 8,100,76,0,0,38.7,0.190,42,0 85 | 8,124,76,24,600,28.7,0.687,52,1 86 | 1,93,56,11,0,22.5,0.417,22,0 87 | 8,143,66,0,0,34.9,0.129,41,1 88 | 6,103,66,0,0,24.3,0.249,29,0 89 | 3,176,86,27,156,33.3,1.154,52,1 90 | 0,73,0,0,0,21.1,0.342,25,0 91 | 11,111,84,40,0,46.8,0.925,45,1 92 | 2,112,78,50,140,39.4,0.175,24,0 93 | 3,132,80,0,0,34.4,0.402,44,1 94 | 2,82,52,22,115,28.5,1.699,25,0 95 | 6,123,72,45,230,33.6,0.733,34,0 96 | 0,188,82,14,185,32.0,0.682,22,1 97 | 0,67,76,0,0,45.3,0.194,46,0 98 | 1,89,24,19,25,27.8,0.559,21,0 99 | 1,173,74,0,0,36.8,0.088,38,1 100 | 1,109,38,18,120,23.1,0.407,26,0 101 | 1,108,88,19,0,27.1,0.400,24,0 102 | 6,96,0,0,0,23.7,0.190,28,0 103 | 1,124,74,36,0,27.8,0.100,30,0 104 | 7,150,78,29,126,35.2,0.692,54,1 105 | 4,183,0,0,0,28.4,0.212,36,1 106 | 1,124,60,32,0,35.8,0.514,21,0 107 | 1,181,78,42,293,40.0,1.258,22,1 108 | 1,92,62,25,41,19.5,0.482,25,0 109 | 0,152,82,39,272,41.5,0.270,27,0 110 | 1,111,62,13,182,24.0,0.138,23,0 111 | 3,106,54,21,158,30.9,0.292,24,0 112 | 3,174,58,22,194,32.9,0.593,36,1 113 | 7,168,88,42,321,38.2,0.787,40,1 114 | 6,105,80,28,0,32.5,0.878,26,0 115 | 11,138,74,26,144,36.1,0.557,50,1 116 | 3,106,72,0,0,25.8,0.207,27,0 117 | 6,117,96,0,0,28.7,0.157,30,0 118 | 2,68,62,13,15,20.1,0.257,23,0 119 | 9,112,82,24,0,28.2,1.282,50,1 120 | 0,119,0,0,0,32.4,0.141,24,1 121 | 2,112,86,42,160,38.4,0.246,28,0 122 | 2,92,76,20,0,24.2,1.698,28,0 123 | 6,183,94,0,0,40.8,1.461,45,0 124 | 0,94,70,27,115,43.5,0.347,21,0 125 | 2,108,64,0,0,30.8,0.158,21,0 126 | 4,90,88,47,54,37.7,0.362,29,0 127 | 0,125,68,0,0,24.7,0.206,21,0 128 | 0,132,78,0,0,32.4,0.393,21,0 129 | 5,128,80,0,0,34.6,0.144,45,0 130 | 4,94,65,22,0,24.7,0.148,21,0 131 | 7,114,64,0,0,27.4,0.732,34,1 132 | 0,102,78,40,90,34.5,0.238,24,0 133 | 2,111,60,0,0,26.2,0.343,23,0 134 | 1,128,82,17,183,27.5,0.115,22,0 135 | 10,92,62,0,0,25.9,0.167,31,0 136 | 13,104,72,0,0,31.2,0.465,38,1 137 | 5,104,74,0,0,28.8,0.153,48,0 138 | 2,94,76,18,66,31.6,0.649,23,0 139 | 7,97,76,32,91,40.9,0.871,32,1 140 | 1,100,74,12,46,19.5,0.149,28,0 141 | 0,102,86,17,105,29.3,0.695,27,0 142 | 4,128,70,0,0,34.3,0.303,24,0 143 | 6,147,80,0,0,29.5,0.178,50,1 144 | 4,90,0,0,0,28.0,0.610,31,0 145 | 3,103,72,30,152,27.6,0.730,27,0 146 | 2,157,74,35,440,39.4,0.134,30,0 147 | 1,167,74,17,144,23.4,0.447,33,1 148 | 0,179,50,36,159,37.8,0.455,22,1 149 | 11,136,84,35,130,28.3,0.260,42,1 150 | 0,107,60,25,0,26.4,0.133,23,0 151 | 1,91,54,25,100,25.2,0.234,23,0 152 | 1,117,60,23,106,33.8,0.466,27,0 153 | 5,123,74,40,77,34.1,0.269,28,0 154 | 2,120,54,0,0,26.8,0.455,27,0 155 | 1,106,70,28,135,34.2,0.142,22,0 156 | 2,155,52,27,540,38.7,0.240,25,1 157 | 2,101,58,35,90,21.8,0.155,22,0 158 | 1,120,80,48,200,38.9,1.162,41,0 159 | 11,127,106,0,0,39.0,0.190,51,0 160 | 3,80,82,31,70,34.2,1.292,27,1 161 | 10,162,84,0,0,27.7,0.182,54,0 162 | 1,199,76,43,0,42.9,1.394,22,1 163 | 8,167,106,46,231,37.6,0.165,43,1 164 | 9,145,80,46,130,37.9,0.637,40,1 165 | 6,115,60,39,0,33.7,0.245,40,1 166 | 1,112,80,45,132,34.8,0.217,24,0 167 | 4,145,82,18,0,32.5,0.235,70,1 168 | 10,111,70,27,0,27.5,0.141,40,1 169 | 6,98,58,33,190,34.0,0.430,43,0 170 | 9,154,78,30,100,30.9,0.164,45,0 171 | 6,165,68,26,168,33.6,0.631,49,0 172 | 1,99,58,10,0,25.4,0.551,21,0 173 | 10,68,106,23,49,35.5,0.285,47,0 174 | 3,123,100,35,240,57.3,0.880,22,0 175 | 8,91,82,0,0,35.6,0.587,68,0 176 | 6,195,70,0,0,30.9,0.328,31,1 177 | 9,156,86,0,0,24.8,0.230,53,1 178 | 0,93,60,0,0,35.3,0.263,25,0 179 | 3,121,52,0,0,36.0,0.127,25,1 180 | 2,101,58,17,265,24.2,0.614,23,0 181 | 2,56,56,28,45,24.2,0.332,22,0 182 | 0,162,76,36,0,49.6,0.364,26,1 183 | 0,95,64,39,105,44.6,0.366,22,0 184 | 4,125,80,0,0,32.3,0.536,27,1 185 | 5,136,82,0,0,0.0,0.640,69,0 186 | 2,129,74,26,205,33.2,0.591,25,0 187 | 3,130,64,0,0,23.1,0.314,22,0 188 | 1,107,50,19,0,28.3,0.181,29,0 189 | 1,140,74,26,180,24.1,0.828,23,0 190 | 1,144,82,46,180,46.1,0.335,46,1 191 | 8,107,80,0,0,24.6,0.856,34,0 192 | 13,158,114,0,0,42.3,0.257,44,1 193 | 2,121,70,32,95,39.1,0.886,23,0 194 | 7,129,68,49,125,38.5,0.439,43,1 195 | 2,90,60,0,0,23.5,0.191,25,0 196 | 7,142,90,24,480,30.4,0.128,43,1 197 | 3,169,74,19,125,29.9,0.268,31,1 198 | 0,99,0,0,0,25.0,0.253,22,0 199 | 4,127,88,11,155,34.5,0.598,28,0 200 | 4,118,70,0,0,44.5,0.904,26,0 201 | 2,122,76,27,200,35.9,0.483,26,0 202 | 6,125,78,31,0,27.6,0.565,49,1 203 | 1,168,88,29,0,35.0,0.905,52,1 204 | 2,129,0,0,0,38.5,0.304,41,0 205 | 4,110,76,20,100,28.4,0.118,27,0 206 | 6,80,80,36,0,39.8,0.177,28,0 207 | 10,115,0,0,0,0.0,0.261,30,1 208 | 2,127,46,21,335,34.4,0.176,22,0 209 | 9,164,78,0,0,32.8,0.148,45,1 210 | 2,93,64,32,160,38.0,0.674,23,1 211 | 3,158,64,13,387,31.2,0.295,24,0 212 | 5,126,78,27,22,29.6,0.439,40,0 213 | 10,129,62,36,0,41.2,0.441,38,1 214 | 0,134,58,20,291,26.4,0.352,21,0 215 | 3,102,74,0,0,29.5,0.121,32,0 216 | 7,187,50,33,392,33.9,0.826,34,1 217 | 3,173,78,39,185,33.8,0.970,31,1 218 | 10,94,72,18,0,23.1,0.595,56,0 219 | 1,108,60,46,178,35.5,0.415,24,0 220 | 5,97,76,27,0,35.6,0.378,52,1 221 | 4,83,86,19,0,29.3,0.317,34,0 222 | 1,114,66,36,200,38.1,0.289,21,0 223 | 1,149,68,29,127,29.3,0.349,42,1 224 | 5,117,86,30,105,39.1,0.251,42,0 225 | 1,111,94,0,0,32.8,0.265,45,0 226 | 4,112,78,40,0,39.4,0.236,38,0 227 | 1,116,78,29,180,36.1,0.496,25,0 228 | 0,141,84,26,0,32.4,0.433,22,0 229 | 2,175,88,0,0,22.9,0.326,22,0 230 | 2,92,52,0,0,30.1,0.141,22,0 231 | 3,130,78,23,79,28.4,0.323,34,1 232 | 8,120,86,0,0,28.4,0.259,22,1 233 | 2,174,88,37,120,44.5,0.646,24,1 234 | 2,106,56,27,165,29.0,0.426,22,0 235 | 2,105,75,0,0,23.3,0.560,53,0 236 | 4,95,60,32,0,35.4,0.284,28,0 237 | 0,126,86,27,120,27.4,0.515,21,0 238 | 8,65,72,23,0,32.0,0.600,42,0 239 | 2,99,60,17,160,36.6,0.453,21,0 240 | 1,102,74,0,0,39.5,0.293,42,1 241 | 11,120,80,37,150,42.3,0.785,48,1 242 | 3,102,44,20,94,30.8,0.400,26,0 243 | 1,109,58,18,116,28.5,0.219,22,0 244 | 9,140,94,0,0,32.7,0.734,45,1 245 | 13,153,88,37,140,40.6,1.174,39,0 246 | 12,100,84,33,105,30.0,0.488,46,0 247 | 1,147,94,41,0,49.3,0.358,27,1 248 | 1,81,74,41,57,46.3,1.096,32,0 249 | 3,187,70,22,200,36.4,0.408,36,1 250 | 6,162,62,0,0,24.3,0.178,50,1 251 | 4,136,70,0,0,31.2,1.182,22,1 252 | 1,121,78,39,74,39.0,0.261,28,0 253 | 3,108,62,24,0,26.0,0.223,25,0 254 | 0,181,88,44,510,43.3,0.222,26,1 255 | 8,154,78,32,0,32.4,0.443,45,1 256 | 1,128,88,39,110,36.5,1.057,37,1 257 | 7,137,90,41,0,32.0,0.391,39,0 258 | 0,123,72,0,0,36.3,0.258,52,1 259 | 1,106,76,0,0,37.5,0.197,26,0 260 | 6,190,92,0,0,35.5,0.278,66,1 261 | 2,88,58,26,16,28.4,0.766,22,0 262 | 9,170,74,31,0,44.0,0.403,43,1 263 | 9,89,62,0,0,22.5,0.142,33,0 264 | 10,101,76,48,180,32.9,0.171,63,0 265 | 2,122,70,27,0,36.8,0.340,27,0 266 | 5,121,72,23,112,26.2,0.245,30,0 267 | 1,126,60,0,0,30.1,0.349,47,1 268 | 1,93,70,31,0,30.4,0.315,23,0 -------------------------------------------------------------------------------- /Factorization_Machine/rr.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/Factorization_Machine/rr.gif -------------------------------------------------------------------------------- /Factorization_Machine/rr2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/Factorization_Machine/rr2.gif -------------------------------------------------------------------------------- /Inverted_index/invert_indexx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun May 5 13:15:25 2019 5 | 6 | @author: lg 7 | """ 8 | 9 | #import jieba 10 | 11 | docu_set={'d1':'i love shanghai', 12 | 'd2':'i am from shanghai now i study in tongji university', 13 | 'd3':'i am from lanzhou now i study in lanzhou university of science and technolgy',} 14 | 15 | 16 | all_words=[] 17 | for i in docu_set.values(): 18 | # cut = jieba.cut(i) 19 | cut=i.split() 20 | all_words.extend(cut) 21 | 22 | set_all_words=set(all_words) 23 | 24 | print(set_all_words) 25 | 26 | 27 | invert_index=dict() 28 | for b in set_all_words: 29 | 30 | 31 | temp=[] 32 | for j in docu_set.keys(): 33 | 34 | field=docu_set[j] 35 | 36 | split_field=field.split() 37 | 38 | if b in split_field: 39 | temp.append(j) 40 | invert_index[b]=temp 41 | 42 | 43 | print(invert_index) 44 | 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # recommend_sys 2 | 各种推荐算法 3 | -------------------------------------------------------------------------------- /collaborative_filtering/ItemCF/item_book.txt: -------------------------------------------------------------------------------- 1 | Liu Yi,3,1001 2 | Chen Er,4,1001 3 | Zhang San,3,1001 4 | Li Si,3,1001 5 | Liu Yi,3,1002 6 | Li Si,4,1002 7 | Liu Yi,4,1003 8 | Zhang San,5,1003 9 | Li Si,5,1003 10 | Liu Yi,4,1004 11 | Zhang San,3,1004 12 | Liu Yi,5,1005 13 | -------------------------------------------------------------------------------- /collaborative_filtering/ItemCF/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | 4 | import math 5 | import pdb 6 | 7 | class ItemBasedCF: 8 | def __init__(self,train_file): 9 | self.train_file = train_file 10 | self.readData() 11 | 12 | def readData(self): 13 | #读取文件,并生成用户-物品的评分表和测试集 14 | self.train = dict() 15 | #用户-物品的评分表 16 | for line in open(self.train_file): 17 | user,score,item = line.strip().split(",") 18 | self.train.setdefault(user,{}) 19 | self.train[user][item] = int(float(score)) 20 | 21 | def ItemSimilarity(self): 22 | #建立物品-物品的共现矩阵 23 | cooccur = dict() #物品-物品的共现矩阵 24 | buy = dict() #物品被多少个不同用户购买N 25 | for user,items in self.train.items(): 26 | for i in items.keys(): 27 | buy.setdefault(i,0) 28 | buy[i] += 1 29 | cooccur.setdefault(i,{}) 30 | for j in items.keys(): 31 | if i == j : continue 32 | cooccur[i].setdefault(j,0) 33 | cooccur[i][j] += 1 34 | #计算相似度矩阵 35 | self.similar = dict() 36 | for i,related_items in cooccur.items(): 37 | self.similar.setdefault(i,{}) 38 | for j,cij in related_items.items(): 39 | self.similar[i][j] = cij / (math.sqrt(buy[i] * buy[j])) 40 | return self.similar 41 | 42 | #给用户user推荐,前K个相关用户,前N个物品 43 | def Recommend(self,user,K=3,N=10): 44 | rank = dict() 45 | action_item = self.train[user] 46 | #用户user产生过行为的item和评分 47 | for item,score in action_item.items(): 48 | sortedItems = sorted(self.similar[item].items(),key=lambda x:x[1],reverse=True)[0:K] 49 | for j,wj in sortedItems: 50 | if j in action_item.keys(): 51 | continue 52 | rank.setdefault(j,0) 53 | rank[j] += score * wj 54 | return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N]) 55 | 56 | #声明一个ItemBasedCF的对象 57 | item = ItemBasedCF("item_book.txt") 58 | item.ItemSimilarity() 59 | recommedDict = item.Recommend("Li Si") 60 | for k,v in recommedDict.items(): 61 | print(k,"\t",v) 62 | -------------------------------------------------------------------------------- /collaborative_filtering/UserCF/cf.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/collaborative_filtering/UserCF/cf.gif -------------------------------------------------------------------------------- /collaborative_filtering/UserCF/cf.py: -------------------------------------------------------------------------------- 1 | 2 | # coding:UTF-8 3 | ''' 4 | Date:20180624 5 | @author: luogan 6 | ''' 7 | 8 | import numpy as np 9 | import pandas 10 | 11 | from numpy import mat,eye 12 | 13 | from numpy import linalg 14 | 15 | 16 | def fetch_data(): 17 | 18 | dat=mat([[4., 3., 0., 5., 0.], 19 | [5., 0., 4., 4., 0.], 20 | [4., 0., 5., 0., 3.], 21 | [2., 3., 0., 1., 0.], 22 | [0., 4., 2., 0., 5.]]) 23 | 24 | return dat 25 | 26 | 27 | 28 | def cos_sim(x, y): 29 | '''余弦相似性 30 | input: x(mat):以行向量的形式存储,可以是用户或者商品 31 | y(mat):以行向量的形式存储,可以是用户或者商品 32 | output: x和y之间的余弦相似度 33 | ''' 34 | numerator = x * y.T # x和y之间的额内积 35 | denominator = np.sqrt(x * x.T) * np.sqrt(y * y.T) 36 | return (numerator / denominator)[0, 0] 37 | 38 | 39 | def similarity(data): 40 | '''计算矩阵中任意两行之间的相似度 41 | input: data(mat):任意矩阵 42 | output: w(mat):任意两行之间的相似度 43 | ''' 44 | m = np.shape(data)[0] # 用户的数量 45 | # 初始化相似度矩阵 46 | w = np.mat(np.zeros((m, m))) 47 | 48 | for i in range(m): 49 | for j in range(i, m): 50 | if j != i: 51 | # 计算任意两行之间的相似度 52 | w[i, j] = cos_sim(data[i, ], data[j, ]) 53 | w[j, i] = w[i, j] 54 | else: 55 | w[i, j] = 0 56 | return w 57 | 58 | def user_based_recommend(data, w, user): 59 | '''基于用户相似性为用户user推荐商品 60 | input: data(mat):用户商品矩阵 61 | w(mat):用户之间的相似度 62 | user(int):用户的编号 63 | output: predict(list):推荐列表 64 | ''' 65 | m, n = np.shape(data) 66 | interaction = data[user, ] # 用户user与商品信息 67 | 68 | # 1、找到用户user没有互动过的商品 69 | not_inter = [] 70 | for i in range(n): 71 | if interaction[0, i] == 0: # 没有互动的商品 72 | not_inter.append(i) 73 | 74 | # 2、对没有互动过的商品进行预测 75 | 76 | #print('not_inter=',not_inter) 77 | predict={} 78 | dd=np.array(data) 79 | ww=np.array(w) 80 | if len(not_inter)>0: 81 | 82 | for i in not_inter: 83 | #print('ww[:,user]=',ww[:,user]) 84 | 85 | #print('dd[:,i].T',dd[:,i].T) 86 | # predict[i]=ww[:,user]@dd[:,i].T 87 | predict[i]=ww[user]@dd[:,i].T 88 | 89 | #print(predict) 90 | return predict 91 | 92 | 93 | 94 | def top_k(predict, k): 95 | '''为用户推荐前k个商品 96 | input: predict(list):排好序的商品列表 97 | k(int):推荐的商品个数 98 | output: top_recom(list):top_k个商品 99 | ''' 100 | pp=pandas.Series(predict) 101 | pp1=pp.sort_values(ascending=False) 102 | #top_recom = [] 103 | len_result = len(predict) 104 | 105 | if k>=len_result: 106 | 107 | return pp1.iloc[:k] 108 | else: 109 | return pp1 110 | 111 | 112 | def normalize(w): 113 | 114 | w=np.array(w) 115 | 116 | #print(w) 117 | 118 | dim=len(w) 119 | ww=[] 120 | for i in range(dim): 121 | 122 | d=w[i] 123 | m=[] 124 | for k in range(len(d)): 125 | m.append(abs(d[k])) 126 | 127 | ssm=sum(m) 128 | #print('ssm=',ssm) 129 | for j in range(len(m)): 130 | 131 | m[j]=d[j]/ssm 132 | 133 | 134 | ww.append(m) 135 | return mat(ww) 136 | 137 | 138 | 139 | 140 | 141 | data = fetch_data() 142 | 143 | 144 | print('仅仅采用协同过滤算法') 145 | 146 | print('only use collaborative') 147 | 148 | w_initial=similarity(data) 149 | # 3、利用用户之间的相似性进行推荐 150 | #print ("------------ 3. predict ------------" ) 151 | predict = user_based_recommend(data, w_initial, 1) 152 | # 4、进行Top-K推荐 153 | #print ("------------ 4. top_k recommendation ------------") 154 | top_recom = top_k(predict, 1) 155 | print ('top_recom=',top_recom) 156 | 157 | 158 | print('采用协同过滤算法+相似度矩阵的归一化') 159 | 160 | print(' use collaborative and normalize') 161 | 162 | w_initial=similarity(data) 163 | 164 | w_initial_normal=normalize(w_initial) 165 | 166 | predict = user_based_recommend(data, w_initial_normal, 1) 167 | top_recom = top_k(predict, 1) 168 | print ('top_recom=',top_recom) 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /ctr_fm_ffm/FFM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from sklearn.datasets import load_breast_cancer 3 | from sklearn.model_selection import train_test_split 4 | K = tf.keras.backend 5 | 6 | 7 | class MyLayer(tf.keras.layers.Layer): 8 | def __init__(self, field_dict, field_dim, input_dim, output_dim=30, **kwargs): 9 | self.field_dict = field_dict 10 | self.field_dim = field_dim 11 | self.input_dim = input_dim 12 | self.output_dim = output_dim 13 | super(MyLayer, self).__init__(**kwargs) 14 | 15 | def build(self, input_shape): 16 | self.kernel = self.add_weight(name='kernel', 17 | shape=(self.input_dim, self.field_dim, self.output_dim), 18 | initializer='glorot_uniform', 19 | trainable=True) 20 | super(MyLayer, self).build(input_shape) 21 | 22 | def call(self, x): 23 | self.field_cross = K.variable(0, dtype='float32') 24 | for i in range(self.input_dim): 25 | for j in range(i+1, self.input_dim): 26 | weight = tf.math.reduce_sum(tf.math.multiply(self.kernel[i, self.field_dict[j]], self.kernel[j, self.field_dict[i]])) 27 | value = tf.math.multiply(weight, tf.math.multiply(x[:,i], x[:,j])) 28 | self.field_cross = tf.math.add(self.field_cross, value) 29 | return self.field_cross 30 | 31 | def compute_output_shape(self, input_shape): 32 | return (input_shape[0], 1) 33 | 34 | def FFM(feature_dim, field_dict, field_dim, output_dim=30): 35 | inputs = tf.keras.Input((feature_dim,)) 36 | liner = tf.keras.layers.Dense(1)(inputs) 37 | cross = MyLayer(field_dict, field_dim, feature_dim, output_dim)(inputs) 38 | cross = tf.keras.layers.Reshape((1,))(cross) 39 | add = tf.keras.layers.Add()([liner, cross]) 40 | predictions = tf.keras.layers.Activation('sigmoid')(add) 41 | model = tf.keras.Model(inputs=inputs, outputs=predictions) 42 | model.compile(loss='binary_crossentropy', 43 | optimizer=tf.train.AdamOptimizer(0.001), 44 | metrics=['binary_accuracy']) 45 | return model 46 | 47 | def train(): 48 | field_dict = {i:i//5 for i in range(30)} 49 | ffm = FFM(30, field_dict, 6, 30) 50 | data = load_breast_cancer() 51 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, 52 | random_state=27, stratify=data.target) 53 | ffm.fit(X_train, y_train, epochs=3, batch_size=16, validation_data=(X_test, y_test)) 54 | return ffm 55 | 56 | 57 | if __name__ == '__main__': 58 | ffm = train() 59 | -------------------------------------------------------------------------------- /ctr_fm_ffm/FM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from sklearn.datasets import load_breast_cancer 3 | from sklearn.model_selection import train_test_split 4 | K = tf.keras.backend 5 | 6 | 7 | class MyLayer(tf.keras.layers.Layer): 8 | def __init__(self, input_dim, output_dim=30, **kwargs): 9 | self.input_dim = input_dim 10 | self.output_dim = output_dim 11 | super(MyLayer, self).__init__(**kwargs) 12 | 13 | def build(self, input_shape): 14 | self.kernel = self.add_weight(name='kernel', 15 | shape=(self.input_dim, self.output_dim), 16 | initializer='glorot_uniform', 17 | trainable=True) 18 | super(MyLayer, self).build(input_shape) 19 | 20 | def call(self, x): 21 | a = K.pow(K.dot(x,self.kernel), 2) 22 | b = K.dot(K.pow(x, 2), K.pow(self.kernel, 2)) 23 | return K.mean(a-b, 1, keepdims=True)*0.5 24 | 25 | def compute_output_shape(self, input_shape): 26 | return (input_shape[0], self.output_dim) 27 | 28 | def FM(feature_dim): 29 | inputs = tf.keras.Input((feature_dim,)) 30 | liner = tf.keras.layers.Dense(units=1, 31 | bias_regularizer=tf.keras.regularizers.l2(0.01), 32 | kernel_regularizer=tf.keras.regularizers.l1(0.02), 33 | )(inputs) 34 | cross = MyLayer(feature_dim)(inputs) 35 | add = tf.keras.layers.Add()([liner, cross]) 36 | predictions = tf.keras.layers.Activation('sigmoid')(add) 37 | model = tf.keras.Model(inputs=inputs, outputs=predictions) 38 | model.compile(loss='binary_crossentropy', 39 | optimizer=tf.train.AdamOptimizer(0.001), 40 | metrics=['binary_accuracy']) 41 | return model 42 | 43 | def train(): 44 | fm = FM(30) 45 | data = load_breast_cancer() 46 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, 47 | random_state=27, stratify=data.target) 48 | fm.fit(X_train, y_train, epochs=3, batch_size=16, validation_data=(X_test, y_test)) 49 | return fm 50 | 51 | 52 | if __name__ == '__main__': 53 | fm = train() 54 | -------------------------------------------------------------------------------- /ctr_fm_ffm/LR.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from sklearn.datasets import load_breast_cancer 3 | from sklearn.model_selection import train_test_split 4 | 5 | 6 | def lr_model(): 7 | inputs = tf.keras.Input((30,)) 8 | pred = tf.keras.layers.Dense(units=1, 9 | bias_regularizer=tf.keras.regularizers.l2(0.01), 10 | kernel_regularizer=tf.keras.regularizers.l1(0.02), 11 | activation=tf.nn.sigmoid)(inputs) 12 | lr = tf.keras.Model(inputs, pred) 13 | lr.compile(loss='binary_crossentropy', 14 | optimizer=tf.train.AdamOptimizer(0.001), 15 | metrics=['binary_accuracy']) 16 | return lr 17 | 18 | def train(): 19 | lr = lr_model() 20 | data = load_breast_cancer() 21 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, 22 | random_state=27, stratify=data.target) 23 | lr.fit(X_train, y_train, epochs=3, batch_size=16, validation_data=(X_test, y_test)) 24 | return lr 25 | 26 | if __name__ == '__main__': 27 | lr = train() 28 | -------------------------------------------------------------------------------- /deepfm_recomend/__pycache__/deepfm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/deepfm.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/__pycache__/feature.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/feature.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/__pycache__/feature_column.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/feature_column.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/__pycache__/inputs.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/inputs.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/__pycache__/inputs1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/inputs1.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/activation.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | import tensorflow as tf 10 | from tensorflow.python.keras.initializers import Zeros 11 | from tensorflow.python.keras.layers import Layer 12 | 13 | try: 14 | unicode 15 | except NameError: 16 | unicode = str 17 | 18 | 19 | class Dice(Layer): 20 | """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data. 21 | 22 | Input shape 23 | - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model. 24 | 25 | Output shape 26 | - Same shape as the input. 27 | 28 | Arguments 29 | - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis). 30 | 31 | - **epsilon** : Small float added to variance to avoid dividing by zero. 32 | 33 | References 34 | - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf) 35 | """ 36 | 37 | def __init__(self, axis=-1, epsilon=1e-9, **kwargs): 38 | self.axis = axis 39 | self.epsilon = epsilon 40 | super(Dice, self).__init__(**kwargs) 41 | 42 | def build(self, input_shape): 43 | self.bn = tf.keras.layers.BatchNormalization( 44 | axis=self.axis, epsilon=self.epsilon, center=False, scale=False) 45 | self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros( 46 | ), dtype=tf.float32, name='dice_alpha') # name='alpha_'+self.name 47 | super(Dice, self).build(input_shape) # Be sure to call this somewhere! 48 | self.uses_learning_phase = True 49 | 50 | def call(self, inputs, training=None, **kwargs): 51 | inputs_normed = self.bn(inputs, training=training) 52 | # tf.layers.batch_normalization( 53 | # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False) 54 | x_p = tf.sigmoid(inputs_normed) 55 | return self.alphas * (1.0 - x_p) * inputs + x_p * inputs 56 | 57 | def compute_output_shape(self, input_shape): 58 | return input_shape 59 | 60 | def get_config(self, ): 61 | config = {'axis': self.axis, 'epsilon': self.epsilon} 62 | base_config = super(Dice, self).get_config() 63 | return dict(list(base_config.items()) + list(config.items())) 64 | 65 | 66 | def activation_layer(activation): 67 | if activation in ("dice", "Dice"): 68 | act_layer = Dice() 69 | elif isinstance(activation, (str, unicode)): 70 | act_layer = tf.keras.layers.Activation(activation) 71 | elif issubclass(activation, Layer): 72 | act_layer = activation() 73 | else: 74 | raise ValueError( 75 | "Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation)) 76 | return act_layer 77 | -------------------------------------------------------------------------------- /deepfm_recomend/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | import tensorflow as tf 10 | from tensorflow.python.keras import backend as K 11 | from tensorflow.python.keras.initializers import Zeros, glorot_normal 12 | from tensorflow.python.keras.layers import Layer 13 | from tensorflow.python.keras.regularizers import l2 14 | 15 | from .activation import activation_layer 16 | 17 | 18 | class LocalActivationUnit(Layer): 19 | """The LocalActivationUnit used in DIN with which the representation of 20 | user interests varies adaptively given different candidate items. 21 | 22 | Input shape 23 | - A list of two 3D tensor with shape: ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)`` 24 | 25 | Output shape 26 | - 3D tensor with shape: ``(batch_size, T, 1)``. 27 | 28 | Arguments 29 | - **hidden_units**:list of positive integer, the attention net layer number and units in each layer. 30 | 31 | - **activation**: Activation function to use in attention net. 32 | 33 | - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix of attention net. 34 | 35 | - **dropout_rate**: float in [0,1). Fraction of the units to dropout in attention net. 36 | 37 | - **use_bn**: bool. Whether use BatchNormalization before activation or not in attention net. 38 | 39 | - **seed**: A Python integer to use as random seed. 40 | 41 | References 42 | - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf) 43 | """ 44 | 45 | def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, 46 | **kwargs): 47 | self.hidden_units = hidden_units 48 | self.activation = activation 49 | self.l2_reg = l2_reg 50 | self.dropout_rate = dropout_rate 51 | self.use_bn = use_bn 52 | self.seed = seed 53 | super(LocalActivationUnit, self).__init__(**kwargs) 54 | self.supports_masking = True 55 | 56 | def build(self, input_shape): 57 | 58 | if not isinstance(input_shape, list) or len(input_shape) != 2: 59 | raise ValueError('A `LocalActivationUnit` layer should be called ' 60 | 'on a list of 2 inputs') 61 | 62 | if len(input_shape[0]) != 3 or len(input_shape[1]) != 3: 63 | raise ValueError("Unexpected inputs dimensions %d and %d, expect to be 3 dimensions" % ( 64 | len(input_shape[0]), len(input_shape[1]))) 65 | 66 | if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1: 67 | raise ValueError('A `LocalActivationUnit` layer requires ' 68 | 'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)' 69 | 'Got different shapes: %s,%s' % (input_shape[0], input_shape[1])) 70 | size = 4 * \ 71 | int(input_shape[0][-1] 72 | ) if len(self.hidden_units) == 0 else self.hidden_units[-1] 73 | self.kernel = self.add_weight(shape=(size, 1), 74 | initializer=glorot_normal( 75 | seed=self.seed), 76 | name="kernel") 77 | self.bias = self.add_weight( 78 | shape=(1,), initializer=Zeros(), name="bias") 79 | self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, 80 | self.dropout_rate, self.use_bn, seed=self.seed) 81 | 82 | self.dense = tf.keras.layers.Lambda(lambda x: tf.nn.bias_add(tf.tensordot( 83 | x[0], x[1], axes=(-1, 0)), x[2])) 84 | 85 | super(LocalActivationUnit, self).build( 86 | input_shape) # Be sure to call this somewhere! 87 | 88 | def call(self, inputs, training=None, **kwargs): 89 | 90 | query, keys = inputs 91 | 92 | keys_len = keys.get_shape()[1] 93 | queries = K.repeat_elements(query, keys_len, 1) 94 | 95 | att_input = tf.concat( 96 | [queries, keys, queries - keys, queries * keys], axis=-1) 97 | 98 | att_out = self.dnn(att_input, training=training) 99 | 100 | attention_score = self.dense([att_out, self.kernel, self.bias]) 101 | 102 | return attention_score 103 | 104 | def compute_output_shape(self, input_shape): 105 | return input_shape[1][:2] + (1,) 106 | 107 | def compute_mask(self, inputs, mask): 108 | return mask 109 | 110 | def get_config(self, ): 111 | config = {'activation': self.activation, 'hidden_units': self.hidden_units, 112 | 'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed} 113 | base_config = super(LocalActivationUnit, self).get_config() 114 | return dict(list(base_config.items()) + list(config.items())) 115 | 116 | 117 | class DNN(Layer): 118 | """The Multi Layer Percetron 119 | 120 | Input shape 121 | - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``. 122 | 123 | Output shape 124 | - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``. 125 | 126 | Arguments 127 | - **hidden_units**:list of positive integer, the layer number and units in each layer. 128 | 129 | - **activation**: Activation function to use. 130 | 131 | - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix. 132 | 133 | - **dropout_rate**: float in [0,1). Fraction of the units to dropout. 134 | 135 | - **use_bn**: bool. Whether use BatchNormalization before activation or not. 136 | 137 | - **seed**: A Python integer to use as random seed. 138 | """ 139 | 140 | def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs): 141 | self.hidden_units = hidden_units 142 | self.activation = activation 143 | self.dropout_rate = dropout_rate 144 | self.seed = seed 145 | self.l2_reg = l2_reg 146 | self.use_bn = use_bn 147 | super(DNN, self).__init__(**kwargs) 148 | 149 | def build(self, input_shape): 150 | # if len(self.hidden_units) == 0: 151 | # raise ValueError("hidden_units is empty") 152 | input_size = input_shape[-1] 153 | hidden_units = [int(input_size)] + list(self.hidden_units) 154 | self.kernels = [self.add_weight(name='kernel' + str(i), 155 | shape=( 156 | hidden_units[i], hidden_units[i + 1]), 157 | initializer=glorot_normal( 158 | seed=self.seed), 159 | regularizer=l2(self.l2_reg), 160 | trainable=True) for i in range(len(self.hidden_units))] 161 | self.bias = [self.add_weight(name='bias' + str(i), 162 | shape=(self.hidden_units[i],), 163 | initializer=Zeros(), 164 | trainable=True) for i in range(len(self.hidden_units))] 165 | if self.use_bn: 166 | self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))] 167 | 168 | self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in 169 | range(len(self.hidden_units))] 170 | 171 | self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))] 172 | 173 | super(DNN, self).build(input_shape) # Be sure to call this somewhere! 174 | 175 | def call(self, inputs, training=None, **kwargs): 176 | 177 | deep_input = inputs 178 | 179 | for i in range(len(self.hidden_units)): 180 | fc = tf.nn.bias_add(tf.tensordot( 181 | deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i]) 182 | # fc = Dense(self.hidden_size[i], activation=None, \ 183 | # kernel_initializer=glorot_normal(seed=self.seed), \ 184 | # kernel_regularizer=l2(self.l2_reg))(deep_input) 185 | if self.use_bn: 186 | fc = self.bn_layers[i](fc, training=training) 187 | 188 | fc = self.activation_layers[i](fc) 189 | 190 | fc = self.dropout_layers[i](fc, training=training) 191 | deep_input = fc 192 | 193 | return deep_input 194 | 195 | def compute_output_shape(self, input_shape): 196 | if len(self.hidden_units) > 0: 197 | shape = input_shape[:-1] + (self.hidden_units[-1],) 198 | else: 199 | shape = input_shape 200 | 201 | return tuple(shape) 202 | 203 | def get_config(self, ): 204 | config = {'activation': self.activation, 'hidden_units': self.hidden_units, 205 | 'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed} 206 | base_config = super(DNN, self).get_config() 207 | return dict(list(base_config.items()) + list(config.items())) 208 | 209 | 210 | class PredictionLayer(Layer): 211 | """ 212 | Arguments 213 | - **task**: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss 214 | 215 | - **use_bias**: bool.Whether add bias term or not. 216 | """ 217 | 218 | def __init__(self, task='binary', use_bias=True, **kwargs): 219 | if task not in ["binary", "multiclass", "regression"]: 220 | raise ValueError("task must be binary,multiclass or regression") 221 | self.task = task 222 | self.use_bias = use_bias 223 | super(PredictionLayer, self).__init__(**kwargs) 224 | 225 | def build(self, input_shape): 226 | 227 | if self.use_bias: 228 | self.global_bias = self.add_weight( 229 | shape=(1,), initializer=Zeros(), name="global_bias") 230 | 231 | # Be sure to call this somewhere! 232 | super(PredictionLayer, self).build(input_shape) 233 | 234 | def call(self, inputs, **kwargs): 235 | x = inputs 236 | if self.use_bias: 237 | x = tf.nn.bias_add(x, self.global_bias, data_format='NHWC') 238 | if self.task == "binary": 239 | x = tf.sigmoid(x) 240 | 241 | output = tf.reshape(x, (-1, 1)) 242 | 243 | return output 244 | 245 | def compute_output_shape(self, input_shape): 246 | return (None, 1) 247 | 248 | def get_config(self, ): 249 | config = {'task': self.task, 'use_bias': self.use_bias} 250 | base_config = super(PredictionLayer, self).get_config() 251 | return dict(list(base_config.items()) + list(config.items())) 252 | 253 | 254 | -------------------------------------------------------------------------------- /deepfm_recomend/deepfm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/deepfm.png -------------------------------------------------------------------------------- /deepfm_recomend/deepfm_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Oct 13 19:50:43 2020 5 | 6 | @author: ledi 7 | """ 8 | 9 | 10 | 11 | import pandas as pd 12 | from sklearn.metrics import log_loss, roc_auc_score 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 15 | # from feature_column import build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns 16 | from layers.core import PredictionLayer, DNN 17 | from layers.interaction import FM 18 | from layers.utils import concat_func, add_func, combined_dnn_input 19 | # from deepfm import DeepFM 20 | 21 | from keras.layers import Dense 22 | # from feature_column import SparseFeat, DenseFeat, get_feature_names 23 | 24 | # if __name__ == "__main__": 25 | data = pd.read_csv('./criteo_sample.txt') 26 | 27 | 28 | #离散的特征名称 29 | sparse_features = ['C' + str(i) for i in range(1, 27)] 30 | 31 | #数值的特征名称 32 | dense_features = ['I' + str(i) for i in range(1, 14)] 33 | 34 | #对缺失的特征进行填充 35 | data[sparse_features] = data[sparse_features].fillna('-1', ) 36 | data[dense_features] = data[dense_features].fillna(0, ) 37 | target = ['label'] 38 | 39 | 40 | #数据预处理 41 | # 1.Label Encoding for sparse features,and do simple Transformation for dense features 42 | #对离散特征进行编码 43 | for feat in sparse_features: 44 | lbe = LabelEncoder() 45 | data[feat] = lbe.fit_transform(data[feat]) 46 | #数值特征进行最大最小归一化 47 | mms = MinMaxScaler(feature_range=(0, 1)) 48 | data[dense_features] = mms.fit_transform(data[dense_features]) 49 | 50 | 51 | 52 | #feature 是特征处理模块 53 | from feature import Operate_Feat1,get_feature_names 54 | 55 | 56 | d=Operate_Feat1() 57 | 58 | 59 | 60 | sparse_list=[] 61 | for p in sparse_features: 62 | d1=d.operate_sparse(data[p], p) 63 | sparse_list.append(d1.copy()) 64 | 65 | dense_list=[] 66 | for q in dense_features: 67 | d2=d.operate_dense(q) 68 | print(d2) 69 | dense_list.append(d2.copy()) 70 | 71 | 72 | # fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 ) 73 | # for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) 74 | # for feat in dense_features] 75 | 76 | merge_list=sparse_list+dense_list 77 | dnn_feature_columns = merge_list 78 | linear_feature_columns = merge_list 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | from feature import DEFAULT_GROUP_NAME,build_input_features 88 | 89 | 90 | 91 | def DeepFM(linear_feature_columns, dnn_feature_columns, fm_group=[DEFAULT_GROUP_NAME], dnn_hidden_units=(128, 128), 92 | l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0, 93 | dnn_activation='relu', dnn_use_bn=False, task='binary'): 94 | 95 | #构建模型的输入张量 96 | features = build_input_features( 97 | merge_list) 98 | 99 | print("#"*10) 100 | print(features) 101 | inputs_list = list(features.values()) 102 | 103 | from feature import get_linear_logit 104 | linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear', 105 | l2_reg=l2_reg_linear) 106 | 107 | 108 | from feature import input_from_feature_columns 109 | group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, l2_reg_embedding, 110 | seed, support_group=True) 111 | 112 | ######################################################################################################### 113 | 114 | print('group_embedding_dict',group_embedding_dict) 115 | print('dense_value_list',dense_value_list) 116 | 117 | # cc=[] 118 | # for k in group_embedding_dict: 119 | # cc.append(k) 120 | cc1=concat_func(group_embedding_dict, axis=1) 121 | 122 | cc2=FM()(cc1) 123 | 124 | # cc=[FM()(concat_func(v, axis=1)) 125 | # for k, v in group_embedding_dict.items() if k in fm_group] 126 | fm_logit = add_func([cc2]) 127 | 128 | dnn_input = combined_dnn_input(group_embedding_dict, dense_value_list) 129 | 130 | dnn_hidden_units=(128, 32) 131 | dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, 132 | dnn_use_bn, seed)(dnn_input) 133 | 134 | # dnn_input= Dense(64, activation='relu')(dnn_input) 135 | 136 | # dnn_output= Dense(28, activation='relu')(dnn_input) 137 | 138 | import keras 139 | import tensorflow as tf 140 | dnn_logit = tf.keras.layers.Dense( 141 | 1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed=seed))(dnn_output) 142 | 143 | final_logit = add_func([linear_logit, fm_logit, dnn_logit]) 144 | 145 | output = PredictionLayer(task)(final_logit) 146 | model = tf.keras.models.Model(inputs=inputs_list, outputs=output) 147 | 148 | return model 149 | 150 | model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') 151 | 152 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) 153 | 154 | # 3.generate input data for model 155 | 156 | train, test = train_test_split(data, test_size=0.2, random_state=2020) 157 | train_model_input = {name:train[name] for name in feature_names} 158 | test_model_input = {name:test[name] for name in feature_names} 159 | 160 | # 4.Define Model,train,predict and evaluate 161 | # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') 162 | model.compile("adam", "binary_crossentropy", 163 | metrics=['binary_crossentropy'], ) 164 | 165 | history = model.fit(train_model_input, train[target].values, 166 | batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) 167 | pred_ans = model.predict(test_model_input, batch_size=256) 168 | print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) 169 | print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) 170 | -------------------------------------------------------------------------------- /deepfm_recomend/feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Oct 14 19:49:32 2020 5 | 6 | @author: ledi 7 | """ 8 | DEFAULT_GROUP_NAME = "default_group" 9 | from collections import namedtuple 10 | from tensorflow.python.keras.initializers import RandomNormal, Zeros 11 | import pandas as pd 12 | from collections import OrderedDict 13 | from tensorflow.python.keras.layers import Input 14 | from layers import Linear 15 | from layers.utils import concat_func, add_func 16 | 17 | 18 | class Operate_Feat1(): 19 | def __init__(self): 20 | 21 | #这里是类别特征的内置配置 22 | self.sparse_dict={ 'embedding_dim':4, 'use_hash':False,'dtype':"int32", 23 | 24 | 'feat_cat':'sparse', 25 | 'embeddings_initializer':RandomNormal(mean=0.0, stddev=0.0001, seed=2020), 26 | 27 | 'embedding_name':None,'group_name':"default_group", 'trainable':True} 28 | #这里是数值特征的内置配置 29 | self.dense_dict={'dimension':1, 'dtype':"float32", 'feat_cat':'dense',} 30 | 31 | 32 | #结果都以字典的形式输出 33 | def operate_sparse(self,some_data,name): 34 | sparse_dict1=self.sparse_dict 35 | sparse_dict1['vocabulary_size']=some_data.nunique() 36 | sparse_dict1['embedding_name'] =name 37 | return pd.Series(sparse_dict1) 38 | def operate_dense(self,dense_name): 39 | dense_dict1=self.dense_dict 40 | dense_dict1['name']=dense_name 41 | 42 | return pd.Series(dense_dict1) 43 | 44 | # 构建输入层 45 | def build_input_features(feature_columns, prefix=''): 46 | input_features = OrderedDict() 47 | for fc in feature_columns: 48 | if fc['feat_cat'] == 'sparse': 49 | input_features[fc['embedding_name']] = Input( 50 | shape=(1,), name=prefix + fc['embedding_name'], dtype=fc['dtype']) 51 | elif fc['feat_cat'] == 'dense': 52 | input_features[fc['name']] = Input( 53 | shape=(fc['dimension'],), name=prefix + fc['name'], dtype=fc['dtype']) 54 | 55 | 56 | else: 57 | raise TypeError("Invalid feature column type,got", type(fc)) 58 | 59 | return input_features 60 | 61 | def get_feature_names(feature_columns): 62 | features = build_input_features(feature_columns) 63 | 64 | print('features==============',features) 65 | return list(features.keys()) 66 | 67 | 68 | def get_linear_logit(features, linear_feature_columns, units=1, use_bias=False, seed=1024, prefix='linear', 69 | l2_reg=0): 70 | 71 | features=features 72 | linear_feature_columns=linear_feature_columns 73 | units=1 74 | use_bias=False 75 | seed=1024 76 | prefix='linear' 77 | l2_reg=0 78 | 79 | 80 | 81 | for i in range(len(linear_feature_columns)): 82 | if linear_feature_columns[i]['feat_cat']=='sparse': 83 | linear_feature_columns[i]['embedding_dim']=3 84 | linear_feature_columns[i]['embeddings_initializer']=Zeros() 85 | 86 | 87 | 88 | linear_emb_list = [input_from_feature_columns(features, linear_feature_columns, l2_reg, seed,prefix=prefix + str(i))[0] for i in range(units)] 89 | _, dense_input_list = input_from_feature_columns(features, linear_feature_columns, l2_reg, seed, prefix=prefix) 90 | 91 | linear_logit_list = [] 92 | for i in range(units): 93 | 94 | if len(linear_emb_list[i]) > 0 and len(dense_input_list) > 0: 95 | sparse_input = concat_func(linear_emb_list[i]) 96 | dense_input = concat_func(dense_input_list) 97 | linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias, seed=seed)([sparse_input, dense_input]) 98 | elif len(linear_emb_list[i]) > 0: 99 | sparse_input = concat_func(linear_emb_list[i]) 100 | linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias, seed=seed)(sparse_input) 101 | elif len(dense_input_list) > 0: 102 | dense_input = concat_func(dense_input_list) 103 | linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias, seed=seed)(dense_input) 104 | else: 105 | # raise NotImplementedError 106 | return add_func([]) 107 | linear_logit_list.append(linear_logit) 108 | 109 | return concat_func(linear_logit_list) 110 | 111 | 112 | 113 | def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True, 114 | support_dense=True, support_group=False): 115 | # feature_columns=linear_feature_columns 116 | # seq_mask_zero=True 117 | # support_dense=True 118 | # support_group=False 119 | 120 | print('KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK') 121 | 122 | print('prefix=',prefix) 123 | sparse_feature_columns=[] 124 | for fc in feature_columns: 125 | if fc['feat_cat'] == 'sparse': 126 | print(fc['feat_cat']) 127 | sparse_feature_columns.append(fc) 128 | 129 | # varlen_sparse_feature_columns = list( 130 | # filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else [] 131 | 132 | 133 | ''' 134 | {'C1': , 135 | 'C2': } 136 | ''' 137 | 138 | from inputs1 import create_embedding_dict,create_embedding_matrix,get_dense_input 139 | 140 | #embedding_matrix_dict是一个字典,key 是特征的名称,values 是某个特征的Embedding 141 | embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix, 142 | seq_mask_zero=seq_mask_zero) 143 | from inputs1 import embedding_lookup 144 | #group_sparse_embedding_dict 是每个特征从input层到embedding 层的映射 , 145 | #这是一个列表 146 | group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns) 147 | 148 | 149 | 150 | #获得dense的输入 151 | dense_value_list = get_dense_input(features, feature_columns) 152 | 153 | 154 | print('TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT') 155 | return group_sparse_embedding_dict, dense_value_list 156 | -------------------------------------------------------------------------------- /deepfm_recomend/inputs.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | from collections import defaultdict 10 | from itertools import chain 11 | 12 | from tensorflow.python.keras.layers import Embedding 13 | from tensorflow.python.keras.regularizers import l2 14 | 15 | from layers.sequence import SequencePoolingLayer, WeightedSequenceLayer 16 | from layers.utils import Hash 17 | 18 | 19 | def get_inputs_list(inputs): 20 | return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs))))) 21 | 22 | 23 | def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, l2_reg, 24 | prefix='sparse_', seq_mask_zero=True): 25 | sparse_embedding = {} 26 | for feat in sparse_feature_columns: 27 | emb = Embedding(feat.vocabulary_size, feat.embedding_dim, 28 | embeddings_initializer=feat.embeddings_initializer, 29 | embeddings_regularizer=l2(l2_reg), 30 | name=prefix + '_emb_' + feat.embedding_name) 31 | emb.trainable = feat.trainable 32 | sparse_embedding[feat.embedding_name] = emb 33 | 34 | if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0: 35 | for feat in varlen_sparse_feature_columns: 36 | # if feat.name not in sparse_embedding: 37 | emb = Embedding(feat.vocabulary_size, feat.embedding_dim, 38 | embeddings_initializer=feat.embeddings_initializer, 39 | embeddings_regularizer=l2( 40 | l2_reg), 41 | name=prefix + '_seq_emb_' + feat.name, 42 | mask_zero=seq_mask_zero) 43 | emb.trainable = feat.trainable 44 | sparse_embedding[feat.embedding_name] = emb 45 | return sparse_embedding 46 | 47 | 48 | def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()): 49 | embedding_vec_list = [] 50 | for fg in sparse_feature_columns: 51 | feat_name = fg.name 52 | if len(return_feat_list) == 0 or feat_name in return_feat_list: 53 | if fg.use_hash: 54 | lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name]) 55 | else: 56 | lookup_idx = input_dict[feat_name] 57 | 58 | embedding_vec_list.append(embedding_dict[feat_name](lookup_idx)) 59 | 60 | return embedding_vec_list 61 | 62 | 63 | def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="", seq_mask_zero=True): 64 | import feature_column as fc_lib 65 | 66 | sparse_feature_columns = list( 67 | filter(lambda x: isinstance(x, fc_lib.SparseFeat), feature_columns)) if feature_columns else [] 68 | varlen_sparse_feature_columns = list( 69 | filter(lambda x: isinstance(x, fc_lib.VarLenSparseFeat), feature_columns)) if feature_columns else [] 70 | sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, 71 | l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero) 72 | return sparse_emb_dict 73 | 74 | 75 | def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(), 76 | mask_feat_list=(), to_list=False): 77 | group_embedding_dict = defaultdict(list) 78 | for fc in sparse_feature_columns: 79 | feature_name = fc.name 80 | embedding_name = fc.embedding_name 81 | if (len(return_feat_list) == 0 or feature_name in return_feat_list): 82 | if fc.use_hash: 83 | lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))( 84 | sparse_input_dict[feature_name]) 85 | else: 86 | lookup_idx = sparse_input_dict[feature_name] 87 | 88 | group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx)) 89 | if to_list: 90 | return list(chain.from_iterable(group_embedding_dict.values())) 91 | return group_embedding_dict 92 | 93 | 94 | def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns): 95 | varlen_embedding_vec_dict = {} 96 | for fc in varlen_sparse_feature_columns: 97 | feature_name = fc.name 98 | embedding_name = fc.embedding_name 99 | if fc.use_hash: 100 | lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name]) 101 | else: 102 | lookup_idx = sequence_input_dict[feature_name] 103 | varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx) 104 | return varlen_embedding_vec_dict 105 | 106 | 107 | def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False): 108 | pooling_vec_list = defaultdict(list) 109 | for fc in varlen_sparse_feature_columns: 110 | feature_name = fc.name 111 | combiner = fc.combiner 112 | feature_length_name = fc.length_name 113 | if feature_length_name is not None: 114 | if fc.weight_name is not None: 115 | seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)( 116 | [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]]) 117 | else: 118 | seq_input = embedding_dict[feature_name] 119 | vec = SequencePoolingLayer(combiner, supports_masking=False)( 120 | [seq_input, features[feature_length_name]]) 121 | else: 122 | if fc.weight_name is not None: 123 | seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)( 124 | [embedding_dict[feature_name], features[fc.weight_name]]) 125 | else: 126 | seq_input = embedding_dict[feature_name] 127 | vec = SequencePoolingLayer(combiner, supports_masking=True)( 128 | seq_input) 129 | pooling_vec_list[fc.group_name].append(vec) 130 | if to_list: 131 | return chain.from_iterable(pooling_vec_list.values()) 132 | return pooling_vec_list 133 | 134 | 135 | def get_dense_input(features, feature_columns): 136 | import feature_column as fc_lib 137 | dense_feature_columns = list( 138 | filter(lambda x: isinstance(x, fc_lib.DenseFeat), feature_columns)) if feature_columns else [] 139 | dense_input_list = [] 140 | for fc in dense_feature_columns: 141 | dense_input_list.append(features[fc.name]) 142 | return dense_input_list 143 | 144 | 145 | def mergeDict(a, b): 146 | c = defaultdict(list) 147 | for k, v in a.items(): 148 | c[k].extend(v) 149 | for k, v in b.items(): 150 | c[k].extend(v) 151 | return c 152 | -------------------------------------------------------------------------------- /deepfm_recomend/inputs1.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | from collections import defaultdict 10 | from itertools import chain 11 | 12 | from tensorflow.python.keras.layers import Embedding 13 | from tensorflow.python.keras.regularizers import l2 14 | 15 | from layers.sequence import SequencePoolingLayer, WeightedSequenceLayer 16 | from layers.utils import Hash 17 | 18 | 19 | 20 | from keras.layers import Embedding 21 | from tensorflow.python.keras.regularizers import l2 22 | def create_embedding_dict(sparse_feature_columns ,seed, l2_reg, 23 | prefix='sparse_', seq_mask_zero=True): 24 | 25 | #将特征进行embedding ,输入维度是某个特征的种类数 26 | sparse_embedding = {} 27 | for feat in sparse_feature_columns: 28 | emb = Embedding(feat.vocabulary_size, feat.embedding_dim, 29 | embeddings_initializer=feat.embeddings_initializer, 30 | embeddings_regularizer=l2(l2_reg), 31 | name=prefix + '_emb_' + feat.embedding_name) 32 | emb.trainable = feat.trainable 33 | sparse_embedding[feat.embedding_name] = emb 34 | 35 | return sparse_embedding 36 | 37 | 38 | 39 | def get_dense_input(features, feature_columns): 40 | # import feature_column as fc_lib 41 | dense_feature_columns=[] 42 | for fc in feature_columns: 43 | if fc['feat_cat'] == 'dense': 44 | dense_feature_columns.append(fc) 45 | dense_input_list = [] 46 | for fc in dense_feature_columns: 47 | dense_input_list.append(features[fc['name']]) 48 | return dense_input_list 49 | 50 | 51 | def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="", seq_mask_zero=True): 52 | 53 | 54 | sparse_feature_columns=[] 55 | for fc in feature_columns: 56 | if fc['feat_cat'] == 'sparse': 57 | sparse_feature_columns.append(fc) 58 | 59 | sparse_emb_dict = create_embedding_dict(sparse_feature_columns, seed, 60 | l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero) 61 | return sparse_emb_dict 62 | 63 | 64 | def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(), 65 | mask_feat_list=(), to_list=False): 66 | 67 | # sparse_embedding_dict=embedding_matrix_dict 68 | # sparse_input_dict =features 69 | 70 | # =sparse_feature_columns 71 | group_embedding_dict = [] 72 | for fc in sparse_feature_columns: 73 | feature_name = fc.embedding_name 74 | embedding_name = fc.embedding_name 75 | # if (len(return_feat_list) == 0 or feature_name in return_feat_list): 76 | if fc.use_hash: 77 | lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))( 78 | sparse_input_dict[feature_name]) 79 | else: 80 | 81 | # 模型输入层张量 82 | lookup_idx = sparse_input_dict[feature_name] 83 | # 从输入层到embedding 层的映射 84 | group_embedding_dict.append(sparse_embedding_dict[embedding_name](lookup_idx)) 85 | 86 | return group_embedding_dict 87 | #这里面是从input 到embedding 层的映射 88 | -------------------------------------------------------------------------------- /deepfm_recomend/layers/__init__.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from .activation import Dice 4 | from .core import DNN, LocalActivationUnit, PredictionLayer 5 | from .interaction import (CIN, FM, AFMLayer, BiInteractionPooling, CrossNet, 6 | InnerProductLayer, InteractingLayer, 7 | OutterProductLayer, FGCNNLayer, SENETLayer, BilinearInteraction, 8 | FieldWiseBiInteraction, FwFMLayer) 9 | from .normalization import LayerNormalization 10 | from .sequence import (AttentionSequencePoolingLayer, BiasEncoding, BiLSTM, 11 | KMaxPooling, SequencePoolingLayer,WeightedSequenceLayer, 12 | Transformer, DynamicGRU) 13 | from .utils import NoMask, Hash,Linear,Add,combined_dnn_input 14 | 15 | custom_objects = {'tf': tf, 16 | 'InnerProductLayer': InnerProductLayer, 17 | 'OutterProductLayer': OutterProductLayer, 18 | 'DNN': DNN, 19 | 'PredictionLayer': PredictionLayer, 20 | 'FM': FM, 21 | 'AFMLayer': AFMLayer, 22 | 'CrossNet': CrossNet, 23 | 'BiInteractionPooling': BiInteractionPooling, 24 | 'LocalActivationUnit': LocalActivationUnit, 25 | 'Dice': Dice, 26 | 'SequencePoolingLayer': SequencePoolingLayer, 27 | 'AttentionSequencePoolingLayer': AttentionSequencePoolingLayer, 28 | 'CIN': CIN, 29 | 'InteractingLayer': InteractingLayer, 30 | 'LayerNormalization': LayerNormalization, 31 | 'BiLSTM': BiLSTM, 32 | 'Transformer': Transformer, 33 | 'NoMask': NoMask, 34 | 'BiasEncoding': BiasEncoding, 35 | 'KMaxPooling': KMaxPooling, 36 | 'FGCNNLayer': FGCNNLayer, 37 | 'Hash': Hash, 38 | 'Linear':Linear, 39 | 'DynamicGRU': DynamicGRU, 40 | 'SENETLayer':SENETLayer, 41 | 'BilinearInteraction':BilinearInteraction, 42 | 'WeightedSequenceLayer':WeightedSequenceLayer, 43 | 'Add':Add, 44 | 'FieldWiseBiInteraction':FieldWiseBiInteraction, 45 | 'FwFMLayer': FwFMLayer 46 | } 47 | -------------------------------------------------------------------------------- /deepfm_recomend/layers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/__pycache__/activation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/activation.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/__pycache__/core.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/core.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/__pycache__/interaction.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/interaction.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/__pycache__/normalization.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/normalization.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/__pycache__/sequence.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/sequence.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/activation.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | import tensorflow as tf 10 | from tensorflow.python.keras.initializers import Zeros 11 | from tensorflow.python.keras.layers import Layer 12 | 13 | try: 14 | unicode 15 | except NameError: 16 | unicode = str 17 | 18 | 19 | class Dice(Layer): 20 | """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data. 21 | 22 | Input shape 23 | - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model. 24 | 25 | Output shape 26 | - Same shape as the input. 27 | 28 | Arguments 29 | - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis). 30 | 31 | - **epsilon** : Small float added to variance to avoid dividing by zero. 32 | 33 | References 34 | - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf) 35 | """ 36 | 37 | def __init__(self, axis=-1, epsilon=1e-9, **kwargs): 38 | self.axis = axis 39 | self.epsilon = epsilon 40 | super(Dice, self).__init__(**kwargs) 41 | 42 | def build(self, input_shape): 43 | self.bn = tf.keras.layers.BatchNormalization( 44 | axis=self.axis, epsilon=self.epsilon, center=False, scale=False) 45 | self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros( 46 | ), dtype=tf.float32, name='dice_alpha') # name='alpha_'+self.name 47 | super(Dice, self).build(input_shape) # Be sure to call this somewhere! 48 | self.uses_learning_phase = True 49 | 50 | def call(self, inputs, training=None, **kwargs): 51 | inputs_normed = self.bn(inputs, training=training) 52 | # tf.layers.batch_normalization( 53 | # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False) 54 | x_p = tf.sigmoid(inputs_normed) 55 | return self.alphas * (1.0 - x_p) * inputs + x_p * inputs 56 | 57 | def compute_output_shape(self, input_shape): 58 | return input_shape 59 | 60 | def get_config(self, ): 61 | config = {'axis': self.axis, 'epsilon': self.epsilon} 62 | base_config = super(Dice, self).get_config() 63 | return dict(list(base_config.items()) + list(config.items())) 64 | 65 | 66 | def activation_layer(activation): 67 | if activation in ("dice", "Dice"): 68 | act_layer = Dice() 69 | elif isinstance(activation, (str, unicode)): 70 | act_layer = tf.keras.layers.Activation(activation) 71 | elif issubclass(activation, Layer): 72 | act_layer = activation() 73 | else: 74 | raise ValueError( 75 | "Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation)) 76 | return act_layer 77 | -------------------------------------------------------------------------------- /deepfm_recomend/layers/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__init__.py -------------------------------------------------------------------------------- /deepfm_recomend/layers/contrib/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/contrib/__pycache__/rnn_v2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__pycache__/rnn_v2.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/contrib/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /deepfm_recomend/layers/contrib/utils.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.ops import array_ops 2 | from tensorflow.python.ops import init_ops 3 | from tensorflow.python.ops import math_ops 4 | from tensorflow.python.ops import nn_ops 5 | from tensorflow.python.ops import variable_scope as vs 6 | from tensorflow.python.ops.rnn_cell import * 7 | from tensorflow.python.util import nest 8 | 9 | _BIAS_VARIABLE_NAME = "bias" 10 | 11 | _WEIGHTS_VARIABLE_NAME = "kernel" 12 | 13 | 14 | class _Linear_(object): 15 | """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. 16 | 17 | 18 | 19 | Args: 20 | 21 | args: a 2D Tensor or a list of 2D, batch x n, Tensors. 22 | 23 | output_size: int, second dimension of weight variable. 24 | 25 | dtype: data type for variables. 26 | 27 | build_bias: boolean, whether to build a bias variable. 28 | 29 | bias_initializer: starting value to initialize the bias 30 | 31 | (default is all zeros). 32 | 33 | kernel_initializer: starting value to initialize the weight. 34 | 35 | 36 | 37 | Raises: 38 | 39 | ValueError: if inputs_shape is wrong. 40 | 41 | """ 42 | 43 | def __init__(self, 44 | 45 | args, 46 | 47 | output_size, 48 | 49 | build_bias, 50 | 51 | bias_initializer=None, 52 | 53 | kernel_initializer=None): 54 | 55 | self._build_bias = build_bias 56 | 57 | if args is None or (nest.is_sequence(args) and not args): 58 | raise ValueError("`args` must be specified") 59 | 60 | if not nest.is_sequence(args): 61 | 62 | args = [args] 63 | 64 | self._is_sequence = False 65 | 66 | else: 67 | 68 | self._is_sequence = True 69 | 70 | # Calculate the total size of arguments on dimension 1. 71 | 72 | total_arg_size = 0 73 | 74 | shapes = [a.get_shape() for a in args] 75 | 76 | for shape in shapes: 77 | 78 | if shape.ndims != 2: 79 | raise ValueError( 80 | "linear is expecting 2D arguments: %s" % shapes) 81 | 82 | if shape[1] is None: 83 | 84 | raise ValueError("linear expects shape[1] to be provided for shape %s, " 85 | 86 | "but saw %s" % (shape, shape[1])) 87 | 88 | else: 89 | 90 | total_arg_size += int(shape[1])#.value 91 | 92 | dtype = [a.dtype for a in args][0] 93 | 94 | scope = vs.get_variable_scope() 95 | 96 | with vs.variable_scope(scope) as outer_scope: 97 | 98 | self._weights = vs.get_variable( 99 | 100 | _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], 101 | 102 | dtype=dtype, 103 | 104 | initializer=kernel_initializer) 105 | 106 | if build_bias: 107 | 108 | with vs.variable_scope(outer_scope) as inner_scope: 109 | 110 | inner_scope.set_partitioner(None) 111 | 112 | if bias_initializer is None: 113 | bias_initializer = init_ops.constant_initializer( 114 | 0.0, dtype=dtype) 115 | 116 | self._biases = vs.get_variable( 117 | 118 | _BIAS_VARIABLE_NAME, [output_size], 119 | 120 | dtype=dtype, 121 | 122 | initializer=bias_initializer) 123 | 124 | def __call__(self, args): 125 | 126 | if not self._is_sequence: 127 | args = [args] 128 | 129 | if len(args) == 1: 130 | 131 | res = math_ops.matmul(args[0], self._weights) 132 | 133 | else: 134 | 135 | res = math_ops.matmul(array_ops.concat(args, 1), self._weights) 136 | 137 | if self._build_bias: 138 | res = nn_ops.bias_add(res, self._biases) 139 | 140 | return res 141 | 142 | 143 | try: 144 | from tensorflow.python.ops.rnn_cell_impl import _Linear 145 | except: 146 | _Linear = _Linear_ 147 | 148 | 149 | class QAAttGRUCell(RNNCell): 150 | """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078). 151 | 152 | Args: 153 | 154 | num_units: int, The number of units in the GRU cell. 155 | 156 | activation: Nonlinearity to use. Default: `tanh`. 157 | 158 | reuse: (optional) Python boolean describing whether to reuse variables 159 | 160 | in an existing scope. If not `True`, and the existing scope already has 161 | 162 | the given variables, an error is raised. 163 | 164 | kernel_initializer: (optional) The initializer to use for the weight and 165 | 166 | projection matrices. 167 | 168 | bias_initializer: (optional) The initializer to use for the bias. 169 | 170 | """ 171 | 172 | def __init__(self, 173 | 174 | num_units, 175 | 176 | activation=None, 177 | 178 | reuse=None, 179 | 180 | kernel_initializer=None, 181 | 182 | bias_initializer=None): 183 | 184 | super(QAAttGRUCell, self).__init__(_reuse=reuse) 185 | 186 | self._num_units = num_units 187 | 188 | self._activation = activation or math_ops.tanh 189 | 190 | self._kernel_initializer = kernel_initializer 191 | 192 | self._bias_initializer = bias_initializer 193 | 194 | self._gate_linear = None 195 | 196 | self._candidate_linear = None 197 | 198 | @property 199 | def state_size(self): 200 | 201 | return self._num_units 202 | 203 | @property 204 | def output_size(self): 205 | 206 | return self._num_units 207 | 208 | def __call__(self, inputs, state, att_score): 209 | 210 | return self.call(inputs, state, att_score) 211 | 212 | def call(self, inputs, state, att_score=None): 213 | """Gated recurrent unit (GRU) with nunits cells.""" 214 | 215 | if self._gate_linear is None: 216 | 217 | bias_ones = self._bias_initializer 218 | 219 | if self._bias_initializer is None: 220 | bias_ones = init_ops.constant_initializer( 221 | 1.0, dtype=inputs.dtype) 222 | 223 | with vs.variable_scope("gates"): # Reset gate and update gate. 224 | 225 | self._gate_linear = _Linear( 226 | 227 | [inputs, state], 228 | 229 | 2 * self._num_units, 230 | 231 | True, 232 | 233 | bias_initializer=bias_ones, 234 | 235 | kernel_initializer=self._kernel_initializer) 236 | 237 | value = math_ops.sigmoid(self._gate_linear([inputs, state])) 238 | 239 | r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) 240 | 241 | r_state = r * state 242 | 243 | if self._candidate_linear is None: 244 | with vs.variable_scope("candidate"): 245 | self._candidate_linear = _Linear( 246 | 247 | [inputs, r_state], 248 | 249 | self._num_units, 250 | 251 | True, 252 | 253 | bias_initializer=self._bias_initializer, 254 | 255 | kernel_initializer=self._kernel_initializer) 256 | 257 | c = self._activation(self._candidate_linear([inputs, r_state])) 258 | 259 | new_h = (1. - att_score) * state + att_score * c 260 | 261 | return new_h, new_h 262 | 263 | 264 | class VecAttGRUCell(RNNCell): 265 | """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078). 266 | 267 | Args: 268 | 269 | num_units: int, The number of units in the GRU cell. 270 | 271 | activation: Nonlinearity to use. Default: `tanh`. 272 | 273 | reuse: (optional) Python boolean describing whether to reuse variables 274 | 275 | in an existing scope. If not `True`, and the existing scope already has 276 | 277 | the given variables, an error is raised. 278 | 279 | kernel_initializer: (optional) The initializer to use for the weight and 280 | 281 | projection matrices. 282 | 283 | bias_initializer: (optional) The initializer to use for the bias. 284 | 285 | """ 286 | 287 | def __init__(self, 288 | 289 | num_units, 290 | 291 | activation=None, 292 | 293 | reuse=None, 294 | 295 | kernel_initializer=None, 296 | 297 | bias_initializer=None): 298 | 299 | super(VecAttGRUCell, self).__init__(_reuse=reuse) 300 | 301 | self._num_units = num_units 302 | 303 | self._activation = activation or math_ops.tanh 304 | 305 | self._kernel_initializer = kernel_initializer 306 | 307 | self._bias_initializer = bias_initializer 308 | 309 | self._gate_linear = None 310 | 311 | self._candidate_linear = None 312 | 313 | @property 314 | def state_size(self): 315 | 316 | return self._num_units 317 | 318 | @property 319 | def output_size(self): 320 | 321 | return self._num_units 322 | 323 | def __call__(self, inputs, state, att_score): 324 | 325 | return self.call(inputs, state, att_score) 326 | 327 | def call(self, inputs, state, att_score=None): 328 | """Gated recurrent unit (GRU) with nunits cells.""" 329 | 330 | if self._gate_linear is None: 331 | 332 | bias_ones = self._bias_initializer 333 | 334 | if self._bias_initializer is None: 335 | bias_ones = init_ops.constant_initializer( 336 | 1.0, dtype=inputs.dtype) 337 | 338 | with vs.variable_scope("gates"): # Reset gate and update gate. 339 | 340 | self._gate_linear = _Linear( 341 | 342 | [inputs, state], 343 | 344 | 2 * self._num_units, 345 | 346 | True, 347 | 348 | bias_initializer=bias_ones, 349 | 350 | kernel_initializer=self._kernel_initializer) 351 | 352 | value = math_ops.sigmoid(self._gate_linear([inputs, state])) 353 | 354 | r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) 355 | 356 | r_state = r * state 357 | 358 | if self._candidate_linear is None: 359 | with vs.variable_scope("candidate"): 360 | self._candidate_linear = _Linear( 361 | 362 | [inputs, r_state], 363 | 364 | self._num_units, 365 | 366 | True, 367 | 368 | bias_initializer=self._bias_initializer, 369 | 370 | kernel_initializer=self._kernel_initializer) 371 | 372 | c = self._activation(self._candidate_linear([inputs, r_state])) 373 | 374 | u = (1.0 - att_score) * u 375 | 376 | new_h = u * state + (1 - u) * c 377 | 378 | return new_h, new_h 379 | -------------------------------------------------------------------------------- /deepfm_recomend/layers/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | import tensorflow as tf 10 | from tensorflow.python.keras import backend as K 11 | from tensorflow.python.keras.initializers import Zeros, glorot_normal 12 | from tensorflow.python.keras.layers import Layer 13 | from tensorflow.python.keras.regularizers import l2 14 | 15 | from .activation import activation_layer 16 | 17 | 18 | class LocalActivationUnit(Layer): 19 | """The LocalActivationUnit used in DIN with which the representation of 20 | user interests varies adaptively given different candidate items. 21 | 22 | Input shape 23 | - A list of two 3D tensor with shape: ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)`` 24 | 25 | Output shape 26 | - 3D tensor with shape: ``(batch_size, T, 1)``. 27 | 28 | Arguments 29 | - **hidden_units**:list of positive integer, the attention net layer number and units in each layer. 30 | 31 | - **activation**: Activation function to use in attention net. 32 | 33 | - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix of attention net. 34 | 35 | - **dropout_rate**: float in [0,1). Fraction of the units to dropout in attention net. 36 | 37 | - **use_bn**: bool. Whether use BatchNormalization before activation or not in attention net. 38 | 39 | - **seed**: A Python integer to use as random seed. 40 | 41 | References 42 | - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf) 43 | """ 44 | 45 | def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, 46 | **kwargs): 47 | self.hidden_units = hidden_units 48 | self.activation = activation 49 | self.l2_reg = l2_reg 50 | self.dropout_rate = dropout_rate 51 | self.use_bn = use_bn 52 | self.seed = seed 53 | super(LocalActivationUnit, self).__init__(**kwargs) 54 | self.supports_masking = True 55 | 56 | def build(self, input_shape): 57 | 58 | if not isinstance(input_shape, list) or len(input_shape) != 2: 59 | raise ValueError('A `LocalActivationUnit` layer should be called ' 60 | 'on a list of 2 inputs') 61 | 62 | if len(input_shape[0]) != 3 or len(input_shape[1]) != 3: 63 | raise ValueError("Unexpected inputs dimensions %d and %d, expect to be 3 dimensions" % ( 64 | len(input_shape[0]), len(input_shape[1]))) 65 | 66 | if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1: 67 | raise ValueError('A `LocalActivationUnit` layer requires ' 68 | 'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)' 69 | 'Got different shapes: %s,%s' % (input_shape[0], input_shape[1])) 70 | size = 4 * \ 71 | int(input_shape[0][-1] 72 | ) if len(self.hidden_units) == 0 else self.hidden_units[-1] 73 | self.kernel = self.add_weight(shape=(size, 1), 74 | initializer=glorot_normal( 75 | seed=self.seed), 76 | name="kernel") 77 | self.bias = self.add_weight( 78 | shape=(1,), initializer=Zeros(), name="bias") 79 | self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, 80 | self.dropout_rate, self.use_bn, seed=self.seed) 81 | 82 | self.dense = tf.keras.layers.Lambda(lambda x: tf.nn.bias_add(tf.tensordot( 83 | x[0], x[1], axes=(-1, 0)), x[2])) 84 | 85 | super(LocalActivationUnit, self).build( 86 | input_shape) # Be sure to call this somewhere! 87 | 88 | def call(self, inputs, training=None, **kwargs): 89 | 90 | query, keys = inputs 91 | 92 | keys_len = keys.get_shape()[1] 93 | queries = K.repeat_elements(query, keys_len, 1) 94 | 95 | att_input = tf.concat( 96 | [queries, keys, queries - keys, queries * keys], axis=-1) 97 | 98 | att_out = self.dnn(att_input, training=training) 99 | 100 | attention_score = self.dense([att_out, self.kernel, self.bias]) 101 | 102 | return attention_score 103 | 104 | def compute_output_shape(self, input_shape): 105 | return input_shape[1][:2] + (1,) 106 | 107 | def compute_mask(self, inputs, mask): 108 | return mask 109 | 110 | def get_config(self, ): 111 | config = {'activation': self.activation, 'hidden_units': self.hidden_units, 112 | 'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed} 113 | base_config = super(LocalActivationUnit, self).get_config() 114 | return dict(list(base_config.items()) + list(config.items())) 115 | 116 | 117 | class DNN(Layer): 118 | """The Multi Layer Percetron 119 | 120 | Input shape 121 | - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``. 122 | 123 | Output shape 124 | - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``. 125 | 126 | Arguments 127 | - **hidden_units**:list of positive integer, the layer number and units in each layer. 128 | 129 | - **activation**: Activation function to use. 130 | 131 | - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix. 132 | 133 | - **dropout_rate**: float in [0,1). Fraction of the units to dropout. 134 | 135 | - **use_bn**: bool. Whether use BatchNormalization before activation or not. 136 | 137 | - **seed**: A Python integer to use as random seed. 138 | """ 139 | 140 | def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs): 141 | self.hidden_units = hidden_units 142 | self.activation = activation 143 | self.dropout_rate = dropout_rate 144 | self.seed = seed 145 | self.l2_reg = l2_reg 146 | self.use_bn = use_bn 147 | super(DNN, self).__init__(**kwargs) 148 | 149 | def build(self, input_shape): 150 | # if len(self.hidden_units) == 0: 151 | # raise ValueError("hidden_units is empty") 152 | input_size = input_shape[-1] 153 | hidden_units = [int(input_size)] + list(self.hidden_units) 154 | self.kernels = [self.add_weight(name='kernel' + str(i), 155 | shape=( 156 | hidden_units[i], hidden_units[i + 1]), 157 | initializer=glorot_normal( 158 | seed=self.seed), 159 | regularizer=l2(self.l2_reg), 160 | trainable=True) for i in range(len(self.hidden_units))] 161 | 162 | print(self.kernels) 163 | 164 | # return self.kernels 165 | self.bias = [self.add_weight(name='bias' + str(i), 166 | shape=(self.hidden_units[i],), 167 | initializer=Zeros(), 168 | trainable=True) for i in range(len(self.hidden_units))] 169 | if self.use_bn: 170 | self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))] 171 | 172 | self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in 173 | range(len(self.hidden_units))] 174 | 175 | self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))] 176 | 177 | super(DNN, self).build(input_shape) # Be sure to call this somewhere! 178 | 179 | def call(self, inputs, training=None, **kwargs): 180 | 181 | deep_input = inputs 182 | 183 | print(self.kernels) 184 | for i in range(len(self.hidden_units)): 185 | 186 | 187 | fc = tf.nn.bias_add(tf.tensordot( 188 | # tf.tensordot 表示矩阵相乘 189 | #比如说 Amn *Bnp axes -1 表示 A的n ,0 表示B 的n 190 | deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i]) 191 | 192 | 193 | # fc = Dense(self.hidden_size[i], activation=None, \ 194 | # kernel_initializer=glorot_normal(seed=self.seed), \ 195 | # kernel_regularizer=l2(self.l2_reg))(deep_input) 196 | if self.use_bn: 197 | fc = self.bn_layers[i](fc, training=training) 198 | 199 | fc = self.activation_layers[i](fc) 200 | 201 | fc = self.dropout_layers[i](fc, training=training) 202 | deep_input = fc 203 | 204 | return deep_input 205 | 206 | def compute_output_shape(self, input_shape): 207 | if len(self.hidden_units) > 0: 208 | shape = input_shape[:-1] + (self.hidden_units[-1],) 209 | else: 210 | shape = input_shape 211 | 212 | return tuple(shape) 213 | 214 | def get_config(self, ): 215 | config = {'activation': self.activation, 'hidden_units': self.hidden_units, 216 | 'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed} 217 | base_config = super(DNN, self).get_config() 218 | return dict(list(base_config.items()) + list(config.items())) 219 | 220 | 221 | class PredictionLayer(Layer): 222 | """ 223 | Arguments 224 | - **task**: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss 225 | 226 | - **use_bias**: bool.Whether add bias term or not. 227 | """ 228 | 229 | def __init__(self, task='binary', use_bias=True, **kwargs): 230 | if task not in ["binary", "multiclass", "regression"]: 231 | raise ValueError("task must be binary,multiclass or regression") 232 | self.task = task 233 | self.use_bias = use_bias 234 | super(PredictionLayer, self).__init__(**kwargs) 235 | 236 | def build(self, input_shape): 237 | 238 | if self.use_bias: 239 | self.global_bias = self.add_weight( 240 | shape=(1,), initializer=Zeros(), name="global_bias") 241 | 242 | # Be sure to call this somewhere! 243 | super(PredictionLayer, self).build(input_shape) 244 | 245 | def call(self, inputs, **kwargs): 246 | x = inputs 247 | if self.use_bias: 248 | x = tf.nn.bias_add(x, self.global_bias, data_format='NHWC') 249 | if self.task == "binary": 250 | x = tf.sigmoid(x) 251 | 252 | output = tf.reshape(x, (-1, 1)) 253 | 254 | return output 255 | 256 | def compute_output_shape(self, input_shape): 257 | return (None, 1) 258 | 259 | def get_config(self, ): 260 | config = {'task': self.task, 'use_bias': self.use_bias} 261 | base_config = super(PredictionLayer, self).get_config() 262 | return dict(list(base_config.items()) + list(config.items())) 263 | 264 | 265 | -------------------------------------------------------------------------------- /deepfm_recomend/layers/normalization.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | from tensorflow.python.keras import backend as K 10 | from tensorflow.python.keras.initializers import Ones, Zeros 11 | from tensorflow.python.keras.layers import Layer 12 | 13 | 14 | class LayerNormalization(Layer): 15 | def __init__(self, axis=-1, eps=1e-9, center=True, 16 | scale=True, **kwargs): 17 | self.axis = axis 18 | self.eps = eps 19 | self.center = center 20 | self.scale = scale 21 | super(LayerNormalization, self).__init__(**kwargs) 22 | 23 | def build(self, input_shape): 24 | self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:], 25 | initializer=Ones(), trainable=True) 26 | self.beta = self.add_weight(name='beta', shape=input_shape[-1:], 27 | initializer=Zeros(), trainable=True) 28 | super(LayerNormalization, self).build(input_shape) 29 | 30 | def call(self, inputs): 31 | mean = K.mean(inputs, axis=self.axis, keepdims=True) 32 | variance = K.mean(K.square(inputs - mean), axis=-1, keepdims=True) 33 | std = K.sqrt(variance + self.eps) 34 | outputs = (inputs - mean) / std 35 | if self.scale: 36 | outputs *= self.gamma 37 | if self.center: 38 | outputs += self.beta 39 | return outputs 40 | 41 | def compute_output_shape(self, input_shape): 42 | return input_shape 43 | 44 | def get_config(self, ): 45 | config = {'axis': self.axis, 'eps': self.eps, 'center': self.center, 'scale': self.scale} 46 | base_config = super(LayerNormalization, self).get_config() 47 | return dict(list(base_config.items()) + list(config.items())) 48 | -------------------------------------------------------------------------------- /deepfm_recomend/layers/untitled17.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Oct 16 16:54:00 2020 5 | 6 | @author: ledi 7 | """ 8 | 9 | import tensorflow as tf 10 | from tensorflow.python.keras import backend as K 11 | from tensorflow.python.keras.initializers import Zeros, glorot_normal 12 | from tensorflow.python.keras.layers import Layer 13 | from tensorflow.python.keras.regularizers import l2 14 | from keras.layers import Activation 15 | 16 | 17 | class my_dense(Layer): 18 | 19 | 20 | def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs): 21 | self.hidden_units = hidden_units 22 | self.activation = activation 23 | self.dropout_rate = dropout_rate 24 | self.seed = seed 25 | self.l2_reg = l2_reg 26 | self.use_bn = use_bn 27 | super().__init__(**kwargs) 28 | 29 | def build(self, input_shape): 30 | # if len(self.hidden_units) == 0: 31 | # raise ValueError("hidden_units is empty") 32 | input_size = input_shape[-1] 33 | # hidden_units = [int(input_size)] + list(self.hidden_units) 34 | self.kernels = self.add_weight(name='kernel' , 35 | shape=(input_size, hidden_units), 36 | initializer=glorot_normal( 37 | seed=self.seed), 38 | regularizer=l2(self.l2_reg), 39 | trainable=True ) 40 | 41 | print(self.kernels) 42 | 43 | # return self.kernels 44 | self.bias = [self.add_weight(name='bias' + str(i), 45 | shape=(self.hidden_units[i],), 46 | initializer=Zeros(), 47 | trainable=True) for i in range(len(self.hidden_units))] 48 | if self.use_bn: 49 | self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))] 50 | 51 | self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in 52 | range(len(self.hidden_units))] 53 | 54 | self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))] 55 | 56 | super().build(input_shape) # Be sure to call this somewhere! 57 | 58 | def call(self, inputs, training=None, **kwargs): 59 | 60 | deep_input = inputs 61 | 62 | print(self.kernels) 63 | for i in range(len(self.hidden_units)): 64 | fc = tf.nn.bias_add(tf.tensordot( 65 | deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i]) 66 | 67 | 68 | # fc = Dense(self.hidden_size[i], activation=None, \ 69 | # kernel_initializer=glorot_normal(seed=self.seed), \ 70 | # kernel_regularizer=l2(self.l2_reg))(deep_input) 71 | if self.use_bn: 72 | fc = self.bn_layers[i](fc, training=training) 73 | 74 | fc = self.activation_layers[i](fc) 75 | 76 | fc = self.dropout_layers[i](fc, training=training) 77 | deep_input = fc 78 | 79 | return deep_input 80 | 81 | def compute_output_shape(self, input_shape): 82 | if len(self.hidden_units) > 0: 83 | shape = input_shape[:-1] + (self.hidden_units[-1],) 84 | else: 85 | shape = input_shape 86 | 87 | return tuple(shape) 88 | 89 | def get_config(self, ): 90 | config = {'activation': self.activation, 'hidden_units': self.hidden_units, 91 | 'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed} 92 | base_config = super(DNN, self).get_config() 93 | return dict(list(base_config.items()) + list(config.items())) 94 | -------------------------------------------------------------------------------- /deepfm_recomend/layers/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | import tensorflow as tf 9 | from tensorflow.python.keras.layers import Flatten 10 | 11 | 12 | class NoMask(tf.keras.layers.Layer): 13 | def __init__(self, **kwargs): 14 | super(NoMask, self).__init__(**kwargs) 15 | 16 | def build(self, input_shape): 17 | # Be sure to call this somewhere! 18 | super(NoMask, self).build(input_shape) 19 | 20 | def call(self, x, mask=None, **kwargs): 21 | return x 22 | 23 | def compute_mask(self, inputs, mask): 24 | return None 25 | 26 | 27 | class Hash(tf.keras.layers.Layer): 28 | """ 29 | hash the input to [0,num_buckets) 30 | if mask_zero = True,0 or 0.0 will be set to 0,other value will be set in range[1,num_buckets) 31 | """ 32 | 33 | def __init__(self, num_buckets, mask_zero=False, **kwargs): 34 | self.num_buckets = num_buckets 35 | self.mask_zero = mask_zero 36 | super(Hash, self).__init__(**kwargs) 37 | 38 | def build(self, input_shape): 39 | # Be sure to call this somewhere! 40 | super(Hash, self).build(input_shape) 41 | 42 | def call(self, x, mask=None, **kwargs): 43 | 44 | 45 | if x.dtype != tf.string: 46 | zero = tf.as_string(tf.zeros([1], dtype=x.dtype)) 47 | x = tf.as_string(x, ) 48 | else: 49 | zero = tf.as_string(tf.zeros([1], dtype='int32')) 50 | 51 | num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1 52 | try: 53 | hash_x = tf.string_to_hash_bucket_fast(x, num_buckets, 54 | name=None) # weak hash 55 | except: 56 | hash_x = tf.strings.to_hash_bucket_fast(x, num_buckets, 57 | name=None) # weak hash 58 | if self.mask_zero: 59 | mask = tf.cast(tf.not_equal(x, zero), dtype='int64') 60 | hash_x = (hash_x + 1) * mask 61 | 62 | return hash_x 63 | def get_config(self, ): 64 | config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, } 65 | base_config = super(Hash, self).get_config() 66 | return dict(list(base_config.items()) + list(config.items())) 67 | 68 | 69 | class Linear(tf.keras.layers.Layer): 70 | 71 | def __init__(self, l2_reg=0.0, mode=0, use_bias=False, seed=1024, **kwargs): 72 | 73 | self.l2_reg = l2_reg 74 | # self.l2_reg = tf.contrib.layers.l2_regularizer(float(l2_reg_linear)) 75 | if mode not in [0, 1, 2]: 76 | raise ValueError("mode must be 0,1 or 2") 77 | self.mode = mode 78 | self.use_bias = use_bias 79 | self.seed = seed 80 | super(Linear, self).__init__(**kwargs) 81 | 82 | def build(self, input_shape): 83 | 84 | print('input_shape=',input_shape) 85 | if self.use_bias: 86 | self.bias = self.add_weight(name='linear_bias', 87 | shape=(1,), 88 | initializer=tf.keras.initializers.Zeros(), 89 | trainable=True) 90 | if self.mode == 1: 91 | self.kernel = self.add_weight( 92 | 'linear_kernel', 93 | shape=[int(input_shape[-1]), 1], 94 | initializer=tf.keras.initializers.glorot_normal(self.seed), 95 | regularizer=tf.keras.regularizers.l2(self.l2_reg), 96 | trainable=True) 97 | elif self.mode == 2: 98 | 99 | 100 | #在deepfm 中 101 | #模式二有两个输入[sparse_input, dense_input] 102 | #input_shape= [TensorShape([None, 1, 26]), TensorShape([None, 13])] 103 | #这里的 kernel的shape 是 13*1 104 | self.kernel = self.add_weight( 105 | 'linear_kernel', 106 | shape=[int(input_shape[1][-1]), 1], 107 | initializer=tf.keras.initializers.glorot_normal(self.seed), 108 | regularizer=tf.keras.regularizers.l2(self.l2_reg), 109 | trainable=True) 110 | 111 | super(Linear, self).build(input_shape) # Be sure to call this somewhere! 112 | 113 | def call(self, inputs, **kwargs): 114 | if self.mode == 0: 115 | sparse_input = inputs 116 | linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=True) 117 | elif self.mode == 1: 118 | dense_input = inputs 119 | # fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0)) 120 | print(dense_input) 121 | 122 | fc =tf.matmul(dense_input,self.kernel) 123 | linear_logit = fc 124 | else: 125 | sparse_input, dense_input = inputs 126 | 127 | print('dense_input',dense_input) 128 | # fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0)) 129 | 130 | #相乘之后N*13的矩阵与13*1的矩阵相乘,变成N*1就是一个数值 131 | fc = tf.matmul(dense_input,self.kernel) 132 | print('fc=',fc) 133 | sum_sparse=reduce_sum(sparse_input, axis=-1, keep_dims=False) 134 | 135 | print('sum_sparse=',sum_sparse) 136 | #sum_sparse也是一个数值,两个数相加 137 | linear_logit = sum_sparse + fc 138 | if self.use_bias: 139 | linear_logit += self.bias 140 | 141 | return linear_logit 142 | 143 | def compute_output_shape(self, input_shape): 144 | return (None, 1) 145 | 146 | def compute_mask(self, inputs, mask): 147 | return None 148 | 149 | def get_config(self, ): 150 | config = {'mode': self.mode, 'l2_reg': self.l2_reg, 'use_bias': self.use_bias, 'seed': self.seed} 151 | base_config = super(Linear, self).get_config() 152 | return dict(list(base_config.items()) + list(config.items())) 153 | 154 | 155 | def concat_func(inputs, axis=-1, mask=False): 156 | if not mask: 157 | inputs = list(map(NoMask(), inputs)) 158 | if len(inputs) == 1: 159 | return inputs[0] 160 | else: 161 | return tf.keras.layers.Concatenate(axis=axis)(inputs) 162 | 163 | 164 | def reduce_mean(input_tensor, 165 | axis=None, 166 | keep_dims=False, 167 | name=None, 168 | reduction_indices=None): 169 | try: 170 | return tf.reduce_mean(input_tensor, 171 | axis=axis, 172 | keep_dims=keep_dims, 173 | name=name, 174 | reduction_indices=reduction_indices) 175 | except TypeError: 176 | return tf.reduce_mean(input_tensor, 177 | axis=axis, 178 | keepdims=keep_dims, 179 | name=name) 180 | 181 | 182 | def reduce_sum(input_tensor, 183 | axis=None, 184 | keep_dims=False, 185 | name=None, 186 | reduction_indices=None): 187 | try: 188 | return tf.reduce_sum(input_tensor, 189 | axis=axis, 190 | keep_dims=keep_dims, 191 | name=name, 192 | reduction_indices=reduction_indices) 193 | except TypeError: 194 | return tf.reduce_sum(input_tensor, 195 | axis=axis, 196 | keepdims=keep_dims, 197 | name=name) 198 | 199 | 200 | def reduce_max(input_tensor, 201 | axis=None, 202 | keep_dims=False, 203 | name=None, 204 | reduction_indices=None): 205 | try: 206 | return tf.reduce_max(input_tensor, 207 | axis=axis, 208 | keep_dims=keep_dims, 209 | name=name, 210 | reduction_indices=reduction_indices) 211 | except TypeError: 212 | return tf.reduce_max(input_tensor, 213 | axis=axis, 214 | keepdims=keep_dims, 215 | name=name) 216 | 217 | 218 | def div(x, y, name=None): 219 | try: 220 | return tf.div(x, y, name=name) 221 | except AttributeError: 222 | return tf.divide(x, y, name=name) 223 | 224 | 225 | def softmax(logits, dim=-1, name=None): 226 | try: 227 | return tf.nn.softmax(logits, dim=dim, name=name) 228 | except TypeError: 229 | return tf.nn.softmax(logits, axis=dim, name=name) 230 | 231 | 232 | class Add(tf.keras.layers.Layer): 233 | def __init__(self, **kwargs): 234 | super(Add, self).__init__(**kwargs) 235 | 236 | def build(self, input_shape): 237 | # Be sure to call this somewhere! 238 | super(Add, self).build(input_shape) 239 | 240 | def call(self, inputs, **kwargs): 241 | if not isinstance(inputs, list): 242 | return inputs 243 | if len(inputs) == 1: 244 | return inputs[0] 245 | if len(inputs) == 0: 246 | return tf.constant([[0.0]]) 247 | 248 | return tf.keras.layers.add(inputs) 249 | 250 | 251 | def add_func(inputs): 252 | return Add()(inputs) 253 | 254 | 255 | def combined_dnn_input(sparse_embedding_list, dense_value_list): 256 | if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0: 257 | sparse_dnn_input = Flatten()(concat_func(sparse_embedding_list)) 258 | dense_dnn_input = Flatten()(concat_func(dense_value_list)) 259 | return concat_func([sparse_dnn_input, dense_dnn_input]) 260 | elif len(sparse_embedding_list) > 0: 261 | return Flatten()(concat_func(sparse_embedding_list)) 262 | elif len(dense_value_list) > 0: 263 | return Flatten()(concat_func(dense_value_list)) 264 | else: 265 | raise NotImplementedError("dnn_feature_columns can not be empty list") 266 | -------------------------------------------------------------------------------- /deepfm_recomend/run_classification_criteo.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import log_loss, roc_auc_score 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 5 | 6 | from deepfm import DeepFM 7 | from feature_column import SparseFeat, DenseFeat, get_feature_names 8 | 9 | # if __name__ == "__main__": 10 | data = pd.read_csv('./criteo_sample.txt') 11 | 12 | sparse_features = ['C' + str(i) for i in range(1, 27)] 13 | dense_features = ['I' + str(i) for i in range(1, 14)] 14 | 15 | data[sparse_features] = data[sparse_features].fillna('-1', ) 16 | data[dense_features] = data[dense_features].fillna(0, ) 17 | target = ['label'] 18 | 19 | # 1.Label Encoding for sparse features,and do simple Transformation for dense features 20 | for feat in sparse_features: 21 | lbe = LabelEncoder() 22 | data[feat] = lbe.fit_transform(data[feat]) 23 | mms = MinMaxScaler(feature_range=(0, 1)) 24 | data[dense_features] = mms.fit_transform(data[dense_features]) 25 | 26 | # 2.count #unique features for each sparse field,and record dense feature field name 27 | 28 | fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 ) 29 | for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) 30 | for feat in dense_features] 31 | 32 | dnn_feature_columns = fixlen_feature_columns 33 | linear_feature_columns = fixlen_feature_columns 34 | 35 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) 36 | 37 | # 3.generate input data for model 38 | 39 | train, test = train_test_split(data, test_size=0.2, random_state=2020) 40 | train_model_input = {name:train[name] for name in feature_names} 41 | test_model_input = {name:test[name] for name in feature_names} 42 | 43 | # 4.Define Model,train,predict and evaluate 44 | model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') 45 | model.compile("adam", "binary_crossentropy", 46 | metrics=['binary_crossentropy'], ) 47 | 48 | history = model.fit(train_model_input, train[target].values, 49 | batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) 50 | pred_ans = model.predict(test_model_input, batch_size=256) 51 | print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) 52 | print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) 53 | -------------------------------------------------------------------------------- /deepfm_recomend/temp/deepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | Author: 4 | Weichen Shen,wcshen1994@163.com 5 | 6 | Reference: 7 | [1] Guo H, Tang R, Ye Y, et al. Deepfm: a factorization-machine based neural network for ctr prediction[J]. arXiv preprint arXiv:1703.04247, 2017.(https://arxiv.org/abs/1703.04247) 8 | 9 | """ 10 | 11 | from itertools import chain 12 | 13 | import tensorflow as tf 14 | 15 | from feature_column import build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns 16 | from layers.core import PredictionLayer, DNN 17 | from layers.interaction import FM 18 | from layers.utils import concat_func, add_func, combined_dnn_input 19 | 20 | 21 | def DeepFM(linear_feature_columns, dnn_feature_columns, fm_group=[DEFAULT_GROUP_NAME], dnn_hidden_units=(128, 128), 22 | l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0, 23 | dnn_activation='relu', dnn_use_bn=False, task='binary'): 24 | """Instantiates the DeepFM Network architecture. 25 | 26 | :param linear_feature_columns: An iterable containing all the features used by linear part of the model. 27 | :param dnn_feature_columns: An iterable containing all the features used by deep part of the model. 28 | :param fm_group: list, group_name of features that will be used to do feature interactions. 29 | :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN 30 | :param l2_reg_linear: float. L2 regularizer strength applied to linear part 31 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 32 | :param l2_reg_dnn: float. L2 regularizer strength applied to DNN 33 | :param seed: integer ,to use as random seed. 34 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. 35 | :param dnn_activation: Activation function to use in DNN 36 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN 37 | :param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss 38 | :return: A Keras model instance. 39 | """ 40 | 41 | #构建模型的输入张量 42 | features = build_input_features( 43 | linear_feature_columns +dnn_feature_columns) 44 | 45 | print("#"*10) 46 | print(features) 47 | inputs_list = list(features.values()) 48 | 49 | 50 | # 构建线性张量 51 | linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear', 52 | l2_reg=l2_reg_linear) 53 | 54 | group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, l2_reg_embedding, 55 | seed, support_group=True) 56 | 57 | 58 | print('group_embedding_dict',group_embedding_dict) 59 | print('dense_value_list',dense_value_list) 60 | fm_logit = add_func([FM()(concat_func(v, axis=1)) 61 | for k, v in group_embedding_dict.items() if k in fm_group]) 62 | 63 | dnn_input = combined_dnn_input(list(chain.from_iterable( 64 | group_embedding_dict.values())), dense_value_list) 65 | dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, 66 | dnn_use_bn, seed)(dnn_input) 67 | dnn_logit = tf.keras.layers.Dense( 68 | 1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed=seed))(dnn_output) 69 | 70 | final_logit = add_func([linear_logit, fm_logit, dnn_logit]) 71 | 72 | output = PredictionLayer(task)(final_logit) 73 | model = tf.keras.models.Model(inputs=inputs_list, outputs=output) 74 | return model 75 | -------------------------------------------------------------------------------- /deepfm_recomend/xdeepfm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/xdeepfm.png -------------------------------------------------------------------------------- /deepfm_recomend/xdeepfm_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Oct 13 19:50:43 2020 5 | 6 | @author: ledi 7 | """ 8 | 9 | 10 | 11 | import pandas as pd 12 | from sklearn.metrics import log_loss, roc_auc_score 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 15 | # from feature_column import build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns 16 | from layers.core import PredictionLayer, DNN 17 | from layers.interaction import FM,CIN 18 | from layers.utils import concat_func, add_func, combined_dnn_input 19 | # from deepfm import DeepFM 20 | 21 | from keras.layers import Dense 22 | # from feature_column import SparseFeat, DenseFeat, get_feature_names 23 | 24 | # if __name__ == "__main__": 25 | data = pd.read_csv('./criteo_sample.txt') 26 | 27 | 28 | #离散的特征名称 29 | sparse_features = ['C' + str(i) for i in range(1, 27)] 30 | 31 | #数值的特征名称 32 | dense_features = ['I' + str(i) for i in range(1, 14)] 33 | 34 | #对缺失的特征进行填充 35 | data[sparse_features] = data[sparse_features].fillna('-1', ) 36 | data[dense_features] = data[dense_features].fillna(0, ) 37 | target = ['label'] 38 | 39 | 40 | #数据预处理 41 | # 1.Label Encoding for sparse features,and do simple Transformation for dense features 42 | #对离散特征进行编码 43 | for feat in sparse_features: 44 | lbe = LabelEncoder() 45 | data[feat] = lbe.fit_transform(data[feat]) 46 | #数值特征进行最大最小归一化 47 | mms = MinMaxScaler(feature_range=(0, 1)) 48 | data[dense_features] = mms.fit_transform(data[dense_features]) 49 | 50 | 51 | 52 | #feature 是特征处理模块 53 | from feature import Operate_Feat1,get_feature_names 54 | 55 | 56 | d=Operate_Feat1() 57 | 58 | 59 | 60 | sparse_list=[] 61 | for p in sparse_features: 62 | d1=d.operate_sparse(data[p], p) 63 | sparse_list.append(d1.copy()) 64 | 65 | dense_list=[] 66 | for q in dense_features: 67 | d2=d.operate_dense(q) 68 | print(d2) 69 | dense_list.append(d2.copy()) 70 | 71 | 72 | # fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 ) 73 | # for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) 74 | # for feat in dense_features] 75 | 76 | merge_list=sparse_list+dense_list 77 | dnn_feature_columns = merge_list 78 | linear_feature_columns = merge_list 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | from feature import DEFAULT_GROUP_NAME,build_input_features 88 | 89 | def xDeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 256), 90 | cin_layer_size=(128, 128,), cin_split_half=True, cin_activation='relu', l2_reg_linear=0.00001, 91 | l2_reg_embedding=0.00001, l2_reg_dnn=0, l2_reg_cin=0, seed=1024, dnn_dropout=0, 92 | dnn_activation='relu', dnn_use_bn=False, task='binary'): 93 | 94 | 95 | 96 | # dnn_hidden_units=(256, 256) 97 | # cin_layer_size=(128, 128,) 98 | # cin_split_half=True 99 | # cin_activation='relu' 100 | # l2_reg_linear=0.00001 101 | # l2_reg_embedding=0.00001 102 | # l2_reg_dnn=0 103 | # l2_reg_cin=0 104 | # seed=1024 105 | # dnn_dropout=0 106 | # dnn_activation='relu' 107 | # dnn_use_bn=False 108 | # task='binary' 109 | 110 | features = build_input_features( 111 | linear_feature_columns + dnn_feature_columns) 112 | 113 | inputs_list = list(features.values()) 114 | from feature import get_linear_logit 115 | linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear', 116 | l2_reg=l2_reg_linear) 117 | from feature import input_from_feature_columns 118 | sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, 119 | l2_reg_embedding, seed) 120 | 121 | fm_input = concat_func(sparse_embedding_list, axis=1) 122 | 123 | dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list) 124 | dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(dnn_input) 125 | 126 | 127 | import keras 128 | import tensorflow as tf 129 | dnn_logit = tf.keras.layers.Dense( 130 | 1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed))(dnn_output) 131 | 132 | final_logit = add_func([linear_logit, dnn_logit]) 133 | 134 | if len(cin_layer_size) > 0: 135 | exFM_out = CIN(cin_layer_size, cin_activation, 136 | cin_split_half, l2_reg_cin, seed)(fm_input) 137 | exFM_logit = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.glorot_normal(seed))(exFM_out) 138 | final_logit = add_func([final_logit, exFM_logit]) 139 | 140 | output = PredictionLayer(task)(final_logit) 141 | 142 | model = tf.keras.models.Model(inputs=inputs_list, outputs=output) 143 | return model 144 | 145 | 146 | 147 | 148 | model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary') 149 | 150 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) 151 | 152 | # 3.generate input data for model 153 | 154 | train, test = train_test_split(data, test_size=0.2, random_state=2020) 155 | train_model_input = {name:train[name] for name in feature_names} 156 | test_model_input = {name:test[name] for name in feature_names} 157 | 158 | # 4.Define Model,train,predict and evaluate 159 | # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') 160 | model.compile("adam", "binary_crossentropy", 161 | metrics=['binary_crossentropy'], ) 162 | 163 | history = model.fit(train_model_input, train[target].values, 164 | batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) 165 | pred_ans = model.predict(test_model_input, batch_size=256) 166 | print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) 167 | print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) 168 | -------------------------------------------------------------------------------- /ffm/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.npy 3 | *.pyc -------------------------------------------------------------------------------- /ffm/ffm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 3/2/18 3 | # @Author : zhangchaoyang 4 | 5 | import numpy as np 6 | 7 | np.random.seed(0) 8 | import math 9 | from logistic import Logistic 10 | 11 | 12 | class FFM_Node(object): 13 | ''' 14 | 通常x是高维稀疏向量,所以用链表来表示一个x,链表上的每个节点是个3元组(j,f,v) 15 | ''' 16 | __slots__ = ['j', 'f', 'v'] # 按元组(而不是字典)的方式来存储类的成员属性 17 | 18 | def __init__(self, j, f, v): 19 | ''' 20 | :param j: Feature index (0 to n-1) 21 | :param f: Field index (0 to m-1) 22 | :param v: value 23 | ''' 24 | self.j = j 25 | self.f = f 26 | self.v = v 27 | 28 | 29 | class FFM(object): 30 | def __init__(self, m, n, k, eta, lambd): 31 | ''' 32 | :param m: Number of fields 33 | :param n: Number of features 34 | :param k: Number of latent factors 35 | :param eta: learning rate 36 | :param lambd: regularization coefficient 37 | ''' 38 | self.m = m 39 | self.n = n 40 | self.k = k 41 | # 超参数 42 | self.eta = eta 43 | self.lambd = lambd 44 | # 初始化三维权重矩阵w~U(0,1/sqrt(k)) 45 | self.w = np.random.rand(n, m, k) / math.sqrt(k) 46 | # 初始化累积梯度平方和为,AdaGrad时要用到,防止除0异常 47 | self.G = np.ones(shape=(n, m, k), dtype=np.float64) 48 | self.log = Logistic() 49 | 50 | def phi(self, node_list): 51 | ''' 52 | 特征组合式的线性加权求和 53 | :param node_list: 用链表存储x中的非0值 54 | :return: 55 | ''' 56 | z = 0.0 57 | for a in range(len(node_list)): 58 | node1 = node_list[a] 59 | j1 = node1.j 60 | f1 = node1.f 61 | v1 = node1.v 62 | for b in range(a + 1, len(node_list)): 63 | node2 = node_list[b] 64 | j2 = node2.j 65 | f2 = node2.f 66 | v2 = node2.v 67 | w1 = self.w[j1, f2] 68 | w2 = self.w[j2, f1] 69 | z += np.dot(w1, w2) * v1 * v2 70 | return z 71 | 72 | def predict(self, node_list): 73 | ''' 74 | 输入x,预测y的值 75 | :param node_list: 用链表存储x中的非0值 76 | :return: 77 | ''' 78 | z = self.phi(node_list) 79 | y = self.log.decide_by_tanh(z) 80 | return y 81 | 82 | def sgd(self, node_list, y): 83 | ''' 84 | 根据一个样本来更新模型参数 85 | :param node_list: 用链表存储x中的非0值 86 | :param y: 正样本1,负样本-1 87 | :return: 88 | ''' 89 | kappa = -y / (1 + math.exp(y * self.phi(node_list))) 90 | for a in range(len(node_list)): 91 | node1 = node_list[a] 92 | j1 = node1.j 93 | f1 = node1.f 94 | v1 = node1.v 95 | for b in range(a + 1, len(node_list)): 96 | node2 = node_list[b] 97 | j2 = node2.j 98 | f2 = node2.f 99 | v2 = node2.v 100 | c = kappa * v1 * v2 101 | # self.w[j1,f2]和self.w[j2,f1]是向量,导致g_j1_f2和g_j2_f1也是向量 102 | g_j1_f2 = self.lambd * self.w[j1, f2] + c * self.w[j2, f1] 103 | g_j2_f1 = self.lambd * self.w[j2, f1] + c * self.w[j1, f2] 104 | # 计算各个维度上的梯度累积平方和 105 | self.G[j1, f2] += g_j1_f2 ** 2 # 所有G肯定是大于0的正数,因为初始化时G都为1 106 | self.G[j2, f1] += g_j2_f1 ** 2 107 | # AdaGrad 108 | self.w[j1, f2] -= self.eta / np.sqrt(self.G[j1, f2]) * g_j1_f2 # sqrt(G)作为分母,所以G必须是大于0的正数 109 | self.w[j2, f1] -= self.eta / np.sqrt( 110 | self.G[j2, f1]) * g_j2_f1 # math.sqrt()只能接收一个数字作为参数,而numpy.sqrt()可以接收一个array作为参数,表示对array中的每个元素分别开方 111 | 112 | def train(self, sample_generator, max_echo, max_r2): 113 | ''' 114 | 根据一堆样本训练模型 115 | :param sample_generator: 样本生成器,每次yield (node_list, y),node_list中存储的是x的非0值。通常x要事先做好归一化,即模长为1,这样精度会略微高一点 116 | :param max_echo: 最大迭代次数 117 | :param max_r2: 拟合系数r2达到阈值时即可终止学习 118 | :return: 119 | ''' 120 | for itr in range(max_echo): 121 | print( "echo", itr) 122 | y_sum = 0.0 123 | y_square_sum = 0.0 124 | err_square_sum = 0.0 # 误差平方和 125 | population = 0 # 样本总数 126 | for node_list, y in sample_generator: 127 | y = 0.0 if y == -1 else y # 真实的y取值为{-1,1},而预测的y位于(0,1),计算拟合效果时需要进行统一 128 | self.sgd(node_list, y) 129 | y_hat = self.predict(node_list) 130 | y_sum += y 131 | y_square_sum += y ** 2 132 | err_square_sum += (y - y_hat) ** 2 133 | population += 1 134 | var_y = y_square_sum - y_sum * y_sum / population # y的方差 135 | r2 = 1 - err_square_sum / var_y 136 | print ("r2=",r2) 137 | if r2 > max_r2: # r2值越大说明拟合得越好 138 | print ('r2 have reach', r2) 139 | break 140 | 141 | def save_model(self, outfile): 142 | ''' 143 | 序列化模型 144 | :param outfile: 145 | :return: 146 | ''' 147 | np.save(outfile, self.w) 148 | 149 | def load_model(self, infile): 150 | ''' 151 | 加载模型 152 | :param infile: 153 | :return: 154 | ''' 155 | self.w = np.load(infile) 156 | -------------------------------------------------------------------------------- /ffm/ffm_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 3/2/18 3 | # @Author : zhangchaoyang 4 | 5 | import math 6 | from ffm import FFM_Node, FFM 7 | import re 8 | 9 | 10 | class Sample(object): 11 | def __init__(self, infile): 12 | self.infile = infile 13 | self.regex = re.compile("\\s+") 14 | 15 | def __iter__(self): 16 | with open(self.infile, 'r') as f_in: 17 | for line in f_in: 18 | arr = self.regex.split(line.strip()) 19 | if len(arr) >= 2: 20 | y = float(arr[0]) 21 | assert math.fabs(y) == 1 22 | node_list = [] 23 | square_sum = 0.0 24 | for i in range(1, len(arr)): 25 | brr = arr[i].split(",") 26 | if len(brr) == 3: 27 | j = int(brr[0]) 28 | f = int(brr[1]) 29 | v = float(brr[2]) 30 | square_sum += v * v 31 | node_list.append(FFM_Node(j, f, v)) 32 | if square_sum > 0: 33 | norm = math.sqrt(square_sum) 34 | # 把模长缩放到1 35 | normed_node_list = [FFM_Node(ele.j, ele.f, ele.v / norm) for ele in node_list] 36 | yield (normed_node_list, y) 37 | 38 | 39 | if __name__ == '__main__': 40 | n = 5 41 | m = 2 42 | k = 2 43 | train_file = "train.txt" 44 | valid_file = "valid.txt" 45 | model_file = "ffm.npy" 46 | # 超参数 47 | eta = 0.01 48 | lambd = 1e-2 49 | max_echo = 30 50 | max_r2 = 0.9 51 | 52 | # 训练模型,并保存模型参数 53 | sample_generator = Sample(train_file) 54 | ffm = FFM(m, n, k, eta, lambd) 55 | ffm.train(sample_generator, max_echo, max_r2) 56 | ffm.save_model(model_file) 57 | 58 | # 加载模型,并计算在验证集上的拟合效果 59 | ffm.load_model(model_file) 60 | valid_generator = Sample(valid_file) 61 | y_sum = 0.0 62 | y_square_sum = 0.0 63 | err_square_sum = 0.0 # 误差平方和 64 | population = 0 # 样本总数 65 | for node_list, y in valid_generator: 66 | y = 0.0 if y == -1 else y # 真实的y取值为{-1,1},而预测的y位于(0,1),计算拟合效果时需要进行统一 67 | y_hat = ffm.predict(node_list) 68 | y_sum += y 69 | y_square_sum += y ** 2 70 | err_square_sum += (y - y_hat) ** 2 71 | population += 1 72 | var_y = y_square_sum - y_sum * y_sum / population # y的方差 73 | r2 = 1 - err_square_sum / var_y 74 | print ("r2 on validation set is", r2) 75 | -------------------------------------------------------------------------------- /ffm/logistic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 3/2/18 3 | # @Author : zhangchaoyang 4 | 5 | import numpy as np 6 | import math 7 | from singleton import Singleton 8 | 9 | 10 | class Logistic(object): 11 | __metaclass__ = Singleton # 单例 12 | 13 | def __init__(self): 14 | exp_max = 10.0 15 | self.exp_scale = 0.001 16 | self.exp_intv = int(exp_max / self.exp_scale) 17 | self.exp_table = [0.0] * self.exp_intv 18 | for i in range(self.exp_intv): 19 | x = self.exp_scale * i 20 | exp = math.exp(x) 21 | self.exp_table[i] = exp / (1.0 + exp) 22 | 23 | def decide_by_table(self, x): 24 | '''查表获得logistic的函数值''' 25 | if x == 0: 26 | return 0.5 27 | i = int(np.nan_to_num(abs(x) / self.exp_scale)) 28 | y = self.exp_table[min(i, self.exp_intv - 1)] 29 | if x > 0: 30 | return y 31 | else: 32 | return 1.0 - y 33 | 34 | def decide_by_tanh(self, x): 35 | '''直接使用1.0 / (1.0 + np.exp(-x))容易发警告“RuntimeWarning: overflowencountered in exp”, 36 | 转换成如下等价形式后算法会更稳定 37 | ''' 38 | return 0.5 * (1 + np.tanh(0.5 * x)) 39 | 40 | def decide(self, x): 41 | '''原始的sigmoid函数''' 42 | return 1.0 / (1.0 + np.exp(-x)) 43 | 44 | 45 | if __name__ == '__main__': 46 | log = Logistic() 47 | for x in np.arange(-20, 20, 0.1): # xrange()中的step不能是小数,所以只好手numpy.arange() 48 | y = log.decide(x) 49 | print( x, y, log.decide_by_tanh(x) - y, log.decide_by_table(x) - y) 50 | -------------------------------------------------------------------------------- /ffm/readme.md: -------------------------------------------------------------------------------- 1 | # Field-aware Factorization Machines 2 | 公式推导见[http://www.cnblogs.com/zhangchaoyang/articles/8410719.html](http://www.cnblogs.com/zhangchaoyang/articles/8410719.html) -------------------------------------------------------------------------------- /ffm/singleton.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Date : 3/2/18 3 | # @Author : zhangchaoyang 4 | 5 | class Singleton(type): 6 | def __init__(cls, class_name,base_classes, attr_dict): 7 | cls.__instance = None 8 | super(Singleton, cls).__init__( class_name,base_classes, attr_dict) 9 | 10 | def __call__(cls, *args, **kwargs): 11 | if cls.__instance is None: 12 | cls.__instance = super(Singleton, cls).__call__(*args, **kwargs) 13 | return cls.__instance 14 | else: 15 | return cls.__instance 16 | -------------------------------------------------------------------------------- /ffm/train.txt: -------------------------------------------------------------------------------- 1 | -1 0,0,2.9 4,1,12.4 2 | 1 1,0,5.7 3,1,0.03 3 | -1 2,0,4.7 4,1,9.4 -------------------------------------------------------------------------------- /ffm/valid.txt: -------------------------------------------------------------------------------- 1 | 1 2,0,4 3,1,2.1 2 | 1 1,0,5.7 4,1,5 3 | -1 0,0,6 4,1,9.4 -------------------------------------------------------------------------------- /gbdt_source/GBDTReg.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | __author__ = 'luchi.lc' 3 | import numpy as np 4 | 5 | """ 6 | date:29/6/2017 7 | usage:构造GBDT树并用其生成数据新的特征向量 8 | """ 9 | class GBDT(object): 10 | 11 | def __init__(self,config): 12 | 13 | 14 | 15 | self.learningRate = config.learningRate #learning_rate 16 | self.maxTreeLength=config.maxTreeLength #树的最大深度 17 | self.maxLeafCount=config.maxLeafCount #最大叶子数量  18 | self.maxTreeNum=config.maxTreeNum #树的数量 19 | self.tree=[] 20 | 21 | #计算平方损失 22 | def calculateSquareLoss(self,residual): 23 | """ 24 | :param residual:梯度残差值 25 | :return:总体的残差值 26 | """ 27 | 28 | #如果这批数据的残差相同,那么loss为0 29 | mean = np.mean(residual) 30 | sumError = np.sum([(value-mean)**2 for value in residual]) 31 | return sumError 32 | 33 | def splitTree(self,x_train,residualGradient,treeHeight): 34 | """ 35 | 36 | :param x_train:训练数据 37 | :param residualGradient:当前需要拟合的梯度残差值 38 | :param treeHeight:树的高度 39 | :return:建好的GBDT树 40 | """ 41 | size = len(x_train) #数据的数量 42 | dim = len(x_train[0]) #特征的维度 43 | #约定:左子树是小于等于,右子树是大于 44 | bestSplitPointDim=-1 45 | bestSplitPointValue=-1 46 | #这是树分裂前,loss 47 | curLoss = self.calculateSquareLoss(residualGradient) 48 | minLossValue=curLoss 49 | #如果树的递归深度等于树的最大深度,则递归终止 50 | if treeHeight==self.maxTreeLength: 51 | 52 | return curLoss 53 | tree=dict([]) 54 | #遍历数据所有的维度 55 | for i in range(dim): 56 | #遍历所有的数据 57 | for j in range(size): 58 | #令 x_train[j,i]为分裂点 59 | splitNum = x_train[j,i] 60 | leftSubTree=[] 61 | rightSubTree=[] 62 | #以splitNum 为分裂点,对于第i个feature ,将数据分成两类, 63 | for k in range(size): 64 | tmpNum=x_train[k,i] 65 | if tmpNum<=splitNum: 66 | leftSubTree.append(residualGradient[k]) 67 | else: 68 | rightSubTree.append(residualGradient[k]) 69 | sumLoss=0.0 70 | #分别计算左右子树的loss,再求和,通过最小化loss,来决定分裂的feature和分裂的值 71 | sumLoss+=self.calculateSquareLoss(np.array(leftSubTree)) 72 | sumLoss+=self.calculateSquareLoss(np.array(rightSubTree)) 73 | if sumLossbestSplitPointValue ]#右子树 89 | 90 | # print(leftSplit) 91 | newLeftSubTree = list(zip(*leftSplit))[0] #左子树的训练数据X 92 | newLeftResidual = list(zip(*leftSplit))[1]#左子树的y 93 | leftTree = self.splitTree(np.array(newLeftSubTree),newLeftResidual,treeHeight+1) 94 | 95 | newRightSubTree = list(zip(*rightSplit))[0] 96 | newRightResidual =list(zip(*rightSplit))[1] 97 | rightTree = self.splitTree(np.array(newRightSubTree),newRightResidual,treeHeight+1) 98 | 99 | tree[(bestSplitPointDim,bestSplitPointValue)]=[leftTree,rightTree] 100 | 101 | print(tree) 102 | return tree 103 | 104 | #计算树的节点数 105 | def getTreeLeafNodeNum(self,tree): 106 | size=0 107 | if type(tree) is not dict: 108 | return 1 109 | for item in tree.items(): 110 | 111 | print(item) 112 | 113 | print('#'*10) 114 | subLeftTree,subRightTree=item[1] 115 | if type(subLeftTree) is dict: 116 | size+=self.getTreeLeafNodeNum(subLeftTree) 117 | else: 118 | size+=1 119 | 120 | if type(subRightTree) is dict: 121 | size+=self.getTreeLeafNodeNum(subRightTree) 122 | else: 123 | size+=1 124 | return size 125 | 126 | #遍历数据应该归到那个叶子节点,并计算其左侧的叶子节点个数 127 | def scanTree(self,curTree,singleX,treeLeafNodeNum): 128 | """ 129 | 130 | :param curTree:当前的树 131 | :param singleX:需要送入到决策树的数据 132 | :param treeLeafNodeNum:树的叶子结点个数 133 | :return:该数据应该分到的叶子结点的值和其在当前树的转换的特征向量 134 | """ 135 | 136 | self.xValue=0 137 | xFeature=[0]*treeLeafNodeNum 138 | self.leftZeroNum=0 139 | def scan(curTree,singleX): 140 | 141 | for item in curTree.items(): 142 | splitDim,splitValue=item[0] 143 | subLeftTree,subRightTree=item[1] 144 | if singleX[splitDim]<=splitValue: 145 | if type(subLeftTree) is dict: 146 | scan(subLeftTree,singleX) 147 | else: 148 | self.xValue=subLeftTree 149 | return 150 | else: 151 | self.leftZeroNum+=self.getTreeLeafNodeNum(subLeftTree) 152 | if type(subRightTree) is dict: 153 | scan(subRightTree,singleX) 154 | else: 155 | self.xValue=subRightTree 156 | return 157 | scan(curTree,singleX) 158 | xFeature[self.leftZeroNum]=1 159 | return self.xValue,xFeature 160 | 161 | #sigmoid函数 162 | def sigmoid(self,x): 163 | return 1.0/(1+np.exp(-1*x)) 164 | #建立GBDT树 165 | def buildGbdt(self,x_train,y_train): 166 | #数据的个数 167 | size = len(x_train) 168 | dim = len(x_train[0]) 169 | x_train=np.array(x_train) 170 | y_train=np.array(y_train) 171 | x_train_feature=[] 172 | 173 | #初始化第一棵树 174 | treePreviousValue=0*y_train 175 | treeValues=[] 176 | treeValues.append(treePreviousValue) 177 | 178 | curValue = self.sigmoid(0*y_train) 179 | dataFeatures=[] 180 | for i in range(self.maxTreeNum): 181 | print("the tree %i-th"%i) 182 | residualGradient = -1*self.learningRate*(curValue-y_train) 183 | curTree = self.splitTree(x_train,residualGradient,1) 184 | self.tree.append(curTree) 185 | # print (curTree) 186 | #更新梯度残差值 187 | curTreeLeafNodeNum = self.getTreeLeafNodeNum(curTree) 188 | curTreeValue=[] 189 | for singleX in x_train: 190 | xValue,xFeature = self.scanTree(curTree,singleX,curTreeLeafNodeNum) 191 | curTreeValue.append(xValue) 192 | 193 | treePreviousValue=np.array(curTreeValue)+treePreviousValue 194 | curValue=self.sigmoid(treePreviousValue) 195 | # print (y_train) 196 | # print("curValue") 197 | # print( curValue) 198 | 199 | #根据建成的树构建输入数据的特征向量 200 | def generateFeatures(self,x_train): 201 | dataFeatures=[] 202 | for curTree in self.tree: 203 | curFeatures=[] 204 | curTreeLeafNodeNum = self.getTreeLeafNodeNum(curTree) 205 | # print ("tree leaf node is %i"%(curTreeLeafNodeNum)) 206 | for singleX in x_train: 207 | _,xFeature = self.scanTree(curTree,singleX,curTreeLeafNodeNum) 208 | curFeatures.append(xFeature) 209 | 210 | if len(dataFeatures)==0: 211 | dataFeatures=np.array(curFeatures) 212 | 213 | else: 214 | dataFeatures=np.concatenate([dataFeatures,curFeatures],axis=1) 215 | 216 | # print('#'*100) 217 | # print(len(curFeatures[0]),len(dataFeatures[0])) 218 | # print('data_feature=',dataFeatures,len(dataFeatures),len(dataFeatures[0])) 219 | # print('curFeatures=',curFeatures) 220 | return dataFeatures 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /gbdt_source/README.txt: -------------------------------------------------------------------------------- 1 | GBDTReg.py 是GBDT模型文件 2 | gbdt_demo.py是训练GBDT,并使用GBDT生成特征向量,转换后的特征向量用于训练和测试LR 3 | testGBDT.py测试的是树的个数对GBDT的结果影响 -------------------------------------------------------------------------------- /gbdt_source/__pycache__/GBDTReg.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/gbdt_source/__pycache__/GBDTReg.cpython-37.pyc -------------------------------------------------------------------------------- /gbdt_source/gbdt_demo.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | __author__ = 'luchi.lc' 3 | 4 | """ 5 | date:29/6/2017 6 | usage:训练GBDT树并使用其讲数据转换成新的特征向量,用于训练Logistic Regression 7 | """ 8 | 9 | from sklearn.datasets import make_classification 10 | from sklearn.model_selection import train_test_split 11 | from GBDTReg import GBDT 12 | from sklearn.linear_model import LogisticRegression 13 | import numpy as np 14 | 15 | class Config(object): 16 | learningRate=0.1 17 | maxTreeLength=5 18 | maxLeafCount=30 19 | maxTreeNum=50 20 | 21 | def generate_data(): 22 | X, y = make_classification(n_samples=1000) 23 | #生成训练/测试数据 24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) 25 | #对于训练数据,前面一般作为训练GBDT,后一半用来训练LR 26 | X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) 27 | return X_train, X_train_lr, y_train, y_train_lr,X_test, y_test 28 | 29 | 30 | 31 | 32 | def main(): 33 | X_train, X_train_lr, y_train, y_train_lr,X_test, y_test=generate_data() 34 | config=Config() 35 | gbdt=GBDT(config=config) 36 | gbdt.buildGbdt(X_train,y_train) 37 | trainDataFeatures=gbdt.generateFeatures(X_train_lr) 38 | testDataFeatures=gbdt.generateFeatures(X_test) 39 | print (len(trainDataFeatures[0])) 40 | lrModel = LogisticRegression() 41 | lrModel.fit(trainDataFeatures,y_train_lr) 42 | #test model 43 | testLabel = lrModel.predict(testDataFeatures) 44 | accuracy = np.sum((np.array(testLabel)==np.array(y_test)))*1.0/len(y_test) 45 | print (("the accuracy is % f"%accuracy)) 46 | 47 | if __name__=='__main__': 48 | main() 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /gbdt_source/testGBDT.py: -------------------------------------------------------------------------------- 1 | __author__ = 'luchi.lc' 2 | 3 | """ 4 | date:29/6/2017 5 | usage:测试GBDT的树的个数对结果的影响 6 | """ 7 | from GBDTReg import GBDT 8 | class Config(object): 9 | learningRate=0.1 10 | maxTreeLength=4 11 | maxLeafCount=30 12 | maxTreeNum=50 13 | 14 | def test(): 15 | x=[[0.5,0.6,0.7],[0.4,0.5,0.5],[1.2,1.3,1.0],[1.4,1.5,0.8],[1.5,1.3,1.3]] 16 | y=[0,0,1,1,1] 17 | c=Config() 18 | gbdt=GBDT(config=c) 19 | gbdt.buildGbdt(x,y) 20 | data_features=gbdt.generateFeatures(x) 21 | print len(data_features[0]) 22 | 23 | test() 24 | -------------------------------------------------------------------------------- /item_book.txt: -------------------------------------------------------------------------------- 1 | Liu Yi,3,1001 2 | Chen Er,4,1001 3 | Zhang San,3,1001 4 | Li Si,3,1001 5 | Liu Yi,3,1002 6 | Li Si,4,1002 7 | Liu Yi,4,1003 8 | Zhang San,5,1003 9 | Li Si,5,1003 10 | Liu Yi,4,1004 11 | Zhang San,3,1004 12 | Liu Yi,5,1005 13 | -------------------------------------------------------------------------------- /logstic/lf1000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/logstic/lf1000.gif -------------------------------------------------------------------------------- /logstic/logstic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 22 16:30:47 2019 4 | 5 | @author: luogantt 6 | """ 7 | 8 | ''' 9 | Created on Oct 27, 2010 10 | Logistic Regression Working Module 11 | @author: Peter 12 | ''' 13 | from numpy import * 14 | 15 | def loadDataSet(): 16 | dataMat = []; labelMat = [] 17 | fr = open('testSet.txt') 18 | for line in fr.readlines(): 19 | lineArr = line.strip().split() 20 | dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) 21 | labelMat.append(int(lineArr[2])) 22 | return dataMat,labelMat 23 | 24 | def sigmoid(inX): 25 | return 1.0/(1+exp(-inX)) 26 | 27 | def gradAscent(dataMatIn, classLabels): 28 | dataMatrix = mat(dataMatIn) #convert to NumPy matrix 29 | labelMat = mat(classLabels).transpose() #convert to NumPy matrix 30 | n,m = shape(dataMatrix) 31 | alpha = 0.001 32 | maxCycles = 5000 33 | weights = ones((m,1)) 34 | for k in range(maxCycles): #heavy on matrix operations 35 | h = sigmoid(dataMatrix*weights) #matrix mult 36 | error = (labelMat - h) #vector subtraction 37 | weights = weights + alpha * dataMatrix.transpose()* error #matrix mult 38 | return weights 39 | 40 | def plotBestFit(weights): 41 | import matplotlib.pyplot as plt 42 | dataMat,labelMat=loadDataSet() 43 | dataArr = array(dataMat) 44 | n = shape(dataArr)[0] 45 | xcord1 = []; ycord1 = [] 46 | xcord2 = []; ycord2 = [] 47 | for i in range(n): 48 | if int(labelMat[i])== 1: 49 | xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) 50 | else: 51 | xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) 52 | fig = plt.figure() 53 | ax = fig.add_subplot(111) 54 | ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') 55 | ax.scatter(xcord2, ycord2, s=30, c='green') 56 | x = arange(-3.0, 3.0, 0.1) 57 | y = (-weights[0]-weights[1]*x)/weights[2] 58 | ax.plot(x, y) 59 | plt.xlabel('X1'); plt.ylabel('X2'); 60 | plt.show() 61 | 62 | #import logRegres 63 | 64 | dataArr,labelMat=loadDataSet() 65 | 66 | 67 | weights=gradAscent(dataArr,labelMat) 68 | plotBestFit(weights.getA()) 69 | -------------------------------------------------------------------------------- /logstic/testSet.txt: -------------------------------------------------------------------------------- 1 | -0.017612 14.053064 0 2 | -1.395634 4.662541 1 3 | -0.752157 6.538620 0 4 | -1.322371 7.152853 0 5 | 0.423363 11.054677 0 6 | 0.406704 7.067335 1 7 | 0.667394 12.741452 0 8 | -2.460150 6.866805 1 9 | 0.569411 9.548755 0 10 | -0.026632 10.427743 0 11 | 0.850433 6.920334 1 12 | 1.347183 13.175500 0 13 | 1.176813 3.167020 1 14 | -1.781871 9.097953 0 15 | -0.566606 5.749003 1 16 | 0.931635 1.589505 1 17 | -0.024205 6.151823 1 18 | -0.036453 2.690988 1 19 | -0.196949 0.444165 1 20 | 1.014459 5.754399 1 21 | 1.985298 3.230619 1 22 | -1.693453 -0.557540 1 23 | -0.576525 11.778922 0 24 | -0.346811 -1.678730 1 25 | -2.124484 2.672471 1 26 | 1.217916 9.597015 0 27 | -0.733928 9.098687 0 28 | -3.642001 -1.618087 1 29 | 0.315985 3.523953 1 30 | 1.416614 9.619232 0 31 | -0.386323 3.989286 1 32 | 0.556921 8.294984 1 33 | 1.224863 11.587360 0 34 | -1.347803 -2.406051 1 35 | 1.196604 4.951851 1 36 | 0.275221 9.543647 0 37 | 0.470575 9.332488 0 38 | -1.889567 9.542662 0 39 | -1.527893 12.150579 0 40 | -1.185247 11.309318 0 41 | -0.445678 3.297303 1 42 | 1.042222 6.105155 1 43 | -0.618787 10.320986 0 44 | 1.152083 0.548467 1 45 | 0.828534 2.676045 1 46 | -1.237728 10.549033 0 47 | -0.683565 -2.166125 1 48 | 0.229456 5.921938 1 49 | -0.959885 11.555336 0 50 | 0.492911 10.993324 0 51 | 0.184992 8.721488 0 52 | -0.355715 10.325976 0 53 | -0.397822 8.058397 0 54 | 0.824839 13.730343 0 55 | 1.507278 5.027866 1 56 | 0.099671 6.835839 1 57 | -0.344008 10.717485 0 58 | 1.785928 7.718645 1 59 | -0.918801 11.560217 0 60 | -0.364009 4.747300 1 61 | -0.841722 4.119083 1 62 | 0.490426 1.960539 1 63 | -0.007194 9.075792 0 64 | 0.356107 12.447863 0 65 | 0.342578 12.281162 0 66 | -0.810823 -1.466018 1 67 | 2.530777 6.476801 1 68 | 1.296683 11.607559 0 69 | 0.475487 12.040035 0 70 | -0.783277 11.009725 0 71 | 0.074798 11.023650 0 72 | -1.337472 0.468339 1 73 | -0.102781 13.763651 0 74 | -0.147324 2.874846 1 75 | 0.518389 9.887035 0 76 | 1.015399 7.571882 0 77 | -1.658086 -0.027255 1 78 | 1.319944 2.171228 1 79 | 2.056216 5.019981 1 80 | -0.851633 4.375691 1 81 | -1.510047 6.061992 0 82 | -1.076637 -3.181888 1 83 | 1.821096 10.283990 0 84 | 3.010150 8.401766 1 85 | -1.099458 1.688274 1 86 | -0.834872 -1.733869 1 87 | -0.846637 3.849075 1 88 | 1.400102 12.628781 0 89 | 1.752842 5.468166 1 90 | 0.078557 0.059736 1 91 | 0.089392 -0.715300 1 92 | 1.825662 12.693808 0 93 | 0.197445 9.744638 0 94 | 0.126117 0.922311 1 95 | -0.679797 1.220530 1 96 | 0.677983 2.556666 1 97 | 0.761349 10.693862 0 98 | -2.168791 0.143632 1 99 | 1.388610 9.341997 0 100 | 0.317029 14.739025 0 101 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | 4 | import math 5 | import pdb 6 | 7 | class ItemBasedCF: 8 | def __init__(self,train_file): 9 | self.train_file = train_file 10 | self.readData() 11 | 12 | def readData(self): 13 | #读取文件,并生成用户-物品的评分表和测试集 14 | self.train = dict() 15 | #用户-物品的评分表 16 | for line in open(self.train_file): 17 | user,score,item = line.strip().split(",") 18 | self.train.setdefault(user,{}) 19 | self.train[user][item] = int(float(score)) 20 | 21 | def ItemSimilarity(self): 22 | #建立物品-物品的共现矩阵 23 | cooccur = dict() #物品-物品的共现矩阵 24 | buy = dict() #物品被多少个不同用户购买N 25 | for user,items in self.train.items(): 26 | for i in items.keys(): 27 | buy.setdefault(i,0) 28 | buy[i] += 1 29 | cooccur.setdefault(i,{}) 30 | for j in items.keys(): 31 | if i == j : continue 32 | cooccur[i].setdefault(j,0) 33 | cooccur[i][j] += 1 34 | #计算相似度矩阵 35 | self.similar = dict() 36 | for i,related_items in cooccur.items(): 37 | self.similar.setdefault(i,{}) 38 | for j,cij in related_items.items(): 39 | self.similar[i][j] = cij / (math.sqrt(buy[i] * buy[j])) 40 | return self.similar 41 | 42 | #给用户user推荐,前K个相关用户,前N个物品 43 | def Recommend(self,user,K=3,N=10): 44 | rank = dict() 45 | action_item = self.train[user] 46 | #用户user产生过行为的item和评分 47 | for item,score in action_item.items(): 48 | sortedItems = sorted(self.similar[item].items(),key=lambda x:x[1],reverse=True)[0:K] 49 | for j,wj in sortedItems: 50 | if j in action_item.keys(): 51 | continue 52 | rank.setdefault(j,0) 53 | rank[j] += score * wj 54 | return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N]) 55 | 56 | #声明一个ItemBasedCF的对象 57 | item = ItemBasedCF("item_book.txt") 58 | item.ItemSimilarity() 59 | recommedDict = item.Recommend("Li Si") 60 | for k,v in recommedDict.items(): 61 | print(k,"\t",v) 62 | -------------------------------------------------------------------------------- /other/DeepFM-Keras-master/README.md: -------------------------------------------------------------------------------- 1 | # DeepFM-Keras 2 | 3 | DeepFM written by Keras[1], similary with the tensorflow version by ChenglongChen "https://github.com/ChenglongChen/tensorflow-DeepFM" 4 | 5 | Usage: 6 | --- 7 | ###load data and divide to train and test 8 | dfTrain = pd.read_csv("data/train.csv") 9 | dfTrain = dfTrain.iloc[0:int(0.7*dfTrain.shape[0]),:] 10 | dfTest = dfTrain.iloc[int(0.7*dfTrain.shape[0]):,:] 11 | 12 | 13 | global_columns = dfTrain.columns.tolist() 14 | ###divide the columns by CATEGORICAL columns 15 | ID_columns = ["ps_reg_01", "ps_reg_02", "ps_reg_03", 16 | "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",] 17 | 18 | qid_columns = ['id'] 19 | target_columns = ['target'] 20 | 21 | 22 | Example: 23 | --- 24 | Folder example includes an example usage of DeepFM models for Porto Seguro's Safe Driver Prediction competition on Kaggle. 25 | 26 | Please download the data from the competition website and put them into the example/data folder. 27 | 28 | To train DeepFM model for this dataset, run 29 | 30 | $ python keras_FM.py 31 | 32 | Support: 33 | --- 34 | Support the auc loss and log_loss as metrics 35 | 36 | 37 | 38 | 39 | [1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He. 40 | -------------------------------------------------------------------------------- /other/DeepFM-Keras-master/data/README.md: -------------------------------------------------------------------------------- 1 | 2 | Please download the data from the [competition website](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction) and put them here. 3 | -------------------------------------------------------------------------------- /other/deepfm/data/README.md: -------------------------------------------------------------------------------- 1 | 随便构造的一个简单的数据,方便debug的话查看每一个变量的值以及维度。 2 | 3 | 4 | 如果想使用真实的数据,可以去kaggle上下载: 5 | 6 | https://www.kaggle.com/c/porto-seguro-safe-driver-prediction -------------------------------------------------------------------------------- /other/deepfm/data/test.csv: -------------------------------------------------------------------------------- 1 | id,target,feat_cat_1,feat_cat_2,feat_num_1,feat_num_2 2 | 6,0,1,2,3.1,2.2 3 | 7,0,2,3,2.1,3.1 4 | 8,1,0,2,1.0,3.4 5 | 9,1,1,1,2.1,1.6 6 | 10,0,0,0,0.5,1.8 -------------------------------------------------------------------------------- /other/deepfm/data/train.csv: -------------------------------------------------------------------------------- 1 | id,target,feat_cat_1,feat_cat_2,feat_num_1,feat_num_2 2 | 1,0,1,2,3.1,2.2 3 | 2,0,2,3,2.1,3.1 4 | 3,1,0,2,1.0,3.4 5 | 4,1,1,1,2.1,1.6 6 | 5,0,0,0,0.5,1.8 -------------------------------------------------------------------------------- /other/deepfm/广告预估CTR系列--DeepFM模型架构图--实现篇.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/deepfm/广告预估CTR系列--DeepFM模型架构图--实现篇.jpg -------------------------------------------------------------------------------- /other/svd/README.md: -------------------------------------------------------------------------------- 1 | # Recsyspy 2 | Classic recommendation algorithms implementation 3 | 4 | ## Algorithm 5 | |DNN Model |RMSE|MAE 6 | | :-------- |:--------|:-------- | 7 | |NeuMF|0.9433|0.7485 8 | 9 | |MF Model | RMSE | MAE 10 | | :-------- | :-------- | :-------- | 11 | | Baseline | 0.946|0.742 12 | | SVD|0.931|0.731| 13 | | SVDPlusPlus|0.927|0.726 14 | | Explicit ALS |1.199|0.903 15 | | Implicit ALS |2.752|2.525 16 | 17 | |Neighborhood Model |RMSE|MAE 18 | | :-------- |:--------|:-------- | 19 | |Itemcf|1.029|0.802 20 | |WeightedSlopOne|1.043|0.835| 21 | 22 | ## Example 23 | ```python 24 | import os 25 | 26 | from util.databuilder import DataBuilder 27 | from algorithm.dnn.neumf import NeuMF 28 | 29 | file_name = os.path.abspath("data/ml-100k/u.data") 30 | data_builder = DataBuilder(file_name, just_test_one=True) 31 | 32 | 33 | data_builder.eval(NeuMF(epochs=2), k_folds=5) 34 | ``` 35 | 36 | 37 | ## Dateset 38 | * MovieLens 39 | 40 | ## Papers 41 | ### Dnn Algorithm 42 | * Neural Collaborative Filtering 43 | 44 | ### MF Algorithm 45 | * Yehuda Koren. Factorization meets the neighborhood: a multifaceted collaborative filtering model 46 | * Matrix factorization techniques for recommender systems 47 | * Advances in Collaborative Filtering 48 | 49 | ### Neighborhood Algorithm 50 | * Slope one predictors for online rating-based collaborative filtering 51 | 52 | 53 | -------------------------------------------------------------------------------- /other/svd/algorithm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/__init__.py -------------------------------------------------------------------------------- /other/svd/algorithm/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/__pycache__/estimator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/__pycache__/estimator.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/dnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/dnn/__init__.py -------------------------------------------------------------------------------- /other/svd/algorithm/dnn/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/dnn/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/dnn/__pycache__/neumf.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/dnn/__pycache__/neumf.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/dnn/neumf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """NeuMF model 3 | Paper: Neural Collaborative Filtering 4 | Apply dnn on MF 5 | """ 6 | 7 | from __future__ import division, print_function 8 | 9 | import numpy as np 10 | from algorithm.estimator import Estimator 11 | from keras.layers import Input, Embedding, Dense, Flatten,\ 12 | BatchNormalization, Dropout 13 | from keras.layers import multiply, concatenate 14 | from keras.models import Model 15 | 16 | class NeuMF(Estimator): 17 | """ 18 | mf_dim: Integer. 19 | MF dimension. 20 | mlp_dim: Integer. 21 | MLP dimension 22 | epochs: Integer 23 | Number of epochs to train the model 24 | """ 25 | 26 | def __init__(self, mf_dim=12, mlp_dim=12, epochs=2): 27 | self.mf_dim = mf_dim 28 | self.mlp_dim = mlp_dim 29 | self.epochs = epochs 30 | 31 | def transform(self, dateset): 32 | X = {} 33 | u, i, r = dateset.all_ratings(axis=0) 34 | X['user_idx'] = u.reshape(-1, 1) 35 | X['item_idx'] = i.reshape(-1, 1) 36 | 37 | y = r.reshape(-1, 1) 38 | 39 | return X, y 40 | 41 | def get_neumf_model(self, user_num, item_num): 42 | user_input = Input(shape=[1], name="user_idx") 43 | item_input = Input(shape=[1], name="item_idx") 44 | 45 | mf_embedding_user = Embedding(user_num, self.mf_dim)(user_input) 46 | mf_embedding_item = Embedding(item_num, self.mf_dim)(item_input) 47 | 48 | gmf_layer = multiply([mf_embedding_user, mf_embedding_item]) 49 | 50 | mlp_embedding_user = Embedding(user_num, self.mlp_dim)(user_input) 51 | mlp_embedding_item = Embedding(item_num, self.mlp_dim)(item_input) 52 | 53 | mlp_layer = concatenate([mlp_embedding_user, mlp_embedding_item]) 54 | 55 | mlp_layer = BatchNormalization()(mlp_layer) 56 | mlp_layer = Dense(32)(mlp_layer) 57 | mlp_layer = Dense(16)(mlp_layer) 58 | mlp_layer = Dense(8)(mlp_layer) 59 | mlp_layer = Dense(4)(mlp_layer) 60 | mlp_layer = Dropout(0.5)(mlp_layer) 61 | 62 | neumf_layer = concatenate([gmf_layer, mlp_layer]) 63 | neumf_layer = Flatten()(neumf_layer) 64 | pred = Dense(1)(neumf_layer) 65 | 66 | model = Model(inputs=[user_input, item_input], outputs=pred) 67 | model.compile(optimizer='adam', loss='mse') 68 | 69 | return model 70 | 71 | 72 | def _train(self): 73 | user_num = self.train_dataset.matrix.shape[0] 74 | item_num = self.train_dataset.matrix.shape[1] 75 | X_train, y_train = self.transform(self.train_dataset) 76 | 77 | self.neumf_model = self.get_neumf_model(user_num, item_num) 78 | self.neumf_model.fit(X_train, y_train, epochs=self.epochs) 79 | 80 | def predict(self, u, i): 81 | #not batch but single pred 82 | X = {} 83 | X['user_idx'] = np.array([u]) 84 | X['item_idx'] = np.array([i]) 85 | 86 | return self.neumf_model.predict(X)[0, 0] -------------------------------------------------------------------------------- /other/svd/algorithm/estimator.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | import util.tools as tl 7 | import util.measure as ms 8 | 9 | 10 | class Estimator(object): 11 | """Basic Estimator 12 | """ 13 | 14 | def __init__(self): 15 | pass 16 | 17 | def train(self, train_dataset): 18 | self.train_dataset = train_dataset 19 | 20 | with tl.Timer() as t: 21 | self._train() 22 | 23 | print("{} algorithm train process cost {:.3f} sec". 24 | format(self.__class__.__name__, t.interval)) 25 | 26 | def _train(self): 27 | raise NotImplementedError() 28 | 29 | def predict(self, u, i): 30 | raise NotImplementedError() 31 | 32 | def estimate(self, raw_test_dataset, measures): 33 | with tl.Timer() as t: 34 | error = self._estimate(raw_test_dataset, measures) 35 | 36 | print("{} algorithm predict process cost {:.3f} sec". 37 | format(self.__class__.__name__, t.interval)) 38 | return error 39 | 40 | def _estimate(self, raw_test_dataset, measures): 41 | users_mean = self.train_dataset.get_user_means() 42 | items_mean = self.train_dataset.get_item_means() 43 | 44 | all = len(raw_test_dataset) 45 | errors = [] 46 | cur = 0 47 | alg_count = 0 48 | 49 | for raw_u, raw_i, r, _ in raw_test_dataset: 50 | cur += 1 51 | has_raw_u = raw_u in self.train_dataset.uid_dict 52 | has_raw_i = raw_i in self.train_dataset.iid_dict 53 | 54 | if not has_raw_u and not has_raw_i: 55 | real, est = r, self.train_dataset.global_mean 56 | elif not has_raw_u: 57 | i = self.train_dataset.iid_dict[raw_i] 58 | real, est = r, items_mean[i] 59 | elif not has_raw_i: 60 | u = self.train_dataset.uid_dict[raw_u] 61 | real, est = r, users_mean[u] 62 | else: 63 | u = self.train_dataset.uid_dict[raw_u] 64 | i = self.train_dataset.iid_dict[raw_i] 65 | real, est = r, self.predict(u, i) 66 | alg_count += 1 67 | 68 | est = min(5, est) 69 | est = max(1, est) 70 | errors.append(real - est) 71 | 72 | self.progress(cur, all, 2000) 73 | 74 | fold_eval_result = [getattr(ms, measure)(errors) for measure in measures] 75 | return fold_eval_result 76 | 77 | @staticmethod 78 | def progress(cur, all, bin=50): 79 | if cur % bin == 0 or cur == all: 80 | progress = 100 * (cur / all) 81 | print("progress: {:.2f}%".format(progress)) 82 | 83 | 84 | class IterationEstimator(Estimator): 85 | """Iterator Estimator 86 | """ 87 | 88 | def _train(self): 89 | self._prepare() 90 | for current_epoch in range(self.n_epochs): 91 | print(" processing epoch {}".format(current_epoch)) 92 | self._iteration() 93 | print(" cur train rmse {}".format(self._eval())) 94 | 95 | def _prepare(self): 96 | """ 97 | do some prepare work 98 | """ 99 | 100 | raise NotImplementedError() 101 | 102 | def _iteration(self): 103 | """ 104 | core iteration 105 | """ 106 | 107 | raise NotImplementedError() 108 | 109 | def _pred(self): 110 | """ 111 | core pred process 112 | """ 113 | 114 | raise NotImplementedError() 115 | 116 | def _eval(self): 117 | """ 118 | eval on valid dateset 119 | """ 120 | 121 | pred_ratings = self._pred() 122 | real_ratings = self.train_dataset.matrix 123 | idx = real_ratings.nonzero() 124 | bias = np.asarray(pred_ratings[idx] - real_ratings[idx]) 125 | return np.sqrt(np.sum(bias ** 2) / real_ratings.count_nonzero()) -------------------------------------------------------------------------------- /other/svd/algorithm/mf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__init__.py -------------------------------------------------------------------------------- /other/svd/algorithm/mf/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/mf/__pycache__/baseline.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/baseline.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/mf/__pycache__/explicit_als.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/explicit_als.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/mf/__pycache__/implicit_als.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/implicit_als.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/mf/__pycache__/svd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/svd.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/mf/__pycache__/svdpp.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/svdpp.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/mf/baseline.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | from algorithm.estimator import IterationEstimator 7 | 8 | 9 | class Baseline(IterationEstimator): 10 | """ 11 | 虽然是baseline,不过整体表现比itemcf和slopOne还高, 12 | 也可以看出邻居模型的弊端了,缺少优化目标 13 | 14 | 属性 15 | --------- 16 | n_factors : 隐式因子数 17 | n_epochs : 迭代次数 18 | lr : 学习速率 19 | reg : 正则因子 20 | """ 21 | 22 | def __init__(self, n_factors=20, n_epochs=20, lr=0.007, reg=.002): 23 | self.n_factors = n_factors 24 | self.n_epochs = n_epochs 25 | self.lr = lr 26 | self.reg = reg 27 | 28 | def _prepare(self): 29 | self.user_num = self.train_dataset.matrix.shape[0] 30 | self.item_num = self.train_dataset.matrix.shape[1] 31 | 32 | self.global_mean = self.train_dataset.global_mean 33 | 34 | # user bias 35 | self.bu = np.zeros(self.user_num, np.double) 36 | 37 | # item bias 38 | self.bi = np.zeros(self.item_num, np.double) 39 | 40 | def _iteration(self): 41 | for u, i, r in self.train_dataset.all_ratings(): 42 | # 预测值 43 | rp = self.global_mean + self.bu[u] + self.bi[i] 44 | # 误差 45 | e_ui = r - rp 46 | 47 | self.bu[u] += self.lr * (e_ui - self.reg * self.bu[u]) 48 | self.bi[i] += self.lr * (e_ui - self.reg * self.bi[i]) 49 | 50 | def _pred(self): 51 | return self.global_mean + np.repeat(np.asmatrix(self.bu).T, self.item_num, axis=1) \ 52 | + np.repeat(np.asmatrix(self.bi), self.user_num, axis=0) 53 | 54 | def predict(self, u, i): 55 | est = self.global_mean + self.bu[u] + self.bi[i] 56 | return est -------------------------------------------------------------------------------- /other/svd/algorithm/mf/explicit_als.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | import scipy.sparse as sparse 7 | from algorithm.estimator import IterationEstimator 8 | 9 | 10 | class ExplicitALS(IterationEstimator): 11 | """ 12 | 显式交替最小二乘,算法表现一般,从它的损失函数也可以看出,是最 13 | 简单的svd。只不过ALS相比SGD速度快一点, 一般10次迭代就能收敛 14 | 15 | 属性 16 | --------- 17 | n_factors : 隐式因子数 18 | n_epochs : 迭代次数 19 | reg : 正则因子 20 | """ 21 | 22 | def __init__(self, n_factors=20, n_epochs=10, reg=0.1): 23 | self.n_factors = n_factors 24 | self.n_epochs = n_epochs 25 | self.reg = reg 26 | 27 | #交替! 28 | def alternative(self, X, Y, is_user): 29 | reg_I = self.reg * sparse.eye(self.n_factors) 30 | uids = self.train_dataset.uids if is_user else self.train_dataset.iids 31 | 32 | for u in uids: 33 | if is_user: 34 | action_idx = self.train_dataset.get_user(u)[0] 35 | else: 36 | action_idx = self.train_dataset.get_item(u)[0] 37 | Y_u = Y[action_idx] 38 | 39 | if is_user: 40 | ru = self.train_dataset.matrix.A[u, action_idx] 41 | else: 42 | ru = self.train_dataset.matrix.A[action_idx, u].T 43 | 44 | X[u] = np.linalg.solve(np.dot(np.transpose(Y_u), Y_u) + reg_I, np.dot(Y_u.T, ru)) 45 | 46 | def _prepare(self): 47 | self.user_num = self.train_dataset.matrix.shape[0] 48 | self.item_num = self.train_dataset.matrix.shape[1] 49 | self.X = np.random.normal(size=(self.user_num, self.n_factors)) 50 | self.Y = np.random.normal(size=(self.item_num, self.n_factors)) 51 | 52 | def _iteration(self): 53 | self.alternative(self.X, self.Y, True) 54 | self.alternative(self.Y, self.X, False) 55 | 56 | def _pred(self): 57 | return np.dot(self.X, self.Y.T) 58 | 59 | def predict(self, u, i): 60 | est = np.dot(self.X[u,:], self.Y[i,:]) 61 | return est -------------------------------------------------------------------------------- /other/svd/algorithm/mf/implicit_als.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | import scipy.sparse as sparse 7 | from scipy.sparse.linalg import spsolve 8 | from algorithm.estimator import IterationEstimator 9 | 10 | 11 | class ImplicitALS(IterationEstimator): 12 | """ 13 | 隐式交替最小二乘,果然不适合显式数据,表现很离谱 14 | 15 | 属性 16 | --------- 17 | n_factors : 隐式因子数 18 | n_epochs : 迭代次数 19 | reg : 正则因子 20 | alpha : 隐式数据评分系数 21 | """ 22 | 23 | def __init__(self, n_factors=20, n_epochs=10, reg=0.1, alpha=40): 24 | self.n_factors = n_factors 25 | self.n_epochs = n_epochs 26 | self.reg = reg 27 | self.alpha = alpha 28 | 29 | def alternative(self, X, Y, is_user): 30 | reg_I = self.reg * sparse.eye(self.n_factors) 31 | YTY = Y.T.dot(Y) 32 | I = sparse.eye(Y.shape[0]) 33 | 34 | uids = self.train_dataset.uids if is_user else self.train_dataset.iids 35 | for u in uids: 36 | if is_user: 37 | ru = self.train_dataset.matrix.A[u] 38 | else: 39 | ru = self.train_dataset.matrix.A[:, u].T 40 | 41 | CuI = sparse.diags(ru * self.alpha, 0) 42 | Cu = CuI + I 43 | 44 | pu = ru.copy() 45 | pu[ru != 0] = 1.0 46 | 47 | YT_CuI_Y = Y.T.dot(CuI).dot(Y) 48 | YT_CuI_pu = Y.T.dot(Cu).dot(sparse.csr_matrix(pu).T) 49 | 50 | X[u] = spsolve(YTY + YT_CuI_Y + reg_I, YT_CuI_pu) 51 | 52 | def _prepare(self): 53 | self.user_num = self.train_dataset.matrix.shape[0] 54 | self.item_num = self.train_dataset.matrix.shape[1] 55 | self.X = sparse.csr_matrix(np.random.normal(size=(self.user_num, self.n_factors))) 56 | self.Y = sparse.csr_matrix(np.random.normal(size=(self.item_num, self.n_factors))) 57 | 58 | def _iteration(self): 59 | self.alternative(self.X, self.Y, True) 60 | self.alternative(self.Y, self.X, False) 61 | 62 | def _pred(self): 63 | return np.dot(self.X, self.Y.T) 64 | 65 | def predict(self, u, i): 66 | est = self.X[u].dot(self.Y[i].T)[0,0] 67 | return est -------------------------------------------------------------------------------- /other/svd/algorithm/mf/svd.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | from algorithm.estimator import IterationEstimator 7 | 8 | 9 | class SVD(IterationEstimator): 10 | """ 11 | 属性 12 | --------- 13 | n_factors : 隐式因子数 14 | n_epochs : 迭代次数 15 | lr : 学习速率 16 | reg : 正则因子 17 | """ 18 | 19 | def __init__(self, n_factors=20, n_epochs=20, lr=0.007, reg=.002): 20 | self.n_factors = n_factors 21 | self.n_epochs = n_epochs 22 | self.lr = lr 23 | self.reg = reg 24 | 25 | def _prepare(self): 26 | self.train_dataset = self.train_dataset 27 | self.user_num = self.train_dataset.matrix.shape[0] 28 | self.item_num = self.train_dataset.matrix.shape[1] 29 | 30 | self.global_mean = self.train_dataset.global_mean 31 | # user bias 32 | self.bu = np.zeros(self.user_num, np.double) 33 | 34 | # item bias 35 | self.bi = np.zeros(self.item_num, np.double) 36 | 37 | # user factor 38 | self.p = np.zeros((self.user_num, self.n_factors), np.double) + .1 39 | 40 | # item factor 41 | self.q = np.zeros((self.item_num, self.n_factors), np.double) + .1 42 | 43 | def _iteration(self): 44 | for u, i, r in self.train_dataset.all_ratings(): 45 | # 预测值 46 | rp = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u]) 47 | # 误差 48 | e_ui = r - rp 49 | 50 | self.bu[u] += self.lr * (e_ui - self.reg * self.bu[u]) 51 | self.bi[i] += self.lr * (e_ui - self.reg * self.bi[i]) 52 | self.p[u] += self.lr * (e_ui * self.q[i] - self.reg * self.p[u]) 53 | self.q[i] += self.lr * (e_ui * self.p[u] - self.reg * self.q[i]) 54 | 55 | def _pred(self): 56 | return self.global_mean + np.repeat(np.asmatrix(self.bu).T, self.item_num, axis=1) \ 57 | + np.repeat(np.asmatrix(self.bi), self.user_num, axis=0) \ 58 | + np.dot(self.p, self.q.T) 59 | 60 | def predict(self, u, i): 61 | est = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u]) 62 | return est 63 | 64 | -------------------------------------------------------------------------------- /other/svd/algorithm/mf/svdpp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | from algorithm.estimator import Estimator 7 | 8 | 9 | class SVDPlusPlus(Estimator): 10 | """ 11 | 属性 12 | --------- 13 | n_factors : 隐式因子数 14 | n_epochs : 迭代次数 15 | lr : 学习速率 16 | reg : 正则因子 17 | """ 18 | 19 | def __init__(self, n_factors=20, n_epochs=20, lr=0.007, reg=.002): 20 | self.n_factors = n_factors 21 | self.n_epochs = n_epochs 22 | self.lr = lr 23 | self.reg = reg 24 | 25 | def train(self, train_dataset): 26 | user_num = train_dataset.matrix.shape[0] 27 | item_num = train_dataset.matrix.shape[1] 28 | self.train_dataset = train_dataset 29 | 30 | #global mean 31 | self.global_mean = train_dataset.global_mean 32 | 33 | #user bias 34 | self.bu = np.zeros(user_num, np.double) 35 | 36 | #item bias 37 | self.bi = np.zeros(item_num, np.double) 38 | 39 | #user factor 40 | self.p = np.zeros((user_num, self.n_factors), np.double) + .1 41 | 42 | #item factor 43 | self.q = np.zeros((item_num, self.n_factors), np.double) + .1 44 | 45 | #item preference facotor 46 | self.y = np.zeros((item_num, self.n_factors), np.double) + .1 47 | 48 | for current_epoch in range(self.n_epochs): 49 | print(" processing epoch {}".format(current_epoch)) 50 | for u, i, r in train_dataset.all_ratings(): 51 | #用户u点评的item集 52 | Nu = train_dataset.get_user(u)[0] 53 | I_Nu = len(Nu) 54 | sqrt_N_u = np.sqrt(I_Nu) 55 | 56 | #基于用户u点评的item集推测u的implicit偏好 57 | y_u = np.sum(self.y[Nu], axis=0) 58 | 59 | u_impl_prf = y_u / sqrt_N_u 60 | 61 | #预测值 62 | rp = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u] + u_impl_prf) 63 | 64 | #误差 65 | e_ui = r - rp 66 | 67 | #sgd 68 | self.bu[u] += self.lr * (e_ui - self.reg * self.bu[u]) 69 | self.bi[i] += self.lr * (e_ui - self.reg * self.bi[i]) 70 | self.p[u] += self.lr * (e_ui * self.q[i] - self.reg * self.p[u]) 71 | self.q[i] += self.lr * (e_ui * (self.p[u] + u_impl_prf) - self.reg * self.q[i]) 72 | for j in Nu: 73 | self.y[j] += self.lr * (e_ui * self.q[j] / sqrt_N_u - self.reg * self.y[j]) 74 | 75 | def predict(self, u, i): 76 | Nu = self.train_dataset.get_user(u)[0] 77 | I_Nu = len(Nu) 78 | sqrt_N_u = np.sqrt(I_Nu) 79 | y_u = np.sum(self.y[Nu], axis=0) / sqrt_N_u 80 | 81 | est = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u] + y_u) 82 | return est 83 | 84 | 85 | -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__init__.py -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__init__.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/__pycache__/itemcf.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__pycache__/itemcf.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/__pycache__/slop_one.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__pycache__/slop_one.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/itemcf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | from scipy.sparse import lil_matrix 7 | from algorithm.estimator import Estimator 8 | from algorithm.mf.baseline import Baseline 9 | 10 | 11 | class Itemcf(Estimator): 12 | """ 13 | 属性 14 | --------- 15 | min : 有效交互数下限 16 | topk : 相似矩阵topk 17 | use_baseline : 是否嵌入baseline计算bias 18 | """ 19 | 20 | def __init__(self, min=2, topk=20, use_baseline=True): 21 | self.min = min 22 | self.topk = topk 23 | self.use_baseline = use_baseline 24 | 25 | def compute_cosine_similarity(self, user_num, item_num, users_ratings): 26 | sim = lil_matrix((item_num, item_num), dtype=np.double) 27 | 28 | #点积 29 | dot = lil_matrix((item_num, item_num), dtype=np.double) 30 | 31 | #左向量平方和 32 | sql = lil_matrix((item_num, item_num), dtype=np.double) 33 | 34 | #右向量平方和 35 | sqr = lil_matrix((item_num, item_num), dtype=np.double) 36 | 37 | #共现矩阵 38 | coo = lil_matrix((item_num, item_num), dtype=np.double) 39 | 40 | cur = 1 41 | for u, (ii, rr) in users_ratings: 42 | cur = cur + 1 43 | for k in range(len(ii) - 1): 44 | k1, k2 = k, k+1 45 | i1, i2 = ii[k1], ii[k2] 46 | if i1 > i2: 47 | i1, i2 = i2, i1 48 | k1, k2 = k2, k1 49 | dot[i1, i2] += rr[k1] * rr[k2] 50 | sql[i1, i2] += rr[k1]**2 51 | sqr[i1, i2] += rr[k2]**2 52 | coo[i1, i2] += 1 53 | self.progress(cur, user_num, 50) 54 | 55 | #dok_matrix不适合进行矩阵算术操作,转为csc格式 56 | dot = dot.tocsc() 57 | sql = sql.tocsc() 58 | sqr = sqr.tocsc() 59 | coo = coo.tocsc() 60 | 61 | #交互数低于限制全部清零 62 | dot.data[coo.data < self.min] = 0 63 | 64 | #左右向量平方和的乘积 65 | sql.data *= sqr.data 66 | 67 | #只需要考虑非0点积 68 | row, col = dot.nonzero() 69 | 70 | #cosine相似矩阵 71 | sim[row, col] = dot[row, col] / np.sqrt((sql)[row, col]) 72 | sim[col, row] = sim[row, col] 73 | 74 | return sim.A 75 | 76 | def _train(self): 77 | if self.use_baseline: 78 | self.baseline = Baseline() 79 | self.baseline.train(self.train_dataset) 80 | 81 | user_num = self.train_dataset.matrix.shape[0] 82 | item_num = self.train_dataset.matrix.shape[1] 83 | self.sim = self.compute_cosine_similarity(user_num, item_num, self.train_dataset.get_users()) 84 | self.item_means = self.train_dataset.get_item_means() 85 | self.user_means = self.train_dataset.get_user_means() 86 | 87 | def predict(self, u, i): 88 | ll, rr = self.train_dataset.get_user(u) 89 | neighbors = [(sim_i, self.sim[i, sim_i], sim_r) for sim_i, sim_r in zip(ll, rr)] 90 | 91 | neighbors = sorted(neighbors, key=lambda tple: tple[1], reverse=True)[0:self.topk] 92 | est = self.baseline.predict(u, i) if self.use_baseline else self.item_means[i] 93 | sum = 0 94 | divisor = 0 95 | 96 | for sim_i, sim, sim_r in neighbors: 97 | if not self.use_baseline: 98 | bias = sim_r - self.item_means[sim_i] 99 | else: 100 | bias = sim_r - self.baseline.predict(u, sim_i) 101 | 102 | sum += sim * bias 103 | divisor += sim 104 | 105 | if divisor != 0: 106 | est += sum / divisor 107 | return est -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/itemcf.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/itemcf.pyc -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/slop_one.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | from scipy.sparse import lil_matrix 7 | from algorithm.estimator import Estimator 8 | 9 | 10 | class SlopOne(Estimator): 11 | """ 12 | 属性 13 | --------- 14 | is_weighted : slopOne or weightedSlopOne 15 | """ 16 | 17 | def __init__(self, is_weighted=False): 18 | self.is_weighted = is_weighted 19 | 20 | def _train(self): 21 | item_num = self.train_dataset.matrix.shape[1] 22 | 23 | self.freq = lil_matrix((item_num, item_num), dtype=np.int8) 24 | self.dev = lil_matrix((item_num, item_num), dtype=np.double) 25 | user_num = self.train_dataset.matrix.shape[0] 26 | cur = 0 27 | for u, (ii, rr) in self.train_dataset.get_users(): 28 | cur += 1 29 | for k in range(len(ii) - 1): 30 | k1, k2 = k, k+1 31 | i1, i2 = ii[k1], ii[k2] 32 | if i1 > i2: 33 | i1, i2 = i2, i1 34 | k1, k2 = k2, k1 35 | self.freq[i1, i2] += 1 36 | self.dev[i1, i2] += rr[k1] - rr[k2] 37 | self.progress(cur, user_num, 50) 38 | 39 | nonzero_indices = self.freq.nonzero() 40 | self.dev[nonzero_indices] /= self.freq[nonzero_indices] 41 | 42 | self.dev[(nonzero_indices[1], nonzero_indices[0])] = -self.dev[nonzero_indices] 43 | self.freq[(nonzero_indices[1], nonzero_indices[0])] = self.freq[nonzero_indices] 44 | 45 | # for i,j in zip(dev.nonzero()): 46 | # if i > j: 47 | # i, j = j, i 48 | # dev[i, j] /= freq[i, j] 49 | 50 | self.dev = self.dev.A 51 | self.freq = self.freq.A 52 | self.user_means = self.train_dataset.get_user_means() 53 | self.ratings = self.train_dataset.matrix.A 54 | 55 | def predict(self, u, i): 56 | N = [j for j in self.train_dataset.get_user(u)[0] if self.freq[i, j] > 0] 57 | est = self.user_means[u] 58 | 59 | if N: 60 | if self.is_weighted: 61 | est = sum([(self.ratings[u, j] + self.dev[i, j]) * self.freq[i, j] for j in N]) /\ 62 | sum([self.freq[i, j] for j in N]) 63 | else: 64 | est += np.mean([self.dev[i, j] for j in N]) 65 | return est 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /other/svd/algorithm/neighborhood/slop_one.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/slop_one.pyc -------------------------------------------------------------------------------- /other/svd/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from algorithm.mf.baseline import Baseline 4 | from util.databuilder import DataBuilder 5 | 6 | from algorithm.dnn.neumf import NeuMF 7 | 8 | from algorithm.mf.explicit_als import ExplicitALS 9 | from algorithm.mf.svd import SVD 10 | from algorithm.mf.svdpp import SVDPlusPlus 11 | from algorithm.mf.implicit_als import ImplicitALS 12 | 13 | from algorithm.neighborhood.slop_one import SlopOne 14 | from algorithm.neighborhood.itemcf import Itemcf 15 | 16 | file_name = os.path.abspath("data/ml-100k/u.data") 17 | data_builder = DataBuilder(file_name, just_test_one=True) 18 | 19 | 20 | 21 | data_builder.eval(NeuMF(epochs=2)) 22 | 23 | 24 | 25 | data_builder.eval(Itemcf()) 26 | 27 | 28 | 29 | data_builder.eval(SlopOne()) 30 | 31 | 32 | 33 | data_builder.eval(Baseline()) 34 | 35 | 36 | data_builder.eval(SVD()) 37 | 38 | 39 | data_builder.eval(SVDPlusPlus()) 40 | 41 | 42 | data_builder.eval(ExplicitALS()) 43 | 44 | 45 | data_builder.eval(ImplicitALS()) 46 | 47 | -------------------------------------------------------------------------------- /other/svd/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/tests/__init__.py -------------------------------------------------------------------------------- /other/svd/tests/algorithm_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from algorithm.mf.baseline import Baseline 4 | from util.databuilder import DataBuilder 5 | 6 | from algorithm.dnn.neumf import NeuMF 7 | 8 | from algorithm.mf.explicit_als import ExplicitALS 9 | from algorithm.mf.svd import SVD 10 | from algorithm.mf.svdpp import SVDPlusPlus 11 | from algorithm.mf.implicit_als import ImplicitALS 12 | 13 | from algorithm.neighborhood.slop_one import SlopOne 14 | from algorithm.neighborhood.itemcf import Itemcf 15 | 16 | file_name = os.path.abspath("data/ml-100k/u.data") 17 | data_builder = DataBuilder(file_name, just_test_one=True) 18 | 19 | 20 | def test_neumf(): 21 | data_builder.eval(NeuMF(epochs=2)) 22 | 23 | 24 | def test_itemcf(): 25 | data_builder.eval(Itemcf()) 26 | 27 | 28 | def test_slopOne(): 29 | data_builder.eval(SlopOne()) 30 | 31 | 32 | def test_baseline(): 33 | data_builder.eval(Baseline()) 34 | 35 | 36 | def test_svd(): 37 | data_builder.eval(SVD()) 38 | 39 | 40 | def test_svdpp(): 41 | data_builder.eval(SVDPlusPlus()) 42 | 43 | 44 | def test_explicit_als(): 45 | data_builder.eval(ExplicitALS()) 46 | 47 | 48 | def test_implicit_als(): 49 | data_builder.eval(ImplicitALS()) -------------------------------------------------------------------------------- /other/svd/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__init__.py -------------------------------------------------------------------------------- /other/svd/util/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/util/__pycache__/databuilder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/databuilder.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/util/__pycache__/matrix.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/matrix.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/util/__pycache__/measure.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/measure.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/util/__pycache__/tools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/tools.cpython-37.pyc -------------------------------------------------------------------------------- /other/svd/util/databuilder.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import itertools 4 | import os 5 | 6 | import numpy as np 7 | from scipy.sparse import csr_matrix 8 | from util.matrix import Matrix 9 | import util.tools as tl 10 | 11 | 12 | class DataBuilder(object): 13 | """ 14 | 构造数据模型 15 | 16 | 参数 17 | ---------- 18 | file_name : 文件地址,这里用的grouplens数据集 19 | shuffle : 是否对数据shuffle 20 | just_test_one : k折交叉验证要运行k次,这里只运行一次,方便测试程序正确性 21 | """ 22 | 23 | def __init__(self, file_name, shuffle=True, just_test_one=False): 24 | self.file_name = file_name 25 | self.shuffle = shuffle 26 | self.just_test_one = just_test_one 27 | 28 | def read_ratings(self): 29 | """ 30 | 读取数据 31 | """ 32 | 33 | with open(os.path.expanduser(self.file_name)) as f: 34 | raw_ratings = [self.parse_line(line) for line in itertools.islice(f, 0, None)] 35 | return raw_ratings 36 | 37 | def parse_line(self, line): 38 | line = line.split("\t") 39 | uid, iid, r, timestamp = (line[i].strip() for i in range(4)) 40 | return uid, iid, float(r), timestamp 41 | 42 | def cv(self, k_folds): 43 | raw_ratings = self.read_ratings() 44 | 45 | if self.shuffle: 46 | np.random.shuffle(raw_ratings) 47 | 48 | stop = 0 49 | raw_len = len(raw_ratings) 50 | offset = raw_len // k_folds 51 | left = raw_len % k_folds 52 | for fold_i in range(k_folds): 53 | print("current fold {}".format(fold_i + 1)) 54 | start = stop 55 | stop += offset 56 | if fold_i < left: 57 | stop += 1 58 | 59 | #使用生成器,提高效率 60 | yield self.mapping(raw_ratings[:start] + raw_ratings[stop:]), raw_ratings[start:stop] 61 | 62 | def mapping(self, raw_train_ratings): 63 | uid_dict = {} 64 | iid_dict = {} 65 | current_u_index = 0 66 | current_i_index = 0 67 | 68 | row = [] 69 | col = [] 70 | data = [] 71 | for urid, irid, r, timestamp in raw_train_ratings: 72 | try: 73 | uid = uid_dict[urid] 74 | except KeyError: 75 | uid = current_u_index 76 | uid_dict[urid] = current_u_index 77 | current_u_index += 1 78 | try: 79 | iid = iid_dict[irid] 80 | except KeyError: 81 | iid = current_i_index 82 | iid_dict[irid] = current_i_index 83 | current_i_index += 1 84 | 85 | row.append(uid) 86 | col.append(iid) 87 | data.append(r) 88 | 89 | sparse_matrix = csr_matrix((data, (row, col))) 90 | 91 | return Matrix(sparse_matrix, uid_dict, iid_dict) 92 | 93 | def eval(self, algorithm, measures=["rmse", "mae"], k_folds=5): 94 | eval_results = [] 95 | 96 | for train_dataset, test_dataset in self.cv(k_folds): 97 | algorithm.train(train_dataset) 98 | eval_results.append(algorithm.estimate(test_dataset, measures)) 99 | if self.just_test_one: 100 | break 101 | 102 | tl.print_pretty(measures, eval_results) -------------------------------------------------------------------------------- /other/svd/util/dnn_util.py: -------------------------------------------------------------------------------- 1 | from keras.callbacks import Callback 2 | from sklearn.metrics import mean_squared_error 3 | import numpy as np 4 | 5 | 6 | class RMSEvaluation(Callback): 7 | def __init__(self, validation_data=(), interval=1): 8 | super(Callback, self).__init__() 9 | 10 | self.interval = interval 11 | self.X_val, self.y_val = validation_data 12 | 13 | def on_epoch_end(self, epoch, logs={}): 14 | if epoch % self.interval == 0: 15 | y_pred = self.model.predict(self.X_val, verbose=0, batch_size=2000) 16 | score = np.sqrt(mean_squared_error(self.y_val, y_pred)) 17 | print("\n RMSE - epoch: %d - score: %.6f \n" % (epoch + 1, score)) -------------------------------------------------------------------------------- /other/svd/util/matrix.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | import itertools 7 | 8 | 9 | class Matrix(object): 10 | 11 | def __init__(self, sparse_matrix, uid_dict=None, iid_dict=None): 12 | self.matrix = sparse_matrix.tocsc() 13 | self._global_mean = None 14 | coo_matrix = sparse_matrix.tocoo() 15 | self.uids = set(coo_matrix.row) 16 | self.iids = set(coo_matrix.col) 17 | self.uid_dict = uid_dict 18 | self.iid_dict = iid_dict 19 | 20 | def get_item(self, i): 21 | """ 22 | (is, (us, rs)) 23 | """ 24 | 25 | ratings = self.matrix.getcol(i).tocoo() 26 | return ratings.row, ratings.data 27 | 28 | def get_user(self, u): 29 | """ 30 | (u, (is, rs)) 31 | """ 32 | 33 | ratings = self.matrix.getrow(u).tocoo() 34 | return ratings.col, ratings.data 35 | 36 | def get_users(self): 37 | """ 38 | iterator(u, (is, rs)) 39 | """ 40 | 41 | for u in self.get_uids(): 42 | yield u, self.get_user(u) 43 | 44 | def get_user_means(self): 45 | """ 46 | 用户的平均评分字典 47 | """ 48 | 49 | users_mean = {} 50 | for u in self.get_uids(): 51 | users_mean[u] = np.mean(self.get_user(u)[1]) 52 | return users_mean 53 | 54 | def get_item_means(self): 55 | """ 56 | 物品的平均评分字典 57 | """ 58 | 59 | item_means = {} 60 | for i in self.get_iids(): 61 | item_means[i] = np.mean(self.get_item(i)[1]) 62 | return item_means 63 | 64 | def all_ratings(self, axis=1): 65 | """ 66 | row(u,i,r) 67 | or 68 | col(u, i, r) 69 | """ 70 | coo_matrix = self.matrix.tocoo() 71 | 72 | if axis == 1: 73 | # return[Row(zip(column_names, row))for row in cursor] 74 | return zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) 75 | # return itertools.izip(coo_matrix.row, coo_matrix.col, coo_matrix.data) 76 | else: 77 | return coo_matrix.row, coo_matrix.col, coo_matrix.data 78 | 79 | def get_uids(self): 80 | """ 81 | 所有用户id集 82 | """ 83 | 84 | return np.unique(self.matrix.tocoo().row) 85 | 86 | def get_iids(self): 87 | """ 88 | 所有物品id集 89 | """ 90 | return np.unique(self.matrix.tocoo().col) 91 | 92 | def has_user(self, u): 93 | """ 94 | 是否存在用户u 95 | """ 96 | 97 | return u in self.uids 98 | 99 | def has_item(self, i): 100 | """ 101 | 是否存在物品i 102 | """ 103 | 104 | return i in self.iids 105 | 106 | @property 107 | def global_mean(self): 108 | """ 109 | 全局均值 110 | """ 111 | 112 | if self._global_mean is None: 113 | self._global_mean = np.sum(self.matrix.data) / self.matrix.size 114 | return self._global_mean -------------------------------------------------------------------------------- /other/svd/util/measure.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import numpy as np 6 | 7 | 8 | def rmse(errors): 9 | return np.sqrt(np.mean(np.power(errors, 2))) 10 | 11 | 12 | def mae(errors): 13 | return np.mean(np.abs(errors)) 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /other/svd/util/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division, print_function 4 | 5 | import time 6 | import numpy as np 7 | 8 | 9 | def print_pretty(measures, eval_results): 10 | """ 11 | 格式化输出 12 | """ 13 | 14 | pad = '{:<9}' * (len(measures) + 1) 15 | 16 | print(pad.format('', *measures)) 17 | 18 | keep = lambda eval_result:['{:.4f}'.format(single_eval) \ 19 | for single_eval in eval_result] 20 | for i, eval_result in enumerate(eval_results): 21 | print(pad.format('fold {}'.format(i), *keep(eval_result))) 22 | print(pad.format('avg', *keep(np.mean(eval_results, axis=0)))) 23 | 24 | 25 | class Timer(object): 26 | """ 27 | time util 28 | """ 29 | def __enter__(self): 30 | self.start = time.clock() 31 | return self 32 | 33 | def __exit__(self, *args): 34 | self.end = time.clock() 35 | self.interval = self.end - self.start -------------------------------------------------------------------------------- /svd/svd.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/svd/svd.gif -------------------------------------------------------------------------------- /svd/untitled1.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # encoding: utf-8 3 | __author__ = 'Scarlett' 4 | #矩阵分解在打分预估系统中得到了成熟的发展和应用 5 | # from pylab import * 6 | import matplotlib.pyplot as plt 7 | from math import pow 8 | import numpy 9 | 10 | 11 | def matrix_factorization(R,P,Q,K,steps=5000,alpha=0.0002,beta=0.02): 12 | Q=Q.T # .T操作表示矩阵的转置 13 | result=[] 14 | for step in range(steps): 15 | for i in range(len(R)): 16 | for j in range(len(R[i])): 17 | if R[i][j]>0: 18 | eij=R[i][j]-numpy.dot(P[i,:],Q[:,j]) # .dot(P,Q) 表示矩阵内积 19 | for k in range(K): 20 | P[i][k]=P[i][k]+alpha*(2*eij*Q[k][j]-beta*P[i][k]) 21 | Q[k][j]=Q[k][j]+alpha*(2*eij*P[i][k]-beta*Q[k][j]) 22 | eR=numpy.dot(P,Q) 23 | e=0 24 | for i in range(len(R)): 25 | for j in range(len(R[i])): 26 | if R[i][j]>0: 27 | e=e+pow(R[i][j]-numpy.dot(P[i,:],Q[:,j]),2) 28 | for k in range(K): 29 | e=e+(beta/2)*(pow(P[i][k],2)+pow(Q[k][j],2)) 30 | result.append(e) 31 | if e<0.001: 32 | break 33 | return P,Q.T,result 34 | 35 | if __name__ == '__main__': 36 | # R=[ 37 | # [5,3,0,1], 38 | # [4,0,0,1], 39 | # [1,1,0,5], 40 | # [1,0,0,4], 41 | # [0,1,5,4] 42 | # ] 43 | 44 | R=[[4., 3., 0., 5., 0.], 45 | [5., 0., 4., 4., 0.], 46 | [4., 0., 5., 0., 3.], 47 | [2., 3., 0., 1., 0.], 48 | [0., 4., 2., 0., 5.]] 49 | 50 | R=numpy.array(R) 51 | 52 | N=len(R) 53 | M=len(R[0]) 54 | K=2 55 | 56 | P=numpy.random.rand(N,K) #随机生成一个 N行 K列的矩阵 57 | Q=numpy.random.rand(M,K) #随机生成一个 M行 K列的矩阵 58 | 59 | nP,nQ,result=matrix_factorization(R,P,Q,K) 60 | print("原始的评分矩阵R为:\n",R) 61 | R_MF=numpy.dot(nP,nQ.T) 62 | print("经过MF算法填充0处评分值后的评分矩阵R_MF为:\n",R_MF) 63 | 64 | #-------------损失函数的收敛曲线图--------------- 65 | 66 | n=len(result) 67 | x=range(n) 68 | plt.plot(x,result,color='r',linewidth=3) 69 | plt.title("Convergence curve") 70 | plt.xlabel("generation") 71 | plt.ylabel("loss") 72 | plt.show() -------------------------------------------------------------------------------- /svd/满秩.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/svd/满秩.gif -------------------------------------------------------------------------------- /wide-and-deep-learning-keras/README.md: -------------------------------------------------------------------------------- 1 | # Wide and Deep Learning implemented with Keras 2 |
 3 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 4 |                     Version 2, December 2004
 5 | 
 6 |  Copyright (C) 2004 Sam Hocevar 
 7 | 
 8 |  Everyone is permitted to copy and distribute verbatim or modified
 9 |  copies of this license document, and changing it is allowed as long
10 |  as the name is changed.
11 | 
12 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
13 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
14 | 
15 |   0. You just DO WHAT THE FUCK YOU WANT TO.
16 | 
17 | 18 | ### Requirements 19 | * Python >= 3.6 20 | * TensorFlow >= 1.6 21 | * Keras >= 2.0.0 22 | 23 | ### Model Plot 24 | ![Model Plot](https://github.com/kaitolucifer/wide-and-deep-learning-keras/blob/master/model.png) 25 | The model is based on [Heng-Tze Cheng, *et al.* Wide & Deep Learning for Recommender Systems (2016)](https://arxiv.org/abs/1606.07792)
26 | I used [UCI Machine Learning Repository: Adult Data Set](https://archive.ics.uci.edu/ml/datasets/adult) as example data.
27 | There are 8 categorical features so I put every one of them into a embedding layer.
28 | And I just put the rest 5 continuous feature into a dense layer and concatenate it with all embedding layers.
29 | Then add some dense layer and before sigmoid layer I concatente the dense layer output with logistic part input.
30 | 31 | ### Best Test Set Accuracy 32 | 85.8% 33 | -------------------------------------------------------------------------------- /wide-and-deep-learning-keras/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/wide-and-deep-learning-keras/model.png -------------------------------------------------------------------------------- /wide-and-deep-learning-keras/wide_and_deep.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/wide-and-deep-learning-keras/wide_and_deep.h5 -------------------------------------------------------------------------------- /wide-and-deep-learning-keras/wide_and_deep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/wide-and-deep-learning-keras/wide_and_deep.png -------------------------------------------------------------------------------- /wide-and-deep-learning-keras/wide_and_deep.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder 4 | from keras.layers import Input, Embedding, Dense, Flatten, Dropout, SpatialDropout1D, Activation, concatenate 5 | from keras.optimizers import Adam, SGD 6 | from keras.layers.advanced_activations import ReLU, PReLU, LeakyReLU, ELU 7 | from keras.layers.normalization import BatchNormalization 8 | from keras.callbacks import EarlyStopping, ModelCheckpoint 9 | from keras.models import Model 10 | from tensorflow.keras.utils import plot_model 11 | 12 | 13 | COLUMNS = [ 14 | "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", 15 | "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", 16 | "hours_per_week", "native_country", "income_bracket" 17 | ] 18 | 19 | LABEL_COLUMN = "label" 20 | 21 | CATEGORICAL_COLUMNS = [ 22 | "workclass", "education", "marital_status", "occupation", "relationship", 23 | "race", "gender", "native_country" 24 | ] 25 | 26 | CONTINUOUS_COLUMNS = [ 27 | "age", "education_num", "capital_gain", "capital_loss", "hours_per_week" 28 | ] 29 | 30 | 31 | def preprocessing(): 32 | train_data = pd.read_csv('./adult.data', names=COLUMNS) 33 | train_data.dropna(how='any', axis=0) 34 | test_data = pd.read_csv('./adult.test', skiprows=1, names=COLUMNS) 35 | test_data.dropna(how='any', axis=0) 36 | all_data = pd.concat([train_data, test_data]) 37 | # ラベルを数値化する 38 | all_data[LABEL_COLUMN] = all_data['income_bracket'].apply(lambda x: ">50K" in x).astype(int) 39 | all_data.pop('income_bracket') 40 | y = all_data[LABEL_COLUMN].values 41 | all_data.pop(LABEL_COLUMN) 42 | for c in CATEGORICAL_COLUMNS: 43 | le = LabelEncoder() 44 | all_data[c] = le.fit_transform(all_data[c]) 45 | train_size = len(train_data) 46 | x_train = all_data.iloc[:train_size] 47 | y_train = y[:train_size] 48 | x_test = all_data.iloc[train_size:] 49 | y_test = y[train_size:] 50 | x_train_categ = np.array(x_train[CATEGORICAL_COLUMNS]) # カテゴリーデータ 51 | x_test_categ = np.array(x_test[CATEGORICAL_COLUMNS]) 52 | x_train_conti = np.array(x_train[CONTINUOUS_COLUMNS], dtype='float64') # 連続的データ 53 | x_test_conti = np.array(x_test[CONTINUOUS_COLUMNS], dtype='float64') 54 | scaler = StandardScaler() 55 | x_train_conti = scaler.fit_transform(x_train_conti) # 連続データの訓練セットの平均とstdで標準化 56 | x_test_conti = scaler.transform(x_test_conti) 57 | return [x_train, y_train, x_test, y_test, x_train_categ, x_test_categ, x_train_conti, x_test_conti, all_data] 58 | 59 | 60 | class Wide_and_Deep: 61 | def __init__(self, mode='wide and deep'): 62 | self.mode = mode 63 | x_train, y_train, x_test, y_test, x_train_categ, x_test_categ, x_train_conti, x_test_conti, all_data \ 64 | = preprocessing() 65 | self.x_train = x_train 66 | self.y_train = y_train 67 | self.x_test = x_test 68 | self.y_test = y_test 69 | self.x_train_categ = x_train_categ # 訓練セットの中のカテゴリーデータ 70 | self.x_test_categ = x_test_categ # テストセットの中のカテゴリーデータ 71 | self.x_train_conti = x_train_conti # 訓練セットの中の連続的データ 72 | self.x_test_conti = x_test_conti # テストセットの中の連続的データ 73 | self.all_data = all_data 74 | self.poly = PolynomialFeatures(degree=2, interaction_only=True) 75 | # カテゴリーデータをcross product化 76 | self.x_train_categ_poly = self.poly.fit_transform(x_train_categ) 77 | self.x_test_categ_poly = self.poly.transform(x_test_categ) 78 | self.categ_inputs = None 79 | self.conti_input = None 80 | self.deep_component_outlayer = None 81 | self.logistic_input = None 82 | self.model = None 83 | 84 | def deep_component(self): 85 | categ_inputs = [] 86 | categ_embeds = [] 87 | # カテゴリーデータの特徴ごとにInput層とEmbedding層を作成 88 | for i in range(len(CATEGORICAL_COLUMNS)): 89 | input_i = Input(shape=(1,), dtype='int32') 90 | dim = len(np.unique(self.all_data[CATEGORICAL_COLUMNS[i]])) 91 | embed_dim = int(np.ceil(dim ** 0.25)) # 入力カテゴリー数の4乗根をEmbedding次元数にする 92 | embed_i = Embedding(dim, embed_dim, input_length=1)(input_i) 93 | flatten_i = Flatten()(embed_i) 94 | categ_inputs.append(input_i) 95 | categ_embeds.append(flatten_i) 96 | # 連続的データは全結合層で一括入力 97 | conti_input = Input(shape=(len(CONTINUOUS_COLUMNS),)) 98 | conti_dense = Dense(256, use_bias=False)(conti_input) 99 | # 全結合層と各Embeddingの出力をくっつける 100 | concat_embeds = concatenate([conti_dense]+categ_embeds) 101 | concat_embeds = Activation('relu')(concat_embeds) 102 | bn_concat = BatchNormalization()(concat_embeds) 103 | # 更に全結合層を3層重ねる 104 | fc1 = Dense(512, use_bias=False)(bn_concat) 105 | ac1 = ReLU()(fc1) 106 | bn1 = BatchNormalization()(ac1) 107 | fc2 = Dense(256, use_bias=False)(bn1) 108 | ac2 = ReLU()(fc2) 109 | bn2 = BatchNormalization()(ac2) 110 | fc3 = Dense(128)(bn2) 111 | ac3 = ReLU()(fc3) 112 | 113 | # 入力の層と最後の層をメンバー変数化(モデル作成用) 114 | self.categ_inputs = categ_inputs 115 | self.conti_input = conti_input 116 | self.deep_component_outlayer = ac3 117 | 118 | def wide_component(self): 119 | # カテゴリーデータだけ線形モデルに入れる 120 | dim = self.x_train_categ_poly.shape[1] 121 | self.logistic_input = Input(shape=(dim,)) 122 | 123 | def create_model(self): 124 | self.deep_component() 125 | self.wide_component() 126 | if self.mode == 'wide and deep': 127 | out_layer = concatenate([self.deep_component_outlayer, self.logistic_input]) 128 | inputs = [self.conti_input] + self.categ_inputs + [self.logistic_input] 129 | elif self.mode =='deep': 130 | out_layer = self.deep_component_outlayer 131 | inputs = [self.conti_input] + self.categ_inputs 132 | else: 133 | print('wrong mode') 134 | return 135 | 136 | output = Dense(1, activation='sigmoid')(out_layer) 137 | self.model = Model(inputs=inputs, outputs=output) 138 | 139 | def train_model(self, epochs=15, optimizer='adam', batch_size=128): 140 | if not self.model: 141 | print('You have to create model first') 142 | return 143 | 144 | if self.mode == 'wide and deep': 145 | input_data = [self.x_train_conti] +\ 146 | [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])] +\ 147 | [self.x_train_categ_poly] 148 | elif self.mode == 'deep': 149 | input_data = [self.x_train_conti] +\ 150 | [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])] 151 | else: 152 | print('wrong mode') 153 | return 154 | 155 | self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) 156 | self.model.fit(input_data, self.y_train, epochs=epochs, batch_size=batch_size) 157 | 158 | def evaluate_model(self): 159 | if not self.model: 160 | print('You have to create model first') 161 | return 162 | 163 | if self.mode == 'wide and deep': 164 | input_data = [self.x_test_conti] +\ 165 | [self.x_test_categ[:, i] for i in range(self.x_test_categ.shape[1])] +\ 166 | [self.x_test_categ_poly] 167 | elif self.mode == 'deep': 168 | input_data = [self.x_test_conti] +\ 169 | [self.x_test_categ[:, i] for i in range(self.x_test_categ.shape[1])] 170 | else: 171 | print('wrong mode') 172 | return 173 | 174 | loss, acc = self.model.evaluate(input_data, self.y_test) 175 | print(f'test_loss: {loss} - test_acc: {acc}') 176 | 177 | def save_model(self, filename='wide_and_deep.h5'): 178 | self.model.save(filename) 179 | 180 | 181 | if __name__ == '__main__': 182 | wide_deep_net = Wide_and_Deep() 183 | wide_deep_net.create_model() 184 | wide_deep_net.train_model() 185 | wide_deep_net.evaluate_model() 186 | wide_deep_net.save_model() 187 | plot_model(wide_deep_net.model, to_file='model.png', show_shapes=True, show_layer_names=False) --------------------------------------------------------------------------------