├── Factorization_Machine
    ├── FM.py
    ├── all.wmf
    ├── data
    │   ├── diabetes_test.txt
    │   └── diabetes_train.txt
    ├── rr.gif
    └── rr2.gif
├── Inverted_index
    └── invert_indexx.py
├── README.md
├── collaborative_filtering
    ├── ItemCF
    │   ├── item_book.txt
    │   └── main.py
    └── UserCF
    │   ├── cf.gif
    │   └── cf.py
├── ctr_fm_ffm
    ├── FFM.py
    ├── FM.py
    └── LR.py
├── deepfm_recomend
    ├── __pycache__
    │   ├── deepfm.cpython-38.pyc
    │   ├── feature.cpython-38.pyc
    │   ├── feature_column.cpython-38.pyc
    │   ├── inputs.cpython-38.pyc
    │   └── inputs1.cpython-38.pyc
    ├── activation.py
    ├── core.py
    ├── criteo_sample.txt
    ├── deepfm.png
    ├── deepfm_main.py
    ├── feature.py
    ├── inputs.py
    ├── inputs1.py
    ├── interaction.py
    ├── layers
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── activation.cpython-38.pyc
    │   │   ├── core.cpython-38.pyc
    │   │   ├── interaction.cpython-38.pyc
    │   │   ├── normalization.cpython-38.pyc
    │   │   ├── sequence.cpython-38.pyc
    │   │   └── utils.cpython-38.pyc
    │   ├── activation.py
    │   ├── contrib
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   ├── rnn_v2.cpython-38.pyc
    │   │   │   └── utils.cpython-38.pyc
    │   │   ├── rnn.py
    │   │   ├── rnn_v2.py
    │   │   └── utils.py
    │   ├── core.py
    │   ├── interaction.py
    │   ├── normalization.py
    │   ├── sequence.py
    │   ├── untitled17.py
    │   └── utils.py
    ├── run_classification_criteo.py
    ├── temp
    │   └── deepfm.py
    ├── xdeepfm.png
    └── xdeepfm_main.py
├── ffm
    ├── .gitignore
    ├── ffm.py
    ├── ffm_test.py
    ├── logistic.py
    ├── readme.md
    ├── singleton.py
    ├── train.txt
    └── valid.txt
├── gbdt_source
    ├── GBDTReg.py
    ├── README.txt
    ├── __pycache__
    │   └── GBDTReg.cpython-37.pyc
    ├── gbdt_demo.py
    └── testGBDT.py
├── item_book.txt
├── logstic
    ├── lf1000.gif
    ├── logstic.py
    └── testSet.txt
├── main.py
├── other
    ├── DeepFM-Keras-master
    │   ├── README.md
    │   ├── data
    │   │   └── README.md
    │   └── keras_FM.py
    ├── deep_and_wide_keras
    │   └── wide_and_deep_keras.py
    ├── deepfm
    │   ├── DeepFM.py
    │   ├── data
    │   │   ├── README.md
    │   │   ├── test.csv
    │   │   └── train.csv
    │   └── 广告预估CTR系列--DeepFM模型架构图--实现篇.jpg
    └── svd
    │   ├── README.md
    │   ├── algorithm
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-37.pyc
    │       │   └── estimator.cpython-37.pyc
    │       ├── dnn
    │       │   ├── __init__.py
    │       │   ├── __pycache__
    │       │   │   ├── __init__.cpython-37.pyc
    │       │   │   └── neumf.cpython-37.pyc
    │       │   └── neumf.py
    │       ├── estimator.py
    │       ├── mf
    │       │   ├── __init__.py
    │       │   ├── __pycache__
    │       │   │   ├── __init__.cpython-37.pyc
    │       │   │   ├── baseline.cpython-37.pyc
    │       │   │   ├── explicit_als.cpython-37.pyc
    │       │   │   ├── implicit_als.cpython-37.pyc
    │       │   │   ├── svd.cpython-37.pyc
    │       │   │   └── svdpp.cpython-37.pyc
    │       │   ├── baseline.py
    │       │   ├── explicit_als.py
    │       │   ├── implicit_als.py
    │       │   ├── svd.py
    │       │   └── svdpp.py
    │       └── neighborhood
    │       │   ├── __init__.py
    │       │   ├── __init__.pyc
    │       │   ├── __pycache__
    │       │       ├── __init__.cpython-37.pyc
    │       │       ├── itemcf.cpython-37.pyc
    │       │       └── slop_one.cpython-37.pyc
    │       │   ├── itemcf.py
    │       │   ├── itemcf.pyc
    │       │   ├── slop_one.py
    │       │   └── slop_one.pyc
    │   ├── data
    │       └── ml-100k
    │       │   └── u.data
    │   ├── main.py
    │   ├── tests
    │       ├── __init__.py
    │       └── algorithm_test.py
    │   └── util
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-37.pyc
    │           ├── databuilder.cpython-37.pyc
    │           ├── matrix.cpython-37.pyc
    │           ├── measure.cpython-37.pyc
    │           └── tools.cpython-37.pyc
    │       ├── databuilder.py
    │       ├── dnn_util.py
    │       ├── matrix.py
    │       ├── measure.py
    │       └── tools.py
├── svd
    ├── svd.gif
    ├── untitled1.py
    └── 满秩.gif
└── wide-and-deep-learning-keras
    ├── README.md
    ├── adult.data
    ├── adult.test
    ├── model.png
    ├── wide_and_deep.h5
    ├── wide_and_deep.png
    └── wide_and_deep.py


/Factorization_Machine/FM.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # coding:UTF-8
  4 | 
  5 | from __future__ import division
  6 | from math import exp
  7 | from numpy import *
  8 | from random import normalvariate  # 正态分布
  9 | from datetime import datetime
 10 | import pandas as pd
 11 | import numpy as np
 12 | from math import log10
 13 | trainData = 'data/diabetes_train.txt'   #请换为自己文件的路径
 14 | testData = 'data/diabetes_test.txt'
 15 | 
 16 | def preprocessData(data):
 17 |     feature=np.array(data.iloc[:,:-1])   #取特征
 18 |     label=data.iloc[:,-1].map(lambda x: 1 if x==1 else -1) #取标签并转化为 +1，-1
 19 |     #将数组按行进行归一化
 20 |     zmax, zmin = feature.max(axis=0), feature.min(axis=0)
 21 |     feature = (feature - zmin) / (zmax - zmin)
 22 |     label=np.array(label)
 23 | 
 24 |     return feature,label
 25 | 
 26 | def sigmoid(inx):
 27 |     #return 1. / (1. + exp(-max(min(inx, 15.), -15.)))
 28 |     return 1.0 / (1 + exp(-inx))
 29 | 
 30 | def SGD_FM(dataMatrix, classLabels, k, iter):
 31 |     '''
 32 |     :param dataMatrix:  特征矩阵
 33 |     :param classLabels: 类别矩阵
 34 |     :param k:           辅助向量的大小
 35 |     :param iter:        迭代次数
 36 |     :return:
 37 |     '''
 38 |     # dataMatrix用的是mat, classLabels是列表
 39 |     m, n = shape(dataMatrix)   #矩阵的行列数，即样本数和特征数
 40 |     alpha = 0.01
 41 |     # 初始化参数
 42 |     # w = random.randn(n, 1)#其中n是特征的个数
 43 |     w = zeros((n, 1))      #一阶特征的系数
 44 |     w_0 = 0.
 45 |     v = normalvariate(0, 0.2) * ones((n, k))   #即生成辅助向量，用来训练二阶交叉特征的系数
 46 | 
 47 |     for it in range(iter):
 48 |         for x in range(m):  # 随机优化，每次只使用一个样本
 49 |             
 50 |             xx0=dataMatrix[x]
 51 |             xx=np.array(xx0)
 52 |             xx1=xx.T@xx
 53 |             vv=v@v.T
 54 |             e=xx1*vv
 55 |             interaction=0.5*(e.sum()-e.trace())
 56 |             p = w_0 + xx@w + interaction
 57 |             loss = (1-sigmoid(classLabels[x] * p[0, 0]) )   #计算损失
 58 |             w_0 = w_0 +alpha * loss * classLabels[x]
 59 | 
 60 |             for i in range(n):
 61 |                 if dataMatrix[x, i] != 0:
 62 |                     w[i, 0] = w[i, 0] +alpha * loss * classLabels[x] * xx[0,i]#dataMatrix[x, i]
 63 |                     for j in range(k):
 64 |                         vv=np.array([v[:,j]])
 65 |                         v[i, j] = v[i, j]+ alpha * loss * classLabels[x] * xx[0,i]*( xx@vv.T - v[i, j] * xx[0,i])
 66 |         print("第{}次迭代后的损失为{}".format(it, loss))
 67 | 
 68 |     return w_0, w, v
 69 | 
 70 | 
 71 | def getAccuracy(dataMatrix, classLabels, w_0, w, v):
 72 |     m, n = shape(dataMatrix)
 73 |     allItem = 0
 74 |     error = 0
 75 |     result = []
 76 |     for x in range(m):   #计算每一个样本的误差
 77 |         allItem += 1
 78 |         xx0=dataMatrix[x]
 79 |         xx=np.array(xx0)
 80 |         xx1=xx.T@xx
 81 |         vv=v@v.T
 82 |         e=xx1*vv
 83 |         interaction=0.5*(e.sum()-e.trace())
 84 |         p = w_0 + xx@w + interaction
 85 | 
 86 |         pre = sigmoid(p[0, 0])
 87 |         result.append(pre)
 88 | 
 89 |         if pre < 0.5 and classLabels[x] == 1.0:
 90 |             error += 1
 91 |         elif pre >= 0.5 and classLabels[x] == -1.0:
 92 |             error += 1
 93 |         else:
 94 |             continue
 95 | 
 96 |     return float(error) / allItem
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     train=pd.read_csv(trainData)
101 |     test = pd.read_csv(testData)
102 |     dataTrain, labelTrain = preprocessData(train)
103 |     dataTest, labelTest = preprocessData(test)
104 |     date_startTrain = datetime.now()
105 |     print    ("开始训练")
106 |     w_0, w, v = SGD_FM(mat(dataTrain), labelTrain, 20, 30)
107 |     print(
108 |         "训练准确性为：%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v)))
109 |     date_endTrain = datetime.now()
110 |     print(
111 |     "训练用时为：%s" % (date_endTrain - date_startTrain))
112 |     print("开始测试")
113 |     print(
114 |         "测试准确性为：%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v)))
115 | 


--------------------------------------------------------------------------------
/Factorization_Machine/all.wmf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/Factorization_Machine/all.wmf


--------------------------------------------------------------------------------
/Factorization_Machine/data/diabetes_test.txt:
--------------------------------------------------------------------------------
  1 | 2,117,90,19,71,25.2,0.313,21,0
  2 | 3,84,72,32,0,37.2,0.267,28,0
  3 | 6,0,68,41,0,39.0,0.727,41,1
  4 | 7,94,64,25,79,33.3,0.738,41,0
  5 | 3,96,78,39,0,37.3,0.238,40,0
  6 | 10,75,82,0,0,33.3,0.263,38,0
  7 | 0,180,90,26,90,36.5,0.314,35,1
  8 | 1,130,60,23,170,28.6,0.692,21,0
  9 | 2,84,50,23,76,30.4,0.968,21,0
 10 | 8,120,78,0,0,25.0,0.409,64,0
 11 | 12,84,72,31,0,29.7,0.297,46,1
 12 | 0,139,62,17,210,22.1,0.207,21,0
 13 | 9,91,68,0,0,24.2,0.200,58,0
 14 | 2,91,62,0,0,27.3,0.525,22,0
 15 | 3,99,54,19,86,25.6,0.154,24,0
 16 | 3,163,70,18,105,31.6,0.268,28,1
 17 | 9,145,88,34,165,30.3,0.771,53,1
 18 | 7,125,86,0,0,37.6,0.304,51,0
 19 | 13,76,60,0,0,32.8,0.180,41,0
 20 | 6,129,90,7,326,19.6,0.582,60,0
 21 | 2,68,70,32,66,25.0,0.187,25,0
 22 | 3,124,80,33,130,33.2,0.305,26,0
 23 | 6,114,0,0,0,0.0,0.189,26,0
 24 | 9,130,70,0,0,34.2,0.652,45,1
 25 | 3,125,58,0,0,31.6,0.151,24,0
 26 | 3,87,60,18,0,21.8,0.444,21,0
 27 | 1,97,64,19,82,18.2,0.299,21,0
 28 | 3,116,74,15,105,26.3,0.107,24,0
 29 | 0,117,66,31,188,30.8,0.493,22,0
 30 | 0,111,65,0,0,24.6,0.660,31,0
 31 | 2,122,60,18,106,29.8,0.717,22,0
 32 | 0,107,76,0,0,45.3,0.686,24,0
 33 | 1,86,66,52,65,41.3,0.917,29,0
 34 | 6,91,0,0,0,29.8,0.501,31,0
 35 | 1,77,56,30,56,33.3,1.251,24,0
 36 | 4,132,0,0,0,32.9,0.302,23,1
 37 | 0,105,90,0,0,29.6,0.197,46,0
 38 | 0,57,60,0,0,21.7,0.735,67,0
 39 | 0,127,80,37,210,36.3,0.804,23,0
 40 | 3,129,92,49,155,36.4,0.968,32,1
 41 | 8,100,74,40,215,39.4,0.661,43,1
 42 | 3,128,72,25,190,32.4,0.549,27,1
 43 | 10,90,85,32,0,34.9,0.825,56,1
 44 | 4,84,90,23,56,39.5,0.159,25,0
 45 | 1,88,78,29,76,32.0,0.365,29,0
 46 | 8,186,90,35,225,34.5,0.423,37,1
 47 | 5,187,76,27,207,43.6,1.034,53,1
 48 | 4,131,68,21,166,33.1,0.160,28,0
 49 | 1,164,82,43,67,32.8,0.341,50,0
 50 | 4,189,110,31,0,28.5,0.680,37,0
 51 | 1,116,70,28,0,27.4,0.204,21,0
 52 | 3,84,68,30,106,31.9,0.591,25,0
 53 | 6,114,88,0,0,27.8,0.247,66,0
 54 | 1,88,62,24,44,29.9,0.422,23,0
 55 | 1,84,64,23,115,36.9,0.471,28,0
 56 | 7,124,70,33,215,25.5,0.161,37,0
 57 | 1,97,70,40,0,38.1,0.218,30,0
 58 | 8,110,76,0,0,27.8,0.237,58,0
 59 | 11,103,68,40,0,46.2,0.126,42,0
 60 | 11,85,74,0,0,30.1,0.300,35,0
 61 | 6,125,76,0,0,33.8,0.121,54,1
 62 | 0,198,66,32,274,41.3,0.502,28,1
 63 | 1,87,68,34,77,37.6,0.401,24,0
 64 | 6,99,60,19,54,26.9,0.497,32,0
 65 | 0,91,80,0,0,32.4,0.601,27,0
 66 | 2,95,54,14,88,26.1,0.748,22,0
 67 | 1,99,72,30,18,38.6,0.412,21,0
 68 | 6,92,62,32,126,32.0,0.085,46,0
 69 | 4,154,72,29,126,31.3,0.338,37,0
 70 | 0,121,66,30,165,34.3,0.203,33,1
 71 | 3,78,70,0,0,32.5,0.270,39,0
 72 | 2,130,96,0,0,22.6,0.268,21,0
 73 | 3,111,58,31,44,29.5,0.430,22,0
 74 | 2,98,60,17,120,34.7,0.198,22,0
 75 | 1,143,86,30,330,30.1,0.892,23,0
 76 | 1,119,44,47,63,35.5,0.280,25,0
 77 | 6,108,44,20,130,24.0,0.813,35,0
 78 | 2,118,80,0,0,42.9,0.693,21,1
 79 | 10,133,68,0,0,27.0,0.245,36,0
 80 | 2,197,70,99,0,34.7,0.575,62,1
 81 | 0,151,90,46,0,42.1,0.371,21,1
 82 | 6,109,60,27,0,25.0,0.206,27,0
 83 | 12,121,78,17,0,26.5,0.259,62,0
 84 | 8,100,76,0,0,38.7,0.190,42,0
 85 | 8,124,76,24,600,28.7,0.687,52,1
 86 | 1,93,56,11,0,22.5,0.417,22,0
 87 | 8,143,66,0,0,34.9,0.129,41,1
 88 | 6,103,66,0,0,24.3,0.249,29,0
 89 | 3,176,86,27,156,33.3,1.154,52,1
 90 | 0,73,0,0,0,21.1,0.342,25,0
 91 | 11,111,84,40,0,46.8,0.925,45,1
 92 | 2,112,78,50,140,39.4,0.175,24,0
 93 | 3,132,80,0,0,34.4,0.402,44,1
 94 | 2,82,52,22,115,28.5,1.699,25,0
 95 | 6,123,72,45,230,33.6,0.733,34,0
 96 | 0,188,82,14,185,32.0,0.682,22,1
 97 | 0,67,76,0,0,45.3,0.194,46,0
 98 | 1,89,24,19,25,27.8,0.559,21,0
 99 | 1,173,74,0,0,36.8,0.088,38,1
100 | 1,109,38,18,120,23.1,0.407,26,0
101 | 1,108,88,19,0,27.1,0.400,24,0
102 | 6,96,0,0,0,23.7,0.190,28,0
103 | 1,124,74,36,0,27.8,0.100,30,0
104 | 7,150,78,29,126,35.2,0.692,54,1
105 | 4,183,0,0,0,28.4,0.212,36,1
106 | 1,124,60,32,0,35.8,0.514,21,0
107 | 1,181,78,42,293,40.0,1.258,22,1
108 | 1,92,62,25,41,19.5,0.482,25,0
109 | 0,152,82,39,272,41.5,0.270,27,0
110 | 1,111,62,13,182,24.0,0.138,23,0
111 | 3,106,54,21,158,30.9,0.292,24,0
112 | 3,174,58,22,194,32.9,0.593,36,1
113 | 7,168,88,42,321,38.2,0.787,40,1
114 | 6,105,80,28,0,32.5,0.878,26,0
115 | 11,138,74,26,144,36.1,0.557,50,1
116 | 3,106,72,0,0,25.8,0.207,27,0
117 | 6,117,96,0,0,28.7,0.157,30,0
118 | 2,68,62,13,15,20.1,0.257,23,0
119 | 9,112,82,24,0,28.2,1.282,50,1
120 | 0,119,0,0,0,32.4,0.141,24,1
121 | 2,112,86,42,160,38.4,0.246,28,0
122 | 2,92,76,20,0,24.2,1.698,28,0
123 | 6,183,94,0,0,40.8,1.461,45,0
124 | 0,94,70,27,115,43.5,0.347,21,0
125 | 2,108,64,0,0,30.8,0.158,21,0
126 | 4,90,88,47,54,37.7,0.362,29,0
127 | 0,125,68,0,0,24.7,0.206,21,0
128 | 0,132,78,0,0,32.4,0.393,21,0
129 | 5,128,80,0,0,34.6,0.144,45,0
130 | 4,94,65,22,0,24.7,0.148,21,0
131 | 7,114,64,0,0,27.4,0.732,34,1
132 | 0,102,78,40,90,34.5,0.238,24,0
133 | 2,111,60,0,0,26.2,0.343,23,0
134 | 1,128,82,17,183,27.5,0.115,22,0
135 | 10,92,62,0,0,25.9,0.167,31,0
136 | 13,104,72,0,0,31.2,0.465,38,1
137 | 5,104,74,0,0,28.8,0.153,48,0
138 | 2,94,76,18,66,31.6,0.649,23,0
139 | 7,97,76,32,91,40.9,0.871,32,1
140 | 1,100,74,12,46,19.5,0.149,28,0
141 | 0,102,86,17,105,29.3,0.695,27,0
142 | 4,128,70,0,0,34.3,0.303,24,0
143 | 6,147,80,0,0,29.5,0.178,50,1
144 | 4,90,0,0,0,28.0,0.610,31,0
145 | 3,103,72,30,152,27.6,0.730,27,0
146 | 2,157,74,35,440,39.4,0.134,30,0
147 | 1,167,74,17,144,23.4,0.447,33,1
148 | 0,179,50,36,159,37.8,0.455,22,1
149 | 11,136,84,35,130,28.3,0.260,42,1
150 | 0,107,60,25,0,26.4,0.133,23,0
151 | 1,91,54,25,100,25.2,0.234,23,0
152 | 1,117,60,23,106,33.8,0.466,27,0
153 | 5,123,74,40,77,34.1,0.269,28,0
154 | 2,120,54,0,0,26.8,0.455,27,0
155 | 1,106,70,28,135,34.2,0.142,22,0
156 | 2,155,52,27,540,38.7,0.240,25,1
157 | 2,101,58,35,90,21.8,0.155,22,0
158 | 1,120,80,48,200,38.9,1.162,41,0
159 | 11,127,106,0,0,39.0,0.190,51,0
160 | 3,80,82,31,70,34.2,1.292,27,1
161 | 10,162,84,0,0,27.7,0.182,54,0
162 | 1,199,76,43,0,42.9,1.394,22,1
163 | 8,167,106,46,231,37.6,0.165,43,1
164 | 9,145,80,46,130,37.9,0.637,40,1
165 | 6,115,60,39,0,33.7,0.245,40,1
166 | 1,112,80,45,132,34.8,0.217,24,0
167 | 4,145,82,18,0,32.5,0.235,70,1
168 | 10,111,70,27,0,27.5,0.141,40,1
169 | 6,98,58,33,190,34.0,0.430,43,0
170 | 9,154,78,30,100,30.9,0.164,45,0
171 | 6,165,68,26,168,33.6,0.631,49,0
172 | 1,99,58,10,0,25.4,0.551,21,0
173 | 10,68,106,23,49,35.5,0.285,47,0
174 | 3,123,100,35,240,57.3,0.880,22,0
175 | 8,91,82,0,0,35.6,0.587,68,0
176 | 6,195,70,0,0,30.9,0.328,31,1
177 | 9,156,86,0,0,24.8,0.230,53,1
178 | 0,93,60,0,0,35.3,0.263,25,0
179 | 3,121,52,0,0,36.0,0.127,25,1
180 | 2,101,58,17,265,24.2,0.614,23,0
181 | 2,56,56,28,45,24.2,0.332,22,0
182 | 0,162,76,36,0,49.6,0.364,26,1
183 | 0,95,64,39,105,44.6,0.366,22,0
184 | 4,125,80,0,0,32.3,0.536,27,1
185 | 5,136,82,0,0,0.0,0.640,69,0
186 | 2,129,74,26,205,33.2,0.591,25,0
187 | 3,130,64,0,0,23.1,0.314,22,0
188 | 1,107,50,19,0,28.3,0.181,29,0
189 | 1,140,74,26,180,24.1,0.828,23,0
190 | 1,144,82,46,180,46.1,0.335,46,1
191 | 8,107,80,0,0,24.6,0.856,34,0
192 | 13,158,114,0,0,42.3,0.257,44,1
193 | 2,121,70,32,95,39.1,0.886,23,0
194 | 7,129,68,49,125,38.5,0.439,43,1
195 | 2,90,60,0,0,23.5,0.191,25,0
196 | 7,142,90,24,480,30.4,0.128,43,1
197 | 3,169,74,19,125,29.9,0.268,31,1
198 | 0,99,0,0,0,25.0,0.253,22,0
199 | 4,127,88,11,155,34.5,0.598,28,0
200 | 4,118,70,0,0,44.5,0.904,26,0
201 | 2,122,76,27,200,35.9,0.483,26,0
202 | 6,125,78,31,0,27.6,0.565,49,1
203 | 1,168,88,29,0,35.0,0.905,52,1
204 | 2,129,0,0,0,38.5,0.304,41,0
205 | 4,110,76,20,100,28.4,0.118,27,0
206 | 6,80,80,36,0,39.8,0.177,28,0
207 | 10,115,0,0,0,0.0,0.261,30,1
208 | 2,127,46,21,335,34.4,0.176,22,0
209 | 9,164,78,0,0,32.8,0.148,45,1
210 | 2,93,64,32,160,38.0,0.674,23,1
211 | 3,158,64,13,387,31.2,0.295,24,0
212 | 5,126,78,27,22,29.6,0.439,40,0
213 | 10,129,62,36,0,41.2,0.441,38,1
214 | 0,134,58,20,291,26.4,0.352,21,0
215 | 3,102,74,0,0,29.5,0.121,32,0
216 | 7,187,50,33,392,33.9,0.826,34,1
217 | 3,173,78,39,185,33.8,0.970,31,1
218 | 10,94,72,18,0,23.1,0.595,56,0
219 | 1,108,60,46,178,35.5,0.415,24,0
220 | 5,97,76,27,0,35.6,0.378,52,1
221 | 4,83,86,19,0,29.3,0.317,34,0
222 | 1,114,66,36,200,38.1,0.289,21,0
223 | 1,149,68,29,127,29.3,0.349,42,1
224 | 5,117,86,30,105,39.1,0.251,42,0
225 | 1,111,94,0,0,32.8,0.265,45,0
226 | 4,112,78,40,0,39.4,0.236,38,0
227 | 1,116,78,29,180,36.1,0.496,25,0
228 | 0,141,84,26,0,32.4,0.433,22,0
229 | 2,175,88,0,0,22.9,0.326,22,0
230 | 2,92,52,0,0,30.1,0.141,22,0
231 | 3,130,78,23,79,28.4,0.323,34,1
232 | 8,120,86,0,0,28.4,0.259,22,1
233 | 2,174,88,37,120,44.5,0.646,24,1
234 | 2,106,56,27,165,29.0,0.426,22,0
235 | 2,105,75,0,0,23.3,0.560,53,0
236 | 4,95,60,32,0,35.4,0.284,28,0
237 | 0,126,86,27,120,27.4,0.515,21,0
238 | 8,65,72,23,0,32.0,0.600,42,0
239 | 2,99,60,17,160,36.6,0.453,21,0
240 | 1,102,74,0,0,39.5,0.293,42,1
241 | 11,120,80,37,150,42.3,0.785,48,1
242 | 3,102,44,20,94,30.8,0.400,26,0
243 | 1,109,58,18,116,28.5,0.219,22,0
244 | 9,140,94,0,0,32.7,0.734,45,1
245 | 13,153,88,37,140,40.6,1.174,39,0
246 | 12,100,84,33,105,30.0,0.488,46,0
247 | 1,147,94,41,0,49.3,0.358,27,1
248 | 1,81,74,41,57,46.3,1.096,32,0
249 | 3,187,70,22,200,36.4,0.408,36,1
250 | 6,162,62,0,0,24.3,0.178,50,1
251 | 4,136,70,0,0,31.2,1.182,22,1
252 | 1,121,78,39,74,39.0,0.261,28,0
253 | 3,108,62,24,0,26.0,0.223,25,0
254 | 0,181,88,44,510,43.3,0.222,26,1
255 | 8,154,78,32,0,32.4,0.443,45,1
256 | 1,128,88,39,110,36.5,1.057,37,1
257 | 7,137,90,41,0,32.0,0.391,39,0
258 | 0,123,72,0,0,36.3,0.258,52,1
259 | 1,106,76,0,0,37.5,0.197,26,0
260 | 6,190,92,0,0,35.5,0.278,66,1
261 | 2,88,58,26,16,28.4,0.766,22,0
262 | 9,170,74,31,0,44.0,0.403,43,1
263 | 9,89,62,0,0,22.5,0.142,33,0
264 | 10,101,76,48,180,32.9,0.171,63,0
265 | 2,122,70,27,0,36.8,0.340,27,0
266 | 5,121,72,23,112,26.2,0.245,30,0
267 | 1,126,60,0,0,30.1,0.349,47,1
268 | 1,93,70,31,0,30.4,0.315,23,0


--------------------------------------------------------------------------------
/Factorization_Machine/rr.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/Factorization_Machine/rr.gif


--------------------------------------------------------------------------------
/Factorization_Machine/rr2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/Factorization_Machine/rr2.gif


--------------------------------------------------------------------------------
/Inverted_index/invert_indexx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun May  5 13:15:25 2019
 5 | 
 6 | @author: lg
 7 | """
 8 | 
 9 | #import jieba
10 | 
11 | docu_set={'d1':'i love shanghai',
12 |           'd2':'i am from shanghai now i study in tongji university',
13 |           'd3':'i am from lanzhou now i study in lanzhou university of science  and  technolgy',}
14 | 
15 | 
16 | all_words=[]
17 | for i in docu_set.values():
18 | #    cut = jieba.cut(i)
19 |     cut=i.split()
20 |     all_words.extend(cut)
21 |     
22 | set_all_words=set(all_words)
23 | 
24 | print(set_all_words)
25 | 
26 | 
27 | invert_index=dict()
28 | for b in set_all_words:
29 |     
30 |     
31 |     temp=[]
32 |     for j in docu_set.keys():
33 |         
34 |         field=docu_set[j]
35 |         
36 |         split_field=field.split()
37 |         
38 |         if b in split_field:
39 |             temp.append(j)
40 |     invert_index[b]=temp
41 |     
42 |             
43 | print(invert_index)            
44 |         
45 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # recommend_sys
2 | 各种推荐算法
3 | 


--------------------------------------------------------------------------------
/collaborative_filtering/ItemCF/item_book.txt:
--------------------------------------------------------------------------------
 1 | Liu Yi,3,1001
 2 | Chen Er,4,1001
 3 | Zhang San,3,1001
 4 | Li Si,3,1001
 5 | Liu Yi,3,1002
 6 | Li Si,4,1002
 7 | Liu Yi,4,1003
 8 | Zhang San,5,1003
 9 | Li Si,5,1003
10 | Liu Yi,4,1004
11 | Zhang San,3,1004
12 | Liu Yi,5,1005
13 | 


--------------------------------------------------------------------------------
/collaborative_filtering/ItemCF/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*-coding:utf-8-*-
 3 |  
 4 | import math
 5 | import pdb
 6 |  
 7 | class ItemBasedCF:
 8 |     def __init__(self,train_file):
 9 |         self.train_file = train_file
10 |         self.readData()
11 |         
12 |     def readData(self):
13 |         #读取文件，并生成用户-物品的评分表和测试集
14 |         self.train = dict()
15 |         #用户-物品的评分表
16 |         for line in open(self.train_file):
17 |             user,score,item = line.strip().split(",")
18 |             self.train.setdefault(user,{})
19 |             self.train[user][item] = int(float(score))
20 |  
21 |     def ItemSimilarity(self):
22 |         #建立物品-物品的共现矩阵
23 |         cooccur = dict()  #物品-物品的共现矩阵
24 |         buy = dict()  #物品被多少个不同用户购买N
25 |         for user,items in self.train.items():
26 |             for i in items.keys():
27 |                 buy.setdefault(i,0)
28 |                 buy[i] += 1
29 |                 cooccur.setdefault(i,{})
30 |                 for j in items.keys():
31 |                     if i == j : continue
32 |                     cooccur[i].setdefault(j,0)
33 |                     cooccur[i][j] += 1
34 |         #计算相似度矩阵
35 |         self.similar = dict()
36 |         for i,related_items in cooccur.items():
37 |             self.similar.setdefault(i,{})
38 |             for j,cij in related_items.items():
39 |                 self.similar[i][j] = cij / (math.sqrt(buy[i] * buy[j]))
40 |         return self.similar
41 |  
42 |     #给用户user推荐，前K个相关用户，前N个物品
43 |     def Recommend(self,user,K=3,N=10):
44 |         rank = dict()
45 |         action_item = self.train[user]     
46 |         #用户user产生过行为的item和评分
47 |         for item,score in action_item.items():
48 |             sortedItems = sorted(self.similar[item].items(),key=lambda x:x[1],reverse=True)[0:K]
49 |             for j,wj in sortedItems:
50 |                 if j in action_item.keys():
51 |                     continue
52 |                 rank.setdefault(j,0)
53 |                 rank[j] += score * wj
54 |         return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])
55 |     
56 | #声明一个ItemBasedCF的对象    
57 | item = ItemBasedCF("item_book.txt")
58 | item.ItemSimilarity()
59 | recommedDict = item.Recommend("Li Si")
60 | for k,v in recommedDict.items():
61 |     print(k,"\t",v)
62 | 


--------------------------------------------------------------------------------
/collaborative_filtering/UserCF/cf.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/collaborative_filtering/UserCF/cf.gif


--------------------------------------------------------------------------------
/collaborative_filtering/UserCF/cf.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding:UTF-8
  3 | '''
  4 | Date:20180624
  5 | @author: luogan
  6 | '''
  7 | 
  8 | import numpy as np
  9 | import pandas
 10 | 
 11 | from numpy import mat,eye
 12 | 
 13 | from numpy import linalg
 14 | 
 15 | 
 16 | def fetch_data():
 17 |     
 18 |     dat=mat([[4., 3., 0., 5., 0.],
 19 |         [5., 0., 4., 4., 0.],
 20 |         [4., 0., 5., 0., 3.],
 21 |         [2., 3., 0., 1., 0.],
 22 |         [0., 4., 2., 0., 5.]])
 23 |     
 24 |     return dat
 25 | 
 26 | 
 27 | 
 28 | def cos_sim(x, y):
 29 |     '''余弦相似性
 30 |     input:  x(mat):以行向量的形式存储，可以是用户或者商品
 31 |             y(mat):以行向量的形式存储，可以是用户或者商品
 32 |     output: x和y之间的余弦相似度
 33 |     '''
 34 |     numerator = x * y.T  # x和y之间的额内积
 35 |     denominator = np.sqrt(x * x.T) * np.sqrt(y * y.T) 
 36 |     return (numerator / denominator)[0, 0]
 37 | 
 38 | 
 39 | def similarity(data):
 40 |     '''计算矩阵中任意两行之间的相似度
 41 |     input:  data(mat):任意矩阵
 42 |     output: w(mat):任意两行之间的相似度
 43 |     '''
 44 |     m = np.shape(data)[0]  # 用户的数量
 45 |     # 初始化相似度矩阵
 46 |     w = np.mat(np.zeros((m, m)))
 47 |     
 48 |     for i in range(m):
 49 |         for j in range(i, m):
 50 |             if j != i:
 51 |                 # 计算任意两行之间的相似度
 52 |                 w[i, j] = cos_sim(data[i, ], data[j, ])
 53 |                 w[j, i] = w[i, j]
 54 |             else:
 55 |                 w[i, j] = 0
 56 |     return w
 57 | 
 58 | def user_based_recommend(data, w, user):
 59 |     '''基于用户相似性为用户user推荐商品
 60 |     input:  data(mat):用户商品矩阵
 61 |             w(mat):用户之间的相似度
 62 |             user(int):用户的编号
 63 |     output: predict(list):推荐列表
 64 |     '''
 65 |     m, n = np.shape(data)
 66 |     interaction = data[user, ]  # 用户user与商品信息
 67 |     
 68 |     # 1、找到用户user没有互动过的商品
 69 |     not_inter = []
 70 |     for i in range(n):
 71 |         if interaction[0, i] == 0:  # 没有互动的商品
 72 |             not_inter.append(i)
 73 |     
 74 |     # 2、对没有互动过的商品进行预测
 75 |     
 76 |     #print('not_inter=',not_inter)
 77 |     predict={}
 78 |     dd=np.array(data)
 79 |     ww=np.array(w) 
 80 |     if len(not_inter)>0:
 81 |         
 82 |         for i in not_inter:
 83 |             #print('ww[:,user]=',ww[:,user])
 84 |             
 85 |             #print('dd[:,i].T',dd[:,i].T)
 86 | #            predict[i]=ww[:,user]@dd[:,i].T
 87 |             predict[i]=ww[user]@dd[:,i].T
 88 |             
 89 |             #print(predict)
 90 |     return predict
 91 |         
 92 | 
 93 | 
 94 | def top_k(predict, k):
 95 |     '''为用户推荐前k个商品
 96 |     input:  predict(list):排好序的商品列表
 97 |             k(int):推荐的商品个数
 98 |     output: top_recom(list):top_k个商品
 99 |     '''
100 |     pp=pandas.Series(predict)
101 |     pp1=pp.sort_values(ascending=False)
102 |     #top_recom = []
103 |     len_result = len(predict)
104 |     
105 |     if k>=len_result:
106 |         
107 |         return pp1.iloc[:k]
108 |     else:
109 |         return pp1
110 | 
111 | 
112 | def normalize(w):
113 |     
114 |     w=np.array(w)
115 |     
116 |     #print(w)
117 |     
118 |     dim=len(w)
119 |     ww=[]
120 |     for i in range(dim):
121 |         
122 |         d=w[i]
123 |         m=[]
124 |         for k in range(len(d)):
125 |             m.append(abs(d[k]))
126 |             
127 |         ssm=sum(m)
128 |         #print('ssm=',ssm)
129 |         for j in range(len(m)):
130 |                      
131 |             m[j]=d[j]/ssm
132 |         
133 |             
134 |         ww.append(m)
135 |     return mat(ww)
136 |         
137 |         
138 |         
139 |           
140 | 
141 | data = fetch_data()
142 | 
143 | 
144 | print('仅仅采用协同过滤算法')
145 | 
146 | print('only use collaborative')
147 | 
148 | w_initial=similarity(data)
149 | # 3、利用用户之间的相似性进行推荐
150 | #print ("------------ 3. predict ------------" )   
151 | predict = user_based_recommend(data, w_initial, 1)
152 | # 4、进行Top-K推荐
153 | #print ("------------ 4. top_k recommendation ------------")
154 | top_recom = top_k(predict, 1)
155 | print ('top_recom=',top_recom)
156 | 
157 | 
158 | print('采用协同过滤算法+相似度矩阵的归一化')
159 | 
160 | print(' use collaborative  and normalize')
161 | 
162 | w_initial=similarity(data) 
163 | 
164 | w_initial_normal=normalize(w_initial)
165 | 
166 | predict = user_based_recommend(data,  w_initial_normal, 1)
167 | top_recom = top_k(predict, 1)
168 | print ('top_recom=',top_recom)
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/ctr_fm_ffm/FFM.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from sklearn.datasets import load_breast_cancer
 3 | from sklearn.model_selection import train_test_split
 4 | K = tf.keras.backend
 5 | 
 6 | 
 7 | class MyLayer(tf.keras.layers.Layer):
 8 |     def __init__(self, field_dict, field_dim, input_dim, output_dim=30, **kwargs):
 9 |         self.field_dict = field_dict
10 |         self.field_dim = field_dim
11 |         self.input_dim = input_dim
12 |         self.output_dim = output_dim
13 |         super(MyLayer, self).__init__(**kwargs)
14 | 
15 |     def build(self, input_shape):
16 |         self.kernel = self.add_weight(name='kernel', 
17 |                                       shape=(self.input_dim, self.field_dim, self.output_dim),
18 |                                       initializer='glorot_uniform',
19 |                                       trainable=True)
20 |         super(MyLayer, self).build(input_shape)
21 | 
22 |     def call(self, x):
23 |         self.field_cross = K.variable(0, dtype='float32')
24 |         for i in range(self.input_dim):
25 |             for j in range(i+1, self.input_dim):
26 |                 weight = tf.math.reduce_sum(tf.math.multiply(self.kernel[i, self.field_dict[j]], self.kernel[j, self.field_dict[i]]))
27 |                 value = tf.math.multiply(weight, tf.math.multiply(x[:,i], x[:,j]))
28 |                 self.field_cross = tf.math.add(self.field_cross, value)
29 |         return self.field_cross
30 | 
31 |     def compute_output_shape(self, input_shape):
32 |         return (input_shape[0], 1)
33 | 
34 | def FFM(feature_dim, field_dict, field_dim, output_dim=30):
35 |     inputs = tf.keras.Input((feature_dim,))
36 |     liner = tf.keras.layers.Dense(1)(inputs)
37 |     cross = MyLayer(field_dict, field_dim, feature_dim, output_dim)(inputs)
38 |     cross = tf.keras.layers.Reshape((1,))(cross)
39 |     add = tf.keras.layers.Add()([liner, cross])
40 |     predictions = tf.keras.layers.Activation('sigmoid')(add)
41 |     model = tf.keras.Model(inputs=inputs, outputs=predictions)
42 |     model.compile(loss='binary_crossentropy',
43 |                   optimizer=tf.train.AdamOptimizer(0.001),
44 |                   metrics=['binary_accuracy'])
45 |     return model
46 | 
47 | def train():
48 |     field_dict = {i:i//5 for i in range(30)}
49 |     ffm = FFM(30, field_dict, 6, 30)
50 |     data = load_breast_cancer()
51 |     X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2,
52 |                                                         random_state=27, stratify=data.target)
53 |     ffm.fit(X_train, y_train, epochs=3, batch_size=16, validation_data=(X_test, y_test))
54 |     return ffm
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     ffm = train()
59 | 


--------------------------------------------------------------------------------
/ctr_fm_ffm/FM.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from sklearn.datasets import load_breast_cancer
 3 | from sklearn.model_selection import train_test_split
 4 | K = tf.keras.backend
 5 | 
 6 | 
 7 | class MyLayer(tf.keras.layers.Layer):
 8 |     def __init__(self, input_dim, output_dim=30, **kwargs):
 9 |         self.input_dim = input_dim
10 |         self.output_dim = output_dim
11 |         super(MyLayer, self).__init__(**kwargs)
12 | 
13 |     def build(self, input_shape):
14 |         self.kernel = self.add_weight(name='kernel', 
15 |                                       shape=(self.input_dim, self.output_dim),
16 |                                       initializer='glorot_uniform',
17 |                                       trainable=True)
18 |         super(MyLayer, self).build(input_shape)
19 | 
20 |     def call(self, x):
21 |         a = K.pow(K.dot(x,self.kernel), 2)
22 |         b = K.dot(K.pow(x, 2), K.pow(self.kernel, 2))
23 |         return K.mean(a-b, 1, keepdims=True)*0.5
24 | 
25 |     def compute_output_shape(self, input_shape):
26 |         return (input_shape[0], self.output_dim)
27 | 
28 | def FM(feature_dim):
29 |     inputs = tf.keras.Input((feature_dim,))
30 |     liner = tf.keras.layers.Dense(units=1, 
31 |                                   bias_regularizer=tf.keras.regularizers.l2(0.01),
32 |                                   kernel_regularizer=tf.keras.regularizers.l1(0.02),
33 |                                   )(inputs)
34 |     cross = MyLayer(feature_dim)(inputs)
35 |     add = tf.keras.layers.Add()([liner, cross])
36 |     predictions = tf.keras.layers.Activation('sigmoid')(add)
37 |     model = tf.keras.Model(inputs=inputs, outputs=predictions)
38 |     model.compile(loss='binary_crossentropy',
39 |                   optimizer=tf.train.AdamOptimizer(0.001),
40 |                   metrics=['binary_accuracy'])
41 |     return model
42 | 
43 | def train():
44 |     fm = FM(30)
45 |     data = load_breast_cancer()
46 |     X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2,
47 |                                                         random_state=27, stratify=data.target)
48 |     fm.fit(X_train, y_train, epochs=3, batch_size=16, validation_data=(X_test, y_test))
49 |     return fm
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     fm = train()
54 | 


--------------------------------------------------------------------------------
/ctr_fm_ffm/LR.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from sklearn.datasets import load_breast_cancer
 3 | from sklearn.model_selection import train_test_split
 4 | 
 5 | 
 6 | def lr_model():
 7 |     inputs = tf.keras.Input((30,))
 8 |     pred = tf.keras.layers.Dense(units=1, 
 9 |                                  bias_regularizer=tf.keras.regularizers.l2(0.01),
10 |                                  kernel_regularizer=tf.keras.regularizers.l1(0.02),
11 |                                  activation=tf.nn.sigmoid)(inputs)
12 |     lr = tf.keras.Model(inputs, pred)
13 |     lr.compile(loss='binary_crossentropy',
14 |                optimizer=tf.train.AdamOptimizer(0.001),
15 |                metrics=['binary_accuracy'])
16 |     return lr
17 | 
18 | def train():
19 |     lr = lr_model()
20 |     data = load_breast_cancer()
21 |     X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2,
22 |                                                         random_state=27, stratify=data.target)
23 |     lr.fit(X_train, y_train, epochs=3, batch_size=16, validation_data=(X_test, y_test))
24 |     return lr
25 | 
26 | if __name__ == '__main__':
27 |     lr = train()
28 | 


--------------------------------------------------------------------------------
/deepfm_recomend/__pycache__/deepfm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/deepfm.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/__pycache__/feature.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/feature.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/__pycache__/feature_column.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/feature_column.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/__pycache__/inputs.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/inputs.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/__pycache__/inputs1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/__pycache__/inputs1.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/activation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | """
 3 | 
 4 | Author:
 5 |     Weichen Shen,wcshen1994@163.com
 6 | 
 7 | """
 8 | 
 9 | import tensorflow as tf
10 | from tensorflow.python.keras.initializers import Zeros
11 | from tensorflow.python.keras.layers import Layer
12 | 
13 | try:
14 |     unicode
15 | except NameError:
16 |     unicode = str
17 | 
18 | 
19 | class Dice(Layer):
20 |     """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data.
21 | 
22 |       Input shape
23 |         - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
24 | 
25 |       Output shape
26 |         - Same shape as the input.
27 | 
28 |       Arguments
29 |         - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
30 | 
31 |         - **epsilon** : Small float added to variance to avoid dividing by zero.
32 | 
33 |       References
34 |         - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
35 |     """
36 | 
37 |     def __init__(self, axis=-1, epsilon=1e-9, **kwargs):
38 |         self.axis = axis
39 |         self.epsilon = epsilon
40 |         super(Dice, self).__init__(**kwargs)
41 | 
42 |     def build(self, input_shape):
43 |         self.bn = tf.keras.layers.BatchNormalization(
44 |             axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
45 |         self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros(
46 |         ), dtype=tf.float32, name='dice_alpha')  # name='alpha_'+self.name
47 |         super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
48 |         self.uses_learning_phase = True
49 | 
50 |     def call(self, inputs, training=None, **kwargs):
51 |         inputs_normed = self.bn(inputs, training=training)
52 |         # tf.layers.batch_normalization(
53 |         # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
54 |         x_p = tf.sigmoid(inputs_normed)
55 |         return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
56 | 
57 |     def compute_output_shape(self, input_shape):
58 |         return input_shape
59 | 
60 |     def get_config(self, ):
61 |         config = {'axis': self.axis, 'epsilon': self.epsilon}
62 |         base_config = super(Dice, self).get_config()
63 |         return dict(list(base_config.items()) + list(config.items()))
64 | 
65 | 
66 | def activation_layer(activation):
67 |     if activation in ("dice", "Dice"):
68 |         act_layer = Dice()
69 |     elif isinstance(activation, (str, unicode)):
70 |         act_layer = tf.keras.layers.Activation(activation)
71 |     elif issubclass(activation, Layer):
72 |         act_layer = activation()
73 |     else:
74 |         raise ValueError(
75 |             "Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation))
76 |     return act_layer
77 | 


--------------------------------------------------------------------------------
/deepfm_recomend/core.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | """
  3 | 
  4 | Author:
  5 |     Weichen Shen,wcshen1994@163.com
  6 | 
  7 | """
  8 | 
  9 | import tensorflow as tf
 10 | from tensorflow.python.keras import backend as K
 11 | from tensorflow.python.keras.initializers import Zeros, glorot_normal
 12 | from tensorflow.python.keras.layers import Layer
 13 | from tensorflow.python.keras.regularizers import l2
 14 | 
 15 | from .activation import activation_layer
 16 | 
 17 | 
 18 | class LocalActivationUnit(Layer):
 19 |     """The LocalActivationUnit used in DIN with which the representation of
 20 |     user interests varies adaptively given different candidate items.
 21 | 
 22 |       Input shape
 23 |         - A list of two 3D tensor with shape:  ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)``
 24 | 
 25 |       Output shape
 26 |         - 3D tensor with shape: ``(batch_size, T, 1)``.
 27 | 
 28 |       Arguments
 29 |         - **hidden_units**:list of positive integer, the attention net layer number and units in each layer.
 30 | 
 31 |         - **activation**: Activation function to use in attention net.
 32 | 
 33 |         - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix of attention net.
 34 | 
 35 |         - **dropout_rate**: float in [0,1). Fraction of the units to dropout in attention net.
 36 | 
 37 |         - **use_bn**: bool. Whether use BatchNormalization before activation or not in attention net.
 38 | 
 39 |         - **seed**: A Python integer to use as random seed.
 40 | 
 41 |       References
 42 |         - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
 43 |     """
 44 | 
 45 |     def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024,
 46 |                  **kwargs):
 47 |         self.hidden_units = hidden_units
 48 |         self.activation = activation
 49 |         self.l2_reg = l2_reg
 50 |         self.dropout_rate = dropout_rate
 51 |         self.use_bn = use_bn
 52 |         self.seed = seed
 53 |         super(LocalActivationUnit, self).__init__(**kwargs)
 54 |         self.supports_masking = True
 55 | 
 56 |     def build(self, input_shape):
 57 | 
 58 |         if not isinstance(input_shape, list) or len(input_shape) != 2:
 59 |             raise ValueError('A `LocalActivationUnit` layer should be called '
 60 |                              'on a list of 2 inputs')
 61 | 
 62 |         if len(input_shape[0]) != 3 or len(input_shape[1]) != 3:
 63 |             raise ValueError("Unexpected inputs dimensions %d and %d, expect to be 3 dimensions" % (
 64 |                 len(input_shape[0]), len(input_shape[1])))
 65 | 
 66 |         if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1:
 67 |             raise ValueError('A `LocalActivationUnit` layer requires '
 68 |                              'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
 69 |                              'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
 70 |         size = 4 * \
 71 |                int(input_shape[0][-1]
 72 |                    ) if len(self.hidden_units) == 0 else self.hidden_units[-1]
 73 |         self.kernel = self.add_weight(shape=(size, 1),
 74 |                                       initializer=glorot_normal(
 75 |                                           seed=self.seed),
 76 |                                       name="kernel")
 77 |         self.bias = self.add_weight(
 78 |             shape=(1,), initializer=Zeros(), name="bias")
 79 |         self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg,
 80 |                        self.dropout_rate, self.use_bn, seed=self.seed)
 81 | 
 82 |         self.dense = tf.keras.layers.Lambda(lambda x: tf.nn.bias_add(tf.tensordot(
 83 |             x[0], x[1], axes=(-1, 0)), x[2]))
 84 | 
 85 |         super(LocalActivationUnit, self).build(
 86 |             input_shape)  # Be sure to call this somewhere!
 87 | 
 88 |     def call(self, inputs, training=None, **kwargs):
 89 | 
 90 |         query, keys = inputs
 91 | 
 92 |         keys_len = keys.get_shape()[1]
 93 |         queries = K.repeat_elements(query, keys_len, 1)
 94 | 
 95 |         att_input = tf.concat(
 96 |             [queries, keys, queries - keys, queries * keys], axis=-1)
 97 | 
 98 |         att_out = self.dnn(att_input, training=training)
 99 | 
100 |         attention_score = self.dense([att_out, self.kernel, self.bias])
101 | 
102 |         return attention_score
103 | 
104 |     def compute_output_shape(self, input_shape):
105 |         return input_shape[1][:2] + (1,)
106 | 
107 |     def compute_mask(self, inputs, mask):
108 |         return mask
109 | 
110 |     def get_config(self, ):
111 |         config = {'activation': self.activation, 'hidden_units': self.hidden_units,
112 |                   'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed}
113 |         base_config = super(LocalActivationUnit, self).get_config()
114 |         return dict(list(base_config.items()) + list(config.items()))
115 | 
116 | 
117 | class DNN(Layer):
118 |     """The Multi Layer Percetron
119 | 
120 |       Input shape
121 |         - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
122 | 
123 |       Output shape
124 |         - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.
125 | 
126 |       Arguments
127 |         - **hidden_units**:list of positive integer, the layer number and units in each layer.
128 | 
129 |         - **activation**: Activation function to use.
130 | 
131 |         - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix.
132 | 
133 |         - **dropout_rate**: float in [0,1). Fraction of the units to dropout.
134 | 
135 |         - **use_bn**: bool. Whether use BatchNormalization before activation or not.
136 | 
137 |         - **seed**: A Python integer to use as random seed.
138 |     """
139 | 
140 |     def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs):
141 |         self.hidden_units = hidden_units
142 |         self.activation = activation
143 |         self.dropout_rate = dropout_rate
144 |         self.seed = seed
145 |         self.l2_reg = l2_reg
146 |         self.use_bn = use_bn
147 |         super(DNN, self).__init__(**kwargs)
148 | 
149 |     def build(self, input_shape):
150 |         # if len(self.hidden_units) == 0:
151 |         #     raise ValueError("hidden_units is empty")
152 |         input_size = input_shape[-1]
153 |         hidden_units = [int(input_size)] + list(self.hidden_units)
154 |         self.kernels = [self.add_weight(name='kernel' + str(i),
155 |                                         shape=(
156 |                                             hidden_units[i], hidden_units[i + 1]),
157 |                                         initializer=glorot_normal(
158 |                                             seed=self.seed),
159 |                                         regularizer=l2(self.l2_reg),
160 |                                         trainable=True) for i in range(len(self.hidden_units))]
161 |         self.bias = [self.add_weight(name='bias' + str(i),
162 |                                      shape=(self.hidden_units[i],),
163 |                                      initializer=Zeros(),
164 |                                      trainable=True) for i in range(len(self.hidden_units))]
165 |         if self.use_bn:
166 |             self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
167 | 
168 |         self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
169 |                                range(len(self.hidden_units))]
170 | 
171 |         self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
172 | 
173 |         super(DNN, self).build(input_shape)  # Be sure to call this somewhere!
174 | 
175 |     def call(self, inputs, training=None, **kwargs):
176 | 
177 |         deep_input = inputs
178 | 
179 |         for i in range(len(self.hidden_units)):
180 |             fc = tf.nn.bias_add(tf.tensordot(
181 |                 deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
182 |             # fc = Dense(self.hidden_size[i], activation=None, \
183 |             #           kernel_initializer=glorot_normal(seed=self.seed), \
184 |             #           kernel_regularizer=l2(self.l2_reg))(deep_input)
185 |             if self.use_bn:
186 |                 fc = self.bn_layers[i](fc, training=training)
187 | 
188 |             fc = self.activation_layers[i](fc)
189 | 
190 |             fc = self.dropout_layers[i](fc, training=training)
191 |             deep_input = fc
192 | 
193 |         return deep_input
194 | 
195 |     def compute_output_shape(self, input_shape):
196 |         if len(self.hidden_units) > 0:
197 |             shape = input_shape[:-1] + (self.hidden_units[-1],)
198 |         else:
199 |             shape = input_shape
200 | 
201 |         return tuple(shape)
202 | 
203 |     def get_config(self, ):
204 |         config = {'activation': self.activation, 'hidden_units': self.hidden_units,
205 |                   'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
206 |         base_config = super(DNN, self).get_config()
207 |         return dict(list(base_config.items()) + list(config.items()))
208 | 
209 | 
210 | class PredictionLayer(Layer):
211 |     """
212 |       Arguments
213 |          - **task**: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
214 | 
215 |          - **use_bias**: bool.Whether add bias term or not.
216 |     """
217 | 
218 |     def __init__(self, task='binary', use_bias=True, **kwargs):
219 |         if task not in ["binary", "multiclass", "regression"]:
220 |             raise ValueError("task must be binary,multiclass or regression")
221 |         self.task = task
222 |         self.use_bias = use_bias
223 |         super(PredictionLayer, self).__init__(**kwargs)
224 | 
225 |     def build(self, input_shape):
226 | 
227 |         if self.use_bias:
228 |             self.global_bias = self.add_weight(
229 |                 shape=(1,), initializer=Zeros(), name="global_bias")
230 | 
231 |         # Be sure to call this somewhere!
232 |         super(PredictionLayer, self).build(input_shape)
233 | 
234 |     def call(self, inputs, **kwargs):
235 |         x = inputs
236 |         if self.use_bias:
237 |             x = tf.nn.bias_add(x, self.global_bias, data_format='NHWC')
238 |         if self.task == "binary":
239 |             x = tf.sigmoid(x)
240 | 
241 |         output = tf.reshape(x, (-1, 1))
242 | 
243 |         return output
244 | 
245 |     def compute_output_shape(self, input_shape):
246 |         return (None, 1)
247 | 
248 |     def get_config(self, ):
249 |         config = {'task': self.task, 'use_bias': self.use_bias}
250 |         base_config = super(PredictionLayer, self).get_config()
251 |         return dict(list(base_config.items()) + list(config.items()))
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/deepfm_recomend/deepfm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/deepfm.png


--------------------------------------------------------------------------------
/deepfm_recomend/deepfm_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Oct 13 19:50:43 2020
  5 | 
  6 | @author: ledi
  7 | """
  8 | 
  9 | 
 10 | 
 11 | import pandas as pd
 12 | from sklearn.metrics import log_loss, roc_auc_score
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 15 | # from feature_column import build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns
 16 | from layers.core import PredictionLayer, DNN
 17 | from layers.interaction import FM
 18 | from layers.utils import concat_func, add_func, combined_dnn_input
 19 | # from  deepfm import DeepFM
 20 | 
 21 | from keras.layers import Dense
 22 | # from feature_column import SparseFeat, DenseFeat, get_feature_names
 23 | 
 24 | # if __name__ == "__main__":
 25 | data = pd.read_csv('./criteo_sample.txt')
 26 | 
 27 | 
 28 | #离散的特征名称
 29 | sparse_features = ['C' + str(i) for i in range(1, 27)]
 30 | 
 31 | #数值的特征名称
 32 | dense_features = ['I' + str(i) for i in range(1, 14)]
 33 | 
 34 | #对缺失的特征进行填充
 35 | data[sparse_features] = data[sparse_features].fillna('-1', )
 36 | data[dense_features] = data[dense_features].fillna(0, )
 37 | target = ['label']
 38 | 
 39 | 
 40 | #数据预处理
 41 | # 1.Label Encoding for sparse features,and do simple Transformation for dense features
 42 | #对离散特征进行编码
 43 | for feat in sparse_features:
 44 |     lbe = LabelEncoder()
 45 |     data[feat] = lbe.fit_transform(data[feat])
 46 | #数值特征进行最大最小归一化
 47 | mms = MinMaxScaler(feature_range=(0, 1))
 48 | data[dense_features] = mms.fit_transform(data[dense_features])
 49 | 
 50 | 
 51 | 
 52 | #feature 是特征处理模块
 53 | from feature import Operate_Feat1,get_feature_names
 54 | 
 55 | 
 56 | d=Operate_Feat1()
 57 | 
 58 | 
 59 | 
 60 | sparse_list=[]
 61 | for p in sparse_features:
 62 |     d1=d.operate_sparse(data[p], p)
 63 |     sparse_list.append(d1.copy())
 64 | 
 65 | dense_list=[]
 66 | for q in dense_features:
 67 |     d2=d.operate_dense(q)
 68 |     print(d2)
 69 |     dense_list.append(d2.copy())
 70 | 
 71 | 
 72 | # fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 )
 73 | #                         for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
 74 | #                       for feat in dense_features]
 75 | 
 76 | merge_list=sparse_list+dense_list
 77 | dnn_feature_columns = merge_list
 78 | linear_feature_columns = merge_list
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | from feature import DEFAULT_GROUP_NAME,build_input_features
 88 | 
 89 | 
 90 | 
 91 | def DeepFM(linear_feature_columns, dnn_feature_columns, fm_group=[DEFAULT_GROUP_NAME], dnn_hidden_units=(128, 128),
 92 |            l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
 93 |            dnn_activation='relu', dnn_use_bn=False, task='binary'):
 94 |         
 95 |     #构建模型的输入张量
 96 |     features = build_input_features(
 97 |         merge_list)
 98 |     
 99 |     print("#"*10)
100 |     print(features)
101 |     inputs_list = list(features.values())
102 |     
103 |     from feature import get_linear_logit
104 |     linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
105 |                                     l2_reg=l2_reg_linear)
106 |     
107 |     
108 |     from feature import input_from_feature_columns
109 |     group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, l2_reg_embedding,
110 |                                                                         seed, support_group=True)
111 | 
112 |     #########################################################################################################
113 |     
114 |     print('group_embedding_dict',group_embedding_dict)
115 |     print('dense_value_list',dense_value_list)
116 |     
117 |     # cc=[]
118 |     # for k in group_embedding_dict:
119 |     #     cc.append(k)
120 |     cc1=concat_func(group_embedding_dict, axis=1)
121 |     
122 |     cc2=FM()(cc1)
123 |     
124 |     # cc=[FM()(concat_func(v, axis=1))
125 |     #                       for k, v in group_embedding_dict.items() if k in fm_group]
126 |     fm_logit = add_func([cc2])
127 |     
128 |     dnn_input = combined_dnn_input(group_embedding_dict, dense_value_list)
129 |     
130 |     dnn_hidden_units=(128, 32)
131 |     dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
132 |                       dnn_use_bn, seed)(dnn_input)
133 |     
134 |     # dnn_input= Dense(64, activation='relu')(dnn_input)
135 |     
136 |     # dnn_output= Dense(28, activation='relu')(dnn_input)
137 |     
138 |     import keras 
139 |     import tensorflow as tf
140 |     dnn_logit = tf.keras.layers.Dense(
141 |         1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed=seed))(dnn_output)
142 |     
143 |     final_logit = add_func([linear_logit, fm_logit, dnn_logit])
144 |     
145 |     output = PredictionLayer(task)(final_logit)
146 |     model = tf.keras.models.Model(inputs=inputs_list, outputs=output)
147 |     
148 |     return model 
149 | 
150 | model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
151 | 
152 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
153 | 
154 | # 3.generate input data for model
155 | 
156 | train, test = train_test_split(data, test_size=0.2, random_state=2020)
157 | train_model_input = {name:train[name] for name in feature_names}
158 | test_model_input = {name:test[name] for name in feature_names}
159 | 
160 | # 4.Define Model,train,predict and evaluate
161 | # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
162 | model.compile("adam", "binary_crossentropy",
163 |               metrics=['binary_crossentropy'], )
164 | 
165 | history = model.fit(train_model_input, train[target].values,
166 |                     batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
167 | pred_ans = model.predict(test_model_input, batch_size=256)
168 | print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
169 | print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
170 | 


--------------------------------------------------------------------------------
/deepfm_recomend/feature.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Oct 14 19:49:32 2020
  5 | 
  6 | @author: ledi
  7 | """
  8 | DEFAULT_GROUP_NAME = "default_group"
  9 | from collections import namedtuple
 10 | from tensorflow.python.keras.initializers import RandomNormal, Zeros
 11 | import pandas as pd
 12 | from collections import OrderedDict
 13 | from tensorflow.python.keras.layers import Input
 14 | from layers import Linear
 15 | from layers.utils import concat_func, add_func
 16 | 
 17 | 
 18 | class Operate_Feat1():
 19 |     def __init__(self):
 20 |         
 21 |         #这里是类别特征的内置配置
 22 |         self.sparse_dict={  'embedding_dim':4, 'use_hash':False,'dtype':"int32", 
 23 |             
 24 |             'feat_cat':'sparse',
 25 |             'embeddings_initializer':RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
 26 |            
 27 |              'embedding_name':None,'group_name':"default_group", 'trainable':True}
 28 |         #这里是数值特征的内置配置
 29 |         self.dense_dict={'dimension':1, 'dtype':"float32", 'feat_cat':'dense',}
 30 |     
 31 |     
 32 |     #结果都以字典的形式输出
 33 |     def operate_sparse(self,some_data,name):
 34 |         sparse_dict1=self.sparse_dict
 35 |         sparse_dict1['vocabulary_size']=some_data.nunique()
 36 |         sparse_dict1['embedding_name'] =name
 37 |         return pd.Series(sparse_dict1)
 38 |     def operate_dense(self,dense_name):
 39 |         dense_dict1=self.dense_dict
 40 |         dense_dict1['name']=dense_name
 41 |         
 42 |         return pd.Series(dense_dict1)
 43 |     
 44 | # 构建输入层
 45 | def build_input_features(feature_columns, prefix=''):
 46 |     input_features = OrderedDict()
 47 |     for fc in feature_columns:
 48 |         if fc['feat_cat'] == 'sparse':
 49 |             input_features[fc['embedding_name']] = Input(
 50 |                 shape=(1,), name=prefix + fc['embedding_name'], dtype=fc['dtype'])
 51 |         elif fc['feat_cat'] == 'dense':
 52 |             input_features[fc['name']] = Input(
 53 |                 shape=(fc['dimension'],), name=prefix + fc['name'], dtype=fc['dtype'])
 54 | 
 55 | 
 56 |         else:
 57 |             raise TypeError("Invalid feature column type,got", type(fc))
 58 | 
 59 |     return input_features
 60 | 
 61 | def get_feature_names(feature_columns):
 62 |     features = build_input_features(feature_columns)
 63 |     
 64 |     print('features==============',features)
 65 |     return list(features.keys())
 66 | 
 67 | 
 68 | def get_linear_logit(features, linear_feature_columns, units=1, use_bias=False, seed=1024, prefix='linear',
 69 |                      l2_reg=0):
 70 |     
 71 |     features=features
 72 |     linear_feature_columns=linear_feature_columns
 73 |     units=1
 74 |     use_bias=False
 75 |     seed=1024
 76 |     prefix='linear'
 77 |     l2_reg=0
 78 |     
 79 |     
 80 | 
 81 |     for i in range(len(linear_feature_columns)):
 82 |         if linear_feature_columns[i]['feat_cat']=='sparse':
 83 |             linear_feature_columns[i]['embedding_dim']=3
 84 |             linear_feature_columns[i]['embeddings_initializer']=Zeros()
 85 |                                                                            
 86 | 
 87 | 
 88 |     linear_emb_list = [input_from_feature_columns(features, linear_feature_columns, l2_reg, seed,prefix=prefix + str(i))[0] for i in range(units)]
 89 |     _, dense_input_list = input_from_feature_columns(features, linear_feature_columns, l2_reg, seed, prefix=prefix)
 90 | 
 91 |     linear_logit_list = []
 92 |     for i in range(units):
 93 | 
 94 |         if len(linear_emb_list[i]) > 0 and len(dense_input_list) > 0:
 95 |             sparse_input = concat_func(linear_emb_list[i])
 96 |             dense_input = concat_func(dense_input_list)
 97 |             linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias, seed=seed)([sparse_input, dense_input])
 98 |         elif len(linear_emb_list[i]) > 0:
 99 |             sparse_input = concat_func(linear_emb_list[i])
100 |             linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias, seed=seed)(sparse_input)
101 |         elif len(dense_input_list) > 0:
102 |             dense_input = concat_func(dense_input_list)
103 |             linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias, seed=seed)(dense_input)
104 |         else:
105 |             # raise NotImplementedError
106 |             return add_func([])
107 |         linear_logit_list.append(linear_logit)
108 | 
109 |     return concat_func(linear_logit_list)
110 | 
111 | 
112 | 
113 | def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True,
114 |                                support_dense=True, support_group=False):
115 |     # feature_columns=linear_feature_columns
116 |     # seq_mask_zero=True
117 |     # support_dense=True
118 |     # support_group=False
119 |     
120 |     print('KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK')
121 |     
122 |     print('prefix=',prefix)
123 |     sparse_feature_columns=[]
124 |     for fc in feature_columns:
125 |         if  fc['feat_cat'] == 'sparse':
126 |             print(fc['feat_cat'])
127 |             sparse_feature_columns.append(fc)
128 | 
129 |     # varlen_sparse_feature_columns = list(
130 |     #     filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
131 |     
132 |     
133 |     '''
134 |     {'C1': <tensorflow.python.keras.layers.embeddings.Embedding at 0x7f5de6377910>,
135 |      'C2': <tensorflow.python.keras.layers.embeddings.Embedding at 0x7f5de62dd1c0>}
136 |     '''
137 |     
138 |     from inputs1 import create_embedding_dict,create_embedding_matrix,get_dense_input
139 | 
140 |     #embedding_matrix_dict是一个字典，key 是特征的名称，values 是某个特征的Embedding
141 |     embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix,
142 |                                                         seq_mask_zero=seq_mask_zero)
143 |     from inputs1 import embedding_lookup
144 |     #group_sparse_embedding_dict 是每个特征从input层到embedding 层的映射 ,
145 |     #这是一个列表
146 |     group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns)
147 |     
148 |     
149 |     
150 |     #获得dense的输入
151 |     dense_value_list = get_dense_input(features, feature_columns)
152 | 
153 |     
154 |     print('TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT')
155 |     return group_sparse_embedding_dict, dense_value_list
156 | 


--------------------------------------------------------------------------------
/deepfm_recomend/inputs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | """
  3 | 
  4 | Author:
  5 |     Weichen Shen,wcshen1994@163.com
  6 | 
  7 | """
  8 | 
  9 | from collections import defaultdict
 10 | from itertools import chain
 11 | 
 12 | from tensorflow.python.keras.layers import Embedding
 13 | from tensorflow.python.keras.regularizers import l2
 14 | 
 15 | from layers.sequence import SequencePoolingLayer, WeightedSequenceLayer
 16 | from layers.utils import Hash
 17 | 
 18 | 
 19 | def get_inputs_list(inputs):
 20 |     return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))
 21 | 
 22 | 
 23 | def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, l2_reg,
 24 |                           prefix='sparse_', seq_mask_zero=True):
 25 |     sparse_embedding = {}
 26 |     for feat in sparse_feature_columns:
 27 |         emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
 28 |                         embeddings_initializer=feat.embeddings_initializer,
 29 |                         embeddings_regularizer=l2(l2_reg),
 30 |                         name=prefix + '_emb_' + feat.embedding_name)
 31 |         emb.trainable = feat.trainable
 32 |         sparse_embedding[feat.embedding_name] = emb
 33 | 
 34 |     if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
 35 |         for feat in varlen_sparse_feature_columns:
 36 |             # if feat.name not in sparse_embedding:
 37 |             emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
 38 |                             embeddings_initializer=feat.embeddings_initializer,
 39 |                             embeddings_regularizer=l2(
 40 |                                 l2_reg),
 41 |                             name=prefix + '_seq_emb_' + feat.name,
 42 |                             mask_zero=seq_mask_zero)
 43 |             emb.trainable = feat.trainable
 44 |             sparse_embedding[feat.embedding_name] = emb
 45 |     return sparse_embedding
 46 | 
 47 | 
 48 | def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()):
 49 |     embedding_vec_list = []
 50 |     for fg in sparse_feature_columns:
 51 |         feat_name = fg.name
 52 |         if len(return_feat_list) == 0 or feat_name in return_feat_list:
 53 |             if fg.use_hash:
 54 |                 lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name])
 55 |             else:
 56 |                 lookup_idx = input_dict[feat_name]
 57 | 
 58 |             embedding_vec_list.append(embedding_dict[feat_name](lookup_idx))
 59 | 
 60 |     return embedding_vec_list
 61 | 
 62 | 
 63 | def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="", seq_mask_zero=True):
 64 |     import feature_column as fc_lib
 65 | 
 66 |     sparse_feature_columns = list(
 67 |         filter(lambda x: isinstance(x, fc_lib.SparseFeat), feature_columns)) if feature_columns else []
 68 |     varlen_sparse_feature_columns = list(
 69 |         filter(lambda x: isinstance(x, fc_lib.VarLenSparseFeat), feature_columns)) if feature_columns else []
 70 |     sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed,
 71 |                                             l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero)
 72 |     return sparse_emb_dict
 73 | 
 74 | 
 75 | def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
 76 |                      mask_feat_list=(), to_list=False):
 77 |     group_embedding_dict = defaultdict(list)
 78 |     for fc in sparse_feature_columns:
 79 |         feature_name = fc.name
 80 |         embedding_name = fc.embedding_name
 81 |         if (len(return_feat_list) == 0 or feature_name in return_feat_list):
 82 |             if fc.use_hash:
 83 |                 lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))(
 84 |                     sparse_input_dict[feature_name])
 85 |             else:
 86 |                 lookup_idx = sparse_input_dict[feature_name]
 87 | 
 88 |             group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx))
 89 |     if to_list:
 90 |         return list(chain.from_iterable(group_embedding_dict.values()))
 91 |     return group_embedding_dict
 92 | 
 93 | 
 94 | def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
 95 |     varlen_embedding_vec_dict = {}
 96 |     for fc in varlen_sparse_feature_columns:
 97 |         feature_name = fc.name
 98 |         embedding_name = fc.embedding_name
 99 |         if fc.use_hash:
100 |             lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
101 |         else:
102 |             lookup_idx = sequence_input_dict[feature_name]
103 |         varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
104 |     return varlen_embedding_vec_dict
105 | 
106 | 
107 | def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False):
108 |     pooling_vec_list = defaultdict(list)
109 |     for fc in varlen_sparse_feature_columns:
110 |         feature_name = fc.name
111 |         combiner = fc.combiner
112 |         feature_length_name = fc.length_name
113 |         if feature_length_name is not None:
114 |             if fc.weight_name is not None:
115 |                 seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
116 |                     [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
117 |             else:
118 |                 seq_input = embedding_dict[feature_name]
119 |             vec = SequencePoolingLayer(combiner, supports_masking=False)(
120 |                 [seq_input, features[feature_length_name]])
121 |         else:
122 |             if fc.weight_name is not None:
123 |                 seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)(
124 |                     [embedding_dict[feature_name], features[fc.weight_name]])
125 |             else:
126 |                 seq_input = embedding_dict[feature_name]
127 |             vec = SequencePoolingLayer(combiner, supports_masking=True)(
128 |                 seq_input)
129 |         pooling_vec_list[fc.group_name].append(vec)
130 |     if to_list:
131 |         return chain.from_iterable(pooling_vec_list.values())
132 |     return pooling_vec_list
133 | 
134 | 
135 | def get_dense_input(features, feature_columns):
136 |     import feature_column as fc_lib
137 |     dense_feature_columns = list(
138 |         filter(lambda x: isinstance(x, fc_lib.DenseFeat), feature_columns)) if feature_columns else []
139 |     dense_input_list = []
140 |     for fc in dense_feature_columns:
141 |         dense_input_list.append(features[fc.name])
142 |     return dense_input_list
143 | 
144 | 
145 | def mergeDict(a, b):
146 |     c = defaultdict(list)
147 |     for k, v in a.items():
148 |         c[k].extend(v)
149 |     for k, v in b.items():
150 |         c[k].extend(v)
151 |     return c
152 | 


--------------------------------------------------------------------------------
/deepfm_recomend/inputs1.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | """
 3 | 
 4 | Author:
 5 |     Weichen Shen,wcshen1994@163.com
 6 | 
 7 | """
 8 | 
 9 | from collections import defaultdict
10 | from itertools import chain
11 | 
12 | from tensorflow.python.keras.layers import Embedding
13 | from tensorflow.python.keras.regularizers import l2
14 | 
15 | from layers.sequence import SequencePoolingLayer, WeightedSequenceLayer
16 | from layers.utils import Hash
17 | 
18 | 
19 | 
20 | from keras.layers import Embedding
21 | from tensorflow.python.keras.regularizers import l2
22 | def create_embedding_dict(sparse_feature_columns ,seed, l2_reg,
23 |                           prefix='sparse_', seq_mask_zero=True):
24 |     
25 |     #将特征进行embedding ,输入维度是某个特征的种类数
26 |     sparse_embedding = {}
27 |     for feat in sparse_feature_columns:
28 |         emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
29 |                         embeddings_initializer=feat.embeddings_initializer,
30 |                         embeddings_regularizer=l2(l2_reg),
31 |                         name=prefix + '_emb_' + feat.embedding_name)
32 |         emb.trainable = feat.trainable
33 |         sparse_embedding[feat.embedding_name] = emb
34 | 
35 |     return sparse_embedding
36 | 
37 | 
38 | 
39 | def get_dense_input(features, feature_columns):
40 |     # import feature_column as fc_lib
41 |     dense_feature_columns=[]
42 |     for fc in feature_columns:
43 |         if  fc['feat_cat'] == 'dense':
44 |             dense_feature_columns.append(fc)
45 |     dense_input_list = []
46 |     for fc in dense_feature_columns:
47 |         dense_input_list.append(features[fc['name']])
48 |     return dense_input_list
49 | 
50 | 
51 | def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="", seq_mask_zero=True):
52 | 
53 | 
54 |     sparse_feature_columns=[]
55 |     for fc in feature_columns:
56 |         if  fc['feat_cat'] == 'sparse':
57 |             sparse_feature_columns.append(fc)
58 | 
59 |     sparse_emb_dict = create_embedding_dict(sparse_feature_columns,  seed,
60 |                                             l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero)
61 |     return sparse_emb_dict
62 | 
63 | 
64 | def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
65 |                      mask_feat_list=(), to_list=False):
66 |     
67 |     # sparse_embedding_dict=embedding_matrix_dict
68 |     # sparse_input_dict =features
69 |     
70 |      # =sparse_feature_columns
71 |     group_embedding_dict = []
72 |     for fc in sparse_feature_columns:
73 |         feature_name = fc.embedding_name
74 |         embedding_name = fc.embedding_name
75 |         # if (len(return_feat_list) == 0 or feature_name in return_feat_list):
76 |         if fc.use_hash:
77 |             lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))(
78 |                 sparse_input_dict[feature_name])
79 |         else:
80 |             
81 |             # 模型输入层张量
82 |             lookup_idx = sparse_input_dict[feature_name]
83 |                                                        # 从输入层到embedding 层的映射
84 |         group_embedding_dict.append(sparse_embedding_dict[embedding_name](lookup_idx))
85 | 
86 |     return group_embedding_dict
87 | #这里面是从input 到embedding 层的映射 
88 | 


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from .activation import Dice
 4 | from .core import DNN, LocalActivationUnit, PredictionLayer
 5 | from .interaction import (CIN, FM, AFMLayer, BiInteractionPooling, CrossNet,
 6 |                           InnerProductLayer, InteractingLayer,
 7 |                           OutterProductLayer, FGCNNLayer, SENETLayer, BilinearInteraction,
 8 |                           FieldWiseBiInteraction, FwFMLayer)
 9 | from .normalization import LayerNormalization
10 | from .sequence import (AttentionSequencePoolingLayer, BiasEncoding, BiLSTM,
11 |                        KMaxPooling, SequencePoolingLayer,WeightedSequenceLayer,
12 |                        Transformer, DynamicGRU)
13 | from .utils import NoMask, Hash,Linear,Add,combined_dnn_input
14 | 
15 | custom_objects = {'tf': tf,
16 |                   'InnerProductLayer': InnerProductLayer,
17 |                   'OutterProductLayer': OutterProductLayer,
18 |                   'DNN': DNN,
19 |                   'PredictionLayer': PredictionLayer,
20 |                   'FM': FM,
21 |                   'AFMLayer': AFMLayer,
22 |                   'CrossNet': CrossNet,
23 |                   'BiInteractionPooling': BiInteractionPooling,
24 |                   'LocalActivationUnit': LocalActivationUnit,
25 |                   'Dice': Dice,
26 |                   'SequencePoolingLayer': SequencePoolingLayer,
27 |                   'AttentionSequencePoolingLayer': AttentionSequencePoolingLayer,
28 |                   'CIN': CIN,
29 |                   'InteractingLayer': InteractingLayer,
30 |                   'LayerNormalization': LayerNormalization,
31 |                   'BiLSTM': BiLSTM,
32 |                   'Transformer': Transformer,
33 |                   'NoMask': NoMask,
34 |                   'BiasEncoding': BiasEncoding,
35 |                   'KMaxPooling': KMaxPooling,
36 |                   'FGCNNLayer': FGCNNLayer,
37 |                   'Hash': Hash,
38 |                   'Linear':Linear,
39 |                   'DynamicGRU': DynamicGRU,
40 |                   'SENETLayer':SENETLayer,
41 |                   'BilinearInteraction':BilinearInteraction,
42 |                   'WeightedSequenceLayer':WeightedSequenceLayer,
43 |                   'Add':Add,
44 |                   'FieldWiseBiInteraction':FieldWiseBiInteraction,
45 |                   'FwFMLayer': FwFMLayer
46 |                   }
47 | 


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__pycache__/activation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/activation.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__pycache__/core.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/core.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__pycache__/interaction.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/interaction.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__pycache__/normalization.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/normalization.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__pycache__/sequence.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/sequence.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/activation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | """
 3 | 
 4 | Author:
 5 |     Weichen Shen,wcshen1994@163.com
 6 | 
 7 | """
 8 | 
 9 | import tensorflow as tf
10 | from tensorflow.python.keras.initializers import Zeros
11 | from tensorflow.python.keras.layers import Layer
12 | 
13 | try:
14 |     unicode
15 | except NameError:
16 |     unicode = str
17 | 
18 | 
19 | class Dice(Layer):
20 |     """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data.
21 | 
22 |       Input shape
23 |         - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
24 | 
25 |       Output shape
26 |         - Same shape as the input.
27 | 
28 |       Arguments
29 |         - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
30 | 
31 |         - **epsilon** : Small float added to variance to avoid dividing by zero.
32 | 
33 |       References
34 |         - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
35 |     """
36 | 
37 |     def __init__(self, axis=-1, epsilon=1e-9, **kwargs):
38 |         self.axis = axis
39 |         self.epsilon = epsilon
40 |         super(Dice, self).__init__(**kwargs)
41 | 
42 |     def build(self, input_shape):
43 |         self.bn = tf.keras.layers.BatchNormalization(
44 |             axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
45 |         self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros(
46 |         ), dtype=tf.float32, name='dice_alpha')  # name='alpha_'+self.name
47 |         super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
48 |         self.uses_learning_phase = True
49 | 
50 |     def call(self, inputs, training=None, **kwargs):
51 |         inputs_normed = self.bn(inputs, training=training)
52 |         # tf.layers.batch_normalization(
53 |         # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
54 |         x_p = tf.sigmoid(inputs_normed)
55 |         return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
56 | 
57 |     def compute_output_shape(self, input_shape):
58 |         return input_shape
59 | 
60 |     def get_config(self, ):
61 |         config = {'axis': self.axis, 'epsilon': self.epsilon}
62 |         base_config = super(Dice, self).get_config()
63 |         return dict(list(base_config.items()) + list(config.items()))
64 | 
65 | 
66 | def activation_layer(activation):
67 |     if activation in ("dice", "Dice"):
68 |         act_layer = Dice()
69 |     elif isinstance(activation, (str, unicode)):
70 |         act_layer = tf.keras.layers.Activation(activation)
71 |     elif issubclass(activation, Layer):
72 |         act_layer = activation()
73 |     else:
74 |         raise ValueError(
75 |             "Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation))
76 |     return act_layer
77 | 


--------------------------------------------------------------------------------
/deepfm_recomend/layers/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__init__.py


--------------------------------------------------------------------------------
/deepfm_recomend/layers/contrib/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/contrib/__pycache__/rnn_v2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__pycache__/rnn_v2.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/contrib/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/layers/contrib/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/deepfm_recomend/layers/contrib/utils.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.python.ops import array_ops
  2 | from tensorflow.python.ops import init_ops
  3 | from tensorflow.python.ops import math_ops
  4 | from tensorflow.python.ops import nn_ops
  5 | from tensorflow.python.ops import variable_scope as vs
  6 | from tensorflow.python.ops.rnn_cell import *
  7 | from tensorflow.python.util import nest
  8 | 
  9 | _BIAS_VARIABLE_NAME = "bias"
 10 | 
 11 | _WEIGHTS_VARIABLE_NAME = "kernel"
 12 | 
 13 | 
 14 | class _Linear_(object):
 15 |     """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
 16 | 
 17 | 
 18 | 
 19 |     Args:
 20 | 
 21 |       args: a 2D Tensor or a list of 2D, batch x n, Tensors.
 22 | 
 23 |       output_size: int, second dimension of weight variable.
 24 | 
 25 |       dtype: data type for variables.
 26 | 
 27 |       build_bias: boolean, whether to build a bias variable.
 28 | 
 29 |       bias_initializer: starting value to initialize the bias
 30 | 
 31 |         (default is all zeros).
 32 | 
 33 |       kernel_initializer: starting value to initialize the weight.
 34 | 
 35 | 
 36 | 
 37 |     Raises:
 38 | 
 39 |       ValueError: if inputs_shape is wrong.
 40 | 
 41 |     """
 42 | 
 43 |     def __init__(self,
 44 | 
 45 |                  args,
 46 | 
 47 |                  output_size,
 48 | 
 49 |                  build_bias,
 50 | 
 51 |                  bias_initializer=None,
 52 | 
 53 |                  kernel_initializer=None):
 54 | 
 55 |         self._build_bias = build_bias
 56 | 
 57 |         if args is None or (nest.is_sequence(args) and not args):
 58 |             raise ValueError("`args` must be specified")
 59 | 
 60 |         if not nest.is_sequence(args):
 61 | 
 62 |             args = [args]
 63 | 
 64 |             self._is_sequence = False
 65 | 
 66 |         else:
 67 | 
 68 |             self._is_sequence = True
 69 | 
 70 |         # Calculate the total size of arguments on dimension 1.
 71 | 
 72 |         total_arg_size = 0
 73 | 
 74 |         shapes = [a.get_shape() for a in args]
 75 | 
 76 |         for shape in shapes:
 77 | 
 78 |             if shape.ndims != 2:
 79 |                 raise ValueError(
 80 |                     "linear is expecting 2D arguments: %s" % shapes)
 81 | 
 82 |             if shape[1] is None:
 83 | 
 84 |                 raise ValueError("linear expects shape[1] to be provided for shape %s, "
 85 | 
 86 |                                  "but saw %s" % (shape, shape[1]))
 87 | 
 88 |             else:
 89 | 
 90 |                 total_arg_size += int(shape[1])#.value
 91 | 
 92 |         dtype = [a.dtype for a in args][0]
 93 | 
 94 |         scope = vs.get_variable_scope()
 95 | 
 96 |         with vs.variable_scope(scope) as outer_scope:
 97 | 
 98 |             self._weights = vs.get_variable(
 99 | 
100 |                 _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
101 | 
102 |                 dtype=dtype,
103 | 
104 |                 initializer=kernel_initializer)
105 | 
106 |             if build_bias:
107 | 
108 |                 with vs.variable_scope(outer_scope) as inner_scope:
109 | 
110 |                     inner_scope.set_partitioner(None)
111 | 
112 |                     if bias_initializer is None:
113 |                         bias_initializer = init_ops.constant_initializer(
114 |                             0.0, dtype=dtype)
115 | 
116 |                     self._biases = vs.get_variable(
117 | 
118 |                         _BIAS_VARIABLE_NAME, [output_size],
119 | 
120 |                         dtype=dtype,
121 | 
122 |                         initializer=bias_initializer)
123 | 
124 |     def __call__(self, args):
125 | 
126 |         if not self._is_sequence:
127 |             args = [args]
128 | 
129 |         if len(args) == 1:
130 | 
131 |             res = math_ops.matmul(args[0], self._weights)
132 | 
133 |         else:
134 | 
135 |             res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
136 | 
137 |         if self._build_bias:
138 |             res = nn_ops.bias_add(res, self._biases)
139 | 
140 |         return res
141 | 
142 | 
143 | try:
144 |     from tensorflow.python.ops.rnn_cell_impl import _Linear
145 | except:
146 |     _Linear = _Linear_
147 | 
148 | 
149 | class QAAttGRUCell(RNNCell):
150 |     """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
151 | 
152 |     Args:
153 | 
154 |       num_units: int, The number of units in the GRU cell.
155 | 
156 |       activation: Nonlinearity to use.  Default: `tanh`.
157 | 
158 |       reuse: (optional) Python boolean describing whether to reuse variables
159 | 
160 |        in an existing scope.  If not `True`, and the existing scope already has
161 | 
162 |        the given variables, an error is raised.
163 | 
164 |       kernel_initializer: (optional) The initializer to use for the weight and
165 | 
166 |       projection matrices.
167 | 
168 |       bias_initializer: (optional) The initializer to use for the bias.
169 | 
170 |     """
171 | 
172 |     def __init__(self,
173 | 
174 |                  num_units,
175 | 
176 |                  activation=None,
177 | 
178 |                  reuse=None,
179 | 
180 |                  kernel_initializer=None,
181 | 
182 |                  bias_initializer=None):
183 | 
184 |         super(QAAttGRUCell, self).__init__(_reuse=reuse)
185 | 
186 |         self._num_units = num_units
187 | 
188 |         self._activation = activation or math_ops.tanh
189 | 
190 |         self._kernel_initializer = kernel_initializer
191 | 
192 |         self._bias_initializer = bias_initializer
193 | 
194 |         self._gate_linear = None
195 | 
196 |         self._candidate_linear = None
197 | 
198 |     @property
199 |     def state_size(self):
200 | 
201 |         return self._num_units
202 | 
203 |     @property
204 |     def output_size(self):
205 | 
206 |         return self._num_units
207 | 
208 |     def __call__(self, inputs, state, att_score):
209 | 
210 |         return self.call(inputs, state, att_score)
211 | 
212 |     def call(self, inputs, state, att_score=None):
213 |         """Gated recurrent unit (GRU) with nunits cells."""
214 | 
215 |         if self._gate_linear is None:
216 | 
217 |             bias_ones = self._bias_initializer
218 | 
219 |             if self._bias_initializer is None:
220 |                 bias_ones = init_ops.constant_initializer(
221 |                     1.0, dtype=inputs.dtype)
222 | 
223 |             with vs.variable_scope("gates"):  # Reset gate and update gate.
224 | 
225 |                 self._gate_linear = _Linear(
226 | 
227 |                     [inputs, state],
228 | 
229 |                     2 * self._num_units,
230 | 
231 |                     True,
232 | 
233 |                     bias_initializer=bias_ones,
234 | 
235 |                     kernel_initializer=self._kernel_initializer)
236 | 
237 |         value = math_ops.sigmoid(self._gate_linear([inputs, state]))
238 | 
239 |         r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
240 | 
241 |         r_state = r * state
242 | 
243 |         if self._candidate_linear is None:
244 |             with vs.variable_scope("candidate"):
245 |                 self._candidate_linear = _Linear(
246 | 
247 |                     [inputs, r_state],
248 | 
249 |                     self._num_units,
250 | 
251 |                     True,
252 | 
253 |                     bias_initializer=self._bias_initializer,
254 | 
255 |                     kernel_initializer=self._kernel_initializer)
256 | 
257 |         c = self._activation(self._candidate_linear([inputs, r_state]))
258 | 
259 |         new_h = (1. - att_score) * state + att_score * c
260 | 
261 |         return new_h, new_h
262 | 
263 | 
264 | class VecAttGRUCell(RNNCell):
265 |     """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
266 | 
267 |     Args:
268 | 
269 |       num_units: int, The number of units in the GRU cell.
270 | 
271 |       activation: Nonlinearity to use.  Default: `tanh`.
272 | 
273 |       reuse: (optional) Python boolean describing whether to reuse variables
274 | 
275 |        in an existing scope.  If not `True`, and the existing scope already has
276 | 
277 |        the given variables, an error is raised.
278 | 
279 |       kernel_initializer: (optional) The initializer to use for the weight and
280 | 
281 |       projection matrices.
282 | 
283 |       bias_initializer: (optional) The initializer to use for the bias.
284 | 
285 |     """
286 | 
287 |     def __init__(self,
288 | 
289 |                  num_units,
290 | 
291 |                  activation=None,
292 | 
293 |                  reuse=None,
294 | 
295 |                  kernel_initializer=None,
296 | 
297 |                  bias_initializer=None):
298 | 
299 |         super(VecAttGRUCell, self).__init__(_reuse=reuse)
300 | 
301 |         self._num_units = num_units
302 | 
303 |         self._activation = activation or math_ops.tanh
304 | 
305 |         self._kernel_initializer = kernel_initializer
306 | 
307 |         self._bias_initializer = bias_initializer
308 | 
309 |         self._gate_linear = None
310 | 
311 |         self._candidate_linear = None
312 | 
313 |     @property
314 |     def state_size(self):
315 | 
316 |         return self._num_units
317 | 
318 |     @property
319 |     def output_size(self):
320 | 
321 |         return self._num_units
322 | 
323 |     def __call__(self, inputs, state, att_score):
324 | 
325 |         return self.call(inputs, state, att_score)
326 | 
327 |     def call(self, inputs, state, att_score=None):
328 |         """Gated recurrent unit (GRU) with nunits cells."""
329 | 
330 |         if self._gate_linear is None:
331 | 
332 |             bias_ones = self._bias_initializer
333 | 
334 |             if self._bias_initializer is None:
335 |                 bias_ones = init_ops.constant_initializer(
336 |                     1.0, dtype=inputs.dtype)
337 | 
338 |             with vs.variable_scope("gates"):  # Reset gate and update gate.
339 | 
340 |                 self._gate_linear = _Linear(
341 | 
342 |                     [inputs, state],
343 | 
344 |                     2 * self._num_units,
345 | 
346 |                     True,
347 | 
348 |                     bias_initializer=bias_ones,
349 | 
350 |                     kernel_initializer=self._kernel_initializer)
351 | 
352 |         value = math_ops.sigmoid(self._gate_linear([inputs, state]))
353 | 
354 |         r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
355 | 
356 |         r_state = r * state
357 | 
358 |         if self._candidate_linear is None:
359 |             with vs.variable_scope("candidate"):
360 |                 self._candidate_linear = _Linear(
361 | 
362 |                     [inputs, r_state],
363 | 
364 |                     self._num_units,
365 | 
366 |                     True,
367 | 
368 |                     bias_initializer=self._bias_initializer,
369 | 
370 |                     kernel_initializer=self._kernel_initializer)
371 | 
372 |         c = self._activation(self._candidate_linear([inputs, r_state]))
373 | 
374 |         u = (1.0 - att_score) * u
375 | 
376 |         new_h = u * state + (1 - u) * c
377 | 
378 |         return new_h, new_h
379 | 


--------------------------------------------------------------------------------
/deepfm_recomend/layers/core.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | """
  3 | 
  4 | Author:
  5 |     Weichen Shen,wcshen1994@163.com
  6 | 
  7 | """
  8 | 
  9 | import tensorflow as tf
 10 | from tensorflow.python.keras import backend as K
 11 | from tensorflow.python.keras.initializers import Zeros, glorot_normal
 12 | from tensorflow.python.keras.layers import Layer
 13 | from tensorflow.python.keras.regularizers import l2
 14 | 
 15 | from .activation import activation_layer
 16 | 
 17 | 
 18 | class LocalActivationUnit(Layer):
 19 |     """The LocalActivationUnit used in DIN with which the representation of
 20 |     user interests varies adaptively given different candidate items.
 21 | 
 22 |       Input shape
 23 |         - A list of two 3D tensor with shape:  ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)``
 24 | 
 25 |       Output shape
 26 |         - 3D tensor with shape: ``(batch_size, T, 1)``.
 27 | 
 28 |       Arguments
 29 |         - **hidden_units**:list of positive integer, the attention net layer number and units in each layer.
 30 | 
 31 |         - **activation**: Activation function to use in attention net.
 32 | 
 33 |         - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix of attention net.
 34 | 
 35 |         - **dropout_rate**: float in [0,1). Fraction of the units to dropout in attention net.
 36 | 
 37 |         - **use_bn**: bool. Whether use BatchNormalization before activation or not in attention net.
 38 | 
 39 |         - **seed**: A Python integer to use as random seed.
 40 | 
 41 |       References
 42 |         - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
 43 |     """
 44 | 
 45 |     def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024,
 46 |                  **kwargs):
 47 |         self.hidden_units = hidden_units
 48 |         self.activation = activation
 49 |         self.l2_reg = l2_reg
 50 |         self.dropout_rate = dropout_rate
 51 |         self.use_bn = use_bn
 52 |         self.seed = seed
 53 |         super(LocalActivationUnit, self).__init__(**kwargs)
 54 |         self.supports_masking = True
 55 | 
 56 |     def build(self, input_shape):
 57 | 
 58 |         if not isinstance(input_shape, list) or len(input_shape) != 2:
 59 |             raise ValueError('A `LocalActivationUnit` layer should be called '
 60 |                              'on a list of 2 inputs')
 61 | 
 62 |         if len(input_shape[0]) != 3 or len(input_shape[1]) != 3:
 63 |             raise ValueError("Unexpected inputs dimensions %d and %d, expect to be 3 dimensions" % (
 64 |                 len(input_shape[0]), len(input_shape[1])))
 65 | 
 66 |         if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1:
 67 |             raise ValueError('A `LocalActivationUnit` layer requires '
 68 |                              'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
 69 |                              'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
 70 |         size = 4 * \
 71 |                int(input_shape[0][-1]
 72 |                    ) if len(self.hidden_units) == 0 else self.hidden_units[-1]
 73 |         self.kernel = self.add_weight(shape=(size, 1),
 74 |                                       initializer=glorot_normal(
 75 |                                           seed=self.seed),
 76 |                                       name="kernel")
 77 |         self.bias = self.add_weight(
 78 |             shape=(1,), initializer=Zeros(), name="bias")
 79 |         self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg,
 80 |                        self.dropout_rate, self.use_bn, seed=self.seed)
 81 | 
 82 |         self.dense = tf.keras.layers.Lambda(lambda x: tf.nn.bias_add(tf.tensordot(
 83 |             x[0], x[1], axes=(-1, 0)), x[2]))
 84 | 
 85 |         super(LocalActivationUnit, self).build(
 86 |             input_shape)  # Be sure to call this somewhere!
 87 | 
 88 |     def call(self, inputs, training=None, **kwargs):
 89 | 
 90 |         query, keys = inputs
 91 | 
 92 |         keys_len = keys.get_shape()[1]
 93 |         queries = K.repeat_elements(query, keys_len, 1)
 94 | 
 95 |         att_input = tf.concat(
 96 |             [queries, keys, queries - keys, queries * keys], axis=-1)
 97 | 
 98 |         att_out = self.dnn(att_input, training=training)
 99 | 
100 |         attention_score = self.dense([att_out, self.kernel, self.bias])
101 | 
102 |         return attention_score
103 | 
104 |     def compute_output_shape(self, input_shape):
105 |         return input_shape[1][:2] + (1,)
106 | 
107 |     def compute_mask(self, inputs, mask):
108 |         return mask
109 | 
110 |     def get_config(self, ):
111 |         config = {'activation': self.activation, 'hidden_units': self.hidden_units,
112 |                   'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed}
113 |         base_config = super(LocalActivationUnit, self).get_config()
114 |         return dict(list(base_config.items()) + list(config.items()))
115 | 
116 | 
117 | class DNN(Layer):
118 |     """The Multi Layer Percetron
119 | 
120 |       Input shape
121 |         - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
122 | 
123 |       Output shape
124 |         - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.
125 | 
126 |       Arguments
127 |         - **hidden_units**:list of positive integer, the layer number and units in each layer.
128 | 
129 |         - **activation**: Activation function to use.
130 | 
131 |         - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix.
132 | 
133 |         - **dropout_rate**: float in [0,1). Fraction of the units to dropout.
134 | 
135 |         - **use_bn**: bool. Whether use BatchNormalization before activation or not.
136 | 
137 |         - **seed**: A Python integer to use as random seed.
138 |     """
139 | 
140 |     def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs):
141 |         self.hidden_units = hidden_units
142 |         self.activation = activation
143 |         self.dropout_rate = dropout_rate
144 |         self.seed = seed
145 |         self.l2_reg = l2_reg
146 |         self.use_bn = use_bn
147 |         super(DNN, self).__init__(**kwargs)
148 | 
149 |     def build(self, input_shape):
150 |         # if len(self.hidden_units) == 0:
151 |         #     raise ValueError("hidden_units is empty")
152 |         input_size = input_shape[-1]
153 |         hidden_units = [int(input_size)] + list(self.hidden_units)
154 |         self.kernels = [self.add_weight(name='kernel' + str(i),
155 |                                         shape=(
156 |                                             hidden_units[i], hidden_units[i + 1]),
157 |                                         initializer=glorot_normal(
158 |                                             seed=self.seed),
159 |                                         regularizer=l2(self.l2_reg),
160 |                                         trainable=True) for i in range(len(self.hidden_units))]
161 |         
162 |         print(self.kernels)
163 |         
164 |         # return self.kernels
165 |         self.bias = [self.add_weight(name='bias' + str(i),
166 |                                      shape=(self.hidden_units[i],),
167 |                                      initializer=Zeros(),
168 |                                      trainable=True) for i in range(len(self.hidden_units))]
169 |         if self.use_bn:
170 |             self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
171 | 
172 |         self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
173 |                                range(len(self.hidden_units))]
174 | 
175 |         self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
176 | 
177 |         super(DNN, self).build(input_shape)  # Be sure to call this somewhere!
178 | 
179 |     def call(self, inputs, training=None, **kwargs):
180 | 
181 |         deep_input = inputs
182 |          
183 |         print(self.kernels)
184 |         for i in range(len(self.hidden_units)):
185 |             
186 |             
187 |             fc = tf.nn.bias_add(tf.tensordot(
188 |                 # tf.tensordot 表示矩阵相乘 
189 |                 #比如说 Amn *Bnp     axes -1 表示 A的n ,0 表示B 的n        
190 |                 deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
191 |             
192 |             
193 |             # fc = Dense(self.hidden_size[i], activation=None, \
194 |             #           kernel_initializer=glorot_normal(seed=self.seed), \
195 |             #           kernel_regularizer=l2(self.l2_reg))(deep_input)
196 |             if self.use_bn:
197 |                 fc = self.bn_layers[i](fc, training=training)
198 | 
199 |             fc = self.activation_layers[i](fc)
200 | 
201 |             fc = self.dropout_layers[i](fc, training=training)
202 |             deep_input = fc
203 | 
204 |         return deep_input
205 | 
206 |     def compute_output_shape(self, input_shape):
207 |         if len(self.hidden_units) > 0:
208 |             shape = input_shape[:-1] + (self.hidden_units[-1],)
209 |         else:
210 |             shape = input_shape
211 | 
212 |         return tuple(shape)
213 | 
214 |     def get_config(self, ):
215 |         config = {'activation': self.activation, 'hidden_units': self.hidden_units,
216 |                   'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
217 |         base_config = super(DNN, self).get_config()
218 |         return dict(list(base_config.items()) + list(config.items()))
219 | 
220 | 
221 | class PredictionLayer(Layer):
222 |     """
223 |       Arguments
224 |          - **task**: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
225 | 
226 |          - **use_bias**: bool.Whether add bias term or not.
227 |     """
228 | 
229 |     def __init__(self, task='binary', use_bias=True, **kwargs):
230 |         if task not in ["binary", "multiclass", "regression"]:
231 |             raise ValueError("task must be binary,multiclass or regression")
232 |         self.task = task
233 |         self.use_bias = use_bias
234 |         super(PredictionLayer, self).__init__(**kwargs)
235 | 
236 |     def build(self, input_shape):
237 | 
238 |         if self.use_bias:
239 |             self.global_bias = self.add_weight(
240 |                 shape=(1,), initializer=Zeros(), name="global_bias")
241 | 
242 |         # Be sure to call this somewhere!
243 |         super(PredictionLayer, self).build(input_shape)
244 | 
245 |     def call(self, inputs, **kwargs):
246 |         x = inputs
247 |         if self.use_bias:
248 |             x = tf.nn.bias_add(x, self.global_bias, data_format='NHWC')
249 |         if self.task == "binary":
250 |             x = tf.sigmoid(x)
251 | 
252 |         output = tf.reshape(x, (-1, 1))
253 | 
254 |         return output
255 | 
256 |     def compute_output_shape(self, input_shape):
257 |         return (None, 1)
258 | 
259 |     def get_config(self, ):
260 |         config = {'task': self.task, 'use_bias': self.use_bias}
261 |         base_config = super(PredictionLayer, self).get_config()
262 |         return dict(list(base_config.items()) + list(config.items()))
263 | 
264 | 
265 | 


--------------------------------------------------------------------------------
/deepfm_recomend/layers/normalization.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | """
 3 | 
 4 | Author:
 5 |     Weichen Shen,wcshen1994@163.com
 6 | 
 7 | """
 8 | 
 9 | from tensorflow.python.keras import backend as K
10 | from tensorflow.python.keras.initializers import Ones, Zeros
11 | from tensorflow.python.keras.layers import Layer
12 | 
13 | 
14 | class LayerNormalization(Layer):
15 |     def __init__(self, axis=-1, eps=1e-9, center=True,
16 |                  scale=True, **kwargs):
17 |         self.axis = axis
18 |         self.eps = eps
19 |         self.center = center
20 |         self.scale = scale
21 |         super(LayerNormalization, self).__init__(**kwargs)
22 | 
23 |     def build(self, input_shape):
24 |         self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
25 |                                      initializer=Ones(), trainable=True)
26 |         self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
27 |                                     initializer=Zeros(), trainable=True)
28 |         super(LayerNormalization, self).build(input_shape)
29 | 
30 |     def call(self, inputs):
31 |         mean = K.mean(inputs, axis=self.axis, keepdims=True)
32 |         variance = K.mean(K.square(inputs - mean), axis=-1, keepdims=True)
33 |         std = K.sqrt(variance + self.eps)
34 |         outputs = (inputs - mean) / std
35 |         if self.scale:
36 |             outputs *= self.gamma
37 |         if self.center:
38 |             outputs += self.beta
39 |         return outputs
40 | 
41 |     def compute_output_shape(self, input_shape):
42 |         return input_shape
43 | 
44 |     def get_config(self, ):
45 |         config = {'axis': self.axis, 'eps': self.eps, 'center': self.center, 'scale': self.scale}
46 |         base_config = super(LayerNormalization, self).get_config()
47 |         return dict(list(base_config.items()) + list(config.items()))
48 | 


--------------------------------------------------------------------------------
/deepfm_recomend/layers/untitled17.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Oct 16 16:54:00 2020
 5 | 
 6 | @author: ledi
 7 | """
 8 | 
 9 | import tensorflow as tf
10 | from tensorflow.python.keras import backend as K
11 | from tensorflow.python.keras.initializers import Zeros, glorot_normal
12 | from tensorflow.python.keras.layers import Layer
13 | from tensorflow.python.keras.regularizers import l2
14 | from keras.layers import Activation
15 | 
16 | 
17 | class my_dense(Layer):
18 | 
19 | 
20 |     def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs):
21 |         self.hidden_units = hidden_units
22 |         self.activation = activation
23 |         self.dropout_rate = dropout_rate
24 |         self.seed = seed
25 |         self.l2_reg = l2_reg
26 |         self.use_bn = use_bn
27 |         super().__init__(**kwargs)
28 | 
29 |     def build(self, input_shape):
30 |         # if len(self.hidden_units) == 0:
31 |         #     raise ValueError("hidden_units is empty")
32 |         input_size = input_shape[-1]
33 |         # hidden_units = [int(input_size)] + list(self.hidden_units)
34 |         self.kernels = self.add_weight(name='kernel' ,
35 |                                         shape=(input_size, hidden_units),
36 |                                         initializer=glorot_normal(
37 |                                             seed=self.seed),
38 |                                         regularizer=l2(self.l2_reg),
39 |                                         trainable=True   ) 
40 |         
41 |         print(self.kernels)
42 |         
43 |         # return self.kernels
44 |         self.bias = [self.add_weight(name='bias' + str(i),
45 |                                      shape=(self.hidden_units[i],),
46 |                                      initializer=Zeros(),
47 |                                      trainable=True) for i in range(len(self.hidden_units))]
48 |         if self.use_bn:
49 |             self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
50 | 
51 |         self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
52 |                                range(len(self.hidden_units))]
53 | 
54 |         self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
55 | 
56 |         super().build(input_shape)  # Be sure to call this somewhere!
57 | 
58 |     def call(self, inputs, training=None, **kwargs):
59 | 
60 |         deep_input = inputs
61 |          
62 |         print(self.kernels)
63 |         for i in range(len(self.hidden_units)):
64 |             fc = tf.nn.bias_add(tf.tensordot(
65 |                 deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
66 |             
67 |             
68 |             # fc = Dense(self.hidden_size[i], activation=None, \
69 |             #           kernel_initializer=glorot_normal(seed=self.seed), \
70 |             #           kernel_regularizer=l2(self.l2_reg))(deep_input)
71 |             if self.use_bn:
72 |                 fc = self.bn_layers[i](fc, training=training)
73 | 
74 |             fc = self.activation_layers[i](fc)
75 | 
76 |             fc = self.dropout_layers[i](fc, training=training)
77 |             deep_input = fc
78 | 
79 |         return deep_input
80 | 
81 |     def compute_output_shape(self, input_shape):
82 |         if len(self.hidden_units) > 0:
83 |             shape = input_shape[:-1] + (self.hidden_units[-1],)
84 |         else:
85 |             shape = input_shape
86 | 
87 |         return tuple(shape)
88 | 
89 |     def get_config(self, ):
90 |         config = {'activation': self.activation, 'hidden_units': self.hidden_units,
91 |                   'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
92 |         base_config = super(DNN, self).get_config()
93 |         return dict(list(base_config.items()) + list(config.items()))
94 | 


--------------------------------------------------------------------------------
/deepfm_recomend/layers/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | """
  3 | 
  4 | Author:
  5 |     Weichen Shen,wcshen1994@163.com
  6 | 
  7 | """
  8 | import tensorflow as tf
  9 | from tensorflow.python.keras.layers import Flatten
 10 | 
 11 | 
 12 | class NoMask(tf.keras.layers.Layer):
 13 |     def __init__(self, **kwargs):
 14 |         super(NoMask, self).__init__(**kwargs)
 15 | 
 16 |     def build(self, input_shape):
 17 |         # Be sure to call this somewhere!
 18 |         super(NoMask, self).build(input_shape)
 19 | 
 20 |     def call(self, x, mask=None, **kwargs):
 21 |         return x
 22 | 
 23 |     def compute_mask(self, inputs, mask):
 24 |         return None
 25 | 
 26 | 
 27 | class Hash(tf.keras.layers.Layer):
 28 |     """
 29 |     hash the input to [0,num_buckets)
 30 |     if mask_zero = True,0 or 0.0 will be set to 0,other value will be set in range[1,num_buckets)
 31 |     """
 32 | 
 33 |     def __init__(self, num_buckets, mask_zero=False, **kwargs):
 34 |         self.num_buckets = num_buckets
 35 |         self.mask_zero = mask_zero
 36 |         super(Hash, self).__init__(**kwargs)
 37 | 
 38 |     def build(self, input_shape):
 39 |         # Be sure to call this somewhere!
 40 |         super(Hash, self).build(input_shape)
 41 | 
 42 |     def call(self, x, mask=None, **kwargs):
 43 | 
 44 | 
 45 |         if x.dtype != tf.string:
 46 |             zero = tf.as_string(tf.zeros([1], dtype=x.dtype))
 47 |             x = tf.as_string(x, )
 48 |         else:
 49 |             zero = tf.as_string(tf.zeros([1], dtype='int32'))
 50 | 
 51 |         num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1
 52 |         try:
 53 |             hash_x = tf.string_to_hash_bucket_fast(x, num_buckets,
 54 |                                                    name=None)  # weak hash
 55 |         except:
 56 |             hash_x = tf.strings.to_hash_bucket_fast(x, num_buckets,
 57 |                                                     name=None)  # weak hash
 58 |         if self.mask_zero:
 59 |             mask = tf.cast(tf.not_equal(x, zero), dtype='int64')
 60 |             hash_x = (hash_x + 1) * mask
 61 | 
 62 |         return hash_x
 63 |     def get_config(self, ):
 64 |         config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, }
 65 |         base_config = super(Hash, self).get_config()
 66 |         return dict(list(base_config.items()) + list(config.items()))
 67 | 
 68 | 
 69 | class Linear(tf.keras.layers.Layer):
 70 | 
 71 |     def __init__(self, l2_reg=0.0, mode=0, use_bias=False, seed=1024, **kwargs):
 72 | 
 73 |         self.l2_reg = l2_reg
 74 |         # self.l2_reg = tf.contrib.layers.l2_regularizer(float(l2_reg_linear))
 75 |         if mode not in [0, 1, 2]:
 76 |             raise ValueError("mode must be 0,1 or 2")
 77 |         self.mode = mode
 78 |         self.use_bias = use_bias
 79 |         self.seed = seed
 80 |         super(Linear, self).__init__(**kwargs)
 81 | 
 82 |     def build(self, input_shape):
 83 |         
 84 |         print('input_shape=',input_shape)
 85 |         if self.use_bias:
 86 |             self.bias = self.add_weight(name='linear_bias',
 87 |                                         shape=(1,),
 88 |                                         initializer=tf.keras.initializers.Zeros(),
 89 |                                         trainable=True)
 90 |         if self.mode == 1:
 91 |             self.kernel = self.add_weight(
 92 |                 'linear_kernel',
 93 |                 shape=[int(input_shape[-1]), 1],
 94 |                 initializer=tf.keras.initializers.glorot_normal(self.seed),
 95 |                 regularizer=tf.keras.regularizers.l2(self.l2_reg),
 96 |                 trainable=True)
 97 |         elif self.mode == 2:
 98 |             
 99 |             
100 |             #在deepfm 中
101 |             #模式二有两个输入[sparse_input, dense_input]
102 |             #input_shape= [TensorShape([None, 1, 26]), TensorShape([None, 13])]
103 |             #这里的 kernel的shape 是 13*1 
104 |             self.kernel = self.add_weight(
105 |                 'linear_kernel',
106 |                 shape=[int(input_shape[1][-1]), 1],
107 |                 initializer=tf.keras.initializers.glorot_normal(self.seed),
108 |                 regularizer=tf.keras.regularizers.l2(self.l2_reg),
109 |                 trainable=True)
110 | 
111 |         super(Linear, self).build(input_shape)  # Be sure to call this somewhere!
112 | 
113 |     def call(self, inputs, **kwargs):
114 |         if self.mode == 0:
115 |             sparse_input = inputs
116 |             linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=True)
117 |         elif self.mode == 1:
118 |             dense_input = inputs
119 |             # fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
120 |             print(dense_input)
121 |             
122 |             fc =tf.matmul(dense_input,self.kernel)
123 |             linear_logit = fc
124 |         else:
125 |             sparse_input, dense_input = inputs
126 |             
127 |             print('dense_input',dense_input)
128 |             # fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
129 |             
130 |             #相乘之后N*13的矩阵与13*1的矩阵相乘，变成N*1就是一个数值
131 |             fc = tf.matmul(dense_input,self.kernel)
132 |             print('fc=',fc)
133 |             sum_sparse=reduce_sum(sparse_input, axis=-1, keep_dims=False)
134 |             
135 |             print('sum_sparse=',sum_sparse)
136 |             #sum_sparse也是一个数值，两个数相加
137 |             linear_logit = sum_sparse + fc
138 |         if self.use_bias:
139 |             linear_logit += self.bias
140 | 
141 |         return linear_logit
142 | 
143 |     def compute_output_shape(self, input_shape):
144 |         return (None, 1)
145 | 
146 |     def compute_mask(self, inputs, mask):
147 |         return None
148 | 
149 |     def get_config(self, ):
150 |         config = {'mode': self.mode, 'l2_reg': self.l2_reg, 'use_bias': self.use_bias, 'seed': self.seed}
151 |         base_config = super(Linear, self).get_config()
152 |         return dict(list(base_config.items()) + list(config.items()))
153 | 
154 | 
155 | def concat_func(inputs, axis=-1, mask=False):
156 |     if not mask:
157 |         inputs = list(map(NoMask(), inputs))
158 |     if len(inputs) == 1:
159 |         return inputs[0]
160 |     else:
161 |         return tf.keras.layers.Concatenate(axis=axis)(inputs)
162 | 
163 | 
164 | def reduce_mean(input_tensor,
165 |                 axis=None,
166 |                 keep_dims=False,
167 |                 name=None,
168 |                 reduction_indices=None):
169 |     try:
170 |         return tf.reduce_mean(input_tensor,
171 |                               axis=axis,
172 |                               keep_dims=keep_dims,
173 |                               name=name,
174 |                               reduction_indices=reduction_indices)
175 |     except TypeError:
176 |         return tf.reduce_mean(input_tensor,
177 |                               axis=axis,
178 |                               keepdims=keep_dims,
179 |                               name=name)
180 | 
181 | 
182 | def reduce_sum(input_tensor,
183 |                axis=None,
184 |                keep_dims=False,
185 |                name=None,
186 |                reduction_indices=None):
187 |     try:
188 |         return tf.reduce_sum(input_tensor,
189 |                              axis=axis,
190 |                              keep_dims=keep_dims,
191 |                              name=name,
192 |                              reduction_indices=reduction_indices)
193 |     except TypeError:
194 |         return tf.reduce_sum(input_tensor,
195 |                              axis=axis,
196 |                              keepdims=keep_dims,
197 |                              name=name)
198 | 
199 | 
200 | def reduce_max(input_tensor,
201 |                axis=None,
202 |                keep_dims=False,
203 |                name=None,
204 |                reduction_indices=None):
205 |     try:
206 |         return tf.reduce_max(input_tensor,
207 |                              axis=axis,
208 |                              keep_dims=keep_dims,
209 |                              name=name,
210 |                              reduction_indices=reduction_indices)
211 |     except TypeError:
212 |         return tf.reduce_max(input_tensor,
213 |                              axis=axis,
214 |                              keepdims=keep_dims,
215 |                              name=name)
216 | 
217 | 
218 | def div(x, y, name=None):
219 |     try:
220 |         return tf.div(x, y, name=name)
221 |     except AttributeError:
222 |         return tf.divide(x, y, name=name)
223 | 
224 | 
225 | def softmax(logits, dim=-1, name=None):
226 |     try:
227 |         return tf.nn.softmax(logits, dim=dim, name=name)
228 |     except TypeError:
229 |         return tf.nn.softmax(logits, axis=dim, name=name)
230 | 
231 | 
232 | class Add(tf.keras.layers.Layer):
233 |     def __init__(self, **kwargs):
234 |         super(Add, self).__init__(**kwargs)
235 | 
236 |     def build(self, input_shape):
237 |         # Be sure to call this somewhere!
238 |         super(Add, self).build(input_shape)
239 | 
240 |     def call(self, inputs, **kwargs):
241 |         if not isinstance(inputs, list):
242 |             return inputs
243 |         if len(inputs) == 1:
244 |             return inputs[0]
245 |         if len(inputs) == 0:
246 |             return tf.constant([[0.0]])
247 | 
248 |         return tf.keras.layers.add(inputs)
249 | 
250 | 
251 | def add_func(inputs):
252 |     return Add()(inputs)
253 | 
254 | 
255 | def combined_dnn_input(sparse_embedding_list, dense_value_list):
256 |     if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0:
257 |         sparse_dnn_input = Flatten()(concat_func(sparse_embedding_list))
258 |         dense_dnn_input = Flatten()(concat_func(dense_value_list))
259 |         return concat_func([sparse_dnn_input, dense_dnn_input])
260 |     elif len(sparse_embedding_list) > 0:
261 |         return Flatten()(concat_func(sparse_embedding_list))
262 |     elif len(dense_value_list) > 0:
263 |         return Flatten()(concat_func(dense_value_list))
264 |     else:
265 |         raise NotImplementedError("dnn_feature_columns can not be empty list")
266 | 


--------------------------------------------------------------------------------
/deepfm_recomend/run_classification_criteo.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.metrics import log_loss, roc_auc_score
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 5 | 
 6 | from  deepfm import DeepFM
 7 | from feature_column import SparseFeat, DenseFeat, get_feature_names
 8 | 
 9 | # if __name__ == "__main__":
10 | data = pd.read_csv('./criteo_sample.txt')
11 | 
12 | sparse_features = ['C' + str(i) for i in range(1, 27)]
13 | dense_features = ['I' + str(i) for i in range(1, 14)]
14 | 
15 | data[sparse_features] = data[sparse_features].fillna('-1', )
16 | data[dense_features] = data[dense_features].fillna(0, )
17 | target = ['label']
18 | 
19 | # 1.Label Encoding for sparse features,and do simple Transformation for dense features
20 | for feat in sparse_features:
21 |     lbe = LabelEncoder()
22 |     data[feat] = lbe.fit_transform(data[feat])
23 | mms = MinMaxScaler(feature_range=(0, 1))
24 | data[dense_features] = mms.fit_transform(data[dense_features])
25 | 
26 | # 2.count #unique features for each sparse field,and record dense feature field name
27 | 
28 | fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 )
29 |                        for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
30 |                       for feat in dense_features]
31 | 
32 | dnn_feature_columns = fixlen_feature_columns
33 | linear_feature_columns = fixlen_feature_columns
34 | 
35 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
36 | 
37 | # 3.generate input data for model
38 | 
39 | train, test = train_test_split(data, test_size=0.2, random_state=2020)
40 | train_model_input = {name:train[name] for name in feature_names}
41 | test_model_input = {name:test[name] for name in feature_names}
42 | 
43 | # 4.Define Model,train,predict and evaluate
44 | model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
45 | model.compile("adam", "binary_crossentropy",
46 |               metrics=['binary_crossentropy'], )
47 | 
48 | history = model.fit(train_model_input, train[target].values,
49 |                     batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
50 | pred_ans = model.predict(test_model_input, batch_size=256)
51 | print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
52 | print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
53 | 


--------------------------------------------------------------------------------
/deepfm_recomend/temp/deepfm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | """
 3 | Author:
 4 |     Weichen Shen,wcshen1994@163.com
 5 | 
 6 | Reference:
 7 |     [1] Guo H, Tang R, Ye Y, et al. Deepfm: a factorization-machine based neural network for ctr prediction[J]. arXiv preprint arXiv:1703.04247, 2017.(https://arxiv.org/abs/1703.04247)
 8 | 
 9 | """
10 | 
11 | from itertools import chain
12 | 
13 | import tensorflow as tf
14 | 
15 | from feature_column import build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns
16 | from layers.core import PredictionLayer, DNN
17 | from layers.interaction import FM
18 | from layers.utils import concat_func, add_func, combined_dnn_input
19 | 
20 | 
21 | def DeepFM(linear_feature_columns, dnn_feature_columns, fm_group=[DEFAULT_GROUP_NAME], dnn_hidden_units=(128, 128),
22 |            l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
23 |            dnn_activation='relu', dnn_use_bn=False, task='binary'):
24 |     """Instantiates the DeepFM Network architecture.
25 | 
26 |     :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
27 |     :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
28 |     :param fm_group: list, group_name of features that will be used to do feature interactions.
29 |     :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
30 |     :param l2_reg_linear: float. L2 regularizer strength applied to linear part
31 |     :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
32 |     :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
33 |     :param seed: integer ,to use as random seed.
34 |     :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
35 |     :param dnn_activation: Activation function to use in DNN
36 |     :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
37 |     :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
38 |     :return: A Keras model instance.
39 |     """
40 | 
41 |     #构建模型的输入张量
42 |     features = build_input_features(
43 |         linear_feature_columns +dnn_feature_columns)
44 |     
45 |     print("#"*10)
46 |     print(features)
47 |     inputs_list = list(features.values())
48 | 
49 | 
50 |     # 构建线性张量
51 |     linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
52 |                                     l2_reg=l2_reg_linear)
53 | 
54 |     group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns, l2_reg_embedding,
55 |                                                                         seed, support_group=True)
56 |     
57 |     
58 |     print('group_embedding_dict',group_embedding_dict)
59 |     print('dense_value_list',dense_value_list)
60 |     fm_logit = add_func([FM()(concat_func(v, axis=1))
61 |                          for k, v in group_embedding_dict.items() if k in fm_group])
62 | 
63 |     dnn_input = combined_dnn_input(list(chain.from_iterable(
64 |         group_embedding_dict.values())), dense_value_list)
65 |     dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
66 |                      dnn_use_bn, seed)(dnn_input)
67 |     dnn_logit = tf.keras.layers.Dense(
68 |         1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed=seed))(dnn_output)
69 | 
70 |     final_logit = add_func([linear_logit, fm_logit, dnn_logit])
71 | 
72 |     output = PredictionLayer(task)(final_logit)
73 |     model = tf.keras.models.Model(inputs=inputs_list, outputs=output)
74 |     return model
75 | 


--------------------------------------------------------------------------------
/deepfm_recomend/xdeepfm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/deepfm_recomend/xdeepfm.png


--------------------------------------------------------------------------------
/deepfm_recomend/xdeepfm_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Oct 13 19:50:43 2020
  5 | 
  6 | @author: ledi
  7 | """
  8 | 
  9 | 
 10 | 
 11 | import pandas as pd
 12 | from sklearn.metrics import log_loss, roc_auc_score
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 15 | # from feature_column import build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns
 16 | from layers.core import PredictionLayer, DNN
 17 | from layers.interaction import FM,CIN
 18 | from layers.utils import concat_func, add_func, combined_dnn_input
 19 | # from  deepfm import DeepFM
 20 | 
 21 | from keras.layers import Dense
 22 | # from feature_column import SparseFeat, DenseFeat, get_feature_names
 23 | 
 24 | # if __name__ == "__main__":
 25 | data = pd.read_csv('./criteo_sample.txt')
 26 | 
 27 | 
 28 | #离散的特征名称
 29 | sparse_features = ['C' + str(i) for i in range(1, 27)]
 30 | 
 31 | #数值的特征名称
 32 | dense_features = ['I' + str(i) for i in range(1, 14)]
 33 | 
 34 | #对缺失的特征进行填充
 35 | data[sparse_features] = data[sparse_features].fillna('-1', )
 36 | data[dense_features] = data[dense_features].fillna(0, )
 37 | target = ['label']
 38 | 
 39 | 
 40 | #数据预处理
 41 | # 1.Label Encoding for sparse features,and do simple Transformation for dense features
 42 | #对离散特征进行编码
 43 | for feat in sparse_features:
 44 |     lbe = LabelEncoder()
 45 |     data[feat] = lbe.fit_transform(data[feat])
 46 | #数值特征进行最大最小归一化
 47 | mms = MinMaxScaler(feature_range=(0, 1))
 48 | data[dense_features] = mms.fit_transform(data[dense_features])
 49 | 
 50 | 
 51 | 
 52 | #feature 是特征处理模块
 53 | from feature import Operate_Feat1,get_feature_names
 54 | 
 55 | 
 56 | d=Operate_Feat1()
 57 | 
 58 | 
 59 | 
 60 | sparse_list=[]
 61 | for p in sparse_features:
 62 |     d1=d.operate_sparse(data[p], p)
 63 |     sparse_list.append(d1.copy())
 64 | 
 65 | dense_list=[]
 66 | for q in dense_features:
 67 |     d2=d.operate_dense(q)
 68 |     print(d2)
 69 |     dense_list.append(d2.copy())
 70 | 
 71 | 
 72 | # fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 )
 73 | #                         for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
 74 | #                       for feat in dense_features]
 75 | 
 76 | merge_list=sparse_list+dense_list
 77 | dnn_feature_columns = merge_list
 78 | linear_feature_columns = merge_list
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | from feature import DEFAULT_GROUP_NAME,build_input_features
 88 | 
 89 | def xDeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 256),
 90 |             cin_layer_size=(128, 128,), cin_split_half=True, cin_activation='relu', l2_reg_linear=0.00001,
 91 |             l2_reg_embedding=0.00001, l2_reg_dnn=0, l2_reg_cin=0, seed=1024, dnn_dropout=0,
 92 |             dnn_activation='relu', dnn_use_bn=False, task='binary'):
 93 | 
 94 |     
 95 |     
 96 |     # dnn_hidden_units=(256, 256)
 97 |     # cin_layer_size=(128, 128,)
 98 |     # cin_split_half=True
 99 |     # cin_activation='relu'
100 |     # l2_reg_linear=0.00001
101 |     # l2_reg_embedding=0.00001
102 |     # l2_reg_dnn=0
103 |     # l2_reg_cin=0
104 |     # seed=1024
105 |     # dnn_dropout=0
106 |     # dnn_activation='relu'
107 |     # dnn_use_bn=False
108 |     # task='binary'
109 | 
110 |     features = build_input_features(
111 |         linear_feature_columns + dnn_feature_columns)
112 | 
113 |     inputs_list = list(features.values())
114 |     from feature import get_linear_logit
115 |     linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
116 |                                     l2_reg=l2_reg_linear)
117 |     from feature import input_from_feature_columns
118 |     sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
119 |                                                                          l2_reg_embedding, seed)
120 | 
121 |     fm_input = concat_func(sparse_embedding_list, axis=1)
122 | 
123 |     dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
124 |     dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(dnn_input)
125 |     
126 |         
127 |     import keras 
128 |     import tensorflow as tf
129 |     dnn_logit = tf.keras.layers.Dense(
130 |         1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed))(dnn_output)
131 | 
132 |     final_logit = add_func([linear_logit, dnn_logit])
133 | 
134 |     if len(cin_layer_size) > 0:
135 |         exFM_out = CIN(cin_layer_size, cin_activation,
136 |                        cin_split_half, l2_reg_cin, seed)(fm_input)
137 |         exFM_logit = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.glorot_normal(seed))(exFM_out)
138 |         final_logit = add_func([final_logit, exFM_logit])
139 | 
140 |     output = PredictionLayer(task)(final_logit)
141 | 
142 |     model = tf.keras.models.Model(inputs=inputs_list, outputs=output)
143 |     return model
144 | 
145 | 
146 |  
147 | 
148 | model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
149 | 
150 | feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
151 | 
152 | # 3.generate input data for model
153 | 
154 | train, test = train_test_split(data, test_size=0.2, random_state=2020)
155 | train_model_input = {name:train[name] for name in feature_names}
156 | test_model_input = {name:test[name] for name in feature_names}
157 | 
158 | # 4.Define Model,train,predict and evaluate
159 | # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
160 | model.compile("adam", "binary_crossentropy",
161 |               metrics=['binary_crossentropy'], )
162 | 
163 | history = model.fit(train_model_input, train[target].values,
164 |                     batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
165 | pred_ans = model.predict(test_model_input, batch_size=256)
166 | print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
167 | print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
168 | 


--------------------------------------------------------------------------------
/ffm/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.npy
3 | *.pyc


--------------------------------------------------------------------------------
/ffm/ffm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Date    : 3/2/18
  3 | # @Author  : zhangchaoyang
  4 | 
  5 | import numpy as np
  6 | 
  7 | np.random.seed(0)
  8 | import math
  9 | from logistic import Logistic
 10 | 
 11 | 
 12 | class FFM_Node(object):
 13 |     '''
 14 |     通常x是高维稀疏向量，所以用链表来表示一个x，链表上的每个节点是个3元组(j,f,v)
 15 |     '''
 16 |     __slots__ = ['j', 'f', 'v']  # 按元组（而不是字典）的方式来存储类的成员属性
 17 | 
 18 |     def __init__(self, j, f, v):
 19 |         '''
 20 |         :param j: Feature index (0 to n-1)
 21 |         :param f: Field index (0 to m-1)
 22 |         :param v: value
 23 |         '''
 24 |         self.j = j
 25 |         self.f = f
 26 |         self.v = v
 27 | 
 28 | 
 29 | class FFM(object):
 30 |     def __init__(self, m, n, k, eta, lambd):
 31 |         '''
 32 |         :param m: Number of fields
 33 |         :param n: Number of features
 34 |         :param k: Number of latent factors
 35 |         :param eta: learning rate
 36 |         :param lambd: regularization coefficient
 37 |         '''
 38 |         self.m = m
 39 |         self.n = n
 40 |         self.k = k
 41 |         # 超参数
 42 |         self.eta = eta
 43 |         self.lambd = lambd
 44 |         # 初始化三维权重矩阵w~U(0,1/sqrt(k))
 45 |         self.w = np.random.rand(n, m, k) / math.sqrt(k)
 46 |         # 初始化累积梯度平方和为，AdaGrad时要用到，防止除0异常
 47 |         self.G = np.ones(shape=(n, m, k), dtype=np.float64)
 48 |         self.log = Logistic()
 49 | 
 50 |     def phi(self, node_list):
 51 |         '''
 52 |         特征组合式的线性加权求和
 53 |         :param node_list: 用链表存储x中的非0值
 54 |         :return:
 55 |         '''
 56 |         z = 0.0
 57 |         for a in range(len(node_list)):
 58 |             node1 = node_list[a]
 59 |             j1 = node1.j
 60 |             f1 = node1.f
 61 |             v1 = node1.v
 62 |             for b in range(a + 1, len(node_list)):
 63 |                 node2 = node_list[b]
 64 |                 j2 = node2.j
 65 |                 f2 = node2.f
 66 |                 v2 = node2.v
 67 |                 w1 = self.w[j1, f2]
 68 |                 w2 = self.w[j2, f1]
 69 |                 z += np.dot(w1, w2) * v1 * v2
 70 |         return z
 71 | 
 72 |     def predict(self, node_list):
 73 |         '''
 74 |         输入x，预测y的值
 75 |         :param node_list: 用链表存储x中的非0值
 76 |         :return:
 77 |         '''
 78 |         z = self.phi(node_list)
 79 |         y = self.log.decide_by_tanh(z)
 80 |         return y
 81 | 
 82 |     def sgd(self, node_list, y):
 83 |         '''
 84 |         根据一个样本来更新模型参数
 85 |         :param node_list: 用链表存储x中的非0值
 86 |         :param y: 正样本1，负样本-1
 87 |         :return:
 88 |         '''
 89 |         kappa = -y / (1 + math.exp(y * self.phi(node_list)))
 90 |         for a in range(len(node_list)):
 91 |             node1 = node_list[a]
 92 |             j1 = node1.j
 93 |             f1 = node1.f
 94 |             v1 = node1.v
 95 |             for b in range(a + 1, len(node_list)):
 96 |                 node2 = node_list[b]
 97 |                 j2 = node2.j
 98 |                 f2 = node2.f
 99 |                 v2 = node2.v
100 |                 c = kappa * v1 * v2
101 |                 # self.w[j1,f2]和self.w[j2,f1]是向量，导致g_j1_f2和g_j2_f1也是向量
102 |                 g_j1_f2 = self.lambd * self.w[j1, f2] + c * self.w[j2, f1]
103 |                 g_j2_f1 = self.lambd * self.w[j2, f1] + c * self.w[j1, f2]
104 |                 # 计算各个维度上的梯度累积平方和
105 |                 self.G[j1, f2] += g_j1_f2 ** 2  # 所有G肯定是大于0的正数，因为初始化时G都为1
106 |                 self.G[j2, f1] += g_j2_f1 ** 2
107 |                 # AdaGrad
108 |                 self.w[j1, f2] -= self.eta / np.sqrt(self.G[j1, f2]) * g_j1_f2  # sqrt(G)作为分母，所以G必须是大于0的正数
109 |                 self.w[j2, f1] -= self.eta / np.sqrt(
110 |                     self.G[j2, f1]) * g_j2_f1  # math.sqrt()只能接收一个数字作为参数，而numpy.sqrt()可以接收一个array作为参数，表示对array中的每个元素分别开方
111 | 
112 |     def train(self, sample_generator, max_echo, max_r2):
113 |         '''
114 |         根据一堆样本训练模型
115 |         :param sample_generator: 样本生成器，每次yield (node_list, y)，node_list中存储的是x的非0值。通常x要事先做好归一化，即模长为1，这样精度会略微高一点
116 |         :param max_echo: 最大迭代次数
117 |         :param max_r2: 拟合系数r2达到阈值时即可终止学习
118 |         :return:
119 |         '''
120 |         for itr in range(max_echo):
121 |             print( "echo", itr)
122 |             y_sum = 0.0
123 |             y_square_sum = 0.0
124 |             err_square_sum = 0.0  # 误差平方和
125 |             population = 0  # 样本总数
126 |             for node_list, y in sample_generator:
127 |                 y = 0.0 if y == -1 else y  # 真实的y取值为{-1,1}，而预测的y位于(0,1)，计算拟合效果时需要进行统一
128 |                 self.sgd(node_list, y)
129 |                 y_hat = self.predict(node_list)
130 |                 y_sum += y
131 |                 y_square_sum += y ** 2
132 |                 err_square_sum += (y - y_hat) ** 2
133 |                 population += 1
134 |             var_y = y_square_sum - y_sum * y_sum / population  # y的方差
135 |             r2 = 1 - err_square_sum / var_y
136 |             print ("r2=",r2)
137 |             if r2 > max_r2:  # r2值越大说明拟合得越好
138 |                 print ('r2 have reach', r2)
139 |                 break
140 | 
141 |     def save_model(self, outfile):
142 |         '''
143 |         序列化模型
144 |         :param outfile:
145 |         :return:
146 |         '''
147 |         np.save(outfile, self.w)
148 | 
149 |     def load_model(self, infile):
150 |         '''
151 |         加载模型
152 |         :param infile:
153 |         :return:
154 |         '''
155 |         self.w = np.load(infile)
156 | 


--------------------------------------------------------------------------------
/ffm/ffm_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Date    : 3/2/18
 3 | # @Author  : zhangchaoyang
 4 | 
 5 | import math
 6 | from ffm import FFM_Node, FFM
 7 | import re
 8 | 
 9 | 
10 | class Sample(object):
11 |     def __init__(self, infile):
12 |         self.infile = infile
13 |         self.regex = re.compile("\\s+")
14 | 
15 |     def __iter__(self):
16 |         with open(self.infile, 'r') as f_in:
17 |             for line in f_in:
18 |                 arr = self.regex.split(line.strip())
19 |                 if len(arr) >= 2:
20 |                     y = float(arr[0])
21 |                     assert math.fabs(y) == 1
22 |                     node_list = []
23 |                     square_sum = 0.0
24 |                     for i in range(1, len(arr)):
25 |                         brr = arr[i].split(",")
26 |                         if len(brr) == 3:
27 |                             j = int(brr[0])
28 |                             f = int(brr[1])
29 |                             v = float(brr[2])
30 |                             square_sum += v * v
31 |                             node_list.append(FFM_Node(j, f, v))
32 |                     if square_sum > 0:
33 |                         norm = math.sqrt(square_sum)
34 |                         # 把模长缩放到1
35 |                         normed_node_list = [FFM_Node(ele.j, ele.f, ele.v / norm) for ele in node_list]
36 |                         yield (normed_node_list, y)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     n = 5
41 |     m = 2
42 |     k = 2
43 |     train_file = "train.txt"
44 |     valid_file = "valid.txt"
45 |     model_file = "ffm.npy"
46 |     # 超参数
47 |     eta = 0.01
48 |     lambd = 1e-2
49 |     max_echo = 30
50 |     max_r2 = 0.9
51 | 
52 |     # 训练模型，并保存模型参数
53 |     sample_generator = Sample(train_file)
54 |     ffm = FFM(m, n, k, eta, lambd)
55 |     ffm.train(sample_generator, max_echo, max_r2)
56 |     ffm.save_model(model_file)
57 | 
58 |     # 加载模型，并计算在验证集上的拟合效果
59 |     ffm.load_model(model_file)
60 |     valid_generator = Sample(valid_file)
61 |     y_sum = 0.0
62 |     y_square_sum = 0.0
63 |     err_square_sum = 0.0  # 误差平方和
64 |     population = 0  # 样本总数
65 |     for node_list, y in valid_generator:
66 |         y = 0.0 if y == -1 else y  # 真实的y取值为{-1,1}，而预测的y位于(0,1)，计算拟合效果时需要进行统一
67 |         y_hat = ffm.predict(node_list)
68 |         y_sum += y
69 |         y_square_sum += y ** 2
70 |         err_square_sum += (y - y_hat) ** 2
71 |         population += 1
72 |     var_y = y_square_sum - y_sum * y_sum / population  # y的方差
73 |     r2 = 1 - err_square_sum / var_y
74 |     print ("r2 on validation set is", r2)
75 | 


--------------------------------------------------------------------------------
/ffm/logistic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Date    : 3/2/18
 3 | # @Author  : zhangchaoyang
 4 | 
 5 | import numpy as np
 6 | import math
 7 | from singleton import Singleton
 8 | 
 9 | 
10 | class Logistic(object):
11 |     __metaclass__ = Singleton  # 单例
12 | 
13 |     def __init__(self):
14 |         exp_max = 10.0
15 |         self.exp_scale = 0.001
16 |         self.exp_intv = int(exp_max / self.exp_scale)
17 |         self.exp_table = [0.0] * self.exp_intv
18 |         for i in range(self.exp_intv):
19 |             x = self.exp_scale * i
20 |             exp = math.exp(x)
21 |             self.exp_table[i] = exp / (1.0 + exp)
22 | 
23 |     def decide_by_table(self, x):
24 |         '''查表获得logistic的函数值'''
25 |         if x == 0:
26 |             return 0.5
27 |         i = int(np.nan_to_num(abs(x) / self.exp_scale))
28 |         y = self.exp_table[min(i, self.exp_intv - 1)]
29 |         if x > 0:
30 |             return y
31 |         else:
32 |             return 1.0 - y
33 | 
34 |     def decide_by_tanh(self, x):
35 |         '''直接使用1.0 / (1.0 + np.exp(-x))容易发警告“RuntimeWarning: overflowencountered in exp”，
36 |            转换成如下等价形式后算法会更稳定
37 |         '''
38 |         return 0.5 * (1 + np.tanh(0.5 * x))
39 | 
40 |     def decide(self, x):
41 |         '''原始的sigmoid函数'''
42 |         return 1.0 / (1.0 + np.exp(-x))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     log = Logistic()
47 |     for x in np.arange(-20, 20, 0.1):  # xrange()中的step不能是小数，所以只好手numpy.arange()
48 |         y = log.decide(x)
49 |         print( x, y, log.decide_by_tanh(x) - y, log.decide_by_table(x) - y)
50 | 


--------------------------------------------------------------------------------
/ffm/readme.md:
--------------------------------------------------------------------------------
1 | # Field-aware Factorization Machines
2 | 公式推导见[http://www.cnblogs.com/zhangchaoyang/articles/8410719.html](http://www.cnblogs.com/zhangchaoyang/articles/8410719.html)


--------------------------------------------------------------------------------
/ffm/singleton.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Date    : 3/2/18
 3 | # @Author  : zhangchaoyang
 4 | 
 5 | class Singleton(type):
 6 |     def __init__(cls,   class_name,base_classes, attr_dict):
 7 |         cls.__instance = None
 8 |         super(Singleton, cls).__init__( class_name,base_classes, attr_dict)
 9 | 
10 |     def __call__(cls, *args, **kwargs):
11 |         if cls.__instance is None:
12 |             cls.__instance = super(Singleton, cls).__call__(*args, **kwargs)
13 |             return cls.__instance
14 |         else:
15 |             return cls.__instance
16 | 


--------------------------------------------------------------------------------
/ffm/train.txt:
--------------------------------------------------------------------------------
1 | -1  0,0,2.9   4,1,12.4
2 | 1   1,0,5.7  3,1,0.03
3 | -1  2,0,4.7   4,1,9.4


--------------------------------------------------------------------------------
/ffm/valid.txt:
--------------------------------------------------------------------------------
1 | 1  2,0,4   3,1,2.1
2 | 1  1,0,5.7  4,1,5
3 | -1   0,0,6   4,1,9.4


--------------------------------------------------------------------------------
/gbdt_source/GBDTReg.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | __author__ = 'luchi.lc'
  3 | import numpy as np
  4 | 
  5 | """
  6 | date:29/6/2017
  7 | usage:构造GBDT树并用其生成数据新的特征向量
  8 | """
  9 | class GBDT(object):
 10 | 
 11 |     def __init__(self,config):
 12 |         
 13 |         
 14 | 
 15 |         self.learningRate = config.learningRate #learning_rate
 16 |         self.maxTreeLength=config.maxTreeLength #树的最大深度
 17 |         self.maxLeafCount=config.maxLeafCount #最大叶子数量　
 18 |         self.maxTreeNum=config.maxTreeNum      #树的数量
 19 |         self.tree=[]
 20 | 
 21 |     #计算平方损失
 22 |     def calculateSquareLoss(self,residual):
 23 |         """
 24 |         :param residual:梯度残差值
 25 |         :return:总体的残差值
 26 |         """
 27 | 
 28 |         #如果这批数据的残差相同，那么loss为0
 29 |         mean = np.mean(residual)
 30 |         sumError = np.sum([(value-mean)**2 for value in residual])
 31 |         return sumError
 32 | 
 33 |     def splitTree(self,x_train,residualGradient,treeHeight):
 34 |         """
 35 | 
 36 |         :param x_train:训练数据
 37 |         :param residualGradient:当前需要拟合的梯度残差值
 38 |         :param treeHeight:树的高度
 39 |         :return:建好的GBDT树
 40 |         """
 41 |         size = len(x_train) #数据的数量
 42 |         dim = len(x_train[0]) #特征的维度
 43 |         #约定：左子树是小于等于，右子树是大于
 44 |         bestSplitPointDim=-1
 45 |         bestSplitPointValue=-1
 46 |         #这是树分裂前，loss
 47 |         curLoss = self.calculateSquareLoss(residualGradient)
 48 |         minLossValue=curLoss
 49 |         #如果树的递归深度等于树的最大深度，则递归终止
 50 |         if treeHeight==self.maxTreeLength:
 51 | 
 52 |             return curLoss
 53 |         tree=dict([])
 54 |         #遍历数据所有的维度
 55 |         for i in range(dim):
 56 |             #遍历所有的数据
 57 |             for j in range(size):
 58 |                 #令　x_train[j,i]为分裂点    
 59 |                 splitNum = x_train[j,i]
 60 |                 leftSubTree=[]
 61 |                 rightSubTree=[]
 62 |                 #以splitNum 为分裂点，对于第i个feature ,将数据分成两类，
 63 |                 for k in range(size):
 64 |                     tmpNum=x_train[k,i]
 65 |                     if tmpNum<=splitNum:
 66 |                         leftSubTree.append(residualGradient[k])
 67 |                     else:
 68 |                         rightSubTree.append(residualGradient[k])
 69 |                 sumLoss=0.0
 70 |                 #分别计算左右子树的loss,再求和，通过最小化loss,来决定分裂的feature和分裂的值
 71 |                 sumLoss+=self.calculateSquareLoss(np.array(leftSubTree))
 72 |                 sumLoss+=self.calculateSquareLoss(np.array(rightSubTree))
 73 |                 if sumLoss<minLossValue:
 74 |                     
 75 |                     bestSplitPointDim=i
 76 |                     bestSplitPointValue=splitNum
 77 |                     minLossValue=sumLoss
 78 |                     print(treeHeight,bestSplitPointDim)
 79 |         '''
 80 |         通过上面的多轮循环，这个树某个节点的最优分裂特征和分裂值
 81 |         '''
 82 |         #如果损失值没有变小，则不作任何改变，也就是下面的归位一个Node
 83 |         if minLossValue==curLoss:
 84 |             return np.mean(residualGradient)
 85 |         else:
 86 |             #上面已经找到节点的特征和分裂值，那就用这个特征bestSplitPointDim，和值bestSplitPointValue将数据分叉，分裂成一个二叉树
 87 |             leftSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]<=bestSplitPointValue ]#左子树
 88 |             rightSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]>bestSplitPointValue ]#右子树
 89 |              
 90 | #            print(leftSplit)
 91 |             newLeftSubTree = list(zip(*leftSplit))[0] #左子树的训练数据X
 92 |             newLeftResidual = list(zip(*leftSplit))[1]#左子树的y
 93 |             leftTree = self.splitTree(np.array(newLeftSubTree),newLeftResidual,treeHeight+1)
 94 | 
 95 |             newRightSubTree = list(zip(*rightSplit))[0]
 96 |             newRightResidual =list(zip(*rightSplit))[1]
 97 |             rightTree = self.splitTree(np.array(newRightSubTree),newRightResidual,treeHeight+1)
 98 | 
 99 |             tree[(bestSplitPointDim,bestSplitPointValue)]=[leftTree,rightTree]
100 |             
101 |             print(tree)
102 |             return tree
103 | 
104 |     #计算树的节点数
105 |     def getTreeLeafNodeNum(self,tree):
106 |             size=0
107 |             if type(tree) is not dict:
108 |                 return 1
109 |             for item in tree.items():
110 |                 
111 |                 print(item)
112 |                 
113 |                 print('#'*10)
114 |                 subLeftTree,subRightTree=item[1]
115 |                 if type(subLeftTree) is dict:
116 |                     size+=self.getTreeLeafNodeNum(subLeftTree)
117 |                 else:
118 |                     size+=1
119 | 
120 |                 if type(subRightTree) is dict:
121 |                     size+=self.getTreeLeafNodeNum(subRightTree)
122 |                 else:
123 |                     size+=1
124 |             return size
125 | 
126 |     #遍历数据应该归到那个叶子节点，并计算其左侧的叶子节点个数
127 |     def scanTree(self,curTree,singleX,treeLeafNodeNum):
128 |         """
129 | 
130 |         :param curTree:当前的树
131 |         :param singleX:需要送入到决策树的数据
132 |         :param treeLeafNodeNum:树的叶子结点个数
133 |         :return:该数据应该分到的叶子结点的值和其在当前树的转换的特征向量
134 |         """
135 | 
136 |         self.xValue=0
137 |         xFeature=[0]*treeLeafNodeNum
138 |         self.leftZeroNum=0
139 |         def scan(curTree,singleX):
140 | 
141 |             for item in curTree.items():
142 |                 splitDim,splitValue=item[0]
143 |                 subLeftTree,subRightTree=item[1]
144 |                 if singleX[splitDim]<=splitValue:
145 |                     if type(subLeftTree) is dict:
146 |                         scan(subLeftTree,singleX)
147 |                     else:
148 |                         self.xValue=subLeftTree
149 |                         return
150 |                 else:
151 |                     self.leftZeroNum+=self.getTreeLeafNodeNum(subLeftTree)
152 |                     if type(subRightTree) is dict:
153 |                         scan(subRightTree,singleX)
154 |                     else:
155 |                         self.xValue=subRightTree
156 |                         return
157 |         scan(curTree,singleX)
158 |         xFeature[self.leftZeroNum]=1
159 |         return self.xValue,xFeature
160 | 
161 |     #sigmoid函数
162 |     def sigmoid(self,x):
163 |         return 1.0/(1+np.exp(-1*x))
164 |     #建立GBDT树
165 |     def buildGbdt(self,x_train,y_train):
166 |         #数据的个数
167 |         size = len(x_train)
168 |         dim = len(x_train[0])
169 |         x_train=np.array(x_train)
170 |         y_train=np.array(y_train)
171 |         x_train_feature=[]
172 | 
173 |         #初始化第一棵树
174 |         treePreviousValue=0*y_train
175 |         treeValues=[]
176 |         treeValues.append(treePreviousValue)
177 | 
178 |         curValue = self.sigmoid(0*y_train)
179 |         dataFeatures=[]
180 |         for i in range(self.maxTreeNum):
181 |             print("the tree %i-th"%i)
182 |             residualGradient = -1*self.learningRate*(curValue-y_train)
183 |             curTree = self.splitTree(x_train,residualGradient,1)
184 |             self.tree.append(curTree)
185 | #            print (curTree)
186 |             #更新梯度残差值
187 |             curTreeLeafNodeNum = self.getTreeLeafNodeNum(curTree)
188 |             curTreeValue=[]
189 |             for singleX in x_train:
190 |                 xValue,xFeature = self.scanTree(curTree,singleX,curTreeLeafNodeNum)
191 |                 curTreeValue.append(xValue)
192 | 
193 |             treePreviousValue=np.array(curTreeValue)+treePreviousValue
194 |             curValue=self.sigmoid(treePreviousValue)
195 | #            print (y_train)
196 | #            print("curValue")
197 | #            print( curValue)
198 | 
199 |     #根据建成的树构建输入数据的特征向量
200 |     def generateFeatures(self,x_train):
201 |         dataFeatures=[]
202 |         for curTree in self.tree:
203 |             curFeatures=[]
204 |             curTreeLeafNodeNum = self.getTreeLeafNodeNum(curTree)
205 |             # print ("tree leaf node is %i"%(curTreeLeafNodeNum))
206 |             for singleX in x_train:
207 |                 _,xFeature = self.scanTree(curTree,singleX,curTreeLeafNodeNum)
208 |                 curFeatures.append(xFeature)
209 | 
210 |             if len(dataFeatures)==0:
211 |                 dataFeatures=np.array(curFeatures)
212 |             
213 |             else:
214 |                 dataFeatures=np.concatenate([dataFeatures,curFeatures],axis=1)
215 |                 
216 | #                print('#'*100)
217 | #                print(len(curFeatures[0]),len(dataFeatures[0]))
218 | #        print('data_feature=',dataFeatures,len(dataFeatures),len(dataFeatures[0]))
219 | #        print('curFeatures=',curFeatures)
220 |         return dataFeatures
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/gbdt_source/README.txt:
--------------------------------------------------------------------------------
1 | GBDTReg.py 是GBDT模型文件
2 | gbdt_demo.py是训练GBDT，并使用GBDT生成特征向量，转换后的特征向量用于训练和测试LR
3 | testGBDT.py测试的是树的个数对GBDT的结果影响


--------------------------------------------------------------------------------
/gbdt_source/__pycache__/GBDTReg.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/gbdt_source/__pycache__/GBDTReg.cpython-37.pyc


--------------------------------------------------------------------------------
/gbdt_source/gbdt_demo.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | __author__ = 'luchi.lc'
 3 | 
 4 | """
 5 | date:29/6/2017
 6 | usage:训练GBDT树并使用其讲数据转换成新的特征向量，用于训练Logistic Regression
 7 | """
 8 | 
 9 | from sklearn.datasets import make_classification
10 | from sklearn.model_selection import train_test_split
11 | from GBDTReg import GBDT
12 | from sklearn.linear_model import LogisticRegression
13 | import numpy as np
14 | 
15 | class Config(object):
16 |     learningRate=0.1
17 |     maxTreeLength=5
18 |     maxLeafCount=30
19 |     maxTreeNum=50
20 | 
21 | def generate_data():
22 |     X, y = make_classification(n_samples=1000)
23 |     #生成训练/测试数据
24 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
25 |     #对于训练数据，前面一般作为训练GBDT，后一半用来训练LR
26 |     X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5)
27 |     return X_train, X_train_lr, y_train, y_train_lr,X_test, y_test
28 | 
29 | 
30 | 
31 | 
32 | def main():
33 |     X_train, X_train_lr, y_train, y_train_lr,X_test, y_test=generate_data()
34 |     config=Config()
35 |     gbdt=GBDT(config=config)
36 |     gbdt.buildGbdt(X_train,y_train)
37 |     trainDataFeatures=gbdt.generateFeatures(X_train_lr)
38 |     testDataFeatures=gbdt.generateFeatures(X_test)
39 |     print (len(trainDataFeatures[0]))
40 |     lrModel = LogisticRegression()
41 |     lrModel.fit(trainDataFeatures,y_train_lr)
42 |     #test model
43 |     testLabel = lrModel.predict(testDataFeatures)
44 |     accuracy = np.sum((np.array(testLabel)==np.array(y_test)))*1.0/len(y_test)
45 |     print (("the accuracy is % f"%accuracy))
46 | 
47 | if __name__=='__main__':
48 |     main()
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/gbdt_source/testGBDT.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'luchi.lc'
 2 | 
 3 | """
 4 | date:29/6/2017
 5 | usage:测试GBDT的树的个数对结果的影响
 6 | """
 7 | from GBDTReg import GBDT
 8 | class Config(object):
 9 |     learningRate=0.1
10 |     maxTreeLength=4
11 |     maxLeafCount=30
12 |     maxTreeNum=50
13 | 
14 | def test():
15 |     x=[[0.5,0.6,0.7],[0.4,0.5,0.5],[1.2,1.3,1.0],[1.4,1.5,0.8],[1.5,1.3,1.3]]
16 |     y=[0,0,1,1,1]
17 |     c=Config()
18 |     gbdt=GBDT(config=c)
19 |     gbdt.buildGbdt(x,y)
20 |     data_features=gbdt.generateFeatures(x)
21 |     print len(data_features[0])
22 | 
23 | test()
24 | 


--------------------------------------------------------------------------------
/item_book.txt:
--------------------------------------------------------------------------------
 1 | Liu Yi,3,1001
 2 | Chen Er,4,1001
 3 | Zhang San,3,1001
 4 | Li Si,3,1001
 5 | Liu Yi,3,1002
 6 | Li Si,4,1002
 7 | Liu Yi,4,1003
 8 | Zhang San,5,1003
 9 | Li Si,5,1003
10 | Liu Yi,4,1004
11 | Zhang San,3,1004
12 | Liu Yi,5,1005
13 | 


--------------------------------------------------------------------------------
/logstic/lf1000.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/logstic/lf1000.gif


--------------------------------------------------------------------------------
/logstic/logstic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Apr 22 16:30:47 2019
 4 | 
 5 | @author: luogantt
 6 | """
 7 | 
 8 | '''
 9 | Created on Oct 27, 2010
10 | Logistic Regression Working Module
11 | @author: Peter
12 | '''
13 | from numpy import *
14 | 
15 | def loadDataSet():
16 |     dataMat = []; labelMat = []
17 |     fr = open('testSet.txt')
18 |     for line in fr.readlines():
19 |         lineArr = line.strip().split()
20 |         dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
21 |         labelMat.append(int(lineArr[2]))
22 |     return dataMat,labelMat
23 | 
24 | def sigmoid(inX):
25 |     return 1.0/(1+exp(-inX))
26 | 
27 | def gradAscent(dataMatIn, classLabels):
28 |     dataMatrix = mat(dataMatIn)             #convert to NumPy matrix
29 |     labelMat = mat(classLabels).transpose() #convert to NumPy matrix
30 |     n,m = shape(dataMatrix)
31 |     alpha = 0.001
32 |     maxCycles = 5000
33 |     weights = ones((m,1))
34 |     for k in range(maxCycles):              #heavy on matrix operations
35 |         h = sigmoid(dataMatrix*weights)     #matrix mult
36 |         error = (labelMat - h)              #vector subtraction
37 |         weights = weights + alpha * dataMatrix.transpose()* error #matrix mult
38 |     return weights
39 | 
40 | def plotBestFit(weights):
41 |     import matplotlib.pyplot as plt
42 |     dataMat,labelMat=loadDataSet()
43 |     dataArr = array(dataMat)
44 |     n = shape(dataArr)[0] 
45 |     xcord1 = []; ycord1 = []
46 |     xcord2 = []; ycord2 = []
47 |     for i in range(n):
48 |         if int(labelMat[i])== 1:
49 |             xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
50 |         else:
51 |             xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
52 |     fig = plt.figure()
53 |     ax = fig.add_subplot(111)
54 |     ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
55 |     ax.scatter(xcord2, ycord2, s=30, c='green')
56 |     x = arange(-3.0, 3.0, 0.1)
57 |     y = (-weights[0]-weights[1]*x)/weights[2]
58 |     ax.plot(x, y)
59 |     plt.xlabel('X1'); plt.ylabel('X2');
60 |     plt.show()
61 | 
62 | #import logRegres    
63 | 
64 | dataArr,labelMat=loadDataSet() 
65 | 
66 | 
67 | weights=gradAscent(dataArr,labelMat)
68 | plotBestFit(weights.getA())
69 | 


--------------------------------------------------------------------------------
/logstic/testSet.txt:
--------------------------------------------------------------------------------
  1 | -0.017612	14.053064	0
  2 | -1.395634	4.662541	1
  3 | -0.752157	6.538620	0
  4 | -1.322371	7.152853	0
  5 | 0.423363	11.054677	0
  6 | 0.406704	7.067335	1
  7 | 0.667394	12.741452	0
  8 | -2.460150	6.866805	1
  9 | 0.569411	9.548755	0
 10 | -0.026632	10.427743	0
 11 | 0.850433	6.920334	1
 12 | 1.347183	13.175500	0
 13 | 1.176813	3.167020	1
 14 | -1.781871	9.097953	0
 15 | -0.566606	5.749003	1
 16 | 0.931635	1.589505	1
 17 | -0.024205	6.151823	1
 18 | -0.036453	2.690988	1
 19 | -0.196949	0.444165	1
 20 | 1.014459	5.754399	1
 21 | 1.985298	3.230619	1
 22 | -1.693453	-0.557540	1
 23 | -0.576525	11.778922	0
 24 | -0.346811	-1.678730	1
 25 | -2.124484	2.672471	1
 26 | 1.217916	9.597015	0
 27 | -0.733928	9.098687	0
 28 | -3.642001	-1.618087	1
 29 | 0.315985	3.523953	1
 30 | 1.416614	9.619232	0
 31 | -0.386323	3.989286	1
 32 | 0.556921	8.294984	1
 33 | 1.224863	11.587360	0
 34 | -1.347803	-2.406051	1
 35 | 1.196604	4.951851	1
 36 | 0.275221	9.543647	0
 37 | 0.470575	9.332488	0
 38 | -1.889567	9.542662	0
 39 | -1.527893	12.150579	0
 40 | -1.185247	11.309318	0
 41 | -0.445678	3.297303	1
 42 | 1.042222	6.105155	1
 43 | -0.618787	10.320986	0
 44 | 1.152083	0.548467	1
 45 | 0.828534	2.676045	1
 46 | -1.237728	10.549033	0
 47 | -0.683565	-2.166125	1
 48 | 0.229456	5.921938	1
 49 | -0.959885	11.555336	0
 50 | 0.492911	10.993324	0
 51 | 0.184992	8.721488	0
 52 | -0.355715	10.325976	0
 53 | -0.397822	8.058397	0
 54 | 0.824839	13.730343	0
 55 | 1.507278	5.027866	1
 56 | 0.099671	6.835839	1
 57 | -0.344008	10.717485	0
 58 | 1.785928	7.718645	1
 59 | -0.918801	11.560217	0
 60 | -0.364009	4.747300	1
 61 | -0.841722	4.119083	1
 62 | 0.490426	1.960539	1
 63 | -0.007194	9.075792	0
 64 | 0.356107	12.447863	0
 65 | 0.342578	12.281162	0
 66 | -0.810823	-1.466018	1
 67 | 2.530777	6.476801	1
 68 | 1.296683	11.607559	0
 69 | 0.475487	12.040035	0
 70 | -0.783277	11.009725	0
 71 | 0.074798	11.023650	0
 72 | -1.337472	0.468339	1
 73 | -0.102781	13.763651	0
 74 | -0.147324	2.874846	1
 75 | 0.518389	9.887035	0
 76 | 1.015399	7.571882	0
 77 | -1.658086	-0.027255	1
 78 | 1.319944	2.171228	1
 79 | 2.056216	5.019981	1
 80 | -0.851633	4.375691	1
 81 | -1.510047	6.061992	0
 82 | -1.076637	-3.181888	1
 83 | 1.821096	10.283990	0
 84 | 3.010150	8.401766	1
 85 | -1.099458	1.688274	1
 86 | -0.834872	-1.733869	1
 87 | -0.846637	3.849075	1
 88 | 1.400102	12.628781	0
 89 | 1.752842	5.468166	1
 90 | 0.078557	0.059736	1
 91 | 0.089392	-0.715300	1
 92 | 1.825662	12.693808	0
 93 | 0.197445	9.744638	0
 94 | 0.126117	0.922311	1
 95 | -0.679797	1.220530	1
 96 | 0.677983	2.556666	1
 97 | 0.761349	10.693862	0
 98 | -2.168791	0.143632	1
 99 | 1.388610	9.341997	0
100 | 0.317029	14.739025	0
101 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*-coding:utf-8-*-
 3 |  
 4 | import math
 5 | import pdb
 6 |  
 7 | class ItemBasedCF:
 8 |     def __init__(self,train_file):
 9 |         self.train_file = train_file
10 |         self.readData()
11 |         
12 |     def readData(self):
13 |         #读取文件，并生成用户-物品的评分表和测试集
14 |         self.train = dict()
15 |         #用户-物品的评分表
16 |         for line in open(self.train_file):
17 |             user,score,item = line.strip().split(",")
18 |             self.train.setdefault(user,{})
19 |             self.train[user][item] = int(float(score))
20 |  
21 |     def ItemSimilarity(self):
22 |         #建立物品-物品的共现矩阵
23 |         cooccur = dict()  #物品-物品的共现矩阵
24 |         buy = dict()  #物品被多少个不同用户购买N
25 |         for user,items in self.train.items():
26 |             for i in items.keys():
27 |                 buy.setdefault(i,0)
28 |                 buy[i] += 1
29 |                 cooccur.setdefault(i,{})
30 |                 for j in items.keys():
31 |                     if i == j : continue
32 |                     cooccur[i].setdefault(j,0)
33 |                     cooccur[i][j] += 1
34 |         #计算相似度矩阵
35 |         self.similar = dict()
36 |         for i,related_items in cooccur.items():
37 |             self.similar.setdefault(i,{})
38 |             for j,cij in related_items.items():
39 |                 self.similar[i][j] = cij / (math.sqrt(buy[i] * buy[j]))
40 |         return self.similar
41 |  
42 |     #给用户user推荐，前K个相关用户，前N个物品
43 |     def Recommend(self,user,K=3,N=10):
44 |         rank = dict()
45 |         action_item = self.train[user]     
46 |         #用户user产生过行为的item和评分
47 |         for item,score in action_item.items():
48 |             sortedItems = sorted(self.similar[item].items(),key=lambda x:x[1],reverse=True)[0:K]
49 |             for j,wj in sortedItems:
50 |                 if j in action_item.keys():
51 |                     continue
52 |                 rank.setdefault(j,0)
53 |                 rank[j] += score * wj
54 |         return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])
55 |     
56 | #声明一个ItemBasedCF的对象    
57 | item = ItemBasedCF("item_book.txt")
58 | item.ItemSimilarity()
59 | recommedDict = item.Recommend("Li Si")
60 | for k,v in recommedDict.items():
61 |     print(k,"\t",v)
62 | 


--------------------------------------------------------------------------------
/other/DeepFM-Keras-master/README.md:
--------------------------------------------------------------------------------
 1 | # DeepFM-Keras
 2 | 
 3 | DeepFM written by Keras[1], similary with the tensorflow version by ChenglongChen "https://github.com/ChenglongChen/tensorflow-DeepFM"
 4 | 
 5 | Usage:
 6 | ---
 7 | ###load data and divide to train and test
 8 | dfTrain = pd.read_csv("data/train.csv")
 9 | dfTrain = dfTrain.iloc[0:int(0.7*dfTrain.shape[0]),:]
10 | dfTest = dfTrain.iloc[int(0.7*dfTrain.shape[0]):,:]
11 | 
12 | 
13 | global_columns  = dfTrain.columns.tolist()
14 | ###divide the columns by  CATEGORICAL columns	
15 | ID_columns  = ["ps_reg_01", "ps_reg_02", "ps_reg_03",
16 |     "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",]
17 | 
18 | qid_columns = ['id']
19 | target_columns = ['target']
20 | 
21 | 
22 | Example:
23 | ---
24 | Folder example includes an example usage of DeepFM models for Porto Seguro's Safe Driver Prediction competition on Kaggle.
25 | 
26 | Please download the data from the competition website and put them into the example/data folder.
27 | 
28 | To train DeepFM model for this dataset, run
29 | 
30 | $ python keras_FM.py
31 | 
32 | Support:
33 | ---
34 | Support the auc loss and log_loss as metrics
35 | 
36 | 
37 | 
38 | 
39 | [1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He.
40 | 


--------------------------------------------------------------------------------
/other/DeepFM-Keras-master/data/README.md:
--------------------------------------------------------------------------------
1 | 
2 | Please download the data from the [competition website](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction) and put them here.
3 | 


--------------------------------------------------------------------------------
/other/deepfm/data/README.md:
--------------------------------------------------------------------------------
1 | 随便构造的一个简单的数据，方便debug的话查看每一个变量的值以及维度。
2 | 
3 | 
4 | 如果想使用真实的数据，可以去kaggle上下载：
5 | 
6 | https://www.kaggle.com/c/porto-seguro-safe-driver-prediction


--------------------------------------------------------------------------------
/other/deepfm/data/test.csv:
--------------------------------------------------------------------------------
1 | id,target,feat_cat_1,feat_cat_2,feat_num_1,feat_num_2
2 | 6,0,1,2,3.1,2.2
3 | 7,0,2,3,2.1,3.1
4 | 8,1,0,2,1.0,3.4
5 | 9,1,1,1,2.1,1.6
6 | 10,0,0,0,0.5,1.8


--------------------------------------------------------------------------------
/other/deepfm/data/train.csv:
--------------------------------------------------------------------------------
1 | id,target,feat_cat_1,feat_cat_2,feat_num_1,feat_num_2
2 | 1,0,1,2,3.1,2.2
3 | 2,0,2,3,2.1,3.1
4 | 3,1,0,2,1.0,3.4
5 | 4,1,1,1,2.1,1.6
6 | 5,0,0,0,0.5,1.8


--------------------------------------------------------------------------------
/other/deepfm/广告预估CTR系列--DeepFM模型架构图--实现篇.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/deepfm/广告预估CTR系列--DeepFM模型架构图--实现篇.jpg


--------------------------------------------------------------------------------
/other/svd/README.md:
--------------------------------------------------------------------------------
 1 | # Recsyspy
 2 | Classic recommendation algorithms implementation
 3 | 
 4 | ## Algorithm
 5 | |DNN Model |RMSE|MAE
 6 | | :-------- |:--------|:-------- |
 7 | |NeuMF|0.9433|0.7485   
 8 | 
 9 | |MF Model | RMSE     | MAE
10 | | :-------- | :-------- | :-------- |
11 | | Baseline  | 0.946|0.742 
12 | | SVD|0.931|0.731|
13 | | SVDPlusPlus|0.927|0.726
14 | | Explicit ALS  |1.199|0.903
15 | | Implicit ALS |2.752|2.525
16 | 
17 | |Neighborhood Model |RMSE|MAE
18 | | :-------- |:--------|:-------- |
19 | |Itemcf|1.029|0.802
20 | |WeightedSlopOne|1.043|0.835|
21 | 
22 | ## Example
23 | ```python
24 | import os
25 | 
26 | from util.databuilder import DataBuilder
27 | from algorithm.dnn.neumf import NeuMF
28 | 
29 | file_name = os.path.abspath("data/ml-100k/u.data")
30 | data_builder = DataBuilder(file_name, just_test_one=True)
31 | 
32 | 
33 | data_builder.eval(NeuMF(epochs=2), k_folds=5)
34 | ```
35 | 
36 | 
37 | ## Dateset
38 | * MovieLens 
39 | 
40 | ## Papers
41 | ### Dnn Algorithm
42 | * Neural Collaborative Filtering
43 | 
44 | ### MF Algorithm  
45 | * Yehuda Koren. Factorization meets the neighborhood: a multifaceted collaborative filtering model
46 | * Matrix factorization techniques for recommender systems
47 | * Advances in Collaborative Filtering
48 | 
49 | ### Neighborhood Algorithm
50 | * Slope one predictors for online rating-based collaborative filtering
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/other/svd/algorithm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/__init__.py


--------------------------------------------------------------------------------
/other/svd/algorithm/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/__pycache__/estimator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/__pycache__/estimator.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/dnn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/dnn/__init__.py


--------------------------------------------------------------------------------
/other/svd/algorithm/dnn/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/dnn/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/dnn/__pycache__/neumf.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/dnn/__pycache__/neumf.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/dnn/neumf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | """NeuMF model
 3 | Paper: Neural Collaborative Filtering
 4 | Apply dnn on MF
 5 | """
 6 | 
 7 | from __future__ import division, print_function
 8 | 
 9 | import numpy as np
10 | from algorithm.estimator import Estimator
11 | from keras.layers import Input, Embedding, Dense, Flatten,\
12 |                          BatchNormalization, Dropout
13 | from keras.layers import multiply, concatenate
14 | from keras.models import Model
15 | 
16 | class NeuMF(Estimator):
17 |     """
18 |     mf_dim: Integer.
19 |        MF dimension.
20 |     mlp_dim: Integer.
21 |        MLP dimension
22 |     epochs: Integer 
23 |        Number of epochs to train the model
24 |     """
25 | 
26 |     def __init__(self, mf_dim=12, mlp_dim=12, epochs=2):
27 |         self.mf_dim = mf_dim
28 |         self.mlp_dim = mlp_dim
29 |         self.epochs = epochs
30 | 
31 |     def transform(self, dateset):
32 |         X = {}
33 |         u, i, r = dateset.all_ratings(axis=0)
34 |         X['user_idx'] = u.reshape(-1, 1)
35 |         X['item_idx'] = i.reshape(-1, 1)
36 | 
37 |         y = r.reshape(-1, 1)
38 | 
39 |         return X, y
40 | 
41 |     def get_neumf_model(self, user_num, item_num):
42 |         user_input = Input(shape=[1], name="user_idx")
43 |         item_input = Input(shape=[1], name="item_idx")
44 | 
45 |         mf_embedding_user = Embedding(user_num, self.mf_dim)(user_input)
46 |         mf_embedding_item = Embedding(item_num, self.mf_dim)(item_input)
47 | 
48 |         gmf_layer = multiply([mf_embedding_user, mf_embedding_item])
49 | 
50 |         mlp_embedding_user = Embedding(user_num, self.mlp_dim)(user_input)
51 |         mlp_embedding_item = Embedding(item_num, self.mlp_dim)(item_input)
52 | 
53 |         mlp_layer = concatenate([mlp_embedding_user, mlp_embedding_item])
54 | 
55 |         mlp_layer = BatchNormalization()(mlp_layer)
56 |         mlp_layer = Dense(32)(mlp_layer)
57 |         mlp_layer = Dense(16)(mlp_layer)
58 |         mlp_layer = Dense(8)(mlp_layer)
59 |         mlp_layer = Dense(4)(mlp_layer)
60 |         mlp_layer = Dropout(0.5)(mlp_layer)
61 | 
62 |         neumf_layer = concatenate([gmf_layer, mlp_layer])
63 |         neumf_layer = Flatten()(neumf_layer)
64 |         pred = Dense(1)(neumf_layer)
65 | 
66 |         model = Model(inputs=[user_input, item_input], outputs=pred)
67 |         model.compile(optimizer='adam', loss='mse')
68 | 
69 |         return model
70 | 
71 | 
72 |     def _train(self):
73 |         user_num = self.train_dataset.matrix.shape[0]
74 |         item_num = self.train_dataset.matrix.shape[1]
75 |         X_train, y_train = self.transform(self.train_dataset)
76 | 
77 |         self.neumf_model = self.get_neumf_model(user_num, item_num)
78 |         self.neumf_model.fit(X_train, y_train, epochs=self.epochs)
79 | 
80 |     def predict(self, u, i):
81 |         #not batch but single pred
82 |         X = {}
83 |         X['user_idx'] = np.array([u])
84 |         X['item_idx'] = np.array([i])
85 | 
86 |         return self.neumf_model.predict(X)[0, 0]


--------------------------------------------------------------------------------
/other/svd/algorithm/estimator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | import numpy as np
  6 | import util.tools as tl
  7 | import util.measure as ms
  8 | 
  9 | 
 10 | class Estimator(object):
 11 |     """Basic Estimator
 12 |     """
 13 | 
 14 |     def __init__(self):
 15 |         pass
 16 | 
 17 |     def train(self, train_dataset):
 18 |         self.train_dataset = train_dataset
 19 | 
 20 |         with tl.Timer() as t:
 21 |             self._train()
 22 | 
 23 |         print("{} algorithm train process cost {:.3f} sec".
 24 |               format(self.__class__.__name__, t.interval))
 25 | 
 26 |     def _train(self):
 27 |         raise NotImplementedError()
 28 | 
 29 |     def predict(self, u, i):
 30 |         raise NotImplementedError()
 31 | 
 32 |     def estimate(self, raw_test_dataset, measures):
 33 |         with tl.Timer() as t:
 34 |             error = self._estimate(raw_test_dataset, measures)
 35 | 
 36 |         print("{} algorithm predict process cost {:.3f} sec".
 37 |               format(self.__class__.__name__, t.interval))
 38 |         return error
 39 | 
 40 |     def _estimate(self, raw_test_dataset, measures):
 41 |         users_mean = self.train_dataset.get_user_means()
 42 |         items_mean = self.train_dataset.get_item_means()
 43 | 
 44 |         all = len(raw_test_dataset)
 45 |         errors = []
 46 |         cur = 0
 47 |         alg_count = 0
 48 | 
 49 |         for raw_u, raw_i, r, _ in raw_test_dataset:
 50 |             cur += 1
 51 |             has_raw_u = raw_u in self.train_dataset.uid_dict
 52 |             has_raw_i = raw_i in self.train_dataset.iid_dict
 53 | 
 54 |             if not has_raw_u and not has_raw_i:
 55 |                 real, est = r, self.train_dataset.global_mean
 56 |             elif not has_raw_u:
 57 |                 i = self.train_dataset.iid_dict[raw_i]
 58 |                 real, est = r, items_mean[i]
 59 |             elif not has_raw_i:
 60 |                 u = self.train_dataset.uid_dict[raw_u]
 61 |                 real, est = r, users_mean[u]
 62 |             else:
 63 |                 u = self.train_dataset.uid_dict[raw_u]
 64 |                 i = self.train_dataset.iid_dict[raw_i]
 65 |                 real, est = r, self.predict(u, i)
 66 |                 alg_count += 1
 67 | 
 68 |             est = min(5, est)
 69 |             est = max(1, est)
 70 |             errors.append(real - est)
 71 | 
 72 |             self.progress(cur, all, 2000)
 73 | 
 74 |         fold_eval_result = [getattr(ms, measure)(errors) for measure in measures]
 75 |         return fold_eval_result
 76 | 
 77 |     @staticmethod
 78 |     def progress(cur, all, bin=50):
 79 |         if cur % bin == 0 or cur == all:
 80 |             progress = 100 * (cur / all)
 81 |             print("progress: {:.2f}%".format(progress))
 82 | 
 83 | 
 84 | class IterationEstimator(Estimator):
 85 |     """Iterator Estimator
 86 |     """
 87 | 
 88 |     def _train(self):
 89 |         self._prepare()
 90 |         for current_epoch in range(self.n_epochs):
 91 |             print(" processing epoch {}".format(current_epoch))
 92 |             self._iteration()
 93 |             print(" cur train rmse {}".format(self._eval()))
 94 | 
 95 |     def _prepare(self):
 96 |         """
 97 |         do some prepare work
 98 |         """
 99 | 
100 |         raise NotImplementedError()
101 | 
102 |     def _iteration(self):
103 |         """
104 |         core iteration 
105 |         """
106 | 
107 |         raise NotImplementedError()
108 | 
109 |     def _pred(self):
110 |         """
111 |         core pred process
112 |         """
113 | 
114 |         raise NotImplementedError()
115 | 
116 |     def _eval(self):
117 |         """
118 |         eval on valid dateset
119 |         """
120 | 
121 |         pred_ratings = self._pred()
122 |         real_ratings = self.train_dataset.matrix
123 |         idx = real_ratings.nonzero()
124 |         bias = np.asarray(pred_ratings[idx] - real_ratings[idx])
125 |         return np.sqrt(np.sum(bias ** 2) / real_ratings.count_nonzero())


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__init__.py


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/__pycache__/baseline.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/baseline.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/__pycache__/explicit_als.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/explicit_als.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/__pycache__/implicit_als.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/implicit_als.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/__pycache__/svd.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/svd.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/__pycache__/svdpp.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/mf/__pycache__/svdpp.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/baseline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | from algorithm.estimator import IterationEstimator
 7 | 
 8 | 
 9 | class Baseline(IterationEstimator):
10 |     """
11 |     虽然是baseline，不过整体表现比itemcf和slopOne还高，
12 |     也可以看出邻居模型的弊端了，缺少优化目标
13 |     
14 |     属性
15 |     ---------
16 |     n_factors : 隐式因子数
17 |     n_epochs : 迭代次数
18 |     lr : 学习速率
19 |     reg : 正则因子
20 |     """
21 | 
22 |     def __init__(self, n_factors=20, n_epochs=20, lr=0.007, reg=.002):
23 |         self.n_factors = n_factors
24 |         self.n_epochs = n_epochs
25 |         self.lr = lr
26 |         self.reg = reg
27 | 
28 |     def _prepare(self):
29 |         self.user_num = self.train_dataset.matrix.shape[0]
30 |         self.item_num = self.train_dataset.matrix.shape[1]
31 | 
32 |         self.global_mean = self.train_dataset.global_mean
33 | 
34 |         # user bias
35 |         self.bu = np.zeros(self.user_num, np.double)
36 | 
37 |         # item bias
38 |         self.bi = np.zeros(self.item_num, np.double)
39 | 
40 |     def _iteration(self):
41 |         for u, i, r in self.train_dataset.all_ratings():
42 |             # 预测值
43 |             rp = self.global_mean + self.bu[u] + self.bi[i]
44 |             # 误差
45 |             e_ui = r - rp
46 | 
47 |             self.bu[u] += self.lr * (e_ui - self.reg * self.bu[u])
48 |             self.bi[i] += self.lr * (e_ui - self.reg * self.bi[i])
49 | 
50 |     def _pred(self):
51 |         return self.global_mean + np.repeat(np.asmatrix(self.bu).T, self.item_num, axis=1) \
52 |                             + np.repeat(np.asmatrix(self.bi), self.user_num, axis=0)
53 | 
54 |     def predict(self, u, i):
55 |         est = self.global_mean + self.bu[u] + self.bi[i]
56 |         return est


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/explicit_als.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | import scipy.sparse as sparse
 7 | from algorithm.estimator import IterationEstimator
 8 | 
 9 | 
10 | class ExplicitALS(IterationEstimator):
11 |     """
12 |     显式交替最小二乘，算法表现一般，从它的损失函数也可以看出，是最
13 |     简单的svd。只不过ALS相比SGD速度快一点, 一般10次迭代就能收敛
14 |     
15 |     属性
16 |     ---------
17 |     n_factors : 隐式因子数
18 |     n_epochs : 迭代次数
19 |     reg : 正则因子
20 |     """
21 | 
22 |     def __init__(self, n_factors=20, n_epochs=10, reg=0.1):
23 |         self.n_factors = n_factors
24 |         self.n_epochs = n_epochs
25 |         self.reg = reg
26 | 
27 |     #交替！
28 |     def alternative(self, X, Y, is_user):
29 |         reg_I = self.reg * sparse.eye(self.n_factors)
30 |         uids = self.train_dataset.uids if is_user else self.train_dataset.iids
31 | 
32 |         for u in uids:
33 |             if is_user:
34 |                 action_idx = self.train_dataset.get_user(u)[0]
35 |             else:
36 |                 action_idx = self.train_dataset.get_item(u)[0]
37 |             Y_u = Y[action_idx]
38 | 
39 |             if is_user:
40 |                 ru = self.train_dataset.matrix.A[u, action_idx]
41 |             else:
42 |                 ru = self.train_dataset.matrix.A[action_idx, u].T
43 | 
44 |             X[u] = np.linalg.solve(np.dot(np.transpose(Y_u), Y_u) + reg_I, np.dot(Y_u.T, ru))
45 | 
46 |     def _prepare(self):
47 |         self.user_num = self.train_dataset.matrix.shape[0]
48 |         self.item_num = self.train_dataset.matrix.shape[1]
49 |         self.X = np.random.normal(size=(self.user_num, self.n_factors))
50 |         self.Y = np.random.normal(size=(self.item_num, self.n_factors))
51 | 
52 |     def _iteration(self):
53 |         self.alternative(self.X, self.Y, True)
54 |         self.alternative(self.Y, self.X, False)
55 | 
56 |     def _pred(self):
57 |         return np.dot(self.X, self.Y.T)
58 | 
59 |     def predict(self, u, i):
60 |         est = np.dot(self.X[u,:], self.Y[i,:])
61 |         return est


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/implicit_als.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | import scipy.sparse as sparse
 7 | from scipy.sparse.linalg import spsolve
 8 | from algorithm.estimator import IterationEstimator
 9 | 
10 | 
11 | class ImplicitALS(IterationEstimator):
12 |     """
13 |     隐式交替最小二乘，果然不适合显式数据，表现很离谱
14 |     
15 |     属性
16 |     ---------
17 |     n_factors : 隐式因子数
18 |     n_epochs : 迭代次数
19 |     reg : 正则因子
20 |     alpha : 隐式数据评分系数
21 |     """
22 | 
23 |     def __init__(self, n_factors=20, n_epochs=10, reg=0.1, alpha=40):
24 |         self.n_factors = n_factors
25 |         self.n_epochs = n_epochs
26 |         self.reg = reg
27 |         self.alpha = alpha
28 | 
29 |     def alternative(self, X, Y, is_user):
30 |         reg_I = self.reg * sparse.eye(self.n_factors)
31 |         YTY = Y.T.dot(Y)
32 |         I = sparse.eye(Y.shape[0])
33 | 
34 |         uids = self.train_dataset.uids if is_user else self.train_dataset.iids
35 |         for u in uids:
36 |             if is_user:
37 |                 ru = self.train_dataset.matrix.A[u]
38 |             else:
39 |                 ru = self.train_dataset.matrix.A[:, u].T
40 | 
41 |             CuI = sparse.diags(ru * self.alpha, 0)
42 |             Cu = CuI + I
43 | 
44 |             pu = ru.copy()
45 |             pu[ru != 0] = 1.0
46 | 
47 |             YT_CuI_Y = Y.T.dot(CuI).dot(Y)
48 |             YT_CuI_pu = Y.T.dot(Cu).dot(sparse.csr_matrix(pu).T)
49 | 
50 |             X[u] = spsolve(YTY + YT_CuI_Y + reg_I, YT_CuI_pu)
51 | 
52 |     def _prepare(self):
53 |         self.user_num = self.train_dataset.matrix.shape[0]
54 |         self.item_num = self.train_dataset.matrix.shape[1]
55 |         self.X = sparse.csr_matrix(np.random.normal(size=(self.user_num, self.n_factors)))
56 |         self.Y = sparse.csr_matrix(np.random.normal(size=(self.item_num, self.n_factors)))
57 | 
58 |     def _iteration(self):
59 |         self.alternative(self.X, self.Y, True)
60 |         self.alternative(self.Y, self.X, False)
61 | 
62 |     def _pred(self):
63 |         return np.dot(self.X, self.Y.T)
64 | 
65 |     def predict(self, u, i):
66 |         est = self.X[u].dot(self.Y[i].T)[0,0]
67 |         return est


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/svd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | from algorithm.estimator import IterationEstimator
 7 | 
 8 | 
 9 | class SVD(IterationEstimator):
10 |     """
11 |     属性
12 |     ---------
13 |     n_factors : 隐式因子数
14 |     n_epochs : 迭代次数
15 |     lr : 学习速率
16 |     reg : 正则因子
17 |     """
18 | 
19 |     def __init__(self, n_factors=20, n_epochs=20, lr=0.007, reg=.002):
20 |         self.n_factors = n_factors
21 |         self.n_epochs = n_epochs
22 |         self.lr = lr
23 |         self.reg = reg
24 | 
25 |     def _prepare(self):
26 |         self.train_dataset = self.train_dataset
27 |         self.user_num = self.train_dataset.matrix.shape[0]
28 |         self.item_num = self.train_dataset.matrix.shape[1]
29 | 
30 |         self.global_mean = self.train_dataset.global_mean
31 |         # user bias
32 |         self.bu = np.zeros(self.user_num, np.double)
33 | 
34 |         # item bias
35 |         self.bi = np.zeros(self.item_num, np.double)
36 | 
37 |         # user factor
38 |         self.p = np.zeros((self.user_num, self.n_factors), np.double) + .1
39 | 
40 |         # item factor
41 |         self.q = np.zeros((self.item_num, self.n_factors), np.double) + .1
42 | 
43 |     def _iteration(self):
44 |         for u, i, r in self.train_dataset.all_ratings():
45 |             # 预测值
46 |             rp = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u])
47 |             # 误差
48 |             e_ui = r - rp
49 | 
50 |             self.bu[u] += self.lr * (e_ui - self.reg * self.bu[u])
51 |             self.bi[i] += self.lr * (e_ui - self.reg * self.bi[i])
52 |             self.p[u] += self.lr * (e_ui * self.q[i] - self.reg * self.p[u])
53 |             self.q[i] += self.lr * (e_ui * self.p[u] - self.reg * self.q[i])
54 | 
55 |     def _pred(self):
56 |         return self.global_mean + np.repeat(np.asmatrix(self.bu).T, self.item_num, axis=1) \
57 |                             + np.repeat(np.asmatrix(self.bi), self.user_num, axis=0) \
58 |                             + np.dot(self.p, self.q.T)
59 | 
60 |     def predict(self, u, i):
61 |         est = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u])
62 |         return est
63 | 
64 | 


--------------------------------------------------------------------------------
/other/svd/algorithm/mf/svdpp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | from algorithm.estimator import Estimator
 7 | 
 8 | 
 9 | class SVDPlusPlus(Estimator):
10 |     """
11 |     属性
12 |     ---------
13 |     n_factors : 隐式因子数
14 |     n_epochs : 迭代次数
15 |     lr : 学习速率
16 |     reg : 正则因子
17 |     """
18 | 
19 |     def __init__(self, n_factors=20, n_epochs=20, lr=0.007, reg=.002):
20 |         self.n_factors = n_factors
21 |         self.n_epochs = n_epochs
22 |         self.lr = lr
23 |         self.reg = reg
24 | 
25 |     def train(self, train_dataset):
26 |         user_num = train_dataset.matrix.shape[0]
27 |         item_num = train_dataset.matrix.shape[1]
28 |         self.train_dataset = train_dataset
29 | 
30 |         #global mean
31 |         self.global_mean = train_dataset.global_mean
32 | 
33 |         #user bias
34 |         self.bu = np.zeros(user_num, np.double)
35 | 
36 |         #item bias
37 |         self.bi = np.zeros(item_num, np.double)
38 | 
39 |         #user factor
40 |         self.p = np.zeros((user_num, self.n_factors), np.double) + .1
41 | 
42 |         #item factor
43 |         self.q = np.zeros((item_num, self.n_factors), np.double) + .1
44 | 
45 |         #item preference facotor
46 |         self.y = np.zeros((item_num, self.n_factors), np.double) + .1
47 | 
48 |         for current_epoch in range(self.n_epochs):
49 |             print(" processing epoch {}".format(current_epoch))
50 |             for u, i, r in train_dataset.all_ratings():
51 |                 #用户u点评的item集
52 |                 Nu = train_dataset.get_user(u)[0]
53 |                 I_Nu = len(Nu)
54 |                 sqrt_N_u = np.sqrt(I_Nu)
55 | 
56 |                 #基于用户u点评的item集推测u的implicit偏好
57 |                 y_u = np.sum(self.y[Nu], axis=0)
58 | 
59 |                 u_impl_prf = y_u / sqrt_N_u
60 | 
61 |                 #预测值
62 |                 rp = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u] + u_impl_prf)
63 | 
64 |                 #误差
65 |                 e_ui = r - rp
66 | 
67 |                 #sgd
68 |                 self.bu[u] += self.lr * (e_ui - self.reg * self.bu[u])
69 |                 self.bi[i] += self.lr * (e_ui - self.reg * self.bi[i])
70 |                 self.p[u] += self.lr * (e_ui * self.q[i] - self.reg * self.p[u])
71 |                 self.q[i] += self.lr * (e_ui * (self.p[u] + u_impl_prf) - self.reg * self.q[i])
72 |                 for j in Nu:
73 |                     self.y[j] += self.lr * (e_ui * self.q[j] / sqrt_N_u - self.reg * self.y[j])
74 | 
75 |     def predict(self, u, i):
76 |         Nu = self.train_dataset.get_user(u)[0]
77 |         I_Nu = len(Nu)
78 |         sqrt_N_u = np.sqrt(I_Nu)
79 |         y_u = np.sum(self.y[Nu], axis=0) / sqrt_N_u
80 | 
81 |         est = self.global_mean + self.bu[u] + self.bi[i] + np.dot(self.q[i], self.p[u] + y_u)
82 |         return est
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__init__.py


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__init__.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/__pycache__/itemcf.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__pycache__/itemcf.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/__pycache__/slop_one.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/__pycache__/slop_one.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/itemcf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | import numpy as np
  6 | from scipy.sparse import lil_matrix
  7 | from algorithm.estimator import Estimator
  8 | from algorithm.mf.baseline import Baseline
  9 | 
 10 | 
 11 | class Itemcf(Estimator):
 12 |     """
 13 |     属性
 14 |     ---------
 15 |     min : 有效交互数下限
 16 |     topk : 相似矩阵topk
 17 |     use_baseline : 是否嵌入baseline计算bias
 18 |     """
 19 | 
 20 |     def __init__(self, min=2, topk=20, use_baseline=True):
 21 |         self.min = min
 22 |         self.topk = topk
 23 |         self.use_baseline = use_baseline
 24 | 
 25 |     def compute_cosine_similarity(self, user_num, item_num, users_ratings):
 26 |         sim = lil_matrix((item_num, item_num), dtype=np.double)
 27 | 
 28 |         #点积
 29 |         dot = lil_matrix((item_num, item_num), dtype=np.double)
 30 | 
 31 |         #左向量平方和
 32 |         sql = lil_matrix((item_num, item_num), dtype=np.double)
 33 | 
 34 |         #右向量平方和
 35 |         sqr = lil_matrix((item_num, item_num), dtype=np.double)
 36 | 
 37 |         #共现矩阵
 38 |         coo = lil_matrix((item_num, item_num), dtype=np.double)
 39 | 
 40 |         cur = 1
 41 |         for u, (ii, rr) in users_ratings:
 42 |             cur = cur + 1
 43 |             for k in range(len(ii) - 1):
 44 |                 k1, k2 = k, k+1
 45 |                 i1, i2 = ii[k1], ii[k2]
 46 |                 if i1 > i2:
 47 |                     i1, i2 = i2, i1
 48 |                     k1, k2 = k2, k1
 49 |                 dot[i1, i2] += rr[k1] * rr[k2]
 50 |                 sql[i1, i2] += rr[k1]**2
 51 |                 sqr[i1, i2] += rr[k2]**2
 52 |                 coo[i1, i2] += 1
 53 |             self.progress(cur, user_num, 50)
 54 | 
 55 |         #dok_matrix不适合进行矩阵算术操作，转为csc格式
 56 |         dot = dot.tocsc()
 57 |         sql = sql.tocsc()
 58 |         sqr = sqr.tocsc()
 59 |         coo = coo.tocsc()
 60 | 
 61 |         #交互数低于限制全部清零
 62 |         dot.data[coo.data < self.min] = 0
 63 | 
 64 |         #左右向量平方和的乘积
 65 |         sql.data *= sqr.data
 66 | 
 67 |         #只需要考虑非0点积
 68 |         row, col = dot.nonzero()
 69 | 
 70 |         #cosine相似矩阵
 71 |         sim[row, col] = dot[row, col] / np.sqrt((sql)[row, col])
 72 |         sim[col, row] = sim[row, col]
 73 | 
 74 |         return sim.A
 75 | 
 76 |     def _train(self):
 77 |         if self.use_baseline:
 78 |             self.baseline = Baseline()
 79 |             self.baseline.train(self.train_dataset)
 80 | 
 81 |         user_num = self.train_dataset.matrix.shape[0]
 82 |         item_num = self.train_dataset.matrix.shape[1]
 83 |         self.sim = self.compute_cosine_similarity(user_num, item_num, self.train_dataset.get_users())
 84 |         self.item_means = self.train_dataset.get_item_means()
 85 |         self.user_means = self.train_dataset.get_user_means()
 86 | 
 87 |     def predict(self, u, i):
 88 |         ll, rr = self.train_dataset.get_user(u)
 89 |         neighbors = [(sim_i, self.sim[i, sim_i], sim_r) for sim_i, sim_r in zip(ll, rr)]
 90 | 
 91 |         neighbors = sorted(neighbors, key=lambda tple: tple[1], reverse=True)[0:self.topk]
 92 |         est = self.baseline.predict(u, i) if self.use_baseline else self.item_means[i]
 93 |         sum = 0
 94 |         divisor = 0
 95 | 
 96 |         for sim_i, sim, sim_r in neighbors:
 97 |             if not self.use_baseline:
 98 |                 bias = sim_r - self.item_means[sim_i]
 99 |             else:
100 |                 bias = sim_r - self.baseline.predict(u, sim_i)
101 | 
102 |             sum += sim * bias
103 |             divisor += sim
104 | 
105 |         if divisor != 0:
106 |             est += sum / divisor
107 |         return est


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/itemcf.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/itemcf.pyc


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/slop_one.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | from scipy.sparse import lil_matrix
 7 | from algorithm.estimator import Estimator
 8 | 
 9 | 
10 | class SlopOne(Estimator):
11 |     """
12 |     属性
13 |     ---------
14 |     is_weighted : slopOne or weightedSlopOne
15 |     """
16 | 
17 |     def __init__(self, is_weighted=False):
18 |         self.is_weighted = is_weighted
19 | 
20 |     def _train(self):
21 |         item_num = self.train_dataset.matrix.shape[1]
22 | 
23 |         self.freq = lil_matrix((item_num, item_num),  dtype=np.int8)
24 |         self.dev = lil_matrix((item_num, item_num),  dtype=np.double)
25 |         user_num = self.train_dataset.matrix.shape[0]
26 |         cur = 0
27 |         for u, (ii, rr) in self.train_dataset.get_users():
28 |             cur += 1
29 |             for k in range(len(ii) - 1):
30 |                 k1, k2 = k, k+1
31 |                 i1, i2 = ii[k1], ii[k2]
32 |                 if i1 > i2:
33 |                     i1, i2 = i2, i1
34 |                     k1, k2 = k2, k1
35 |                 self.freq[i1, i2] += 1
36 |                 self.dev[i1, i2] += rr[k1] - rr[k2]
37 |             self.progress(cur, user_num, 50)
38 | 
39 |         nonzero_indices = self.freq.nonzero()
40 |         self.dev[nonzero_indices] /= self.freq[nonzero_indices]
41 | 
42 |         self.dev[(nonzero_indices[1], nonzero_indices[0])] = -self.dev[nonzero_indices]
43 |         self.freq[(nonzero_indices[1], nonzero_indices[0])] = self.freq[nonzero_indices]
44 | 
45 |         # for i,j in zip(dev.nonzero()):
46 |         #     if i > j:
47 |         #        i, j = j, i
48 |         #     dev[i, j] /= freq[i, j]
49 | 
50 |         self.dev = self.dev.A
51 |         self.freq = self.freq.A
52 |         self.user_means = self.train_dataset.get_user_means()
53 |         self.ratings = self.train_dataset.matrix.A
54 | 
55 |     def predict(self, u, i):
56 |         N = [j for j in self.train_dataset.get_user(u)[0] if self.freq[i, j] > 0]
57 |         est = self.user_means[u]
58 | 
59 |         if N:
60 |             if self.is_weighted:
61 |                 est = sum([(self.ratings[u, j] + self.dev[i, j]) * self.freq[i, j] for j in N]) /\
62 |                       sum([self.freq[i, j] for j in N])
63 |             else:
64 |                 est += np.mean([self.dev[i, j] for j in N])
65 |         return est
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/other/svd/algorithm/neighborhood/slop_one.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/algorithm/neighborhood/slop_one.pyc


--------------------------------------------------------------------------------
/other/svd/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from algorithm.mf.baseline import Baseline
 4 | from util.databuilder import DataBuilder
 5 | 
 6 | from algorithm.dnn.neumf import NeuMF
 7 | 
 8 | from algorithm.mf.explicit_als import ExplicitALS
 9 | from algorithm.mf.svd import SVD
10 | from algorithm.mf.svdpp import SVDPlusPlus
11 | from algorithm.mf.implicit_als import ImplicitALS
12 | 
13 | from algorithm.neighborhood.slop_one import SlopOne
14 | from algorithm.neighborhood.itemcf import Itemcf
15 | 
16 | file_name = os.path.abspath("data/ml-100k/u.data")
17 | data_builder = DataBuilder(file_name, just_test_one=True)
18 | 
19 | 
20 | 
21 | data_builder.eval(NeuMF(epochs=2))
22 | 
23 | 
24 | 
25 | data_builder.eval(Itemcf())
26 | 
27 | 
28 | 
29 | data_builder.eval(SlopOne())
30 | 
31 | 
32 | 
33 | data_builder.eval(Baseline())
34 | 
35 | 
36 | data_builder.eval(SVD())
37 | 
38 | 
39 | data_builder.eval(SVDPlusPlus())
40 | 
41 | 
42 | data_builder.eval(ExplicitALS())
43 | 
44 | 
45 | data_builder.eval(ImplicitALS())
46 |  
47 |  


--------------------------------------------------------------------------------
/other/svd/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/tests/__init__.py


--------------------------------------------------------------------------------
/other/svd/tests/algorithm_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from algorithm.mf.baseline import Baseline
 4 | from util.databuilder import DataBuilder
 5 | 
 6 | from algorithm.dnn.neumf import NeuMF
 7 | 
 8 | from algorithm.mf.explicit_als import ExplicitALS
 9 | from algorithm.mf.svd import SVD
10 | from algorithm.mf.svdpp import SVDPlusPlus
11 | from algorithm.mf.implicit_als import ImplicitALS
12 | 
13 | from algorithm.neighborhood.slop_one import SlopOne
14 | from algorithm.neighborhood.itemcf import Itemcf
15 | 
16 | file_name = os.path.abspath("data/ml-100k/u.data")
17 | data_builder = DataBuilder(file_name, just_test_one=True)
18 | 
19 | 
20 | def test_neumf():
21 |     data_builder.eval(NeuMF(epochs=2))
22 | 
23 | 
24 | def test_itemcf():
25 |     data_builder.eval(Itemcf())
26 | 
27 | 
28 | def test_slopOne():
29 |     data_builder.eval(SlopOne())
30 | 
31 | 
32 | def test_baseline():
33 |     data_builder.eval(Baseline())
34 | 
35 | 
36 | def test_svd():
37 |     data_builder.eval(SVD())
38 | 
39 | 
40 | def test_svdpp():
41 |     data_builder.eval(SVDPlusPlus())
42 | 
43 | 
44 | def test_explicit_als():
45 |     data_builder.eval(ExplicitALS())
46 | 
47 | 
48 | def test_implicit_als():
49 |     data_builder.eval(ImplicitALS())


--------------------------------------------------------------------------------
/other/svd/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__init__.py


--------------------------------------------------------------------------------
/other/svd/util/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/util/__pycache__/databuilder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/databuilder.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/util/__pycache__/matrix.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/matrix.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/util/__pycache__/measure.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/measure.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/util/__pycache__/tools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/other/svd/util/__pycache__/tools.cpython-37.pyc


--------------------------------------------------------------------------------
/other/svd/util/databuilder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import itertools
  4 | import os
  5 | 
  6 | import numpy as np
  7 | from scipy.sparse import csr_matrix
  8 | from util.matrix import Matrix
  9 | import util.tools as tl
 10 | 
 11 | 
 12 | class DataBuilder(object):
 13 |     """
 14 |     构造数据模型
 15 |        
 16 |     参数
 17 |     ----------    
 18 |     file_name : 文件地址，这里用的grouplens数据集
 19 |     shuffle : 是否对数据shuffle
 20 |     just_test_one : k折交叉验证要运行k次，这里只运行一次，方便测试程序正确性
 21 |     """
 22 | 
 23 |     def __init__(self, file_name, shuffle=True, just_test_one=False):
 24 |         self.file_name = file_name
 25 |         self.shuffle = shuffle
 26 |         self.just_test_one = just_test_one
 27 | 
 28 |     def read_ratings(self):
 29 |         """
 30 |         读取数据
 31 |         """
 32 | 
 33 |         with open(os.path.expanduser(self.file_name)) as f:
 34 |             raw_ratings = [self.parse_line(line) for line in itertools.islice(f, 0, None)]
 35 |         return raw_ratings
 36 | 
 37 |     def parse_line(self, line):
 38 |         line = line.split("\t")
 39 |         uid, iid, r, timestamp = (line[i].strip() for i in range(4))
 40 |         return uid, iid, float(r), timestamp
 41 | 
 42 |     def cv(self, k_folds):
 43 |         raw_ratings = self.read_ratings()
 44 | 
 45 |         if self.shuffle:
 46 |             np.random.shuffle(raw_ratings)
 47 | 
 48 |         stop = 0
 49 |         raw_len = len(raw_ratings)
 50 |         offset = raw_len // k_folds
 51 |         left = raw_len % k_folds
 52 |         for fold_i in range(k_folds):
 53 |             print("current fold {}".format(fold_i + 1))
 54 |             start = stop
 55 |             stop += offset
 56 |             if fold_i < left:
 57 |                 stop += 1
 58 | 
 59 |             #使用生成器，提高效率
 60 |             yield self.mapping(raw_ratings[:start] + raw_ratings[stop:]), raw_ratings[start:stop]
 61 | 
 62 |     def mapping(self, raw_train_ratings):
 63 |         uid_dict = {}
 64 |         iid_dict = {}
 65 |         current_u_index = 0
 66 |         current_i_index = 0
 67 | 
 68 |         row = []
 69 |         col = []
 70 |         data = []
 71 |         for urid, irid, r, timestamp in raw_train_ratings:
 72 |             try:
 73 |                 uid = uid_dict[urid]
 74 |             except KeyError:
 75 |                 uid = current_u_index
 76 |                 uid_dict[urid] = current_u_index
 77 |                 current_u_index += 1
 78 |             try:
 79 |                 iid = iid_dict[irid]
 80 |             except KeyError:
 81 |                 iid = current_i_index
 82 |                 iid_dict[irid] = current_i_index
 83 |                 current_i_index += 1
 84 | 
 85 |             row.append(uid)
 86 |             col.append(iid)
 87 |             data.append(r)
 88 | 
 89 |         sparse_matrix = csr_matrix((data, (row, col)))
 90 | 
 91 |         return Matrix(sparse_matrix, uid_dict, iid_dict)
 92 | 
 93 |     def eval(self, algorithm, measures=["rmse", "mae"], k_folds=5):
 94 |         eval_results = []
 95 | 
 96 |         for train_dataset, test_dataset in self.cv(k_folds):
 97 |             algorithm.train(train_dataset)
 98 |             eval_results.append(algorithm.estimate(test_dataset, measures))
 99 |             if self.just_test_one:
100 |                 break
101 | 
102 |         tl.print_pretty(measures, eval_results)


--------------------------------------------------------------------------------
/other/svd/util/dnn_util.py:
--------------------------------------------------------------------------------
 1 | from keras.callbacks import Callback
 2 | from sklearn.metrics import mean_squared_error
 3 | import numpy as np
 4 | 
 5 | 
 6 | class RMSEvaluation(Callback):
 7 |     def __init__(self, validation_data=(), interval=1):
 8 |         super(Callback, self).__init__()
 9 | 
10 |         self.interval = interval
11 |         self.X_val, self.y_val = validation_data
12 | 
13 |     def on_epoch_end(self, epoch, logs={}):
14 |         if epoch % self.interval == 0:
15 |             y_pred = self.model.predict(self.X_val, verbose=0, batch_size=2000)
16 |             score = np.sqrt(mean_squared_error(self.y_val, y_pred))
17 |             print("\n RMSE - epoch: %d - score: %.6f \n" % (epoch + 1, score))


--------------------------------------------------------------------------------
/other/svd/util/matrix.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import division, print_function
  4 | 
  5 | import numpy as np
  6 | import itertools
  7 | 
  8 | 
  9 | class Matrix(object):
 10 | 
 11 |     def __init__(self, sparse_matrix, uid_dict=None, iid_dict=None):
 12 |         self.matrix = sparse_matrix.tocsc()
 13 |         self._global_mean = None
 14 |         coo_matrix = sparse_matrix.tocoo()
 15 |         self.uids = set(coo_matrix.row)
 16 |         self.iids = set(coo_matrix.col)
 17 |         self.uid_dict = uid_dict
 18 |         self.iid_dict = iid_dict
 19 | 
 20 |     def get_item(self, i):
 21 |         """
 22 |         (is, (us, rs))
 23 |         """
 24 | 
 25 |         ratings = self.matrix.getcol(i).tocoo()
 26 |         return ratings.row, ratings.data
 27 | 
 28 |     def get_user(self, u):
 29 |         """
 30 |         (u, (is, rs)) 
 31 |         """
 32 | 
 33 |         ratings = self.matrix.getrow(u).tocoo()
 34 |         return ratings.col, ratings.data
 35 | 
 36 |     def get_users(self):
 37 |         """
 38 |         iterator(u, (is, rs))
 39 |         """
 40 | 
 41 |         for u in self.get_uids():
 42 |             yield u, self.get_user(u)
 43 | 
 44 |     def get_user_means(self):
 45 |         """
 46 |         用户的平均评分字典
 47 |         """
 48 | 
 49 |         users_mean = {}
 50 |         for u in self.get_uids():
 51 |             users_mean[u] = np.mean(self.get_user(u)[1])
 52 |         return users_mean
 53 | 
 54 |     def get_item_means(self):
 55 |         """
 56 |         物品的平均评分字典 
 57 |         """
 58 | 
 59 |         item_means = {}
 60 |         for i in self.get_iids():
 61 |             item_means[i] = np.mean(self.get_item(i)[1])
 62 |         return item_means
 63 | 
 64 |     def all_ratings(self, axis=1):
 65 |         """
 66 |         row(u,i,r)
 67 |         or 
 68 |         col(u, i, r)
 69 |         """
 70 |         coo_matrix = self.matrix.tocoo()
 71 | 
 72 |         if axis == 1:
 73 | #            return[Row(zip(column_names, row))for row in cursor]
 74 |             return zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
 75 | #            return itertools.izip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
 76 |         else:
 77 |             return coo_matrix.row, coo_matrix.col, coo_matrix.data
 78 | 
 79 |     def get_uids(self):
 80 |         """
 81 |         所有用户id集 
 82 |         """
 83 | 
 84 |         return np.unique(self.matrix.tocoo().row)
 85 | 
 86 |     def get_iids(self):
 87 |         """
 88 |         所有物品id集 
 89 |         """
 90 |         return np.unique(self.matrix.tocoo().col)
 91 | 
 92 |     def has_user(self, u):
 93 |         """
 94 |         是否存在用户u 
 95 |         """
 96 | 
 97 |         return u in self.uids
 98 | 
 99 |     def has_item(self, i):
100 |         """
101 |         是否存在物品i 
102 |         """
103 | 
104 |         return i in self.iids
105 | 
106 |     @property
107 |     def global_mean(self):
108 |         """
109 |         全局均值 
110 |         """
111 | 
112 |         if self._global_mean is None:
113 |             self._global_mean = np.sum(self.matrix.data) / self.matrix.size
114 |         return self._global_mean


--------------------------------------------------------------------------------
/other/svd/util/measure.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def rmse(errors):
 9 |     return np.sqrt(np.mean(np.power(errors, 2)))
10 | 
11 | 
12 | def mae(errors):
13 |     return np.mean(np.abs(errors))
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/other/svd/util/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import division, print_function
 4 | 
 5 | import time
 6 | import numpy as np
 7 | 
 8 | 
 9 | def print_pretty(measures, eval_results):
10 |     """
11 |     格式化输出
12 |     """
13 | 
14 |     pad = '{:<9}' * (len(measures) + 1)
15 | 
16 |     print(pad.format('', *measures))
17 | 
18 |     keep = lambda eval_result:['{:.4f}'.format(single_eval) \
19 |                                for single_eval in eval_result]
20 |     for i, eval_result in enumerate(eval_results):
21 |         print(pad.format('fold {}'.format(i), *keep(eval_result)))
22 |     print(pad.format('avg', *keep(np.mean(eval_results, axis=0))))
23 | 
24 | 
25 | class Timer(object):
26 |     """
27 |     time util
28 |     """
29 |     def __enter__(self):
30 |         self.start = time.clock()
31 |         return self
32 | 
33 |     def __exit__(self, *args):
34 |         self.end = time.clock()
35 |         self.interval = self.end - self.start


--------------------------------------------------------------------------------
/svd/svd.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/svd/svd.gif


--------------------------------------------------------------------------------
/svd/untitled1.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # encoding: utf-8
 3 | __author__ = 'Scarlett'
 4 | #矩阵分解在打分预估系统中得到了成熟的发展和应用
 5 | # from pylab import *
 6 | import matplotlib.pyplot as plt
 7 | from math import pow
 8 | import numpy
 9 | 
10 | 
11 | def matrix_factorization(R,P,Q,K,steps=5000,alpha=0.0002,beta=0.02):
12 |     Q=Q.T  # .T操作表示矩阵的转置
13 |     result=[]
14 |     for step in range(steps):
15 |         for i in range(len(R)):
16 |             for j in range(len(R[i])):
17 |                 if R[i][j]>0:
18 |                     eij=R[i][j]-numpy.dot(P[i,:],Q[:,j]) # .dot(P,Q) 表示矩阵内积
19 |                     for k in range(K):
20 |                         P[i][k]=P[i][k]+alpha*(2*eij*Q[k][j]-beta*P[i][k])
21 |                         Q[k][j]=Q[k][j]+alpha*(2*eij*P[i][k]-beta*Q[k][j])
22 |         eR=numpy.dot(P,Q)
23 |         e=0
24 |         for i in range(len(R)):
25 |             for j in range(len(R[i])):
26 |                 if R[i][j]>0:
27 |                     e=e+pow(R[i][j]-numpy.dot(P[i,:],Q[:,j]),2)
28 |                     for k in range(K):
29 |                         e=e+(beta/2)*(pow(P[i][k],2)+pow(Q[k][j],2))
30 |         result.append(e)
31 |         if e<0.001:
32 |             break
33 |     return P,Q.T,result
34 | 
35 | if __name__ == '__main__':
36 | #    R=[
37 | #        [5,3,0,1],
38 | #        [4,0,0,1],
39 | #        [1,1,0,5],
40 | #        [1,0,0,4],
41 | #        [0,1,5,4]
42 | #    ]
43 |     
44 |     R=[[4., 3., 0., 5., 0.],
45 |         [5., 0., 4., 4., 0.],
46 |         [4., 0., 5., 0., 3.],
47 |         [2., 3., 0., 1., 0.],
48 |         [0., 4., 2., 0., 5.]]
49 | 
50 |     R=numpy.array(R)
51 | 
52 |     N=len(R)
53 |     M=len(R[0])
54 |     K=2
55 | 
56 |     P=numpy.random.rand(N,K) #随机生成一个 N行 K列的矩阵
57 |     Q=numpy.random.rand(M,K) #随机生成一个 M行 K列的矩阵
58 | 
59 |     nP,nQ,result=matrix_factorization(R,P,Q,K)
60 |     print("原始的评分矩阵R为：\n",R)
61 |     R_MF=numpy.dot(nP,nQ.T)
62 |     print("经过MF算法填充0处评分值后的评分矩阵R_MF为：\n",R_MF)
63 | 
64 | #-------------损失函数的收敛曲线图---------------
65 | 
66 |     n=len(result)
67 |     x=range(n)
68 |     plt.plot(x,result,color='r',linewidth=3)
69 |     plt.title("Convergence curve")
70 |     plt.xlabel("generation")
71 |     plt.ylabel("loss")
72 |     plt.show()


--------------------------------------------------------------------------------
/svd/满秩.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/svd/满秩.gif


--------------------------------------------------------------------------------
/wide-and-deep-learning-keras/README.md:
--------------------------------------------------------------------------------
 1 | # Wide and Deep Learning implemented with Keras
 2 | <pre>
 3 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 4 |                     Version 2, December 2004
 5 | 
 6 |  Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
 7 | 
 8 |  Everyone is permitted to copy and distribute verbatim or modified
 9 |  copies of this license document, and changing it is allowed as long
10 |  as the name is changed.
11 | 
12 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
13 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
14 | 
15 |   0. You just DO WHAT THE FUCK YOU WANT TO.
16 | </pre>
17 | 
18 | ### Requirements
19 | * Python >= 3.6
20 | * TensorFlow >= 1.6
21 | * Keras >= 2.0.0
22 | 
23 | ### Model Plot
24 | ![Model Plot](https://github.com/kaitolucifer/wide-and-deep-learning-keras/blob/master/model.png)
25 | The model is based on [Heng-Tze Cheng, *et al.* Wide & Deep Learning for Recommender Systems (2016)](https://arxiv.org/abs/1606.07792)<br/>
26 | I used [UCI Machine Learning Repository: Adult Data Set](https://archive.ics.uci.edu/ml/datasets/adult) as example data.</br>
27 | There are 8 categorical features so I put every one of them into a embedding layer.</br>
28 | And I just put the rest 5 continuous feature into a dense layer and concatenate it with all embedding layers.</br>
29 | Then add some dense layer and before sigmoid layer I concatente the dense layer output with logistic part input.</br>
30 | 
31 | ### Best Test Set Accuracy
32 | 85.8%
33 | 


--------------------------------------------------------------------------------
/wide-and-deep-learning-keras/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/wide-and-deep-learning-keras/model.png


--------------------------------------------------------------------------------
/wide-and-deep-learning-keras/wide_and_deep.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/wide-and-deep-learning-keras/wide_and_deep.h5


--------------------------------------------------------------------------------
/wide-and-deep-learning-keras/wide_and_deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luogantt/recommend_sys/10cb88732a7049197220c48d4fecd11c98e779ba/wide-and-deep-learning-keras/wide_and_deep.png


--------------------------------------------------------------------------------
/wide-and-deep-learning-keras/wide_and_deep.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
  4 | from keras.layers import Input, Embedding, Dense, Flatten, Dropout, SpatialDropout1D, Activation, concatenate
  5 | from keras.optimizers import Adam, SGD
  6 | from keras.layers.advanced_activations import ReLU, PReLU, LeakyReLU, ELU
  7 | from keras.layers.normalization import BatchNormalization
  8 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  9 | from keras.models import Model
 10 | from tensorflow.keras.utils import plot_model
 11 | 
 12 | 
 13 | COLUMNS = [
 14 |     "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", 
 15 |     "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", 
 16 |     "hours_per_week", "native_country", "income_bracket"
 17 | ]
 18 | 
 19 | LABEL_COLUMN = "label"
 20 | 
 21 | CATEGORICAL_COLUMNS = [
 22 |     "workclass", "education", "marital_status", "occupation", "relationship", 
 23 |     "race", "gender", "native_country"
 24 | ]
 25 | 
 26 | CONTINUOUS_COLUMNS = [
 27 |     "age", "education_num", "capital_gain", "capital_loss", "hours_per_week"
 28 | ]
 29 | 
 30 | 
 31 | def preprocessing():
 32 |     train_data = pd.read_csv('./adult.data', names=COLUMNS)
 33 |     train_data.dropna(how='any', axis=0)
 34 |     test_data = pd.read_csv('./adult.test', skiprows=1, names=COLUMNS)
 35 |     test_data.dropna(how='any', axis=0)
 36 |     all_data = pd.concat([train_data, test_data])
 37 |     # ラベルを数値化する
 38 |     all_data[LABEL_COLUMN] = all_data['income_bracket'].apply(lambda x: ">50K" in x).astype(int)
 39 |     all_data.pop('income_bracket')
 40 |     y = all_data[LABEL_COLUMN].values
 41 |     all_data.pop(LABEL_COLUMN)
 42 |     for c in CATEGORICAL_COLUMNS:
 43 |         le = LabelEncoder()
 44 |         all_data[c] = le.fit_transform(all_data[c])
 45 |     train_size = len(train_data)
 46 |     x_train = all_data.iloc[:train_size]
 47 |     y_train = y[:train_size]
 48 |     x_test = all_data.iloc[train_size:]
 49 |     y_test = y[train_size:]
 50 |     x_train_categ = np.array(x_train[CATEGORICAL_COLUMNS]) # カテゴリーデータ
 51 |     x_test_categ = np.array(x_test[CATEGORICAL_COLUMNS])
 52 |     x_train_conti = np.array(x_train[CONTINUOUS_COLUMNS], dtype='float64') # 連続的データ
 53 |     x_test_conti = np.array(x_test[CONTINUOUS_COLUMNS], dtype='float64')
 54 |     scaler = StandardScaler()
 55 |     x_train_conti = scaler.fit_transform(x_train_conti) # 連続データの訓練セットの平均とstdで標準化
 56 |     x_test_conti = scaler.transform(x_test_conti)
 57 |     return [x_train, y_train, x_test, y_test, x_train_categ, x_test_categ, x_train_conti, x_test_conti, all_data]
 58 | 
 59 | 
 60 | class Wide_and_Deep:
 61 |     def __init__(self, mode='wide and deep'):
 62 |         self.mode = mode
 63 |         x_train, y_train, x_test, y_test, x_train_categ, x_test_categ, x_train_conti, x_test_conti, all_data \
 64 |             = preprocessing()
 65 |         self.x_train = x_train
 66 |         self.y_train = y_train
 67 |         self.x_test = x_test
 68 |         self.y_test = y_test
 69 |         self.x_train_categ = x_train_categ # 訓練セットの中のカテゴリーデータ
 70 |         self.x_test_categ = x_test_categ # テストセットの中のカテゴリーデータ
 71 |         self.x_train_conti = x_train_conti # 訓練セットの中の連続的データ
 72 |         self.x_test_conti = x_test_conti # テストセットの中の連続的データ
 73 |         self.all_data = all_data
 74 |         self.poly = PolynomialFeatures(degree=2, interaction_only=True)
 75 |         # カテゴリーデータをcross product化
 76 |         self.x_train_categ_poly = self.poly.fit_transform(x_train_categ)
 77 |         self.x_test_categ_poly = self.poly.transform(x_test_categ)
 78 |         self.categ_inputs = None
 79 |         self.conti_input = None
 80 |         self.deep_component_outlayer = None
 81 |         self.logistic_input = None
 82 |         self.model = None
 83 | 
 84 |     def deep_component(self):
 85 |         categ_inputs = []
 86 |         categ_embeds = []
 87 |         # カテゴリーデータの特徴ごとにInput層とEmbedding層を作成
 88 |         for i in range(len(CATEGORICAL_COLUMNS)):
 89 |             input_i = Input(shape=(1,), dtype='int32')
 90 |             dim = len(np.unique(self.all_data[CATEGORICAL_COLUMNS[i]]))
 91 |             embed_dim = int(np.ceil(dim ** 0.25)) # 入力カテゴリー数の4乗根をEmbedding次元数にする
 92 |             embed_i = Embedding(dim, embed_dim, input_length=1)(input_i)
 93 |             flatten_i = Flatten()(embed_i)
 94 |             categ_inputs.append(input_i)
 95 |             categ_embeds.append(flatten_i)
 96 |         # 連続的データは全結合層で一括入力
 97 |         conti_input = Input(shape=(len(CONTINUOUS_COLUMNS),))
 98 |         conti_dense = Dense(256, use_bias=False)(conti_input)
 99 |         # 全結合層と各Embeddingの出力をくっつける
100 |         concat_embeds = concatenate([conti_dense]+categ_embeds)
101 |         concat_embeds = Activation('relu')(concat_embeds)
102 |         bn_concat = BatchNormalization()(concat_embeds)
103 |         # 更に全結合層を3層重ねる
104 |         fc1 = Dense(512, use_bias=False)(bn_concat)
105 |         ac1 = ReLU()(fc1)
106 |         bn1 = BatchNormalization()(ac1)
107 |         fc2 = Dense(256, use_bias=False)(bn1)
108 |         ac2 = ReLU()(fc2)
109 |         bn2 = BatchNormalization()(ac2)
110 |         fc3 = Dense(128)(bn2)
111 |         ac3 = ReLU()(fc3)
112 | 
113 |         # 入力の層と最後の層をメンバー変数化（モデル作成用）
114 |         self.categ_inputs = categ_inputs
115 |         self.conti_input = conti_input
116 |         self.deep_component_outlayer = ac3
117 | 
118 |     def wide_component(self):
119 |         # カテゴリーデータだけ線形モデルに入れる
120 |         dim = self.x_train_categ_poly.shape[1]
121 |         self.logistic_input = Input(shape=(dim,))
122 | 
123 |     def create_model(self):
124 |         self.deep_component()
125 |         self.wide_component()
126 |         if self.mode == 'wide and deep':
127 |             out_layer = concatenate([self.deep_component_outlayer, self.logistic_input])
128 |             inputs = [self.conti_input] + self.categ_inputs + [self.logistic_input]
129 |         elif self.mode =='deep':
130 |             out_layer = self.deep_component_outlayer
131 |             inputs = [self.conti_input] + self.categ_inputs
132 |         else:
133 |             print('wrong mode')
134 |             return
135 | 
136 |         output = Dense(1, activation='sigmoid')(out_layer)
137 |         self.model = Model(inputs=inputs, outputs=output)
138 | 
139 |     def train_model(self, epochs=15, optimizer='adam', batch_size=128):
140 |         if not self.model:
141 |             print('You have to create model first')
142 |             return
143 | 
144 |         if self.mode == 'wide and deep':
145 |             input_data = [self.x_train_conti] +\
146 |                          [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])] +\
147 |                          [self.x_train_categ_poly]
148 |         elif self.mode == 'deep':
149 |             input_data = [self.x_train_conti] +\
150 |                          [self.x_train_categ[:, i] for i in range(self.x_train_categ.shape[1])]
151 |         else:
152 |             print('wrong mode')
153 |             return
154 |         
155 |         self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
156 |         self.model.fit(input_data, self.y_train, epochs=epochs, batch_size=batch_size)
157 | 
158 |     def evaluate_model(self):
159 |         if not self.model:
160 |             print('You have to create model first')
161 |             return
162 | 
163 |         if self.mode == 'wide and deep':
164 |             input_data = [self.x_test_conti] +\
165 |                          [self.x_test_categ[:, i] for i in range(self.x_test_categ.shape[1])] +\
166 |                          [self.x_test_categ_poly]
167 |         elif self.mode == 'deep':
168 |             input_data = [self.x_test_conti] +\
169 |                          [self.x_test_categ[:, i] for i in range(self.x_test_categ.shape[1])]
170 |         else:
171 |             print('wrong mode')
172 |             return
173 | 
174 |         loss, acc = self.model.evaluate(input_data, self.y_test)
175 |         print(f'test_loss: {loss} - test_acc: {acc}')
176 | 
177 |     def save_model(self, filename='wide_and_deep.h5'):
178 |         self.model.save(filename)
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     wide_deep_net = Wide_and_Deep()
183 |     wide_deep_net.create_model()
184 |     wide_deep_net.train_model()
185 |     wide_deep_net.evaluate_model()
186 |     wide_deep_net.save_model()
187 |     plot_model(wide_deep_net.model, to_file='model.png', show_shapes=True, show_layer_names=False)


--------------------------------------------------------------------------------