├── .idea
    └── vcs.xml
├── AFM
    └── AFM.py
├── CollaborativeFiltering.ipynb
├── DCN
    ├── DCN-keras.ipynb
    ├── DCN-tf2.0.ipynb
    ├── DCN-tf2.0.py
    └── DCN.ipynb
├── GBDT_LR.ipynb
├── MLR.ipynb
├── NFM
    └── NFM.py
├── PNN
    ├── PNN-tf2.0.ipynb
    └── PNN.py
├── README.md
├── Wide-Deep
    ├── Wide-Deep.ipynb
    ├── Wide-Deep.py
    └── data_process.py
├── data
    ├── Criteo
    │   ├── data_process.py
    │   └── train.txt
    └── Driver
    │   └── train.csv
├── embedding.ipynb
└── util
    └── train_model.py


/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/AFM/AFM.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | TensorFlow 2.0 implementation of AFM
  4 | Reference:
  5 | https://www.jianshu.com/p/83d3b2a1e55d
  6 | Attentional Factorization Machines:
  7 | Learning the Weight of Feature Interactions via Attention Networks
  8 | """
  9 | import tensorflow as tf
 10 | 
 11 | import pickle
 12 | from util.train_model import train_test_model_demo
 13 | 
 14 | 
 15 | class AttentionNet(tf.keras.layers.Layer):
 16 |     def __init__(self, embedding_size=10,attention_size=3, **kwargs):
 17 |         self.embedding_size = embedding_size
 18 |         self.attention_size = attention_size
 19 |         super(AttentionNet, self).__init__(**kwargs)
 20 | 
 21 |     def build(self, input_shape):
 22 |         input_dim = input_shape[2]
 23 | 
 24 |         self.linearlayer = tf.keras.layers.Dense(input_dim, activation='relu', use_bias=True)
 25 |         self.attention_w = self.add_weight(shape=(self.embedding_size,self.attention_size),
 26 |             initializer='random_normal',trainable=True)
 27 |         self.attention_b = self.add_weight(shape=(self.attention_size,),
 28 |             initializer='random_normal',trainable=True)
 29 |         self.attention_h = self.add_weight(shape=(self.attention_size,),
 30 |             initializer='random_normal',trainable=True)
 31 |         self.attention_p = self.add_weight(shape=(self.embedding_size,1),
 32 |             initializer='ones',trainable=True)
 33 | 
 34 |     def call(self, input):
 35 |         # element_wise
 36 |         num_feat = input.shape[1]
 37 |         element_wise_product_list = []
 38 |         for i in range(num_feat):
 39 |             for j in range(i+1,num_feat):
 40 |                 element_wise_product_list.append(tf.multiply(input[:,i,:],input[:,j,:])) # None * embedding_size
 41 |         self.element_wise_product = tf.stack(element_wise_product_list) # (F * F - 1 / 2) * None * embedding_size
 42 |         self.element_wise_product = tf.transpose(self.element_wise_product,perm=[1,0,2],name='element_wise_product') # None * (F * F - 1 / 2) *  embedding_size
 43 |         print("element_wise_product",self.element_wise_product.get_shape())
 44 |         # attention part
 45 |         num_interaction = int(num_feat*(num_feat-1)/2)
 46 |         # wx+b->relu(wx+b)->h*relu(wx+b)
 47 |         self.attention_wx_plus_b = tf.reshape(tf.add(tf.matmul(tf.reshape(self.element_wise_product,shape=(-1,self.embedding_size)),
 48 |                         self.attention_w),self.attention_b),shape = [-1,num_interaction,self.attention_size]) # N *(F*F-1/2)*1
 49 |         self.attention_exp = tf.exp(tf.reduce_sum(tf.multiply(tf.nn.relu(self.attention_wx_plus_b),
 50 |                                                            self.attention_h),axis=2))# N * ( F * F - 1 / 2) * 1
 51 | 
 52 |         self.attention_exp_sum = tf.reshape(tf.reduce_sum(self.attention_exp,axis=1),shape=(-1,1)) # N * 1 * 1
 53 | 
 54 |         self.attention_out = tf.divide(self.attention_exp,self.attention_exp_sum,name='attention_out')  # N * ( F * F - 1 / 2) * 1
 55 |         self.attention_x_product = tf.reduce_sum(tf.einsum('bn,bnm->bnm',self.attention_out,self.element_wise_product),axis=1,name='afm') # N * embedding_size
 56 |         self.attention_part_sum = tf.matmul(self.attention_x_product,self.attention_p) # N * 1
 57 | 
 58 |         return self.attention_part_sum
 59 | 
 60 | class AFM(tf.keras.Model):
 61 |     def __init__(self, num_feat,embedding_size=10,attention_size=3):
 62 |         super().__init__()
 63 |         self.num_feat = num_feat  # F features nums 字典数量
 64 |         self.embedding_size = embedding_size
 65 |         self.attention_size = attention_size
 66 |         # Embedding 这里采用embeddings层 因此大小为F* M F为field特征数量，N 为 feature的种类数 M为embedding的维度
 67 |         feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size,
 68 |                                                     embeddings_initializer='uniform')  # N * embedding_size
 69 |         self.feat_embeddings = feat_embeddings
 70 |         self.attentionlayer = AttentionNet(self.embedding_size,self.attention_size)
 71 |         # linear part
 72 |         self.linearlayer = tf.keras.layers.Dense(1, activation='relu', use_bias=True)
 73 | 
 74 |     def call(self, feat_index, feat_value):
 75 |         # call函数接收输入变量
 76 |         # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
 77 |         feat_embedding_0 = self.feat_embeddings(feat_index)  # Batch * F * embedding_size
 78 |         feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value) # # Batch * F * embedding_size
 79 |         feat_embedding_1 = tf.transpose(feat_embedding,perm=[0,2,1])
 80 |         y_deep = self.attentionlayer(feat_embedding)
 81 | 
 82 |         y_linear = tf.reduce_sum(self.linearlayer(feat_embedding_1),axis=1)
 83 |         output = y_deep + y_linear
 84 |         return output
 85 | if __name__ == '__main__':
 86 |     AID_DATA_DIR = "../data/Criteo/"
 87 |     feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
 88 | 
 89 |     afm = AFM(num_feat=len(feat_dict_) + 1,embedding_size=10,attention_size=3)
 90 | 
 91 |     train_label_path = AID_DATA_DIR + 'train_label'
 92 |     train_idx_path = AID_DATA_DIR + 'train_idx'
 93 |     train_value_path = AID_DATA_DIR + 'train_value'
 94 | 
 95 |     test_label_path = AID_DATA_DIR + 'test_label'
 96 |     test_idx_path = AID_DATA_DIR + 'test_idx'
 97 |     test_value_path = AID_DATA_DIR + 'test_value'
 98 | 
 99 |     train_test_model_demo(afm,train_label_path, train_idx_path, train_value_path)
100 | 


--------------------------------------------------------------------------------
/CollaborativeFiltering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "movies = pd.read_csv(r\"F:\\baidudownload\\ml-20m\\ml-20m\\movies.csv\")"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 4,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "ratings = pd.read_csv(r'F:\\baidudownload\\ml-20m\\ml-20m\\ratings.csv')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 5,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/html": [
 38 |        "<div>\n",
 39 |        "<style scoped>\n",
 40 |        "    .dataframe tbody tr th:only-of-type {\n",
 41 |        "        vertical-align: middle;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe tbody tr th {\n",
 45 |        "        vertical-align: top;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe thead th {\n",
 49 |        "        text-align: right;\n",
 50 |        "    }\n",
 51 |        "</style>\n",
 52 |        "<table border=\"1\" class=\"dataframe\">\n",
 53 |        "  <thead>\n",
 54 |        "    <tr style=\"text-align: right;\">\n",
 55 |        "      <th></th>\n",
 56 |        "      <th>movieId</th>\n",
 57 |        "      <th>title</th>\n",
 58 |        "      <th>genres</th>\n",
 59 |        "    </tr>\n",
 60 |        "  </thead>\n",
 61 |        "  <tbody>\n",
 62 |        "    <tr>\n",
 63 |        "      <th>0</th>\n",
 64 |        "      <td>1</td>\n",
 65 |        "      <td>Toy Story (1995)</td>\n",
 66 |        "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
 67 |        "    </tr>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>1</th>\n",
 70 |        "      <td>2</td>\n",
 71 |        "      <td>Jumanji (1995)</td>\n",
 72 |        "      <td>Adventure|Children|Fantasy</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>2</th>\n",
 76 |        "      <td>3</td>\n",
 77 |        "      <td>Grumpier Old Men (1995)</td>\n",
 78 |        "      <td>Comedy|Romance</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>3</th>\n",
 82 |        "      <td>4</td>\n",
 83 |        "      <td>Waiting to Exhale (1995)</td>\n",
 84 |        "      <td>Comedy|Drama|Romance</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>4</th>\n",
 88 |        "      <td>5</td>\n",
 89 |        "      <td>Father of the Bride Part II (1995)</td>\n",
 90 |        "      <td>Comedy</td>\n",
 91 |        "    </tr>\n",
 92 |        "  </tbody>\n",
 93 |        "</table>\n",
 94 |        "</div>"
 95 |       ],
 96 |       "text/plain": [
 97 |        "   movieId                               title  \\\n",
 98 |        "0        1                    Toy Story (1995)   \n",
 99 |        "1        2                      Jumanji (1995)   \n",
100 |        "2        3             Grumpier Old Men (1995)   \n",
101 |        "3        4            Waiting to Exhale (1995)   \n",
102 |        "4        5  Father of the Bride Part II (1995)   \n",
103 |        "\n",
104 |        "                                        genres  \n",
105 |        "0  Adventure|Animation|Children|Comedy|Fantasy  \n",
106 |        "1                   Adventure|Children|Fantasy  \n",
107 |        "2                               Comedy|Romance  \n",
108 |        "3                         Comedy|Drama|Romance  \n",
109 |        "4                                       Comedy  "
110 |       ]
111 |      },
112 |      "execution_count": 5,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "movies.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/html": [
129 |        "<div>\n",
130 |        "<style scoped>\n",
131 |        "    .dataframe tbody tr th:only-of-type {\n",
132 |        "        vertical-align: middle;\n",
133 |        "    }\n",
134 |        "\n",
135 |        "    .dataframe tbody tr th {\n",
136 |        "        vertical-align: top;\n",
137 |        "    }\n",
138 |        "\n",
139 |        "    .dataframe thead th {\n",
140 |        "        text-align: right;\n",
141 |        "    }\n",
142 |        "</style>\n",
143 |        "<table border=\"1\" class=\"dataframe\">\n",
144 |        "  <thead>\n",
145 |        "    <tr style=\"text-align: right;\">\n",
146 |        "      <th></th>\n",
147 |        "      <th>userId</th>\n",
148 |        "      <th>movieId</th>\n",
149 |        "      <th>rating</th>\n",
150 |        "      <th>timestamp</th>\n",
151 |        "    </tr>\n",
152 |        "  </thead>\n",
153 |        "  <tbody>\n",
154 |        "    <tr>\n",
155 |        "      <th>0</th>\n",
156 |        "      <td>1</td>\n",
157 |        "      <td>2</td>\n",
158 |        "      <td>3.5</td>\n",
159 |        "      <td>1112486027</td>\n",
160 |        "    </tr>\n",
161 |        "    <tr>\n",
162 |        "      <th>1</th>\n",
163 |        "      <td>1</td>\n",
164 |        "      <td>29</td>\n",
165 |        "      <td>3.5</td>\n",
166 |        "      <td>1112484676</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>2</th>\n",
170 |        "      <td>1</td>\n",
171 |        "      <td>32</td>\n",
172 |        "      <td>3.5</td>\n",
173 |        "      <td>1112484819</td>\n",
174 |        "    </tr>\n",
175 |        "    <tr>\n",
176 |        "      <th>3</th>\n",
177 |        "      <td>1</td>\n",
178 |        "      <td>47</td>\n",
179 |        "      <td>3.5</td>\n",
180 |        "      <td>1112484727</td>\n",
181 |        "    </tr>\n",
182 |        "    <tr>\n",
183 |        "      <th>4</th>\n",
184 |        "      <td>1</td>\n",
185 |        "      <td>50</td>\n",
186 |        "      <td>3.5</td>\n",
187 |        "      <td>1112484580</td>\n",
188 |        "    </tr>\n",
189 |        "  </tbody>\n",
190 |        "</table>\n",
191 |        "</div>"
192 |       ],
193 |       "text/plain": [
194 |        "   userId  movieId  rating   timestamp\n",
195 |        "0       1        2     3.5  1112486027\n",
196 |        "1       1       29     3.5  1112484676\n",
197 |        "2       1       32     3.5  1112484819\n",
198 |        "3       1       47     3.5  1112484727\n",
199 |        "4       1       50     3.5  1112484580"
200 |       ]
201 |      },
202 |      "execution_count": 6,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "ratings.head()"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 10,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "data = pd.merge(movies,ratings,on='movieId',how='left')"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 12,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "data[['userId','rating','movieId','title']].sort_values('userId').to_csv(r\"F:\\baidudownload\\ml-20m\\ml-20m\\data.csv\",index=False)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "### 采用python字典来表示每位用户评论的电影和评分"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 25,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "files = open(r\"F:\\baidudownload\\ml-20m\\ml-20m\\data.csv\",'r',encoding=\"UTF-8\")\n",
243 |     "# 读取data文件中每行中除了名字的数据\n",
244 |     "data = {} ## 存放每个用户评论的电影和评分\n",
245 |     "for line in files.readlines():\n",
246 |     "    line = line.strip().split(',')\n",
247 |     "    # 如果字典中没有某位用户，则使用用户ID来创建这位用户\n",
248 |     "    if not line[0] in data.keys():\n",
249 |     "        data[line[0]] = {line[3]:line[1]} # 子字典\n",
250 |     "    else:\n",
251 |     "         data[line[0]][line[3]] = line[1]"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "## 计算任何两位用户之间的相似度，由于每位用户评论的电影不完全一样，所以兽先要找到两位用户共同评论过的电影然后计算两者之间的欧式距离，最后算出两者之间的相似度"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 16,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "from math import *"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 38,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "def Euclidean(user1,user2):\n",
277 |     "    # 取出两位用户评论过的电影和评分\n",
278 |     "    user1_data = data[user1]\n",
279 |     "    user2_data = data[user2]\n",
280 |     "    \n",
281 |     "    # 找到两位用户都评论过的电影，并计算两者的欧式距离\n",
282 |     "    for key in user1_data.keys():\n",
283 |     "        if key in user2_data.keys():\n",
284 |     "#             print(user1_data[key],user2_data[key])\n",
285 |     "            try:\n",
286 |     "                distance +=pow((float(user1_data[key])-float(user2_data[key])),2)\n",
287 |     "            except:\n",
288 |     "                print(\"error:\",user2_data[key])\n",
289 |     "    return 1/(1+sqrt(distance)) # 计算返回值越小，相似度越大"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 39,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "name": "stdout",
299 |      "output_type": "stream",
300 |      "text": [
301 |       "error: \n",
302 |       "[('17602.0', 0.037535053785096986), ('67346.0', 0.03923924660549805), ('116900.0', 0.03938151824124737), ('130390.0', 0.042373278587501804)]\n"
303 |      ]
304 |     }
305 |    ],
306 |    "source": [
307 |     "# 计算某个用户与其他用的相似度\n",
308 |     "def top10_simliar(userID):\n",
309 |     "    res = []\n",
310 |     "    for userid in data.keys():\n",
311 |     "        # 排除自己计算相似度\n",
312 |     "        if not userid == userID:\n",
313 |     "            simliar = Euclidean(userID,userid)\n",
314 |     "            res.append((userid,simliar))\n",
315 |     "    res.sort(key=lambda val:val[1])# 按照相似度最大顺序排序\n",
316 |     "    return res[:4]\n",
317 |     "RES = top10_simliar('1.0')\n",
318 |     "print(RES)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "## 根据相似度来推荐用户"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 45,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "error: \n",
338 |       "[('Good Will Hunting (1997)', '5.0'), ('Horton Hears a Who! (2008)', '5.0'), ('Billy Madison (1995)', '5.0'), ('Julie & Julia (2009)', '5.0'), ('Chocolat (2000)', '5.0'), ('Harry Potter and the Order of the Phoenix (2007)', '5.0'), ('\"Sisterhood of the Traveling Pants', '5.0'), ('\"Secret Life of Bees', '5.0'), ('Happy Gilmore (1996)', '5.0'), ('Big Daddy (1999)', '5.0')]\n"
339 |      ]
340 |     }
341 |    ],
342 |    "source": [
343 |     "def recommend(user):\n",
344 |     "    # 相似度最高用户\n",
345 |     "    top_sim_user = top10_simliar(user)[0][0]\n",
346 |     "    # 相似度最高用户的观影记录\n",
347 |     "    items = data[top_sim_user]\n",
348 |     "    recommendations = []\n",
349 |     "    # 筛选出该用户未观看的电影病添加到列表中\n",
350 |     "    for item in items.keys():\n",
351 |     "        if item not in data[user].keys():\n",
352 |     "            recommendations.append((item,items[item]))\n",
353 |     "    recommendations.sort(key=lambda val :val[1],reverse=True) # 按照评分排序\n",
354 |     "    # 返回评分最高的10部电影\n",
355 |     "    return recommendations[:10]\n",
356 |     "Recommend = recommend('1.0')\n",
357 |     "print(Recommend)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "#### \n",
365 |     "但有时我们会碰到因为两个用户之间数据由于数据膨胀，一方数据大，一方数据小，但是两者称明显的线性关系\n",
366 |     "\n",
367 |     "我们引入Pearson相关系数来衡量两个变量之间的线性相关性。\n",
368 |     "\n",
369 |     "Pearson：-1~1   -1：完全负相关  1：完全正相关  0：不相关              \n",
370 |     "\n",
371 |     "相关系数 0.8-1.0 极强相关\n",
372 |     "\n",
373 |     "0.6-0.8 强相关\n",
374 |     "\n",
375 |     "0.4-0.6 中等程度相关\n",
376 |     "\n",
377 |     "0.2-0.4 弱相关\n",
378 |     "\n",
379 |     "0.0-0.2 极弱相关或无相关\n",
380 |     "\n",
381 |     "公式："
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 61,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "name": "stdout",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "0.22531203182281434\n"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "########################################################################\n",
399 |     "##计算两用户之间的Pearson相关系数\n",
400 |     "def pearson_sim(user1,user2):\n",
401 |     "    # 取出两位用户评论过的电影和评分\n",
402 |     "    user1_data = data[user1]\n",
403 |     "    user2_data = data[user2]\n",
404 |     "    distance = 0\n",
405 |     "    common = {}\n",
406 |     " \n",
407 |     "    # 找到两位用户都评论过的电影\n",
408 |     "    for key in user1_data.keys():\n",
409 |     "        if key in user2_data.keys():\n",
410 |     "            common[key] = 1\n",
411 |     "    if len(common) == 0:\n",
412 |     "        return 0#如果没有共同评论过的电影，则返回0\n",
413 |     "    n = len(common)#共同电影数目\n",
414 |     "#     print(n,common)\n",
415 |     " \n",
416 |     "    ##计算评分和\n",
417 |     "    try:\n",
418 |     "        sum1 = sum([float(user1_data[movie]) for movie in common])\n",
419 |     "        sum2 = sum([float(user2_data[movie]) for movie in common])\n",
420 |     "\n",
421 |     "        ##计算评分平方和\n",
422 |     "        sum1Sq = sum([pow(float(user1_data[movie]),2) for movie in common])\n",
423 |     "        sum2Sq = sum([pow(float(user2_data[movie]),2) for movie in common])\n",
424 |     "\n",
425 |     "        ##计算乘积和\n",
426 |     "        PSum = sum([float(user1_data[it])*float(user2_data[it]) for it in common])\n",
427 |     " \n",
428 |     "        ##计算相关系数\n",
429 |     "        num = PSum - (sum1*sum2/n)\n",
430 |     "        den = sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))\n",
431 |     "    except:\n",
432 |     "        den = 999\n",
433 |     "        num = 0\n",
434 |     "        print('error:') \n",
435 |     "    if den == 0:\n",
436 |     "        return 0\n",
437 |     "    r = num/den\n",
438 |     "    return r\n",
439 |     " \n",
440 |     "R = pearson_sim('1.0','3.0')\n",
441 |     "print(R)"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 63,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "# 计算某个用户与其他用的相似度\n",
451 |     "def top10_simliar(userID):\n",
452 |     "    res = []\n",
453 |     "    for userid in data.keys():\n",
454 |     "        # 排除自己计算相似度\n",
455 |     "        if not userid == userID:\n",
456 |     "            simliar = pearson_sim(userID,userid)\n",
457 |     "            res.append((userid,simliar))\n",
458 |     "    res.sort(key=lambda val:val[1])# 按照相似度最大顺序排序\n",
459 |     "    return res[-4:]"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 64,
465 |    "metadata": {},
466 |    "outputs": [
467 |     {
468 |      "name": "stdout",
469 |      "output_type": "stream",
470 |      "text": [
471 |       "error:\n",
472 |       "[('79721.0', 1.000000000000017), ('60581.0', 1.0000000000000187), ('83906.0', 1.0000000000000213), ('103682.0', 1.0000000000000255)]\n"
473 |      ]
474 |     }
475 |    ],
476 |    "source": [
477 |     "RES = top10_simliar('1.0')\n",
478 |     "print(RES)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 65,
484 |    "metadata": {},
485 |    "outputs": [
486 |     {
487 |      "name": "stdout",
488 |      "output_type": "stream",
489 |      "text": [
490 |       "error:\n",
491 |       "[('\"Italian Job', '5.0'), ('\"Clockwork Orange', '5.0'), ('RocknRolla (2008)', '5.0'), ('No Country for Old Men (2007)', '5.0'), ('21 Grams (2003)', '5.0'), ('Layer Cake (2004)', '5.0'), ('Seven Pounds (2008)', '5.0'), ('Trainspotting (1996)', '5.0'), (\"Carlito's Way (1993)\", '5.0'), ('Crash (2004)', '5.0')]\n"
492 |      ]
493 |     }
494 |    ],
495 |    "source": [
496 |     "def recommend(user):\n",
497 |     "    # 相似度最高用户\n",
498 |     "    top_sim_user = top10_simliar(user)[0][0]\n",
499 |     "    # 相似度最高用户的观影记录\n",
500 |     "    items = data[top_sim_user]\n",
501 |     "    recommendations = []\n",
502 |     "    # 筛选出该用户未观看的电影病添加到列表中\n",
503 |     "    for item in items.keys():\n",
504 |     "        if item not in data[user].keys():\n",
505 |     "            recommendations.append((item,items[item]))\n",
506 |     "    recommendations.sort(key=lambda val :val[1],reverse=True) # 按照评分排序\n",
507 |     "    # 返回评分最高的10部电影\n",
508 |     "    return recommendations[:10]\n",
509 |     "Recommend = recommend('1.0')\n",
510 |     "print(Recommend)"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": null,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": []
519 |   }
520 |  ],
521 |  "metadata": {
522 |   "kernelspec": {
523 |    "display_name": "Python 3",
524 |    "language": "python",
525 |    "name": "python3"
526 |   },
527 |   "language_info": {
528 |    "codemirror_mode": {
529 |     "name": "ipython",
530 |     "version": 3
531 |    },
532 |    "file_extension": ".py",
533 |    "mimetype": "text/x-python",
534 |    "name": "python",
535 |    "nbconvert_exporter": "python",
536 |    "pygments_lexer": "ipython3",
537 |    "version": "3.6.5"
538 |   }
539 |  },
540 |  "nbformat": 4,
541 |  "nbformat_minor": 1
542 | }
543 | 


--------------------------------------------------------------------------------
/DCN/DCN-tf2.0.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 13 |       "  from ._conv import register_converters as _register_converters\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "import tensorflow as tf"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from collections import Counter"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "class CrossLayer(tf.keras.layers.Layer):\n",
 39 |     "    def __init__(self,output_dim,num_layer,**kwargs):\n",
 40 |     "        self.output_dim = output_dim\n",
 41 |     "        self.num_layer = num_layer\n",
 42 |     "        super(CrossLayer,self).__init__(**kwargs)\n",
 43 |     "    \n",
 44 |     "    def build(self,input_shape):\n",
 45 |     "        self.input_dim = input_shape[1]\n",
 46 |     "        # print(self.input_dim)\n",
 47 |     "        self.W = []\n",
 48 |     "        self.bias = []\n",
 49 |     "        for i in range(self.num_layer):\n",
 50 |     "            self.W.append(self.add_weight(shape=[self.input_dim,1],initializer = 'glorot_uniform',name='w_{}'.format(i),trainable=True))\n",
 51 |     "            self.bias.append(self.add_weight(shape=[self.input_dim,1],initializer = 'zeros',name='b_{}'.format(i),trainable=True))\n",
 52 |     "        self.built = True\n",
 53 |     "    def call(self,input):\n",
 54 |     "\n",
 55 |     "        x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j]\n",
 56 |     "        # print(\"x0_shape\",x0.get_shape())\n",
 57 |     "        x1 = tf.einsum('bmn,bnk->bmk',input,x0)\n",
 58 |     "        cross = tf.einsum('bmn,nk->bmk',x1,self.W[0]) + self.bias[0] + input\n",
 59 |     "        \n",
 60 |     "        for i in range(1,self.num_layer):\n",
 61 |     "            x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j]\n",
 62 |     "            x1 = tf.einsum('bmn,bnk->bmk',input,x0)\n",
 63 |     "            cross = tf.einsum('bmn,nk->bmk',x1,self.W[i]) + self.bias[i] + cross\n",
 64 |     "        return cross\n",
 65 |     "        \n",
 66 |     "class Deep(tf.keras.layers.Layer):\n",
 67 |     "    def __init__(self,dropout_deep,deep_layer_sizes):\n",
 68 |     "        # input_dim = num_size + embed_size = input_size\n",
 69 |     "        super(Deep, self).__init__()\n",
 70 |     "        self.dropout_deep  = dropout_deep\n",
 71 |     "        # fc layer\n",
 72 |     "        self.deep_layer_sizes = deep_layer_sizes\n",
 73 |     "        # 神经网络方面的参数\n",
 74 |     "        for i in range(len(deep_layer_sizes)):\n",
 75 |     "            setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))\n",
 76 |     "            setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())\n",
 77 |     "            setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))\n",
 78 |     "            setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))\n",
 79 |     "        # last layer\n",
 80 |     "        self.fc = tf.keras.layers.Dense(128,activation=None,use_bias=True)\n",
 81 |     "        \n",
 82 |     "    def call(self,input):\n",
 83 |     "        y_deep = getattr(self,'dense_' + str(0))(input)\n",
 84 |     "        y_deep = getattr(self,'batchNorm_' + str(0))(y_deep)\n",
 85 |     "        y_deep = getattr(self,'activation_' + str(0))(y_deep)\n",
 86 |     "        y_deep = getattr(self,'dropout_' + str(0))(y_deep)\n",
 87 |     "        \n",
 88 |     "        for i in range(1,len(self.deep_layer_sizes)):\n",
 89 |     "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
 90 |     "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
 91 |     "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
 92 |     "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
 93 |     "        \n",
 94 |     "        output = self.fc(y_deep)\n",
 95 |     "        return output\n",
 96 |     "    \n",
 97 |     "class DCN(tf.keras.Model):\n",
 98 |     "    def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):\n",
 99 |     "        super().__init__()\n",
100 |     "        self.num_feat = num_feat # F =features nums\n",
101 |     "        self.num_field = num_field # N =fields of a feature \n",
102 |     "        self.dropout_deep  = dropout_deep\n",
103 |     "        \n",
104 |     "        # Embedding 这里采用embeddings层因此大小为F* M F为特征数量，M为embedding的维度\n",
105 |     "        feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M \n",
106 |     "        self.feat_embeddings = feat_embeddings\n",
107 |     "        \n",
108 |     "        self.crosslayer = CrossLayer(output_dim = 128,num_layer=8)\n",
109 |     "    \n",
110 |     "        self.deep = Deep(dropout_deep,deep_layer_sizes)\n",
111 |     "        self.fc = tf.keras.layers.Dense(1,activation='sigmoid',use_bias=True)\n",
112 |     "        \n",
113 |     "    def call(self,feat_index,feat_value):\n",
114 |     "        \n",
115 |     "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
116 |     "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
117 |     "#         print(feat_value.get_shape())\n",
118 |     "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
119 |     "        # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
120 |     "        stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
121 |     "        # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
122 |     "        \n",
123 |     "        x1 = self.crosslayer(stack_input)\n",
124 |     "        x2 = self.deep(stack_input)\n",
125 |     "        \n",
126 |     "        x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
127 |     "        output = self.fc(x3)\n",
128 |     "        return output"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 4,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "train = pd.read_table('../data/Criteo/train.txt')\n",
138 |     "train.columns=['label','I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n",
139 |     "       'I10', 'I11', 'I12', 'I13','C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n",
140 |     "       'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n",
141 |     "       'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 5,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "cont_features=['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n",
151 |     "       'I10', 'I11', 'I12', 'I13']\n",
152 |     "dist_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n",
153 |     "       'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n",
154 |     "       'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 6,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "freq_ = 10\n",
164 |     "# dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'\n",
165 |     "continuous_range_ = range(1, 14)\n",
166 |     "categorical_range_ = range(14, 40)\n",
167 |     "\n",
168 |     "# 统计离散特征每个离散值出现的次数组成字典\n",
169 |     "feat_cnt = Counter()\n",
170 |     "with open('../data/Criteo/train.txt', 'r') as fin:\n",
171 |     "    for line_idx, line in enumerate(fin):\n",
172 |     "        features = line.rstrip('\\n').split('\\t')\n",
173 |     "        for idx in categorical_range_:\n",
174 |     "            if features[idx] == '': continue\n",
175 |     "            feat_cnt.update([features[idx]])\n",
176 |     "# Only retain discrete features with high frequency\n",
177 |     "dis_feat_set = set() # 高频段的离散字符\n",
178 |     "for feat, ot in feat_cnt.items():\n",
179 |     "    if ot >= freq_:\n",
180 |     "        dis_feat_set.add(feat)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 7,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "# Create a dictionary for continuous and discrete features\n",
190 |     "feat_dict = {}\n",
191 |     "tc = 1\n",
192 |     "# Continuous features\n",
193 |     "for idx in continuous_range_:\n",
194 |     "    feat_dict[idx] = tc\n",
195 |     "    tc += 1 # 代表占据一列\n",
196 |     "\n",
197 |     "# Discrete features\n",
198 |     "cnt_feat_set = set()\n",
199 |     "with open('../data/Criteo/train.txt', 'r') as fin:\n",
200 |     "    for line_idx, line in enumerate(fin):\n",
201 |     "        features = line.rstrip('\\n').split('\\t')\n",
202 |     "        for idx in categorical_range_:\n",
203 |     "            # 排除空字符和低频离散字符\n",
204 |     "            if features[idx] == '' or features[idx] not in dis_feat_set:\n",
205 |     "                continue\n",
206 |     "            # 排除连续性数值\n",
207 |     "            if features[idx] not in cnt_feat_set:\n",
208 |     "                cnt_feat_set.add(features[idx])\n",
209 |     "                # 获取种类数\n",
210 |     "                feat_dict[features[idx]] = tc\n",
211 |     "                tc += 1"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 8,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "train_label = []\n",
221 |     "train_value = []\n",
222 |     "train_idx = []\n",
223 |     "\n",
224 |     "continuous_range_ = range(1, 14)\n",
225 |     "categorical_range_ = range(14, 40)\n",
226 |     "cont_max_=[]\n",
227 |     "cont_min_=[]\n",
228 |     "for cf in cont_features:\n",
229 |     "    cont_max_.append(max(train[cf]))\n",
230 |     "    cont_min_.append(min(train[cf]))\n",
231 |     "cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]\n",
232 |     "\n",
233 |     "def process_line_(line):\n",
234 |     "    features = line.rstrip('\\n').split('\\t')\n",
235 |     "    feat_idx, feat_value, label = [], [], []\n",
236 |     "\n",
237 |     "    # MinMax Normalization\n",
238 |     "    for idx in continuous_range_:\n",
239 |     "        if features[idx] == '':\n",
240 |     "            feat_idx.append(0)\n",
241 |     "            feat_value.append(0.0)\n",
242 |     "        else:\n",
243 |     "            feat_idx.append(feat_dict[idx])\n",
244 |     "            # 归一化\n",
245 |     "            feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))\n",
246 |     "\n",
247 |     "    # 处理离散型数据\n",
248 |     "    for idx in categorical_range_:\n",
249 |     "        if features[idx] == '' or features[idx] not in feat_dict:\n",
250 |     "            feat_idx.append(0)\n",
251 |     "            feat_value.append(0.0)\n",
252 |     "        else:\n",
253 |     "            feat_idx.append(feat_dict[features[idx]])\n",
254 |     "            feat_value.append(1.0)\n",
255 |     "    return feat_idx, feat_value, [int(features[0])]\n",
256 |     "\n",
257 |     "with open('../data/Criteo/train.txt', 'r') as fin:\n",
258 |     "    for line_idx, line in enumerate(fin):\n",
259 |     "\n",
260 |     "        feat_idx, feat_value, label = process_line_(line)\n",
261 |     "        train_label.append(label)\n",
262 |     "        train_idx.append(feat_idx)\n",
263 |     "        train_value.append(feat_value)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 9,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "dcn= DCN(num_feat=len(feat_dict) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],\n",
273 |     "                deep_layer_sizes=[400, 400])"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 10,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "train_ds = tf.data.Dataset.from_tensor_slices(\n",
283 |     "    (train_label,train_idx,train_value)).shuffle(10000).batch(32)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 11,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "@tf.function\n",
293 |     "def train_one_step(model, optimizer, idx, value, label):\n",
294 |     "    with tf.GradientTape() as tape:\n",
295 |     "        output = model(idx,value)\n",
296 |     "        loss = loss_object(y_true=label, y_pred=output)\n",
297 |     "    grads = tape.gradient(loss, model.trainable_variables)\n",
298 |     "    grads = [tf.clip_by_norm(g, 100) for g in grads]\n",
299 |     "    optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))\n",
300 |     "    \n",
301 |     "    train_loss(loss)\n",
302 |     "    train_accuracy(label,output)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 12,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "train_loss = tf.keras.metrics.Mean(name='train_loss')\n",
312 |     "train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')\n",
313 |     "\n",
314 |     "loss_object = tf.keras.losses.BinaryCrossentropy()\n",
315 |     "\n",
316 |     "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 13,
322 |    "metadata": {
323 |     "scrolled": true
324 |    },
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "WARNING:tensorflow:Entity <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>>, which Python reported as:\n",
331 |       "    def call(self,feat_index,feat_value):\n",
332 |       "        \n",
333 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
334 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
335 |       "#         print(feat_value.get_shape())\n",
336 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
337 |       "        # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
338 |       "        stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
339 |       "        # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
340 |       "        \n",
341 |       "        x1 = self.crosslayer(stack_input)\n",
342 |       "        x2 = self.deep(stack_input)\n",
343 |       "        \n",
344 |       "        x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
345 |       "        output = self.fc(x3)\n",
346 |       "        return output\n",
347 |       "\n",
348 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
349 |       "WARNING: Entity <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>>, which Python reported as:\n",
350 |       "    def call(self,feat_index,feat_value):\n",
351 |       "        \n",
352 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
353 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
354 |       "#         print(feat_value.get_shape())\n",
355 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
356 |       "        # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
357 |       "        stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
358 |       "        # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
359 |       "        \n",
360 |       "        x1 = self.crosslayer(stack_input)\n",
361 |       "        x2 = self.deep(stack_input)\n",
362 |       "        \n",
363 |       "        x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
364 |       "        output = self.fc(x3)\n",
365 |       "        return output\n",
366 |       "\n",
367 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
368 |       "WARNING:tensorflow:Entity <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>>, which Python reported as:\n",
369 |       "    def call(self,feat_index,feat_value):\n",
370 |       "        \n",
371 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
372 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
373 |       "#         print(feat_value.get_shape())\n",
374 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
375 |       "        # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
376 |       "        stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
377 |       "        # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
378 |       "        \n",
379 |       "        x1 = self.crosslayer(stack_input)\n",
380 |       "        x2 = self.deep(stack_input)\n",
381 |       "        \n",
382 |       "        x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
383 |       "        output = self.fc(x3)\n",
384 |       "        return output\n",
385 |       "\n",
386 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
387 |       "WARNING: Entity <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>>, which Python reported as:\n",
388 |       "    def call(self,feat_index,feat_value):\n",
389 |       "        \n",
390 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
391 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
392 |       "#         print(feat_value.get_shape())\n",
393 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
394 |       "        # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
395 |       "        stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
396 |       "        # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
397 |       "        \n",
398 |       "        x1 = self.crosslayer(stack_input)\n",
399 |       "        x2 = self.deep(stack_input)\n",
400 |       "        \n",
401 |       "        x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
402 |       "        output = self.fc(x3)\n",
403 |       "        return output\n",
404 |       "\n",
405 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
406 |       "WARNING:tensorflow:Entity <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>>, which Python reported as:\n",
407 |       "    def call(self,feat_index,feat_value):\n",
408 |       "        \n",
409 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
410 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
411 |       "#         print(feat_value.get_shape())\n",
412 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
413 |       "        # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
414 |       "        stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
415 |       "        # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
416 |       "        \n",
417 |       "        x1 = self.crosslayer(stack_input)\n",
418 |       "        x2 = self.deep(stack_input)\n",
419 |       "        \n",
420 |       "        x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
421 |       "        output = self.fc(x3)\n",
422 |       "        return output\n",
423 |       "\n",
424 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
425 |       "WARNING: Entity <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method DCN.call of <__main__.DCN object at 0x00000183593F28D0>>, which Python reported as:\n",
426 |       "    def call(self,feat_index,feat_value):\n",
427 |       "        \n",
428 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
429 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
430 |       "#         print(feat_value.get_shape())\n",
431 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
432 |       "        # print(\"feat_embedding:\",feat_embedding.get_shape()) # 32 * 39 * 10\n",
433 |       "        stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)\n",
434 |       "        # print(\"stack_input:\",stack_input.get_shape()) # 32 * 1 * 390\n",
435 |       "        \n",
436 |       "        x1 = self.crosslayer(stack_input)\n",
437 |       "        x2 = self.deep(stack_input)\n",
438 |       "        \n",
439 |       "        x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)\n",
440 |       "        output = self.fc(x3)\n",
441 |       "        return output\n",
442 |       "\n",
443 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
444 |       "Epoch 1, Loss: 0.565358579158783, Accuracy: 0.790395200252533\n",
445 |       "Epoch 2, Loss: 0.5333142280578613, Accuracy: 0.7906453013420105\n",
446 |       "Epoch 3, Loss: 0.5188921093940735, Accuracy: 0.7907286882400513\n",
447 |       "Epoch 4, Loss: 0.5085805654525757, Accuracy: 0.790770411491394\n",
448 |       "Epoch 5, Loss: 0.5001382231712341, Accuracy: 0.7907953858375549\n",
449 |       "Epoch 6, Loss: 0.49196508526802063, Accuracy: 0.790812075138092\n",
450 |       "Epoch 7, Loss: 0.4845847487449646, Accuracy: 0.791538655757904\n",
451 |       "Epoch 8, Loss: 0.4777772128582001, Accuracy: 0.7933967113494873\n",
452 |       "Epoch 9, Loss: 0.4712851643562317, Accuracy: 0.7953976988792419\n"
453 |      ]
454 |     },
455 |     {
456 |      "name": "stdout",
457 |      "output_type": "stream",
458 |      "text": [
459 |       "Epoch 10, Loss: 0.46522337198257446, Accuracy: 0.797548770904541\n",
460 |       "Epoch 11, Loss: 0.4593830108642578, Accuracy: 0.799308717250824\n",
461 |       "Epoch 12, Loss: 0.4535185396671295, Accuracy: 0.8014007210731506\n",
462 |       "Epoch 13, Loss: 0.4476926326751709, Accuracy: 0.8034401535987854\n",
463 |       "Epoch 14, Loss: 0.4420176148414612, Accuracy: 0.8057957291603088\n",
464 |       "Epoch 15, Loss: 0.43604835867881775, Accuracy: 0.8078039288520813\n",
465 |       "Epoch 16, Loss: 0.430029958486557, Accuracy: 0.8101238012313843\n",
466 |       "Epoch 17, Loss: 0.4236184060573578, Accuracy: 0.8130241632461548\n",
467 |       "Epoch 18, Loss: 0.41711094975471497, Accuracy: 0.8157690167427063\n",
468 |       "Epoch 19, Loss: 0.410213828086853, Accuracy: 0.8188567757606506\n",
469 |       "Epoch 20, Loss: 0.40275657176971436, Accuracy: 0.8226613402366638\n",
470 |       "Epoch 21, Loss: 0.3947707712650299, Accuracy: 0.8265085220336914\n",
471 |       "Epoch 22, Loss: 0.3864079415798187, Accuracy: 0.8308699727058411\n",
472 |       "Epoch 23, Loss: 0.37755030393600464, Accuracy: 0.8352872133255005\n",
473 |       "Epoch 24, Loss: 0.3682657480239868, Accuracy: 0.8399407863616943\n",
474 |       "Epoch 25, Loss: 0.3589519262313843, Accuracy: 0.8447423577308655\n",
475 |       "Epoch 26, Loss: 0.3493313491344452, Accuracy: 0.8495401740074158\n",
476 |       "Epoch 27, Loss: 0.33972665667533875, Accuracy: 0.8542419075965881\n",
477 |       "Epoch 28, Loss: 0.33029282093048096, Accuracy: 0.8588579893112183\n",
478 |       "Epoch 29, Loss: 0.3210965692996979, Accuracy: 0.8632591962814331\n",
479 |       "Epoch 30, Loss: 0.3121466338634491, Accuracy: 0.8674670457839966\n",
480 |       "Epoch 31, Loss: 0.3034890294075012, Accuracy: 0.8714196085929871\n",
481 |       "Epoch 32, Loss: 0.2950327396392822, Accuracy: 0.8753126859664917\n",
482 |       "Epoch 33, Loss: 0.2869029939174652, Accuracy: 0.8790152668952942\n",
483 |       "Epoch 34, Loss: 0.27917614579200745, Accuracy: 0.8824853897094727\n",
484 |       "Epoch 35, Loss: 0.27175894379615784, Accuracy: 0.8858000636100769\n",
485 |       "Epoch 36, Loss: 0.2646080255508423, Accuracy: 0.8889583945274353\n",
486 |       "Epoch 37, Loss: 0.2577793300151825, Accuracy: 0.8919594883918762\n",
487 |       "Epoch 38, Loss: 0.2512573003768921, Accuracy: 0.8948026895523071\n",
488 |       "Epoch 39, Loss: 0.24505917727947235, Accuracy: 0.897487223148346\n",
489 |       "Epoch 40, Loss: 0.23911045491695404, Accuracy: 0.9000500440597534\n",
490 |       "Epoch 41, Loss: 0.23342998325824738, Accuracy: 0.9024878144264221\n",
491 |       "Epoch 42, Loss: 0.22800736129283905, Accuracy: 0.9048095345497131\n",
492 |       "Epoch 43, Loss: 0.22281348705291748, Accuracy: 0.9070232510566711\n",
493 |       "Epoch 44, Loss: 0.21784667670726776, Accuracy: 0.9091364145278931\n",
494 |       "Epoch 45, Loss: 0.2130877673625946, Accuracy: 0.9111555814743042\n",
495 |       "Epoch 46, Loss: 0.20853079855442047, Accuracy: 0.9130869507789612\n",
496 |       "Epoch 47, Loss: 0.2041596919298172, Accuracy: 0.9149361848831177\n",
497 |       "Epoch 48, Loss: 0.19996346533298492, Accuracy: 0.9167083501815796\n",
498 |       "Epoch 49, Loss: 0.19593490660190582, Accuracy: 0.9184081554412842\n",
499 |       "Epoch 50, Loss: 0.19206039607524872, Accuracy: 0.9200400114059448\n"
500 |      ]
501 |     }
502 |    ],
503 |    "source": [
504 |     "EPOCHS = 50\n",
505 |     "for epoch in range(EPOCHS):\n",
506 |     "    for label, idx, value in train_ds:\n",
507 |     "        train_one_step(dcn,optimizer,idx, value,label)\n",
508 |     "    template = 'Epoch {}, Loss: {}, Accuracy: {}'\n",
509 |     "    print (template.format(epoch+1,\n",
510 |     "                             train_loss.result(),train_accuracy.result()))"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": null,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": []
519 |   }
520 |  ],
521 |  "metadata": {
522 |   "kernelspec": {
523 |    "display_name": "Python 3",
524 |    "language": "python",
525 |    "name": "python3"
526 |   },
527 |   "language_info": {
528 |    "codemirror_mode": {
529 |     "name": "ipython",
530 |     "version": 3
531 |    },
532 |    "file_extension": ".py",
533 |    "mimetype": "text/x-python",
534 |    "name": "python",
535 |    "nbconvert_exporter": "python",
536 |    "pygments_lexer": "ipython3",
537 |    "version": "3.6.5"
538 |   }
539 |  },
540 |  "nbformat": 4,
541 |  "nbformat_minor": 2
542 | }
543 | 


--------------------------------------------------------------------------------
/DCN/DCN-tf2.0.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tensorflow as tf
  5 | from collections import Counter
  6 | import pickle
  7 | from util.train_model import train_test_model_demo
  8 | 
  9 | 
 10 | class CrossLayer(tf.keras.layers.Layer):
 11 |     def __init__(self,output_dim,num_layer,**kwargs):
 12 |         self.output_dim = output_dim
 13 |         self.num_layer = num_layer
 14 |         super(CrossLayer,self).__init__(**kwargs)
 15 |     
 16 |     def build(self,input_shape):
 17 |         self.input_dim = input_shape[2]
 18 |         # print(self.input_dim)
 19 |         self.W = []
 20 |         self.bias = []
 21 |         for i in range(self.num_layer):
 22 |             self.W.append(self.add_weight(shape=[1,self.input_dim],initializer = 'glorot_uniform',name='w_{}'.format(i),trainable=True))
 23 |             self.bias.append(self.add_weight(shape=[1,self.input_dim],initializer = 'zeros',name='b_{}'.format(i),trainable=True))
 24 |         self.built = True
 25 | 
 26 |     def call(self,input):
 27 |         # 按照论文的公式
 28 |         # x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j]
 29 |         # print("x0_shape",x0.get_shape())# (9, 390, 1)
 30 |         # x1 = tf.einsum('bmn,bkm->bnk', input, x0)
 31 |         # print("x1_shape", x1.get_shape()) # (9, 390, 390)
 32 |         # print("self.W[0]_shape", self.W[0].get_shape())
 33 |         # cross = tf.einsum('bmn,kn->bkm',x1,self.W[0]) + self.bias[0] + input
 34 |         # print("cross0", cross.get_shape())# (9, 1, 390)
 35 |         # for i in range(1,self.num_layer):
 36 |         #     x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j]
 37 |         #     x1 = tf.einsum('bmn,bkm->bnk',input,x0)
 38 |         #     cross = tf.einsum('bmn,kn->bkm',x1,self.W[i]) + self.bias[i] + cross
 39 | 
 40 |         # 优化论文公式 改变结合律
 41 |         x0 = tf.einsum('bij->bji',input) # output[j][i] = m[i][j]
 42 |         x1 = tf.einsum('bmn,km->bnk', x0, self.W[0])
 43 |         cross = tf.einsum('bkm,bnk->bnm',input,x1) + self.bias[0] + input
 44 |         for i in range(1,self.num_layer):
 45 |             x0 = tf.einsum('bij->bji',cross) # output[j][i] = m[i][j]
 46 |             x1 = tf.einsum('bmn,km->bnk', x0, self.W[i])
 47 |             cross = tf.einsum('bkm,bnk->bnm', cross,x1) + self.bias[i] + cross
 48 |         return cross
 49 |         
 50 | class Deep(tf.keras.layers.Layer):
 51 |     def __init__(self,dropout_deep,deep_layer_sizes):
 52 |         # input_dim = num_size + embed_size = input_size
 53 |         super(Deep, self).__init__()
 54 |         self.dropout_deep = dropout_deep
 55 |         # fc layer
 56 |         self.deep_layer_sizes = deep_layer_sizes
 57 |         # 神经网络方面的参数
 58 |         for i in range(len(deep_layer_sizes)):
 59 |             setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))
 60 |             setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())
 61 |             setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))
 62 |             setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))
 63 |         # last layer
 64 |         self.fc = tf.keras.layers.Dense(128,activation=None,use_bias=True)
 65 |         
 66 |     def call(self,input):
 67 |         y_deep = getattr(self,'dense_' + str(0))(input)
 68 |         y_deep = getattr(self,'batchNorm_' + str(0))(y_deep)
 69 |         y_deep = getattr(self,'activation_' + str(0))(y_deep)
 70 |         y_deep = getattr(self,'dropout_' + str(0))(y_deep)
 71 |         
 72 |         for i in range(1,len(self.deep_layer_sizes)):
 73 |             y_deep = getattr(self,'dense_' + str(i))(y_deep)
 74 |             y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)
 75 |             y_deep = getattr(self,'activation_' + str(i))(y_deep)
 76 |             y_deep = getattr(self,'dropout_' + str(i))(y_deep)
 77 |         
 78 |         output = self.fc(y_deep)
 79 |         return output
 80 |     
 81 | class DCN(tf.keras.Model):
 82 |     def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):
 83 |         super().__init__()
 84 |         self.num_feat = num_feat # F =features nums
 85 |         self.num_field = num_field # N =fields of a feature 
 86 |         self.dropout_deep  = dropout_deep
 87 |         
 88 |         # Embedding 这里采用embeddings层因此大小为F* M F为特征数量，M为embedding的维度
 89 |         feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M 
 90 |         self.feat_embeddings = feat_embeddings
 91 |         
 92 |         self.crosslayer = CrossLayer(output_dim = 128,num_layer=8)
 93 |     
 94 |         self.deep = Deep(dropout_deep,deep_layer_sizes)
 95 |         self.fc = tf.keras.layers.Dense(1,activation='sigmoid',use_bias=True)
 96 |         
 97 |     def call(self,feat_index,feat_value):
 98 |         
 99 |         # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
100 |         feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 
101 | #         print(feat_value.get_shape())
102 |         feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)
103 |         # print("feat_embedding:",feat_embedding.get_shape()) # 32 * 39 * 10
104 |         stack_input = tf.keras.layers.Reshape((1,-1))(feat_embedding)
105 |         # print("stack_input:",stack_input.get_shape()) # 32 * 1 * 390
106 |         
107 |         x1 = self.crosslayer(stack_input)
108 |         x2 = self.deep(stack_input)
109 |         
110 |         x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)
111 |         output = self.fc(x3)
112 |         return output
113 | 
114 | if __name__ == '__main__':
115 |     AID_DATA_DIR = "../data/Criteo/"
116 |     feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
117 | 
118 |     dcn = DCN(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],
119 |                     deep_layer_sizes=[400, 400])
120 | 
121 |     train_label_path = AID_DATA_DIR + 'train_label'
122 |     train_idx_path = AID_DATA_DIR + 'train_idx'
123 |     train_value_path = AID_DATA_DIR + 'train_value'
124 | 
125 |     train_test_model_demo(dcn,train_label_path, train_idx_path, train_value_path)


--------------------------------------------------------------------------------
/GBDT_LR.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "## GBDT+LR代码分析"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Scikit-learn实现"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import lightgbm as lgb\n",
 28 |     "import numpy as np\n",
 29 |     "import pandas as pd\n",
 30 |     "from sklearn.metrics import mean_squared_error\n",
 31 |     "from sklearn.linear_model import LogisticRegression"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 10,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from sklearn.preprocessing import OneHotEncoder\n",
 41 |     "from sklearn.ensemble import GradientBoostingClassifier"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "df_train = pd.read_csv(r'F:\\Data\\recsys-data\\gbdt+lr/train.csv')\n",
 51 |     "df_test = pd.read_csv(r'F:\\Data\\recsys-data\\gbdt+lr/test.csv')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "NUMERIC_COLS = [\n",
 61 |     "    \"ps_reg_01\", \"ps_reg_02\", \"ps_reg_03\",\n",
 62 |     "    \"ps_car_12\", \"ps_car_13\", \"ps_car_14\", \"ps_car_15\",\n",
 63 |     "]"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 11,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "gbdt = GradientBoostingClassifier(n_estimators=50,random_state=10,subsample = 0.6,max_depth=7,min_samples_split=900)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/html": [
 83 |        "<div>\n",
 84 |        "<style scoped>\n",
 85 |        "    .dataframe tbody tr th:only-of-type {\n",
 86 |        "        vertical-align: middle;\n",
 87 |        "    }\n",
 88 |        "\n",
 89 |        "    .dataframe tbody tr th {\n",
 90 |        "        vertical-align: top;\n",
 91 |        "    }\n",
 92 |        "\n",
 93 |        "    .dataframe thead th {\n",
 94 |        "        text-align: right;\n",
 95 |        "    }\n",
 96 |        "</style>\n",
 97 |        "<table border=\"1\" class=\"dataframe\">\n",
 98 |        "  <thead>\n",
 99 |        "    <tr style=\"text-align: right;\">\n",
100 |        "      <th></th>\n",
101 |        "      <th>Unnamed: 0</th>\n",
102 |        "      <th>id</th>\n",
103 |        "      <th>target</th>\n",
104 |        "      <th>ps_ind_01</th>\n",
105 |        "      <th>ps_ind_02_cat</th>\n",
106 |        "      <th>ps_ind_03</th>\n",
107 |        "      <th>ps_ind_04_cat</th>\n",
108 |        "      <th>ps_ind_05_cat</th>\n",
109 |        "      <th>ps_ind_06_bin</th>\n",
110 |        "      <th>ps_ind_07_bin</th>\n",
111 |        "      <th>...</th>\n",
112 |        "      <th>ps_calc_11</th>\n",
113 |        "      <th>ps_calc_12</th>\n",
114 |        "      <th>ps_calc_13</th>\n",
115 |        "      <th>ps_calc_14</th>\n",
116 |        "      <th>ps_calc_15_bin</th>\n",
117 |        "      <th>ps_calc_16_bin</th>\n",
118 |        "      <th>ps_calc_17_bin</th>\n",
119 |        "      <th>ps_calc_18_bin</th>\n",
120 |        "      <th>ps_calc_19_bin</th>\n",
121 |        "      <th>ps_calc_20_bin</th>\n",
122 |        "    </tr>\n",
123 |        "  </thead>\n",
124 |        "  <tbody>\n",
125 |        "    <tr>\n",
126 |        "      <th>0</th>\n",
127 |        "      <td>8000</td>\n",
128 |        "      <td>20227</td>\n",
129 |        "      <td>1</td>\n",
130 |        "      <td>7</td>\n",
131 |        "      <td>1</td>\n",
132 |        "      <td>5</td>\n",
133 |        "      <td>1</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>1</td>\n",
137 |        "      <td>...</td>\n",
138 |        "      <td>4</td>\n",
139 |        "      <td>2</td>\n",
140 |        "      <td>6</td>\n",
141 |        "      <td>5</td>\n",
142 |        "      <td>0</td>\n",
143 |        "      <td>0</td>\n",
144 |        "      <td>1</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>1</td>\n",
147 |        "      <td>0</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>1</th>\n",
151 |        "      <td>8001</td>\n",
152 |        "      <td>20228</td>\n",
153 |        "      <td>1</td>\n",
154 |        "      <td>0</td>\n",
155 |        "      <td>1</td>\n",
156 |        "      <td>6</td>\n",
157 |        "      <td>1</td>\n",
158 |        "      <td>0</td>\n",
159 |        "      <td>1</td>\n",
160 |        "      <td>0</td>\n",
161 |        "      <td>...</td>\n",
162 |        "      <td>5</td>\n",
163 |        "      <td>2</td>\n",
164 |        "      <td>4</td>\n",
165 |        "      <td>10</td>\n",
166 |        "      <td>0</td>\n",
167 |        "      <td>0</td>\n",
168 |        "      <td>0</td>\n",
169 |        "      <td>0</td>\n",
170 |        "      <td>0</td>\n",
171 |        "      <td>1</td>\n",
172 |        "    </tr>\n",
173 |        "    <tr>\n",
174 |        "      <th>2</th>\n",
175 |        "      <td>8002</td>\n",
176 |        "      <td>20229</td>\n",
177 |        "      <td>0</td>\n",
178 |        "      <td>3</td>\n",
179 |        "      <td>1</td>\n",
180 |        "      <td>8</td>\n",
181 |        "      <td>0</td>\n",
182 |        "      <td>0</td>\n",
183 |        "      <td>0</td>\n",
184 |        "      <td>0</td>\n",
185 |        "      <td>...</td>\n",
186 |        "      <td>10</td>\n",
187 |        "      <td>1</td>\n",
188 |        "      <td>3</td>\n",
189 |        "      <td>5</td>\n",
190 |        "      <td>0</td>\n",
191 |        "      <td>0</td>\n",
192 |        "      <td>1</td>\n",
193 |        "      <td>1</td>\n",
194 |        "      <td>1</td>\n",
195 |        "      <td>0</td>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>3</th>\n",
199 |        "      <td>8003</td>\n",
200 |        "      <td>20235</td>\n",
201 |        "      <td>0</td>\n",
202 |        "      <td>2</td>\n",
203 |        "      <td>1</td>\n",
204 |        "      <td>8</td>\n",
205 |        "      <td>0</td>\n",
206 |        "      <td>0</td>\n",
207 |        "      <td>0</td>\n",
208 |        "      <td>0</td>\n",
209 |        "      <td>...</td>\n",
210 |        "      <td>2</td>\n",
211 |        "      <td>2</td>\n",
212 |        "      <td>2</td>\n",
213 |        "      <td>9</td>\n",
214 |        "      <td>0</td>\n",
215 |        "      <td>0</td>\n",
216 |        "      <td>0</td>\n",
217 |        "      <td>1</td>\n",
218 |        "      <td>1</td>\n",
219 |        "      <td>0</td>\n",
220 |        "    </tr>\n",
221 |        "    <tr>\n",
222 |        "      <th>4</th>\n",
223 |        "      <td>8004</td>\n",
224 |        "      <td>20236</td>\n",
225 |        "      <td>0</td>\n",
226 |        "      <td>0</td>\n",
227 |        "      <td>1</td>\n",
228 |        "      <td>2</td>\n",
229 |        "      <td>1</td>\n",
230 |        "      <td>0</td>\n",
231 |        "      <td>0</td>\n",
232 |        "      <td>0</td>\n",
233 |        "      <td>...</td>\n",
234 |        "      <td>3</td>\n",
235 |        "      <td>2</td>\n",
236 |        "      <td>5</td>\n",
237 |        "      <td>5</td>\n",
238 |        "      <td>0</td>\n",
239 |        "      <td>0</td>\n",
240 |        "      <td>1</td>\n",
241 |        "      <td>0</td>\n",
242 |        "      <td>1</td>\n",
243 |        "      <td>0</td>\n",
244 |        "    </tr>\n",
245 |        "  </tbody>\n",
246 |        "</table>\n",
247 |        "<p>5 rows × 60 columns</p>\n",
248 |        "</div>"
249 |       ],
250 |       "text/plain": [
251 |        "   Unnamed: 0     id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  \\\n",
252 |        "0        8000  20227       1          7              1          5   \n",
253 |        "1        8001  20228       1          0              1          6   \n",
254 |        "2        8002  20229       0          3              1          8   \n",
255 |        "3        8003  20235       0          2              1          8   \n",
256 |        "4        8004  20236       0          0              1          2   \n",
257 |        "\n",
258 |        "   ps_ind_04_cat  ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin       ...        \\\n",
259 |        "0              1              0              0              1       ...         \n",
260 |        "1              1              0              1              0       ...         \n",
261 |        "2              0              0              0              0       ...         \n",
262 |        "3              0              0              0              0       ...         \n",
263 |        "4              1              0              0              0       ...         \n",
264 |        "\n",
265 |        "   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \\\n",
266 |        "0           4           2           6           5               0   \n",
267 |        "1           5           2           4          10               0   \n",
268 |        "2          10           1           3           5               0   \n",
269 |        "3           2           2           2           9               0   \n",
270 |        "4           3           2           5           5               0   \n",
271 |        "\n",
272 |        "   ps_calc_16_bin  ps_calc_17_bin  ps_calc_18_bin  ps_calc_19_bin  \\\n",
273 |        "0               0               1               1               1   \n",
274 |        "1               0               0               0               0   \n",
275 |        "2               0               1               1               1   \n",
276 |        "3               0               0               1               1   \n",
277 |        "4               0               1               0               1   \n",
278 |        "\n",
279 |        "   ps_calc_20_bin  \n",
280 |        "0               0  \n",
281 |        "1               1  \n",
282 |        "2               0  \n",
283 |        "3               0  \n",
284 |        "4               0  \n",
285 |        "\n",
286 |        "[5 rows x 60 columns]"
287 |       ]
288 |      },
289 |      "execution_count": 4,
290 |      "metadata": {},
291 |      "output_type": "execute_result"
292 |     }
293 |    ],
294 |    "source": [
295 |     "df_test.head()"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 100,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "y_train = df_train['target']\n",
305 |     "y_test = df_test['target']\n",
306 |     "X_train = df_train[NUMERIC_COLS]\n",
307 |     "X_test = df_test[NUMERIC_COLS]"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 6,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "(8001, 7)"
319 |       ]
320 |      },
321 |      "execution_count": 6,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "X_train.shape"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 23,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "lgb_train = lgb.Dataset(X_train,y_train)\n",
337 |     "lgb_eval = lgb.Dataset(X_test,y_test,reference=lgb_train)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "## 设置子树为100颗，每颗树包含64支叶子的树模型。那么形成的中间特征向量为100*64"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 24,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "params = {\n",
354 |     "    'task': 'train',\n",
355 |     "    'boosting_type': 'gbdt',\n",
356 |     "    'objective': 'binary',\n",
357 |     "    'metric': {'binary_logloss'},\n",
358 |     "    'num_leaves': 64,\n",
359 |     "    'num_trees': 100,\n",
360 |     "    'learning_rate': 0.01,\n",
361 |     "    'feature_fraction': 0.9,\n",
362 |     "    'bagging_fraction': 0.8,\n",
363 |     "    'bagging_freq': 5,\n",
364 |     "    'verbose': 0\n",
365 |     "}"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 25,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "#  叶子节点数，用来进行特征转换使用\n",
375 |     "num_leaf = 64"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 26,
381 |    "metadata": {
382 |     "scrolled": true
383 |    },
384 |    "outputs": [
385 |     {
386 |      "name": "stderr",
387 |      "output_type": "stream",
388 |      "text": [
389 |       "D:\\anaconda3\\julianxu\\lib\\site-packages\\lightgbm\\engine.py:148: UserWarning: Found `num_trees` in params. Will use it instead of argument\n",
390 |       "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
391 |      ]
392 |     },
393 |     {
394 |      "name": "stdout",
395 |      "output_type": "stream",
396 |      "text": [
397 |       "[1]\ttraining's binary_logloss: 0.155602\n",
398 |       "[2]\ttraining's binary_logloss: 0.155022\n",
399 |       "[3]\ttraining's binary_logloss: 0.15441\n",
400 |       "[4]\ttraining's binary_logloss: 0.153819\n",
401 |       "[5]\ttraining's binary_logloss: 0.153267\n",
402 |       "[6]\ttraining's binary_logloss: 0.152685\n",
403 |       "[7]\ttraining's binary_logloss: 0.152144\n",
404 |       "[8]\ttraining's binary_logloss: 0.151545\n",
405 |       "[9]\ttraining's binary_logloss: 0.151029\n",
406 |       "[10]\ttraining's binary_logloss: 0.15049\n",
407 |       "[11]\ttraining's binary_logloss: 0.150069\n",
408 |       "[12]\ttraining's binary_logloss: 0.149553\n",
409 |       "[13]\ttraining's binary_logloss: 0.149064\n",
410 |       "[14]\ttraining's binary_logloss: 0.148592\n",
411 |       "[15]\ttraining's binary_logloss: 0.148111\n",
412 |       "[16]\ttraining's binary_logloss: 0.147618\n",
413 |       "[17]\ttraining's binary_logloss: 0.147086\n",
414 |       "[18]\ttraining's binary_logloss: 0.146624\n",
415 |       "[19]\ttraining's binary_logloss: 0.146184\n",
416 |       "[20]\ttraining's binary_logloss: 0.145696\n",
417 |       "[21]\ttraining's binary_logloss: 0.145182\n",
418 |       "[22]\ttraining's binary_logloss: 0.144704\n",
419 |       "[23]\ttraining's binary_logloss: 0.144244\n",
420 |       "[24]\ttraining's binary_logloss: 0.143804\n",
421 |       "[25]\ttraining's binary_logloss: 0.14335\n",
422 |       "[26]\ttraining's binary_logloss: 0.142893\n",
423 |       "[27]\ttraining's binary_logloss: 0.142461\n",
424 |       "[28]\ttraining's binary_logloss: 0.141992\n",
425 |       "[29]\ttraining's binary_logloss: 0.14154\n",
426 |       "[30]\ttraining's binary_logloss: 0.141097\n",
427 |       "[31]\ttraining's binary_logloss: 0.14065\n",
428 |       "[32]\ttraining's binary_logloss: 0.14021\n",
429 |       "[33]\ttraining's binary_logloss: 0.139826\n",
430 |       "[34]\ttraining's binary_logloss: 0.139455\n",
431 |       "[35]\ttraining's binary_logloss: 0.139101\n",
432 |       "[36]\ttraining's binary_logloss: 0.138699\n",
433 |       "[37]\ttraining's binary_logloss: 0.138313\n",
434 |       "[38]\ttraining's binary_logloss: 0.137922\n",
435 |       "[39]\ttraining's binary_logloss: 0.13748\n",
436 |       "[40]\ttraining's binary_logloss: 0.13711\n",
437 |       "[41]\ttraining's binary_logloss: 0.136669\n",
438 |       "[42]\ttraining's binary_logloss: 0.136245\n",
439 |       "[43]\ttraining's binary_logloss: 0.135825\n",
440 |       "[44]\ttraining's binary_logloss: 0.135446\n",
441 |       "[45]\ttraining's binary_logloss: 0.135044\n",
442 |       "[46]\ttraining's binary_logloss: 0.134611\n",
443 |       "[47]\ttraining's binary_logloss: 0.134199\n",
444 |       "[48]\ttraining's binary_logloss: 0.133789\n",
445 |       "[49]\ttraining's binary_logloss: 0.133391\n",
446 |       "[50]\ttraining's binary_logloss: 0.133004\n",
447 |       "[51]\ttraining's binary_logloss: 0.132586\n",
448 |       "[52]\ttraining's binary_logloss: 0.132205\n",
449 |       "[53]\ttraining's binary_logloss: 0.131787\n",
450 |       "[54]\ttraining's binary_logloss: 0.131378\n",
451 |       "[55]\ttraining's binary_logloss: 0.131014\n",
452 |       "[56]\ttraining's binary_logloss: 0.130628\n",
453 |       "[57]\ttraining's binary_logloss: 0.130253\n",
454 |       "[58]\ttraining's binary_logloss: 0.129902\n",
455 |       "[59]\ttraining's binary_logloss: 0.12956\n",
456 |       "[60]\ttraining's binary_logloss: 0.129185\n",
457 |       "[61]\ttraining's binary_logloss: 0.128838\n",
458 |       "[62]\ttraining's binary_logloss: 0.128492\n",
459 |       "[63]\ttraining's binary_logloss: 0.128169\n",
460 |       "[64]\ttraining's binary_logloss: 0.127838\n",
461 |       "[65]\ttraining's binary_logloss: 0.12748\n",
462 |       "[66]\ttraining's binary_logloss: 0.127149\n",
463 |       "[67]\ttraining's binary_logloss: 0.126845\n",
464 |       "[68]\ttraining's binary_logloss: 0.126493\n",
465 |       "[69]\ttraining's binary_logloss: 0.126139\n",
466 |       "[70]\ttraining's binary_logloss: 0.125797\n",
467 |       "[71]\ttraining's binary_logloss: 0.125492\n",
468 |       "[72]\ttraining's binary_logloss: 0.125175\n",
469 |       "[73]\ttraining's binary_logloss: 0.12489\n",
470 |       "[74]\ttraining's binary_logloss: 0.124602\n",
471 |       "[75]\ttraining's binary_logloss: 0.124281\n",
472 |       "[76]\ttraining's binary_logloss: 0.123981\n",
473 |       "[77]\ttraining's binary_logloss: 0.123696\n",
474 |       "[78]\ttraining's binary_logloss: 0.123414\n",
475 |       "[79]\ttraining's binary_logloss: 0.123113\n",
476 |       "[80]\ttraining's binary_logloss: 0.122799\n",
477 |       "[81]\ttraining's binary_logloss: 0.122486\n",
478 |       "[82]\ttraining's binary_logloss: 0.122147\n",
479 |       "[83]\ttraining's binary_logloss: 0.121818\n",
480 |       "[84]\ttraining's binary_logloss: 0.121483\n",
481 |       "[85]\ttraining's binary_logloss: 0.12115\n",
482 |       "[86]\ttraining's binary_logloss: 0.120842\n",
483 |       "[87]\ttraining's binary_logloss: 0.120546\n",
484 |       "[88]\ttraining's binary_logloss: 0.12025\n",
485 |       "[89]\ttraining's binary_logloss: 0.119959\n",
486 |       "[90]\ttraining's binary_logloss: 0.119682\n",
487 |       "[91]\ttraining's binary_logloss: 0.11935\n",
488 |       "[92]\ttraining's binary_logloss: 0.119037\n",
489 |       "[93]\ttraining's binary_logloss: 0.118712\n",
490 |       "[94]\ttraining's binary_logloss: 0.118397\n",
491 |       "[95]\ttraining's binary_logloss: 0.118085\n",
492 |       "[96]\ttraining's binary_logloss: 0.117773\n",
493 |       "[97]\ttraining's binary_logloss: 0.117491\n",
494 |       "[98]\ttraining's binary_logloss: 0.117192\n",
495 |       "[99]\ttraining's binary_logloss: 0.116892\n",
496 |       "[100]\ttraining's binary_logloss: 0.116629\n"
497 |      ]
498 |     }
499 |    ],
500 |    "source": [
501 |     "# train\n",
502 |     "gbm = lgb.train(params,\n",
503 |     "                lgb_train,\n",
504 |     "                num_boost_round=100,\n",
505 |     "                valid_sets=lgb_train)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": 27,
511 |    "metadata": {},
512 |    "outputs": [
513 |     {
514 |      "name": "stdout",
515 |      "output_type": "stream",
516 |      "text": [
517 |       "Save model...\n"
518 |      ]
519 |     },
520 |     {
521 |      "data": {
522 |       "text/plain": [
523 |        "<lightgbm.basic.Booster at 0x13099093d68>"
524 |       ]
525 |      },
526 |      "execution_count": 27,
527 |      "metadata": {},
528 |      "output_type": "execute_result"
529 |     }
530 |    ],
531 |    "source": [
532 |     "print('Save model...')\n",
533 |     "# save model to file\n",
534 |     "gbm.save_model(r'F:\\Data\\recsys-data\\gbdt+lr/model.txt')"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": 54,
540 |    "metadata": {},
541 |    "outputs": [
542 |     {
543 |      "name": "stdout",
544 |      "output_type": "stream",
545 |      "text": [
546 |       "Start predicting...\n"
547 |      ]
548 |     }
549 |    ],
550 |    "source": [
551 |     "print('Start predicting...')\n",
552 |     "# predict and get data on leaves, training data\n",
553 |     "y_pred = gbm.predict(X_train, pred_leaf=True)"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 56,
559 |    "metadata": {},
560 |    "outputs": [
561 |     {
562 |      "data": {
563 |       "text/plain": [
564 |        "(8001, 7)"
565 |       ]
566 |      },
567 |      "execution_count": 56,
568 |      "metadata": {},
569 |      "output_type": "execute_result"
570 |     }
571 |    ],
572 |    "source": [
573 |     "X_train.shape"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 55,
579 |    "metadata": {},
580 |    "outputs": [
581 |     {
582 |      "data": {
583 |       "text/plain": [
584 |        "array([[17,  0, 55, ...,  4, 63, 63],\n",
585 |        "       [62,  8, 58, ..., 47,  9, 57],\n",
586 |        "       [44,  0, 58, ..., 34, 62, 45],\n",
587 |        "       ...,\n",
588 |        "       [51, 19, 16, ..., 23, 33, 56],\n",
589 |        "       [61, 28, 58, ..., 53, 28, 18],\n",
590 |        "       [53, 29, 54, ...,  4, 63, 63]])"
591 |       ]
592 |      },
593 |      "execution_count": 55,
594 |      "metadata": {},
595 |      "output_type": "execute_result"
596 |     }
597 |    ],
598 |    "source": [
599 |     "y_pred"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 29,
605 |    "metadata": {},
606 |    "outputs": [
607 |     {
608 |      "data": {
609 |       "text/plain": [
610 |        "(8001, 100)"
611 |       ]
612 |      },
613 |      "execution_count": 29,
614 |      "metadata": {},
615 |      "output_type": "execute_result"
616 |     }
617 |    ],
618 |    "source": [
619 |     "np.array(y_pred).shape"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": 32,
625 |    "metadata": {},
626 |    "outputs": [
627 |     {
628 |      "data": {
629 |       "text/plain": [
630 |        "array([17,  0, 55, 44, 47,  8,  8, 39,  8,  8,  0,  0,  0,  0,  0,  0, 38,\n",
631 |        "       36, 36, 26, 15, 13, 38, 18, 41, 54, 45, 51, 55, 59, 15, 20,  2,  2,\n",
632 |        "        2, 63, 56, 26,  7, 25, 46, 58, 62, 26, 19, 48,  6, 51,  5, 45, 44,\n",
633 |        "        1, 44, 14, 33, 41, 10, 39, 49, 63, 51, 63, 20, 48, 52, 47,  8, 36,\n",
634 |        "        8,  8, 50,  0, 32, 21,  8, 23, 48, 48, 17, 49, 46, 10, 28, 12, 59,\n",
635 |        "       22, 12, 51, 34, 32, 15, 15, 53, 29, 29, 59, 59,  4, 63, 63])"
636 |       ]
637 |      },
638 |      "execution_count": 32,
639 |      "metadata": {},
640 |      "output_type": "execute_result"
641 |     }
642 |    ],
643 |    "source": [
644 |     "y_pred[0]\n",
645 |     "# 17,0每个数字代表每颗树的叶子节点索引"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": 36,
651 |    "metadata": {},
652 |    "outputs": [],
653 |    "source": [
654 |     "transform_training_matrix = np.zeros([len(y_pred),len(y_pred[0])*num_leaf],dtype=np.int64) # N**num_tress*num_leaf"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 47,
660 |    "metadata": {},
661 |    "outputs": [],
662 |    "source": [
663 |     "for i in range(0,len(y_pred)):\n",
664 |     "    temp = np.arange(len(y_pred[0]))*num_leaf + np.array(y_pred[i]) #  以64为一个周期，然后加上相应的节点位置\n",
665 |     "    transform_training_matrix[i][temp] += 1 # 找出索引对应的值，然后加1"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": 83,
671 |    "metadata": {},
672 |    "outputs": [
673 |     {
674 |      "data": {
675 |       "text/plain": [
676 |        "(8001, 6400)"
677 |       ]
678 |      },
679 |      "execution_count": 83,
680 |      "metadata": {},
681 |      "output_type": "execute_result"
682 |     }
683 |    ],
684 |    "source": [
685 |     "transform_training_matrix.shape"
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "code",
690 |    "execution_count": 95,
691 |    "metadata": {},
692 |    "outputs": [],
693 |    "source": [
694 |     "y_test_lgb = gbm.predict(X_test,pred_leaf=True)"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": 65,
700 |    "metadata": {},
701 |    "outputs": [],
702 |    "source": [
703 |     "# 将预测集进行onehot转换"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": 86,
709 |    "metadata": {},
710 |    "outputs": [
711 |     {
712 |      "data": {
713 |       "text/plain": [
714 |        "2000"
715 |       ]
716 |      },
717 |      "execution_count": 86,
718 |      "metadata": {},
719 |      "output_type": "execute_result"
720 |     }
721 |    ],
722 |    "source": [
723 |     "len(y_test)"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": 96,
729 |    "metadata": {},
730 |    "outputs": [],
731 |    "source": [
732 |     "transform_test_matrix = np.zeros([len(y_test_lgb),len(y_test_lgb[0])*num_leaf],dtype=np.int64)"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": 88,
738 |    "metadata": {},
739 |    "outputs": [
740 |     {
741 |      "data": {
742 |       "text/plain": [
743 |        "(2000, 6400)"
744 |       ]
745 |      },
746 |      "execution_count": 88,
747 |      "metadata": {},
748 |      "output_type": "execute_result"
749 |     }
750 |    ],
751 |    "source": [
752 |     "transform_test_matrix.shape"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": 97,
758 |    "metadata": {},
759 |    "outputs": [],
760 |    "source": [
761 |     "for i in range(len(y_test_lgb)):\n",
762 |     "    temp = np.arange(len(y_test[0]))*num_leaf + np.array(y_test_lgb[i])\n",
763 |     "    transform_test_matrix[i][temp] += 1"
764 |    ]
765 |   },
766 |   {
767 |    "cell_type": "code",
768 |    "execution_count": 98,
769 |    "metadata": {},
770 |    "outputs": [],
771 |    "source": [
772 |     "lm = LogisticRegression(penalty='l2',C=0.05)\n",
773 |     "lm.fit(transform_training_matrix,y_train)\n",
774 |     "y_pred_test = lm.predict_proba(transform_test_matrix)"
775 |    ]
776 |   },
777 |   {
778 |    "cell_type": "code",
779 |    "execution_count": 99,
780 |    "metadata": {},
781 |    "outputs": [
782 |     {
783 |      "data": {
784 |       "text/plain": [
785 |        "(2000, 2)"
786 |       ]
787 |      },
788 |      "execution_count": 99,
789 |      "metadata": {},
790 |      "output_type": "execute_result"
791 |     }
792 |    ],
793 |    "source": [
794 |     "y_pred_test.shape"
795 |    ]
796 |   },
797 |   {
798 |    "cell_type": "code",
799 |    "execution_count": 101,
800 |    "metadata": {},
801 |    "outputs": [
802 |     {
803 |      "name": "stdout",
804 |      "output_type": "stream",
805 |      "text": [
806 |       "Normalized Cross Entropy 2.213280152050503\n"
807 |      ]
808 |     }
809 |    ],
810 |    "source": [
811 |     "NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))\n",
812 |     "print(\"Normalized Cross Entropy \" + str(NE))"
813 |    ]
814 |   },
815 |   {
816 |    "cell_type": "code",
817 |    "execution_count": null,
818 |    "metadata": {},
819 |    "outputs": [],
820 |    "source": []
821 |   }
822 |  ],
823 |  "metadata": {
824 |   "kernelspec": {
825 |    "display_name": "Python 3",
826 |    "language": "python",
827 |    "name": "python3"
828 |   },
829 |   "language_info": {
830 |    "codemirror_mode": {
831 |     "name": "ipython",
832 |     "version": 3
833 |    },
834 |    "file_extension": ".py",
835 |    "mimetype": "text/x-python",
836 |    "name": "python",
837 |    "nbconvert_exporter": "python",
838 |    "pygments_lexer": "ipython3",
839 |    "version": "3.6.5"
840 |   }
841 |  },
842 |  "nbformat": 4,
843 |  "nbformat_minor": 1
844 | }
845 | 


--------------------------------------------------------------------------------
/MLR.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# 1.处理数据过程"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 3,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "import pandas as pd\n",
  17 |     "from sklearn.preprocessing import StandardScaler"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "code",
  22 |    "execution_count": 81,
  23 |    "metadata": {},
  24 |    "outputs": [],
  25 |    "source": [
  26 |     "# get_data\n",
  27 |     "train_data = pd.read_table(r'F:\\Data\\recsys-data\\mlr\\adult.data',header=None,delimiter=',')"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "code",
  32 |    "execution_count": null,
  33 |    "metadata": {},
  34 |    "outputs": [],
  35 |    "source": [
  36 |     "test_data = pd.read_table(r'F:\\Data\\recsys-data\\mlr\\adult.test',header=None,delimiter=',')"
  37 |    ]
  38 |   },
  39 |   {
  40 |    "cell_type": "code",
  41 |    "execution_count": null,
  42 |    "metadata": {},
  43 |    "outputs": [
  44 |     {
  45 |      "data": {
  46 |       "text/html": [
  47 |        "<div>\n",
  48 |        "<style scoped>\n",
  49 |        "    .dataframe tbody tr th:only-of-type {\n",
  50 |        "        vertical-align: middle;\n",
  51 |        "    }\n",
  52 |        "\n",
  53 |        "    .dataframe tbody tr th {\n",
  54 |        "        vertical-align: top;\n",
  55 |        "    }\n",
  56 |        "\n",
  57 |        "    .dataframe thead th {\n",
  58 |        "        text-align: right;\n",
  59 |        "    }\n",
  60 |        "</style>\n",
  61 |        "<table border=\"1\" class=\"dataframe\">\n",
  62 |        "  <thead>\n",
  63 |        "    <tr style=\"text-align: right;\">\n",
  64 |        "      <th></th>\n",
  65 |        "      <th>0</th>\n",
  66 |        "      <th>1</th>\n",
  67 |        "      <th>2</th>\n",
  68 |        "      <th>3</th>\n",
  69 |        "      <th>4</th>\n",
  70 |        "      <th>5</th>\n",
  71 |        "      <th>6</th>\n",
  72 |        "      <th>7</th>\n",
  73 |        "      <th>8</th>\n",
  74 |        "      <th>9</th>\n",
  75 |        "      <th>10</th>\n",
  76 |        "      <th>11</th>\n",
  77 |        "      <th>12</th>\n",
  78 |        "      <th>13</th>\n",
  79 |        "      <th>14</th>\n",
  80 |        "    </tr>\n",
  81 |        "  </thead>\n",
  82 |        "  <tbody>\n",
  83 |        "    <tr>\n",
  84 |        "      <th>0</th>\n",
  85 |        "      <td>39</td>\n",
  86 |        "      <td>State-gov</td>\n",
  87 |        "      <td>77516</td>\n",
  88 |        "      <td>Bachelors</td>\n",
  89 |        "      <td>13</td>\n",
  90 |        "      <td>Never-married</td>\n",
  91 |        "      <td>Adm-clerical</td>\n",
  92 |        "      <td>Not-in-family</td>\n",
  93 |        "      <td>White</td>\n",
  94 |        "      <td>Male</td>\n",
  95 |        "      <td>2174</td>\n",
  96 |        "      <td>0</td>\n",
  97 |        "      <td>40</td>\n",
  98 |        "      <td>United-States</td>\n",
  99 |        "      <td>&lt;=50K</td>\n",
 100 |        "    </tr>\n",
 101 |        "    <tr>\n",
 102 |        "      <th>1</th>\n",
 103 |        "      <td>50</td>\n",
 104 |        "      <td>Self-emp-not-inc</td>\n",
 105 |        "      <td>83311</td>\n",
 106 |        "      <td>Bachelors</td>\n",
 107 |        "      <td>13</td>\n",
 108 |        "      <td>Married-civ-spouse</td>\n",
 109 |        "      <td>Exec-managerial</td>\n",
 110 |        "      <td>Husband</td>\n",
 111 |        "      <td>White</td>\n",
 112 |        "      <td>Male</td>\n",
 113 |        "      <td>0</td>\n",
 114 |        "      <td>0</td>\n",
 115 |        "      <td>13</td>\n",
 116 |        "      <td>United-States</td>\n",
 117 |        "      <td>&lt;=50K</td>\n",
 118 |        "    </tr>\n",
 119 |        "    <tr>\n",
 120 |        "      <th>2</th>\n",
 121 |        "      <td>38</td>\n",
 122 |        "      <td>Private</td>\n",
 123 |        "      <td>215646</td>\n",
 124 |        "      <td>HS-grad</td>\n",
 125 |        "      <td>9</td>\n",
 126 |        "      <td>Divorced</td>\n",
 127 |        "      <td>Handlers-cleaners</td>\n",
 128 |        "      <td>Not-in-family</td>\n",
 129 |        "      <td>White</td>\n",
 130 |        "      <td>Male</td>\n",
 131 |        "      <td>0</td>\n",
 132 |        "      <td>0</td>\n",
 133 |        "      <td>40</td>\n",
 134 |        "      <td>United-States</td>\n",
 135 |        "      <td>&lt;=50K</td>\n",
 136 |        "    </tr>\n",
 137 |        "    <tr>\n",
 138 |        "      <th>3</th>\n",
 139 |        "      <td>53</td>\n",
 140 |        "      <td>Private</td>\n",
 141 |        "      <td>234721</td>\n",
 142 |        "      <td>11th</td>\n",
 143 |        "      <td>7</td>\n",
 144 |        "      <td>Married-civ-spouse</td>\n",
 145 |        "      <td>Handlers-cleaners</td>\n",
 146 |        "      <td>Husband</td>\n",
 147 |        "      <td>Black</td>\n",
 148 |        "      <td>Male</td>\n",
 149 |        "      <td>0</td>\n",
 150 |        "      <td>0</td>\n",
 151 |        "      <td>40</td>\n",
 152 |        "      <td>United-States</td>\n",
 153 |        "      <td>&lt;=50K</td>\n",
 154 |        "    </tr>\n",
 155 |        "    <tr>\n",
 156 |        "      <th>4</th>\n",
 157 |        "      <td>28</td>\n",
 158 |        "      <td>Private</td>\n",
 159 |        "      <td>338409</td>\n",
 160 |        "      <td>Bachelors</td>\n",
 161 |        "      <td>13</td>\n",
 162 |        "      <td>Married-civ-spouse</td>\n",
 163 |        "      <td>Prof-specialty</td>\n",
 164 |        "      <td>Wife</td>\n",
 165 |        "      <td>Black</td>\n",
 166 |        "      <td>Female</td>\n",
 167 |        "      <td>0</td>\n",
 168 |        "      <td>0</td>\n",
 169 |        "      <td>40</td>\n",
 170 |        "      <td>Cuba</td>\n",
 171 |        "      <td>&lt;=50K</td>\n",
 172 |        "    </tr>\n",
 173 |        "  </tbody>\n",
 174 |        "</table>\n",
 175 |        "</div>"
 176 |       ],
 177 |       "text/plain": [
 178 |        "   0                  1       2           3   4                    5   \\\n",
 179 |        "0  39          State-gov   77516   Bachelors  13        Never-married   \n",
 180 |        "1  50   Self-emp-not-inc   83311   Bachelors  13   Married-civ-spouse   \n",
 181 |        "2  38            Private  215646     HS-grad   9             Divorced   \n",
 182 |        "3  53            Private  234721        11th   7   Married-civ-spouse   \n",
 183 |        "4  28            Private  338409   Bachelors  13   Married-civ-spouse   \n",
 184 |        "\n",
 185 |        "                   6               7       8        9     10  11  12  \\\n",
 186 |        "0        Adm-clerical   Not-in-family   White     Male  2174   0  40   \n",
 187 |        "1     Exec-managerial         Husband   White     Male     0   0  13   \n",
 188 |        "2   Handlers-cleaners   Not-in-family   White     Male     0   0  40   \n",
 189 |        "3   Handlers-cleaners         Husband   Black     Male     0   0  40   \n",
 190 |        "4      Prof-specialty            Wife   Black   Female     0   0  40   \n",
 191 |        "\n",
 192 |        "               13      14  \n",
 193 |        "0   United-States   <=50K  \n",
 194 |        "1   United-States   <=50K  \n",
 195 |        "2   United-States   <=50K  \n",
 196 |        "3   United-States   <=50K  \n",
 197 |        "4            Cuba   <=50K  "
 198 |       ]
 199 |      },
 200 |      "execution_count": 83,
 201 |      "metadata": {},
 202 |      "output_type": "execute_result"
 203 |     }
 204 |    ],
 205 |    "source": [
 206 |     "train_data.head()"
 207 |    ]
 208 |   },
 209 |   {
 210 |    "cell_type": "code",
 211 |    "execution_count": 98,
 212 |    "metadata": {},
 213 |    "outputs": [],
 214 |    "source": [
 215 |     "test_data[14] = test_data[14].apply(lambda x: x[:-1])"
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "code",
 220 |    "execution_count": 85,
 221 |    "metadata": {},
 222 |    "outputs": [
 223 |     {
 224 |      "name": "stdout",
 225 |      "output_type": "stream",
 226 |      "text": [
 227 |       "<class 'pandas.core.frame.DataFrame'>\n",
 228 |       "RangeIndex: 32561 entries, 0 to 32560\n",
 229 |       "Data columns (total 15 columns):\n",
 230 |       "0     32561 non-null int64\n",
 231 |       "1     32561 non-null object\n",
 232 |       "2     32561 non-null int64\n",
 233 |       "3     32561 non-null object\n",
 234 |       "4     32561 non-null int64\n",
 235 |       "5     32561 non-null object\n",
 236 |       "6     32561 non-null object\n",
 237 |       "7     32561 non-null object\n",
 238 |       "8     32561 non-null object\n",
 239 |       "9     32561 non-null object\n",
 240 |       "10    32561 non-null int64\n",
 241 |       "11    32561 non-null int64\n",
 242 |       "12    32561 non-null int64\n",
 243 |       "13    32561 non-null object\n",
 244 |       "14    32561 non-null object\n",
 245 |       "dtypes: int64(6), object(9)\n",
 246 |       "memory usage: 3.7+ MB\n"
 247 |      ]
 248 |     }
 249 |    ],
 250 |    "source": [
 251 |     "train_data.info()"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": 86,
 257 |    "metadata": {},
 258 |    "outputs": [],
 259 |    "source": [
 260 |     "all_columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','label','type']\n",
 261 |     "continus_columns =['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']\n",
 262 |     "dummy_columns = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']"
 263 |    ]
 264 |   },
 265 |   {
 266 |    "cell_type": "code",
 267 |    "execution_count": 87,
 268 |    "metadata": {},
 269 |    "outputs": [],
 270 |    "source": [
 271 |     "train_data['type'] = 1\n",
 272 |     "test_data['type'] = 2"
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": 88,
 278 |    "metadata": {},
 279 |    "outputs": [
 280 |     {
 281 |      "data": {
 282 |       "text/html": [
 283 |        "<div>\n",
 284 |        "<style scoped>\n",
 285 |        "    .dataframe tbody tr th:only-of-type {\n",
 286 |        "        vertical-align: middle;\n",
 287 |        "    }\n",
 288 |        "\n",
 289 |        "    .dataframe tbody tr th {\n",
 290 |        "        vertical-align: top;\n",
 291 |        "    }\n",
 292 |        "\n",
 293 |        "    .dataframe thead th {\n",
 294 |        "        text-align: right;\n",
 295 |        "    }\n",
 296 |        "</style>\n",
 297 |        "<table border=\"1\" class=\"dataframe\">\n",
 298 |        "  <thead>\n",
 299 |        "    <tr style=\"text-align: right;\">\n",
 300 |        "      <th></th>\n",
 301 |        "      <th>0</th>\n",
 302 |        "      <th>1</th>\n",
 303 |        "      <th>2</th>\n",
 304 |        "      <th>3</th>\n",
 305 |        "      <th>4</th>\n",
 306 |        "      <th>5</th>\n",
 307 |        "      <th>6</th>\n",
 308 |        "      <th>7</th>\n",
 309 |        "      <th>8</th>\n",
 310 |        "      <th>9</th>\n",
 311 |        "      <th>10</th>\n",
 312 |        "      <th>11</th>\n",
 313 |        "      <th>12</th>\n",
 314 |        "      <th>13</th>\n",
 315 |        "      <th>14</th>\n",
 316 |        "      <th>type</th>\n",
 317 |        "    </tr>\n",
 318 |        "  </thead>\n",
 319 |        "  <tbody>\n",
 320 |        "    <tr>\n",
 321 |        "      <th>0</th>\n",
 322 |        "      <td>39</td>\n",
 323 |        "      <td>State-gov</td>\n",
 324 |        "      <td>77516</td>\n",
 325 |        "      <td>Bachelors</td>\n",
 326 |        "      <td>13</td>\n",
 327 |        "      <td>Never-married</td>\n",
 328 |        "      <td>Adm-clerical</td>\n",
 329 |        "      <td>Not-in-family</td>\n",
 330 |        "      <td>White</td>\n",
 331 |        "      <td>Male</td>\n",
 332 |        "      <td>2174</td>\n",
 333 |        "      <td>0</td>\n",
 334 |        "      <td>40</td>\n",
 335 |        "      <td>United-States</td>\n",
 336 |        "      <td>&lt;=50K</td>\n",
 337 |        "      <td>1</td>\n",
 338 |        "    </tr>\n",
 339 |        "    <tr>\n",
 340 |        "      <th>1</th>\n",
 341 |        "      <td>50</td>\n",
 342 |        "      <td>Self-emp-not-inc</td>\n",
 343 |        "      <td>83311</td>\n",
 344 |        "      <td>Bachelors</td>\n",
 345 |        "      <td>13</td>\n",
 346 |        "      <td>Married-civ-spouse</td>\n",
 347 |        "      <td>Exec-managerial</td>\n",
 348 |        "      <td>Husband</td>\n",
 349 |        "      <td>White</td>\n",
 350 |        "      <td>Male</td>\n",
 351 |        "      <td>0</td>\n",
 352 |        "      <td>0</td>\n",
 353 |        "      <td>13</td>\n",
 354 |        "      <td>United-States</td>\n",
 355 |        "      <td>&lt;=50K</td>\n",
 356 |        "      <td>1</td>\n",
 357 |        "    </tr>\n",
 358 |        "    <tr>\n",
 359 |        "      <th>2</th>\n",
 360 |        "      <td>38</td>\n",
 361 |        "      <td>Private</td>\n",
 362 |        "      <td>215646</td>\n",
 363 |        "      <td>HS-grad</td>\n",
 364 |        "      <td>9</td>\n",
 365 |        "      <td>Divorced</td>\n",
 366 |        "      <td>Handlers-cleaners</td>\n",
 367 |        "      <td>Not-in-family</td>\n",
 368 |        "      <td>White</td>\n",
 369 |        "      <td>Male</td>\n",
 370 |        "      <td>0</td>\n",
 371 |        "      <td>0</td>\n",
 372 |        "      <td>40</td>\n",
 373 |        "      <td>United-States</td>\n",
 374 |        "      <td>&lt;=50K</td>\n",
 375 |        "      <td>1</td>\n",
 376 |        "    </tr>\n",
 377 |        "    <tr>\n",
 378 |        "      <th>3</th>\n",
 379 |        "      <td>53</td>\n",
 380 |        "      <td>Private</td>\n",
 381 |        "      <td>234721</td>\n",
 382 |        "      <td>11th</td>\n",
 383 |        "      <td>7</td>\n",
 384 |        "      <td>Married-civ-spouse</td>\n",
 385 |        "      <td>Handlers-cleaners</td>\n",
 386 |        "      <td>Husband</td>\n",
 387 |        "      <td>Black</td>\n",
 388 |        "      <td>Male</td>\n",
 389 |        "      <td>0</td>\n",
 390 |        "      <td>0</td>\n",
 391 |        "      <td>40</td>\n",
 392 |        "      <td>United-States</td>\n",
 393 |        "      <td>&lt;=50K</td>\n",
 394 |        "      <td>1</td>\n",
 395 |        "    </tr>\n",
 396 |        "    <tr>\n",
 397 |        "      <th>4</th>\n",
 398 |        "      <td>28</td>\n",
 399 |        "      <td>Private</td>\n",
 400 |        "      <td>338409</td>\n",
 401 |        "      <td>Bachelors</td>\n",
 402 |        "      <td>13</td>\n",
 403 |        "      <td>Married-civ-spouse</td>\n",
 404 |        "      <td>Prof-specialty</td>\n",
 405 |        "      <td>Wife</td>\n",
 406 |        "      <td>Black</td>\n",
 407 |        "      <td>Female</td>\n",
 408 |        "      <td>0</td>\n",
 409 |        "      <td>0</td>\n",
 410 |        "      <td>40</td>\n",
 411 |        "      <td>Cuba</td>\n",
 412 |        "      <td>&lt;=50K</td>\n",
 413 |        "      <td>1</td>\n",
 414 |        "    </tr>\n",
 415 |        "  </tbody>\n",
 416 |        "</table>\n",
 417 |        "</div>"
 418 |       ],
 419 |       "text/plain": [
 420 |        "    0                  1       2           3   4                    5  \\\n",
 421 |        "0  39          State-gov   77516   Bachelors  13        Never-married   \n",
 422 |        "1  50   Self-emp-not-inc   83311   Bachelors  13   Married-civ-spouse   \n",
 423 |        "2  38            Private  215646     HS-grad   9             Divorced   \n",
 424 |        "3  53            Private  234721        11th   7   Married-civ-spouse   \n",
 425 |        "4  28            Private  338409   Bachelors  13   Married-civ-spouse   \n",
 426 |        "\n",
 427 |        "                    6               7       8        9    10  11  12  \\\n",
 428 |        "0        Adm-clerical   Not-in-family   White     Male  2174   0  40   \n",
 429 |        "1     Exec-managerial         Husband   White     Male     0   0  13   \n",
 430 |        "2   Handlers-cleaners   Not-in-family   White     Male     0   0  40   \n",
 431 |        "3   Handlers-cleaners         Husband   Black     Male     0   0  40   \n",
 432 |        "4      Prof-specialty            Wife   Black   Female     0   0  40   \n",
 433 |        "\n",
 434 |        "               13      14  type  \n",
 435 |        "0   United-States   <=50K     1  \n",
 436 |        "1   United-States   <=50K     1  \n",
 437 |        "2   United-States   <=50K     1  \n",
 438 |        "3   United-States   <=50K     1  \n",
 439 |        "4            Cuba   <=50K     1  "
 440 |       ]
 441 |      },
 442 |      "execution_count": 88,
 443 |      "metadata": {},
 444 |      "output_type": "execute_result"
 445 |     }
 446 |    ],
 447 |    "source": [
 448 |     "train_data.head()"
 449 |    ]
 450 |   },
 451 |   {
 452 |    "cell_type": "code",
 453 |    "execution_count": 89,
 454 |    "metadata": {},
 455 |    "outputs": [
 456 |     {
 457 |      "data": {
 458 |       "text/html": [
 459 |        "<div>\n",
 460 |        "<style scoped>\n",
 461 |        "    .dataframe tbody tr th:only-of-type {\n",
 462 |        "        vertical-align: middle;\n",
 463 |        "    }\n",
 464 |        "\n",
 465 |        "    .dataframe tbody tr th {\n",
 466 |        "        vertical-align: top;\n",
 467 |        "    }\n",
 468 |        "\n",
 469 |        "    .dataframe thead th {\n",
 470 |        "        text-align: right;\n",
 471 |        "    }\n",
 472 |        "</style>\n",
 473 |        "<table border=\"1\" class=\"dataframe\">\n",
 474 |        "  <thead>\n",
 475 |        "    <tr style=\"text-align: right;\">\n",
 476 |        "      <th></th>\n",
 477 |        "      <th>0</th>\n",
 478 |        "      <th>1</th>\n",
 479 |        "      <th>2</th>\n",
 480 |        "      <th>3</th>\n",
 481 |        "      <th>4</th>\n",
 482 |        "      <th>5</th>\n",
 483 |        "      <th>6</th>\n",
 484 |        "      <th>7</th>\n",
 485 |        "      <th>8</th>\n",
 486 |        "      <th>9</th>\n",
 487 |        "      <th>10</th>\n",
 488 |        "      <th>11</th>\n",
 489 |        "      <th>12</th>\n",
 490 |        "      <th>13</th>\n",
 491 |        "      <th>14</th>\n",
 492 |        "      <th>type</th>\n",
 493 |        "    </tr>\n",
 494 |        "  </thead>\n",
 495 |        "  <tbody>\n",
 496 |        "    <tr>\n",
 497 |        "      <th>0</th>\n",
 498 |        "      <td>25</td>\n",
 499 |        "      <td>Private</td>\n",
 500 |        "      <td>226802</td>\n",
 501 |        "      <td>11th</td>\n",
 502 |        "      <td>7</td>\n",
 503 |        "      <td>Never-married</td>\n",
 504 |        "      <td>Machine-op-inspct</td>\n",
 505 |        "      <td>Own-child</td>\n",
 506 |        "      <td>Black</td>\n",
 507 |        "      <td>Male</td>\n",
 508 |        "      <td>0</td>\n",
 509 |        "      <td>0</td>\n",
 510 |        "      <td>40</td>\n",
 511 |        "      <td>United-States</td>\n",
 512 |        "      <td>&lt;=50K.</td>\n",
 513 |        "      <td>2</td>\n",
 514 |        "    </tr>\n",
 515 |        "    <tr>\n",
 516 |        "      <th>1</th>\n",
 517 |        "      <td>38</td>\n",
 518 |        "      <td>Private</td>\n",
 519 |        "      <td>89814</td>\n",
 520 |        "      <td>HS-grad</td>\n",
 521 |        "      <td>9</td>\n",
 522 |        "      <td>Married-civ-spouse</td>\n",
 523 |        "      <td>Farming-fishing</td>\n",
 524 |        "      <td>Husband</td>\n",
 525 |        "      <td>White</td>\n",
 526 |        "      <td>Male</td>\n",
 527 |        "      <td>0</td>\n",
 528 |        "      <td>0</td>\n",
 529 |        "      <td>50</td>\n",
 530 |        "      <td>United-States</td>\n",
 531 |        "      <td>&lt;=50K.</td>\n",
 532 |        "      <td>2</td>\n",
 533 |        "    </tr>\n",
 534 |        "    <tr>\n",
 535 |        "      <th>2</th>\n",
 536 |        "      <td>28</td>\n",
 537 |        "      <td>Local-gov</td>\n",
 538 |        "      <td>336951</td>\n",
 539 |        "      <td>Assoc-acdm</td>\n",
 540 |        "      <td>12</td>\n",
 541 |        "      <td>Married-civ-spouse</td>\n",
 542 |        "      <td>Protective-serv</td>\n",
 543 |        "      <td>Husband</td>\n",
 544 |        "      <td>White</td>\n",
 545 |        "      <td>Male</td>\n",
 546 |        "      <td>0</td>\n",
 547 |        "      <td>0</td>\n",
 548 |        "      <td>40</td>\n",
 549 |        "      <td>United-States</td>\n",
 550 |        "      <td>&gt;50K.</td>\n",
 551 |        "      <td>2</td>\n",
 552 |        "    </tr>\n",
 553 |        "    <tr>\n",
 554 |        "      <th>3</th>\n",
 555 |        "      <td>44</td>\n",
 556 |        "      <td>Private</td>\n",
 557 |        "      <td>160323</td>\n",
 558 |        "      <td>Some-college</td>\n",
 559 |        "      <td>10</td>\n",
 560 |        "      <td>Married-civ-spouse</td>\n",
 561 |        "      <td>Machine-op-inspct</td>\n",
 562 |        "      <td>Husband</td>\n",
 563 |        "      <td>Black</td>\n",
 564 |        "      <td>Male</td>\n",
 565 |        "      <td>7688</td>\n",
 566 |        "      <td>0</td>\n",
 567 |        "      <td>40</td>\n",
 568 |        "      <td>United-States</td>\n",
 569 |        "      <td>&gt;50K.</td>\n",
 570 |        "      <td>2</td>\n",
 571 |        "    </tr>\n",
 572 |        "    <tr>\n",
 573 |        "      <th>4</th>\n",
 574 |        "      <td>18</td>\n",
 575 |        "      <td>?</td>\n",
 576 |        "      <td>103497</td>\n",
 577 |        "      <td>Some-college</td>\n",
 578 |        "      <td>10</td>\n",
 579 |        "      <td>Never-married</td>\n",
 580 |        "      <td>?</td>\n",
 581 |        "      <td>Own-child</td>\n",
 582 |        "      <td>White</td>\n",
 583 |        "      <td>Female</td>\n",
 584 |        "      <td>0</td>\n",
 585 |        "      <td>0</td>\n",
 586 |        "      <td>30</td>\n",
 587 |        "      <td>United-States</td>\n",
 588 |        "      <td>&lt;=50K.</td>\n",
 589 |        "      <td>2</td>\n",
 590 |        "    </tr>\n",
 591 |        "  </tbody>\n",
 592 |        "</table>\n",
 593 |        "</div>"
 594 |       ],
 595 |       "text/plain": [
 596 |        "    0           1       2              3   4                    5  \\\n",
 597 |        "0  25     Private  226802           11th   7        Never-married   \n",
 598 |        "1  38     Private   89814        HS-grad   9   Married-civ-spouse   \n",
 599 |        "2  28   Local-gov  336951     Assoc-acdm  12   Married-civ-spouse   \n",
 600 |        "3  44     Private  160323   Some-college  10   Married-civ-spouse   \n",
 601 |        "4  18           ?  103497   Some-college  10        Never-married   \n",
 602 |        "\n",
 603 |        "                    6           7       8        9    10  11  12  \\\n",
 604 |        "0   Machine-op-inspct   Own-child   Black     Male     0   0  40   \n",
 605 |        "1     Farming-fishing     Husband   White     Male     0   0  50   \n",
 606 |        "2     Protective-serv     Husband   White     Male     0   0  40   \n",
 607 |        "3   Machine-op-inspct     Husband   Black     Male  7688   0  40   \n",
 608 |        "4                   ?   Own-child   White   Female     0   0  30   \n",
 609 |        "\n",
 610 |        "               13       14  type  \n",
 611 |        "0   United-States   <=50K.     2  \n",
 612 |        "1   United-States   <=50K.     2  \n",
 613 |        "2   United-States    >50K.     2  \n",
 614 |        "3   United-States    >50K.     2  \n",
 615 |        "4   United-States   <=50K.     2  "
 616 |       ]
 617 |      },
 618 |      "execution_count": 89,
 619 |      "metadata": {},
 620 |      "output_type": "execute_result"
 621 |     }
 622 |    ],
 623 |    "source": [
 624 |     "test_data.head()"
 625 |    ]
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": 99,
 630 |    "metadata": {},
 631 |    "outputs": [],
 632 |    "source": [
 633 |     "all_data = pd.concat([train_data,test_data],axis = 0)\n",
 634 |     "all_data.columns = all_columns"
 635 |    ]
 636 |   },
 637 |   {
 638 |    "cell_type": "code",
 639 |    "execution_count": 101,
 640 |    "metadata": {},
 641 |    "outputs": [],
 642 |    "source": [
 643 |     "all_data = pd.get_dummies(all_data,columns=dummy_columns)"
 644 |    ]
 645 |   },
 646 |   {
 647 |    "cell_type": "code",
 648 |    "execution_count": 102,
 649 |    "metadata": {},
 650 |    "outputs": [],
 651 |    "source": [
 652 |     "all_data['label'] = all_data['label'].map(lambda x:1 if x.strip()=='>50K' else 0)"
 653 |    ]
 654 |   },
 655 |   {
 656 |    "cell_type": "code",
 657 |    "execution_count": 103,
 658 |    "metadata": {},
 659 |    "outputs": [],
 660 |    "source": [
 661 |     "for col in continus_columns:\n",
 662 |     "    ss = StandardScaler()\n",
 663 |     "    all_data[col] = ss.fit_transform(all_data[[col]])"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "code",
 668 |    "execution_count": 104,
 669 |    "metadata": {},
 670 |    "outputs": [],
 671 |    "source": [
 672 |     "test_data = all_data[all_data['type']==2].drop(['type'],axis=1)\n",
 673 |     "train_data = all_data[all_data['type']==1].drop(['type'],axis=1)"
 674 |    ]
 675 |   },
 676 |   {
 677 |    "cell_type": "code",
 678 |    "execution_count": 105,
 679 |    "metadata": {},
 680 |    "outputs": [
 681 |     {
 682 |      "data": {
 683 |       "text/html": [
 684 |        "<div>\n",
 685 |        "<style scoped>\n",
 686 |        "    .dataframe tbody tr th:only-of-type {\n",
 687 |        "        vertical-align: middle;\n",
 688 |        "    }\n",
 689 |        "\n",
 690 |        "    .dataframe tbody tr th {\n",
 691 |        "        vertical-align: top;\n",
 692 |        "    }\n",
 693 |        "\n",
 694 |        "    .dataframe thead th {\n",
 695 |        "        text-align: right;\n",
 696 |        "    }\n",
 697 |        "</style>\n",
 698 |        "<table border=\"1\" class=\"dataframe\">\n",
 699 |        "  <thead>\n",
 700 |        "    <tr style=\"text-align: right;\">\n",
 701 |        "      <th></th>\n",
 702 |        "      <th>age</th>\n",
 703 |        "      <th>fnlwgt</th>\n",
 704 |        "      <th>education-num</th>\n",
 705 |        "      <th>capital-gain</th>\n",
 706 |        "      <th>capital-loss</th>\n",
 707 |        "      <th>hours-per-week</th>\n",
 708 |        "      <th>label</th>\n",
 709 |        "      <th>workclass_ ?</th>\n",
 710 |        "      <th>workclass_ Federal-gov</th>\n",
 711 |        "      <th>workclass_ Local-gov</th>\n",
 712 |        "      <th>...</th>\n",
 713 |        "      <th>native-country_ Portugal</th>\n",
 714 |        "      <th>native-country_ Puerto-Rico</th>\n",
 715 |        "      <th>native-country_ Scotland</th>\n",
 716 |        "      <th>native-country_ South</th>\n",
 717 |        "      <th>native-country_ Taiwan</th>\n",
 718 |        "      <th>native-country_ Thailand</th>\n",
 719 |        "      <th>native-country_ Trinadad&amp;Tobago</th>\n",
 720 |        "      <th>native-country_ United-States</th>\n",
 721 |        "      <th>native-country_ Vietnam</th>\n",
 722 |        "      <th>native-country_ Yugoslavia</th>\n",
 723 |        "    </tr>\n",
 724 |        "  </thead>\n",
 725 |        "  <tbody>\n",
 726 |        "    <tr>\n",
 727 |        "      <th>0</th>\n",
 728 |        "      <td>0.025996</td>\n",
 729 |        "      <td>-1.061979</td>\n",
 730 |        "      <td>1.136512</td>\n",
 731 |        "      <td>0.146932</td>\n",
 732 |        "      <td>-0.217127</td>\n",
 733 |        "      <td>-0.034087</td>\n",
 734 |        "      <td>0</td>\n",
 735 |        "      <td>0</td>\n",
 736 |        "      <td>0</td>\n",
 737 |        "      <td>0</td>\n",
 738 |        "      <td>...</td>\n",
 739 |        "      <td>0</td>\n",
 740 |        "      <td>0</td>\n",
 741 |        "      <td>0</td>\n",
 742 |        "      <td>0</td>\n",
 743 |        "      <td>0</td>\n",
 744 |        "      <td>0</td>\n",
 745 |        "      <td>0</td>\n",
 746 |        "      <td>1</td>\n",
 747 |        "      <td>0</td>\n",
 748 |        "      <td>0</td>\n",
 749 |        "    </tr>\n",
 750 |        "    <tr>\n",
 751 |        "      <th>1</th>\n",
 752 |        "      <td>0.828308</td>\n",
 753 |        "      <td>-1.007104</td>\n",
 754 |        "      <td>1.136512</td>\n",
 755 |        "      <td>-0.144804</td>\n",
 756 |        "      <td>-0.217127</td>\n",
 757 |        "      <td>-2.213032</td>\n",
 758 |        "      <td>0</td>\n",
 759 |        "      <td>0</td>\n",
 760 |        "      <td>0</td>\n",
 761 |        "      <td>0</td>\n",
 762 |        "      <td>...</td>\n",
 763 |        "      <td>0</td>\n",
 764 |        "      <td>0</td>\n",
 765 |        "      <td>0</td>\n",
 766 |        "      <td>0</td>\n",
 767 |        "      <td>0</td>\n",
 768 |        "      <td>0</td>\n",
 769 |        "      <td>0</td>\n",
 770 |        "      <td>1</td>\n",
 771 |        "      <td>0</td>\n",
 772 |        "      <td>0</td>\n",
 773 |        "    </tr>\n",
 774 |        "    <tr>\n",
 775 |        "      <th>2</th>\n",
 776 |        "      <td>-0.046942</td>\n",
 777 |        "      <td>0.246034</td>\n",
 778 |        "      <td>-0.419335</td>\n",
 779 |        "      <td>-0.144804</td>\n",
 780 |        "      <td>-0.217127</td>\n",
 781 |        "      <td>-0.034087</td>\n",
 782 |        "      <td>0</td>\n",
 783 |        "      <td>0</td>\n",
 784 |        "      <td>0</td>\n",
 785 |        "      <td>0</td>\n",
 786 |        "      <td>...</td>\n",
 787 |        "      <td>0</td>\n",
 788 |        "      <td>0</td>\n",
 789 |        "      <td>0</td>\n",
 790 |        "      <td>0</td>\n",
 791 |        "      <td>0</td>\n",
 792 |        "      <td>0</td>\n",
 793 |        "      <td>0</td>\n",
 794 |        "      <td>1</td>\n",
 795 |        "      <td>0</td>\n",
 796 |        "      <td>0</td>\n",
 797 |        "    </tr>\n",
 798 |        "    <tr>\n",
 799 |        "      <th>3</th>\n",
 800 |        "      <td>1.047121</td>\n",
 801 |        "      <td>0.426663</td>\n",
 802 |        "      <td>-1.197259</td>\n",
 803 |        "      <td>-0.144804</td>\n",
 804 |        "      <td>-0.217127</td>\n",
 805 |        "      <td>-0.034087</td>\n",
 806 |        "      <td>0</td>\n",
 807 |        "      <td>0</td>\n",
 808 |        "      <td>0</td>\n",
 809 |        "      <td>0</td>\n",
 810 |        "      <td>...</td>\n",
 811 |        "      <td>0</td>\n",
 812 |        "      <td>0</td>\n",
 813 |        "      <td>0</td>\n",
 814 |        "      <td>0</td>\n",
 815 |        "      <td>0</td>\n",
 816 |        "      <td>0</td>\n",
 817 |        "      <td>0</td>\n",
 818 |        "      <td>1</td>\n",
 819 |        "      <td>0</td>\n",
 820 |        "      <td>0</td>\n",
 821 |        "    </tr>\n",
 822 |        "    <tr>\n",
 823 |        "      <th>4</th>\n",
 824 |        "      <td>-0.776316</td>\n",
 825 |        "      <td>1.408530</td>\n",
 826 |        "      <td>1.136512</td>\n",
 827 |        "      <td>-0.144804</td>\n",
 828 |        "      <td>-0.217127</td>\n",
 829 |        "      <td>-0.034087</td>\n",
 830 |        "      <td>0</td>\n",
 831 |        "      <td>0</td>\n",
 832 |        "      <td>0</td>\n",
 833 |        "      <td>0</td>\n",
 834 |        "      <td>...</td>\n",
 835 |        "      <td>0</td>\n",
 836 |        "      <td>0</td>\n",
 837 |        "      <td>0</td>\n",
 838 |        "      <td>0</td>\n",
 839 |        "      <td>0</td>\n",
 840 |        "      <td>0</td>\n",
 841 |        "      <td>0</td>\n",
 842 |        "      <td>0</td>\n",
 843 |        "      <td>0</td>\n",
 844 |        "      <td>0</td>\n",
 845 |        "    </tr>\n",
 846 |        "  </tbody>\n",
 847 |        "</table>\n",
 848 |        "<p>5 rows × 109 columns</p>\n",
 849 |        "</div>"
 850 |       ],
 851 |       "text/plain": [
 852 |        "        age    fnlwgt  education-num  capital-gain  capital-loss  \\\n",
 853 |        "0  0.025996 -1.061979       1.136512      0.146932     -0.217127   \n",
 854 |        "1  0.828308 -1.007104       1.136512     -0.144804     -0.217127   \n",
 855 |        "2 -0.046942  0.246034      -0.419335     -0.144804     -0.217127   \n",
 856 |        "3  1.047121  0.426663      -1.197259     -0.144804     -0.217127   \n",
 857 |        "4 -0.776316  1.408530       1.136512     -0.144804     -0.217127   \n",
 858 |        "\n",
 859 |        "   hours-per-week  label  workclass_ ?  workclass_ Federal-gov  \\\n",
 860 |        "0       -0.034087      0             0                       0   \n",
 861 |        "1       -2.213032      0             0                       0   \n",
 862 |        "2       -0.034087      0             0                       0   \n",
 863 |        "3       -0.034087      0             0                       0   \n",
 864 |        "4       -0.034087      0             0                       0   \n",
 865 |        "\n",
 866 |        "   workclass_ Local-gov             ...              native-country_ Portugal  \\\n",
 867 |        "0                     0             ...                                     0   \n",
 868 |        "1                     0             ...                                     0   \n",
 869 |        "2                     0             ...                                     0   \n",
 870 |        "3                     0             ...                                     0   \n",
 871 |        "4                     0             ...                                     0   \n",
 872 |        "\n",
 873 |        "   native-country_ Puerto-Rico  native-country_ Scotland  \\\n",
 874 |        "0                            0                         0   \n",
 875 |        "1                            0                         0   \n",
 876 |        "2                            0                         0   \n",
 877 |        "3                            0                         0   \n",
 878 |        "4                            0                         0   \n",
 879 |        "\n",
 880 |        "   native-country_ South  native-country_ Taiwan  native-country_ Thailand  \\\n",
 881 |        "0                      0                       0                         0   \n",
 882 |        "1                      0                       0                         0   \n",
 883 |        "2                      0                       0                         0   \n",
 884 |        "3                      0                       0                         0   \n",
 885 |        "4                      0                       0                         0   \n",
 886 |        "\n",
 887 |        "   native-country_ Trinadad&Tobago  native-country_ United-States  \\\n",
 888 |        "0                                0                              1   \n",
 889 |        "1                                0                              1   \n",
 890 |        "2                                0                              1   \n",
 891 |        "3                                0                              1   \n",
 892 |        "4                                0                              0   \n",
 893 |        "\n",
 894 |        "   native-country_ Vietnam  native-country_ Yugoslavia  \n",
 895 |        "0                        0                           0  \n",
 896 |        "1                        0                           0  \n",
 897 |        "2                        0                           0  \n",
 898 |        "3                        0                           0  \n",
 899 |        "4                        0                           0  \n",
 900 |        "\n",
 901 |        "[5 rows x 109 columns]"
 902 |       ]
 903 |      },
 904 |      "execution_count": 105,
 905 |      "metadata": {},
 906 |      "output_type": "execute_result"
 907 |     }
 908 |    ],
 909 |    "source": [
 910 |     "train_data.head()"
 911 |    ]
 912 |   },
 913 |   {
 914 |    "cell_type": "code",
 915 |    "execution_count": 106,
 916 |    "metadata": {},
 917 |    "outputs": [],
 918 |    "source": [
 919 |     "train_y = train_data['label']\n",
 920 |     "train_x  = train_data.drop(['label'],axis = 1)\n",
 921 |     "test_y = test_data['label']\n",
 922 |     "test_x = test_data.drop(['label'],axis = 1)"
 923 |    ]
 924 |   },
 925 |   {
 926 |    "cell_type": "code",
 927 |    "execution_count": 107,
 928 |    "metadata": {},
 929 |    "outputs": [
 930 |     {
 931 |      "data": {
 932 |       "text/plain": [
 933 |        "((24720, 109), (7841, 109))"
 934 |       ]
 935 |      },
 936 |      "execution_count": 107,
 937 |      "metadata": {},
 938 |      "output_type": "execute_result"
 939 |     }
 940 |    ],
 941 |    "source": [
 942 |     "train_data[train_data['label']==0].shape,train_data[train_data['label']==1].shape"
 943 |    ]
 944 |   },
 945 |   {
 946 |    "cell_type": "code",
 947 |    "execution_count": 108,
 948 |    "metadata": {},
 949 |    "outputs": [
 950 |     {
 951 |      "data": {
 952 |       "text/plain": [
 953 |        "((12435, 109), (3846, 109))"
 954 |       ]
 955 |      },
 956 |      "execution_count": 108,
 957 |      "metadata": {},
 958 |      "output_type": "execute_result"
 959 |     }
 960 |    ],
 961 |    "source": [
 962 |     "test_data[test_data['label']==0].shape,test_data[test_data['label']==1].shape"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "markdown",
 967 |    "metadata": {},
 968 |    "source": [
 969 |     "# 数据处理完后，特征的维度是108维。"
 970 |    ]
 971 |   },
 972 |   {
 973 |    "cell_type": "code",
 974 |    "execution_count": 36,
 975 |    "metadata": {},
 976 |    "outputs": [],
 977 |    "source": [
 978 |     "import tensorflow as tf\n",
 979 |     "import time\n",
 980 |     "from sklearn.metrics import roc_auc_score"
 981 |    ]
 982 |   },
 983 |   {
 984 |    "cell_type": "code",
 985 |    "execution_count": 37,
 986 |    "metadata": {},
 987 |    "outputs": [],
 988 |    "source": [
 989 |     "x = tf.placeholder(tf.float32,shape=[None,108])\n",
 990 |     "y = tf.placeholder(tf.float32,shape=[None])"
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "code",
 995 |    "execution_count": 38,
 996 |    "metadata": {},
 997 |    "outputs": [],
 998 |    "source": [
 999 |     "m = 2\n",
1000 |     "learning_rate = 0.3\n",
1001 |     "# 聚类参数\n",
1002 |     "u = tf.Variable(tf.random_normal([108,m],0.0,0.5),name='u')\n",
1003 |     "w = tf.Variable(tf.random_normal([108,m],0.0,0.5),name='w')"
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": 39,
1009 |    "metadata": {},
1010 |    "outputs": [],
1011 |    "source": [
1012 |     "U = tf.matmul(x,u)\n",
1013 |     "p1 = tf.nn.softmax(U)"
1014 |    ]
1015 |   },
1016 |   {
1017 |    "cell_type": "code",
1018 |    "execution_count": 40,
1019 |    "metadata": {},
1020 |    "outputs": [],
1021 |    "source": [
1022 |     "W = tf.matmul(x,w)\n",
1023 |     "p2 = tf.nn.softmax(W)"
1024 |    ]
1025 |   },
1026 |   {
1027 |    "cell_type": "code",
1028 |    "execution_count": 43,
1029 |    "metadata": {},
1030 |    "outputs": [],
1031 |    "source": [
1032 |     "pred = tf.reduce_sum(tf.multiply(p1,p2),1)"
1033 |    ]
1034 |   },
1035 |   {
1036 |    "cell_type": "code",
1037 |    "execution_count": 111,
1038 |    "metadata": {},
1039 |    "outputs": [
1040 |     {
1041 |      "name": "stdout",
1042 |      "output_type": "stream",
1043 |      "text": [
1044 |       "0 0 cost:11242.442383,train_auc:0.712778,test_auc:0.820132\n",
1045 |       "100 6 cost:10882.122070,train_auc:0.899455,test_auc:0.896667\n",
1046 |       "200 13 cost:10876.142578,train_auc:0.899216,test_auc:0.896111\n",
1047 |       "300 20 cost:10874.057617,train_auc:0.898944,test_auc:0.895544\n",
1048 |       "400 27 cost:10873.227539,train_auc:0.898751,test_auc:0.895171\n",
1049 |       "500 34 cost:10872.964844,train_auc:0.898600,test_auc:0.894889\n",
1050 |       "600 41 cost:10872.932617,train_auc:0.898473,test_auc:0.894680\n",
1051 |       "700 47 cost:10873.029297,train_auc:0.898341,test_auc:0.894479\n",
1052 |       "800 54 cost:10873.178711,train_auc:0.898198,test_auc:0.894281\n",
1053 |       "900 61 cost:10873.346680,train_auc:0.898057,test_auc:0.894087\n",
1054 |       "1000 67 cost:10873.514648,train_auc:0.897925,test_auc:0.893923\n",
1055 |       "1100 74 cost:10873.672852,train_auc:0.897806,test_auc:0.893769\n",
1056 |       "1200 81 cost:10873.840820,train_auc:0.897707,test_auc:0.893632\n",
1057 |       "1300 88 cost:10874.041992,train_auc:0.897628,test_auc:0.893517\n",
1058 |       "1400 94 cost:10874.262695,train_auc:0.897528,test_auc:0.893393\n",
1059 |       "1500 101 cost:10874.436523,train_auc:0.897415,test_auc:0.893272\n",
1060 |       "1600 108 cost:10874.587891,train_auc:0.897307,test_auc:0.893159\n",
1061 |       "1700 114 cost:10874.765625,train_auc:0.897202,test_auc:0.893054\n",
1062 |       "1800 121 cost:10874.981445,train_auc:0.897095,test_auc:0.892930\n",
1063 |       "1900 128 cost:10875.167969,train_auc:0.896978,test_auc:0.892802\n",
1064 |       "2000 134 cost:10875.333008,train_auc:0.896860,test_auc:0.892688\n",
1065 |       "2100 141 cost:10875.524414,train_auc:0.896738,test_auc:0.892574\n",
1066 |       "2200 148 cost:10875.757812,train_auc:0.896615,test_auc:0.892461\n",
1067 |       "2300 155 cost:10875.997070,train_auc:0.896485,test_auc:0.892339\n",
1068 |       "2400 161 cost:10876.208984,train_auc:0.896360,test_auc:0.892227\n",
1069 |       "2500 168 cost:10876.394531,train_auc:0.896243,test_auc:0.892121\n",
1070 |       "2600 175 cost:10876.570312,train_auc:0.896136,test_auc:0.892023\n",
1071 |       "2700 181 cost:10876.739258,train_auc:0.896034,test_auc:0.891935\n",
1072 |       "2800 188 cost:10876.905273,train_auc:0.895939,test_auc:0.891854\n",
1073 |       "2900 195 cost:10877.067383,train_auc:0.895846,test_auc:0.891770\n",
1074 |       "3000 202 cost:10877.227539,train_auc:0.895758,test_auc:0.891693\n",
1075 |       "3100 208 cost:10877.381836,train_auc:0.895672,test_auc:0.891614\n",
1076 |       "3200 215 cost:10877.538086,train_auc:0.895587,test_auc:0.891543\n",
1077 |       "3300 222 cost:10877.686523,train_auc:0.895506,test_auc:0.891467\n",
1078 |       "3400 228 cost:10877.834961,train_auc:0.895428,test_auc:0.891393\n",
1079 |       "3500 235 cost:10877.979492,train_auc:0.895351,test_auc:0.891326\n",
1080 |       "3600 242 cost:10878.118164,train_auc:0.895276,test_auc:0.891256\n",
1081 |       "3700 249 cost:10878.254883,train_auc:0.895202,test_auc:0.891195\n",
1082 |       "3800 255 cost:10878.383789,train_auc:0.895131,test_auc:0.891134\n",
1083 |       "3900 262 cost:10878.516602,train_auc:0.895061,test_auc:0.891071\n",
1084 |       "4000 269 cost:10878.642578,train_auc:0.894996,test_auc:0.891007\n",
1085 |       "4100 275 cost:10878.762695,train_auc:0.894933,test_auc:0.890941\n",
1086 |       "4200 282 cost:10878.880859,train_auc:0.894870,test_auc:0.890885\n",
1087 |       "4300 289 cost:10878.995117,train_auc:0.894808,test_auc:0.890830\n",
1088 |       "4400 296 cost:10879.104492,train_auc:0.894749,test_auc:0.890773\n",
1089 |       "4500 302 cost:10879.211914,train_auc:0.894694,test_auc:0.890719\n",
1090 |       "4600 309 cost:10879.315430,train_auc:0.894638,test_auc:0.890661\n",
1091 |       "4700 316 cost:10879.413086,train_auc:0.894587,test_auc:0.890599\n",
1092 |       "4800 322 cost:10879.512695,train_auc:0.894538,test_auc:0.890540\n",
1093 |       "4900 329 cost:10879.610352,train_auc:0.894489,test_auc:0.890475\n",
1094 |       "5000 336 cost:10879.701172,train_auc:0.894441,test_auc:0.890410\n",
1095 |       "5100 343 cost:10879.782227,train_auc:0.894394,test_auc:0.890351\n",
1096 |       "5200 349 cost:10879.862305,train_auc:0.894349,test_auc:0.890298\n",
1097 |       "5300 356 cost:10879.940430,train_auc:0.894302,test_auc:0.890250\n",
1098 |       "5400 363 cost:10880.012695,train_auc:0.894259,test_auc:0.890202\n",
1099 |       "5500 369 cost:10880.083984,train_auc:0.894218,test_auc:0.890157\n",
1100 |       "5600 376 cost:10880.151367,train_auc:0.894178,test_auc:0.890111\n",
1101 |       "5700 383 cost:10880.219727,train_auc:0.894140,test_auc:0.890069\n",
1102 |       "5800 390 cost:10880.284180,train_auc:0.894102,test_auc:0.890030\n",
1103 |       "5900 396 cost:10880.351562,train_auc:0.894066,test_auc:0.889992\n",
1104 |       "6000 403 cost:10880.413086,train_auc:0.894032,test_auc:0.889956\n",
1105 |       "6100 410 cost:10880.476562,train_auc:0.893997,test_auc:0.889925\n",
1106 |       "6200 416 cost:10880.538086,train_auc:0.893965,test_auc:0.889892\n",
1107 |       "6300 423 cost:10880.598633,train_auc:0.893930,test_auc:0.889859\n",
1108 |       "6400 430 cost:10880.655273,train_auc:0.893898,test_auc:0.889828\n",
1109 |       "6500 436 cost:10880.713867,train_auc:0.893868,test_auc:0.889799\n",
1110 |       "6600 443 cost:10880.770508,train_auc:0.893839,test_auc:0.889774\n",
1111 |       "6700 450 cost:10880.827148,train_auc:0.893811,test_auc:0.889746\n",
1112 |       "6800 456 cost:10880.886719,train_auc:0.893784,test_auc:0.889719\n",
1113 |       "6900 463 cost:10880.939453,train_auc:0.893758,test_auc:0.889696\n",
1114 |       "7000 470 cost:10880.992188,train_auc:0.893733,test_auc:0.889676\n",
1115 |       "7100 476 cost:10881.047852,train_auc:0.893709,test_auc:0.889654\n",
1116 |       "7200 483 cost:10881.102539,train_auc:0.893686,test_auc:0.889627\n",
1117 |       "7300 490 cost:10881.153320,train_auc:0.893661,test_auc:0.889606\n",
1118 |       "7400 496 cost:10881.208008,train_auc:0.893640,test_auc:0.889589\n",
1119 |       "7500 503 cost:10881.259766,train_auc:0.893618,test_auc:0.889570\n",
1120 |       "7600 510 cost:10881.309570,train_auc:0.893597,test_auc:0.889553\n",
1121 |       "7700 516 cost:10881.361328,train_auc:0.893579,test_auc:0.889539\n",
1122 |       "7800 523 cost:10881.413086,train_auc:0.893559,test_auc:0.889522\n",
1123 |       "7900 530 cost:10881.461914,train_auc:0.893539,test_auc:0.889508\n",
1124 |       "8000 536 cost:10881.512695,train_auc:0.893519,test_auc:0.889491\n",
1125 |       "8100 543 cost:10881.560547,train_auc:0.893500,test_auc:0.889474\n",
1126 |       "8200 550 cost:10881.610352,train_auc:0.893482,test_auc:0.889456\n",
1127 |       "8300 556 cost:10881.657227,train_auc:0.893465,test_auc:0.889440\n",
1128 |       "8400 563 cost:10881.704102,train_auc:0.893447,test_auc:0.889421\n",
1129 |       "8500 570 cost:10881.750977,train_auc:0.893431,test_auc:0.889410\n",
1130 |       "8600 576 cost:10881.797852,train_auc:0.893416,test_auc:0.889400\n",
1131 |       "8700 583 cost:10881.842773,train_auc:0.893401,test_auc:0.889388\n",
1132 |       "8800 590 cost:10881.889648,train_auc:0.893389,test_auc:0.889375\n",
1133 |       "8900 596 cost:10881.931641,train_auc:0.893376,test_auc:0.889363\n",
1134 |       "9000 603 cost:10881.979492,train_auc:0.893363,test_auc:0.889353\n",
1135 |       "9100 610 cost:10882.022461,train_auc:0.893352,test_auc:0.889344\n",
1136 |       "9200 616 cost:10882.066406,train_auc:0.893340,test_auc:0.889334\n",
1137 |       "9300 623 cost:10882.108398,train_auc:0.893328,test_auc:0.889323\n",
1138 |       "9400 629 cost:10882.151367,train_auc:0.893317,test_auc:0.889311\n",
1139 |       "9500 636 cost:10882.194336,train_auc:0.893307,test_auc:0.889305\n",
1140 |       "9600 643 cost:10882.236328,train_auc:0.893295,test_auc:0.889294\n",
1141 |       "9700 649 cost:10882.278320,train_auc:0.893284,test_auc:0.889286\n",
1142 |       "9800 656 cost:10882.318359,train_auc:0.893269,test_auc:0.889278\n",
1143 |       "9900 663 cost:10882.359375,train_auc:0.893260,test_auc:0.889270\n"
1144 |      ]
1145 |     }
1146 |    ],
1147 |    "source": [
1148 |     "cost1 = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred,labels=y))\n",
1149 |     "cost = tf.add_n([cost1])\n",
1150 |     "train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)\n",
1151 |     "time_s = time.time()\n",
1152 |     "result=[]\n",
1153 |     "with tf.Session() as sess:\n",
1154 |     "    sess.run(tf.global_variables_initializer())# 初始化\n",
1155 |     "    for epoch in range(0,10000):\n",
1156 |     "        f_dict ={x:train_x,y:train_y}\n",
1157 |     "        \n",
1158 |     "        _,cost_,predict_ = sess.run([train_op,cost,pred],feed_dict=f_dict)\n",
1159 |     "        \n",
1160 |     "        auc = roc_auc_score(train_y,predict_)\n",
1161 |     "        time_t =time.time()\n",
1162 |     "        # 测试集\n",
1163 |     "        if epoch % 100 ==0:\n",
1164 |     "            f_dict ={x:test_x,y:test_y}\n",
1165 |     "            _,cost_,predict_test =sess.run([train_op,cost,pred],feed_dict=f_dict)\n",
1166 |     "            test_auc = roc_auc_score(test_y,predict_test)\n",
1167 |     "            print(\"%d %1d cost:%f,train_auc:%f,test_auc:%f\"%(epoch,(time_t-time_s),cost_,auc,test_auc))\n",
1168 |     "            result.append([epoch,(time_t-time_s),auc,test_auc])"
1169 |    ]
1170 |   },
1171 |   {
1172 |    "cell_type": "code",
1173 |    "execution_count": null,
1174 |    "metadata": {},
1175 |    "outputs": [],
1176 |    "source": []
1177 |   }
1178 |  ],
1179 |  "metadata": {
1180 |   "kernelspec": {
1181 |    "display_name": "Python 3",
1182 |    "language": "python",
1183 |    "name": "python3"
1184 |   },
1185 |   "language_info": {
1186 |    "codemirror_mode": {
1187 |     "name": "ipython",
1188 |     "version": 3
1189 |    },
1190 |    "file_extension": ".py",
1191 |    "mimetype": "text/x-python",
1192 |    "name": "python",
1193 |    "nbconvert_exporter": "python",
1194 |    "pygments_lexer": "ipython3",
1195 |    "version": "3.6.5"
1196 |   }
1197 |  },
1198 |  "nbformat": 4,
1199 |  "nbformat_minor": 2
1200 | }
1201 | 


--------------------------------------------------------------------------------
/NFM/NFM.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | TensorFlow 2.0 implementation of Product-based Neural Network[1]
  4 | Reference:
  5 | https://zhuanlan.zhihu.com/p/37522285
  6 | Neural Factorization Machines for Sparse Predictive Analytics
  7 | """
  8 | import tensorflow as tf
  9 | 
 10 | import pickle
 11 | from util.train_model import train_test_model_demo
 12 | 
 13 | 
 14 | class BiInteraction(tf.keras.layers.Layer):
 15 |     def __init__(self, Units=1, **kwargs):
 16 |         self.units = Units
 17 |         super(BiInteraction, self).__init__(**kwargs)
 18 | 
 19 |     def build(self, input_shape):
 20 |         input_dim = input_shape[2]
 21 |         # self.W = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', trainable=True)
 22 |         # self.b = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', trainable=True)
 23 |         self.linearlayer = tf.keras.layers.Dense(input_dim, activation='relu', use_bias=True)
 24 | 
 25 |     def call(self, input):
 26 |         # sum-square-part
 27 |         self.summed_features_emb = tf.reduce_sum(input,1) # None * K
 28 |         # print("self.summed_features_emb:",self.summed_features_emb.get_shape())
 29 |         self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K
 30 |         # square-sum-part
 31 |         self.squared_features_emb = tf.square(input)
 32 |         self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb,1) # None * K
 33 | 
 34 |         # second order
 35 |         self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square,self.squared_sum_features_emb) # None * K
 36 |         print("y_second_order:",self.y_second_order.get_shape()) # 128 * 10
 37 |         output = self.linearlayer(self.y_second_order)
 38 |         return output
 39 | 
 40 | class NFM(tf.keras.Model):
 41 |     def __init__(self, num_feat, num_field, dropout_deep, deep_layer_sizes, embedding_size=10):
 42 |         super().__init__()
 43 |         self.num_feat = num_feat  # F =features nums
 44 |         self.num_field = num_field  # N =fields of a feature
 45 |         self.dropout_deep = dropout_deep
 46 | 
 47 |         # Embedding 这里采用embeddings层因此大小为F* M F为特征数量，M为embedding的维度
 48 |         feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size,
 49 |                                                     embeddings_initializer='uniform')  # F * M
 50 |         self.feat_embeddings = feat_embeddings
 51 | 
 52 |         # fc layer
 53 |         self.deep_layer_sizes = deep_layer_sizes
 54 |         # 神经网络方面的参数
 55 |         for i in range(len(deep_layer_sizes)):
 56 |             setattr(self, 'dense_' + str(i), tf.keras.layers.Dense(deep_layer_sizes[i]))
 57 |             setattr(self, 'batchNorm_' + str(i), tf.keras.layers.BatchNormalization())
 58 |             setattr(self, 'activation_' + str(i), tf.keras.layers.Activation('relu'))
 59 |             setattr(self, 'dropout_' + str(i), tf.keras.layers.Dropout(dropout_deep[i]))
 60 |         self.bilayer = BiInteraction(1)
 61 |         # last layer
 62 |         self.fc = tf.keras.layers.Dense(1, activation=None, use_bias=True)
 63 | 
 64 |         self.linearlayer = tf.keras.layers.Dense(deep_layer_sizes[-1], activation='relu', use_bias=True)
 65 | 
 66 |     def call(self, feat_index, feat_value):
 67 |         # call函数接收输入变量
 68 |         # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
 69 |         feat_embedding_0 = self.feat_embeddings(feat_index)  # Batch * N * M
 70 |         #         print(feat_value.get_shape())
 71 |         feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value)
 72 | 
 73 |         y_deep = self.bilayer(feat_embedding)
 74 |         y_linear = self.linearlayer(tf.reduce_sum(feat_embedding,1))
 75 | 
 76 |         for i in range(len(self.deep_layer_sizes)):
 77 |             y_deep = getattr(self, 'dense_' + str(i))(y_deep)
 78 |             y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep)
 79 |             y_deep = getattr(self, 'activation_' + str(i))(y_deep)
 80 |             y_deep = getattr(self, 'dropout_' + str(i))(y_deep)
 81 |         y = y_deep + y_linear
 82 |         output = self.fc(y)
 83 | 
 84 |         return output
 85 | if __name__ == '__main__':
 86 |     AID_DATA_DIR = "../data/Criteo/"
 87 |     feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
 88 | 
 89 |     nfm = NFM(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],
 90 |                     deep_layer_sizes=[400, 400], embedding_size=10)
 91 | 
 92 |     train_label_path = AID_DATA_DIR + 'train_label'
 93 |     train_idx_path = AID_DATA_DIR + 'train_idx'
 94 |     train_value_path = AID_DATA_DIR + 'train_value'
 95 | 
 96 |     test_label_path = AID_DATA_DIR + 'test_label'
 97 |     test_idx_path = AID_DATA_DIR + 'test_idx'
 98 |     test_value_path = AID_DATA_DIR + 'test_value'
 99 | 
100 |     train_test_model_demo(nfm,train_label_path, train_idx_path, train_value_path)
101 | 


--------------------------------------------------------------------------------
/PNN/PNN-tf2.0.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 13 |       "  from ._conv import register_converters as _register_converters\n",
 14 |       "Using TensorFlow backend.\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "# from sklearn.preprocessing import OneHotEncoder,StandarScaler\n",
 22 |     "from sklearn.metrics import accuracy_score\n",
 23 |     "import random\n",
 24 |     "from keras.utils import to_categorical\n",
 25 |     "from sklearn.preprocessing import LabelEncoder\n",
 26 |     "\n",
 27 |     "from sklearn.metrics import roc_auc_score\n",
 28 |     "\n",
 29 |     "import tensorflow as tf\n",
 30 |     "\n",
 31 |     "from collections import Counter\n",
 32 |     "\n",
 33 |     "import math"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 23,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "class PNN(tf.keras.Model):\n",
 43 |     "    def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,product_layer_dim=10,reg_l1=0.01,reg_l2=1e-5,embedding_size=10,product_type='outer'):\n",
 44 |     "        super().__init__()\n",
 45 |     "        self.reg_l1 = reg_l1\n",
 46 |     "        self.reg_l2 = reg_l2\n",
 47 |     "        self.num_feat = num_feat # F =features nums\n",
 48 |     "        self.num_field = num_field # N =fields of a feature \n",
 49 |     "        self.product_layer_dim = product_layer_dim # D1 pnn dim\n",
 50 |     "        self.dropout_deep  = dropout_deep\n",
 51 |     "        \n",
 52 |     "        # Embedding 这里采用embeddings层因此大小为F* M F为特征数量，M为embedding的维度\n",
 53 |     "        feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M \n",
 54 |     "        self.feat_embeddings = feat_embeddings\n",
 55 |     "        \n",
 56 |     "        # 定义随机初始化\n",
 57 |     "        initializer = tf.initializers.GlorotUniform()\n",
 58 |     "        \n",
 59 |     "        # linear part 线性层就是embedding层的复制，因此线性信号权重大小是D1 * N * M，为什么因此是线性层维度为 D1，embedding层维度为N* M\n",
 60 |     "        # 因此权重大小为D1 * N *M\n",
 61 |     "        self.linear_weights = tf.Variable(initializer(shape=(product_layer_dim,num_field,embedding_size))) # D1 * N * M\n",
 62 |     "        \n",
 63 |     "        # quadratic part \n",
 64 |     "        self.product_type = product_type\n",
 65 |     "        if product_type == 'inner':\n",
 66 |     "            self.theta = tf.Variable(initializer(shape=(product_layer_dim,num_field))) # D1 * N\n",
 67 |     "\n",
 68 |     "        else:\n",
 69 |     "            self.quadratic_weights = tf.Variable(initializer(shape=(product_layer_dim,embedding_size, embedding_size)))# D1 * M * M\n",
 70 |     "        \n",
 71 |     "        # fc layer\n",
 72 |     "        self.deep_layer_sizes = deep_layer_sizes\n",
 73 |     "        #神经网络方面的参数\n",
 74 |     "        for i in range(len(deep_layer_sizes)):\n",
 75 |     "            setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))\n",
 76 |     "            setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())\n",
 77 |     "            setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))\n",
 78 |     "            setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))\n",
 79 |     "        \n",
 80 |     "        # last layer\n",
 81 |     "        self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)\n",
 82 |     "        \n",
 83 |     "    def call(self,feat_index,feat_value):\n",
 84 |     "        # call函数接收输入变量\n",
 85 |     "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
 86 |     "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
 87 |     "#         print(feat_value.get_shape())\n",
 88 |     "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
 89 |     "        # linear part \n",
 90 |     "        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
 91 |     "        \n",
 92 |     "        # quadratic part\n",
 93 |     "        if self.product_type == 'inner':\n",
 94 |     "            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
 95 |     "            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
 96 |     "        else:\n",
 97 |     "            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
 98 |     "            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
 99 |     "            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
100 |     "        \n",
101 |     "        y_deep = tf.concat((lz,lp),axis=1)\n",
102 |     "        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
103 |     "        \n",
104 |     "        for i in range(len(self.deep_layer_sizes)):\n",
105 |     "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
106 |     "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
107 |     "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
108 |     "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
109 |     "        \n",
110 |     "        output = self.fc(y_deep)\n",
111 |     "        \n",
112 |     "        return output "
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "train = pd.read_csv(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.tiny.csv')\n",
122 |     "\n",
123 |     "train = train.fillna(0)\n",
124 |     "\n",
125 |     "traindrop = train.drop(columns = ['Id'])\n",
126 |     "\n",
127 |     "traindrop.to_csv(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt',sep='\\t', index=False,header=None)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 11,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "freq_ = 10\n",
137 |     "# dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'\n",
138 |     "continuous_range_ = range(1, 14)\n",
139 |     "categorical_range_ = range(14, 40)\n",
140 |     "\n",
141 |     "# 统计离散特征每个离散值出现的次数组成字典\n",
142 |     "feat_cnt = Counter()\n",
143 |     "with open(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt', 'r') as fin:\n",
144 |     "    for line_idx, line in enumerate(fin):\n",
145 |     "        features = line.rstrip('\\n').split('\\t')\n",
146 |     "        for idx in categorical_range_:\n",
147 |     "            if features[idx] == '': continue\n",
148 |     "            feat_cnt.update([features[idx]])"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 13,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "# Only retain discrete features with high frequency\n",
158 |     "dis_feat_set = set() # 高频段的离散字符\n",
159 |     "for feat, ot in feat_cnt.items():\n",
160 |     "    if ot >= freq_:\n",
161 |     "        dis_feat_set.add(feat)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 14,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "# Create a dictionary for continuous and discrete features\n",
171 |     "feat_dict = {}\n",
172 |     "tc = 1\n",
173 |     "# Continuous features\n",
174 |     "for idx in continuous_range_:\n",
175 |     "    feat_dict[idx] = tc\n",
176 |     "    tc += 1 # 代表占据一列\n",
177 |     "\n",
178 |     "# Discrete features\n",
179 |     "cnt_feat_set = set()\n",
180 |     "with open(r'F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\train.txt', 'r') as fin:\n",
181 |     "    for line_idx, line in enumerate(fin):\n",
182 |     "        features = line.rstrip('\\n').split('\\t')\n",
183 |     "        for idx in categorical_range_:\n",
184 |     "            # 排除空字符和低频离散字符\n",
185 |     "            if features[idx] == '' or features[idx] not in dis_feat_set:\n",
186 |     "                continue\n",
187 |     "            # 排除连续性数值\n",
188 |     "            if features[idx] not in cnt_feat_set:\n",
189 |     "                cnt_feat_set.add(features[idx])\n",
190 |     "                # 获取种类数\n",
191 |     "                feat_dict[features[idx]] = tc\n",
192 |     "                tc += 1"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 16,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "file_path = \"F:\\\\baidudownload\\\\kaggle-2014-criteo-master\\\\kaggle-2014-criteo-master\\\\\""
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 18,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "cont_features=['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',\n",
211 |     "       'I10', 'I11', 'I12', 'I13']\n",
212 |     "dist_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',\n",
213 |     "       'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',\n",
214 |     "       'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 21,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "train_label = []\n",
224 |     "train_value = []\n",
225 |     "train_idx = []\n",
226 |     "test_label = []\n",
227 |     "test_value = []\n",
228 |     "test_idx = []\n",
229 |     "\n",
230 |     "continuous_range_ = range(1, 14)\n",
231 |     "categorical_range_ = range(14, 40)\n",
232 |     "cont_max_=[]\n",
233 |     "cont_min_=[]\n",
234 |     "for cf in cont_features:\n",
235 |     "    cont_max_.append(max(train[cf]))\n",
236 |     "    cont_min_.append(min(train[cf]))\n",
237 |     "cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]\n",
238 |     "\n",
239 |     "def process_line_(line):\n",
240 |     "    features = line.rstrip('\\n').split('\\t')\n",
241 |     "    feat_idx, feat_value, label = [], [], []\n",
242 |     "\n",
243 |     "    # MinMax Normalization\n",
244 |     "    for idx in continuous_range_:\n",
245 |     "        if features[idx] == '':\n",
246 |     "            feat_idx.append(0)\n",
247 |     "            feat_value.append(0.0)\n",
248 |     "        else:\n",
249 |     "            feat_idx.append(feat_dict[idx])\n",
250 |     "            # 归一化\n",
251 |     "            feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))\n",
252 |     "\n",
253 |     "    # 处理离散型数据\n",
254 |     "    for idx in categorical_range_:\n",
255 |     "        if features[idx] == '' or features[idx] not in feat_dict:\n",
256 |     "            feat_idx.append(0)\n",
257 |     "            feat_value.append(0.0)\n",
258 |     "        else:\n",
259 |     "            feat_idx.append(feat_dict[features[idx]])\n",
260 |     "            feat_value.append(1.0)\n",
261 |     "    return feat_idx, feat_value, [int(features[0])]\n",
262 |     "split_ratio = 0.9\n",
263 |     "with open(file_path + 'train.txt', 'r') as fin:\n",
264 |     "    for line_idx, line in enumerate(fin):\n",
265 |     "\n",
266 |     "        feat_idx, feat_value, label = process_line_(line)\n",
267 |     "        if np.random.random() <= split_ratio:\n",
268 |     "            train_label.append(label)\n",
269 |     "            train_idx.append(feat_idx)\n",
270 |     "            train_value.append(feat_value)\n",
271 |     "        else:\n",
272 |     "            test_label.append(label)\n",
273 |     "            test_idx.append(feat_idx)\n",
274 |     "            test_value.append(feat_value)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 24,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "pnn = PNN(num_feat=len(feat_dict) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],\n",
284 |     "                deep_layer_sizes=[400, 400], product_layer_dim=10,\n",
285 |     "                reg_l1=0.01, reg_l2=1e-5, embedding_size=10, product_type='outer')"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 25,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "train_ds = tf.data.Dataset.from_tensor_slices(\n",
295 |     "    (train_label,train_idx,train_value)).shuffle(10000).batch(32)"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 26,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "@tf.function\n",
305 |     "def train_one_step(model, optimizer, idx, value, label):\n",
306 |     "    with tf.GradientTape() as tape:\n",
307 |     "        output = model(idx,value)\n",
308 |     "        loss = loss_object(y_true=label, y_pred=output)\n",
309 |     "    grads = tape.gradient(loss, model.trainable_variables)\n",
310 |     "    grads = [tf.clip_by_norm(g, 100) for g in grads]\n",
311 |     "    optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))\n",
312 |     "    \n",
313 |     "    train_loss(loss)\n",
314 |     "    train_accuracy(label,output)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 27,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "train_loss = tf.keras.metrics.Mean(name='train_loss')\n",
324 |     "train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')\n",
325 |     "\n",
326 |     "loss_object = tf.keras.losses.BinaryCrossentropy()\n",
327 |     "\n",
328 |     "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 35,
334 |    "metadata": {
335 |     "scrolled": true
336 |    },
337 |    "outputs": [
338 |     {
339 |      "name": "stdout",
340 |      "output_type": "stream",
341 |      "text": [
342 |       "WARNING:tensorflow:Entity <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>>, which Python reported as:\n",
343 |       "    def call(self,feat_index,feat_value):\n",
344 |       "        # call函数接收输入变量\n",
345 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
346 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
347 |       "#         print(feat_value.get_shape())\n",
348 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
349 |       "        # linear part \n",
350 |       "        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
351 |       "        \n",
352 |       "        # quadratic part\n",
353 |       "        if self.product_type == 'inner':\n",
354 |       "            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
355 |       "            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
356 |       "        else:\n",
357 |       "            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
358 |       "            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
359 |       "            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
360 |       "        \n",
361 |       "        y_deep = tf.concat((lz,lp),axis=1)\n",
362 |       "        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
363 |       "        \n",
364 |       "        for i in range(len(self.deep_layer_sizes)):\n",
365 |       "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
366 |       "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
367 |       "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
368 |       "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
369 |       "        \n",
370 |       "        output = self.fc(y_deep)\n",
371 |       "        \n",
372 |       "        return output \n",
373 |       "\n",
374 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
375 |       "WARNING: Entity <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>>, which Python reported as:\n",
376 |       "    def call(self,feat_index,feat_value):\n",
377 |       "        # call函数接收输入变量\n",
378 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
379 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
380 |       "#         print(feat_value.get_shape())\n",
381 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
382 |       "        # linear part \n",
383 |       "        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
384 |       "        \n",
385 |       "        # quadratic part\n",
386 |       "        if self.product_type == 'inner':\n",
387 |       "            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
388 |       "            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
389 |       "        else:\n",
390 |       "            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
391 |       "            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
392 |       "            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
393 |       "        \n",
394 |       "        y_deep = tf.concat((lz,lp),axis=1)\n",
395 |       "        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
396 |       "        \n",
397 |       "        for i in range(len(self.deep_layer_sizes)):\n",
398 |       "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
399 |       "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
400 |       "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
401 |       "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
402 |       "        \n",
403 |       "        output = self.fc(y_deep)\n",
404 |       "        \n",
405 |       "        return output \n",
406 |       "\n",
407 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
408 |       "WARNING:tensorflow:Entity <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>>, which Python reported as:\n",
409 |       "    def call(self,feat_index,feat_value):\n",
410 |       "        # call函数接收输入变量\n",
411 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
412 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
413 |       "#         print(feat_value.get_shape())\n",
414 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
415 |       "        # linear part \n",
416 |       "        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
417 |       "        \n",
418 |       "        # quadratic part\n",
419 |       "        if self.product_type == 'inner':\n",
420 |       "            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
421 |       "            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
422 |       "        else:\n",
423 |       "            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
424 |       "            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
425 |       "            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
426 |       "        \n",
427 |       "        y_deep = tf.concat((lz,lp),axis=1)\n",
428 |       "        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
429 |       "        \n",
430 |       "        for i in range(len(self.deep_layer_sizes)):\n",
431 |       "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
432 |       "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
433 |       "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
434 |       "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
435 |       "        \n",
436 |       "        output = self.fc(y_deep)\n",
437 |       "        \n",
438 |       "        return output \n",
439 |       "\n",
440 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
441 |       "WARNING: Entity <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>>, which Python reported as:\n",
442 |       "    def call(self,feat_index,feat_value):\n",
443 |       "        # call函数接收输入变量\n",
444 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
445 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
446 |       "#         print(feat_value.get_shape())\n",
447 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
448 |       "        # linear part \n",
449 |       "        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
450 |       "        \n",
451 |       "        # quadratic part\n",
452 |       "        if self.product_type == 'inner':\n",
453 |       "            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
454 |       "            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
455 |       "        else:\n",
456 |       "            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
457 |       "            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
458 |       "            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
459 |       "        \n",
460 |       "        y_deep = tf.concat((lz,lp),axis=1)\n",
461 |       "        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
462 |       "        \n",
463 |       "        for i in range(len(self.deep_layer_sizes)):\n",
464 |       "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
465 |       "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
466 |       "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
467 |       "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
468 |       "        \n",
469 |       "        output = self.fc(y_deep)\n",
470 |       "        \n",
471 |       "        return output \n",
472 |       "\n",
473 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
474 |       "WARNING:tensorflow:Entity <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>>, which Python reported as:\n",
475 |       "    def call(self,feat_index,feat_value):\n",
476 |       "        # call函数接收输入变量\n",
477 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
478 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
479 |       "#         print(feat_value.get_shape())\n",
480 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
481 |       "        # linear part \n",
482 |       "        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
483 |       "        \n",
484 |       "        # quadratic part\n",
485 |       "        if self.product_type == 'inner':\n",
486 |       "            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
487 |       "            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
488 |       "        else:\n",
489 |       "            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
490 |       "            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
491 |       "            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
492 |       "        \n",
493 |       "        y_deep = tf.concat((lz,lp),axis=1)\n",
494 |       "        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
495 |       "        \n",
496 |       "        for i in range(len(self.deep_layer_sizes)):\n",
497 |       "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
498 |       "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
499 |       "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
500 |       "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
501 |       "        \n",
502 |       "        output = self.fc(y_deep)\n",
503 |       "        \n",
504 |       "        return output \n",
505 |       "\n",
506 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n"
507 |      ]
508 |     },
509 |     {
510 |      "name": "stdout",
511 |      "output_type": "stream",
512 |      "text": [
513 |       "WARNING: Entity <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method PNN.call of <__main__.PNN object at 0x0000022281C438D0>>, which Python reported as:\n",
514 |       "    def call(self,feat_index,feat_value):\n",
515 |       "        # call函数接收输入变量\n",
516 |       "        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。\n",
517 |       "        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M \n",
518 |       "#         print(feat_value.get_shape())\n",
519 |       "        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)\n",
520 |       "        # linear part \n",
521 |       "        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1\n",
522 |       "        \n",
523 |       "        # quadratic part\n",
524 |       "        if self.product_type == 'inner':\n",
525 |       "            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M \n",
526 |       "            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1\n",
527 |       "        else:\n",
528 |       "            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M\n",
529 |       "            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)\n",
530 |       "            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1\n",
531 |       "        \n",
532 |       "        y_deep = tf.concat((lz,lp),axis=1)\n",
533 |       "        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)\n",
534 |       "        \n",
535 |       "        for i in range(len(self.deep_layer_sizes)):\n",
536 |       "            y_deep = getattr(self,'dense_' + str(i))(y_deep)\n",
537 |       "            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)\n",
538 |       "            y_deep = getattr(self,'activation_' + str(i))(y_deep)\n",
539 |       "            y_deep = getattr(self,'dropout_' + str(i))(y_deep)\n",
540 |       "        \n",
541 |       "        output = self.fc(y_deep)\n",
542 |       "        \n",
543 |       "        return output \n",
544 |       "\n",
545 |       "This may be caused by multiline strings or comments not indented at the same level as the code.\n",
546 |       "Epoch 1, Loss: 0.6576068997383118, Accuracy: 0.7935805320739746\n",
547 |       "Epoch 2, Loss: 0.5885103344917297, Accuracy: 0.7927504181861877\n",
548 |       "Epoch 3, Loss: 0.5613061785697937, Accuracy: 0.7932115793228149\n",
549 |       "Epoch 4, Loss: 0.5463097095489502, Accuracy: 0.7933038473129272\n",
550 |       "Epoch 5, Loss: 0.5362721681594849, Accuracy: 0.7933591604232788\n",
551 |       "Epoch 6, Loss: 0.5272665023803711, Accuracy: 0.7933960556983948\n",
552 |       "Epoch 7, Loss: 0.519040048122406, Accuracy: 0.7934224009513855\n",
553 |       "Epoch 8, Loss: 0.5109833478927612, Accuracy: 0.7934421896934509\n",
554 |       "Epoch 9, Loss: 0.5033180713653564, Accuracy: 0.7937650084495544\n",
555 |       "Epoch 10, Loss: 0.4961019456386566, Accuracy: 0.7946319580078125\n",
556 |       "Epoch 11, Loss: 0.4890766143798828, Accuracy: 0.7957438230514526\n",
557 |       "Epoch 12, Loss: 0.4827176034450531, Accuracy: 0.7968548536300659\n",
558 |       "Epoch 13, Loss: 0.47674980759620667, Accuracy: 0.797752320766449\n",
559 |       "Epoch 14, Loss: 0.4711345434188843, Accuracy: 0.7988378405570984\n",
560 |       "Epoch 15, Loss: 0.4657707214355469, Accuracy: 0.800000011920929\n",
561 |       "Epoch 16, Loss: 0.46048682928085327, Accuracy: 0.8010168671607971\n",
562 |       "Epoch 17, Loss: 0.45555245876312256, Accuracy: 0.8017839193344116\n",
563 |       "Epoch 18, Loss: 0.45082414150238037, Accuracy: 0.8027116656303406\n",
564 |       "Epoch 19, Loss: 0.44618847966194153, Accuracy: 0.8038330674171448\n",
565 |       "Epoch 20, Loss: 0.4417554438114166, Accuracy: 0.8047869205474854\n",
566 |       "Epoch 21, Loss: 0.4372897744178772, Accuracy: 0.8057553768157959\n",
567 |       "Epoch 22, Loss: 0.4330126941204071, Accuracy: 0.8065603375434875\n",
568 |       "Epoch 23, Loss: 0.4288385808467865, Accuracy: 0.8073434233665466\n",
569 |       "Epoch 24, Loss: 0.4247806966304779, Accuracy: 0.8084993362426758\n",
570 |       "Epoch 25, Loss: 0.4208265244960785, Accuracy: 0.8095406889915466\n",
571 |       "Epoch 26, Loss: 0.4169451892375946, Accuracy: 0.8103954792022705\n",
572 |       "Epoch 27, Loss: 0.4130288362503052, Accuracy: 0.8114328980445862\n",
573 |       "Epoch 28, Loss: 0.40920114517211914, Accuracy: 0.8125543594360352\n",
574 |       "Epoch 29, Loss: 0.40538835525512695, Accuracy: 0.8135412335395813\n",
575 |       "Epoch 30, Loss: 0.4015306830406189, Accuracy: 0.8148680925369263\n",
576 |       "Epoch 31, Loss: 0.3976829946041107, Accuracy: 0.8161808252334595\n",
577 |       "Epoch 32, Loss: 0.3940720558166504, Accuracy: 0.8173941373825073\n",
578 |       "Epoch 33, Loss: 0.3902757167816162, Accuracy: 0.8186178207397461\n",
579 |       "Epoch 34, Loss: 0.3865205645561218, Accuracy: 0.8198021054267883\n",
580 |       "Epoch 35, Loss: 0.3826711177825928, Accuracy: 0.8213613629341125\n",
581 |       "Epoch 36, Loss: 0.3788437247276306, Accuracy: 0.8228186964988708\n",
582 |       "Epoch 37, Loss: 0.37512916326522827, Accuracy: 0.8244065642356873\n",
583 |       "Epoch 38, Loss: 0.37138548493385315, Accuracy: 0.8259109258651733\n",
584 |       "Epoch 39, Loss: 0.3679004907608032, Accuracy: 0.8273097276687622\n",
585 |       "Epoch 40, Loss: 0.36420953273773193, Accuracy: 0.8287769556045532\n",
586 |       "Epoch 41, Loss: 0.3605599105358124, Accuracy: 0.8302671313285828\n",
587 |       "Epoch 42, Loss: 0.3569217622280121, Accuracy: 0.8318312168121338\n",
588 |       "Epoch 43, Loss: 0.3537658751010895, Accuracy: 0.8332582712173462\n",
589 |       "Epoch 44, Loss: 0.35020676255226135, Accuracy: 0.8348342180252075\n",
590 |       "Epoch 45, Loss: 0.3465138077735901, Accuracy: 0.8364631533622742\n",
591 |       "Epoch 46, Loss: 0.34285783767700195, Accuracy: 0.8380813598632812\n",
592 |       "Epoch 47, Loss: 0.3392927646636963, Accuracy: 0.8396543264389038\n",
593 |       "Epoch 48, Loss: 0.33572396636009216, Accuracy: 0.8413115739822388\n",
594 |       "Epoch 49, Loss: 0.33254799246788025, Accuracy: 0.8427882790565491\n",
595 |       "Epoch 50, Loss: 0.3292645514011383, Accuracy: 0.8442501425743103\n"
596 |      ]
597 |     }
598 |    ],
599 |    "source": [
600 |     "EPOCHS = 50\n",
601 |     "for epoch in range(EPOCHS):\n",
602 |     "    for label, idx, value in train_ds:\n",
603 |     "        train_one_step(pnn,optimizer,idx, value,label)\n",
604 |     "    template = 'Epoch {}, Loss: {}, Accuracy: {}'\n",
605 |     "    print (template.format(epoch+1,\n",
606 |     "                             train_loss.result(),train_accuracy.result()))"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {},
613 |    "outputs": [],
614 |    "source": []
615 |   }
616 |  ],
617 |  "metadata": {
618 |   "kernelspec": {
619 |    "display_name": "Python 3",
620 |    "language": "python",
621 |    "name": "python3"
622 |   },
623 |   "language_info": {
624 |    "codemirror_mode": {
625 |     "name": "ipython",
626 |     "version": 3
627 |    },
628 |    "file_extension": ".py",
629 |    "mimetype": "text/x-python",
630 |    "name": "python",
631 |    "nbconvert_exporter": "python",
632 |    "pygments_lexer": "ipython3",
633 |    "version": "3.6.5"
634 |   }
635 |  },
636 |  "nbformat": 4,
637 |  "nbformat_minor": 2
638 | }
639 | 


--------------------------------------------------------------------------------
/PNN/PNN.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | TensorFlow 2.0 implementation of Product-based Neural Network[1]
  4 | Reference:
  5 | [1] Product-based Neural Networks for User ResponsePrediction,
  6 |     Yanru Qu, Han Cai, Kan Ren, Weinan Zhang, Yong Yu, Ying Wen, Jun Wang
  7 | [2] Tensorflow implementation of PNN
  8 |     https://github.com/Snail110/Awesome-RecSystem-Models/blob/master/Model/PNN_TensorFlow.py
  9 | """
 10 | import tensorflow as tf
 11 | 
 12 | import pickle
 13 | from util.train_model import train_test_model_demo
 14 | class PNN(tf.keras.Model):
 15 |     def __init__(self, num_feat, num_field, dropout_deep, deep_layer_sizes, product_layer_dim=10, reg_l1=0.01,
 16 |                  reg_l2=1e-5, embedding_size=10, product_type='outer'):
 17 |         super().__init__()
 18 |         self.reg_l1 = reg_l1
 19 |         self.reg_l2 = reg_l2
 20 |         self.num_feat = num_feat  # F =features nums
 21 |         self.num_field = num_field  # N =fields of a feature
 22 |         self.product_layer_dim = product_layer_dim  # D1 pnn dim
 23 |         self.dropout_deep = dropout_deep
 24 | 
 25 |         # Embedding 这里采用embeddings层因此大小为F* M F为特征数量，M为embedding的维度
 26 |         feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size,
 27 |                                                     embeddings_initializer='uniform')  # F * M
 28 |         self.feat_embeddings = feat_embeddings
 29 | 
 30 |         # 定义随机初始化
 31 |         initializer = tf.initializers.GlorotUniform()
 32 | 
 33 |         # linear part 线性层就是embedding层的复制，因此线性信号权重大小是D1 * N * M，为什么因此是线性层维度为 D1，embedding层维度为N* M
 34 |         # 因此权重大小为D1 * N *M
 35 |         self.linear_weights = tf.Variable(
 36 |             initializer(shape=(product_layer_dim, num_field, embedding_size)))  # D1 * N * M
 37 | 
 38 |         # quadratic part
 39 |         self.product_type = product_type
 40 |         if product_type == 'inner':
 41 |             self.theta = tf.Variable(initializer(shape=(product_layer_dim, num_field)))  # D1 * N
 42 | 
 43 |         else:
 44 |             self.quadratic_weights = tf.Variable(
 45 |                 initializer(shape=(product_layer_dim, embedding_size, embedding_size)))  # D1 * M * M
 46 | 
 47 |         # fc layer
 48 |         self.deep_layer_sizes = deep_layer_sizes
 49 |         # 神经网络方面的参数
 50 |         for i in range(len(deep_layer_sizes)):
 51 |             setattr(self, 'dense_' + str(i), tf.keras.layers.Dense(deep_layer_sizes[i]))
 52 |             setattr(self, 'batchNorm_' + str(i), tf.keras.layers.BatchNormalization())
 53 |             setattr(self, 'activation_' + str(i), tf.keras.layers.Activation('relu'))
 54 |             setattr(self, 'dropout_' + str(i), tf.keras.layers.Dropout(dropout_deep[i]))
 55 | 
 56 |         # last layer
 57 |         self.fc = tf.keras.layers.Dense(1, activation=None, use_bias=True)
 58 | 
 59 |     def call(self, feat_index, feat_value):
 60 |         # call函数接收输入变量
 61 |         # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
 62 |         feat_embedding_0 = self.feat_embeddings(feat_index)  # Batch * N * M
 63 |         #         print(feat_value.get_shape())
 64 |         feat_embedding = tf.einsum('bnm,bn->bnm', feat_embedding_0, feat_value)
 65 |         # linear part
 66 |         lz = tf.einsum('bnm,dnm->bd', feat_embedding, self.linear_weights)  # Batch * D1
 67 | 
 68 |         # quadratic part
 69 |         if self.product_type == 'inner':
 70 |             theta = tf.einsum('bnm,dn->bdnm', feat_embedding, self.theta)  # Batch * D1 * N * M
 71 |             lp = tf.einsum('bdnm,bdnm->bd', theta, theta)  # Batch * D1
 72 |         else:
 73 |             embed_sum = tf.reduce_sum(feat_embedding, axis=1)  # Batch * M
 74 |             p = tf.einsum('bm,bn->bmn', embed_sum, embed_sum)
 75 |             lp = tf.einsum('bmn,dmn->bd', p, self.quadratic_weights)  # Batch * D1
 76 | 
 77 |         y_deep = tf.concat((lz, lp), axis=1)
 78 |         y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)
 79 | 
 80 |         for i in range(len(self.deep_layer_sizes)):
 81 |             y_deep = getattr(self, 'dense_' + str(i))(y_deep)
 82 |             y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep)
 83 |             y_deep = getattr(self, 'activation_' + str(i))(y_deep)
 84 |             y_deep = getattr(self, 'dropout_' + str(i))(y_deep)
 85 | 
 86 |         output = self.fc(y_deep)
 87 | 
 88 |         return output
 89 | if __name__ == '__main__':
 90 |     AID_DATA_DIR = "../data/Criteo/"
 91 |     feat_dict_ = pickle.load(open(AID_DATA_DIR + '/feat_dict_10.pkl2', 'rb'))
 92 | 
 93 |     pnn = PNN(num_feat=len(feat_dict_) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],
 94 |                     deep_layer_sizes=[400, 400], product_layer_dim=10,
 95 |                     reg_l1=0.01, reg_l2=1e-5, embedding_size=10, product_type='outer')
 96 | 
 97 |     train_label_path = AID_DATA_DIR + 'train_label'
 98 |     train_idx_path = AID_DATA_DIR + 'train_idx'
 99 |     train_value_path = AID_DATA_DIR + 'train_value'
100 | 
101 |     test_label_path = AID_DATA_DIR + 'test_label'
102 |     test_idx_path = AID_DATA_DIR + 'test_idx'
103 |     test_value_path = AID_DATA_DIR + 'test_value'
104 | 
105 |     train_test_model_demo(pnn,train_label_path, train_idx_path, train_value_path)
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # recsys
 2 | 
 3 | ## 1.Requireme
 4 | 在这里强调下主要要是采用 Tensorflow2.0的api进行建立model
 5 | 
 6 | TensorFlow2.0,Keras, Python3.6, NumPy, sk-learn, Pandas
 7 | 
 8 | ## 2.Datasets
 9 | 
10 | ### 2.1 Criteo
11 | 
12 | This dataset Contains about 45 million records. There are 13 features taking integer values (mostly count features) and 26 categorical features.
13 | The dataset is available at http://labs.criteo.com/2014/02/download-kaggle-display-advertising-challenge-dataset/
14 | 
15 | 在这里我截取一部分数据进行模型训练 data =../data/Criteo/train.txt
16 | 
17 | ### 2.2 Seguro-safe-driver
18 | 
19 | In the train and test data, features that belong to similar groupings are 
20 | tagged as such in the feature names (e.g., ind, reg, car, calc). In addition, 
21 | feature names include the postfix bin to indicate binary features and 
22 | cat to indicate categorical features. Features without these designations 
23 | are either continuous or ordinal. Values of -1 indicate that the feature was 
24 | missing from the observation. The target columns signifies whether or not a 
25 | claim was filed for that policy holder.
26 | 
27 | The dataset is available at https://www.kaggle.com/c/porto-seguro-safe-driver-prediction
28 | 
29 | ## 3. 推荐系统实战
30 | 
31 | ![image](https://pic2.zhimg.com/80/v2-763b523bd17349cd6cfecae2765db3d5_hd.jpg)
32 | 来自https://zhuanlan.zhihu.com/p/69050253
33 | 
34 | ![image](https://pic3.zhimg.com/v2-dd98a58d2676f20ded7d7b0c61e88fa2_r.jpg)
35 | 来自https://zhuanlan.zhihu.com/p/53231955
36 | ### 3.1 第一章.协同过滤
37 | 
38 | ### 3.2 第二章 GBDT+LR
39 | 
40 | 本质上GBDT+LR是一种具有stacking思想的二分类，所以用来解决二分类问题，这个方法出自于Facebook 2014年的论文 Practical Lessons from Predicting Clicks on Ads at Facebook 。
41 | https://zhuanlan.zhihu.com/p/29053940
42 | 
43 | ### 3.3 第三章 MLR
44 | 
45 | 算法简单实现 我们这里只是简单实现一个tensorflow版本的MLR模型
46 | https://www.jianshu.com/p/627fc0d755b2
47 | 
48 | ### 3.4 第四章 DCN
49 | 
50 | Deep Cross Network模型
51 | 
52 | https://www.jianshu.com/p/77719fc252fa
53 | 
54 | https://github.com/Nirvanada/Deep-and-Cross-Keras
55 | 
56 | https://blog.csdn.net/roguesir/article/details/797632
57 | 
58 | https://arxiv.org/abs/1708.05123
59 | 
60 | ### 3.5 第五章 PNN
61 | 
62 | https://github.com/JianzhouZhan/Awesome-RecSystem-Models
63 | 
64 | https://github.com/Snail110/tensorflow_practice/blob/master/recommendation/Basic-PNN-Demo/PNN.py
65 | 
66 | https://www.jianshu.com/p/be784ab4abc2
67 | 
68 | 
69 | ### 3.6 第六章 Wide-Deep
70 | 
71 | https://zhuanlan.zhihu.com/p/92279796
72 | 
73 | https://github.com/busesese/Wide_Deep_Model
74 | 
75 | ### 3.6 第七章 NFM
76 | 
77 | https://zhuanlan.zhihu.com/p/37522285
78 | Neural Factorization Machines for Sparse Predictive Analytics


--------------------------------------------------------------------------------
/Wide-Deep/Wide-Deep.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | import pickle
  4 | from util.train_model import train_test_model_demo
  5 | 
  6 | 
  7 | class Wide(tf.keras.layers.Layer):
  8 |     def __init__(self,units=1):
  9 |         # input_dim = num_size + embed_size = input_size
 10 |         super(Wide, self).__init__()
 11 | #         self.units = units
 12 |         self.linear = tf.keras.layers.Dense(units=units,activation='relu')
 13 |     def call(self, inputs):
 14 |         output = self.linear(inputs)
 15 |         return output
 16 | 
 17 | class Deep(tf.keras.layers.Layer):
 18 |     def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):
 19 |         # input_dim = num_size + embed_size = input_size
 20 |         super(Deep, self).__init__()
 21 |         self.num_feat = num_feat # F =features nums
 22 |         self.num_field = num_field # N =fields of a feature 
 23 |         self.dropout_deep  = dropout_deep
 24 |         
 25 |         # Embedding 这里采用embeddings层因此大小为F* M F为特征数量，M为embedding的维度
 26 |         feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M 
 27 |         self.feat_embeddings = feat_embeddings
 28 |         
 29 |         # fc layer
 30 |         self.deep_layer_sizes = deep_layer_sizes
 31 |         #神经网络方面的参数
 32 |         for i in range(len(deep_layer_sizes)):
 33 |             setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))
 34 |             setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())
 35 |             setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))
 36 |             setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))
 37 |         # last layer
 38 |         self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)
 39 |         
 40 |     def call(self,feat_index,feat_value):
 41 |         # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
 42 |         feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 
 43 | #         print(feat_value.get_shape())
 44 |         feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)
 45 | 
 46 |         y_deep = tf.keras.layers.Flatten()(feat_embedding)
 47 |         for i in range(len(self.deep_layer_sizes)):
 48 |             y_deep = getattr(self,'dense_' + str(i))(y_deep)
 49 |             y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)
 50 |             y_deep = getattr(self,'activation_' + str(i))(y_deep)
 51 |             y_deep = getattr(self,'dropout_' + str(i))(y_deep)
 52 |         
 53 |         output = self.fc(y_deep)
 54 |         return output
 55 |     
 56 | class WideDeep(tf.keras.Model):
 57 |     def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,embedding_size=10):
 58 |         super().__init__()
 59 |         self.num_feat = num_feat # F =features nums
 60 |         self.num_field = num_field # N =fields of a feature 
 61 |         self.dropout_deep  = dropout_deep
 62 |         
 63 |         self.wide = Wide(units=1)
 64 |         self.deep = Deep(num_feat,num_field,dropout_deep,deep_layer_sizes)
 65 |         self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)
 66 |         
 67 |     def call(self,num_input,feat_index,feat_value):
 68 |         x1 = self.wide(num_input)
 69 |         x2 = self.deep(feat_index,feat_value)
 70 |         
 71 |         x3 = tf.keras.layers.concatenate([x1,x2],axis=-1)
 72 |         output = self.fc(x3)
 73 |         return output
 74 |     
 75 | 
 76 | if __name__ == '__main__':
 77 |     AID_DATA_DIR = "../data/Criteo/"
 78 |     feat_dict_ = pickle.load(open(AID_DATA_DIR + '/cross_feat_dict_10.pkl2', 'rb'))
 79 | 
 80 |     widedeep = WideDeep(num_feat=len(feat_dict_) + 1, num_field=52, dropout_deep=[0.5, 0.5, 0.5],
 81 |                     deep_layer_sizes=[400, 400],embedding_size=10)
 82 | 
 83 |     train_label_path = AID_DATA_DIR + 'traincross_label'
 84 |     train_idx_path = AID_DATA_DIR + 'traincross_idx'
 85 |     train_value_path = AID_DATA_DIR + 'traincross_value'
 86 |     train_num_path = AID_DATA_DIR + 'traincross_num'
 87 | 
 88 |     # 这种读取数据方式采用TextLineDataset，数据为大文件时，节省内存，效率训练
 89 |     def get_batch_dataset(label_path, idx_path, value_path,num_path):
 90 |         label = tf.data.TextLineDataset(label_path)
 91 |         idx = tf.data.TextLineDataset(idx_path)
 92 |         value = tf.data.TextLineDataset(value_path)
 93 |         num = tf.data.TextLineDataset(num_path)
 94 | 
 95 |         label = label.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
 96 |         idx = idx.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
 97 |         value = value.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
 98 |         num = num.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
 99 | 
100 |         batch_dataset = tf.data.Dataset.zip((num,label, idx, value))
101 |         batch_dataset = batch_dataset.shuffle(buffer_size=128)
102 |         batch_dataset = batch_dataset.batch(128)
103 |         batch_dataset = batch_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
104 |         return batch_dataset
105 |     train_batch_dataset = get_batch_dataset(train_label_path, train_idx_path, train_value_path,train_num_path)
106 | 
107 |     train_loss = tf.keras.metrics.Mean(name='train_loss')
108 |     train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')
109 |     loss_object = tf.keras.losses.BinaryCrossentropy()
110 |     optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
111 | 
112 | 
113 |     @tf.function
114 |     def train_one_step(model, optimizer, idx, value, label, num):
115 |         with tf.GradientTape() as tape:
116 |             output = model(num, idx, value)
117 |             loss = loss_object(y_true=label, y_pred=output)
118 |         grads = tape.gradient(loss, model.trainable_variables)
119 |         grads = [tf.clip_by_norm(g, 100) for g in grads]
120 |         optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
121 | 
122 |         train_loss(loss)
123 |         train_accuracy(label, output)
124 | 
125 |     EPOCHS = 50
126 |     for epoch in range(EPOCHS):
127 |         for num, label, idx, value in train_batch_dataset:
128 |             train_one_step(widedeep, optimizer, idx, value, label,num)
129 |         template = 'Epoch {}, Loss: {}, Accuracy: {}'
130 |         print(template.format(epoch + 1,
131 |                               train_loss.result(), train_accuracy.result()))
132 | 
133 | 


--------------------------------------------------------------------------------
/Wide-Deep/data_process.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import pickle
  4 | from collections import Counter
  5 | 
  6 | """
  7 | Data Process for Wide-Deep network
  8 | https://github.com/busesese/Wide_Deep_Model
  9 | https://github.com/aviraj-sinha/ML5/blob/master/10.%20Keras%20Wide%20and%20Deep.ipynb
 10 | """
 11 | def get_train_test_file(file_path, feat_dict_, split_ratio=0.9):
 12 |     #定义训练集与测试集
 13 |     train_label_fout = open(file_path+'traincross_label', 'w')
 14 |     train_value_fout = open(file_path+'traincross_value', 'w')
 15 |     train_idx_fout = open(file_path+'traincross_idx', 'w')
 16 |     train_num_fout = open(file_path + 'traincross_num', 'w')
 17 | 
 18 |     continuous_range_ = range(1, 14)
 19 |     categorical_range_ = range(14, 52)
 20 | 
 21 |     def process_line_(line):
 22 |         features = line.rstrip('\n').split('\t')
 23 |         feat_idx, feat_value, label= [], [], []
 24 |         # 自己获取每列特征中的最大值，最小值
 25 |         cont_min_ = [0.0, -2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 26 |         cont_max_ = [95.0,7864,8457.0,87.0,1015215.0,4638.0,1658.0,547.0,5637.0,4.0,37.0,98.0,770.0]
 27 |         cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]
 28 |         # MinMax Normalization
 29 |         for idx in continuous_range_:
 30 |             if features[idx] == '':
 31 |                 feat_idx.append(0)
 32 |                 feat_value.append(0.0)
 33 |             else:
 34 |                 feat_idx.append(feat_dict_[idx])
 35 |                 feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))
 36 |         # 获取数值型特征
 37 |         num = feat_value[:]
 38 |         # 处理分类型数据
 39 |         for idx in categorical_range_:
 40 |             if features[idx] == '' or features[idx] not in feat_dict_:
 41 |                 feat_idx.append(0)
 42 |                 feat_value.append(0.0)
 43 |             else:
 44 |                 feat_idx.append(feat_dict_[features[idx]])
 45 |                 feat_value.append(1.0)
 46 |         return feat_idx, feat_value, [int(features[0])], num
 47 | 
 48 |     with open(file_path+'traincross.txt', 'r') as fin:
 49 |         for line_idx, line in enumerate(fin):
 50 |             feat_idx, feat_value, label, num = process_line_(line)
 51 | 
 52 |             feat_value = '\t'.join([str(v) for v in feat_value]) + '\n'
 53 |             feat_idx = '\t'.join([str(idx) for idx in feat_idx]) + '\n'
 54 |             label = '\t'.join([str(idx) for idx in label]) + '\n'
 55 |             feat_num = '\t'.join([str(idx) for idx in num]) + '\n'
 56 | 
 57 |             train_label_fout.write(label)
 58 |             train_idx_fout.write(feat_idx)
 59 |             train_value_fout.write(feat_value)
 60 |             train_num_fout.write(feat_num)
 61 | 
 62 |         fin.close()
 63 | 
 64 |     train_label_fout.close()
 65 |     train_idx_fout.close()
 66 |     train_value_fout.close()
 67 |     train_num_fout.close()
 68 | 
 69 | 
 70 | def cross_feature(file_path,cross_range):
 71 |     # 构建交叉特征数据集
 72 |     traincross = open(file_path+'traincross.txt', 'w')
 73 |     with open(file_path+'train.txt', 'r') as fin:
 74 |         for line_idx, line in enumerate(fin):
 75 |             features = line.rstrip('\n').split('\t')
 76 |             for i in cross_range:
 77 |                 features.append('_'.join([features[i[0]], features[i[1]]]))
 78 |             string_features = '\t'.join(features) + '\n'
 79 |             traincross.write(string_features)
 80 |         fin.close()
 81 |     traincross.close()
 82 | 
 83 | def get_feat_dict(file_path):
 84 | 
 85 |     freq_ = 10
 86 |     # pkl2格式用来保存字典形式的wide-deep数据pickle
 87 |     dir_feat_dict_ = file_path+'cross_feat_dict_' + str(freq_) + '.pkl2'
 88 |     continuous_range_ = range(1, 14)
 89 |     categorical_range_ = range(14, 52)
 90 | 
 91 |     if os.path.exists(dir_feat_dict_):
 92 |         feat_dict = pickle.load(open(dir_feat_dict_, 'rb'))
 93 |     else:
 94 |         # print('generate a feature dict')
 95 |         # Count the number of occurrences of discrete features
 96 |         feat_cnt = Counter()
 97 |         with open(file_path+'traincross.txt', 'r') as fin:
 98 |             for line_idx, line in enumerate(fin):
 99 |                 features = line.rstrip('\n').split('\t')
100 |                 for idx in categorical_range_:
101 |                     if features[idx] == '': continue
102 |                     feat_cnt.update([features[idx]])
103 |             fin.close()
104 |         # Only retain discrete features with high frequency
105 |         dis_feat_set = set()
106 |         for feat, ot in feat_cnt.items():
107 |             if ot >= freq_:
108 |                 dis_feat_set.add(feat)
109 | 
110 |         # Create a dictionary for continuous and discrete features
111 |         feat_dict = {}
112 |         tc = 1
113 |         # Continuous features
114 |         for idx in continuous_range_:
115 |             feat_dict[idx] = tc
116 |             tc += 1
117 |         # Discrete features
118 |         cnt_feat_set = set()
119 |         with open(file_path+'traincross.txt', 'r') as fin:
120 |             for line_idx, line in enumerate(fin):
121 |                 features = line.rstrip('\n').split('\t')
122 | 
123 |                 for idx in categorical_range_:
124 |                     if features[idx] == '' or features[idx] not in dis_feat_set:
125 |                         continue
126 |                     if features[idx] not in cnt_feat_set:
127 |                         cnt_feat_set.add(features[idx])
128 |                         feat_dict[features[idx]] = tc
129 |                         tc += 1
130 |             # Save dictionary
131 |             fin.close()
132 |         with open(dir_feat_dict_, 'wb') as fout:
133 |             pickle.dump(feat_dict, fout)
134 |         print('args.num_feat ', len(feat_dict) + 1)
135 |     return feat_dict
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     file_path = '../data/Criteo/'
140 |     # 交叉特征
141 |     cross_range = [[14, 15], [16, 17], [18, 19], [20, 21], [22, 23], [24, 25], [26, 27], [28, 29], [30, 31],
142 |                    [32, 33], [34, 35], [36, 37], [38, 39]]
143 |     cross_feature(file_path,cross_range)
144 |     feat_dict = get_feat_dict(file_path)
145 |     get_train_test_file(file_path, feat_dict)
146 |     print('Done!')


--------------------------------------------------------------------------------
/data/Criteo/data_process.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import pickle
  4 | from collections import Counter
  5 | 
  6 | """
  7 | Data Process for FM, PNN, and DeepFM.
  8 | [1] PaddlePaddle implementation of DeepFM for CTR prediction
  9 | https://github.com/Snail110/Awesome-RecSystem-Models/blob/master/data/Criteo/forOtherModels/dataPreprocess_TensorFlow.py
 10 | """
 11 | def get_train_test_file(file_path, feat_dict_, split_ratio=0.9):
 12 |     #定义训练集与测试集
 13 |     train_label_fout = open('train_label', 'w')
 14 |     train_value_fout = open('train_value', 'w')
 15 |     train_idx_fout = open('train_idx', 'w')
 16 |     test_label_fout = open('test_label', 'w')
 17 |     test_value_fout = open('test_value', 'w')
 18 |     test_idx_fout = open('test_idx', 'w')
 19 | 
 20 |     continuous_range_ = range(1, 14)
 21 |     categorical_range_ = range(14, 40)
 22 | 
 23 |     def process_line_(line):
 24 |         features = line.rstrip('\n').split('\t')
 25 |         feat_idx, feat_value, label = [], [], []
 26 |         # 自己获取每列特征中的最大值，最小值
 27 |         cont_min_ = [0.0, -2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 28 |         cont_max_ = [95.0,7864,8457.0,87.0,1015215.0,4638.0,1658.0,547.0,5637.0,4.0,37.0,98.0,770.0]
 29 |         cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]
 30 |         # MinMax Normalization
 31 |         for idx in continuous_range_:
 32 |             if features[idx] == '':
 33 |                 feat_idx.append(0)
 34 |                 feat_value.append(0.0)
 35 |             else:
 36 |                 feat_idx.append(feat_dict_[idx])
 37 |                 feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))
 38 | 
 39 |         # 处理分类型数据
 40 |         for idx in categorical_range_:
 41 |             if features[idx] == '' or features[idx] not in feat_dict_:
 42 |                 feat_idx.append(0)
 43 |                 feat_value.append(0.0)
 44 |             else:
 45 |                 feat_idx.append(feat_dict_[features[idx]])
 46 |                 feat_value.append(1.0)
 47 |         return feat_idx, feat_value, [int(features[0])]
 48 | 
 49 |     with open(file_path, 'r') as fin:
 50 |         for line_idx, line in enumerate(fin):
 51 |             feat_idx, feat_value, label = process_line_(line)
 52 | 
 53 |             feat_value = '\t'.join([str(v) for v in feat_value]) + '\n'
 54 |             feat_idx = '\t'.join([str(idx) for idx in feat_idx]) + '\n'
 55 |             label = '\t'.join([str(idx) for idx in label]) + '\n'
 56 | 
 57 |             if np.random.random() <= split_ratio:
 58 |                 train_label_fout.write(label)
 59 |                 train_idx_fout.write(feat_idx)
 60 |                 train_value_fout.write(feat_value)
 61 |             else:
 62 |                 test_label_fout.write(label)
 63 |                 test_idx_fout.write(feat_idx)
 64 |                 test_value_fout.write(feat_value)
 65 | 
 66 |         fin.close()
 67 | 
 68 |     train_label_fout.close()
 69 |     train_idx_fout.close()
 70 |     train_value_fout.close()
 71 |     test_label_fout.close()
 72 |     test_idx_fout.close()
 73 |     test_value_fout.close()
 74 | 
 75 | 
 76 | def get_feat_dict(file_path):
 77 |     freq_ = 10
 78 |     # pkl2格式用来保存字典形式的数据pickle
 79 |     dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'
 80 |     continuous_range_ = range(1, 14)
 81 |     categorical_range_ = range(14, 40)
 82 | 
 83 |     if os.path.exists(dir_feat_dict_):
 84 |         feat_dict = pickle.load(open(dir_feat_dict_, 'rb'))
 85 |     else:
 86 |         # print('generate a feature dict')
 87 |         # Count the number of occurrences of discrete features
 88 |         feat_cnt = Counter()
 89 |         with open(file_path, 'r') as fin:
 90 |             for line_idx, line in enumerate(fin):
 91 |                 features = line.rstrip('\n').split('\t')
 92 |                 for idx in categorical_range_:
 93 |                     if features[idx] == '': continue
 94 |                     feat_cnt.update([features[idx]])
 95 | 
 96 |         # Only retain discrete features with high frequency
 97 |         dis_feat_set = set()
 98 |         for feat, ot in feat_cnt.items():
 99 |             if ot >= freq_:
100 |                 dis_feat_set.add(feat)
101 | 
102 |         # Create a dictionary for continuous and discrete features
103 |         feat_dict = {}
104 |         tc = 1
105 |         # Continuous features
106 |         for idx in continuous_range_:
107 |             feat_dict[idx] = tc
108 |             tc += 1
109 |         # Discrete features
110 |         cnt_feat_set = set()
111 |         with open(file_path, 'r') as fin:
112 |             for line_idx, line in enumerate(fin):
113 |                 features = line.rstrip('\n').split('\t')
114 |                 for idx in categorical_range_:
115 |                     if features[idx] == '' or features[idx] not in dis_feat_set:
116 |                         continue
117 |                     if features[idx] not in cnt_feat_set:
118 |                         cnt_feat_set.add(features[idx])
119 |                         feat_dict[features[idx]] = tc
120 |                         tc += 1
121 | 
122 |         # Save dictionary
123 |         with open(dir_feat_dict_, 'wb') as fout:
124 |             pickle.dump(feat_dict, fout)
125 |         print('args.num_feat ', len(feat_dict) + 1)
126 | 
127 |     return feat_dict
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     file_path = './train.txt'
132 |     feat_dict = get_feat_dict(file_path)
133 |     get_train_test_file(file_path, feat_dict)
134 |     print('Done!')


--------------------------------------------------------------------------------
/embedding.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "D:\\anaconda3\\julianxu\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 13 |       "  from ._conv import register_converters as _register_converters\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import tensorflow as tf"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "csv = [\n",
 28 |     "  \"1,harden|james|curry\",\n",
 29 |     "  \"2,wrestbrook|harden|durant\",\n",
 30 |     "  \"3,|paul|towns\",\n",
 31 |     "]"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "TAG_SET = [\"harden\", \"james\", \"curry\", \"durant\", \"paul\",\"towns\",\"wrestbrook\"]"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 4,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# 处理得到SpareTensor\n",
 50 |     "ids,post_tags_str = tf.decode_csv(csv,[[-1],[\"\"]])"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 7,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "table = tf.contrib.lookup.index_table_from_tensor(\n",
 60 |     "mapping=TAG_SET,default_value=-1) # 构造一个查找表"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 9,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "split_tags = tf.string_split(post_tags_str,\"|\")"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 12,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "<tf.Tensor 'StringSplit:0' shape=(?, 2) dtype=int64>"
 81 |       ]
 82 |      },
 83 |      "execution_count": 12,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "split_tags.indices"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 14,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "tags = tf.SparseTensor(\n",
 99 |     "indices = split_tags.indices,\n",
100 |     "values = table.lookup(split_tags.values),\n",
101 |     "    dense_shape=split_tags.dense_shape)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 15,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "# 定义embedding变量\n",
111 |     "# 大小为3 因为 只有7个类型\n",
112 |     "TAG_EMBEDDING_DIM = 3\n",
113 |     "embedding_params = tf.Variable(tf.truncated_normal([len(TAG_SET),TAG_EMBEDDING_DIM]))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 16,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "WARNING:tensorflow:The default value of combiner will change from \"mean\" to \"sqrtn\" after 2016/11/01.\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "embedding_tags = tf.nn.embedding_lookup_sparse(embedding_params,sp_ids=tags,sp_weights=None)\n",
131 |     "# sp_ids就是我们刚刚得到的SparseTensor，而sp_weights=None代表的每一个取值的权重，如果是None的话，所有权重都是1，也就是相当于取了平均\n",
132 |     "# 如果不是None的话，我们需要同样传入一个SparseTensor，代表不同球员的喜欢权重。大家感兴趣可以自己去尝试"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 18,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "[SparseTensorValue(indices=array([[0, 0],\n",
145 |       "       [0, 1],\n",
146 |       "       [0, 2],\n",
147 |       "       [1, 0],\n",
148 |       "       [1, 1],\n",
149 |       "       [1, 2],\n",
150 |       "       [2, 0],\n",
151 |       "       [2, 1]], dtype=int64), values=array([0, 1, 2, 6, 0, 3, 4, 5], dtype=int64), dense_shape=array([3, 3], dtype=int64)), array([[ 0.06023904,  1.0575624 , -0.9093878 ],\n",
152 |       "       [-0.42566654,  0.26845995, -0.6602178 ],\n",
153 |       "       [-0.6277443 ,  0.28916246, -0.15512544]], dtype=float32), array([b'harden|james|curry', b'wrestbrook|harden|durant', b'|paul|towns'],\n",
154 |       "      dtype=object)]\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "with tf.Session() as s:\n",
160 |     "    s.run([tf.global_variables_initializer(),tf.tables_initializer()])\n",
161 |     "    print(s.run([tags,embedding_tags,post_tags_str]))"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "Python 3",
175 |    "language": "python",
176 |    "name": "python3"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "3.6.5"
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 2
193 | }
194 | 


--------------------------------------------------------------------------------
/util/train_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | import tensorflow as tf
 5 | 
 6 | def train_test_model_demo(model,train_label_path, train_idx_path, train_value_path):
 7 |     # 这种读取数据方式采用TextLineDataset，数据为大文件时，节省内存，效率训练
 8 |     def get_batch_dataset(label_path, idx_path, value_path):
 9 |         label = tf.data.TextLineDataset(label_path)
10 |         idx = tf.data.TextLineDataset(idx_path)
11 |         value = tf.data.TextLineDataset(value_path)
12 | 
13 |         label = label.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
14 |         idx = idx.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
15 |         value = value.map(lambda x: tf.strings.to_number(tf.strings.split(x, sep='\t')), num_parallel_calls=12)
16 | 
17 |         batch_dataset = tf.data.Dataset.zip((label, idx, value))
18 |         batch_dataset = batch_dataset.shuffle(buffer_size=128)
19 |         batch_dataset = batch_dataset.batch(128)
20 |         batch_dataset = batch_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
21 |         return batch_dataset
22 |     train_batch_dataset = get_batch_dataset(train_label_path, train_idx_path, train_value_path)
23 | 
24 |     train_loss = tf.keras.metrics.Mean(name='train_loss')
25 |     train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')
26 |     loss_object = tf.keras.losses.BinaryCrossentropy()
27 |     optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
28 | 
29 | 
30 |     @tf.function
31 |     def train_one_step(model, optimizer, idx, value, label):
32 |         with tf.GradientTape() as tape:
33 |             output = model(idx, value)
34 |             loss = loss_object(y_true=label, y_pred=output)
35 |         grads = tape.gradient(loss, model.trainable_variables)
36 |         grads = [tf.clip_by_norm(g, 100) for g in grads]
37 |         optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
38 | 
39 |         train_loss(loss)
40 |         train_accuracy(label, output)
41 | 
42 |     EPOCHS = 50
43 |     for epoch in range(EPOCHS):
44 |         for label, idx, value in train_batch_dataset:
45 |             train_one_step(model, optimizer, idx, value, label)
46 |         template = 'Epoch {}, Loss: {}, Accuracy: {}'
47 |         print(template.format(epoch + 1,
48 |                               train_loss.result(), train_accuracy.result()))
49 | 
50 | def train_test_model_demo_1(model,train_label, train_idx, train_value):
51 |     # 这种读取数据方式采用tf.data.Dataset.from_tensor_slices，数据为小文件时，便于进行大数据前的调试模型使用。
52 |     def get_dataset(train_label, train_idx, train_value):
53 |         train_ds = tf.data.Dataset.from_tensor_slices(
54 |             (train_label, train_idx, train_value)).shuffle(10000).batch(32)
55 |         return train_ds
56 |     train_batch_dataset = get_dataset(train_label, train_idx, train_value)
57 | 
58 |     train_loss = tf.keras.metrics.Mean(name='train_loss')
59 |     train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')
60 |     # 二分类
61 |     loss_object = tf.keras.losses.BinaryCrossentropy()
62 |     optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
63 | 
64 |     @tf.function
65 |     def train_one_step(model, optimizer, idx, value, label):
66 |         with tf.GradientTape() as tape:
67 |             output = model(idx, value)
68 |             loss = loss_object(y_true=label, y_pred=output)
69 |         grads = tape.gradient(loss, model.trainable_variables)
70 |         grads = [tf.clip_by_norm(g, 100) for g in grads]
71 |         optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
72 | 
73 |         train_loss(loss)
74 |         train_accuracy(label, output)
75 | 
76 |     EPOCHS = 50
77 |     for epoch in range(EPOCHS):
78 |         for label, idx, value in train_batch_dataset:
79 |             train_one_step(model, optimizer, idx, value, label)
80 |         template = 'Epoch {}, Loss: {}, Accuracy: {}'
81 |         print(template.format(epoch + 1,
82 |                               train_loss.result(), train_accuracy.result()))


--------------------------------------------------------------------------------
	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy
	userId	movieId	rating	timestamp
0	1	2	3.5	1112486027
1	1	29	3.5	1112484676
2	1	32	3.5	1112484819
3	1	47	3.5	1112484727
4	1	50	3.5	1112484580
	Unnamed: 0	id	target	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	...	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	8000	20227	1	7	1	5	1	0	1	...	4	2	6	5	1	1	1	0
1	8001	20228	1	0	1	6	1	1	0	...	5	2	4	10	0	0	0	1
2	8002	20229	0	3	1	8	0	0	0	...	10	1	3	5	1	1	1	0
3	8003	20235	0	2	1	8	0	0	0	...	2	2	2	9	0	1	1	0
4	8004	20236	0	0	1	2	1	0	0	...	3	2	5	5	1	0	1	0
	0	1	2	3	4	5	6	7	8	9	10	12	13	14
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K
	0	1	2	3	4	5	6	7	8	9	10	12	13	14	type
0	25	Private	226802	11th	7	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	<=50K.	2
1	38	Private	89814	HS-grad	9	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	<=50K.	2
2	28	Local-gov	336951	Assoc-acdm	12	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	>50K.	2
3	44	Private	160323	Some-college	10	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	>50K.	2
4	18	?	103497	Some-college	10	Never-married	?	Own-child	White	Female	0	30	United-States	<=50K.	2
	age	fnlwgt	education-num	capital-gain	capital-loss	hours-per-week	...	native-country_ United-States
0	0.025996	-1.061979	1.136512	0.146932	-0.217127	-0.034087	...	1
1	0.828308	-1.007104	1.136512	-0.144804	-0.217127	-2.213032	...	1
2	-0.046942	0.246034	-0.419335	-0.144804	-0.217127	-0.034087	...	1
3	1.047121	0.426663	-1.197259	-0.144804	-0.217127	-0.034087	...	1
4	-0.776316	1.408530	1.136512	-0.144804	-0.217127	-0.034087	...	0