├── code
    ├── requirement.txt
    ├── README.md
    ├── utils.py
    ├── generator.py
    ├── fl_training.py
    ├── Example.ipynb
    ├── preprecoess.py
    └── models.py
├── README.md
└── LICENSE


/code/requirement.txt:
--------------------------------------------------------------------------------
1 | sklearn
2 | tensorflow-gpu == 1.13.0
3 | keras == 2.2.4
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FedNewsRec-EMNLP-Findings-2020
2 | Data and code of our paper "Privacy-Preserving News Recommendation Model Learning"
3 | 


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
 1 | # FedNewsRec-EMNLP-Findings-2020
 2 | - Code of our paper "Privacy-Preserving News Recommendation Model Learning"
 3 | 
 4 | # Data Preparation
 5 | - If you want to test this project, you should download MIND-Small dataset in https://msnews.github.io/index.html
 6 | - Let data-root-path denote the root path of the embedding
 7 | - Files in the training dataset should be placed in root\_data\_path/train
 8 | - Files in the validation dataset should be placed in data-root-path/val
 9 | - We used the glove.840B.300d embedding vecrors in https://nlp.stanford.edu/projects/glove/
10 | - The embedding file should be placed in embedding\_path\glove.840B.300d.txt
11 | 
12 | # Code Files
13 | - preprocess.py: containing functions to preprocess the datasets
14 | - utils.py: containg some util functions, such as evaluation matrics
15 | - generator.py: containing data generator for model evaluation
16 | - models.py: containing codes for implementing the base model of FedRec
17 | - fl\_training.py: containing codes for federated model training
18 | - Example.ipynb: containing codes for model training and evaluation
19 | 
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 JulySinceAndrew
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/code/utils.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import roc_auc_score
 2 | import numpy as np
 3 | 
 4 | def dcg_score(y_true, y_score, k=10):
 5 |     order = np.argsort(y_score)[::-1]
 6 |     y_true = np.take(y_true, order[:k])
 7 |     gains = 2 ** y_true - 1
 8 |     discounts = np.log2(np.arange(len(y_true)) + 2)
 9 |     return np.sum(gains / discounts)
10 | 
11 | 
12 | def ndcg_score(y_true, y_score, k=10):
13 |     best = dcg_score(y_true, y_true, k)
14 |     actual = dcg_score(y_true, y_score, k)
15 |     return actual / best
16 | 
17 | 
18 | def mrr_score(y_true, y_score):
19 |     order = np.argsort(y_score)[::-1]
20 |     y_true = np.take(y_true, order)
21 |     rr_score = y_true / (np.arange(len(y_true)) + 1)
22 |     return np.sum(rr_score) / np.sum(y_true)
23 | 
24 | def evaluate(user_scorings,news_scorings,Impressions):
25 |     AUC = []
26 |     MRR = []
27 |     nDCG5 = []
28 |     nDCG10 =[]
29 |     for i in range(len(Impressions)):
30 |         docids = Impressions[i]['docs']
31 |         labels = Impressions[i]['labels']
32 |         uv = user_scorings[i]
33 |         
34 |         docids = np.array(docids,dtype='int32')
35 |         nv = news_scorings[docids]
36 |         score = np.dot(nv,uv)
37 |         auc = roc_auc_score(labels,score)
38 |         mrr = mrr_score(labels,score)
39 |         ndcg5 = ndcg_score(labels,score,k=5)
40 |         ndcg10 = ndcg_score(labels,score,k=10)
41 |     
42 |         AUC.append(auc)
43 |         MRR.append(mrr)
44 |         nDCG5.append(ndcg5)
45 |         nDCG10.append(ndcg10)
46 |     AUC = np.array(AUC)
47 |     MRR = np.array(MRR)
48 |     nDCG5 = np.array(nDCG5)
49 |     nDCG10 = np.array(nDCG10)
50 |     
51 |     AUC = AUC.mean()
52 |     MRR = MRR.mean()
53 |     nDCG5 = nDCG5.mean()
54 |     nDCG10 = nDCG10.mean()
55 |     
56 |     return AUC, MRR, nDCG5, nDCG10


--------------------------------------------------------------------------------
/code/generator.py:
--------------------------------------------------------------------------------
 1 | from keras.utils import Sequence
 2 | import numpy as np
 3 | 
 4 | class get_hir_train_generator(Sequence):
 5 |     def __init__(self,news_title, clicked_news,user_id, news_id, label, batch_size):
 6 |         self.title = news_title
 7 |         
 8 |         self.clicked_news = clicked_news
 9 |         
10 |         self.user_id = user_id
11 |         self.doc_id = news_id
12 |         self.label = label
13 |         
14 |         self.batch_size = batch_size
15 |         self.ImpNum = self.label.shape[0]
16 |         
17 |     def __len__(self):
18 |         return int(np.ceil(self.ImpNum / float(self.batch_size)))
19 |     
20 |     def __get_news(self,docids):
21 |         title = self.title[docids]
22 |         
23 |         return title
24 |         
25 | 
26 |     def __getitem__(self, idx):
27 |         start = idx*self.batch_size
28 |         ed = (idx+1)*self.batch_size
29 |         if ed> self.ImpNum:
30 |             ed = self.ImpNum
31 |         
32 |         doc_ids = self.doc_id[start:ed]
33 |         title = self.__get_news(doc_ids)
34 |         
35 |         user_ids = self.user_id[start:ed]
36 |         clicked_ids = self.clicked_news[user_ids]
37 |         
38 |         user_title = self.__get_news(clicked_ids)
39 |         
40 |         label = self.label[start:ed]
41 |                 
42 |         return ([title, user_title,],[label])
43 | 
44 | 
45 | class get_hir_user_generator(Sequence):
46 |     def __init__(self,news_scoring, clicked_news,batch_size):
47 |         self.news_scoring = news_scoring
48 |         self.clicked_news = clicked_news
49 | 
50 |         self.batch_size = batch_size
51 |         self.ImpNum = self.clicked_news.shape[0]
52 |         
53 |     def __len__(self):
54 |         return int(np.ceil(self.ImpNum / float(self.batch_size)))
55 | 
56 |     def __get_news(self,docids):
57 |         news_scoring = self.news_scoring[docids]
58 |         
59 |         return news_scoring
60 |             
61 |     
62 |     def __getitem__(self, idx):
63 |         start = idx*self.batch_size
64 |         ed = (idx+1)*self.batch_size
65 |         if ed> self.ImpNum:
66 |             ed = self.ImpNum
67 |         clicked_ids = self.clicked_news[start:ed]
68 |         
69 |         news_scoring = self.__get_news(clicked_ids)
70 | 
71 |         return news_scoring


--------------------------------------------------------------------------------
/code/fl_training.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id):
 4 |     def _get_user_data(uid):
 5 |         click = []
 6 |         sample = []
 7 |         label = []
 8 |         for sid in train_user_id_sample[uid]:
 9 |             click.append(train_user['click'][train_user_id[sid]])
10 |             sample.append(train_sess[sid])
11 |             label.append(train_label[sid])
12 |         click = np.array(click)
13 |         sample = np.array(sample)
14 |         label = np.array(label)
15 |         click = news_title[click]
16 |         sample = news_title[sample]        
17 |         return click,sample,label
18 |     return _get_user_data
19 | 
20 | 
21 | def add_noise(weights,lambd):
22 |     for i in range(len(weights)):
23 |         weights[i] += np.random.laplace(scale = lambd,size=weights[i].shape)
24 |     return weights
25 | 
26 | def fed_single_update(model,doc_encoder,user_encoder,num,lambd,get_user_data,train_uid_table):
27 |     random_index = np.random.permutation(len(train_uid_table))[:num]
28 |     
29 |     all_news_weights = []
30 |     all_user_weights = []
31 |     old_news_weight = doc_encoder.get_weights()
32 |     old_user_weight = user_encoder.get_weights()
33 |     
34 |     sample_nums = []
35 |     
36 |     loss = []
37 | 
38 |     for uinx in random_index:
39 |         doc_encoder.set_weights(old_news_weight)
40 |         user_encoder.set_weights(old_user_weight)
41 | 
42 |         uid = train_uid_table[uinx]
43 |         click,sample,label = get_user_data(uid)
44 |         #print(label)
45 |         g = model.fit([sample,click],label,batch_size = label.shape[0],verbose=False)
46 |         loss.append(g.history['loss'][0])
47 |         news_weight = doc_encoder.get_weights()
48 |         user_weight = user_encoder.get_weights()
49 |         if lambd>0:
50 |             news_weight = add_noise(news_weight,lambd)
51 |             user_weight = add_noise(user_weight,lambd)
52 |         #noise = 
53 |         #weight += noise
54 |         all_news_weights.append(news_weight)
55 |         all_user_weights.append(user_weight)
56 |         sample_nums.append(label.shape[0])
57 |     
58 |     sample_nums = np.array(sample_nums)
59 |     sample_nums = sample_nums/sample_nums.sum()
60 |     
61 |     doc_weights = [np.average(weights, axis=0,weights=sample_nums) for weights in zip(*all_news_weights)]
62 |     user_weights = [np.average(weights, axis=0,weights=sample_nums) for weights in zip(*all_user_weights)]
63 |     
64 |     doc_encoder.set_weights(doc_weights)
65 |     user_encoder.set_weights(user_weights)
66 |     loss = np.array(loss).mean()
67 |     #print('average loss',loss)
68 |     return loss


--------------------------------------------------------------------------------
/code/Example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": []
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 4,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stderr",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "Using TensorFlow backend.\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "import tensorflow as tf\n",
 34 |     "import keras.backend.tensorflow_backend as KTF\n",
 35 |     " \n",
 36 |     "config = tf.ConfigProto()  \n",
 37 |     "config.gpu_options.allow_growth=True  \n",
 38 |     "session = tf.Session(config=config)\n",
 39 |     " \n",
 40 |     "KTF.set_session(session)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 5,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "from utils import *\n",
 50 |     "from preprecoess import *\n",
 51 |     "from generator import *\n",
 52 |     "from models import *\n",
 53 |     "from fl_training import *"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": []
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "root_data_path = None # MIND-Dataset Path\n",
 70 |     "embedding_path = None # Word Embedding Path"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": []
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 7,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# Read News\n",
 87 |     "news,news_index,category_dict,subcategory_dict,word_dict = read_news(root_data_path,['train','val'])\n",
 88 |     "news_title,news_vert,news_subvert=get_doc_input(news,news_index,category_dict,subcategory_dict,word_dict)\n",
 89 |     "title_word_embedding_matrix, have_word = load_matrix(embedding_path,word_dict)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": []
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 8,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "#Parse User\n",
106 |     "train_session, train_uid_click, train_uid_table = read_clickhistory(root_data_path,'train')\n",
107 |     "test_session, test_uid_click,test_uid_table = read_clickhistory(root_data_path,'val')\n",
108 |     "train_user = parse_user(train_session,news_index)\n",
109 |     "test_user = parse_user(test_session,news_index)\n",
110 |     "train_sess, train_user_id, train_label, train_user_id_sample = get_train_input(train_session,train_uid_click,news_index)\n",
111 |     "test_impressions, test_userids = get_test_input(test_session,news_index)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": []
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 9,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "get_user_data = GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": []
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "scrolled": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "lr = 0.3\n",
146 |     "delta = 0.05\n",
147 |     "lambd = 0\n",
148 |     "num = 6\n",
149 |     "\n",
150 |     "model, doc_encoder, user_encoder, news_encoder = get_model(lr,delta,title_word_embedding_matrix)\n",
151 |     "Res = []\n",
152 |     "Loss = []\n",
153 |     "count = 0\n",
154 |     "while True:\n",
155 |     "    loss = fed_single_update(model,doc_encoder,user_encoder,num,lambd,get_user_data,train_uid_table)\n",
156 |     "    Loss.append(loss)\n",
157 |     "    if count % 25 == 0:\n",
158 |     "        news_scoring = news_encoder.predict(news_title,verbose=0)\n",
159 |     "        user_generator = get_hir_user_generator(news_scoring,test_user['click'],64)\n",
160 |     "        user_scoring = user_encoder.predict_generator(user_generator,verbose=0),\n",
161 |     "        user_scoring = user_scoring[0]\n",
162 |     "        g = evaluate(user_scoring,news_scoring,test_impressions)\n",
163 |     "        Res.append(g)\n",
164 |     "        print(g)\n",
165 |     "        with open('FedRec-woLDP-1.json','a') as f:\n",
166 |     "            s = json.dumps(g) + '\\n'\n",
167 |     "            f.write(s)\n",
168 |     "    count += 1"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": []
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": []
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "kernelspec": {
188 |    "display_name": "Python 3",
189 |    "language": "python",
190 |    "name": "python3"
191 |   },
192 |   "language_info": {
193 |    "codemirror_mode": {
194 |     "name": "ipython",
195 |     "version": 3
196 |    },
197 |    "file_extension": ".py",
198 |    "mimetype": "text/x-python",
199 |    "name": "python",
200 |    "nbconvert_exporter": "python",
201 |    "pygments_lexer": "ipython3",
202 |    "version": "3.7.6"
203 |   }
204 |  },
205 |  "nbformat": 4,
206 |  "nbformat_minor": 2
207 | }
208 | 


--------------------------------------------------------------------------------
/code/preprecoess.py:
--------------------------------------------------------------------------------
  1 | import nltk 
  2 | from nltk.tokenize import word_tokenize
  3 | import csv
  4 | import datetime
  5 | import time
  6 | import json
  7 | import itertools
  8 | import random
  9 | import os
 10 | import numpy as np
 11 | 
 12 | 
 13 | MAX_SENTENCE = 30
 14 | MAX_ALL = 50
 15 | npratio = 4
 16 | 
 17 | def newsample(nnn,ratio):
 18 |     if ratio >len(nnn):
 19 |         return random.sample(nnn*(ratio//len(nnn)+1),ratio)
 20 |     else:
 21 |         return random.sample(nnn,ratio)
 22 | 
 23 | def read_news(root_data_path,modes):
 24 |     news={}
 25 |     category=[]
 26 |     subcategory=[]
 27 |     news_index={}
 28 |     index=1
 29 |     word_dict={}
 30 |     word_index=1
 31 |     
 32 |     for mode in modes:
 33 |         with open(os.path.join(root_data_path,mode,'news.tsv')) as f:
 34 |             lines = f.readlines()
 35 |         for line in lines:
 36 |             splited = line.strip('\n').split('\t')
 37 |             doc_id,vert,subvert,title= splited[0:4]
 38 |             if doc_id in news_index:
 39 |                 continue
 40 |             news_index[doc_id]=index
 41 |             index+=1
 42 |             category.append(vert)
 43 |             subcategory.append(subvert)
 44 |             title = title.lower()
 45 |             title=word_tokenize(title)
 46 |             news[doc_id]=[vert,subvert,title]
 47 |             for word in title:
 48 |                 word = word.lower()
 49 |                 if not(word in word_dict):
 50 |                     word_dict[word]=word_index
 51 |                     word_index+=1
 52 |     category=list(set(category))
 53 |     subcategory=list(set(subcategory))
 54 |     category_dict={}
 55 |     index=1
 56 |     for c in category:
 57 |         category_dict[c]=index
 58 |         index+=1
 59 |     subcategory_dict={}
 60 |     index=1
 61 |     for c in subcategory:
 62 |         subcategory_dict[c]=index
 63 |         index+=1
 64 |     return news,news_index,category_dict,subcategory_dict,word_dict
 65 | 
 66 | def get_doc_input(news,news_index,category,subcategory,word_dict):
 67 |     news_num=len(news)+1
 68 |     news_title=np.zeros((news_num,MAX_SENTENCE),dtype='int32')
 69 |     news_vert=np.zeros((news_num,),dtype='int32')
 70 |     news_subvert=np.zeros((news_num,),dtype='int32')
 71 |     for key in news:    
 72 |         vert,subvert,title=news[key]
 73 |         doc_index=news_index[key]
 74 |         news_vert[doc_index]=category[vert]
 75 |         news_subvert[doc_index]=subcategory[subvert]
 76 |         for word_id in range(min(MAX_SENTENCE,len(title))):
 77 |             news_title[doc_index,word_id]=word_dict[title[word_id].lower()]
 78 |         
 79 |     return news_title,news_vert,news_subvert
 80 | 
 81 | 
 82 | def load_matrix(embedding_path,word_dict):
 83 |     embedding_matrix = np.zeros((len(word_dict)+1,300))
 84 |     have_word=[]
 85 |     with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f:
 86 |         while True:
 87 |             l=f.readline()
 88 |             if len(l)==0:
 89 |                 break
 90 |             l=l.split()
 91 |             word = l[0].decode()
 92 |             if word in word_dict:
 93 |                 index = word_dict[word]
 94 |                 tp = [float(x) for x in l[1:]]
 95 |                 embedding_matrix[index]=np.array(tp)
 96 |                 have_word.append(word)
 97 |     return embedding_matrix,have_word
 98 | 
 99 | 
100 | def read_clickhistory(root_data_path,mode):
101 |     
102 |     lines = []
103 |     userids = {}
104 |     uid_table = {}
105 |     with open(os.path.join(root_data_path,mode,'behaviors.tsv')) as f:
106 |         lines = f.readlines()
107 |         
108 |     sessions = []
109 |     for i in range(len(lines)):
110 |         _,uid,_,click,imp = lines[i].strip().split('\t')
111 |         true_click = click.split()
112 |         assert not '' in true_click
113 |         if not uid in userids:
114 |             uid_table[len(userids)] = uid
115 |             userids[uid] = []
116 |         userids[uid].append(i)
117 |         imp = imp.split()
118 |         pos = []
119 |         neg = []
120 |         for beh in imp:
121 |             nid, label = beh.split('-')
122 |             if label == '0':
123 |                 neg.append(nid)
124 |             else:
125 |                 pos.append(nid)
126 |         sessions.append([true_click,pos,neg])
127 |     return sessions,userids,uid_table
128 | 
129 | def parse_user(session,news_index):
130 |     user_num = len(session)
131 |     user={'click': np.zeros((user_num,MAX_ALL),dtype='int32'),}
132 |     for user_id in range(len(session)):
133 |         tclick = []
134 |         click, pos, neg =session[user_id]
135 |         for i in range(len(click)):
136 |             tclick.append(news_index[click[i]])
137 |         click = tclick
138 | 
139 |         if len(click) >MAX_ALL:
140 |             click = click[-MAX_ALL:]
141 |         else:
142 |             click=[0]*(MAX_ALL-len(click)) + click
143 |             
144 |         user['click'][user_id] = np.array(click)
145 |     return user
146 | 
147 | def get_train_input(session,uid_click_talbe,news_index):
148 |     inv_table = {}
149 |     user_id_session = {}
150 | 
151 |     for uid in uid_click_talbe:
152 |         user_id_session[uid] = []
153 |         for v in uid_click_talbe[uid]:
154 |             inv_table[v] = uid
155 |     
156 |     sess_pos = []
157 |     sess_neg = []
158 |     user_id = []
159 |     for sess_id in range(len(session)):
160 |         sess = session[sess_id]
161 |         _, poss, negs=sess
162 |         for i in range(len(poss)):
163 |             pos = poss[i]
164 |             neg=newsample(negs,npratio)
165 |             sess_pos.append(pos)
166 |             sess_neg.append(neg)
167 |             user_id.append(sess_id)                
168 |             user_id_session[inv_table[sess_id]].append(len(sess_pos)-1)
169 |             
170 |     sess_all = np.zeros((len(sess_pos),1+npratio),dtype='int32')
171 |     label = np.zeros((len(sess_pos),1+npratio))
172 |     for sess_id in range(sess_all.shape[0]):
173 |         pos = sess_pos[sess_id]
174 |         negs = sess_neg[sess_id]
175 |         sess_all[sess_id,0] = news_index[pos]
176 |         index = 1
177 |         for neg in negs:
178 |             sess_all[sess_id,index] = news_index[neg]
179 |             index+=1
180 |         #index = np.random.randint(1+npratio)
181 |         label[sess_id,0]=1
182 |     user_id = np.array(user_id, dtype='int32')
183 |     
184 |     return sess_all, user_id, label, user_id_session
185 | 
186 | def get_test_input(session,news_index):
187 |     
188 |     Impressions = []
189 |     userid = []
190 |     for sess_id in range(len(session)):
191 |         _, poss, negs = session[sess_id]
192 |         imp = {'labels':[],
193 |                 'docs':[]}
194 |         userid.append(sess_id)
195 |         for i in range(len(poss)):
196 |             docid = news_index[poss[i]]
197 |             imp['docs'].append(docid)
198 |             imp['labels'].append(1)
199 |         for i in range(len(negs)):
200 |             docid = news_index[negs[i]]
201 |             imp['docs'].append(docid)
202 |             imp['labels'].append(0)
203 |         Impressions.append(imp)
204 |         
205 |     userid = np.array(userid,dtype='int32')
206 |     
207 |     return Impressions, userid,


--------------------------------------------------------------------------------
/code/models.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import keras
  3 | from keras.utils.np_utils import *
  4 | from keras.preprocessing.text import Tokenizer, text_to_word_sequence
  5 | from keras.preprocessing.sequence import pad_sequences
  6 | from keras.utils.np_utils import to_categorical
  7 | 
  8 | from keras.layers import Embedding, concatenate
  9 | from keras.layers import Dense, Input, Flatten, average,Lambda
 10 | 
 11 | from keras.layers import *
 12 | from keras.models import Model, load_model
 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 14 | 
 15 | from keras import backend as K
 16 | from keras.engine.topology import Layer, InputSpec
 17 | from keras import initializers #keras2
 18 | from keras.utils import plot_model
 19 | import numpy as np
 20 | from sklearn.metrics import accuracy_score, classification_report
 21 | from keras.optimizers import *
 22 | 
 23 | npratio = 4
 24 | 
 25 | class Attention(Layer):
 26 |  
 27 |     def __init__(self, nb_head, size_per_head, **kwargs):
 28 |         self.nb_head = nb_head
 29 |         self.size_per_head = size_per_head
 30 |         self.output_dim = nb_head*size_per_head
 31 |         super(Attention, self).__init__(**kwargs)
 32 |  
 33 |     def build(self, input_shape):
 34 |         self.WQ = self.add_weight(name='WQ',
 35 |                                   shape=(input_shape[0][-1], self.output_dim),
 36 |                                   initializer='glorot_uniform',
 37 |                                   trainable=True)
 38 |         self.WK = self.add_weight(name='WK',
 39 |                                   shape=(input_shape[1][-1], self.output_dim),
 40 |                                   initializer='glorot_uniform',
 41 |                                   trainable=True)
 42 |         self.WV = self.add_weight(name='WV',
 43 |                                   shape=(input_shape[2][-1], self.output_dim),
 44 |                                   initializer='glorot_uniform',
 45 |                                   trainable=True)
 46 |         super(Attention, self).build(input_shape)
 47 |  
 48 |     def Mask(self, inputs, seq_len, mode='mul'):
 49 |         if seq_len == None:
 50 |             return inputs
 51 |         else:
 52 |             mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
 53 |             mask = 1 - K.cumsum(mask, 1)
 54 |             for _ in range(len(inputs.shape)-2):
 55 |                 mask = K.expand_dims(mask, 2)
 56 |             if mode == 'mul':
 57 |                 return inputs * mask
 58 |             if mode == 'add':
 59 |                 return inputs - (1 - mask) * 1e12
 60 |  
 61 |     def call(self, x):
 62 |         #如果只传入Q_seq,K_seq,V_seq，那么就不做Mask
 63 |         #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len，那么对多余部分做Mask
 64 |         if len(x) == 3:
 65 |             Q_seq,K_seq,V_seq = x
 66 |             Q_len,V_len = None,None
 67 |         elif len(x) == 5:
 68 |             Q_seq,K_seq,V_seq,Q_len,V_len = x
 69 |         #对Q、K、V做线性变换
 70 |         Q_seq = K.dot(Q_seq, self.WQ)
 71 |         Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
 72 |         Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
 73 |         K_seq = K.dot(K_seq, self.WK)
 74 |         K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
 75 |         K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
 76 |         V_seq = K.dot(V_seq, self.WV)
 77 |         V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
 78 |         V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
 79 |         #计算内积，然后mask，然后softmax
 80 |         A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
 81 |         A = K.permute_dimensions(A, (0,3,2,1))
 82 |         A = self.Mask(A, V_len, 'add')
 83 |         A = K.permute_dimensions(A, (0,3,2,1))
 84 |         A = K.softmax(A)
 85 |         #输出并mask
 86 |         O_seq = K.batch_dot(A, V_seq, axes=[3,2])
 87 |         O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
 88 |         O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
 89 |         O_seq = self.Mask(O_seq, Q_len, 'mul')
 90 |         return O_seq
 91 |  
 92 |     def compute_output_shape(self, input_shape):
 93 |         return (input_shape[0][0], input_shape[0][1], self.output_dim)
 94 | 
 95 | 
 96 | def AttentivePooling(dim1,dim2):
 97 |     vecs_input = Input(shape=(dim1,dim2),dtype='float32')
 98 |     user_vecs =Dropout(0.2)(vecs_input)
 99 |     user_att = Dense(200,activation='tanh')(user_vecs)
100 |     user_att = keras.layers.Flatten()(Dense(1)(user_att))
101 |     user_att = Activation('softmax')(user_att)
102 |     user_vec = keras.layers.Dot((1,1))([user_vecs,user_att])
103 |     model = Model(vecs_input,user_vec)
104 |     return model
105 | 
106 | 
107 | def get_doc_encoder():
108 |     sentence_input = Input(shape=(30,300), dtype='float32')
109 |     droped_vecs = Dropout(0.2)(sentence_input)
110 | 
111 |     l_cnnt = Conv1D(400,3,activation='relu')(droped_vecs)
112 |     l_cnnt = Dropout(0.2)(l_cnnt)
113 |     l_cnnt = Attention(20,20)([l_cnnt,l_cnnt,l_cnnt])
114 |     l_cnnt = keras.layers.Activation('relu')(l_cnnt)
115 |     
116 |     droped_rep = Dropout(0.2)(l_cnnt)
117 |     title_vec = AttentivePooling(30,400)(droped_rep)
118 |     sentEncodert = Model(sentence_input, title_vec)
119 |     return sentEncodert
120 | 
121 | def get_user_encoder():
122 |     news_vecs_input = Input(shape=(50,400), dtype='float32')
123 |     
124 |     news_vecs = Dropout(0.2)(news_vecs_input)
125 |     gru_input = keras.layers.Lambda(lambda x:x[:,-15:,:])(news_vecs)
126 |     vec1 = GRU(400)(gru_input)
127 |     vecs2 = Attention(20,20)([news_vecs]*3)
128 |     vec2 = AttentivePooling(50,400)(vecs2)
129 | 
130 |     user_vecs2 = Attention(20,20)([news_vecs_input]*3)
131 |     user_vecs2 = Dropout(0.2)(user_vecs2)
132 |     user_vec2 = AttentivePooling(50,400)(user_vecs2)
133 |     user_vec2 = keras.layers.Reshape((1,400))(user_vec2)
134 |         
135 |     user_vecs1 = Lambda(lambda x:x[:,-20:,:])(news_vecs_input)
136 |     user_vec1 = GRU(400)(user_vecs1)
137 |     user_vec1 = keras.layers.Reshape((1,400))(user_vec1)
138 | 
139 |     user_vecs = keras.layers.Concatenate(axis=-2)([user_vec1,user_vec2])
140 |     vec = AttentivePooling(2,400)(user_vecs)
141 |         
142 |     sentEncodert = Model(news_vecs_input, vec)
143 |     return sentEncodert
144 | 
145 | 
146 | def get_model(lr,delta,title_word_embedding_matrix):
147 |     doc_encoder = get_doc_encoder()
148 |     user_encoder = get_user_encoder()
149 |     
150 |     title_word_embedding_layer = Embedding(title_word_embedding_matrix.shape[0], 300, weights=[title_word_embedding_matrix],trainable=False)
151 |     
152 |     click_title = Input(shape=(50,30),dtype='int32')
153 |     can_title = Input(shape=(1+npratio,30),dtype='int32')
154 |     
155 |     click_word_vecs = title_word_embedding_layer(click_title)
156 |     can_word_vecs = title_word_embedding_layer(can_title)
157 |     
158 |     click_vecs = TimeDistributed(doc_encoder)(click_word_vecs)
159 |     can_vecs = TimeDistributed(doc_encoder)(can_word_vecs)
160 |     
161 |     user_vec = user_encoder(click_vecs)
162 |     
163 |     scores = keras.layers.Dot(axes=-1)([user_vec,can_vecs]) #(batch_size,1+1,) 
164 |     logits = keras.layers.Activation(keras.activations.softmax,name = 'recommend')(scores)     
165 |     
166 |     model = Model([can_title,click_title],logits) # max prob_click_positive
167 |     model.compile(loss=['categorical_crossentropy'],
168 |                   optimizer=SGD(lr=lr,clipvalue = delta), 
169 |                   metrics=['acc'])
170 |     
171 |     news_input = Input(shape=(30,),dtype='int32')
172 |     news_word_vecs = title_word_embedding_layer(news_input)
173 |     news_vec = doc_encoder(news_word_vecs)
174 |     news_encoder = Model(news_input,news_vec)
175 |     
176 |     return model, doc_encoder, user_encoder, news_encoder


--------------------------------------------------------------------------------