├── code ├── requirement.txt ├── README.md ├── utils.py ├── generator.py ├── fl_training.py ├── Example.ipynb ├── preprecoess.py └── models.py ├── README.md └── LICENSE /code/requirement.txt: -------------------------------------------------------------------------------- 1 | sklearn 2 | tensorflow-gpu == 1.13.0 3 | keras == 2.2.4 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FedNewsRec-EMNLP-Findings-2020 2 | Data and code of our paper "Privacy-Preserving News Recommendation Model Learning" 3 | -------------------------------------------------------------------------------- /code/README.md: -------------------------------------------------------------------------------- 1 | # FedNewsRec-EMNLP-Findings-2020 2 | - Code of our paper "Privacy-Preserving News Recommendation Model Learning" 3 | 4 | # Data Preparation 5 | - If you want to test this project, you should download MIND-Small dataset in https://msnews.github.io/index.html 6 | - Let data-root-path denote the root path of the embedding 7 | - Files in the training dataset should be placed in root\_data\_path/train 8 | - Files in the validation dataset should be placed in data-root-path/val 9 | - We used the glove.840B.300d embedding vecrors in https://nlp.stanford.edu/projects/glove/ 10 | - The embedding file should be placed in embedding\_path\glove.840B.300d.txt 11 | 12 | # Code Files 13 | - preprocess.py: containing functions to preprocess the datasets 14 | - utils.py: containg some util functions, such as evaluation matrics 15 | - generator.py: containing data generator for model evaluation 16 | - models.py: containing codes for implementing the base model of FedRec 17 | - fl\_training.py: containing codes for federated model training 18 | - Example.ipynb: containing codes for model training and evaluation 19 | 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 JulySinceAndrew 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | def dcg_score(y_true, y_score, k=10): 5 | order = np.argsort(y_score)[::-1] 6 | y_true = np.take(y_true, order[:k]) 7 | gains = 2 ** y_true - 1 8 | discounts = np.log2(np.arange(len(y_true)) + 2) 9 | return np.sum(gains / discounts) 10 | 11 | 12 | def ndcg_score(y_true, y_score, k=10): 13 | best = dcg_score(y_true, y_true, k) 14 | actual = dcg_score(y_true, y_score, k) 15 | return actual / best 16 | 17 | 18 | def mrr_score(y_true, y_score): 19 | order = np.argsort(y_score)[::-1] 20 | y_true = np.take(y_true, order) 21 | rr_score = y_true / (np.arange(len(y_true)) + 1) 22 | return np.sum(rr_score) / np.sum(y_true) 23 | 24 | def evaluate(user_scorings,news_scorings,Impressions): 25 | AUC = [] 26 | MRR = [] 27 | nDCG5 = [] 28 | nDCG10 =[] 29 | for i in range(len(Impressions)): 30 | docids = Impressions[i]['docs'] 31 | labels = Impressions[i]['labels'] 32 | uv = user_scorings[i] 33 | 34 | docids = np.array(docids,dtype='int32') 35 | nv = news_scorings[docids] 36 | score = np.dot(nv,uv) 37 | auc = roc_auc_score(labels,score) 38 | mrr = mrr_score(labels,score) 39 | ndcg5 = ndcg_score(labels,score,k=5) 40 | ndcg10 = ndcg_score(labels,score,k=10) 41 | 42 | AUC.append(auc) 43 | MRR.append(mrr) 44 | nDCG5.append(ndcg5) 45 | nDCG10.append(ndcg10) 46 | AUC = np.array(AUC) 47 | MRR = np.array(MRR) 48 | nDCG5 = np.array(nDCG5) 49 | nDCG10 = np.array(nDCG10) 50 | 51 | AUC = AUC.mean() 52 | MRR = MRR.mean() 53 | nDCG5 = nDCG5.mean() 54 | nDCG10 = nDCG10.mean() 55 | 56 | return AUC, MRR, nDCG5, nDCG10 -------------------------------------------------------------------------------- /code/generator.py: -------------------------------------------------------------------------------- 1 | from keras.utils import Sequence 2 | import numpy as np 3 | 4 | class get_hir_train_generator(Sequence): 5 | def __init__(self,news_title, clicked_news,user_id, news_id, label, batch_size): 6 | self.title = news_title 7 | 8 | self.clicked_news = clicked_news 9 | 10 | self.user_id = user_id 11 | self.doc_id = news_id 12 | self.label = label 13 | 14 | self.batch_size = batch_size 15 | self.ImpNum = self.label.shape[0] 16 | 17 | def __len__(self): 18 | return int(np.ceil(self.ImpNum / float(self.batch_size))) 19 | 20 | def __get_news(self,docids): 21 | title = self.title[docids] 22 | 23 | return title 24 | 25 | 26 | def __getitem__(self, idx): 27 | start = idx*self.batch_size 28 | ed = (idx+1)*self.batch_size 29 | if ed> self.ImpNum: 30 | ed = self.ImpNum 31 | 32 | doc_ids = self.doc_id[start:ed] 33 | title = self.__get_news(doc_ids) 34 | 35 | user_ids = self.user_id[start:ed] 36 | clicked_ids = self.clicked_news[user_ids] 37 | 38 | user_title = self.__get_news(clicked_ids) 39 | 40 | label = self.label[start:ed] 41 | 42 | return ([title, user_title,],[label]) 43 | 44 | 45 | class get_hir_user_generator(Sequence): 46 | def __init__(self,news_scoring, clicked_news,batch_size): 47 | self.news_scoring = news_scoring 48 | self.clicked_news = clicked_news 49 | 50 | self.batch_size = batch_size 51 | self.ImpNum = self.clicked_news.shape[0] 52 | 53 | def __len__(self): 54 | return int(np.ceil(self.ImpNum / float(self.batch_size))) 55 | 56 | def __get_news(self,docids): 57 | news_scoring = self.news_scoring[docids] 58 | 59 | return news_scoring 60 | 61 | 62 | def __getitem__(self, idx): 63 | start = idx*self.batch_size 64 | ed = (idx+1)*self.batch_size 65 | if ed> self.ImpNum: 66 | ed = self.ImpNum 67 | clicked_ids = self.clicked_news[start:ed] 68 | 69 | news_scoring = self.__get_news(clicked_ids) 70 | 71 | return news_scoring -------------------------------------------------------------------------------- /code/fl_training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id): 4 | def _get_user_data(uid): 5 | click = [] 6 | sample = [] 7 | label = [] 8 | for sid in train_user_id_sample[uid]: 9 | click.append(train_user['click'][train_user_id[sid]]) 10 | sample.append(train_sess[sid]) 11 | label.append(train_label[sid]) 12 | click = np.array(click) 13 | sample = np.array(sample) 14 | label = np.array(label) 15 | click = news_title[click] 16 | sample = news_title[sample] 17 | return click,sample,label 18 | return _get_user_data 19 | 20 | 21 | def add_noise(weights,lambd): 22 | for i in range(len(weights)): 23 | weights[i] += np.random.laplace(scale = lambd,size=weights[i].shape) 24 | return weights 25 | 26 | def fed_single_update(model,doc_encoder,user_encoder,num,lambd,get_user_data,train_uid_table): 27 | random_index = np.random.permutation(len(train_uid_table))[:num] 28 | 29 | all_news_weights = [] 30 | all_user_weights = [] 31 | old_news_weight = doc_encoder.get_weights() 32 | old_user_weight = user_encoder.get_weights() 33 | 34 | sample_nums = [] 35 | 36 | loss = [] 37 | 38 | for uinx in random_index: 39 | doc_encoder.set_weights(old_news_weight) 40 | user_encoder.set_weights(old_user_weight) 41 | 42 | uid = train_uid_table[uinx] 43 | click,sample,label = get_user_data(uid) 44 | #print(label) 45 | g = model.fit([sample,click],label,batch_size = label.shape[0],verbose=False) 46 | loss.append(g.history['loss'][0]) 47 | news_weight = doc_encoder.get_weights() 48 | user_weight = user_encoder.get_weights() 49 | if lambd>0: 50 | news_weight = add_noise(news_weight,lambd) 51 | user_weight = add_noise(user_weight,lambd) 52 | #noise = 53 | #weight += noise 54 | all_news_weights.append(news_weight) 55 | all_user_weights.append(user_weight) 56 | sample_nums.append(label.shape[0]) 57 | 58 | sample_nums = np.array(sample_nums) 59 | sample_nums = sample_nums/sample_nums.sum() 60 | 61 | doc_weights = [np.average(weights, axis=0,weights=sample_nums) for weights in zip(*all_news_weights)] 62 | user_weights = [np.average(weights, axis=0,weights=sample_nums) for weights in zip(*all_user_weights)] 63 | 64 | doc_encoder.set_weights(doc_weights) 65 | user_encoder.set_weights(user_weights) 66 | loss = np.array(loss).mean() 67 | #print('average loss',loss) 68 | return loss -------------------------------------------------------------------------------- /code/Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 4, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stderr", 26 | "output_type": "stream", 27 | "text": [ 28 | "Using TensorFlow backend.\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import tensorflow as tf\n", 34 | "import keras.backend.tensorflow_backend as KTF\n", 35 | " \n", 36 | "config = tf.ConfigProto() \n", 37 | "config.gpu_options.allow_growth=True \n", 38 | "session = tf.Session(config=config)\n", 39 | " \n", 40 | "KTF.set_session(session)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from utils import *\n", 50 | "from preprecoess import *\n", 51 | "from generator import *\n", 52 | "from models import *\n", 53 | "from fl_training import *" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "root_data_path = None # MIND-Dataset Path\n", 70 | "embedding_path = None # Word Embedding Path" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 7, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# Read News\n", 87 | "news,news_index,category_dict,subcategory_dict,word_dict = read_news(root_data_path,['train','val'])\n", 88 | "news_title,news_vert,news_subvert=get_doc_input(news,news_index,category_dict,subcategory_dict,word_dict)\n", 89 | "title_word_embedding_matrix, have_word = load_matrix(embedding_path,word_dict)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 8, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "#Parse User\n", 106 | "train_session, train_uid_click, train_uid_table = read_clickhistory(root_data_path,'train')\n", 107 | "test_session, test_uid_click,test_uid_table = read_clickhistory(root_data_path,'val')\n", 108 | "train_user = parse_user(train_session,news_index)\n", 109 | "test_user = parse_user(test_session,news_index)\n", 110 | "train_sess, train_user_id, train_label, train_user_id_sample = get_train_input(train_session,train_uid_click,news_index)\n", 111 | "test_impressions, test_userids = get_test_input(test_session,news_index)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 9, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "get_user_data = GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "scrolled": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "lr = 0.3\n", 146 | "delta = 0.05\n", 147 | "lambd = 0\n", 148 | "num = 6\n", 149 | "\n", 150 | "model, doc_encoder, user_encoder, news_encoder = get_model(lr,delta,title_word_embedding_matrix)\n", 151 | "Res = []\n", 152 | "Loss = []\n", 153 | "count = 0\n", 154 | "while True:\n", 155 | " loss = fed_single_update(model,doc_encoder,user_encoder,num,lambd,get_user_data,train_uid_table)\n", 156 | " Loss.append(loss)\n", 157 | " if count % 25 == 0:\n", 158 | " news_scoring = news_encoder.predict(news_title,verbose=0)\n", 159 | " user_generator = get_hir_user_generator(news_scoring,test_user['click'],64)\n", 160 | " user_scoring = user_encoder.predict_generator(user_generator,verbose=0),\n", 161 | " user_scoring = user_scoring[0]\n", 162 | " g = evaluate(user_scoring,news_scoring,test_impressions)\n", 163 | " Res.append(g)\n", 164 | " print(g)\n", 165 | " with open('FedRec-woLDP-1.json','a') as f:\n", 166 | " s = json.dumps(g) + '\\n'\n", 167 | " f.write(s)\n", 168 | " count += 1" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "display_name": "Python 3", 189 | "language": "python", 190 | "name": "python3" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": { 194 | "name": "ipython", 195 | "version": 3 196 | }, 197 | "file_extension": ".py", 198 | "mimetype": "text/x-python", 199 | "name": "python", 200 | "nbconvert_exporter": "python", 201 | "pygments_lexer": "ipython3", 202 | "version": "3.7.6" 203 | } 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 2 207 | } 208 | -------------------------------------------------------------------------------- /code/preprecoess.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import word_tokenize 3 | import csv 4 | import datetime 5 | import time 6 | import json 7 | import itertools 8 | import random 9 | import os 10 | import numpy as np 11 | 12 | 13 | MAX_SENTENCE = 30 14 | MAX_ALL = 50 15 | npratio = 4 16 | 17 | def newsample(nnn,ratio): 18 | if ratio >len(nnn): 19 | return random.sample(nnn*(ratio//len(nnn)+1),ratio) 20 | else: 21 | return random.sample(nnn,ratio) 22 | 23 | def read_news(root_data_path,modes): 24 | news={} 25 | category=[] 26 | subcategory=[] 27 | news_index={} 28 | index=1 29 | word_dict={} 30 | word_index=1 31 | 32 | for mode in modes: 33 | with open(os.path.join(root_data_path,mode,'news.tsv')) as f: 34 | lines = f.readlines() 35 | for line in lines: 36 | splited = line.strip('\n').split('\t') 37 | doc_id,vert,subvert,title= splited[0:4] 38 | if doc_id in news_index: 39 | continue 40 | news_index[doc_id]=index 41 | index+=1 42 | category.append(vert) 43 | subcategory.append(subvert) 44 | title = title.lower() 45 | title=word_tokenize(title) 46 | news[doc_id]=[vert,subvert,title] 47 | for word in title: 48 | word = word.lower() 49 | if not(word in word_dict): 50 | word_dict[word]=word_index 51 | word_index+=1 52 | category=list(set(category)) 53 | subcategory=list(set(subcategory)) 54 | category_dict={} 55 | index=1 56 | for c in category: 57 | category_dict[c]=index 58 | index+=1 59 | subcategory_dict={} 60 | index=1 61 | for c in subcategory: 62 | subcategory_dict[c]=index 63 | index+=1 64 | return news,news_index,category_dict,subcategory_dict,word_dict 65 | 66 | def get_doc_input(news,news_index,category,subcategory,word_dict): 67 | news_num=len(news)+1 68 | news_title=np.zeros((news_num,MAX_SENTENCE),dtype='int32') 69 | news_vert=np.zeros((news_num,),dtype='int32') 70 | news_subvert=np.zeros((news_num,),dtype='int32') 71 | for key in news: 72 | vert,subvert,title=news[key] 73 | doc_index=news_index[key] 74 | news_vert[doc_index]=category[vert] 75 | news_subvert[doc_index]=subcategory[subvert] 76 | for word_id in range(min(MAX_SENTENCE,len(title))): 77 | news_title[doc_index,word_id]=word_dict[title[word_id].lower()] 78 | 79 | return news_title,news_vert,news_subvert 80 | 81 | 82 | def load_matrix(embedding_path,word_dict): 83 | embedding_matrix = np.zeros((len(word_dict)+1,300)) 84 | have_word=[] 85 | with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f: 86 | while True: 87 | l=f.readline() 88 | if len(l)==0: 89 | break 90 | l=l.split() 91 | word = l[0].decode() 92 | if word in word_dict: 93 | index = word_dict[word] 94 | tp = [float(x) for x in l[1:]] 95 | embedding_matrix[index]=np.array(tp) 96 | have_word.append(word) 97 | return embedding_matrix,have_word 98 | 99 | 100 | def read_clickhistory(root_data_path,mode): 101 | 102 | lines = [] 103 | userids = {} 104 | uid_table = {} 105 | with open(os.path.join(root_data_path,mode,'behaviors.tsv')) as f: 106 | lines = f.readlines() 107 | 108 | sessions = [] 109 | for i in range(len(lines)): 110 | _,uid,_,click,imp = lines[i].strip().split('\t') 111 | true_click = click.split() 112 | assert not '' in true_click 113 | if not uid in userids: 114 | uid_table[len(userids)] = uid 115 | userids[uid] = [] 116 | userids[uid].append(i) 117 | imp = imp.split() 118 | pos = [] 119 | neg = [] 120 | for beh in imp: 121 | nid, label = beh.split('-') 122 | if label == '0': 123 | neg.append(nid) 124 | else: 125 | pos.append(nid) 126 | sessions.append([true_click,pos,neg]) 127 | return sessions,userids,uid_table 128 | 129 | def parse_user(session,news_index): 130 | user_num = len(session) 131 | user={'click': np.zeros((user_num,MAX_ALL),dtype='int32'),} 132 | for user_id in range(len(session)): 133 | tclick = [] 134 | click, pos, neg =session[user_id] 135 | for i in range(len(click)): 136 | tclick.append(news_index[click[i]]) 137 | click = tclick 138 | 139 | if len(click) >MAX_ALL: 140 | click = click[-MAX_ALL:] 141 | else: 142 | click=[0]*(MAX_ALL-len(click)) + click 143 | 144 | user['click'][user_id] = np.array(click) 145 | return user 146 | 147 | def get_train_input(session,uid_click_talbe,news_index): 148 | inv_table = {} 149 | user_id_session = {} 150 | 151 | for uid in uid_click_talbe: 152 | user_id_session[uid] = [] 153 | for v in uid_click_talbe[uid]: 154 | inv_table[v] = uid 155 | 156 | sess_pos = [] 157 | sess_neg = [] 158 | user_id = [] 159 | for sess_id in range(len(session)): 160 | sess = session[sess_id] 161 | _, poss, negs=sess 162 | for i in range(len(poss)): 163 | pos = poss[i] 164 | neg=newsample(negs,npratio) 165 | sess_pos.append(pos) 166 | sess_neg.append(neg) 167 | user_id.append(sess_id) 168 | user_id_session[inv_table[sess_id]].append(len(sess_pos)-1) 169 | 170 | sess_all = np.zeros((len(sess_pos),1+npratio),dtype='int32') 171 | label = np.zeros((len(sess_pos),1+npratio)) 172 | for sess_id in range(sess_all.shape[0]): 173 | pos = sess_pos[sess_id] 174 | negs = sess_neg[sess_id] 175 | sess_all[sess_id,0] = news_index[pos] 176 | index = 1 177 | for neg in negs: 178 | sess_all[sess_id,index] = news_index[neg] 179 | index+=1 180 | #index = np.random.randint(1+npratio) 181 | label[sess_id,0]=1 182 | user_id = np.array(user_id, dtype='int32') 183 | 184 | return sess_all, user_id, label, user_id_session 185 | 186 | def get_test_input(session,news_index): 187 | 188 | Impressions = [] 189 | userid = [] 190 | for sess_id in range(len(session)): 191 | _, poss, negs = session[sess_id] 192 | imp = {'labels':[], 193 | 'docs':[]} 194 | userid.append(sess_id) 195 | for i in range(len(poss)): 196 | docid = news_index[poss[i]] 197 | imp['docs'].append(docid) 198 | imp['labels'].append(1) 199 | for i in range(len(negs)): 200 | docid = news_index[negs[i]] 201 | imp['docs'].append(docid) 202 | imp['labels'].append(0) 203 | Impressions.append(imp) 204 | 205 | userid = np.array(userid,dtype='int32') 206 | 207 | return Impressions, userid, -------------------------------------------------------------------------------- /code/models.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import keras 3 | from keras.utils.np_utils import * 4 | from keras.preprocessing.text import Tokenizer, text_to_word_sequence 5 | from keras.preprocessing.sequence import pad_sequences 6 | from keras.utils.np_utils import to_categorical 7 | 8 | from keras.layers import Embedding, concatenate 9 | from keras.layers import Dense, Input, Flatten, average,Lambda 10 | 11 | from keras.layers import * 12 | from keras.models import Model, load_model 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint 14 | 15 | from keras import backend as K 16 | from keras.engine.topology import Layer, InputSpec 17 | from keras import initializers #keras2 18 | from keras.utils import plot_model 19 | import numpy as np 20 | from sklearn.metrics import accuracy_score, classification_report 21 | from keras.optimizers import * 22 | 23 | npratio = 4 24 | 25 | class Attention(Layer): 26 | 27 | def __init__(self, nb_head, size_per_head, **kwargs): 28 | self.nb_head = nb_head 29 | self.size_per_head = size_per_head 30 | self.output_dim = nb_head*size_per_head 31 | super(Attention, self).__init__(**kwargs) 32 | 33 | def build(self, input_shape): 34 | self.WQ = self.add_weight(name='WQ', 35 | shape=(input_shape[0][-1], self.output_dim), 36 | initializer='glorot_uniform', 37 | trainable=True) 38 | self.WK = self.add_weight(name='WK', 39 | shape=(input_shape[1][-1], self.output_dim), 40 | initializer='glorot_uniform', 41 | trainable=True) 42 | self.WV = self.add_weight(name='WV', 43 | shape=(input_shape[2][-1], self.output_dim), 44 | initializer='glorot_uniform', 45 | trainable=True) 46 | super(Attention, self).build(input_shape) 47 | 48 | def Mask(self, inputs, seq_len, mode='mul'): 49 | if seq_len == None: 50 | return inputs 51 | else: 52 | mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1]) 53 | mask = 1 - K.cumsum(mask, 1) 54 | for _ in range(len(inputs.shape)-2): 55 | mask = K.expand_dims(mask, 2) 56 | if mode == 'mul': 57 | return inputs * mask 58 | if mode == 'add': 59 | return inputs - (1 - mask) * 1e12 60 | 61 | def call(self, x): 62 | #如果只传入Q_seq,K_seq,V_seq,那么就不做Mask 63 | #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask 64 | if len(x) == 3: 65 | Q_seq,K_seq,V_seq = x 66 | Q_len,V_len = None,None 67 | elif len(x) == 5: 68 | Q_seq,K_seq,V_seq,Q_len,V_len = x 69 | #对Q、K、V做线性变换 70 | Q_seq = K.dot(Q_seq, self.WQ) 71 | Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) 72 | Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3)) 73 | K_seq = K.dot(K_seq, self.WK) 74 | K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) 75 | K_seq = K.permute_dimensions(K_seq, (0,2,1,3)) 76 | V_seq = K.dot(V_seq, self.WV) 77 | V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) 78 | V_seq = K.permute_dimensions(V_seq, (0,2,1,3)) 79 | #计算内积,然后mask,然后softmax 80 | A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5 81 | A = K.permute_dimensions(A, (0,3,2,1)) 82 | A = self.Mask(A, V_len, 'add') 83 | A = K.permute_dimensions(A, (0,3,2,1)) 84 | A = K.softmax(A) 85 | #输出并mask 86 | O_seq = K.batch_dot(A, V_seq, axes=[3,2]) 87 | O_seq = K.permute_dimensions(O_seq, (0,2,1,3)) 88 | O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) 89 | O_seq = self.Mask(O_seq, Q_len, 'mul') 90 | return O_seq 91 | 92 | def compute_output_shape(self, input_shape): 93 | return (input_shape[0][0], input_shape[0][1], self.output_dim) 94 | 95 | 96 | def AttentivePooling(dim1,dim2): 97 | vecs_input = Input(shape=(dim1,dim2),dtype='float32') 98 | user_vecs =Dropout(0.2)(vecs_input) 99 | user_att = Dense(200,activation='tanh')(user_vecs) 100 | user_att = keras.layers.Flatten()(Dense(1)(user_att)) 101 | user_att = Activation('softmax')(user_att) 102 | user_vec = keras.layers.Dot((1,1))([user_vecs,user_att]) 103 | model = Model(vecs_input,user_vec) 104 | return model 105 | 106 | 107 | def get_doc_encoder(): 108 | sentence_input = Input(shape=(30,300), dtype='float32') 109 | droped_vecs = Dropout(0.2)(sentence_input) 110 | 111 | l_cnnt = Conv1D(400,3,activation='relu')(droped_vecs) 112 | l_cnnt = Dropout(0.2)(l_cnnt) 113 | l_cnnt = Attention(20,20)([l_cnnt,l_cnnt,l_cnnt]) 114 | l_cnnt = keras.layers.Activation('relu')(l_cnnt) 115 | 116 | droped_rep = Dropout(0.2)(l_cnnt) 117 | title_vec = AttentivePooling(30,400)(droped_rep) 118 | sentEncodert = Model(sentence_input, title_vec) 119 | return sentEncodert 120 | 121 | def get_user_encoder(): 122 | news_vecs_input = Input(shape=(50,400), dtype='float32') 123 | 124 | news_vecs = Dropout(0.2)(news_vecs_input) 125 | gru_input = keras.layers.Lambda(lambda x:x[:,-15:,:])(news_vecs) 126 | vec1 = GRU(400)(gru_input) 127 | vecs2 = Attention(20,20)([news_vecs]*3) 128 | vec2 = AttentivePooling(50,400)(vecs2) 129 | 130 | user_vecs2 = Attention(20,20)([news_vecs_input]*3) 131 | user_vecs2 = Dropout(0.2)(user_vecs2) 132 | user_vec2 = AttentivePooling(50,400)(user_vecs2) 133 | user_vec2 = keras.layers.Reshape((1,400))(user_vec2) 134 | 135 | user_vecs1 = Lambda(lambda x:x[:,-20:,:])(news_vecs_input) 136 | user_vec1 = GRU(400)(user_vecs1) 137 | user_vec1 = keras.layers.Reshape((1,400))(user_vec1) 138 | 139 | user_vecs = keras.layers.Concatenate(axis=-2)([user_vec1,user_vec2]) 140 | vec = AttentivePooling(2,400)(user_vecs) 141 | 142 | sentEncodert = Model(news_vecs_input, vec) 143 | return sentEncodert 144 | 145 | 146 | def get_model(lr,delta,title_word_embedding_matrix): 147 | doc_encoder = get_doc_encoder() 148 | user_encoder = get_user_encoder() 149 | 150 | title_word_embedding_layer = Embedding(title_word_embedding_matrix.shape[0], 300, weights=[title_word_embedding_matrix],trainable=False) 151 | 152 | click_title = Input(shape=(50,30),dtype='int32') 153 | can_title = Input(shape=(1+npratio,30),dtype='int32') 154 | 155 | click_word_vecs = title_word_embedding_layer(click_title) 156 | can_word_vecs = title_word_embedding_layer(can_title) 157 | 158 | click_vecs = TimeDistributed(doc_encoder)(click_word_vecs) 159 | can_vecs = TimeDistributed(doc_encoder)(can_word_vecs) 160 | 161 | user_vec = user_encoder(click_vecs) 162 | 163 | scores = keras.layers.Dot(axes=-1)([user_vec,can_vecs]) #(batch_size,1+1,) 164 | logits = keras.layers.Activation(keras.activations.softmax,name = 'recommend')(scores) 165 | 166 | model = Model([can_title,click_title],logits) # max prob_click_positive 167 | model.compile(loss=['categorical_crossentropy'], 168 | optimizer=SGD(lr=lr,clipvalue = delta), 169 | metrics=['acc']) 170 | 171 | news_input = Input(shape=(30,),dtype='int32') 172 | news_word_vecs = title_word_embedding_layer(news_input) 173 | news_vec = doc_encoder(news_word_vecs) 174 | news_encoder = Model(news_input,news_vec) 175 | 176 | return model, doc_encoder, user_encoder, news_encoder --------------------------------------------------------------------------------