├── ALS_optimize.py ├── ALS_optimize_origin.py ├── README.md ├── compute_grad.py ├── dataset.py ├── evaluation.py └── main.py /ALS_optimize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from six.moves import xrange 3 | from numpy.linalg import inv 4 | from dataset import build_user_item_matrix 5 | 6 | def _update_user_feature(n_user, ratings_csr_, n_feature, lamda_u, mean_rating_, user_features_, item_features_): 7 | ''' 8 | n_u : number of rating items of user i 9 | item_features: n_u * n_feature (108 * 8) 10 | A_i = v_j' * u_i + lamda_u * I(n_feature) 11 | V_i = sum(M_ij * v_j) 12 | ''' 13 | for i in xrange(n_user): 14 | _, item_idx = ratings_csr_[i, :].nonzero() 15 | n_u = item_idx.shape[0] 16 | if n_u == 0: 17 | continue 18 | item_features = item_features_.take(item_idx, axis=0) 19 | 20 | ratings = ratings_csr_[i, :].data - mean_rating_ 21 | A_i = (np.dot(item_features.T, item_features) + 22 | lamda_u * n_u * np.eye(n_feature)) 23 | V_i = np.dot(item_features.T, ratings) 24 | user_features_[i, :] = np.dot(inv(A_i), V_i) 25 | 26 | def _update_mal_feature(mal_user, mal_ratings_csr_, n_feature, lamda_u, mal_mean_rating_, mal_user_features_, item_features_): 27 | for m in xrange(mal_user): 28 | _, item_idx = mal_ratings_csr_[m, :].nonzero() 29 | n_m = item_idx.shape[0] 30 | if n_m == 0: 31 | continue 32 | item_features = item_features_.take(item_idx, axis=0) 33 | 34 | ratings = mal_ratings_csr_[m, :].data - mal_mean_rating_ 35 | A_i = (np.dot(item_features.T, item_features) + 36 | lamda_u * n_m * np.eye(n_feature)) 37 | V_i = np.dot(item_features.T, ratings) 38 | mal_user_features_[m, :] = np.dot(inv(A_i), V_i) 39 | 40 | def _update_item_feature(n_item, ratings_csc_, mal_ratings_csc_, n_feature, lamda_v, mean_rating_, \ 41 | mal_mean_rating_, user_features_, mal_user_features_, item_features_): 42 | ''' 43 | n_i : number of rating items of item j 44 | ''' 45 | for j in xrange(n_item): 46 | user_idx, _ = ratings_csc_[:, j].nonzero() 47 | n_i = user_idx.shape[0] 48 | if n_i == 0: 49 | continue 50 | user_features = user_features_.take(user_idx, axis=0) 51 | ratings = ratings_csc_[:, j].data - mean_rating_ 52 | 53 | mal_user_idx, _ = mal_ratings_csc_[:, j].nonzero() 54 | m_i = mal_user_idx.shape[0] 55 | if m_i == 0: 56 | continue 57 | mal_user_features = mal_user_features_.take(mal_user_idx, axis=0) 58 | mal_ratings = mal_ratings_csc_[:, j].data - mal_mean_rating_ 59 | 60 | A_j = (np.dot(user_features.T, user_features) + np.dot(mal_user_features.T, mal_user_features) \ 61 | + lamda_v * (n_i + m_i) * np.eye(n_feature)) 62 | V_j = np.dot(user_features.T, ratings) + np.dot(mal_user_features.T, mal_ratings) 63 | item_features_[j, :] = np.dot(inv(A_j), V_j) 64 | 65 | def ALS(n_user, n_item, n_feature, mal_user, ratings, mean_rating_, mal_mean_rating_, mal_ratings, lamda_u, lamda_v, \ 66 | user_features_, mal_user_features_, item_features_): 67 | ratings_csr_ = build_user_item_matrix(n_user, n_item, ratings) 68 | ratings_csc_ = ratings_csr_.tocsc() 69 | mal_ratings_csr_ = build_user_item_matrix(mal_user, n_item, mal_ratings) 70 | mal_ratings_csc_ = mal_ratings_csr_.tocsc() 71 | 72 | _update_user_feature(n_user, ratings_csr_, n_feature, lamda_u, mean_rating_, user_features_, item_features_) 73 | _update_mal_feature(mal_user, mal_ratings_csr_, n_feature, lamda_u, mal_mean_rating_, mal_user_features_, item_features_) 74 | _update_item_feature(n_item, ratings_csc_, mal_ratings_csc_, n_feature, lamda_v, mean_rating_, \ 75 | mal_mean_rating_, user_features_, mal_user_features_, item_features_) 76 | -------------------------------------------------------------------------------- /ALS_optimize_origin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from six.moves import xrange 3 | from numpy.linalg import inv 4 | from dataset import build_user_item_matrix 5 | 6 | def _update_user_feature(n_user, ratings_csr_, n_feature, lamda_u, mean_rating_, user_features_, item_features_): 7 | ''' 8 | n_u : number of rating items of user i 9 | item_features: n_u * n_feature (108 * 8) 10 | A_i = v_j' * u_i + lamda_u * I(n_feature) 11 | V_i = sum(M_ij * v_j) 12 | ''' 13 | for i in xrange(n_user): 14 | _, item_idx = ratings_csr_[i, :].nonzero() 15 | n_u = item_idx.shape[0] 16 | if n_u == 0: 17 | continue 18 | item_features = item_features_.take(item_idx, axis=0) 19 | 20 | ratings = ratings_csr_[i, :].data - mean_rating_ 21 | A_i = (np.dot(item_features.T, item_features) + 22 | lamda_u * n_u * np.eye(n_feature)) 23 | V_i = np.dot(item_features.T, ratings) 24 | user_features_[i, :] = np.dot(inv(A_i), V_i) 25 | 26 | 27 | def _update_item_feature(n_item, ratings_csc_, n_feature, lamda_v, mean_rating_, user_features_, item_features_): 28 | ''' 29 | n_i : number of rating items of item j 30 | ''' 31 | for j in xrange(n_item): 32 | user_idx, _ = ratings_csc_[:, j].nonzero() 33 | n_i = user_idx.shape[0] 34 | if n_i == 0: 35 | continue 36 | user_features = user_features_.take(user_idx, axis=0) 37 | ratings = ratings_csc_[:, j].data - mean_rating_ 38 | 39 | A_j = (np.dot(user_features.T, user_features) + lamda_v * n_i * np.eye(n_feature)) 40 | V_j = np.dot(user_features.T, ratings) 41 | item_features_[j, :] = np.dot(inv(A_j), V_j) 42 | 43 | def ALS_origin(n_user, n_item, n_feature, ratings, mean_rating_, lamda_u, lamda_v, user_features_, item_features_): 44 | ratings_csr_ = build_user_item_matrix(n_user, n_item, ratings) 45 | ratings_csc_ = ratings_csr_.tocsc() 46 | _update_user_feature(n_user, ratings_csr_, n_feature, lamda_u, mean_rating_, user_features_, item_features_) 47 | _update_item_feature(n_item, ratings_csc_, n_feature, lamda_v, mean_rating_, user_features_, item_features_) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-poisoning-attacks-on-factorization-based-collaborative-filtering 2 | # Reference 3 | * "Data poisoning Attack on Factorization-Based Collaborative Filtering" NIPS 2016 4 | * Github respository : "recommend" 5 | * Github respository : "movielens" 6 | # Illustration 7 | * ALS_optimize.py: compute optimize solution using malicious data and normal data. 8 | * ALS_optimize_old.py: compute optimize solution only using normal data. 9 | * compute_grad.py: compute two parts of gradient using the formulations in the paper. 10 | * dataset.py: functions of load dataset and build user item matrix. 11 | * evaluation.py: predict data and compure RMSE 12 | * main.py: the implement of PGA 13 | -------------------------------------------------------------------------------- /compute_grad.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from six.moves import xrange 4 | from dataset import build_user_item_matrix 5 | from numpy.linalg import inv 6 | #compute the gradient of the hyrid utility function 7 | def compute_utility_grad(n_user, n_item, train, user_features_, item_features_,user_features_origin_, item_features_origin_, \ 8 | w_j0 = 0.8, u1 = 0.5, u2 = 0.5): 9 | ratings_csr_ = build_user_item_matrix(n_user, n_item, train) 10 | grad_av = 2 * (np.dot(user_features_, item_features_.T) - np.dot(user_features_origin_, item_features_origin_.T)) 11 | for i in xrange(n_user): 12 | _, item_idx = ratings_csr_[i, :].nonzero() 13 | grad_av[i, item_idx] = 0 14 | avg_rating = np.mean(np.dot(user_features_, item_features_.T), axis = 0) 15 | perfer_index = np.where(avg_rating > 0.03) 16 | J0 = random.sample(list(perfer_index[0]), 1) 17 | grad_in = np.zeros([n_user, n_item]) 18 | grad_in[:, J0] = w_j0 19 | grad_hy = u1 * grad_av + u2 * grad_in 20 | return grad_hy 21 | 22 | def compute_grad(n_user, n_item, mal_user, mal_ratings, train, user_features_, mal_user_features_, \ 23 | item_features_, lamda_v, n_feature, user_features_origin_, item_features_origin_): 24 | ''' 25 | A : inv(lamda_v * Ik + sum(u_i* u_i)) (for u_i of item j) k * k 26 | u_i : 1 * k 27 | grad_model: d(u_i * v_j.T)/d(M_ij) = u_i * A * u_i.T 28 | ''' 29 | grad_R = compute_utility_grad(n_user, n_item, train, user_features_, \ 30 | item_features_, user_features_origin_, item_features_origin_) 31 | ratings_csr_ = build_user_item_matrix(n_user, n_item, train) 32 | ratings_csc_ = ratings_csr_.tocsc() 33 | mal_ratings_csr_ = build_user_item_matrix(mal_user, n_item, mal_ratings) 34 | mal_ratings_csc_ = mal_ratings_csr_.tocsc() 35 | grad_total = np.zeros([mal_user, n_item]) 36 | for i in xrange(mal_user): 37 | for j in xrange(n_item): 38 | if j % 100 == 0: 39 | print('Computing the %dth malicious user, the %d item(total users: %d, total items: %d)' % (i, j, n_user, n_item)) 40 | user_idx, _ = ratings_csc_[:, j].nonzero() 41 | mal_user_idx, _ = mal_ratings_csc_[:, j].nonzero() 42 | user_features = user_features_.take(user_idx, axis=0) 43 | mal_user_features = mal_user_features_.take(mal_user_idx, axis=0) 44 | U = np.vstack((user_features, mal_user_features)) 45 | u_i = user_features_.take(i, axis = 0) 46 | A = np.dot(U.T, U) + lamda_v * np.eye(n_feature) 47 | A_u = np.dot(A, u_i.T) 48 | grad_model = np.zeros([n_user, n_item]) 49 | for m in xrange(n_user): 50 | u_m = user_features_.take(i, axis = 0) 51 | grad_model[m, j] = np.dot(u_m, np.dot(inv(A), u_i.T)) 52 | grad_total[i, j] = sum(sum(grad_model * grad_R)) 53 | return grad_total 54 | 55 | 56 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sparse 3 | #from itertools import islice 4 | 5 | def load_movielens_ratings(ratings_file): 6 | with open(ratings_file) as f: 7 | ratings = [] 8 | for line in f: 9 | #for line in islice(f, 1, None): 10 | line = line.split(",")[:3] 11 | line = [int(l) for l in line] 12 | ratings.append(line) 13 | ratings = np.array(ratings) 14 | return ratings 15 | 16 | def build_user_item_matrix(n_user, n_item, ratings): 17 | """Build user-item matrix 18 | Return 19 | ------ 20 | sparse matrix with shape (n_user, n_item) 21 | """ 22 | data = ratings[:, 2] 23 | row_ind = ratings[:, 0] 24 | col_ind = ratings[:, 1] 25 | shape = (n_user, n_item) 26 | return sparse.csr_matrix((data, (row_ind, col_ind)), shape=shape) 27 | 28 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def predict(data, user_features_, item_features_, mean_rating_, max_rating = 1, min_rating = -1): 4 | data = data.astype(int) 5 | u_features = user_features_.take(data.take(0, axis=1), axis=0) 6 | i_features = item_features_.take(data.take(1, axis=1), axis=0) 7 | preds = np.sum(u_features * i_features, 1) + mean_rating_ 8 | if max_rating: 9 | preds[preds > max_rating] = max_rating 10 | if min_rating: 11 | preds[preds < min_rating] = min_rating 12 | return preds 13 | 14 | def RMSE(estimation, truth): 15 | """Root Mean Square Error""" 16 | estimation = np.float64(estimation) 17 | truth = np.float64(truth) 18 | num_sample = estimation.shape[0] 19 | 20 | # sum square error 21 | sse = np.sum(np.square(truth - estimation)) 22 | return np.sqrt(np.divide(sse, num_sample - 1)) 23 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | from six.moves import xrange 5 | import numpy as np 6 | from numpy.random import RandomState 7 | from numpy.linalg import inv 8 | 9 | from dataset import load_movielens_ratings 10 | from dataset import build_user_item_matrix 11 | from ALS_optimize import ALS 12 | from ALS_optimize_origin import ALS_origin 13 | from evaluation import predict 14 | from evaluation import RMSE 15 | from compute_grad import compute_utility_grad 16 | from compute_grad import compute_grad 17 | 18 | #ratings_file = '/media/wangfuyi15/F/Adversarial machine learning/ml-20m/ratings.csv' 19 | ratings_file = 'F:/Adversarial machine learning/movielens-master/code/ratings-ml.csv' 20 | ratings = load_movielens_ratings(ratings_file) 21 | rand_state = RandomState(0) 22 | 23 | max_rating = max(ratings[:, 2]) 24 | min_rating = min(ratings[:, 2]) 25 | ''' 26 | parameters: 27 | lamda_u: the regularization parameter of user 28 | lamda_v: the regularization parameter of item 29 | alpha: the proportion of malicious users 30 | B: the items of malicious users rating 31 | n_iter: number of iteration 32 | converge: the least RMSE between two iterations 33 | train_pct: the proportion of train dataset 34 | ''' 35 | lamda_u = 5e-2 36 | lamda_v = 5e-2 37 | alpha = 0.2 38 | B = 25 39 | n_iters = 10 40 | n_feature = 8 41 | seed = None 42 | last_rmse = None 43 | converge = 1e-5 44 | mal_item = B 45 | # split data to training & testing 46 | train_pct = 0.9 47 | rand_state.shuffle(ratings) 48 | train_size = int(train_pct * ratings.shape[0]) 49 | train = ratings[:train_size] 50 | validation = ratings[train_size:] 51 | 52 | n_user = max(train[:, 0]) + 1 53 | n_item = max(train[:, 1]) + 1 54 | mal_user = int(alpha * n_user) 55 | 56 | # add malicious users data 57 | mal_ratings = [] 58 | for u in xrange(mal_user): 59 | mal_user_idx = u 60 | mal_item_idx = random.sample(range(n_item), mal_item) 61 | for i in xrange(mal_item): 62 | mal_movie_idx = mal_item_idx[i] 63 | mal_rating = 2 * (RandomState(seed).rand() > 0.5) - 1 64 | mal_ratings.append([mal_user_idx, mal_movie_idx, mal_rating]) 65 | mal_ratings = np.array(mal_ratings) 66 | #initialize the matrix U U~ and V 67 | user_features_ = 0.1 * RandomState(seed).rand(n_user, n_feature) 68 | mal_user_features_ = 0.1 * RandomState(seed).rand(mal_user, n_feature) 69 | item_features_ = 0.1 * RandomState(seed).rand(n_item, n_feature) 70 | mean_rating_ = np.mean(train.take(2, axis=1)) 71 | mal_mean_rating_ = np.mean(mal_ratings.take(2, axis=1)) 72 | user_features_origin_ = 0.1 * RandomState(seed).rand(n_user, n_feature) 73 | item_features_origin_ = 0.1 * RandomState(seed).rand(n_item, n_feature) 74 | 75 | #train origin model 76 | def optimize_model_origin(): 77 | print("Start training model without data poisoning attacks!") 78 | last_rmse = None 79 | for iteration in xrange(n_iters): 80 | t1 = time.time() 81 | ALS_origin(n_user, n_item, n_feature, train, mean_rating_, lamda_u, lamda_v, user_features_origin_, item_features_origin_) 82 | train_preds = predict(train.take([0, 1], axis=1), user_features_origin_, item_features_origin_, mean_rating_) 83 | train_rmse = RMSE(train_preds, train.take(2, axis=1)) 84 | t2 = time.time() 85 | print("The %d th iteration \t time: %ds \t RMSE: %f " % (iteration + 1, t2 - t1, train_rmse)) 86 | # stop when converge 87 | if last_rmse and abs(train_rmse - last_rmse) < converge: 88 | break 89 | else: 90 | last_rmse = train_rmse 91 | return last_rmse 92 | 93 | #train added attack data model 94 | def optimize_model(): 95 | print("Start training model with data poisoning attacks!") 96 | last_rmse = None 97 | for iteration in xrange(n_iters): 98 | t1 = time.time() 99 | ALS(n_user, n_item, n_feature, mal_user, train, mean_rating_, mal_mean_rating_, mal_ratings, lamda_u, lamda_v, \ 100 | user_features_, mal_user_features_, item_features_) 101 | train_preds = predict(train.take([0, 1], axis=1), user_features_, item_features_, mean_rating_) 102 | train_rmse = RMSE(train_preds, train.take(2, axis=1)) 103 | t2 = time.time() 104 | print("The %d th iteration \t time: %ds \t RMSE: %f " % (iteration + 1, t2 - t1, train_rmse)) 105 | # stop when converge 106 | if last_rmse and abs(train_rmse - last_rmse) < converge: 107 | break 108 | else: 109 | last_rmse = train_rmse 110 | return last_rmse 111 | 112 | #using the algorithm of PGA to optimize the utility function 113 | ''' 114 | m_iters: number of iteration in PGA 115 | s_t: step size 116 | Lamda: the contraint of vector 117 | ''' 118 | m_iters = 10 119 | s_t = 0.2 * np.ones([m_iters]) 120 | converge = 1e-5 121 | Lamda = 1 122 | last_rmse = None 123 | #optimize_model_origin() 124 | for t in xrange(m_iters): 125 | t1 = time.time() 126 | #optimize_model() 127 | grad_total = compute_grad(n_user, n_item, mal_user, mal_ratings, train, user_features_, mal_user_features_, \ 128 | item_features_, lamda_v, n_feature, user_features_origin_, item_features_origin_) 129 | mal_data = np.dot(mal_user_features_, item_features_) 130 | temp = mal_data 131 | mal_data += grad_total * s_t[t] 132 | mal_data[mal_data > Lamda] = Lamda 133 | mal_data[mal_data < - Lamda] = - Lamda 134 | rmse = rmse(mal_data, temp) 135 | t2 = time.time() 136 | print("The %d th iteration \t time: %ds \t RMSE: %f " % (t + 1, t2 - t1, rmse)) 137 | if last_rmse and abs(rmse - last_rmse) < converge: 138 | break 139 | else: 140 | last_rmse = rmse 141 | 142 | 143 | 144 | --------------------------------------------------------------------------------