├── README.md ├── data ├── .DS_Store └── lastfm │ ├── test.1 │ └── train.1 ├── lrec.ipynb ├── lrec ├── __init__.py ├── evaluate │ ├── __init__.py │ ├── cy_ranking_metric.pyx │ ├── eval_ranking_metric.py │ ├── py_ranking_metric.py │ └── ranking_metric.py ├── parallel │ ├── __init__.py │ └── ipythonParallelLinear.py ├── recommender │ ├── LRec │ │ ├── LRec.py │ │ ├── __init__.py │ │ └── base.py │ ├── __init__.py │ └── modelArgs.py └── utils │ ├── __init__.py │ ├── data_utils │ ├── __init__.py │ ├── data.py │ ├── data_cython_helpers.c │ ├── data_cython_helpers.pyx │ ├── data_helpers.py │ └── lineParser.py │ ├── general_utils.py │ └── train_test_utils │ └── random_split_generator.sh ├── requirements.txt └── setup.py /README.md: -------------------------------------------------------------------------------- 1 | # LRec 2 | 3 | # Dependencies 4 | 5 | + envoy 6 | + progressbar 7 | + sklearn 0.17 8 | + Cython 9 | 10 | # Installation 11 | 12 | python setup.py install 13 | 14 | # Data Input 15 | 16 | The data format is tab separated user-item-rating score 17 | \t\t 18 | For one class collaborative filtering, the rating is 1 19 | Also, the different delimiter can be used by defining a custom parser or passind delim in UserItemRatingParser. 20 | 21 | # Running the code 22 | 23 | Please refer lrec.ipynb for step by step guide to run the code. The sample dataset(lastFM) is included in the data folder. 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/data/.DS_Store -------------------------------------------------------------------------------- /lrec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Code Dependencies\n", 8 | " - envoy\n", 9 | " - progressbar\n", 10 | " - sklearn 0.17\n", 11 | " - Cython\n", 12 | " " 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 58, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "from lrec.utils.data_utils.data import Data, loadDataset\n", 24 | "from lrec.utils.data_utils.lineParser import UserItemRatingParser" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "train_path = TRAIN_DATA_PATH\n", 36 | "test_path = TEST_DATA_PATH" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "

Load Data

" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "#### Input data format:\n", 51 | " \\t\\t\n", 52 | " Note: \"rating\" is 1 for one class collaborative filtering" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 84, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "#Define the dataset parser\n", 64 | "parser = UserItemRatingParser(delim=\"\\t\")\n", 65 | "d = Data()\n", 66 | "d.import_data(train_path, parser)\n", 67 | "train = d.R\n", 68 | "test, _ = loadDataset(test_path, d.users, d.items, parser)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "

Learn LRec Model

" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "

Learn model in parallel

\n", 83 | "Start ipython parallel engines.\n", 84 | "\n", 85 | "For eg: start 4 ipython engines\n", 86 | "\n", 87 | "ipcluster start -n 4" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "Loss Functions\n", 95 | " - squared : squared loss \n", 96 | " - logistic: logistic loss" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 82, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "from lrec.recommender.LRec.LRec import LRec\n", 108 | "from lrec.recommender.modelArgs import LRecArgs\n", 109 | "from lrec.evaluate.eval_ranking_metric import evalMetricsParallelMiniBatch\n", 110 | "\n", 111 | "#number of cpu to use\n", 112 | "n = 4\n", 113 | "#set loss function (squared or logisitc)\n", 114 | "loss = \"logistic\"\n", 115 | "#set regularization strength\n", 116 | "c = 0.0001\n", 117 | "arg = LRecArgs(c, loss)\n", 118 | "model = LRec(arg)\n", 119 | "indices, sim = model.fit_parallel(train,num_procs=n)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "

Evaluate the model

" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 83, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "from lrec.evaluate.eval_ranking_metric import evalMetricsParallelMiniBatch\n", 138 | "evalMetricsParallelMiniBatch(train, train, test, model, mapk=100, ks=[3, 5, 10, 20])" 139 | ] 140 | } 141 | ], 142 | "metadata": { 143 | "anaconda-cloud": {}, 144 | "kernelspec": { 145 | "display_name": "Python [default]", 146 | "language": "python", 147 | "name": "python2" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 2 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython2", 159 | "version": "2.7.12" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 0 164 | } 165 | -------------------------------------------------------------------------------- /lrec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/lrec/__init__.py -------------------------------------------------------------------------------- /lrec/evaluate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/lrec/evaluate/__init__.py -------------------------------------------------------------------------------- /lrec/evaluate/cy_ranking_metric.pyx: -------------------------------------------------------------------------------- 1 | cpdef get_intersection(int[:] purchased, int[:] recommended, int k): 2 | cdef: 3 | int i, m 4 | double hit 5 | set recos 6 | recos = set() 7 | for i in range(k): 8 | recos.add(recommended[i]) 9 | m = len(purchased) 10 | hit = 0.0 11 | for i in range(m): 12 | if purchased[i] in recos: 13 | hit += 1 14 | return hit 15 | 16 | cpdef double recall(int[:] purchased, int[:] recommended, int k): 17 | cdef: 18 | double hit 19 | int m 20 | m = len(purchased) 21 | hit = get_intersection(purchased, recommended, k) 22 | return hit / m 23 | 24 | cpdef double prec(int[:] purchased, int[:] recommended, int k): 25 | cdef: 26 | double hit 27 | hit = get_intersection(purchased, recommended, k) 28 | return hit / k 29 | 30 | cpdef double apk(int[:] purchased, int[:] recommended, int k): 31 | cdef: 32 | double score 33 | int i, m, item 34 | set actual_set 35 | actual_set = set(purchased) 36 | score = 0.0 37 | m = len(purchased) 38 | for i in range(1, k + 1): 39 | item = recommended[i-1] 40 | if item in actual_set: 41 | score += prec(purchased, recommended, i) 42 | return score / min(m, k) 43 | -------------------------------------------------------------------------------- /lrec/evaluate/eval_ranking_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lrec.evaluate.ranking_metric import * 3 | import multiprocessing as mp 4 | import time 5 | import scipy.sparse 6 | 7 | 8 | def getTestUsers(train, test, cond=(1, float("inf"))): 9 | low = cond[0] 10 | high = cond[1] 11 | test_users = np.ravel(np.where(np.ravel(test.sum(axis=1)) > 0)[0]) 12 | train_user_stat = np.ravel(train.sum(axis=1)) 13 | logical_cond = np.logical_and( 14 | train_user_stat >= low, train_user_stat <= high) 15 | train_users = np.ravel(np.where(logical_cond)[0]) 16 | train_users_set = set(train_users.tolist()) 17 | final_test_users = [] 18 | for user in test_users: 19 | if user in train_users_set: 20 | final_test_users.append(user) 21 | return final_test_users 22 | 23 | 24 | def getTopK(scores, k): 25 | topk_items = np.argsort(scores)[::-1] 26 | topk_items = topk_items.astype(np.int32) 27 | if k > 0: 28 | return topk_items[:k] 29 | else: 30 | return topk_items 31 | 32 | 33 | def generateBatches(lst, batch_size): 34 | batch = [] 35 | for i in range(0, len(lst), batch_size): 36 | end = min(i + batch_size, len(lst)) 37 | batch.append(lst[i: end]) 38 | return batch 39 | 40 | 41 | def getUserPurchased(mat, user): 42 | return mat.indices[range(mat.indptr[user], mat.indptr[user + 1])] 43 | 44 | 45 | def evalMetrics(train, test, recos, mapk=100, ks=[3, 5, 10, 20]): 46 | # Input: 47 | # train: training data 48 | # test: test csr data matrix 49 | # recos: recommendation matrix (csr) 50 | # mapk : cutoff for map evaluation 51 | # ks : @k values for precision and recall 52 | # Output: 53 | # (map@mapk , precision@ks, recall@ks) 54 | 55 | # IMPORTANT: Use this method if you can fit whole test user recommendation 56 | # in the memory 57 | testusers = getTestUsers(train, test) 58 | 59 | maps = [] 60 | precs = [] 61 | recalls = [] 62 | for user in testusers: 63 | if scipy.sparse.issparse(recos): 64 | reco_score = recos.getrow(user).todense() 65 | else: 66 | reco_score = recos[user, :] 67 | 68 | reco_score = np.ravel(reco_score) 69 | # reco_score = np.ravel(reco_score) 70 | history = train.getrow(user) 71 | history_index = history.indices[ 72 | range(history.indptr[0], history.indptr[1])] 73 | reco_score[history_index] = float("-inf") 74 | recommended = getTopK(reco_score, mapk) 75 | user_purchased = getUserPurchased(test, user) 76 | 77 | _apk = apk(user_purchased, recommended, mapk) 78 | _recalls = [] 79 | _precs = [] 80 | for k in ks: 81 | _prec = prec(user_purchased, recommended, k) 82 | _rec = recall(user_purchased, recommended, k) 83 | _recalls.append(_rec) 84 | _precs.append(_prec) 85 | recalls.append(_recalls) 86 | precs.append(_precs) 87 | maps.append(_apk) 88 | return np.mean(maps), np.mean(np.array(precs), axis=0), np.mean(np.array(recalls), axis=0), len(testusers) 89 | 90 | 91 | def evalMetricsParallelMiniBatch(train_input, train_target, test, model, 92 | mapk=100, ks=[3, 5, 10, 20], batch_size=1000, nprocs=1): 93 | # Input: 94 | # train: training data 95 | # test: test csr data matrix 96 | # recos: recommendation matrix (csr) 97 | # mapk : cutoff for map evaluation 98 | # ks : @k values for precision and recall 99 | # Output: 100 | # (map@mapk , precision@ks, recall@ks) 101 | 102 | # IMPORTANT: Use this method if you can fit whole test user recommendation 103 | # in the memory 104 | 105 | class resultCollector: 106 | 107 | def __init__(self): 108 | self.results = [] 109 | self.running = 0 110 | 111 | def collect(self, x): 112 | self.results.append(x) 113 | self.running -= 1 114 | 115 | def getResult(self): 116 | maps = 0 117 | precs = [] 118 | recalls = [] 119 | total_users = 0.0 120 | for result in self.results: 121 | _map, _prec, _rec, nuser = result 122 | maps += _map * nuser 123 | precs.append(_prec * nuser) 124 | recalls.append(_rec * nuser) 125 | total_users += nuser 126 | _map_score = np.array(maps).sum() / total_users 127 | prec_score = np.array(precs).sum(axis=0) / total_users 128 | recall_score = np.array(recalls).sum(axis=0) / total_users 129 | 130 | return _map_score, prec_score, recall_score, int(total_users) 131 | 132 | testusers = getTestUsers(train_target, test) 133 | testusers_batch = generateBatches(testusers, batch_size) 134 | collector = resultCollector() 135 | nprocs = min(nprocs, len(testusers_batch)) 136 | pool = mp.Pool(nprocs) 137 | for batch_users in testusers_batch: 138 | reco_score = model.recommend(batch_users, train_input) 139 | train_batch = train_target[batch_users, :] 140 | test_batch = test[batch_users, :] 141 | collector.running += 1 142 | arg = (train_batch, test_batch, reco_score, mapk, ks) 143 | pool.apply_async(evalMetrics, args=arg, callback=collector.collect) 144 | while(collector.running >= nprocs): 145 | time.sleep(1) 146 | pool.close() 147 | pool.join() 148 | return collector.getResult() 149 | 150 | 151 | def evalMetricsIterative(train_input, train_target, test, model, mapk=100, ks=[3, 5, 10, 20], cond=None): 152 | # Input: 153 | # train_input: training input for model 154 | # train_target : user-item purchase data (csr matrix) 155 | # test: test csr data matrix 156 | # model 157 | # mapk : cutoff for map evaluation 158 | # ks : @k values for precision and recall 159 | # Output: 160 | # (map@mapk , precision@ks, recall@ks) 161 | 162 | # IMPORTANT: Use this method to evaluate model on the fly 163 | 164 | testusers = getTestUsers(train_target, test, cond) 165 | maps = [] 166 | precs = [] 167 | recalls = [] 168 | for user in testusers: 169 | reco_score = np.ravel(model.recommend(user, train_input)) 170 | history = train_target.getrow(user) 171 | history_index = history.indices[ 172 | range(history.indptr[0], history.indptr[1])] 173 | reco_score[history_index] = float("-inf") 174 | recommended = getTopK(reco_score, mapk) 175 | user_purchased = getUserPurchased(test, user) 176 | 177 | _apk = apk(user_purchased, recommended, mapk) 178 | _recalls = [] 179 | _precs = [] 180 | for k in ks: 181 | _prec = prec(user_purchased, recommended, k) 182 | _rec = recall(user_purchased, recommended, k) 183 | _recalls.append(_rec) 184 | _precs.append(_prec) 185 | recalls.append(_recalls) 186 | precs.append(_precs) 187 | maps.append(_apk) 188 | return np.mean(maps), np.mean(np.array(precs), axis=0), np.mean(np.array(recalls), axis=0), len(test_users) 189 | -------------------------------------------------------------------------------- /lrec/evaluate/py_ranking_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def recall(purchased, recommended, k): 4 | return len(np.intersect1d(purchased, recommended[:k])) / float(len(purchased)) 5 | # return len(purchased.intersection(recommended[:k])) / 6 | # float(len(purchased)) 7 | 8 | 9 | def prec(purchased, recommended, k): 10 | return len(np.intersect1d(purchased, recommended[:k])) / float(k) 11 | # return len(purchased.intersection(recommended[:k])) / float(k) 12 | 13 | 14 | def apk(actual, predicted, k=10): 15 | """ 16 | Computes the average precision at k. 17 | This function computes the average prescision at k between two lists of 18 | items. 19 | Parameters 20 | ---------- 21 | actual : list 22 | A list of elements that are to be predicted (order doesn't matter) 23 | predicted : list 24 | A list of predicted elements (order does matter) 25 | k : int, optional 26 | The maximum number of predicted elements 27 | Returns 28 | ------- 29 | score : double 30 | The average precision at k over the input lists 31 | """ 32 | if len(predicted) > k: 33 | predicted = predicted[:k] 34 | 35 | score = 0.0 36 | num_hits = 0.0 37 | 38 | for i, p in enumerate(predicted): 39 | if p in actual and p not in predicted[:i]: 40 | num_hits += 1.0 41 | score += num_hits / (i + 1.0) 42 | 43 | if not actual.any(): 44 | return 1.0 45 | 46 | return score / min(len(actual), k) 47 | 48 | 49 | def mapk(actual, predicted, k=10): 50 | """ 51 | Computes the mean average precision at k. 52 | This function computes the mean average prescision at k between two lists 53 | of lists of items. 54 | Parameters 55 | ---------- 56 | actual : list 57 | A list of lists of elements that are to be predicted 58 | (order doesn't matter in the lists) 59 | predicted : list 60 | A list of lists of predicted elements 61 | (order matters in the lists) 62 | k : int, optional 63 | The maximum number of predicted elements 64 | Returns 65 | ------- 66 | score : double 67 | The mean average precision at k over the input lists 68 | """ 69 | return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)]) -------------------------------------------------------------------------------- /lrec/evaluate/ranking_metric.py: -------------------------------------------------------------------------------- 1 | try: 2 | from lrec.evaluate.cy_ranking_metric import * 3 | except: 4 | from lrec.evaluate.py_ranking_metric import * 5 | -------------------------------------------------------------------------------- /lrec/parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/lrec/parallel/__init__.py -------------------------------------------------------------------------------- /lrec/parallel/ipythonParallelLinear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse 3 | from copy import deepcopy 4 | import time 5 | import multiprocessing as mp 6 | 7 | 8 | def generateBatches(lst, batch_size): 9 | batch = [] 10 | for i in range(0, len(lst), batch_size): 11 | end = min(i + batch_size, len(lst)) 12 | batch.append(lst[i: end]) 13 | return batch 14 | 15 | 16 | def parallelRunnerHelper(model, train_input, batch): 17 | import numpy as np 18 | import scipy.sparse 19 | return model.fit(train_input, batch) 20 | 21 | 22 | def argsort(seq): 23 | return [x for x, y in sorted(enumerate(seq), key=lambda x: x[1])] 24 | 25 | 26 | class ResultCollector: 27 | 28 | def __init__(self): 29 | self.results = [] 30 | self.running = 0 31 | 32 | def collect(self, x): 33 | self.results.append(x) 34 | self.running -= 1 35 | 36 | def getResult(self): 37 | indices = [] 38 | sims = [] 39 | sorted_indices = argsort(map(lambda x: x[0][0], self.results)) 40 | for index in sorted_indices: 41 | _indices, _sims = self.results[index] 42 | indices.extend(_indices) 43 | sims.append(_sims) 44 | return indices, np.hstack(sims).T 45 | # scipy.sparse.hstack(sims, format="csr").T 46 | 47 | 48 | class ParallelRunner(object): 49 | 50 | def __init__(self, model, nprocs=5, batch_size=1000): 51 | super(ParallelRunner, self).__init__() 52 | self.model = model 53 | self.nprocs = nprocs 54 | self.batch_size = batch_size 55 | 56 | def fit(self, train_input, target_indices=None): 57 | nprocs = self.nprocs 58 | if target_indices is None: 59 | num = train_input.shape[0] 60 | batch_indices = generateBatches(range(num), self.batch_size) 61 | else: 62 | batch_indices = generateBatches(target_indices, self.batch_size) 63 | print "Model Learning Started" 64 | collector = ResultCollector() 65 | nprocs = min(nprocs, len(batch_indices)) 66 | pool = mp.Pool(nprocs) 67 | for batch_users in batch_indices: 68 | args = (deepcopy(self.model), train_input, 69 | batch_users) 70 | pool.apply_async(parallelRunnerHelper, args=args, 71 | callback=collector.collect) 72 | while(collector.running >= nprocs): 73 | time.sleep(1) 74 | pool.close() 75 | pool.join() 76 | indices, sim = collector.getResult() 77 | print "Model Learning Ended" 78 | self.model.sim = sim 79 | return indices, sim 80 | -------------------------------------------------------------------------------- /lrec/recommender/LRec/LRec.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression, Ridge 2 | import scipy.sparse 3 | import numpy as np 4 | from lrec.recommender.LRec.base import BaseLinear 5 | from lrec.parallel.ipythonParallelLinear import ParallelRunner 6 | 7 | 8 | class LRec(BaseLinear): 9 | 10 | def __init__(self, arg): 11 | super(LRec, self).__init__() 12 | self.arg = arg 13 | self.target = None 14 | self.__initargs() 15 | 16 | def __initargs(self): 17 | self.l2 = self.arg.l2 18 | self.loss = self.arg.loss 19 | 20 | def __getLearner(self): 21 | if self.arg.loss == "squared": 22 | return Ridge(alpha=self.l2, fit_intercept=True) 23 | elif self.arg.loss == "logistic": 24 | return LogisticRegression(C=self.l2, class_weight="balanced", fit_intercept=True) 25 | else: 26 | raise NotImplementedError( 27 | "Model %s not implemented" % (self.arg.loss)) 28 | 29 | def fit(self, train_input, target_indices=None): 30 | import numpy as np 31 | import scipy.sparse 32 | models = [] 33 | train_target = train_input.T 34 | train_input = train_input.T 35 | if target_indices is not None: 36 | train_target = train_target[:, target_indices] 37 | else: 38 | target_indices = range(train_target.shape[1]) 39 | # for fast column access 40 | train_target = train_target.tocsc() 41 | for i, index in enumerate(target_indices): 42 | learner = self.__getLearner() 43 | y = np.ravel(train_target.getcol(i).todense()) 44 | learner.fit(train_input, y) 45 | models.append(learner.coef_) 46 | self.sim = np.vstack(models).T 47 | return target_indices, self.sim 48 | 49 | def fit_parallel(self, train_input, 50 | target_indices=None, num_procs=4, 51 | batch_size=1000): 52 | if self.arg.loss == "squared_analytical": 53 | sim = (np.linalg.inv((train_input * train_input.T + lamda * 54 | scipy.sparse.identity(m)).todense()) * train_input * train_input.T) 55 | indices = range(train_input.shape[0]) 56 | self.sim = sim 57 | else: 58 | prunner = ParallelRunner(self, num_procs, batch_size) 59 | indices, sim = prunner.fit(train_input) 60 | self.sim = sim 61 | return indices, self.sim 62 | 63 | def recommend_all(self, train_input): 64 | score = np.dot(self.sim.T * train_input) 65 | return score 66 | 67 | def recommend(self, users, train_input): 68 | reco = (self.sim[:, users].T * train_input) 69 | if scipy.sparse.issparse(reco): 70 | return reco.todense() 71 | else: 72 | return reco 73 | -------------------------------------------------------------------------------- /lrec/recommender/LRec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/lrec/recommender/LRec/__init__.py -------------------------------------------------------------------------------- /lrec/recommender/LRec/base.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse 2 | 3 | 4 | class BaseLinear(object): 5 | """docstring for BaseLinear""" 6 | 7 | def __init__(self): 8 | super(BaseLinear, self).__init__() 9 | 10 | def get_sim(self): 11 | return self.sim 12 | 13 | def recommend_all(self, train_input): 14 | return train_input * self.sim 15 | 16 | def recommend(self, user, train_input): 17 | result = train_input[user, :] * self.sim 18 | if scipy.sparse.issparse(result): 19 | result = result.todense() 20 | return result 21 | -------------------------------------------------------------------------------- /lrec/recommender/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/lrec/recommender/__init__.py -------------------------------------------------------------------------------- /lrec/recommender/modelArgs.py: -------------------------------------------------------------------------------- 1 | 2 | class args(object): 3 | """docstring for args""" 4 | 5 | def __init__(self): 6 | super(args, self).__init__() 7 | 8 | def __str__(self): 9 | fields = [] 10 | for key, value in self.__dict__.items(): 11 | fields.append("%s : %s" % (str(key), str(value))) 12 | return "\n".join(fields) 13 | 14 | 15 | class LRecArgs(args): 16 | 17 | def __init__(self, l2, loss="logistic"): 18 | self.l2 = l2 19 | self.loss = loss 20 | -------------------------------------------------------------------------------- /lrec/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/lrec/utils/__init__.py -------------------------------------------------------------------------------- /lrec/utils/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesuvash/LRec/077ebacf2cea454e3d30ea1588a29851f837c8eb/lrec/utils/data_utils/__init__.py -------------------------------------------------------------------------------- /lrec/utils/data_utils/data.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse 2 | import numpy as np 3 | import envoy 4 | import progressbar 5 | import sys 6 | from lrec.utils.data_utils.data_helpers import coo_tocsr 7 | 8 | 9 | class Data(object): 10 | 11 | def __init__(self): 12 | self.users = {} 13 | self.items = {} 14 | self.nusers = 0 15 | self.nitems = 0 16 | self.include_time = False 17 | 18 | def update_user_item(self, user, item): 19 | if user not in self.users: 20 | self.users[user] = self.nusers 21 | self.nusers += 1 22 | if item not in self.items: 23 | self.items[item] = self.nitems 24 | self.nitems += 1 25 | 26 | def import_data(self, filename, parser, shape=None, num_headers=0, debug=False): 27 | r = envoy.run('wc -l {}'.format(filename)) 28 | num_lines = int(r.std_out.strip().partition(' ')[0]) 29 | bar = progressbar.ProgressBar(maxval=num_lines, widgets=["Loading data: ", 30 | progressbar.Bar( 31 | '=', '[', ']'), 32 | ' ', progressbar.Percentage(), 33 | 34 | ' ', progressbar.ETA()]).start() 35 | I, J, V = [], [], [] 36 | with open(filename) as f: 37 | for i in range(num_headers): 38 | f.readline() 39 | for i, line in enumerate(f): 40 | if (i % 1000) == 0: 41 | bar.update(i % bar.maxval) 42 | try: 43 | userid, itemid, rating = parser.parse(line) 44 | self.update_user_item(userid, itemid) 45 | uid = self.users[userid] 46 | iid = self.items[itemid] 47 | I.append(uid) 48 | J.append(iid) 49 | V.append(float(rating)) 50 | except: 51 | if debug: 52 | print "Ignoring Input: ", line, 53 | continue 54 | bar.finish() 55 | if shape is not None: 56 | _shape = (self.nusers if shape[0] is None else shape[0], 57 | self.nitems if shape[1] is None else shape[1]) 58 | R = scipy.sparse.coo_matrix( 59 | (V, (I, J)), shape=_shape) 60 | else: 61 | R = scipy.sparse.coo_matrix( 62 | (V, (I, J)), shape=(self.nusers, self.nitems)) 63 | self.R = coo_tocsr(R) 64 | sys.stdout.flush() 65 | return self.R 66 | 67 | def filter(self, n_users=5, n_items=5, iscount=False): 68 | while True: 69 | if iscount: 70 | Rcp = self.R.copy() 71 | Rcp.data[:] = 1.0 72 | else: 73 | Rcp = self.R 74 | user_stats = Rcp.sum(axis=1) 75 | item_stats = Rcp.sum(axis=0) 76 | filter_user = np.ravel((user_stats < n_users) * 1) 77 | filter_user_cum = np.cumsum(filter_user) 78 | filter_item = np.ravel((item_stats < n_items) * 1) 79 | filter_item_cum = np.cumsum(filter_item) 80 | if (filter_user_cum[-1] == 0) and (filter_item_cum[-1] == 0): 81 | break 82 | 83 | m, n = self.R.shape 84 | 85 | # filter User item 86 | I, J, V = [], [], [] 87 | data, ri, rptr = self.R.data, self.R.indices, self.R.indptr 88 | for i in xrange(m): 89 | indices = range(rptr[i], rptr[i + 1]) 90 | items = ri[indices] 91 | ratings = data[indices] 92 | for j, item in enumerate(items): 93 | if (filter_user[i] == 0) and (filter_item[item] == 0): 94 | I.append(i - filter_user_cum[i]) 95 | J.append(item - filter_item_cum[item]) 96 | V.append(ratings[j]) 97 | R = scipy.sparse.coo_matrix((V, (I, J)), 98 | shape=(m - filter_user_cum[-1], 99 | n - filter_item_cum[-1])) 100 | self.R = R.tocsr() 101 | # self.R = coo_tocsr(R) 102 | 103 | inv_users = {v: k for k, v in self.users.items()} 104 | inv_items = {v: k for k, v in self.items.items()} 105 | 106 | for i in range(m): 107 | if filter_user[i] == 1: 108 | del self.users[inv_users[i]] 109 | else: 110 | self.users[inv_users[i]] -= filter_user_cum[i] 111 | 112 | for i in range(n): 113 | if filter_item[i] == 1: 114 | del self.items[inv_items[i]] 115 | else: 116 | self.items[inv_items[i]] -= filter_item_cum[i] 117 | 118 | 119 | def loadDataset(filename, usermap, itemmap, parser, shape=None): 120 | r = envoy.run('wc -l {}'.format(filename)) 121 | num_lines = int(r.std_out.strip().partition(' ')[0]) 122 | bar = progressbar.ProgressBar(maxval=num_lines, widgets=["Loading data: ", 123 | progressbar.Bar( 124 | '=', '[', ']'), 125 | ' ', progressbar.Percentage(), 126 | 127 | ' ', progressbar.ETA()]).start() 128 | I, J, V = [], [], [] 129 | cold = [] 130 | with open(filename) as f: 131 | for i, line in enumerate(f): 132 | if (i % 1000) == 0: 133 | bar.update(i % bar.maxval) 134 | userid, itemid, rating = parser.parse(line) 135 | if userid not in usermap or itemid not in itemmap: 136 | cold.append((userid, itemid, rating)) 137 | continue 138 | uid = usermap[userid] 139 | iid = itemmap[itemid] 140 | I.append(uid) 141 | J.append(iid) 142 | V.append(float(rating)) 143 | bar.finish() 144 | if shape is not None: 145 | R = scipy.sparse.coo_matrix((V, (I, J)), shape=shape) 146 | else: 147 | R = scipy.sparse.coo_matrix( 148 | (V, (I, J)), shape=(len(usermap), len(itemmap))) 149 | R = coo_tocsr(R) 150 | 151 | return R, cold 152 | 153 | 154 | def loadSideInfo(filename, targetmap, parser, shape=None): 155 | r = envoy.run('wc -l {}'.format(filename)) 156 | num_lines = int(r.std_out.strip().partition(' ')[0]) 157 | bar = progressbar.ProgressBar(maxval=num_lines, widgets=["Loading data: ", 158 | progressbar.Bar( 159 | '=', '[', ']'), 160 | ' ', progressbar.Percentage(), 161 | 162 | ' ', progressbar.ETA()]).start() 163 | I, J, V = [], [], [] 164 | cold = [] 165 | counter = 0 166 | feature_map = {} 167 | with open(filename) as f: 168 | for i, line in enumerate(f): 169 | if (i % 1000) == 0: 170 | bar.update(i % bar.maxval) 171 | keyid, featureid = parser.parse(line) 172 | if keyid not in targetmap: 173 | continue 174 | if featureid not in feature_map: 175 | feature_map[featureid] = counter 176 | counter += 1 177 | kid = targetmap[keyid] 178 | fid = feature_map[featureid] 179 | I.append(kid) 180 | J.append(fid) 181 | V.append(1.0) 182 | bar.finish() 183 | if shape is not None: 184 | R = scipy.sparse.coo_matrix((V, (I, J)), shape=shape) 185 | else: 186 | R = scipy.sparse.coo_matrix( 187 | (V, (I, J)), shape=(len(targetmap), len(feature_map))) 188 | R = coo_tocsr(R) 189 | 190 | return R, feature_map 191 | -------------------------------------------------------------------------------- /lrec/utils/data_utils/data_cython_helpers.pyx: -------------------------------------------------------------------------------- 1 | cpdef csr_max_duplicates( 2 | int n_row, 3 | int n_col, 4 | int [:] rptr, 5 | int[:] rindices, 6 | double [:] data): 7 | 8 | cdef: 9 | int nnz, row_end, jj, j, i 10 | double x 11 | nnz = 0 12 | row_end = 0 13 | for i in range(n_row): 14 | jj = row_end 15 | row_end = rptr[i+1] 16 | while( jj < row_end ): 17 | j = rindices[jj] 18 | x = data[jj] 19 | jj += 1 20 | while( (jj < row_end) and (rindices[jj] == j)): 21 | if x < data[jj]: 22 | x = data[jj] 23 | jj += 1 24 | rindices[nnz] = j 25 | data[nnz] = x 26 | nnz += 1 27 | rptr[i+1] = nnz 28 | 29 | cpdef transformDayFromLastPurchase(int[:]rptr, int[:] ri, 30 | double[:] data, double[:] latest_purchased_date, 31 | int nrows): 32 | 33 | cdef: 34 | int i, j 35 | list indices 36 | double row_max 37 | for i in xrange(nrows): 38 | row_max = latest_purchased_date[i] 39 | for j in range(rptr[i], rptr[i+1]): 40 | data[j] = row_max - data[j] + 1 41 | -------------------------------------------------------------------------------- /lrec/utils/data_utils/data_helpers.py: -------------------------------------------------------------------------------- 1 | from scipy.sparse._sparsetools import coo_tocsr as scipy_coo_tocsr 2 | from scipy.sparse.sputils import get_index_dtype, upcast 3 | import scipy.sparse 4 | from lrec.utils.data_utils.data_cython_helpers import csr_max_duplicates 5 | import numpy as np 6 | 7 | def coo_tocsr(coo_mat): 8 | M, N = coo_mat.shape 9 | idx_dtype = get_index_dtype((coo_mat.row, coo_mat.col), 10 | maxval=max(coo_mat.nnz, N)) 11 | indptr = np.empty(M + 1, dtype=idx_dtype) 12 | indices = np.empty(coo_mat.nnz, dtype=idx_dtype) 13 | data = np.empty(coo_mat.nnz, dtype=upcast(coo_mat.dtype)) 14 | 15 | scipy_coo_tocsr(M, N, coo_mat.nnz, 16 | coo_mat.row.astype(idx_dtype), 17 | coo_mat.col.astype(idx_dtype), 18 | coo_mat.data, 19 | indptr, indices, data) 20 | A = scipy.sparse.csr_matrix((data, indices, indptr), shape=coo_mat.shape) 21 | A.sort_indices() 22 | csr_max_duplicates(M, N, A.indptr, A.indices, A.data) 23 | A.prune() 24 | A.has_canonical_format = True 25 | return A 26 | 27 | 28 | def transformDayFromLastPurchase(X): 29 | offset = 1 30 | latest_purchased_date = np.ravel(X.max(axis=1).todense()) 31 | data, rptr, ri = X.data, X.indptr, X.indices 32 | result = np.array([]) 33 | for i in xrange(X.shape[0]): 34 | indices = range(rptr[i], rptr[i + 1]) 35 | result = np.append(result, latest_purchased_date[ 36 | i] - data[indices] + offset) 37 | X.data = result 38 | -------------------------------------------------------------------------------- /lrec/utils/data_utils/lineParser.py: -------------------------------------------------------------------------------- 1 | class UserItemRatingParser(object): 2 | """docstring for userItemRatingParser""" 3 | 4 | def __init__(self, delim="\t"): 5 | super(UserItemRatingParser, self).__init__() 6 | self.delim = delim 7 | 8 | def parse(self, line): 9 | user, item, rating = line.strip().split(self.delim)[:3] 10 | return (user, item, rating) 11 | 12 | -------------------------------------------------------------------------------- /lrec/utils/general_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def force_print(x): 4 | sys.stdout.write(x+"\n") 5 | sys.stdout.flush() 6 | -------------------------------------------------------------------------------- /lrec/utils/train_test_utils/random_split_generator.sh: -------------------------------------------------------------------------------- 1 | if [ $# -ne 4 ]; then 2 | echo "Random train-test generator" 3 | echo "Usage: 'foldGenerator.sh inputfile no_of_folds percentage_split outputfolder' " 4 | exit 5 | fi 6 | 7 | for(( i=1; i<=$2; i++ )) 8 | do 9 | shuf $1 > temp 10 | split -l $(expr $(cat temp | wc -l) \* $3 / 100) temp 11 | if [ ! -d "$4" ]; then 12 | echo "Creating folder: $4" 13 | mkdir $4 14 | fi 15 | mv xaa "$4/train.$i" 16 | mv xab "$4/test.$i" 17 | rm temp 18 | done 19 | 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Install by running pip install -r requirements.txt 2 | envoy 3 | progressbar33 4 | sklearn 5 | Cython 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # import os 4 | from setuptools import find_packages 5 | 6 | from distutils.core import setup 7 | from distutils.extension import Extension 8 | from Cython.Build import cythonize 9 | 10 | 11 | def read(fname): 12 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 13 | 14 | ext_modules = [] 15 | ext_modules += [ 16 | Extension("lrec.evaluate.cy_ranking_metric", [ 17 | "lrec/evaluate/cy_ranking_metric.pyx"]), 18 | Extension("lrec.utils.data_utils.data_cython_helpers", [ 19 | "lrec/utils/data_utils/data_cython_helpers.pyx"]), 20 | ] 21 | 22 | setup( 23 | name="LRec", 24 | version="0.1", 25 | author="Suvash Sedhain", 26 | author_email="mesuvash@gmail.com", 27 | packages=find_packages(), 28 | include_package_data=True, 29 | zip_safe=False, 30 | package_dir={'': '.'}, 31 | ext_modules=cythonize(ext_modules), 32 | ) 33 | --------------------------------------------------------------------------------