├── FMonSpark_demo_a9a.ipynb ├── LICENSE ├── README.md ├── fm ├── fm_parallel_sgd.py └── fm_parallel_sgd.scala └── img └── parallel_sgd.PNG /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FM on Spark with parallel SGD 2 | 3 | Implementation of Factorization Machines on Spark using parallel stochastic gradient descent (python and scala) 4 | 5 | Factorization Machines is a smart general predictor introduced by Rendle in [2010](http://www.ismll.uni-hildesheim.de/pub/pdfs/Rendle2010FM.pdf), which can capture all single and pairwise interactions in a dataset. It can be applied to any real valued feature vector and also performs well on highly sparse data. An extension on FMs, namely Field Factorization Machines, proved to be a successful method in predicting advertisement clicks in the [Display Advertising Challenge on Kaggle](https://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/10555/3-idiots-solution-libffm). 6 | 7 | I built a custom Spark implementation to use in Python and Scala. 8 | To make optimum use of parallel computing in Spark, I implemented Parallel Stochastic Gradient Descent to train the FMs. This forms an alternative to Mini-batch SGD, which is currently available in MLLib to train Logistic Regression models. 9 | 10 | ![parallel-sgd](img/parallel_sgd.PNG) 11 | 12 | 13 | 14 | This implementation shows impressive results in terms of speed and effectivness. 15 | 16 | 17 | 18 | I worked on this project during my summer internship at ING Netherlands in 2015. ING has strong teams of data scientists and I thank them for their help during this project. I could also use a powerful cluster to test my code and train my models. 19 | 20 | 21 | ## Tutorial 22 | Here's a short tutorial on how to use them in pyspark. (Note: the procedure is quite the same in scala, see below) 23 | You may prefer try it directly using the ipython notebook tutorial *FMonSpark_demo_a9a.ipynb*. You will need to download the [a9a dataset](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a) first. 24 | 25 | 26 | 27 | ### Pyspark 28 | 29 | 1. import the script **fm_parallel_sgd.py**. You can do this by adding the following lines to your code: 30 | 31 | **sc.addPyFile("spark-FM-parallelSGD/fm/fm_parallel_sgd.py")** 32 | 33 | **import fm_parallel_sgd as fm** 34 | 35 | or by running the codes directly when starting spark: 36 | 37 | **pyspark –py-files spark-FM-parallelSGD/fm/fm_parallel_sgd.py** 38 | 39 | 40 | 2. Preprocess your data such that you 41 | 42 | a) Divide it into test and train 43 | 44 | b) The data is an RDD with labeled points 45 | - Labels should be -1 or 1. If your data has 0/1 labels, transform them with the function *fm.transform_data(data)* 46 | - Features should be either *SparseVector* or *DenseVector* from *mllib.linalg* library. 47 | 48 | 3. If you think it makes sense, take a (stratified) sample of you data using *RDD.sample()*. this is not done as part of the fm procedure 49 | 4. How many partitions is your data? The performance of the parallel sgd is best with as few partitions as possible. Coalesce your data into 1 or 2 partitions per excecutor by using *coalesce(nrPartitions)* or *repartition(nrPartitions)* on your RDD 50 | 5. Call the function **fm.trainFM_parallel_sgd(sc, train, params...)**. There are the following parameters that you can specify: 51 | - *iterations* : Nr of iterations of parallel SGD. default=50 52 | - *iter_sgd* : Nr of iterations of SGD in each partition. default=5 (between 1 and 10 is better) 53 | - *alpha* : Learning rate of SGD. default=0.01 54 | - *regParam* : Regularization parameter. default=0.01 55 | - *factorLength* : Length of the weight vectors of the FMs. default=4 56 | - *verbose*: Whether to ouptut evaluations on train and validation set after each iteration. (the code split your dataset into train(80%) + validation(20%) sets) 57 | - *savingFilename*: Whether to save the model after each iteration. The file is saved in a pickle file into your current folder. 58 | - *evalTraining* : Useful to plot the evolution of the evaluation during the training. 59 | - Instance of the class evaluation. Create the instance before using fm.trainFM_parallel_sgd ! 60 | - You can set a modulo to evaluate the model after each *#modulo* iterations with *‘instance’.modulo* 61 | 62 | 6. This returns a weight matrix **w**. if you want to store this for future use, you can use the function *fm.saveModel(w, "path/to/store/model")* 63 | 7. To evaluate the perfomance of the model on the test set, call **fm.evaluate(train, w)**. This returns the area under the Precision/recall curve, the AUC of ROC, the average logloss, the MSE and the accuracy. 64 | 8. To calculate the probabilities according to the model for a test set, call *fm.predictFM(data, w)*. This returns an RDD with probability scores. 65 | 9. To load a model that you saved, you can use the function **fm.loadModel("path/to/store/model”)** 66 | 67 | ##### Plot : 68 | 1. You can plot the error (rtv_pr_auc, rtv_auc, logl, MSE) function of different learning rates by using **fm.plotAlpha(sc, data, alpha_list, params…)**. *'alpha_list’* is a list of the learnings rates you want to test. The training is on 80% of the data, the evaluation is on the remaining 20%. 69 | 2. You can do the same for the regularization parameter and the factor length with **fm.plotRegParam(sc, data, regParam_list, params…)** and **fm.plotFactorLength(sc, data, factorLength_list, params…)** 70 | 3. You can plot a color map of the logloss function for learning rate/regParam combinations using **fm.plotAlpha_regParam(sc, data, alpha_list, regParam_list, params…)**. The brighter square is the lower logloss. The training is on 80% of the data, the evaluation is on the remaining 20%. 71 | 72 | 73 | ### Scala 74 | 75 | 1. Load the file **fm_parallel_sgd.scala**. You can do this by adding the following lines to your code: 76 | 77 | **:load spark-FM-parallelSGD/fm/fm_parallel_sgd.scala** 78 | 79 | or by running the code directly when starting spark 80 | 81 | **spark-shell –i spark-FM-parallelSGD/fm/fm_parallel_sgd.scala** 82 | 83 | 84 | 2. Preprocess your data such that you 85 | 86 | a) Divide it into test and train 87 | 88 | b) The data is an RDD with labeled points 89 | - Labels should be -1 or 1. 90 | - Features should be *Vector* from *mllib.linalg*. 91 | 92 | 3. If you think it makes sense, take a (stratified) sample of you data using *RDD.sample()*. this is not done as part of the fm procedure 93 | 4. How many partitions is your data? The performance of the parallel sgd is best with as few partitions as possible. Coalesce your data into 1 or 2 partitions per excecutor by using *coalesce(nrPartitions)* or *repartition(nrPartitions)* on your RDD 94 | 5. Call the function **fm.trainFM_parallel_sgd(train, params...)**. There are the following parameters that you can specify: 95 | - *iterations* : Nr of iterations of parallel SGD. default=50 96 | - *iter_sgd* : Nr of iterations of SGD in each partition. default=5 (between 1 and 10 is better) 97 | - *alpha* : Learning rate of SGD. default=0.01 98 | - *regParam* : Regularization parameter. default=0.01 99 | - *factorLength* : Length of the weight vectors of the FMs. default=4 100 | - *verbose*: Whether to ouptut evaluations on train and validation set after each iteration. (the code split your dataset into train(80%) + validation(20%) sets) 101 | 102 | 6. This returns a weight matrix **w**. 103 | 7. To evaluate the perfomance of the model on the test set, call **fm.evaluate(train, w)**. This returns the average logloss. 104 | 8. To calculate the probabilities according to the model for a test set, call **fm.predictFM(data, w)**. This returns an RDD with probability scores. 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /fm/fm_parallel_sgd.py: -------------------------------------------------------------------------------- 1 | from pyspark.mllib.regression import LabeledPoint 2 | from pyspark import SparkContext, SparkConf 3 | from pyspark.mllib.util import MLUtils 4 | from pyspark.storagelevel import * 5 | import pyspark.mllib.linalg 6 | import numpy as np 7 | from sklearn.metrics import auc, roc_curve, average_precision_score, log_loss, mean_squared_error 8 | import time 9 | import pickle 10 | import matplotlib.pyplot as plt 11 | from matplotlib.colors import LinearSegmentedColormap 12 | 13 | #------------------------------------------------------------------------------- 14 | # Factorization machines 15 | 16 | 17 | def fm_get_p(x, w): 18 | """ 19 | Computes the probability of an instance given a model 20 | """ 21 | # use the compress trick if x is a sparse vector 22 | # The compress trick allows to upload the weight matrix for the rows corresponding to the indices of the non-zeros X values 23 | if type(x) == pyspark.mllib.linalg.SparseVector : 24 | W = w[x.indices] 25 | X = x.values 26 | elif type(x) == pyspark.mllib.linalg.DenseVector : 27 | W=w 28 | X=x 29 | else : 30 | return 'data type error' 31 | 32 | xa = np.array([X]) 33 | VX = xa.dot(W) 34 | VX_square = (xa*xa).dot(W*W) 35 | phi = 0.5*(VX*VX - VX_square).sum() 36 | 37 | return 1.0/(1.0 + np.exp(-phi)) 38 | 39 | 40 | 41 | def fm_get_p_old(X, W): 42 | """ 43 | Computes the probability of an instance given a model 44 | """ 45 | w_triu = np.triu(np.dot(W, W.T), 1) 46 | xa = np.array([X]) 47 | x_triu = np.triu(np.dot(xa.T, xa), 1) 48 | phi = np.sum(x_triu* w_triu) 49 | return 1.0/(1.0 + np.exp(-phi)) 50 | 51 | 52 | def fm_gradient_sgd_trick(X, y, W, regParam): 53 | """ 54 | Computes the gradient for one instance using Rendle FM paper (2010) trick (linear time computation) 55 | """ 56 | xa = np.array([X]) 57 | x_matrix = xa.T.dot(xa) 58 | 59 | VX = xa.dot(W) 60 | VX_square = (xa*xa).dot(W*W) 61 | phi = 0.5*(VX*VX - VX_square).sum() 62 | 63 | expnyt = np.exp(-y*phi) 64 | np.fill_diagonal(x_matrix,0) 65 | result = (-y*expnyt)/(1+expnyt)* (np.dot(x_matrix, W)) 66 | 67 | return regParam*W + result 68 | 69 | 70 | def fm_gradient_sgd(X, y, dim, W, regParam): 71 | """ 72 | Computes the gradient for one instance 73 | """ 74 | w_matrix = np.dot(W, W.T) 75 | w_triu = np.triu(w_matrix, 1) 76 | xa = np.array([X]) 77 | x_matrix = np.dot(xa.T, xa) 78 | x_triu = np.triu(x_matrix, 1) 79 | phi = np.sum(x_triu*w_triu) 80 | expnyt = np.exp(-y*phi) 81 | x_matrix_negeye = (1-np.eye(dim))*x_matrix 82 | return regParam*W + (-y*expnyt)/(1+expnyt)* (np.dot(x_matrix_negeye, W)) 83 | 84 | 85 | def predictFM(data, w) : 86 | """ 87 | Computes the probabilities given a model for the complete data set 88 | """ 89 | return data.map(lambda row: fm_get_p(row.features, w)) 90 | 91 | def logloss(X, w, y): 92 | """ 93 | Computes the logloss of the model for one instance 94 | """ 95 | #p = max(min(phi, 1.0 - 10e-12), 10e-12) 96 | phi = get_phi(X, w) 97 | #y01 = 1 if y==1 else 0 98 | 99 | return np.log(1+np.exp(-y*phi)) 100 | 101 | def logloss2(y_pred, y_true) : 102 | """ 103 | Computes the logloss given the true label and the predictions 104 | """ 105 | #avoid NaN value 106 | y_pred[y_pred == 0] = 1e-12 107 | y_pred[y_pred == 1] = 1 - 1e-12 108 | 109 | losses = -y_true * np.log(y_pred) - (1-y_true)*np.log(1-y_pred) 110 | return np.mean(losses) 111 | 112 | def get_phi(X, W): 113 | """ 114 | Computes the phi-value for an instance given a model 115 | """ 116 | xa = np.array([X]) 117 | VX = xa.dot(W) 118 | VX_square = (xa*xa).dot(W*W) 119 | phi = 0.5*(VX*VX - VX_square).sum() 120 | return phi 121 | 122 | 123 | 124 | #----------------------------------------------------------------------- 125 | # Train with parallel sgd 126 | 127 | def trainFM_parallel_sgd (sc, data, iterations=50, iter_sgd= 5, alpha=0.01, regParam=0.01, factorLength=4,\ 128 | verbose=False, savingFilename = None, evalTraining=None) : 129 | 130 | """ 131 | Train a Factorization Machine model using parallel stochastic gradient descent. 132 | 133 | Parameters: 134 | data : RDD of LabeledPoints 135 | Training data. Labels should be -1 and 1 136 | Features should be either SparseVector or DenseVector from mllib.linalg library 137 | iterations : numeric 138 | Nr of iterations of parallel SGD. default=50 139 | iter_sgd : numeric 140 | Nr of iteration of sgd in each partition. default = 5 141 | alpha : numeric 142 | Learning rate of SGD. default=0.01 143 | regParam : numeric 144 | Regularization parameter. default=0.01 145 | factorLength : numeric 146 | Length of the weight vectors of the FMs. default=4 147 | verbose: boolean 148 | Whether to ouptut iteration numbers, time, logloss for train and validation sets 149 | savingFilename: String 150 | Whether to save the model after each iteration 151 | evalTraining : instance of the class evaluation 152 | Plot the evaluation during the training (on a train and a validation set) 153 | The instance should be created before using trainFM_parallel_sgd 154 | 155 | returns: w 156 | numpy matrix holding the model weights 157 | """ 158 | 159 | # split the data in train and validation sets if evalTraining or verbose 160 | if evalTraining: 161 | verbose2 = True 162 | else: 163 | verbose2 = False 164 | if verbose or verbose2: 165 | train, val = data.randomSplit([0.8,0.2]) 166 | train.persist(StorageLevel.MEMORY_ONLY_SER) 167 | val.persist(StorageLevel.MEMORY_ONLY_SER) 168 | #train.cache() 169 | #val.cache() 170 | else: 171 | train = data.persist(StorageLevel.MEMORY_ONLY_SER) 172 | #train= data.cahe() 173 | 174 | # glom() allows to treat a partition as an array rather as a single row at time 175 | train_Y = train.map(lambda row: row.label).glom() 176 | train_X = train.map(lambda row: row.features).glom() 177 | train_XY = train_X.zip(train_Y).persist(StorageLevel.MEMORY_ONLY_SER) 178 | #train_XY = train_X.zip(train_Y).cache() 179 | 180 | #Initialize weight vectors 181 | nrFeat = len(train_XY.first()[0][0]) 182 | np.random.seed(int(time.time())) 183 | w = np.random.ranf((nrFeat, factorLength)) 184 | w = w / np.sqrt((w*w).sum()) 185 | 186 | 187 | if evalTraining: 188 | evalValidation = evaluation(val) 189 | evalValidation.modulo = evalTraining.modulo 190 | evalValidation.evaluate(w) 191 | evalTraining = evaluation(train) 192 | evalTraining.evaluate(w) 193 | evalTraining.modulo = evalValidation.modulo 194 | if verbose: 195 | print 'iter \ttime \ttrain_logl \tval_logl' 196 | #compute original logloss (0 iteration) 197 | if evalTraining: 198 | print '%d \t%d \t%5f \t%5f' %(0, 0, evalTraining.logl[-1], evalValidation.logl[-1]) 199 | else : 200 | print '%d \t%d \t%5f \t%5f' %(0, 0, evaluate(train, w)[2], evaluate(val, w)[2]) 201 | start = time.time() 202 | 203 | 204 | for i in xrange(iterations): 205 | wb = sc.broadcast(w) 206 | wsub = train_XY.map(lambda (X, y): sgd_subset(X, y, wb.value, iter_sgd, alpha, regParam)) 207 | w = wsub.mean() 208 | 209 | # evaluate and store the evaluation figures each 'evalTraining.modulo' iteration 210 | if evalTraining and i%evalTraining.modulo ==0: 211 | evalTraining.evaluate(w) 212 | evalValidation.evaluate(w) 213 | if verbose: 214 | if evalTraining : 215 | if i%evalTraining.modulo ==0: 216 | print '%d \t%d \t%5f \t%5f' %(i+1, time.time() - start, evalTraining.logl[-1], evalValidation.logl[-1]) 217 | else: 218 | print '%d \t%d \t%5f \t%5f' %(i+1, time.time() - start, evaluate(train, w)[2], evaluate(val, w)[2]) 219 | if savingFilename: 220 | saveModel(w, savingFilename+'_iteration_'+str(i+1)) 221 | 222 | if evalTraining: 223 | p = plt.figure() 224 | evalTraining.plotTraining(p) 225 | evalValidation.plotTraining(p), plt.legend(["train","validation"]) 226 | print 'Train set: '; evalTraining.display() 227 | print 'Validation set: '; evalValidation.display() 228 | elif verbose: 229 | print 'Train set: '; print '(rtv_pr_auc, rtv_auc, logl, mse, accuracy)'; print evaluate(train, w) 230 | print 'Validation set:'; print evaluate(val, w) 231 | 232 | train_XY.unpersist() 233 | 234 | return w 235 | 236 | 237 | def sgd_subset(train_X, train_Y, w, iter_sgd, alpha, regParam): 238 | """ 239 | Computes stochastic gradient descent for a partition (in memory) 240 | Automatically detects which vector representation is used (dense or sparse) 241 | Parameter: 242 | train_X : list of pyspark.mllib.linalg dense or sparse vectors 243 | train_Y : list of labels 244 | w : numpy matrix holding the model weights 245 | iter_sgd : numeric 246 | Nr of iteration of sgd in each partition. 247 | alpha : numeric 248 | Learning rate of SGD. 249 | regParam : numeric 250 | Regularization parameter. 251 | 252 | return: 253 | numpy matrix holding the model weights for this partition 254 | """ 255 | if type(train_X[0])==pyspark.mllib.linalg.DenseVector: 256 | return sgd_subset_dense(train_X, train_Y, w, iter_sgd, alpha, regParam) 257 | elif type(train_X[0]) == pyspark.mllib.linalg.SparseVector : 258 | return sgd_subset_sparse(train_X, train_Y, w, iter_sgd, alpha, regParam) 259 | else : 260 | return 'data type error' 261 | 262 | 263 | 264 | def sgd_subset_dense(train_X, train_Y, w, iter_sgd, alpha, regParam) : 265 | """ 266 | Computes stochastic gradient descent for a partition (in memory) 267 | Parameter: 268 | train_X : list of pyspark.mllib.linalg dense or sparse vectors 269 | train_Y : list of labels 270 | w : numpy matrix holding the model weights 271 | iter_sgd : numeric 272 | Nr of iteration of sgd in each partition. 273 | alpha : numeric 274 | Learning rate of SGD. 275 | regParam : numeric 276 | Regularization parameter. 277 | 278 | return: 279 | wsub: numpy matrix holding the model weights for this partition 280 | """ 281 | N = len(train_X) 282 | wsub = w 283 | G=np.ones(w.shape) 284 | for i in xrange(iter_sgd): 285 | np.random.seed(int(time.time())) 286 | random_idx_list = np.random.permutation(N) 287 | for j in xrange(N): 288 | idx = random_idx_list[j] 289 | X = train_X[idx] 290 | y = train_Y[idx] 291 | grads = fm_gradient_sgd_trick(X, y, wsub, regParam) 292 | G += grads * grads 293 | wsub -= alpha * grads / np.sqrt(G) 294 | 295 | return wsub 296 | 297 | 298 | def sgd_subset_sparse(train_X, train_Y, w, iter_sgd, alpha, regParam) : 299 | """ 300 | Computes stochastic gradient descent for a partition (in memory) 301 | The compress trick allows to upload the weight matrix for the rows corresponding to the indices of the non-zeros X values 302 | Parameter: 303 | train_X : list of pyspark.mllib.linalg dense or sparse vectors 304 | train_Y : list of labels 305 | w : numpy matrix holding the model weights 306 | iter_sgd : numeric 307 | Nr of iteration of sgd in each partition. 308 | alpha : numeric 309 | Learning rate of SGD. 310 | regParam : numeric 311 | Regularization parameter. 312 | 313 | return: 314 | wsub: numpy matrix holding the model weights for this partition 315 | """ 316 | N = len(train_X) 317 | wsub = w 318 | G=np.ones(w.shape) 319 | for i in xrange(iter_sgd): 320 | np.random.seed(int(time.time())) 321 | random_idx_list = np.random.permutation(N) 322 | for j in xrange(N): 323 | 324 | idx = random_idx_list[j] 325 | X = train_X[idx] 326 | y = train_Y[idx] 327 | grads_compress = fm_gradient_sgd_trick(X.values, y, wsub[X.indices], regParam) 328 | G[X.indices] += grads_compress * grads_compress 329 | wsub[X.indices] -= alpha * grads_compress / np.sqrt(G[X.indices]) 330 | 331 | return wsub 332 | 333 | 334 | 335 | 336 | #----------------------------------------------------------------------- 337 | # Train with non-parallel sgd 338 | def trainFM_sgd (data, iterations=300, alpha=0.01, regParam=0.01, factorLength=4) : 339 | """ 340 | Train a Factorization Machine model using stochastic gradient descent, non-parallel. 341 | 342 | Parameters: 343 | data : RDD of LabeledPoints 344 | Training data. Labels should be -1 and 1 345 | iterations : numeric 346 | Nr of iterations of SGD. default=300 347 | alpha : numeric 348 | Learning rate of SGD. default=0.01 349 | regParam : numeric 350 | Regularization parameter. default=0.01 351 | factorLength : numeric 352 | Length of the weight vectors of the FMs. default=4 353 | 354 | returns: w 355 | numpy matrix holding the model weights 356 | """ 357 | # data is labeledPoint RDD 358 | train_Y = np.array(data.map(lambda row: row.label).collect()) 359 | train_X = np.array(data.map(lambda row: row.features).collect()) 360 | (N, dim) = train_X.shape 361 | w = np.random.ranf((dim, factorLength)) 362 | w = w / np.sqrt((w*w).sum()) 363 | G=np.ones(w.shape) 364 | for i in xrange(iterations): 365 | np.random.seed(int(time.time())) 366 | random_idx_list = np.random.permutation(N) 367 | for j in xrange(N): 368 | idx = random_idx_list[j] 369 | X = train_X[idx] 370 | y = train_Y[idx] 371 | grads = fm_gradient_sgd_trick(X, y, wsub, regParam) 372 | G += grads * grads 373 | w -= alpha * grads / np.sqrt(G) 374 | 375 | return w 376 | 377 | #----------------------------------------------------------------------- 378 | def evaluate(data, w) : 379 | """ 380 | Evaluate a Factorization Machine model on a data set. 381 | 382 | Parameters: 383 | data : RDD of LabeledPoints 384 | Evaluation data. Labels should be -1 and 1 385 | w : numpy matrix 386 | FM model, result from trainFM_sgd or trainFM_parallel_sgd 387 | 388 | returns : (rtv_pr_auc, rtv_auc, logl, mse, accuracy) 389 | rtv_pr_auc : Area under the curve of the Recall/Precision graph (average precision score) 390 | rtv_auc : Area under the curve of the ROC-curve 391 | logl : average logloss 392 | MSE : mean square error 393 | accuracy 394 | """ 395 | #data.cache() 396 | data.persist(StorageLevel.MEMORY_ONLY_SER) 397 | y_true_rdd = data.map(lambda lp: 1 if lp.label == 1 else 0) 398 | y_true = y_true_rdd.collect() 399 | y_pred_rdd = predictFM(data, w) 400 | y_pred = y_pred_rdd.collect() 401 | 402 | logl = logloss2(np.array(y_pred), np.array(y_true)) 403 | 404 | #rtv_pr_auc and rtv_auc 405 | y_pair = np.column_stack((y_pred,y_true)) 406 | sort_y_pair = y_pair[y_pair[:,0].argsort()[::-1]] 407 | 408 | fpr, tpr, _ = roc_curve(sort_y_pair[:,1], sort_y_pair[:,0]) 409 | 410 | if np.isnan(tpr[0]) : 411 | rtv_pr_auc = 0 412 | rtv_auc = 0 413 | print 'cant compute AUC' 414 | else: 415 | rtv_auc = auc(fpr, tpr) 416 | rtv_pr_auc = average_precision_score(sort_y_pair[:,1], sort_y_pair[:,0]) 417 | 418 | #mse 419 | mse = mean_squared_error(sort_y_pair[:,1], sort_y_pair[:,0]) 420 | 421 | # accuracy 422 | y_pred_label = np.zeros(len(y_pred)) 423 | y_pred_label[np.array(y_pred) > 0.5] = 1 424 | truePred = ((y_pred_label - y_true) == 0).sum() 425 | accuracy = float(truePred) / len(y_true) 426 | 427 | return rtv_pr_auc, rtv_auc, logl, mse, accuracy 428 | 429 | 430 | def saveModel(w, fileName): 431 | """ 432 | Saves the model in a pickle file 433 | """ 434 | #with open('model/'+fileName, 'wb') as handle : 435 | with open(fileName, 'wb') as handle : 436 | pickle.dump(w, handle) 437 | 438 | def loadModel(fileName): 439 | """ 440 | Load the model from a pickle file 441 | """ 442 | #with open('model/'+fileName, 'rb') as handle : 443 | with open(fileName, 'rb') as handle : 444 | return pickle.load(handle) 445 | 446 | 447 | def transform_data(data_01_label) : 448 | """ 449 | Transforms LabeledPoint RDDs that have 0/1 labels to -1/1 labels (as is needed for the FM models) 450 | """ 451 | data = data_01_label.map(lambda row: LabeledPoint(-1 if row.label == 0 else 1, row.features)) 452 | 453 | 454 | #----------------------------------------------------------------------- 455 | # Plot the error 456 | 457 | class evaluation (object): 458 | """ Store the evaluation figures (rtv_pr_auc, rtv_auc, logl, mse, accuracy) in lists 459 | Print the final error 460 | Plot the evolution of the error function of the number of iterations 461 | """ 462 | 463 | def __init__(self, data): 464 | self.data = data 465 | self.rtv_pr_auc = [] 466 | self.rtv_auc = [] 467 | self.logl = [] 468 | self.mse = [] 469 | self.accuracy = [] 470 | #choose the modulo of the iterations to compute the evaluation 471 | self.modulo = 1 472 | 473 | def evaluate(self, w): 474 | eval = evaluate(self.data, w) 475 | self.rtv_pr_auc.append(eval[0]) 476 | self.rtv_auc.append(eval[1]) 477 | self.logl.append(eval[2]) 478 | self.mse.append(eval[3]) 479 | self.accuracy.append(eval[4]) 480 | 481 | def display(self): 482 | """ print the evaluation figures (mse, logl, rtv_pr_auc, rtv_auc, accuracy) (last element of the corresponding evaluation list) 483 | """ 484 | print 'MSE: {0:3f} \nlogl: {1:3f} \nrtv_pr_auc: {2:3f} \nrtv_auc: {3:3f} \nAccuracy: {3:3f}\n'\ 485 | .format(self.mse[-1],self.logl[-1],self.rtv_pr_auc[-1],self.rtv_auc[-1],self.accuracy[-1]) 486 | 487 | def plotTraining(self, p): 488 | """ Plot the error (rtv_pr_auc, rtv_auc, logl, mse) function of the number of iterations 489 | The error lists need to be not empty. 490 | 491 | Parameter: 492 | p: matplotlib.figure 493 | Contains all the plot elements. 494 | """ 495 | #check if at least the logl list is not empty 496 | if self.logl: 497 | # create a list to set xlabel=numIter 498 | x= [i*self.modulo+1 for i in range(len(self.mse)-1)] 499 | x.insert(0,0) 500 | 501 | p.add_subplot(221), plt.plot(x, self.rtv_pr_auc, marker='o'), plt.title("rtv_pr_auc"),plt.xlabel("iterations") 502 | p.add_subplot(222), plt.plot(x, self.rtv_auc, marker='o'), plt.title("rtv_auc"),plt.xlabel("iterations") 503 | p.add_subplot(223), plt.plot(x, self.logl, marker='o'), plt.title("logl"),plt.xlabel("iterations") 504 | p.add_subplot(224), plt.plot(x, self.mse, marker='o'), plt.title("mse"), plt.xlabel("iterations") 505 | #show = plt.show() 506 | 507 | 508 | 509 | def plotAlpha(sc, data, alpha_list = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03],\ 510 | iterations=50, iter_sgd=10, regParam=0., factorLength=4): 511 | 512 | """ Plot the error (rtv_pr_auc, rtv_auc, logl, mse) function of the learning rates in the alpha_list 513 | The training is on 80% of the data. 514 | The evaluation is on the remaining 20%. 515 | 516 | Parameters: 517 | sc: SparkContext 518 | data : RDD of LabeledPoints 519 | Training data. Labels should be -1 and 1 520 | Features should be either SparseVector or DenseVector from mllib/linalg library 521 | alpha_list: List 522 | learning rates we want to test. 523 | iterations : numeric 524 | Nr of iterations of parallel SGD. default=50 525 | iter_sgd : numeric 526 | Nr of iteration of sgd in each partition. default=5 527 | regParam : numeric 528 | Regularization parameter. default=0.01 529 | factorLength : numeric 530 | Length of the weight vectors of the FMs. default=4 531 | 532 | returns: model 533 | numpy matrix holding the model weights with the best (lower) logloss regarding the different learning rates. 534 | 535 | """ 536 | 537 | bestLogl = 1e10 538 | train, val = data.randomSplit([0.8,0.2]) 539 | errorTrain = evaluation(train) 540 | errorVal = evaluation(val) 541 | 542 | for alpha in alpha_list : 543 | w = trainFM_parallel_sgd(sc,train, iterations, iter_sgd, alpha, regParam, factorLength) 544 | errorTrain.evaluate(w) 545 | errorVal.evaluate(w) 546 | 547 | if(errorVal.logl[-1] < bestLogl): 548 | bestModel = w 549 | bestLogl = errorVal.logl[-1] 550 | bestAlpha = alpha 551 | 552 | p = plt.figure() 553 | p.add_subplot(221), plt.plot(alpha_list, errorTrain.rtv_pr_auc, label='Train', marker='o'),plt.plot(alpha_list, errorVal.rtv_pr_auc, label='Validation set', marker='o'), plt.ylabel("rtv_pr_auc"),plt.xlabel("alpha") , plt.xscale('log'), plt.legend() 554 | p.add_subplot(222), plt.plot(alpha_list, errorTrain.rtv_auc, label='Train', marker='o'),plt.plot(alpha_list, errorVal.rtv_auc, label='Validation set', marker='o'), plt.ylabel("rtv_auc"),plt.xlabel("alpha") , plt.xscale('log'), plt.legend() 555 | p.add_subplot(223), plt.plot(alpha_list, errorTrain.logl, label='Train', marker='o'),plt.plot(alpha_list, errorVal.logl, label='Validation', marker='o'), plt.ylabel("logl"),plt.xlabel("alpha") , plt.xscale('log'), plt.legend() 556 | p.add_subplot(224), plt.plot(alpha_list, errorTrain.mse, label='Train', marker='o'), plt.plot(alpha_list, errorVal.mse, label='Validation', marker='o'), plt.ylabel("mse"), plt.xlabel("alpha") , plt.xscale('log'), plt.legend() 557 | plt.show() 558 | 559 | print 'best alpha : {0: 3f}'.format(bestAlpha) 560 | print 'best logloss : {0: 3f}'.format(bestLogl) 561 | 562 | return bestModel 563 | 564 | 565 | def plotRegParam(sc, data, regParam_list = [0,0.001, 0.003, 0.006, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24],\ 566 | iterations=50, iter_sgd=10, alpha=0.01, factorLength=4): 567 | 568 | """ Plot the error (rtv_pr_auc, rtv_auc, logl, mse) function of the regularization parameters in the regParam_list 569 | The training is on 80% of the data. 570 | The evaluation is on the remaining 20%. 571 | 572 | Parameters: 573 | sc: SparkContext 574 | data : RDD of LabeledPoints 575 | Training data. Labels should be -1 and 1 576 | Features should be either SparseVector or DenseVector from mllib/linalg library 577 | regParam_list: List 578 | regularization parameters we want to test. 579 | iterations : numeric 580 | Nr of iterations of parallel SGD. default=50 581 | iter_sgd : numeric 582 | Nr of iteration of sgd in each partition. default=5 583 | alpha : numeric 584 | Learning rate of SGD. default=0.01 585 | factorLength : numeric 586 | Length of the weight vectors of the FMs. default=4 587 | 588 | returns: model 589 | numpy matrix holding the model weights with the best (lower) logloss regarding the different regularization parameters. 590 | 591 | """ 592 | 593 | bestLogl = 1e10 594 | train, val = data.randomSplit([0.8,0.2]) 595 | errorTrain = evaluation(train) 596 | errorVal = evaluation(val) 597 | 598 | for regParam in regParam_list : 599 | w = trainFM_parallel_sgd(sc,train, iterations, iter_sgd, alpha, regParam, factorLength) 600 | errorTrain.evaluate(w) 601 | errorVal.evaluate(w) 602 | 603 | if(errorVal.logl[-1] < bestLogl): 604 | bestModel = w 605 | bestLogl = errorVal.logl[-1] 606 | bestRegParam = regParam 607 | 608 | p = plt.figure() 609 | p.add_subplot(221), plt.plot(regParam_list, errorTrain.rtv_pr_auc, label='Train', marker='o'),plt.plot(regParam_list, errorVal.rtv_pr_auc, label='Validation', marker='o'), plt.ylabel("rtv_pr_auc"),plt.xlabel("regParam") , plt.xscale('log'), plt.legend() 610 | p.add_subplot(222), plt.plot(regParam_list, errorTrain.rtv_auc, label='Train', marker='o'),plt.plot(regParam_list, errorVal.rtv_auc, label='Validation', marker='o'), plt.ylabel("rtv_auc"),plt.xlabel("regParam") , plt.xscale('log'), plt.legend() 611 | p.add_subplot(223), plt.plot(regParam_list, errorTrain.logl, label='Train', marker='o'),plt.plot(regParam_list, errorVal.logl, label='Validation', marker='o'), plt.ylabel("logl"),plt.xlabel("regParam") , plt.xscale('log'), plt.legend() 612 | p.add_subplot(224), plt.plot(regParam_list, errorTrain.mse, label='Train', marker='o'), plt.plot(regParam_list, errorVal.mse, label='Validation', marker='o'), plt.ylabel("mse"), plt.xlabel("regParam") , plt.xscale('log'), plt.legend() 613 | plt.show() 614 | 615 | print 'best Regularization Parameter : {0: 3f}'.format(bestRegParam) 616 | print 'best logloss : {0: 3f}'.format(bestLogl) 617 | 618 | return bestModel 619 | 620 | 621 | def plotFactorLength(sc, data, factorLength_list = [1,2,3,4,5,6,7,8,9,10],\ 622 | iterations=50, iter_sgd=10, alpha=0.01, regParam=0.): 623 | 624 | """ Plot the error (rtv_pr_auc, rtv_auc, logl, mse) function of the factor length parameters in the factorLength_list 625 | The training is on 80% of the data. 626 | The evaluation is on the remaining 20%. 627 | 628 | Parameters: 629 | sc: SparkContext 630 | data : RDD of LabeledPoints 631 | Training data. Labels should be -1 and 1 632 | Features should be either SparseVector or DenseVector from mllib/linalg library 633 | factorLength_list: List 634 | Factor length we want to test. 635 | iterations : numeric 636 | Nr of iterations of parallel SGD. default=50 637 | iter_sgd : numeric 638 | Nr of iteration of sgd in each partition. default=5 639 | alpha : numeric 640 | Learning rate of SGD. default=0.01 641 | regParam : numeric 642 | Regularization parameter. default=0.01 643 | 644 | returns: model 645 | numpy matrix holding the model weights with the best (lower) logloss regarding the different factor length values. 646 | 647 | """ 648 | 649 | bestLogl = 1e10 650 | train, val = data.randomSplit([0.8,0.2]) 651 | errorTrain = evaluation(train) 652 | errorVal = evaluation(val) 653 | 654 | for factorLength in factorLength_list : 655 | w = trainFM_parallel_sgd(sc,train, iterations, iter_sgd, alpha, regParam, factorLength) 656 | errorTrain.evaluate(w) 657 | errorVal.evaluate(w) 658 | 659 | if(errorVal.logl[-1] < bestLogl): 660 | bestModel = w 661 | bestLogl = errorVal.logl[-1] 662 | bestFL = factorLength 663 | 664 | 665 | p = plt.figure() 666 | p.add_subplot(221), plt.plot(factorLength_list, errorTrain.rtv_pr_auc, label='Train', marker='o'),plt.plot(factorLength_list, errorVal.rtv_pr_auc, label='Validation', marker='o'), plt.ylabel("rtv_pr_auc"),plt.xlabel("factorLength") , plt.legend() 667 | p.add_subplot(222), plt.plot(factorLength_list, errorTrain.rtv_auc, label='Train', marker='o'),plt.plot(factorLength_list, errorVal.rtv_auc, label='Validation', marker='o'), plt.ylabel("rtv_auc"),plt.xlabel("factorLength") , plt.legend() 668 | p.add_subplot(223), plt.plot(factorLength_list, errorTrain.logl, label='Train', marker='o'),plt.plot(factorLength_list, errorVal.logl, label='Validation', marker='o'), plt.ylabel("logl"),plt.xlabel("factorLength") , plt.legend() 669 | p.add_subplot(224), plt.plot(factorLength_list, errorTrain.mse, label='Train', marker='o'), plt.plot(factorLength_list, errorVal.mse, label='Validation', marker='o'), plt.ylabel("mse"), plt.xlabel("factorLength") , plt.legend() 670 | plt.show() 671 | 672 | print 'best factor length : {0: 3f}'.format(bestFL) 673 | print 'best logloss : {0: 3f}'.format(bestLogl) 674 | 675 | return bestModel 676 | 677 | 678 | 679 | def plotAlpha_RegParam(sc, data, alpha_list = [0.001, 0.003, 0.006, 0.01, 0.03],\ 680 | regParam_list = [0, 0.01, 0.05, 0.1, 0.5, 1],\ 681 | iterations=20, iter_sgd=10, factorLength=4): 682 | """ Color map of the logloss function for each alpha / regParam combination. 683 | The brigther square is the lower logloss. 684 | The training is on 80% of the data. 685 | The evaluation is on the remaining 20%. 686 | 687 | Parameters: 688 | sc: SparkContext 689 | data : RDD of LabeledPoints 690 | Training data. Labels should be -1 and 1 691 | Features should be either SparseVector or DenseVector from mllib/linalg library 692 | alpha_list: List 693 | Learning rates we want to test. 694 | regParam_list: List 695 | regularization parameters we want to test. 696 | iterations : numeric 697 | Nr of iterations of parallel SGD. default=50 698 | iter_sgd : numeric 699 | Nr of iteration of sgd in each partition. default=5 700 | factorLength : numeric 701 | Length of the weight vectors of the FMs. default=4 702 | 703 | returns: model 704 | numpy matrix holding the model weights with the best (lower) logloss regarding each alpha / regParam combination. 705 | 706 | """ 707 | 708 | bestLogl = 1e10 709 | train, val = data.randomSplit([0.8,0.2]) 710 | errorTrain = evaluation(train) 711 | errorVal = evaluation(val) 712 | 713 | for alpha in alpha_list : 714 | for regParam in regParam_list : 715 | w = trainFM_parallel_sgd(sc,train, iterations, iter_sgd, alpha, regParam, factorLength) 716 | #errorTrain.evaluate(w) 717 | errorVal.evaluate(w) 718 | 719 | if(errorVal.logl[-1] < bestLogl): 720 | bestModel = w 721 | bestLogl = errorVal.logl[-1] 722 | bestAlpha = alpha 723 | bestRegParam = regParam 724 | 725 | logl= np.array(errorVal.logl) 726 | numRows, numCols = len(alpha_list), len(regParam_list) 727 | logl.shape = (numRows,numCols) 728 | 729 | print 'LOGL :'; print logl 730 | print 'best alpha : {0: 3f}'.format(bestAlpha) 731 | print 'best Regularization Parameter : {0: 3f}'.format(bestRegParam) 732 | print 'best logloss : {0: 3f}'.format(bestLogl) 733 | 734 | 735 | fig, ax = plt.subplots() 736 | ax.set_xticklabels([]), ax.set_yticklabels([]) 737 | 738 | colors = LinearSegmentedColormap.from_list('blue', ['#0022ff', '#000055'], gamma=.2) 739 | image = plt.imshow(logl, interpolation='nearest', aspect='auto', cmap=colors) 740 | ax.set_xlabel('Regularization Parameter '+str(regParam_list)), ax.set_ylabel('Step size '+str(alpha_list[::-1])) 741 | 742 | return bestModel 743 | 744 | 745 | -------------------------------------------------------------------------------- /fm/fm_parallel_sgd.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.SparkContext 2 | import org.apache.spark.SparkContext._ 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.mllib.util.MLUtils 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.mllib.linalg.{Vectors, Vector => SparkV, SparseVector => SparkSV, DenseVector => SparkDV, Matrix => SparkM } 8 | import breeze.linalg.{Vector => BV, SparseVector => BSV, DenseVector => BDV, DenseMatrix => BDM, _} 9 | import breeze.numerics._ 10 | import scala.math._ 11 | import util.Random.shuffle 12 | 13 | 14 | implicit def toBreeze(v: SparkV) : BV[Double]= { 15 | /** Convert a spark.mllib.linalg vector into breeze.linalg.vector 16 | * We use Breeze library for any type of matrix operations because mllib local linear algebra package doesn't have any support for it (in Spark 1.4.1) 17 | * mllib toBreeze function already exists but is private to mllib scope 18 | * 19 | * Automatically choose the representation (dense or sparse) which use the less memory to be store 20 | */ 21 | val nnz = v.numNonzeros 22 | if (1.5 * (nnz +1.0) < v.size) { 23 | new BSV(v.toSparse.indices, v.toSparse.values, v.size) 24 | } else { 25 | BV(v.toArray) 26 | } 27 | 28 | } 29 | 30 | def fm_get_p (X: SparkV, W : BDM[Double]) : Double = { 31 | val nnz = X.numNonzeros 32 | var x:BV[Double] = BV(0.) 33 | var w:BDM[Double] = BDM(0.) 34 | /** Computes the probability of an instance given a model */ 35 | 36 | // convert Spark Vector to Breeze Vector 37 | if (1.5 * (nnz +1.0) < X.size) { 38 | val xsp = X.toSparse 39 | val xind = xsp.indices.toSeq 40 | x = BV(xsp.values) 41 | w = W(xind,::).toDenseMatrix 42 | } else { 43 | x = X:BV[Double] 44 | w = W 45 | } 46 | 47 | val xa = x.toDenseVector.asDenseMatrix 48 | val VX = xa * w 49 | val VX_square = (xa :* xa) * (w :* w) 50 | 51 | val phi = 0.5*(VX:*VX - VX_square).sum 52 | return 1/(1 + exp(-phi)) 53 | 54 | } 55 | 56 | def predictFM (data : RDD[LabeledPoint], W : BDM[Double]) : RDD[Double] = { 57 | /** Computes the probabilities given a model for the complete data set */ 58 | return data.map(row => fm_get_p(row.features, W)) 59 | } 60 | 61 | def logloss(y_pred : Array[Double], y_true : Array[Int]) : Double = { 62 | /* Computes the logloss given the true label and the predictions */ 63 | val losses = BDV(y_true.map(v => v.toDouble) :* (y_pred.map(log))) + BDV(y_true.map(v => (1-v).toDouble) :* (y_pred.map(v => 1-v).map(log))) 64 | return -losses.sum / losses.size 65 | } 66 | 67 | def evaluate (data : RDD[LabeledPoint], w : BDM[Double]) : Double = { 68 | /* Evaluate a Factorization Machine model on a data set. 69 | * 70 | * Parameters: 71 | * data : RDD of LabeledPoints 72 | * Evaluation data. Labels should be -1 and 1 73 | * w : Breeze dense matrix 74 | * FM model, result from trainFM_sgd or trainFM_parallel_sgd 75 | * return: 76 | logl: average logloss 77 | */ 78 | val y_true_rdd = data.map(lp => if(lp.label == 1){1} else {0}) 79 | val y_true = y_true_rdd.collect() 80 | val y_pred_rdd = predictFM(data, w) 81 | val y_pred = y_pred_rdd.collect() 82 | 83 | val logl = logloss(y_pred, y_true) 84 | 85 | return logl 86 | } 87 | 88 | def fm_gradient_sgd_trick (X: BV[Double], y: Double , W: BDM[Double], regParam: Double): BDM[Double] = { 89 | /* Computes the gradient for one instance using Rendle FM paper (2010) trick (linear time computation) */ 90 | val nrFeat = X.size 91 | val xa = X.toDenseVector.asDenseMatrix 92 | val x_matrix = xa.t * xa 93 | val VX = xa * W 94 | val VX_square = (xa :* xa) * (W :* W) 95 | 96 | val phi = 0.5*(VX:*VX - VX_square).sum 97 | val expnyt = exp(-y*phi) 98 | var i = 0 99 | while (i < nrFeat) { 100 | x_matrix.update(i, i, 0.) 101 | i += 1 102 | } 103 | 104 | var result = x_matrix * W :*(-y*expnyt)/(1+expnyt) 105 | 106 | result+= W:*regParam 107 | 108 | return result 109 | } 110 | 111 | def sgd_subset(train_X : Array[SparkV], train_Y : Array[Double], W : BDM[Double], iter_sgd : Int, alpha : Double, regParam : Double) : BDM[Double] = { 112 | /* Computes stochastic gradient descent for a partition (in memory) */ 113 | 114 | val N = train_X.length 115 | var wsub : BDM[Double] = BDM.zeros(W.rows,W.cols) 116 | wsub += W 117 | var G = BDM.ones[Double](W.rows,W.cols) 118 | 119 | for (i <- 1 to iter_sgd) { 120 | var random_idx_list = shuffle(0 to N-1) 121 | for (j <- 0 to N-1) { 122 | val idx = random_idx_list(j) 123 | val X = train_X(idx) 124 | val y = train_Y(idx) 125 | val nnz = X.numNonzeros 126 | if (1.5 * (nnz +1.0) < X.size) { 127 | val xsp = X.toSparse 128 | val xind = xsp.indices.toSeq 129 | val grads_compress = fm_gradient_sgd_trick(BV(xsp.values), y, wsub(xind,::).toDenseMatrix, regParam) 130 | G(xind,::) := (G(xind,::).toDenseMatrix + (grads_compress :* grads_compress)) 131 | wsub(xind,::) := wsub(xind,::).toDenseMatrix - (alpha :* (grads_compress :/ (G(xind,::).toDenseMatrix.map(sqrt(_))))) 132 | 133 | } else { 134 | val grads = fm_gradient_sgd_trick(X, y, wsub, regParam) 135 | 136 | G += grads :* grads 137 | wsub -= alpha * grads :/ (G.map(sqrt(_))) 138 | 139 | } 140 | } 141 | } 142 | return wsub 143 | } 144 | 145 | def trainFM_parallel_sgd (data : RDD[LabeledPoint], iterations: Int = 50, iter_sgd : Int =5, alpha : Double =0.01, regParam : Double = 0., factorLength : Int = 4, verbose: Boolean =false) : BDM[Double] = { 146 | /* 147 | * Train a Factorization Machine model using parallel stochastic gradient descent. 148 | * 149 | * Parameters: 150 | * data : RDD of LabeledPoints 151 | * Training data. Labels should be -1 and 1 152 | * Features should be Vector from mllib.linalg library 153 | * iterations : Int 154 | * Nr of iterations of parallel SGD. default=50 155 | * iter_sgd : Int 156 | * Nr of iteration of sgd in each partition. default = 5 157 | * alpha : Double 158 | * Learning rate of SGD. default=0.01 159 | * regParam : Double 160 | * Regularization parameter. default=0.01 161 | * factorLength : Int 162 | * Length of the weight vectors of the FMs. default=4 163 | * verbose: Boolean 164 | * Whether to ouptut iteration numbers, time, logloss for train and validation sets 165 | * returns: W 166 | * Breeze dense matrix holding the model weights 167 | */ 168 | val train = data 169 | val valid = data 170 | if (verbose) { 171 | val Array(train,valid) = data.randomSplit(Array(0.8, 0.2)) 172 | valid.cache() 173 | } 174 | train.cache() 175 | 176 | val train_X = train.map(xy => xy.features).glom() 177 | val train_Y = train.map(xy => xy.label).glom() 178 | val train_XY = train_X.zip(train_Y) 179 | train_XY.cache() 180 | 181 | val nrFeat = train_XY.first()._1(0).size 182 | var W = BDM.rand(nrFeat,factorLength) 183 | W :*= 1 / sqrt(sum(W:*W)) 184 | 185 | if (verbose) { 186 | println("iter train_logl valid_logl") 187 | println("%d %.5f %.5f".format(0, evaluate(train, W), evaluate(valid, W))) 188 | 189 | } 190 | 191 | for (i <- 1 to iterations) { 192 | val wb = sc.broadcast(W) 193 | val wsub = train_XY.map(xy => sgd_subset(xy._1, xy._2, wb.value, iter_sgd, alpha, regParam)) 194 | W = wsub.map(w=> w.map(_/5)).reduce(_+_) 195 | if (verbose) { 196 | println("%d %.5f %.5f".format(i, evaluate(train, W), evaluate(valid, W))) 197 | } 198 | } 199 | 200 | train_XY.unpersist() 201 | 202 | return W 203 | } 204 | 205 | -------------------------------------------------------------------------------- /img/parallel_sgd.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blebreton/spark-FM-parallelSGD/165b4b2a8beb6d6c00c59c9a9909ccc799e3caa4/img/parallel_sgd.PNG --------------------------------------------------------------------------------