├── LICENSE ├── README.md ├── pom.xml ├── src ├── main │ └── scala │ │ └── io │ │ └── github │ │ └── qf6101 │ │ └── mfm │ │ ├── baseframe │ │ ├── Coefficients.scala │ │ ├── MLLearner.scala │ │ ├── MLModel.scala │ │ ├── ModelParam.scala │ │ ├── binomial │ │ │ ├── BinLearner.scala │ │ │ ├── BinModel.scala │ │ │ └── BinModelParam.scala │ │ └── mutinomial │ │ │ ├── MultiLearner.scala │ │ │ ├── MultiModel.scala │ │ │ └── MultiModelParam.scala │ │ ├── factorization │ │ ├── binomial │ │ │ ├── FmCoefficients.scala │ │ │ ├── FmGradient.scala │ │ │ ├── FmLearnSGD.scala │ │ │ ├── FmModel.scala │ │ │ └── FmModelParam.scala │ │ └── multinomial │ │ │ ├── MfmCoefficients.scala │ │ │ ├── MfmGradient.scala │ │ │ ├── MfmLearnSGD.scala │ │ │ ├── MfmModel.scala │ │ │ └── MfmModelParam.scala │ │ ├── logisticregression │ │ ├── LogisticGradient.scala │ │ ├── LrLearnLBFGS.scala │ │ ├── LrLearnSGD.scala │ │ ├── LrModel.scala │ │ ├── LrModelParam.scala │ │ └── VectorCoefficients.scala │ │ ├── optimization │ │ ├── DecreasingStrategy.scala │ │ ├── Gradient.scala │ │ ├── GradientDescent.scala │ │ ├── LBFGS.scala │ │ ├── LBFGSParam.scala │ │ ├── Optimizer.scala │ │ ├── SGDParam.scala │ │ └── Updater.scala │ │ ├── tuning │ │ ├── BinCrossValidation.scala │ │ ├── BinParamGridBuilder.scala │ │ ├── BinaryClassificationMetrics.scala │ │ └── RegressionMetrics.scala │ │ └── util │ │ ├── GaussianRandom.scala │ │ ├── HDFSUtil.scala │ │ ├── LoadDSUtil.scala │ │ ├── Logging.scala │ │ ├── NumericParser.scala │ │ ├── ParamUtil.scala │ │ └── VectorConverter.scala └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── io │ └── github │ └── qf6101 │ └── mfm │ ├── factorization │ ├── binomial │ │ └── FmSuite.scala │ └── multinomial │ │ ├── MfmCoefficientsSuite.scala │ │ └── MfmSuite.scala │ ├── optimization │ ├── GradientDescentSuite.scala │ └── LBFGSSuite.scala │ └── util │ ├── MfmTestSparkSession.scala │ ├── ParamSuite.scala │ ├── ParquetIOTest.scala │ └── TestingUtils.scala └── test_data └── input ├── README.txt ├── a1a ├── a1a └── a1a.t └── mnist └── .gitkeep /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multinomial Factorization Machines 2 | 3 | ## Brief Description 4 | 5 | This project implements the binomial and multinomial factorization machines. Factorization machines are a generic approach that combines the generality of feature engineering with the superiority of factorization models in estimating interactions between categorical variables of large domain. Please refer to [\[Steffen Rendle (2010)\]](http://www.inf.uni-konstanz.de/~rendle/pdf/Rendle2010FM.pdf) for more detail. 6 | 7 | This implementation is based on Spark 2.0.0 as compared with the famous standalone implementation known as [libfm](http://www.libfm.org/). Some auxiliary codes (e.g., the optimization and Logging) were adopted from Spark's private internals. 8 | 9 | ## Binomial Factorization Machines (FM) 10 | 11 | FM is designed for binary-class classification problem as the standard [libfm](http://www.libfm.org/). Please refer to [src/test/scala/io/github/qf6101/mfm/factorization/binomial/FmSuite.scala](src/test/scala/io/github/qf6101/mfm/factorization/binomial/FmSuite.scala 12 | ) for detailed usage. 13 | 14 | > Note: The implementation takes the labels as +1/-1. 15 | 16 | ## Mutinomial Factorization Machines (MFM) 17 | 18 | MFM is desinged for multi-class classification problem which uses softmax as hypothesis. Please refer to [src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmSuite.scala 19 | ](src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmSuite.scala 20 | ) for detailed usage. 21 | 22 | > Note: The implementation takes the labels as 0, 1, 2, etc. -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.qf6101 8 | multinomial-factorization-machines 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 2.11.7 13 | 2.11 14 | 2.0.0 15 | 16 | 17 | 18 | 19 | org.apache.parquet 20 | parquet-tools 21 | 1.7.0 22 | 23 | 24 | com.github.pathikrit 25 | better-files_${scala.binary.version} 26 | 2.16.0 27 | 28 | 29 | com.github.scopt 30 | scopt_${scala.binary.version} 31 | 3.5.0 32 | 33 | 34 | joda-time 35 | joda-time 36 | 2.7 37 | 38 | 39 | org.joda 40 | joda-convert 41 | 1.7 42 | 43 | 44 | org.apache.spark 45 | spark-mllib_${scala.binary.version} 46 | ${spark.version} 47 | provided 48 | 49 | 50 | org.scalatest 51 | scalatest_${scala.binary.version} 52 | 3.0.0 53 | test 54 | 55 | 56 | 57 | 58 | 59 | 60 | org.apache.maven.plugins 61 | maven-compiler-plugin 62 | 2.0.2 63 | 64 | 1.6 65 | 1.6 66 | UTF-8 67 | 68 | 69 | 70 | org.apache.maven.plugins 71 | maven-jar-plugin 72 | 2.3.1 73 | 74 | 75 | maven-assembly-plugin 76 | 77 | 78 | jar-with-dependencies 79 | 80 | 81 | 82 | 83 | make-assembly 84 | package 85 | 86 | single 87 | 88 | 89 | 90 | 91 | 92 | net.alchim31.maven 93 | scala-maven-plugin 94 | 3.2.2 95 | 96 | 97 | 98 | compile 99 | testCompile 100 | 101 | 102 | 103 | 104 | 105 | -Xms64m 106 | -Xmx1024m 107 | 108 | ${scala.binary.version} 109 | ${scala.version} 110 | 111 | 112 | 113 | org.apache.maven.plugins 114 | maven-surefire-plugin 115 | 2.7 116 | 117 | true 118 | 119 | 120 | 121 | org.scalatest 122 | scalatest-maven-plugin 123 | 1.0 124 | 125 | ${project.build.directory}/surefire-reports 126 | . 127 | WDF TestSuite.txt 128 | 129 | 130 | 131 | test 132 | 133 | test 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/Coefficients.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.parquet.hadoop.ParquetReader 5 | import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord} 6 | 7 | import scala.collection.mutable.ListBuffer 8 | 9 | /** 10 | * Created by qfeng on 15-3-12. 11 | */ 12 | 13 | /** 14 | * 模型系数,抽象基类 15 | */ 16 | abstract class Coefficients extends Serializable { 17 | /** 18 | * 只复制this的结构(比如参数个数),不复制内容 19 | * 20 | * @return 复制的拷贝 21 | */ 22 | def copyEmpty(): Coefficients 23 | 24 | /** 25 | * 同时复制this的结构和内容 26 | * 27 | * @return 复制的拷贝 28 | */ 29 | def copy: Coefficients 30 | 31 | /** 32 | * 对应系数加法,加至this上 33 | * 34 | * @param other 加数 35 | * @return this 36 | */ 37 | def +=(other: Coefficients): Coefficients 38 | 39 | /** 40 | * 对应系数减法,减至this上 41 | * 42 | * @param other 减数 43 | * @return this 44 | */ 45 | def -=(other: Coefficients): Coefficients 46 | 47 | /** 48 | * 49 | * 对应系数加法,加至复制this的类上 50 | * 51 | * @param other 加数 52 | * @return 加法结果(拷贝) 53 | */ 54 | def +(other: Coefficients): Coefficients = { 55 | val result = this.copy 56 | result += other 57 | result 58 | } 59 | 60 | /** 61 | * 对应系数加上同一实数,加至复制this的类上 62 | * 63 | * @param addend 加数 64 | * @return 加法结果(拷贝) 65 | */ 66 | def +(addend: Double): Coefficients 67 | 68 | /** 69 | * 对应系数减上同一实数,减至复制this的类上 70 | * 71 | * @param minuend 减数 72 | * @return 减法结果(拷贝) 73 | */ 74 | def -(minuend: Double): Coefficients = { 75 | this.copy + (-minuend) 76 | } 77 | 78 | /** 79 | * 对应系数除上同一实数,加至复制this的类上 80 | * 81 | * @param dividend 除数 82 | * @return 除法结果 83 | */ 84 | def /(dividend: Double): Coefficients 85 | 86 | /** 87 | * 对应系数乘上同一实数,加至复制this的类上 88 | * 89 | * @param multiplier 乘数 90 | * @return 乘法结果 91 | */ 92 | def *(multiplier: Double): Coefficients 93 | 94 | /** 95 | * 计算L2的正则值 96 | * 97 | * @param reg 正则参数 98 | * @return 参数加权后的L2正则值 99 | */ 100 | def L2RegValue(reg: Array[Double]): Double 101 | 102 | /** 103 | * 计算L2的正则梯度值 104 | * 105 | * @param reg 正则参数 106 | * @return 参数加权后的L2正则梯度值 107 | */ 108 | def L2RegGradient(reg: Array[Double]): Coefficients 109 | 110 | /** 111 | * 用L1稀疏化系数 112 | * 113 | * @param regParam 正则参数值 114 | * @param stepSize 学习率 115 | * @return 稀疏化后的系数 116 | */ 117 | def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients 118 | 119 | /** 120 | * 计算L1的正则值 121 | * 122 | * @param regParam 正则参数 123 | * @return 参数绝对值加权后的L1正则值 124 | */ 125 | def L1RegValue(regParam: Array[Double]): Double 126 | 127 | /** 128 | * 计算系数的2范数 129 | * sum(abs(A).^p)^(1/p) where p=2 130 | * 131 | * @return 系数的2范数 132 | */ 133 | def norm: Double 134 | 135 | /** 136 | * 计算两组系数差异的2范数 137 | * 138 | * @param other 另一组系数 139 | * @return 差异的2范数值 140 | */ 141 | def normDiff(other: Coefficients): Double = { 142 | (this - other).norm 143 | } 144 | 145 | /** 146 | * 对应系数减法,减至复制this的类上 147 | * 148 | * @param other 减数 149 | * @return 减法结果(拷贝) 150 | */ 151 | def -(other: Coefficients): Coefficients = { 152 | val result = this.copy 153 | result -= other 154 | result 155 | } 156 | 157 | /** 158 | * 保存系数至文件 159 | * 160 | * @param location 文件位置 161 | */ 162 | def save(location: String): Unit = { 163 | saveMeta(location + "/" + Coefficients.namingMetaFile) 164 | saveData(location + "/" + Coefficients.namingDataFile) 165 | } 166 | 167 | /** 168 | * 保存元数据至文件 169 | * 170 | * @param location 文件位置 171 | */ 172 | def saveMeta(location: String): Unit 173 | 174 | /** 175 | * 保存数据至文件 176 | * 177 | * @param location 文件位置 178 | */ 179 | def saveData(location: String): Unit 180 | 181 | /** 182 | * 与另一个系数是否相等 183 | * 184 | * @param other 另一个系数 185 | * @return 是否相等 186 | */ 187 | def equals(other: Coefficients): Boolean 188 | } 189 | 190 | /** 191 | * 静态系数对象 192 | */ 193 | object Coefficients { 194 | val namingCoeffType: String = "coeff_type" 195 | val namingMetaFile: String = "coeff_meta" 196 | val namingDataFile: String = "coeff_data" 197 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/MLLearner.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe 2 | 3 | import org.apache.spark.ml.param.ParamMap 4 | 5 | /** 6 | * Created by qfeng on 16-9-9. 7 | */ 8 | 9 | /** 10 | * 学习器基类 11 | * 12 | * @param params 参数池 13 | */ 14 | abstract class MLLearner(val params: ParamMap) extends Serializable { 15 | /** 16 | * 更新参数池 17 | * 18 | * @param updatingParams 更新参数 19 | */ 20 | def updateParams(updatingParams: ParamMap): Unit = { 21 | params ++= updatingParams 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/MLModel.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe 2 | 3 | import org.apache.spark.ml.param.ParamMap 4 | 5 | /** 6 | * Created by qfeng on 16-9-8. 7 | */ 8 | 9 | /** 10 | * 机器学习模型基类 11 | * 12 | * @param paramMeta 模型参赛 13 | * @param coeffs 模型系数 14 | * @param params 参数池(保存参数的值) 15 | */ 16 | abstract class MLModel(val paramMeta: ModelParam, 17 | val coeffs: Coefficients, 18 | val params: ParamMap) extends Serializable { 19 | /** 20 | * 保存模型文件 21 | * 22 | * @param location 模型文件的位置 23 | */ 24 | def save(location: String): Unit = { 25 | //保存模型系数 26 | coeffs.save(location + "/" + MLModel.namingCoeffFile) 27 | //保存模型参数 28 | paramMeta.save(location + "/" + MLModel.namingParamFile, params) 29 | } 30 | 31 | /** 32 | * 模型内容是否相同 33 | * 34 | * @param other 另一个模型 35 | * @return 内容是否相同 36 | */ 37 | def equals(other: MLModel): Boolean 38 | } 39 | 40 | /** 41 | * 静态模型对象 42 | */ 43 | object MLModel { 44 | val namingCoeffFile = "coefficient" 45 | val namingParamFile = "params" 46 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/ModelParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe 2 | 3 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} 4 | import org.apache.spark.sql.SparkSession 5 | import org.json4s.JsonAST 6 | import org.json4s.JsonDSL._ 7 | import org.json4s.jackson.JsonMethods._ 8 | 9 | /** 10 | * Created by qfeng on 15-4-2. 11 | */ 12 | 13 | /** 14 | * 模型参数 15 | */ 16 | trait ModelParam extends Serializable { 17 | val initMean: Param[Double] = new Param("ModelParam", "initMean", "使用高斯分布,初始化参数值,均值", ParamValidators.inRange(0, 1)) 18 | val initStdev: Param[Double] = new Param("ModelParam", "initStdev", "使用高斯分布,初始化参数值,标准差值", ParamValidators.inRange(0, 1)) 19 | 20 | 21 | /** 22 | * 将模型参数值保存至文件 23 | * 24 | * @param location 保存位置 25 | * @param params 参数池 26 | */ 27 | def save(location: String, params: ParamMap): Unit = { 28 | SparkSession.builder().getOrCreate().sparkContext. 29 | makeRDD(List(compact(render(this.toJSON(params))))).repartition(1).saveAsTextFile(location) 30 | } 31 | 32 | /** 33 | * Transform parameters to json object 34 | * 35 | * @return parameters in json format 36 | */ 37 | def toJSON(params: ParamMap): JsonAST.JObject = { 38 | (initMean.name -> params(initMean)) ~ (initStdev.name -> params(initStdev)) 39 | } 40 | } 41 | 42 | /** 43 | * 静态模型参数对象 44 | */ 45 | object ModelParam { 46 | val namingParamType = "param_type" 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/binomial/BinLearner.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe.binomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLLearner 5 | import org.apache.spark.ml.param.ParamMap 6 | import org.apache.spark.rdd.RDD 7 | 8 | /** 9 | * Created by qfeng on 15-3-27. 10 | */ 11 | 12 | /** 13 | * 二分学习器基类 14 | * 15 | * @param params 参数池 16 | */ 17 | abstract class BinLearner(override val params: ParamMap) extends MLLearner(params) { 18 | /** 19 | * 训练二分模型 20 | * 21 | * @param dataset 训练集 22 | * @return 二分模型 23 | */ 24 | def train(dataset: RDD[(Double, SparseVector[Double])]): BinModel 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/binomial/BinModel.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe.binomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.{Coefficients, MLModel} 5 | import io.github.qf6101.mfm.tuning.BinaryClassificationMetrics 6 | import io.github.qf6101.mfm.util.Logging 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.storage.StorageLevel 10 | 11 | import scala.util.control.Breaks 12 | 13 | /** 14 | * Created by qfeng on 15-3-27. 15 | */ 16 | 17 | /** 18 | * 二分模型基类 19 | * 20 | * @param paramMeta 模型参赛 21 | * @param coeffs 模型系数 22 | * @param params 参数池(保存参数的值) 23 | */ 24 | abstract class BinModel(override val paramMeta: BinModelParam, 25 | override val coeffs: Coefficients, 26 | override val params: ParamMap) 27 | extends MLModel(paramMeta, coeffs, params) with Logging with Serializable { 28 | //设置默认的阈值为0.5 29 | params.put(paramMeta.binaryThreshold, 0.5) 30 | 31 | /** 32 | * 对输入数据进行预测 33 | * 34 | * @param data 输入数据 35 | * @return 预测值(0~1) 36 | */ 37 | def predict(data: SparseVector[Double]): Double 38 | 39 | /** 40 | * 对输入数据集进行预测 41 | * 42 | * @param dataSet 输入数据集 43 | * @return 预测值集合(0~1) 44 | */ 45 | def predict(dataSet: RDD[SparseVector[Double]]): RDD[Double] = { 46 | dataSet.map(predict) 47 | } 48 | 49 | /** 50 | * 选择二分分离器的阈值(固定AUC,选择F1-score最大的阈值) 51 | * 52 | * @param dataSet 数据集合 53 | */ 54 | def selectThreshold(dataSet: RDD[(Double, SparseVector[Double])]): Array[BinaryClassificationMetrics] = { 55 | //生成对数据集的预测结果并持久化 56 | val scoreAndLabels = dataSet.map { case (label, data) => 57 | (predict(data), label) 58 | }.persist(StorageLevel.MEMORY_AND_DISK_SER) 59 | //以0.05为间隔,尝试每个threshold,选择F1_score最大的threshold 60 | //直至遇到F1_score为NaN,停止尝试 61 | var maxF1Score = Double.MinValue 62 | var selectedThreshold = 0.5 63 | val loop = new Breaks 64 | loop.breakable { 65 | for (tryThreshold <- 0.05 until 1.0 by 0.05) { 66 | val metrics = new BinaryClassificationMetrics(scoreAndLabels, tryThreshold) 67 | logDebug(s"threshold selection => f1-score: ${"%1.4f".format(metrics.f1_scores._1)}, threshold: ${"%1.2f".format(tryThreshold)}") 68 | if (metrics.f1_scores._1.isNaN) { 69 | loop.break() 70 | } else if (metrics.f1_scores._1 > maxF1Score) { 71 | maxF1Score = metrics.f1_scores._1 72 | selectedThreshold = tryThreshold 73 | } 74 | } 75 | } 76 | //设置选择得到的threshold 77 | params.put(paramMeta.binaryThreshold, selectedThreshold) 78 | //计算最终的度量指标 79 | val finalMetrics = new BinaryClassificationMetrics(scoreAndLabels, selectedThreshold) 80 | logInfo(s"selected threshold: $selectedThreshold, metrics: ${finalMetrics.toString}}") 81 | //解除持久化 82 | scoreAndLabels.unpersist() 83 | Array(finalMetrics) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/binomial/BinModelParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe.binomial 2 | 3 | import io.github.qf6101.mfm.baseframe.ModelParam 4 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} 5 | import org.json4s.JsonAST 6 | import org.json4s.JsonDSL._ 7 | 8 | /** 9 | * Created by qfeng on 16-9-8. 10 | */ 11 | trait BinModelParam extends ModelParam { 12 | //default value: 0.5 13 | val binaryThreshold: Param[Double] = new Param("BinModelParam", "binaryThreshold", "threshold for binary classification", ParamValidators.inRange(0, 1, false, false)) 14 | 15 | /** 16 | * Transform parameters to json object 17 | * 18 | * @return parameters in json format 19 | */ 20 | override def toJSON(params: ParamMap): JsonAST.JObject = { 21 | super.toJSON(params) ~ (binaryThreshold.name -> params(binaryThreshold)) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/mutinomial/MultiLearner.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe.mutinomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLLearner 5 | import org.apache.spark.ml.param.ParamMap 6 | import org.apache.spark.rdd.RDD 7 | 8 | /** 9 | * Created by qfeng on 16-9-9. 10 | */ 11 | 12 | /** 13 | * 多分类学习器基类 14 | * 15 | * @param params 参数池 16 | */ 17 | abstract class MultiLearner(override val params: ParamMap) extends MLLearner(params) { 18 | /** 19 | * 训练对应模型 20 | * 21 | * @param dataset 训练集 22 | * @return 模型 23 | */ 24 | def train(dataset: RDD[(Double, SparseVector[Double])]): MultiModel 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/mutinomial/MultiModel.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe.mutinomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.{Coefficients, MLModel} 5 | import io.github.qf6101.mfm.util.Logging 6 | import org.apache.spark.ml.param.ParamMap 7 | import org.apache.spark.rdd.RDD 8 | 9 | /** 10 | * Created by qfeng on 16-9-9. 11 | */ 12 | 13 | /** 14 | * 多分类模型基类 15 | * 16 | * @param paramMeta 模型参赛 17 | * @param coeffs 模型系数 18 | * @param params 参数池(保存参数的值) 19 | */ 20 | abstract class MultiModel(override val paramMeta: MultiModelParam, 21 | override val coeffs: Coefficients, 22 | override val params: ParamMap) 23 | extends MLModel(paramMeta, coeffs, params) with Logging with Serializable { 24 | /** 25 | * 对输入数据进行预测 26 | * 27 | * @param data 输入数据 28 | * @return 预测值向量(0~1) 29 | */ 30 | def predict(data: SparseVector[Double]): Array[Double] 31 | 32 | /** 33 | * 对输入数据集进行预测 34 | * 35 | * @param dataSet 输入数据集 36 | * @return 预测值集合(0~1) 37 | */ 38 | def predict(dataSet: RDD[SparseVector[Double]]): RDD[Array[Double]] = { 39 | dataSet.map(predict) 40 | } 41 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/baseframe/mutinomial/MultiModelParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.baseframe.mutinomial 2 | 3 | import io.github.qf6101.mfm.baseframe.ModelParam 4 | 5 | /** 6 | * Created by qfeng on 16-9-9. 7 | */ 8 | 9 | /** 10 | * 多分类模型参数 11 | */ 12 | trait MultiModelParam extends ModelParam 13 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmCoefficients.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.binomial 2 | 3 | import better.files.File 4 | import breeze.linalg.{DenseMatrix, DenseVector} 5 | import io.github.qf6101.mfm.baseframe.Coefficients 6 | import io.github.qf6101.mfm.util.GaussianRandom 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.parquet.hadoop.ParquetReader 9 | import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord} 10 | import org.apache.spark.sql.SparkSession 11 | import org.json4s.DefaultFormats 12 | import org.json4s.JsonDSL._ 13 | import org.json4s.jackson.JsonMethods._ 14 | 15 | import scala.collection.mutable.ListBuffer 16 | import scala.math._ 17 | 18 | /** 19 | * Created by qfeng on 15-3-12. 20 | */ 21 | 22 | /** 23 | * Factorization Machine模型系数 24 | * 25 | * @param initMean 随机初始值均值 26 | * @param initStdev 随机初始值标准差 27 | * @param numFeatures 特征个数 28 | * @param numFactors 因子个数 29 | * @param k0 是否需要处理截距 30 | * @param k1 是否需要处理一阶参数 31 | * @param k2 是否需要处理二阶参数 32 | */ 33 | class FmCoefficients(val initMean: Double, 34 | val initStdev: Double, 35 | var numFeatures: Int, 36 | var numInteractFeatures: Int, 37 | var numFactors: Int, 38 | val k0: Boolean, 39 | val k1: Boolean, 40 | val k2: Boolean) extends Coefficients { 41 | var w0 = 0.0 42 | var w = DenseVector.zeros[Double](numFeatures) 43 | var v = GaussianRandom.randDenseMatrix(initMean, initStdev, numInteractFeatures, numFactors) 44 | 45 | /** 46 | * 用breeze稀疏向量和CSC稀疏矩阵初始化模型系数 47 | * 48 | * @param w0 0阶系数 49 | * @param w 1阶系数 50 | * @param v 2阶系数 51 | * @param k0 是否需要处理截距 52 | * @param k1 是否需要处理一阶参数 53 | * @param k2 是否需要处理二阶参数 54 | */ 55 | def this(w0: Double, w: DenseVector[Double], v: DenseMatrix[Double], k0: Boolean, k1: Boolean, k2: Boolean) { 56 | this(0.0, 0.0, w.length, v.rows, v.cols, k0, k1, k2) 57 | this.w0 = w0 58 | this.w = w.copy 59 | this.v = v.copy 60 | } 61 | 62 | /** 63 | * 只复制this的结构(比如参数个数),不复制内容 64 | * 65 | * @return 复制的拷贝 66 | */ 67 | override def copyEmpty(): Coefficients = new FmCoefficients(this.initMean, this.initMean, 68 | this.numFeatures, this.numInteractFeatures, this.numFactors, this.k0, this.k1, this.k2) 69 | 70 | /** 71 | * 对应系数加法,加至this上 72 | * 73 | * @param other 加数 74 | * @return this 75 | */ 76 | override def +=(other: Coefficients): Coefficients = { 77 | val otherCoeffs = other.asInstanceOf[FmCoefficients] 78 | if (k0) this.w0 += otherCoeffs.w0 79 | if (k1) this.w += otherCoeffs.w 80 | if (k2) this.v += otherCoeffs.v 81 | this 82 | } 83 | 84 | /** 85 | * 对应系数减法,减至this上 86 | * 87 | * @param other 减数 88 | * @return this 89 | */ 90 | override def -=(other: Coefficients): Coefficients = { 91 | val otherCoeffs = other.asInstanceOf[FmCoefficients] 92 | if (k0) this.w0 -= otherCoeffs.w0 93 | if (k1) this.w -= otherCoeffs.w 94 | if (k2) this.v -= otherCoeffs.v 95 | this 96 | } 97 | 98 | /** 99 | * 对应系数加上同一实数,加至复制this的类上 100 | * 101 | * @param addend 加数 102 | * @return 加法结果(拷贝) 103 | */ 104 | override def +(addend: Double): Coefficients = { 105 | val result = this.copy.asInstanceOf[FmCoefficients] 106 | if (k0) result.w0 += addend 107 | if (k1) result.w += addend 108 | if (k2) result.v += addend 109 | result 110 | } 111 | 112 | /** 113 | * 对应系数乘上同一实数,加至复制this的类上 114 | * 115 | * @param multiplier 乘数 116 | * @return 乘法结果 117 | */ 118 | override def *(multiplier: Double): Coefficients = { 119 | val result = this.copy.asInstanceOf[FmCoefficients] 120 | if (k0) result.w0 *= multiplier 121 | if (k1) result.w *= multiplier 122 | if (k2) result.v *= multiplier 123 | result 124 | } 125 | 126 | /** 127 | * 同时复制this的结构和内容 128 | * 129 | * @return 复制的拷贝 130 | */ 131 | override def copy: Coefficients = { 132 | //从效率出发,参数设为0 133 | val coeffs = new FmCoefficients(this.initMean, this.initStdev, 0, 0, 0, this.k0, this.k1, this.k2) 134 | coeffs.numFeatures = this.numFeatures 135 | coeffs.numInteractFeatures = this.numInteractFeatures 136 | coeffs.numFactors = this.numFactors 137 | coeffs.w0 = this.w0 138 | coeffs.w = this.w.copy 139 | coeffs.v = this.v.copy 140 | coeffs 141 | } 142 | 143 | /** 144 | * 对应系数除上同一实数,加至复制this的类上 145 | * 146 | * @param dividend 除数 147 | * @return 除法结果 148 | */ 149 | override def /(dividend: Double): Coefficients = { 150 | val result = this.copy.asInstanceOf[FmCoefficients] 151 | if (k0) result.w0 /= dividend 152 | if (k1) result.w /= dividend 153 | if (k2) result.v /= dividend 154 | result 155 | } 156 | 157 | /** 158 | * 计算L2的正则值 159 | * 160 | * @param reg 正则参数 161 | * @return 参数加权后的L2正则值 162 | */ 163 | override def L2RegValue(reg: Array[Double]): Double = { 164 | val zeroRegValue = if (k0) w0 * w0 * reg(0) else 0.0 165 | val firstRegValue = if (k1 && w.activeSize > 0) w.activeValuesIterator.reduce(_ + Math.pow(_, 2)) * reg(1) else 0.0 166 | val secondRegValue = if (k2 && v.activeSize > 0) v.activeValuesIterator.reduce(_ + Math.pow(_, 2)) * reg(2) else 0.0 167 | 0.5 * (zeroRegValue + firstRegValue + secondRegValue) 168 | } 169 | 170 | /** 171 | * 计算L2的正则梯度值 172 | * 173 | * @param reg 正则参数 174 | * @return 参数加权后的L2正则梯度值 175 | */ 176 | override def L2RegGradient(reg: Array[Double]): Coefficients = { 177 | val result = this.copy.asInstanceOf[FmCoefficients] 178 | if (k0) result.w0 *= reg(0) 179 | if (k1) result.w *= reg(1) 180 | if (k2) result.v *= reg(2) 181 | result 182 | } 183 | 184 | /** 185 | * 用L1稀疏化系数 186 | * 187 | * @param regParam 正则参数值 188 | * @param stepSize 学习率 189 | * @return 稀疏化后的系数 190 | */ 191 | override def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients = { 192 | //0阶参数 193 | if (k0) { 194 | val zeroShrinkageVal = regParam(0) * stepSize 195 | w0 = signum(w0) * max(0.0, abs(w0) - zeroShrinkageVal) 196 | } 197 | //1阶参数 198 | if (k1) { 199 | val firstShrinkageVal = regParam(1) * stepSize 200 | val newW = DenseVector.zeros[Double](w.length) 201 | w.activeIterator.foreach { case (index, weight) => 202 | val newWeight = signum(weight) * max(0.0, abs(weight) - firstShrinkageVal) 203 | if (newWeight == 0) { 204 | Nil 205 | } else { 206 | newW.update(index, newWeight) 207 | } 208 | } 209 | w = newW 210 | } 211 | //2阶参数 212 | if (k2) { 213 | val secondShrinkageVal = regParam(2) * stepSize / numFactors 214 | val newV = DenseMatrix.zeros[Double](v.rows, v.cols) 215 | v.activeIterator.foreach { case ((rowIndex, colIndex), weight) => 216 | val newWeight = signum(weight) * max(0.0, abs(weight) - secondShrinkageVal) 217 | if (newWeight == 0) { 218 | Nil 219 | } else { 220 | newV.update(rowIndex, colIndex, newWeight) 221 | } 222 | } 223 | v = newV 224 | } 225 | //全部更新完后,返回结果 226 | this 227 | } 228 | 229 | /** 230 | * 计算L1的正则值 231 | * 232 | * @param reg 正则参数 233 | * @return 参数绝对值加权后的L1正则值 234 | */ 235 | override def L1RegValue(reg: Array[Double]): Double = { 236 | val zeroRegValue = if (k0) abs(w0) * reg(0) else 0.0 237 | val firstRegValue = if (k1 && w.activeSize > 0) w.activeIterator.foldLeft(0.0) { case (absSum, (_, weight)) => 238 | absSum + abs(weight) 239 | } * reg(1) 240 | else 0.0 241 | val secondRegValue = if (k2 && v.activeSize > 0) v.activeIterator.foldLeft(0.0) { case (absSum, (_, weight)) => 242 | absSum + abs(weight) 243 | } * reg(2) 244 | else 0.0 245 | zeroRegValue + firstRegValue + secondRegValue 246 | } 247 | 248 | /** 249 | * 计算系数的2范数 250 | * sum(abs(A).^p)^(1/p) where p=2 251 | * 252 | * @return 系数的2范数 253 | */ 254 | override def norm: Double = { 255 | val zeroSum = if (k0) w0 * w0 else 0.0 256 | val firstSum = if (k1 && w.activeSize > 0) w.activeIterator.foldLeft(0.0) { case (sum: Double, (_, value: Double)) => 257 | sum + value * value 258 | } else 0.0 259 | val secondSum = if (k2 && v.activeSize > 0) v.activeIterator.foldLeft(0.0) { case (sum: Double, (_, value: Double)) => 260 | sum + value * value 261 | } else 0.0 262 | math.sqrt(zeroSum + firstSum + secondSum) 263 | } 264 | 265 | /** 266 | * 保存元数据至文件 267 | * 268 | * @param location 文件位置 269 | */ 270 | override def saveMeta(location: String): Unit = { 271 | val json = (Coefficients.namingCoeffType -> FmCoefficients.getClass.toString) ~ 272 | (FmCoefficients.namingIntercept -> w0) ~ 273 | (FmCoefficients.namingWSize -> w.size) ~ 274 | (FmCoefficients.namingVRows -> v.rows) ~ 275 | (FmCoefficients.namingVCols -> v.cols) ~ 276 | (FmCoefficients.namingK0 -> k0) ~ 277 | (FmCoefficients.namingK1 -> k1) ~ 278 | (FmCoefficients.namingK2 -> k2) 279 | SparkSession.builder().getOrCreate().sparkContext. 280 | makeRDD(List(compact(render(json)))).repartition(1).saveAsTextFile(location) 281 | } 282 | 283 | /** 284 | * 保存数据至文件 285 | * 286 | * @param location 文件位置 287 | */ 288 | override def saveData(location: String): Unit = { 289 | val spark = SparkSession.builder().getOrCreate() 290 | spark.createDataFrame(w.data.map(Tuple1(_))).repartition(1).toDF("value").write.parquet(location + "/w") 291 | spark.createDataFrame(v.data.map(Tuple1(_))).repartition(1).toDF("value").write.parquet(location + "/v") 292 | } 293 | 294 | /** 295 | * 与另一个系数是否相等 296 | * 297 | * @param other 另一个系数 298 | * @return 是否相等 299 | */ 300 | override def equals(other: Coefficients): Boolean = { 301 | other match { 302 | case otherCoeffs: FmCoefficients => 303 | if (w0 == otherCoeffs.w0 && w.equals(otherCoeffs.w) && v.equals(otherCoeffs.v)) true else false 304 | case _ => false 305 | } 306 | } 307 | } 308 | 309 | /** 310 | * 静态FM系数对象 311 | */ 312 | object FmCoefficients { 313 | val namingIntercept = "intercept" 314 | val namingWSize = "w_size" 315 | val namingVRows = "v_rows" 316 | val namingVCols = "v_cols" 317 | val namingK0 = "k0" 318 | val namingK1 = "k1" 319 | val namingK2 = "k2" 320 | 321 | /** 322 | * 系数文件构造分解机系数 323 | * 324 | * @param location 系数文件 325 | * @return 分解机系数 326 | */ 327 | def apply(location: String): FmCoefficients = { 328 | //初始化spark session 329 | val spark = SparkSession.builder().getOrCreate() 330 | import spark.implicits._ 331 | //读取元数据 332 | val meta = spark.read.json(location + "/" + Coefficients.namingMetaFile).first() 333 | val w0 = meta.getAs[Double](namingIntercept) 334 | val vRows = meta.getAs[Long](namingVRows).toInt 335 | val vCols = meta.getAs[Long](namingVCols).toInt 336 | val k0 = meta.getAs[Boolean](namingK0) 337 | val k1 = meta.getAs[Boolean](namingK1) 338 | val k2 = meta.getAs[Boolean](namingK2) 339 | // 读取w向量 340 | val w = DenseVector(spark.read.parquet(location + "/" + Coefficients.namingDataFile + "/w").map { row => 341 | row.getAs[Double]("value") 342 | }.collect()) 343 | // 读取v向量 344 | val v = DenseMatrix.create(vRows, vCols, spark.read.parquet(location + "/" + Coefficients.namingDataFile + "/v") 345 | .map { row => 346 | row.getAs[Double]("value") 347 | }.collect()) 348 | //返回结果 349 | new FmCoefficients(w0, w, v, k0, k1, k2) 350 | } 351 | 352 | /** 353 | * 从本地文件载入系数 354 | * 355 | * @param location 本地文件 356 | * @return FM系数对象 357 | */ 358 | def fromLocal(location: String): FmCoefficients = { 359 | //读取元数据 360 | implicit val formats = DefaultFormats 361 | val meta = parse(File(location + "/" + Coefficients.namingMetaFile + "/part-00000").contentAsString) 362 | val w0 = (meta \ namingIntercept).extract[Double] 363 | val vRows = (meta \ namingVRows).extract[Int] 364 | val vCols = (meta \ namingVCols).extract[Int] 365 | val k0 = (meta \ namingK0).extract[Boolean] 366 | val k1 = (meta \ namingK1).extract[Boolean] 367 | val k2 = (meta \ namingK2).extract[Boolean] 368 | // 读取w和v向量 369 | val w = DenseVector(readValues(location + "/" + Coefficients.namingDataFile + "/w")) 370 | val v = DenseMatrix.create(vRows, vCols, readValues(location + "/" + Coefficients.namingDataFile + "/v")) 371 | //返回结果 372 | new FmCoefficients(w0, w, v, k0, k1, k2) 373 | } 374 | 375 | /** 376 | * 从本地文件读取系数内容 377 | * 378 | * @param location 本地文件 379 | * @return 系数对象 380 | */ 381 | def readValues(location: String): Array[Double] = { 382 | val reader = ParquetReader.builder[SimpleRecord](new SimpleReadSupport(), new Path(location)).build() 383 | val values = ListBuffer[Double]() 384 | var elem = reader.read() 385 | while (elem != null) { 386 | values += elem.getValues.get(0).getValue.asInstanceOf[Double] 387 | elem = reader.read() 388 | } 389 | reader.close() 390 | values.toArray 391 | } 392 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmGradient.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.binomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | import io.github.qf6101.mfm.optimization.Gradient 6 | import io.github.qf6101.mfm.util.Logging 7 | import org.apache.spark.ml.param.ParamMap 8 | 9 | /** 10 | * Created by qfeng on 15-3-11. 11 | */ 12 | 13 | /** 14 | * FM梯度 15 | * 16 | * @param paramMeta 参数元数据 17 | * @param params 参数池(保存参数值) 18 | */ 19 | class FmGradient(paramMeta: FmModelParam, params: ParamMap) extends Gradient with Logging { 20 | /** 21 | * Compute the gradient and loss given the features of a single data point, 22 | * add the gradient to a provided vector to avoid creating new objects, and return loss. 23 | * 24 | * @param data features for one data point 25 | * @param label label for this data point 26 | * @param coeffs weights/coefficients corresponding to features 27 | * @param cumGradient the computed gradient will be added to this vector 28 | * @return loss 29 | */ 30 | override def compute(data: SparseVector[Double], 31 | label: Double, 32 | coeffs: Coefficients, 33 | cumGradient: Coefficients): 34 | Double = { 35 | val fmcoeffs = coeffs.asInstanceOf[FmCoefficients] 36 | val fmCumGradient = cumGradient.asInstanceOf[FmCoefficients] 37 | val linearScore = FmModel.linearScore(data, paramMeta, params, fmcoeffs) 38 | val expComponent = 1 + math.exp(-label * linearScore) 39 | val loss = math.log(expComponent) 40 | val multiplier = -label * (1 - 1 / expComponent) 41 | //参与2阶项的最大维度 42 | val maxInteractFeatures = params(paramMeta.maxInteractFeatures) 43 | //0阶梯度 44 | if (params(paramMeta.k0)) { 45 | fmCumGradient.w0 += multiplier 46 | } 47 | //1阶梯度 48 | if (params(paramMeta.k1)) { 49 | data.activeIterator.foreach { case (index, value) => 50 | fmCumGradient.w(index) += multiplier * value 51 | } 52 | } 53 | //2阶梯度 54 | if (params(paramMeta.k2)) { 55 | for (factorIndex <- 0 until params(paramMeta.numFactors)) { 56 | //提前计算(因为求和中每一项都会用到)firstMoment = \sum_j^n {v_jf*x_j} (固定f) 57 | val firstMoment = data.activeIterator.foldLeft(0.0) { case (sum, (index, value)) => 58 | if (index < maxInteractFeatures) { 59 | sum + fmcoeffs.v(index, factorIndex) * value 60 | } else sum 61 | } 62 | //计算2阶梯度 63 | data.activeIterator.foreach { case (index, value) => 64 | if (index < maxInteractFeatures) { 65 | val twoWayCumCoeff = fmCumGradient.v(index, factorIndex) 66 | val twoWayCoeff = fmcoeffs.v(index, factorIndex) 67 | val incrementGradient = twoWayCumCoeff + multiplier * ((value * firstMoment) - (twoWayCoeff * value * value)) 68 | fmCumGradient.v.update(index, factorIndex, incrementGradient) 69 | } 70 | } 71 | } 72 | } 73 | loss 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmLearnSGD.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.binomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLModel 5 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel} 6 | import io.github.qf6101.mfm.optimization.{GradientDescent, Updater} 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.apache.spark.rdd.RDD 9 | 10 | /** 11 | * Created by qfeng on 15-3-27. 12 | */ 13 | 14 | /** 15 | * FM模型的SGD学习器 16 | * 17 | * @param params 参数池 18 | * @param updater 模型参数更新器 19 | */ 20 | class FmLearnSGD(override val params: ParamMap, 21 | val updater: Updater) 22 | extends BinLearner(params) with FmModelParam { 23 | val lg = new FmGradient(this, params) 24 | val gd = new GradientDescent(lg, updater, params) 25 | 26 | /** 27 | * 训练模型 28 | * 29 | * @param dataset 训练集 30 | * @return 模型 31 | */ 32 | override def train(dataset: RDD[(Double, SparseVector[Double])]): BinModel = { 33 | val initialCoeffs = new FmCoefficients(params(initMean), params(initStdev), 34 | params(numFeatures), params(maxInteractFeatures), params(numFactors), params(k0), params(k1), params(k2)) 35 | val regs = Array(params(reg0), params(reg1), params(reg2)) 36 | val coeffs = gd.optimize(dataset, initialCoeffs, regs) 37 | new FmModel(this, coeffs.asInstanceOf[FmCoefficients], params) 38 | } 39 | } 40 | 41 | /** 42 | * FM模型的SGD学习器实例 43 | */ 44 | object FmLearnSGD { 45 | def train(dataset: RDD[(Double, SparseVector[Double])], 46 | params: ParamMap, 47 | updater: Updater): MLModel = { 48 | new FmLearnSGD(params, updater).train(dataset) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmModel.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.binomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLModel 5 | import io.github.qf6101.mfm.baseframe.binomial.BinModel 6 | import io.github.qf6101.mfm.util.Logging 7 | import org.apache.spark.ml.param.ParamMap 8 | 9 | 10 | /** 11 | * Created by qfeng on 15-1-26. 12 | */ 13 | 14 | /** 15 | * Factorization Machine模型 16 | * 17 | * @param paramMeta 分解机模型参数 18 | * @param params 参数池 19 | */ 20 | class FmModel(override val paramMeta: FmModelParam, 21 | override val coeffs: FmCoefficients, 22 | override val params: ParamMap) 23 | extends BinModel(paramMeta, coeffs, params) { 24 | /** 25 | * 对输入数据进行预测 26 | * 27 | * @param data 输入数据 28 | * @return 预测值 29 | */ 30 | override def predict(data: SparseVector[Double]): Double = { 31 | val score = FmModel.linearScore(data, paramMeta, params, coeffs) 32 | 1.0 / (1.0 + math.exp(-score)) 33 | } 34 | 35 | /** 36 | * 模型内容是否相同 37 | * 38 | * @param other 另一个模型 39 | * @return 内容是否相同 40 | */ 41 | override def equals(other: MLModel): Boolean = { 42 | other match { 43 | case otherModel: FmModel => 44 | if (paramMeta.toJSON(params).equals(otherModel.paramMeta.toJSON(otherModel.params)) 45 | && coeffs.equals(otherModel.coeffs)) true 46 | else false 47 | case _ => false 48 | } 49 | } 50 | } 51 | 52 | object FmModel extends Logging { 53 | /** 54 | * 计算样本的线性得分值 55 | * 56 | * @param data 样本 57 | * @param paramMeta 参数元数据 58 | * @param params 参数池 59 | * @param coeffs FM系数 60 | * @return 输入样本的线性得分值 61 | */ 62 | def linearScore(data: SparseVector[Double], 63 | paramMeta: FmModelParam, 64 | params: ParamMap, 65 | coeffs: FmCoefficients): Double = { 66 | //初始化各阶预测值为0 67 | var zeroWayPredict = 0.0 68 | var oneWayPredict = 0.0 69 | var twoWayPredict = 0.0 70 | //参与2阶项的最大维度 71 | val maxInteractAttr = params(paramMeta.maxInteractFeatures) 72 | //0阶预测值 73 | if (params(paramMeta.k0)) { 74 | zeroWayPredict += coeffs.w0 75 | } 76 | //1阶预测值 77 | if (params(paramMeta.k1)) { 78 | data.activeIterator.foreach { case (index, value) => 79 | oneWayPredict += coeffs.w(index) * value 80 | } 81 | } 82 | //2阶预测值 83 | if (params(paramMeta.k2)) { 84 | for (factorIndex <- 0 until params(paramMeta.numFactors)) { 85 | var firstMoment = 0.0 86 | var secondMoment = 0.0 87 | data.activeIterator.foreach { case (index, value) => 88 | if (index < maxInteractAttr) { 89 | firstMoment += coeffs.v(index, factorIndex) * value 90 | secondMoment += math.pow(coeffs.v(index, factorIndex) * value, 2) 91 | } 92 | } 93 | twoWayPredict += firstMoment * firstMoment - secondMoment 94 | } 95 | } 96 | zeroWayPredict + oneWayPredict + 0.5 * twoWayPredict 97 | } 98 | 99 | /** 100 | * 从文件载入分解机模型 101 | * 102 | * @param location 包含分解机型信息的文件 103 | * @return 分解机模型 104 | */ 105 | def apply(location: String): FmModel = { 106 | val params = new ParamMap() 107 | val paramMeta = FmModelParam(location + "/" + MLModel.namingParamFile, params) 108 | val coefficients = FmCoefficients(location + "/" + MLModel.namingCoeffFile) 109 | new FmModel(paramMeta, coefficients, params) 110 | } 111 | 112 | /** 113 | * 从本地文件载入分解机模型 114 | * 115 | * @param location 包含分解机型信息的本地文件 116 | * @return 分解机模型 117 | */ 118 | def fromLocal(location: String): FmModel = { 119 | val params = new ParamMap() 120 | val paramMeta = FmModelParam.fromLocal(location + "/" + MLModel.namingParamFile + "/part-00000", params) 121 | val coefficients = FmCoefficients.fromLocal(location + "/" + MLModel.namingCoeffFile) 122 | new FmModel(paramMeta, coefficients, params) 123 | } 124 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmModelParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.binomial 2 | 3 | import better.files.File 4 | import io.github.qf6101.mfm.baseframe.ModelParam 5 | import io.github.qf6101.mfm.baseframe.binomial.BinModelParam 6 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} 7 | import org.apache.spark.sql.SparkSession 8 | import org.json4s.{DefaultFormats, JsonAST} 9 | import org.json4s.JsonDSL._ 10 | import org.json4s.jackson.JsonMethods._ 11 | 12 | /** 13 | * Created by qfeng on 15-1-26. 14 | */ 15 | 16 | /** 17 | * Factorization Machine模型参数 18 | */ 19 | trait FmModelParam extends BinModelParam { 20 | val numFeatures: Param[Int] = new Param("FmModelParam", "numFeatures", "样本维度数", ParamValidators.gt(0)) 21 | val numFactors: Param[Int] = new Param("FmModelParam", "numFactors", "2阶分解维度数", ParamValidators.gt(0)) 22 | val k0: Param[Boolean] = new Param("FmModelParam", "k0", "是否考虑0阶", ParamValidators.inArray(Array(true, false))) 23 | val k1: Param[Boolean] = new Param("FmModelParam", "k1", "是否考虑1阶", ParamValidators.inArray(Array(true, false))) 24 | val k2: Param[Boolean] = new Param("FmModelParam", "k2", "是否考虑2阶", ParamValidators.inArray(Array(true, false))) 25 | val reg0: Param[Double] = new Param("FmModelParam", "reg0", "正则参数") 26 | val reg1: Param[Double] = new Param("FmModelParam", "reg1", "正则参数") 27 | val reg2: Param[Double] = new Param("FmModelParam", "reg2", "正则参数") 28 | val maxInteractFeatures: Param[Int] = new Param("FmModelParam", "maxInteractFeatures", "参与2阶项的最大特征维度(不包含)", ParamValidators.gt(0)) 29 | 30 | /** 31 | * Transform parameters to json object 32 | * 33 | * @return parameters in json format 34 | */ 35 | override def toJSON(params: ParamMap): JsonAST.JObject = { 36 | super.toJSON(params) ~ 37 | (ModelParam.namingParamType -> FmModelParam.getClass.toString) ~ 38 | (reg0.name -> params(reg0)) ~ 39 | (reg1.name -> params(reg1)) ~ 40 | (reg2.name -> params(reg2)) ~ 41 | (numFeatures.name -> params(numFeatures)) ~ 42 | (numFactors.name -> params(numFactors)) ~ 43 | (k0.name -> params(k0)) ~ 44 | (k1.name -> params(k1)) ~ 45 | (k2.name -> params(k2)) ~ 46 | (maxInteractFeatures.name -> params(maxInteractFeatures)) 47 | } 48 | } 49 | 50 | object FmModelParam { 51 | /** 52 | * 从参数文件构造分解机模型参数 53 | * 54 | * @param location 参数文件位置 55 | * @param params 参数池 56 | * @return 分解机型参数 57 | */ 58 | def apply(location: String, params: ParamMap): FmModelParam = { 59 | // 初始化参数对象和spark session 60 | val fmModelParam = new FmModelParam {} 61 | val spark = SparkSession.builder().getOrCreate() 62 | // 读取参数值 63 | val paramValues = spark.read.json(location).first() 64 | val binaryThreshold = paramValues.getAs[Double](fmModelParam.binaryThreshold.name) 65 | val reg0 = paramValues.getAs[Double](fmModelParam.reg0.name) 66 | val reg1 = paramValues.getAs[Double](fmModelParam.reg1.name) 67 | val reg2 = paramValues.getAs[Double](fmModelParam.reg2.name) 68 | val numFeatures = paramValues.getAs[Long](fmModelParam.numFeatures.name).toInt 69 | val numFactors = paramValues.getAs[Long](fmModelParam.numFactors.name).toInt 70 | val k0 = paramValues.getAs[Boolean](fmModelParam.k0.name) 71 | val k1 = paramValues.getAs[Boolean](fmModelParam.k1.name) 72 | val k2 = paramValues.getAs[Boolean](fmModelParam.k2.name) 73 | val initMean = paramValues.getAs[Double](fmModelParam.initMean.name) 74 | val initStdev = paramValues.getAs[Double](fmModelParam.initStdev.name) 75 | val maxInteractFeatures = paramValues.getAs[Long](fmModelParam.maxInteractFeatures.name).toInt 76 | // 设置参数值 77 | params.put(fmModelParam.binaryThreshold, binaryThreshold) 78 | params.put(fmModelParam.reg0, reg0) 79 | params.put(fmModelParam.reg1, reg1) 80 | params.put(fmModelParam.reg2, reg2) 81 | params.put(fmModelParam.numFeatures, numFeatures) 82 | params.put(fmModelParam.numFactors, numFactors) 83 | params.put(fmModelParam.k0, k0) 84 | params.put(fmModelParam.k1, k1) 85 | params.put(fmModelParam.k2, k2) 86 | params.put(fmModelParam.initMean, initMean) 87 | params.put(fmModelParam.initStdev, initStdev) 88 | params.put(fmModelParam.maxInteractFeatures, maxInteractFeatures) 89 | // 返回FM参数 90 | fmModelParam 91 | } 92 | 93 | /** 94 | * 从本地文件载入参数 95 | * 96 | * @param location 本地文件位置 97 | * @param params 参数池 98 | * @return 分解机参数 99 | */ 100 | def fromLocal(location: String, params: ParamMap): FmModelParam = { 101 | // 初始化参数对象 102 | val fmModelParam = new FmModelParam {} 103 | implicit val formats = DefaultFormats 104 | // 读取参数值 105 | val paramValues = parse(File(location).contentAsString) 106 | val binaryThreshold = (paramValues \ fmModelParam.binaryThreshold.name).extract[Double] 107 | val reg0 = (paramValues \ fmModelParam.reg0.name).extract[Double] 108 | val reg1 = (paramValues \ fmModelParam.reg1.name).extract[Double] 109 | val reg2 = (paramValues \ fmModelParam.reg2.name).extract[Double] 110 | val numFeatures = (paramValues \ fmModelParam.numFeatures.name).extract[Int] 111 | val numFactors = (paramValues \ fmModelParam.numFactors.name).extract[Int] 112 | val k0 = (paramValues \ fmModelParam.k0.name).extract[Boolean] 113 | val k1 = (paramValues \ fmModelParam.k1.name).extract[Boolean] 114 | val k2 = (paramValues \ fmModelParam.k2.name).extract[Boolean] 115 | val initMean = (paramValues \ fmModelParam.initMean.name).extract[Double] 116 | val initStdev = (paramValues \ fmModelParam.initStdev.name).extract[Double] 117 | val maxInteractFeatures = (paramValues \ fmModelParam.maxInteractFeatures.name).extract[Int] 118 | // 设置参数值 119 | params.put(fmModelParam.binaryThreshold, binaryThreshold) 120 | params.put(fmModelParam.reg0, reg0) 121 | params.put(fmModelParam.reg1, reg1) 122 | params.put(fmModelParam.reg2, reg2) 123 | params.put(fmModelParam.numFeatures, numFeatures) 124 | params.put(fmModelParam.numFactors, numFactors) 125 | params.put(fmModelParam.k0, k0) 126 | params.put(fmModelParam.k1, k1) 127 | params.put(fmModelParam.k2, k2) 128 | params.put(fmModelParam.initMean, initMean) 129 | params.put(fmModelParam.initStdev, initStdev) 130 | params.put(fmModelParam.maxInteractFeatures, maxInteractFeatures) 131 | // 返回FM参数 132 | fmModelParam 133 | } 134 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmCoefficients.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.multinomial 2 | 3 | import better.files.File 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | import io.github.qf6101.mfm.factorization.binomial.FmCoefficients 6 | import org.apache.spark.sql.SparkSession 7 | import org.json4s.DefaultFormats 8 | import org.json4s.JsonDSL._ 9 | import org.json4s.jackson.JsonMethods._ 10 | 11 | /** 12 | * Created by qfeng on 16-9-7. 13 | */ 14 | 15 | /** 16 | * Multinomial Factorization Machine模型系数 17 | * 18 | * @param initMean 随机初始值均值 19 | * @param initStdev 随机初始值标准差 20 | * @param numFeatures 特征个数 21 | * @param numFactors 因子个数 22 | * @param k0 是否需要处理截距 23 | * @param k1 是否需要处理一阶参数 24 | * @param k2 是否需要处理二阶参数 25 | * @param numClasses 标签个数 26 | */ 27 | class MfmCoefficients(val initMean: Double, 28 | val initStdev: Double, 29 | var numFeatures: Int, 30 | var numInteractFeatures: Int, 31 | var numFactors: Int, 32 | val k0: Boolean, 33 | val k1: Boolean, 34 | val k2: Boolean, 35 | val numClasses: Int) extends Coefficients { 36 | // 每个标签对应一个FM系数 37 | var thetas = Array.fill[FmCoefficients](numClasses)(new FmCoefficients( 38 | initMean, initStdev, numFeatures, numInteractFeatures, numFactors, k0, k1, k2)) 39 | 40 | /** 41 | * 从FM系数数组构造多分类模型系数 42 | * 43 | * @param thetas FM系数数组 44 | */ 45 | def this(thetas: Array[FmCoefficients]) { 46 | this(thetas(0).initMean, thetas(0).initStdev, thetas(0).numFeatures, thetas(0).numInteractFeatures, 47 | thetas(0).numFactors, thetas(0).k0, thetas(0).k1, thetas(0).k2, thetas.length) 48 | this.thetas = thetas 49 | } 50 | 51 | /** 52 | * 只复制this的结构(比如参数个数),不复制内容 53 | * 54 | * @return 复制的拷贝 55 | */ 56 | override def copyEmpty(): Coefficients = new MfmCoefficients(this.initMean, this.initStdev, 57 | this.numFeatures, this.numInteractFeatures, this.numFactors, this.k0, this.k1, this.k2, this.numClasses) 58 | 59 | /** 60 | * 对应系数加法,加至this上 61 | * 62 | * @param other 加数 63 | * @return this 64 | */ 65 | override def +=(other: Coefficients): Coefficients = { 66 | val otherCoeffs = other.asInstanceOf[MfmCoefficients] 67 | (this.thetas zip otherCoeffs.thetas).foreach { case (me, he) => 68 | me += he 69 | } 70 | this 71 | } 72 | 73 | /** 74 | * 对应系数减法,减至this上 75 | * 76 | * @param other 减数 77 | * @return this 78 | */ 79 | override def -=(other: Coefficients): Coefficients = { 80 | val otherCoeffs = other.asInstanceOf[MfmCoefficients] 81 | (this.thetas zip otherCoeffs.thetas).foreach { case (me, he) => 82 | me -= he 83 | } 84 | this 85 | } 86 | 87 | /** 88 | * 对应系数加上同一实数,加至复制this的类上 89 | * 90 | * @param addend 加数 91 | * @return 加法结果(拷贝) 92 | */ 93 | override def +(addend: Double): Coefficients = { 94 | val me = this.copy.asInstanceOf[MfmCoefficients] 95 | val result = me.thetas.map { theta => 96 | (theta + addend).asInstanceOf[FmCoefficients] 97 | } 98 | new MfmCoefficients(result) 99 | } 100 | 101 | /** 102 | * 对应系数乘上同一实数,加至复制this的类上 103 | * 104 | * @param multiplier 乘数 105 | * @return 乘法结果 106 | */ 107 | override def *(multiplier: Double): Coefficients = { 108 | val me = this.copy.asInstanceOf[MfmCoefficients] 109 | val result = me.thetas.map { theta => 110 | (theta * multiplier).asInstanceOf[FmCoefficients] 111 | } 112 | new MfmCoefficients(result) 113 | } 114 | 115 | /** 116 | * 对应系数除上同一实数,加至复制this的类上 117 | * 118 | * @param dividend 除数 119 | * @return 除法结果 120 | */ 121 | override def /(dividend: Double): Coefficients = { 122 | val me = this.copy.asInstanceOf[MfmCoefficients] 123 | val result = me.thetas.map { theta => 124 | (theta / dividend).asInstanceOf[FmCoefficients] 125 | } 126 | new MfmCoefficients(result) 127 | } 128 | 129 | /** 130 | * 计算L1的正则值 131 | * 132 | * @param regParam 正则参数 133 | * @return 参数绝对值加权后的L1正则值 134 | */ 135 | override def L1RegValue(regParam: Array[Double]): Double = { 136 | thetas.map { theta => 137 | theta.L1RegValue(regParam) 138 | }.sum 139 | } 140 | 141 | 142 | /** 143 | * 计算系数的2范数 144 | * sum(abs(A).^p)^(1/p) where p=2 145 | * 146 | * @return 系数的2范数 147 | */ 148 | override def norm: Double = { 149 | this.thetas.map(_.norm).sum / this.thetas.length 150 | } 151 | 152 | /** 153 | * 用L1稀疏化系数 154 | * 155 | * @param regParam 正则参数值 156 | * @param stepSize 学习率 157 | * @return 稀疏化后的系数 158 | */ 159 | override def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients = { 160 | thetas.foreach { theta => 161 | theta.L1Shrink(regParam, stepSize) 162 | } 163 | this 164 | } 165 | 166 | 167 | /** 168 | * 同时复制this的结构和内容 169 | * 170 | * @return 复制的拷贝 171 | */ 172 | override def copy: Coefficients = { 173 | new MfmCoefficients(this.thetas.map(_.copy.asInstanceOf[FmCoefficients])) 174 | } 175 | 176 | /** 177 | * 计算L2的正则值 178 | * 179 | * @param reg 正则参数 180 | * @return 参数加权后的L2正则值 181 | */ 182 | override def L2RegValue(reg: Array[Double]): Double = { 183 | thetas.map { theta => 184 | theta.L2RegValue(reg) 185 | }.sum 186 | } 187 | 188 | /** 189 | * 计算L2的正则梯度值 190 | * 191 | * @param reg 正则参数 192 | * @return 参数加权后的L2正则梯度值 193 | */ 194 | override def L2RegGradient(reg: Array[Double]): Coefficients = { 195 | val me = this.copy.asInstanceOf[MfmCoefficients] 196 | val result = me.thetas.map { theta => 197 | theta.L2RegGradient(reg).asInstanceOf[FmCoefficients] 198 | } 199 | new MfmCoefficients(result) 200 | } 201 | 202 | /** 203 | * 保存元数据至文件 204 | * 205 | * @param location 文件位置 206 | */ 207 | override def saveMeta(location: String): Unit = { 208 | val json = (Coefficients.namingCoeffType -> MfmCoefficients.getClass.toString) ~ 209 | (MfmCoefficients.namingNumClasses -> numClasses) 210 | SparkSession.builder().getOrCreate().sparkContext. 211 | makeRDD(List(compact(render(json)))).repartition(1).saveAsTextFile(location) 212 | } 213 | 214 | /** 215 | * 保存数据至文件 216 | * 217 | * @param location 文件位置 218 | */ 219 | override def saveData(location: String): Unit = { 220 | thetas.zipWithIndex.foreach { case (theta, index) => 221 | theta.saveMeta(location + "/" + index + "/" + Coefficients.namingMetaFile) 222 | theta.saveData(location + "/" + index + "/" + Coefficients.namingDataFile) 223 | } 224 | } 225 | 226 | /** 227 | * 与另一个系数是否相等 228 | * 229 | * @param other 另一个系数 230 | * @return 是否相等 231 | */ 232 | override def equals(other: Coefficients): Boolean = { 233 | other match { 234 | case otherCoeffs: MfmCoefficients => 235 | (thetas zip otherCoeffs.thetas).foldLeft(true) { case (eq, (me, he)) => 236 | eq && me.equals(he) 237 | } 238 | case _ => false 239 | } 240 | } 241 | } 242 | 243 | /** 244 | * 多分类FM系数对象 245 | */ 246 | object MfmCoefficients { 247 | val namingNumClasses = "num_classes" 248 | 249 | /** 250 | * 从文件构造多分类FM系数对象 251 | * 252 | * @param location 文件位置 253 | * @return 多分类FM系数对象 254 | */ 255 | def apply(location: String): MfmCoefficients = { 256 | // 初始化spark session 257 | val spark = SparkSession.builder().getOrCreate() 258 | // 读取元数据 259 | val meta = spark.read.json(location + "/" + Coefficients.namingMetaFile + "/part-00000").first() 260 | val numClasses = meta.getAs[Long](namingNumClasses).toInt 261 | // 读取系数 262 | val thetas = Array.fill[FmCoefficients](numClasses)(null) 263 | for (index <- 0 until numClasses) { 264 | thetas(index) = FmCoefficients(location + "/" + Coefficients.namingDataFile + "/" + index) 265 | } 266 | // 返回结果 267 | new MfmCoefficients(thetas) 268 | } 269 | 270 | /** 271 | * 从本地文件载入系数 272 | * 273 | * @param location 本地文件 274 | * @return MFM系数对象 275 | */ 276 | def fromLocal(location: String): MfmCoefficients = { 277 | //读取元数据 278 | implicit val formats = DefaultFormats 279 | val meta = parse(File(location + "/" + Coefficients.namingMetaFile + "/part-00000").contentAsString) 280 | val numClasses = (meta \ namingNumClasses).extract[Int] 281 | // 读取系数 282 | val thetas = Array.fill[FmCoefficients](numClasses)(null) 283 | for (index <- 0 until numClasses) { 284 | thetas(index) = FmCoefficients.fromLocal(location + "/" + Coefficients.namingDataFile + "/" + index) 285 | } 286 | // 返回结果 287 | new MfmCoefficients(thetas) 288 | } 289 | } 290 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmGradient.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.multinomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | import io.github.qf6101.mfm.optimization.Gradient 6 | import io.github.qf6101.mfm.util.Logging 7 | import org.apache.spark.ml.param.ParamMap 8 | 9 | /** 10 | * Created by qfeng on 16-9-7. 11 | */ 12 | 13 | /** 14 | * 多分类FM梯度 15 | * 16 | * @param paramMeta 多分类FM参数 17 | * @param params 参数池 18 | */ 19 | class MfmGradient(paramMeta: MfmModelParam, params: ParamMap) extends Gradient with Logging { 20 | /** 21 | * Compute the gradient and loss given the features of a single data point, 22 | * add the gradient to a provided vector to avoid creating new objects, and return loss. 23 | * 24 | * @param data features for one data point 25 | * @param label label for this data point 26 | * @param coeffs weights/coefficients corresponding to features 27 | * @param cumGradient the computed gradient will be added to this vector 28 | * @return loss 29 | */ 30 | override def compute(data: SparseVector[Double], 31 | label: Double, 32 | coeffs: Coefficients, 33 | cumGradient: Coefficients): Double = { 34 | val mfmCoeff = coeffs.asInstanceOf[MfmCoefficients] 35 | val mfmCumGradient = cumGradient.asInstanceOf[MfmCoefficients] 36 | val scores = MfmModel.predict(data, paramMeta, params, mfmCoeff) 37 | val multipliers = scores.zipWithIndex.map { case (score, index) => 38 | if (label.toInt == index) score - 1.0 else score 39 | } 40 | //参与2阶项的最大维度 41 | val maxInteractFeatures = params(paramMeta.maxInteractFeatures) 42 | val loss = -math.log(scores(label.toInt)) 43 | (mfmCoeff.thetas zip mfmCumGradient.thetas zip multipliers).foreach { case ((fmCoeff, fmCumGradient), multiplier) => 44 | //0阶梯度 45 | if (params(paramMeta.k0)) { 46 | fmCumGradient.w0 += multiplier 47 | } 48 | //1阶梯度 49 | if (params(paramMeta.k1)) { 50 | data.activeIterator.foreach { case (index, value) => 51 | fmCumGradient.w(index) += multiplier * value 52 | } 53 | } 54 | //2阶梯度 55 | if (params(paramMeta.k2)) { 56 | for (factorIndex <- 0 until params(paramMeta.numFactors)) { 57 | //提前计算(因为求和中每一项都会用到)firstMoment = \sum_j^n {v_jf*x_j} (固定f) 58 | val firstMoment = data.activeIterator.foldLeft(0.0) { case (sum, (index, value)) => 59 | if (index < maxInteractFeatures) { 60 | sum + fmCoeff.v(index, factorIndex) * value 61 | } else sum 62 | } 63 | //计算2阶梯度 64 | data.activeIterator.foreach { case (index, value) => 65 | if (index < maxInteractFeatures) { 66 | val twoWayCumCoeff = fmCumGradient.v(index, factorIndex) 67 | val twoWayCoeff = fmCoeff.v(index, factorIndex) 68 | val incrementGradient = twoWayCumCoeff + multiplier * ((value * firstMoment) - (twoWayCoeff * value * value)) 69 | fmCumGradient.v.update(index, factorIndex, incrementGradient) 70 | } 71 | } 72 | } 73 | } 74 | } 75 | loss 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmLearnSGD.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.multinomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.mutinomial.{MultiLearner, MultiModel} 5 | import io.github.qf6101.mfm.optimization.{GradientDescent, Updater} 6 | import org.apache.spark.ml.param.ParamMap 7 | import org.apache.spark.rdd.RDD 8 | 9 | /** 10 | * Created by qfeng on 16-9-7. 11 | */ 12 | class MfmLearnSGD(override val params: ParamMap, 13 | val updater: Updater) extends MultiLearner(params) with MfmModelParam { 14 | val lg = new MfmGradient(this, params) 15 | val gd = new GradientDescent(lg, updater, params) 16 | 17 | /** 18 | * 训练对应模型 19 | * 20 | * @param dataset 训练集 21 | * @return 模型 22 | */ 23 | override def train(dataset: RDD[(Double, SparseVector[Double])]): MultiModel = { 24 | val initialCoeffs = new MfmCoefficients(params(initMean), params(initStdev), params(numFeatures), 25 | params(maxInteractFeatures), params(numFactors), params(k0), params(k1), params(k2), params(numClasses)) 26 | val regs = Array(params(reg0), params(reg1), params(reg2)) 27 | val coeffs = gd.optimize(dataset, initialCoeffs, regs) 28 | new MfmModel(this, coeffs.asInstanceOf[MfmCoefficients], params) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmModel.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.multinomial 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLModel 5 | import io.github.qf6101.mfm.baseframe.mutinomial.MultiModel 6 | import io.github.qf6101.mfm.factorization.binomial.FmModel 7 | import org.apache.spark.ml.param.ParamMap 8 | 9 | /** 10 | * Created by qfeng on 16-9-7. 11 | */ 12 | 13 | /** 14 | * 多分类FM模型 15 | * 16 | * @param paramMeta 模型参赛 17 | * @param coeffs 模型系数 18 | * @param params 参数池(保存参数的值) 19 | */ 20 | class MfmModel(override val paramMeta: MfmModelParam, 21 | override val coeffs: MfmCoefficients, 22 | override val params: ParamMap) extends MultiModel(paramMeta, coeffs, params) { 23 | //dump, 设置默认的阈值为0.5 24 | params.put(paramMeta.binaryThreshold, 0.5) 25 | 26 | /** 27 | * 对输入数据进行预测 28 | * 29 | * @param data 输入数据 30 | * @return 预测值向量(0~1) 31 | */ 32 | override def predict(data: SparseVector[Double]): Array[Double] = { 33 | MfmModel.predict(data, paramMeta, params, coeffs) 34 | } 35 | 36 | /** 37 | * 模型内容是否相同 38 | * 39 | * @param other 另一个模型 40 | * @return 内容是否相同 41 | */ 42 | override def equals(other: MLModel): Boolean = { 43 | other match { 44 | case otherModel: MfmModel => 45 | if (paramMeta.toJSON(params).equals(otherModel.paramMeta.toJSON(otherModel.params)) 46 | && coeffs.equals(otherModel.coeffs)) true 47 | else false 48 | case _ => false 49 | } 50 | } 51 | } 52 | 53 | /** 54 | * 多分类FM模型对象 55 | */ 56 | object MfmModel { 57 | /** 58 | * 对输入样本进行预测 59 | * 60 | * @param data 样本数据 61 | * @param paramMeta 多分类FM参数 62 | * @param params 参数池 63 | * @param coeffs 多分类FM系数 64 | * @return 预测值 65 | */ 66 | def predict(data: SparseVector[Double], 67 | paramMeta: MfmModelParam, 68 | params: ParamMap, 69 | coeffs: MfmCoefficients): Array[Double] = { 70 | // 计算线性得分 71 | val scores = coeffs.thetas.map { theta => 72 | FmModel.linearScore(data, paramMeta, params, theta) 73 | } 74 | // 为了防止溢出,对分子分母都除以maxScore,得到adjustedScores 75 | val maxScore = scores.max 76 | val adjustedScores = scores.map { score => 77 | math.exp(score - maxScore) 78 | } 79 | // 计算归一化后的得分 80 | val sumAdjustedScores = adjustedScores.sum 81 | adjustedScores.map(_ / sumAdjustedScores) 82 | } 83 | 84 | /** 85 | * 从文件载入分解机模型 86 | * 87 | * @param location 包含分解机型信息的文件 88 | * @return 分解机模型 89 | */ 90 | def apply(location: String): MfmModel = { 91 | val params = new ParamMap() 92 | val paramMeta = MfmModelParam(location + "/" + MLModel.namingParamFile, params) 93 | val coefficients = MfmCoefficients(location + "/" + MLModel.namingCoeffFile) 94 | new MfmModel(paramMeta, coefficients, params) 95 | } 96 | 97 | /** 98 | * 从本地文件载入分解机模型 99 | * 100 | * @param location 包含分解机型信息的本地文件 101 | * @return 分解机模型 102 | */ 103 | def fromLocal(location: String): MfmModel = { 104 | val params = new ParamMap() 105 | val paramMeta = MfmModelParam.fromLocal(location + "/" + MLModel.namingParamFile + "/part-00000", params) 106 | val coefficients = MfmCoefficients.fromLocal(location + "/" + MLModel.namingCoeffFile) 107 | new MfmModel(paramMeta, coefficients, params) 108 | } 109 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmModelParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.multinomial 2 | 3 | import better.files.File 4 | import io.github.qf6101.mfm.baseframe.ModelParam 5 | import io.github.qf6101.mfm.baseframe.mutinomial.MultiModelParam 6 | import io.github.qf6101.mfm.factorization.binomial.FmModelParam 7 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} 8 | import org.apache.spark.sql.SparkSession 9 | import org.json4s.JsonAST.JField 10 | import org.json4s.JsonDSL._ 11 | import org.json4s.jackson.JsonMethods._ 12 | import org.json4s.{DefaultFormats, JObject, JsonAST} 13 | 14 | 15 | /** 16 | * Created by qfeng on 16-9-7. 17 | */ 18 | 19 | /** 20 | * 多分类FM模型参数 21 | */ 22 | trait MfmModelParam extends FmModelParam with MultiModelParam { 23 | val numClasses: Param[Int] = new Param("MfmModelParam", "numClasses", "标签数目", ParamValidators.gt(0)) 24 | 25 | /** 26 | * Transform parameters to json object 27 | * 28 | * @return parameters in json format 29 | */ 30 | override def toJSON(params: ParamMap): JsonAST.JObject = { 31 | val json = super.toJSON(params) removeField { 32 | case JField(ModelParam.namingParamType, _) => true 33 | case _ => false 34 | } 35 | json.asInstanceOf[JObject] ~ 36 | (numClasses.name -> params(numClasses)) ~ 37 | (ModelParam.namingParamType -> MfmModelParam.getClass.toString) 38 | } 39 | } 40 | 41 | object MfmModelParam { 42 | /** 43 | * 参数文件构造分解机模型参数 44 | * 45 | * @param location 文件位置 46 | * @param params 参数池 47 | * @return 分解机型参数 48 | */ 49 | def apply(location: String, params: ParamMap): MfmModelParam = { 50 | // 初始化参数对象和spark session 51 | val mfmModelParam = new MfmModelParam {} 52 | val spark = SparkSession.builder().getOrCreate() 53 | // 读取参数值 54 | val paramValues = spark.read.json(location).first() 55 | val binaryThreshold = paramValues.getAs[Double](mfmModelParam.binaryThreshold.name) 56 | val reg0 = paramValues.getAs[Double](mfmModelParam.reg0.name) 57 | val reg1 = paramValues.getAs[Double](mfmModelParam.reg1.name) 58 | val reg2 = paramValues.getAs[Double](mfmModelParam.reg2.name) 59 | val numFeatures = paramValues.getAs[Long](mfmModelParam.numFeatures.name).toInt 60 | val numFactors = paramValues.getAs[Long](mfmModelParam.numFactors.name).toInt 61 | val k0 = paramValues.getAs[Boolean](mfmModelParam.k0.name) 62 | val k1 = paramValues.getAs[Boolean](mfmModelParam.k1.name) 63 | val k2 = paramValues.getAs[Boolean](mfmModelParam.k2.name) 64 | val initMean = paramValues.getAs[Double](mfmModelParam.initMean.name) 65 | val initStdev = paramValues.getAs[Double](mfmModelParam.initStdev.name) 66 | val maxInteractFeatures = paramValues.getAs[Long](mfmModelParam.maxInteractFeatures.name).toInt 67 | val numClasses = paramValues.getAs[Long](mfmModelParam.numClasses.name).toInt 68 | // 设置参数值 69 | params.put(mfmModelParam.binaryThreshold, binaryThreshold) 70 | params.put(mfmModelParam.reg0, reg0) 71 | params.put(mfmModelParam.reg1, reg1) 72 | params.put(mfmModelParam.reg2, reg2) 73 | params.put(mfmModelParam.numFeatures, numFeatures) 74 | params.put(mfmModelParam.numFactors, numFactors) 75 | params.put(mfmModelParam.k0, k0) 76 | params.put(mfmModelParam.k1, k1) 77 | params.put(mfmModelParam.k2, k2) 78 | params.put(mfmModelParam.initMean, initMean) 79 | params.put(mfmModelParam.initStdev, initStdev) 80 | params.put(mfmModelParam.maxInteractFeatures, maxInteractFeatures) 81 | params.put(mfmModelParam.numClasses, numClasses) 82 | // 返回MFM参数 83 | mfmModelParam 84 | } 85 | 86 | /** 87 | * 从本地文件载入参数 88 | * 89 | * @param location 本地文件位置 90 | * @param params 参数池 91 | * @return 分解机参数 92 | */ 93 | def fromLocal(location: String, params: ParamMap): MfmModelParam = { 94 | // 初始化参数对象 95 | val mfmModelParam = new MfmModelParam {} 96 | implicit val formats = DefaultFormats 97 | // 读取参数值 98 | val paramValues = parse(File(location).contentAsString) 99 | val binaryThreshold = (paramValues \ mfmModelParam.binaryThreshold.name).extract[Double] 100 | val reg0 = (paramValues \ mfmModelParam.reg0.name).extract[Double] 101 | val reg1 = (paramValues \ mfmModelParam.reg1.name).extract[Double] 102 | val reg2 = (paramValues \ mfmModelParam.reg2.name).extract[Double] 103 | val numFeatures = (paramValues \ mfmModelParam.numFeatures.name).extract[Int] 104 | val numFactors = (paramValues \ mfmModelParam.numFactors.name).extract[Int] 105 | val k0 = (paramValues \ mfmModelParam.k0.name).extract[Boolean] 106 | val k1 = (paramValues \ mfmModelParam.k1.name).extract[Boolean] 107 | val k2 = (paramValues \ mfmModelParam.k2.name).extract[Boolean] 108 | val initMean = (paramValues \ mfmModelParam.initMean.name).extract[Double] 109 | val initStdev = (paramValues \ mfmModelParam.initStdev.name).extract[Double] 110 | val maxInteractFeatures = (paramValues \ mfmModelParam.maxInteractFeatures.name).extract[Int] 111 | val numClasses = (paramValues \ mfmModelParam.numClasses.name).extract[Int] 112 | // 设置参数值 113 | params.put(mfmModelParam.binaryThreshold, binaryThreshold) 114 | params.put(mfmModelParam.reg0, reg0) 115 | params.put(mfmModelParam.reg1, reg1) 116 | params.put(mfmModelParam.reg2, reg2) 117 | params.put(mfmModelParam.numFeatures, numFeatures) 118 | params.put(mfmModelParam.numFactors, numFactors) 119 | params.put(mfmModelParam.k0, k0) 120 | params.put(mfmModelParam.k1, k1) 121 | params.put(mfmModelParam.k2, k2) 122 | params.put(mfmModelParam.initMean, initMean) 123 | params.put(mfmModelParam.initStdev, initStdev) 124 | params.put(mfmModelParam.maxInteractFeatures, maxInteractFeatures) 125 | params.put(mfmModelParam.numClasses, numClasses) 126 | // 返回FM参数 127 | mfmModelParam 128 | } 129 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/logisticregression/LogisticGradient.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.logisticregression 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | import io.github.qf6101.mfm.optimization.Gradient 6 | import org.apache.spark.ml.param.ParamMap 7 | 8 | /** 9 | * Created by qfeng on 15-3-13. 10 | */ 11 | 12 | 13 | /** 14 | * Compute gradient and loss for a logistic loss function, as used in binary classification. 15 | * See also the documentation for the precise formulation. 16 | */ 17 | class LogisticGradient(params: ParamMap) extends Gradient { 18 | /** 19 | * Compute the gradient and loss given the features of a single data point, 20 | * add the gradient to a provided vector to avoid creating new objects, and return loss. 21 | * 22 | * @param data features for one data point 23 | * @param label label for this data point 24 | * @param coeffs weights/coefficients corresponding to features 25 | * @param cumGradient the computed gradient will be added to this vector 26 | * @return loss 27 | */ 28 | override def compute(data: SparseVector[Double], 29 | label: Double, 30 | coeffs: Coefficients, 31 | cumGradient: Coefficients): 32 | Double = { 33 | val vecCoeffs = coeffs.asInstanceOf[VectorCoefficients] 34 | val vecCumGradient = cumGradient.asInstanceOf[VectorCoefficients] 35 | val hypotheses = 1 / (1 + math.exp(-1.0 * vecCoeffs.dot(data))) 36 | val multiplier = hypotheses - label 37 | 38 | vecCumGradient +=(multiplier, data * multiplier) 39 | if (label > 0) -math.log(hypotheses) else -math.log(1 - hypotheses) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/logisticregression/LrLearnLBFGS.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.logisticregression 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLModel 5 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel} 6 | import io.github.qf6101.mfm.optimization.{LBFGS, Updater} 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.storage.StorageLevel 10 | 11 | /** 12 | * Created by qfeng on 15-4-7. 13 | */ 14 | 15 | /** 16 | * 逻辑斯蒂模型的LBFGS学习器 17 | * 18 | * @param params 参数池* 19 | * @param updater 参数更新器 20 | * @param initialCoeffs 初始参数 21 | */ 22 | class LrLearnLBFGS(override val params: ParamMap, 23 | val updater: Updater, 24 | val initialCoeffs: Option[VectorCoefficients] = None) 25 | extends BinLearner(params) with LrModelParam { 26 | val lg = new LogisticGradient(params) 27 | val lbfgs = new LBFGS(lg, updater, params) 28 | 29 | /** 30 | * 训练逻辑斯蒂模型 31 | * 32 | * @param dataSet 训练集 33 | * @return 逻辑斯蒂模型 34 | */ 35 | override def train(dataSet: RDD[(Double, SparseVector[Double])]): BinModel = { 36 | dataSet.persist(StorageLevel.MEMORY_AND_DISK_SER) 37 | val inputCoeffs = initialCoeffs match { 38 | case Some(value) => value 39 | case None => new VectorCoefficients(dataSet.first()._2.length) 40 | } 41 | val coeffs = lbfgs.optimize(dataSet, inputCoeffs, params(reg)) 42 | dataSet.unpersist() 43 | new LrModel(this, coeffs.asInstanceOf[VectorCoefficients], params) 44 | } 45 | } 46 | 47 | /** 48 | * 逻辑斯蒂模型的LBFGS学习器实例 49 | */ 50 | object LrLearnLBFGS { 51 | 52 | /** 53 | * 训练逻辑斯蒂模型 54 | * 55 | * @param dataset 数据集 56 | * @param params 参数池* 57 | * @param updater 参数更新器 58 | * @param initialCoeffs 初始参数 59 | * @return 逻辑斯蒂模型 60 | */ 61 | def train(dataset: RDD[(Double, SparseVector[Double])], 62 | params: ParamMap, 63 | updater: Updater, 64 | initialCoeffs: Option[VectorCoefficients] = None): MLModel = { 65 | new LrLearnLBFGS(params, updater, initialCoeffs).train(dataset) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/logisticregression/LrLearnSGD.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.logisticregression 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLModel 5 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel} 6 | import io.github.qf6101.mfm.optimization.{GradientDescent, Updater} 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.storage.StorageLevel 10 | 11 | /** 12 | * Created by qfeng on 15-3-17. 13 | */ 14 | 15 | /** 16 | * Train a classification model for Logistic Regression using Stochastic Gradient Descent. By 17 | * default L2 regularization is used, which can be changed via 18 | * [[LrLearnSGD]]. 19 | */ 20 | 21 | /** 22 | * 逻辑斯蒂模型的SGD学习器 23 | * 24 | * @param params 参数池* 25 | * @param updater 参数更新器 26 | * @param initialCoeffs 初始参数 27 | */ 28 | class LrLearnSGD(override val params: ParamMap, 29 | val updater: Updater, 30 | val initialCoeffs: Option[VectorCoefficients] = None) 31 | extends BinLearner(params) with LrModelParam { 32 | val lg = new LogisticGradient(params) 33 | val gd = new GradientDescent(lg, updater, params) 34 | 35 | /** 36 | * 训练逻辑斯蒂模型 37 | * 38 | * @param dataSet 训练集 39 | * @return 逻辑斯蒂模型 40 | */ 41 | override def train(dataSet: RDD[(Double, SparseVector[Double])]): BinModel = { 42 | dataSet.persist(StorageLevel.MEMORY_AND_DISK_SER_2) 43 | val inputCoeffs = initialCoeffs match { 44 | case Some(value) => value 45 | case None => new VectorCoefficients(dataSet.first()._2.length) 46 | } 47 | val coeffs = gd.optimize(dataSet, inputCoeffs, params(reg)) 48 | dataSet.unpersist() 49 | new LrModel(this, coeffs.asInstanceOf[VectorCoefficients], params) 50 | } 51 | 52 | 53 | } 54 | 55 | /** 56 | * 逻辑斯蒂模型的SGD学习器实例 57 | */ 58 | object LrLearnSGD { 59 | /** 60 | * 训练逻辑斯蒂模型 61 | * 62 | * @param dataset 数据集 63 | * @param params 参数池* 64 | * @param updater 参数更新器 65 | * @param initialCoeffs 初始参数 66 | * @return 逻辑斯蒂模型 67 | */ 68 | def train(dataset: RDD[(Double, SparseVector[Double])], 69 | params: ParamMap, 70 | updater: Updater, 71 | initialCoeffs: Option[VectorCoefficients] = None): MLModel = { 72 | new LrLearnSGD(params, updater, initialCoeffs).train(dataset) 73 | } 74 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/logisticregression/LrModel.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.logisticregression 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.MLModel 5 | import io.github.qf6101.mfm.baseframe.binomial.BinModel 6 | import io.github.qf6101.mfm.util.Logging 7 | import org.apache.spark.ml.param.ParamMap 8 | 9 | /** 10 | * Created by qfeng on 15-3-16. 11 | */ 12 | 13 | /** 14 | * 逻辑斯蒂回归模型 15 | * 16 | * @param coeffs 模型系数 17 | * @param paramMeta 逻辑斯蒂参数 18 | * @param params 参数池 19 | */ 20 | class LrModel(override val paramMeta: LrModelParam, 21 | override val coeffs: VectorCoefficients, 22 | override val params: ParamMap) 23 | extends BinModel(paramMeta, coeffs, params) with Logging { 24 | /** 25 | * 对输入数据进行预测(使用内置系数) 26 | * 27 | * @param data 输入数据 28 | * @return 预测值(0~1) 29 | */ 30 | override def predict(data: SparseVector[Double]): Double = { 31 | predict(data, this.coeffs) 32 | } 33 | 34 | /** 35 | * 对输入数据进行预测 36 | * 37 | * @param data 输入数据 38 | * @param coeffs 系数 39 | * @return 预测值(0~1) 40 | */ 41 | def predict(data: SparseVector[Double], coeffs: VectorCoefficients = this.coeffs): Double = { 42 | val margin = -1.0 * coeffs.dot(data) 43 | 1.0 / (1.0 + math.exp(margin)) 44 | } 45 | 46 | override def equals(other: MLModel): Boolean = { 47 | other match { 48 | case otherModel: LrModel => 49 | if (paramMeta.toJSON(params).equals(otherModel.paramMeta.toJSON(otherModel.params)) 50 | && coeffs.equals(otherModel.coeffs)) true 51 | else false 52 | case _ => false 53 | } 54 | } 55 | } 56 | 57 | object LrModel extends Logging { 58 | /** 59 | * 从模型文件载入逻辑斯蒂模型 60 | * 61 | * @param location 模型文件 62 | * @return 逻辑斯蒂模型 63 | */ 64 | def apply(location: String): LrModel = { 65 | val params = new ParamMap() 66 | val paramMeta = LrModelParam(location + "/" + MLModel.namingParamFile, params) 67 | val coefficients = VectorCoefficients(location + "/" + MLModel.namingCoeffFile) 68 | new LrModel(paramMeta, coefficients, params) 69 | } 70 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/logisticregression/LrModelParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.logisticregression 2 | 3 | import io.github.qf6101.mfm.baseframe.ModelParam 4 | import io.github.qf6101.mfm.baseframe.binomial.BinModelParam 5 | import org.apache.spark.ml.param.{Param, ParamMap} 6 | import org.apache.spark.sql.SparkSession 7 | import org.json4s.JsonAST 8 | import org.json4s.JsonDSL._ 9 | 10 | 11 | /** 12 | * Created by qfeng on 15-3-18. 13 | */ 14 | 15 | /** 16 | * 逻辑斯蒂模型的参数 17 | */ 18 | trait LrModelParam extends BinModelParam { 19 | val reg: Param[Array[Double]] = new Param("LrModelParam", "reg", "正则参数") 20 | 21 | /** 22 | * Transform parameters to json object 23 | * 24 | * @return parameters in json format 25 | */ 26 | override def toJSON(params: ParamMap): JsonAST.JObject = { 27 | super.toJSON(params) ~ 28 | (ModelParam.namingParamType -> LrModelParam.getClass.toString) ~ 29 | (reg.name -> params(reg).mkString(", ")) 30 | } 31 | } 32 | 33 | object LrModelParam { 34 | /** 35 | * 根据字符串数组构造逻辑斯蒂模型参数 36 | * 37 | * @param location 文件位置 38 | * @param params 参数池 39 | * @return 逻辑斯蒂模型参数 40 | */ 41 | def apply(location: String, params: ParamMap): LrModelParam = { 42 | // 初始化参数对象和spark session 43 | val lrModelParam = new LrModelParam {} 44 | val spark = SparkSession.builder().getOrCreate() 45 | // 读取参数值 46 | val paramValues = spark.read.json(location).first() 47 | val binaryThreshold = paramValues.getAs[Double](lrModelParam.binaryThreshold.name) 48 | val reg = paramValues.getAs[String](lrModelParam.reg.name).split(",").map(_.trim.toDouble) 49 | val initMean = paramValues.getAs[Double](lrModelParam.initMean.name) 50 | val initStdev = paramValues.getAs[Double](lrModelParam.initStdev.name) 51 | // 设置参数值 52 | params.put(lrModelParam.binaryThreshold, binaryThreshold) 53 | params.put(lrModelParam.reg, reg) 54 | params.put(lrModelParam.initMean, initMean) 55 | params.put(lrModelParam.initStdev, initStdev) 56 | // 返回LR模型参数 57 | lrModelParam 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/logisticregression/VectorCoefficients.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.logisticregression 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | import org.apache.spark.sql.SparkSession 6 | import org.json4s.JsonDSL._ 7 | import org.json4s.jackson.JsonMethods._ 8 | 9 | import scala.collection.mutable 10 | import scala.math._ 11 | 12 | /** 13 | * Created by qfeng on 15-6-11. 14 | */ 15 | class VectorCoefficients(val size: Int) extends Coefficients { 16 | var w0 = 0.0 17 | var w = mutable.HashMap[Int, Double]() 18 | 19 | /** 20 | * 同时复制this的结构和内容 21 | * 22 | * @return 复制的拷贝 23 | */ 24 | override def copy: Coefficients = { 25 | new VectorCoefficients(this.size, this.w0, this.w) 26 | } 27 | 28 | /** 29 | * 用Map稀疏向量初始化 30 | * 31 | * @param w0 截距 32 | * @param w Map稀疏向量表示的参数 33 | */ 34 | def this(size: Int, w0: Double, w: mutable.HashMap[Int, Double]) { 35 | this(size) 36 | this.w0 = w0 37 | this.w ++= w 38 | } 39 | 40 | /** 41 | * 只复制this的结构(比如参数个数),不复制内容 42 | * 43 | * @return 复制的拷贝 44 | */ 45 | override def copyEmpty(): Coefficients = new VectorCoefficients(this.size) 46 | 47 | /** 48 | * 对应系数加法,加至this上 49 | * 50 | * @param otherW0 截距加数 51 | * @param otherW 一阶系数加数 52 | * @return this 53 | */ 54 | def +=(otherW0: Double, otherW: SparseVector[Double]): VectorCoefficients = { 55 | this.w0 += otherW0 56 | otherW.activeIterator.foreach { case (index, value) => 57 | val originalValue = this.w.getOrElse(index, 0.0) 58 | this.w.update(index, originalValue + value) 59 | } 60 | this 61 | } 62 | 63 | /** 64 | * 对应系数加法,加至this上 65 | * 66 | * @param other 加数 67 | * @return this 68 | */ 69 | override def +=(other: Coefficients): Coefficients = { 70 | val otherCoeffs = other.asInstanceOf[VectorCoefficients] 71 | this.w0 += otherCoeffs.w0 72 | otherCoeffs.w.foreach { case (index, value) => 73 | val originalValue = this.w.getOrElse(index, 0.0) 74 | this.w.update(index, originalValue + value) 75 | } 76 | this 77 | } 78 | 79 | /** 80 | * 对应系数减法,减至this上 81 | * 82 | * @param other 减数 83 | * @return this 84 | */ 85 | override def -=(other: Coefficients): Coefficients = { 86 | val otherCoeffs = other.asInstanceOf[VectorCoefficients] 87 | this.w0 -= otherCoeffs.w0 88 | otherCoeffs.w.foreach { case (index, value) => 89 | val originalValue = this.w.getOrElse(index, 0.0) 90 | this.w.update(index, originalValue - value) 91 | } 92 | this 93 | } 94 | 95 | /** 96 | * 97 | * 对应系数加上同一实数,加至复制this的类上 98 | * 99 | * @param addend 加数 100 | * @return 加法结果(拷贝) 101 | */ 102 | override def +(addend: Double): Coefficients = { 103 | val result = new VectorCoefficients(this.size) 104 | result.w0 = this.w0 + addend 105 | result.w = this.w.map { case (index, value) => index -> (value + addend) } 106 | result 107 | } 108 | 109 | /** 110 | * 对应系数除上同一实数,加至复制this的类上 111 | * 112 | * @param dividend 除数 113 | * @return 除法结果 114 | */ 115 | override def /(dividend: Double): Coefficients = { 116 | val result = new VectorCoefficients(this.size) 117 | result.w0 = this.w0 / dividend 118 | result.w = this.w.map { case (index, value) => index -> (value / dividend) } 119 | result 120 | } 121 | 122 | /** 123 | * 计算L2的正则值 124 | * 125 | * @param reg 正则参数 126 | * @return 参数加权后的L2正则值 127 | */ 128 | override def L2RegValue(reg: Array[Double]): Double = { 129 | var squaredCoeffSum = w0 * w0 130 | this.w.foreach { case (index, value) => 131 | squaredCoeffSum += value * value 132 | } 133 | 0.5 * reg(0) * squaredCoeffSum 134 | } 135 | 136 | /** 137 | * 计算L2的正则梯度值 138 | * 139 | * @param reg 正则参数 140 | * @return 参数加权后的L2正则梯度值 141 | */ 142 | override def L2RegGradient(reg: Array[Double]): Coefficients = { 143 | this * reg(0) 144 | } 145 | 146 | /** 147 | * 对应系数乘上同一实数,加至复制this的类上 148 | * 149 | * @param multiplier 乘数 150 | * @return 乘法结果 151 | */ 152 | override def *(multiplier: Double): Coefficients = { 153 | val result = new VectorCoefficients(this.size) 154 | result.w0 = this.w0 * multiplier 155 | result.w = this.w.map { case (index, value) => index -> (value * multiplier) } 156 | result 157 | } 158 | 159 | /** 160 | * 用L1稀疏化系数 161 | * 162 | * @param regParam 正则参数值 163 | * @param stepSize 学习率 164 | * @return 稀疏化后的系数 165 | */ 166 | override def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients = { 167 | //收缩值 168 | val shrinkageVal = regParam(0) * stepSize 169 | w0 = signum(w0) * max(0.0, abs(w0) - shrinkageVal) 170 | w = w.flatMap { case (index, weight) => 171 | val newWeight = signum(weight) * max(0.0, abs(weight) - shrinkageVal) 172 | if (newWeight == 0) { 173 | Nil 174 | } else { 175 | List(index -> newWeight) 176 | } 177 | } 178 | this 179 | } 180 | 181 | /** 182 | * 计算L1的正则值 183 | * 184 | * @param regParam 正则参数 185 | * @return 参数绝对值加权后的L1正则值 186 | */ 187 | override def L1RegValue(regParam: Array[Double]): Double = { 188 | val zeroRegValue = abs(w0) 189 | val firstRegValue = this.w.foldLeft(0.0) { case (absSum, element) => 190 | absSum + abs(element._2) 191 | } 192 | (zeroRegValue + firstRegValue) * regParam(0) 193 | } 194 | 195 | /** 196 | * 系数与稀疏向量点乘 197 | * 198 | * @param otherW 稀疏向量 199 | * @return 点乘的结果 200 | */ 201 | def dot(otherW: SparseVector[Double]): Double = { 202 | var result = w0 203 | otherW.activeIterator.foreach { case (index, value) => 204 | val originalValue = this.w.getOrElse(index, 0.0) 205 | result += originalValue * value 206 | } 207 | result 208 | } 209 | 210 | /** 211 | * 计算系数的2范数 212 | * sum(abs(A).^p)^(1/p) where p=2 213 | * 214 | * @return 系数的2范数 215 | */ 216 | override def norm: Double = { 217 | math.sqrt(w.foldLeft(0.0) { case (sum: Double, (_, value: Double)) => 218 | sum + value * value 219 | } + w0 * w0) 220 | } 221 | 222 | /** 223 | * 保存元数据至文件 224 | * 225 | * @param location 文件位置 226 | */ 227 | override def saveMeta(location: String): Unit = { 228 | val json = (Coefficients.namingCoeffType -> VectorCoefficients.getClass.toString) ~ 229 | (VectorCoefficients.namingFeatureSize -> size) ~ 230 | (VectorCoefficients.namingIntercept -> w0) ~ 231 | (VectorCoefficients.namingWSize -> w.size) 232 | SparkSession.builder().getOrCreate().sparkContext. 233 | makeRDD(List(compact(render(json)))).repartition(1).saveAsTextFile(location) 234 | } 235 | 236 | /** 237 | * 保存数据至文件 238 | * 239 | * @param location 文件位置 240 | */ 241 | override def saveData(location: String): Unit = { 242 | SparkSession.builder().getOrCreate().createDataFrame(w.toSeq).toDF("index", "value").write.parquet(location) 243 | } 244 | 245 | /** 246 | * 与另一个系数是否相等 247 | * 248 | * @param other 另一个系数 249 | * @return 是否相等 250 | */ 251 | override def equals(other: Coefficients): Boolean = { 252 | other match { 253 | case otherCoeffs: VectorCoefficients => 254 | if (w0 == otherCoeffs.w0 && w.equals(otherCoeffs.w)) true else false 255 | case _ => false 256 | } 257 | } 258 | } 259 | 260 | /** 261 | * 向量化系数对象 262 | */ 263 | object VectorCoefficients { 264 | val namingIntercept = "intercept" 265 | val namingFeatureSize = "feature_size" 266 | val namingWSize = "w_size" 267 | 268 | /** 269 | * 根据字符串数组构造向量系数 270 | * 271 | * @param location 系数文件位置 272 | * @return 向量系数 273 | */ 274 | def apply(location: String): VectorCoefficients = { 275 | val spark = SparkSession.builder().getOrCreate() 276 | import spark.implicits._ 277 | val meta = spark.read.json(location + "/" + Coefficients.namingMetaFile).first() 278 | val size = meta.getAs[Long](namingFeatureSize).toInt 279 | val w0 = meta.getAs[Double](namingIntercept) 280 | val w = spark.read.parquet(location + "/" + Coefficients.namingDataFile).map { row => 281 | (row.getAs[Long]("index").toInt, row.getAs[Double]("value")) 282 | }.collect() 283 | new VectorCoefficients(size, w0, mutable.HashMap[Int, Double](w.toSeq: _*)) 284 | } 285 | } 286 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/DecreasingStrategy.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | /** 4 | * User: qfeng 5 | * Date: 15-12-29 下午4:49 6 | * Usage: SGD学习步长衰减类 7 | */ 8 | trait DecreasingStrategy extends Serializable { 9 | /** 10 | * 根据当前的迭代次数计算学习步长衰减的分母 11 | * 12 | * @param iter 迭代次数 13 | * @return 学习步长衰减的分母 14 | */ 15 | def decrease(iter: Int): Double 16 | } 17 | 18 | class Log10DecreasingStrategy extends DecreasingStrategy { 19 | /** 20 | * 根据当前的迭代次数计算学习步长衰减的分母 21 | * 按照log10进行衰减,第91次迭代衰减为一半 22 | * 23 | * @param iter 迭代次数 24 | * @return 学习步长衰减的分母 25 | */ 26 | def decrease(iter: Int): Double = { 27 | math.log10(9 + iter) 28 | } 29 | } 30 | 31 | class LogXDecreasingStrategy(X: Int) extends DecreasingStrategy { 32 | /** 33 | * 根据当前的迭代次数计算学习步长衰减的分母 34 | * 按照logX进行衰减 35 | * 36 | * @param iter 迭代次数 37 | * @return 学习步长衰减的分母 38 | */ 39 | def decrease(iter: Int): Double = { 40 | math.log(X - 1 + iter) / math.log(X) 41 | } 42 | } 43 | 44 | class ConstantDecreasingStrategy(stepSize: Double) extends DecreasingStrategy { 45 | /** 46 | * 不衰减 47 | * 48 | * @param iter 迭代次数 49 | * @return 常数学习率(不衰减学习率) 50 | */ 51 | def decrease(iter: Int): Double = { 52 | stepSize 53 | } 54 | } 55 | 56 | class sqrtDecreasingStrategy extends DecreasingStrategy { 57 | /** 58 | * 根据当前的迭代次数计算学习步长衰减的分母 59 | * 按照开方进行衰减,第5次迭代衰减为一半 60 | * 61 | * @param iter 迭代次数 62 | * @return 学习步长衰减的分母 63 | */ 64 | def decrease(iter: Int): Double = { 65 | Math.sqrt(iter) 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/Gradient.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | 6 | 7 | /** 8 | * Created by qfeng on 15-3-11. 9 | */ 10 | trait Gradient extends Serializable { 11 | /** 12 | * Compute the gradient and loss given the features of a single data point. 13 | * 14 | * @param data features for one data point 15 | * @param label label for this data point 16 | * @param coeffs weights/coefficients corresponding to features 17 | * @return (gradient: Coefficients, loss: Double) 18 | */ 19 | def compute(data: SparseVector[Double], 20 | label: Double, 21 | coeffs: Coefficients): (Coefficients, Double) = { 22 | val gradient = coeffs.copyEmpty() 23 | val loss = compute(data, label, coeffs, gradient) 24 | (gradient, loss) 25 | } 26 | 27 | /** 28 | * Compute the gradient and loss given the features of a single data point, 29 | * add the gradient to a provided vector to avoid creating new objects, and return loss. 30 | * 31 | * @param data features for one data point 32 | * @param label label for this data point 33 | * @param coeffs weights/coefficients corresponding to features 34 | * @param cumGradient the computed gradient will be added to this vector 35 | * @return loss 36 | */ 37 | def compute(data: SparseVector[Double], 38 | label: Double, 39 | coeffs: Coefficients, 40 | cumGradient: Coefficients): Double 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/GradientDescent.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | import io.github.qf6101.mfm.util.Logging 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.apache.spark.rdd.RDD 9 | 10 | import scala.collection.mutable.ArrayBuffer 11 | 12 | /** 13 | * Created by qfeng on 15-3-11. 14 | */ 15 | 16 | /** 17 | * 随机梯度下降器 18 | * 19 | * @param gradient 梯度逻辑 20 | * @param updater 更新逻辑 21 | * @param params 参数池 22 | */ 23 | class GradientDescent(private var gradient: Gradient, private var updater: Updater, private var params: ParamMap) 24 | extends Optimizer with SGDParam with Logging { 25 | 26 | /** 27 | * 最优化函数 28 | * 29 | * @param data 样本数据集 30 | * @param initialCoeffs 初始化系数值 31 | * @param regParam 正则参数值 32 | * @return 学习后的参数 33 | */ 34 | override def optimize(data: RDD[(Double, SparseVector[Double])], 35 | initialCoeffs: Coefficients, 36 | regParam: Array[Double]): Coefficients = { 37 | val (coeffs, _) = optimizeWithHistory(data, initialCoeffs, regParam) 38 | coeffs 39 | } 40 | 41 | /** 42 | * 最优化函数 43 | * 44 | * @param data 样本数据集 45 | * @param initialCoeffs 初始化系数值 46 | * @param regParam 正则参数值 47 | * @return 学习后的参数 48 | */ 49 | def optimizeWithHistory(data: RDD[(Double, SparseVector[Double])], 50 | initialCoeffs: Coefficients, 51 | regParam: Array[Double]): (Coefficients, Array[Double]) = { 52 | //获取参数 53 | val numIterationsValue = params(numIterations) 54 | val miniBatchFractionValue = params(miniBatchFraction) 55 | val stepSizeValue = params(stepSize) 56 | //初始化系数、正则值 57 | var coeffs = initialCoeffs.copy 58 | var regVal = updater.compute(coeffs, coeffs.copyEmpty(), 0, 1, regParam)._2 59 | val lossHistory = new ArrayBuffer[Double](numIterationsValue) 60 | //初始化临时变量:迭代次数、是否收敛、上次损失值 61 | var i = 0 62 | var reachStopCondition = false 63 | //开始迭代训练 64 | while (!reachStopCondition && i < numIterationsValue) { 65 | i += 1 66 | val bcCoeffs = SparkContext.getOrCreate.broadcast(coeffs) 67 | val (gradientSum, lossSum, miniBatchSize) = data.sample(withReplacement = false, miniBatchFractionValue, 42 + i) 68 | .treeAggregate(initialCoeffs.copyEmpty(), 0.0, 0L)( 69 | seqOp = (c, v) => { 70 | // c: (grad, loss, count), v: (label, features) 71 | val l = gradient.compute(v._2, v._1, bcCoeffs.value, c._1) 72 | (c._1, c._2 + l, c._3 + 1) 73 | }, 74 | combOp = (c1, c2) => { 75 | // c: (grad, loss, count) 76 | (c1._1 += c2._1, c1._2 + c2._2, c1._3 + c2._3) 77 | }) 78 | 79 | if (miniBatchSize > 0) { 80 | //计算损失值、新的系数、正则值 81 | lossHistory.append(lossSum / miniBatchSize + regVal) 82 | val update = updater.compute(coeffs, gradientSum / miniBatchSize.toDouble, stepSizeValue, i, regParam) 83 | //判断是否达到收敛条件 84 | val (converged, solutionDiff) = isConverged(update._1, coeffs) 85 | reachStopCondition = converged 86 | //更新系数和正则值 87 | coeffs = update._1 88 | regVal = update._2 89 | //打印调试信息:损失值 90 | logInfo(s"Iteration ($i/$numIterationsValue) loss: ${lossSum / miniBatchSize} and $regVal, solutionDiff: $solutionDiff") 91 | } else { 92 | logWarning(s"Iteration ($i/$numIterationsValue}). The size of sampled batch is zero") 93 | } 94 | } 95 | (coeffs, lossHistory.toArray) 96 | } 97 | 98 | /** 99 | * 判断是否达到收敛条件 100 | * 101 | * @param newCoeffs 更新后的系数 102 | * @param oldCoeffs 更新前的系数 103 | * @return 是否达到收敛条件 104 | */ 105 | private def isConverged(newCoeffs: Coefficients, oldCoeffs: Coefficients): (Boolean, Double) = { 106 | val solutionDiff = newCoeffs.normDiff(oldCoeffs) 107 | (solutionDiff < params(convergenceTol) * Math.max(newCoeffs.norm, 1.0), solutionDiff) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/LBFGS.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import breeze.linalg.SparseVector 4 | import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS} 5 | import io.github.qf6101.mfm.baseframe.Coefficients 6 | import io.github.qf6101.mfm.logisticregression.VectorCoefficients 7 | import io.github.qf6101.mfm.util.Logging 8 | import org.apache.spark.ml.param.ParamMap 9 | import org.apache.spark.rdd.RDD 10 | 11 | import scala.collection.mutable 12 | import scala.language.implicitConversions 13 | 14 | /** 15 | * Created by qfeng on 15-4-7. 16 | */ 17 | 18 | /** 19 | * LBFGS优化器 20 | * 21 | * @param gradient 梯度逻辑 22 | * @param updater 更新逻辑 23 | * @param params 参数池 24 | */ 25 | class LBFGS(private var gradient: Gradient, private var updater: Updater, private var params: ParamMap) extends 26 | Optimizer with LBFGSParam with Logging { 27 | 28 | /** 29 | * 最优化函数 30 | * 31 | * @param data 样本数据集 32 | * @param initialCoeffs 初始化系数值 33 | * @param reg 正则参数值 34 | * @return 学习后的参数 35 | */ 36 | override def optimize(data: RDD[(Double, SparseVector[Double])], 37 | initialCoeffs: Coefficients, 38 | reg: Array[Double]): 39 | Coefficients = { 40 | val (coeffs, _) = optimizeWithHistory(data, initialCoeffs, reg) 41 | coeffs 42 | } 43 | 44 | /** 45 | * 最优化函数 46 | * 47 | * @param data 样本数据集 48 | * @param initialCoeffs 初始化系数值 49 | * @param reg 正则参数值 50 | * @return 学习后的参数 51 | */ 52 | def optimizeWithHistory(data: RDD[(Double, SparseVector[Double])], 53 | initialCoeffs: Coefficients, 54 | reg: Array[Double], 55 | negativePenalty: Double = 1.0): 56 | (Coefficients, Array[Double]) = { 57 | //获取参数 58 | val numIterationsValue = params(numIterations) 59 | val numCorrectionsValue = params(numCorrections) 60 | val convergenceTolValue = params(convergenceTol) 61 | //初始化损失值数组、数据集大小 62 | val lossHistory = new mutable.ArrayBuffer[Double](numIterationsValue) 63 | val numExamples = data.count() 64 | //初始化系数、损失函数形式 65 | val vecInitialCoeffs = initialCoeffs.asInstanceOf[VectorCoefficients] 66 | val costFun = new CostFun(data, gradient, updater, reg, numExamples, negativePenalty) 67 | val lbfgs = new BreezeLBFGS[SparseVector[Double]](numIterationsValue, numCorrectionsValue, convergenceTolValue) 68 | //创建LBFGS状态序列 69 | val states = lbfgs.iterations(new CachedDiffFunction(costFun), VCToBSV(vecInitialCoeffs)) 70 | //执行迭代 71 | var i = 0 72 | var state = states.next() 73 | while (states.hasNext) { 74 | i += 1 75 | logDebug(s"Iteration ($i/$numIterationsValue) loss: ${state.value}") 76 | lossHistory.append(state.value) 77 | state = states.next() 78 | } 79 | lossHistory.append(state.value) 80 | //返回结果 81 | (state.x, lossHistory.toArray) 82 | } 83 | 84 | /** 85 | * 向量系数转成breeze的稀疏向量 86 | * 87 | * @param in 向量系数 88 | * @return breeze的稀疏向量 89 | */ 90 | implicit def VCToBSV(in: VectorCoefficients): SparseVector[Double] = { 91 | val out = SparseVector.zeros[Double](in.size + 1) 92 | out.update(0, in.w0) 93 | in.w.foreach { case (index, value) => 94 | out.update(index + 1, value) 95 | } 96 | out 97 | } 98 | 99 | /** 100 | * breeze的稀疏向量转成向量系数 101 | * 102 | * @param in breeze的稀疏向量 103 | * @return 向量系数 104 | */ 105 | implicit def BSVToVC(in: SparseVector[Double]): VectorCoefficients = { 106 | val w0 = in(0) 107 | val w = mutable.HashMap[Int, Double]() 108 | in.activeIterator.foreach { case (index, value) => 109 | if (index != 0) { 110 | w += (index - 1) -> value 111 | } 112 | } 113 | new VectorCoefficients(in.length - 1, w0, w) 114 | } 115 | 116 | /** 117 | * CostFun implements Breeze's DiffFunction[T], which returns the loss and gradient 118 | * at a particular point (weights). It's used in Breeze's convex optimization routines. 119 | */ 120 | private class CostFun(data: RDD[(Double, SparseVector[Double])], 121 | gradient: Gradient, 122 | updater: Updater, 123 | reg: Array[Double], 124 | numExamples: Long, 125 | negativePenalty: Double) extends DiffFunction[SparseVector[Double]] with Serializable { 126 | 127 | override def calculate(weights: SparseVector[Double]): (Double, SparseVector[Double]) = { 128 | // Have a local copy to avoid the serialization of CostFun object which is not serializable. 129 | val w = weights.copy 130 | val n = weights.length 131 | val bcW = data.context.broadcast(w) 132 | val localGradient = gradient 133 | 134 | val (gradientSum, lossSum) = data.treeAggregate((new VectorCoefficients(n - 1), 0.0))( 135 | seqOp = (c, v) => (c, v) match { 136 | case ((grad, loss), (label, features)) => 137 | val l = localGradient.compute(features, label, bcW.value, grad) 138 | (grad, loss + l) 139 | }, 140 | combOp = (c1, c2) => (c1, c2) match { 141 | case ((grad1, loss1), (grad2, loss2)) => 142 | grad1 += grad2 143 | (grad1, loss1 + loss2) 144 | }) 145 | 146 | /** 147 | * regVal is sum of weight squares if it's L2 updater; 148 | * for other updater, the same logic is followed. 149 | */ 150 | val regVal = updater.compute(w, new VectorCoefficients(n - 1), 0, 1, reg)._2 151 | val outputLoss = lossSum / numExamples + regVal 152 | /** 153 | * It will return the gradient part of regularization using updater. 154 | * 155 | * Given the input parameters, the updater basically does the following, 156 | * 157 | * w' = w - thisIterStepSize * (gradient + regGradient(w)) 158 | * Note that regGradient is function of w 159 | * 160 | * If we set gradient = 0, thisIterStepSize = 1, then 161 | * 162 | * regGradient(w) = w - w' 163 | * 164 | * TODO: We need to clean it up by separating the logic of regularization out 165 | * from updater to regularizer. 166 | */ 167 | // The following gradientTotal is actually the regularization part of gradient. 168 | // Will add the gradientSum computed from the data with weights in the next step. 169 | val gradientTotal = BSVToVC(w) 170 | gradientTotal -= updater.compute(w, new VectorCoefficients(n - 1), 1, 1, reg)._1.asInstanceOf[VectorCoefficients] 171 | gradientTotal += gradientSum * (1.0 / numExamples) 172 | 173 | (outputLoss, gradientTotal) 174 | } 175 | } 176 | 177 | } 178 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/LBFGSParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import org.apache.spark.ml.param.{Param, ParamValidators} 4 | 5 | /** 6 | * Created by qfeng on 15-4-7. 7 | */ 8 | 9 | /** 10 | * LGBGS的参数 11 | */ 12 | trait LBFGSParam extends Serializable { 13 | //default value: 10 14 | val numCorrections: Param[Int] = new Param("LBFGSParam", "numCorrections", "number of corrections used in the LBFGS " + 15 | "update", ParamValidators.gt(0)) 16 | //default value:1E-4 17 | val convergenceTol: Param[Double] = new Param("LBFGSParam", "convergenceTol", "convergence tolerance of iterations for LBFGS", 18 | ParamValidators.gt(0)) 19 | val numIterations: Param[Int] = new Param("LBFGSParam", "numIterations", "number of iterations that SGD should be run", 20 | ParamValidators.gt(0)) 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/Optimizer.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.Coefficients 5 | import org.apache.spark.rdd.RDD 6 | 7 | /** 8 | * Created by qfeng on 15-3-11. 9 | */ 10 | 11 | /** 12 | * 优化器接口,实现包括SGD, LBFGS等 13 | */ 14 | trait Optimizer extends Serializable { 15 | /** 16 | * 最优化函数 17 | * 18 | * @param data 样本数据集 19 | * @param initialCoeffs 初始化系数值 20 | * @param regParam 正则参数值 21 | * @return 学习后的参数 22 | */ 23 | def optimize(data: RDD[(Double, SparseVector[Double])], 24 | initialCoeffs: Coefficients, 25 | regParam: Array[Double]): Coefficients 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/SGDParam.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import org.apache.spark.ml.param.{Param, ParamValidators} 4 | 5 | 6 | /** 7 | * Created by qfeng on 15-3-18. 8 | */ 9 | 10 | 11 | /** 12 | * SGD的参数 13 | */ 14 | trait SGDParam extends Serializable { 15 | //default value: 1.0 16 | val stepSize: Param[Double] = new Param("SGDParam", "stepSize", "initial step size for the first step", 17 | ParamValidators.gt(0)) 18 | val numIterations: Param[Int] = new Param("SGDParam", "numIterations", "number of iterations that SGD should be run", 19 | ParamValidators.gt(0)) 20 | //default value: 1.0 21 | val miniBatchFraction: Param[Double] = new Param("SGDParam", "miniBatchFraction", "fraction of the input data set " + 22 | "that should be used for one iteration of SGD", ParamValidators.inRange(0, 1, false, true)) 23 | //default value:1E-4 24 | val convergenceTol: Param[Double] = new Param("SGDParam", "convergenceTol", "convergence tolerance of iterations for SGD", 25 | ParamValidators.gt(0)) 26 | } 27 | 28 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/optimization/Updater.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import io.github.qf6101.mfm.baseframe.Coefficients 4 | import io.github.qf6101.mfm.util.Logging 5 | 6 | /** 7 | * Created by qfeng on 15-3-11. 8 | */ 9 | abstract class Updater(private val decreasingStrategy: DecreasingStrategy) 10 | extends Logging with Serializable { 11 | /** 12 | * Compute an updated value for weights given the gradient, stepSize, iteration number and 13 | * regularization parameter. Also returns the regularization value regParam * R(w) 14 | * computed using the *updated* weights. 15 | * 16 | * @param coeffOld - Old coefficients. 17 | * @param gradient - Average batch gradient. 18 | * @param stepSize - step size across iterations 19 | * @param iter - Iteration number 20 | * @param regParam - Regularization parameter 21 | * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights, 22 | * and the second element is the regularization value computed using updated weights. 23 | */ 24 | def compute(coeffOld: Coefficients, 25 | gradient: Coefficients, 26 | stepSize: Double, 27 | iter: Int, 28 | regParam: Array[Double]): (Coefficients, Double) 29 | } 30 | 31 | /** 32 | * A simple updater for gradient descent *without* any regularization. 33 | * Uses a step-size decreasing with the square root of the number of iterations. 34 | */ 35 | class SimpleUpdater(private val decreasingStrategy: DecreasingStrategy = new Log10DecreasingStrategy()) 36 | extends Updater(decreasingStrategy) { 37 | /** 38 | * Compute an updated value for weights given the gradient, stepSize, iteration number and 39 | * regularization parameter. Also returns the regularization value regParam * R(w) 40 | * computed using the *updated* weights. 41 | * 42 | * @param coeffOld - Old coefficients. 43 | * @param gradient - Average batch gradient. 44 | * @param stepSize - step size across iterations 45 | * @param iter - Iteration number 46 | * @param regParam - Regularization parameter 47 | * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights, 48 | * and the second element is the regularization value computed using updated weights. 49 | */ 50 | override def compute(coeffOld: Coefficients, 51 | gradient: Coefficients, 52 | stepSize: Double, 53 | iter: Int, 54 | regParam: Array[Double]): (Coefficients, Double) = { 55 | val thisIterStepSize = stepSize / decreasingStrategy.decrease(iter) 56 | val coeffNew = coeffOld + gradient * (-thisIterStepSize) 57 | (coeffNew, 0.0) 58 | } 59 | } 60 | 61 | /** 62 | * Updater for L2 regularized problems. 63 | * R(w) = 1/2 ||w||2 64 | * Uses a step-size decreasing with the square root of the number of iterations. 65 | **/ 66 | class SquaredL2Updater(private val decreasingStrategy: DecreasingStrategy = new Log10DecreasingStrategy()) 67 | extends Updater(decreasingStrategy) { 68 | /** 69 | * Compute an updated value for weights given the gradient, stepSize, iteration number and 70 | * regularization parameter. Also returns the regularization value regParam * R(w) 71 | * computed using the *updated* weights. 72 | * 73 | * @param coeffOld - Old coefficients. 74 | * @param gradient - Average batch gradient. 75 | * @param stepSize - step size across iterations 76 | * @param iter - Iteration number 77 | * @param regParam - Regularization parameter 78 | * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights, 79 | * and the second element is the regularization value computed using updated weights. 80 | */ 81 | override def compute(coeffOld: Coefficients, 82 | gradient: Coefficients, 83 | stepSize: Double, 84 | iter: Int, 85 | regParam: Array[Double]): (Coefficients, Double) = { 86 | // add up both updates from the gradient of the loss (= step) as well as 87 | // the gradient of the regularizer (= regParam * weightsOld) 88 | // w' = w - thisIterStepSize * (gradient + regParam * w) 89 | val thisIterStepSize = stepSize / decreasingStrategy.decrease(iter) 90 | logInfo("step size: " + thisIterStepSize) 91 | val coeffNew = coeffOld + ((gradient + coeffOld.L2RegGradient(regParam)) * (-thisIterStepSize)) 92 | (coeffNew, coeffNew.L2RegValue(regParam)) 93 | } 94 | } 95 | 96 | /** 97 | * :: DeveloperApi :: 98 | * Updater for L1 regularized problems. 99 | * R(w) = ||w||_1 100 | * Uses a step-size decreasing with the square root of the number of iterations. 101 | * 102 | * Instead of subgradient of the regularizer, the proximal operator for the 103 | * L1 regularization is applied after the gradient step. This is known to 104 | * result in better sparsity of the intermediate solution. 105 | * 106 | * The corresponding proximal operator for the L1 norm is the soft-thresholding 107 | * function. That is, each weight component is shrunk towards 0 by shrinkageVal. 108 | * 109 | * If w > shrinkageVal, set weight component to w-shrinkageVal. 110 | * If w < -shrinkageVal, set weight component to w+shrinkageVal. 111 | * If -shrinkageVal < w < shrinkageVal, set weight component to 0. 112 | * 113 | * Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal) 114 | */ 115 | class L1Updater(private val decreasingStrategy: DecreasingStrategy = new Log10DecreasingStrategy()) 116 | extends Updater(decreasingStrategy) { 117 | /** 118 | * Compute an updated value for weights given the gradient, stepSize, iteration number and 119 | * regularization parameter. Also returns the regularization value regParam * R(w) 120 | * computed using the *updated* weights. 121 | * 122 | * @param coeffOld - Old coefficients. 123 | * @param gradient - Average batch gradient. 124 | * @param stepSize - step size across iterations 125 | * @param iter - Iteration number 126 | * @param regParam - Regularization parameter 127 | * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights, 128 | * and the second element is the regularization value computed using updated weights. 129 | */ 130 | override def compute(coeffOld: Coefficients, 131 | gradient: Coefficients, 132 | stepSize: Double, 133 | iter: Int, 134 | regParam: Array[Double]): (Coefficients, Double) = { 135 | val thisIterStepSize = stepSize / decreasingStrategy.decrease(iter) 136 | val coeffNew = coeffOld + (gradient * (-thisIterStepSize)) 137 | // Apply proximal operator (soft thresholding) 138 | coeffNew.L1Shrink(regParam, thisIterStepSize) 139 | (coeffNew, coeffNew.L1RegValue(regParam)) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/tuning/BinCrossValidation.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.tuning 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel} 5 | import io.github.qf6101.mfm.util.{Logging, ParamUtil} 6 | import org.apache.spark.ml.param.{Param, ParamMap} 7 | import org.apache.spark.mllib.util.MLUtils 8 | import org.apache.spark.rdd.RDD 9 | 10 | import scala.util.Random 11 | 12 | /** 13 | * User: qfeng 14 | * Date: 15-8-24 上午10:25 15 | * Usage: 快速模型选择,固定其他参数,尝试某一参数的各个值,取最大AUC的参数值 16 | */ 17 | 18 | /** 19 | * 二分类交叉检验 20 | * 21 | * @param learner 二分类学习器 22 | * @param paramGridBuilder 候选参数构造器 23 | * @param numFolds 交叉份数 24 | * @param baseParamMinAUC 基准AUC 25 | */ 26 | class BinCrossValidation(val learner: BinLearner, 27 | val paramGridBuilder: BinParamGridBuilder, 28 | val numFolds: Int = 5, 29 | val baseParamMinAUC: Double = 0.0) extends Logging with Serializable { 30 | 31 | /** 32 | * 分类问题的模型选择 33 | * 34 | * @param dataset 数据集 35 | * @return 训练得到的模型及其评估值 36 | */ 37 | def selectParamsForClassif(dataset: RDD[(Double, SparseVector[Double])]): (BinModel, BinaryClassificationMetrics) 38 | = { 39 | //选择得到的参数集合,即返回值 40 | val selectedParamMap = new ParamMap 41 | //数据分块,用于交叉验证 42 | val splits = MLUtils.kFold(dataset, numFolds, Random.nextInt()) 43 | //随机选择一组参数作为基准参数 44 | val baseParamMap = selectBaseParams(splits(0)._1, baseParamMinAUC) 45 | //对于每个参数,都尝试它的每个参数值,选择AUC最大的那个作为最终的参数值(其他参数采用基准参数值) 46 | paramGridBuilder.paramGrid.foreach { case (param, paramValues) => 47 | //每个参数值都对应数组中的一个元素 48 | val AUCs = new Array[Double](paramValues.size) 49 | val models = new Array[BinModel](paramValues.size) 50 | val candidateParamValues = new Array[Any](paramValues.size) 51 | 52 | //对于每个参数值,都基于交叉检验训练模型,计算AUC均值 53 | paramValues.zipWithIndex.foreach { case (paramValue, paramValueIndex) => 54 | //组装出一组参数,用于训练模型 55 | val paramMap = baseParamMap.copy.put(param.asInstanceOf[Param[Any]], paramValue) 56 | candidateParamValues(paramValueIndex) = paramValue 57 | learner.updateParams(paramMap) 58 | //采用交叉检验训练模型计算AUC值 59 | splits.zipWithIndex.foreach { case ((training, testing), splitIndex) => 60 | models(paramValueIndex) = learner.train(training) 61 | val validating = testing.map { case (label, features) => 62 | (models(paramValueIndex).predict(features), label) 63 | } 64 | val metrics = new BinaryClassificationMetrics(validating) 65 | val AUC = metrics.AUC 66 | AUCs(paramValueIndex) += AUC 67 | logInfo(s"split $splitIndex >>>>> AUC: ${metrics.AUC}") 68 | } 69 | //计算AUC均值 70 | AUCs(paramValueIndex) /= splits.length 71 | logInfo(s"selected parameters: ${ParamUtil.paramsToString(paramMap)}; >>>>> AUC: ${AUCs(paramValueIndex).formatted("%1.4f")}") 72 | } 73 | //挑选出AUC最大的参数值 74 | val (_, bestIndex) = AUCs.zipWithIndex.maxBy(_._1) 75 | selectedParamMap.put(param.asInstanceOf[Param[Any]], candidateParamValues(bestIndex)) 76 | } 77 | //使用挑选出的那组参数值,基于整个数据集训练模型,并计算评估值 78 | learner.updateParams(selectedParamMap) 79 | val fullModel = learner.train(dataset) 80 | val fullValidating = dataset.map { case (label, features) => 81 | (fullModel.predict(features), label) 82 | } 83 | val fullMetrics = new BinaryClassificationMetrics(fullValidating) 84 | (fullModel, fullMetrics) 85 | } 86 | 87 | /** 88 | * 随机选择一组参数作为基准参数 89 | * 90 | * @param dataset 数据集 91 | * @param baseParamMinAUC 基准参数的AUC阈值(基准参数描述的模型AUC不能小于等于该阈值) 92 | * @return 基准参数 93 | */ 94 | private def selectBaseParams(dataset: RDD[(Double, SparseVector[Double])], 95 | baseParamMinAUC: Double): ParamMap = { 96 | var selected = false 97 | var baseParamMap: ParamMap = null 98 | var tryTime: Int = 0 99 | 100 | while (!selected) { 101 | //尝试5次,如果AUC都是0则抛出异常 102 | tryTime = tryTime + 1 103 | if (tryTime > 5) { 104 | throw new Exception("try time exceeds 5 for base parameters selection.") 105 | } 106 | //随机选择一组参数,并计算AUC值 107 | baseParamMap = paramGridBuilder.sampleParams() 108 | learner.updateParams(baseParamMap) 109 | val model = learner.train(dataset) 110 | val validating = dataset.map { case (label, features) => 111 | (model.predict(features), label) 112 | } 113 | val metrics = new BinaryClassificationMetrics(validating) 114 | //AUC值大于阈值,则返回 115 | if (metrics.AUC > baseParamMinAUC) { 116 | selected = true 117 | } 118 | } 119 | baseParamMap 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/tuning/BinParamGridBuilder.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.tuning 2 | 3 | import org.apache.spark.ml.param._ 4 | 5 | import scala.collection.mutable 6 | import scala.util.Random 7 | 8 | /** 9 | * User: qfeng 10 | * Date: 15-8-24 上午10:32 11 | * Usage: 快速模型选择时,参数的构建工具类 12 | */ 13 | 14 | /** 15 | * 快速模型选择时,参数的构建工具类(针对二分模型) 16 | */ 17 | class BinParamGridBuilder extends Serializable { 18 | val paramGrid = mutable.Map.empty[Param[_], Iterable[_]] 19 | 20 | def copy(): BinParamGridBuilder = { 21 | val result = new BinParamGridBuilder 22 | result.paramGrid ++= paramGrid 23 | result 24 | } 25 | 26 | /** 27 | * 对于参数集合,随机选择一组参数值 28 | * 29 | * @return 参数及对应参数值集合 30 | */ 31 | def sampleParams(): ParamMap = { 32 | val paramMap = new ParamMap 33 | paramGrid.foreach { case (param, values) => 34 | val valueList = values.toList 35 | val value = valueList(Random.nextInt(valueList.length)) 36 | paramMap.put(param.asInstanceOf[Param[Any]], value) 37 | } 38 | paramMap 39 | } 40 | 41 | /** 42 | * Adds a double param with multiple values. 43 | */ 44 | def addGrid(param: DoubleParam, values: Array[Double]): this.type = { 45 | addGrid[Double](param, values) 46 | } 47 | 48 | // specialized versions of addGrid for Java. 49 | 50 | /** 51 | * Adds a int param with multiple values. 52 | */ 53 | def addGrid(param: IntParam, values: Array[Int]): this.type = { 54 | addGrid[Int](param, values) 55 | } 56 | 57 | /** 58 | * Adds a float param with multiple values. 59 | */ 60 | def addGrid(param: FloatParam, values: Array[Float]): this.type = { 61 | addGrid[Float](param, values) 62 | } 63 | 64 | /** 65 | * Adds a param with multiple values (overwrites if the input param exists). 66 | */ 67 | def addGrid[T](param: Param[T], values: Iterable[T]): this.type = { 68 | paramGrid.put(param, values) 69 | this 70 | } 71 | 72 | /** 73 | * Adds a long param with multiple values. 74 | */ 75 | def addGrid(param: LongParam, values: Array[Long]): this.type = { 76 | addGrid[Long](param, values) 77 | } 78 | 79 | /** 80 | * Adds a boolean param with true and false. 81 | */ 82 | def addGrid(param: BooleanParam): this.type = { 83 | addGrid[Boolean](param, Array(true, false)) 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/tuning/BinaryClassificationMetrics.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.tuning 2 | 3 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics => BCM} 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.storage.StorageLevel 6 | 7 | /** 8 | * User: qfeng 9 | * Date: 15-8-10 上午11:10 10 | * Usage: Binary classification evaluation, See https://en.wikipedia.org/wiki/Receiver_operating_characteristic 11 | */ 12 | 13 | /** 14 | * 二分类指标 15 | * 16 | * @param rawScoreAndLabels 预测值和标签 17 | * @param threshold 二分类阈值(默认0.5) 18 | */ 19 | class BinaryClassificationMetrics(private val rawScoreAndLabels: RDD[(Double, Double)], 20 | val threshold: Double = 0.5) extends Serializable { 21 | // 假如标签为1/-1,将其转换为1/0 22 | private val scoreAndLabels = rawScoreAndLabels.map { case (score, label) => 23 | if (label <= 0) (score, 0.0) else (score, 1.0) 24 | }.persist(StorageLevel.MEMORY_AND_DISK_SER) 25 | 26 | private val metrics = computeMetrics 27 | val accuracy = metrics._1 28 | val precisions = (metrics._2, metrics._3) 29 | val recalls = (metrics._4, metrics._5) 30 | val f1_scores = (metrics._6, metrics._7) 31 | val AUC = AUCValue 32 | private lazy val AUCValue = computeAUC(metrics._8) 33 | 34 | /** 35 | * 将各个度量指标转成字符串形式(保留4位小数) 36 | * 37 | * @return (AUC, accuracy, precisions, recalls, f1_scores) 38 | */ 39 | override def toString: String = { 40 | val result = new StringBuilder 41 | result.append("AUC: ") 42 | result.append("%1.4f".format(AUC)) 43 | result.append(", accuracy: ") 44 | result.append("%1.4f".format(accuracy)) 45 | result.append(", precisions: ") 46 | result.append(mkTupleString(precisions)) 47 | result.append(", recalls: ") 48 | result.append(mkTupleString(recalls)) 49 | result.append(", f1_scores: ") 50 | result.append(mkTupleString(f1_scores)) 51 | result.toString() 52 | } 53 | 54 | /** 55 | * double类型的元组转成字符串(转成4位小数) 56 | * 57 | * @param t 元组 58 | * @return 4位小数表示的字符串 59 | */ 60 | private def mkTupleString(t: (Double, Double)): String = { 61 | val result = new StringBuilder 62 | result.append("(") 63 | result.append("%1.4f".format(t._1)) 64 | result.append(", ") 65 | result.append("%1.4f".format(t._2)) 66 | result.append(")") 67 | result.toString() 68 | } 69 | 70 | private def computeAUC(numData: Int): Double = { 71 | var auc: Double = 0.0 72 | if (numData > 300000) { 73 | auc = new BCM(scoreAndLabels, 100000).areaUnderROC() 74 | } else { 75 | auc = new BCM(scoreAndLabels).areaUnderROC() 76 | } 77 | if (scoreAndLabels.getStorageLevel == StorageLevel.MEMORY_AND_DISK_SER) { 78 | scoreAndLabels.unpersist() 79 | } 80 | auc 81 | } 82 | 83 | /** 84 | * 计算各种衡量二分类模型的度量指标 85 | * 86 | * @return 指标依次为:(accuracy, positive precision, negative precision, positive recall, negative recall, positive f1_scores, negative f1_score) 87 | */ 88 | private def computeMetrics: (Double, Double, Double, Double, Double, Double, Double, Int) = { 89 | val sc = scoreAndLabels.context 90 | val totalAccum = sc.longAccumulator 91 | val testPositiveAccum = sc.longAccumulator 92 | val condPositiveAccum = sc.longAccumulator 93 | val truePositiveAccum = sc.longAccumulator 94 | val trueNegativeAccum = sc.longAccumulator 95 | 96 | scoreAndLabels.foreach { case (score, label) => 97 | totalAccum.add(1) 98 | if (score > threshold) { 99 | testPositiveAccum.add(1) 100 | } 101 | if (label == 1.0) { 102 | condPositiveAccum.add(1) 103 | } 104 | if (score >= threshold && label == 1.0) { 105 | truePositiveAccum.add(1) 106 | } 107 | if (score < threshold && label == 0.0) { 108 | trueNegativeAccum.add(1) 109 | } 110 | } 111 | 112 | val totalNum = totalAccum.value.toDouble 113 | val testPositiveNum = testPositiveAccum.value.toDouble 114 | val testNegativeNum = totalNum - testPositiveNum 115 | val condPositiveNum = condPositiveAccum.value.toDouble 116 | val condNegativeNum = totalNum - condPositiveNum 117 | val truePositiveNum = truePositiveAccum.value.toDouble 118 | val trueNegativeNum = trueNegativeAccum.value.toDouble 119 | 120 | //accuracy 121 | val ACC = (truePositiveNum + trueNegativeNum) / totalNum 122 | //positive predictive value (positive precision) 123 | val PPV = truePositiveNum / testPositiveNum 124 | //negative predictive value (negative precision) 125 | val NPV = trueNegativeNum / testNegativeNum 126 | //true positive rate (sensitivity, positive recall) 127 | val TPR = truePositiveNum / condPositiveNum 128 | //true negative rate (specificity, negative recall) 129 | val TNR = trueNegativeNum / condNegativeNum 130 | //positive f1 score 131 | val F1P = (2 * PPV * TPR) / (PPV + TPR) 132 | //negative f1 score 133 | val F1N = (2 * NPV * TNR) / (NPV + TNR) 134 | 135 | (ACC, PPV, NPV, TPR, TNR, F1P, F1N, totalNum.toInt) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/tuning/RegressionMetrics.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.tuning 2 | 3 | import org.apache.spark.mllib.evaluation.{RegressionMetrics => RM} 4 | import org.apache.spark.rdd.RDD 5 | 6 | /** 7 | * User: qfeng 8 | * Date: 15-8-11 下午4:03 9 | */ 10 | 11 | /** 12 | * 回归模型指标 13 | * 14 | * @param scoreAndLabels 预测值和实际值 15 | */ 16 | class RegressionMetrics(val scoreAndLabels: RDD[(Double, Double)]) { 17 | private val rm = new RM(scoreAndLabels) 18 | 19 | /** 20 | * 将各个度量指标转成字符串形式 21 | * 22 | * @return MSE 23 | */ 24 | override def toString: String = { 25 | val result = new StringBuilder 26 | result.append("MSE: ") 27 | result.append(MSE) 28 | result.toString() 29 | } 30 | 31 | def MSE: Double = { 32 | rm.meanSquaredError 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/util/GaussianRandom.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import java.util.Random 4 | 5 | import breeze.linalg.{DenseMatrix, DenseVector} 6 | 7 | 8 | /** 9 | * Created by qfeng on 15-1-26. 10 | */ 11 | 12 | /** 13 | * 高斯随机数生成器实例 14 | */ 15 | object GaussianRandom { 16 | /** 17 | * 生成高斯随机数密向量 18 | * 19 | * @param mean 高斯分布的均值 20 | * @param stdev 高斯分布的标准差 21 | * @param length 向量长度 22 | * @return 高斯随机数密向量 23 | */ 24 | def randDenseVector(mean: Double, stdev: Double, length: Int): DenseVector[Double] = { 25 | val results = DenseVector.zeros[Double](length) 26 | for (i <- 0 until length) { 27 | results.update(i, rand(mean, stdev)) 28 | } 29 | results 30 | } 31 | 32 | /** 33 | * 生成告诉随机数密矩阵 34 | * 35 | * @param mean 高斯分布的均值 36 | * @param stdev 高斯分布的标准差 37 | * @param numRows 矩阵行数 38 | * @param numCols 矩阵列数 39 | * @return 高斯随机数密矩阵 40 | */ 41 | def randDenseMatrix(mean: Double, stdev: Double, numRows: Int, numCols: Int): DenseMatrix[Double] = { 42 | val results = DenseMatrix.zeros[Double](numRows, numCols) 43 | for (i <- 0 until numRows) 44 | for (j <- 0 until numCols) 45 | results.update(i, j, rand(mean, stdev)) 46 | results 47 | } 48 | 49 | /** 50 | * 生成高斯随机数 51 | * 52 | * @param mean 高斯分布的均值 53 | * @param stdev 高斯分布的标准差 54 | * @return 高斯随机数 55 | */ 56 | def rand(mean: Double, stdev: Double): Double = { 57 | val random = new Random() 58 | val genValue = random.nextGaussian() 59 | mean + stdev * genValue 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/util/HDFSUtil.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import org.apache.hadoop.fs.{FileSystem, Path} 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.SparkSession 6 | 7 | /** 8 | * Created by qfeng on 16-3-3. 9 | */ 10 | 11 | /** 12 | * HDFS文件操作工具类 13 | */ 14 | object HDFSUtil { 15 | /** 16 | * 如果文件存在则删除它 17 | * 18 | * @param file 文件 19 | */ 20 | def deleteIfExists(file: String): Unit = { 21 | val spark = SparkSession.builder().getOrCreate() 22 | val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) 23 | if (fs.exists(new Path(file))) { 24 | fs.delete(new Path(file), true) 25 | } 26 | } 27 | 28 | /** 29 | * 文件是否存在 30 | * 31 | * @param file 文件 32 | */ 33 | def exists(file: String): Boolean = { 34 | val spark = SparkSession.builder().getOrCreate() 35 | val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) 36 | fs.exists(new Path(file)) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/util/LoadDSUtil.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import breeze.linalg.SparseVector 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.storage.StorageLevel 7 | 8 | /** 9 | * Created by qfeng on 15-3-18. 10 | */ 11 | object LoadDSUtil { 12 | // Convenient methods for `loadLibSVMFile`. 13 | 14 | /** 15 | * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of 16 | * partitions. 17 | */ 18 | def loadLibSVMDataSet(path: String, 19 | numFeatures: Int = -1): (RDD[(Double, SparseVector[Double])], Int) = { 20 | val sc = SparkContext.getOrCreate() 21 | val dataSet = sc.textFile(path, sc.defaultMinPartitions) 22 | toLibSVMDataSet(dataSet, numFeatures) 23 | } 24 | 25 | /** 26 | * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint]. 27 | * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR. 28 | * Each line represents a labeled sparse feature vector using the following format: 29 | * {{{label index1:value1 index2:value2 ...}}} 30 | * where the indices are one-based and in ascending order. 31 | * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]], 32 | * where the feature indices are converted to zero-based. 33 | * 34 | * @param dataSet 数据集 35 | * @param numFeatures number of features, which will be determined from the input data if a 36 | * nonpositive value is given. This is useful when the dataset is already split 37 | * into multiple files and you want to load them separately, because some 38 | * features may not present in certain files, which leads to inconsistent 39 | * feature dimensions. 40 | * @return labeled data stored as an RDD[LabeledPoint] 41 | */ 42 | def toLibSVMDataSet(dataSet: RDD[String], 43 | numFeatures: Int = -1): (RDD[(Double, SparseVector[Double])], Int) = { 44 | val parsed = dataSet.map(_.trim) 45 | .filter(line => !(line.isEmpty || line.startsWith("#"))) 46 | .map { line => 47 | val items = line.split(' ') 48 | val label = items.head.toDouble 49 | val (indices, values) = items.tail.filter(_.nonEmpty).map { item => 50 | val indexAndValue = item.split(':') 51 | val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based. 52 | val value = indexAndValue(1).toDouble 53 | (index, value) 54 | }.unzip 55 | (label, indices, values) 56 | } 57 | 58 | // Determine number of features. 59 | val d = if (numFeatures > 0) { 60 | numFeatures 61 | } else { 62 | parsed.persist(StorageLevel.MEMORY_AND_DISK_SER) 63 | parsed.map { case (label, indices, values) => 64 | indices.lastOption.getOrElse(0) 65 | }.reduce(math.max) + 1 66 | } 67 | 68 | (parsed.map { case (label, indices, values) => 69 | (label, new SparseVector[Double](indices, values, d)) 70 | }, d) 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/util/Logging.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import org.slf4j.{Logger, LoggerFactory} 4 | 5 | /** 6 | * Created by qfeng on 16-8-30. 7 | */ 8 | 9 | /** 10 | * 日志工具接口 11 | */ 12 | trait Logging { 13 | @transient private var log_ : Logger = null 14 | 15 | protected def logName = { 16 | this.getClass.getName.stripSuffix("$") 17 | } 18 | 19 | protected def log: Logger = { 20 | if (log_ == null) { 21 | log_ = LoggerFactory.getLogger(logName) 22 | } 23 | log_ 24 | } 25 | 26 | protected def logInfo(msg: => String) { 27 | if (log.isInfoEnabled) log.info(msg) 28 | } 29 | 30 | protected def logDebug(msg: => String) { 31 | if (log.isDebugEnabled) log.debug(msg) 32 | } 33 | 34 | protected def logTrace(msg: => String) { 35 | if (log.isTraceEnabled) log.trace(msg) 36 | } 37 | 38 | protected def logWarning(msg: => String) { 39 | if (log.isWarnEnabled) log.warn(msg) 40 | } 41 | 42 | protected def logError(msg: => String) { 43 | if (log.isErrorEnabled) log.error(msg) 44 | } 45 | 46 | protected def logInfo(msg: => String, throwable: Throwable) { 47 | if (log.isInfoEnabled) log.info(msg, throwable) 48 | } 49 | 50 | protected def logDebug(msg: => String, throwable: Throwable) { 51 | if (log.isDebugEnabled) log.debug(msg, throwable) 52 | } 53 | 54 | protected def logTrace(msg: => String, throwable: Throwable) { 55 | if (log.isTraceEnabled) log.trace(msg, throwable) 56 | } 57 | 58 | protected def logWarning(msg: => String, throwable: Throwable) { 59 | if (log.isWarnEnabled) log.warn(msg, throwable) 60 | } 61 | 62 | protected def logError(msg: => String, throwable: Throwable) { 63 | if (log.isErrorEnabled) log.error(msg, throwable) 64 | } 65 | 66 | protected def isTraceEnabled: Boolean = { 67 | log.isTraceEnabled 68 | } 69 | } -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/util/NumericParser.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import java.util.StringTokenizer 4 | 5 | import org.apache.spark.SparkException 6 | 7 | import scala.collection.mutable.{ArrayBuffer, ListBuffer} 8 | 9 | /** 10 | * Created by qfeng on 15-3-13. 11 | */ 12 | 13 | /** 14 | * Simple parser for a numeric structure consisting of three types: 15 | * 16 | * - number: a double in Java's floating number format 17 | * - array: an array of numbers stored as `[v0,v1,...,vn]` 18 | * - tuple: a list of numbers, arrays, or tuples stored as `(...)` 19 | */ 20 | object NumericParser { 21 | 22 | /** Parses a string into a Double, an Array[Double], or a Seq[Any]. */ 23 | def parse(s: String): Any = { 24 | val tokenizer = new StringTokenizer(s, "()[],", true) 25 | if (tokenizer.hasMoreTokens()) { 26 | val token = tokenizer.nextToken() 27 | if (token == "(") { 28 | parseTuple(tokenizer) 29 | } else if (token == "[") { 30 | parseArray(tokenizer) 31 | } else { 32 | // expecting a number 33 | parseDouble(token) 34 | } 35 | } else { 36 | throw new SparkException(s"Cannot find any token from the input string.") 37 | } 38 | } 39 | 40 | private def parseArray(tokenizer: StringTokenizer): Array[Double] = { 41 | val values = ArrayBuffer.empty[Double] 42 | var parsing = true 43 | var allowComma = false 44 | var token: String = null 45 | while (parsing && tokenizer.hasMoreTokens()) { 46 | token = tokenizer.nextToken() 47 | if (token == "]") { 48 | parsing = false 49 | } else if (token == ",") { 50 | if (allowComma) { 51 | allowComma = false 52 | } else { 53 | throw new SparkException("Found a ',' at a wrong position.") 54 | } 55 | } else { 56 | // expecting a number 57 | values.append(parseDouble(token)) 58 | allowComma = true 59 | } 60 | } 61 | if (parsing) { 62 | throw new SparkException(s"An array must end with ']'.") 63 | } 64 | values.toArray 65 | } 66 | 67 | private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { 68 | val items = ListBuffer.empty[Any] 69 | var parsing = true 70 | var allowComma = false 71 | var token: String = null 72 | while (parsing && tokenizer.hasMoreTokens()) { 73 | token = tokenizer.nextToken() 74 | if (token == "(") { 75 | items.append(parseTuple(tokenizer)) 76 | allowComma = true 77 | } else if (token == "[") { 78 | items.append(parseArray(tokenizer)) 79 | allowComma = true 80 | } else if (token == ",") { 81 | if (allowComma) { 82 | allowComma = false 83 | } else { 84 | throw new SparkException("Found a ',' at a wrong position.") 85 | } 86 | } else if (token == ")") { 87 | parsing = false 88 | } else { 89 | // expecting a number 90 | items.append(parseDouble(token)) 91 | allowComma = true 92 | } 93 | } 94 | if (parsing) { 95 | throw new SparkException(s"A tuple must end with ')'.") 96 | } 97 | items 98 | } 99 | 100 | private def parseDouble(s: String): Double = { 101 | try { 102 | java.lang.Double.parseDouble(s) 103 | } catch { 104 | case e: Throwable => 105 | throw new SparkException(s"Cannot parse a double from: $s", e) 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/util/ParamUtil.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import org.apache.spark.ml.param.ParamMap 4 | 5 | /** 6 | * Created by qfeng on 15-3-31. 7 | */ 8 | 9 | /** 10 | * 参数工具类实例 11 | */ 12 | object ParamUtil { 13 | 14 | /** 15 | * 参数池转成字符串 16 | * 17 | * @param params 参数池 18 | * @return 字符串 19 | */ 20 | def paramsToString(params: ParamMap): String = { 21 | params.toSeq.map { paramPair => paramPair.value match { 22 | case v: Array[_] => s"${paramPair.param.name}:${v.mkString(",")}" 23 | case _ => s"${paramPair.param.name}:${paramPair.value}" 24 | } 25 | }.mkString(", ") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/io/github/qf6101/mfm/util/VectorConverter.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import breeze.linalg.SparseVector 4 | import org.apache.spark.mllib.linalg.Vector 5 | 6 | /** 7 | * Created by qfeng on 15-3-17. 8 | */ 9 | object VectorConverter { 10 | /** 11 | * spark的向量转成breeze的稀疏向量 12 | * 13 | * @param input spark向量 14 | * @return breeze的稀疏向量 15 | */ 16 | def SparkVector2SV(input: Vector): SparseVector[Double] = { 17 | val result = SparseVector.zeros[Double](input.size) 18 | 19 | for (i <- 0 until input.size) { 20 | if (input(i) != 0.0) { 21 | result.update(i, input(i)) 22 | } 23 | } 24 | 25 | result 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=DEBUG, console, file 2 | # Set everything to be logged to the console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{100}: %m%n 7 | # Set everything to be logged to the file core/target/unit-tests.log 8 | log4j.appender.file=org.apache.log4j.FileAppender 9 | log4j.appender.file.append=false 10 | log4j.appender.file.file=target/unit-tests.log 11 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 12 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 13 | #ignore specific log information 14 | log4j.logger.org.eclipse.jetty=OFF 15 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=OFF 16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=OFF 17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=OFF 18 | log4j.logger.org.apache.spark=OFF 19 | log4j.logger.Remoting=OFF 20 | log4j.logger.org.spark-project.jetty=OFF 21 | log4j.logger.org.apache.hadoop=OFF 22 | log4j.logger.io.netty=OFF 23 | log4j.logger.akka=OFF 24 | log4j.logger.breeze=OFF 25 | log4j.logger.org.spark_project.jetty=OFF -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/factorization/binomial/FmSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.binomial 2 | 3 | import io.github.qf6101.mfm.optimization.{LogXDecreasingStrategy, SquaredL2Updater} 4 | import io.github.qf6101.mfm.tuning.BinaryClassificationMetrics 5 | import io.github.qf6101.mfm.util.TestingUtils._ 6 | import io.github.qf6101.mfm.util.{HDFSUtil, LoadDSUtil, MfmTestSparkSession} 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.scalatest.FunSuite 9 | 10 | /** 11 | * User: qfeng 12 | * Date: 15-12-8 下午4:58 13 | */ 14 | class FmSuite extends FunSuite with MfmTestSparkSession { 15 | test("test binomial factorization machines") { 16 | // Load training and testing data sets 17 | val (training, _) = LoadDSUtil.loadLibSVMDataSet("test_data/input/a1a/a1a") 18 | val (testing, numFeatures) = LoadDSUtil.loadLibSVMDataSet("test_data/input/a1a/a1a.t") 19 | // Construct factorization machines learner with parameters 20 | val params = new ParamMap() 21 | val updater = new SquaredL2Updater(decreasingStrategy = new LogXDecreasingStrategy(100)) 22 | val fmLearn = new FmLearnSGD(params, updater) 23 | params.put(fmLearn.gd.numIterations, 100) 24 | params.put(fmLearn.gd.stepSize, 0.1) 25 | params.put(fmLearn.gd.miniBatchFraction, 1.0) 26 | params.put(fmLearn.gd.convergenceTol, 1E-5) 27 | params.put(fmLearn.numFeatures, numFeatures) 28 | params.put(fmLearn.numFactors, 5) 29 | params.put(fmLearn.k0, false) 30 | params.put(fmLearn.k1, true) 31 | params.put(fmLearn.k2, false) 32 | params.put(fmLearn.maxInteractFeatures, numFeatures) 33 | params.put(fmLearn.initMean, 0.0) 34 | params.put(fmLearn.initStdev, 0.0001) 35 | params.put(fmLearn.reg0, 0.0) 36 | params.put(fmLearn.reg1, 0.0) 37 | params.put(fmLearn.reg2, 0.0) 38 | // Train FM model 39 | val model = fmLearn.train(training) 40 | // Use testing data set to evaluate the model 41 | val eval = testing.map { case (label, features) => 42 | (model.predict(features), label) 43 | } 44 | val metrics = new BinaryClassificationMetrics(eval) 45 | // Save model to file 46 | HDFSUtil.deleteIfExists("test_data/output/a1a") 47 | model.save("test_data/output/a1a") 48 | 49 | //// Firstly test spark reloading 50 | // Reload model from file and test if it is equal to the original model 51 | val sparkReloadModel = FmModel("test_data/output/a1a") 52 | assert(model.equals(sparkReloadModel)) 53 | // Evaluate the reloaded model 54 | val sparkReloadEval = testing.map { case (label, features) => 55 | (sparkReloadModel.predict(features), label) 56 | } 57 | // Test if the reloaded model has the same result on the testing data set 58 | val sparkReloadMetrics = new BinaryClassificationMetrics(sparkReloadEval) 59 | assert(sparkReloadMetrics.AUC ~= metrics.AUC absTol 1E-5) 60 | 61 | //// Secondly test local reloading 62 | // Reload model from file and test if it is equal to the original model 63 | val localReloadModel = FmModel.fromLocal("test_data/output/a1a") 64 | assert(model.equals(localReloadModel)) 65 | // Evaluate the reloaded model 66 | val localReloadEval = testing.map { case (label, features) => 67 | (localReloadModel.predict(features), label) 68 | } 69 | // Test if the reloaded model has the same result on the testing data set 70 | val localReloadMetrics = new BinaryClassificationMetrics(localReloadEval) 71 | assert(localReloadMetrics.AUC ~= metrics.AUC absTol 1E-5) 72 | // print the AUC 73 | println("AUC: " + metrics.AUC) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmCoefficientsSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.multinomial 2 | 3 | import io.github.qf6101.mfm.util.MfmTestSparkSession 4 | import org.scalatest.FunSuite 5 | 6 | /** 7 | * Created by qfeng on 16-9-18. 8 | */ 9 | class MfmCoefficientsSuite extends FunSuite with MfmTestSparkSession { 10 | test("test MfmCoefficients' += operation") { 11 | val left = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2) 12 | val right = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2) 13 | 14 | val leftSample = left.thetas(0).w(0) 15 | val rightSample = right.thetas(0).w(0) 16 | println(leftSample) 17 | println(rightSample) 18 | left += right 19 | println(left.thetas(0).w(0)) 20 | assert(left.thetas(0).w(0) == leftSample + rightSample) 21 | } 22 | 23 | test("test MfmCoefficients' + operation") { 24 | val left = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2) 25 | val right = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2) 26 | 27 | val leftSample = left.thetas(0).w(0) 28 | val rightSample = right.thetas(0).w(0) 29 | println(leftSample) 30 | println(rightSample) 31 | val sum = left + right 32 | println(sum.asInstanceOf[MfmCoefficients].thetas(0).w(0)) 33 | assert(sum.asInstanceOf[MfmCoefficients].thetas(0).w(0) == leftSample + rightSample) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.factorization.multinomial 2 | 3 | import breeze.linalg.argmax 4 | import io.github.qf6101.mfm.optimization._ 5 | import io.github.qf6101.mfm.util.TestingUtils._ 6 | import io.github.qf6101.mfm.util.{HDFSUtil, LoadDSUtil, MfmTestSparkSession} 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.apache.spark.mllib.evaluation.MulticlassMetrics 9 | import org.scalatest.FunSuite 10 | 11 | /** 12 | * User: qfeng 13 | * Date: 15-12-8 下午4:58 14 | */ 15 | class MfmSuite extends FunSuite with MfmTestSparkSession { 16 | test("test binomial factorization machines") { 17 | // Load training and testing data sets 18 | val (training, numFeatures) = LoadDSUtil.loadLibSVMDataSet("test_data/input/mnist/mnist.scale") 19 | val (testing, _) = LoadDSUtil.loadLibSVMDataSet("test_data/input/mnist/mnist.scale.t") 20 | // Construct multinomial factorization machines learner with parameters 21 | val params = new ParamMap() 22 | val updater = new SquaredL2Updater(decreasingStrategy = new LogXDecreasingStrategy(20)) 23 | val mfmLearn = new MfmLearnSGD(params, updater) 24 | params.put(mfmLearn.gd.numIterations, 10) 25 | params.put(mfmLearn.gd.stepSize, 0.1) 26 | params.put(mfmLearn.gd.miniBatchFraction, 1.0) 27 | params.put(mfmLearn.gd.convergenceTol, 1E-5) 28 | params.put(mfmLearn.numFeatures, numFeatures) 29 | params.put(mfmLearn.numFactors, 5) 30 | params.put(mfmLearn.k0, false) 31 | params.put(mfmLearn.k1, true) 32 | params.put(mfmLearn.k2, false) 33 | params.put(mfmLearn.maxInteractFeatures, numFeatures) 34 | params.put(mfmLearn.initMean, 0.0) 35 | params.put(mfmLearn.initStdev, 0.01) 36 | params.put(mfmLearn.reg0, 0.0001) 37 | params.put(mfmLearn.reg1, 0.0001) 38 | params.put(mfmLearn.reg2, 0.001) 39 | params.put(mfmLearn.numClasses, 10) 40 | // Train MFM model 41 | val model = mfmLearn.train(training) 42 | // Use testing data set to evaluate the model 43 | val eval = testing.map { case (label, features) => 44 | argmax(model.predict(features)).toDouble -> label 45 | } 46 | val metrics = new MulticlassMetrics(eval) 47 | // Save model to file 48 | HDFSUtil.deleteIfExists("test_data/output/mnist") 49 | model.save("test_data/output/mnist") 50 | 51 | //// Firstly test spark reloading 52 | // Reload model from file and test if it is equal to the original model 53 | val sparkReloadModel = MfmModel("test_data/output/mnist") 54 | assert(model.equals(sparkReloadModel)) 55 | // Evaluate the reloaded model 56 | val sparkReloadEval = testing.map { case (label, features) => 57 | argmax(sparkReloadModel.predict(features)).toDouble -> label 58 | } 59 | // Test if the reloaded model has the same result on the testing data set 60 | val sparkReloadMetrics = new MulticlassMetrics(sparkReloadEval) 61 | assert(sparkReloadMetrics.accuracy ~= metrics.accuracy absTol 1E-5) 62 | assert(sparkReloadMetrics.weightedPrecision ~= metrics.weightedPrecision absTol 1E-5) 63 | assert(sparkReloadMetrics.weightedRecall ~= metrics.weightedRecall absTol 1E-5) 64 | assert(sparkReloadMetrics.weightedFMeasure ~= metrics.weightedFMeasure absTol 1E-5) 65 | 66 | //// Secondly test local reloading 67 | // Reload model from file and test if it is equal to the original model 68 | val localReloadModel = MfmModel.fromLocal("test_data/output/mnist") 69 | assert(model.equals(localReloadModel)) 70 | // Evaluate the reloaded model 71 | val localReloadEval = testing.map { case (label, features) => 72 | argmax(localReloadModel.predict(features)).toDouble -> label 73 | } 74 | // Test if the reloaded model has the same result on the testing data set 75 | val localReloadMetrics = new MulticlassMetrics(localReloadEval) 76 | assert(localReloadMetrics.accuracy ~= metrics.accuracy absTol 1E-5) 77 | assert(localReloadMetrics.weightedPrecision ~= metrics.weightedPrecision absTol 1E-5) 78 | assert(localReloadMetrics.weightedRecall ~= metrics.weightedRecall absTol 1E-5) 79 | assert(localReloadMetrics.weightedFMeasure ~= metrics.weightedFMeasure absTol 1E-5) 80 | // print the metrics 81 | println("accuracy: " + metrics.accuracy) 82 | println("weighted precision: " + metrics.weightedPrecision) 83 | println("weighted recall: " + metrics.weightedRecall) 84 | println("weighted f-measure: " + metrics.weightedFMeasure) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/optimization/GradientDescentSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import breeze.linalg.SparseVector 4 | import io.github.qf6101.mfm.logisticregression.{LogisticGradient, LrLearnSGD, VectorCoefficients} 5 | import io.github.qf6101.mfm.util.MfmTestSparkSession 6 | import io.github.qf6101.mfm.util.TestingUtils._ 7 | import org.apache.spark.ml.param.ParamMap 8 | import org.scalatest.{FunSuite, Matchers} 9 | 10 | import scala.collection.JavaConversions._ 11 | import scala.util.Random 12 | 13 | /** 14 | * Created by qfeng on 15-3-13. 15 | */ 16 | 17 | object GradientDescentSuite { 18 | 19 | def generateLogisticInputAsList( 20 | offset: Double, 21 | scale: Double, 22 | nPoints: Int, 23 | seed: Int): java.util.List[(Double, SparseVector[Double])] = { 24 | seqAsJavaList(generateGDInput(offset, scale, nPoints, seed)) 25 | } 26 | 27 | // Generate input of the form Y = logistic(offset + scale * X) 28 | def generateGDInput(offset: Double, 29 | scale: Double, 30 | nPoints: Int, 31 | seed: Int): Seq[(Double, SparseVector[Double])] = { 32 | val rnd = new Random(seed) 33 | val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian()) 34 | 35 | val unifRand = new Random(45) 36 | val rLogis = (0 until nPoints).map { i => 37 | val u = unifRand.nextDouble() 38 | math.log(u) - math.log(1.0 - u) 39 | } 40 | 41 | val y: Seq[Double] = (0 until nPoints).map { i => 42 | val yVal = offset + scale * x1(i) + rLogis(i) 43 | if (yVal > 0) 1.0 else 0.0 44 | } 45 | 46 | (0 until nPoints).map(i => (y(i), new SparseVector[Double](Array(0, 1), Array(1.0, x1(i)), 2))) 47 | } 48 | } 49 | 50 | class GradientDescentSuite extends FunSuite with MfmTestSparkSession with Matchers { 51 | test("Assert the loss is decreasing.") { 52 | val nPoints = 1000 53 | val A = 2.0 54 | val B = -1.5 55 | 56 | val params = new ParamMap() 57 | val gradient = new LogisticGradient(params) 58 | val updater = new SimpleUpdater() 59 | val lrf = new LrLearnSGD(params, null) 60 | val gd = new GradientDescent(gradient, updater, params) 61 | 62 | // Add a extra variable consisting of all 1.0's for the intercept. 63 | val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42) 64 | val dataRDD = spark.sparkContext.parallelize(testData, 2).cache() 65 | val initialWeightsWithIntercept = new VectorCoefficients(2) 66 | initialWeightsWithIntercept.w.update(0, 1.0) 67 | initialWeightsWithIntercept.w.update(1, -1.0) 68 | 69 | params.put(gd.numIterations, 10) 70 | params.put(gd.miniBatchFraction, 1.0) 71 | params.put(gd.stepSize, 1.0) 72 | params.put(gd.convergenceTol, 1E-4) 73 | params.put(lrf.reg, Array(0.0)) 74 | 75 | val (_, loss) = gd.optimizeWithHistory( 76 | dataRDD, 77 | initialWeightsWithIntercept, 78 | params(lrf.reg)) 79 | 80 | assert(loss.last - loss.head < 0, "loss isn't decreasing.") 81 | 82 | val lossDiff = loss.init.zip(loss.tail).map { case (lhs, rhs) => lhs - rhs } 83 | assert(lossDiff.count(_ > 0).toDouble / lossDiff.size > 0.8) 84 | } 85 | 86 | 87 | test("Test the loss and gradient of first iteration with regularization.") { 88 | val params = new ParamMap() 89 | val gradient = new LogisticGradient(params) 90 | val updater = new SquaredL2Updater() 91 | val lrf = new LrLearnSGD(params, null) 92 | val gd = new GradientDescent(gradient, updater, params) 93 | 94 | // Add a extra variable consisting of all 1.0's for the intercept. 95 | val testData = GradientDescentSuite.generateGDInput(2.0, -1.5, 1000, 42) 96 | val dataRDD = spark.sparkContext.parallelize(testData, 2).cache() 97 | 98 | // Prepare non-zero weights 99 | val initialWeightsWithIntercept = new VectorCoefficients(2) 100 | initialWeightsWithIntercept.w.update(0, 1.0) 101 | initialWeightsWithIntercept.w.update(1, 0.5) 102 | 103 | params.put(gd.numIterations, 1) 104 | params.put(gd.miniBatchFraction, 1.0) 105 | params.put(gd.stepSize, 1.0) 106 | params.put(gd.convergenceTol, 1E-4) 107 | params.put(lrf.reg, Array(0.0)) 108 | 109 | val (newWeights0, loss0) = gd.optimizeWithHistory( 110 | dataRDD, initialWeightsWithIntercept, params(lrf.reg)) 111 | 112 | params.put(gd.numIterations, 1) 113 | params.put(lrf.reg, Array(1.0)) 114 | 115 | val (newWeights1, loss1) = gd.optimizeWithHistory( 116 | dataRDD, initialWeightsWithIntercept, params(lrf.reg)) 117 | 118 | assert( 119 | loss1(0) ~= (loss0(0) + (math.pow(initialWeightsWithIntercept.w(0), 2) + 120 | math.pow(initialWeightsWithIntercept.w(1), 2)) / 2) absTol 1E-5, 121 | """For non-zero weights, the regVal should be \frac{1}{2}\sum_i w_i^2.""") 122 | 123 | assert( 124 | (newWeights1.asInstanceOf[VectorCoefficients].w(0) ~= (newWeights0.asInstanceOf[VectorCoefficients].w(0) - 125 | initialWeightsWithIntercept.w(0)) 126 | absTol 1E-5) && 127 | (newWeights1.asInstanceOf[VectorCoefficients].w(1) ~= (newWeights0.asInstanceOf[VectorCoefficients].w(1) - 128 | initialWeightsWithIntercept.w(1)) absTol 1E-5), 129 | "The different between newWeights with/without regularization " + 130 | "should be initialWeightsWithIntercept.") 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/optimization/LBFGSSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.optimization 2 | 3 | import io.github.qf6101.mfm.logisticregression.{LogisticGradient, LrLearnLBFGS, VectorCoefficients} 4 | import io.github.qf6101.mfm.util.MfmTestSparkSession 5 | import io.github.qf6101.mfm.util.TestingUtils._ 6 | import org.apache.spark.ml.param.ParamMap 7 | import org.scalatest.FunSuite 8 | 9 | 10 | /** 11 | * Created by qfeng on 15-4-7. 12 | */ 13 | 14 | 15 | class LBFGSSuite extends FunSuite with MfmTestSparkSession { 16 | lazy val dataRDD = spark.sparkContext.parallelize(testData, 2).cache() 17 | val nPoints = 1000 18 | val A = 2.0 19 | val B = -1.5 20 | // Add a extra variable consisting of all 1.0's for the intercept. 21 | val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42) 22 | val simpleUpdater = new SimpleUpdater() 23 | val squaredL2Updater = new SquaredL2Updater() 24 | 25 | test("LBFGS loss should be decreasing and match the result of Gradient Descent.") { 26 | val initialWeightsWithIntercept = new VectorCoefficients(2) 27 | initialWeightsWithIntercept.w.update(0, 1.0) 28 | initialWeightsWithIntercept.w.update(1, -1.0) 29 | 30 | val lbfgsParamPool = new ParamMap() 31 | val lbfgsGradient = new LogisticGradient(lbfgsParamPool) 32 | val lbfgsLrf = new LrLearnLBFGS(lbfgsParamPool, null) 33 | val lbfgs = new LBFGS(lbfgsGradient, simpleUpdater, lbfgsParamPool) 34 | 35 | lbfgsParamPool.put(lbfgs.numIterations, 10) 36 | lbfgsParamPool.put(lbfgsLrf.reg, Array(0.0)) 37 | lbfgsParamPool.put(lbfgs.convergenceTol, 1e-12) 38 | lbfgsParamPool.put(lbfgs.numCorrections, 10) 39 | 40 | val (_, lossLBFGS) = lbfgs.optimizeWithHistory( 41 | dataRDD, 42 | initialWeightsWithIntercept, 43 | lbfgsParamPool(lbfgsLrf.reg)) 44 | 45 | // Since the cost function is convex, the loss is guaranteed to be monotonically decreasing 46 | // with L-BFGS optimizer. 47 | // (SGD doesn't guarantee this, and the loss will be fluctuating in the optimization process.) 48 | assert((lossLBFGS, lossLBFGS.tail).zipped.forall(_ > _), "loss should be monotonically decreasing.") 49 | 50 | val gdParamPool = new ParamMap() 51 | val gdGradient = new LogisticGradient(gdParamPool) 52 | val gdLrf = new LrLearnLBFGS(lbfgsParamPool, null) 53 | val gd = new GradientDescent(gdGradient, simpleUpdater, gdParamPool) 54 | 55 | gdParamPool.put(gd.stepSize, 1.0) 56 | gdParamPool.put(gd.numIterations, 50) 57 | gdParamPool.put(gdLrf.reg, Array(0.0)) 58 | gdParamPool.put(gd.miniBatchFraction, 1.0) 59 | gdParamPool.put(gd.convergenceTol, 1E-12) 60 | 61 | val (_, lossGD) = gd.optimizeWithHistory( 62 | dataRDD, 63 | initialWeightsWithIntercept, 64 | gdParamPool(gdLrf.reg)) 65 | 66 | // GD converges a way slower than L-BFGS. To achieve 1% difference, 67 | // it requires 90 iterations in GD. No matter how hard we increase 68 | // the number of iterations in GD here, the lossGD will be always 69 | // larger than lossLBFGS. This is based on observation, no theoretically guaranteed 70 | assert(Math.abs((lossGD.last - lossLBFGS.last) / lossLBFGS.last) < 0.02, 71 | "LBFGS should match GD result within 2% difference.") 72 | } 73 | 74 | test("LBFGS and Gradient Descent with L2 regularization should get the same result.") { 75 | val initialWeightsWithIntercept = new VectorCoefficients(2) 76 | initialWeightsWithIntercept.w.update(0, 0.3) 77 | initialWeightsWithIntercept.w.update(1, 0.12) 78 | 79 | val lbfgsParamPool = new ParamMap() 80 | val lbfgsGradient = new LogisticGradient(lbfgsParamPool) 81 | val lbfgsLrf = new LrLearnLBFGS(lbfgsParamPool, null) 82 | val lbfgs = new LBFGS(lbfgsGradient, squaredL2Updater, lbfgsParamPool) 83 | 84 | lbfgsParamPool.put(lbfgs.numIterations, 10) 85 | lbfgsParamPool.put(lbfgsLrf.reg, Array(0.2)) 86 | lbfgsParamPool.put(lbfgs.convergenceTol, 1e-12) 87 | lbfgsParamPool.put(lbfgs.numCorrections, 10) 88 | 89 | val (weightLBFGS, lossLBFGS) = lbfgs.optimizeWithHistory( 90 | dataRDD, 91 | initialWeightsWithIntercept, 92 | lbfgsParamPool(lbfgsLrf.reg)) 93 | 94 | // Since the cost function is convex, the loss is guaranteed to be monotonically decreasing 95 | // with L-BFGS optimizer. 96 | // (SGD doesn't guarantee this, and the loss will be fluctuating in the optimization process.) 97 | assert((lossLBFGS, lossLBFGS.tail).zipped.forall(_ > _), "loss should be monotonically decreasing.") 98 | 99 | val gdParamPool = new ParamMap() 100 | val gdGradient = new LogisticGradient(gdParamPool) 101 | val gdLrf = new LrLearnLBFGS(lbfgsParamPool, null) 102 | val gd = new GradientDescent(gdGradient, squaredL2Updater, gdParamPool) 103 | 104 | gdParamPool.put(gd.stepSize, 1.0) 105 | gdParamPool.put(gd.numIterations, 50) 106 | gdParamPool.put(gdLrf.reg, Array(0.2)) 107 | gdParamPool.put(gd.miniBatchFraction, 1.0) 108 | gdParamPool.put(gd.convergenceTol, 1E-12) 109 | 110 | val (weightGD, lossGD) = gd.optimizeWithHistory( 111 | dataRDD, 112 | initialWeightsWithIntercept, 113 | gdParamPool(gdLrf.reg)) 114 | 115 | assert(lossGD(0) ~= lossLBFGS(0) absTol 1E-5, 116 | "The first losses of LBFGS and GD should be the same.") 117 | 118 | // The 2% difference here is based on observation, but is not theoretically guaranteed. 119 | assert(lossGD.last ~= lossLBFGS.last relTol 0.03, 120 | "The last losses of LBFGS and GD should be within 3% difference.") 121 | 122 | assert( 123 | (weightLBFGS.asInstanceOf[VectorCoefficients].w0 ~= weightGD.asInstanceOf[VectorCoefficients].w0 relTol 0.03) 124 | && (weightLBFGS.asInstanceOf[VectorCoefficients].w(0) ~= weightGD.asInstanceOf[VectorCoefficients].w(0) relTol 0.03) 125 | && (weightLBFGS.asInstanceOf[VectorCoefficients].w(1) ~= weightGD.asInstanceOf[VectorCoefficients].w(1) relTol 0.03), 126 | "The weight differences between LBFGS and GD should be within 3%.") 127 | } 128 | 129 | 130 | test("The convergence criteria should work as we expect.") { 131 | val initialWeightsWithIntercept = new VectorCoefficients(2) 132 | initialWeightsWithIntercept.w.update(0, 0.0) 133 | initialWeightsWithIntercept.w.update(1, 0.0) 134 | 135 | val lbfgsParamPool = new ParamMap() 136 | val lbfgsGradient = new LogisticGradient(lbfgsParamPool) 137 | val lbfgsLrf = new LrLearnLBFGS(lbfgsParamPool, null) 138 | val lbfgs = new LBFGS(lbfgsGradient, squaredL2Updater, lbfgsParamPool) 139 | 140 | lbfgsParamPool.put(lbfgs.numIterations, 8) 141 | lbfgsParamPool.put(lbfgsLrf.reg, Array(0.0)) 142 | lbfgsParamPool.put(lbfgs.convergenceTol, 1E-12) 143 | lbfgsParamPool.put(lbfgs.numCorrections, 10) 144 | 145 | val (_, lossLBFGS1) = lbfgs.optimizeWithHistory( 146 | dataRDD, 147 | initialWeightsWithIntercept, 148 | lbfgsParamPool(lbfgsLrf.reg)) 149 | 150 | // Note that the first loss is computed with initial weights, 151 | // so the total numbers of loss will be numbers of iterations + 1 152 | assert(lossLBFGS1.length == 9) 153 | 154 | lbfgsParamPool.put(lbfgs.convergenceTol, 0.1) 155 | 156 | val (_, lossLBFGS2) = lbfgs.optimizeWithHistory( 157 | dataRDD, 158 | initialWeightsWithIntercept, 159 | lbfgsParamPool(lbfgsLrf.reg)) 160 | 161 | // Based on observation, lossLBFGS2 runs 3 iterations, no theoretically guaranteed. 162 | assert(lossLBFGS2.length == 4) 163 | assert((lossLBFGS2(2) - lossLBFGS2(3)) / lossLBFGS2(2) < 0.1) 164 | 165 | lbfgsParamPool.put(lbfgs.convergenceTol, 0.01) 166 | 167 | val (_, lossLBFGS3) = lbfgs.optimizeWithHistory( 168 | dataRDD, 169 | initialWeightsWithIntercept, 170 | lbfgsParamPool(lbfgsLrf.reg)) 171 | 172 | // With smaller convergenceTol, it takes more steps. 173 | assert(lossLBFGS3.length > lossLBFGS2.length) 174 | 175 | // Based on observation, lossLBFGS2 runs 5 iterations, no theoretically guaranteed. 176 | assert(lossLBFGS3.length == 6) 177 | assert((lossLBFGS3(4) - lossLBFGS3(5)) / lossLBFGS3(4) < 0.01) 178 | } 179 | 180 | } 181 | -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/util/MfmTestSparkSession.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.scalatest.{BeforeAndAfterAll, Suite} 5 | 6 | /** 7 | * Created by qfeng on 15-3-13. 8 | */ 9 | trait MfmTestSparkSession extends BeforeAndAfterAll { 10 | self: Suite => 11 | @transient var spark: SparkSession = _ 12 | 13 | override def beforeAll() { 14 | super.beforeAll() 15 | spark = SparkSession.builder() 16 | .master("local[2]").appName(this.getClass.toString) 17 | .getOrCreate() 18 | } 19 | 20 | override def afterAll() { 21 | if (spark != null) { 22 | spark.stop() 23 | } 24 | super.afterAll() 25 | } 26 | } -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/util/ParamSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import org.apache.spark.ml.param.{Param, ParamMap} 4 | import org.scalatest.FunSuite 5 | 6 | /** 7 | * User: qfeng 8 | * Date: 15-8-11 下午4:44 9 | * Usage: 10 | */ 11 | class ParamSuite extends FunSuite { 12 | test("add two parameter sets") { 13 | val params = new ParamMap() 14 | val otherParams = new ParamMap() 15 | 16 | val param1: Param[Double] = new Param("ParamTest", "param1", "param1") 17 | val param2: Param[Double] = new Param("ParamTest", "param2", "param2") 18 | val param3: Param[Double] = new Param("ParamTest", "param3", "param3") 19 | val param4: Param[Double] = new Param("ParamTest", "param4", "param4") 20 | 21 | params.put[Double](param1, 1.0) 22 | params.put[Double](param1, 2.0) 23 | params.put[Double](param2, 5.0) 24 | 25 | otherParams.put[Double](param2, 10.1) 26 | otherParams.put[Double](param3, 7.0) 27 | otherParams.put[Double](param4, 8.0) 28 | 29 | params ++= otherParams 30 | 31 | assert(params(param1) == 2.0) 32 | //overwrite by other parameters 33 | assert(params(param2) == 10.1) 34 | assert(params(param3) == 7.0) 35 | assert(params(param4) == 8.0) 36 | //print parameters 37 | println(ParamUtil.paramsToString(params)) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/util/ParquetIOTest.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.parquet.hadoop.ParquetReader 5 | import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord} 6 | 7 | /** 8 | * Created by qfeng on 16-9-23. 9 | */ 10 | object ParquetIOTest { 11 | def main(args: Array[String]) { 12 | val reader = ParquetReader.builder[SimpleRecord](new SimpleReadSupport(), 13 | new Path("test_data/output/mnist/coefficient/coeff_data/0/coeff_data/w")) 14 | .build() 15 | var value = reader.read() 16 | while (value != null) { 17 | println(value.getValues.get(0).getValue.asInstanceOf[Double]) 18 | value = reader.read() 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/test/scala/io/github/qf6101/mfm/util/TestingUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.qf6101.mfm.util 2 | 3 | /** 4 | * Created by qfeng on 15-3-13. 5 | */ 6 | 7 | import org.apache.spark.mllib.linalg.{Matrix, Vector} 8 | import org.scalatest.exceptions.TestFailedException 9 | 10 | object TestingUtils { 11 | 12 | val ABS_TOL_MSG = " using absolute tolerance" 13 | val REL_TOL_MSG = " using relative tolerance" 14 | 15 | /** 16 | * Private helper function for comparing two values using relative tolerance. 17 | * Note that if x or y is extremely close to zero, i.e., smaller than Double.MinPositiveValue, 18 | * the relative tolerance is meaningless, so the exception will be raised to warn users. 19 | */ 20 | private def RelativeErrorComparison(x: Double, y: Double, eps: Double): Boolean = { 21 | val absX = math.abs(x) 22 | val absY = math.abs(y) 23 | val diff = math.abs(x - y) 24 | if (x == y) { 25 | true 26 | } else if (absX < Double.MinPositiveValue || absY < Double.MinPositiveValue) { 27 | throw new TestFailedException( 28 | s"$x or $y is extremely close to zero, so the relative tolerance is meaningless.", 0) 29 | } else { 30 | diff < eps * math.min(absX, absY) 31 | } 32 | } 33 | 34 | /** 35 | * Private helper function for comparing two values using absolute tolerance. 36 | */ 37 | private def AbsoluteErrorComparison(x: Double, y: Double, eps: Double): Boolean = { 38 | math.abs(x - y) < eps 39 | } 40 | 41 | case class CompareDoubleRightSide( 42 | fun: (Double, Double, Double) => Boolean, y: Double, eps: Double, method: String) 43 | 44 | /** 45 | * Implicit class for comparing two double values using relative tolerance or absolute tolerance. 46 | */ 47 | implicit class DoubleWithAlmostEquals(val x: Double) { 48 | 49 | /** 50 | * When the difference of two values are within eps, returns true; otherwise, returns false. 51 | */ 52 | def ~=(r: CompareDoubleRightSide): Boolean = r.fun(x, r.y, r.eps) 53 | 54 | /** 55 | * When the difference of two values are within eps, returns false; otherwise, returns true. 56 | */ 57 | def !~=(r: CompareDoubleRightSide): Boolean = !r.fun(x, r.y, r.eps) 58 | 59 | /** 60 | * Throws exception when the difference of two values are NOT within eps; 61 | * otherwise, returns true. 62 | */ 63 | def ~==(r: CompareDoubleRightSide): Boolean = { 64 | if (!r.fun(x, r.y, r.eps)) { 65 | throw new TestFailedException( 66 | s"Expected $x and ${r.y} to be within ${r.eps}${r.method}.", 0) 67 | } 68 | true 69 | } 70 | 71 | /** 72 | * Throws exception when the difference of two values are within eps; otherwise, returns true. 73 | */ 74 | def !~==(r: CompareDoubleRightSide): Boolean = { 75 | if (r.fun(x, r.y, r.eps)) { 76 | throw new TestFailedException( 77 | s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method}.", 0) 78 | } 79 | true 80 | } 81 | 82 | /** 83 | * Comparison using absolute tolerance. 84 | */ 85 | def absTol(eps: Double): CompareDoubleRightSide = CompareDoubleRightSide(AbsoluteErrorComparison, 86 | x, eps, ABS_TOL_MSG) 87 | 88 | /** 89 | * Comparison using relative tolerance. 90 | */ 91 | def relTol(eps: Double): CompareDoubleRightSide = CompareDoubleRightSide(RelativeErrorComparison, 92 | x, eps, REL_TOL_MSG) 93 | 94 | override def toString = x.toString 95 | } 96 | 97 | case class CompareVectorRightSide( 98 | fun: (Vector, Vector, Double) => Boolean, y: Vector, eps: Double, method: String) 99 | 100 | /** 101 | * Implicit class for comparing two vectors using relative tolerance or absolute tolerance. 102 | */ 103 | implicit class VectorWithAlmostEquals(val x: Vector) { 104 | 105 | /** 106 | * When the difference of two vectors are within eps, returns true; otherwise, returns false. 107 | */ 108 | def ~=(r: CompareVectorRightSide): Boolean = r.fun(x, r.y, r.eps) 109 | 110 | /** 111 | * When the difference of two vectors are within eps, returns false; otherwise, returns true. 112 | */ 113 | def !~=(r: CompareVectorRightSide): Boolean = !r.fun(x, r.y, r.eps) 114 | 115 | /** 116 | * Throws exception when the difference of two vectors are NOT within eps; 117 | * otherwise, returns true. 118 | */ 119 | def ~==(r: CompareVectorRightSide): Boolean = { 120 | if (!r.fun(x, r.y, r.eps)) { 121 | throw new TestFailedException( 122 | s"Expected $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0) 123 | } 124 | true 125 | } 126 | 127 | /** 128 | * Throws exception when the difference of two vectors are within eps; otherwise, returns true. 129 | */ 130 | def !~==(r: CompareVectorRightSide): Boolean = { 131 | if (r.fun(x, r.y, r.eps)) { 132 | throw new TestFailedException( 133 | s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0) 134 | } 135 | true 136 | } 137 | 138 | /** 139 | * Comparison using absolute tolerance. 140 | */ 141 | def absTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide( 142 | (x: Vector, y: Vector, eps: Double) => { 143 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps) 144 | }, x, eps, ABS_TOL_MSG) 145 | 146 | /** 147 | * Comparison using relative tolerance. Note that comparing against sparse vector 148 | * with elements having value of zero will raise exception because it involves with 149 | * comparing against zero. 150 | */ 151 | def relTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide( 152 | (x: Vector, y: Vector, eps: Double) => { 153 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps) 154 | }, x, eps, REL_TOL_MSG) 155 | 156 | override def toString = x.toString 157 | } 158 | 159 | case class CompareMatrixRightSide( 160 | fun: (Matrix, Matrix, Double) => Boolean, y: Matrix, eps: Double, method: String) 161 | 162 | /** 163 | * Implicit class for comparing two matrices using relative tolerance or absolute tolerance. 164 | */ 165 | implicit class MatrixWithAlmostEquals(val x: Matrix) { 166 | 167 | /** 168 | * When the difference of two matrices are within eps, returns true; otherwise, returns false. 169 | */ 170 | def ~=(r: CompareMatrixRightSide): Boolean = r.fun(x, r.y, r.eps) 171 | 172 | /** 173 | * When the difference of two matrices are within eps, returns false; otherwise, returns true. 174 | */ 175 | def !~=(r: CompareMatrixRightSide): Boolean = !r.fun(x, r.y, r.eps) 176 | 177 | /** 178 | * Throws exception when the difference of two matrices are NOT within eps; 179 | * otherwise, returns true. 180 | */ 181 | def ~==(r: CompareMatrixRightSide): Boolean = { 182 | if (!r.fun(x, r.y, r.eps)) { 183 | throw new TestFailedException( 184 | s"Expected \n$x\n and \n${r.y}\n to be within ${r.eps}${r.method} for all elements.", 0) 185 | } 186 | true 187 | } 188 | 189 | /** 190 | * Throws exception when the difference of two matrices are within eps; otherwise, returns true. 191 | */ 192 | def !~==(r: CompareMatrixRightSide): Boolean = { 193 | if (r.fun(x, r.y, r.eps)) { 194 | throw new TestFailedException( 195 | s"Did not expect \n$x\n and \n${r.y}\n to be within " + 196 | "${r.eps}${r.method} for all elements.", 0) 197 | } 198 | true 199 | } 200 | 201 | /** 202 | * Comparison using absolute tolerance. 203 | */ 204 | def absTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide( 205 | (x: Matrix, y: Matrix, eps: Double) => { 206 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps) 207 | }, x, eps, ABS_TOL_MSG) 208 | 209 | /** 210 | * Comparison using relative tolerance. Note that comparing against sparse vector 211 | * with elements having value of zero will raise exception because it involves with 212 | * comparing against zero. 213 | */ 214 | def relTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide( 215 | (x: Matrix, y: Matrix, eps: Double) => { 216 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps) 217 | }, x, eps, REL_TOL_MSG) 218 | 219 | override def toString = x.toString 220 | } 221 | 222 | } -------------------------------------------------------------------------------- /test_data/input/README.txt: -------------------------------------------------------------------------------- 1 | Download the datasets from libsvm website: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ -------------------------------------------------------------------------------- /test_data/input/mnist/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qf6101/multinomial-factorization-machines/405c0c1c4c7a676226cebcfc7ed682627948c01c/test_data/input/mnist/.gitkeep --------------------------------------------------------------------------------