├── LICENSE
├── README.md
├── pom.xml
├── src
    ├── main
    │   └── scala
    │   │   └── io
    │   │       └── github
    │   │           └── qf6101
    │   │               └── mfm
    │   │                   ├── baseframe
    │   │                       ├── Coefficients.scala
    │   │                       ├── MLLearner.scala
    │   │                       ├── MLModel.scala
    │   │                       ├── ModelParam.scala
    │   │                       ├── binomial
    │   │                       │   ├── BinLearner.scala
    │   │                       │   ├── BinModel.scala
    │   │                       │   └── BinModelParam.scala
    │   │                       └── mutinomial
    │   │                       │   ├── MultiLearner.scala
    │   │                       │   ├── MultiModel.scala
    │   │                       │   └── MultiModelParam.scala
    │   │                   ├── factorization
    │   │                       ├── binomial
    │   │                       │   ├── FmCoefficients.scala
    │   │                       │   ├── FmGradient.scala
    │   │                       │   ├── FmLearnSGD.scala
    │   │                       │   ├── FmModel.scala
    │   │                       │   └── FmModelParam.scala
    │   │                       └── multinomial
    │   │                       │   ├── MfmCoefficients.scala
    │   │                       │   ├── MfmGradient.scala
    │   │                       │   ├── MfmLearnSGD.scala
    │   │                       │   ├── MfmModel.scala
    │   │                       │   └── MfmModelParam.scala
    │   │                   ├── logisticregression
    │   │                       ├── LogisticGradient.scala
    │   │                       ├── LrLearnLBFGS.scala
    │   │                       ├── LrLearnSGD.scala
    │   │                       ├── LrModel.scala
    │   │                       ├── LrModelParam.scala
    │   │                       └── VectorCoefficients.scala
    │   │                   ├── optimization
    │   │                       ├── DecreasingStrategy.scala
    │   │                       ├── Gradient.scala
    │   │                       ├── GradientDescent.scala
    │   │                       ├── LBFGS.scala
    │   │                       ├── LBFGSParam.scala
    │   │                       ├── Optimizer.scala
    │   │                       ├── SGDParam.scala
    │   │                       └── Updater.scala
    │   │                   ├── tuning
    │   │                       ├── BinCrossValidation.scala
    │   │                       ├── BinParamGridBuilder.scala
    │   │                       ├── BinaryClassificationMetrics.scala
    │   │                       └── RegressionMetrics.scala
    │   │                   └── util
    │   │                       ├── GaussianRandom.scala
    │   │                       ├── HDFSUtil.scala
    │   │                       ├── LoadDSUtil.scala
    │   │                       ├── Logging.scala
    │   │                       ├── NumericParser.scala
    │   │                       ├── ParamUtil.scala
    │   │                       └── VectorConverter.scala
    └── test
    │   ├── resources
    │       └── log4j.properties
    │   └── scala
    │       └── io
    │           └── github
    │               └── qf6101
    │                   └── mfm
    │                       ├── factorization
    │                           ├── binomial
    │                           │   └── FmSuite.scala
    │                           └── multinomial
    │                           │   ├── MfmCoefficientsSuite.scala
    │                           │   └── MfmSuite.scala
    │                       ├── optimization
    │                           ├── GradientDescentSuite.scala
    │                           └── LBFGSSuite.scala
    │                       └── util
    │                           ├── MfmTestSparkSession.scala
    │                           ├── ParamSuite.scala
    │                           ├── ParquetIOTest.scala
    │                           └── TestingUtils.scala
└── test_data
    └── input
        ├── README.txt
        ├── a1a
            ├── a1a
            └── a1a.t
        └── mnist
            └── .gitkeep


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multinomial Factorization Machines
 2 | 
 3 | ## Brief Description
 4 | 
 5 | This project implements the binomial and multinomial factorization machines. Factorization machines are a generic approach that combines the generality of feature engineering with the superiority of factorization models in estimating interactions between categorical variables of large domain. Please refer to [\[Steffen Rendle (2010)\]](http://www.inf.uni-konstanz.de/~rendle/pdf/Rendle2010FM.pdf) for more detail.
 6 | 
 7 | This implementation is based on Spark 2.0.0 as compared with the famous standalone implementation known as [libfm](http://www.libfm.org/). Some auxiliary codes (e.g., the optimization and Logging) were adopted from Spark's private internals.
 8 | 
 9 | ## Binomial Factorization Machines (FM)
10 | 
11 | FM is designed for binary-class classification problem as the standard [libfm](http://www.libfm.org/). Please refer to [src/test/scala/io/github/qf6101/mfm/factorization/binomial/FmSuite.scala](src/test/scala/io/github/qf6101/mfm/factorization/binomial/FmSuite.scala
12 | ) for detailed usage.
13 | 
14 | > Note: The implementation takes the labels as +1/-1.
15 | 
16 | ## Mutinomial Factorization Machines (MFM)
17 | 
18 | MFM is desinged for multi-class classification problem which uses softmax as hypothesis. Please refer to [src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmSuite.scala
19 | ](src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmSuite.scala
20 | ) for detailed usage.
21 | 
22 | > Note: The implementation takes the labels as 0, 1, 2, etc.


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>io.github.qf6101</groupId>
  8 |     <artifactId>multinomial-factorization-machines</artifactId>
  9 |     <version>1.0-SNAPSHOT</version>
 10 | 
 11 |     <properties>
 12 |         <scala.version>2.11.7</scala.version>
 13 |         <scala.binary.version>2.11</scala.binary.version>
 14 |         <spark.version>2.0.0</spark.version>
 15 |     </properties>
 16 | 
 17 |     <dependencies>
 18 |         <dependency>
 19 |             <groupId>org.apache.parquet</groupId>
 20 |             <artifactId>parquet-tools</artifactId>
 21 |             <version>1.7.0</version>
 22 |         </dependency>
 23 |         <dependency>
 24 |             <groupId>com.github.pathikrit</groupId>
 25 |             <artifactId>better-files_${scala.binary.version}</artifactId>
 26 |             <version>2.16.0</version>
 27 |         </dependency>
 28 |         <dependency>
 29 |             <groupId>com.github.scopt</groupId>
 30 |             <artifactId>scopt_${scala.binary.version}</artifactId>
 31 |             <version>3.5.0</version>
 32 |         </dependency>
 33 |         <dependency>
 34 |             <groupId>joda-time</groupId>
 35 |             <artifactId>joda-time</artifactId>
 36 |             <version>2.7</version>
 37 |         </dependency>
 38 |         <dependency>
 39 |             <groupId>org.joda</groupId>
 40 |             <artifactId>joda-convert</artifactId>
 41 |             <version>1.7</version>
 42 |         </dependency>
 43 |         <dependency>
 44 |             <groupId>org.apache.spark</groupId>
 45 |             <artifactId>spark-mllib_${scala.binary.version}</artifactId>
 46 |             <version>${spark.version}</version>
 47 |             <scope>provided</scope>
 48 |         </dependency>
 49 |         <dependency>
 50 |             <groupId>org.scalatest</groupId>
 51 |             <artifactId>scalatest_${scala.binary.version}</artifactId>
 52 |             <version>3.0.0</version>
 53 |             <scope>test</scope>
 54 |         </dependency>
 55 |     </dependencies>
 56 | 
 57 |     <build>
 58 |         <plugins>
 59 |             <plugin>
 60 |                 <groupId>org.apache.maven.plugins</groupId>
 61 |                 <artifactId>maven-compiler-plugin</artifactId>
 62 |                 <version>2.0.2</version>
 63 |                 <configuration>
 64 |                     <source>1.6</source>
 65 |                     <target>1.6</target>
 66 |                     <encoding>UTF-8</encoding>
 67 |                 </configuration>
 68 |             </plugin>
 69 |             <plugin>
 70 |                 <groupId>org.apache.maven.plugins</groupId>
 71 |                 <artifactId>maven-jar-plugin</artifactId>
 72 |                 <version>2.3.1</version>
 73 |             </plugin>
 74 |             <plugin>
 75 |                 <artifactId>maven-assembly-plugin</artifactId>
 76 |                 <configuration>
 77 |                     <descriptorRefs>
 78 |                         <descriptorRef>jar-with-dependencies</descriptorRef>
 79 |                     </descriptorRefs>
 80 |                 </configuration>
 81 |                 <executions>
 82 |                     <execution>
 83 |                         <id>make-assembly</id>
 84 |                         <phase>package</phase>
 85 |                         <goals>
 86 |                             <goal>single</goal>
 87 |                         </goals>
 88 |                     </execution>
 89 |                 </executions>
 90 |             </plugin>
 91 |             <plugin>
 92 |                 <groupId>net.alchim31.maven</groupId>
 93 |                 <artifactId>scala-maven-plugin</artifactId>
 94 |                 <version>3.2.2</version>
 95 |                 <executions>
 96 |                     <execution>
 97 |                         <goals>
 98 |                             <goal>compile</goal>
 99 |                             <goal>testCompile</goal>
100 |                         </goals>
101 |                     </execution>
102 |                 </executions>
103 |                 <configuration>
104 |                     <jvmArgs>
105 |                         <jvmArg>-Xms64m</jvmArg>
106 |                         <jvmArg>-Xmx1024m</jvmArg>
107 |                     </jvmArgs>
108 |                     <scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
109 |                     <scalaVersion>${scala.version}</scalaVersion>
110 |                 </configuration>
111 |             </plugin>
112 |             <plugin>
113 |                 <groupId>org.apache.maven.plugins</groupId>
114 |                 <artifactId>maven-surefire-plugin</artifactId>
115 |                 <version>2.7</version>
116 |                 <configuration>
117 |                     <skipTests>true</skipTests>
118 |                 </configuration>
119 |             </plugin>
120 |             <plugin>
121 |                 <groupId>org.scalatest</groupId>
122 |                 <artifactId>scalatest-maven-plugin</artifactId>
123 |                 <version>1.0</version>
124 |                 <configuration>
125 |                     <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
126 |                     <junitxml>.</junitxml>
127 |                     <filereports>WDF TestSuite.txt</filereports>
128 |                 </configuration>
129 |                 <executions>
130 |                     <execution>
131 |                         <id>test</id>
132 |                         <goals>
133 |                             <goal>test</goal>
134 |                         </goals>
135 |                     </execution>
136 |                 </executions>
137 |             </plugin>
138 |         </plugins>
139 |     </build>
140 |     
141 | </project>


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/Coefficients.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.baseframe
  2 | 
  3 | import org.apache.hadoop.fs.Path
  4 | import org.apache.parquet.hadoop.ParquetReader
  5 | import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord}
  6 | 
  7 | import scala.collection.mutable.ListBuffer
  8 | 
  9 | /**
 10 |   * Created by qfeng on 15-3-12.
 11 |   */
 12 | 
 13 | /**
 14 |   * 模型系数，抽象基类
 15 |   */
 16 | abstract class Coefficients extends Serializable {
 17 |   /**
 18 |     * 只复制this的结构（比如参数个数），不复制内容
 19 |     *
 20 |     * @return 复制的拷贝
 21 |     */
 22 |   def copyEmpty(): Coefficients
 23 | 
 24 |   /**
 25 |     * 同时复制this的结构和内容
 26 |     *
 27 |     * @return 复制的拷贝
 28 |     */
 29 |   def copy: Coefficients
 30 | 
 31 |   /**
 32 |     * 对应系数加法，加至this上
 33 |     *
 34 |     * @param other 加数
 35 |     * @return this
 36 |     */
 37 |   def +=(other: Coefficients): Coefficients
 38 | 
 39 |   /**
 40 |     * 对应系数减法，减至this上
 41 |     *
 42 |     * @param other 减数
 43 |     * @return this
 44 |     */
 45 |   def -=(other: Coefficients): Coefficients
 46 | 
 47 |   /**
 48 |     *
 49 |     * 对应系数加法，加至复制this的类上
 50 |     *
 51 |     * @param other 加数
 52 |     * @return 加法结果（拷贝）
 53 |     */
 54 |   def +(other: Coefficients): Coefficients = {
 55 |     val result = this.copy
 56 |     result += other
 57 |     result
 58 |   }
 59 | 
 60 |   /**
 61 |     * 对应系数加上同一实数，加至复制this的类上
 62 |     *
 63 |     * @param addend 加数
 64 |     * @return 加法结果（拷贝）
 65 |     */
 66 |   def +(addend: Double): Coefficients
 67 | 
 68 |   /**
 69 |     * 对应系数减上同一实数，减至复制this的类上
 70 |     *
 71 |     * @param minuend 减数
 72 |     * @return 减法结果（拷贝）
 73 |     */
 74 |   def -(minuend: Double): Coefficients = {
 75 |     this.copy + (-minuend)
 76 |   }
 77 | 
 78 |   /**
 79 |     * 对应系数除上同一实数，加至复制this的类上
 80 |     *
 81 |     * @param dividend 除数
 82 |     * @return 除法结果
 83 |     */
 84 |   def /(dividend: Double): Coefficients
 85 | 
 86 |   /**
 87 |     * 对应系数乘上同一实数，加至复制this的类上
 88 |     *
 89 |     * @param multiplier 乘数
 90 |     * @return 乘法结果
 91 |     */
 92 |   def *(multiplier: Double): Coefficients
 93 | 
 94 |   /**
 95 |     * 计算L2的正则值
 96 |     *
 97 |     * @param reg 正则参数
 98 |     * @return 参数加权后的L2正则值
 99 |     */
100 |   def L2RegValue(reg: Array[Double]): Double
101 | 
102 |   /**
103 |     * 计算L2的正则梯度值
104 |     *
105 |     * @param reg 正则参数
106 |     * @return 参数加权后的L2正则梯度值
107 |     */
108 |   def L2RegGradient(reg: Array[Double]): Coefficients
109 | 
110 |   /**
111 |     * 用L1稀疏化系数
112 |     *
113 |     * @param regParam 正则参数值
114 |     * @param stepSize 学习率
115 |     * @return 稀疏化后的系数
116 |     */
117 |   def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients
118 | 
119 |   /**
120 |     * 计算L1的正则值
121 |     *
122 |     * @param regParam 正则参数
123 |     * @return 参数绝对值加权后的L1正则值
124 |     */
125 |   def L1RegValue(regParam: Array[Double]): Double
126 | 
127 |   /**
128 |     * 计算系数的2范数
129 |     * sum(abs(A).^p)^(1/p) where p=2
130 |     *
131 |     * @return 系数的2范数
132 |     */
133 |   def norm: Double
134 | 
135 |   /**
136 |     * 计算两组系数差异的2范数
137 |     *
138 |     * @param other 另一组系数
139 |     * @return 差异的2范数值
140 |     */
141 |   def normDiff(other: Coefficients): Double = {
142 |     (this - other).norm
143 |   }
144 | 
145 |   /**
146 |     * 对应系数减法，减至复制this的类上
147 |     *
148 |     * @param other 减数
149 |     * @return 减法结果（拷贝）
150 |     */
151 |   def -(other: Coefficients): Coefficients = {
152 |     val result = this.copy
153 |     result -= other
154 |     result
155 |   }
156 | 
157 |   /**
158 |     * 保存系数至文件
159 |     *
160 |     * @param location 文件位置
161 |     */
162 |   def save(location: String): Unit = {
163 |     saveMeta(location + "/" + Coefficients.namingMetaFile)
164 |     saveData(location + "/" + Coefficients.namingDataFile)
165 |   }
166 | 
167 |   /**
168 |     * 保存元数据至文件
169 |     *
170 |     * @param location 文件位置
171 |     */
172 |   def saveMeta(location: String): Unit
173 | 
174 |   /**
175 |     * 保存数据至文件
176 |     *
177 |     * @param location 文件位置
178 |     */
179 |   def saveData(location: String): Unit
180 | 
181 |   /**
182 |     * 与另一个系数是否相等
183 |     *
184 |     * @param other 另一个系数
185 |     * @return 是否相等
186 |     */
187 |   def equals(other: Coefficients): Boolean
188 | }
189 | 
190 | /**
191 |   * 静态系数对象
192 |   */
193 | object Coefficients {
194 |   val namingCoeffType: String = "coeff_type"
195 |   val namingMetaFile: String = "coeff_meta"
196 |   val namingDataFile: String = "coeff_data"
197 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/MLLearner.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe
 2 | 
 3 | import org.apache.spark.ml.param.ParamMap
 4 | 
 5 | /**
 6 |   * Created by qfeng on 16-9-9.
 7 |   */
 8 | 
 9 | /**
10 |   * 学习器基类
11 |   *
12 |   * @param params 参数池
13 |   */
14 | abstract class MLLearner(val params: ParamMap) extends Serializable {
15 |   /**
16 |     * 更新参数池
17 |     *
18 |     * @param updatingParams 更新参数
19 |     */
20 |   def updateParams(updatingParams: ParamMap): Unit = {
21 |     params ++= updatingParams
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/MLModel.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe
 2 | 
 3 | import org.apache.spark.ml.param.ParamMap
 4 | 
 5 | /**
 6 |   * Created by qfeng on 16-9-8.
 7 |   */
 8 | 
 9 | /**
10 |   * 机器学习模型基类
11 |   *
12 |   * @param paramMeta 模型参赛
13 |   * @param coeffs    模型系数
14 |   * @param params    参数池（保存参数的值）
15 |   */
16 | abstract class MLModel(val paramMeta: ModelParam,
17 |                        val coeffs: Coefficients,
18 |                        val params: ParamMap) extends Serializable {
19 |   /**
20 |     * 保存模型文件
21 |     *
22 |     * @param location 模型文件的位置
23 |     */
24 |   def save(location: String): Unit = {
25 |     //保存模型系数
26 |     coeffs.save(location + "/" + MLModel.namingCoeffFile)
27 |     //保存模型参数
28 |     paramMeta.save(location + "/" + MLModel.namingParamFile, params)
29 |   }
30 | 
31 |   /**
32 |     * 模型内容是否相同
33 |     *
34 |     * @param other 另一个模型
35 |     * @return 内容是否相同
36 |     */
37 |   def equals(other: MLModel): Boolean
38 | }
39 | 
40 | /**
41 |   * 静态模型对象
42 |   */
43 | object MLModel {
44 |   val namingCoeffFile = "coefficient"
45 |   val namingParamFile = "params"
46 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/ModelParam.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe
 2 | 
 3 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.json4s.JsonAST
 6 | import org.json4s.JsonDSL._
 7 | import org.json4s.jackson.JsonMethods._
 8 | 
 9 | /**
10 |   * Created by qfeng on 15-4-2.
11 |   */
12 | 
13 | /**
14 |   * 模型参数
15 |   */
16 | trait ModelParam extends Serializable {
17 |   val initMean: Param[Double] = new Param("ModelParam", "initMean", "使用高斯分布，初始化参数值，均值", ParamValidators.inRange(0, 1))
18 |   val initStdev: Param[Double] = new Param("ModelParam", "initStdev", "使用高斯分布，初始化参数值，标准差值", ParamValidators.inRange(0, 1))
19 | 
20 | 
21 |   /**
22 |     * 将模型参数值保存至文件
23 |     *
24 |     * @param location 保存位置
25 |     * @param params   参数池
26 |     */
27 |   def save(location: String, params: ParamMap): Unit = {
28 |     SparkSession.builder().getOrCreate().sparkContext.
29 |       makeRDD(List(compact(render(this.toJSON(params))))).repartition(1).saveAsTextFile(location)
30 |   }
31 | 
32 |   /**
33 |     * Transform parameters to json object
34 |     *
35 |     * @return parameters in json format
36 |     */
37 |   def toJSON(params: ParamMap): JsonAST.JObject = {
38 |     (initMean.name -> params(initMean)) ~ (initStdev.name -> params(initStdev))
39 |   }
40 | }
41 | 
42 | /**
43 |   * 静态模型参数对象
44 |   */
45 | object ModelParam {
46 |   val namingParamType = "param_type"
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/binomial/BinLearner.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe.binomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.MLLearner
 5 | import org.apache.spark.ml.param.ParamMap
 6 | import org.apache.spark.rdd.RDD
 7 | 
 8 | /**
 9 |   * Created by qfeng on 15-3-27.
10 |   */
11 | 
12 | /**
13 |   * 二分学习器基类
14 |   *
15 |   * @param params 参数池
16 |   */
17 | abstract class BinLearner(override val params: ParamMap) extends MLLearner(params) {
18 |   /**
19 |     * 训练二分模型
20 |     *
21 |     * @param dataset 训练集
22 |     * @return 二分模型
23 |     */
24 |   def train(dataset: RDD[(Double, SparseVector[Double])]): BinModel
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/binomial/BinModel.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe.binomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.{Coefficients, MLModel}
 5 | import io.github.qf6101.mfm.tuning.BinaryClassificationMetrics
 6 | import io.github.qf6101.mfm.util.Logging
 7 | import org.apache.spark.ml.param.ParamMap
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.storage.StorageLevel
10 | 
11 | import scala.util.control.Breaks
12 | 
13 | /**
14 |   * Created by qfeng on 15-3-27.
15 |   */
16 | 
17 | /**
18 |   * 二分模型基类
19 |   *
20 |   * @param paramMeta 模型参赛
21 |   * @param coeffs    模型系数
22 |   * @param params    参数池（保存参数的值）
23 |   */
24 | abstract class BinModel(override val paramMeta: BinModelParam,
25 |                         override val coeffs: Coefficients,
26 |                         override val params: ParamMap)
27 |   extends MLModel(paramMeta, coeffs, params) with Logging with Serializable {
28 |   //设置默认的阈值为0.5
29 |   params.put(paramMeta.binaryThreshold, 0.5)
30 | 
31 |   /**
32 |     * 对输入数据进行预测
33 |     *
34 |     * @param data 输入数据
35 |     * @return 预测值(0~1)
36 |     */
37 |   def predict(data: SparseVector[Double]): Double
38 | 
39 |   /**
40 |     * 对输入数据集进行预测
41 |     *
42 |     * @param dataSet 输入数据集
43 |     * @return 预测值集合(0~1)
44 |     */
45 |   def predict(dataSet: RDD[SparseVector[Double]]): RDD[Double] = {
46 |     dataSet.map(predict)
47 |   }
48 | 
49 |   /**
50 |     * 选择二分分离器的阈值（固定AUC，选择F1-score最大的阈值）
51 |     *
52 |     * @param dataSet 数据集合
53 |     */
54 |   def selectThreshold(dataSet: RDD[(Double, SparseVector[Double])]): Array[BinaryClassificationMetrics] = {
55 |     //生成对数据集的预测结果并持久化
56 |     val scoreAndLabels = dataSet.map { case (label, data) =>
57 |       (predict(data), label)
58 |     }.persist(StorageLevel.MEMORY_AND_DISK_SER)
59 |     //以0.05为间隔，尝试每个threshold，选择F1_score最大的threshold
60 |     //直至遇到F1_score为NaN，停止尝试
61 |     var maxF1Score = Double.MinValue
62 |     var selectedThreshold = 0.5
63 |     val loop = new Breaks
64 |     loop.breakable {
65 |       for (tryThreshold <- 0.05 until 1.0 by 0.05) {
66 |         val metrics = new BinaryClassificationMetrics(scoreAndLabels, tryThreshold)
67 |         logDebug(s"threshold selection => f1-score: ${"%1.4f".format(metrics.f1_scores._1)}, threshold: ${"%1.2f".format(tryThreshold)}")
68 |         if (metrics.f1_scores._1.isNaN) {
69 |           loop.break()
70 |         } else if (metrics.f1_scores._1 > maxF1Score) {
71 |           maxF1Score = metrics.f1_scores._1
72 |           selectedThreshold = tryThreshold
73 |         }
74 |       }
75 |     }
76 |     //设置选择得到的threshold
77 |     params.put(paramMeta.binaryThreshold, selectedThreshold)
78 |     //计算最终的度量指标
79 |     val finalMetrics = new BinaryClassificationMetrics(scoreAndLabels, selectedThreshold)
80 |     logInfo(s"selected threshold: $selectedThreshold, metrics: ${finalMetrics.toString}}")
81 |     //解除持久化
82 |     scoreAndLabels.unpersist()
83 |     Array(finalMetrics)
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/binomial/BinModelParam.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe.binomial
 2 | 
 3 | import io.github.qf6101.mfm.baseframe.ModelParam
 4 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
 5 | import org.json4s.JsonAST
 6 | import org.json4s.JsonDSL._
 7 | 
 8 | /**
 9 |   * Created by qfeng on 16-9-8.
10 |   */
11 | trait BinModelParam extends ModelParam {
12 |   //default value: 0.5
13 |   val binaryThreshold: Param[Double] = new Param("BinModelParam", "binaryThreshold", "threshold for binary classification", ParamValidators.inRange(0, 1, false, false))
14 | 
15 |   /**
16 |     * Transform parameters to json object
17 |     *
18 |     * @return parameters in json format
19 |     */
20 |   override def toJSON(params: ParamMap): JsonAST.JObject = {
21 |     super.toJSON(params) ~ (binaryThreshold.name -> params(binaryThreshold))
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/mutinomial/MultiLearner.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe.mutinomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.MLLearner
 5 | import org.apache.spark.ml.param.ParamMap
 6 | import org.apache.spark.rdd.RDD
 7 | 
 8 | /**
 9 |   * Created by qfeng on 16-9-9.
10 |   */
11 | 
12 | /**
13 |   * 多分类学习器基类
14 |   *
15 |   * @param params 参数池
16 |   */
17 | abstract class MultiLearner(override val params: ParamMap) extends MLLearner(params) {
18 |   /**
19 |     * 训练对应模型
20 |     *
21 |     * @param dataset 训练集
22 |     * @return 模型
23 |     */
24 |   def train(dataset: RDD[(Double, SparseVector[Double])]): MultiModel
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/mutinomial/MultiModel.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe.mutinomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.{Coefficients, MLModel}
 5 | import io.github.qf6101.mfm.util.Logging
 6 | import org.apache.spark.ml.param.ParamMap
 7 | import org.apache.spark.rdd.RDD
 8 | 
 9 | /**
10 |   * Created by qfeng on 16-9-9.
11 |   */
12 | 
13 | /**
14 |   * 多分类模型基类
15 |   *
16 |   * @param paramMeta 模型参赛
17 |   * @param coeffs    模型系数
18 |   * @param params    参数池（保存参数的值）
19 |   */
20 | abstract class MultiModel(override val paramMeta: MultiModelParam,
21 |                           override val coeffs: Coefficients,
22 |                           override val params: ParamMap)
23 |   extends MLModel(paramMeta, coeffs, params) with Logging with Serializable {
24 |   /**
25 |     * 对输入数据进行预测
26 |     *
27 |     * @param data 输入数据
28 |     * @return 预测值向量(0~1)
29 |     */
30 |   def predict(data: SparseVector[Double]): Array[Double]
31 | 
32 |   /**
33 |     * 对输入数据集进行预测
34 |     *
35 |     * @param dataSet 输入数据集
36 |     * @return 预测值集合(0~1)
37 |     */
38 |   def predict(dataSet: RDD[SparseVector[Double]]): RDD[Array[Double]] = {
39 |     dataSet.map(predict)
40 |   }
41 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/baseframe/mutinomial/MultiModelParam.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.baseframe.mutinomial
 2 | 
 3 | import io.github.qf6101.mfm.baseframe.ModelParam
 4 | 
 5 | /**
 6 |   * Created by qfeng on 16-9-9.
 7 |   */
 8 | 
 9 | /**
10 |   * 多分类模型参数
11 |   */
12 | trait MultiModelParam extends ModelParam
13 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmCoefficients.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.factorization.binomial
  2 | 
  3 | import better.files.File
  4 | import breeze.linalg.{DenseMatrix, DenseVector}
  5 | import io.github.qf6101.mfm.baseframe.Coefficients
  6 | import io.github.qf6101.mfm.util.GaussianRandom
  7 | import org.apache.hadoop.fs.Path
  8 | import org.apache.parquet.hadoop.ParquetReader
  9 | import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord}
 10 | import org.apache.spark.sql.SparkSession
 11 | import org.json4s.DefaultFormats
 12 | import org.json4s.JsonDSL._
 13 | import org.json4s.jackson.JsonMethods._
 14 | 
 15 | import scala.collection.mutable.ListBuffer
 16 | import scala.math._
 17 | 
 18 | /**
 19 |   * Created by qfeng on 15-3-12.
 20 |   */
 21 | 
 22 | /**
 23 |   * Factorization Machine模型系数
 24 |   *
 25 |   * @param initMean    随机初始值均值
 26 |   * @param initStdev   随机初始值标准差
 27 |   * @param numFeatures 特征个数
 28 |   * @param numFactors  因子个数
 29 |   * @param k0          是否需要处理截距
 30 |   * @param k1          是否需要处理一阶参数
 31 |   * @param k2          是否需要处理二阶参数
 32 |   */
 33 | class FmCoefficients(val initMean: Double,
 34 |                      val initStdev: Double,
 35 |                      var numFeatures: Int,
 36 |                      var numInteractFeatures: Int,
 37 |                      var numFactors: Int,
 38 |                      val k0: Boolean,
 39 |                      val k1: Boolean,
 40 |                      val k2: Boolean) extends Coefficients {
 41 |   var w0 = 0.0
 42 |   var w = DenseVector.zeros[Double](numFeatures)
 43 |   var v = GaussianRandom.randDenseMatrix(initMean, initStdev, numInteractFeatures, numFactors)
 44 | 
 45 |   /**
 46 |     * 用breeze稀疏向量和CSC稀疏矩阵初始化模型系数
 47 |     *
 48 |     * @param w0 0阶系数
 49 |     * @param w  1阶系数
 50 |     * @param v  2阶系数
 51 |     * @param k0 是否需要处理截距
 52 |     * @param k1 是否需要处理一阶参数
 53 |     * @param k2 是否需要处理二阶参数
 54 |     */
 55 |   def this(w0: Double, w: DenseVector[Double], v: DenseMatrix[Double], k0: Boolean, k1: Boolean, k2: Boolean) {
 56 |     this(0.0, 0.0, w.length, v.rows, v.cols, k0, k1, k2)
 57 |     this.w0 = w0
 58 |     this.w = w.copy
 59 |     this.v = v.copy
 60 |   }
 61 | 
 62 |   /**
 63 |     * 只复制this的结构（比如参数个数），不复制内容
 64 |     *
 65 |     * @return 复制的拷贝
 66 |     */
 67 |   override def copyEmpty(): Coefficients = new FmCoefficients(this.initMean, this.initMean,
 68 |     this.numFeatures, this.numInteractFeatures, this.numFactors, this.k0, this.k1, this.k2)
 69 | 
 70 |   /**
 71 |     * 对应系数加法，加至this上
 72 |     *
 73 |     * @param other 加数
 74 |     * @return this
 75 |     */
 76 |   override def +=(other: Coefficients): Coefficients = {
 77 |     val otherCoeffs = other.asInstanceOf[FmCoefficients]
 78 |     if (k0) this.w0 += otherCoeffs.w0
 79 |     if (k1) this.w += otherCoeffs.w
 80 |     if (k2) this.v += otherCoeffs.v
 81 |     this
 82 |   }
 83 | 
 84 |   /**
 85 |     * 对应系数减法，减至this上
 86 |     *
 87 |     * @param other 减数
 88 |     * @return this
 89 |     */
 90 |   override def -=(other: Coefficients): Coefficients = {
 91 |     val otherCoeffs = other.asInstanceOf[FmCoefficients]
 92 |     if (k0) this.w0 -= otherCoeffs.w0
 93 |     if (k1) this.w -= otherCoeffs.w
 94 |     if (k2) this.v -= otherCoeffs.v
 95 |     this
 96 |   }
 97 | 
 98 |   /**
 99 |     * 对应系数加上同一实数，加至复制this的类上
100 |     *
101 |     * @param addend 加数
102 |     * @return 加法结果（拷贝）
103 |     */
104 |   override def +(addend: Double): Coefficients = {
105 |     val result = this.copy.asInstanceOf[FmCoefficients]
106 |     if (k0) result.w0 += addend
107 |     if (k1) result.w += addend
108 |     if (k2) result.v += addend
109 |     result
110 |   }
111 | 
112 |   /**
113 |     * 对应系数乘上同一实数，加至复制this的类上
114 |     *
115 |     * @param multiplier 乘数
116 |     * @return 乘法结果
117 |     */
118 |   override def *(multiplier: Double): Coefficients = {
119 |     val result = this.copy.asInstanceOf[FmCoefficients]
120 |     if (k0) result.w0 *= multiplier
121 |     if (k1) result.w *= multiplier
122 |     if (k2) result.v *= multiplier
123 |     result
124 |   }
125 | 
126 |   /**
127 |     * 同时复制this的结构和内容
128 |     *
129 |     * @return 复制的拷贝
130 |     */
131 |   override def copy: Coefficients = {
132 |     //从效率出发，参数设为0
133 |     val coeffs = new FmCoefficients(this.initMean, this.initStdev, 0, 0, 0, this.k0, this.k1, this.k2)
134 |     coeffs.numFeatures = this.numFeatures
135 |     coeffs.numInteractFeatures = this.numInteractFeatures
136 |     coeffs.numFactors = this.numFactors
137 |     coeffs.w0 = this.w0
138 |     coeffs.w = this.w.copy
139 |     coeffs.v = this.v.copy
140 |     coeffs
141 |   }
142 | 
143 |   /**
144 |     * 对应系数除上同一实数，加至复制this的类上
145 |     *
146 |     * @param dividend 除数
147 |     * @return 除法结果
148 |     */
149 |   override def /(dividend: Double): Coefficients = {
150 |     val result = this.copy.asInstanceOf[FmCoefficients]
151 |     if (k0) result.w0 /= dividend
152 |     if (k1) result.w /= dividend
153 |     if (k2) result.v /= dividend
154 |     result
155 |   }
156 | 
157 |   /**
158 |     * 计算L2的正则值
159 |     *
160 |     * @param reg 正则参数
161 |     * @return 参数加权后的L2正则值
162 |     */
163 |   override def L2RegValue(reg: Array[Double]): Double = {
164 |     val zeroRegValue = if (k0) w0 * w0 * reg(0) else 0.0
165 |     val firstRegValue = if (k1 && w.activeSize > 0) w.activeValuesIterator.reduce(_ + Math.pow(_, 2)) * reg(1) else 0.0
166 |     val secondRegValue = if (k2 && v.activeSize > 0) v.activeValuesIterator.reduce(_ + Math.pow(_, 2)) * reg(2) else 0.0
167 |     0.5 * (zeroRegValue + firstRegValue + secondRegValue)
168 |   }
169 | 
170 |   /**
171 |     * 计算L2的正则梯度值
172 |     *
173 |     * @param reg 正则参数
174 |     * @return 参数加权后的L2正则梯度值
175 |     */
176 |   override def L2RegGradient(reg: Array[Double]): Coefficients = {
177 |     val result = this.copy.asInstanceOf[FmCoefficients]
178 |     if (k0) result.w0 *= reg(0)
179 |     if (k1) result.w *= reg(1)
180 |     if (k2) result.v *= reg(2)
181 |     result
182 |   }
183 | 
184 |   /**
185 |     * 用L1稀疏化系数
186 |     *
187 |     * @param regParam 正则参数值
188 |     * @param stepSize 学习率
189 |     * @return 稀疏化后的系数
190 |     */
191 |   override def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients = {
192 |     //0阶参数
193 |     if (k0) {
194 |       val zeroShrinkageVal = regParam(0) * stepSize
195 |       w0 = signum(w0) * max(0.0, abs(w0) - zeroShrinkageVal)
196 |     }
197 |     //1阶参数
198 |     if (k1) {
199 |       val firstShrinkageVal = regParam(1) * stepSize
200 |       val newW = DenseVector.zeros[Double](w.length)
201 |       w.activeIterator.foreach { case (index, weight) =>
202 |         val newWeight = signum(weight) * max(0.0, abs(weight) - firstShrinkageVal)
203 |         if (newWeight == 0) {
204 |           Nil
205 |         } else {
206 |           newW.update(index, newWeight)
207 |         }
208 |       }
209 |       w = newW
210 |     }
211 |     //2阶参数
212 |     if (k2) {
213 |       val secondShrinkageVal = regParam(2) * stepSize / numFactors
214 |       val newV = DenseMatrix.zeros[Double](v.rows, v.cols)
215 |       v.activeIterator.foreach { case ((rowIndex, colIndex), weight) =>
216 |         val newWeight = signum(weight) * max(0.0, abs(weight) - secondShrinkageVal)
217 |         if (newWeight == 0) {
218 |           Nil
219 |         } else {
220 |           newV.update(rowIndex, colIndex, newWeight)
221 |         }
222 |       }
223 |       v = newV
224 |     }
225 |     //全部更新完后，返回结果
226 |     this
227 |   }
228 | 
229 |   /**
230 |     * 计算L1的正则值
231 |     *
232 |     * @param reg 正则参数
233 |     * @return 参数绝对值加权后的L1正则值
234 |     */
235 |   override def L1RegValue(reg: Array[Double]): Double = {
236 |     val zeroRegValue = if (k0) abs(w0) * reg(0) else 0.0
237 |     val firstRegValue = if (k1 && w.activeSize > 0) w.activeIterator.foldLeft(0.0) { case (absSum, (_, weight)) =>
238 |       absSum + abs(weight)
239 |     } * reg(1)
240 |     else 0.0
241 |     val secondRegValue = if (k2 && v.activeSize > 0) v.activeIterator.foldLeft(0.0) { case (absSum, (_, weight)) =>
242 |       absSum + abs(weight)
243 |     } * reg(2)
244 |     else 0.0
245 |     zeroRegValue + firstRegValue + secondRegValue
246 |   }
247 | 
248 |   /**
249 |     * 计算系数的2范数
250 |     * sum(abs(A).^p)^(1/p) where p=2
251 |     *
252 |     * @return 系数的2范数
253 |     */
254 |   override def norm: Double = {
255 |     val zeroSum = if (k0) w0 * w0 else 0.0
256 |     val firstSum = if (k1 && w.activeSize > 0) w.activeIterator.foldLeft(0.0) { case (sum: Double, (_, value: Double)) =>
257 |       sum + value * value
258 |     } else 0.0
259 |     val secondSum = if (k2 && v.activeSize > 0) v.activeIterator.foldLeft(0.0) { case (sum: Double, (_, value: Double)) =>
260 |       sum + value * value
261 |     } else 0.0
262 |     math.sqrt(zeroSum + firstSum + secondSum)
263 |   }
264 | 
265 |   /**
266 |     * 保存元数据至文件
267 |     *
268 |     * @param location 文件位置
269 |     */
270 |   override def saveMeta(location: String): Unit = {
271 |     val json = (Coefficients.namingCoeffType -> FmCoefficients.getClass.toString) ~
272 |       (FmCoefficients.namingIntercept -> w0) ~
273 |       (FmCoefficients.namingWSize -> w.size) ~
274 |       (FmCoefficients.namingVRows -> v.rows) ~
275 |       (FmCoefficients.namingVCols -> v.cols) ~
276 |       (FmCoefficients.namingK0 -> k0) ~
277 |       (FmCoefficients.namingK1 -> k1) ~
278 |       (FmCoefficients.namingK2 -> k2)
279 |     SparkSession.builder().getOrCreate().sparkContext.
280 |       makeRDD(List(compact(render(json)))).repartition(1).saveAsTextFile(location)
281 |   }
282 | 
283 |   /**
284 |     * 保存数据至文件
285 |     *
286 |     * @param location 文件位置
287 |     */
288 |   override def saveData(location: String): Unit = {
289 |     val spark = SparkSession.builder().getOrCreate()
290 |     spark.createDataFrame(w.data.map(Tuple1(_))).repartition(1).toDF("value").write.parquet(location + "/w")
291 |     spark.createDataFrame(v.data.map(Tuple1(_))).repartition(1).toDF("value").write.parquet(location + "/v")
292 |   }
293 | 
294 |   /**
295 |     * 与另一个系数是否相等
296 |     *
297 |     * @param other 另一个系数
298 |     * @return 是否相等
299 |     */
300 |   override def equals(other: Coefficients): Boolean = {
301 |     other match {
302 |       case otherCoeffs: FmCoefficients =>
303 |         if (w0 == otherCoeffs.w0 && w.equals(otherCoeffs.w) && v.equals(otherCoeffs.v)) true else false
304 |       case _ => false
305 |     }
306 |   }
307 | }
308 | 
309 | /**
310 |   * 静态FM系数对象
311 |   */
312 | object FmCoefficients {
313 |   val namingIntercept = "intercept"
314 |   val namingWSize = "w_size"
315 |   val namingVRows = "v_rows"
316 |   val namingVCols = "v_cols"
317 |   val namingK0 = "k0"
318 |   val namingK1 = "k1"
319 |   val namingK2 = "k2"
320 | 
321 |   /**
322 |     * 系数文件构造分解机系数
323 |     *
324 |     * @param location 系数文件
325 |     * @return 分解机系数
326 |     */
327 |   def apply(location: String): FmCoefficients = {
328 |     //初始化spark session
329 |     val spark = SparkSession.builder().getOrCreate()
330 |     import spark.implicits._
331 |     //读取元数据
332 |     val meta = spark.read.json(location + "/" + Coefficients.namingMetaFile).first()
333 |     val w0 = meta.getAs[Double](namingIntercept)
334 |     val vRows = meta.getAs[Long](namingVRows).toInt
335 |     val vCols = meta.getAs[Long](namingVCols).toInt
336 |     val k0 = meta.getAs[Boolean](namingK0)
337 |     val k1 = meta.getAs[Boolean](namingK1)
338 |     val k2 = meta.getAs[Boolean](namingK2)
339 |     // 读取w向量
340 |     val w = DenseVector(spark.read.parquet(location + "/" + Coefficients.namingDataFile + "/w").map { row =>
341 |       row.getAs[Double]("value")
342 |     }.collect())
343 |     // 读取v向量
344 |     val v = DenseMatrix.create(vRows, vCols, spark.read.parquet(location + "/" + Coefficients.namingDataFile + "/v")
345 |       .map { row =>
346 |         row.getAs[Double]("value")
347 |       }.collect())
348 |     //返回结果
349 |     new FmCoefficients(w0, w, v, k0, k1, k2)
350 |   }
351 | 
352 |   /**
353 |     * 从本地文件载入系数
354 |     *
355 |     * @param location 本地文件
356 |     * @return FM系数对象
357 |     */
358 |   def fromLocal(location: String): FmCoefficients = {
359 |     //读取元数据
360 |     implicit val formats = DefaultFormats
361 |     val meta = parse(File(location + "/" + Coefficients.namingMetaFile + "/part-00000").contentAsString)
362 |     val w0 = (meta \ namingIntercept).extract[Double]
363 |     val vRows = (meta \ namingVRows).extract[Int]
364 |     val vCols = (meta \ namingVCols).extract[Int]
365 |     val k0 = (meta \ namingK0).extract[Boolean]
366 |     val k1 = (meta \ namingK1).extract[Boolean]
367 |     val k2 = (meta \ namingK2).extract[Boolean]
368 |     // 读取w和v向量
369 |     val w = DenseVector(readValues(location + "/" + Coefficients.namingDataFile + "/w"))
370 |     val v = DenseMatrix.create(vRows, vCols, readValues(location + "/" + Coefficients.namingDataFile + "/v"))
371 |     //返回结果
372 |     new FmCoefficients(w0, w, v, k0, k1, k2)
373 |   }
374 | 
375 |   /**
376 |     * 从本地文件读取系数内容
377 |     *
378 |     * @param location 本地文件
379 |     * @return 系数对象
380 |     */
381 |   def readValues(location: String): Array[Double] = {
382 |     val reader = ParquetReader.builder[SimpleRecord](new SimpleReadSupport(), new Path(location)).build()
383 |     val values = ListBuffer[Double]()
384 |     var elem = reader.read()
385 |     while (elem != null) {
386 |       values += elem.getValues.get(0).getValue.asInstanceOf[Double]
387 |       elem = reader.read()
388 |     }
389 |     reader.close()
390 |     values.toArray
391 |   }
392 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmGradient.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.factorization.binomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.Coefficients
 5 | import io.github.qf6101.mfm.optimization.Gradient
 6 | import io.github.qf6101.mfm.util.Logging
 7 | import org.apache.spark.ml.param.ParamMap
 8 | 
 9 | /**
10 |   * Created by qfeng on 15-3-11.
11 |   */
12 | 
13 | /**
14 |   * FM梯度
15 |   *
16 |   * @param paramMeta 参数元数据
17 |   * @param params    参数池(保存参数值)
18 |   */
19 | class FmGradient(paramMeta: FmModelParam, params: ParamMap) extends Gradient with Logging {
20 |   /**
21 |     * Compute the gradient and loss given the features of a single data point,
22 |     * add the gradient to a provided vector to avoid creating new objects, and return loss.
23 |     *
24 |     * @param data        features for one data point
25 |     * @param label       label for this data point
26 |     * @param coeffs      weights/coefficients corresponding to features
27 |     * @param cumGradient the computed gradient will be added to this vector
28 |     * @return loss
29 |     */
30 |   override def compute(data: SparseVector[Double],
31 |                        label: Double,
32 |                        coeffs: Coefficients,
33 |                        cumGradient: Coefficients):
34 |   Double = {
35 |     val fmcoeffs = coeffs.asInstanceOf[FmCoefficients]
36 |     val fmCumGradient = cumGradient.asInstanceOf[FmCoefficients]
37 |     val linearScore = FmModel.linearScore(data, paramMeta, params, fmcoeffs)
38 |     val expComponent = 1 + math.exp(-label * linearScore)
39 |     val loss = math.log(expComponent)
40 |     val multiplier = -label * (1 - 1 / expComponent)
41 |     //参与2阶项的最大维度
42 |     val maxInteractFeatures = params(paramMeta.maxInteractFeatures)
43 |     //0阶梯度
44 |     if (params(paramMeta.k0)) {
45 |       fmCumGradient.w0 += multiplier
46 |     }
47 |     //1阶梯度
48 |     if (params(paramMeta.k1)) {
49 |       data.activeIterator.foreach { case (index, value) =>
50 |         fmCumGradient.w(index) += multiplier * value
51 |       }
52 |     }
53 |     //2阶梯度
54 |     if (params(paramMeta.k2)) {
55 |       for (factorIndex <- 0 until params(paramMeta.numFactors)) {
56 |         //提前计算（因为求和中每一项都会用到）firstMoment = \sum_j^n {v_jf*x_j} （固定f）
57 |         val firstMoment = data.activeIterator.foldLeft(0.0) { case (sum, (index, value)) =>
58 |           if (index < maxInteractFeatures) {
59 |             sum + fmcoeffs.v(index, factorIndex) * value
60 |           } else sum
61 |         }
62 |         //计算2阶梯度
63 |         data.activeIterator.foreach { case (index, value) =>
64 |           if (index < maxInteractFeatures) {
65 |             val twoWayCumCoeff = fmCumGradient.v(index, factorIndex)
66 |             val twoWayCoeff = fmcoeffs.v(index, factorIndex)
67 |             val incrementGradient = twoWayCumCoeff + multiplier * ((value * firstMoment) - (twoWayCoeff * value * value))
68 |             fmCumGradient.v.update(index, factorIndex, incrementGradient)
69 |           }
70 |         }
71 |       }
72 |     }
73 |     loss
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmLearnSGD.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.factorization.binomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.MLModel
 5 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel}
 6 | import io.github.qf6101.mfm.optimization.{GradientDescent, Updater}
 7 | import org.apache.spark.ml.param.ParamMap
 8 | import org.apache.spark.rdd.RDD
 9 | 
10 | /**
11 |   * Created by qfeng on 15-3-27.
12 |   */
13 | 
14 | /**
15 |   * FM模型的SGD学习器
16 |   *
17 |   * @param params  参数池
18 |   * @param updater 模型参数更新器
19 |   */
20 | class FmLearnSGD(override val params: ParamMap,
21 |                  val updater: Updater)
22 |   extends BinLearner(params) with FmModelParam {
23 |   val lg = new FmGradient(this, params)
24 |   val gd = new GradientDescent(lg, updater, params)
25 | 
26 |   /**
27 |     * 训练模型
28 |     *
29 |     * @param dataset 训练集
30 |     * @return 模型
31 |     */
32 |   override def train(dataset: RDD[(Double, SparseVector[Double])]): BinModel = {
33 |     val initialCoeffs = new FmCoefficients(params(initMean), params(initStdev),
34 |       params(numFeatures), params(maxInteractFeatures), params(numFactors), params(k0), params(k1), params(k2))
35 |     val regs = Array(params(reg0), params(reg1), params(reg2))
36 |     val coeffs = gd.optimize(dataset, initialCoeffs, regs)
37 |     new FmModel(this, coeffs.asInstanceOf[FmCoefficients], params)
38 |   }
39 | }
40 | 
41 | /**
42 |   * FM模型的SGD学习器实例
43 |   */
44 | object FmLearnSGD {
45 |   def train(dataset: RDD[(Double, SparseVector[Double])],
46 |             params: ParamMap,
47 |             updater: Updater): MLModel = {
48 |     new FmLearnSGD(params, updater).train(dataset)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmModel.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.factorization.binomial
  2 | 
  3 | import breeze.linalg.SparseVector
  4 | import io.github.qf6101.mfm.baseframe.MLModel
  5 | import io.github.qf6101.mfm.baseframe.binomial.BinModel
  6 | import io.github.qf6101.mfm.util.Logging
  7 | import org.apache.spark.ml.param.ParamMap
  8 | 
  9 | 
 10 | /**
 11 |   * Created by qfeng on 15-1-26.
 12 |   */
 13 | 
 14 | /**
 15 |   * Factorization Machine模型
 16 |   *
 17 |   * @param paramMeta 分解机模型参数
 18 |   * @param params    参数池
 19 |   */
 20 | class FmModel(override val paramMeta: FmModelParam,
 21 |               override val coeffs: FmCoefficients,
 22 |               override val params: ParamMap)
 23 |   extends BinModel(paramMeta, coeffs, params) {
 24 |   /**
 25 |     * 对输入数据进行预测
 26 |     *
 27 |     * @param data 输入数据
 28 |     * @return 预测值
 29 |     */
 30 |   override def predict(data: SparseVector[Double]): Double = {
 31 |     val score = FmModel.linearScore(data, paramMeta, params, coeffs)
 32 |     1.0 / (1.0 + math.exp(-score))
 33 |   }
 34 | 
 35 |   /**
 36 |     * 模型内容是否相同
 37 |     *
 38 |     * @param other 另一个模型
 39 |     * @return 内容是否相同
 40 |     */
 41 |   override def equals(other: MLModel): Boolean = {
 42 |     other match {
 43 |       case otherModel: FmModel =>
 44 |         if (paramMeta.toJSON(params).equals(otherModel.paramMeta.toJSON(otherModel.params))
 45 |           && coeffs.equals(otherModel.coeffs)) true
 46 |         else false
 47 |       case _ => false
 48 |     }
 49 |   }
 50 | }
 51 | 
 52 | object FmModel extends Logging {
 53 |   /**
 54 |     * 计算样本的线性得分值
 55 |     *
 56 |     * @param data      样本
 57 |     * @param paramMeta 参数元数据
 58 |     * @param params    参数池
 59 |     * @param coeffs    FM系数
 60 |     * @return 输入样本的线性得分值
 61 |     */
 62 |   def linearScore(data: SparseVector[Double],
 63 |                   paramMeta: FmModelParam,
 64 |                   params: ParamMap,
 65 |                   coeffs: FmCoefficients): Double = {
 66 |     //初始化各阶预测值为0
 67 |     var zeroWayPredict = 0.0
 68 |     var oneWayPredict = 0.0
 69 |     var twoWayPredict = 0.0
 70 |     //参与2阶项的最大维度
 71 |     val maxInteractAttr = params(paramMeta.maxInteractFeatures)
 72 |     //0阶预测值
 73 |     if (params(paramMeta.k0)) {
 74 |       zeroWayPredict += coeffs.w0
 75 |     }
 76 |     //1阶预测值
 77 |     if (params(paramMeta.k1)) {
 78 |       data.activeIterator.foreach { case (index, value) =>
 79 |         oneWayPredict += coeffs.w(index) * value
 80 |       }
 81 |     }
 82 |     //2阶预测值
 83 |     if (params(paramMeta.k2)) {
 84 |       for (factorIndex <- 0 until params(paramMeta.numFactors)) {
 85 |         var firstMoment = 0.0
 86 |         var secondMoment = 0.0
 87 |         data.activeIterator.foreach { case (index, value) =>
 88 |           if (index < maxInteractAttr) {
 89 |             firstMoment += coeffs.v(index, factorIndex) * value
 90 |             secondMoment += math.pow(coeffs.v(index, factorIndex) * value, 2)
 91 |           }
 92 |         }
 93 |         twoWayPredict += firstMoment * firstMoment - secondMoment
 94 |       }
 95 |     }
 96 |     zeroWayPredict + oneWayPredict + 0.5 * twoWayPredict
 97 |   }
 98 | 
 99 |   /**
100 |     * 从文件载入分解机模型
101 |     *
102 |     * @param location 包含分解机型信息的文件
103 |     * @return 分解机模型
104 |     */
105 |   def apply(location: String): FmModel = {
106 |     val params = new ParamMap()
107 |     val paramMeta = FmModelParam(location + "/" + MLModel.namingParamFile, params)
108 |     val coefficients = FmCoefficients(location + "/" + MLModel.namingCoeffFile)
109 |     new FmModel(paramMeta, coefficients, params)
110 |   }
111 | 
112 |   /**
113 |     * 从本地文件载入分解机模型
114 |     *
115 |     * @param location 包含分解机型信息的本地文件
116 |     * @return 分解机模型
117 |     */
118 |   def fromLocal(location: String): FmModel = {
119 |     val params = new ParamMap()
120 |     val paramMeta = FmModelParam.fromLocal(location + "/" + MLModel.namingParamFile + "/part-00000", params)
121 |     val coefficients = FmCoefficients.fromLocal(location + "/" + MLModel.namingCoeffFile)
122 |     new FmModel(paramMeta, coefficients, params)
123 |   }
124 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/binomial/FmModelParam.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.factorization.binomial
  2 | 
  3 | import better.files.File
  4 | import io.github.qf6101.mfm.baseframe.ModelParam
  5 | import io.github.qf6101.mfm.baseframe.binomial.BinModelParam
  6 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
  7 | import org.apache.spark.sql.SparkSession
  8 | import org.json4s.{DefaultFormats, JsonAST}
  9 | import org.json4s.JsonDSL._
 10 | import org.json4s.jackson.JsonMethods._
 11 | 
 12 | /**
 13 |   * Created by qfeng on 15-1-26.
 14 |   */
 15 | 
 16 | /**
 17 |   * Factorization Machine模型参数
 18 |   */
 19 | trait FmModelParam extends BinModelParam {
 20 |   val numFeatures: Param[Int] = new Param("FmModelParam", "numFeatures", "样本维度数", ParamValidators.gt(0))
 21 |   val numFactors: Param[Int] = new Param("FmModelParam", "numFactors", "2阶分解维度数", ParamValidators.gt(0))
 22 |   val k0: Param[Boolean] = new Param("FmModelParam", "k0", "是否考虑0阶", ParamValidators.inArray(Array(true, false)))
 23 |   val k1: Param[Boolean] = new Param("FmModelParam", "k1", "是否考虑1阶", ParamValidators.inArray(Array(true, false)))
 24 |   val k2: Param[Boolean] = new Param("FmModelParam", "k2", "是否考虑2阶", ParamValidators.inArray(Array(true, false)))
 25 |   val reg0: Param[Double] = new Param("FmModelParam", "reg0", "正则参数")
 26 |   val reg1: Param[Double] = new Param("FmModelParam", "reg1", "正则参数")
 27 |   val reg2: Param[Double] = new Param("FmModelParam", "reg2", "正则参数")
 28 |   val maxInteractFeatures: Param[Int] = new Param("FmModelParam", "maxInteractFeatures", "参与2阶项的最大特征维度（不包含）", ParamValidators.gt(0))
 29 | 
 30 |   /**
 31 |     * Transform parameters to json object
 32 |     *
 33 |     * @return parameters in json format
 34 |     */
 35 |   override def toJSON(params: ParamMap): JsonAST.JObject = {
 36 |     super.toJSON(params) ~
 37 |       (ModelParam.namingParamType -> FmModelParam.getClass.toString) ~
 38 |       (reg0.name -> params(reg0)) ~
 39 |       (reg1.name -> params(reg1)) ~
 40 |       (reg2.name -> params(reg2)) ~
 41 |       (numFeatures.name -> params(numFeatures)) ~
 42 |       (numFactors.name -> params(numFactors)) ~
 43 |       (k0.name -> params(k0)) ~
 44 |       (k1.name -> params(k1)) ~
 45 |       (k2.name -> params(k2)) ~
 46 |       (maxInteractFeatures.name -> params(maxInteractFeatures))
 47 |   }
 48 | }
 49 | 
 50 | object FmModelParam {
 51 |   /**
 52 |     * 从参数文件构造分解机模型参数
 53 |     *
 54 |     * @param location 参数文件位置
 55 |     * @param params   参数池
 56 |     * @return 分解机型参数
 57 |     */
 58 |   def apply(location: String, params: ParamMap): FmModelParam = {
 59 |     // 初始化参数对象和spark session
 60 |     val fmModelParam = new FmModelParam {}
 61 |     val spark = SparkSession.builder().getOrCreate()
 62 |     // 读取参数值
 63 |     val paramValues = spark.read.json(location).first()
 64 |     val binaryThreshold = paramValues.getAs[Double](fmModelParam.binaryThreshold.name)
 65 |     val reg0 = paramValues.getAs[Double](fmModelParam.reg0.name)
 66 |     val reg1 = paramValues.getAs[Double](fmModelParam.reg1.name)
 67 |     val reg2 = paramValues.getAs[Double](fmModelParam.reg2.name)
 68 |     val numFeatures = paramValues.getAs[Long](fmModelParam.numFeatures.name).toInt
 69 |     val numFactors = paramValues.getAs[Long](fmModelParam.numFactors.name).toInt
 70 |     val k0 = paramValues.getAs[Boolean](fmModelParam.k0.name)
 71 |     val k1 = paramValues.getAs[Boolean](fmModelParam.k1.name)
 72 |     val k2 = paramValues.getAs[Boolean](fmModelParam.k2.name)
 73 |     val initMean = paramValues.getAs[Double](fmModelParam.initMean.name)
 74 |     val initStdev = paramValues.getAs[Double](fmModelParam.initStdev.name)
 75 |     val maxInteractFeatures = paramValues.getAs[Long](fmModelParam.maxInteractFeatures.name).toInt
 76 |     // 设置参数值
 77 |     params.put(fmModelParam.binaryThreshold, binaryThreshold)
 78 |     params.put(fmModelParam.reg0, reg0)
 79 |     params.put(fmModelParam.reg1, reg1)
 80 |     params.put(fmModelParam.reg2, reg2)
 81 |     params.put(fmModelParam.numFeatures, numFeatures)
 82 |     params.put(fmModelParam.numFactors, numFactors)
 83 |     params.put(fmModelParam.k0, k0)
 84 |     params.put(fmModelParam.k1, k1)
 85 |     params.put(fmModelParam.k2, k2)
 86 |     params.put(fmModelParam.initMean, initMean)
 87 |     params.put(fmModelParam.initStdev, initStdev)
 88 |     params.put(fmModelParam.maxInteractFeatures, maxInteractFeatures)
 89 |     // 返回FM参数
 90 |     fmModelParam
 91 |   }
 92 | 
 93 |   /**
 94 |     * 从本地文件载入参数
 95 |     *
 96 |     * @param location 本地文件位置
 97 |     * @param params 参数池
 98 |     * @return 分解机参数
 99 |     */
100 |   def fromLocal(location: String, params: ParamMap): FmModelParam = {
101 |     // 初始化参数对象
102 |     val fmModelParam = new FmModelParam {}
103 |     implicit val formats = DefaultFormats
104 |     // 读取参数值
105 |     val paramValues = parse(File(location).contentAsString)
106 |     val binaryThreshold = (paramValues \ fmModelParam.binaryThreshold.name).extract[Double]
107 |     val reg0 = (paramValues \ fmModelParam.reg0.name).extract[Double]
108 |     val reg1 = (paramValues \ fmModelParam.reg1.name).extract[Double]
109 |     val reg2 = (paramValues \ fmModelParam.reg2.name).extract[Double]
110 |     val numFeatures = (paramValues \ fmModelParam.numFeatures.name).extract[Int]
111 |     val numFactors = (paramValues \ fmModelParam.numFactors.name).extract[Int]
112 |     val k0 = (paramValues \ fmModelParam.k0.name).extract[Boolean]
113 |     val k1 = (paramValues \ fmModelParam.k1.name).extract[Boolean]
114 |     val k2 = (paramValues \ fmModelParam.k2.name).extract[Boolean]
115 |     val initMean = (paramValues \ fmModelParam.initMean.name).extract[Double]
116 |     val initStdev = (paramValues \ fmModelParam.initStdev.name).extract[Double]
117 |     val maxInteractFeatures = (paramValues \ fmModelParam.maxInteractFeatures.name).extract[Int]
118 |     // 设置参数值
119 |     params.put(fmModelParam.binaryThreshold, binaryThreshold)
120 |     params.put(fmModelParam.reg0, reg0)
121 |     params.put(fmModelParam.reg1, reg1)
122 |     params.put(fmModelParam.reg2, reg2)
123 |     params.put(fmModelParam.numFeatures, numFeatures)
124 |     params.put(fmModelParam.numFactors, numFactors)
125 |     params.put(fmModelParam.k0, k0)
126 |     params.put(fmModelParam.k1, k1)
127 |     params.put(fmModelParam.k2, k2)
128 |     params.put(fmModelParam.initMean, initMean)
129 |     params.put(fmModelParam.initStdev, initStdev)
130 |     params.put(fmModelParam.maxInteractFeatures, maxInteractFeatures)
131 |     // 返回FM参数
132 |     fmModelParam
133 |   }
134 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmCoefficients.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.factorization.multinomial
  2 | 
  3 | import better.files.File
  4 | import io.github.qf6101.mfm.baseframe.Coefficients
  5 | import io.github.qf6101.mfm.factorization.binomial.FmCoefficients
  6 | import org.apache.spark.sql.SparkSession
  7 | import org.json4s.DefaultFormats
  8 | import org.json4s.JsonDSL._
  9 | import org.json4s.jackson.JsonMethods._
 10 | 
 11 | /**
 12 |   * Created by qfeng on 16-9-7.
 13 |   */
 14 | 
 15 | /**
 16 |   * Multinomial Factorization Machine模型系数
 17 |   *
 18 |   * @param initMean    随机初始值均值
 19 |   * @param initStdev   随机初始值标准差
 20 |   * @param numFeatures 特征个数
 21 |   * @param numFactors  因子个数
 22 |   * @param k0          是否需要处理截距
 23 |   * @param k1          是否需要处理一阶参数
 24 |   * @param k2          是否需要处理二阶参数
 25 |   * @param numClasses  标签个数
 26 |   */
 27 | class MfmCoefficients(val initMean: Double,
 28 |                       val initStdev: Double,
 29 |                       var numFeatures: Int,
 30 |                       var numInteractFeatures: Int,
 31 |                       var numFactors: Int,
 32 |                       val k0: Boolean,
 33 |                       val k1: Boolean,
 34 |                       val k2: Boolean,
 35 |                       val numClasses: Int) extends Coefficients {
 36 |   // 每个标签对应一个FM系数
 37 |   var thetas = Array.fill[FmCoefficients](numClasses)(new FmCoefficients(
 38 |     initMean, initStdev, numFeatures, numInteractFeatures, numFactors, k0, k1, k2))
 39 | 
 40 |   /**
 41 |     * 从FM系数数组构造多分类模型系数
 42 |     *
 43 |     * @param thetas FM系数数组
 44 |     */
 45 |   def this(thetas: Array[FmCoefficients]) {
 46 |     this(thetas(0).initMean, thetas(0).initStdev, thetas(0).numFeatures, thetas(0).numInteractFeatures,
 47 |       thetas(0).numFactors, thetas(0).k0, thetas(0).k1, thetas(0).k2, thetas.length)
 48 |     this.thetas = thetas
 49 |   }
 50 | 
 51 |   /**
 52 |     * 只复制this的结构（比如参数个数），不复制内容
 53 |     *
 54 |     * @return 复制的拷贝
 55 |     */
 56 |   override def copyEmpty(): Coefficients = new MfmCoefficients(this.initMean, this.initStdev,
 57 |     this.numFeatures, this.numInteractFeatures, this.numFactors, this.k0, this.k1, this.k2, this.numClasses)
 58 | 
 59 |   /**
 60 |     * 对应系数加法，加至this上
 61 |     *
 62 |     * @param other 加数
 63 |     * @return this
 64 |     */
 65 |   override def +=(other: Coefficients): Coefficients = {
 66 |     val otherCoeffs = other.asInstanceOf[MfmCoefficients]
 67 |     (this.thetas zip otherCoeffs.thetas).foreach { case (me, he) =>
 68 |       me += he
 69 |     }
 70 |     this
 71 |   }
 72 | 
 73 |   /**
 74 |     * 对应系数减法，减至this上
 75 |     *
 76 |     * @param other 减数
 77 |     * @return this
 78 |     */
 79 |   override def -=(other: Coefficients): Coefficients = {
 80 |     val otherCoeffs = other.asInstanceOf[MfmCoefficients]
 81 |     (this.thetas zip otherCoeffs.thetas).foreach { case (me, he) =>
 82 |       me -= he
 83 |     }
 84 |     this
 85 |   }
 86 | 
 87 |   /**
 88 |     * 对应系数加上同一实数，加至复制this的类上
 89 |     *
 90 |     * @param addend 加数
 91 |     * @return 加法结果（拷贝）
 92 |     */
 93 |   override def +(addend: Double): Coefficients = {
 94 |     val me = this.copy.asInstanceOf[MfmCoefficients]
 95 |     val result = me.thetas.map { theta =>
 96 |       (theta + addend).asInstanceOf[FmCoefficients]
 97 |     }
 98 |     new MfmCoefficients(result)
 99 |   }
100 | 
101 |   /**
102 |     * 对应系数乘上同一实数，加至复制this的类上
103 |     *
104 |     * @param multiplier 乘数
105 |     * @return 乘法结果
106 |     */
107 |   override def *(multiplier: Double): Coefficients = {
108 |     val me = this.copy.asInstanceOf[MfmCoefficients]
109 |     val result = me.thetas.map { theta =>
110 |       (theta * multiplier).asInstanceOf[FmCoefficients]
111 |     }
112 |     new MfmCoefficients(result)
113 |   }
114 | 
115 |   /**
116 |     * 对应系数除上同一实数，加至复制this的类上
117 |     *
118 |     * @param dividend 除数
119 |     * @return 除法结果
120 |     */
121 |   override def /(dividend: Double): Coefficients = {
122 |     val me = this.copy.asInstanceOf[MfmCoefficients]
123 |     val result = me.thetas.map { theta =>
124 |       (theta / dividend).asInstanceOf[FmCoefficients]
125 |     }
126 |     new MfmCoefficients(result)
127 |   }
128 | 
129 |   /**
130 |     * 计算L1的正则值
131 |     *
132 |     * @param regParam 正则参数
133 |     * @return 参数绝对值加权后的L1正则值
134 |     */
135 |   override def L1RegValue(regParam: Array[Double]): Double = {
136 |     thetas.map { theta =>
137 |       theta.L1RegValue(regParam)
138 |     }.sum
139 |   }
140 | 
141 | 
142 |   /**
143 |     * 计算系数的2范数
144 |     * sum(abs(A).^p)^(1/p) where p=2
145 |     *
146 |     * @return 系数的2范数
147 |     */
148 |   override def norm: Double = {
149 |     this.thetas.map(_.norm).sum / this.thetas.length
150 |   }
151 | 
152 |   /**
153 |     * 用L1稀疏化系数
154 |     *
155 |     * @param regParam 正则参数值
156 |     * @param stepSize 学习率
157 |     * @return 稀疏化后的系数
158 |     */
159 |   override def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients = {
160 |     thetas.foreach { theta =>
161 |       theta.L1Shrink(regParam, stepSize)
162 |     }
163 |     this
164 |   }
165 | 
166 | 
167 |   /**
168 |     * 同时复制this的结构和内容
169 |     *
170 |     * @return 复制的拷贝
171 |     */
172 |   override def copy: Coefficients = {
173 |     new MfmCoefficients(this.thetas.map(_.copy.asInstanceOf[FmCoefficients]))
174 |   }
175 | 
176 |   /**
177 |     * 计算L2的正则值
178 |     *
179 |     * @param reg 正则参数
180 |     * @return 参数加权后的L2正则值
181 |     */
182 |   override def L2RegValue(reg: Array[Double]): Double = {
183 |     thetas.map { theta =>
184 |       theta.L2RegValue(reg)
185 |     }.sum
186 |   }
187 | 
188 |   /**
189 |     * 计算L2的正则梯度值
190 |     *
191 |     * @param reg 正则参数
192 |     * @return 参数加权后的L2正则梯度值
193 |     */
194 |   override def L2RegGradient(reg: Array[Double]): Coefficients = {
195 |     val me = this.copy.asInstanceOf[MfmCoefficients]
196 |     val result = me.thetas.map { theta =>
197 |       theta.L2RegGradient(reg).asInstanceOf[FmCoefficients]
198 |     }
199 |     new MfmCoefficients(result)
200 |   }
201 | 
202 |   /**
203 |     * 保存元数据至文件
204 |     *
205 |     * @param location 文件位置
206 |     */
207 |   override def saveMeta(location: String): Unit = {
208 |     val json = (Coefficients.namingCoeffType -> MfmCoefficients.getClass.toString) ~
209 |       (MfmCoefficients.namingNumClasses -> numClasses)
210 |     SparkSession.builder().getOrCreate().sparkContext.
211 |       makeRDD(List(compact(render(json)))).repartition(1).saveAsTextFile(location)
212 |   }
213 | 
214 |   /**
215 |     * 保存数据至文件
216 |     *
217 |     * @param location 文件位置
218 |     */
219 |   override def saveData(location: String): Unit = {
220 |     thetas.zipWithIndex.foreach { case (theta, index) =>
221 |       theta.saveMeta(location + "/" + index + "/" + Coefficients.namingMetaFile)
222 |       theta.saveData(location + "/" + index + "/" + Coefficients.namingDataFile)
223 |     }
224 |   }
225 | 
226 |   /**
227 |     * 与另一个系数是否相等
228 |     *
229 |     * @param other 另一个系数
230 |     * @return 是否相等
231 |     */
232 |   override def equals(other: Coefficients): Boolean = {
233 |     other match {
234 |       case otherCoeffs: MfmCoefficients =>
235 |         (thetas zip otherCoeffs.thetas).foldLeft(true) { case (eq, (me, he)) =>
236 |           eq && me.equals(he)
237 |         }
238 |       case _ => false
239 |     }
240 |   }
241 | }
242 | 
243 | /**
244 |   * 多分类FM系数对象
245 |   */
246 | object MfmCoefficients {
247 |   val namingNumClasses = "num_classes"
248 | 
249 |   /**
250 |     * 从文件构造多分类FM系数对象
251 |     *
252 |     * @param location 文件位置
253 |     * @return 多分类FM系数对象
254 |     */
255 |   def apply(location: String): MfmCoefficients = {
256 |     // 初始化spark session
257 |     val spark = SparkSession.builder().getOrCreate()
258 |     // 读取元数据
259 |     val meta = spark.read.json(location + "/" + Coefficients.namingMetaFile + "/part-00000").first()
260 |     val numClasses = meta.getAs[Long](namingNumClasses).toInt
261 |     // 读取系数
262 |     val thetas = Array.fill[FmCoefficients](numClasses)(null)
263 |     for (index <- 0 until numClasses) {
264 |       thetas(index) = FmCoefficients(location + "/" + Coefficients.namingDataFile + "/" + index)
265 |     }
266 |     // 返回结果
267 |     new MfmCoefficients(thetas)
268 |   }
269 | 
270 |   /**
271 |     * 从本地文件载入系数
272 |     *
273 |     * @param location 本地文件
274 |     * @return MFM系数对象
275 |     */
276 |   def fromLocal(location: String): MfmCoefficients = {
277 |     //读取元数据
278 |     implicit val formats = DefaultFormats
279 |     val meta = parse(File(location + "/" + Coefficients.namingMetaFile + "/part-00000").contentAsString)
280 |     val numClasses = (meta \ namingNumClasses).extract[Int]
281 |     // 读取系数
282 |     val thetas = Array.fill[FmCoefficients](numClasses)(null)
283 |     for (index <- 0 until numClasses) {
284 |       thetas(index) = FmCoefficients.fromLocal(location + "/" + Coefficients.namingDataFile + "/" + index)
285 |     }
286 |     // 返回结果
287 |     new MfmCoefficients(thetas)
288 |   }
289 | }
290 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmGradient.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.factorization.multinomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.Coefficients
 5 | import io.github.qf6101.mfm.optimization.Gradient
 6 | import io.github.qf6101.mfm.util.Logging
 7 | import org.apache.spark.ml.param.ParamMap
 8 | 
 9 | /**
10 |   * Created by qfeng on 16-9-7.
11 |   */
12 | 
13 | /**
14 |   * 多分类FM梯度
15 |   *
16 |   * @param paramMeta 多分类FM参数
17 |   * @param params    参数池
18 |   */
19 | class MfmGradient(paramMeta: MfmModelParam, params: ParamMap) extends Gradient with Logging {
20 |   /**
21 |     * Compute the gradient and loss given the features of a single data point,
22 |     * add the gradient to a provided vector to avoid creating new objects, and return loss.
23 |     *
24 |     * @param data        features for one data point
25 |     * @param label       label for this data point
26 |     * @param coeffs      weights/coefficients corresponding to features
27 |     * @param cumGradient the computed gradient will be added to this vector
28 |     * @return loss
29 |     */
30 |   override def compute(data: SparseVector[Double],
31 |                        label: Double,
32 |                        coeffs: Coefficients,
33 |                        cumGradient: Coefficients): Double = {
34 |     val mfmCoeff = coeffs.asInstanceOf[MfmCoefficients]
35 |     val mfmCumGradient = cumGradient.asInstanceOf[MfmCoefficients]
36 |     val scores = MfmModel.predict(data, paramMeta, params, mfmCoeff)
37 |     val multipliers = scores.zipWithIndex.map { case (score, index) =>
38 |       if (label.toInt == index) score - 1.0 else score
39 |     }
40 |     //参与2阶项的最大维度
41 |     val maxInteractFeatures = params(paramMeta.maxInteractFeatures)
42 |     val loss = -math.log(scores(label.toInt))
43 |     (mfmCoeff.thetas zip mfmCumGradient.thetas zip multipliers).foreach { case ((fmCoeff, fmCumGradient), multiplier) =>
44 |       //0阶梯度
45 |       if (params(paramMeta.k0)) {
46 |         fmCumGradient.w0 += multiplier
47 |       }
48 |       //1阶梯度
49 |       if (params(paramMeta.k1)) {
50 |         data.activeIterator.foreach { case (index, value) =>
51 |           fmCumGradient.w(index) += multiplier * value
52 |         }
53 |       }
54 |       //2阶梯度
55 |       if (params(paramMeta.k2)) {
56 |         for (factorIndex <- 0 until params(paramMeta.numFactors)) {
57 |           //提前计算（因为求和中每一项都会用到）firstMoment = \sum_j^n {v_jf*x_j} （固定f）
58 |           val firstMoment = data.activeIterator.foldLeft(0.0) { case (sum, (index, value)) =>
59 |             if (index < maxInteractFeatures) {
60 |               sum + fmCoeff.v(index, factorIndex) * value
61 |             } else sum
62 |           }
63 |           //计算2阶梯度
64 |           data.activeIterator.foreach { case (index, value) =>
65 |             if (index < maxInteractFeatures) {
66 |               val twoWayCumCoeff = fmCumGradient.v(index, factorIndex)
67 |               val twoWayCoeff = fmCoeff.v(index, factorIndex)
68 |               val incrementGradient = twoWayCumCoeff + multiplier * ((value * firstMoment) - (twoWayCoeff * value * value))
69 |               fmCumGradient.v.update(index, factorIndex, incrementGradient)
70 |             }
71 |           }
72 |         }
73 |       }
74 |     }
75 |     loss
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmLearnSGD.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.factorization.multinomial
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.mutinomial.{MultiLearner, MultiModel}
 5 | import io.github.qf6101.mfm.optimization.{GradientDescent, Updater}
 6 | import org.apache.spark.ml.param.ParamMap
 7 | import org.apache.spark.rdd.RDD
 8 | 
 9 | /**
10 |   * Created by qfeng on 16-9-7.
11 |   */
12 | class MfmLearnSGD(override val params: ParamMap,
13 |                   val updater: Updater) extends MultiLearner(params) with MfmModelParam {
14 |   val lg = new MfmGradient(this, params)
15 |   val gd = new GradientDescent(lg, updater, params)
16 | 
17 |   /**
18 |     * 训练对应模型
19 |     *
20 |     * @param dataset 训练集
21 |     * @return 模型
22 |     */
23 |   override def train(dataset: RDD[(Double, SparseVector[Double])]): MultiModel = {
24 |     val initialCoeffs = new MfmCoefficients(params(initMean), params(initStdev), params(numFeatures),
25 |       params(maxInteractFeatures), params(numFactors), params(k0), params(k1), params(k2), params(numClasses))
26 |     val regs = Array(params(reg0), params(reg1), params(reg2))
27 |     val coeffs = gd.optimize(dataset, initialCoeffs, regs)
28 |     new MfmModel(this, coeffs.asInstanceOf[MfmCoefficients], params)
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmModel.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.factorization.multinomial
  2 | 
  3 | import breeze.linalg.SparseVector
  4 | import io.github.qf6101.mfm.baseframe.MLModel
  5 | import io.github.qf6101.mfm.baseframe.mutinomial.MultiModel
  6 | import io.github.qf6101.mfm.factorization.binomial.FmModel
  7 | import org.apache.spark.ml.param.ParamMap
  8 | 
  9 | /**
 10 |   * Created by qfeng on 16-9-7.
 11 |   */
 12 | 
 13 | /**
 14 |   * 多分类FM模型
 15 |   *
 16 |   * @param paramMeta 模型参赛
 17 |   * @param coeffs    模型系数
 18 |   * @param params    参数池（保存参数的值）
 19 |   */
 20 | class MfmModel(override val paramMeta: MfmModelParam,
 21 |                override val coeffs: MfmCoefficients,
 22 |                override val params: ParamMap) extends MultiModel(paramMeta, coeffs, params) {
 23 |   //dump, 设置默认的阈值为0.5
 24 |   params.put(paramMeta.binaryThreshold, 0.5)
 25 | 
 26 |   /**
 27 |     * 对输入数据进行预测
 28 |     *
 29 |     * @param data 输入数据
 30 |     * @return 预测值向量(0~1)
 31 |     */
 32 |   override def predict(data: SparseVector[Double]): Array[Double] = {
 33 |     MfmModel.predict(data, paramMeta, params, coeffs)
 34 |   }
 35 | 
 36 |   /**
 37 |     * 模型内容是否相同
 38 |     *
 39 |     * @param other 另一个模型
 40 |     * @return 内容是否相同
 41 |     */
 42 |   override def equals(other: MLModel): Boolean = {
 43 |     other match {
 44 |       case otherModel: MfmModel =>
 45 |         if (paramMeta.toJSON(params).equals(otherModel.paramMeta.toJSON(otherModel.params))
 46 |           && coeffs.equals(otherModel.coeffs)) true
 47 |         else false
 48 |       case _ => false
 49 |     }
 50 |   }
 51 | }
 52 | 
 53 | /**
 54 |   * 多分类FM模型对象
 55 |   */
 56 | object MfmModel {
 57 |   /**
 58 |     * 对输入样本进行预测
 59 |     *
 60 |     * @param data      样本数据
 61 |     * @param paramMeta 多分类FM参数
 62 |     * @param params    参数池
 63 |     * @param coeffs    多分类FM系数
 64 |     * @return 预测值
 65 |     */
 66 |   def predict(data: SparseVector[Double],
 67 |               paramMeta: MfmModelParam,
 68 |               params: ParamMap,
 69 |               coeffs: MfmCoefficients): Array[Double] = {
 70 |     // 计算线性得分
 71 |     val scores = coeffs.thetas.map { theta =>
 72 |       FmModel.linearScore(data, paramMeta, params, theta)
 73 |     }
 74 |     // 为了防止溢出,对分子分母都除以maxScore,得到adjustedScores
 75 |     val maxScore = scores.max
 76 |     val adjustedScores = scores.map { score =>
 77 |       math.exp(score - maxScore)
 78 |     }
 79 |     // 计算归一化后的得分
 80 |     val sumAdjustedScores = adjustedScores.sum
 81 |     adjustedScores.map(_ / sumAdjustedScores)
 82 |   }
 83 | 
 84 |   /**
 85 |     * 从文件载入分解机模型
 86 |     *
 87 |     * @param location 包含分解机型信息的文件
 88 |     * @return 分解机模型
 89 |     */
 90 |   def apply(location: String): MfmModel = {
 91 |     val params = new ParamMap()
 92 |     val paramMeta = MfmModelParam(location + "/" + MLModel.namingParamFile, params)
 93 |     val coefficients = MfmCoefficients(location + "/" + MLModel.namingCoeffFile)
 94 |     new MfmModel(paramMeta, coefficients, params)
 95 |   }
 96 | 
 97 |   /**
 98 |     * 从本地文件载入分解机模型
 99 |     *
100 |     * @param location 包含分解机型信息的本地文件
101 |     * @return 分解机模型
102 |     */
103 |   def fromLocal(location: String): MfmModel = {
104 |     val params = new ParamMap()
105 |     val paramMeta = MfmModelParam.fromLocal(location + "/" + MLModel.namingParamFile + "/part-00000", params)
106 |     val coefficients = MfmCoefficients.fromLocal(location + "/" + MLModel.namingCoeffFile)
107 |     new MfmModel(paramMeta, coefficients, params)
108 |   }
109 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/factorization/multinomial/MfmModelParam.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.factorization.multinomial
  2 | 
  3 | import better.files.File
  4 | import io.github.qf6101.mfm.baseframe.ModelParam
  5 | import io.github.qf6101.mfm.baseframe.mutinomial.MultiModelParam
  6 | import io.github.qf6101.mfm.factorization.binomial.FmModelParam
  7 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
  8 | import org.apache.spark.sql.SparkSession
  9 | import org.json4s.JsonAST.JField
 10 | import org.json4s.JsonDSL._
 11 | import org.json4s.jackson.JsonMethods._
 12 | import org.json4s.{DefaultFormats, JObject, JsonAST}
 13 | 
 14 | 
 15 | /**
 16 |   * Created by qfeng on 16-9-7.
 17 |   */
 18 | 
 19 | /**
 20 |   * 多分类FM模型参数
 21 |   */
 22 | trait MfmModelParam extends FmModelParam with MultiModelParam {
 23 |   val numClasses: Param[Int] = new Param("MfmModelParam", "numClasses", "标签数目", ParamValidators.gt(0))
 24 | 
 25 |   /**
 26 |     * Transform parameters to json object
 27 |     *
 28 |     * @return parameters in json format
 29 |     */
 30 |   override def toJSON(params: ParamMap): JsonAST.JObject = {
 31 |     val json = super.toJSON(params) removeField {
 32 |       case JField(ModelParam.namingParamType, _) => true
 33 |       case _ => false
 34 |     }
 35 |     json.asInstanceOf[JObject] ~
 36 |       (numClasses.name -> params(numClasses)) ~
 37 |       (ModelParam.namingParamType -> MfmModelParam.getClass.toString)
 38 |   }
 39 | }
 40 | 
 41 | object MfmModelParam {
 42 |   /**
 43 |     * 参数文件构造分解机模型参数
 44 |     *
 45 |     * @param location 文件位置
 46 |     * @param params   参数池
 47 |     * @return 分解机型参数
 48 |     */
 49 |   def apply(location: String, params: ParamMap): MfmModelParam = {
 50 |     // 初始化参数对象和spark session
 51 |     val mfmModelParam = new MfmModelParam {}
 52 |     val spark = SparkSession.builder().getOrCreate()
 53 |     // 读取参数值
 54 |     val paramValues = spark.read.json(location).first()
 55 |     val binaryThreshold = paramValues.getAs[Double](mfmModelParam.binaryThreshold.name)
 56 |     val reg0 = paramValues.getAs[Double](mfmModelParam.reg0.name)
 57 |     val reg1 = paramValues.getAs[Double](mfmModelParam.reg1.name)
 58 |     val reg2 = paramValues.getAs[Double](mfmModelParam.reg2.name)
 59 |     val numFeatures = paramValues.getAs[Long](mfmModelParam.numFeatures.name).toInt
 60 |     val numFactors = paramValues.getAs[Long](mfmModelParam.numFactors.name).toInt
 61 |     val k0 = paramValues.getAs[Boolean](mfmModelParam.k0.name)
 62 |     val k1 = paramValues.getAs[Boolean](mfmModelParam.k1.name)
 63 |     val k2 = paramValues.getAs[Boolean](mfmModelParam.k2.name)
 64 |     val initMean = paramValues.getAs[Double](mfmModelParam.initMean.name)
 65 |     val initStdev = paramValues.getAs[Double](mfmModelParam.initStdev.name)
 66 |     val maxInteractFeatures = paramValues.getAs[Long](mfmModelParam.maxInteractFeatures.name).toInt
 67 |     val numClasses = paramValues.getAs[Long](mfmModelParam.numClasses.name).toInt
 68 |     // 设置参数值
 69 |     params.put(mfmModelParam.binaryThreshold, binaryThreshold)
 70 |     params.put(mfmModelParam.reg0, reg0)
 71 |     params.put(mfmModelParam.reg1, reg1)
 72 |     params.put(mfmModelParam.reg2, reg2)
 73 |     params.put(mfmModelParam.numFeatures, numFeatures)
 74 |     params.put(mfmModelParam.numFactors, numFactors)
 75 |     params.put(mfmModelParam.k0, k0)
 76 |     params.put(mfmModelParam.k1, k1)
 77 |     params.put(mfmModelParam.k2, k2)
 78 |     params.put(mfmModelParam.initMean, initMean)
 79 |     params.put(mfmModelParam.initStdev, initStdev)
 80 |     params.put(mfmModelParam.maxInteractFeatures, maxInteractFeatures)
 81 |     params.put(mfmModelParam.numClasses, numClasses)
 82 |     // 返回MFM参数
 83 |     mfmModelParam
 84 |   }
 85 | 
 86 |   /**
 87 |     * 从本地文件载入参数
 88 |     *
 89 |     * @param location 本地文件位置
 90 |     * @param params 参数池
 91 |     * @return 分解机参数
 92 |     */
 93 |   def fromLocal(location: String, params: ParamMap): MfmModelParam = {
 94 |     // 初始化参数对象
 95 |     val mfmModelParam = new MfmModelParam {}
 96 |     implicit val formats = DefaultFormats
 97 |     // 读取参数值
 98 |     val paramValues = parse(File(location).contentAsString)
 99 |     val binaryThreshold = (paramValues \ mfmModelParam.binaryThreshold.name).extract[Double]
100 |     val reg0 = (paramValues \ mfmModelParam.reg0.name).extract[Double]
101 |     val reg1 = (paramValues \ mfmModelParam.reg1.name).extract[Double]
102 |     val reg2 = (paramValues \ mfmModelParam.reg2.name).extract[Double]
103 |     val numFeatures = (paramValues \ mfmModelParam.numFeatures.name).extract[Int]
104 |     val numFactors = (paramValues \ mfmModelParam.numFactors.name).extract[Int]
105 |     val k0 = (paramValues \ mfmModelParam.k0.name).extract[Boolean]
106 |     val k1 = (paramValues \ mfmModelParam.k1.name).extract[Boolean]
107 |     val k2 = (paramValues \ mfmModelParam.k2.name).extract[Boolean]
108 |     val initMean = (paramValues \ mfmModelParam.initMean.name).extract[Double]
109 |     val initStdev = (paramValues \ mfmModelParam.initStdev.name).extract[Double]
110 |     val maxInteractFeatures = (paramValues \ mfmModelParam.maxInteractFeatures.name).extract[Int]
111 |     val numClasses = (paramValues \ mfmModelParam.numClasses.name).extract[Int]
112 |     // 设置参数值
113 |     params.put(mfmModelParam.binaryThreshold, binaryThreshold)
114 |     params.put(mfmModelParam.reg0, reg0)
115 |     params.put(mfmModelParam.reg1, reg1)
116 |     params.put(mfmModelParam.reg2, reg2)
117 |     params.put(mfmModelParam.numFeatures, numFeatures)
118 |     params.put(mfmModelParam.numFactors, numFactors)
119 |     params.put(mfmModelParam.k0, k0)
120 |     params.put(mfmModelParam.k1, k1)
121 |     params.put(mfmModelParam.k2, k2)
122 |     params.put(mfmModelParam.initMean, initMean)
123 |     params.put(mfmModelParam.initStdev, initStdev)
124 |     params.put(mfmModelParam.maxInteractFeatures, maxInteractFeatures)
125 |     params.put(mfmModelParam.numClasses, numClasses)
126 |     // 返回FM参数
127 |     mfmModelParam
128 |   }
129 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/logisticregression/LogisticGradient.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.logisticregression
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.Coefficients
 5 | import io.github.qf6101.mfm.optimization.Gradient
 6 | import org.apache.spark.ml.param.ParamMap
 7 | 
 8 | /**
 9 |   * Created by qfeng on 15-3-13.
10 |   */
11 | 
12 | 
13 | /**
14 |   * Compute gradient and loss for a logistic loss function, as used in binary classification.
15 |   * See also the documentation for the precise formulation.
16 |   */
17 | class LogisticGradient(params: ParamMap) extends Gradient {
18 |   /**
19 |     * Compute the gradient and loss given the features of a single data point,
20 |     * add the gradient to a provided vector to avoid creating new objects, and return loss.
21 |     *
22 |     * @param data        features for one data point
23 |     * @param label       label for this data point
24 |     * @param coeffs      weights/coefficients corresponding to features
25 |     * @param cumGradient the computed gradient will be added to this vector
26 |     * @return loss
27 |     */
28 |   override def compute(data: SparseVector[Double],
29 |                        label: Double,
30 |                        coeffs: Coefficients,
31 |                        cumGradient: Coefficients):
32 |   Double = {
33 |     val vecCoeffs = coeffs.asInstanceOf[VectorCoefficients]
34 |     val vecCumGradient = cumGradient.asInstanceOf[VectorCoefficients]
35 |     val hypotheses = 1 / (1 + math.exp(-1.0 * vecCoeffs.dot(data)))
36 |     val multiplier = hypotheses - label
37 | 
38 |     vecCumGradient +=(multiplier, data * multiplier)
39 |     if (label > 0) -math.log(hypotheses) else -math.log(1 - hypotheses)
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/logisticregression/LrLearnLBFGS.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.logisticregression
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.MLModel
 5 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel}
 6 | import io.github.qf6101.mfm.optimization.{LBFGS, Updater}
 7 | import org.apache.spark.ml.param.ParamMap
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.storage.StorageLevel
10 | 
11 | /**
12 |   * Created by qfeng on 15-4-7.
13 |   */
14 | 
15 | /**
16 |   * 逻辑斯蒂模型的LBFGS学习器
17 |   *
18 |   * @param params        参数池*
19 |   * @param updater       参数更新器
20 |   * @param initialCoeffs 初始参数
21 |   */
22 | class LrLearnLBFGS(override val params: ParamMap,
23 |                    val updater: Updater,
24 |                    val initialCoeffs: Option[VectorCoefficients] = None)
25 |   extends BinLearner(params) with LrModelParam {
26 |   val lg = new LogisticGradient(params)
27 |   val lbfgs = new LBFGS(lg, updater, params)
28 | 
29 |   /**
30 |     * 训练逻辑斯蒂模型
31 |     *
32 |     * @param dataSet 训练集
33 |     * @return 逻辑斯蒂模型
34 |     */
35 |   override def train(dataSet: RDD[(Double, SparseVector[Double])]): BinModel = {
36 |     dataSet.persist(StorageLevel.MEMORY_AND_DISK_SER)
37 |     val inputCoeffs = initialCoeffs match {
38 |       case Some(value) => value
39 |       case None => new VectorCoefficients(dataSet.first()._2.length)
40 |     }
41 |     val coeffs = lbfgs.optimize(dataSet, inputCoeffs, params(reg))
42 |     dataSet.unpersist()
43 |     new LrModel(this, coeffs.asInstanceOf[VectorCoefficients], params)
44 |   }
45 | }
46 | 
47 | /**
48 |   * 逻辑斯蒂模型的LBFGS学习器实例
49 |   */
50 | object LrLearnLBFGS {
51 | 
52 |   /**
53 |     * 训练逻辑斯蒂模型
54 |     *
55 |     * @param dataset       数据集
56 |     * @param params        参数池*
57 |     * @param updater       参数更新器
58 |     * @param initialCoeffs 初始参数
59 |     * @return 逻辑斯蒂模型
60 |     */
61 |   def train(dataset: RDD[(Double, SparseVector[Double])],
62 |             params: ParamMap,
63 |             updater: Updater,
64 |             initialCoeffs: Option[VectorCoefficients] = None): MLModel = {
65 |     new LrLearnLBFGS(params, updater, initialCoeffs).train(dataset)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/logisticregression/LrLearnSGD.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.logisticregression
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.MLModel
 5 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel}
 6 | import io.github.qf6101.mfm.optimization.{GradientDescent, Updater}
 7 | import org.apache.spark.ml.param.ParamMap
 8 | import org.apache.spark.rdd.RDD
 9 | import org.apache.spark.storage.StorageLevel
10 | 
11 | /**
12 |   * Created by qfeng on 15-3-17.
13 |   */
14 | 
15 | /**
16 |   * Train a classification model for Logistic Regression using Stochastic Gradient Descent. By
17 |   * default L2 regularization is used, which can be changed via
18 |   * [[LrLearnSGD]].
19 |   */
20 | 
21 | /**
22 |   * 逻辑斯蒂模型的SGD学习器
23 |   *
24 |   * @param params        参数池*
25 |   * @param updater       参数更新器
26 |   * @param initialCoeffs 初始参数
27 |   */
28 | class LrLearnSGD(override val params: ParamMap,
29 |                  val updater: Updater,
30 |                  val initialCoeffs: Option[VectorCoefficients] = None)
31 |   extends BinLearner(params) with LrModelParam {
32 |   val lg = new LogisticGradient(params)
33 |   val gd = new GradientDescent(lg, updater, params)
34 | 
35 |   /**
36 |     * 训练逻辑斯蒂模型
37 |     *
38 |     * @param dataSet 训练集
39 |     * @return 逻辑斯蒂模型
40 |     */
41 |   override def train(dataSet: RDD[(Double, SparseVector[Double])]): BinModel = {
42 |     dataSet.persist(StorageLevel.MEMORY_AND_DISK_SER_2)
43 |     val inputCoeffs = initialCoeffs match {
44 |       case Some(value) => value
45 |       case None => new VectorCoefficients(dataSet.first()._2.length)
46 |     }
47 |     val coeffs = gd.optimize(dataSet, inputCoeffs, params(reg))
48 |     dataSet.unpersist()
49 |     new LrModel(this, coeffs.asInstanceOf[VectorCoefficients], params)
50 |   }
51 | 
52 | 
53 | }
54 | 
55 | /**
56 |   * 逻辑斯蒂模型的SGD学习器实例
57 |   */
58 | object LrLearnSGD {
59 |   /**
60 |     * 训练逻辑斯蒂模型
61 |     *
62 |     * @param dataset       数据集
63 |     * @param params        参数池*
64 |     * @param updater       参数更新器
65 |     * @param initialCoeffs 初始参数
66 |     * @return 逻辑斯蒂模型
67 |     */
68 |   def train(dataset: RDD[(Double, SparseVector[Double])],
69 |             params: ParamMap,
70 |             updater: Updater,
71 |             initialCoeffs: Option[VectorCoefficients] = None): MLModel = {
72 |     new LrLearnSGD(params, updater, initialCoeffs).train(dataset)
73 |   }
74 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/logisticregression/LrModel.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.logisticregression
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.MLModel
 5 | import io.github.qf6101.mfm.baseframe.binomial.BinModel
 6 | import io.github.qf6101.mfm.util.Logging
 7 | import org.apache.spark.ml.param.ParamMap
 8 | 
 9 | /**
10 |   * Created by qfeng on 15-3-16.
11 |   */
12 | 
13 | /**
14 |   * 逻辑斯蒂回归模型
15 |   *
16 |   * @param coeffs    模型系数
17 |   * @param paramMeta 逻辑斯蒂参数
18 |   * @param params    参数池
19 |   */
20 | class LrModel(override val paramMeta: LrModelParam,
21 |               override val coeffs: VectorCoefficients,
22 |               override val params: ParamMap)
23 |   extends BinModel(paramMeta, coeffs, params) with Logging {
24 |   /**
25 |     * 对输入数据进行预测（使用内置系数）
26 |     *
27 |     * @param data 输入数据
28 |     * @return 预测值(0~1)
29 |     */
30 |   override def predict(data: SparseVector[Double]): Double = {
31 |     predict(data, this.coeffs)
32 |   }
33 | 
34 |   /**
35 |     * 对输入数据进行预测
36 |     *
37 |     * @param data   输入数据
38 |     * @param coeffs 系数
39 |     * @return 预测值(0~1)
40 |     */
41 |   def predict(data: SparseVector[Double], coeffs: VectorCoefficients = this.coeffs): Double = {
42 |     val margin = -1.0 * coeffs.dot(data)
43 |     1.0 / (1.0 + math.exp(margin))
44 |   }
45 | 
46 |   override def equals(other: MLModel): Boolean = {
47 |     other match {
48 |       case otherModel: LrModel =>
49 |         if (paramMeta.toJSON(params).equals(otherModel.paramMeta.toJSON(otherModel.params))
50 |           && coeffs.equals(otherModel.coeffs)) true
51 |         else false
52 |       case _ => false
53 |     }
54 |   }
55 | }
56 | 
57 | object LrModel extends Logging {
58 |   /**
59 |     * 从模型文件载入逻辑斯蒂模型
60 |     *
61 |     * @param location 模型文件
62 |     * @return 逻辑斯蒂模型
63 |     */
64 |   def apply(location: String): LrModel = {
65 |     val params = new ParamMap()
66 |     val paramMeta = LrModelParam(location + "/" + MLModel.namingParamFile, params)
67 |     val coefficients = VectorCoefficients(location + "/" + MLModel.namingCoeffFile)
68 |     new LrModel(paramMeta, coefficients, params)
69 |   }
70 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/logisticregression/LrModelParam.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.logisticregression
 2 | 
 3 | import io.github.qf6101.mfm.baseframe.ModelParam
 4 | import io.github.qf6101.mfm.baseframe.binomial.BinModelParam
 5 | import org.apache.spark.ml.param.{Param, ParamMap}
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.json4s.JsonAST
 8 | import org.json4s.JsonDSL._
 9 | 
10 | 
11 | /**
12 |   * Created by qfeng on 15-3-18.
13 |   */
14 | 
15 | /**
16 |   * 逻辑斯蒂模型的参数
17 |   */
18 | trait LrModelParam extends BinModelParam {
19 |   val reg: Param[Array[Double]] = new Param("LrModelParam", "reg", "正则参数")
20 | 
21 |   /**
22 |     * Transform parameters to json object
23 |     *
24 |     * @return parameters in json format
25 |     */
26 |   override def toJSON(params: ParamMap): JsonAST.JObject = {
27 |     super.toJSON(params) ~
28 |       (ModelParam.namingParamType -> LrModelParam.getClass.toString) ~
29 |       (reg.name -> params(reg).mkString(", "))
30 |   }
31 | }
32 | 
33 | object LrModelParam {
34 |   /**
35 |     * 根据字符串数组构造逻辑斯蒂模型参数
36 |     *
37 |     * @param location 文件位置
38 |     * @param params   参数池
39 |     * @return 逻辑斯蒂模型参数
40 |     */
41 |   def apply(location: String, params: ParamMap): LrModelParam = {
42 |     // 初始化参数对象和spark session
43 |     val lrModelParam = new LrModelParam {}
44 |     val spark = SparkSession.builder().getOrCreate()
45 |     // 读取参数值
46 |     val paramValues = spark.read.json(location).first()
47 |     val binaryThreshold = paramValues.getAs[Double](lrModelParam.binaryThreshold.name)
48 |     val reg = paramValues.getAs[String](lrModelParam.reg.name).split(",").map(_.trim.toDouble)
49 |     val initMean = paramValues.getAs[Double](lrModelParam.initMean.name)
50 |     val initStdev = paramValues.getAs[Double](lrModelParam.initStdev.name)
51 |     // 设置参数值
52 |     params.put(lrModelParam.binaryThreshold, binaryThreshold)
53 |     params.put(lrModelParam.reg, reg)
54 |     params.put(lrModelParam.initMean, initMean)
55 |     params.put(lrModelParam.initStdev, initStdev)
56 |     // 返回LR模型参数
57 |     lrModelParam
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/logisticregression/VectorCoefficients.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.logisticregression
  2 | 
  3 | import breeze.linalg.SparseVector
  4 | import io.github.qf6101.mfm.baseframe.Coefficients
  5 | import org.apache.spark.sql.SparkSession
  6 | import org.json4s.JsonDSL._
  7 | import org.json4s.jackson.JsonMethods._
  8 | 
  9 | import scala.collection.mutable
 10 | import scala.math._
 11 | 
 12 | /**
 13 |   * Created by qfeng on 15-6-11.
 14 |   */
 15 | class VectorCoefficients(val size: Int) extends Coefficients {
 16 |   var w0 = 0.0
 17 |   var w = mutable.HashMap[Int, Double]()
 18 | 
 19 |   /**
 20 |     * 同时复制this的结构和内容
 21 |     *
 22 |     * @return 复制的拷贝
 23 |     */
 24 |   override def copy: Coefficients = {
 25 |     new VectorCoefficients(this.size, this.w0, this.w)
 26 |   }
 27 | 
 28 |   /**
 29 |     * 用Map稀疏向量初始化
 30 |     *
 31 |     * @param w0 截距
 32 |     * @param w  Map稀疏向量表示的参数
 33 |     */
 34 |   def this(size: Int, w0: Double, w: mutable.HashMap[Int, Double]) {
 35 |     this(size)
 36 |     this.w0 = w0
 37 |     this.w ++= w
 38 |   }
 39 | 
 40 |   /**
 41 |     * 只复制this的结构（比如参数个数），不复制内容
 42 |     *
 43 |     * @return 复制的拷贝
 44 |     */
 45 |   override def copyEmpty(): Coefficients = new VectorCoefficients(this.size)
 46 | 
 47 |   /**
 48 |     * 对应系数加法，加至this上
 49 |     *
 50 |     * @param otherW0 截距加数
 51 |     * @param otherW  一阶系数加数
 52 |     * @return this
 53 |     */
 54 |   def +=(otherW0: Double, otherW: SparseVector[Double]): VectorCoefficients = {
 55 |     this.w0 += otherW0
 56 |     otherW.activeIterator.foreach { case (index, value) =>
 57 |       val originalValue = this.w.getOrElse(index, 0.0)
 58 |       this.w.update(index, originalValue + value)
 59 |     }
 60 |     this
 61 |   }
 62 | 
 63 |   /**
 64 |     * 对应系数加法，加至this上
 65 |     *
 66 |     * @param other 加数
 67 |     * @return this
 68 |     */
 69 |   override def +=(other: Coefficients): Coefficients = {
 70 |     val otherCoeffs = other.asInstanceOf[VectorCoefficients]
 71 |     this.w0 += otherCoeffs.w0
 72 |     otherCoeffs.w.foreach { case (index, value) =>
 73 |       val originalValue = this.w.getOrElse(index, 0.0)
 74 |       this.w.update(index, originalValue + value)
 75 |     }
 76 |     this
 77 |   }
 78 | 
 79 |   /**
 80 |     * 对应系数减法，减至this上
 81 |     *
 82 |     * @param other 减数
 83 |     * @return this
 84 |     */
 85 |   override def -=(other: Coefficients): Coefficients = {
 86 |     val otherCoeffs = other.asInstanceOf[VectorCoefficients]
 87 |     this.w0 -= otherCoeffs.w0
 88 |     otherCoeffs.w.foreach { case (index, value) =>
 89 |       val originalValue = this.w.getOrElse(index, 0.0)
 90 |       this.w.update(index, originalValue - value)
 91 |     }
 92 |     this
 93 |   }
 94 | 
 95 |   /**
 96 |     *
 97 |     * 对应系数加上同一实数，加至复制this的类上
 98 |     *
 99 |     * @param addend 加数
100 |     * @return 加法结果（拷贝）
101 |     */
102 |   override def +(addend: Double): Coefficients = {
103 |     val result = new VectorCoefficients(this.size)
104 |     result.w0 = this.w0 + addend
105 |     result.w = this.w.map { case (index, value) => index -> (value + addend) }
106 |     result
107 |   }
108 | 
109 |   /**
110 |     * 对应系数除上同一实数，加至复制this的类上
111 |     *
112 |     * @param dividend 除数
113 |     * @return 除法结果
114 |     */
115 |   override def /(dividend: Double): Coefficients = {
116 |     val result = new VectorCoefficients(this.size)
117 |     result.w0 = this.w0 / dividend
118 |     result.w = this.w.map { case (index, value) => index -> (value / dividend) }
119 |     result
120 |   }
121 | 
122 |   /**
123 |     * 计算L2的正则值
124 |     *
125 |     * @param reg 正则参数
126 |     * @return 参数加权后的L2正则值
127 |     */
128 |   override def L2RegValue(reg: Array[Double]): Double = {
129 |     var squaredCoeffSum = w0 * w0
130 |     this.w.foreach { case (index, value) =>
131 |       squaredCoeffSum += value * value
132 |     }
133 |     0.5 * reg(0) * squaredCoeffSum
134 |   }
135 | 
136 |   /**
137 |     * 计算L2的正则梯度值
138 |     *
139 |     * @param reg 正则参数
140 |     * @return 参数加权后的L2正则梯度值
141 |     */
142 |   override def L2RegGradient(reg: Array[Double]): Coefficients = {
143 |     this * reg(0)
144 |   }
145 | 
146 |   /**
147 |     * 对应系数乘上同一实数，加至复制this的类上
148 |     *
149 |     * @param multiplier 乘数
150 |     * @return 乘法结果
151 |     */
152 |   override def *(multiplier: Double): Coefficients = {
153 |     val result = new VectorCoefficients(this.size)
154 |     result.w0 = this.w0 * multiplier
155 |     result.w = this.w.map { case (index, value) => index -> (value * multiplier) }
156 |     result
157 |   }
158 | 
159 |   /**
160 |     * 用L1稀疏化系数
161 |     *
162 |     * @param regParam 正则参数值
163 |     * @param stepSize 学习率
164 |     * @return 稀疏化后的系数
165 |     */
166 |   override def L1Shrink(regParam: Array[Double], stepSize: Double): Coefficients = {
167 |     //收缩值
168 |     val shrinkageVal = regParam(0) * stepSize
169 |     w0 = signum(w0) * max(0.0, abs(w0) - shrinkageVal)
170 |     w = w.flatMap { case (index, weight) =>
171 |       val newWeight = signum(weight) * max(0.0, abs(weight) - shrinkageVal)
172 |       if (newWeight == 0) {
173 |         Nil
174 |       } else {
175 |         List(index -> newWeight)
176 |       }
177 |     }
178 |     this
179 |   }
180 | 
181 |   /**
182 |     * 计算L1的正则值
183 |     *
184 |     * @param regParam 正则参数
185 |     * @return 参数绝对值加权后的L1正则值
186 |     */
187 |   override def L1RegValue(regParam: Array[Double]): Double = {
188 |     val zeroRegValue = abs(w0)
189 |     val firstRegValue = this.w.foldLeft(0.0) { case (absSum, element) =>
190 |       absSum + abs(element._2)
191 |     }
192 |     (zeroRegValue + firstRegValue) * regParam(0)
193 |   }
194 | 
195 |   /**
196 |     * 系数与稀疏向量点乘
197 |     *
198 |     * @param otherW 稀疏向量
199 |     * @return 点乘的结果
200 |     */
201 |   def dot(otherW: SparseVector[Double]): Double = {
202 |     var result = w0
203 |     otherW.activeIterator.foreach { case (index, value) =>
204 |       val originalValue = this.w.getOrElse(index, 0.0)
205 |       result += originalValue * value
206 |     }
207 |     result
208 |   }
209 | 
210 |   /**
211 |     * 计算系数的2范数
212 |     * sum(abs(A).^p)^(1/p) where p=2
213 |     *
214 |     * @return 系数的2范数
215 |     */
216 |   override def norm: Double = {
217 |     math.sqrt(w.foldLeft(0.0) { case (sum: Double, (_, value: Double)) =>
218 |       sum + value * value
219 |     } + w0 * w0)
220 |   }
221 | 
222 |   /**
223 |     * 保存元数据至文件
224 |     *
225 |     * @param location 文件位置
226 |     */
227 |   override def saveMeta(location: String): Unit = {
228 |     val json = (Coefficients.namingCoeffType -> VectorCoefficients.getClass.toString) ~
229 |       (VectorCoefficients.namingFeatureSize -> size) ~
230 |       (VectorCoefficients.namingIntercept -> w0) ~
231 |       (VectorCoefficients.namingWSize -> w.size)
232 |     SparkSession.builder().getOrCreate().sparkContext.
233 |       makeRDD(List(compact(render(json)))).repartition(1).saveAsTextFile(location)
234 |   }
235 | 
236 |   /**
237 |     * 保存数据至文件
238 |     *
239 |     * @param location 文件位置
240 |     */
241 |   override def saveData(location: String): Unit = {
242 |     SparkSession.builder().getOrCreate().createDataFrame(w.toSeq).toDF("index", "value").write.parquet(location)
243 |   }
244 | 
245 |   /**
246 |     * 与另一个系数是否相等
247 |     *
248 |     * @param other 另一个系数
249 |     * @return 是否相等
250 |     */
251 |   override def equals(other: Coefficients): Boolean = {
252 |     other match {
253 |       case otherCoeffs: VectorCoefficients =>
254 |         if (w0 == otherCoeffs.w0 && w.equals(otherCoeffs.w)) true else false
255 |       case _ => false
256 |     }
257 |   }
258 | }
259 | 
260 | /**
261 |   * 向量化系数对象
262 |   */
263 | object VectorCoefficients {
264 |   val namingIntercept = "intercept"
265 |   val namingFeatureSize = "feature_size"
266 |   val namingWSize = "w_size"
267 | 
268 |   /**
269 |     * 根据字符串数组构造向量系数
270 |     *
271 |     * @param location 系数文件位置
272 |     * @return 向量系数
273 |     */
274 |   def apply(location: String): VectorCoefficients = {
275 |     val spark = SparkSession.builder().getOrCreate()
276 |     import spark.implicits._
277 |     val meta = spark.read.json(location + "/" + Coefficients.namingMetaFile).first()
278 |     val size = meta.getAs[Long](namingFeatureSize).toInt
279 |     val w0 = meta.getAs[Double](namingIntercept)
280 |     val w = spark.read.parquet(location + "/" + Coefficients.namingDataFile).map { row =>
281 |       (row.getAs[Long]("index").toInt, row.getAs[Double]("value"))
282 |     }.collect()
283 |     new VectorCoefficients(size, w0, mutable.HashMap[Int, Double](w.toSeq: _*))
284 |   }
285 | }
286 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/DecreasingStrategy.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.optimization
 2 | 
 3 | /**
 4 |   * User: qfeng
 5 |   * Date: 15-12-29 下午4:49
 6 |   * Usage: SGD学习步长衰减类
 7 |   */
 8 | trait DecreasingStrategy extends Serializable {
 9 |   /**
10 |     * 根据当前的迭代次数计算学习步长衰减的分母
11 |     *
12 |     * @param iter 迭代次数
13 |     * @return 学习步长衰减的分母
14 |     */
15 |   def decrease(iter: Int): Double
16 | }
17 | 
18 | class Log10DecreasingStrategy extends DecreasingStrategy {
19 |   /**
20 |     * 根据当前的迭代次数计算学习步长衰减的分母
21 |     * 按照log10进行衰减，第91次迭代衰减为一半
22 |     *
23 |     * @param iter 迭代次数
24 |     * @return 学习步长衰减的分母
25 |     */
26 |   def decrease(iter: Int): Double = {
27 |     math.log10(9 + iter)
28 |   }
29 | }
30 | 
31 | class LogXDecreasingStrategy(X: Int) extends DecreasingStrategy {
32 |   /**
33 |     * 根据当前的迭代次数计算学习步长衰减的分母
34 |     * 按照logX进行衰减
35 |     *
36 |     * @param iter 迭代次数
37 |     * @return 学习步长衰减的分母
38 |     */
39 |   def decrease(iter: Int): Double = {
40 |     math.log(X - 1 + iter) / math.log(X)
41 |   }
42 | }
43 | 
44 | class ConstantDecreasingStrategy(stepSize: Double) extends DecreasingStrategy {
45 |   /**
46 |     * 不衰减
47 |     *
48 |     * @param iter 迭代次数
49 |     * @return 常数学习率(不衰减学习率)
50 |     */
51 |   def decrease(iter: Int): Double = {
52 |     stepSize
53 |   }
54 | }
55 | 
56 | class sqrtDecreasingStrategy extends DecreasingStrategy {
57 |   /**
58 |     * 根据当前的迭代次数计算学习步长衰减的分母
59 |     * 按照开方进行衰减，第5次迭代衰减为一半
60 |     *
61 |     * @param iter 迭代次数
62 |     * @return 学习步长衰减的分母
63 |     */
64 |   def decrease(iter: Int): Double = {
65 |     Math.sqrt(iter)
66 |   }
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/Gradient.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.optimization
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.Coefficients
 5 | 
 6 | 
 7 | /**
 8 |   * Created by qfeng on 15-3-11.
 9 |   */
10 | trait Gradient extends Serializable {
11 |   /**
12 |     * Compute the gradient and loss given the features of a single data point.
13 |     *
14 |     * @param data   features for one data point
15 |     * @param label  label for this data point
16 |     * @param coeffs weights/coefficients corresponding to features
17 |     * @return (gradient: Coefficients, loss: Double)
18 |     */
19 |   def compute(data: SparseVector[Double],
20 |               label: Double,
21 |               coeffs: Coefficients): (Coefficients, Double) = {
22 |     val gradient = coeffs.copyEmpty()
23 |     val loss = compute(data, label, coeffs, gradient)
24 |     (gradient, loss)
25 |   }
26 | 
27 |   /**
28 |     * Compute the gradient and loss given the features of a single data point,
29 |     * add the gradient to a provided vector to avoid creating new objects, and return loss.
30 |     *
31 |     * @param data        features for one data point
32 |     * @param label       label for this data point
33 |     * @param coeffs      weights/coefficients corresponding to features
34 |     * @param cumGradient the computed gradient will be added to this vector
35 |     * @return loss
36 |     */
37 |   def compute(data: SparseVector[Double],
38 |               label: Double,
39 |               coeffs: Coefficients,
40 |               cumGradient: Coefficients): Double
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/GradientDescent.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.optimization
  2 | 
  3 | import breeze.linalg.SparseVector
  4 | import io.github.qf6101.mfm.baseframe.Coefficients
  5 | import io.github.qf6101.mfm.util.Logging
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.ml.param.ParamMap
  8 | import org.apache.spark.rdd.RDD
  9 | 
 10 | import scala.collection.mutable.ArrayBuffer
 11 | 
 12 | /**
 13 |   * Created by qfeng on 15-3-11.
 14 |   */
 15 | 
 16 | /**
 17 |   * 随机梯度下降器
 18 |   *
 19 |   * @param gradient 梯度逻辑
 20 |   * @param updater  更新逻辑
 21 |   * @param params   参数池
 22 |   */
 23 | class GradientDescent(private var gradient: Gradient, private var updater: Updater, private var params: ParamMap)
 24 |   extends Optimizer with SGDParam with Logging {
 25 | 
 26 |   /**
 27 |     * 最优化函数
 28 |     *
 29 |     * @param data          样本数据集
 30 |     * @param initialCoeffs 初始化系数值
 31 |     * @param regParam      正则参数值
 32 |     * @return 学习后的参数
 33 |     */
 34 |   override def optimize(data: RDD[(Double, SparseVector[Double])],
 35 |                         initialCoeffs: Coefficients,
 36 |                         regParam: Array[Double]): Coefficients = {
 37 |     val (coeffs, _) = optimizeWithHistory(data, initialCoeffs, regParam)
 38 |     coeffs
 39 |   }
 40 | 
 41 |   /**
 42 |     * 最优化函数
 43 |     *
 44 |     * @param data          样本数据集
 45 |     * @param initialCoeffs 初始化系数值
 46 |     * @param regParam      正则参数值
 47 |     * @return 学习后的参数
 48 |     */
 49 |   def optimizeWithHistory(data: RDD[(Double, SparseVector[Double])],
 50 |                           initialCoeffs: Coefficients,
 51 |                           regParam: Array[Double]): (Coefficients, Array[Double]) = {
 52 |     //获取参数
 53 |     val numIterationsValue = params(numIterations)
 54 |     val miniBatchFractionValue = params(miniBatchFraction)
 55 |     val stepSizeValue = params(stepSize)
 56 |     //初始化系数、正则值
 57 |     var coeffs = initialCoeffs.copy
 58 |     var regVal = updater.compute(coeffs, coeffs.copyEmpty(), 0, 1, regParam)._2
 59 |     val lossHistory = new ArrayBuffer[Double](numIterationsValue)
 60 |     //初始化临时变量：迭代次数、是否收敛、上次损失值
 61 |     var i = 0
 62 |     var reachStopCondition = false
 63 |     //开始迭代训练
 64 |     while (!reachStopCondition && i < numIterationsValue) {
 65 |       i += 1
 66 |       val bcCoeffs = SparkContext.getOrCreate.broadcast(coeffs)
 67 |       val (gradientSum, lossSum, miniBatchSize) = data.sample(withReplacement = false, miniBatchFractionValue, 42 + i)
 68 |         .treeAggregate(initialCoeffs.copyEmpty(), 0.0, 0L)(
 69 |           seqOp = (c, v) => {
 70 |             // c: (grad, loss, count), v: (label, features)
 71 |             val l = gradient.compute(v._2, v._1, bcCoeffs.value, c._1)
 72 |             (c._1, c._2 + l, c._3 + 1)
 73 |           },
 74 |           combOp = (c1, c2) => {
 75 |             // c: (grad, loss, count)
 76 |             (c1._1 += c2._1, c1._2 + c2._2, c1._3 + c2._3)
 77 |           })
 78 | 
 79 |       if (miniBatchSize > 0) {
 80 |         //计算损失值、新的系数、正则值
 81 |         lossHistory.append(lossSum / miniBatchSize + regVal)
 82 |         val update = updater.compute(coeffs, gradientSum / miniBatchSize.toDouble, stepSizeValue, i, regParam)
 83 |         //判断是否达到收敛条件
 84 |         val (converged, solutionDiff) = isConverged(update._1, coeffs)
 85 |         reachStopCondition = converged
 86 |         //更新系数和正则值
 87 |         coeffs = update._1
 88 |         regVal = update._2
 89 |         //打印调试信息：损失值
 90 |         logInfo(s"Iteration ($i/$numIterationsValue) loss: ${lossSum / miniBatchSize} and $regVal, solutionDiff: $solutionDiff")
 91 |       } else {
 92 |         logWarning(s"Iteration ($i/$numIterationsValue}). The size of sampled batch is zero")
 93 |       }
 94 |     }
 95 |     (coeffs, lossHistory.toArray)
 96 |   }
 97 | 
 98 |   /**
 99 |     * 判断是否达到收敛条件
100 |     *
101 |     * @param newCoeffs 更新后的系数
102 |     * @param oldCoeffs 更新前的系数
103 |     * @return 是否达到收敛条件
104 |     */
105 |   private def isConverged(newCoeffs: Coefficients, oldCoeffs: Coefficients): (Boolean, Double) = {
106 |     val solutionDiff = newCoeffs.normDiff(oldCoeffs)
107 |     (solutionDiff < params(convergenceTol) * Math.max(newCoeffs.norm, 1.0), solutionDiff)
108 |   }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/LBFGS.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.optimization
  2 | 
  3 | import breeze.linalg.SparseVector
  4 | import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS}
  5 | import io.github.qf6101.mfm.baseframe.Coefficients
  6 | import io.github.qf6101.mfm.logisticregression.VectorCoefficients
  7 | import io.github.qf6101.mfm.util.Logging
  8 | import org.apache.spark.ml.param.ParamMap
  9 | import org.apache.spark.rdd.RDD
 10 | 
 11 | import scala.collection.mutable
 12 | import scala.language.implicitConversions
 13 | 
 14 | /**
 15 |   * Created by qfeng on 15-4-7.
 16 |   */
 17 | 
 18 | /**
 19 |   * LBFGS优化器
 20 |   *
 21 |   * @param gradient 梯度逻辑
 22 |   * @param updater  更新逻辑
 23 |   * @param params   参数池
 24 |   */
 25 | class LBFGS(private var gradient: Gradient, private var updater: Updater, private var params: ParamMap) extends
 26 |   Optimizer with LBFGSParam with Logging {
 27 | 
 28 |   /**
 29 |     * 最优化函数
 30 |     *
 31 |     * @param data          样本数据集
 32 |     * @param initialCoeffs 初始化系数值
 33 |     * @param reg           正则参数值
 34 |     * @return 学习后的参数
 35 |     */
 36 |   override def optimize(data: RDD[(Double, SparseVector[Double])],
 37 |                         initialCoeffs: Coefficients,
 38 |                         reg: Array[Double]):
 39 |   Coefficients = {
 40 |     val (coeffs, _) = optimizeWithHistory(data, initialCoeffs, reg)
 41 |     coeffs
 42 |   }
 43 | 
 44 |   /**
 45 |     * 最优化函数
 46 |     *
 47 |     * @param data          样本数据集
 48 |     * @param initialCoeffs 初始化系数值
 49 |     * @param reg           正则参数值
 50 |     * @return 学习后的参数
 51 |     */
 52 |   def optimizeWithHistory(data: RDD[(Double, SparseVector[Double])],
 53 |                           initialCoeffs: Coefficients,
 54 |                           reg: Array[Double],
 55 |                           negativePenalty: Double = 1.0):
 56 |   (Coefficients, Array[Double]) = {
 57 |     //获取参数
 58 |     val numIterationsValue = params(numIterations)
 59 |     val numCorrectionsValue = params(numCorrections)
 60 |     val convergenceTolValue = params(convergenceTol)
 61 |     //初始化损失值数组、数据集大小
 62 |     val lossHistory = new mutable.ArrayBuffer[Double](numIterationsValue)
 63 |     val numExamples = data.count()
 64 |     //初始化系数、损失函数形式
 65 |     val vecInitialCoeffs = initialCoeffs.asInstanceOf[VectorCoefficients]
 66 |     val costFun = new CostFun(data, gradient, updater, reg, numExamples, negativePenalty)
 67 |     val lbfgs = new BreezeLBFGS[SparseVector[Double]](numIterationsValue, numCorrectionsValue, convergenceTolValue)
 68 |     //创建LBFGS状态序列
 69 |     val states = lbfgs.iterations(new CachedDiffFunction(costFun), VCToBSV(vecInitialCoeffs))
 70 |     //执行迭代
 71 |     var i = 0
 72 |     var state = states.next()
 73 |     while (states.hasNext) {
 74 |       i += 1
 75 |       logDebug(s"Iteration ($i/$numIterationsValue) loss: ${state.value}")
 76 |       lossHistory.append(state.value)
 77 |       state = states.next()
 78 |     }
 79 |     lossHistory.append(state.value)
 80 |     //返回结果
 81 |     (state.x, lossHistory.toArray)
 82 |   }
 83 | 
 84 |   /**
 85 |     * 向量系数转成breeze的稀疏向量
 86 |     *
 87 |     * @param in 向量系数
 88 |     * @return breeze的稀疏向量
 89 |     */
 90 |   implicit def VCToBSV(in: VectorCoefficients): SparseVector[Double] = {
 91 |     val out = SparseVector.zeros[Double](in.size + 1)
 92 |     out.update(0, in.w0)
 93 |     in.w.foreach { case (index, value) =>
 94 |       out.update(index + 1, value)
 95 |     }
 96 |     out
 97 |   }
 98 | 
 99 |   /**
100 |     * breeze的稀疏向量转成向量系数
101 |     *
102 |     * @param in breeze的稀疏向量
103 |     * @return 向量系数
104 |     */
105 |   implicit def BSVToVC(in: SparseVector[Double]): VectorCoefficients = {
106 |     val w0 = in(0)
107 |     val w = mutable.HashMap[Int, Double]()
108 |     in.activeIterator.foreach { case (index, value) =>
109 |       if (index != 0) {
110 |         w += (index - 1) -> value
111 |       }
112 |     }
113 |     new VectorCoefficients(in.length - 1, w0, w)
114 |   }
115 | 
116 |   /**
117 |     * CostFun implements Breeze's DiffFunction[T], which returns the loss and gradient
118 |     * at a particular point (weights). It's used in Breeze's convex optimization routines.
119 |     */
120 |   private class CostFun(data: RDD[(Double, SparseVector[Double])],
121 |                         gradient: Gradient,
122 |                         updater: Updater,
123 |                         reg: Array[Double],
124 |                         numExamples: Long,
125 |                         negativePenalty: Double) extends DiffFunction[SparseVector[Double]] with Serializable {
126 | 
127 |     override def calculate(weights: SparseVector[Double]): (Double, SparseVector[Double]) = {
128 |       // Have a local copy to avoid the serialization of CostFun object which is not serializable.
129 |       val w = weights.copy
130 |       val n = weights.length
131 |       val bcW = data.context.broadcast(w)
132 |       val localGradient = gradient
133 | 
134 |       val (gradientSum, lossSum) = data.treeAggregate((new VectorCoefficients(n - 1), 0.0))(
135 |         seqOp = (c, v) => (c, v) match {
136 |           case ((grad, loss), (label, features)) =>
137 |             val l = localGradient.compute(features, label, bcW.value, grad)
138 |             (grad, loss + l)
139 |         },
140 |         combOp = (c1, c2) => (c1, c2) match {
141 |           case ((grad1, loss1), (grad2, loss2)) =>
142 |             grad1 += grad2
143 |             (grad1, loss1 + loss2)
144 |         })
145 | 
146 |       /**
147 |         * regVal is sum of weight squares if it's L2 updater;
148 |         * for other updater, the same logic is followed.
149 |         */
150 |       val regVal = updater.compute(w, new VectorCoefficients(n - 1), 0, 1, reg)._2
151 |       val outputLoss = lossSum / numExamples + regVal
152 |       /**
153 |         * It will return the gradient part of regularization using updater.
154 |         *
155 |         * Given the input parameters, the updater basically does the following,
156 |         *
157 |         * w' = w - thisIterStepSize * (gradient + regGradient(w))
158 |         * Note that regGradient is function of w
159 |         *
160 |         * If we set gradient = 0, thisIterStepSize = 1, then
161 |         *
162 |         * regGradient(w) = w - w'
163 |         *
164 |         * TODO: We need to clean it up by separating the logic of regularization out
165 |         * from updater to regularizer.
166 |         */
167 |       // The following gradientTotal is actually the regularization part of gradient.
168 |       // Will add the gradientSum computed from the data with weights in the next step.
169 |       val gradientTotal = BSVToVC(w)
170 |       gradientTotal -= updater.compute(w, new VectorCoefficients(n - 1), 1, 1, reg)._1.asInstanceOf[VectorCoefficients]
171 |       gradientTotal += gradientSum * (1.0 / numExamples)
172 | 
173 |       (outputLoss, gradientTotal)
174 |     }
175 |   }
176 | 
177 | }
178 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/LBFGSParam.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.optimization
 2 | 
 3 | import org.apache.spark.ml.param.{Param, ParamValidators}
 4 | 
 5 | /**
 6 |   * Created by qfeng on 15-4-7.
 7 |   */
 8 | 
 9 | /**
10 |   * LGBGS的参数
11 |   */
12 | trait LBFGSParam extends Serializable {
13 |   //default value: 10
14 |   val numCorrections: Param[Int] = new Param("LBFGSParam", "numCorrections", "number of corrections used in the LBFGS " +
15 |     "update", ParamValidators.gt(0))
16 |   //default value:1E-4
17 |   val convergenceTol: Param[Double] = new Param("LBFGSParam", "convergenceTol", "convergence tolerance of iterations for LBFGS",
18 |     ParamValidators.gt(0))
19 |   val numIterations: Param[Int] = new Param("LBFGSParam", "numIterations", "number of iterations that SGD should be run",
20 |     ParamValidators.gt(0))
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/Optimizer.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.optimization
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import io.github.qf6101.mfm.baseframe.Coefficients
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | /**
 8 |   * Created by qfeng on 15-3-11.
 9 |   */
10 | 
11 | /**
12 |   * 优化器接口,实现包括SGD, LBFGS等
13 |   */
14 | trait Optimizer extends Serializable {
15 |   /**
16 |     * 最优化函数
17 |     *
18 |     * @param data          样本数据集
19 |     * @param initialCoeffs 初始化系数值
20 |     * @param regParam      正则参数值
21 |     * @return 学习后的参数
22 |     */
23 |   def optimize(data: RDD[(Double, SparseVector[Double])],
24 |                initialCoeffs: Coefficients,
25 |                regParam: Array[Double]): Coefficients
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/SGDParam.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.optimization
 2 | 
 3 | import org.apache.spark.ml.param.{Param, ParamValidators}
 4 | 
 5 | 
 6 | /**
 7 |   * Created by qfeng on 15-3-18.
 8 |   */
 9 | 
10 | 
11 | /**
12 |   * SGD的参数
13 |   */
14 | trait SGDParam extends Serializable {
15 |   //default value: 1.0
16 |   val stepSize: Param[Double] = new Param("SGDParam", "stepSize", "initial step size for the first step",
17 |     ParamValidators.gt(0))
18 |   val numIterations: Param[Int] = new Param("SGDParam", "numIterations", "number of iterations that SGD should be run",
19 |     ParamValidators.gt(0))
20 |   //default value: 1.0
21 |   val miniBatchFraction: Param[Double] = new Param("SGDParam", "miniBatchFraction", "fraction of the input data set " +
22 |     "that should be used for one iteration of SGD", ParamValidators.inRange(0, 1, false, true))
23 |   //default value:1E-4
24 |   val convergenceTol: Param[Double] = new Param("SGDParam", "convergenceTol", "convergence tolerance of iterations for SGD",
25 |     ParamValidators.gt(0))
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/optimization/Updater.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.optimization
  2 | 
  3 | import io.github.qf6101.mfm.baseframe.Coefficients
  4 | import io.github.qf6101.mfm.util.Logging
  5 | 
  6 | /**
  7 |   * Created by qfeng on 15-3-11.
  8 |   */
  9 | abstract class Updater(private val decreasingStrategy: DecreasingStrategy)
 10 |   extends Logging with Serializable {
 11 |   /**
 12 |     * Compute an updated value for weights given the gradient, stepSize, iteration number and
 13 |     * regularization parameter. Also returns the regularization value regParam * R(w)
 14 |     * computed using the *updated* weights.
 15 |     *
 16 |     * @param coeffOld - Old coefficients.
 17 |     * @param gradient - Average batch gradient.
 18 |     * @param stepSize - step size across iterations
 19 |     * @param iter     - Iteration number
 20 |     * @param regParam - Regularization parameter
 21 |     * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights,
 22 |     *         and the second element is the regularization value computed using updated weights.
 23 |     */
 24 |   def compute(coeffOld: Coefficients,
 25 |               gradient: Coefficients,
 26 |               stepSize: Double,
 27 |               iter: Int,
 28 |               regParam: Array[Double]): (Coefficients, Double)
 29 | }
 30 | 
 31 | /**
 32 |   * A simple updater for gradient descent *without* any regularization.
 33 |   * Uses a step-size decreasing with the square root of the number of iterations.
 34 |   */
 35 | class SimpleUpdater(private val decreasingStrategy: DecreasingStrategy = new Log10DecreasingStrategy())
 36 |   extends Updater(decreasingStrategy) {
 37 |   /**
 38 |     * Compute an updated value for weights given the gradient, stepSize, iteration number and
 39 |     * regularization parameter. Also returns the regularization value regParam * R(w)
 40 |     * computed using the *updated* weights.
 41 |     *
 42 |     * @param coeffOld - Old coefficients.
 43 |     * @param gradient - Average batch gradient.
 44 |     * @param stepSize - step size across iterations
 45 |     * @param iter     - Iteration number
 46 |     * @param regParam - Regularization parameter
 47 |     * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights,
 48 |     *         and the second element is the regularization value computed using updated weights.
 49 |     */
 50 |   override def compute(coeffOld: Coefficients,
 51 |                        gradient: Coefficients,
 52 |                        stepSize: Double,
 53 |                        iter: Int,
 54 |                        regParam: Array[Double]): (Coefficients, Double) = {
 55 |     val thisIterStepSize = stepSize / decreasingStrategy.decrease(iter)
 56 |     val coeffNew = coeffOld + gradient * (-thisIterStepSize)
 57 |     (coeffNew, 0.0)
 58 |   }
 59 | }
 60 | 
 61 | /**
 62 |   * Updater for L2 regularized problems.
 63 |   * R(w) = 1/2 ||w||2
 64 |   * Uses a step-size decreasing with the square root of the number of iterations.
 65 |   **/
 66 | class SquaredL2Updater(private val decreasingStrategy: DecreasingStrategy = new Log10DecreasingStrategy())
 67 |   extends Updater(decreasingStrategy) {
 68 |   /**
 69 |     * Compute an updated value for weights given the gradient, stepSize, iteration number and
 70 |     * regularization parameter. Also returns the regularization value regParam * R(w)
 71 |     * computed using the *updated* weights.
 72 |     *
 73 |     * @param coeffOld - Old coefficients.
 74 |     * @param gradient - Average batch gradient.
 75 |     * @param stepSize - step size across iterations
 76 |     * @param iter     - Iteration number
 77 |     * @param regParam - Regularization parameter
 78 |     * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights,
 79 |     *         and the second element is the regularization value computed using updated weights.
 80 |     */
 81 |   override def compute(coeffOld: Coefficients,
 82 |                        gradient: Coefficients,
 83 |                        stepSize: Double,
 84 |                        iter: Int,
 85 |                        regParam: Array[Double]): (Coefficients, Double) = {
 86 |     // add up both updates from the gradient of the loss (= step) as well as
 87 |     // the gradient of the regularizer (= regParam * weightsOld)
 88 |     // w' = w - thisIterStepSize * (gradient + regParam * w)
 89 |     val thisIterStepSize = stepSize / decreasingStrategy.decrease(iter)
 90 |     logInfo("step size: " + thisIterStepSize)
 91 |     val coeffNew = coeffOld + ((gradient + coeffOld.L2RegGradient(regParam)) * (-thisIterStepSize))
 92 |     (coeffNew, coeffNew.L2RegValue(regParam))
 93 |   }
 94 | }
 95 | 
 96 | /**
 97 |   * :: DeveloperApi ::
 98 |   * Updater for L1 regularized problems.
 99 |   * R(w) = ||w||_1
100 |   * Uses a step-size decreasing with the square root of the number of iterations.
101 |   *
102 |   * Instead of subgradient of the regularizer, the proximal operator for the
103 |   * L1 regularization is applied after the gradient step. This is known to
104 |   * result in better sparsity of the intermediate solution.
105 |   *
106 |   * The corresponding proximal operator for the L1 norm is the soft-thresholding
107 |   * function. That is, each weight component is shrunk towards 0 by shrinkageVal.
108 |   *
109 |   * If w >  shrinkageVal, set weight component to w-shrinkageVal.
110 |   * If w < -shrinkageVal, set weight component to w+shrinkageVal.
111 |   * If -shrinkageVal < w < shrinkageVal, set weight component to 0.
112 |   *
113 |   * Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
114 |   */
115 | class L1Updater(private val decreasingStrategy: DecreasingStrategy = new Log10DecreasingStrategy())
116 |   extends Updater(decreasingStrategy) {
117 |   /**
118 |     * Compute an updated value for weights given the gradient, stepSize, iteration number and
119 |     * regularization parameter. Also returns the regularization value regParam * R(w)
120 |     * computed using the *updated* weights.
121 |     *
122 |     * @param coeffOld - Old coefficients.
123 |     * @param gradient - Average batch gradient.
124 |     * @param stepSize - step size across iterations
125 |     * @param iter     - Iteration number
126 |     * @param regParam - Regularization parameter
127 |     * @return A tuple of 2 elements. The first element is a coefficient structure containing updated weights,
128 |     *         and the second element is the regularization value computed using updated weights.
129 |     */
130 |   override def compute(coeffOld: Coefficients,
131 |                        gradient: Coefficients,
132 |                        stepSize: Double,
133 |                        iter: Int,
134 |                        regParam: Array[Double]): (Coefficients, Double) = {
135 |     val thisIterStepSize = stepSize / decreasingStrategy.decrease(iter)
136 |     val coeffNew = coeffOld + (gradient * (-thisIterStepSize))
137 |     // Apply proximal operator (soft thresholding)
138 |     coeffNew.L1Shrink(regParam, thisIterStepSize)
139 |     (coeffNew, coeffNew.L1RegValue(regParam))
140 |   }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/tuning/BinCrossValidation.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.tuning
  2 | 
  3 | import breeze.linalg.SparseVector
  4 | import io.github.qf6101.mfm.baseframe.binomial.{BinLearner, BinModel}
  5 | import io.github.qf6101.mfm.util.{Logging, ParamUtil}
  6 | import org.apache.spark.ml.param.{Param, ParamMap}
  7 | import org.apache.spark.mllib.util.MLUtils
  8 | import org.apache.spark.rdd.RDD
  9 | 
 10 | import scala.util.Random
 11 | 
 12 | /**
 13 |   * User: qfeng
 14 |   * Date: 15-8-24 上午10:25
 15 |   * Usage: 快速模型选择，固定其他参数，尝试某一参数的各个值，取最大AUC的参数值
 16 |   */
 17 | 
 18 | /**
 19 |   * 二分类交叉检验
 20 |   *
 21 |   * @param learner          二分类学习器
 22 |   * @param paramGridBuilder 候选参数构造器
 23 |   * @param numFolds         交叉份数
 24 |   * @param baseParamMinAUC  基准AUC
 25 |   */
 26 | class BinCrossValidation(val learner: BinLearner,
 27 |                          val paramGridBuilder: BinParamGridBuilder,
 28 |                          val numFolds: Int = 5,
 29 |                          val baseParamMinAUC: Double = 0.0) extends Logging with Serializable {
 30 | 
 31 |   /**
 32 |     * 分类问题的模型选择
 33 |     *
 34 |     * @param dataset 数据集
 35 |     * @return 训练得到的模型及其评估值
 36 |     */
 37 |   def selectParamsForClassif(dataset: RDD[(Double, SparseVector[Double])]): (BinModel, BinaryClassificationMetrics)
 38 |   = {
 39 |     //选择得到的参数集合，即返回值
 40 |     val selectedParamMap = new ParamMap
 41 |     //数据分块，用于交叉验证
 42 |     val splits = MLUtils.kFold(dataset, numFolds, Random.nextInt())
 43 |     //随机选择一组参数作为基准参数
 44 |     val baseParamMap = selectBaseParams(splits(0)._1, baseParamMinAUC)
 45 |     //对于每个参数，都尝试它的每个参数值，选择AUC最大的那个作为最终的参数值（其他参数采用基准参数值）
 46 |     paramGridBuilder.paramGrid.foreach { case (param, paramValues) =>
 47 |       //每个参数值都对应数组中的一个元素
 48 |       val AUCs = new Array[Double](paramValues.size)
 49 |       val models = new Array[BinModel](paramValues.size)
 50 |       val candidateParamValues = new Array[Any](paramValues.size)
 51 | 
 52 |       //对于每个参数值，都基于交叉检验训练模型，计算AUC均值
 53 |       paramValues.zipWithIndex.foreach { case (paramValue, paramValueIndex) =>
 54 |         //组装出一组参数，用于训练模型
 55 |         val paramMap = baseParamMap.copy.put(param.asInstanceOf[Param[Any]], paramValue)
 56 |         candidateParamValues(paramValueIndex) = paramValue
 57 |         learner.updateParams(paramMap)
 58 |         //采用交叉检验训练模型计算AUC值
 59 |         splits.zipWithIndex.foreach { case ((training, testing), splitIndex) =>
 60 |           models(paramValueIndex) = learner.train(training)
 61 |           val validating = testing.map { case (label, features) =>
 62 |             (models(paramValueIndex).predict(features), label)
 63 |           }
 64 |           val metrics = new BinaryClassificationMetrics(validating)
 65 |           val AUC = metrics.AUC
 66 |           AUCs(paramValueIndex) += AUC
 67 |           logInfo(s"split $splitIndex >>>>> AUC: ${metrics.AUC}")
 68 |         }
 69 |         //计算AUC均值
 70 |         AUCs(paramValueIndex) /= splits.length
 71 |         logInfo(s"selected parameters: ${ParamUtil.paramsToString(paramMap)}; >>>>> AUC: ${AUCs(paramValueIndex).formatted("%1.4f")}")
 72 |       }
 73 |       //挑选出AUC最大的参数值
 74 |       val (_, bestIndex) = AUCs.zipWithIndex.maxBy(_._1)
 75 |       selectedParamMap.put(param.asInstanceOf[Param[Any]], candidateParamValues(bestIndex))
 76 |     }
 77 |     //使用挑选出的那组参数值，基于整个数据集训练模型，并计算评估值
 78 |     learner.updateParams(selectedParamMap)
 79 |     val fullModel = learner.train(dataset)
 80 |     val fullValidating = dataset.map { case (label, features) =>
 81 |       (fullModel.predict(features), label)
 82 |     }
 83 |     val fullMetrics = new BinaryClassificationMetrics(fullValidating)
 84 |     (fullModel, fullMetrics)
 85 |   }
 86 | 
 87 |   /**
 88 |     * 随机选择一组参数作为基准参数
 89 |     *
 90 |     * @param dataset         数据集
 91 |     * @param baseParamMinAUC 基准参数的AUC阈值（基准参数描述的模型AUC不能小于等于该阈值）
 92 |     * @return 基准参数
 93 |     */
 94 |   private def selectBaseParams(dataset: RDD[(Double, SparseVector[Double])],
 95 |                                baseParamMinAUC: Double): ParamMap = {
 96 |     var selected = false
 97 |     var baseParamMap: ParamMap = null
 98 |     var tryTime: Int = 0
 99 | 
100 |     while (!selected) {
101 |       //尝试5次，如果AUC都是0则抛出异常
102 |       tryTime = tryTime + 1
103 |       if (tryTime > 5) {
104 |         throw new Exception("try time exceeds 5 for base parameters selection.")
105 |       }
106 |       //随机选择一组参数，并计算AUC值
107 |       baseParamMap = paramGridBuilder.sampleParams()
108 |       learner.updateParams(baseParamMap)
109 |       val model = learner.train(dataset)
110 |       val validating = dataset.map { case (label, features) =>
111 |         (model.predict(features), label)
112 |       }
113 |       val metrics = new BinaryClassificationMetrics(validating)
114 |       //AUC值大于阈值，则返回
115 |       if (metrics.AUC > baseParamMinAUC) {
116 |         selected = true
117 |       }
118 |     }
119 |     baseParamMap
120 |   }
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/tuning/BinParamGridBuilder.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.tuning
 2 | 
 3 | import org.apache.spark.ml.param._
 4 | 
 5 | import scala.collection.mutable
 6 | import scala.util.Random
 7 | 
 8 | /**
 9 |   * User: qfeng
10 |   * Date: 15-8-24 上午10:32
11 |   * Usage: 快速模型选择时，参数的构建工具类
12 |   */
13 | 
14 | /**
15 |   * 快速模型选择时，参数的构建工具类(针对二分模型)
16 |   */
17 | class BinParamGridBuilder extends Serializable {
18 |   val paramGrid = mutable.Map.empty[Param[_], Iterable[_]]
19 | 
20 |   def copy(): BinParamGridBuilder = {
21 |     val result = new BinParamGridBuilder
22 |     result.paramGrid ++= paramGrid
23 |     result
24 |   }
25 | 
26 |   /**
27 |     * 对于参数集合，随机选择一组参数值
28 |     *
29 |     * @return 参数及对应参数值集合
30 |     */
31 |   def sampleParams(): ParamMap = {
32 |     val paramMap = new ParamMap
33 |     paramGrid.foreach { case (param, values) =>
34 |       val valueList = values.toList
35 |       val value = valueList(Random.nextInt(valueList.length))
36 |       paramMap.put(param.asInstanceOf[Param[Any]], value)
37 |     }
38 |     paramMap
39 |   }
40 | 
41 |   /**
42 |     * Adds a double param with multiple values.
43 |     */
44 |   def addGrid(param: DoubleParam, values: Array[Double]): this.type = {
45 |     addGrid[Double](param, values)
46 |   }
47 | 
48 |   // specialized versions of addGrid for Java.
49 | 
50 |   /**
51 |     * Adds a int param with multiple values.
52 |     */
53 |   def addGrid(param: IntParam, values: Array[Int]): this.type = {
54 |     addGrid[Int](param, values)
55 |   }
56 | 
57 |   /**
58 |     * Adds a float param with multiple values.
59 |     */
60 |   def addGrid(param: FloatParam, values: Array[Float]): this.type = {
61 |     addGrid[Float](param, values)
62 |   }
63 | 
64 |   /**
65 |     * Adds a param with multiple values (overwrites if the input param exists).
66 |     */
67 |   def addGrid[T](param: Param[T], values: Iterable[T]): this.type = {
68 |     paramGrid.put(param, values)
69 |     this
70 |   }
71 | 
72 |   /**
73 |     * Adds a long param with multiple values.
74 |     */
75 |   def addGrid(param: LongParam, values: Array[Long]): this.type = {
76 |     addGrid[Long](param, values)
77 |   }
78 | 
79 |   /**
80 |     * Adds a boolean param with true and false.
81 |     */
82 |   def addGrid(param: BooleanParam): this.type = {
83 |     addGrid[Boolean](param, Array(true, false))
84 |   }
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/tuning/BinaryClassificationMetrics.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.tuning
  2 | 
  3 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics => BCM}
  4 | import org.apache.spark.rdd.RDD
  5 | import org.apache.spark.storage.StorageLevel
  6 | 
  7 | /**
  8 |   * User: qfeng
  9 |   * Date: 15-8-10 上午11:10
 10 |   * Usage: Binary classification evaluation, See https://en.wikipedia.org/wiki/Receiver_operating_characteristic
 11 |   */
 12 | 
 13 | /**
 14 |   * 二分类指标
 15 |   *
 16 |   * @param rawScoreAndLabels 预测值和标签
 17 |   * @param threshold         二分类阈值(默认0.5)
 18 |   */
 19 | class BinaryClassificationMetrics(private val rawScoreAndLabels: RDD[(Double, Double)],
 20 |                                   val threshold: Double = 0.5) extends Serializable {
 21 |   // 假如标签为1/-1,将其转换为1/0
 22 |   private val scoreAndLabels = rawScoreAndLabels.map { case (score, label) =>
 23 |     if (label <= 0) (score, 0.0) else (score, 1.0)
 24 |   }.persist(StorageLevel.MEMORY_AND_DISK_SER)
 25 | 
 26 |   private val metrics = computeMetrics
 27 |   val accuracy = metrics._1
 28 |   val precisions = (metrics._2, metrics._3)
 29 |   val recalls = (metrics._4, metrics._5)
 30 |   val f1_scores = (metrics._6, metrics._7)
 31 |   val AUC = AUCValue
 32 |   private lazy val AUCValue = computeAUC(metrics._8)
 33 | 
 34 |   /**
 35 |     * 将各个度量指标转成字符串形式（保留4位小数）
 36 |     *
 37 |     * @return (AUC, accuracy, precisions, recalls, f1_scores)
 38 |     */
 39 |   override def toString: String = {
 40 |     val result = new StringBuilder
 41 |     result.append("AUC: ")
 42 |     result.append("%1.4f".format(AUC))
 43 |     result.append(", accuracy: ")
 44 |     result.append("%1.4f".format(accuracy))
 45 |     result.append(", precisions: ")
 46 |     result.append(mkTupleString(precisions))
 47 |     result.append(", recalls: ")
 48 |     result.append(mkTupleString(recalls))
 49 |     result.append(", f1_scores: ")
 50 |     result.append(mkTupleString(f1_scores))
 51 |     result.toString()
 52 |   }
 53 | 
 54 |   /**
 55 |     * double类型的元组转成字符串（转成4位小数）
 56 |     *
 57 |     * @param t 元组
 58 |     * @return 4位小数表示的字符串
 59 |     */
 60 |   private def mkTupleString(t: (Double, Double)): String = {
 61 |     val result = new StringBuilder
 62 |     result.append("(")
 63 |     result.append("%1.4f".format(t._1))
 64 |     result.append(", ")
 65 |     result.append("%1.4f".format(t._2))
 66 |     result.append(")")
 67 |     result.toString()
 68 |   }
 69 | 
 70 |   private def computeAUC(numData: Int): Double = {
 71 |     var auc: Double = 0.0
 72 |     if (numData > 300000) {
 73 |       auc = new BCM(scoreAndLabels, 100000).areaUnderROC()
 74 |     } else {
 75 |       auc = new BCM(scoreAndLabels).areaUnderROC()
 76 |     }
 77 |     if (scoreAndLabels.getStorageLevel == StorageLevel.MEMORY_AND_DISK_SER) {
 78 |       scoreAndLabels.unpersist()
 79 |     }
 80 |     auc
 81 |   }
 82 | 
 83 |   /**
 84 |     * 计算各种衡量二分类模型的度量指标
 85 |     *
 86 |     * @return 指标依次为：(accuracy, positive precision, negative precision, positive recall, negative recall, positive f1_scores, negative f1_score)
 87 |     */
 88 |   private def computeMetrics: (Double, Double, Double, Double, Double, Double, Double, Int) = {
 89 |     val sc = scoreAndLabels.context
 90 |     val totalAccum = sc.longAccumulator
 91 |     val testPositiveAccum = sc.longAccumulator
 92 |     val condPositiveAccum = sc.longAccumulator
 93 |     val truePositiveAccum = sc.longAccumulator
 94 |     val trueNegativeAccum = sc.longAccumulator
 95 | 
 96 |     scoreAndLabels.foreach { case (score, label) =>
 97 |       totalAccum.add(1)
 98 |       if (score > threshold) {
 99 |         testPositiveAccum.add(1)
100 |       }
101 |       if (label == 1.0) {
102 |         condPositiveAccum.add(1)
103 |       }
104 |       if (score >= threshold && label == 1.0) {
105 |         truePositiveAccum.add(1)
106 |       }
107 |       if (score < threshold && label == 0.0) {
108 |         trueNegativeAccum.add(1)
109 |       }
110 |     }
111 | 
112 |     val totalNum = totalAccum.value.toDouble
113 |     val testPositiveNum = testPositiveAccum.value.toDouble
114 |     val testNegativeNum = totalNum - testPositiveNum
115 |     val condPositiveNum = condPositiveAccum.value.toDouble
116 |     val condNegativeNum = totalNum - condPositiveNum
117 |     val truePositiveNum = truePositiveAccum.value.toDouble
118 |     val trueNegativeNum = trueNegativeAccum.value.toDouble
119 | 
120 |     //accuracy
121 |     val ACC = (truePositiveNum + trueNegativeNum) / totalNum
122 |     //positive predictive value (positive precision)
123 |     val PPV = truePositiveNum / testPositiveNum
124 |     //negative predictive value (negative precision)
125 |     val NPV = trueNegativeNum / testNegativeNum
126 |     //true positive rate (sensitivity, positive recall)
127 |     val TPR = truePositiveNum / condPositiveNum
128 |     //true negative rate (specificity, negative recall)
129 |     val TNR = trueNegativeNum / condNegativeNum
130 |     //positive f1 score
131 |     val F1P = (2 * PPV * TPR) / (PPV + TPR)
132 |     //negative f1 score
133 |     val F1N = (2 * NPV * TNR) / (NPV + TNR)
134 | 
135 |     (ACC, PPV, NPV, TPR, TNR, F1P, F1N, totalNum.toInt)
136 |   }
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/tuning/RegressionMetrics.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.tuning
 2 | 
 3 | import org.apache.spark.mllib.evaluation.{RegressionMetrics => RM}
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | /**
 7 |   * User: qfeng
 8 |   * Date: 15-8-11 下午4:03
 9 |   */
10 | 
11 | /**
12 |   * 回归模型指标
13 |   *
14 |   * @param scoreAndLabels 预测值和实际值
15 |   */
16 | class RegressionMetrics(val scoreAndLabels: RDD[(Double, Double)]) {
17 |   private val rm = new RM(scoreAndLabels)
18 | 
19 |   /**
20 |     * 将各个度量指标转成字符串形式
21 |     *
22 |     * @return MSE
23 |     */
24 |   override def toString: String = {
25 |     val result = new StringBuilder
26 |     result.append("MSE: ")
27 |     result.append(MSE)
28 |     result.toString()
29 |   }
30 | 
31 |   def MSE: Double = {
32 |     rm.meanSquaredError
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/util/GaussianRandom.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import java.util.Random
 4 | 
 5 | import breeze.linalg.{DenseMatrix, DenseVector}
 6 | 
 7 | 
 8 | /**
 9 |   * Created by qfeng on 15-1-26.
10 |   */
11 | 
12 | /**
13 |   * 高斯随机数生成器实例
14 |   */
15 | object GaussianRandom {
16 |   /**
17 |     * 生成高斯随机数密向量
18 |     *
19 |     * @param mean   高斯分布的均值
20 |     * @param stdev  高斯分布的标准差
21 |     * @param length 向量长度
22 |     * @return 高斯随机数密向量
23 |     */
24 |   def randDenseVector(mean: Double, stdev: Double, length: Int): DenseVector[Double] = {
25 |     val results = DenseVector.zeros[Double](length)
26 |     for (i <- 0 until length) {
27 |       results.update(i, rand(mean, stdev))
28 |     }
29 |     results
30 |   }
31 | 
32 |   /**
33 |     * 生成告诉随机数密矩阵
34 |     *
35 |     * @param mean    高斯分布的均值
36 |     * @param stdev   高斯分布的标准差
37 |     * @param numRows 矩阵行数
38 |     * @param numCols 矩阵列数
39 |     * @return 高斯随机数密矩阵
40 |     */
41 |   def randDenseMatrix(mean: Double, stdev: Double, numRows: Int, numCols: Int): DenseMatrix[Double] = {
42 |     val results = DenseMatrix.zeros[Double](numRows, numCols)
43 |     for (i <- 0 until numRows)
44 |       for (j <- 0 until numCols)
45 |         results.update(i, j, rand(mean, stdev))
46 |     results
47 |   }
48 | 
49 |   /**
50 |     * 生成高斯随机数
51 |     *
52 |     * @param mean  高斯分布的均值
53 |     * @param stdev 高斯分布的标准差
54 |     * @return 高斯随机数
55 |     */
56 |   def rand(mean: Double, stdev: Double): Double = {
57 |     val random = new Random()
58 |     val genValue = random.nextGaussian()
59 |     mean + stdev * genValue
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/util/HDFSUtil.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import org.apache.hadoop.fs.{FileSystem, Path}
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | /**
 8 |   * Created by qfeng on 16-3-3.
 9 |   */
10 | 
11 | /**
12 |   * HDFS文件操作工具类
13 |   */
14 | object HDFSUtil {
15 |   /**
16 |     * 如果文件存在则删除它
17 |     *
18 |     * @param file 文件
19 |     */
20 |   def deleteIfExists(file: String): Unit = {
21 |     val spark = SparkSession.builder().getOrCreate()
22 |     val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
23 |     if (fs.exists(new Path(file))) {
24 |       fs.delete(new Path(file), true)
25 |     }
26 |   }
27 | 
28 |   /**
29 |     * 文件是否存在
30 |     *
31 |     * @param file 文件
32 |     */
33 |   def exists(file: String): Boolean = {
34 |     val spark = SparkSession.builder().getOrCreate()
35 |     val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
36 |     fs.exists(new Path(file))
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/util/LoadDSUtil.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.storage.StorageLevel
 7 | 
 8 | /**
 9 |   * Created by qfeng on 15-3-18.
10 |   */
11 | object LoadDSUtil {
12 |   // Convenient methods for `loadLibSVMFile`.
13 | 
14 |   /**
15 |     * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint], with the default number of
16 |     * partitions.
17 |     */
18 |   def loadLibSVMDataSet(path: String,
19 |                         numFeatures: Int = -1): (RDD[(Double, SparseVector[Double])], Int) = {
20 |     val sc = SparkContext.getOrCreate()
21 |     val dataSet = sc.textFile(path, sc.defaultMinPartitions)
22 |     toLibSVMDataSet(dataSet, numFeatures)
23 |   }
24 | 
25 |   /**
26 |     * Loads labeled data in the LIBSVM format into an RDD[LabeledPoint].
27 |     * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
28 |     * Each line represents a labeled sparse feature vector using the following format:
29 |     * {{{label index1:value1 index2:value2 ...}}}
30 |     * where the indices are one-based and in ascending order.
31 |     * This method parses each line into a [[org.apache.spark.mllib.regression.LabeledPoint]],
32 |     * where the feature indices are converted to zero-based.
33 |     *
34 |     * @param dataSet     数据集
35 |     * @param numFeatures number of features, which will be determined from the input data if a
36 |     *                    nonpositive value is given. This is useful when the dataset is already split
37 |     *                    into multiple files and you want to load them separately, because some
38 |     *                    features may not present in certain files, which leads to inconsistent
39 |     *                    feature dimensions.
40 |     * @return labeled data stored as an RDD[LabeledPoint]
41 |     */
42 |   def toLibSVMDataSet(dataSet: RDD[String],
43 |                       numFeatures: Int = -1): (RDD[(Double, SparseVector[Double])], Int) = {
44 |     val parsed = dataSet.map(_.trim)
45 |       .filter(line => !(line.isEmpty || line.startsWith("#")))
46 |       .map { line =>
47 |         val items = line.split(' ')
48 |         val label = items.head.toDouble
49 |         val (indices, values) = items.tail.filter(_.nonEmpty).map { item =>
50 |           val indexAndValue = item.split(':')
51 |           val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
52 |         val value = indexAndValue(1).toDouble
53 |           (index, value)
54 |         }.unzip
55 |         (label, indices, values)
56 |       }
57 | 
58 |     // Determine number of features.
59 |     val d = if (numFeatures > 0) {
60 |       numFeatures
61 |     } else {
62 |       parsed.persist(StorageLevel.MEMORY_AND_DISK_SER)
63 |       parsed.map { case (label, indices, values) =>
64 |         indices.lastOption.getOrElse(0)
65 |       }.reduce(math.max) + 1
66 |     }
67 | 
68 |     (parsed.map { case (label, indices, values) =>
69 |       (label, new SparseVector[Double](indices, values, d))
70 |     }, d)
71 |   }
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/util/Logging.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import org.slf4j.{Logger, LoggerFactory}
 4 | 
 5 | /**
 6 |   * Created by qfeng on 16-8-30.
 7 |   */
 8 | 
 9 | /**
10 |   * 日志工具接口
11 |   */
12 | trait Logging {
13 |   @transient private var log_ : Logger = null
14 | 
15 |   protected def logName = {
16 |     this.getClass.getName.stripSuffix("$")
17 |   }
18 | 
19 |   protected def log: Logger = {
20 |     if (log_ == null) {
21 |       log_ = LoggerFactory.getLogger(logName)
22 |     }
23 |     log_
24 |   }
25 | 
26 |   protected def logInfo(msg: => String) {
27 |     if (log.isInfoEnabled) log.info(msg)
28 |   }
29 | 
30 |   protected def logDebug(msg: => String) {
31 |     if (log.isDebugEnabled) log.debug(msg)
32 |   }
33 | 
34 |   protected def logTrace(msg: => String) {
35 |     if (log.isTraceEnabled) log.trace(msg)
36 |   }
37 | 
38 |   protected def logWarning(msg: => String) {
39 |     if (log.isWarnEnabled) log.warn(msg)
40 |   }
41 | 
42 |   protected def logError(msg: => String) {
43 |     if (log.isErrorEnabled) log.error(msg)
44 |   }
45 | 
46 |   protected def logInfo(msg: => String, throwable: Throwable) {
47 |     if (log.isInfoEnabled) log.info(msg, throwable)
48 |   }
49 | 
50 |   protected def logDebug(msg: => String, throwable: Throwable) {
51 |     if (log.isDebugEnabled) log.debug(msg, throwable)
52 |   }
53 | 
54 |   protected def logTrace(msg: => String, throwable: Throwable) {
55 |     if (log.isTraceEnabled) log.trace(msg, throwable)
56 |   }
57 | 
58 |   protected def logWarning(msg: => String, throwable: Throwable) {
59 |     if (log.isWarnEnabled) log.warn(msg, throwable)
60 |   }
61 | 
62 |   protected def logError(msg: => String, throwable: Throwable) {
63 |     if (log.isErrorEnabled) log.error(msg, throwable)
64 |   }
65 | 
66 |   protected def isTraceEnabled: Boolean = {
67 |     log.isTraceEnabled
68 |   }
69 | }


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/util/NumericParser.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.util
  2 | 
  3 | import java.util.StringTokenizer
  4 | 
  5 | import org.apache.spark.SparkException
  6 | 
  7 | import scala.collection.mutable.{ArrayBuffer, ListBuffer}
  8 | 
  9 | /**
 10 |   * Created by qfeng on 15-3-13.
 11 |   */
 12 | 
 13 | /**
 14 |   * Simple parser for a numeric structure consisting of three types:
 15 |   *
 16 |   * - number: a double in Java's floating number format
 17 |   * - array: an array of numbers stored as `[v0,v1,...,vn]`
 18 |   * - tuple: a list of numbers, arrays, or tuples stored as `(...)`
 19 |   */
 20 | object NumericParser {
 21 | 
 22 |   /** Parses a string into a Double, an Array[Double], or a Seq[Any]. */
 23 |   def parse(s: String): Any = {
 24 |     val tokenizer = new StringTokenizer(s, "()[],", true)
 25 |     if (tokenizer.hasMoreTokens()) {
 26 |       val token = tokenizer.nextToken()
 27 |       if (token == "(") {
 28 |         parseTuple(tokenizer)
 29 |       } else if (token == "[") {
 30 |         parseArray(tokenizer)
 31 |       } else {
 32 |         // expecting a number
 33 |         parseDouble(token)
 34 |       }
 35 |     } else {
 36 |       throw new SparkException(s"Cannot find any token from the input string.")
 37 |     }
 38 |   }
 39 | 
 40 |   private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
 41 |     val values = ArrayBuffer.empty[Double]
 42 |     var parsing = true
 43 |     var allowComma = false
 44 |     var token: String = null
 45 |     while (parsing && tokenizer.hasMoreTokens()) {
 46 |       token = tokenizer.nextToken()
 47 |       if (token == "]") {
 48 |         parsing = false
 49 |       } else if (token == ",") {
 50 |         if (allowComma) {
 51 |           allowComma = false
 52 |         } else {
 53 |           throw new SparkException("Found a ',' at a wrong position.")
 54 |         }
 55 |       } else {
 56 |         // expecting a number
 57 |         values.append(parseDouble(token))
 58 |         allowComma = true
 59 |       }
 60 |     }
 61 |     if (parsing) {
 62 |       throw new SparkException(s"An array must end with ']'.")
 63 |     }
 64 |     values.toArray
 65 |   }
 66 | 
 67 |   private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
 68 |     val items = ListBuffer.empty[Any]
 69 |     var parsing = true
 70 |     var allowComma = false
 71 |     var token: String = null
 72 |     while (parsing && tokenizer.hasMoreTokens()) {
 73 |       token = tokenizer.nextToken()
 74 |       if (token == "(") {
 75 |         items.append(parseTuple(tokenizer))
 76 |         allowComma = true
 77 |       } else if (token == "[") {
 78 |         items.append(parseArray(tokenizer))
 79 |         allowComma = true
 80 |       } else if (token == ",") {
 81 |         if (allowComma) {
 82 |           allowComma = false
 83 |         } else {
 84 |           throw new SparkException("Found a ',' at a wrong position.")
 85 |         }
 86 |       } else if (token == ")") {
 87 |         parsing = false
 88 |       } else {
 89 |         // expecting a number
 90 |         items.append(parseDouble(token))
 91 |         allowComma = true
 92 |       }
 93 |     }
 94 |     if (parsing) {
 95 |       throw new SparkException(s"A tuple must end with ')'.")
 96 |     }
 97 |     items
 98 |   }
 99 | 
100 |   private def parseDouble(s: String): Double = {
101 |     try {
102 |       java.lang.Double.parseDouble(s)
103 |     } catch {
104 |       case e: Throwable =>
105 |         throw new SparkException(s"Cannot parse a double from: $s", e)
106 |     }
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/util/ParamUtil.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import org.apache.spark.ml.param.ParamMap
 4 | 
 5 | /**
 6 |   * Created by qfeng on 15-3-31.
 7 |   */
 8 | 
 9 | /**
10 |   * 参数工具类实例
11 |   */
12 | object ParamUtil {
13 | 
14 |   /**
15 |     * 参数池转成字符串
16 |     *
17 |     * @param params 参数池
18 |     * @return 字符串
19 |     */
20 |   def paramsToString(params: ParamMap): String = {
21 |     params.toSeq.map { paramPair => paramPair.value match {
22 |       case v: Array[_] => s"${paramPair.param.name}:${v.mkString(",")}"
23 |       case _ => s"${paramPair.param.name}:${paramPair.value}"
24 |     }
25 |     }.mkString(", ")
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/io/github/qf6101/mfm/util/VectorConverter.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import org.apache.spark.mllib.linalg.Vector
 5 | 
 6 | /**
 7 |   * Created by qfeng on 15-3-17.
 8 |   */
 9 | object VectorConverter {
10 |   /**
11 |     * spark的向量转成breeze的稀疏向量
12 |     *
13 |     * @param input spark向量
14 |     * @return breeze的稀疏向量
15 |     */
16 |   def SparkVector2SV(input: Vector): SparseVector[Double] = {
17 |     val result = SparseVector.zeros[Double](input.size)
18 | 
19 |     for (i <- 0 until input.size) {
20 |       if (input(i) != 0.0) {
21 |         result.update(i, input(i))
22 |       }
23 |     }
24 | 
25 |     result
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootCategory=DEBUG, console, file
 2 | # Set everything to be logged to the console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{100}: %m%n
 7 | # Set everything to be logged to the file core/target/unit-tests.log
 8 | log4j.appender.file=org.apache.log4j.FileAppender
 9 | log4j.appender.file.append=false
10 | log4j.appender.file.file=target/unit-tests.log
11 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
12 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
13 | #ignore specific log information
14 | log4j.logger.org.eclipse.jetty=OFF
15 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=OFF
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=OFF
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=OFF
18 | log4j.logger.org.apache.spark=OFF
19 | log4j.logger.Remoting=OFF
20 | log4j.logger.org.spark-project.jetty=OFF
21 | log4j.logger.org.apache.hadoop=OFF
22 | log4j.logger.io.netty=OFF
23 | log4j.logger.akka=OFF
24 | log4j.logger.breeze=OFF
25 | log4j.logger.org.spark_project.jetty=OFF


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/factorization/binomial/FmSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.factorization.binomial
 2 | 
 3 | import io.github.qf6101.mfm.optimization.{LogXDecreasingStrategy, SquaredL2Updater}
 4 | import io.github.qf6101.mfm.tuning.BinaryClassificationMetrics
 5 | import io.github.qf6101.mfm.util.TestingUtils._
 6 | import io.github.qf6101.mfm.util.{HDFSUtil, LoadDSUtil, MfmTestSparkSession}
 7 | import org.apache.spark.ml.param.ParamMap
 8 | import org.scalatest.FunSuite
 9 | 
10 | /**
11 |   * User: qfeng
12 |   * Date: 15-12-8 下午4:58
13 |   */
14 | class FmSuite extends FunSuite with MfmTestSparkSession {
15 |   test("test binomial factorization machines") {
16 |     // Load training and testing data sets
17 |     val (training, _) = LoadDSUtil.loadLibSVMDataSet("test_data/input/a1a/a1a")
18 |     val (testing, numFeatures) = LoadDSUtil.loadLibSVMDataSet("test_data/input/a1a/a1a.t")
19 |     // Construct factorization machines learner with parameters
20 |     val params = new ParamMap()
21 |     val updater = new SquaredL2Updater(decreasingStrategy = new LogXDecreasingStrategy(100))
22 |     val fmLearn = new FmLearnSGD(params, updater)
23 |     params.put(fmLearn.gd.numIterations, 100)
24 |     params.put(fmLearn.gd.stepSize, 0.1)
25 |     params.put(fmLearn.gd.miniBatchFraction, 1.0)
26 |     params.put(fmLearn.gd.convergenceTol, 1E-5)
27 |     params.put(fmLearn.numFeatures, numFeatures)
28 |     params.put(fmLearn.numFactors, 5)
29 |     params.put(fmLearn.k0, false)
30 |     params.put(fmLearn.k1, true)
31 |     params.put(fmLearn.k2, false)
32 |     params.put(fmLearn.maxInteractFeatures, numFeatures)
33 |     params.put(fmLearn.initMean, 0.0)
34 |     params.put(fmLearn.initStdev, 0.0001)
35 |     params.put(fmLearn.reg0, 0.0)
36 |     params.put(fmLearn.reg1, 0.0)
37 |     params.put(fmLearn.reg2, 0.0)
38 |     // Train FM model
39 |     val model = fmLearn.train(training)
40 |     // Use testing data set to evaluate the model
41 |     val eval = testing.map { case (label, features) =>
42 |       (model.predict(features), label)
43 |     }
44 |     val metrics = new BinaryClassificationMetrics(eval)
45 |     // Save model to file
46 |     HDFSUtil.deleteIfExists("test_data/output/a1a")
47 |     model.save("test_data/output/a1a")
48 | 
49 |     //// Firstly test spark reloading
50 |     // Reload model from file and test if it is equal to the original model
51 |     val sparkReloadModel = FmModel("test_data/output/a1a")
52 |     assert(model.equals(sparkReloadModel))
53 |     // Evaluate the reloaded model
54 |     val sparkReloadEval = testing.map { case (label, features) =>
55 |       (sparkReloadModel.predict(features), label)
56 |     }
57 |     // Test if the reloaded model has the same result on the testing data set
58 |     val sparkReloadMetrics = new BinaryClassificationMetrics(sparkReloadEval)
59 |     assert(sparkReloadMetrics.AUC ~= metrics.AUC absTol 1E-5)
60 | 
61 |     //// Secondly test local reloading
62 |     // Reload model from file and test if it is equal to the original model
63 |     val localReloadModel = FmModel.fromLocal("test_data/output/a1a")
64 |     assert(model.equals(localReloadModel))
65 |     // Evaluate the reloaded model
66 |     val localReloadEval = testing.map { case (label, features) =>
67 |       (localReloadModel.predict(features), label)
68 |     }
69 |     // Test if the reloaded model has the same result on the testing data set
70 |     val localReloadMetrics = new BinaryClassificationMetrics(localReloadEval)
71 |     assert(localReloadMetrics.AUC ~= metrics.AUC absTol 1E-5)
72 |     // print the AUC
73 |     println("AUC: " + metrics.AUC)
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmCoefficientsSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.factorization.multinomial
 2 | 
 3 | import io.github.qf6101.mfm.util.MfmTestSparkSession
 4 | import org.scalatest.FunSuite
 5 | 
 6 | /**
 7 |   * Created by qfeng on 16-9-18.
 8 |   */
 9 | class MfmCoefficientsSuite extends FunSuite with MfmTestSparkSession {
10 |   test("test MfmCoefficients' += operation") {
11 |     val left = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2)
12 |     val right = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2)
13 | 
14 |     val leftSample = left.thetas(0).w(0)
15 |     val rightSample = right.thetas(0).w(0)
16 |     println(leftSample)
17 |     println(rightSample)
18 |     left += right
19 |     println(left.thetas(0).w(0))
20 |     assert(left.thetas(0).w(0) == leftSample + rightSample)
21 |   }
22 | 
23 |   test("test MfmCoefficients' + operation") {
24 |     val left = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2)
25 |     val right = new MfmCoefficients(0.0, 0.01, 1, 1, 1, false, true, false, 2)
26 | 
27 |     val leftSample = left.thetas(0).w(0)
28 |     val rightSample = right.thetas(0).w(0)
29 |     println(leftSample)
30 |     println(rightSample)
31 |     val sum = left + right
32 |     println(sum.asInstanceOf[MfmCoefficients].thetas(0).w(0))
33 |     assert(sum.asInstanceOf[MfmCoefficients].thetas(0).w(0) == leftSample + rightSample)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/factorization/multinomial/MfmSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.factorization.multinomial
 2 | 
 3 | import breeze.linalg.argmax
 4 | import io.github.qf6101.mfm.optimization._
 5 | import io.github.qf6101.mfm.util.TestingUtils._
 6 | import io.github.qf6101.mfm.util.{HDFSUtil, LoadDSUtil, MfmTestSparkSession}
 7 | import org.apache.spark.ml.param.ParamMap
 8 | import org.apache.spark.mllib.evaluation.MulticlassMetrics
 9 | import org.scalatest.FunSuite
10 | 
11 | /**
12 |   * User: qfeng
13 |   * Date: 15-12-8 下午4:58
14 |   */
15 | class MfmSuite extends FunSuite with MfmTestSparkSession {
16 |   test("test binomial factorization machines") {
17 |     // Load training and testing data sets
18 |     val (training, numFeatures) = LoadDSUtil.loadLibSVMDataSet("test_data/input/mnist/mnist.scale")
19 |     val (testing, _) = LoadDSUtil.loadLibSVMDataSet("test_data/input/mnist/mnist.scale.t")
20 |     // Construct multinomial factorization machines learner with parameters
21 |     val params = new ParamMap()
22 |     val updater = new SquaredL2Updater(decreasingStrategy = new LogXDecreasingStrategy(20))
23 |     val mfmLearn = new MfmLearnSGD(params, updater)
24 |     params.put(mfmLearn.gd.numIterations, 10)
25 |     params.put(mfmLearn.gd.stepSize, 0.1)
26 |     params.put(mfmLearn.gd.miniBatchFraction, 1.0)
27 |     params.put(mfmLearn.gd.convergenceTol, 1E-5)
28 |     params.put(mfmLearn.numFeatures, numFeatures)
29 |     params.put(mfmLearn.numFactors, 5)
30 |     params.put(mfmLearn.k0, false)
31 |     params.put(mfmLearn.k1, true)
32 |     params.put(mfmLearn.k2, false)
33 |     params.put(mfmLearn.maxInteractFeatures, numFeatures)
34 |     params.put(mfmLearn.initMean, 0.0)
35 |     params.put(mfmLearn.initStdev, 0.01)
36 |     params.put(mfmLearn.reg0, 0.0001)
37 |     params.put(mfmLearn.reg1, 0.0001)
38 |     params.put(mfmLearn.reg2, 0.001)
39 |     params.put(mfmLearn.numClasses, 10)
40 |     // Train MFM model
41 |     val model = mfmLearn.train(training)
42 |     // Use testing data set to evaluate the model
43 |     val eval = testing.map { case (label, features) =>
44 |       argmax(model.predict(features)).toDouble -> label
45 |     }
46 |     val metrics = new MulticlassMetrics(eval)
47 |     // Save model to file
48 |     HDFSUtil.deleteIfExists("test_data/output/mnist")
49 |     model.save("test_data/output/mnist")
50 | 
51 |     //// Firstly test spark reloading
52 |     // Reload model from file and test if it is equal to the original model
53 |     val sparkReloadModel = MfmModel("test_data/output/mnist")
54 |     assert(model.equals(sparkReloadModel))
55 |     // Evaluate the reloaded model
56 |     val sparkReloadEval = testing.map { case (label, features) =>
57 |       argmax(sparkReloadModel.predict(features)).toDouble -> label
58 |     }
59 |     // Test if the reloaded model has the same result on the testing data set
60 |     val sparkReloadMetrics = new MulticlassMetrics(sparkReloadEval)
61 |     assert(sparkReloadMetrics.accuracy ~= metrics.accuracy absTol 1E-5)
62 |     assert(sparkReloadMetrics.weightedPrecision ~= metrics.weightedPrecision absTol 1E-5)
63 |     assert(sparkReloadMetrics.weightedRecall ~= metrics.weightedRecall absTol 1E-5)
64 |     assert(sparkReloadMetrics.weightedFMeasure ~= metrics.weightedFMeasure absTol 1E-5)
65 | 
66 |     //// Secondly test local reloading
67 |     // Reload model from file and test if it is equal to the original model
68 |     val localReloadModel = MfmModel.fromLocal("test_data/output/mnist")
69 |     assert(model.equals(localReloadModel))
70 |     // Evaluate the reloaded model
71 |     val localReloadEval = testing.map { case (label, features) =>
72 |       argmax(localReloadModel.predict(features)).toDouble -> label
73 |     }
74 |     // Test if the reloaded model has the same result on the testing data set
75 |     val localReloadMetrics = new MulticlassMetrics(localReloadEval)
76 |     assert(localReloadMetrics.accuracy ~= metrics.accuracy absTol 1E-5)
77 |     assert(localReloadMetrics.weightedPrecision ~= metrics.weightedPrecision absTol 1E-5)
78 |     assert(localReloadMetrics.weightedRecall ~= metrics.weightedRecall absTol 1E-5)
79 |     assert(localReloadMetrics.weightedFMeasure ~= metrics.weightedFMeasure absTol 1E-5)
80 |     // print the metrics
81 |     println("accuracy: " + metrics.accuracy)
82 |     println("weighted precision: " + metrics.weightedPrecision)
83 |     println("weighted recall: " + metrics.weightedRecall)
84 |     println("weighted f-measure: " + metrics.weightedFMeasure)
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/optimization/GradientDescentSuite.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.optimization
  2 | 
  3 | import breeze.linalg.SparseVector
  4 | import io.github.qf6101.mfm.logisticregression.{LogisticGradient, LrLearnSGD, VectorCoefficients}
  5 | import io.github.qf6101.mfm.util.MfmTestSparkSession
  6 | import io.github.qf6101.mfm.util.TestingUtils._
  7 | import org.apache.spark.ml.param.ParamMap
  8 | import org.scalatest.{FunSuite, Matchers}
  9 | 
 10 | import scala.collection.JavaConversions._
 11 | import scala.util.Random
 12 | 
 13 | /**
 14 |   * Created by qfeng on 15-3-13.
 15 |   */
 16 | 
 17 | object GradientDescentSuite {
 18 | 
 19 |   def generateLogisticInputAsList(
 20 |                                    offset: Double,
 21 |                                    scale: Double,
 22 |                                    nPoints: Int,
 23 |                                    seed: Int): java.util.List[(Double, SparseVector[Double])] = {
 24 |     seqAsJavaList(generateGDInput(offset, scale, nPoints, seed))
 25 |   }
 26 | 
 27 |   // Generate input of the form Y = logistic(offset + scale * X)
 28 |   def generateGDInput(offset: Double,
 29 |                       scale: Double,
 30 |                       nPoints: Int,
 31 |                       seed: Int): Seq[(Double, SparseVector[Double])] = {
 32 |     val rnd = new Random(seed)
 33 |     val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
 34 | 
 35 |     val unifRand = new Random(45)
 36 |     val rLogis = (0 until nPoints).map { i =>
 37 |       val u = unifRand.nextDouble()
 38 |       math.log(u) - math.log(1.0 - u)
 39 |     }
 40 | 
 41 |     val y: Seq[Double] = (0 until nPoints).map { i =>
 42 |       val yVal = offset + scale * x1(i) + rLogis(i)
 43 |       if (yVal > 0) 1.0 else 0.0
 44 |     }
 45 | 
 46 |     (0 until nPoints).map(i => (y(i), new SparseVector[Double](Array(0, 1), Array(1.0, x1(i)), 2)))
 47 |   }
 48 | }
 49 | 
 50 | class GradientDescentSuite extends FunSuite with MfmTestSparkSession with Matchers {
 51 |   test("Assert the loss is decreasing.") {
 52 |     val nPoints = 1000
 53 |     val A = 2.0
 54 |     val B = -1.5
 55 | 
 56 |     val params = new ParamMap()
 57 |     val gradient = new LogisticGradient(params)
 58 |     val updater = new SimpleUpdater()
 59 |     val lrf = new LrLearnSGD(params, null)
 60 |     val gd = new GradientDescent(gradient, updater, params)
 61 | 
 62 |     // Add a extra variable consisting of all 1.0's for the intercept.
 63 |     val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
 64 |     val dataRDD = spark.sparkContext.parallelize(testData, 2).cache()
 65 |     val initialWeightsWithIntercept = new VectorCoefficients(2)
 66 |     initialWeightsWithIntercept.w.update(0, 1.0)
 67 |     initialWeightsWithIntercept.w.update(1, -1.0)
 68 | 
 69 |     params.put(gd.numIterations, 10)
 70 |     params.put(gd.miniBatchFraction, 1.0)
 71 |     params.put(gd.stepSize, 1.0)
 72 |     params.put(gd.convergenceTol, 1E-4)
 73 |     params.put(lrf.reg, Array(0.0))
 74 | 
 75 |     val (_, loss) = gd.optimizeWithHistory(
 76 |       dataRDD,
 77 |       initialWeightsWithIntercept,
 78 |       params(lrf.reg))
 79 | 
 80 |     assert(loss.last - loss.head < 0, "loss isn't decreasing.")
 81 | 
 82 |     val lossDiff = loss.init.zip(loss.tail).map { case (lhs, rhs) => lhs - rhs }
 83 |     assert(lossDiff.count(_ > 0).toDouble / lossDiff.size > 0.8)
 84 |   }
 85 | 
 86 | 
 87 |   test("Test the loss and gradient of first iteration with regularization.") {
 88 |     val params = new ParamMap()
 89 |     val gradient = new LogisticGradient(params)
 90 |     val updater = new SquaredL2Updater()
 91 |     val lrf = new LrLearnSGD(params, null)
 92 |     val gd = new GradientDescent(gradient, updater, params)
 93 | 
 94 |     // Add a extra variable consisting of all 1.0's for the intercept.
 95 |     val testData = GradientDescentSuite.generateGDInput(2.0, -1.5, 1000, 42)
 96 |     val dataRDD = spark.sparkContext.parallelize(testData, 2).cache()
 97 | 
 98 |     // Prepare non-zero weights
 99 |     val initialWeightsWithIntercept = new VectorCoefficients(2)
100 |     initialWeightsWithIntercept.w.update(0, 1.0)
101 |     initialWeightsWithIntercept.w.update(1, 0.5)
102 | 
103 |     params.put(gd.numIterations, 1)
104 |     params.put(gd.miniBatchFraction, 1.0)
105 |     params.put(gd.stepSize, 1.0)
106 |     params.put(gd.convergenceTol, 1E-4)
107 |     params.put(lrf.reg, Array(0.0))
108 | 
109 |     val (newWeights0, loss0) = gd.optimizeWithHistory(
110 |       dataRDD, initialWeightsWithIntercept, params(lrf.reg))
111 | 
112 |     params.put(gd.numIterations, 1)
113 |     params.put(lrf.reg, Array(1.0))
114 | 
115 |     val (newWeights1, loss1) = gd.optimizeWithHistory(
116 |       dataRDD, initialWeightsWithIntercept, params(lrf.reg))
117 | 
118 |     assert(
119 |       loss1(0) ~= (loss0(0) + (math.pow(initialWeightsWithIntercept.w(0), 2) +
120 |         math.pow(initialWeightsWithIntercept.w(1), 2)) / 2) absTol 1E-5,
121 |       """For non-zero weights, the regVal should be \frac{1}{2}\sum_i w_i^2.""")
122 | 
123 |     assert(
124 |       (newWeights1.asInstanceOf[VectorCoefficients].w(0) ~= (newWeights0.asInstanceOf[VectorCoefficients].w(0) -
125 |         initialWeightsWithIntercept.w(0))
126 |         absTol 1E-5) &&
127 |         (newWeights1.asInstanceOf[VectorCoefficients].w(1) ~= (newWeights0.asInstanceOf[VectorCoefficients].w(1) -
128 |           initialWeightsWithIntercept.w(1)) absTol 1E-5),
129 |       "The different between newWeights with/without regularization " +
130 |         "should be initialWeightsWithIntercept.")
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/optimization/LBFGSSuite.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.optimization
  2 | 
  3 | import io.github.qf6101.mfm.logisticregression.{LogisticGradient, LrLearnLBFGS, VectorCoefficients}
  4 | import io.github.qf6101.mfm.util.MfmTestSparkSession
  5 | import io.github.qf6101.mfm.util.TestingUtils._
  6 | import org.apache.spark.ml.param.ParamMap
  7 | import org.scalatest.FunSuite
  8 | 
  9 | 
 10 | /**
 11 |   * Created by qfeng on 15-4-7.
 12 |   */
 13 | 
 14 | 
 15 | class LBFGSSuite extends FunSuite with MfmTestSparkSession {
 16 |   lazy val dataRDD = spark.sparkContext.parallelize(testData, 2).cache()
 17 |   val nPoints = 1000
 18 |   val A = 2.0
 19 |   val B = -1.5
 20 |   // Add a extra variable consisting of all 1.0's for the intercept.
 21 |   val testData = GradientDescentSuite.generateGDInput(A, B, nPoints, 42)
 22 |   val simpleUpdater = new SimpleUpdater()
 23 |   val squaredL2Updater = new SquaredL2Updater()
 24 | 
 25 |   test("LBFGS loss should be decreasing and match the result of Gradient Descent.") {
 26 |     val initialWeightsWithIntercept = new VectorCoefficients(2)
 27 |     initialWeightsWithIntercept.w.update(0, 1.0)
 28 |     initialWeightsWithIntercept.w.update(1, -1.0)
 29 | 
 30 |     val lbfgsParamPool = new ParamMap()
 31 |     val lbfgsGradient = new LogisticGradient(lbfgsParamPool)
 32 |     val lbfgsLrf = new LrLearnLBFGS(lbfgsParamPool, null)
 33 |     val lbfgs = new LBFGS(lbfgsGradient, simpleUpdater, lbfgsParamPool)
 34 | 
 35 |     lbfgsParamPool.put(lbfgs.numIterations, 10)
 36 |     lbfgsParamPool.put(lbfgsLrf.reg, Array(0.0))
 37 |     lbfgsParamPool.put(lbfgs.convergenceTol, 1e-12)
 38 |     lbfgsParamPool.put(lbfgs.numCorrections, 10)
 39 | 
 40 |     val (_, lossLBFGS) = lbfgs.optimizeWithHistory(
 41 |       dataRDD,
 42 |       initialWeightsWithIntercept,
 43 |       lbfgsParamPool(lbfgsLrf.reg))
 44 | 
 45 |     // Since the cost function is convex, the loss is guaranteed to be monotonically decreasing
 46 |     // with L-BFGS optimizer.
 47 |     // (SGD doesn't guarantee this, and the loss will be fluctuating in the optimization process.)
 48 |     assert((lossLBFGS, lossLBFGS.tail).zipped.forall(_ > _), "loss should be monotonically decreasing.")
 49 | 
 50 |     val gdParamPool = new ParamMap()
 51 |     val gdGradient = new LogisticGradient(gdParamPool)
 52 |     val gdLrf = new LrLearnLBFGS(lbfgsParamPool, null)
 53 |     val gd = new GradientDescent(gdGradient, simpleUpdater, gdParamPool)
 54 | 
 55 |     gdParamPool.put(gd.stepSize, 1.0)
 56 |     gdParamPool.put(gd.numIterations, 50)
 57 |     gdParamPool.put(gdLrf.reg, Array(0.0))
 58 |     gdParamPool.put(gd.miniBatchFraction, 1.0)
 59 |     gdParamPool.put(gd.convergenceTol, 1E-12)
 60 | 
 61 |     val (_, lossGD) = gd.optimizeWithHistory(
 62 |       dataRDD,
 63 |       initialWeightsWithIntercept,
 64 |       gdParamPool(gdLrf.reg))
 65 | 
 66 |     // GD converges a way slower than L-BFGS. To achieve 1% difference,
 67 |     // it requires 90 iterations in GD. No matter how hard we increase
 68 |     // the number of iterations in GD here, the lossGD will be always
 69 |     // larger than lossLBFGS. This is based on observation, no theoretically guaranteed
 70 |     assert(Math.abs((lossGD.last - lossLBFGS.last) / lossLBFGS.last) < 0.02,
 71 |       "LBFGS should match GD result within 2% difference.")
 72 |   }
 73 | 
 74 |   test("LBFGS and Gradient Descent with L2 regularization should get the same result.") {
 75 |     val initialWeightsWithIntercept = new VectorCoefficients(2)
 76 |     initialWeightsWithIntercept.w.update(0, 0.3)
 77 |     initialWeightsWithIntercept.w.update(1, 0.12)
 78 | 
 79 |     val lbfgsParamPool = new ParamMap()
 80 |     val lbfgsGradient = new LogisticGradient(lbfgsParamPool)
 81 |     val lbfgsLrf = new LrLearnLBFGS(lbfgsParamPool, null)
 82 |     val lbfgs = new LBFGS(lbfgsGradient, squaredL2Updater, lbfgsParamPool)
 83 | 
 84 |     lbfgsParamPool.put(lbfgs.numIterations, 10)
 85 |     lbfgsParamPool.put(lbfgsLrf.reg, Array(0.2))
 86 |     lbfgsParamPool.put(lbfgs.convergenceTol, 1e-12)
 87 |     lbfgsParamPool.put(lbfgs.numCorrections, 10)
 88 | 
 89 |     val (weightLBFGS, lossLBFGS) = lbfgs.optimizeWithHistory(
 90 |       dataRDD,
 91 |       initialWeightsWithIntercept,
 92 |       lbfgsParamPool(lbfgsLrf.reg))
 93 | 
 94 |     // Since the cost function is convex, the loss is guaranteed to be monotonically decreasing
 95 |     // with L-BFGS optimizer.
 96 |     // (SGD doesn't guarantee this, and the loss will be fluctuating in the optimization process.)
 97 |     assert((lossLBFGS, lossLBFGS.tail).zipped.forall(_ > _), "loss should be monotonically decreasing.")
 98 | 
 99 |     val gdParamPool = new ParamMap()
100 |     val gdGradient = new LogisticGradient(gdParamPool)
101 |     val gdLrf = new LrLearnLBFGS(lbfgsParamPool, null)
102 |     val gd = new GradientDescent(gdGradient, squaredL2Updater, gdParamPool)
103 | 
104 |     gdParamPool.put(gd.stepSize, 1.0)
105 |     gdParamPool.put(gd.numIterations, 50)
106 |     gdParamPool.put(gdLrf.reg, Array(0.2))
107 |     gdParamPool.put(gd.miniBatchFraction, 1.0)
108 |     gdParamPool.put(gd.convergenceTol, 1E-12)
109 | 
110 |     val (weightGD, lossGD) = gd.optimizeWithHistory(
111 |       dataRDD,
112 |       initialWeightsWithIntercept,
113 |       gdParamPool(gdLrf.reg))
114 | 
115 |     assert(lossGD(0) ~= lossLBFGS(0) absTol 1E-5,
116 |       "The first losses of LBFGS and GD should be the same.")
117 | 
118 |     // The 2% difference here is based on observation, but is not theoretically guaranteed.
119 |     assert(lossGD.last ~= lossLBFGS.last relTol 0.03,
120 |       "The last losses of LBFGS and GD should be within 3% difference.")
121 | 
122 |     assert(
123 |       (weightLBFGS.asInstanceOf[VectorCoefficients].w0 ~= weightGD.asInstanceOf[VectorCoefficients].w0 relTol 0.03)
124 |         && (weightLBFGS.asInstanceOf[VectorCoefficients].w(0) ~= weightGD.asInstanceOf[VectorCoefficients].w(0) relTol 0.03)
125 |         && (weightLBFGS.asInstanceOf[VectorCoefficients].w(1) ~= weightGD.asInstanceOf[VectorCoefficients].w(1) relTol 0.03),
126 |       "The weight differences between LBFGS and GD should be within 3%.")
127 |   }
128 | 
129 | 
130 |   test("The convergence criteria should work as we expect.") {
131 |     val initialWeightsWithIntercept = new VectorCoefficients(2)
132 |     initialWeightsWithIntercept.w.update(0, 0.0)
133 |     initialWeightsWithIntercept.w.update(1, 0.0)
134 | 
135 |     val lbfgsParamPool = new ParamMap()
136 |     val lbfgsGradient = new LogisticGradient(lbfgsParamPool)
137 |     val lbfgsLrf = new LrLearnLBFGS(lbfgsParamPool, null)
138 |     val lbfgs = new LBFGS(lbfgsGradient, squaredL2Updater, lbfgsParamPool)
139 | 
140 |     lbfgsParamPool.put(lbfgs.numIterations, 8)
141 |     lbfgsParamPool.put(lbfgsLrf.reg, Array(0.0))
142 |     lbfgsParamPool.put(lbfgs.convergenceTol, 1E-12)
143 |     lbfgsParamPool.put(lbfgs.numCorrections, 10)
144 | 
145 |     val (_, lossLBFGS1) = lbfgs.optimizeWithHistory(
146 |       dataRDD,
147 |       initialWeightsWithIntercept,
148 |       lbfgsParamPool(lbfgsLrf.reg))
149 | 
150 |     // Note that the first loss is computed with initial weights,
151 |     // so the total numbers of loss will be numbers of iterations + 1
152 |     assert(lossLBFGS1.length == 9)
153 | 
154 |     lbfgsParamPool.put(lbfgs.convergenceTol, 0.1)
155 | 
156 |     val (_, lossLBFGS2) = lbfgs.optimizeWithHistory(
157 |       dataRDD,
158 |       initialWeightsWithIntercept,
159 |       lbfgsParamPool(lbfgsLrf.reg))
160 | 
161 |     // Based on observation, lossLBFGS2 runs 3 iterations, no theoretically guaranteed.
162 |     assert(lossLBFGS2.length == 4)
163 |     assert((lossLBFGS2(2) - lossLBFGS2(3)) / lossLBFGS2(2) < 0.1)
164 | 
165 |     lbfgsParamPool.put(lbfgs.convergenceTol, 0.01)
166 | 
167 |     val (_, lossLBFGS3) = lbfgs.optimizeWithHistory(
168 |       dataRDD,
169 |       initialWeightsWithIntercept,
170 |       lbfgsParamPool(lbfgsLrf.reg))
171 | 
172 |     // With smaller convergenceTol, it takes more steps.
173 |     assert(lossLBFGS3.length > lossLBFGS2.length)
174 | 
175 |     // Based on observation, lossLBFGS2 runs 5 iterations, no theoretically guaranteed.
176 |     assert(lossLBFGS3.length == 6)
177 |     assert((lossLBFGS3(4) - lossLBFGS3(5)) / lossLBFGS3(4) < 0.01)
178 |   }
179 | 
180 | }
181 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/util/MfmTestSparkSession.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.scalatest.{BeforeAndAfterAll, Suite}
 5 | 
 6 | /**
 7 |   * Created by qfeng on 15-3-13.
 8 |   */
 9 | trait MfmTestSparkSession extends BeforeAndAfterAll {
10 |   self: Suite =>
11 |   @transient var spark: SparkSession = _
12 | 
13 |   override def beforeAll() {
14 |     super.beforeAll()
15 |     spark = SparkSession.builder()
16 |       .master("local[2]").appName(this.getClass.toString)
17 |       .getOrCreate()
18 |   }
19 | 
20 |   override def afterAll() {
21 |     if (spark != null) {
22 |       spark.stop()
23 |     }
24 |     super.afterAll()
25 |   }
26 | }


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/util/ParamSuite.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import org.apache.spark.ml.param.{Param, ParamMap}
 4 | import org.scalatest.FunSuite
 5 | 
 6 | /**
 7 |   * User: qfeng
 8 |   * Date: 15-8-11 下午4:44
 9 |   * Usage:
10 |   */
11 | class ParamSuite extends FunSuite {
12 |   test("add two parameter sets") {
13 |     val params = new ParamMap()
14 |     val otherParams = new ParamMap()
15 | 
16 |     val param1: Param[Double] = new Param("ParamTest", "param1", "param1")
17 |     val param2: Param[Double] = new Param("ParamTest", "param2", "param2")
18 |     val param3: Param[Double] = new Param("ParamTest", "param3", "param3")
19 |     val param4: Param[Double] = new Param("ParamTest", "param4", "param4")
20 | 
21 |     params.put[Double](param1, 1.0)
22 |     params.put[Double](param1, 2.0)
23 |     params.put[Double](param2, 5.0)
24 | 
25 |     otherParams.put[Double](param2, 10.1)
26 |     otherParams.put[Double](param3, 7.0)
27 |     otherParams.put[Double](param4, 8.0)
28 | 
29 |     params ++= otherParams
30 | 
31 |     assert(params(param1) == 2.0)
32 |     //overwrite by other parameters
33 |     assert(params(param2) == 10.1)
34 |     assert(params(param3) == 7.0)
35 |     assert(params(param4) == 8.0)
36 |     //print parameters
37 |     println(ParamUtil.paramsToString(params))
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/util/ParquetIOTest.scala:
--------------------------------------------------------------------------------
 1 | package io.github.qf6101.mfm.util
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.parquet.hadoop.ParquetReader
 5 | import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord}
 6 | 
 7 | /**
 8 |   * Created by qfeng on 16-9-23.
 9 |   */
10 | object ParquetIOTest {
11 |   def main(args: Array[String]) {
12 |     val reader = ParquetReader.builder[SimpleRecord](new SimpleReadSupport(),
13 |       new Path("test_data/output/mnist/coefficient/coeff_data/0/coeff_data/w"))
14 |       .build()
15 |     var value = reader.read()
16 |     while (value != null) {
17 |       println(value.getValues.get(0).getValue.asInstanceOf[Double])
18 |       value = reader.read()
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/test/scala/io/github/qf6101/mfm/util/TestingUtils.scala:
--------------------------------------------------------------------------------
  1 | package io.github.qf6101.mfm.util
  2 | 
  3 | /**
  4 |   * Created by qfeng on 15-3-13.
  5 |   */
  6 | 
  7 | import org.apache.spark.mllib.linalg.{Matrix, Vector}
  8 | import org.scalatest.exceptions.TestFailedException
  9 | 
 10 | object TestingUtils {
 11 | 
 12 |   val ABS_TOL_MSG = " using absolute tolerance"
 13 |   val REL_TOL_MSG = " using relative tolerance"
 14 | 
 15 |   /**
 16 |     * Private helper function for comparing two values using relative tolerance.
 17 |     * Note that if x or y is extremely close to zero, i.e., smaller than Double.MinPositiveValue,
 18 |     * the relative tolerance is meaningless, so the exception will be raised to warn users.
 19 |     */
 20 |   private def RelativeErrorComparison(x: Double, y: Double, eps: Double): Boolean = {
 21 |     val absX = math.abs(x)
 22 |     val absY = math.abs(y)
 23 |     val diff = math.abs(x - y)
 24 |     if (x == y) {
 25 |       true
 26 |     } else if (absX < Double.MinPositiveValue || absY < Double.MinPositiveValue) {
 27 |       throw new TestFailedException(
 28 |         s"$x or $y is extremely close to zero, so the relative tolerance is meaningless.", 0)
 29 |     } else {
 30 |       diff < eps * math.min(absX, absY)
 31 |     }
 32 |   }
 33 | 
 34 |   /**
 35 |     * Private helper function for comparing two values using absolute tolerance.
 36 |     */
 37 |   private def AbsoluteErrorComparison(x: Double, y: Double, eps: Double): Boolean = {
 38 |     math.abs(x - y) < eps
 39 |   }
 40 | 
 41 |   case class CompareDoubleRightSide(
 42 |                                      fun: (Double, Double, Double) => Boolean, y: Double, eps: Double, method: String)
 43 | 
 44 |   /**
 45 |     * Implicit class for comparing two double values using relative tolerance or absolute tolerance.
 46 |     */
 47 |   implicit class DoubleWithAlmostEquals(val x: Double) {
 48 | 
 49 |     /**
 50 |       * When the difference of two values are within eps, returns true; otherwise, returns false.
 51 |       */
 52 |     def ~=(r: CompareDoubleRightSide): Boolean = r.fun(x, r.y, r.eps)
 53 | 
 54 |     /**
 55 |       * When the difference of two values are within eps, returns false; otherwise, returns true.
 56 |       */
 57 |     def !~=(r: CompareDoubleRightSide): Boolean = !r.fun(x, r.y, r.eps)
 58 | 
 59 |     /**
 60 |       * Throws exception when the difference of two values are NOT within eps;
 61 |       * otherwise, returns true.
 62 |       */
 63 |     def ~==(r: CompareDoubleRightSide): Boolean = {
 64 |       if (!r.fun(x, r.y, r.eps)) {
 65 |         throw new TestFailedException(
 66 |           s"Expected $x and ${r.y} to be within ${r.eps}${r.method}.", 0)
 67 |       }
 68 |       true
 69 |     }
 70 | 
 71 |     /**
 72 |       * Throws exception when the difference of two values are within eps; otherwise, returns true.
 73 |       */
 74 |     def !~==(r: CompareDoubleRightSide): Boolean = {
 75 |       if (r.fun(x, r.y, r.eps)) {
 76 |         throw new TestFailedException(
 77 |           s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method}.", 0)
 78 |       }
 79 |       true
 80 |     }
 81 | 
 82 |     /**
 83 |       * Comparison using absolute tolerance.
 84 |       */
 85 |     def absTol(eps: Double): CompareDoubleRightSide = CompareDoubleRightSide(AbsoluteErrorComparison,
 86 |       x, eps, ABS_TOL_MSG)
 87 | 
 88 |     /**
 89 |       * Comparison using relative tolerance.
 90 |       */
 91 |     def relTol(eps: Double): CompareDoubleRightSide = CompareDoubleRightSide(RelativeErrorComparison,
 92 |       x, eps, REL_TOL_MSG)
 93 | 
 94 |     override def toString = x.toString
 95 |   }
 96 | 
 97 |   case class CompareVectorRightSide(
 98 |                                      fun: (Vector, Vector, Double) => Boolean, y: Vector, eps: Double, method: String)
 99 | 
100 |   /**
101 |     * Implicit class for comparing two vectors using relative tolerance or absolute tolerance.
102 |     */
103 |   implicit class VectorWithAlmostEquals(val x: Vector) {
104 | 
105 |     /**
106 |       * When the difference of two vectors are within eps, returns true; otherwise, returns false.
107 |       */
108 |     def ~=(r: CompareVectorRightSide): Boolean = r.fun(x, r.y, r.eps)
109 | 
110 |     /**
111 |       * When the difference of two vectors are within eps, returns false; otherwise, returns true.
112 |       */
113 |     def !~=(r: CompareVectorRightSide): Boolean = !r.fun(x, r.y, r.eps)
114 | 
115 |     /**
116 |       * Throws exception when the difference of two vectors are NOT within eps;
117 |       * otherwise, returns true.
118 |       */
119 |     def ~==(r: CompareVectorRightSide): Boolean = {
120 |       if (!r.fun(x, r.y, r.eps)) {
121 |         throw new TestFailedException(
122 |           s"Expected $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0)
123 |       }
124 |       true
125 |     }
126 | 
127 |     /**
128 |       * Throws exception when the difference of two vectors are within eps; otherwise, returns true.
129 |       */
130 |     def !~==(r: CompareVectorRightSide): Boolean = {
131 |       if (r.fun(x, r.y, r.eps)) {
132 |         throw new TestFailedException(
133 |           s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0)
134 |       }
135 |       true
136 |     }
137 | 
138 |     /**
139 |       * Comparison using absolute tolerance.
140 |       */
141 |     def absTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide(
142 |       (x: Vector, y: Vector, eps: Double) => {
143 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps)
144 |       }, x, eps, ABS_TOL_MSG)
145 | 
146 |     /**
147 |       * Comparison using relative tolerance. Note that comparing against sparse vector
148 |       * with elements having value of zero will raise exception because it involves with
149 |       * comparing against zero.
150 |       */
151 |     def relTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide(
152 |       (x: Vector, y: Vector, eps: Double) => {
153 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps)
154 |       }, x, eps, REL_TOL_MSG)
155 | 
156 |     override def toString = x.toString
157 |   }
158 | 
159 |   case class CompareMatrixRightSide(
160 |                                      fun: (Matrix, Matrix, Double) => Boolean, y: Matrix, eps: Double, method: String)
161 | 
162 |   /**
163 |     * Implicit class for comparing two matrices using relative tolerance or absolute tolerance.
164 |     */
165 |   implicit class MatrixWithAlmostEquals(val x: Matrix) {
166 | 
167 |     /**
168 |       * When the difference of two matrices are within eps, returns true; otherwise, returns false.
169 |       */
170 |     def ~=(r: CompareMatrixRightSide): Boolean = r.fun(x, r.y, r.eps)
171 | 
172 |     /**
173 |       * When the difference of two matrices are within eps, returns false; otherwise, returns true.
174 |       */
175 |     def !~=(r: CompareMatrixRightSide): Boolean = !r.fun(x, r.y, r.eps)
176 | 
177 |     /**
178 |       * Throws exception when the difference of two matrices are NOT within eps;
179 |       * otherwise, returns true.
180 |       */
181 |     def ~==(r: CompareMatrixRightSide): Boolean = {
182 |       if (!r.fun(x, r.y, r.eps)) {
183 |         throw new TestFailedException(
184 |           s"Expected \n$x\n and \n${r.y}\n to be within ${r.eps}${r.method} for all elements.", 0)
185 |       }
186 |       true
187 |     }
188 | 
189 |     /**
190 |       * Throws exception when the difference of two matrices are within eps; otherwise, returns true.
191 |       */
192 |     def !~==(r: CompareMatrixRightSide): Boolean = {
193 |       if (r.fun(x, r.y, r.eps)) {
194 |         throw new TestFailedException(
195 |           s"Did not expect \n$x\n and \n${r.y}\n to be within " +
196 |             "${r.eps}${r.method} for all elements.", 0)
197 |       }
198 |       true
199 |     }
200 | 
201 |     /**
202 |       * Comparison using absolute tolerance.
203 |       */
204 |     def absTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide(
205 |       (x: Matrix, y: Matrix, eps: Double) => {
206 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps)
207 |       }, x, eps, ABS_TOL_MSG)
208 | 
209 |     /**
210 |       * Comparison using relative tolerance. Note that comparing against sparse vector
211 |       * with elements having value of zero will raise exception because it involves with
212 |       * comparing against zero.
213 |       */
214 |     def relTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide(
215 |       (x: Matrix, y: Matrix, eps: Double) => {
216 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps)
217 |       }, x, eps, REL_TOL_MSG)
218 | 
219 |     override def toString = x.toString
220 |   }
221 | 
222 | }


--------------------------------------------------------------------------------
/test_data/input/README.txt:
--------------------------------------------------------------------------------
1 | Download the datasets from libsvm website: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/


--------------------------------------------------------------------------------
/test_data/input/mnist/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qf6101/multinomial-factorization-machines/405c0c1c4c7a676226cebcfc7ed682627948c01c/test_data/input/mnist/.gitkeep


--------------------------------------------------------------------------------