├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src └── main └── scala ├── TestFM.scala └── org └── apache └── spark └── mllib └── regression ├── FMWithLBFGS.scala ├── FMWithSGD.scala └── FactorizationMachine.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | ======================================================================= 206 | Apache Spark Subcomponents: 207 | 208 | The Apache Spark project contains subcomponents with separate copyright 209 | notices and license terms. Your use of the source code for the these 210 | subcomponents is subject to the terms and conditions of the following 211 | licenses. 212 | 213 | 214 | ======================================================================= 215 | For the Boto EC2 library (ec2/third_party/boto*.zip): 216 | ======================================================================= 217 | 218 | Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/ 219 | 220 | Permission is hereby granted, free of charge, to any person obtaining a 221 | copy of this software and associated documentation files (the 222 | "Software"), to deal in the Software without restriction, including 223 | without limitation the rights to use, copy, modify, merge, publish, dis- 224 | tribute, sublicense, and/or sell copies of the Software, and to permit 225 | persons to whom the Software is furnished to do so, subject to the fol- 226 | lowing conditions: 227 | 228 | The above copyright notice and this permission notice shall be included 229 | in all copies or substantial portions of the Software. 230 | 231 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 232 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- 233 | ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 234 | SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 235 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 236 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 237 | IN THE SOFTWARE. 238 | 239 | 240 | ======================================================================== 241 | For CloudPickle (pyspark/cloudpickle.py): 242 | ======================================================================== 243 | 244 | Copyright (c) 2012, Regents of the University of California. 245 | Copyright (c) 2009 `PiCloud, Inc. `_. 246 | All rights reserved. 247 | 248 | Redistribution and use in source and binary forms, with or without 249 | modification, are permitted provided that the following conditions 250 | are met: 251 | * Redistributions of source code must retain the above copyright 252 | notice, this list of conditions and the following disclaimer. 253 | * Redistributions in binary form must reproduce the above copyright 254 | notice, this list of conditions and the following disclaimer in the 255 | documentation and/or other materials provided with the distribution. 256 | * Neither the name of the University of California, Berkeley nor the 257 | names of its contributors may be used to endorse or promote 258 | products derived from this software without specific prior written 259 | permission. 260 | 261 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 262 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 263 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 264 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 265 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 266 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 267 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 268 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 269 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 270 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 271 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 272 | 273 | 274 | ======================================================================== 275 | For Py4J (python/lib/py4j-0.8.2.1-src.zip) 276 | ======================================================================== 277 | 278 | Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. 279 | 280 | Redistribution and use in source and binary forms, with or without 281 | modification, are permitted provided that the following conditions are met: 282 | 283 | - Redistributions of source code must retain the above copyright notice, this 284 | list of conditions and the following disclaimer. 285 | 286 | - Redistributions in binary form must reproduce the above copyright notice, 287 | this list of conditions and the following disclaimer in the documentation 288 | and/or other materials provided with the distribution. 289 | 290 | - The name of the author may not be used to endorse or promote products 291 | derived from this software without specific prior written permission. 292 | 293 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 294 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 295 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 296 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 297 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 298 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 299 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 300 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 301 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 302 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 303 | POSSIBILITY OF SUCH DAMAGE. 304 | 305 | 306 | ======================================================================== 307 | For DPark join code (python/pyspark/join.py): 308 | ======================================================================== 309 | 310 | Copyright (c) 2011, Douban Inc. 311 | All rights reserved. 312 | 313 | Redistribution and use in source and binary forms, with or without 314 | modification, are permitted provided that the following conditions are 315 | met: 316 | 317 | * Redistributions of source code must retain the above copyright 318 | notice, this list of conditions and the following disclaimer. 319 | 320 | * Redistributions in binary form must reproduce the above 321 | copyright notice, this list of conditions and the following disclaimer 322 | in the documentation and/or other materials provided with the 323 | distribution. 324 | 325 | * Neither the name of the Douban Inc. nor the names of its 326 | contributors may be used to endorse or promote products derived from 327 | this software without specific prior written permission. 328 | 329 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 330 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 331 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 332 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 333 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 334 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 335 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 336 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 337 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 338 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 339 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 340 | 341 | ======================================================================== 342 | For heapq (pyspark/heapq3.py): 343 | ======================================================================== 344 | 345 | # A. HISTORY OF THE SOFTWARE 346 | # ========================== 347 | # 348 | # Python was created in the early 1990s by Guido van Rossum at Stichting 349 | # Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands 350 | # as a successor of a language called ABC. Guido remains Python's 351 | # principal author, although it includes many contributions from others. 352 | # 353 | # In 1995, Guido continued his work on Python at the Corporation for 354 | # National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) 355 | # in Reston, Virginia where he released several versions of the 356 | # software. 357 | # 358 | # In May 2000, Guido and the Python core development team moved to 359 | # BeOpen.com to form the BeOpen PythonLabs team. In October of the same 360 | # year, the PythonLabs team moved to Digital Creations (now Zope 361 | # Corporation, see http://www.zope.com). In 2001, the Python Software 362 | # Foundation (PSF, see http://www.python.org/psf/) was formed, a 363 | # non-profit organization created specifically to own Python-related 364 | # Intellectual Property. Zope Corporation is a sponsoring member of 365 | # the PSF. 366 | # 367 | # All Python releases are Open Source (see http://www.opensource.org for 368 | # the Open Source Definition). Historically, most, but not all, Python 369 | # releases have also been GPL-compatible; the table below summarizes 370 | # the various releases. 371 | # 372 | # Release Derived Year Owner GPL- 373 | # from compatible? (1) 374 | # 375 | # 0.9.0 thru 1.2 1991-1995 CWI yes 376 | # 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes 377 | # 1.6 1.5.2 2000 CNRI no 378 | # 2.0 1.6 2000 BeOpen.com no 379 | # 1.6.1 1.6 2001 CNRI yes (2) 380 | # 2.1 2.0+1.6.1 2001 PSF no 381 | # 2.0.1 2.0+1.6.1 2001 PSF yes 382 | # 2.1.1 2.1+2.0.1 2001 PSF yes 383 | # 2.2 2.1.1 2001 PSF yes 384 | # 2.1.2 2.1.1 2002 PSF yes 385 | # 2.1.3 2.1.2 2002 PSF yes 386 | # 2.2.1 2.2 2002 PSF yes 387 | # 2.2.2 2.2.1 2002 PSF yes 388 | # 2.2.3 2.2.2 2003 PSF yes 389 | # 2.3 2.2.2 2002-2003 PSF yes 390 | # 2.3.1 2.3 2002-2003 PSF yes 391 | # 2.3.2 2.3.1 2002-2003 PSF yes 392 | # 2.3.3 2.3.2 2002-2003 PSF yes 393 | # 2.3.4 2.3.3 2004 PSF yes 394 | # 2.3.5 2.3.4 2005 PSF yes 395 | # 2.4 2.3 2004 PSF yes 396 | # 2.4.1 2.4 2005 PSF yes 397 | # 2.4.2 2.4.1 2005 PSF yes 398 | # 2.4.3 2.4.2 2006 PSF yes 399 | # 2.4.4 2.4.3 2006 PSF yes 400 | # 2.5 2.4 2006 PSF yes 401 | # 2.5.1 2.5 2007 PSF yes 402 | # 2.5.2 2.5.1 2008 PSF yes 403 | # 2.5.3 2.5.2 2008 PSF yes 404 | # 2.6 2.5 2008 PSF yes 405 | # 2.6.1 2.6 2008 PSF yes 406 | # 2.6.2 2.6.1 2009 PSF yes 407 | # 2.6.3 2.6.2 2009 PSF yes 408 | # 2.6.4 2.6.3 2009 PSF yes 409 | # 2.6.5 2.6.4 2010 PSF yes 410 | # 2.7 2.6 2010 PSF yes 411 | # 412 | # Footnotes: 413 | # 414 | # (1) GPL-compatible doesn't mean that we're distributing Python under 415 | # the GPL. All Python licenses, unlike the GPL, let you distribute 416 | # a modified version without making your changes open source. The 417 | # GPL-compatible licenses make it possible to combine Python with 418 | # other software that is released under the GPL; the others don't. 419 | # 420 | # (2) According to Richard Stallman, 1.6.1 is not GPL-compatible, 421 | # because its license has a choice of law clause. According to 422 | # CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 423 | # is "not incompatible" with the GPL. 424 | # 425 | # Thanks to the many outside volunteers who have worked under Guido's 426 | # direction to make these releases possible. 427 | # 428 | # 429 | # B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON 430 | # =============================================================== 431 | # 432 | # PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 433 | # -------------------------------------------- 434 | # 435 | # 1. This LICENSE AGREEMENT is between the Python Software Foundation 436 | # ("PSF"), and the Individual or Organization ("Licensee") accessing and 437 | # otherwise using this software ("Python") in source or binary form and 438 | # its associated documentation. 439 | # 440 | # 2. Subject to the terms and conditions of this License Agreement, PSF hereby 441 | # grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, 442 | # analyze, test, perform and/or display publicly, prepare derivative works, 443 | # distribute, and otherwise use Python alone or in any derivative version, 444 | # provided, however, that PSF's License Agreement and PSF's notice of copyright, 445 | # i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 446 | # 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained 447 | # in Python alone or in any derivative version prepared by Licensee. 448 | # 449 | # 3. In the event Licensee prepares a derivative work that is based on 450 | # or incorporates Python or any part thereof, and wants to make 451 | # the derivative work available to others as provided herein, then 452 | # Licensee hereby agrees to include in any such work a brief summary of 453 | # the changes made to Python. 454 | # 455 | # 4. PSF is making Python available to Licensee on an "AS IS" 456 | # basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 457 | # IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND 458 | # DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS 459 | # FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT 460 | # INFRINGE ANY THIRD PARTY RIGHTS. 461 | # 462 | # 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON 463 | # FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS 464 | # A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, 465 | # OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 466 | # 467 | # 6. This License Agreement will automatically terminate upon a material 468 | # breach of its terms and conditions. 469 | # 470 | # 7. Nothing in this License Agreement shall be deemed to create any 471 | # relationship of agency, partnership, or joint venture between PSF and 472 | # Licensee. This License Agreement does not grant permission to use PSF 473 | # trademarks or trade name in a trademark sense to endorse or promote 474 | # products or services of Licensee, or any third party. 475 | # 476 | # 8. By copying, installing or otherwise using Python, Licensee 477 | # agrees to be bound by the terms and conditions of this License 478 | # Agreement. 479 | # 480 | # 481 | # BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 482 | # ------------------------------------------- 483 | # 484 | # BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 485 | # 486 | # 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an 487 | # office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the 488 | # Individual or Organization ("Licensee") accessing and otherwise using 489 | # this software in source or binary form and its associated 490 | # documentation ("the Software"). 491 | # 492 | # 2. Subject to the terms and conditions of this BeOpen Python License 493 | # Agreement, BeOpen hereby grants Licensee a non-exclusive, 494 | # royalty-free, world-wide license to reproduce, analyze, test, perform 495 | # and/or display publicly, prepare derivative works, distribute, and 496 | # otherwise use the Software alone or in any derivative version, 497 | # provided, however, that the BeOpen Python License is retained in the 498 | # Software, alone or in any derivative version prepared by Licensee. 499 | # 500 | # 3. BeOpen is making the Software available to Licensee on an "AS IS" 501 | # basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 502 | # IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND 503 | # DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS 504 | # FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT 505 | # INFRINGE ANY THIRD PARTY RIGHTS. 506 | # 507 | # 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE 508 | # SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS 509 | # AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY 510 | # DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 511 | # 512 | # 5. This License Agreement will automatically terminate upon a material 513 | # breach of its terms and conditions. 514 | # 515 | # 6. This License Agreement shall be governed by and interpreted in all 516 | # respects by the law of the State of California, excluding conflict of 517 | # law provisions. Nothing in this License Agreement shall be deemed to 518 | # create any relationship of agency, partnership, or joint venture 519 | # between BeOpen and Licensee. This License Agreement does not grant 520 | # permission to use BeOpen trademarks or trade names in a trademark 521 | # sense to endorse or promote products or services of Licensee, or any 522 | # third party. As an exception, the "BeOpen Python" logos available at 523 | # http://www.pythonlabs.com/logos.html may be used according to the 524 | # permissions granted on that web page. 525 | # 526 | # 7. By copying, installing or otherwise using the software, Licensee 527 | # agrees to be bound by the terms and conditions of this License 528 | # Agreement. 529 | # 530 | # 531 | # CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 532 | # --------------------------------------- 533 | # 534 | # 1. This LICENSE AGREEMENT is between the Corporation for National 535 | # Research Initiatives, having an office at 1895 Preston White Drive, 536 | # Reston, VA 20191 ("CNRI"), and the Individual or Organization 537 | # ("Licensee") accessing and otherwise using Python 1.6.1 software in 538 | # source or binary form and its associated documentation. 539 | # 540 | # 2. Subject to the terms and conditions of this License Agreement, CNRI 541 | # hereby grants Licensee a nonexclusive, royalty-free, world-wide 542 | # license to reproduce, analyze, test, perform and/or display publicly, 543 | # prepare derivative works, distribute, and otherwise use Python 1.6.1 544 | # alone or in any derivative version, provided, however, that CNRI's 545 | # License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) 546 | # 1995-2001 Corporation for National Research Initiatives; All Rights 547 | # Reserved" are retained in Python 1.6.1 alone or in any derivative 548 | # version prepared by Licensee. Alternately, in lieu of CNRI's License 549 | # Agreement, Licensee may substitute the following text (omitting the 550 | # quotes): "Python 1.6.1 is made available subject to the terms and 551 | # conditions in CNRI's License Agreement. This Agreement together with 552 | # Python 1.6.1 may be located on the Internet using the following 553 | # unique, persistent identifier (known as a handle): 1895.22/1013. This 554 | # Agreement may also be obtained from a proxy server on the Internet 555 | # using the following URL: http://hdl.handle.net/1895.22/1013". 556 | # 557 | # 3. In the event Licensee prepares a derivative work that is based on 558 | # or incorporates Python 1.6.1 or any part thereof, and wants to make 559 | # the derivative work available to others as provided herein, then 560 | # Licensee hereby agrees to include in any such work a brief summary of 561 | # the changes made to Python 1.6.1. 562 | # 563 | # 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" 564 | # basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 565 | # IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND 566 | # DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS 567 | # FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT 568 | # INFRINGE ANY THIRD PARTY RIGHTS. 569 | # 570 | # 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON 571 | # 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS 572 | # A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, 573 | # OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 574 | # 575 | # 6. This License Agreement will automatically terminate upon a material 576 | # breach of its terms and conditions. 577 | # 578 | # 7. This License Agreement shall be governed by the federal 579 | # intellectual property law of the United States, including without 580 | # limitation the federal copyright law, and, to the extent such 581 | # U.S. federal law does not apply, by the law of the Commonwealth of 582 | # Virginia, excluding Virginia's conflict of law provisions. 583 | # Notwithstanding the foregoing, with regard to derivative works based 584 | # on Python 1.6.1 that incorporate non-separable material that was 585 | # previously distributed under the GNU General Public License (GPL), the 586 | # law of the Commonwealth of Virginia shall govern this License 587 | # Agreement only as to issues arising under or with respect to 588 | # Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this 589 | # License Agreement shall be deemed to create any relationship of 590 | # agency, partnership, or joint venture between CNRI and Licensee. This 591 | # License Agreement does not grant permission to use CNRI trademarks or 592 | # trade name in a trademark sense to endorse or promote products or 593 | # services of Licensee, or any third party. 594 | # 595 | # 8. By clicking on the "ACCEPT" button where indicated, or by copying, 596 | # installing or otherwise using Python 1.6.1, Licensee agrees to be 597 | # bound by the terms and conditions of this License Agreement. 598 | # 599 | # ACCEPT 600 | # 601 | # 602 | # CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 603 | # -------------------------------------------------- 604 | # 605 | # Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, 606 | # The Netherlands. All rights reserved. 607 | # 608 | # Permission to use, copy, modify, and distribute this software and its 609 | # documentation for any purpose and without fee is hereby granted, 610 | # provided that the above copyright notice appear in all copies and that 611 | # both that copyright notice and this permission notice appear in 612 | # supporting documentation, and that the name of Stichting Mathematisch 613 | # Centrum or CWI not be used in advertising or publicity pertaining to 614 | # distribution of the software without specific, written prior 615 | # permission. 616 | # 617 | # STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO 618 | # THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 619 | # FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE 620 | # FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 621 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 622 | # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 623 | # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 624 | 625 | ======================================================================== 626 | For sorttable (core/src/main/resources/org/apache/spark/ui/static/sorttable.js): 627 | ======================================================================== 628 | 629 | Copyright (c) 1997-2007 Stuart Langridge 630 | 631 | Permission is hereby granted, free of charge, to any person obtaining a copy 632 | of this software and associated documentation files (the "Software"), to deal 633 | in the Software without restriction, including without limitation the rights 634 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 635 | copies of the Software, and to permit persons to whom the Software is 636 | furnished to do so, subject to the following conditions: 637 | 638 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 639 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 640 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 641 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 642 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 643 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 644 | THE SOFTWARE. 645 | 646 | 647 | ======================================================================== 648 | For Scala Interpreter classes (all .scala files in repl/src/main/scala 649 | except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala), 650 | and for SerializableMapWrapper in JavaUtils.scala: 651 | ======================================================================== 652 | 653 | Copyright (c) 2002-2013 EPFL 654 | Copyright (c) 2011-2013 Typesafe, Inc. 655 | 656 | All rights reserved. 657 | 658 | Redistribution and use in source and binary forms, with or without 659 | modification, are permitted provided that the following conditions are met: 660 | 661 | - Redistributions of source code must retain the above copyright notice, 662 | this list of conditions and the following disclaimer. 663 | 664 | - Redistributions in binary form must reproduce the above copyright notice, 665 | this list of conditions and the following disclaimer in the documentation 666 | and/or other materials provided with the distribution. 667 | 668 | - Neither the name of the EPFL nor the names of its contributors may be 669 | used to endorse or promote products derived from this software without 670 | specific prior written permission. 671 | 672 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 673 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 674 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 675 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 676 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 677 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 678 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 679 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 680 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 681 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 682 | POSSIBILITY OF SUCH DAMAGE. 683 | 684 | 685 | ======================================================================== 686 | For sbt and sbt-launch-lib.bash in sbt/: 687 | ======================================================================== 688 | 689 | // Generated from http://www.opensource.org/licenses/bsd-license.php 690 | Copyright (c) 2011, Paul Phillips. 691 | All rights reserved. 692 | 693 | Redistribution and use in source and binary forms, with or without 694 | modification, are permitted provided that the following conditions are met: 695 | 696 | * Redistributions of source code must retain the above copyright notice, 697 | this list of conditions and the following disclaimer. 698 | * Redistributions in binary form must reproduce the above copyright notice, 699 | this list of conditions and the following disclaimer in the documentation 700 | and/or other materials provided with the distribution. 701 | * Neither the name of the author nor the names of its contributors may be 702 | used to endorse or promote products derived from this software without 703 | specific prior written permission. 704 | 705 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 706 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 707 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 708 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 709 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 710 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 711 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 712 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 713 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 714 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 715 | 716 | ======================================================================== 717 | For SnapTree: 718 | ======================================================================== 719 | 720 | SNAPTREE LICENSE 721 | 722 | Copyright (c) 2009-2012 Stanford University, unless otherwise specified. 723 | All rights reserved. 724 | 725 | This software was developed by the Pervasive Parallelism Laboratory of 726 | Stanford University, California, USA. 727 | 728 | Permission to use, copy, modify, and distribute this software in source 729 | or binary form for any purpose with or without fee is hereby granted, 730 | provided that the following conditions are met: 731 | 732 | 1. Redistributions of source code must retain the above copyright 733 | notice, this list of conditions and the following disclaimer. 734 | 735 | 2. Redistributions in binary form must reproduce the above copyright 736 | notice, this list of conditions and the following disclaimer in the 737 | documentation and/or other materials provided with the distribution. 738 | 739 | 3. Neither the name of Stanford University nor the names of its 740 | contributors may be used to endorse or promote products derived 741 | from this software without specific prior written permission. 742 | 743 | 744 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 745 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 746 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 747 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 748 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 749 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 750 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 751 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 752 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 753 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 754 | SUCH DAMAGE. 755 | 756 | 757 | ======================================================================== 758 | For Timsort (core/src/main/java/org/apache/spark/util/collection/TimSort.java): 759 | ======================================================================== 760 | Copyright (C) 2008 The Android Open Source Project 761 | 762 | Licensed under the Apache License, Version 2.0 (the "License"); 763 | you may not use this file except in compliance with the License. 764 | You may obtain a copy of the License at 765 | 766 | http://www.apache.org/licenses/LICENSE-2.0 767 | 768 | Unless required by applicable law or agreed to in writing, software 769 | distributed under the License is distributed on an "AS IS" BASIS, 770 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 771 | See the License for the specific language governing permissions and 772 | limitations under the License. 773 | 774 | ======================================================================== 775 | For TestTimSort (core/src/test/java/org/apache/spark/util/collection/TestTimSort.java): 776 | ======================================================================== 777 | Copyright (C) 2015 Stijn de Gouw 778 | 779 | Licensed under the Apache License, Version 2.0 (the "License"); 780 | you may not use this file except in compliance with the License. 781 | You may obtain a copy of the License at 782 | 783 | http://www.apache.org/licenses/LICENSE-2.0 784 | 785 | Unless required by applicable law or agreed to in writing, software 786 | distributed under the License is distributed on an "AS IS" BASIS, 787 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 788 | See the License for the specific language governing permissions and 789 | limitations under the License. 790 | 791 | ======================================================================== 792 | For LimitedInputStream 793 | (network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java): 794 | ======================================================================== 795 | Copyright (C) 2007 The Guava Authors 796 | 797 | Licensed under the Apache License, Version 2.0 (the "License"); 798 | you may not use this file except in compliance with the License. 799 | You may obtain a copy of the License at 800 | 801 | http://www.apache.org/licenses/LICENSE-2.0 802 | 803 | Unless required by applicable law or agreed to in writing, software 804 | distributed under the License is distributed on an "AS IS" BASIS, 805 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 806 | See the License for the specific language governing permissions and 807 | limitations under the License. 808 | 809 | 810 | ======================================================================== 811 | BSD-style licenses 812 | ======================================================================== 813 | 814 | The following components are provided under a BSD-style license. See project link for details. 815 | 816 | (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core) 817 | (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.3 - http://jblas.org/) 818 | (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/) 819 | (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org) 820 | (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org) 821 | (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org) 822 | (BSD style) Hamcrest Core (org.hamcrest:hamcrest-core:1.1 - no url defined) 823 | (BSD) JLine (jline:jline:0.9.94 - http://jline.sourceforge.net) 824 | (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer) 825 | (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer) 826 | (BSD-like) (The BSD License) jline (org.scala-lang:jline:2.10.4 - http://www.scala-lang.org/) 827 | (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.10.4 - http://www.scala-lang.org/) 828 | (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.10.4 - http://www.scala-lang.org/) 829 | (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.10.4 - http://www.scala-lang.org/) 830 | (BSD-like) Scala Library (org.scala-lang:scala-library:2.10.4 - http://www.scala-lang.org/) 831 | (BSD-like) Scalap (org.scala-lang:scalap:2.10.4 - http://www.scala-lang.org/) 832 | (BSD-style) scalacheck (org.scalacheck:scalacheck_2.10:1.10.0 - http://www.scalacheck.org) 833 | (BSD-style) spire (org.spire-math:spire_2.10:0.7.1 - http://spire-math.org) 834 | (BSD-style) spire-macros (org.spire-math:spire-macros_2.10:0.7.1 - http://spire-math.org) 835 | (New BSD License) Kryo (com.esotericsoftware.kryo:kryo:2.21 - http://code.google.com/p/kryo/) 836 | (New BSD License) MinLog (com.esotericsoftware.minlog:minlog:1.2 - http://code.google.com/p/minlog/) 837 | (New BSD License) ReflectASM (com.esotericsoftware.reflectasm:reflectasm:1.07 - http://code.google.com/p/reflectasm/) 838 | (New BSD license) Protocol Buffer Java API (com.google.protobuf:protobuf-java:2.5.0 - http://code.google.com/p/protobuf) 839 | (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf) 840 | (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net) 841 | (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net) 842 | (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/) 843 | (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/) 844 | (ISC/BSD License) jbcrypt (org.mindrot:jbcrypt:0.3m - http://www.mindrot.org/) 845 | 846 | ======================================================================== 847 | MIT licenses 848 | ======================================================================== 849 | 850 | The following components are provided under the MIT License. See project link for details. 851 | 852 | (MIT License) JCL 1.1.1 implemented over SLF4J (org.slf4j:jcl-over-slf4j:1.7.5 - http://www.slf4j.org) 853 | (MIT License) JUL to SLF4J bridge (org.slf4j:jul-to-slf4j:1.7.5 - http://www.slf4j.org) 854 | (MIT License) SLF4J API Module (org.slf4j:slf4j-api:1.7.5 - http://www.slf4j.org) 855 | (MIT License) SLF4J LOG4J-12 Binding (org.slf4j:slf4j-log4j12:1.7.5 - http://www.slf4j.org) 856 | (MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/) 857 | (MIT License) scopt (com.github.scopt:scopt_2.10:3.2.0 - https://github.com/scopt/scopt) 858 | (The MIT License) Mockito (org.mockito:mockito-all:1.8.5 - http://www.mockito.org) 859 | (MIT License) jquery (https://jquery.org/license/) 860 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-libFM 2 | An implementation of Factorization Machines (LibFM) 3 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.mininglamp.ml 8 | FM 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 1.0-SNAPSHOT 13 | 1.8 14 | UTF-8 15 | 2.10.6 16 | 2.10 17 | 1.6.1 18 | 19 | 20 | 21 | 22 | org.apache.spark 23 | spark-mllib_2.10 24 | ${spark.version} 25 | 26 | 27 | 28 | 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-compiler-plugin 33 | 3.5.1 34 | 35 | 36 | ${java.version} 37 | ${java.version} 38 | ${project.build.sourceEncoding} 39 | 40 | 41 | 42 | net.alchim31.maven 43 | scala-maven-plugin 44 | 3.1.6 45 | 46 | 47 | compile 48 | 49 | compile 50 | 51 | compile 52 | 53 | 54 | test-compile 55 | 56 | testCompile 57 | 58 | test-compile 59 | 60 | 61 | process-resources 62 | 63 | compile 64 | 65 | 66 | 67 | 68 | ${scala.version} 69 | incremental 70 | true 71 | 72 | -unchecked 73 | -deprecation 74 | -feature 75 | 76 | 77 | -source 78 | ${java.version} 79 | -target 80 | ${java.version} 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/main/scala/TestFM.scala: -------------------------------------------------------------------------------- 1 | 2 | import org.apache.spark.{SparkConf, SparkContext} 3 | import org.apache.spark.mllib.regression._ 4 | import org.apache.spark.mllib.util.MLUtils 5 | 6 | 7 | /** 8 | * Created by zrf on 4/18/15. 9 | */ 10 | 11 | 12 | object TestFM extends App { 13 | 14 | override def main(args: Array[String]): Unit = { 15 | 16 | val sc = new SparkContext(new SparkConf().setAppName("TESTFM")) 17 | 18 | // "hdfs://ns1/whale-tmp/url_combined" 19 | val training = MLUtils.loadLibSVMFile(sc, "hdfs://ns1/whale-tmp/url_combined").cache() 20 | 21 | // val task = args(1).toInt 22 | // val numIterations = args(2).toInt 23 | // val stepSize = args(3).toDouble 24 | // val miniBatchFraction = args(4).toDouble 25 | 26 | val fm1 = FMWithSGD.train(training, task = 1, numIterations = 100, stepSize = 0.15, miniBatchFraction = 1.0, dim = (true, true, 4), regParam = (0, 0, 0), initStd = 0.1) 27 | 28 | 29 | val fm2 = FMWithLBFGS.train(training, task = 1, numIterations = 20, numCorrections = 5, dim = (true, true, 4), regParam = (0, 0, 0), initStd = 0.1) 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/mllib/regression/FMWithLBFGS.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.regression 2 | 3 | import org.apache.spark.Logging 4 | import org.apache.spark.mllib.linalg.{DenseMatrix, Vectors, Vector} 5 | import org.apache.spark.mllib.optimization.LBFGS 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.storage.StorageLevel 8 | 9 | import scala.util.Random 10 | 11 | /** 12 | * Created by zrf on 4/22/15. 13 | */ 14 | object FMWithLBFGS { 15 | /** 16 | * Train a Factoriaton Machine Regression model given an RDD of (label, features) pairs. We run a fixed number 17 | * of iterations of gradient descent using the specified step size. Each iteration uses 18 | * `miniBatchFraction` fraction of the data to calculate a stochastic gradient. The weights used 19 | * in gradient descent are initialized using the initial weights provided. 20 | * 21 | * @param input RDD of (label, array of features) pairs. Each pair describes a row of the data 22 | * matrix A as well as the corresponding right hand side label y. 23 | * @param task 0 for Regression, and 1 for Binary Classification 24 | * @param numIterations Number of iterations of gradient descent to run. 25 | * @param dim A (Boolean,Boolean,Int) 3-Tuple stands for whether the global bias term should be used, whether the 26 | * one-way interactions should be used, and the number of factors that are used for pairwise 27 | * interactions, respectively. 28 | * @param regParam A (Double,Double,Double) 3-Tuple stands for the regularization parameters of intercept, one-way 29 | * interactions and pairwise interactions, respectively. 30 | * @param initStd Standard Deviation used for factorization matrix initialization. 31 | */ 32 | def train(input: RDD[LabeledPoint], 33 | task: Int, 34 | numIterations: Int, 35 | numCorrections: Int, 36 | dim: (Boolean, Boolean, Int), 37 | regParam: (Double, Double, Double), 38 | initStd: Double): FMModel = { 39 | new FMWithLBFGS(task, numIterations, numCorrections, dim, regParam) 40 | .setInitStd(initStd) 41 | .run(input) 42 | } 43 | 44 | // def train(input: RDD[LabeledPoint], 45 | // task: Int, 46 | // numIterations: Int): FMModel = { 47 | // new FMWithSGD(task, 1.0, numIterations, (true, true, 8), (0, 0.01, 0.01), 1.0) 48 | // .setInitStd(0.01) 49 | // .run(input) 50 | // } 51 | } 52 | 53 | 54 | class FMWithLBFGS(private var task: Int, 55 | private var numIterations: Int, 56 | private var numCorrections: Int, 57 | private var dim: (Boolean, Boolean, Int), 58 | private var regParam: (Double, Double, Double)) extends Serializable with Logging { 59 | 60 | private var k0: Boolean = dim._1 61 | private var k1: Boolean = dim._2 62 | private var k2: Int = dim._3 63 | 64 | private var r0: Double = regParam._1 65 | private var r1: Double = regParam._2 66 | private var r2: Double = regParam._3 67 | 68 | private var initMean: Double = 0 69 | private var initStd: Double = 0.01 70 | 71 | private var numFeatures: Int = -1 72 | private var minLabel: Double = Double.MaxValue 73 | private var maxLabel: Double = Double.MinValue 74 | 75 | /** 76 | * A (Boolean,Boolean,Int) 3-Tuple stands for whether the global bias term should be used, whether the one-way 77 | * interactions should be used, and the number of factors that are used for pairwise interactions, respectively. 78 | */ 79 | def setDim(dim: (Boolean, Boolean, Int)): this.type = { 80 | require(dim._3 > 0) 81 | this.k0 = dim._1 82 | this.k1 = dim._2 83 | this.k2 = dim._3 84 | this 85 | } 86 | 87 | /** 88 | * 89 | * @param addIntercept determines if the global bias term w0 should be used 90 | * @param add1Way determines if one-way interactions (bias terms for each variable) 91 | * @param numFactors the number of factors that are used for pairwise interactions 92 | */ 93 | def setDim(addIntercept: Boolean = true, add1Way: Boolean = true, numFactors: Int = 8): this.type = { 94 | setDim((addIntercept, add1Way, numFactors)) 95 | } 96 | 97 | 98 | /** 99 | * @param regParams A (Double,Double,Double) 3-Tuple stands for the regularization parameters of intercept, one-way 100 | * interactions and pairwise interactions, respectively. 101 | */ 102 | def setRegParam(regParams: (Double, Double, Double)): this.type = { 103 | require(regParams._1 >= 0 && regParams._2 >= 0 && regParams._3 >= 0) 104 | this.r0 = regParams._1 105 | this.r1 = regParams._2 106 | this.r2 = regParams._3 107 | this 108 | } 109 | 110 | /** 111 | * @param regIntercept intercept regularization 112 | * @param reg1Way one-way interactions regularization 113 | * @param reg2Way pairwise interactions regularization 114 | */ 115 | def setRegParam(regIntercept: Double = 0, reg1Way: Double = 0, reg2Way: Double = 0): this.type = { 116 | setRegParam((regIntercept, reg1Way, reg2Way)) 117 | } 118 | 119 | 120 | /** 121 | * @param initStd Standard Deviation used for factorization matrix initialization. 122 | */ 123 | def setInitStd(initStd: Double): this.type = { 124 | require(initStd > 0) 125 | this.initStd = initStd 126 | this 127 | } 128 | 129 | 130 | /** 131 | * Set the number of iterations for SGD. 132 | */ 133 | def setNumIterations(numIterations: Int): this.type = { 134 | require(numIterations > 0) 135 | this.numIterations = numIterations 136 | this 137 | } 138 | 139 | 140 | /** 141 | * Encode the FMModel to a dense vector, with its first numFeatures * numFactors elements representing the 142 | * factorization matrix v, sequential numFeatures elements representing the one-way interactions weights w if k1 is 143 | * set to true, and the last element representing the intercept w0 if k0 is set to true. 144 | * The factorization matrix v is initialized by Gaussinan(0, initStd). 145 | * v : numFeatures * numFactors + w : [numFeatures] + w0 : [1] 146 | */ 147 | private def generateInitWeights(): Vector = { 148 | (k0, k1) match { 149 | case (true, true) => 150 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean) ++ 151 | Array.fill(numFeatures + 1)(0.0)) 152 | 153 | case (true, false) => 154 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean) ++ 155 | Array(0.0)) 156 | 157 | case (false, true) => 158 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean) ++ 159 | Array.fill(numFeatures)(0.0)) 160 | 161 | case (false, false) => 162 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean)) 163 | } 164 | } 165 | 166 | 167 | /** 168 | * Create a FMModel from an encoded vector. 169 | */ 170 | private def createModel(weights: Vector): FMModel = { 171 | 172 | val values = weights.toArray 173 | 174 | val v = new DenseMatrix(k2, numFeatures, values.slice(0, numFeatures * k2)) 175 | 176 | val w = if (k1) Some(Vectors.dense(values.slice(numFeatures * k2, numFeatures * k2 + numFeatures))) else None 177 | 178 | val w0 = if (k0) values.last else 0.0 179 | 180 | new FMModel(task, v, w, w0, minLabel, maxLabel) 181 | } 182 | 183 | 184 | /** 185 | * Run the algorithm with the configured parameters on an input RDD 186 | * of LabeledPoint entries. 187 | */ 188 | def run(input: RDD[LabeledPoint]): FMModel = { 189 | 190 | if (input.getStorageLevel == StorageLevel.NONE) { 191 | logWarning("The input data is not directly cached, which may hurt performance if its" 192 | + " parent RDDs are also uncached.") 193 | } 194 | 195 | this.numFeatures = input.first().features.size 196 | require(numFeatures > 0) 197 | 198 | if (task == 0) { 199 | val (minT, maxT) = input.map(_.label).aggregate[(Double, Double)]((Double.MaxValue, Double.MinValue))({ 200 | case ((min, max), v) => 201 | (Math.min(min, v), Math.max(max, v)) 202 | }, { 203 | case ((min1, max1), (min2, max2)) => 204 | (Math.min(min1, min2), Math.max(max1, max2)) 205 | }) 206 | 207 | this.minLabel = minT 208 | this.maxLabel = maxT 209 | } 210 | 211 | val gradient = new FMGradient(task, k0, k1, k2, numFeatures, minLabel, maxLabel) 212 | 213 | val updater = new FMUpdater(k0, k1, k2, r0, r1, r2, numFeatures) 214 | 215 | val optimizer = new LBFGS(gradient, updater) 216 | .setNumIterations(numIterations) 217 | 218 | val data = task match { 219 | case 0 => 220 | input.map(l => (l.label, l.features)).persist() 221 | case 1 => 222 | input.map(l => (if (l.label > 0) 1.0 else -1.0, l.features)).persist() 223 | } 224 | 225 | val initWeights = generateInitWeights() 226 | 227 | val weights = optimizer.optimize(data, initWeights) 228 | 229 | data.unpersist() 230 | 231 | createModel(weights) 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/mllib/regression/FMWithSGD.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.regression 2 | 3 | import org.apache.spark.Logging 4 | import org.apache.spark.mllib.linalg.{DenseMatrix, Vectors, Vector} 5 | import org.apache.spark.mllib.optimization.GradientDescent 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.storage.StorageLevel 8 | 9 | import scala.util.Random 10 | 11 | /** 12 | * Created by zrf on 4/24/15. 13 | */ 14 | 15 | object FMWithSGD { 16 | /** 17 | * Train a Factoriaton Machine Regression model given an RDD of (label, features) pairs. We run a fixed number 18 | * of iterations of gradient descent using the specified step size. Each iteration uses 19 | * `miniBatchFraction` fraction of the data to calculate a stochastic gradient. The weights used 20 | * in gradient descent are initialized using the initial weights provided. 21 | * 22 | * @param input RDD of (label, array of features) pairs. Each pair describes a row of the data 23 | * matrix A as well as the corresponding right hand side label y. 24 | * @param task 0 for Regression, and 1 for Binary Classification 25 | * @param numIterations Number of iterations of gradient descent to run. 26 | * @param stepSize Step size to be used for each iteration of gradient descent. 27 | * @param miniBatchFraction Fraction of data to be used per iteration. 28 | * @param dim A (Boolean,Boolean,Int) 3-Tuple stands for whether the global bias term should be used, whether the 29 | * one-way interactions should be used, and the number of factors that are used for pairwise 30 | * interactions, respectively. 31 | * @param regParam A (Double,Double,Double) 3-Tuple stands for the regularization parameters of intercept, one-way 32 | * interactions and pairwise interactions, respectively. 33 | * @param initStd Standard Deviation used for factorization matrix initialization. 34 | */ 35 | def train(input: RDD[LabeledPoint], 36 | task: Int, 37 | numIterations: Int, 38 | stepSize: Double, 39 | miniBatchFraction: Double, 40 | dim: (Boolean, Boolean, Int), 41 | regParam: (Double, Double, Double), 42 | initStd: Double): FMModel = { 43 | new FMWithSGD(task, stepSize, numIterations, dim, regParam, miniBatchFraction) 44 | .setInitStd(initStd) 45 | .run(input) 46 | } 47 | 48 | def train(input: RDD[LabeledPoint], 49 | task: Int, 50 | numIterations: Int): FMModel = { 51 | new FMWithSGD(task, 1.0, numIterations, (true, true, 8), (0, 1e-3, 1e-4), 1e-5) 52 | .setInitStd(0.01) 53 | .run(input) 54 | } 55 | } 56 | 57 | 58 | class FMWithSGD(private var task: Int, 59 | private var stepSize: Double, 60 | private var numIterations: Int, 61 | private var dim: (Boolean, Boolean, Int), 62 | private var regParam: (Double, Double, Double), 63 | private var miniBatchFraction: Double) extends Serializable with Logging { 64 | 65 | 66 | /** 67 | * Construct an object with default parameters: {task: 0, stepSize: 1.0, numIterations: 100, 68 | * dim: (true, true, 8), regParam: (0, 0.01, 0.01), miniBatchFraction: 1.0}. 69 | */ 70 | def this() = this(0, 1.0, 100, (true, true, 8), (0, 1e-3, 1e-4), 1e-5) 71 | 72 | private var k0: Boolean = dim._1 73 | private var k1: Boolean = dim._2 74 | private var k2: Int = dim._3 75 | 76 | private var r0: Double = regParam._1 77 | private var r1: Double = regParam._2 78 | private var r2: Double = regParam._3 79 | 80 | private var initMean: Double = 0 81 | private var initStd: Double = 0.01 82 | 83 | private var numFeatures: Int = -1 84 | private var minLabel: Double = Double.MaxValue 85 | private var maxLabel: Double = Double.MinValue 86 | 87 | /** 88 | * A (Boolean,Boolean,Int) 3-Tuple stands for whether the global bias term should be used, whether the one-way 89 | * interactions should be used, and the number of factors that are used for pairwise interactions, respectively. 90 | */ 91 | def setDim(dim: (Boolean, Boolean, Int)): this.type = { 92 | require(dim._3 > 0) 93 | this.k0 = dim._1 94 | this.k1 = dim._2 95 | this.k2 = dim._3 96 | this 97 | } 98 | 99 | /** 100 | * 101 | * @param addIntercept determines if the global bias term w0 should be used 102 | * @param add1Way determines if one-way interactions (bias terms for each variable) 103 | * @param numFactors the number of factors that are used for pairwise interactions 104 | */ 105 | def setDim(addIntercept: Boolean = true, add1Way: Boolean = true, numFactors: Int = 8): this.type = { 106 | setDim((addIntercept, add1Way, numFactors)) 107 | } 108 | 109 | 110 | /** 111 | * @param regParams A (Double,Double,Double) 3-Tuple stands for the regularization parameters of intercept, one-way 112 | * interactions and pairwise interactions, respectively. 113 | */ 114 | def setRegParam(regParams: (Double, Double, Double)): this.type = { 115 | require(regParams._1 >= 0 && regParams._2 >= 0 && regParams._3 >= 0) 116 | this.r0 = regParams._1 117 | this.r1 = regParams._2 118 | this.r2 = regParams._3 119 | this 120 | } 121 | 122 | /** 123 | * @param regIntercept intercept regularization 124 | * @param reg1Way one-way interactions regularization 125 | * @param reg2Way pairwise interactions regularization 126 | */ 127 | def setRegParam(regIntercept: Double = 0, reg1Way: Double = 0, reg2Way: Double = 0): this.type = { 128 | setRegParam((regIntercept, reg1Way, reg2Way)) 129 | } 130 | 131 | 132 | /** 133 | * @param initStd Standard Deviation used for factorization matrix initialization. 134 | */ 135 | def setInitStd(initStd: Double): this.type = { 136 | require(initStd > 0) 137 | this.initStd = initStd 138 | this 139 | } 140 | 141 | /** 142 | * Set fraction of data to be used for each SGD iteration. 143 | */ 144 | def setMiniBatchFraction(miniBatchFraction: Double): this.type = { 145 | require(miniBatchFraction > 0 && miniBatchFraction <= 1) 146 | this.miniBatchFraction = miniBatchFraction 147 | this 148 | } 149 | 150 | /** 151 | * Set the number of iterations for SGD. 152 | */ 153 | def setNumIterations(numIterations: Int): this.type = { 154 | require(numIterations > 0) 155 | this.numIterations = numIterations 156 | this 157 | } 158 | 159 | /** 160 | * Set the initial step size of SGD for the first step. 161 | * In subsequent steps, the step size will decrease with stepSize/sqrt(t) 162 | */ 163 | def setStepSize(stepSize: Double): this.type = { 164 | require(stepSize >= 0) 165 | this.stepSize = stepSize 166 | this 167 | } 168 | 169 | 170 | /** 171 | * Encode the FMModel to a dense vector, with its first numFeatures * numFactors elements representing the 172 | * factorization matrix v, sequential numFeatures elements representing the one-way interactions weights w if k1 is 173 | * set to true, and the last element representing the intercept w0 if k0 is set to true. 174 | * The factorization matrix v is initialized by Gaussinan(0, initStd). 175 | * v : numFeatures * numFactors + w : [numFeatures] + w0 : [1] 176 | */ 177 | private def generateInitWeights(): Vector = { 178 | (k0, k1) match { 179 | case (true, true) => 180 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean) ++ 181 | Array.fill(numFeatures + 1)(0.0)) 182 | 183 | case (true, false) => 184 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean) ++ 185 | Array(0.0)) 186 | 187 | case (false, true) => 188 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean) ++ 189 | Array.fill(numFeatures)(0.0)) 190 | 191 | case (false, false) => 192 | Vectors.dense(Array.fill(numFeatures * k2)(Random.nextGaussian() * initStd + initMean)) 193 | } 194 | } 195 | 196 | 197 | /** 198 | * Create a FMModel from an encoded vector. 199 | */ 200 | private def createModel(weights: Vector): FMModel = { 201 | 202 | val values = weights.toArray 203 | 204 | val v = new DenseMatrix(k2, numFeatures, values.slice(0, numFeatures * k2)) 205 | 206 | val w = if (k1) Some(Vectors.dense(values.slice(numFeatures * k2, numFeatures * k2 + numFeatures))) else None 207 | 208 | val w0 = if (k0) values.last else 0.0 209 | 210 | new FMModel(task, v, w, w0, minLabel, maxLabel) 211 | } 212 | 213 | 214 | /** 215 | * Run the algorithm with the configured parameters on an input RDD 216 | * of LabeledPoint entries. 217 | */ 218 | def run(input: RDD[LabeledPoint]): FMModel = { 219 | 220 | this.numFeatures = input.first().features.size 221 | require(numFeatures > 0) 222 | 223 | if (task == 0) { 224 | val (minT, maxT) = input.map(_.label).aggregate[(Double, Double)]((Double.MaxValue, Double.MinValue))({ 225 | case ((min, max), v) => 226 | (Math.min(min, v), Math.max(max, v)) 227 | }, { 228 | case ((min1, max1), (min2, max2)) => 229 | (Math.min(min1, min2), Math.max(max1, max2)) 230 | }) 231 | 232 | this.minLabel = minT 233 | this.maxLabel = maxT 234 | } 235 | 236 | val gradient = new FMGradient(task, k0, k1, k2, numFeatures, minLabel, maxLabel) 237 | 238 | val updater = new FMUpdater(k0, k1, k2, r0, r1, r2, numFeatures) 239 | 240 | val optimizer = new GradientDescent(gradient, updater) 241 | .setStepSize(stepSize) 242 | .setNumIterations(numIterations) 243 | .setMiniBatchFraction(miniBatchFraction) 244 | .setConvergenceTol(Double.MinPositiveValue) 245 | 246 | val data = task match { 247 | case 0 => 248 | input.map(l => (l.label, l.features)).persist() 249 | case 1 => 250 | input.map(l => (if (l.label > 0) 1.0 else -1.0, l.features)).persist() 251 | } 252 | 253 | val initWeights = generateInitWeights() 254 | 255 | val weights = optimizer.optimize(data, initWeights) 256 | 257 | createModel(weights) 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/mllib/regression/FactorizationMachine.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.regression 2 | 3 | import org.json4s.DefaultFormats 4 | import org.json4s.JsonDSL._ 5 | import org.json4s.jackson.JsonMethods._ 6 | 7 | import scala.util.Random 8 | 9 | import org.apache.spark.{SparkContext, Logging} 10 | import org.apache.spark.mllib.linalg._ 11 | import org.apache.spark.mllib.optimization.{Updater, Gradient} 12 | import org.apache.spark.rdd.RDD 13 | import org.apache.spark.storage.StorageLevel 14 | import org.apache.spark.mllib.util.Loader._ 15 | import org.apache.spark.mllib.util.{Loader, Saveable} 16 | import org.apache.spark.sql.{DataFrame, SQLContext} 17 | 18 | /** 19 | * Created by zrf on 4/13/15. 20 | */ 21 | 22 | /** 23 | * Factorization Machine model. 24 | */ 25 | class FMModel(val task: Int, 26 | val factorMatrix: Matrix, 27 | val weightVector: Option[Vector], 28 | val intercept: Double, 29 | val min: Double, 30 | val max: Double) extends Serializable with Saveable { 31 | 32 | val numFeatures = factorMatrix.numCols 33 | val numFactors = factorMatrix.numRows 34 | 35 | require(numFeatures > 0 && numFactors > 0) 36 | require(task == 0 || task == 1) 37 | 38 | def predict(testData: Vector): Double = { 39 | require(testData.size == numFeatures) 40 | 41 | var pred = intercept 42 | if (weightVector.isDefined) { 43 | testData.foreachActive { 44 | case (i, v) => 45 | pred += weightVector.get(i) * v 46 | } 47 | } 48 | 49 | for (f <- 0 until numFactors) { 50 | var sum = 0.0 51 | var sumSqr = 0.0 52 | testData.foreachActive { 53 | case (i, v) => 54 | val d = factorMatrix(f, i) * v 55 | sum += d 56 | sumSqr += d * d 57 | } 58 | pred += (sum * sum - sumSqr) / 2 59 | } 60 | 61 | task match { 62 | case 0 => 63 | Math.min(Math.max(pred, min), max) 64 | case 1 => 65 | 1.0 / (1.0 + Math.exp(-pred)) 66 | } 67 | } 68 | 69 | def predict(testData: RDD[Vector]): RDD[Double] = { 70 | testData.mapPartitions { 71 | _.map { 72 | vec => 73 | predict(vec) 74 | } 75 | } 76 | } 77 | 78 | override protected def formatVersion: String = "1.0" 79 | 80 | override def save(sc: SparkContext, path: String): Unit = { 81 | val data = FMModel.SaveLoadV1_0.Data(factorMatrix, weightVector, intercept, min, max, task) 82 | FMModel.SaveLoadV1_0.save(sc, path, data) 83 | } 84 | } 85 | 86 | object FMModel extends Loader[FMModel] { 87 | 88 | private object SaveLoadV1_0 { 89 | 90 | def thisFormatVersion = "1.0" 91 | 92 | def thisClassName = "org.apache.spark.mllib.regression.FMModel" 93 | 94 | /** Model data for model import/export */ 95 | case class Data(factorMatrix: Matrix, weightVector: Option[Vector], intercept: Double, 96 | min: Double, max: Double, task: Int) 97 | 98 | def save(sc: SparkContext, path: String, data: Data): Unit = { 99 | val sqlContext = new SQLContext(sc) 100 | import sqlContext.implicits._ 101 | // Create JSON metadata. 102 | val metadata = compact(render( 103 | ("class" -> this.getClass.getName) ~ ("version" -> thisFormatVersion) ~ 104 | ("numFeatures" -> data.factorMatrix.numCols) ~ ("numFactors" -> data.factorMatrix.numRows) 105 | ~ ("min" -> data.min) ~ ("max" -> data.max) ~ ("task" -> data.task))) 106 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path)) 107 | 108 | // Create Parquet data. 109 | val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF() 110 | dataRDD.write.parquet(dataPath(path)) 111 | } 112 | 113 | def load(sc: SparkContext, path: String): FMModel = { 114 | val sqlContext = new SQLContext(sc) 115 | // Load Parquet data. 116 | val dataRDD = sqlContext.read.parquet(dataPath(path)) 117 | // Check schema explicitly since erasure makes it hard to use match-case for checking. 118 | checkSchema[Data](dataRDD.schema) 119 | val dataArray = dataRDD.select("task", "factorMatrix", "weightVector", "intercept", "min", "max").take(1) 120 | assert(dataArray.length == 1, s"Unable to load FMModel data from: ${dataPath(path)}") 121 | val data = dataArray(0) 122 | val task = data.getInt(0) 123 | val factorMatrix = data.getAs[Matrix](1) 124 | val weightVector = data.getAs[Option[Vector]](2) 125 | val intercept = data.getDouble(3) 126 | val min = data.getDouble(4) 127 | val max = data.getDouble(5) 128 | new FMModel(task, factorMatrix, weightVector, intercept, min, max) 129 | } 130 | } 131 | 132 | override def load(sc: SparkContext, path: String): FMModel = { 133 | implicit val formats = DefaultFormats 134 | 135 | val (loadedClassName, version, metadata) = loadMetadata(sc, path) 136 | val classNameV1_0 = SaveLoadV1_0.thisClassName 137 | 138 | (loadedClassName, version) match { 139 | case (className, "1.0") if className == classNameV1_0 => 140 | val numFeatures = (metadata \ "numFeatures").extract[Int] 141 | val numFactors = (metadata \ "numFactors").extract[Int] 142 | val model = SaveLoadV1_0.load(sc, path) 143 | assert(model.factorMatrix.numCols == numFeatures, 144 | s"FMModel.load expected $numFeatures features," + 145 | s" but factorMatrix had columns of size:" + 146 | s" ${model.factorMatrix.numCols}") 147 | assert(model.factorMatrix.numRows == numFactors, 148 | s"FMModel.load expected $numFactors factors," + 149 | s" but factorMatrix had rows of size:" + 150 | s" ${model.factorMatrix.numRows}") 151 | model 152 | 153 | case _ => throw new Exception( 154 | s"FMModel.load did not recognize model with (className, format version):" + 155 | s"($loadedClassName, $version). Supported:\n" + 156 | s" ($classNameV1_0, 1.0)") 157 | } 158 | } 159 | } 160 | 161 | 162 | /** 163 | * :: DeveloperApi :: 164 | * Compute gradient and loss for a Least-squared loss function, as used in linear regression. 165 | * For the detailed mathematical derivation, see the reference at 166 | * http://doi.acm.org/10.1145/2168752.2168771 167 | */ 168 | class FMGradient(val task: Int, val k0: Boolean, val k1: Boolean, val k2: Int, 169 | val numFeatures: Int, val min: Double, val max: Double) extends Gradient { 170 | 171 | private def predict(data: Vector, weights: Vector): (Double, Array[Double]) = { 172 | 173 | var pred = if (k0) weights(weights.size - 1) else 0.0 174 | 175 | if (k1) { 176 | val pos = numFeatures * k2 177 | data.foreachActive { 178 | case (i, v) => 179 | pred += weights(pos + i) * v 180 | } 181 | } 182 | 183 | val sum = Array.fill(k2)(0.0) 184 | for (f <- 0 until k2) { 185 | var sumSqr = 0.0 186 | data.foreachActive { 187 | case (i, v) => 188 | val d = weights(i * k2 + f) * v 189 | sum(f) += d 190 | sumSqr += d * d 191 | } 192 | pred += (sum(f) * sum(f) - sumSqr) / 2 193 | } 194 | 195 | if (task == 0) { 196 | pred = Math.min(Math.max(pred, min), max) 197 | } 198 | 199 | (pred, sum) 200 | } 201 | 202 | 203 | private def cumulateGradient(data: Vector, weights: Vector, 204 | pred: Double, label: Double, 205 | sum: Array[Double], cumGrad: Vector): Unit = { 206 | 207 | val mult = task match { 208 | case 0 => 209 | pred - label 210 | case 1 => 211 | -label * (1.0 - 1.0 / (1.0 + Math.exp(-label * pred))) 212 | } 213 | 214 | cumGrad match { 215 | case vec: DenseVector => 216 | val cumValues = vec.values 217 | 218 | if (k0) { 219 | cumValues(cumValues.length - 1) += mult 220 | } 221 | 222 | if (k1) { 223 | val pos = numFeatures * k2 224 | data.foreachActive { 225 | case (i, v) => 226 | cumValues(pos + i) += v * mult 227 | } 228 | } 229 | 230 | data.foreachActive { 231 | case (i, v) => 232 | val pos = i * k2 233 | for (f <- 0 until k2) { 234 | cumValues(pos + f) += (sum(f) * v - weights(pos + f) * v * v) * mult 235 | } 236 | } 237 | 238 | case _ => 239 | throw new IllegalArgumentException( 240 | s"cumulateGradient only supports adding to a dense vector but got type ${cumGrad.getClass}.") 241 | } 242 | } 243 | 244 | 245 | override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = { 246 | val cumGradient = Vectors.dense(Array.fill(weights.size)(0.0)) 247 | val loss = compute(data, label, weights, cumGradient) 248 | (cumGradient, loss) 249 | } 250 | 251 | override def compute(data: Vector, label: Double, weights: Vector, cumGradient: Vector): Double = { 252 | require(data.size == numFeatures) 253 | val (pred, sum) = predict(data, weights) 254 | cumulateGradient(data, weights, pred, label, sum, cumGradient) 255 | 256 | task match { 257 | case 0 => 258 | (pred - label) * (pred - label) 259 | case 1 => 260 | 1 - Math.signum(pred * label) 261 | } 262 | } 263 | } 264 | 265 | /** 266 | * :: DeveloperApi :: 267 | * Updater for L2 regularized problems. 268 | * Uses a step-size decreasing with the square root of the number of iterations. 269 | */ 270 | class FMUpdater(val k0: Boolean, val k1: Boolean, val k2: Int, 271 | val r0: Double, val r1: Double, val r2: Double, 272 | val numFeatures: Int) extends Updater { 273 | 274 | override def compute(weightsOld: Vector, gradient: Vector, 275 | stepSize: Double, iter: Int, regParam: Double): (Vector, Double) = { 276 | val thisIterStepSize = stepSize / math.sqrt(iter) 277 | val len = weightsOld.size 278 | 279 | val weightsNew = Array.fill(len)(0.0) 280 | var regVal = 0.0 281 | 282 | if (k0) { 283 | weightsNew(len - 1) = weightsOld(len - 1) - thisIterStepSize * (gradient(len - 1) + r0 * weightsOld(len - 1)) 284 | regVal += r0 * weightsNew(len - 1) * weightsNew(len - 1) 285 | } 286 | 287 | if (k1) { 288 | for (i <- numFeatures * k2 until numFeatures * k2 + numFeatures) { 289 | weightsNew(i) = weightsOld(i) - thisIterStepSize * (gradient(i) + r1 * weightsOld(i)) 290 | regVal += r1 * weightsNew(i) * weightsNew(i) 291 | } 292 | } 293 | 294 | for (i <- 0 until numFeatures * k2) { 295 | weightsNew(i) = weightsOld(i) - thisIterStepSize * (gradient(i) + r2 * weightsOld(i)) 296 | regVal += r2 * weightsNew(i) * weightsNew(i) 297 | } 298 | 299 | (Vectors.dense(weightsNew), regVal / 2) 300 | } 301 | } 302 | --------------------------------------------------------------------------------