├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
├── project
    └── assembly.sbt
└── src
    ├── examples
        ├── resources
        │   └── valid_codes
        └── scala
        │   └── sparkz
        │       └── examples
        │           ├── BinaryClassifierEvaluationExample.scala
        │           └── DataValidation.scala
    └── main
        ├── resources
            └── stopwords.tsv
        └── scala
            ├── sam
                └── sceval
                │   ├── AreaUnderCurve.scala
                │   ├── BinUtils.scala
                │   ├── BinaryConfusionMatrix.scala
                │   ├── EvaluationPimps.scala
                │   ├── XValidator.scala
                │   └── package.scala
            └── sparkz
                ├── classifiers
                    ├── BinaryClassifier.scala
                    ├── BinaryClassifierTrainerWithTransformer.scala
                    ├── DecisionTreeVectorClassifier.scala
                    ├── Features.scala
                    └── RandomBinaryClassifierTrainer.scala
                ├── evaluation
                    ├── BinaryClassifierEvaluation.scala
                    └── MAP.scala
                ├── transformers
                    ├── EnsembleTransformer.scala
                    ├── MultiOneHotTransformer.scala
                    ├── OriginalNumericalsTransformer.scala
                    └── TermFrequencyTransformer.scala
                └── utils
                    ├── AppLogger.scala
                    ├── Pimps.scala
                    ├── StatCounter.scala
                    ├── TopN.scala
                    └── VPTree.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Intellij ###
 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, WebStorm
 3 | 
 4 | ## Directory-based project format
 5 | .idea/
 6 | # if you remove the above rule, at least ignore user-specific stuff:
 7 | # .idea/workspace.xml
 8 | # .idea/tasks.xml
 9 | # and these sensitive or high-churn files:
10 | # .idea/dataSources.ids
11 | # .idea/dataSources.xml
12 | # .idea/sqlDataSources.xml
13 | # .idea/dynamic.xml
14 | 
15 | ## File-based project format
16 | *.ipr
17 | *.iws
18 | *.iml
19 | 
20 | ## Additional for IntelliJ
21 | out/
22 | gen/
23 | 
24 | # generated by mpeltonen/sbt-idea plugin
25 | .idea_modules/
26 | 
27 | # generated by JIRA plugin
28 | atlassian-ide-plugin.xml
29 | 
30 | # generated by Crashlytics plugin (for Android Studio and Intellij)
31 | com_crashlytics_export_strings.xml
32 | 
33 | 
34 | ### idea-gitignore
35 | 
36 | # Release package
37 | idea-gitignore.jar
38 | resources/templates.list
39 | build/
40 | build.properties
41 | junit*.properties


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | 
3 | scala:
4 |    - 2.10.4
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sparkz
 2 | [![Build Status](https://travis-ci.org/gm-spacagna/sparkz.svg?branch=master)](https://travis-ci.org/gm-spacagna/sparkz)
 3 | 
 4 | A proof-of-concept extension to the amazing Spark framework for better functional programming.
 5 | The project aims to extend, and in a few cases re-implement, some of the functionalities and classes in the [Apache Spark](spark.apache.org) framework.
 6 | 
 7 | The main motivation is to make statically typed the APIs of some Machine Learning components, to provide the missing functional structures of some classes (Broadcast variables, data validation pipelines, utility classes...) and to work around the unnecessary limitations imposed by private fields/methods.
 8 | Moreover, the project introduces a bunch of util functions, implicits and tutorials to show the power, conciseness and elegance of the Spark framework when combined with a fully functional design.
 9 | 
10 | ## Sonatype dependency
11 | Maven:
12 | 
13 |     <dependency>
14 |       <groupId>com.github.gm-spacagna</groupId>
15 |       <artifactId>sparkz_2.10</artifactId>
16 |       <version>0.1.0</version>
17 |     </dependency>
18 | 
19 | sbt:
20 | 
21 |     "com.github.gm-spacagna" % "sparkz_2.10" % "0.1.0"
22 | 
23 | ## Current features
24 | 
25 | * Functional Data Validation using monads and applicative functors: https://datasciencevademecum.wordpress.com/2016/03/09/functional-data-validation-using-monads-and-applicative-functors/
26 | * Integration with sceval for a better binary classification evaluation framework: https://github.com/samthebest/sceval
27 | * Immutable StatCounter class with defined Monoid
28 | * Collection of Pimps for daily tasks utils
29 | * Lazy logger for debug computations
30 | * Transformer -> Trainer -> Model -> Evaluation functional framework for machine learning algorithms (similar to ML pipeline but typed and functional): https://datasciencevademecum.wordpress.com/2016/04/12/robust-and-declarative-machine-learning-pipelines-for-predictive-buyin/
31 | 
32 | ## WIP
33 | * Functor for Spark Broadcast
34 | 
35 |  
36 | ## Limitations
37 | The original Spark implementations are intentionally not fully functional in order to avoid overloading the garbage collector and have more efficient and mutable data structures. This project is only a proof-of-concept with the goal of inspiring developers, data scientists and engineers to think their design in pure functional terms but does not guarantee better performances. It is strongly encouraged to tailor and tune each component based on your speficif needs.
38 | 
39 | ## Related projects
40 | * Frameless: https://github.com/adelbertc/frameless
41 | * Exploratory Data Analysis: https://github.com/vicpara/exploratory-data-analysis
42 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | organization := "com.github.gm-spacagna"
 2 | 
 3 | name := "sparkz"
 4 | 
 5 | version := "0.1.0-SNAPSHOT"
 6 | 
 7 | scalaVersion := "2.10.4"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "joda-time" % "joda-time" % "2.6" withSources() withJavadoc(),
11 |   "org.joda" % "joda-convert" % "1.2" withSources() withJavadoc(),
12 |   "org.apache.spark" % "spark-core_2.10" % "1.3.0" withSources() withJavadoc(),
13 |   "org.apache.spark" % "spark-mllib_2.10" % "1.3.0" withSources() withJavadoc(),
14 |   "com.github.scala-incubator.io" %% "scala-io-file" % "0.4.2" withSources() withJavadoc(),
15 |   "org.scalaz" %% "scalaz-core" % "7.0.6" withSources() withJavadoc(),
16 |   "org.rogach" %% "scallop" % "0.9.5" withSources() withJavadoc(),
17 |   "org.scala-lang" % "scalap" % "2.10.4" withSources() withJavadoc(),
18 |   "org.scala-lang" % "scala-compiler" % "2.10.4" withSources() withJavadoc(),
19 |   "org.specs2" %% "specs2-core" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(),
20 |   "org.specs2" %% "specs2-scalacheck" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc()
21 | )
22 | 
23 | resolvers ++= Seq(
24 |   "Maven Central" at "https://repo1.maven.org/maven2/"
25 | )
26 | 
27 | mergeStrategy in assembly <<= (mergeStrategy in assembly) ((old) => {
28 |   case x if Assembly.isConfigFile(x) =>
29 |     MergeStrategy.concat
30 |   case PathList(ps @ _*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) =>
31 |     MergeStrategy.rename
32 |   case PathList("META-INF", xs @ _*) =>
33 |     (xs map {_.toLowerCase}) match {
34 |       case ("manifest.mf" :: Nil) | ("index.list" :: Nil) | ("dependencies" :: Nil) =>
35 |         MergeStrategy.discard
36 |       case ps @ (x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") =>
37 |         MergeStrategy.discard
38 |       case "plexus" :: xs =>
39 |         MergeStrategy.discard
40 |       case "services" :: xs =>
41 |         MergeStrategy.filterDistinctLines
42 |       case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) =>
43 |         MergeStrategy.filterDistinctLines
44 |       case _ => MergeStrategy.first // Changed deduplicate to first
45 |     }
46 |   case PathList(_*) => MergeStrategy.first // added this line
47 | })
48 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | 


--------------------------------------------------------------------------------
/src/examples/resources/valid_codes:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | 4
 5 | 5
 6 | 6
 7 | 7
 8 | 8
 9 | 9
10 | 10
11 | 


--------------------------------------------------------------------------------
/src/examples/scala/sparkz/examples/BinaryClassifierEvaluationExample.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.examples
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.joda.time.LocalDate
 5 | import sam.sceval.EvaluationPimps._
 6 | import sparkz.classifiers._
 7 | import sparkz.evaluation.BinaryClassifierEvaluation
 8 | import sparkz.transformers._
 9 | 
10 | case class UserInfo(customerId: Long, date: LocalDate)
11 | case class UserFeatures(events: List[String], attributes: Map[String, String], numericals: Map[String, Double])
12 | case class UserFeaturesWithBooleanLabel(features: UserFeatures,
13 |                                         metaData: UserInfo,
14 |                                         isTrue: Boolean) extends FeaturesWithBooleanLabel[UserFeatures] with MetaData[UserInfo]
15 | 
16 | object BinaryClassifierEvaluationExample {
17 |   def auc(data: RDD[(FeaturesWithBooleanLabel[UserFeatures] with MetaData[UserInfo])]): Map[BinaryClassifierTrainer[UserFeatures], Double] = {
18 | 
19 |     val vecDecisionTreeClassifier: BinaryClassifierVectorTrainer =
20 |       DecisionTreeClassifierTrainer(impurity = "gini", maxDepth = 5, maxBins = 32)
21 | 
22 |     val eventsSubTransformer = TermFrequencyTransformer[String, UserFeatures](_.events)
23 |     val categoriesSubTransformer = MultiOneHotTransformer[String, UserFeatures](_.attributes)
24 |     val numericalsSubTransformer = OriginalNumericalsTransformer[String, UserFeatures](_.numericals)
25 | 
26 |     val classifiers: List[BinaryClassifierTrainer[UserFeatures]] = List(
27 |       RandomBinaryClassifierTrainer(),
28 |       BinaryClassifierTrainerWithTransformer(
29 |         vec2classifier = vecDecisionTreeClassifier,
30 |         transformer = eventsSubTransformer
31 |       ),
32 |       BinaryClassifierTrainerWithTransformer(
33 |         vec2classifier = vecDecisionTreeClassifier,
34 |         transformer = categoriesSubTransformer
35 |       ),
36 |       BinaryClassifierTrainerWithTransformer(
37 |         vec2classifier = vecDecisionTreeClassifier,
38 |         transformer = numericalsSubTransformer
39 |       ),
40 |       BinaryClassifierTrainerWithTransformer(
41 |         vec2classifier = vecDecisionTreeClassifier,
42 |         transformer = EnsembleTransformer(eventsSubTransformer, categoriesSubTransformer, numericalsSubTransformer)
43 |       )
44 |     )
45 | 
46 |     BinaryClassifierEvaluation.crossValidationScores(
47 |       data = data,
48 |       k = 10,
49 |       classifiers = classifiers,
50 |       uniqueId = (_: UserInfo).customerId,
51 |       orderingField = (_: UserInfo).date.toDateTimeAtStartOfDay.getMillis,
52 |       singleInference = true
53 |     ).mapValues(
54 |       _.map {
55 |         case (features, score) => score -> features.isTrue
56 |       }
57 |       .confusions().areaUnderROC
58 |     )
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/examples/scala/sparkz/examples/DataValidation.scala:
--------------------------------------------------------------------------------
  1 | package sparkz.examples
  2 | 
  3 | import org.apache.spark.broadcast.Broadcast
  4 | import org.apache.spark.rdd.RDD
  5 | import org.joda.time.{DateTime, Interval, LocalDate}
  6 | import sparkz.utils.Pimps._
  7 | 
  8 | import scalaz.Scalaz._
  9 | import scalaz.ValidationNel
 10 | 
 11 | case class UserEvent(userId: Long, eventCode: Int, timestamp: Long)
 12 | 
 13 | sealed trait InvalidEventCause
 14 | case object NonRecognizedEventType extends InvalidEventCause
 15 | case object BlackListUser extends InvalidEventCause
 16 | case object NonEligibleUser extends InvalidEventCause
 17 | case object OutOfGlobalIntervalEvent extends InvalidEventCause
 18 | case object FirstDayToConsiderEvent extends InvalidEventCause
 19 | 
 20 | case class InvalidEvent(event: UserEvent, cause: InvalidEventCause)
 21 | 
 22 | case object DataValidation {
 23 |   def validationFunction(events: RDD[UserEvent],
 24 |                          eligibleUsers: Set[Long],
 25 |                          validEventCodes: Set[Int],
 26 |                          blackListEventCodes: Set[Int],
 27 |                          minDate: String, maxDate: String): UserEvent => ValidationNel[InvalidEvent, UserEvent] = {
 28 |     val sc = events.context
 29 | 
 30 |     val validEventCodesBV: Broadcast[Set[Int]] = sc.broadcast(validEventCodes)
 31 |     val notRecognizedEventCode: PartialFunction[UserEvent, InvalidEventCause] = {
 32 |       case event if !validEventCodesBV.value.contains(event.eventCode) => NonRecognizedEventType
 33 |     }
 34 | 
 35 |     val eligibleUsersBV: Broadcast[Set[Long]] = sc.broadcast(eligibleUsers)
 36 |     val customerNotEligible: PartialFunction[UserEvent, InvalidEventCause] = {
 37 |       case event if !eligibleUsersBV.value.contains(event.userId) => NonEligibleUser
 38 |     }
 39 | 
 40 |     val blackListEventCodesBV: Broadcast[Set[Int]] = sc.broadcast(blackListEventCodes)
 41 |     // Users for which we observed a black list event
 42 |     val blackListUsersBV: Broadcast[Set[Long]] = sc.broadcast(
 43 |       events.filter(event => blackListEventCodesBV.value.contains(event.eventCode))
 44 |       .map(_.userId).distinct().collect().toSet
 45 |     )
 46 |     val customerIsInBlackList: PartialFunction[UserEvent, InvalidEventCause] = {
 47 |       case event if blackListUsersBV.value.contains(event.userId) => BlackListUser
 48 |     }
 49 | 
 50 | 
 51 |     val eventIsOutOfGlobalInterval: PartialFunction[UserEvent, InvalidEventCause] = {
 52 |       case event if !new Interval(DateTime.parse(minDate), DateTime.parse(maxDate)).contains(event.timestamp) =>
 53 |         OutOfGlobalIntervalEvent
 54 |     }
 55 | 
 56 |     // max between first date we have ever seen a customer event and the global min date
 57 |     val customersFirstDayToConsiderBV: Broadcast[Map[Long, LocalDate]] =
 58 |       sc.broadcast(
 59 |         events.keyBy(_.userId)
 60 |         .mapValues(personalEvent => new DateTime(personalEvent.timestamp).toLocalDate)
 61 |         .reduceByKey((date1, date2) => List(date1, date2).minBy(_.toDateTimeAtStartOfDay.getMillis))
 62 |         .mapValues(firstDate => List(firstDate, LocalDate.parse(minDate)).maxBy(_.toDateTimeAtStartOfDay.getMillis))
 63 |         .collect().toMap
 64 |       )
 65 |     val eventIsFirstDayToConsider: PartialFunction[UserEvent, InvalidEventCause] = {
 66 |       case event if customersFirstDayToConsiderBV.value(event.userId).isEqual(event.timestamp.toLocalDate) =>
 67 |         FirstDayToConsiderEvent
 68 |     }
 69 |     val validationRules: List[PartialFunction[UserEvent, InvalidEventCause]] =
 70 |       List(customerNotEligible, notRecognizedEventCode, customerIsInBlackList,
 71 |         eventIsOutOfGlobalInterval.orElse(eventIsFirstDayToConsider)
 72 |       )
 73 | 
 74 |     (event: UserEvent) => validationRules.map(_.toFailureNel(event, InvalidEvent(event, _))).reduce(_ |+++| _)
 75 |   }
 76 | 
 77 |   def onlyValidEvents(events: RDD[UserEvent],
 78 |                       validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): RDD[UserEvent] =
 79 |     events.map(validationFunc).flatMap(_.toOption)
 80 | 
 81 |   def invalidEvents(events: RDD[UserEvent],
 82 |                     validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): RDD[InvalidEvent] =
 83 |     events.map(validationFunc).flatMap(_.swap.toOption).flatMap(_.toList)
 84 | 
 85 |   def outOfRangeEvents(events: RDD[UserEvent],
 86 |                        validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): RDD[UserEvent] =
 87 |     events.map(validationFunc).flatMap(_.swap.toOption).flatMap(_.toSet).flatMap {
 88 |       case InvalidEvent(event, OutOfGlobalIntervalEvent) => event.some
 89 |       case _ => Nil
 90 |     }
 91 | 
 92 |   // This method will return something like:
 93 |   // Map(Set(NonEligibleCustomer, NonRecognizedEventType) -> 36018450,
 94 |   // Set(NonEligibleUser) -> 9037691,
 95 |   // Set(NonEligibleUser, BlackListUser, NonRecognizedEventType) -> 137816,
 96 |   // Set(NonEligibleUser) -> 464694973,
 97 |   // Set(BeforeFirstDayToConsiderEvent, NonRecognizedEventType) -> 5147475,
 98 |   // Set(OutOfGlobalIntervalEvent, NonRecognizedEventType) -> 983478)
 99 |   def causeSetToInvalidEventsCount(events: RDD[UserEvent],
100 |                                    validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): Map[Set[InvalidEventCause], Int] =
101 |     events.map(validationFunc)
102 |     .map(_.swap).flatMap(_.toOption).map(_.map(_.cause).toSet -> 1)
103 |     .reduceByKey(_ + _)
104 |     .collect().toMap
105 | 
106 |   // This method will return something like:
107 | //   Map(Set(NonEligibleCustomer, NonRecognizedEventType) -> 1545,
108 | //   Set(NonEligibleUser) -> 122,
109 | //   Set(NonEligibleUser, BlackListUser, NonRecognizedEventType) -> 3224,
110 | //   Set(NonEligibleUser) -> 4,
111 | //   Set(BeforeFirstDayToConsiderEvent, NonRecognizedEventType) -> 335,
112 | //   Set(OutOfGlobalIntervalEvent, NonRecognizedEventType) -> 33)
113 |   def causeSetToUsersLostCount(events: RDD[UserEvent],
114 |                                validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): Map[Set[InvalidEventCause], Int] = {
115 |     val survivedUsersBV: Broadcast[Set[Long]] =
116 |       events.context.broadcast(events.map(validationFunc).flatMap(_.toOption).map(_.userId).distinct().collect().toSet)
117 | 
118 |     events.map(validationFunc).flatMap(_.swap.toOption)
119 |     .keyBy(_.head.event.userId)
120 |     .filter(_._1 |> (!survivedUsersBV.value(_)))
121 |     .mapValues(_.map(_.cause).toSet)
122 |     .mapValues(Set(_))
123 |     .reduceByKey(_ ++ _)
124 |     .flatMap(_._2)
125 |     .map(_ -> 1)
126 |     .reduceByKey(_ + _)
127 |     .collect().toMap
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/resources/stopwords.tsv:
--------------------------------------------------------------------------------
 1 | PETER
 2 | PRACTICE
 3 | ROAD
 4 | ENGINEERING
 5 | SOLICITORS
 6 | SURGERY
 7 | HAIR
 8 | ROYAL
 9 | DAY
10 | HOLDINGS
11 | SON
12 | CLUB
13 | COMMUNITY
14 | LONDON
15 | THE
16 | PHARMACY
17 | LODGE
18 | SCHOOL
19 | ELECTRICAL
20 | CLINIC
21 | CARE
22 | DESIGN
23 | SHOP
24 | STORES
25 | HOME
26 | NURSERY
27 | DENTAL
28 | AUTO
29 | SOUTH
30 | SERVICES
31 | HALL
32 | OLD
33 | OF
34 | HEATING
35 | ASSOCIATION
36 | CHURCH
37 | GARAGE
38 | SALON
39 | ROBERT
40 | SYSTEMS
41 | LIBRARY
42 | ST
43 | MOTORS
44 | HEALTH
45 | LTD
46 | PARK
47 | GROUP
48 | SERVICE
49 | NEW
50 | COMPANY
51 | SONS
52 | HOTEL
53 | HOUSE
54 | MOTOR
55 | ARMS
56 | UK
57 | THOMAS
58 | FARM
59 | LIMITED
60 | PLUMBING
61 | BEAUTY
62 | CO
63 | RESTAURANT
64 | GARDEN
65 | INN
66 | BAR
67 | CAFE
68 | MANAGEMENT
69 | BUILDING
70 | NORTH
71 | STATION
72 | CATHOLIC
73 | HIRE
74 | COURT
75 | CENTRE
76 | FISH
77 | STUDIO
78 | SPORTS
79 | PAUL
80 | MEDICAL
81 | NEWS
82 | VILLAGE
83 | ASSOCIATES
84 | PRIMARY
85 | OFFICE
86 | POST
87 | INTERNATIONAL
88 | SUPPLIES
89 | 


--------------------------------------------------------------------------------
/src/main/scala/sam/sceval/AreaUnderCurve.scala:
--------------------------------------------------------------------------------
 1 | package sam.sceval
 2 | 
 3 | import org.apache.spark.mllib.rdd.RDDFunctions._
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | /** Computes the area under the curve (AUC) using the trapezoidal rule. */
 7 | @deprecated("Don't use meaningless measures, use something that has a direct probabilistic meaning. See README.md")
 8 | object AreaUnderCurve {
 9 |   def trapezoid(points: Seq[(Double, Double)]): Double = {
10 |     val (x1, x2) :: (y1, y2) :: Nil = points
11 |     (y1 - x1) * (y2 + x2) / 2.0
12 |   }
13 | 
14 |   def apply(curve: RDD[(Double, Double)]): Double = curve.sliding(2).aggregate(0.0)(
15 |     seqOp = (auc: Double, points: Array[(Double, Double)]) => auc + trapezoid(points.toList),
16 |     combOp = _ + _
17 |   )
18 | 
19 |   def apply(curve: Iterable[(Double, Double)]): Double =
20 |     curve.toIterator.sliding(2).withPartial(false).aggregate(0.0)(
21 |       seqop = (auc: Double, points: Seq[(Double, Double)]) => auc + trapezoid(points),
22 |       combop = _ + _
23 |     )
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/sam/sceval/BinUtils.scala:
--------------------------------------------------------------------------------
 1 | package sam.sceval
 2 | 
 3 | object BinUtils {
 4 |   case class BinStats(startBinNumber: Int = 0, offset: Int = 0)
 5 | 
 6 |   def binnerFac[Model](partitionLastIndexes: Array[Map[Model, Long]],
 7 |                        numRecodsPerBin: Long): (Model, Long, Int) => Int = {
 8 |     val modelToBinStats: Map[Model, Array[BinStats]] =
 9 |       partitionLastIndexes.flatMap(_.keySet).toSet.foldLeft(Map.empty[Model, List[BinStats]])((modelToStats, model) =>
10 |         partitionLastIndexes.foldLeft(modelToStats)((modelToStats, partition) =>
11 |           modelToStats + (model -> ((partition.get(model), modelToStats.getOrElse(model, List(BinStats()))) match {
12 |             case (Some(lastIndex), cum@(BinStats(startBinNumber, offset) :: _)) =>
13 |               val newOffset = (lastIndex + 1 + offset) % numRecodsPerBin
14 |               BinStats((startBinNumber + (lastIndex + offset) / numRecodsPerBin).toInt + (if (newOffset == 0) 1 else 0),
15 |                 newOffset.toInt) +: cum
16 |             case (None, cum@(binStats :: _)) => binStats +: cum
17 |             case _ => ??? // default impossible case to remove warning messages
18 |           })))) // map identity is a hack around the non-serializability of the Map returned from mapValues
19 |       .mapValues(_.reverse.toArray).map(identity)
20 | 
21 |     (model: Model, index: Long, partitionIndex: Int) => {
22 |       val BinStats(startBinNumber, offset) = modelToBinStats(model)(partitionIndex)
23 |       (startBinNumber + (index + offset) / numRecodsPerBin).toInt
24 |     }
25 |   }
26 | 
27 |   def resultingBinNumber(recordsPerBin: Int, totalRecords: Long): Long =
28 |     if (totalRecords % recordsPerBin == 0) totalRecords / recordsPerBin else (totalRecords / recordsPerBin) + 1
29 | 
30 |   def optimizeRecordsPerBin(totalRecords: Long, desiredBinNum: Int): Long =
31 |     (1 to (if (desiredBinNum < totalRecords) 1 + (totalRecords / desiredBinNum) else desiredBinNum).toInt)
32 |     .minBy(recordsPerBin => math.abs(resultingBinNumber(recordsPerBin, totalRecords) - desiredBinNum))
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/sam/sceval/BinaryConfusionMatrix.scala:
--------------------------------------------------------------------------------
 1 | package sam.sceval
 2 | 
 3 | case class MutableBinaryLabelCount(var numPositives: Long = 0L, var numNegatives: Long = 0L) {
 4 |   def +=(label: Boolean): MutableBinaryLabelCount = {
 5 |     if (label) numPositives += 1L else numNegatives += 1L
 6 |     this
 7 |   }
 8 | 
 9 |   def +=(other: MutableBinaryLabelCount): MutableBinaryLabelCount = {
10 |     numPositives += other.numPositives
11 |     numNegatives += other.numNegatives
12 |     this
13 |   }
14 | 
15 |   override def clone: MutableBinaryLabelCount = new MutableBinaryLabelCount(numPositives, numNegatives)
16 | 
17 |   def count: BinaryLabelCount = BinaryLabelCount(numPositives, numNegatives)
18 |   def total: Long = numPositives + numNegatives
19 | }
20 | 
21 | case class BinaryLabelCount(numPositives: Long = 0L, numNegatives: Long = 0L) {
22 |   def total: Long = numPositives + numNegatives
23 | 
24 |   def +(label: Boolean): BinaryLabelCount =
25 |     if (label) copy(numPositives = numPositives + 1L) else copy(numNegatives = numNegatives + 1L)
26 | 
27 |   def +(other: BinaryLabelCount): BinaryLabelCount =
28 |     BinaryLabelCount(numPositives + other.numPositives, numNegatives + other.numNegatives)
29 | }
30 | 
31 | case class BinaryConfusionMatrix(tp: Long, fp: Long, tn: Long, fn: Long) {
32 |   def total: Long = tp + fp + tn + fn
33 | 
34 |   def actualPositives: Long = tp + fn
35 |   def actualNegatives: Long = fp + tn
36 | 
37 |   def predictedPositives: Long = tp + fp
38 |   def predictedNegatives: Long = tn + fn
39 | 
40 |   def volume: Double = predictedPositives.toDouble / total
41 | 
42 |   /** The 'probability' the label is actually True given we predict True */
43 |   def precision: Double = if (predictedPositives == 0) 1.0 else tp.toDouble / predictedPositives
44 | 
45 |   /** The 'probability' we will predict True given the label is actually True */
46 |   def recall: Double = if (actualPositives == 0) 0.0 else tp.toDouble / actualPositives
47 | 
48 |   /** Actual probability of True */
49 |   def prior: Double = actualPositives.toDouble / total
50 | 
51 |   /** The 'probability' we will predict True */
52 |   def predictedPrior: Double = (tp + fp).toDouble / total
53 | 
54 |   /** How many times the predictor is better than random. One can usually map this number directly to
55 |     * savings / profit making it have business meaning, unlike most measures. */
56 |   def uplift: Double = (tp * total).toDouble / ((tp + fp) * (tp + fn))
57 | 
58 |   def specificity: Double = tn.toDouble / actualNegatives
59 |   def negativePredictiveValue: Double = tn.toDouble / predictedNegatives
60 |   def fallOut: Double = fp.toDouble / actualNegatives
61 |   def falseDiscoveryRate: Double = fp.toDouble / predictedPositives
62 |   def falsePositiveRate: Double = if (actualNegatives == 0) 0.0 else fp.toDouble / actualNegatives
63 |   def accuracy: Double = (tp + tn).toDouble / total
64 | 
65 |   def +(other: BinaryConfusionMatrix): BinaryConfusionMatrix =
66 |     BinaryConfusionMatrix(other.tp + tp, other.fp + fp, other.tn + tn, other.fn + fn)
67 | 
68 |   @deprecated("Don't use meaningless measures, use something that has a direct probabilistic meaning. See README.md")
69 |   def f1Measure(beta: Double = 1.0): Double = {
70 |     val beta2 = beta * beta
71 |     if (precision + recall == 0) 0.0 else (1.0 + beta2) * (precision * recall) / (beta2 * precision + recall)
72 |   }
73 | }
74 | 
75 | object BinaryConfusionMatrix {
76 |   def apply(count: BinaryLabelCount, totalCount: BinaryLabelCount): BinaryConfusionMatrix = BinaryConfusionMatrix(
77 |     tp = count.numPositives,
78 |     fp = count.numNegatives,
79 |     tn = totalCount.numNegatives - count.numNegatives,
80 |     fn = totalCount.numPositives - count.numPositives
81 |   )
82 | }
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/src/main/scala/sam/sceval/EvaluationPimps.scala:
--------------------------------------------------------------------------------
  1 | package sam.sceval
  2 | 
  3 | import BinUtils._
  4 | import org.apache.spark.Logging
  5 | import org.apache.spark.rdd.{RDD, UnionRDD}
  6 | import org.apache.spark.storage.StorageLevel
  7 | import org.apache.spark.storage.StorageLevel.MEMORY_ONLY
  8 | 
  9 | import scala.collection.mutable
 10 | import scala.reflect.ClassTag
 11 | 
 12 | // TODO (long term) Abstract out the RDD part so we can have a List version and a ParIterable version.
 13 | // I.e. introduce a DistributedDataset type-class (or DD)
 14 | 
 15 | // TODO Another version where the number of models is huge, but there is a long (0.0, Boolean) tail which can be
 16 | // preaggregated to then allow for a reduceBy(model).
 17 | // Will be useful for evaluating matching algorithms or tiny-cluster clustering problems
 18 | 
 19 | 
 20 | object EvaluationPimps extends Logging {
 21 |   implicit class PimpedScoresAndLabelsRDD(scoreAndLabels: RDD[(Double, Boolean)]) {
 22 | 
 23 |     import PimpedScoresAndLabelsRDD._
 24 | 
 25 |     def confusions(cacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY),
 26 |                    bins: Option[Int] = Some(1000),
 27 |                    recordsPerBin: Option[Long] = None): Seq[BinaryConfusionMatrix] =
 28 |       if (scoreAndLabels.isEmpty())
 29 |         Nil
 30 |       else
 31 |         scoreAndLabels.map(0 -> _).map(Map[Int, (Double, Boolean)](_))
 32 |         .confusionsByModel(cacheIntermediate, bins, recordsPerBin).map(_._2).collect().head
 33 | 
 34 |     /** Will incur 3 spark stages for bins != 0, 2 otherwise.
 35 |       * This method is `approx` in the sense the bin sizes and bin count can vary quite wildly. Use at own risk. */
 36 |     @deprecated("bin sizes & bin counts vary wildly. Use `confusions`.")
 37 |     def scoresAndConfusions(desiredBins: Int = 0): RDD[(Double, BinaryConfusionMatrix)] = {
 38 |       val binnedCounts: ScoresAndCounts =
 39 |         downSampleIfRequired(scoreAndLabels.combineByKey(
 40 |           createCombiner = new MutableBinaryLabelCount(0L, 0L) += (_: Boolean),
 41 |           mergeValue = (_: MutableBinaryLabelCount) += (_: Boolean),
 42 |           mergeCombiners = (_: MutableBinaryLabelCount) += (_: MutableBinaryLabelCount)
 43 |         ).sortByKey(ascending = false), desiredBins)
 44 | 
 45 |       val partitionwiseCumCounts: Array[MutableBinaryLabelCount] = partitionwiseCumulativeCounts(binnedCounts)
 46 |       val totalCount = partitionwiseCumCounts.last
 47 | 
 48 |       logInfo(s"Total counts: $totalCount")
 49 | 
 50 |       binnedCounts.mapPartitionsWithIndex(
 51 |         (index, partition) => partition.map {
 52 |           case (score, c) => (score, (partitionwiseCumCounts(index) += c).count)
 53 |         },
 54 |         preservesPartitioning = true)
 55 |       .map {
 56 |         case (score, cumCount) => (score, BinaryConfusionMatrix(cumCount, totalCount.count))
 57 |       }
 58 |     }
 59 |   }
 60 | 
 61 |   /** Model should extend AnyVal or Equals so that it makes sense to use this as a key.
 62 |     * Should scale reasonably well in the number of models. We return RDD because BCMs are computed in parallel for
 63 |     * each model, which ought to be a bit faster than a local computation for a large number of models.
 64 |     * This will ensure all bins except potentially the last bin are of equal size, consequently certain `bins` arguments
 65 |     * may be impossible to respect, in such cases the bin number that is closest to the desired bins is chosen.
 66 |     *
 67 |     * The first element of each Array[BinaryLabelCount] from `binaryLabelCounts` will be the total.
 68 |     *
 69 |     * This algorithm costs 4 stages and causes a job that runs the first 2.
 70 |     *
 71 |     * @param scoreAndLabelsByModel an RDD of Maps from the `Model` to a score-label pair.  It's assumed that for each
 72 |     *                              `Model` the total number of score-label pairs in the RDD is equal, if not then
 73 |     *                              the behaviour is unspecified and a warning is printed. */
 74 |   implicit class PimpedModelOutputsRDD[Model: ClassTag](scoreAndLabelsByModel: RDD[Map[Model, (Double, Boolean)]]) {
 75 | 
 76 |     import PimpedModelOutputsRDD._
 77 | 
 78 |     def confusionsByModel(cacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY),
 79 |                           bins: Option[Int] = Some(1000),
 80 |                           recordsPerBin: Option[Long] = None): RDD[(Model, Array[BinaryConfusionMatrix])] =
 81 |       binaryLabelCounts(cacheIntermediate, bins, recordsPerBin)
 82 |       .mapValues(blcs => blcs.map(BinaryConfusionMatrix(_, blcs.head)))
 83 | 
 84 |     def binaryLabelCounts(cacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY),
 85 |                           bins: Option[Int] = Some(1000),
 86 |                           recordsPerBin: Option[Long] = None): RDD[(Model, Array[BinaryLabelCount])] = {
 87 |       checkArgs(bins, recordsPerBin)
 88 | 
 89 |       scoreAndLabelsByModel.take(1).headOption.flatMap { _ =>
 90 |         val indexed: RDD[(Double, (Model, Boolean, Long))] = indexInPartition(scoreAndLabelsByModel)
 91 | 
 92 |         cacheIntermediate.foreach(indexed.persist)
 93 | 
 94 |         val lastIndexes: Array[Map[Model, Long]] = partitionLastIndexes(indexed)
 95 | 
 96 |         reindexByBin(indexed, lastIndexes, recordsPerBin, bins)
 97 |         .map(computeBLCs)
 98 |       }
 99 |       .getOrElse(scoreAndLabelsByModel.context.makeRDD[(Model, Array[BinaryLabelCount])](Nil))
100 |     }
101 |   }
102 | 
103 |   def checkArgs(bins: Option[Int] = Some(1000), recordsPerBin: Option[Long] = None): Unit = {
104 |     require(bins.isDefined ^ recordsPerBin.isDefined, "Only one of bins or recordsPerBin can be specified")
105 |     bins.foreach { b =>
106 |       require(b > 0, "Doesn't make sense to request zero or less bins: " + b)
107 |       require(b != 1, "Requesting 1 bin doesn't make sense. If you want the total use 2 bins and access " +
108 |         "totalCount in BinaryLabelCounts")
109 |     }
110 |     recordsPerBin.foreach(r => require(r >= 0, "Doesn't make sense to request negative records per bin: " + r))
111 |   }
112 | 
113 |   /** Companion object for `PimpedModelOutputsRDD` and methods only likely to be useful in this class.  Methods not
114 |     * declared private since this is a functional context. */
115 |   // We use mutability in these methods at the partition level to minimize pressure on GC
116 |   object PimpedModelOutputsRDD {
117 |     type Indexed[Model] = RDD[(Double, (Model, Boolean, Long))]
118 | 
119 |     def reindexByBin[Model: ClassTag](indexed: Indexed[Model],
120 |                                       lastIndexes: Array[Map[Model, Long]],
121 |                                       recordsPerBin: Option[Long],
122 |                                       bins: Option[Int]): Option[RDD[(Model, Boolean, Int)]] = {
123 |       val totalRecords =
124 |         lastIndexes.flatMap(_.keys).toSet.map((model: Model) =>
125 |           lastIndexes.filter(_.nonEmpty).flatMap(_.get(model).map(_ + 1)).sum)
126 |         .toList match {
127 |           case totalRecords :: Nil =>
128 |             totalRecords
129 |           case totalRecords :: _ :: _ =>
130 |             logWarning("Total number of records for each model is not all equal.")
131 |             totalRecords
132 |         }
133 | 
134 |       logInfo("Total records: " + totalRecords)
135 | 
136 |       lastIndexes.find(_.nonEmpty).map { aNonEmptyPartition =>
137 |         recordsPerBin.foreach(r => require(r < totalRecords, s"Cannot request $r records per bin as not enough " +
138 |           s"records in total to make 2 bins: $totalRecords"))
139 | 
140 |         val numRecordsPerBin: Long = recordsPerBin.getOrElse(optimizeRecordsPerBin(totalRecords, bins.get))
141 | 
142 |         logInfo("Bins that will used: " + resultingBinNumber(numRecordsPerBin.toInt, totalRecords) +
143 |           ", each with " + numRecordsPerBin + " records")
144 | 
145 |         reindexWithBinner(indexed, binnerFac(lastIndexes, numRecordsPerBin))
146 |       }
147 |     }
148 | 
149 |     def reindexWithBinner[Model: ClassTag](indexed: Indexed[Model],
150 |                                            binner: (Model, Long, Int) => Int): RDD[(Model, Boolean, Int)] =
151 |       indexed.mapPartitionsWithIndex((partitionIndex, partition) => partition.map {
152 |         case (_, (model, label, index)) => (model, label, binner(model, index, partitionIndex))
153 |       })
154 | 
155 |     /** The last element of each Array[BinaryLabelCount] will be the total */
156 |     def computeBLCs[Model: ClassTag](indexedByBin: RDD[(Model, Boolean, Int)]): RDD[(Model, Array[BinaryLabelCount])] =
157 |       indexedByBin.mapPartitions { partition =>
158 |         val bins: mutable.Map[(Model, Int), MutableBinaryLabelCount] = mutable.Map()
159 |         partition.foreach {
160 |           case (model, label, bin) =>
161 |             bins += ((model, bin) -> (bins.getOrElse((model, bin), MutableBinaryLabelCount()) += label))
162 |         }
163 |         bins.mapValues(_.count).toList.iterator
164 |       }
165 |       .reduceByKey(_ + _).map {
166 |         case ((model, bin), count) => (model, (bin, count))
167 |       }
168 |       .groupByKey()
169 |       .mapValues(_.toArray.sortBy(-_._1).map(_._2).scan(BinaryLabelCount())(_ + _).drop(1).reverse)
170 | 
171 |     def indexInPartition[Model: ClassTag](scoreAndLabelsByModel: RDD[Map[Model, (Double, Boolean)]]): Indexed[Model] =
172 |       scoreAndLabelsByModel.flatMap(identity).map {
173 |         case (model, (score, label)) => (score, (model, label))
174 |       }
175 |       .sortByKey()
176 |       .mapPartitions(partition => {
177 |         val modelToCount: mutable.Map[Model, Long] = mutable.Map()
178 |         partition.map {
179 |           case (score, (model, label)) =>
180 |             val index = modelToCount.getOrElse(model, 0L)
181 |             modelToCount += (model -> (index + 1))
182 |             // TODO Determine if keeping the score here is actually necessary - I don't think it makes sense
183 |             (score, (model, label, index))
184 |         }
185 |       }, preservesPartitioning = true)
186 | 
187 |     // Uses memory O(models x partitions)
188 |     def partitionLastIndexes[Model: ClassTag](indexed: Indexed[Model]): Array[Map[Model, Long]] =
189 |       indexed.mapPartitions { partition =>
190 |         val modelToCount: mutable.Map[Model, Long] = mutable.Map()
191 |         partition.foreach {
192 |           case (_, (model, _, index)) => modelToCount += model -> index
193 |         }
194 |         Iterator(modelToCount.toMap)
195 |       }
196 |       .collect()
197 |   }
198 | 
199 |   implicit class PimpedConfusionsSeq(confusions: Seq[BinaryConfusionMatrix]) {
200 |     def roc: Seq[(Double, Double)] =
201 |       (0.0, 0.0) +: confusions.map(bcm => (bcm.falsePositiveRate, bcm.recall)) :+(1.0, 1.0)
202 | 
203 |     def precisionByVolume: Seq[(Double, Double)] = confusions.map(bcm => (bcm.volume, bcm.precision))
204 |     def recallByVolume: Seq[(Double, Double)] = confusions.map(bcm => (bcm.volume, bcm.recall))
205 |     def precisionRecallCurve: Seq[(Double, Double)] = (0.0, 1.0) +: confusions.map(bcm => (bcm.recall, bcm.precision))
206 |     def areaUnderROC: Double = AreaUnderCurve(roc)
207 |     def areaUnderPR: Double = AreaUnderCurve(precisionRecallCurve)
208 |   }
209 | 
210 |   /** These are left as RDDs as per the original API, but they will be small data that's best returned to the driver
211 |     * for subsequent processing */
212 |   implicit class PimpedScoresAndConfusionsRDD(confusions: RDD[(Double, BinaryConfusionMatrix)]) {
213 |     def roc(): RDD[(Double, Double)] = {
214 |       val rocCurve = confusions.map(_._2).map(bcm => (bcm.falsePositiveRate, bcm.recall))
215 |       val sc = confusions.context
216 |       val first = sc.makeRDD(Seq((0.0, 0.0)), 1)
217 |       val last = sc.makeRDD(Seq((1.0, 1.0)), 1)
218 |       new UnionRDD[(Double, Double)](sc, Seq(first, rocCurve, last))
219 |     }
220 | 
221 |     def precisionRecallCurve(): RDD[(Double, Double)] =
222 |       confusions.context.makeRDD(Seq((0.0, 1.0)), 1)
223 |       .union(confusions.map(_._2).map(bcm => (bcm.recall, bcm.precision)))
224 | 
225 |     def thresholds(): RDD[Double] = confusions.map(_._1)
226 |     def areaUnderROC(): Double = AreaUnderCurve(roc())
227 |     def areaUnderPR(): Double = AreaUnderCurve(precisionRecallCurve())
228 |     def precisionByThreshold(): RDD[(Double, Double)] = confusions.mapValues(_.precision)
229 |     def recallByThreshold(): RDD[(Double, Double)] = confusions.mapValues(_.recall)
230 |     @deprecated("Don't use meaningless measures, use something that has a direct probabilistic meaning. See README.md")
231 |     def f1MeasureByThreshold(beta: Double = 1.0): RDD[(Double, Double)] = confusions.mapValues(_.f1Measure(beta))
232 |   }
233 | 
234 |   object PimpedScoresAndLabelsRDD {
235 |     type ScoresAndCounts = RDD[(Double, MutableBinaryLabelCount)]
236 | 
237 |     // This doesn't scale in the number of bins since it all happens on the driver node
238 |     def partitionwiseCumulativeCounts(binnedCounts: ScoresAndCounts): Array[MutableBinaryLabelCount] =
239 |       binnedCounts.values.mapPartitions { partition =>
240 |         val agg = MutableBinaryLabelCount()
241 |         partition.foreach(agg +=)
242 |         Iterator(agg)
243 |       }
244 |       .collect()
245 |       .scanLeft(MutableBinaryLabelCount())(_.clone += _)
246 | 
247 |     def downSample(grouping: Int, sortedCounts: ScoresAndCounts): ScoresAndCounts =
248 |       sortedCounts.mapPartitions(_.grouped(grouping.toInt).map {
249 |         case group@((firstScore, _) :: _) =>
250 |           val agg = new MutableBinaryLabelCount()
251 |           group.map(_._2).foreach(agg +=)
252 |           (firstScore, agg)
253 |       })
254 | 
255 |     def downSampleIfRequired(sortedCounts: ScoresAndCounts, desiredBins: Int): ScoresAndCounts =
256 |       if (desiredBins == 0) sortedCounts
257 |       else {
258 |         val countsSize = sortedCounts.count()
259 |         countsSize / desiredBins match {
260 |           case g if g < 2 =>
261 |             logInfo(s"Curve is too small ($countsSize) for $desiredBins bins to be useful")
262 |             sortedCounts
263 |           case g if g >= Int.MaxValue =>
264 |             logWarning(s"Curve too large ($countsSize) for $desiredBins bins; capping at ${Int.MaxValue}")
265 |             downSample(Int.MaxValue, sortedCounts)
266 |           case g =>
267 |             downSample(g.toInt, sortedCounts)
268 |         }
269 |       }
270 |   }
271 | }
272 | 


--------------------------------------------------------------------------------
/src/main/scala/sam/sceval/XValidator.scala:
--------------------------------------------------------------------------------
 1 | package sam.sceval
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import EvaluationPimps._
 5 | import org.apache.spark.storage.StorageLevel
 6 | import org.apache.spark.storage.StorageLevel._
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /** x-validator that uses near exact same size folds */
11 | case class XValidator(folds: Int = 10, 
12 |                       evalBins: Option[Int] = Some(1000),
13 |                       evalCacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY),
14 |                       evalRecordsPerBin: Option[Long] = None) {
15 | 
16 |   def trainWithExample(fold: Int, modelIndex: Int): Boolean = fold != modelIndex
17 |   def scoreWithExample(fold: Int, modelIndex: Int): Boolean = fold == modelIndex
18 | 
19 |   /** Randomly enumerates all values such that each fold will have the same number of elements + / - 1.
20 |     * It is then up to the user to decide how to use this to train their models (user can use the helper methods
21 |     * `trainWithExample` and `scoreWithExample` to ensure a consistent approach)
22 |     *
23 |     * Strictly speaking there are edge cases where this will not generate random splits.  Particularly when partitions
24 |     * consist of a very small number of examples. */
25 |   def split[Features](featuresAndLabels: RDD[(Features, Boolean)]): RDD[(Int, Features, Boolean)] = {
26 |     val upToFolds = featuresAndLabels.take(folds).length
27 |     require(upToFolds == folds, s"Not enough records ($upToFolds) for $folds folds")
28 |     featuresAndLabels.mapPartitions(new Random().shuffle(_)).zipWithIndex().map {
29 |       case ((f, l), i) => ((i % folds).toInt, f, l)
30 |     }
31 |   }
32 | 
33 |   def evaluate(scoresAndLabelsByModel: RDD[(Int, Double, Boolean)]): Array[BinaryConfusionMatrix] =
34 |     scoresAndLabelsByModel.map(p => Map(p._1 -> (p._2, p._3)))
35 |     .confusionsByModel(evalCacheIntermediate, evalBins, evalRecordsPerBin).map(_._2)
36 |     .flatMap(_.zipWithIndex.map(_.swap)).reduceByKey(_ + _).collect().sortBy(_._1).map(_._2)
37 | 
38 |   def xval[Features](trainAndScoreByModel: RDD[(Int, Features, Boolean)] => RDD[(Int, Double, Boolean)],
39 |                      featuresAndLabel: RDD[(Features, Boolean)]): Array[BinaryConfusionMatrix] =
40 |     evaluate(trainAndScoreByModel(split(featuresAndLabel)))
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/sam/sceval/package.scala:
--------------------------------------------------------------------------------
1 | package sam
2 | 
3 | // Copied and pasted from https://github.com/samthebest/sceval until sceval is uploaded to maven as jar dependency
4 | package object sceval {
5 | 
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/classifiers/BinaryClassifier.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.classifiers
 2 | 
 3 | import org.apache.spark.mllib.linalg.Vector
 4 | import org.apache.spark.mllib.regression.LabeledPoint
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | trait BinaryClassifierTrainedModel[Features] extends Serializable {
 8 |   def score(featuresWindow: Features): Double
 9 | }
10 | 
11 | trait BinaryClassifierTrainer[Features] {
12 |   def train(trainingData: RDD[_ <:FeaturesWithBooleanLabel[Features]]): BinaryClassifierTrainedModel[Features]
13 | }
14 | 
15 | trait BinaryClassifierTrainedVectorModel extends Serializable {
16 |   def score(vector: Vector): Double
17 | }
18 | 
19 | trait BinaryClassifierVectorTrainer {
20 |   def train(trainingData: RDD[LabeledPoint]): BinaryClassifierTrainedVectorModel
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/classifiers/BinaryClassifierTrainerWithTransformer.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.classifiers
 2 | 
 3 | import org.apache.spark.mllib.linalg._
 4 | import org.apache.spark.mllib.regression.LabeledPoint
 5 | import org.apache.spark.rdd.RDD
 6 | import sparkz.transformers.FeaturesTransformer
 7 | 
 8 | import scala.reflect.ClassTag
 9 | 
10 | case object BinaryClassifierTrainerWithTransformer {
11 |   def labeledPoint[Features: ClassTag](featuresWithBooleanLabel: FeaturesWithBooleanLabel[Features],
12 |                                        toVector: Features => Vector): LabeledPoint = featuresWithBooleanLabel match {
13 |     case featuresWithLabel =>
14 |       LabeledPoint(if (featuresWithLabel.isTrue) 1.0 else 0.0, toVector(featuresWithLabel.features))
15 |   }
16 | 
17 |   def apply[Features: ClassTag](vec2classifier: BinaryClassifierVectorTrainer,
18 |                                 transformer: FeaturesTransformer[Features]): BinaryClassifierTrainer[Features] =
19 |     new BinaryClassifierTrainer[Features] {
20 |       def train(trainingData: RDD[_ <: FeaturesWithBooleanLabel[Features]]): BinaryClassifierTrainedModel[Features] = {
21 |         val toVector = transformer.featuresToVector(trainingData.map(_.features))
22 | 
23 |         val model = vec2classifier.train(trainingData.map(labeledPoint(_, toVector)))
24 |         new BinaryClassifierTrainedModel[Features] {
25 |           def score(featuresWindow: Features): Double = model.score(toVector(featuresWindow))
26 |         }
27 |       }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/classifiers/DecisionTreeVectorClassifier.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.classifiers
 2 | 
 3 | import org.apache.spark.mllib.linalg.Vector
 4 | import org.apache.spark.mllib.regression.LabeledPoint
 5 | import org.apache.spark.mllib.tree.DecisionTree
 6 | import org.apache.spark.mllib.tree.configuration.FeatureType
 7 | import org.apache.spark.mllib.tree.model.Node
 8 | import org.apache.spark.rdd.RDD
 9 | 
10 | case class DecisionTreeClassifierTrainer(impurity: String,
11 |                                          maxDepth: Int,
12 |                                          maxBins: Int) extends BinaryClassifierVectorTrainer {
13 |   def train(trainingData: RDD[LabeledPoint]): BinaryClassifierTrainedVectorModel = {
14 | 
15 |     val model = DecisionTree.trainClassifier(
16 |       input = trainingData,
17 |       numClasses = 2,
18 |       categoricalFeaturesInfo = Map.empty[Int, Int],
19 |       impurity = impurity,
20 |       maxDepth = maxDepth,
21 |       maxBins = maxBins
22 |     )
23 | 
24 |     val topNode = model.topNode
25 | 
26 |     new BinaryClassifierTrainedVectorModel {
27 |       def score(vector: Vector): Double = {
28 |         DecisionTreeInference.predictProb(topNode)(vector)
29 |       }
30 |     }
31 |   }
32 | }
33 | 
34 | case object DecisionTreeInference {
35 |   def predictProb(node: Node)(features: Vector): Double = {
36 |     if (node.isLeaf) {
37 |       if (node.predict.predict == 1.0) node.predict.prob else 1.0 - node.predict.prob
38 |     } else {
39 |       assert(node.split.get.featureType == FeatureType.Continuous)
40 |       if (features(node.split.get.feature) <= node.split.get.threshold) {
41 |         predictProb(node.leftNode.get)(features)
42 |       } else {
43 |         predictProb(node.rightNode.get)(features)
44 |       }
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/classifiers/Features.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.classifiers
 2 | 
 3 | trait FeaturesWithBooleanLabel[Features] {
 4 |   def features: Features
 5 |   def isTrue: Boolean
 6 | }
 7 | 
 8 | trait MetaData[MetaData] {
 9 |   def metaData: MetaData
10 | }
11 | 
12 | object EmptyMetaData extends MetaData[Unit] {
13 |   def metaData: Unit = ()
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/classifiers/RandomBinaryClassifierTrainer.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.classifiers
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | case class RandomBinaryClassifierTrainer[Features](seed: Option[Long] = None) extends BinaryClassifierTrainer[Features] {
 8 |   def train(trainingData: RDD[_ <: FeaturesWithBooleanLabel[Features]]): BinaryClassifierTrainedModel[Features] = {
 9 |     val random: Option[Random] = seed.map(new Random(_))
10 |     new BinaryClassifierTrainedModel[Features] {
11 |       def score(vector: Features): Double = random.getOrElse(new Random()).nextDouble
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/evaluation/BinaryClassifierEvaluation.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.evaluation
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import sparkz.classifiers.{BinaryClassifierTrainer, FeaturesWithBooleanLabel, MetaData}
 5 | import sparkz.utils.AppLogger
 6 | 
 7 | import scala.reflect.ClassTag
 8 | import scalaz.Scalaz._
 9 | 
10 | object BinaryClassifierEvaluation {
11 |   def crossValidationScores[Features, Meta, Order: Ordering : ClassTag, UniqueKey: ClassTag]
12 |   (data: RDD[FeaturesWithBooleanLabel[Features] with MetaData[Meta]],
13 |    k: Int,
14 |    classifiers: List[BinaryClassifierTrainer[Features]],
15 |    uniqueId: Meta => UniqueKey,
16 |    orderingField: Meta => Order,
17 |    singleInference: Boolean = true,
18 |    seed: Long = 12345L): Map[BinaryClassifierTrainer[Features], RDD[(FeaturesWithBooleanLabel[Features] with MetaData[Meta], Double)]] =
19 |     (for {
20 |       i <- 0 until k
21 |       otherFolds = data.filter(_.metaData |> uniqueId |> (_.hashCode() % k == i))
22 |       holdoutFold = data.filter(_.metaData |> uniqueId |> (_.hashCode() % k != i))
23 | 
24 |       splitMax = holdoutFold
25 |                  .keyBy(_.metaData |> uniqueId)
26 |                  .mapValues(_.metaData |> orderingField)
27 |                  .reduceByKey((record1, record2) => Array(record1, record2).max)
28 |                  .values.min
29 | 
30 |       Array(splitPoint) = holdoutFold.map(_.metaData |> orderingField).filter(_ < splitMax)
31 |                           .takeSample(withReplacement = false, num = 1, seed = seed)
32 | 
33 |       trainingData = otherFolds.filter(_.metaData |> orderingField |> (_ <= splitPoint))
34 |       holdoutFoldAfterSplit = holdoutFold.filter(_.metaData |> orderingField |> (_ > splitPoint))
35 | 
36 |       testData = if (singleInference)
37 |         holdoutFoldAfterSplit.keyBy(_.metaData |> uniqueId)
38 |         .reduceByKey((record1, record2) => Array(record1, record2).minBy(_.metaData |> orderingField))
39 |         .values
40 |       else holdoutFoldAfterSplit
41 | 
42 |       classifier <- classifiers
43 |       model = classifier.train(trainingData)
44 |       scores = testData.map(testRecord => testRecord -> model.score(testRecord.features))
45 |     } yield classifier -> scores)
46 |     .groupBy(_._1).mapValues(_.map(_._2).reduce(_ ++ _))
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/evaluation/MAP.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.evaluation
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.rdd.RDD._
 5 | 
 6 | import scalaz.Scalaz._
 7 | 
 8 | object MAP {
 9 |   def apply[T](n: Int = 100, recommendations: RDD[(Long, List[T])], evaluation: RDD[(Long, Set[T])]): Double =
10 |     recommendations.join(evaluation).values.map {
11 |       case (recommendedLikes, trueLikes) => recommendedLikes.take(n).zipWithIndex.foldLeft(0, 0.0) {
12 |         case ((accLikes, accPrecision), (postId, k)) if trueLikes(postId) =>
13 |           (accLikes + 1, accPrecision + ((accLikes + 1).toDouble / (k + 1)))
14 |         case ((accLikes, accPrecision), _) => (accLikes, accPrecision)
15 |       }._2 / math.min(trueLikes.size, n)
16 |     } |> (apn => {
17 |       val count = apn.count()
18 |       if (count > 0) apn.reduce(_ + _) / count else 0
19 |     })
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/transformers/EnsembleTransformer.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.transformers
 2 | 
 3 | import org.apache.spark.mllib.linalg._
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | trait FeaturesTransformer[Features] extends Serializable {
 9 |   def featuresToVector(trainingData: RDD[Features]): Features => Vector
10 | }
11 | 
12 | abstract class SubFeaturesTransformer[SubFeatures: ClassTag, Features] extends FeaturesTransformer[Features] {
13 |   def subFeatures(features: Features): SubFeatures
14 | 
15 |   def subFeaturesToVector(trainingData: RDD[SubFeatures]): SubFeatures => Vector
16 | 
17 |   def featuresToVector(trainingData: RDD[Features]): Features => Vector = {
18 |     val toVector = subFeaturesToVector(trainingData.map(subFeatures))
19 | 
20 |     (features: Features) => toVector(subFeatures(features))
21 |   }
22 | }
23 | 
24 | case object EnsembleTransformer {
25 |   def concatenateVectors(v1: Vector, v2: Vector): Vector = (v1, v2) match {
26 |     case (v1: SparseVector, v2: SparseVector) => Vectors.sparse(
27 |       size = v1.size + v2.size,
28 |       indices = v1.indices ++ v2.indices.map(_ + v1.size),
29 |       values = v1.values ++ v2.values
30 |     )
31 |     case (v1: SparseVector, v2: DenseVector) => Vectors.sparse(
32 |       size = v1.size + v2.size,
33 |       indices = v1.indices ++ (v1.size until (v1.size + v2.size)),
34 |       values = v1.values ++ v2.values
35 |     )
36 |     case (v1: DenseVector, v2: SparseVector) => Vectors.sparse(
37 |       size = v1.size + v2.size,
38 |       indices = (0 until v1.size).toArray ++ v2.indices.map(_ + v1.size),
39 |       values = v1.values ++ v2.values
40 |     )
41 |     case (v1: DenseVector, v2: DenseVector) =>
42 |       Vectors.dense(values = v1.values ++ v2.values)
43 |   }
44 | }
45 | 
46 | case class EnsembleTransformer[Features](subTransformer1: SubFeaturesTransformer[_, Features],
47 |                                          otherSubTransformers: SubFeaturesTransformer[_, Features]*) extends FeaturesTransformer[Features] {
48 |   def featuresToVector(trainingData: RDD[Features]): Features => Vector = {
49 |     (features: Features) =>
50 |       (subTransformer1 +: otherSubTransformers).map(_.featuresToVector(trainingData))
51 |       .map(_.apply(features)).reduce(EnsembleTransformer.concatenateVectors)
52 |   }
53 | }


--------------------------------------------------------------------------------
/src/main/scala/sparkz/transformers/MultiOneHotTransformer.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.transformers
 2 | 
 3 | import org.apache.spark.broadcast.Broadcast
 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | case object MultiOneHotTransformer {
10 |   def apply[Key: ClassTag: Ordering, Features](attributes: Features => Map[Key, String]) =
11 |     new SubFeaturesTransformer[Map[Key, String], Features] {
12 |       def subFeatures(features: Features): Map[Key, String] = attributes(features)
13 |       def subFeaturesToVector(trainingData: RDD[Map[Key, String]]): (Map[Key, String]) => Vector = {
14 |         val attributeToIndexBV: Broadcast[Map[(Key, String), Int]] = trainingData.context.broadcast(
15 |           trainingData.flatMap(identity).distinct().collect().sorted.zipWithIndex.toMap
16 |         )
17 | 
18 |         (attributes: Map[Key, String]) => Vectors.sparse(
19 |           size = attributeToIndexBV.value.size,
20 |           elements = attributes.toSeq.flatMap(attribute =>
21 |             attributeToIndexBV.value.get(attribute).map(_ -> 1.0)
22 |           ))
23 |       }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/transformers/OriginalNumericalsTransformer.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.transformers
 2 | 
 3 | import org.apache.spark.broadcast.Broadcast
 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | case object OriginalNumericalsTransformer {
10 |   def apply[Key: ClassTag: Ordering, Features](numericals: Features => Map[Key, Double]) =
11 |     new SubFeaturesTransformer[Map[Key, Double], Features] {
12 |       def subFeatures(features: Features): Map[Key, Double] = numericals(features)
13 |       def subFeaturesToVector(trainingData: RDD[Map[Key, Double]]): (Map[Key, Double]) => Vector = {
14 |         val keyToIndexBV: Broadcast[Map[Key, Int]] = trainingData.context.broadcast(
15 |           trainingData.flatMap(_.keySet).distinct().collect().sorted.zipWithIndex.toMap
16 |         )
17 | 
18 |         (numericals: Map[Key, Double]) => Vectors.sparse(
19 |           size = keyToIndexBV.value.size,
20 |           elements = numericals.toSeq.flatMap {
21 |             case (key, value) => keyToIndexBV.value.get(key).map(_ -> value)
22 |           })
23 |       }
24 |     }
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/transformers/TermFrequencyTransformer.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.transformers
 2 | 
 3 | import org.apache.spark.broadcast.Broadcast
 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | case object TermFrequencyTransformer {
10 |   def apply[Term: ClassTag: Ordering, Features](terms: Features => List[Term]) = new SubFeaturesTransformer[List[Term], Features] {
11 |     def subFeatures(features: Features): List[Term] = terms(features)
12 | 
13 |     def subFeaturesToVector(trainingData: RDD[List[Term]]): (List[Term]) => Vector = {
14 |       val termToIndexBV: Broadcast[Map[Term, Int]] = trainingData.context.broadcast(
15 |         trainingData.flatMap(identity).distinct().collect().sorted.zipWithIndex.toMap
16 |       )
17 | 
18 |       (terms: List[Term]) =>
19 |         Vectors.sparse(termToIndexBV.value.size, terms.groupBy(identity).mapValues(_.size).flatMap {
20 |           case (term, count) => termToIndexBV.value.get(term).map(_ -> count.toDouble)
21 |         }.toSeq)
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/utils/AppLogger.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.utils
 2 | 
 3 | import org.apache.log4j._
 4 | 
 5 | case class AppLogger(logger: org.apache.log4j.Logger) {
 6 |   def info(message: String): Unit = info(() => message)
 7 |   def info(message: () => String): Unit =
 8 |     if (Level.INFO.isGreaterOrEqual(logger.getEffectiveLevel)) logger.info(message())
 9 | 
10 |   def warn(message: String): Unit = warn(() => message)
11 |   def warn(message: () => String): Unit =
12 |     if (Level.WARN.isGreaterOrEqual(logger.getEffectiveLevel)) logger.warn(message())
13 | 
14 |   def debug(message: String): Unit = debug(() => message)
15 |   def debug(message: () => String): Unit =
16 |     if (Level.DEBUG.isGreaterOrEqual(logger.getEffectiveLevel)) logger.debug(message())
17 | }
18 | 
19 | case object AppLogger {
20 |   def getLogger(level: Level): AppLogger = {
21 |     val logger = org.apache.log4j.Logger.getLogger("FTB")
22 |     logger.setLevel(level)
23 |     AppLogger(logger)
24 |   }
25 | 
26 |   def infoLevel(): AppLogger = getLogger(Level.INFO)
27 |   def warnLevel(): AppLogger = getLogger(Level.WARN)
28 |   def debugLevel(): AppLogger = getLogger(Level.DEBUG)
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/utils/Pimps.scala:
--------------------------------------------------------------------------------
  1 | package sparkz.utils
  2 | 
  3 | import org.apache.spark.rdd.RDD
  4 | import org.joda.time.DateTime
  5 | 
  6 | import scala.reflect.ClassTag
  7 | import scalaz.Scalaz._
  8 | import scalaz.{Failure, Success, ValidationNel}
  9 | 
 10 | case object Pimps {
 11 | 
 12 |   implicit class PimpedString(str: String) {
 13 |     def nonEmptyOption: Option[String] = str.nonEmpty.option(str)
 14 | 
 15 |     def nonEmptyOption[T](f: String => T): Option[T] = str.nonEmpty.option(str).map(f)
 16 |   }
 17 | 
 18 |   implicit def longToDateTime(ts: Long): DateTime = new DateTime(ts)
 19 | 
 20 |   implicit class PimpedIterable[A](i: Iterable[A]) {
 21 |     def maxesBy[B](f: A => B)(implicit cmp: Ordering[B]): Set[A] =
 22 |       if (i.nonEmpty) i.maxBy(f) |>
 23 |         ((max: A) => i.filter(x => f(x) == f(max)).toSet)
 24 |       else Set.empty
 25 |   }
 26 | 
 27 |   implicit class PimpedTupledSet[K, V](set: Set[(K, V)]) {
 28 |     def toMultiMap: Map[K, Set[V]] = set.groupBy(_._1).mapValues(_.map(_._2))
 29 |   }
 30 | 
 31 |   implicit class PimpedOptionMap[K, V](m: Map[Option[K], V]) {
 32 |     def flatMapOptionKeys = new PimpedMap(m).flatMapKeys(Option.option2Iterable)
 33 |   }
 34 | 
 35 |   implicit class PimpedMap[K, V](m: Map[K, V]) {
 36 |     def flatMapKeys[K2](f: (K => Iterable[K2])): Map[K2, V] = m.flatMap {
 37 |       case (k1, v) => f(k1).map(_ -> v)
 38 |     }
 39 | 
 40 |     def join[V2](that: Map[K, V2]): Map[K, (V, V2)] =
 41 |       m.flatMap(kv => that.get(kv._1).map(thatV => (kv._1, (kv._2, thatV))))
 42 | 
 43 |     def maxByValue(implicit cmp: Ordering[V]): (K, V) = m.maxByValue(v => v)
 44 | 
 45 |     def maxByValue[B](f: (V => B))(implicit cmp: Ordering[B]): (K, V) = m.maxBy(pair => f(pair._2))
 46 | 
 47 |     def maxesByValue(implicit cmp: Ordering[V]): List[(K, V)] = maxesByValue(v => v)
 48 | 
 49 |     def maxesByValue[B](f: (V => B))(implicit cmp: Ordering[B]): List[(K, V)] =
 50 |       m.maxBy(pair => f(pair._2)) |> ((max: (K, V)) => m.filter(_._2 == max._2).toList)
 51 | 
 52 |     def minByValue(implicit cmp: Ordering[V]): (K, V) = m.minByValue(v => v)
 53 | 
 54 |     def minByValue[B](f: (V => B))(implicit cmp: Ordering[B]): (K, V) = m.minBy(pair => f(pair._2))
 55 | 
 56 |     def minsByValue(implicit cmp: Ordering[V]): List[(K, V)] = minsByValue(v => v)
 57 | 
 58 |     def minsByValue[B](f: (V => B))(implicit cmp: Ordering[B]): List[(K, V)] =
 59 |       m.minBy(pair => f(pair._2)) |> ((max: (K, V)) => m.filter(_._2 == max._2).toList)
 60 |   }
 61 | 
 62 |   implicit class PimpedPairedList[K, V](l: List[(K, V)]) {
 63 |     def reduceByKey(func: (V, V) => V): Map[K, V] = l.groupBy(_._1).mapValues(_.map(_._2).reduce(func))
 64 |   }
 65 | 
 66 |   // TODO DRY with insights-engine code by making another shared project of some kind (could open source it!?)
 67 |   implicit class PimpedTupleIterable[T1, T2](l: Iterable[(T1, T2)]) {
 68 |     def mapTupled[U](f: (T1, T2) => U): Iterable[U] = l.map(f.tupled)
 69 | 
 70 |     def flatMapTupled[U](f: (T1, T2) => TraversableOnce[U]): Iterable[U] = l.flatMap(f.tupled)
 71 |   }
 72 | 
 73 |   implicit class PimpedMapDouble[K](m: Map[K, Double]) {
 74 |     def normalize: Map[K, Double] = m.values.sum |> (total => m.mapValues(_ / total))
 75 | 
 76 |     def productWithMap(m2: Map[K, Double]): Double = {
 77 |       (for {
 78 |         (k1, v1) <- m
 79 |         if m2.contains(k1)
 80 |       } yield v1 * m2(k1)).sum
 81 |     }
 82 |   }
 83 | 
 84 |   implicit class PimpedMapInt[K](m: Map[K, Int]) {
 85 |     def normalize: Map[K, Double] = m.values.sum |> (total => m.mapValues(_.toDouble / total))
 86 |   }
 87 | 
 88 |   implicit class PimpedSet[T](s: Set[T]) {
 89 |     def X[U](other: Set[U]): Set[(T, U)] = |@|(other).tupled.toSet
 90 | 
 91 |     def |@|[U](other: Set[U]) = s.toList |@| other.toList
 92 |   }
 93 | 
 94 |   implicit class PimpedRDD[T: ClassTag](rdd: RDD[T]) {
 95 |     def applyIf(condition: Boolean)(f: RDD[T] => RDD[T]): RDD[T] = if (condition) f(rdd) else rdd
 96 | 
 97 |     def thenDo[U](f: RDD[T] => U): RDD[T] = f(rdd) |> (_ => rdd)
 98 |   }
 99 | 
100 |   implicit class PimpedPartialFunction[X, E](pf: PartialFunction[X, E]) {
101 |     def toFailureNel[W](x: X, toW: E => W = identity _): ValidationNel[W, X] =
102 |       pf.andThen(e => toW(e).failureNel[X]).applyOrElse(x, (_: X).successNel[W])
103 | 
104 |     def toFailureNel(x: X): ValidationNel[E, X] = toFailureNel(x, identity)
105 |   }
106 | 
107 |   implicit class PimpedValidationNel[E, X](x1: ValidationNel[E, X]) {
108 |     // Extension of scalaz.Validation.+++ operator, does not require the semigroup defined for X
109 |     def |+++|(x2: ValidationNel[E, X]) = x1 match {
110 |       case Failure(a1) => x2 match {
111 |         case Failure(a2) => Failure(a1 append a2)
112 |         case Success(b2) => x1
113 |       }
114 |       case Success(b1) => x2 match {
115 |         case b2@Failure(_) => b2
116 |         case Success(b2) if b1 == b2 => Success(b1)
117 |         case Success(b2) => throw new IllegalArgumentException(s"$b1 not equals to $b2")
118 |       }
119 |     }
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/utils/StatCounter.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.utils
 2 | 
 3 | import scalaz.Monoid
 4 | 
 5 | case object StatCounterMonoid extends Monoid[StatCounter] {
 6 |   def zero: StatCounter = ZeroStatCounter
 7 | 
 8 |   def append(f1: StatCounter, f2: => StatCounter): StatCounter = f1 merge f2
 9 | }
10 | 
11 | object ZeroStatCounter extends StatCounter(0, 0.0, 0.0, Double.PositiveInfinity, Double.NegativeInfinity)
12 | 
13 | case object StatCounter {
14 |   def apply(values: TraversableOnce[Double]): StatCounter = values.foldLeft(ZeroStatCounter: StatCounter)(_ merge _)
15 | 
16 |   def apply(value: Double): StatCounter = apply(List(value))
17 | 
18 |   implicit val monoid: Monoid[StatCounter] = StatCounterMonoid
19 | }
20 | 
21 | case class StatCounter(n: Long, sum: Double, sos: Double, min: Double, max: Double) {
22 |   def merge(other: StatCounter) = StatCounter(n + other.n, sum + other.sum, sos + other.sos,
23 |     math.min(min, other.min), math.max(max, other.max))
24 | 
25 |   def merge(value: Double) = StatCounter(n + 1, sum + value, sos + (value * value),
26 |     math.min(min, value), math.max(max, value))
27 | 
28 |   def merge(values: TraversableOnce[Double]): StatCounter = values.foldLeft(this)(_ merge _)
29 | 
30 |   def count = n
31 | 
32 |   def mean = sum / n
33 | 
34 |   def variance = if (n > 1) (sos - n * mean * mean) / (n - 1) else Double.NaN
35 | 
36 |   def stdev = math.sqrt(variance)
37 | 
38 |   def stderr = stdev / math.sqrt(n)
39 | 
40 |   override def toString = "(count: %d, mean: %f, stdev: %f, min: %f, max: %f)".format(count, mean, stdev, max, min)
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/utils/TopN.scala:
--------------------------------------------------------------------------------
 1 | package sparkz.utils
 2 | 
 3 | import scala.reflect.ClassTag
 4 | 
 5 | object TopElements {
 6 |   def topN[T: ClassTag](elems: Iterable[T])(scoreFunc: T => Double, n: Int): List[T] =
 7 |     elems.foldLeft((Set.empty[(T, Double)], Double.MaxValue)) {
 8 |       case (accumulator@(topElems, minScore), elem) =>
 9 |         val score = scoreFunc(elem)
10 |         if (topElems.size < n)
11 |           (topElems + (elem -> score), math.min(minScore, score))
12 |         else if (score > minScore) {
13 |           val newTopElems = topElems - topElems.minBy(_._2) + (elem -> score)
14 |           (newTopElems, newTopElems.map(_._2).min)
15 |         }
16 |         else accumulator
17 |     }
18 |       ._1.toList.sortBy(_._2).reverse.map(_._1)
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/sparkz/utils/VPTree.scala:
--------------------------------------------------------------------------------
  1 | package sparkz.utils
  2 | 
  3 | import VPTree._
  4 | 
  5 | import scala.reflect.ClassTag
  6 | 
  7 | case class VPTree[T1: ClassTag, T2: ClassTag](root: Tree[T1, T2], distance: Distance[T1]) {
  8 |   def nearest(t: T1, maxDist: Double) = root.nearN(t, maxDist, distance)
  9 | 
 10 |   def approximateNearest(t: T1): (T1, T2) = root.approxNear(t, distance)
 11 | 
 12 |   def approximateNearestN(t: T1, n: Int): Array[(T1, T2)] = root.approxNearN(t, n, distance)
 13 | }
 14 | 
 15 | // Adapted version of https://github.com/kaja47/sketches/blob/master/VPTree.scala
 16 | case object VPTree {
 17 |   type Distance[T1] = (T1, T1) => Double
 18 | 
 19 |   def apply[T1: ClassTag, T2: ClassTag](items: Array[(T1, T2)], distance: Distance[T1], leafSize: Int): VPTree[T1, T2] =
 20 |     VPTree(mkNode(items, distance, leafSize), distance)
 21 | 
 22 |   sealed trait Tree[T1, T2] {
 23 |     def size: Int
 24 | 
 25 |     def toArray: Array[(T1, T2)]
 26 | 
 27 |     def nearN(t: T1, maxDist: Double, distance: Distance[T1]): Array[(T1, T2)]
 28 | 
 29 |     def approxNear(t: T1, f: Distance[T1]): (T1, T2)
 30 | 
 31 |     def approxNearN(t: T1, n: Int, f: Distance[T1]): Array[(T1, T2)]
 32 |   }
 33 | 
 34 |   case class Node[T1, T2](point: (T1, T2), radius: Double, size: Int, in: Tree[T1, T2], out: Tree[T1, T2]) extends Tree[T1, T2] {
 35 |     def toArray = in.toArray ++ out.toArray
 36 | 
 37 |     def nearN(t: T1, maxDist: Double, distance: Distance[T1]): Array[(T1, T2)] = {
 38 |       val d = distance(t, point._1)
 39 |       if (d + maxDist < radius) {
 40 |         in.nearN(t, maxDist, distance)
 41 |       } else if (d - maxDist >= radius) {
 42 |         out.nearN(t, maxDist, distance)
 43 |       } else {
 44 |         in.nearN(t, maxDist, distance) ++ out.nearN(t, maxDist, distance)
 45 |       }
 46 |     }
 47 | 
 48 |     def approxNear(t: T1, f: Distance[T1]): (T1, T2) = {
 49 |       val d = f(point._1, t)
 50 |       if (d < radius) in.approxNear(t, f)
 51 |       else out.approxNear(t, f)
 52 |     }
 53 | 
 54 |     def approxNearN(t: T1, n: Int, f: Distance[T1]): Array[(T1, T2)] =
 55 |       if (n <= 0) Array.empty
 56 |       else if (n > size) toArray
 57 |       else {
 58 |         val d = f(point._1, t)
 59 |         if (d < radius) {
 60 |           in.approxNearN(t, n, f) ++ out.approxNearN(t, n - in.size, f)
 61 |         } else {
 62 |           out.approxNearN(t, n, f) ++ in.approxNearN(t, n - out.size, f)
 63 |         }
 64 |       }
 65 |   }
 66 | 
 67 |   case class Leaf[T1: ClassTag, T2: ClassTag](points: Array[(T1, T2)]) extends Tree[T1, T2] {
 68 |     def size = points.length
 69 | 
 70 |     def toArray = points
 71 | 
 72 |     def approxNear(t: T1, distance: Distance[T1]): (T1, T2) = points.minBy(p => distance(t, p._1))
 73 | 
 74 |     def approxNearN(t: T1, n: Int, distance: Distance[T1]): Array[(T1, T2)] =
 75 |       if (n <= 0) Array.empty
 76 |       else if (n >= size) points
 77 |       else points.sortBy(p => distance(p._1, t)).take(n)
 78 | 
 79 |     def nearN(t: T1, maxDist: Double, distance: Distance[T1]): Array[(T1, T2)] =
 80 |       points.filter(p => distance(t, p._1) <= maxDist)
 81 |   }
 82 | 
 83 |   def mkNode[T1: ClassTag, T2: ClassTag](items: Array[(T1, T2)], distance: Distance[T1], leafSize: Int): Tree[T1, T2] = {
 84 |     if (items.length <= leafSize)
 85 |       Leaf[T1, T2](items)
 86 |     else {
 87 |       val vp = items(util.Random.nextInt(items.length))
 88 | 
 89 |       val radius = {
 90 |         val numSamples = math.sqrt(items.length).floor * 2
 91 |         val distances = pickSample(items, numSamples.toInt).map(i => distance(vp._1, i._1))
 92 |         distances.sortBy(identity).apply(distances.length / 2)
 93 |       }
 94 | 
 95 |       val (in, out) = items partition (item => distance(item._1, vp._1) < radius)
 96 | 
 97 |       if (in.length == 0) Leaf[T1, T2](out)
 98 |       else if (out.length == 0) Leaf[T1, T2](in)
 99 |       else Node(vp, radius, items.length, mkNode(in, distance, leafSize), mkNode(out, distance, leafSize))
100 |     }
101 |   }
102 | 
103 |   def pickSample[T1, T2](items: Array[(T1, T2)], size: Int): Array[(T1, T2)] =
104 |     if (items.length <= size) items
105 |     else Array.fill(size)(items(util.Random.nextInt(items.length)))
106 | }
107 | 


--------------------------------------------------------------------------------