├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt ├── project └── assembly.sbt └── src ├── examples ├── resources │ └── valid_codes └── scala │ └── sparkz │ └── examples │ ├── BinaryClassifierEvaluationExample.scala │ └── DataValidation.scala └── main ├── resources └── stopwords.tsv └── scala ├── sam └── sceval │ ├── AreaUnderCurve.scala │ ├── BinUtils.scala │ ├── BinaryConfusionMatrix.scala │ ├── EvaluationPimps.scala │ ├── XValidator.scala │ └── package.scala └── sparkz ├── classifiers ├── BinaryClassifier.scala ├── BinaryClassifierTrainerWithTransformer.scala ├── DecisionTreeVectorClassifier.scala ├── Features.scala └── RandomBinaryClassifierTrainer.scala ├── evaluation ├── BinaryClassifierEvaluation.scala └── MAP.scala ├── transformers ├── EnsembleTransformer.scala ├── MultiOneHotTransformer.scala ├── OriginalNumericalsTransformer.scala └── TermFrequencyTransformer.scala └── utils ├── AppLogger.scala ├── Pimps.scala ├── StatCounter.scala ├── TopN.scala └── VPTree.scala /.gitignore: -------------------------------------------------------------------------------- 1 | ### Intellij ### 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, WebStorm 3 | 4 | ## Directory-based project format 5 | .idea/ 6 | # if you remove the above rule, at least ignore user-specific stuff: 7 | # .idea/workspace.xml 8 | # .idea/tasks.xml 9 | # and these sensitive or high-churn files: 10 | # .idea/dataSources.ids 11 | # .idea/dataSources.xml 12 | # .idea/sqlDataSources.xml 13 | # .idea/dynamic.xml 14 | 15 | ## File-based project format 16 | *.ipr 17 | *.iws 18 | *.iml 19 | 20 | ## Additional for IntelliJ 21 | out/ 22 | gen/ 23 | 24 | # generated by mpeltonen/sbt-idea plugin 25 | .idea_modules/ 26 | 27 | # generated by JIRA plugin 28 | atlassian-ide-plugin.xml 29 | 30 | # generated by Crashlytics plugin (for Android Studio and Intellij) 31 | com_crashlytics_export_strings.xml 32 | 33 | 34 | ### idea-gitignore 35 | 36 | # Release package 37 | idea-gitignore.jar 38 | resources/templates.list 39 | build/ 40 | build.properties 41 | junit*.properties -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | 3 | scala: 4 | - 2.10.4 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sparkz 2 | [![Build Status](https://travis-ci.org/gm-spacagna/sparkz.svg?branch=master)](https://travis-ci.org/gm-spacagna/sparkz) 3 | 4 | A proof-of-concept extension to the amazing Spark framework for better functional programming. 5 | The project aims to extend, and in a few cases re-implement, some of the functionalities and classes in the [Apache Spark](spark.apache.org) framework. 6 | 7 | The main motivation is to make statically typed the APIs of some Machine Learning components, to provide the missing functional structures of some classes (Broadcast variables, data validation pipelines, utility classes...) and to work around the unnecessary limitations imposed by private fields/methods. 8 | Moreover, the project introduces a bunch of util functions, implicits and tutorials to show the power, conciseness and elegance of the Spark framework when combined with a fully functional design. 9 | 10 | ## Sonatype dependency 11 | Maven: 12 | 13 | 14 | com.github.gm-spacagna 15 | sparkz_2.10 16 | 0.1.0 17 | 18 | 19 | sbt: 20 | 21 | "com.github.gm-spacagna" % "sparkz_2.10" % "0.1.0" 22 | 23 | ## Current features 24 | 25 | * Functional Data Validation using monads and applicative functors: https://datasciencevademecum.wordpress.com/2016/03/09/functional-data-validation-using-monads-and-applicative-functors/ 26 | * Integration with sceval for a better binary classification evaluation framework: https://github.com/samthebest/sceval 27 | * Immutable StatCounter class with defined Monoid 28 | * Collection of Pimps for daily tasks utils 29 | * Lazy logger for debug computations 30 | * Transformer -> Trainer -> Model -> Evaluation functional framework for machine learning algorithms (similar to ML pipeline but typed and functional): https://datasciencevademecum.wordpress.com/2016/04/12/robust-and-declarative-machine-learning-pipelines-for-predictive-buyin/ 31 | 32 | ## WIP 33 | * Functor for Spark Broadcast 34 | 35 | 36 | ## Limitations 37 | The original Spark implementations are intentionally not fully functional in order to avoid overloading the garbage collector and have more efficient and mutable data structures. This project is only a proof-of-concept with the goal of inspiring developers, data scientists and engineers to think their design in pure functional terms but does not guarantee better performances. It is strongly encouraged to tailor and tune each component based on your speficif needs. 38 | 39 | ## Related projects 40 | * Frameless: https://github.com/adelbertc/frameless 41 | * Exploratory Data Analysis: https://github.com/vicpara/exploratory-data-analysis 42 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | organization := "com.github.gm-spacagna" 2 | 3 | name := "sparkz" 4 | 5 | version := "0.1.0-SNAPSHOT" 6 | 7 | scalaVersion := "2.10.4" 8 | 9 | libraryDependencies ++= Seq( 10 | "joda-time" % "joda-time" % "2.6" withSources() withJavadoc(), 11 | "org.joda" % "joda-convert" % "1.2" withSources() withJavadoc(), 12 | "org.apache.spark" % "spark-core_2.10" % "1.3.0" withSources() withJavadoc(), 13 | "org.apache.spark" % "spark-mllib_2.10" % "1.3.0" withSources() withJavadoc(), 14 | "com.github.scala-incubator.io" %% "scala-io-file" % "0.4.2" withSources() withJavadoc(), 15 | "org.scalaz" %% "scalaz-core" % "7.0.6" withSources() withJavadoc(), 16 | "org.rogach" %% "scallop" % "0.9.5" withSources() withJavadoc(), 17 | "org.scala-lang" % "scalap" % "2.10.4" withSources() withJavadoc(), 18 | "org.scala-lang" % "scala-compiler" % "2.10.4" withSources() withJavadoc(), 19 | "org.specs2" %% "specs2-core" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(), 20 | "org.specs2" %% "specs2-scalacheck" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc() 21 | ) 22 | 23 | resolvers ++= Seq( 24 | "Maven Central" at "https://repo1.maven.org/maven2/" 25 | ) 26 | 27 | mergeStrategy in assembly <<= (mergeStrategy in assembly) ((old) => { 28 | case x if Assembly.isConfigFile(x) => 29 | MergeStrategy.concat 30 | case PathList(ps @ _*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) => 31 | MergeStrategy.rename 32 | case PathList("META-INF", xs @ _*) => 33 | (xs map {_.toLowerCase}) match { 34 | case ("manifest.mf" :: Nil) | ("index.list" :: Nil) | ("dependencies" :: Nil) => 35 | MergeStrategy.discard 36 | case ps @ (x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") => 37 | MergeStrategy.discard 38 | case "plexus" :: xs => 39 | MergeStrategy.discard 40 | case "services" :: xs => 41 | MergeStrategy.filterDistinctLines 42 | case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) => 43 | MergeStrategy.filterDistinctLines 44 | case _ => MergeStrategy.first // Changed deduplicate to first 45 | } 46 | case PathList(_*) => MergeStrategy.first // added this line 47 | }) 48 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | -------------------------------------------------------------------------------- /src/examples/resources/valid_codes: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | -------------------------------------------------------------------------------- /src/examples/scala/sparkz/examples/BinaryClassifierEvaluationExample.scala: -------------------------------------------------------------------------------- 1 | package sparkz.examples 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.joda.time.LocalDate 5 | import sam.sceval.EvaluationPimps._ 6 | import sparkz.classifiers._ 7 | import sparkz.evaluation.BinaryClassifierEvaluation 8 | import sparkz.transformers._ 9 | 10 | case class UserInfo(customerId: Long, date: LocalDate) 11 | case class UserFeatures(events: List[String], attributes: Map[String, String], numericals: Map[String, Double]) 12 | case class UserFeaturesWithBooleanLabel(features: UserFeatures, 13 | metaData: UserInfo, 14 | isTrue: Boolean) extends FeaturesWithBooleanLabel[UserFeatures] with MetaData[UserInfo] 15 | 16 | object BinaryClassifierEvaluationExample { 17 | def auc(data: RDD[(FeaturesWithBooleanLabel[UserFeatures] with MetaData[UserInfo])]): Map[BinaryClassifierTrainer[UserFeatures], Double] = { 18 | 19 | val vecDecisionTreeClassifier: BinaryClassifierVectorTrainer = 20 | DecisionTreeClassifierTrainer(impurity = "gini", maxDepth = 5, maxBins = 32) 21 | 22 | val eventsSubTransformer = TermFrequencyTransformer[String, UserFeatures](_.events) 23 | val categoriesSubTransformer = MultiOneHotTransformer[String, UserFeatures](_.attributes) 24 | val numericalsSubTransformer = OriginalNumericalsTransformer[String, UserFeatures](_.numericals) 25 | 26 | val classifiers: List[BinaryClassifierTrainer[UserFeatures]] = List( 27 | RandomBinaryClassifierTrainer(), 28 | BinaryClassifierTrainerWithTransformer( 29 | vec2classifier = vecDecisionTreeClassifier, 30 | transformer = eventsSubTransformer 31 | ), 32 | BinaryClassifierTrainerWithTransformer( 33 | vec2classifier = vecDecisionTreeClassifier, 34 | transformer = categoriesSubTransformer 35 | ), 36 | BinaryClassifierTrainerWithTransformer( 37 | vec2classifier = vecDecisionTreeClassifier, 38 | transformer = numericalsSubTransformer 39 | ), 40 | BinaryClassifierTrainerWithTransformer( 41 | vec2classifier = vecDecisionTreeClassifier, 42 | transformer = EnsembleTransformer(eventsSubTransformer, categoriesSubTransformer, numericalsSubTransformer) 43 | ) 44 | ) 45 | 46 | BinaryClassifierEvaluation.crossValidationScores( 47 | data = data, 48 | k = 10, 49 | classifiers = classifiers, 50 | uniqueId = (_: UserInfo).customerId, 51 | orderingField = (_: UserInfo).date.toDateTimeAtStartOfDay.getMillis, 52 | singleInference = true 53 | ).mapValues( 54 | _.map { 55 | case (features, score) => score -> features.isTrue 56 | } 57 | .confusions().areaUnderROC 58 | ) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/examples/scala/sparkz/examples/DataValidation.scala: -------------------------------------------------------------------------------- 1 | package sparkz.examples 2 | 3 | import org.apache.spark.broadcast.Broadcast 4 | import org.apache.spark.rdd.RDD 5 | import org.joda.time.{DateTime, Interval, LocalDate} 6 | import sparkz.utils.Pimps._ 7 | 8 | import scalaz.Scalaz._ 9 | import scalaz.ValidationNel 10 | 11 | case class UserEvent(userId: Long, eventCode: Int, timestamp: Long) 12 | 13 | sealed trait InvalidEventCause 14 | case object NonRecognizedEventType extends InvalidEventCause 15 | case object BlackListUser extends InvalidEventCause 16 | case object NonEligibleUser extends InvalidEventCause 17 | case object OutOfGlobalIntervalEvent extends InvalidEventCause 18 | case object FirstDayToConsiderEvent extends InvalidEventCause 19 | 20 | case class InvalidEvent(event: UserEvent, cause: InvalidEventCause) 21 | 22 | case object DataValidation { 23 | def validationFunction(events: RDD[UserEvent], 24 | eligibleUsers: Set[Long], 25 | validEventCodes: Set[Int], 26 | blackListEventCodes: Set[Int], 27 | minDate: String, maxDate: String): UserEvent => ValidationNel[InvalidEvent, UserEvent] = { 28 | val sc = events.context 29 | 30 | val validEventCodesBV: Broadcast[Set[Int]] = sc.broadcast(validEventCodes) 31 | val notRecognizedEventCode: PartialFunction[UserEvent, InvalidEventCause] = { 32 | case event if !validEventCodesBV.value.contains(event.eventCode) => NonRecognizedEventType 33 | } 34 | 35 | val eligibleUsersBV: Broadcast[Set[Long]] = sc.broadcast(eligibleUsers) 36 | val customerNotEligible: PartialFunction[UserEvent, InvalidEventCause] = { 37 | case event if !eligibleUsersBV.value.contains(event.userId) => NonEligibleUser 38 | } 39 | 40 | val blackListEventCodesBV: Broadcast[Set[Int]] = sc.broadcast(blackListEventCodes) 41 | // Users for which we observed a black list event 42 | val blackListUsersBV: Broadcast[Set[Long]] = sc.broadcast( 43 | events.filter(event => blackListEventCodesBV.value.contains(event.eventCode)) 44 | .map(_.userId).distinct().collect().toSet 45 | ) 46 | val customerIsInBlackList: PartialFunction[UserEvent, InvalidEventCause] = { 47 | case event if blackListUsersBV.value.contains(event.userId) => BlackListUser 48 | } 49 | 50 | 51 | val eventIsOutOfGlobalInterval: PartialFunction[UserEvent, InvalidEventCause] = { 52 | case event if !new Interval(DateTime.parse(minDate), DateTime.parse(maxDate)).contains(event.timestamp) => 53 | OutOfGlobalIntervalEvent 54 | } 55 | 56 | // max between first date we have ever seen a customer event and the global min date 57 | val customersFirstDayToConsiderBV: Broadcast[Map[Long, LocalDate]] = 58 | sc.broadcast( 59 | events.keyBy(_.userId) 60 | .mapValues(personalEvent => new DateTime(personalEvent.timestamp).toLocalDate) 61 | .reduceByKey((date1, date2) => List(date1, date2).minBy(_.toDateTimeAtStartOfDay.getMillis)) 62 | .mapValues(firstDate => List(firstDate, LocalDate.parse(minDate)).maxBy(_.toDateTimeAtStartOfDay.getMillis)) 63 | .collect().toMap 64 | ) 65 | val eventIsFirstDayToConsider: PartialFunction[UserEvent, InvalidEventCause] = { 66 | case event if customersFirstDayToConsiderBV.value(event.userId).isEqual(event.timestamp.toLocalDate) => 67 | FirstDayToConsiderEvent 68 | } 69 | val validationRules: List[PartialFunction[UserEvent, InvalidEventCause]] = 70 | List(customerNotEligible, notRecognizedEventCode, customerIsInBlackList, 71 | eventIsOutOfGlobalInterval.orElse(eventIsFirstDayToConsider) 72 | ) 73 | 74 | (event: UserEvent) => validationRules.map(_.toFailureNel(event, InvalidEvent(event, _))).reduce(_ |+++| _) 75 | } 76 | 77 | def onlyValidEvents(events: RDD[UserEvent], 78 | validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): RDD[UserEvent] = 79 | events.map(validationFunc).flatMap(_.toOption) 80 | 81 | def invalidEvents(events: RDD[UserEvent], 82 | validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): RDD[InvalidEvent] = 83 | events.map(validationFunc).flatMap(_.swap.toOption).flatMap(_.toList) 84 | 85 | def outOfRangeEvents(events: RDD[UserEvent], 86 | validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): RDD[UserEvent] = 87 | events.map(validationFunc).flatMap(_.swap.toOption).flatMap(_.toSet).flatMap { 88 | case InvalidEvent(event, OutOfGlobalIntervalEvent) => event.some 89 | case _ => Nil 90 | } 91 | 92 | // This method will return something like: 93 | // Map(Set(NonEligibleCustomer, NonRecognizedEventType) -> 36018450, 94 | // Set(NonEligibleUser) -> 9037691, 95 | // Set(NonEligibleUser, BlackListUser, NonRecognizedEventType) -> 137816, 96 | // Set(NonEligibleUser) -> 464694973, 97 | // Set(BeforeFirstDayToConsiderEvent, NonRecognizedEventType) -> 5147475, 98 | // Set(OutOfGlobalIntervalEvent, NonRecognizedEventType) -> 983478) 99 | def causeSetToInvalidEventsCount(events: RDD[UserEvent], 100 | validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): Map[Set[InvalidEventCause], Int] = 101 | events.map(validationFunc) 102 | .map(_.swap).flatMap(_.toOption).map(_.map(_.cause).toSet -> 1) 103 | .reduceByKey(_ + _) 104 | .collect().toMap 105 | 106 | // This method will return something like: 107 | // Map(Set(NonEligibleCustomer, NonRecognizedEventType) -> 1545, 108 | // Set(NonEligibleUser) -> 122, 109 | // Set(NonEligibleUser, BlackListUser, NonRecognizedEventType) -> 3224, 110 | // Set(NonEligibleUser) -> 4, 111 | // Set(BeforeFirstDayToConsiderEvent, NonRecognizedEventType) -> 335, 112 | // Set(OutOfGlobalIntervalEvent, NonRecognizedEventType) -> 33) 113 | def causeSetToUsersLostCount(events: RDD[UserEvent], 114 | validationFunc: UserEvent => ValidationNel[InvalidEvent, UserEvent]): Map[Set[InvalidEventCause], Int] = { 115 | val survivedUsersBV: Broadcast[Set[Long]] = 116 | events.context.broadcast(events.map(validationFunc).flatMap(_.toOption).map(_.userId).distinct().collect().toSet) 117 | 118 | events.map(validationFunc).flatMap(_.swap.toOption) 119 | .keyBy(_.head.event.userId) 120 | .filter(_._1 |> (!survivedUsersBV.value(_))) 121 | .mapValues(_.map(_.cause).toSet) 122 | .mapValues(Set(_)) 123 | .reduceByKey(_ ++ _) 124 | .flatMap(_._2) 125 | .map(_ -> 1) 126 | .reduceByKey(_ + _) 127 | .collect().toMap 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/resources/stopwords.tsv: -------------------------------------------------------------------------------- 1 | PETER 2 | PRACTICE 3 | ROAD 4 | ENGINEERING 5 | SOLICITORS 6 | SURGERY 7 | HAIR 8 | ROYAL 9 | DAY 10 | HOLDINGS 11 | SON 12 | CLUB 13 | COMMUNITY 14 | LONDON 15 | THE 16 | PHARMACY 17 | LODGE 18 | SCHOOL 19 | ELECTRICAL 20 | CLINIC 21 | CARE 22 | DESIGN 23 | SHOP 24 | STORES 25 | HOME 26 | NURSERY 27 | DENTAL 28 | AUTO 29 | SOUTH 30 | SERVICES 31 | HALL 32 | OLD 33 | OF 34 | HEATING 35 | ASSOCIATION 36 | CHURCH 37 | GARAGE 38 | SALON 39 | ROBERT 40 | SYSTEMS 41 | LIBRARY 42 | ST 43 | MOTORS 44 | HEALTH 45 | LTD 46 | PARK 47 | GROUP 48 | SERVICE 49 | NEW 50 | COMPANY 51 | SONS 52 | HOTEL 53 | HOUSE 54 | MOTOR 55 | ARMS 56 | UK 57 | THOMAS 58 | FARM 59 | LIMITED 60 | PLUMBING 61 | BEAUTY 62 | CO 63 | RESTAURANT 64 | GARDEN 65 | INN 66 | BAR 67 | CAFE 68 | MANAGEMENT 69 | BUILDING 70 | NORTH 71 | STATION 72 | CATHOLIC 73 | HIRE 74 | COURT 75 | CENTRE 76 | FISH 77 | STUDIO 78 | SPORTS 79 | PAUL 80 | MEDICAL 81 | NEWS 82 | VILLAGE 83 | ASSOCIATES 84 | PRIMARY 85 | OFFICE 86 | POST 87 | INTERNATIONAL 88 | SUPPLIES 89 | -------------------------------------------------------------------------------- /src/main/scala/sam/sceval/AreaUnderCurve.scala: -------------------------------------------------------------------------------- 1 | package sam.sceval 2 | 3 | import org.apache.spark.mllib.rdd.RDDFunctions._ 4 | import org.apache.spark.rdd.RDD 5 | 6 | /** Computes the area under the curve (AUC) using the trapezoidal rule. */ 7 | @deprecated("Don't use meaningless measures, use something that has a direct probabilistic meaning. See README.md") 8 | object AreaUnderCurve { 9 | def trapezoid(points: Seq[(Double, Double)]): Double = { 10 | val (x1, x2) :: (y1, y2) :: Nil = points 11 | (y1 - x1) * (y2 + x2) / 2.0 12 | } 13 | 14 | def apply(curve: RDD[(Double, Double)]): Double = curve.sliding(2).aggregate(0.0)( 15 | seqOp = (auc: Double, points: Array[(Double, Double)]) => auc + trapezoid(points.toList), 16 | combOp = _ + _ 17 | ) 18 | 19 | def apply(curve: Iterable[(Double, Double)]): Double = 20 | curve.toIterator.sliding(2).withPartial(false).aggregate(0.0)( 21 | seqop = (auc: Double, points: Seq[(Double, Double)]) => auc + trapezoid(points), 22 | combop = _ + _ 23 | ) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/sam/sceval/BinUtils.scala: -------------------------------------------------------------------------------- 1 | package sam.sceval 2 | 3 | object BinUtils { 4 | case class BinStats(startBinNumber: Int = 0, offset: Int = 0) 5 | 6 | def binnerFac[Model](partitionLastIndexes: Array[Map[Model, Long]], 7 | numRecodsPerBin: Long): (Model, Long, Int) => Int = { 8 | val modelToBinStats: Map[Model, Array[BinStats]] = 9 | partitionLastIndexes.flatMap(_.keySet).toSet.foldLeft(Map.empty[Model, List[BinStats]])((modelToStats, model) => 10 | partitionLastIndexes.foldLeft(modelToStats)((modelToStats, partition) => 11 | modelToStats + (model -> ((partition.get(model), modelToStats.getOrElse(model, List(BinStats()))) match { 12 | case (Some(lastIndex), cum@(BinStats(startBinNumber, offset) :: _)) => 13 | val newOffset = (lastIndex + 1 + offset) % numRecodsPerBin 14 | BinStats((startBinNumber + (lastIndex + offset) / numRecodsPerBin).toInt + (if (newOffset == 0) 1 else 0), 15 | newOffset.toInt) +: cum 16 | case (None, cum@(binStats :: _)) => binStats +: cum 17 | case _ => ??? // default impossible case to remove warning messages 18 | })))) // map identity is a hack around the non-serializability of the Map returned from mapValues 19 | .mapValues(_.reverse.toArray).map(identity) 20 | 21 | (model: Model, index: Long, partitionIndex: Int) => { 22 | val BinStats(startBinNumber, offset) = modelToBinStats(model)(partitionIndex) 23 | (startBinNumber + (index + offset) / numRecodsPerBin).toInt 24 | } 25 | } 26 | 27 | def resultingBinNumber(recordsPerBin: Int, totalRecords: Long): Long = 28 | if (totalRecords % recordsPerBin == 0) totalRecords / recordsPerBin else (totalRecords / recordsPerBin) + 1 29 | 30 | def optimizeRecordsPerBin(totalRecords: Long, desiredBinNum: Int): Long = 31 | (1 to (if (desiredBinNum < totalRecords) 1 + (totalRecords / desiredBinNum) else desiredBinNum).toInt) 32 | .minBy(recordsPerBin => math.abs(resultingBinNumber(recordsPerBin, totalRecords) - desiredBinNum)) 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/sam/sceval/BinaryConfusionMatrix.scala: -------------------------------------------------------------------------------- 1 | package sam.sceval 2 | 3 | case class MutableBinaryLabelCount(var numPositives: Long = 0L, var numNegatives: Long = 0L) { 4 | def +=(label: Boolean): MutableBinaryLabelCount = { 5 | if (label) numPositives += 1L else numNegatives += 1L 6 | this 7 | } 8 | 9 | def +=(other: MutableBinaryLabelCount): MutableBinaryLabelCount = { 10 | numPositives += other.numPositives 11 | numNegatives += other.numNegatives 12 | this 13 | } 14 | 15 | override def clone: MutableBinaryLabelCount = new MutableBinaryLabelCount(numPositives, numNegatives) 16 | 17 | def count: BinaryLabelCount = BinaryLabelCount(numPositives, numNegatives) 18 | def total: Long = numPositives + numNegatives 19 | } 20 | 21 | case class BinaryLabelCount(numPositives: Long = 0L, numNegatives: Long = 0L) { 22 | def total: Long = numPositives + numNegatives 23 | 24 | def +(label: Boolean): BinaryLabelCount = 25 | if (label) copy(numPositives = numPositives + 1L) else copy(numNegatives = numNegatives + 1L) 26 | 27 | def +(other: BinaryLabelCount): BinaryLabelCount = 28 | BinaryLabelCount(numPositives + other.numPositives, numNegatives + other.numNegatives) 29 | } 30 | 31 | case class BinaryConfusionMatrix(tp: Long, fp: Long, tn: Long, fn: Long) { 32 | def total: Long = tp + fp + tn + fn 33 | 34 | def actualPositives: Long = tp + fn 35 | def actualNegatives: Long = fp + tn 36 | 37 | def predictedPositives: Long = tp + fp 38 | def predictedNegatives: Long = tn + fn 39 | 40 | def volume: Double = predictedPositives.toDouble / total 41 | 42 | /** The 'probability' the label is actually True given we predict True */ 43 | def precision: Double = if (predictedPositives == 0) 1.0 else tp.toDouble / predictedPositives 44 | 45 | /** The 'probability' we will predict True given the label is actually True */ 46 | def recall: Double = if (actualPositives == 0) 0.0 else tp.toDouble / actualPositives 47 | 48 | /** Actual probability of True */ 49 | def prior: Double = actualPositives.toDouble / total 50 | 51 | /** The 'probability' we will predict True */ 52 | def predictedPrior: Double = (tp + fp).toDouble / total 53 | 54 | /** How many times the predictor is better than random. One can usually map this number directly to 55 | * savings / profit making it have business meaning, unlike most measures. */ 56 | def uplift: Double = (tp * total).toDouble / ((tp + fp) * (tp + fn)) 57 | 58 | def specificity: Double = tn.toDouble / actualNegatives 59 | def negativePredictiveValue: Double = tn.toDouble / predictedNegatives 60 | def fallOut: Double = fp.toDouble / actualNegatives 61 | def falseDiscoveryRate: Double = fp.toDouble / predictedPositives 62 | def falsePositiveRate: Double = if (actualNegatives == 0) 0.0 else fp.toDouble / actualNegatives 63 | def accuracy: Double = (tp + tn).toDouble / total 64 | 65 | def +(other: BinaryConfusionMatrix): BinaryConfusionMatrix = 66 | BinaryConfusionMatrix(other.tp + tp, other.fp + fp, other.tn + tn, other.fn + fn) 67 | 68 | @deprecated("Don't use meaningless measures, use something that has a direct probabilistic meaning. See README.md") 69 | def f1Measure(beta: Double = 1.0): Double = { 70 | val beta2 = beta * beta 71 | if (precision + recall == 0) 0.0 else (1.0 + beta2) * (precision * recall) / (beta2 * precision + recall) 72 | } 73 | } 74 | 75 | object BinaryConfusionMatrix { 76 | def apply(count: BinaryLabelCount, totalCount: BinaryLabelCount): BinaryConfusionMatrix = BinaryConfusionMatrix( 77 | tp = count.numPositives, 78 | fp = count.numNegatives, 79 | tn = totalCount.numNegatives - count.numNegatives, 80 | fn = totalCount.numPositives - count.numPositives 81 | ) 82 | } 83 | 84 | 85 | -------------------------------------------------------------------------------- /src/main/scala/sam/sceval/EvaluationPimps.scala: -------------------------------------------------------------------------------- 1 | package sam.sceval 2 | 3 | import BinUtils._ 4 | import org.apache.spark.Logging 5 | import org.apache.spark.rdd.{RDD, UnionRDD} 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.storage.StorageLevel.MEMORY_ONLY 8 | 9 | import scala.collection.mutable 10 | import scala.reflect.ClassTag 11 | 12 | // TODO (long term) Abstract out the RDD part so we can have a List version and a ParIterable version. 13 | // I.e. introduce a DistributedDataset type-class (or DD) 14 | 15 | // TODO Another version where the number of models is huge, but there is a long (0.0, Boolean) tail which can be 16 | // preaggregated to then allow for a reduceBy(model). 17 | // Will be useful for evaluating matching algorithms or tiny-cluster clustering problems 18 | 19 | 20 | object EvaluationPimps extends Logging { 21 | implicit class PimpedScoresAndLabelsRDD(scoreAndLabels: RDD[(Double, Boolean)]) { 22 | 23 | import PimpedScoresAndLabelsRDD._ 24 | 25 | def confusions(cacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY), 26 | bins: Option[Int] = Some(1000), 27 | recordsPerBin: Option[Long] = None): Seq[BinaryConfusionMatrix] = 28 | if (scoreAndLabels.isEmpty()) 29 | Nil 30 | else 31 | scoreAndLabels.map(0 -> _).map(Map[Int, (Double, Boolean)](_)) 32 | .confusionsByModel(cacheIntermediate, bins, recordsPerBin).map(_._2).collect().head 33 | 34 | /** Will incur 3 spark stages for bins != 0, 2 otherwise. 35 | * This method is `approx` in the sense the bin sizes and bin count can vary quite wildly. Use at own risk. */ 36 | @deprecated("bin sizes & bin counts vary wildly. Use `confusions`.") 37 | def scoresAndConfusions(desiredBins: Int = 0): RDD[(Double, BinaryConfusionMatrix)] = { 38 | val binnedCounts: ScoresAndCounts = 39 | downSampleIfRequired(scoreAndLabels.combineByKey( 40 | createCombiner = new MutableBinaryLabelCount(0L, 0L) += (_: Boolean), 41 | mergeValue = (_: MutableBinaryLabelCount) += (_: Boolean), 42 | mergeCombiners = (_: MutableBinaryLabelCount) += (_: MutableBinaryLabelCount) 43 | ).sortByKey(ascending = false), desiredBins) 44 | 45 | val partitionwiseCumCounts: Array[MutableBinaryLabelCount] = partitionwiseCumulativeCounts(binnedCounts) 46 | val totalCount = partitionwiseCumCounts.last 47 | 48 | logInfo(s"Total counts: $totalCount") 49 | 50 | binnedCounts.mapPartitionsWithIndex( 51 | (index, partition) => partition.map { 52 | case (score, c) => (score, (partitionwiseCumCounts(index) += c).count) 53 | }, 54 | preservesPartitioning = true) 55 | .map { 56 | case (score, cumCount) => (score, BinaryConfusionMatrix(cumCount, totalCount.count)) 57 | } 58 | } 59 | } 60 | 61 | /** Model should extend AnyVal or Equals so that it makes sense to use this as a key. 62 | * Should scale reasonably well in the number of models. We return RDD because BCMs are computed in parallel for 63 | * each model, which ought to be a bit faster than a local computation for a large number of models. 64 | * This will ensure all bins except potentially the last bin are of equal size, consequently certain `bins` arguments 65 | * may be impossible to respect, in such cases the bin number that is closest to the desired bins is chosen. 66 | * 67 | * The first element of each Array[BinaryLabelCount] from `binaryLabelCounts` will be the total. 68 | * 69 | * This algorithm costs 4 stages and causes a job that runs the first 2. 70 | * 71 | * @param scoreAndLabelsByModel an RDD of Maps from the `Model` to a score-label pair. It's assumed that for each 72 | * `Model` the total number of score-label pairs in the RDD is equal, if not then 73 | * the behaviour is unspecified and a warning is printed. */ 74 | implicit class PimpedModelOutputsRDD[Model: ClassTag](scoreAndLabelsByModel: RDD[Map[Model, (Double, Boolean)]]) { 75 | 76 | import PimpedModelOutputsRDD._ 77 | 78 | def confusionsByModel(cacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY), 79 | bins: Option[Int] = Some(1000), 80 | recordsPerBin: Option[Long] = None): RDD[(Model, Array[BinaryConfusionMatrix])] = 81 | binaryLabelCounts(cacheIntermediate, bins, recordsPerBin) 82 | .mapValues(blcs => blcs.map(BinaryConfusionMatrix(_, blcs.head))) 83 | 84 | def binaryLabelCounts(cacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY), 85 | bins: Option[Int] = Some(1000), 86 | recordsPerBin: Option[Long] = None): RDD[(Model, Array[BinaryLabelCount])] = { 87 | checkArgs(bins, recordsPerBin) 88 | 89 | scoreAndLabelsByModel.take(1).headOption.flatMap { _ => 90 | val indexed: RDD[(Double, (Model, Boolean, Long))] = indexInPartition(scoreAndLabelsByModel) 91 | 92 | cacheIntermediate.foreach(indexed.persist) 93 | 94 | val lastIndexes: Array[Map[Model, Long]] = partitionLastIndexes(indexed) 95 | 96 | reindexByBin(indexed, lastIndexes, recordsPerBin, bins) 97 | .map(computeBLCs) 98 | } 99 | .getOrElse(scoreAndLabelsByModel.context.makeRDD[(Model, Array[BinaryLabelCount])](Nil)) 100 | } 101 | } 102 | 103 | def checkArgs(bins: Option[Int] = Some(1000), recordsPerBin: Option[Long] = None): Unit = { 104 | require(bins.isDefined ^ recordsPerBin.isDefined, "Only one of bins or recordsPerBin can be specified") 105 | bins.foreach { b => 106 | require(b > 0, "Doesn't make sense to request zero or less bins: " + b) 107 | require(b != 1, "Requesting 1 bin doesn't make sense. If you want the total use 2 bins and access " + 108 | "totalCount in BinaryLabelCounts") 109 | } 110 | recordsPerBin.foreach(r => require(r >= 0, "Doesn't make sense to request negative records per bin: " + r)) 111 | } 112 | 113 | /** Companion object for `PimpedModelOutputsRDD` and methods only likely to be useful in this class. Methods not 114 | * declared private since this is a functional context. */ 115 | // We use mutability in these methods at the partition level to minimize pressure on GC 116 | object PimpedModelOutputsRDD { 117 | type Indexed[Model] = RDD[(Double, (Model, Boolean, Long))] 118 | 119 | def reindexByBin[Model: ClassTag](indexed: Indexed[Model], 120 | lastIndexes: Array[Map[Model, Long]], 121 | recordsPerBin: Option[Long], 122 | bins: Option[Int]): Option[RDD[(Model, Boolean, Int)]] = { 123 | val totalRecords = 124 | lastIndexes.flatMap(_.keys).toSet.map((model: Model) => 125 | lastIndexes.filter(_.nonEmpty).flatMap(_.get(model).map(_ + 1)).sum) 126 | .toList match { 127 | case totalRecords :: Nil => 128 | totalRecords 129 | case totalRecords :: _ :: _ => 130 | logWarning("Total number of records for each model is not all equal.") 131 | totalRecords 132 | } 133 | 134 | logInfo("Total records: " + totalRecords) 135 | 136 | lastIndexes.find(_.nonEmpty).map { aNonEmptyPartition => 137 | recordsPerBin.foreach(r => require(r < totalRecords, s"Cannot request $r records per bin as not enough " + 138 | s"records in total to make 2 bins: $totalRecords")) 139 | 140 | val numRecordsPerBin: Long = recordsPerBin.getOrElse(optimizeRecordsPerBin(totalRecords, bins.get)) 141 | 142 | logInfo("Bins that will used: " + resultingBinNumber(numRecordsPerBin.toInt, totalRecords) + 143 | ", each with " + numRecordsPerBin + " records") 144 | 145 | reindexWithBinner(indexed, binnerFac(lastIndexes, numRecordsPerBin)) 146 | } 147 | } 148 | 149 | def reindexWithBinner[Model: ClassTag](indexed: Indexed[Model], 150 | binner: (Model, Long, Int) => Int): RDD[(Model, Boolean, Int)] = 151 | indexed.mapPartitionsWithIndex((partitionIndex, partition) => partition.map { 152 | case (_, (model, label, index)) => (model, label, binner(model, index, partitionIndex)) 153 | }) 154 | 155 | /** The last element of each Array[BinaryLabelCount] will be the total */ 156 | def computeBLCs[Model: ClassTag](indexedByBin: RDD[(Model, Boolean, Int)]): RDD[(Model, Array[BinaryLabelCount])] = 157 | indexedByBin.mapPartitions { partition => 158 | val bins: mutable.Map[(Model, Int), MutableBinaryLabelCount] = mutable.Map() 159 | partition.foreach { 160 | case (model, label, bin) => 161 | bins += ((model, bin) -> (bins.getOrElse((model, bin), MutableBinaryLabelCount()) += label)) 162 | } 163 | bins.mapValues(_.count).toList.iterator 164 | } 165 | .reduceByKey(_ + _).map { 166 | case ((model, bin), count) => (model, (bin, count)) 167 | } 168 | .groupByKey() 169 | .mapValues(_.toArray.sortBy(-_._1).map(_._2).scan(BinaryLabelCount())(_ + _).drop(1).reverse) 170 | 171 | def indexInPartition[Model: ClassTag](scoreAndLabelsByModel: RDD[Map[Model, (Double, Boolean)]]): Indexed[Model] = 172 | scoreAndLabelsByModel.flatMap(identity).map { 173 | case (model, (score, label)) => (score, (model, label)) 174 | } 175 | .sortByKey() 176 | .mapPartitions(partition => { 177 | val modelToCount: mutable.Map[Model, Long] = mutable.Map() 178 | partition.map { 179 | case (score, (model, label)) => 180 | val index = modelToCount.getOrElse(model, 0L) 181 | modelToCount += (model -> (index + 1)) 182 | // TODO Determine if keeping the score here is actually necessary - I don't think it makes sense 183 | (score, (model, label, index)) 184 | } 185 | }, preservesPartitioning = true) 186 | 187 | // Uses memory O(models x partitions) 188 | def partitionLastIndexes[Model: ClassTag](indexed: Indexed[Model]): Array[Map[Model, Long]] = 189 | indexed.mapPartitions { partition => 190 | val modelToCount: mutable.Map[Model, Long] = mutable.Map() 191 | partition.foreach { 192 | case (_, (model, _, index)) => modelToCount += model -> index 193 | } 194 | Iterator(modelToCount.toMap) 195 | } 196 | .collect() 197 | } 198 | 199 | implicit class PimpedConfusionsSeq(confusions: Seq[BinaryConfusionMatrix]) { 200 | def roc: Seq[(Double, Double)] = 201 | (0.0, 0.0) +: confusions.map(bcm => (bcm.falsePositiveRate, bcm.recall)) :+(1.0, 1.0) 202 | 203 | def precisionByVolume: Seq[(Double, Double)] = confusions.map(bcm => (bcm.volume, bcm.precision)) 204 | def recallByVolume: Seq[(Double, Double)] = confusions.map(bcm => (bcm.volume, bcm.recall)) 205 | def precisionRecallCurve: Seq[(Double, Double)] = (0.0, 1.0) +: confusions.map(bcm => (bcm.recall, bcm.precision)) 206 | def areaUnderROC: Double = AreaUnderCurve(roc) 207 | def areaUnderPR: Double = AreaUnderCurve(precisionRecallCurve) 208 | } 209 | 210 | /** These are left as RDDs as per the original API, but they will be small data that's best returned to the driver 211 | * for subsequent processing */ 212 | implicit class PimpedScoresAndConfusionsRDD(confusions: RDD[(Double, BinaryConfusionMatrix)]) { 213 | def roc(): RDD[(Double, Double)] = { 214 | val rocCurve = confusions.map(_._2).map(bcm => (bcm.falsePositiveRate, bcm.recall)) 215 | val sc = confusions.context 216 | val first = sc.makeRDD(Seq((0.0, 0.0)), 1) 217 | val last = sc.makeRDD(Seq((1.0, 1.0)), 1) 218 | new UnionRDD[(Double, Double)](sc, Seq(first, rocCurve, last)) 219 | } 220 | 221 | def precisionRecallCurve(): RDD[(Double, Double)] = 222 | confusions.context.makeRDD(Seq((0.0, 1.0)), 1) 223 | .union(confusions.map(_._2).map(bcm => (bcm.recall, bcm.precision))) 224 | 225 | def thresholds(): RDD[Double] = confusions.map(_._1) 226 | def areaUnderROC(): Double = AreaUnderCurve(roc()) 227 | def areaUnderPR(): Double = AreaUnderCurve(precisionRecallCurve()) 228 | def precisionByThreshold(): RDD[(Double, Double)] = confusions.mapValues(_.precision) 229 | def recallByThreshold(): RDD[(Double, Double)] = confusions.mapValues(_.recall) 230 | @deprecated("Don't use meaningless measures, use something that has a direct probabilistic meaning. See README.md") 231 | def f1MeasureByThreshold(beta: Double = 1.0): RDD[(Double, Double)] = confusions.mapValues(_.f1Measure(beta)) 232 | } 233 | 234 | object PimpedScoresAndLabelsRDD { 235 | type ScoresAndCounts = RDD[(Double, MutableBinaryLabelCount)] 236 | 237 | // This doesn't scale in the number of bins since it all happens on the driver node 238 | def partitionwiseCumulativeCounts(binnedCounts: ScoresAndCounts): Array[MutableBinaryLabelCount] = 239 | binnedCounts.values.mapPartitions { partition => 240 | val agg = MutableBinaryLabelCount() 241 | partition.foreach(agg +=) 242 | Iterator(agg) 243 | } 244 | .collect() 245 | .scanLeft(MutableBinaryLabelCount())(_.clone += _) 246 | 247 | def downSample(grouping: Int, sortedCounts: ScoresAndCounts): ScoresAndCounts = 248 | sortedCounts.mapPartitions(_.grouped(grouping.toInt).map { 249 | case group@((firstScore, _) :: _) => 250 | val agg = new MutableBinaryLabelCount() 251 | group.map(_._2).foreach(agg +=) 252 | (firstScore, agg) 253 | }) 254 | 255 | def downSampleIfRequired(sortedCounts: ScoresAndCounts, desiredBins: Int): ScoresAndCounts = 256 | if (desiredBins == 0) sortedCounts 257 | else { 258 | val countsSize = sortedCounts.count() 259 | countsSize / desiredBins match { 260 | case g if g < 2 => 261 | logInfo(s"Curve is too small ($countsSize) for $desiredBins bins to be useful") 262 | sortedCounts 263 | case g if g >= Int.MaxValue => 264 | logWarning(s"Curve too large ($countsSize) for $desiredBins bins; capping at ${Int.MaxValue}") 265 | downSample(Int.MaxValue, sortedCounts) 266 | case g => 267 | downSample(g.toInt, sortedCounts) 268 | } 269 | } 270 | } 271 | } 272 | -------------------------------------------------------------------------------- /src/main/scala/sam/sceval/XValidator.scala: -------------------------------------------------------------------------------- 1 | package sam.sceval 2 | 3 | import org.apache.spark.rdd.RDD 4 | import EvaluationPimps._ 5 | import org.apache.spark.storage.StorageLevel 6 | import org.apache.spark.storage.StorageLevel._ 7 | 8 | import scala.util.Random 9 | 10 | /** x-validator that uses near exact same size folds */ 11 | case class XValidator(folds: Int = 10, 12 | evalBins: Option[Int] = Some(1000), 13 | evalCacheIntermediate: Option[StorageLevel] = Some(MEMORY_ONLY), 14 | evalRecordsPerBin: Option[Long] = None) { 15 | 16 | def trainWithExample(fold: Int, modelIndex: Int): Boolean = fold != modelIndex 17 | def scoreWithExample(fold: Int, modelIndex: Int): Boolean = fold == modelIndex 18 | 19 | /** Randomly enumerates all values such that each fold will have the same number of elements + / - 1. 20 | * It is then up to the user to decide how to use this to train their models (user can use the helper methods 21 | * `trainWithExample` and `scoreWithExample` to ensure a consistent approach) 22 | * 23 | * Strictly speaking there are edge cases where this will not generate random splits. Particularly when partitions 24 | * consist of a very small number of examples. */ 25 | def split[Features](featuresAndLabels: RDD[(Features, Boolean)]): RDD[(Int, Features, Boolean)] = { 26 | val upToFolds = featuresAndLabels.take(folds).length 27 | require(upToFolds == folds, s"Not enough records ($upToFolds) for $folds folds") 28 | featuresAndLabels.mapPartitions(new Random().shuffle(_)).zipWithIndex().map { 29 | case ((f, l), i) => ((i % folds).toInt, f, l) 30 | } 31 | } 32 | 33 | def evaluate(scoresAndLabelsByModel: RDD[(Int, Double, Boolean)]): Array[BinaryConfusionMatrix] = 34 | scoresAndLabelsByModel.map(p => Map(p._1 -> (p._2, p._3))) 35 | .confusionsByModel(evalCacheIntermediate, evalBins, evalRecordsPerBin).map(_._2) 36 | .flatMap(_.zipWithIndex.map(_.swap)).reduceByKey(_ + _).collect().sortBy(_._1).map(_._2) 37 | 38 | def xval[Features](trainAndScoreByModel: RDD[(Int, Features, Boolean)] => RDD[(Int, Double, Boolean)], 39 | featuresAndLabel: RDD[(Features, Boolean)]): Array[BinaryConfusionMatrix] = 40 | evaluate(trainAndScoreByModel(split(featuresAndLabel))) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/sam/sceval/package.scala: -------------------------------------------------------------------------------- 1 | package sam 2 | 3 | // Copied and pasted from https://github.com/samthebest/sceval until sceval is uploaded to maven as jar dependency 4 | package object sceval { 5 | 6 | } 7 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/classifiers/BinaryClassifier.scala: -------------------------------------------------------------------------------- 1 | package sparkz.classifiers 2 | 3 | import org.apache.spark.mllib.linalg.Vector 4 | import org.apache.spark.mllib.regression.LabeledPoint 5 | import org.apache.spark.rdd.RDD 6 | 7 | trait BinaryClassifierTrainedModel[Features] extends Serializable { 8 | def score(featuresWindow: Features): Double 9 | } 10 | 11 | trait BinaryClassifierTrainer[Features] { 12 | def train(trainingData: RDD[_ <:FeaturesWithBooleanLabel[Features]]): BinaryClassifierTrainedModel[Features] 13 | } 14 | 15 | trait BinaryClassifierTrainedVectorModel extends Serializable { 16 | def score(vector: Vector): Double 17 | } 18 | 19 | trait BinaryClassifierVectorTrainer { 20 | def train(trainingData: RDD[LabeledPoint]): BinaryClassifierTrainedVectorModel 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/classifiers/BinaryClassifierTrainerWithTransformer.scala: -------------------------------------------------------------------------------- 1 | package sparkz.classifiers 2 | 3 | import org.apache.spark.mllib.linalg._ 4 | import org.apache.spark.mllib.regression.LabeledPoint 5 | import org.apache.spark.rdd.RDD 6 | import sparkz.transformers.FeaturesTransformer 7 | 8 | import scala.reflect.ClassTag 9 | 10 | case object BinaryClassifierTrainerWithTransformer { 11 | def labeledPoint[Features: ClassTag](featuresWithBooleanLabel: FeaturesWithBooleanLabel[Features], 12 | toVector: Features => Vector): LabeledPoint = featuresWithBooleanLabel match { 13 | case featuresWithLabel => 14 | LabeledPoint(if (featuresWithLabel.isTrue) 1.0 else 0.0, toVector(featuresWithLabel.features)) 15 | } 16 | 17 | def apply[Features: ClassTag](vec2classifier: BinaryClassifierVectorTrainer, 18 | transformer: FeaturesTransformer[Features]): BinaryClassifierTrainer[Features] = 19 | new BinaryClassifierTrainer[Features] { 20 | def train(trainingData: RDD[_ <: FeaturesWithBooleanLabel[Features]]): BinaryClassifierTrainedModel[Features] = { 21 | val toVector = transformer.featuresToVector(trainingData.map(_.features)) 22 | 23 | val model = vec2classifier.train(trainingData.map(labeledPoint(_, toVector))) 24 | new BinaryClassifierTrainedModel[Features] { 25 | def score(featuresWindow: Features): Double = model.score(toVector(featuresWindow)) 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/classifiers/DecisionTreeVectorClassifier.scala: -------------------------------------------------------------------------------- 1 | package sparkz.classifiers 2 | 3 | import org.apache.spark.mllib.linalg.Vector 4 | import org.apache.spark.mllib.regression.LabeledPoint 5 | import org.apache.spark.mllib.tree.DecisionTree 6 | import org.apache.spark.mllib.tree.configuration.FeatureType 7 | import org.apache.spark.mllib.tree.model.Node 8 | import org.apache.spark.rdd.RDD 9 | 10 | case class DecisionTreeClassifierTrainer(impurity: String, 11 | maxDepth: Int, 12 | maxBins: Int) extends BinaryClassifierVectorTrainer { 13 | def train(trainingData: RDD[LabeledPoint]): BinaryClassifierTrainedVectorModel = { 14 | 15 | val model = DecisionTree.trainClassifier( 16 | input = trainingData, 17 | numClasses = 2, 18 | categoricalFeaturesInfo = Map.empty[Int, Int], 19 | impurity = impurity, 20 | maxDepth = maxDepth, 21 | maxBins = maxBins 22 | ) 23 | 24 | val topNode = model.topNode 25 | 26 | new BinaryClassifierTrainedVectorModel { 27 | def score(vector: Vector): Double = { 28 | DecisionTreeInference.predictProb(topNode)(vector) 29 | } 30 | } 31 | } 32 | } 33 | 34 | case object DecisionTreeInference { 35 | def predictProb(node: Node)(features: Vector): Double = { 36 | if (node.isLeaf) { 37 | if (node.predict.predict == 1.0) node.predict.prob else 1.0 - node.predict.prob 38 | } else { 39 | assert(node.split.get.featureType == FeatureType.Continuous) 40 | if (features(node.split.get.feature) <= node.split.get.threshold) { 41 | predictProb(node.leftNode.get)(features) 42 | } else { 43 | predictProb(node.rightNode.get)(features) 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/classifiers/Features.scala: -------------------------------------------------------------------------------- 1 | package sparkz.classifiers 2 | 3 | trait FeaturesWithBooleanLabel[Features] { 4 | def features: Features 5 | def isTrue: Boolean 6 | } 7 | 8 | trait MetaData[MetaData] { 9 | def metaData: MetaData 10 | } 11 | 12 | object EmptyMetaData extends MetaData[Unit] { 13 | def metaData: Unit = () 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/classifiers/RandomBinaryClassifierTrainer.scala: -------------------------------------------------------------------------------- 1 | package sparkz.classifiers 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.util.Random 6 | 7 | case class RandomBinaryClassifierTrainer[Features](seed: Option[Long] = None) extends BinaryClassifierTrainer[Features] { 8 | def train(trainingData: RDD[_ <: FeaturesWithBooleanLabel[Features]]): BinaryClassifierTrainedModel[Features] = { 9 | val random: Option[Random] = seed.map(new Random(_)) 10 | new BinaryClassifierTrainedModel[Features] { 11 | def score(vector: Features): Double = random.getOrElse(new Random()).nextDouble 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/evaluation/BinaryClassifierEvaluation.scala: -------------------------------------------------------------------------------- 1 | package sparkz.evaluation 2 | 3 | import org.apache.spark.rdd.RDD 4 | import sparkz.classifiers.{BinaryClassifierTrainer, FeaturesWithBooleanLabel, MetaData} 5 | import sparkz.utils.AppLogger 6 | 7 | import scala.reflect.ClassTag 8 | import scalaz.Scalaz._ 9 | 10 | object BinaryClassifierEvaluation { 11 | def crossValidationScores[Features, Meta, Order: Ordering : ClassTag, UniqueKey: ClassTag] 12 | (data: RDD[FeaturesWithBooleanLabel[Features] with MetaData[Meta]], 13 | k: Int, 14 | classifiers: List[BinaryClassifierTrainer[Features]], 15 | uniqueId: Meta => UniqueKey, 16 | orderingField: Meta => Order, 17 | singleInference: Boolean = true, 18 | seed: Long = 12345L): Map[BinaryClassifierTrainer[Features], RDD[(FeaturesWithBooleanLabel[Features] with MetaData[Meta], Double)]] = 19 | (for { 20 | i <- 0 until k 21 | otherFolds = data.filter(_.metaData |> uniqueId |> (_.hashCode() % k == i)) 22 | holdoutFold = data.filter(_.metaData |> uniqueId |> (_.hashCode() % k != i)) 23 | 24 | splitMax = holdoutFold 25 | .keyBy(_.metaData |> uniqueId) 26 | .mapValues(_.metaData |> orderingField) 27 | .reduceByKey((record1, record2) => Array(record1, record2).max) 28 | .values.min 29 | 30 | Array(splitPoint) = holdoutFold.map(_.metaData |> orderingField).filter(_ < splitMax) 31 | .takeSample(withReplacement = false, num = 1, seed = seed) 32 | 33 | trainingData = otherFolds.filter(_.metaData |> orderingField |> (_ <= splitPoint)) 34 | holdoutFoldAfterSplit = holdoutFold.filter(_.metaData |> orderingField |> (_ > splitPoint)) 35 | 36 | testData = if (singleInference) 37 | holdoutFoldAfterSplit.keyBy(_.metaData |> uniqueId) 38 | .reduceByKey((record1, record2) => Array(record1, record2).minBy(_.metaData |> orderingField)) 39 | .values 40 | else holdoutFoldAfterSplit 41 | 42 | classifier <- classifiers 43 | model = classifier.train(trainingData) 44 | scores = testData.map(testRecord => testRecord -> model.score(testRecord.features)) 45 | } yield classifier -> scores) 46 | .groupBy(_._1).mapValues(_.map(_._2).reduce(_ ++ _)) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/evaluation/MAP.scala: -------------------------------------------------------------------------------- 1 | package sparkz.evaluation 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.rdd.RDD._ 5 | 6 | import scalaz.Scalaz._ 7 | 8 | object MAP { 9 | def apply[T](n: Int = 100, recommendations: RDD[(Long, List[T])], evaluation: RDD[(Long, Set[T])]): Double = 10 | recommendations.join(evaluation).values.map { 11 | case (recommendedLikes, trueLikes) => recommendedLikes.take(n).zipWithIndex.foldLeft(0, 0.0) { 12 | case ((accLikes, accPrecision), (postId, k)) if trueLikes(postId) => 13 | (accLikes + 1, accPrecision + ((accLikes + 1).toDouble / (k + 1))) 14 | case ((accLikes, accPrecision), _) => (accLikes, accPrecision) 15 | }._2 / math.min(trueLikes.size, n) 16 | } |> (apn => { 17 | val count = apn.count() 18 | if (count > 0) apn.reduce(_ + _) / count else 0 19 | }) 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/transformers/EnsembleTransformer.scala: -------------------------------------------------------------------------------- 1 | package sparkz.transformers 2 | 3 | import org.apache.spark.mllib.linalg._ 4 | import org.apache.spark.rdd.RDD 5 | 6 | import scala.reflect.ClassTag 7 | 8 | trait FeaturesTransformer[Features] extends Serializable { 9 | def featuresToVector(trainingData: RDD[Features]): Features => Vector 10 | } 11 | 12 | abstract class SubFeaturesTransformer[SubFeatures: ClassTag, Features] extends FeaturesTransformer[Features] { 13 | def subFeatures(features: Features): SubFeatures 14 | 15 | def subFeaturesToVector(trainingData: RDD[SubFeatures]): SubFeatures => Vector 16 | 17 | def featuresToVector(trainingData: RDD[Features]): Features => Vector = { 18 | val toVector = subFeaturesToVector(trainingData.map(subFeatures)) 19 | 20 | (features: Features) => toVector(subFeatures(features)) 21 | } 22 | } 23 | 24 | case object EnsembleTransformer { 25 | def concatenateVectors(v1: Vector, v2: Vector): Vector = (v1, v2) match { 26 | case (v1: SparseVector, v2: SparseVector) => Vectors.sparse( 27 | size = v1.size + v2.size, 28 | indices = v1.indices ++ v2.indices.map(_ + v1.size), 29 | values = v1.values ++ v2.values 30 | ) 31 | case (v1: SparseVector, v2: DenseVector) => Vectors.sparse( 32 | size = v1.size + v2.size, 33 | indices = v1.indices ++ (v1.size until (v1.size + v2.size)), 34 | values = v1.values ++ v2.values 35 | ) 36 | case (v1: DenseVector, v2: SparseVector) => Vectors.sparse( 37 | size = v1.size + v2.size, 38 | indices = (0 until v1.size).toArray ++ v2.indices.map(_ + v1.size), 39 | values = v1.values ++ v2.values 40 | ) 41 | case (v1: DenseVector, v2: DenseVector) => 42 | Vectors.dense(values = v1.values ++ v2.values) 43 | } 44 | } 45 | 46 | case class EnsembleTransformer[Features](subTransformer1: SubFeaturesTransformer[_, Features], 47 | otherSubTransformers: SubFeaturesTransformer[_, Features]*) extends FeaturesTransformer[Features] { 48 | def featuresToVector(trainingData: RDD[Features]): Features => Vector = { 49 | (features: Features) => 50 | (subTransformer1 +: otherSubTransformers).map(_.featuresToVector(trainingData)) 51 | .map(_.apply(features)).reduce(EnsembleTransformer.concatenateVectors) 52 | } 53 | } -------------------------------------------------------------------------------- /src/main/scala/sparkz/transformers/MultiOneHotTransformer.scala: -------------------------------------------------------------------------------- 1 | package sparkz.transformers 2 | 3 | import org.apache.spark.broadcast.Broadcast 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.reflect.ClassTag 8 | 9 | case object MultiOneHotTransformer { 10 | def apply[Key: ClassTag: Ordering, Features](attributes: Features => Map[Key, String]) = 11 | new SubFeaturesTransformer[Map[Key, String], Features] { 12 | def subFeatures(features: Features): Map[Key, String] = attributes(features) 13 | def subFeaturesToVector(trainingData: RDD[Map[Key, String]]): (Map[Key, String]) => Vector = { 14 | val attributeToIndexBV: Broadcast[Map[(Key, String), Int]] = trainingData.context.broadcast( 15 | trainingData.flatMap(identity).distinct().collect().sorted.zipWithIndex.toMap 16 | ) 17 | 18 | (attributes: Map[Key, String]) => Vectors.sparse( 19 | size = attributeToIndexBV.value.size, 20 | elements = attributes.toSeq.flatMap(attribute => 21 | attributeToIndexBV.value.get(attribute).map(_ -> 1.0) 22 | )) 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/transformers/OriginalNumericalsTransformer.scala: -------------------------------------------------------------------------------- 1 | package sparkz.transformers 2 | 3 | import org.apache.spark.broadcast.Broadcast 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.reflect.ClassTag 8 | 9 | case object OriginalNumericalsTransformer { 10 | def apply[Key: ClassTag: Ordering, Features](numericals: Features => Map[Key, Double]) = 11 | new SubFeaturesTransformer[Map[Key, Double], Features] { 12 | def subFeatures(features: Features): Map[Key, Double] = numericals(features) 13 | def subFeaturesToVector(trainingData: RDD[Map[Key, Double]]): (Map[Key, Double]) => Vector = { 14 | val keyToIndexBV: Broadcast[Map[Key, Int]] = trainingData.context.broadcast( 15 | trainingData.flatMap(_.keySet).distinct().collect().sorted.zipWithIndex.toMap 16 | ) 17 | 18 | (numericals: Map[Key, Double]) => Vectors.sparse( 19 | size = keyToIndexBV.value.size, 20 | elements = numericals.toSeq.flatMap { 21 | case (key, value) => keyToIndexBV.value.get(key).map(_ -> value) 22 | }) 23 | } 24 | } 25 | } 26 | 27 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/transformers/TermFrequencyTransformer.scala: -------------------------------------------------------------------------------- 1 | package sparkz.transformers 2 | 3 | import org.apache.spark.broadcast.Broadcast 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.reflect.ClassTag 8 | 9 | case object TermFrequencyTransformer { 10 | def apply[Term: ClassTag: Ordering, Features](terms: Features => List[Term]) = new SubFeaturesTransformer[List[Term], Features] { 11 | def subFeatures(features: Features): List[Term] = terms(features) 12 | 13 | def subFeaturesToVector(trainingData: RDD[List[Term]]): (List[Term]) => Vector = { 14 | val termToIndexBV: Broadcast[Map[Term, Int]] = trainingData.context.broadcast( 15 | trainingData.flatMap(identity).distinct().collect().sorted.zipWithIndex.toMap 16 | ) 17 | 18 | (terms: List[Term]) => 19 | Vectors.sparse(termToIndexBV.value.size, terms.groupBy(identity).mapValues(_.size).flatMap { 20 | case (term, count) => termToIndexBV.value.get(term).map(_ -> count.toDouble) 21 | }.toSeq) 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/utils/AppLogger.scala: -------------------------------------------------------------------------------- 1 | package sparkz.utils 2 | 3 | import org.apache.log4j._ 4 | 5 | case class AppLogger(logger: org.apache.log4j.Logger) { 6 | def info(message: String): Unit = info(() => message) 7 | def info(message: () => String): Unit = 8 | if (Level.INFO.isGreaterOrEqual(logger.getEffectiveLevel)) logger.info(message()) 9 | 10 | def warn(message: String): Unit = warn(() => message) 11 | def warn(message: () => String): Unit = 12 | if (Level.WARN.isGreaterOrEqual(logger.getEffectiveLevel)) logger.warn(message()) 13 | 14 | def debug(message: String): Unit = debug(() => message) 15 | def debug(message: () => String): Unit = 16 | if (Level.DEBUG.isGreaterOrEqual(logger.getEffectiveLevel)) logger.debug(message()) 17 | } 18 | 19 | case object AppLogger { 20 | def getLogger(level: Level): AppLogger = { 21 | val logger = org.apache.log4j.Logger.getLogger("FTB") 22 | logger.setLevel(level) 23 | AppLogger(logger) 24 | } 25 | 26 | def infoLevel(): AppLogger = getLogger(Level.INFO) 27 | def warnLevel(): AppLogger = getLogger(Level.WARN) 28 | def debugLevel(): AppLogger = getLogger(Level.DEBUG) 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/utils/Pimps.scala: -------------------------------------------------------------------------------- 1 | package sparkz.utils 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.joda.time.DateTime 5 | 6 | import scala.reflect.ClassTag 7 | import scalaz.Scalaz._ 8 | import scalaz.{Failure, Success, ValidationNel} 9 | 10 | case object Pimps { 11 | 12 | implicit class PimpedString(str: String) { 13 | def nonEmptyOption: Option[String] = str.nonEmpty.option(str) 14 | 15 | def nonEmptyOption[T](f: String => T): Option[T] = str.nonEmpty.option(str).map(f) 16 | } 17 | 18 | implicit def longToDateTime(ts: Long): DateTime = new DateTime(ts) 19 | 20 | implicit class PimpedIterable[A](i: Iterable[A]) { 21 | def maxesBy[B](f: A => B)(implicit cmp: Ordering[B]): Set[A] = 22 | if (i.nonEmpty) i.maxBy(f) |> 23 | ((max: A) => i.filter(x => f(x) == f(max)).toSet) 24 | else Set.empty 25 | } 26 | 27 | implicit class PimpedTupledSet[K, V](set: Set[(K, V)]) { 28 | def toMultiMap: Map[K, Set[V]] = set.groupBy(_._1).mapValues(_.map(_._2)) 29 | } 30 | 31 | implicit class PimpedOptionMap[K, V](m: Map[Option[K], V]) { 32 | def flatMapOptionKeys = new PimpedMap(m).flatMapKeys(Option.option2Iterable) 33 | } 34 | 35 | implicit class PimpedMap[K, V](m: Map[K, V]) { 36 | def flatMapKeys[K2](f: (K => Iterable[K2])): Map[K2, V] = m.flatMap { 37 | case (k1, v) => f(k1).map(_ -> v) 38 | } 39 | 40 | def join[V2](that: Map[K, V2]): Map[K, (V, V2)] = 41 | m.flatMap(kv => that.get(kv._1).map(thatV => (kv._1, (kv._2, thatV)))) 42 | 43 | def maxByValue(implicit cmp: Ordering[V]): (K, V) = m.maxByValue(v => v) 44 | 45 | def maxByValue[B](f: (V => B))(implicit cmp: Ordering[B]): (K, V) = m.maxBy(pair => f(pair._2)) 46 | 47 | def maxesByValue(implicit cmp: Ordering[V]): List[(K, V)] = maxesByValue(v => v) 48 | 49 | def maxesByValue[B](f: (V => B))(implicit cmp: Ordering[B]): List[(K, V)] = 50 | m.maxBy(pair => f(pair._2)) |> ((max: (K, V)) => m.filter(_._2 == max._2).toList) 51 | 52 | def minByValue(implicit cmp: Ordering[V]): (K, V) = m.minByValue(v => v) 53 | 54 | def minByValue[B](f: (V => B))(implicit cmp: Ordering[B]): (K, V) = m.minBy(pair => f(pair._2)) 55 | 56 | def minsByValue(implicit cmp: Ordering[V]): List[(K, V)] = minsByValue(v => v) 57 | 58 | def minsByValue[B](f: (V => B))(implicit cmp: Ordering[B]): List[(K, V)] = 59 | m.minBy(pair => f(pair._2)) |> ((max: (K, V)) => m.filter(_._2 == max._2).toList) 60 | } 61 | 62 | implicit class PimpedPairedList[K, V](l: List[(K, V)]) { 63 | def reduceByKey(func: (V, V) => V): Map[K, V] = l.groupBy(_._1).mapValues(_.map(_._2).reduce(func)) 64 | } 65 | 66 | // TODO DRY with insights-engine code by making another shared project of some kind (could open source it!?) 67 | implicit class PimpedTupleIterable[T1, T2](l: Iterable[(T1, T2)]) { 68 | def mapTupled[U](f: (T1, T2) => U): Iterable[U] = l.map(f.tupled) 69 | 70 | def flatMapTupled[U](f: (T1, T2) => TraversableOnce[U]): Iterable[U] = l.flatMap(f.tupled) 71 | } 72 | 73 | implicit class PimpedMapDouble[K](m: Map[K, Double]) { 74 | def normalize: Map[K, Double] = m.values.sum |> (total => m.mapValues(_ / total)) 75 | 76 | def productWithMap(m2: Map[K, Double]): Double = { 77 | (for { 78 | (k1, v1) <- m 79 | if m2.contains(k1) 80 | } yield v1 * m2(k1)).sum 81 | } 82 | } 83 | 84 | implicit class PimpedMapInt[K](m: Map[K, Int]) { 85 | def normalize: Map[K, Double] = m.values.sum |> (total => m.mapValues(_.toDouble / total)) 86 | } 87 | 88 | implicit class PimpedSet[T](s: Set[T]) { 89 | def X[U](other: Set[U]): Set[(T, U)] = |@|(other).tupled.toSet 90 | 91 | def |@|[U](other: Set[U]) = s.toList |@| other.toList 92 | } 93 | 94 | implicit class PimpedRDD[T: ClassTag](rdd: RDD[T]) { 95 | def applyIf(condition: Boolean)(f: RDD[T] => RDD[T]): RDD[T] = if (condition) f(rdd) else rdd 96 | 97 | def thenDo[U](f: RDD[T] => U): RDD[T] = f(rdd) |> (_ => rdd) 98 | } 99 | 100 | implicit class PimpedPartialFunction[X, E](pf: PartialFunction[X, E]) { 101 | def toFailureNel[W](x: X, toW: E => W = identity _): ValidationNel[W, X] = 102 | pf.andThen(e => toW(e).failureNel[X]).applyOrElse(x, (_: X).successNel[W]) 103 | 104 | def toFailureNel(x: X): ValidationNel[E, X] = toFailureNel(x, identity) 105 | } 106 | 107 | implicit class PimpedValidationNel[E, X](x1: ValidationNel[E, X]) { 108 | // Extension of scalaz.Validation.+++ operator, does not require the semigroup defined for X 109 | def |+++|(x2: ValidationNel[E, X]) = x1 match { 110 | case Failure(a1) => x2 match { 111 | case Failure(a2) => Failure(a1 append a2) 112 | case Success(b2) => x1 113 | } 114 | case Success(b1) => x2 match { 115 | case b2@Failure(_) => b2 116 | case Success(b2) if b1 == b2 => Success(b1) 117 | case Success(b2) => throw new IllegalArgumentException(s"$b1 not equals to $b2") 118 | } 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/utils/StatCounter.scala: -------------------------------------------------------------------------------- 1 | package sparkz.utils 2 | 3 | import scalaz.Monoid 4 | 5 | case object StatCounterMonoid extends Monoid[StatCounter] { 6 | def zero: StatCounter = ZeroStatCounter 7 | 8 | def append(f1: StatCounter, f2: => StatCounter): StatCounter = f1 merge f2 9 | } 10 | 11 | object ZeroStatCounter extends StatCounter(0, 0.0, 0.0, Double.PositiveInfinity, Double.NegativeInfinity) 12 | 13 | case object StatCounter { 14 | def apply(values: TraversableOnce[Double]): StatCounter = values.foldLeft(ZeroStatCounter: StatCounter)(_ merge _) 15 | 16 | def apply(value: Double): StatCounter = apply(List(value)) 17 | 18 | implicit val monoid: Monoid[StatCounter] = StatCounterMonoid 19 | } 20 | 21 | case class StatCounter(n: Long, sum: Double, sos: Double, min: Double, max: Double) { 22 | def merge(other: StatCounter) = StatCounter(n + other.n, sum + other.sum, sos + other.sos, 23 | math.min(min, other.min), math.max(max, other.max)) 24 | 25 | def merge(value: Double) = StatCounter(n + 1, sum + value, sos + (value * value), 26 | math.min(min, value), math.max(max, value)) 27 | 28 | def merge(values: TraversableOnce[Double]): StatCounter = values.foldLeft(this)(_ merge _) 29 | 30 | def count = n 31 | 32 | def mean = sum / n 33 | 34 | def variance = if (n > 1) (sos - n * mean * mean) / (n - 1) else Double.NaN 35 | 36 | def stdev = math.sqrt(variance) 37 | 38 | def stderr = stdev / math.sqrt(n) 39 | 40 | override def toString = "(count: %d, mean: %f, stdev: %f, min: %f, max: %f)".format(count, mean, stdev, max, min) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/utils/TopN.scala: -------------------------------------------------------------------------------- 1 | package sparkz.utils 2 | 3 | import scala.reflect.ClassTag 4 | 5 | object TopElements { 6 | def topN[T: ClassTag](elems: Iterable[T])(scoreFunc: T => Double, n: Int): List[T] = 7 | elems.foldLeft((Set.empty[(T, Double)], Double.MaxValue)) { 8 | case (accumulator@(topElems, minScore), elem) => 9 | val score = scoreFunc(elem) 10 | if (topElems.size < n) 11 | (topElems + (elem -> score), math.min(minScore, score)) 12 | else if (score > minScore) { 13 | val newTopElems = topElems - topElems.minBy(_._2) + (elem -> score) 14 | (newTopElems, newTopElems.map(_._2).min) 15 | } 16 | else accumulator 17 | } 18 | ._1.toList.sortBy(_._2).reverse.map(_._1) 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/sparkz/utils/VPTree.scala: -------------------------------------------------------------------------------- 1 | package sparkz.utils 2 | 3 | import VPTree._ 4 | 5 | import scala.reflect.ClassTag 6 | 7 | case class VPTree[T1: ClassTag, T2: ClassTag](root: Tree[T1, T2], distance: Distance[T1]) { 8 | def nearest(t: T1, maxDist: Double) = root.nearN(t, maxDist, distance) 9 | 10 | def approximateNearest(t: T1): (T1, T2) = root.approxNear(t, distance) 11 | 12 | def approximateNearestN(t: T1, n: Int): Array[(T1, T2)] = root.approxNearN(t, n, distance) 13 | } 14 | 15 | // Adapted version of https://github.com/kaja47/sketches/blob/master/VPTree.scala 16 | case object VPTree { 17 | type Distance[T1] = (T1, T1) => Double 18 | 19 | def apply[T1: ClassTag, T2: ClassTag](items: Array[(T1, T2)], distance: Distance[T1], leafSize: Int): VPTree[T1, T2] = 20 | VPTree(mkNode(items, distance, leafSize), distance) 21 | 22 | sealed trait Tree[T1, T2] { 23 | def size: Int 24 | 25 | def toArray: Array[(T1, T2)] 26 | 27 | def nearN(t: T1, maxDist: Double, distance: Distance[T1]): Array[(T1, T2)] 28 | 29 | def approxNear(t: T1, f: Distance[T1]): (T1, T2) 30 | 31 | def approxNearN(t: T1, n: Int, f: Distance[T1]): Array[(T1, T2)] 32 | } 33 | 34 | case class Node[T1, T2](point: (T1, T2), radius: Double, size: Int, in: Tree[T1, T2], out: Tree[T1, T2]) extends Tree[T1, T2] { 35 | def toArray = in.toArray ++ out.toArray 36 | 37 | def nearN(t: T1, maxDist: Double, distance: Distance[T1]): Array[(T1, T2)] = { 38 | val d = distance(t, point._1) 39 | if (d + maxDist < radius) { 40 | in.nearN(t, maxDist, distance) 41 | } else if (d - maxDist >= radius) { 42 | out.nearN(t, maxDist, distance) 43 | } else { 44 | in.nearN(t, maxDist, distance) ++ out.nearN(t, maxDist, distance) 45 | } 46 | } 47 | 48 | def approxNear(t: T1, f: Distance[T1]): (T1, T2) = { 49 | val d = f(point._1, t) 50 | if (d < radius) in.approxNear(t, f) 51 | else out.approxNear(t, f) 52 | } 53 | 54 | def approxNearN(t: T1, n: Int, f: Distance[T1]): Array[(T1, T2)] = 55 | if (n <= 0) Array.empty 56 | else if (n > size) toArray 57 | else { 58 | val d = f(point._1, t) 59 | if (d < radius) { 60 | in.approxNearN(t, n, f) ++ out.approxNearN(t, n - in.size, f) 61 | } else { 62 | out.approxNearN(t, n, f) ++ in.approxNearN(t, n - out.size, f) 63 | } 64 | } 65 | } 66 | 67 | case class Leaf[T1: ClassTag, T2: ClassTag](points: Array[(T1, T2)]) extends Tree[T1, T2] { 68 | def size = points.length 69 | 70 | def toArray = points 71 | 72 | def approxNear(t: T1, distance: Distance[T1]): (T1, T2) = points.minBy(p => distance(t, p._1)) 73 | 74 | def approxNearN(t: T1, n: Int, distance: Distance[T1]): Array[(T1, T2)] = 75 | if (n <= 0) Array.empty 76 | else if (n >= size) points 77 | else points.sortBy(p => distance(p._1, t)).take(n) 78 | 79 | def nearN(t: T1, maxDist: Double, distance: Distance[T1]): Array[(T1, T2)] = 80 | points.filter(p => distance(t, p._1) <= maxDist) 81 | } 82 | 83 | def mkNode[T1: ClassTag, T2: ClassTag](items: Array[(T1, T2)], distance: Distance[T1], leafSize: Int): Tree[T1, T2] = { 84 | if (items.length <= leafSize) 85 | Leaf[T1, T2](items) 86 | else { 87 | val vp = items(util.Random.nextInt(items.length)) 88 | 89 | val radius = { 90 | val numSamples = math.sqrt(items.length).floor * 2 91 | val distances = pickSample(items, numSamples.toInt).map(i => distance(vp._1, i._1)) 92 | distances.sortBy(identity).apply(distances.length / 2) 93 | } 94 | 95 | val (in, out) = items partition (item => distance(item._1, vp._1) < radius) 96 | 97 | if (in.length == 0) Leaf[T1, T2](out) 98 | else if (out.length == 0) Leaf[T1, T2](in) 99 | else Node(vp, radius, items.length, mkNode(in, distance, leafSize), mkNode(out, distance, leafSize)) 100 | } 101 | } 102 | 103 | def pickSample[T1, T2](items: Array[(T1, T2)], size: Int): Array[(T1, T2)] = 104 | if (items.length <= size) items 105 | else Array.fill(size)(items(util.Random.nextInt(items.length))) 106 | } 107 | --------------------------------------------------------------------------------