├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt ├── fp4ml-main ├── README.md ├── build.sbt └── src │ ├── main │ └── scala │ │ └── mlbigbook │ │ ├── app │ │ └── Exp20NG.scala │ │ ├── math │ │ ├── Argmax.scala │ │ ├── Argmin.scala │ │ ├── BaseMathVecOps.scala │ │ ├── Dense.scala │ │ ├── MathVectorOps.scala │ │ ├── NumericConversion.scala │ │ ├── RandoMut.scala │ │ ├── Sparse.scala │ │ ├── Val.scala │ │ ├── VectorOps.scala │ │ └── package.scala │ │ ├── ml │ │ ├── ClassificationModule.scala │ │ ├── ClusteringConf.scala │ │ ├── ClusteringModule.scala │ │ ├── CustomHashMap.scala │ │ ├── Hashable.scala │ │ ├── ItemNumVecModule.scala │ │ ├── Kmeans.scala │ │ ├── KnnClassifier.scala │ │ ├── NearestNeighbors.scala │ │ ├── OLD_KnnClassifier.scala │ │ └── RankingModule.scala │ │ └── util │ │ └── package.scala │ └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── mlbigbook │ ├── math │ ├── AbstractMathVectorOpsT.scala │ ├── AbstractMvoFractionalT.scala │ ├── MathVectorOpsDenseDoubleTest.scala │ ├── MathVectorOpsDenseFloatTest.scala │ ├── MathVectorOpsDenseIntTest.scala │ ├── MathVectorOpsDenseLongTest.scala │ ├── MathVectorOpsSparseDoubleTest.scala │ ├── MathVectorOpsSparseFloatTest.scala │ ├── MathVectorOpsSparseIntTest.scala │ └── MathVectorOpsSparseLongTest.scala │ └── ml │ ├── AddressData.scala │ ├── KmeansTest.scala │ ├── KnnClassifierTest.scala │ ├── KnnLshClassifierTest.scala │ ├── NearestNeighborsLSHTest.scala │ ├── NearestNeighborsTest.scala │ └── OLDKMeansTest.scala ├── fp4ml-spark ├── README.md ├── build.sbt └── src │ └── main │ └── scala │ └── TODO └── project ├── SharedBuild.scala ├── build.properties └── plugins.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | index-*.html 2 | src/main/resources/tessdata 3 | LOG_* 4 | OUTPUT_* 5 | 6 | *.class 7 | *.log 8 | *.swp 9 | project/target 10 | .idea* 11 | *.DS_Store 12 | 13 | # sbt specific 14 | .cache/ 15 | .history/ 16 | .lib/ 17 | dist/* 18 | target/ 19 | lib_managed/ 20 | src_managed/ 21 | project/boot/ 22 | project/plugins/project/ 23 | 24 | # Scala-IDE specific 25 | .scala_dependencies 26 | .worksheet 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.8 4 | jdk: 5 | - oraclejdk8 6 | script: "sbt test" 7 | after_success: "sbt coverage test coveralls" 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright [2016] [Malcolm W. Greaves] 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License.Copyright [yyyy] [name of copyright owner] 14 | 15 | Licensed under the Apache License, Version 2.0 (the "License"); 16 | you may not use this file except in compliance with the License. 17 | You may obtain a copy of the License at 18 | 19 | http://www.apache.org/licenses/LICENSE-2.0 20 | 21 | Unless required by applicable law or agreed to in writing, software 22 | distributed under the License is distributed on an "AS IS" BASIS, 23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | See the License for the specific language governing permissions and 25 | limitations under the License. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fp4ml 2 | [![Build Status](https://travis-ci.org/malcolmgreaves/fp4ml.svg?branch=master)](https://travis-ci.org/malcolmgreaves/fp4ml) [![Coverage Status](https://coveralls.io/repos/malcolmgreaves/fp4ml/badge.svg?branch=master&service=github)](https://coveralls.io/github/malcolmgreaves/fp4ml?branch=master) 3 | [![Codacy Badge](http://api.codacy.com:80/project/badge/7a4fbaf2cbe6449993224d6eb4df0f13)](https://www.codacy.com/app/greavesmalcolm/fp4ml) [![Stories in Ready](https://badge.waffle.io/malcolmgreaves/fp4ml.png?label=ready&title=Ready)](https://waffle.io/malcolmgreaves/fp4ml) [![Join the chat at https://gitter.im/malcolmgreaves/fp4ml](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/malcolmgreaves/fp4ml?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.malcolmgreaves/fp4ml-scala_2.11/badge.svg?style=plastic)](https://maven-badges.herokuapp.com/maven-central/io.malcolmgreaves/fp4ml-scala_2.11) 4 | 5 | Machine learning for functional programmers. 6 | 7 | # Project Structure 8 | 9 | This repository is split into subprojects: 10 | 11 | * [fp4ml-main](https://github.com/malcolmgreaves/fp4ml/tree/master/fp4ml-core) 12 | * The meat and potatoes of the fp4ml project. Includes: 13 | * learning algorithms 14 | * abstractions 15 | * data structures 16 | * experiment frameworks 17 | * evaluation metrics 18 | * model definitions, formats 19 | * Depends on 3rd party libraries including: 20 | * [`data-tc`](https://github.com/malcolmgreaves/data-tc) 21 | * `shapeless` 22 | * `spire` 23 | 24 | * [fp4ml-spark](https://github.com/malcolmgreaves/fp4ml/tree/master/fp4ml-spark) 25 | * An extension of `fp4ml-main` to use elements from the Apache Spark ecosystem. 26 | 27 | # Legal 28 | 29 | The original author retains copyright over all material contained within this repository. Use of this code is governed under the terms of the Apache 2.0 open source software license. See the [LICENSE](./LICENSE) file for more details. 30 | 31 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "fp4ml" 2 | organization in ThisBuild := "io.malcolmgreaves" 3 | version in ThisBuild := { 4 | val major: Int = 0 5 | val minor: Int = 0 6 | val patch: Int = 0 7 | s"$major.$minor.$patch" 8 | } 9 | 10 | import SharedBuild._ 11 | 12 | lazy val root = project 13 | .in(file(".")) 14 | .aggregate( 15 | `fp4ml-main`, 16 | `fp4ml-spark` 17 | ) 18 | .settings { 19 | publishArtifact := false 20 | publishLocal := {} 21 | publish := {} 22 | } 23 | 24 | lazy val `fp4ml-main` = project.in(file("fp4ml-main")).settings { 25 | publishArtifact := true 26 | } 27 | 28 | lazy val `fp4ml-spark` = 29 | project.in(file("fp4ml-spark")).dependsOn(`fp4ml-main`).settings { 30 | publishArtifact := true 31 | } 32 | 33 | lazy val subprojects: Seq[ProjectReference] = root.aggregate 34 | lazy val publishTasks = subprojects.map { publish.in } 35 | 36 | resolvers in ThisBuild := Seq( 37 | // sonatype, maven central 38 | "Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/", 39 | "Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/", 40 | // bintray 41 | "Scalaz Bintray" at "http://dl.bintray.com/scalaz/releases", 42 | Resolver.bintrayRepo("mfglabs", "maven"), 43 | Resolver.bintrayRepo("dwhjames", "maven"), 44 | // etc. 45 | "Confluent" at "http://packages.confluent.io/maven/" 46 | ) 47 | 48 | lazy val javaV = "1.8" 49 | scalaVersion in ThisBuild := "2.11.8" 50 | scalacOptions in ThisBuild := Seq( 51 | "-optimize", 52 | "-deprecation", 53 | "-feature", 54 | "-unchecked", 55 | s"-target:jvm-$javaV", 56 | "-encoding", 57 | "utf8", 58 | "-language:postfixOps", 59 | "-language:existentials", 60 | "-language:higherKinds", 61 | "-language:implicitConversions", 62 | "-language:experimental.macros", 63 | "-language:reflectiveCalls", 64 | "-Yno-adapted-args", 65 | "-Ywarn-value-discard", 66 | "-Yinline-warnings", 67 | "-Xlint", 68 | "-Xfuture", 69 | "-Ywarn-dead-code", 70 | "-Xfatal-warnings" // Every warning is esclated to an error. 71 | ) 72 | javacOptions in ThisBuild := Seq("-source", javaV, "-target", javaV) 73 | javaOptions in ThisBuild := Seq( 74 | "-server", 75 | "-XX:+AggressiveOpts", 76 | "-XX:+TieredCompilation", 77 | "-XX:CompileThreshold=100", 78 | "-Xmx3000M", 79 | "-XX:+UseG1GC" 80 | ) 81 | 82 | publishArtifact := false 83 | -------------------------------------------------------------------------------- /fp4ml-main/README.md: -------------------------------------------------------------------------------- 1 | # fp4ml-main 2 | [Maven Central](https://maven-badges.herokuapp.com/maven-central/io.malcolmgreaves/fp4ml-main_2.11/badge.svg?style=plastic)](https://maven-badges.herokuapp.com/maven-central/io.malcolmgreaves/fp4ml-main_2.11) 3 | Machine learning for functional programmers. 4 | A library of machine learning algorithms. Implemented from the get-go using principles of functional programming, all algorithm APIs are referentially transparent and stateless. Additionally, algoritghms are implemented in a maximally general way, with the objective to "write once, run anywhere". -------------------------------------------------------------------------------- /fp4ml-main/build.sbt: -------------------------------------------------------------------------------- 1 | name := "fp4ml-main" 2 | 3 | import SharedBuild._ 4 | 5 | addCompilerPlugin(scalaMacros) 6 | 7 | libraryDependencies ++= 8 | fp4mlMainDeps ++ 9 | testDeps 10 | 11 | // 12 | // test, runtime settings 13 | // 14 | fork in run := true 15 | fork in Test := true 16 | parallelExecution in Test := true 17 | 18 | pomExtra := pomExtraInfo 19 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/app/Exp20NG.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.app 2 | 3 | import java.io.{FileReader, BufferedReader, File} 4 | import java.nio.charset.Charset 5 | import java.nio.file.Files 6 | 7 | import breeze.linalg.{SparseVector, DenseVector} 8 | import mlbigbook.math.MathVectorOps 9 | import mlbigbook.ml.{ImplicitHashable, KnnClassifier} 10 | 11 | import scala.io.Source 12 | import scala.util.Random 13 | 14 | object Exp20NG extends App { 15 | 16 | lazy val normalizeLine: String => String = 17 | s => s.trim.toLowerCase 18 | 19 | lazy val filterLine: String => Boolean = 20 | s => 21 | s.nonEmpty && 22 | headerPrefixes.forall { !s.startsWith(_) } && 23 | headerSuffixes.forall { !s.endsWith(_) } 24 | 25 | lazy val labelTransform: String => String = 26 | label => { 27 | val i = label.indexOf(".") 28 | if (i >= 0) 29 | label.substring(0, i) 30 | else 31 | label 32 | } 33 | 34 | lazy val headerPrefixes: Seq[String] = 35 | """ 36 | |Xref: 37 | |Path: 38 | |From: 39 | |Newsgroups: 40 | |Subject: 41 | |Summary: 42 | |Keywords: 43 | |Message-ID: 44 | |Date: 45 | |Expires: 46 | |Followup-To: 47 | |Distribution: 48 | |Organization: 49 | |Approved: 50 | |Supersedes: 51 | |Lines: 52 | |Archive-name: 53 | |Alt-atheism-archive-name: 54 | |Last-modified: 55 | |Version: 56 | |-----BEGIN PGP SIGNED MESSAGE----- 57 | |In article 58 | |From article 59 | |> 60 | |>> 61 | |References: 62 | |Email: 63 | |Sender: 64 | |NNTP-posting-host 65 | |NNTP-posting-user 66 | |-- 67 | |: >: 68 | |: > 69 | | > 70 | |: 71 | |< 72 | """.stripMargin.trim.toLowerCase.split { "\n" }.toSeq 73 | 74 | lazy val headerSuffixes: Seq[String] = 75 | """ 76 | |writes: 77 | |.com 78 | """.stripMargin.trim.toLowerCase.split { "\n" }.toSeq 79 | 80 | lazy val ngDirectory = new File("./20_newsgroups") 81 | println( 82 | s"Loading 20 Newsgroup Data from:\n${ngDirectory.getCanonicalPath}\n") 83 | 84 | import scala.collection.JavaConverters._ 85 | lazy val loadNgFi: File => Seq[String] = 86 | fi => 87 | if (fi isFile) { 88 | val br = new BufferedReader(new FileReader(fi)) 89 | val buf = new scala.collection.mutable.ArrayBuffer[String](420) 90 | var line: String = br.readLine() 91 | while (line != null) { 92 | buf.append(line) 93 | line = br.readLine() 94 | } 95 | buf.toSeq 96 | }.map { normalizeLine }.filter { filterLine } else 97 | Seq.empty 98 | 99 | lazy val loadNgData: File => Seq[(File, Seq[String])] = 100 | f => { 101 | if (f.isDirectory) { 102 | Option(f.listFiles()).map { _.toSeq }.getOrElse { Seq.empty }.flatMap { 103 | loadNgData 104 | } 105 | 106 | } else if (f.isFile) 107 | Seq((f, loadNgFi(f))) 108 | else 109 | Seq.empty 110 | } 111 | 112 | // // // // // // // // // // // // // // // // // // // // // // // // // // 113 | // 114 | // S C R I P T 115 | // 116 | // // // // // // // // // // // // // // // // // // // // // // // // // // 117 | 118 | lazy val ng20 = ngDirectory.listFiles.filter(_ != null).toSeq 119 | println(s"There are ${ng20.size} newsgroup directories") 120 | 121 | val newsgroup2fileandcontent = 122 | ng20.map { ngDir => 123 | println(s"loading data from the ${ngDir.getName} newsgroup ... ") 124 | val bothFiLines = loadNgData(ngDir) 125 | (ngDir.getName, bothFiLines) 126 | }.toMap 127 | 128 | type Document = String 129 | 130 | import ImplicitHashable._ 131 | import fif.ImplicitCollectionsData._ 132 | lazy val knn = KnnClassifier[Document, String, Float, SparseVector]( 133 | MathVectorOps.Implicits.FloatSparseVot, 134 | representsNoLabel = "" 135 | ) 136 | 137 | val stringVectorizer: knn.Vectorizer = new { 138 | 139 | val word2index: Map[String, Int] = { 140 | val words = { 141 | for { 142 | (_, data) <- newsgroup2fileandcontent 143 | (_, lines) <- data 144 | line <- lines 145 | word <- line.split(" ") 146 | } yield word 147 | }.toSet 148 | 149 | println(s"There are ${words.size} unique words") 150 | 151 | words.zipWithIndex.toMap 152 | } 153 | 154 | lazy val vectorize = (s: Document) => 155 | SparseVector[Float](word2index.size)({ 156 | val bothIndexValue = s.split(" ").foldLeft(Map.empty[Int, Float]) { 157 | case (accum, word) => 158 | if (word2index contains word) { 159 | val index = word2index(word) 160 | if (accum.contains(index)) 161 | (accum - index) + (index -> (accum(index) + 1.0f)) 162 | else 163 | accum + (index -> 1.0f) 164 | 165 | } else 166 | accum 167 | } 168 | 169 | bothIndexValue.map { 170 | case (index, count) => (index, math.log(count).toFloat) 171 | }.toSeq 172 | }: _*) 173 | 174 | lazy val nDimensions = word2index.size 175 | } 176 | 177 | val distance: knn.Distance = (v1, v2) => { 178 | val r = knn.vops.subV(v1, v2) 179 | knn.vops.dot(r, r) 180 | } 181 | 182 | val allLabeledData: Seq[(Document, String)] = for { 183 | (ng, bothFiAndData) <- newsgroup2fileandcontent.toSeq 184 | (_, lines) <- bothFiAndData 185 | } yield (lines.mkString("\n"), labelTransform(ng)) 186 | 187 | println(s"total labeled data size: ${allLabeledData.size}") 188 | 189 | val (train, test): (Seq[(Document, String)], Seq[(Document, String)]) = { 190 | 191 | val shuffled: Seq[(Document, String)] = allLabeledData.map { x => 192 | (x, math.random) 193 | }.sortBy { case (_, rando) => rando }.map { case (x, _) => x } 194 | 195 | val si = (shuffled.size * .9).toInt 196 | 197 | ( 198 | shuffled.slice(0, si), 199 | shuffled.slice(si + 1, shuffled.size) 200 | ) 201 | } 202 | 203 | println(s"building kNN on ${train.size} examples") 204 | val classifier = knn.train((5, distance), stringVectorizer)(train) 205 | 206 | val nTake = 25 207 | println(s"grabbing $nTake random test example (from ${test.size} documents)") 208 | 209 | var nCorrect = 0 210 | test.take(nTake).foreach { 211 | case (testDoc, testLabel) => 212 | val predicted = classifier(testDoc) 213 | println(s"predicted: $predicted actual: $testLabel") 214 | if (predicted == testLabel) 215 | nCorrect += 1 216 | } 217 | println( 218 | s"\n\nAccuracy: $nCorrect / $nTake = ${(nCorrect.toFloat / nTake.toFloat) * 100.0f} %") 219 | } 220 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/Argmax.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import fif.Data 4 | 5 | import scala.reflect.ClassTag 6 | 7 | /** 8 | * Generic algorithm for finding the maximal argument. Uses the `Val` 9 | * type class as evidence of an argument's value. 10 | */ 11 | object Argmax { 12 | 13 | import Data.ops._ 14 | 15 | /** 16 | * Finds the maximal argument of `elements` in linear time. Uses the `Val` 17 | * type class as evidence of an argument's value. 18 | * 19 | * throws IllegalArgumentException Iff `elements` is empty. 20 | */ 21 | def apply[T: Val: ClassTag, D[_]: Data](elements: D[T]): Option[T] = 22 | if (elements isEmpty) 23 | None 24 | else 25 | Some(applyUnsafe(elements)) 26 | 27 | def applyUnsafe[T: Val: ClassTag, D[_]: Data](elements: D[T]): T = { 28 | val v = Val[T] 29 | elements.reduce { 30 | case (a, b) => 31 | if (v.n.lt(v.valueOf(a), v.valueOf(b))) 32 | a 33 | else 34 | b 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/Argmin.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import fif.Data 4 | 5 | import scala.reflect.ClassTag 6 | 7 | object Argmin { 8 | 9 | def apply[T: Val: ClassTag, D[_]: Data](elements: D[T]): Option[T] = 10 | Argmax(elements)( 11 | Val.inverse, 12 | implicitly[ClassTag[T]], 13 | implicitly[Data[D]] 14 | ) 15 | 16 | def applyUnsafe[T: Val: ClassTag, D[_]: Data](elements: D[T]): T = 17 | Argmax.applyUnsafe(elements)( 18 | Val.inverse, 19 | implicitly[ClassTag[T]], 20 | implicitly[Data[D]] 21 | ) 22 | } 23 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/BaseMathVecOps.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.math.Semiring 4 | import breeze.storage.Zero 5 | 6 | import scala.language.higherKinds 7 | 8 | private[math] abstract class BaseMathVecOps[Num, V[_]]( 9 | implicit no: Fractional[Num], 10 | zo: Zero[Num], 11 | so: Semiring[Num] 12 | ) extends MathVectorOps[V] { 13 | 14 | final override type N = Num 15 | 16 | override final implicit lazy val n = no 17 | override final implicit lazy val z = zo 18 | override final implicit lazy val s = so 19 | 20 | } 21 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/Dense.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.math.Semiring 4 | import breeze.linalg.{Vector, DenseVector} 5 | import breeze.linalg.operators._ 6 | import breeze.storage.Zero 7 | import mlbigbook.util 8 | import spire.syntax.cfor._ 9 | 10 | import scala.language.{higherKinds, implicitConversions} 11 | import scala.reflect.ClassTag 12 | 13 | /** 14 | * Base partial implementation for DenseVectors. Implements the MathVectorOps 15 | * methods for the DenseVector type. Also defines the zeros, ones methds 16 | * of MathVectorOps. 17 | */ 18 | protected abstract class Dense[ 19 | @specialized Num: Fractional: Zero: Semiring: ClassTag] 20 | extends BaseMathVecOps[Num, DenseVector] { 21 | 22 | override final def foreach[A](v: DenseVector[A])(f: A => Any): Unit = 23 | v.foreach(f) 24 | 25 | override final def zeros(size: Int): DenseVector[N] = 26 | DenseVector.zeros[N](size) 27 | 28 | override final def ones(size: Int): DenseVector[N] = 29 | DenseVector.ones[N](size)( 30 | implicitly[ClassTag[N]], 31 | implicitly[Semiring[N]] 32 | ) 33 | 34 | override final def fill[A: ClassTag: Zero](size: Int)(value: => A) = 35 | DenseVector.fill(size)(value) 36 | 37 | override final def toSeq[A: ClassTag](v: DenseVector[A]): Seq[A] = 38 | util.copyToSeq(v.toArray) 39 | 40 | override final def size(v: DenseVector[_]): Int = 41 | v.length 42 | 43 | override final def apply[A](v: DenseVector[A])(index: Int): A = 44 | v(index) 45 | 46 | override final def map[B: ClassTag: Fractional: Zero](v: DenseVector[N])( 47 | f: N => B): DenseVector[B] = 48 | v.map(f) 49 | 50 | override final def reduce[B >: N: ClassTag](v: DenseVector[N])(r: (B, 51 | B) => B) = 52 | v.reduceLeft(r) 53 | 54 | override final def fold[B: ClassTag](v: DenseVector[N])(zero: B)( 55 | combine: (B, N) => B) = 56 | v.valuesIterator.foldLeft(zero)(combine) 57 | 58 | override final def copy(v: DenseVector[N]) = { 59 | val src = v.toArray 60 | val size = src.length 61 | val cpy = new Array[N](size) 62 | System.arraycopy(src, 0, cpy, 0, size) 63 | DenseVector(cpy) 64 | } 65 | } 66 | 67 | /** 68 | * Implementation for DenseVector[Double]. 69 | */ 70 | object DoubleDenseMathVector extends Dense[Double] { 71 | override val addV = DenseVector.dv_dv_Op_Double_OpAdd 72 | override val addS = DenseVector.dv_s_Op_Double_OpAdd 73 | override val subV = DenseVector.dv_dv_Op_Double_OpSub 74 | override val subS = DenseVector.dv_s_Op_Double_OpSub 75 | override val dot = 76 | new OpMulInner.Impl2[DenseVector[Double], DenseVector[Double], Double] { 77 | 78 | def apply(a: DenseVector[Double], b: DenseVector[Double]) = { 79 | require(b.length == a.length, "Vectors must be the same length!") 80 | // val boff = 81 | // if (b.stride >= 0) b.offset 82 | // else b.offset + b.stride * (b.length - 1) 83 | // val aoff = 84 | // if (a.stride >= 0) a.offset 85 | // else a.offset + a.stride * (a.length - 1) 86 | // BLAS.getInstance().sdot( 87 | // a.length, b.data, boff, b.stride, a.data, aoff, a.stride 88 | // ) 89 | // TODO : Do we need to take into consideration ({a,b}.{stride,offset}) 90 | // into account here? 91 | var agg = 0.0 92 | cfor(0)(_ < a.length, _ + 1) { i => 93 | agg += a(i) * b(i) 94 | } 95 | agg 96 | } 97 | 98 | implicitly[BinaryRegistry[Vector[Double], 99 | Vector[Double], 100 | OpMulInner.type, 101 | Double]].register(this) 102 | } 103 | 104 | override val divS = DenseVector.dv_s_Op_Double_OpDiv 105 | override val mulS = DenseVector.dv_s_Op_Double_OpMulScalar 106 | override val divV = DenseVector.dv_dv_Op_Double_OpDiv 107 | override val mulV = DenseVector.dv_dv_Op_Double_OpMulScalar 108 | } 109 | 110 | /** 111 | * Implementation for DenseVector[Float]. 112 | */ 113 | object FloatDenseMathVector extends Dense[Float] { 114 | override val addV = DenseVector.dv_dv_Op_Float_OpAdd 115 | override val addS = DenseVector.dv_s_Op_Float_OpAdd 116 | override val subV = DenseVector.dv_dv_Op_Float_OpSub 117 | override val subS = DenseVector.dv_s_Op_Float_OpSub 118 | override val dot = DenseVector.canDot_DV_DV_Float 119 | override val divS = DenseVector.dv_s_Op_Float_OpDiv 120 | override val mulS = DenseVector.dv_s_Op_Float_OpMulScalar 121 | override val divV = DenseVector.dv_dv_Op_Float_OpDiv 122 | override val mulV = DenseVector.dv_dv_Op_Float_OpMulScalar 123 | } 124 | 125 | /** 126 | * Implementation for DenseVector[Long]. 127 | */ 128 | object LongDenseMathVector extends Dense[Long] { 129 | override val addV = DenseVector.dv_dv_Op_Long_OpAdd 130 | override val addS = DenseVector.dv_s_Op_Long_OpAdd 131 | override val subV = DenseVector.dv_dv_Op_Long_OpSub 132 | override val subS = DenseVector.dv_s_Op_Long_OpSub 133 | override val dot = DenseVector.canDot_DV_DV_Long 134 | override val divS = DenseVector.dv_s_Op_Long_OpDiv 135 | override val mulS = DenseVector.dv_s_Op_Long_OpMulScalar 136 | override val divV = DenseVector.dv_dv_Op_Long_OpDiv 137 | override val mulV = DenseVector.dv_dv_Op_Long_OpMulScalar 138 | } 139 | 140 | /** 141 | * Implementation for DenseVector[Int]. 142 | */ 143 | object IntDenseMathVector extends Dense[Int] { 144 | override val addV = DenseVector.dv_dv_Op_Int_OpAdd 145 | override val addS = DenseVector.dv_s_Op_Int_OpAdd 146 | override val subV = DenseVector.dv_dv_Op_Int_OpSub 147 | override val subS = DenseVector.dv_s_Op_Int_OpSub 148 | override val dot = DenseVector.canDot_DV_DV_Int 149 | override val divS = DenseVector.dv_s_Op_Int_OpDiv 150 | override val mulS = DenseVector.dv_s_Op_Int_OpMulScalar 151 | override val divV = DenseVector.dv_dv_Op_Int_OpDiv 152 | override val mulV = DenseVector.dv_dv_Op_Int_OpMulScalar 153 | } 154 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/MathVectorOps.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.math.Semiring 4 | import breeze.linalg.operators._ 5 | import breeze.storage.Zero 6 | 7 | import scala.language.{higherKinds, implicitConversions} 8 | import scala.reflect.ClassTag 9 | 10 | /** 11 | * An abstraction specifying operations one may perform using vectors and 12 | * scalar values. These operations include element-wise & scalar 13 | * multiplication, division, addition, and subtraction. Support for the dot 14 | * product of two vectors is also included. As well as methods to construct new 15 | * vector instances. 16 | */ 17 | trait MathVectorOps[V[_]] extends VectorOps[V] { 18 | 19 | type N 20 | implicit val n: Fractional[N] 21 | implicit val z: Zero[N] 22 | implicit val s: Semiring[N] 23 | 24 | /** 25 | * Creates a new vector of the input size where each element has value 0. 26 | */ 27 | def zeros(size: Int): V[N] 28 | 29 | /** 30 | * Creates a new vector of the input size where each element has value 1. 31 | */ 32 | def ones(size: Int): V[N] 33 | 34 | protected lazy val zero = implicitly[Fractional[N]].zero 35 | protected lazy val one = implicitly[Fractional[N]].one 36 | 37 | /** 38 | * Change every element of a vector V using the function f. 39 | * No side effects. 40 | */ 41 | def map[B: ClassTag: Fractional: Zero](v: V[N])(f: N => B): V[B] 42 | 43 | /** 44 | * Apply a binary combination operator, r, to pairs of elements from the 45 | * input vector, v. Note that the output of r shall be applied to both 46 | * vector elements as well as other, previous outputs from r. The order of 47 | * execution is not guaranteed. Therefore, it is important that r is 48 | * associative and communiative. 49 | */ 50 | def reduce[A1 >: N: ClassTag](v: V[N])(r: (A1, A1) => A1): A1 51 | 52 | /** 53 | * From the starting value, zero, applies the function combine to elements 54 | * of the input vector v. This method evaluates to the final accumulated 55 | * value of this operation across all elements of the vector. Execution 56 | * order is not guaranteed, so combine must be side-effect free, 57 | * associative, and communicative. 58 | */ 59 | def fold[B: ClassTag](v: V[N])(zero: B)(combine: (B, N) => B): B 60 | 61 | /** 62 | * Create a new vector of the input size where each element has the value v. 63 | */ 64 | def fill[A: ClassTag: Zero](size: Int)(v: => A): V[A] 65 | 66 | /** 67 | * Performs a shallow copy of the vector's contents. Each element is copied 68 | * to a newly allocated vector of type V[N]. If N is a primitive or other 69 | * value type, then this will be a deep copy. Otherwise, the reference will 70 | * be copied. 71 | */ 72 | def copy(v: V[N]): V[N] 73 | 74 | /** 75 | * Performs element-wise addition of two vectors. 76 | */ 77 | val addV: OpAdd.Impl2[V[N], V[N], V[N]] 78 | 79 | /** 80 | * Adds a scalar to each element of a vector. 81 | */ 82 | val addS: OpAdd.Impl2[V[N], N, V[N]] 83 | 84 | /** 85 | * Performs element-wise subtraction of two vectors. 86 | */ 87 | val subV: OpSub.Impl2[V[N], V[N], V[N]] 88 | 89 | /** 90 | * Subtracts a scalar from each element of a vector. 91 | */ 92 | val subS: OpSub.Impl2[V[N], N, V[N]] 93 | 94 | /** 95 | * Performs a dot product operation between two vectors, 96 | * which results in a scalar. 97 | */ 98 | val dot: OpMulInner.Impl2[V[N], V[N], N] 99 | 100 | /** 101 | * Performs element-wise multiplication between two vectors. 102 | */ 103 | val mulV: OpMulScalar.Impl2[V[N], V[N], V[N]] 104 | 105 | /** 106 | * Multiplies each vector element by a scalar. 107 | */ 108 | val mulS: OpMulScalar.Impl2[V[N], N, V[N]] 109 | 110 | /** 111 | * Performs element-wise division between two vectors. 112 | */ 113 | val divV: OpDiv.Impl2[V[N], V[N], V[N]] 114 | 115 | /** 116 | * Divides each vector element by a scalar. 117 | */ 118 | val divS: OpDiv.Impl2[V[N], N, V[N]] 119 | 120 | } 121 | 122 | object MathVectorOps { 123 | 124 | type Type[Num, Vec[_]] = MathVectorOps[Vec] { 125 | type N = Num 126 | } 127 | 128 | object Implicits { 129 | // dense operations 130 | implicit val DoubleDenseVot = DoubleDenseMathVector 131 | implicit val FloatDenseVot = FloatDenseMathVector 132 | implicit val LongDenseVot = LongDenseMathVector 133 | implicit val IntDenseVot = IntDenseMathVector 134 | // sparse operations 135 | implicit val DoubleSparseVot = DoubleSparseMathVector 136 | implicit val FloatSparseVot = FloatSparseMathVector 137 | implicit val LongSparseVot = LongSparseMathVector 138 | implicit val IntSparseVot = IntSparseMathVector 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/NumericConversion.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import scala.reflect.ClassTag 4 | 5 | /** 6 | * Typeclass supporting conversions between primitive types, with the 7 | * constraint that the primitive has Numeric evidence. 8 | */ 9 | sealed abstract class NumericConversion[@specialized N] { 10 | 11 | def fromInt(i: Int): N 12 | 13 | def fromLong(l: Long): N 14 | 15 | def fromDouble(d: Double): N 16 | 17 | def fromByte(b: Byte): N 18 | 19 | def fromShort(s: Short): N 20 | 21 | def fromFloat(f: Float): N 22 | 23 | implicit def ct: ClassTag[N] 24 | } 25 | 26 | object NumericConversion { 27 | 28 | def apply[N: NumericConversion]: NumericConversion[N] = 29 | implicitly[NumericConversion[N]] 30 | 31 | /** 32 | * Implicit NumericConversion instances for every primitive numeric type: 33 | * float, long, double, int, short, byte 34 | */ 35 | object Implicits { 36 | 37 | implicit case object FloatC extends NumericConversion[Float] { 38 | override implicit val ct: ClassTag[Float] = ClassTag(classOf[Float]) 39 | override def fromInt(l: Int) = l.toFloat 40 | override def fromLong(l: Long) = l.toFloat 41 | override def fromShort(s: Short) = s.toFloat 42 | override def fromByte(b: Byte) = b.toFloat 43 | override def fromDouble(d: Double) = d.toFloat 44 | override def fromFloat(f: Float) = f 45 | } 46 | 47 | implicit case object LongC extends NumericConversion[Long] { 48 | override implicit val ct: ClassTag[Long] = ClassTag(classOf[Long]) 49 | override def fromInt(l: Int) = l.toLong 50 | override def fromLong(l: Long) = l 51 | override def fromShort(s: Short) = s.toLong 52 | override def fromByte(b: Byte) = b.toLong 53 | override def fromDouble(d: Double) = d.toLong 54 | override def fromFloat(f: Float) = f.toLong 55 | } 56 | 57 | implicit case object DoubleC extends NumericConversion[Double] { 58 | override implicit val ct: ClassTag[Double] = ClassTag(classOf[Double]) 59 | override def fromInt(l: Int) = l.toDouble 60 | override def fromLong(l: Long): Double = l.toDouble 61 | override def fromShort(s: Short): Double = s.toDouble 62 | override def fromByte(b: Byte): Double = b.toDouble 63 | override def fromDouble(d: Double): Double = d 64 | override def fromFloat(f: Float): Double = f.toDouble 65 | } 66 | 67 | implicit case object IntC extends NumericConversion[Int] { 68 | override implicit val ct: ClassTag[Int] = ClassTag(classOf[Int]) 69 | override def fromInt(l: Int) = l.toInt 70 | override def fromLong(l: Long) = l.toInt 71 | override def fromShort(s: Short) = s.toInt 72 | override def fromByte(b: Byte) = b.toInt 73 | override def fromDouble(d: Double) = d.toInt 74 | override def fromFloat(f: Float) = f.toInt 75 | } 76 | 77 | implicit case object ShortC extends NumericConversion[Short] { 78 | override implicit val ct: ClassTag[Short] = ClassTag(classOf[Short]) 79 | override def fromInt(l: Int) = l.toShort 80 | override def fromLong(l: Long) = l.toShort 81 | override def fromShort(s: Short) = s 82 | override def fromByte(b: Byte) = b.toShort 83 | override def fromDouble(d: Double) = d.toShort 84 | override def fromFloat(f: Float) = f.toShort 85 | } 86 | 87 | implicit case object ByteC extends NumericConversion[Byte] { 88 | override implicit val ct: ClassTag[Byte] = ClassTag(classOf[Byte]) 89 | override def fromInt(l: Int) = l.toByte 90 | override def fromLong(l: Long) = l.toByte 91 | override def fromShort(s: Short) = s.toByte 92 | override def fromByte(b: Byte) = b 93 | override def fromDouble(d: Double) = d.toByte 94 | override def fromFloat(f: Float) = f.toByte 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/RandoMut.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import scala.util.Random 4 | 5 | abstract class RandoMut[N: Fractional] { 6 | def next(): N 7 | } 8 | 9 | object RandoMut { 10 | 11 | def newSeedPerCall[N: Fractional: NumericConversion]: () => RandoMut[N] = 12 | () => fromSeed(Random.nextLong()) 13 | 14 | def fromSeed[N: Fractional: NumericConversion](seed: Long): RandoMut[N] = { 15 | val r = new Random(seed) 16 | new RandoMut[N] { 17 | override def next() = NumericConversion[N].fromDouble(r.nextDouble()) 18 | } 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/Sparse.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.SparseVector 4 | import breeze.linalg.operators._ 5 | import breeze.math.Semiring 6 | import breeze.storage.Zero 7 | import spire.syntax.cfor._ 8 | 9 | import scala.language.higherKinds 10 | import scala.reflect.ClassTag 11 | 12 | /** 13 | * Base partial implementation for DenseVectors. Implements the MathVectorOps 14 | * methods for the DenseVector type. Also defines the zeros, ones methds 15 | * of MathVectorOps. 16 | */ 17 | protected abstract class Sparse[ 18 | @specialized Num: Fractional: Zero: Semiring: ClassTag] 19 | extends BaseMathVecOps[Num, SparseVector] { 20 | 21 | override final def foreach[A](v: SparseVector[A])(f: A => Any) = 22 | v.foreach(f) 23 | 24 | override final def zeros(size: Int) = 25 | SparseVector.zeros[N](size) 26 | 27 | override final def ones(size: Int) = 28 | SparseVector.fill(size)(one) 29 | 30 | override final def fill[A: ClassTag: Zero](size: Int)(value: => A) = 31 | SparseVector.fill(size)(value) 32 | 33 | override final def toSeq[A: ClassTag](v: SparseVector[A]) = { 34 | val values = new Array[A](v.length) 35 | cfor(0)(_ < values.length, _ + 1) { i => 36 | values(i) = v(i) 37 | } 38 | values.toSeq 39 | } 40 | 41 | override final def size(v: SparseVector[_]): Int = 42 | v.length 43 | 44 | override final def apply[A](v: SparseVector[A])(index: Int) = 45 | v(index) 46 | 47 | import SparseVector._ 48 | 49 | override final def map[B: ClassTag: Fractional: Zero](v: SparseVector[N])( 50 | f: N => B) = 51 | v.map(f) 52 | 53 | override final def reduce[A1 >: N: ClassTag](v: SparseVector[N])( 54 | r: (A1, A1) => A1) = 55 | v.reduceLeft(r) 56 | 57 | override final def fold[B: ClassTag](v: SparseVector[N])(zero: B)( 58 | combine: (B, N) => B) = 59 | v.valuesIterator.foldLeft(zero)(combine) 60 | 61 | override final def copy(v: SparseVector[N]) = 62 | v.copy 63 | } 64 | 65 | /** 66 | * Implementation for SparseVector[Double]. 67 | */ 68 | object DoubleSparseMathVector extends Sparse[Double] { 69 | override val subS = 70 | new OpSub.Impl2[SparseVector[Double], Double, SparseVector[Double]] { 71 | override def apply(v: SparseVector[Double], v2: Double) = v.map { 72 | _ - v2 73 | } 74 | } 75 | override val subV = SparseVector.implOps_SVT_SVT_eq_SVT_Double_OpSub 76 | 77 | override val addS = 78 | new OpAdd.Impl2[SparseVector[Double], Double, SparseVector[Double]] { 79 | override def apply(v: SparseVector[Double], v2: Double) = v.map { 80 | _ + v2 81 | } 82 | } 83 | override val addV = SparseVector.implOps_SVT_SVT_eq_SVT_Double_OpAdd 84 | 85 | override val dot = 86 | new OpMulInner.Impl2[SparseVector[Double], SparseVector[Double], Double] { 87 | override def apply(v: SparseVector[Double], v2: SparseVector[Double]) = 88 | v.dot(v2) 89 | } 90 | 91 | override val divS = 92 | new OpDiv.Impl2[SparseVector[Double], Double, SparseVector[Double]] { 93 | override def apply(v: SparseVector[Double], v2: Double) = v.map { 94 | _ / v2 95 | } 96 | } 97 | override val divV = SparseVector.implOps_SVT_SVT_eq_SVT_Double_OpDiv 98 | 99 | override val mulS = 100 | new OpMulScalar.Impl2[SparseVector[Double], Double, SparseVector[Double]] { 101 | override def apply(v: SparseVector[Double], v2: Double) = v.map { 102 | _ * v2 103 | } 104 | } 105 | override val mulV = SparseVector.implOpMulScalar_SVT_SVT_eq_SVT_Double 106 | } 107 | 108 | /** 109 | * Implementation for SparseVector[Float]. 110 | */ 111 | object FloatSparseMathVector extends Sparse[Float] { 112 | override val subS = 113 | new OpSub.Impl2[SparseVector[Float], Float, SparseVector[Float]] { 114 | override def apply(v: SparseVector[Float], v2: Float) = v.map { _ - v2 } 115 | } 116 | override val subV = SparseVector.implOps_SVT_SVT_eq_SVT_Float_OpSub 117 | 118 | override val addS = 119 | new OpAdd.Impl2[SparseVector[Float], Float, SparseVector[Float]] { 120 | override def apply(v: SparseVector[Float], v2: Float) = v.map { _ + v2 } 121 | } 122 | override val addV = SparseVector.implOps_SVT_SVT_eq_SVT_Float_OpAdd 123 | 124 | override val dot = 125 | new OpMulInner.Impl2[SparseVector[Float], SparseVector[Float], Float] { 126 | override def apply(v: SparseVector[Float], v2: SparseVector[Float]) = 127 | v.dot(v2) 128 | } 129 | 130 | override val divS = 131 | new OpDiv.Impl2[SparseVector[Float], Float, SparseVector[Float]] { 132 | override def apply(v: SparseVector[Float], v2: Float) = v.map { _ / v2 } 133 | } 134 | override val divV = SparseVector.implOps_SVT_SVT_eq_SVT_Float_OpDiv 135 | 136 | override val mulS = 137 | new OpMulScalar.Impl2[SparseVector[Float], Float, SparseVector[Float]] { 138 | override def apply(v: SparseVector[Float], v2: Float) = v.map { _ * v2 } 139 | } 140 | override val mulV = SparseVector.implOpMulScalar_SVT_SVT_eq_SVT_Float 141 | } 142 | 143 | /** 144 | * Implementation for SparseVector[Long]. 145 | */ 146 | object LongSparseMathVector extends Sparse[Long] { 147 | override val subS = 148 | new OpSub.Impl2[SparseVector[Long], Long, SparseVector[Long]] { 149 | override def apply(v: SparseVector[Long], v2: Long) = v.map { _ - v2 } 150 | } 151 | override val subV = SparseVector.implOps_SVT_SVT_eq_SVT_Long_OpSub 152 | 153 | override val addS = 154 | new OpAdd.Impl2[SparseVector[Long], Long, SparseVector[Long]] { 155 | override def apply(v: SparseVector[Long], v2: Long) = v.map { _ + v2 } 156 | } 157 | override val addV = SparseVector.implOps_SVT_SVT_eq_SVT_Long_OpAdd 158 | 159 | override val dot = 160 | new OpMulInner.Impl2[SparseVector[Long], SparseVector[Long], Long] { 161 | override def apply(v: SparseVector[Long], v2: SparseVector[Long]) = 162 | v.dot(v2) 163 | } 164 | 165 | override val divS = 166 | new OpDiv.Impl2[SparseVector[Long], Long, SparseVector[Long]] { 167 | override def apply(v: SparseVector[Long], v2: Long) = v.map { _ / v2 } 168 | } 169 | override val divV = SparseVector.implOps_SVT_SVT_eq_SVT_Long_OpDiv 170 | 171 | override val mulS = 172 | new OpMulScalar.Impl2[SparseVector[Long], Long, SparseVector[Long]] { 173 | override def apply(v: SparseVector[Long], v2: Long) = v.map { _ * v2 } 174 | } 175 | override val mulV = SparseVector.implOpMulScalar_SVT_SVT_eq_SVT_Long 176 | } 177 | 178 | /** 179 | * Implementation for SparseVector[Int]. 180 | */ 181 | object IntSparseMathVector extends Sparse[Int] { 182 | override val subS = 183 | new OpSub.Impl2[SparseVector[Int], Int, SparseVector[Int]] { 184 | override def apply(v: SparseVector[Int], v2: Int) = v.map { _ - v2 } 185 | } 186 | override val subV = SparseVector.implOps_SVT_SVT_eq_SVT_Int_OpSub 187 | 188 | override val addS = 189 | new OpAdd.Impl2[SparseVector[Int], Int, SparseVector[Int]] { 190 | override def apply(v: SparseVector[Int], v2: Int) = v.map { _ + v2 } 191 | } 192 | override val addV = SparseVector.implOps_SVT_SVT_eq_SVT_Int_OpAdd 193 | 194 | override val dot = 195 | new OpMulInner.Impl2[SparseVector[Int], SparseVector[Int], Int] { 196 | override def apply(v: SparseVector[Int], v2: SparseVector[Int]) = 197 | v.dot(v2) 198 | } 199 | 200 | override val divS = 201 | new OpDiv.Impl2[SparseVector[Int], Int, SparseVector[Int]] { 202 | override def apply(v: SparseVector[Int], v2: Int) = v.map { _ / v2 } 203 | } 204 | override val divV = SparseVector.implOps_SVT_SVT_eq_SVT_Int_OpDiv 205 | 206 | override val mulS = 207 | new OpMulScalar.Impl2[SparseVector[Int], Int, SparseVector[Int]] { 208 | override def apply(v: SparseVector[Int], v2: Int) = v.map { _ * v2 } 209 | } 210 | override val mulV = SparseVector.implOpMulScalar_SVT_SVT_eq_SVT_Int 211 | } 212 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/Val.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | /** Type class for giving a value to a type `X`. */ 4 | trait Val[-X] { 5 | 6 | type N 7 | implicit def n: Numeric[N] 8 | 9 | def valueOf(a: X): N 10 | } 11 | 12 | object Val { 13 | 14 | def apply[V: Val]: Val[V] = implicitly[Val[V]] 15 | 16 | def inverse[V: Val]: Val[V] = { 17 | val original = Val[V] 18 | new Val[V] { 19 | override type N = original.N 20 | override lazy val n = original.n 21 | @inline override def valueOf(a: V) = 22 | original.n.minus(original.n.zero, original.valueOf(a)) 23 | } 24 | } 25 | 26 | object Implicits { 27 | 28 | implicit def identityVal[X: Numeric] = { 29 | val evidence = implicitly[Numeric[X]] 30 | new Val[X] { 31 | override type N = X 32 | override implicit lazy val n = evidence 33 | @inline override def valueOf(a: X) = a 34 | } 35 | } 36 | 37 | implicit def tupleValIn1st[Num: Numeric, X] = { 38 | val evidence = implicitly[Numeric[Num]] 39 | new Val[(Num, X)] { 40 | override type N = Num 41 | override implicit lazy val n = evidence 42 | @inline override def valueOf(a: (Num, X)) = a._1 43 | } 44 | } 45 | 46 | implicit def tupleValIn2nd[X, Num: Numeric] = { 47 | val evidence = implicitly[Numeric[Num]] 48 | new Val[(X, Num)] { 49 | override type N = Num 50 | override implicit lazy val n = evidence 51 | @inline override def valueOf(a: (X, Num)) = a._2 52 | } 53 | } 54 | 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/VectorOps.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import simulacrum.typeclass 4 | 5 | import scala.language.higherKinds 6 | import scala.reflect.ClassTag 7 | 8 | /** 9 | * An abstraction specifying operations one may perform using vectors and 10 | * scalar values. These operations include element-wise & scalar 11 | * multiplication, division, addition, and subtraction. Support for the dot 12 | * product of two vectors is also included. As well as methods to construct new 13 | * vector instances. 14 | */ 15 | @typeclass 16 | trait VectorOps[V[_]] { 17 | 18 | def apply[A](v: V[A])(index: Int): A 19 | 20 | def toSeq[A: ClassTag](v: V[A]): Seq[A] 21 | 22 | def size(v: V[_]): Int 23 | 24 | def foreach[A](v: V[A])(f: A => Any): Unit 25 | 26 | } 27 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/math/package.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook 2 | 3 | package object math { 4 | 5 | implicit val intIsFractional = new Fractional[Int] { 6 | @inline override def div(x: Int, y: Int) = x / y 7 | @inline override def toDouble(x: Int) = x.toDouble 8 | @inline override def plus(x: Int, y: Int) = x + y 9 | @inline override def toFloat(x: Int) = x.toFloat 10 | @inline override def toInt(x: Int) = x 11 | @inline override def negate(x: Int) = -x 12 | @inline override def fromInt(x: Int) = x 13 | @inline override def toLong(x: Int) = x.toLong 14 | @inline override def times(x: Int, y: Int) = x * y 15 | @inline override def minus(x: Int, y: Int) = x - y 16 | @inline override def compare(x: Int, y: Int) = x - y 17 | } 18 | 19 | implicit val longIsFractional = new Fractional[Long] { 20 | @inline override def div(x: Long, y: Long) = x / y 21 | @inline override def toDouble(x: Long) = x.toDouble 22 | @inline override def plus(x: Long, y: Long) = x + y 23 | @inline override def toFloat(x: Long) = x.toFloat 24 | @inline override def toInt(x: Long) = x.toInt 25 | @inline override def negate(x: Long) = -x 26 | @inline override def fromInt(x: Int) = x.toLong 27 | @inline override def toLong(x: Long) = x 28 | @inline override def times(x: Long, y: Long) = x * y 29 | @inline override def minus(x: Long, y: Long) = x - y 30 | @inline override def compare(x: Long, y: Long) = (x - y).toInt 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/ClassificationModule.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import fif.Data 4 | 5 | import scala.annotation.tailrec 6 | import scala.language.{higherKinds, postfixOps, reflectiveCalls} 7 | import scala.reflect.ClassTag 8 | 9 | trait ClassificationModule extends ItemNumVecModule { 10 | 11 | type Label 12 | val emptyLabel: Label 13 | 14 | type Classifier = Item => Label 15 | 16 | type Vectorizer = { 17 | val vectorize: Item => V[N] 18 | val nDimensions: Int 19 | } 20 | 21 | type Conf 22 | 23 | import Data.ops._ 24 | 25 | final def train[D[_]: Data]( 26 | c: Conf, 27 | mkVectorizer: D[(Item, Label)] => Vectorizer 28 | )(data: D[(Item, Label)]): Classifier = 29 | train( 30 | c, 31 | mkVectorizer { data } 32 | )(data) 33 | 34 | def train[D[_]: Data]( 35 | c: Conf, 36 | toVec: Vectorizer 37 | )(data: D[(Item, Label)]): Classifier 38 | 39 | } 40 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/ClusteringConf.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | case class ClusteringConf( 4 | nClusters: Int, 5 | tolerance: Double, 6 | maxIterations: Int 7 | ) 8 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/ClusteringModule.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import fif.Data 4 | 5 | import scala.language.{postfixOps, higherKinds, reflectiveCalls} 6 | 7 | trait ClusteringModule extends ItemNumVecModule { 8 | 9 | type Vectorizer = { 10 | val vectorize: Item => V[N] 11 | val nDimensions: Int 12 | } 13 | 14 | type Distance = (V[N], V[N]) => N 15 | 16 | case class Center(id: String, mean: V[N]) 17 | 18 | final def cluster[D[_]: Data]( 19 | conf: ClusteringConf, 20 | dist: Distance, 21 | mkVectorizer: D[Item] => Vectorizer 22 | )(data: D[Item]): Seq[Center] = 23 | cluster(conf, dist, mkVectorizer(data))(data) 24 | 25 | def cluster[D[_]: Data]( 26 | conf: ClusteringConf, 27 | dist: Distance, 28 | toVec: Vectorizer 29 | )(data: D[Item]): Seq[Center] 30 | 31 | import Data.ops._ 32 | 33 | final def assign[D[_]: Data]( 34 | centers: Seq[Center], 35 | distance: Distance, 36 | vectorizer: Vectorizer 37 | )( 38 | data: D[Item] 39 | ): D[String] = 40 | assign(centers, distance)( 41 | data map { vectorizer.vectorize } 42 | ) 43 | 44 | final def assign[D[_]: Data]( 45 | centers: Seq[Center], 46 | distance: Distance 47 | )( 48 | data: D[V[N]] 49 | ): D[String] = 50 | if (centers isEmpty) 51 | data map { _ => 52 | "" 53 | } else if (centers.size == 1) { 54 | val label = centers.head.id 55 | data map { _ => 56 | label 57 | } 58 | 59 | } else { 60 | 61 | val lessThan = implicitly[Numeric[N]].lt _ 62 | val restCents = centers.slice(1, centers.size) 63 | 64 | data map { v => 65 | val (nearestLabel, _) = 66 | restCents.foldLeft(centers.head.id, distance(centers.head.mean, v)) { 67 | 68 | case (currChampion @ (minLabel, minDistance), center) => 69 | val distToCenter = distance(center.mean, v) 70 | if (lessThan(distToCenter, minDistance)) 71 | (center.id, distToCenter) 72 | else 73 | currChampion 74 | } 75 | 76 | nearestLabel 77 | } 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/CustomHashMap.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import scala.annotation.tailrec 4 | 5 | final class CustomHashMap[K: Hashable, V]( 6 | private[this] val hashedKey2val: Map[Int, V], 7 | private[this] val hashedKeys: List[K] 8 | ) extends Map[K, V] { 9 | 10 | override def +[B1 >: V](kv: (K, B1)): Map[K, B1] = { 11 | val (key, value) = kv 12 | val id = implicitly[Hashable[K]].hash(key) 13 | 14 | if (hashedKey2val contains id) 15 | new CustomHashMap( 16 | (hashedKey2val - id) + (id -> value), 17 | hashedKeys 18 | ) 19 | else 20 | new CustomHashMap( 21 | hashedKey2val + (id -> value), 22 | hashedKeys :+ key 23 | ) 24 | } 25 | 26 | override def get(key: K): Option[V] = { 27 | val id = implicitly[Hashable[K]].hash(key) 28 | hashedKey2val.get(id) 29 | } 30 | 31 | override def iterator: Iterator[(K, V)] = 32 | hashedKeys.toIterator.map { key => 33 | val id = implicitly[Hashable[K]].hash(key) 34 | (key, hashedKey2val(id)) 35 | } 36 | 37 | override def -(key: K): Map[K, V] = { 38 | val id = implicitly[Hashable[K]].hash(key) 39 | if (hashedKey2val contains id) 40 | new CustomHashMap( 41 | hashedKey2val - id, 42 | remove(id, hashedKeys, Nil) 43 | ) 44 | else 45 | this 46 | } 47 | 48 | @tailrec 49 | private[this] def remove( 50 | idOfKeyToRemove: Int, 51 | before: List[K], 52 | remaining: List[K] 53 | ): List[K] = 54 | remaining match { 55 | 56 | case anotherKey :: restOfList => 57 | val idOfAnother = implicitly[Hashable[K]].hash(anotherKey) 58 | if (idOfAnother == idOfKeyToRemove) 59 | before ++ restOfList 60 | else 61 | remove(idOfKeyToRemove, before :+ anotherKey, restOfList) 62 | 63 | case Nil => 64 | before 65 | } 66 | 67 | } 68 | 69 | object CustomHashMap { 70 | 71 | def empty[K: Hashable, V]: Map[K, V] = 72 | new CustomHashMap[K, V](Map.empty[Int, V], Nil) 73 | 74 | } 75 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/Hashable.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import simulacrum._ 4 | 5 | @typeclass 6 | trait Hashable[T] { 7 | def hash(t: T): Int 8 | } 9 | 10 | object ImplicitHashable { 11 | 12 | implicit val bIsH: Hashable[Boolean] = new Hashable[Boolean] { 13 | @inline override def hash(t: Boolean) = if (t) 1 else 0 14 | } 15 | 16 | implicit val iIsH: Hashable[Int] = new Hashable[Int] { 17 | @inline override def hash(t: Int) = t 18 | } 19 | 20 | implicit val sIsH: Hashable[String] = new Hashable[String] { 21 | @inline override def hash(t: String) = t.hashCode 22 | } 23 | 24 | implicit def optIsH[T: Hashable]: Hashable[Option[T]] = 25 | new Hashable[Option[T]] { 26 | import Hashable.ops._ 27 | 28 | @inline override def hash(maybeT: Option[T]) = maybeT match { 29 | case Some(t) => t.hash 30 | case None => 0 31 | } 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/ItemNumVecModule.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import breeze.math.Semiring 4 | import breeze.storage.Zero 5 | import mlbigbook.math.MathVectorOps 6 | 7 | import scala.language.{higherKinds, postfixOps, reflectiveCalls} 8 | import scala.reflect.ClassTag 9 | 10 | trait ItemNumVecModule { 11 | 12 | type Item 13 | type N 14 | type V[_] 15 | 16 | // vops serves as a type class for numeric vector operations 17 | // having an instance of type MathVectorOps[N,V] implies constraints on N and V 18 | val vops: MathVectorOps.Type[N, V] 19 | 20 | // we can get these type classes for N 21 | implicit lazy final val nFrac: Fractional[N] = vops.n 22 | implicit lazy final val nSr: Semiring[N] = vops.s 23 | implicit lazy final val nZero: Zero[N] = vops.z 24 | 25 | // Class tag support for abstract types 26 | implicit val ctN: ClassTag[N] 27 | implicit val ctI: ClassTag[Item] 28 | // support for the numerical vector type 29 | implicit val ctVn: ClassTag[V[N]] 30 | 31 | } 32 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/Kmeans.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import fif.Data 4 | import mlbigbook.math.{MathVectorOps, RandoMut} 5 | 6 | import scala.annotation.tailrec 7 | import scala.language.{higherKinds, reflectiveCalls} 8 | import scala.reflect.ClassTag 9 | 10 | trait Kmeans extends ClusteringModule { 11 | 12 | /** Creates a pseudo-random number generator for the type N. */ 13 | val mkRandomNumGen: () => RandoMut[N] 14 | 15 | // Brings in the Data type class operations as methods "accessible" using 16 | // familiar object dot notation. 17 | // i.e. `data.map` instead of `implicitly[Data[D]].map(data)` 18 | import Data.ops._ 19 | 20 | override final def cluster[D[_]: Data]( 21 | conf: ClusteringConf, 22 | dist: Distance, 23 | toVec: Vectorizer 24 | )(data: D[Item]): Seq[Center] = 25 | cluster_h( 26 | conf, 27 | dist, 28 | toVec, 29 | 0, 30 | data map { toVec.vectorize }, 31 | initialize(conf.nClusters, toVec.nDimensions) 32 | ) 33 | 34 | final def initialize( 35 | nClusters: Int, 36 | nDimensions: Int 37 | ): Seq[Center] = { 38 | val r = mkRandomNumGen() 39 | (0 until nClusters).map { id => 40 | Center( 41 | id = id.toString, 42 | mean = vops.map(vops.ones(nDimensions)) { one => 43 | vops.n.times(one, r.next()) 44 | } 45 | ) 46 | }.toSeq 47 | } 48 | 49 | @tailrec 50 | private[this] final def cluster_h[D[_]: Data]( 51 | conf: ClusteringConf, 52 | dist: Distance, 53 | toVec: Vectorizer, 54 | currIter: Int, 55 | data: D[V[N]], 56 | currCenters: Seq[Center] 57 | ): Seq[Center] = 58 | if (currIter >= conf.maxIterations) 59 | currCenters 60 | else { 61 | 62 | val updatedCenters = updateCenters(dist, toVec, currCenters, data) 63 | 64 | println( 65 | s"""[center check: currIter=$currIter] 66 | |[ORIGINAL # ${currCenters.size}] ${currCenters.mkString("\t")} 67 | |[UPDATED # ${updatedCenters.size}] ${updatedCenters.mkString( 68 | "\t")} 69 | """.stripMargin 70 | ) 71 | 72 | val sumSquaredChangeInMeansBetweenIters = 73 | currCenters.zip(updatedCenters).foldLeft(0.0) { 74 | case (accum, (existing, updated)) => 75 | val d = math.abs( 76 | implicitly[Numeric[N]].toDouble( 77 | dist(existing.mean, updated.mean) 78 | ) 79 | ) 80 | accum + d 81 | } 82 | 83 | if (sumSquaredChangeInMeansBetweenIters < conf.tolerance) 84 | updatedCenters 85 | else 86 | cluster_h( 87 | conf, 88 | dist, 89 | toVec, 90 | currIter + 1, 91 | data, 92 | updatedCenters 93 | ) 94 | } 95 | 96 | def updateCenters[D[_]: Data]( 97 | dist: Distance, 98 | toVec: Vectorizer, 99 | centers: Seq[Center], 100 | data: D[V[N]] 101 | ): Seq[Center] = 102 | data 103 | .zip(assign(centers, dist)(data)) 104 | .groupBy { case (_, assignment) => assignment } 105 | .map { 106 | case (label, bothDataAndLabel) => 107 | val summed = 108 | bothDataAndLabel.foldLeft(vops.zeros(toVec.nDimensions)) { 109 | case (summing, (vector, _)) => 110 | vops.addV(summing, vector) 111 | } 112 | 113 | val newMean = 114 | vops.divS( 115 | summed, 116 | implicitly[Numeric[N]].fromInt(bothDataAndLabel.size) 117 | ) 118 | 119 | Center( 120 | id = label, 121 | mean = newMean 122 | ) 123 | } 124 | .toSeq 125 | 126 | } 127 | 128 | object Kmeans { 129 | 130 | type Type[ItemToCluster, Num, Vec[_]] = Kmeans { 131 | type Item = ItemToCluster 132 | type N = Num 133 | type V[_] = Vec[_] 134 | } 135 | 136 | def apply[ItemToCluster, Num, Vec[_]]( 137 | mathVectorOps: MathVectorOps.Type[Num, Vec], 138 | mkRando: () => RandoMut[Num] 139 | )( 140 | implicit ctForI: ClassTag[ItemToCluster], 141 | ctForN: ClassTag[Num], 142 | ctForVn: ClassTag[Vec[Num]] 143 | ): Type[ItemToCluster, Num, Vec] = { 144 | 145 | // val okVops: MathVectorOps.Type[Type[ItemToCluster, Num, Vec]#N, Type[ItemToCluster, Num, Vec]#V] = 146 | // mathVectorOps 147 | // mathVectorOps.asInstanceOf[MathVectorOps.Type[Type[ItemToCluster, Num, Vec]#N, Type[ItemToCluster, Num, Vec]#V]] 148 | 149 | // val okCtVn: ClassTag[Type[ItemToCluster, Num, Vec]#V[Type[ItemToCluster, Num, Vec]#N]] = 150 | // ctForVn 151 | // ctForVn.asInstanceOf[ClassTag[Type[ItemToCluster, Num, Vec]#V[Type[ItemToCluster, Num, Vec]#N]]] 152 | 153 | new Kmeans { 154 | 155 | override type Item = ItemToCluster 156 | override type N = Num 157 | override type V[_] = Vec[_] 158 | 159 | override lazy val mkRandomNumGen = mkRando 160 | override lazy val vops = 161 | mathVectorOps.asInstanceOf[MathVectorOps.Type[N, V]] 162 | 163 | override implicit lazy val ctI = ctForI 164 | override implicit lazy val ctN = ctForN 165 | override implicit lazy val ctVn = ctForVn.asInstanceOf[ClassTag[V[N]]] 166 | } 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/KnnClassifier.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import fif.Data 4 | import mlbigbook.math.{MathVectorOps, Val, Argmax} 5 | 6 | import scala.language.{higherKinds, postfixOps, reflectiveCalls} 7 | import scala.reflect.ClassTag 8 | 9 | trait KnnClassifier extends ClassificationModule { 10 | 11 | final type NeighborhoodSize = Int 12 | 13 | type Distance = (V[N], V[N]) => N 14 | override final type Conf = (NeighborhoodSize, Distance) 15 | 16 | implicit val labelHash: Hashable[Label] 17 | 18 | private[this] lazy val nnRankMod = 19 | NearestNeighbors[(Item, Label), N, V](vops) 20 | 21 | def train[D[_]: Data]( 22 | c: (NeighborhoodSize, Distance), 23 | toVec: Vectorizer 24 | )(data: D[(Item, Label)]): Classifier = { 25 | 26 | val (nSize, dist) = c 27 | 28 | val nnRanker = nnRankMod.mkRanker[D]( 29 | dist.asInstanceOf[nnRankMod.Distance], 30 | new { 31 | val vectorize: ((Item, Label)) => V[N] = { 32 | case (item, _) => toVec.vectorize(item) 33 | } 34 | val nDimensions = toVec.nDimensions 35 | } 36 | )(data) 37 | 38 | itemToClassify => 39 | { 40 | 41 | val neighborhood = nnRanker(nSize)((itemToClassify, emptyLabel)) 42 | 43 | val votesForNeighborhood: Map[Label, Int] = 44 | neighborhood.foldLeft(CustomHashMap.empty[Label, Int]) { 45 | case (label2count, (item, label)) => 46 | if (label2count contains label) 47 | (label2count - label) + (label -> (label2count(label) + 1)) 48 | else 49 | label2count + (label -> 1) 50 | } 51 | 52 | import fif.ImplicitCollectionsData.seqIsData 53 | import Val.Implicits.tupleValIn2nd 54 | 55 | Argmax(votesForNeighborhood.toSeq).fold { emptyLabel } { 56 | case (majorityLabel, _) => majorityLabel 57 | } 58 | } 59 | } 60 | 61 | } 62 | 63 | object KnnClassifier { 64 | 65 | type Type[Input, L, Num, Vec[_]] = KnnClassifier { 66 | type Item = Input 67 | type Label = L 68 | type N = Num 69 | type V[_] = Vec[_] 70 | } 71 | 72 | def apply[Input, L, Num, Vec[_]]( 73 | mathVecOps: MathVectorOps.Type[Num, Vec], 74 | representsNoLabel: L 75 | )( 76 | implicit ctForI: ClassTag[Input], 77 | ctForN: ClassTag[Num], 78 | ctForVn: ClassTag[Vec[Num]], 79 | lh: Hashable[L] 80 | ): Type[Input, L, Num, Vec] = 81 | new KnnClassifier { 82 | override type Item = Input 83 | override type N = Num 84 | override type V[_] = Vec[_] 85 | override type Label = L 86 | 87 | override lazy val emptyLabel = representsNoLabel 88 | override lazy val vops = 89 | mathVecOps.asInstanceOf[MathVectorOps.Type[N, V]] 90 | 91 | override implicit lazy val labelHash = lh 92 | 93 | override implicit lazy val ctI = ctForI 94 | override implicit lazy val ctN = ctForN 95 | override implicit lazy val ctVn = ctForVn.asInstanceOf[ClassTag[V[N]]] 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/NearestNeighbors.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import fif.Data 4 | import mlbigbook.math.MathVectorOps 5 | 6 | import scala.language.{higherKinds, postfixOps, reflectiveCalls} 7 | import scala.reflect.ClassTag 8 | 9 | trait NearestNeighbors extends RankingModule { 10 | 11 | import Data.ops._ 12 | 13 | override type Conf = Distance 14 | 15 | override def mkRanker[D[_]: Data]( 16 | dist: Distance, 17 | toVec: Vectorizer 18 | )( 19 | data: D[Item] 20 | ): Ranker = { 21 | val bothItemVec = data.map { item => 22 | (item, toVec.vectorize(item)) 23 | } 24 | limit => itemToRank => 25 | { 26 | val vecItemToRank = toVec.vectorize(itemToRank) 27 | bothItemVec.sortBy { case (item, vec) => dist(vec, vecItemToRank) } 28 | .take(limit) 29 | .map { case (item, _) => item } 30 | .toSeq 31 | } 32 | } 33 | 34 | } 35 | 36 | object NearestNeighbors { 37 | 38 | type Type[ItemToRank, Num, Vec[_]] = NearestNeighbors { 39 | type Item = ItemToRank 40 | type N = Num 41 | type V[_] = Vec[_] 42 | } 43 | 44 | def apply[ItemToRank, Num, Vec[_]]( 45 | mathVecOps: MathVectorOps.Type[Num, Vec] 46 | )( 47 | implicit ctForI: ClassTag[ItemToRank], 48 | ctForN: ClassTag[Num], 49 | ctForVn: ClassTag[Vec[Num]] 50 | ): Type[ItemToRank, Num, Vec] = 51 | new NearestNeighbors { 52 | override type Item = ItemToRank 53 | override type N = Num 54 | override type V[_] = Vec[_] 55 | 56 | override lazy val vops = 57 | mathVecOps.asInstanceOf[MathVectorOps.Type[N, V]] 58 | 59 | override implicit lazy val ctI = ctForI 60 | override implicit lazy val ctN = ctForN 61 | override implicit lazy val ctVn = ctForVn.asInstanceOf[ClassTag[V[N]]] 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/OLD_KnnClassifier.scala: -------------------------------------------------------------------------------- 1 | //package mlbigbook.ml 2 | // 3 | //import mlbigbook.data._ 4 | // 5 | //import scala.reflect.ClassTag 6 | // 7 | //object KnnClassifier { 8 | // 9 | // /** 10 | // * Creates a k-Nearest Neighbors classifier. 11 | // * 12 | // * Uses NnRanker.apply underneath to perform the nearest neighbors search. 13 | // */ 14 | // def apply[T: ClassTag](n: NearNeighIn)(vdata: VectorDataIn[LabeledData[T]]): Learning[T, Labeled]#Classifier = 15 | // apply(NnRanker(n)(vdata)) 16 | // 17 | // def apply[T: ClassTag](nearestNeighborsRanker: Ranker[LabeledData[T]]): Learning[T, Labeled]#Classifier = 18 | // (input: T) => { 19 | // val neighborhood = 20 | // nearestNeighborsRanker(UnlabeledData(input)) 21 | // .map(_._1.label) 22 | // 23 | // Labeled(takeLargest(countVotes(neighborhood))) 24 | // } 25 | // 26 | // /** 27 | // * Counts the number of times each element occurs in neighborhood. 28 | // * Returns this information as a mapping. 29 | // */ 30 | // def countVotes(neighborhood: Traversable[String]): Map[String, Int] = 31 | // neighborhood.foldLeft(Map.empty[String, Int])( 32 | // (m, label) => 33 | // if (m.contains(label)) { 34 | // val newCount = m(label) + 1 35 | // (m - label) + (label -> newCount) 36 | // } else { 37 | // m + (label -> 1) 38 | // } 39 | // ) 40 | // 41 | // /** 42 | // * Evaluates to the String associated with the largest value (of Numeric type N). If the input 43 | // * elements is empty, evaluates to the empty string (""). 44 | // */ 45 | // @inline def takeLargest[N](elements: Map[String, N])(implicit n: Fractional[N]): String = 46 | // takeLargest(elements.toIndexedSeq) 47 | // 48 | // /** 49 | // * Evaluates to the String associated with the largest value (of Numeric type N). If the input 50 | // * elements is empty, evaluates to the empty string (""). 51 | // * 52 | // */ 53 | // def takeLargest[N](elements: IndexedSeq[(String, N)])(implicit n: Fractional[N]): String = 54 | // elements.size match { 55 | // 56 | // case 0 => 57 | // "" 58 | // 59 | // case 1 => 60 | // elements.head._1 61 | // 62 | // case _ => 63 | // elements.slice(1, elements.size) 64 | // .foldLeft(elements.head)({ 65 | // case ((maxLabel, maxValue), (label, value)) => 66 | // if (n.gt(value, maxValue)) 67 | // (label, value) 68 | // else 69 | // (maxLabel, maxValue) 70 | // })._1 71 | // 72 | // } 73 | // 74 | //} 75 | 76 | // RANKING 77 | 78 | ///** 79 | // * Evaluates to a Traversable containing the elements that have the largest associated values in the input. The 80 | // * returned Traversable has at most limit items. 81 | // */ 82 | //def takeTopK[T, N](limit: Int, elements: DataClass[(T, N)])( 83 | // implicit 84 | // n: Fractional[N], c: ClassTag[N] 85 | //): Traversable[(T, N)] = 86 | // elements 87 | // .sortBy(_._2)(c, n.reverse) 88 | // .take(limit) 89 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/ml/RankingModule.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import fif.Data 4 | 5 | import scala.language.{higherKinds, postfixOps, reflectiveCalls} 6 | 7 | trait RankingModule extends ItemNumVecModule { 8 | 9 | type Vectorizer = { 10 | val vectorize: Item => V[N] 11 | val nDimensions: Int 12 | } 13 | 14 | type Distance = (V[N], V[N]) => N 15 | 16 | type Ranker = Int => Item => Seq[Item] 17 | 18 | type Conf 19 | 20 | final def mkRanker[D[_]: Data]( 21 | c: Conf, 22 | mkVectorizer: D[Item] => Vectorizer 23 | )(data: D[Item]): Ranker = 24 | mkRanker(c, mkVectorizer(data))(data) 25 | 26 | def mkRanker[D[_]: Data]( 27 | c: Conf, 28 | toVec: Vectorizer 29 | )( 30 | data: D[Item] 31 | ): Ranker 32 | 33 | } 34 | -------------------------------------------------------------------------------- /fp4ml-main/src/main/scala/mlbigbook/util/package.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook 2 | 3 | import scala.reflect.ClassTag 4 | 5 | package object util { 6 | 7 | @inline 8 | def copyToSeq[@specialized A: ClassTag](src: Array[A]): Seq[A] = 9 | if (src == null || src.isEmpty) 10 | Seq.empty[A] 11 | else { 12 | val s = new Array[A](src.length) 13 | System.arraycopy(src, 0, s, 0, src.length) 14 | s.toSeq 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=WARN, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN 13 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/AbstractMathVectorOpsT.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import org.scalatest.{FunSpec, Matchers} 4 | 5 | import scala.language.higherKinds 6 | 7 | abstract class AbstractMathVectorOpsT[N, V[_]] extends FunSpec with Matchers { 8 | 9 | val vops: MathVectorOps.Type[N, V] 10 | implicit lazy val nIsNumeric: Numeric[N] = vops.n 11 | 12 | def vals2vec(vs: N*): V[N] 13 | 14 | def dbl2num(d: Double): N 15 | def int2num(i: Int): N 16 | 17 | implicit class DblAsN(d: Double) { 18 | val n: N = dbl2num(d) 19 | } 20 | 21 | implicit class IntAsN(i: Int) { 22 | val n: N = int2num(i) 23 | } 24 | 25 | val tolerance: N 26 | 27 | describe("vector operations") { 28 | 29 | it("zeros") { 30 | vops.foreach { vops.zeros(5) } { x => 31 | x should be(0.n +- tolerance) 32 | } 33 | } 34 | 35 | it("ones") { 36 | vops.foreach(vops.ones(5)) { x => 37 | x should be(1.n +- tolerance) 38 | } 39 | } 40 | 41 | it("add vector") { 42 | val v1 = vals2vec(1.n, 2.n, 40.n) 43 | val v2 = vals2vec((-1).n, 2.n, 100.n) 44 | val r = vops.addV(v1, v2) 45 | 46 | vops(r)(0) should be(0.n +- tolerance) 47 | vops(r)(1) should be(4.n +- tolerance) 48 | vops(r)(2) should be(140.n +- tolerance) 49 | vops.size(r) should be(3) 50 | } 51 | 52 | it("add scalar") { 53 | val v = vals2vec(1.n, 2.n, 40.n) 54 | val s = 10.n 55 | val r = vops.addS(v, s) 56 | 57 | vops(r)(0) should be(11.n +- tolerance) 58 | vops(r)(1) should be(12.n +- tolerance) 59 | vops(r)(2) should be(50.n +- tolerance) 60 | vops.size(r) should be(3) 61 | } 62 | 63 | it("subtract vector") { 64 | val v1 = vals2vec(1.n, 2.n, 40.n) 65 | val v2 = vals2vec((-1).n, 2.n, 100.n) 66 | val r = vops.subV(v1, v2) 67 | 68 | vops(r)(0) should be(2.n +- tolerance) 69 | vops(r)(1) should be(0.n +- tolerance) 70 | vops(r)(2) should be((-60).n +- tolerance) 71 | vops.size(r) should be(3) 72 | } 73 | 74 | it("subtract scalar") { 75 | val v = vals2vec(1.n, 2.n, 40.n) 76 | val s = 10.n 77 | val r = vops.subS(v, s) 78 | 79 | vops(r)(0) should be((-9).n +- tolerance) 80 | vops(r)(1) should be((-8).n +- tolerance) 81 | vops(r)(2) should be(30.n +- tolerance) 82 | vops.size(r) should be(3) 83 | } 84 | 85 | it("dot product") { 86 | val v1 = vals2vec(1.n, 2.n, 40.n) 87 | val v2 = vals2vec((-1).n, 2.n, 100.n) 88 | val r = vops.dot(v1, v2) 89 | 90 | r should be(4003.n +- tolerance) 91 | } 92 | 93 | it("multiply vector") { 94 | val v1 = vals2vec(1.n, 2.n, 40.n) 95 | val v2 = vals2vec((-1).n, 2.n, 100.n) 96 | val r = vops.mulV(v1, v2) 97 | 98 | vops(r)(0) should be((-1).n +- tolerance) 99 | vops(r)(1) should be(4.n +- tolerance) 100 | vops(r)(2) should be(4000.n +- tolerance) 101 | vops.size(r) should be(3) 102 | } 103 | 104 | it("multiply scalar") { 105 | val v = vals2vec(1.n, 2.n, 40.n) 106 | val s = 10.n 107 | val r = vops.mulS(v, s) 108 | 109 | vops(r)(0) should be(10.n +- tolerance) 110 | vops(r)(1) should be(20.n +- tolerance) 111 | vops(r)(2) should be(400.n +- tolerance) 112 | vops.size(r) should be(3) 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/AbstractMvoFractionalT.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import scala.language.higherKinds 4 | 5 | trait AbstractMvoFractionalT[N, V[_]] extends AbstractMathVectorOpsT[N, V] { 6 | 7 | it("divide vector") { 8 | val v1 = vals2vec(1.n, 2.n, 40.n) 9 | val v2 = vals2vec((-1).n, 2.n, 100.n) 10 | val r = vops.divV(v1, v2) 11 | 12 | vops(r)(0) should be((-1).n +- tolerance) 13 | vops(r)(1) should be(1.n +- tolerance) 14 | vops(r)(2) should be((2.0 / 5.0).n +- tolerance) 15 | vops.size(r) should be(3) 16 | } 17 | 18 | it("divide scalar") { 19 | val v = vals2vec(1.n, 2.n, 40.n) 20 | val s = 10.n 21 | val r = vops.divS(v, s) 22 | 23 | vops(r)(0) should be(0.1.n +- tolerance) 24 | vops(r)(1) should be(0.2.n +- tolerance) 25 | vops(r)(2) should be(4.n +- tolerance) 26 | vops.size(r) should be(3) 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsDenseDoubleTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.DenseVector 4 | 5 | class MathVectorOpsDenseDoubleTest 6 | extends AbstractMvoFractionalT[Double, DenseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Double, DenseVector]] 10 | override def int2num(i: Int) = i.toDouble 11 | override def dbl2num(d: Double) = d 12 | override def vals2vec(vs: Double*) = DenseVector(vs: _*) 13 | override val tolerance = 1e-6 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsDenseFloatTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.DenseVector 4 | 5 | class MathVectorOpsDenseFloatTest 6 | extends AbstractMvoFractionalT[Float, DenseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Float, DenseVector]] 10 | override def int2num(i: Int) = i.toFloat 11 | override def dbl2num(d: Double) = d.toFloat 12 | override def vals2vec(vs: Float*) = DenseVector(vs: _*) 13 | override val tolerance = 1e-6f 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsDenseIntTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.DenseVector 4 | 5 | class MathVectorOpsDenseIntTest 6 | extends AbstractMathVectorOpsT[Int, DenseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Int, DenseVector]] 10 | override def int2num(i: Int) = i 11 | override def dbl2num(d: Double) = d.toInt 12 | override def vals2vec(vs: Int*) = DenseVector(vs: _*) 13 | override val tolerance = 1 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsDenseLongTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.DenseVector 4 | 5 | class MathVectorOpsDenseLongTest 6 | extends AbstractMathVectorOpsT[Long, DenseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Long, DenseVector]] 10 | override def int2num(i: Int) = i.toLong 11 | override def dbl2num(d: Double) = d.toLong 12 | override def vals2vec(vs: Long*) = DenseVector(vs: _*) 13 | override val tolerance = 1l 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsSparseDoubleTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.SparseVector 4 | 5 | class MathVectorOpsSparseDoubleTest 6 | extends AbstractMvoFractionalT[Double, SparseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Double, SparseVector]] 10 | override def int2num(i: Int) = i.toDouble 11 | override def dbl2num(d: Double) = d.toDouble 12 | override def vals2vec(vs: Double*) = SparseVector(vs: _*) 13 | override val tolerance = 1e-6 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsSparseFloatTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.SparseVector 4 | 5 | class MathVectorOpsSparseFloatTest 6 | extends AbstractMvoFractionalT[Float, SparseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Float, SparseVector]] 10 | override def int2num(i: Int) = i.toFloat 11 | override def dbl2num(d: Double) = d.toFloat 12 | override def vals2vec(vs: Float*) = SparseVector(vs: _*) 13 | override val tolerance = 1e-6f 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsSparseIntTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.SparseVector 4 | 5 | class MathVectorOpsSparseIntTest 6 | extends AbstractMathVectorOpsT[Int, SparseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Int, SparseVector]] 10 | override def int2num(i: Int) = i 11 | override def dbl2num(d: Double) = d.toInt 12 | override def vals2vec(vs: Int*) = SparseVector(vs: _*) 13 | override val tolerance = 1 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/math/MathVectorOpsSparseLongTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.math 2 | 3 | import breeze.linalg.SparseVector 4 | 5 | class MathVectorOpsSparseLongTest 6 | extends AbstractMathVectorOpsT[Long, SparseVector] { 7 | 8 | import MathVectorOps.Implicits._ 9 | override val vops = implicitly[MathVectorOps.Type[Long, SparseVector]] 10 | override def int2num(i: Int) = i.toLong 11 | override def dbl2num(d: Double) = d.toLong 12 | override def vals2vec(vs: Long*) = SparseVector(vs: _*) 13 | override val tolerance = 1l 14 | } 15 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/ml/AddressData.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | object AddressData { 4 | 5 | case class Location[@specialized(Int, Double) N: Fractional](x: N, y: N) 6 | 7 | case class Address[@specialized(Int, Double) N: Fractional]( 8 | loc: Location[N], 9 | name: Option[String] = None, 10 | number: Option[Int] = None, 11 | street: Option[String] = None, 12 | zip: Option[Short] = None, 13 | city: Option[String] = None, 14 | state: Option[String] = None 15 | ) 16 | 17 | } 18 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/ml/KmeansTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import breeze.linalg.DenseVector 4 | import mlbigbook.math.{MathVectorOps, NumericConversion, RandoMut} 5 | import org.scalatest.FunSuite 6 | 7 | import scala.language.reflectiveCalls 8 | 9 | class KmeansTest extends FunSuite { 10 | 11 | import KmeansTest._ 12 | import fif.ImplicitCollectionsData._ 13 | 14 | test("Simple run") { 15 | 16 | val initial = 17 | kmeans.initialize(conf.nClusters, stringVectorizer.nDimensions) 18 | println( 19 | s"""INITIAL with nClusters= ${conf.nClusters} & nDimensions= ${stringVectorizer.nDimensions} 20 | |# of clusters FROM INITIAL: ${initial.size} 21 | | 22 | |${initial.mkString("\n")} 23 | | 24 | """.stripMargin 25 | ) 26 | 27 | val centers = kmeans.cluster(conf, distance, stringVectorizer)(data) 28 | 29 | centers foreach println 30 | } 31 | 32 | } 33 | 34 | object KmeansTest { 35 | 36 | val conf = ClusteringConf( 37 | nClusters = 2, 38 | tolerance = 0.001, 39 | maxIterations = 25 40 | ) 41 | 42 | val kmeans: Kmeans.Type[String, Float, DenseVector] = { 43 | import NumericConversion.Implicits._ 44 | Kmeans[String, Float, DenseVector]( 45 | MathVectorOps.Implicits.FloatDenseVot, 46 | RandoMut.newSeedPerCall[Float] 47 | ) 48 | } 49 | 50 | val data = Seq( 51 | "hello world", 52 | "hello hello", 53 | "how world", 54 | "hello how world world hello" 55 | ) 56 | 57 | val words = "hello world how are you doing today fine great".split(" ").toSeq 58 | 59 | val word2index = words.zipWithIndex.toMap 60 | 61 | val initial = word2index.map { case (_, index) => (index, 0.0f) } 62 | 63 | val stringVectorizer: kmeans.Vectorizer = new { 64 | 65 | lazy val vectorize = (s: String) => 66 | DenseVector { 67 | val bothIndexValue = s.split(" ").foldLeft(initial) { 68 | case (accum, word) => 69 | val index = word2index(word) 70 | (accum - index) + (index -> (accum(index) + 1.0f)) 71 | } 72 | 73 | (0 until nDimensions).map { index => 74 | bothIndexValue.getOrElse(index, 0.0f) 75 | }.toArray 76 | } 77 | 78 | lazy val nDimensions = words.size 79 | } 80 | 81 | val distance: kmeans.Distance = (v1, v2) => { 82 | val r = kmeans.vops.subV(v1, v2) 83 | kmeans.vops.dot(r, r) 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/ml/KnnClassifierTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import breeze.linalg.DenseVector 4 | import mlbigbook.math.MathVectorOps 5 | import org.scalatest.FunSuite 6 | 7 | import scala.language.reflectiveCalls 8 | 9 | class KnnClassifierTest extends FunSuite { 10 | 11 | import KnnClassifierTest._ 12 | import fif.ImplicitCollectionsData._ 13 | 14 | test("Sanity Check: 1-NN Classification on train set is perfect") { 15 | 16 | val classify = knn.train((1, distance), stringVectorizer)(data) 17 | 18 | data foreach { 19 | case (item, label) => 20 | val predicted = classify(item) 21 | assert(predicted === label) 22 | } 23 | } 24 | 25 | } 26 | 27 | object KnnClassifierTest { 28 | 29 | import ImplicitHashable._ 30 | 31 | val knn = KnnClassifier[String, Boolean, Float, DenseVector]( 32 | MathVectorOps.Implicits.FloatDenseVot, 33 | representsNoLabel = false 34 | ) 35 | 36 | val data = Seq( 37 | ("becky wow", true), 38 | ("oh my lord", true), 39 | ("where is that", true), 40 | ("how now", false), 41 | ("how now brown cow", false), 42 | ("how how how how do you do it", false), 43 | ("how", false) 44 | ) 45 | 46 | val words = data.flatMap { case (ws, _) => ws.split(" ") }.toSet 47 | 48 | val word2index = words.zipWithIndex.toMap 49 | 50 | val initial = word2index.map { case (_, index) => (index, 0.0f) } 51 | 52 | val stringVectorizer: knn.Vectorizer = new { 53 | 54 | lazy val vectorize = (s: String) => 55 | DenseVector { 56 | val bothIndexValue = s.split(" ").foldLeft(initial) { 57 | case (accum, word) => 58 | val index = word2index(word) 59 | (accum - index) + (index -> (accum(index) + 1.0f)) 60 | } 61 | 62 | (0 until nDimensions).map { index => 63 | bothIndexValue.getOrElse(index, 0.0f) 64 | }.toArray 65 | } 66 | 67 | lazy val nDimensions = words.size 68 | } 69 | 70 | val distance: knn.Distance = (v1, v2) => { 71 | val r = knn.vops.subV(v1, v2) 72 | knn.vops.dot(r, r) 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/ml/KnnLshClassifierTest.scala: -------------------------------------------------------------------------------- 1 | //package mlbigbook.ml 2 | // 3 | //import mlbigbook.data.Labeled 4 | //import mlbigbook.wordcount.LocalSparkContext 5 | //import org.scalatest.FunSuite 6 | // 7 | //class KnnLshClassifierTest extends FunSuite with LocalSparkContext { 8 | // 9 | // import KnnLshClassifierTest._ 10 | // 11 | // ignore("classify simple addresses") { 12 | // fail("unimplemented") 13 | // } 14 | // 15 | //} 16 | // 17 | //object KnnLshClassifierTest { 18 | // 19 | // import NearestNeighborsLSHTest._ 20 | // 21 | // def classificationTest[T](c: Learning[T, Labeled]#Classifier, input: T, expected: Labeled): Err = { 22 | // val actual = c(input) 23 | // if (actual.label != expected.label) 24 | // Some(s"Expected and actual labels dont match. Expecting: ${actual.label} . Actual: ${actual.label}") 25 | // else 26 | // None 27 | // } 28 | // 29 | //} 30 | // 31 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/ml/NearestNeighborsLSHTest.scala: -------------------------------------------------------------------------------- 1 | //package mlbigbook.ml 2 | // 3 | //import mlbigbook.data.mut.DenseVector 4 | //import mlbigbook.data.{ OLD_VectorizerMaker, OLD_Vectorizer, OLD_VectorizerMaker$, OLD_Vectorizer$ } 5 | //import mlbigbook.wordcount.LocalSparkContext 6 | //import org.scalatest.FunSuite 7 | // 8 | //class NearestNeighborsLSHTest extends FunSuite with LocalSparkContext { 9 | // 10 | // import NearestNeighborsLSHTest._ 11 | // 12 | // ignore("nearest neighbors addresses, k=3") { 13 | // fail("unimmplemented") 14 | // } 15 | // 16 | // ignore("LSH modified NN addresses, k=3, nBins=5") { 17 | // fail("unimplemented") 18 | // } 19 | // 20 | //} 21 | // 22 | //object NearestNeighborsLSHTest { 23 | // 24 | // type Err = Option[String] 25 | // 26 | // def nnTest[T](nn: Ranker[T], input: T, expected: Traversable[(T, Double)]): Err = { 27 | // 28 | // val actual = nn(input) 29 | // 30 | // val errors = 31 | // expected.toSeq.zip(actual.toIndexedSeq).foldLeft(List.empty[String])({ 32 | // case (sum, (e, a)) => 33 | // if (e != a) 34 | // sum :+ s"""Expecting: $e | Actual: $a""" 35 | // else 36 | // sum 37 | // }) 38 | // 39 | // if (errors.nonEmpty) 40 | // Some(s"""Found ${errors.length} differences: ${errors.mkString("\n")}""") 41 | // else 42 | // None 43 | // } 44 | // 45 | // lazy val nnConfig = NearNeighIn(Manhattan, 3) 46 | // 47 | // lazy val nLshFuncs = 5 48 | // 49 | // def lshConfig: LshIn = ??? 50 | // //LshIn(???, nLshFuncs) 51 | // 52 | // import AddressData._ 53 | // 54 | // val apartments = Seq( 55 | // Address(Location(1, 2), Some("apartment A")), 56 | // Address(Location(0, 0), Some("apartment B")), 57 | // Address(Location(3, 2), Some("apartment C")), 58 | // Address(Location(5, 2), Some("apartment D")), 59 | // Address(Location(4, 5), Some("apartment E")), 60 | // Address(Location(0, 5), Some("apartment F")), 61 | // Address(Location(4, 4), Some("apartment G")), 62 | // Address(Location(3, 2), Some("apartment H")), 63 | // Address(Location(2, 1), Some("apartment I")), 64 | // Address(Location(5, 3), Some("apartment J")) 65 | // ) 66 | // 67 | // import OLD_Vectorizer._ 68 | // 69 | // def addressVectorizer[N: Fractional]: OLD_Vectorizer[Address[N]] = 70 | // (a: Address[N]) => 71 | // DenseVector(Array(implicitly[Numeric[N]].toDouble(a.loc.x), implicitly[Numeric[N]].toDouble(a.loc.y))) 72 | // 73 | // def mkAddressVectorizer[N](implicit n: Fractional[N]): OLD_VectorizerMaker[Address[N]] = 74 | // ??? 75 | // 76 | //} 77 | // 78 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/ml/NearestNeighborsTest.scala: -------------------------------------------------------------------------------- 1 | package mlbigbook.ml 2 | 3 | import breeze.linalg.DenseVector 4 | import mlbigbook.math.MathVectorOps 5 | import org.scalatest.FunSuite 6 | 7 | import scala.language.reflectiveCalls 8 | 9 | class NearestNeighborsTest extends FunSuite { 10 | 11 | import NearestNeighborsTest._ 12 | import fif.ImplicitCollectionsData._ 13 | 14 | test("Sanity check: 1-NN on train set evaluates to input item") { 15 | val rank = nn.mkRanker(distance, stringVectorizer)(data) 16 | data foreach { item => 17 | val retrieved = rank(1)(item) 18 | assert(retrieved.size === 1) 19 | assert(retrieved.head === item) 20 | } 21 | } 22 | 23 | } 24 | 25 | object NearestNeighborsTest { 26 | val nn = NearestNeighbors[String, Float, DenseVector]( 27 | MathVectorOps.Implicits.FloatDenseVot 28 | ) 29 | val data = KnnClassifierTest.data.map { case (ws, _) => ws } 30 | val stringVectorizer: nn.Vectorizer = KnnClassifierTest.stringVectorizer 31 | val distance: nn.Distance = KnnClassifierTest.distance 32 | } 33 | -------------------------------------------------------------------------------- /fp4ml-main/src/test/scala/mlbigbook/ml/OLDKMeansTest.scala: -------------------------------------------------------------------------------- 1 | //package mlbigbook.ml 2 | // 3 | //import mlbigbook.wordcount.LocalSparkContext 4 | //import org.scalatest.FunSuite 5 | // 6 | //class OLDKMeansTest extends FunSuite with LocalSparkContext { 7 | // 8 | // ignore("classify simple addresses") { 9 | // fail("unimplemented") 10 | // } 11 | // 12 | //} 13 | // 14 | //object OLDKMeansTest { 15 | // 16 | // import NearestNeighborsLSHTest._ 17 | // 18 | // def softClusterTest[T](sc: SoftCluster[T], input: T, expected: IndexedSeq[(OLD_Center, Double)]): Err = { 19 | // val actual = sc(input) 20 | // 21 | // val errors = 22 | // expected.zip(actual).foldLeft(List.empty[String])({ 23 | // case (sum, (e, a)) => 24 | // if (e != a) 25 | // sum :+ s"""Expecting: $e | Actual: $a""" 26 | // else 27 | // sum 28 | // }) 29 | // 30 | // if (errors.nonEmpty) 31 | // Some(s"""Found ${errors.length} differences: ${errors.mkString("\n")}""") 32 | // else 33 | // None 34 | // } 35 | // 36 | //} 37 | // 38 | -------------------------------------------------------------------------------- /fp4ml-spark/README.md: -------------------------------------------------------------------------------- 1 | # fp4ml-spark 2 | [Maven Central](https://maven-badges.herokuapp.com/maven-central/io.malcolmgreaves/fp4ml-spark_2.11/badge.svg?style=plastic)](https://maven-badges.herokuapp.com/maven-central/io.malcolmgreaves/fp4ml-spark_2.11) 3 | Machine learning for functional programmers. 4 | Extensions to `fp4ml-main` to uses the Spark ecosystem. -------------------------------------------------------------------------------- /fp4ml-spark/build.sbt: -------------------------------------------------------------------------------- 1 | name := "fp4ml-spark" 2 | 3 | import SharedBuild._ 4 | 5 | addCompilerPlugin(scalaMacros) 6 | 7 | libraryDependencies ++= 8 | fp4mlSparkDeps ++ 9 | testDeps 10 | 11 | fork in run := false 12 | 13 | pomExtra := pomExtraInfo 14 | -------------------------------------------------------------------------------- /fp4ml-spark/src/main/scala/TODO: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malcolmgreaves/fp4ml/69958a463ce40e508c2a6103599e1af35c9f2845/fp4ml-spark/src/main/scala/TODO -------------------------------------------------------------------------------- /project/SharedBuild.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import Keys._ 3 | 4 | object SharedBuild { 5 | 6 | // // // // // // // // 7 | // // Versions // // 8 | // // // // // // // // 9 | 10 | lazy val breezeV = "0.12" 11 | lazy val nakV = "1.3" 12 | lazy val dataTcV = "0.0.0" 13 | lazy val scalaMacrosV = "2.1.0" 14 | lazy val avroCgV = "0.3.4" 15 | lazy val shapelessV = "2.2.5" 16 | lazy val wispV = "0.0.4" 17 | lazy val argonautV = "6.1" 18 | lazy val scalajV = "2.2.1" 19 | 20 | // // // // // // // // // // 21 | // // Dependencies // // 22 | // // // // // // // // // // 23 | 24 | lazy val scalaMacros = 25 | "org.scalamacros" % "paradise" % scalaMacrosV cross CrossVersion.full 26 | 27 | lazy val fp4mlMainDeps = Seq( 28 | "org.scalanlp" %% "breeze" % breezeV, 29 | "org.scalanlp" %% "breeze-natives" % breezeV, 30 | "org.scalanlp" %% "nak" % nakV, 31 | "com.quantifind" %% "wisp" % wispV, 32 | // [B] necessary? 33 | "io.argonaut" %% "argonaut" % argonautV, 34 | "org.scalaj" %% "scalaj-http" % scalajV, 35 | // [E] necessary? 36 | "com.chuusai" %% "shapeless" % shapelessV, 37 | "com.gonitro" %% "avro-codegen-runtime" % avroCgV, 38 | "io.malcolmgreaves" %% "data-tc-extra" % dataTcV, 39 | "io.malcolmgreaves" %% "data-tc-scala" % dataTcV 40 | ) 41 | 42 | lazy val fp4mlSparkDeps = Seq( 43 | "io.malcolmgreaves" %% "data-tc-spark" % dataTcV 44 | ) 45 | 46 | lazy val testDeps = Seq( 47 | "org.scalatest" %% "scalatest" % "2.2.6" % Test 48 | ) 49 | 50 | lazy val pomExtraInfo = { 51 | https://github.com/malcolmgreaves/fp4ml 52 | 53 | 54 | Apache 2.0 55 | https://www.apache.org/licenses/LICENSE-2.0.txt 56 | repo 57 | 58 | 59 | 60 | git@github.com:malcolmgreaves/fp4ml.git 61 | scm:git@github.com:malcolmgreaves/fp4ml.git 62 | 63 | 64 | 65 | malcolmgreaves 66 | Malcolm Greaves 67 | greaves.malcolm@gmail.com 68 | https://malcolmgreaves.io/ 69 | 70 | 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.8 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "0.4.10") 4 | 5 | addSbtPlugin("com.gonitro" % "avro-codegen-compiler" % "0.3.4") 6 | 7 | //addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.0.4") 8 | //addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.0.0") 9 | --------------------------------------------------------------------------------