├── .gitignore ├── README.md ├── build.sbt ├── project ├── build.properties └── plugins.sbt └── src └── main ├── java └── META-INF │ └── MANIFEST.MF └── scala ├── Experiment.scala ├── Main.scala ├── Main2.scala └── qlearn ├── Types.scala ├── dataset ├── Binary.scala ├── Labeled.scala ├── MultiLabeled.scala ├── Nominal.scala ├── NominalBasic.scala ├── NominalFull.scala ├── Numerical.scala ├── SingleLabeled.scala ├── Unlabeled.scala ├── loaders │ ├── ArffLoader.scala │ ├── Loader.scala │ └── TabLoader.scala └── schema │ ├── BinaryColumn.scala │ ├── Column.scala │ ├── NominalColumn.scala │ └── NumericalColumn.scala ├── loss ├── Loss.scala ├── binary │ ├── F1.scala │ ├── HingeLoss.scala │ ├── LogisticLoss.scala │ ├── Precision.scala │ └── Recall.scala ├── nominal │ ├── CrossEntropyLoss.scala │ ├── FractionOfIncorrect.scala │ └── MatrixLoss.scala └── numerical │ ├── MeanAbsoluteLoss.scala │ ├── MeanSquaredLoss.scala │ └── distance │ ├── ChebyshevDistance.scala │ ├── CosineSimilarity.scala │ ├── Distance.scala │ ├── EuclideanDistance.scala │ ├── ManhattanDistance.scala │ ├── NormDistance.scala │ ├── PolyKernel.scala │ └── RBFKernel.scala ├── ml ├── Clusterer.scala ├── FittedModel.scala ├── FittedModelMulti.scala ├── Model.scala ├── ModelForTwo.scala ├── RandomizedClusterer.scala ├── RandomizedModel.scala ├── classify │ ├── FittedNeuralNetwork.scala │ ├── FittedRandomTree.scala │ ├── FittedSameDistribution.scala │ ├── FittedSimpleKNN.scala │ ├── NeuralNetwork.scala │ ├── RandomForest.scala │ ├── RandomTree.scala │ ├── SameDistribution.scala │ ├── SimpleKNN.scala │ └── weka │ │ ├── LogisticRegression.scala │ │ └── REPTree.scala ├── cluster │ ├── KMeans.scala │ └── weka │ │ ├── CobWeb.scala │ │ ├── ExpectationMaximization.scala │ │ └── KMeans.scala ├── meta │ ├── Bagging.scala │ ├── FittedBagging.scala │ ├── FittedOneVsAll.scala │ └── OneVsAll.scala └── regress │ ├── FittedRidgeRegression.scala │ ├── FittedRidgeRegressionMulti.scala │ ├── RidgeRegression.scala │ └── weka │ └── RidgeRegression.scala ├── strategies ├── NoRecentImprovement.scala └── Stopping.scala ├── util ├── Util.scala ├── decisionStump │ └── DecisionStump.scala └── nnet │ ├── Layer.scala │ └── activations │ ├── ActivationFunction.scala │ ├── Eliott.scala │ ├── EliottSym.scala │ ├── Sigmoid.scala │ └── Tanh.scala ├── validation ├── CrossValidation.scala ├── LeaveOneOut.scala ├── PercentageSplit.scala ├── SameDatasetValidation.scala └── Validation.scala └── wekas ├── WekaClusterer.scala ├── WekaDistance.scala ├── WekaFittedModel.scala └── WekaModel.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | *.class 4 | *.log 5 | 6 | # sbt specific 7 | .cache/ 8 | .history/ 9 | .lib/ 10 | dist/* 11 | target/ 12 | lib_managed/ 13 | src_managed/ 14 | project/boot/ 15 | project/plugins/project/ 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | QuantumLearn 2 | === 3 | 4 | I have used Weka for quite some time now. There were quite some things that bothered me. The dataset management was clumsy at best, but I could deal with that. What I couldn't get around was the general slowness of the library. This is not only my complaint, just make a basic google search. The reason for that? Weka doesn't use BLAS (and neither does any other Java machine learning library). Not only that, many times unnecessary data copies are made. 5 | 6 | So, here is the manifesto of the QuantumLearn library: 7 | 8 | * Where possible, the **BLAS & LAPACK methods** are used. This brings up to 10x speedup compared to Weka. 9 | * Created with support for **multilabel datasets** from the start. This enables much faster learning for algorithms that support that (NN, Linear / Logistic regression), otherwise it just fails back to training a separate model for each label. 10 | * Created around the idea of **immutable data structures**. This eases multithreading. Example: When you call *.fit* on *LinearRegression*, you get *FittedLinearRegression* back. 11 | * Leverage the abilities of the type system. Scala helps a great deal here. This means we can catch obvious errors at compile times instead of failing with an exception - such as passing a nominal dataset to a regressor. 12 | * Algorithms are reporting their **progress status** during the training process. This is really important, since the training is usually a really long-lasting activity. It's nice to know how much time you have left to wait. 13 | 14 | Beware, this library is in really early alpha stage. Do not use in production. 15 | 16 | Dealing with the lack of algorithms 17 | --- 18 | Until the algorithm is implemented in effective Scala code, the algorithm from Weka is wrapped. For now, this causes a dataset copy (but it is done only once). 19 | 20 | Dealing with datasets 21 | === 22 | Let's first create an unlabeled (unsupervised) dataset. We create a matrix and name the features. 23 | 24 | ```scala 25 | val unlabeled = Unlabeled(DenseMatrix( 26 | (16.0,2.0,3.0), 27 | (3.0,11.0,5.5), 28 | (4.0,8.0,10.0), 29 | (5.0,100.0,7.0) 30 | ), Vector('x, 'y, 'z)) 31 | ``` 32 | 33 | This dataset might be used for clustering or to make predictions on. However, if we want to learn from it, we have to label (supervize) it. Along with the dataset, you can specify a custom cost function, as is shown below with specifying hinge loss for the binary dataset. 34 | 35 | ```scala 36 | val isMale = Binary('isMale, unlabeled, Vector(true, false, true, true), loss = HingeLoss) 37 | val age = Numerical('age, unlabeled, Vector(20.3, 56.8, 10.3, 11.8)) 38 | val major = Nominal('major, unlabeled, Vector("ML", "literature", "ML", "art")) 39 | ``` 40 | 41 | Multi label datasets 42 | --- 43 | 44 | You can then group many of those single-labeled datasets into a multi-labeled one. 45 | 46 | ```scala 47 | val labeled = MultiLabeled(isMale, age, major) 48 | ``` 49 | 50 | Finally, to check everything is fine so far, we call `labeled.report` and get this on the standard output: 51 | 52 | ``` 53 | x y z isMale age major=ML major=literature major=art 54 | 16.00000 2.000000 3.000000 -> 1.000000 20.30000 1.000000 0.00000000000000 0.0000000 55 | 3.000000 11.00000 5.500000 -> 0.000000 56.80000 0.000000 1.00000000000000 0.0000000 56 | 4.000000 8.000000 10.00000 -> 1.000000 10.30000 1.000000 0.00000000000000 0.0000000 57 | 5.000000 100.0000 7.000000 -> 1.000000 11.80000 0.000000 0.00000000000000 1.0000000 58 | ``` 59 | 60 | Image datasets 61 | --- 62 | This is currently just a proposal. In the future, it should be possible to instantiate an unabeled dataset containing image data. This dataset behaves no differently as the one described above. 63 | 64 | Let's create the collection of images, each of which is resized to 40x30 pixels. Moreover, each image is stored five times - the original, translated 1 and 5 pixels to the right and 1 and 3 pixels to the left. Additionally, each images has a 30% chance of appearing rotated by 1, 3 of 5 degrees. 65 | 66 | ```scala 67 | val unlabeledImages = ImageDataset( 68 | Seq("images/bird1.jpg", "images/bird2.jpg", "images/bird3.png"), 69 | height = 40, width = 30, 70 | translations = Seq(1, -1, -3, 5), 71 | rotations = Seq(1.0, 3.0, 5.0), rotationProbability = 0.3 72 | ) 73 | ``` 74 | 75 | Transforming records, features and labels 76 | === 77 | Say we want to add two new features to the dataset. We can use arbitrary data or reuse existing. Here's how it's done: 78 | 79 | ```scala 80 | val augmented = FeatureAdder( 81 | 'isFemale -> (row => !row('isMale)), 82 | 'ageInMonths -> (row => row('age) * 12) 83 | ).transform(labeled) 84 | ``` 85 | 86 | 87 | Wrapping Weka learners 88 | === 89 | 90 | Everything Weka-connected resides in a `qlearn.algorithms.weka` package. Some of the Weka algorithms are already nicely wrapped. The ones that are not, you can wrap yourself: 91 | 92 | ```scala 93 | // simple example 94 | WekaWrapper(new J48) 95 | 96 | // complex example 97 | WekaWrapper({ 98 | val tmp = new J48 99 | tmp.setMinNumObj(10) 100 | tmp.setUseLaplace(true) 101 | tmp 102 | }) 103 | ``` 104 | 105 | Once this is done, you can use Weka learners in the same manner as the native ones. It's that simple. 106 | 107 | Future improvements 108 | === 109 | This section presents future ideas for optimization / improvement. We are not in a hurry, first just make sure everything works correctly. 110 | 111 | Memory optimizations 112 | --- 113 | * Think how could we adapt the learners to avoid copying the dataset K times on K-fold cross-validation. Ideas: bit masking vector, two phase learning (to splice out the test fold), ... There is no free luch, and this causes additional CPU costs and complicates the code and class design. Therefore debate whether this is reasonable - RAM is cheap novadays. 114 | * Learners should drop the reference to the learning dataset as soon as they are done learning. For example, with linear regression, we just remember the coefficients and allow the original dataset to be garbage collected. 115 | * Could sparse datasets easily be supported? What is the performance cost, since BLAS is not used? How could more advanced optimizations in Breeze library help? See also: [Breeze bug report](https://github.com/scalanlp/breeze/issues/360) 116 | 117 | CPU & computational optimizations 118 | --- 119 | * Which learners are capable of parallelism (or distributed computing - Hadoop, Akka, Spark)? This is not a huge concern, since things such as parameter selection (via cross validation), ensembles (bagging, stacking) are hugely parallel. 120 | * Since loss functions usually have unchanging target predictions, would preprocessing help? 121 | 122 | No nominal attributes on X dataset 123 | --- 124 | For efficiency (simplicity, memory and coputation) reasons, the X dataset is represented as as simple matrix. This suffices most of the time by simply binarizing the nominal attribures. Some algorithms, such as Naive bayes, requires the higher level of knowledge. This could be solved this way: 125 | 126 | * Make `=` a special symbol in the attribute name. Therefore, if you have attrbutes named `hairColor=grey`, `hairColor=black`, `hairColor=blonde`, the classifier can deduce the category memberships. However, such storage is inefficient with attrbutes with many possible values. Moreover, specifying attribute names to adhere to our standard is just a call for problems. 127 | * Represent the ordinal value as a double in a single attribute (`grey` = 1.0, `black` = 2.0, `blonde` = 3.0), and then accept the information about which attributes are stored this way. Example: 128 | 129 | ```scala 130 | val unlabeled = Unlabeled(DenseMatrix( 131 | (1.0,2.0,3.0), 132 | (2.0,11.0,5.5), 133 | (1.0,8.0,10.0), 134 | (3.0,100.0,7.0) 135 | ), Vector('hairColor, 'y, 'z), nominal = Seq('hairColor)) 136 | ``` 137 | 138 | Algorithm improvements 139 | --- 140 | * Do learning algorithms benefit from knowing the unsupervized future test data upfront? See: [CrossValidated question](https://stats.stackexchange.com/questions/156085/which-supervised-algorithms-benefit-from-knowing-future-inputs-upfront) 141 | 142 | Misc 143 | --- 144 | * Java interop. Make it possible to use this library in Java without any extended effort. 145 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "QuantumLearn" 2 | 3 | version := "0.1.5" 4 | 5 | scalaVersion := "2.11.7" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.scalanlp" %% "breeze" % "0.12-SNAPSHOT", 9 | 10 | // native libraries greatly improve performance, but increase jar sizes. 11 | "org.scalanlp" %% "breeze-natives" % "0.12-SNAPSHOT", 12 | 13 | // weka 14 | "nz.ac.waikato.cms.weka" % "weka-dev" % "3.7.13" 15 | ) 16 | 17 | resolvers ++= Seq( 18 | "Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/", 19 | "Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/" 20 | ) -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.9 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /src/main/java/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: Main 3 | 4 | -------------------------------------------------------------------------------- /src/main/scala/Experiment.scala: -------------------------------------------------------------------------------- 1 | import breeze.linalg.{pinv, sum} 2 | import qlearn.Types.Mat 3 | 4 | object Experiment extends App { 5 | def memory { 6 | System.gc 7 | System.gc 8 | System.gc 9 | System.gc 10 | System.gc 11 | println(s"Mem ${Runtime.getRuntime.freeMemory}") 12 | } 13 | 14 | 15 | 16 | 17 | 18 | { 19 | val n = 10000 20 | val a = Mat.rand[Double](n, n) 21 | 22 | var start = System.nanoTime 23 | val b = a(0 to -2, 0 to -2 by 2) 24 | println((System.nanoTime - start) / 1000000000.0) 25 | } 26 | 27 | 28 | 29 | println("START") 30 | memory 31 | val a = Mat.rand[Double](1300, 1300) 32 | println(sum(a)) 33 | memory 34 | 35 | 36 | 37 | { 38 | val start = System.nanoTime 39 | pinv(a) 40 | println((System.nanoTime - start) / 1000000000.0) 41 | } 42 | memory 43 | 44 | println("A") 45 | 46 | { 47 | var start = System.nanoTime 48 | val b = a(0 to -2, 0 to -2) 49 | println((System.nanoTime - start) / 1000000000.0) 50 | 51 | memory 52 | 53 | start = System.nanoTime 54 | pinv(b) 55 | println((System.nanoTime - start) / 1000000000.0) 56 | } 57 | memory 58 | 59 | println("B") 60 | 61 | { 62 | var start = System.nanoTime 63 | val b = a(0 to -2, 0 to -2 by 2) 64 | println((System.nanoTime - start) / 1000000000.0) 65 | 66 | memory 67 | 68 | start = System.nanoTime 69 | pinv(b) 70 | println((System.nanoTime - start) / 1000000000.0) 71 | } 72 | memory 73 | 74 | println("C") 75 | 76 | { 77 | var start = System.nanoTime 78 | val bx = a((0 until a.rows).filter( _ => math.random < 0.5), ::) 79 | println((System.nanoTime - start) / 1000000000.0) 80 | 81 | memory 82 | 83 | start = System.nanoTime 84 | val b = bx.toDenseMatrix 85 | println((System.nanoTime - start) / 1000000000.0) 86 | 87 | memory 88 | 89 | start = System.nanoTime 90 | pinv(b) 91 | println((System.nanoTime - start) / 1000000000.0) 92 | } 93 | memory 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/Main.scala: -------------------------------------------------------------------------------- 1 | import _root_.weka.clusterers.SimpleKMeans 2 | import qlearn.dataset._ 3 | 4 | import qlearn.Types._ 5 | import qlearn.loss.numerical.distance.EuclideanDistance 6 | import qlearn.ml 7 | import qlearn.ml.classify.weka.{LogisticRegression, REPTree} 8 | import qlearn.ml.classify.{RandomTree, SimpleKNN, FittedSimpleKNN} 9 | import qlearn.ml.cluster.{weka, KMeans} 10 | import qlearn.ml.meta.Bagging 11 | import qlearn.ml.regress.RidgeRegression 12 | import qlearn.validation.{SameDatasetValidation, CrossValidation} 13 | import qlearn.wekas.WekaClusterer 14 | 15 | object Main extends App { 16 | val ul = Unlabeled(Mat.rand(8, 10).t, Vector('a, 'b, 'c, 'd, 'e, 'f, 'g, 'h)) 17 | val l1 = Nominal('wheel, ul, Vector.fill(4)("axxxxxx") ++ Vector.fill(3)("b") ++ Vector.fill(2)("c") ++ Vector.fill(1)("d")) 18 | val l2 = Nominal('tire, ul, Vector.fill(1)("axyza") ++ Vector.fill(2)("b") ++ Vector.fill(3)("c") ++ Vector.fill(4)("d")) 19 | val lb = Binary('sold, ul, Vector.fill(6)(true) ++ Vector.fill(4)(false)) 20 | val ln = MultiLabeled( 21 | Numerical('num, ul, (1 to 10).map(_.toDouble), EuclideanDistance), 22 | Numerical('num2, ul, (11 to 20).map(_.toDouble)) 23 | ) 24 | 25 | l1.report 26 | FittedSimpleKNN(l1, k = 3).predict(ul).report 27 | 28 | 29 | //val l3 = MultiLabeled(l1, l2, lb) 30 | //l3.report 31 | 32 | //LogisticRegression().fit(l3).predict(ul).report 33 | //REPTree().fit(l3).predict(ul).report 34 | RandomTree(minParent = 5).fit(l1).predict(ul).report 35 | 36 | val mdls = Seq.fill(300)(RandomTree(minParent = 1)) 37 | Bagging(mdls).fit(l1).predict(ul).report 38 | 39 | //RidgeRegression(ridge = 0.0).fit(ln).predict(ul).report 40 | //ml.regress.weka.RidgeRegression(ridge = 0.0).fit(ln).predict(ul).report 41 | 42 | println(CrossValidation[Nominal](l1).validate(LogisticRegression())) 43 | println(SameDatasetValidation[Nominal](l1).validate(SimpleKNN(5))) 44 | 45 | 46 | 47 | val ulc = Unlabeled(Mat.rand(8, 50000).t, Vector('a, 'b, 'c, 'd, 'e, 'f, 'g, 'h)) 48 | println("a") 49 | KMeans(k = 50).cluster(ulc) 50 | println("b") 51 | weka.KMeans(k = 50) 52 | println("c") 53 | new WekaClusterer({ 54 | val c = new SimpleKMeans 55 | c.setNumClusters(50) 56 | c 57 | }).cluster(ulc) 58 | println("x") 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | val rul = Unlabeled(Mat.rand(8, 9).t, Vector('a, 'b, 'c, 'd, 'e, 'f, 'g, 'h)) 70 | val nm = Numerical('num, rul, (1 to 9).map(_.toDouble)) 71 | println(nm.wekaDataset) 72 | RidgeRegression().fit(nm).predict(rul).report 73 | ml.regress.weka.RidgeRegression().fit(nm).predict(rul).report 74 | } -------------------------------------------------------------------------------- /src/main/scala/Main2.scala: -------------------------------------------------------------------------------- 1 | import qlearn.dataset.loaders.ArffLoader 2 | import qlearn.ml.cluster.KMeans 3 | 4 | object Main2 extends App { 5 | val data = ArffLoader.unlabeled("datasets/arff/regression/autoPrice.arff") 6 | //val data = ArffLoader.unlabeled("datasets/arff/classification9/anneal.ORIG.arff") 7 | data.report 8 | 9 | KMeans(k = 30).cluster(data).report 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/Types.scala: -------------------------------------------------------------------------------- 1 | package qlearn 2 | 3 | import breeze.linalg.{*, DenseMatrix, Matrix, DenseVector} 4 | 5 | object Types { 6 | type Vec = DenseVector[Double] 7 | type Mat = DenseMatrix[Double] 8 | 9 | val Vec = DenseVector 10 | val Mat = DenseMatrix 11 | 12 | type IntVec = DenseVector[Int] 13 | type BinVec = DenseVector[Boolean] 14 | 15 | implicit class MatWrapper(m: Mat) { 16 | def r = m(*, ::) 17 | 18 | def c = m(::, *) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/Binary.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import qlearn.Types.{BinVec, Mat, Vec} 4 | import qlearn.dataset.schema.BinaryColumn 5 | import qlearn.loss.Loss 6 | import qlearn.loss.binary.LogisticLoss 7 | import qlearn.loss.nominal.CrossEntropyLoss 8 | import qlearn.util.Util 9 | 10 | case class Binary(x: Unlabeled, yb: Vec, schema: BinaryColumn) extends Nominal { 11 | 12 | val values = Vector("no", "yes") 13 | 14 | lazy val ymat = 15 | Mat(yb.map( v => 16 | Seq(1-v, v) 17 | ).toScalaVector: _*) 18 | 19 | val yt = yb :>= 0.5 20 | 21 | override val y = 22 | yb.map( v => 23 | if (v >= 0.5) 1 else 0 24 | ) 25 | 26 | def updated(xnew: Unlabeled, ynew: Mat): Binary = schema.populate(xnew, ynew) 27 | 28 | /* 29 | Function that writes the dataset to stdout 30 | */ 31 | 32 | val reportHeader = Seq("%9s" format name.name) 33 | 34 | def reportLine(line: Int) = 35 | Seq(Util.printDoubleNicely(yb(line), reportHeader.head.size)) 36 | } 37 | 38 | object Binary { 39 | 40 | def apply(name: Symbol, x: Unlabeled, seq: Seq[Boolean], loss: Loss[Binary] = LogisticLoss): Binary = { 41 | Binary(x, Vec(seq.map { v => 42 | if (v) 1.0 else 0.0 43 | }: _*), BinaryColumn(name, loss)) 44 | } 45 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/Labeled.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.schema.Column 5 | import qlearn.loss.Loss 6 | 7 | import scala.util.Random 8 | 9 | abstract class Labeled[+T <: Labeled[T]] { 10 | def x: Unlabeled 11 | def xmat = x.xmat 12 | val schema: Column 13 | 14 | def recordCount = x.recordCount 15 | def featureCount = x.featureCount 16 | 17 | def ymat: Mat 18 | 19 | // TODO 20 | //require(ymat.rows == xmat.rows, "Both X and Y have to have the same number of rows") 21 | 22 | def width: Int 23 | 24 | def indices = 0 until recordCount 25 | 26 | def apply(range: Range): T = updated(x(range), ymat(range, ::)) 27 | def apply(index: Int): T = apply(index to index) 28 | def pick(indices: Seq[Int]): T = updated(x.pick(indices), ymat(indices, ::).toDenseMatrix) 29 | 30 | def updated(xnew: Unlabeled, ynew: Mat): T 31 | 32 | def ++[Q <: Labeled[Q]](that: Q): T = updated(x ++ that.x, Mat.vertcat(ymat, that.ymat)) 33 | 34 | def duplicate: T = updated(x.duplicate, ymat.copy) 35 | 36 | def shuffle: T = 37 | pick( Random.shuffle(0 to recordCount - 1) ) 38 | 39 | /* 40 | Function that writes the dataset to stdout 41 | */ 42 | 43 | def reportHeader: Seq[String] 44 | def reportLine(line: Int): Seq[String] 45 | 46 | def report { 47 | println(x.reportHeader ++ Vector(" ") ++ reportHeader mkString " ") 48 | 49 | indices.foreach { i => 50 | println(x.reportLine(i) ++ Vector(" -> ") ++ reportLine(i) mkString " ") 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/MultiLabeled.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.schema.Column 5 | import qlearn.loss.Loss 6 | import qlearn.util.Util 7 | 8 | case class MultiLabeled[T <: SingleLabeled[T]](ys: T*) extends Labeled[MultiLabeled[T]] { 9 | require(ys.nonEmpty, "Specify at least one dataset.") 10 | 11 | val x = ys.head.x 12 | 13 | lazy val width = ys.map(_.width).sum 14 | 15 | lazy val ymat = Mat.horzcat(ys.map(_.ymat): _*) 16 | 17 | type Tmp = T 18 | val schema = new Column { 19 | type T = MultiLabeled[Tmp] 20 | 21 | val name = 'MultiToBeRenamed 22 | 23 | val loss = new Loss[T] { 24 | val range = 1.0 -> 2.0 25 | 26 | def apply(a: T, b: T) = 1.0 27 | /*(a.ys, b.ys).zipped.map( 28 | (as, bs) => as.schema.loss(as, bs) 29 | ).sum / a.ys.size*/ 30 | } 31 | 32 | def populate(x: Unlabeled, y: Mat) = 33 | MultiLabeled(ys.zipWithIndex.map { case (y2, i) => 34 | val start = widths(i) 35 | val stop = widths(i + 1) 36 | val slice = y(::, start until stop) 37 | y2.updated(x, slice) 38 | }: _*) 39 | } 40 | 41 | 42 | private val widths = ys.scanLeft(0)(_ + _.width) 43 | 44 | def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew) 45 | 46 | override def toString = { 47 | val labels = ys.map(_.name.name) mkString ", " 48 | s"Multi(${x.labelString}, $labels)" 49 | } 50 | 51 | /* 52 | Function that writes the dataset to stdout 53 | */ 54 | 55 | lazy val reportHeader = ys.flatMap(_.reportHeader) 56 | 57 | def reportLine(line: Int) = ys.flatMap(_.reportLine(line)) 58 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/Nominal.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import qlearn.Types._ 4 | import qlearn.dataset.schema.NominalColumn 5 | import qlearn.loss.Loss 6 | import qlearn.loss.nominal.CrossEntropyLoss 7 | import weka.core.{Attribute, Instances} 8 | 9 | abstract class Nominal extends SingleLabeled[Nominal] with Product with Serializable { 10 | val x: Unlabeled 11 | val ymat: Mat 12 | def values: Vector[String] 13 | 14 | def width = values.size 15 | 16 | val y: IntVec = 17 | Vec.tabulate(recordCount) { i => 18 | val inner = ymat(i, ::).inner.toArray 19 | val max = inner.max 20 | inner.indexOf(max) 21 | } 22 | 23 | /* 24 | Produce the weka dataset (Instances). 25 | */ 26 | 27 | lazy val wekaDataset = { 28 | val data = new Instances(x.wekaDataset) 29 | val pos = data.numAttributes 30 | 31 | import collection.JavaConversions._ 32 | data.insertAttributeAt(new Attribute("output", values), pos) 33 | data.setClassIndex(pos) 34 | 35 | indices.foreach { i => 36 | data.instance(i).setValue(pos, values(y(i))) 37 | } 38 | 39 | data 40 | } 41 | } 42 | 43 | object Nominal { 44 | def apply(name: Symbol, x: Unlabeled, y: Seq[String], loss: Loss[Nominal] = CrossEntropyLoss()): NominalBasic = { 45 | val values = y.distinct.toVector 46 | val lookup = values.zipWithIndex.toMap 47 | 48 | NominalBasic(x, Vec.tabulate(y.size)(y andThen lookup), NominalColumn(name, values, loss)) 49 | } 50 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/NominalBasic.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import breeze.linalg.argmax 4 | import qlearn.Types._ 5 | import qlearn.dataset.schema.NominalColumn 6 | 7 | case class NominalBasic(x: Unlabeled, override val y: IntVec, schema: NominalColumn) extends Nominal { 8 | 9 | def values = schema.values 10 | 11 | lazy val ymat = 12 | Mat.tabulate(y.size, values.size) { (r, c) => 13 | if (y(r) == c) 1.0 else 0.0 14 | } 15 | 16 | def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew) 17 | 18 | def updatedSame(xnew: Unlabeled, ynew: Mat): NominalBasic = { 19 | assert(ynew.cols == values.size) 20 | copy(x = xnew, y = argmax(ynew.r)) 21 | } 22 | 23 | /* 24 | Function that writes the dataset to stdout 25 | */ 26 | 27 | lazy val reportHeader = { 28 | val len = values.map(_.size).max 29 | Seq(s"%${len}s" format name.name) 30 | } 31 | 32 | def reportLine(line: Int) = 33 | Seq(values(y(line)).padTo(reportHeader.head.size, ' ')) 34 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/NominalFull.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import qlearn.Types._ 4 | import qlearn.dataset.schema.NominalColumn 5 | import qlearn.util.Util 6 | 7 | case class NominalFull(x: Unlabeled, ymat: Mat, schema: NominalColumn) extends Nominal { 8 | 9 | def values = schema.values 10 | 11 | def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew) 12 | 13 | 14 | /* 15 | Function that writes the dataset to stdout 16 | */ 17 | 18 | lazy val reportHeader = 19 | values.map(x => 20 | "%9s" format s"${name.name}=$x" 21 | ) 22 | 23 | def reportLine(line: Int) = { 24 | val vec = ymat(line, ::).inner.toScalaVector 25 | vec.zipWithIndex.map { case (v, i) => 26 | Util.printDoubleNicely(v, reportHeader(i).size) 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/Numerical.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import qlearn.Types._ 4 | import qlearn.dataset.schema.NumericalColumn 5 | import qlearn.loss.Loss 6 | import qlearn.loss.numerical.MeanSquaredLoss 7 | import qlearn.util.Util 8 | import weka.core.{Attribute, Instances} 9 | 10 | case class Numerical(x: Unlabeled, y: Vec, schema: NumericalColumn) extends SingleLabeled[Numerical] { 11 | lazy val ymat = y.toDenseMatrix.t 12 | 13 | val width = 1 14 | 15 | def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew) 16 | 17 | 18 | /* 19 | Function that writes the dataset to stdout 20 | */ 21 | 22 | val reportHeader = Seq("%9s" format name.name) 23 | 24 | def reportLine(line: Int) = 25 | Seq(Util.printDoubleNicely(y(line), reportHeader.head.size)) 26 | 27 | 28 | lazy val wekaDataset = { 29 | val data = new Instances(x.wekaDataset) 30 | val pos = data.numAttributes 31 | data.insertAttributeAt(new Attribute("output"), pos) 32 | data.setClassIndex(pos) 33 | 34 | (0 until y.length).foreach { i => 35 | data.instance(i).setValue(pos, y(i)) 36 | } 37 | 38 | data 39 | } 40 | } 41 | 42 | object Numerical { 43 | def apply(name: Symbol, x: Unlabeled, y: Seq[Double], loss: Loss[Numerical] = MeanSquaredLoss): Numerical = 44 | Numerical(x, Vec(y: _*), NumericalColumn(name, loss)) 45 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/SingleLabeled.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import weka.core.Instances 4 | 5 | abstract class SingleLabeled[+T <: SingleLabeled[T]] extends Labeled[T] { 6 | val name: Symbol = schema.name 7 | 8 | override def toString = s"Single(${x.labelString}, ${name.name})" 9 | 10 | def wekaDataset: Instances 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/Unlabeled.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.util.Util 5 | import weka.core.{DenseInstance, Attribute, Instances} 6 | 7 | case class Unlabeled(xmat: Mat, names: Vector[Symbol]) { 8 | val recordCount = xmat.rows 9 | val featureCount = xmat.cols 10 | 11 | require(recordCount > 0, "Dataset must have at least one record.") 12 | require(names.size == names.distinct.size, "The names have to be unique.") 13 | require(names.size == featureCount, "You have to name every feature.") 14 | 15 | 16 | 17 | def indices = 0 until recordCount 18 | 19 | def apply(range: Range): Unlabeled = copy(xmat = xmat(range, ::)) 20 | def apply(index: Int): Unlabeled = apply(index to index) 21 | def pick(indices: Seq[Int]): Unlabeled = copy(xmat = xmat(indices, ::).toDenseMatrix) 22 | 23 | def ++(that: Unlabeled) = copy(xmat = Mat.vertcat(xmat, that.xmat)) 24 | 25 | def duplicate = copy(xmat = xmat.copy) 26 | 27 | def labelString = s"${recordCount} records, ${featureCount} features" 28 | override def toString = s"Data($labelString)" 29 | 30 | /* 31 | Function that writes the dataset to stdout 32 | */ 33 | 34 | lazy val reportHeader = 35 | names.map("%9s" format _.name) 36 | 37 | def reportLine(line: Int) = { 38 | val vec = xmat(line, ::).inner.toScalaVector 39 | vec.zipWithIndex.map { case (v, i) => 40 | Util.printDoubleNicely(v, reportHeader(i).size) 41 | } 42 | } 43 | 44 | def report { 45 | println(reportHeader mkString " ") 46 | indices.foreach { i => 47 | println(reportLine(i) mkString " ") 48 | } 49 | } 50 | 51 | 52 | 53 | lazy val wekaDataset = { 54 | val attributes = new java.util.ArrayList[Attribute]() 55 | names.foreach { name => 56 | attributes.add(new Attribute(name.name)) 57 | } 58 | 59 | val data = new Instances("Dataset", attributes, 0) 60 | 61 | indices.foreach { i => 62 | data.add(new DenseInstance(1.0, xmat(i, ::).inner.toArray)) 63 | } 64 | 65 | data 66 | } 67 | } 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/loaders/ArffLoader.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset.loaders 2 | 3 | import qlearn.Types.{Vec, Mat} 4 | import qlearn.dataset.schema.{Column, NominalColumn, NumericalColumn} 5 | import qlearn.dataset.{Numerical, Unlabeled} 6 | import qlearn.loss.numerical.MeanSquaredLoss 7 | 8 | import scala.collection.mutable.ArrayBuffer 9 | 10 | /* 11 | ARFF data file loader 12 | 13 | Missing features: 14 | * support for labeled datasets (figure out types) 15 | * support for string and date columns 16 | * support for sparse rows 17 | * support for instance weights (this will probably never be implemented) 18 | 19 | */ 20 | 21 | object ArffLoader extends Loader { 22 | 23 | def removeComment(line: String) = 24 | line.takeWhile(_ != '%') 25 | 26 | def trimLine(line: String) = line.trim 27 | 28 | def isEmptyLine(line: String) = line.isEmpty 29 | 30 | 31 | 32 | object Regex { 33 | // since the structure is really simple, there's no need to bother with parsers 34 | 35 | private val literal = "'?(.*?)'?" 36 | 37 | private val nominal = raw"\{\s*(.*?)\s*\}" 38 | 39 | private val kind = raw"(real|numeric|integer|string|date|relational|$nominal)" 40 | 41 | val name = raw"(?i)@relation\s+$literal\s*".r 42 | 43 | val attribute = raw"(?i)@attribute\s+$literal\s+$kind".r 44 | } 45 | 46 | 47 | 48 | def clean(data: Iterator[String]) = 49 | data.map(removeComment).map(trimLine).filterNot(isEmptyLine) 50 | 51 | def commaSplit(str: String) = str.split(raw"\s*,\s*").toVector 52 | 53 | def parseHeader(data: Iterator[String]) = { 54 | val name = data.next match { 55 | case Regex.name(name) => name 56 | case line => throw ParseError(s"The dataset has to start with @relation, got instead: $line") 57 | } 58 | 59 | val attributes = Stream.continually(data.next).takeWhile(_.toLowerCase != "@data").map { 60 | case Regex.attribute(name, "real" | "numeric" | "integer", _) => NumericalColumn(Symbol(name)) 61 | 62 | case Regex.attribute(name, kind, null) => 63 | throw ParseError(s"An attribute $name of type $kind is currently unsupported") 64 | 65 | case Regex.attribute(name, _, kind) => NominalColumn(Symbol(name), commaSplit(kind)) 66 | 67 | case line => 68 | throw ParseError(s"Proper attribute declaration expected, got instead: $line") 69 | }.toVector 70 | 71 | (name, attributes) 72 | } 73 | 74 | def parseLine(types: Vector[Column])(line: String) = 75 | (commaSplit(line), types).zipped.map { 76 | case ("?", _) => Double.NaN 77 | case (value, col: NumericalColumn) => value.toDouble 78 | case (value, col: NominalColumn) => 79 | col.lookup.get(value) match { 80 | case Some(pos) => pos.toDouble 81 | case _ => throw ParseError(s"Undeclared nominal value: $value") 82 | } 83 | } 84 | 85 | def buildUnlabeled(it: Iterator[Double], names: Vector[Symbol]) = { 86 | val array = it.toArray 87 | val cols = names.size 88 | val rows = array.size / cols 89 | val matrix = new Mat(rows, cols, array, 0, cols, true) 90 | Unlabeled(matrix, names) 91 | } 92 | 93 | 94 | 95 | def unlabeled(data: Iterator[String]) = { 96 | val cleaned = clean(data) 97 | val (name, columns) = parseHeader(cleaned) 98 | val names = columns.map(_.name) 99 | 100 | val it = cleaned.flatMap(parseLine(columns)) 101 | buildUnlabeled(it, names) 102 | } 103 | 104 | def labeled(data: Iterator[String], attribute: Symbol) = { 105 | val cleaned = clean(data) 106 | val (name, columns) = parseHeader(cleaned) 107 | val names = columns.map(_.name) 108 | 109 | val index = names.indexOf(attribute) 110 | if (index == -1) 111 | throw ParseError(s"The attribute $attribute was not found in the dataset") 112 | 113 | val y = new ArrayBuffer[Double] 114 | val it = cleaned.flatMap { line => 115 | val data = parseLine(columns)(line) 116 | y.append(data(index)) 117 | data.patch(index, Nil, 1) 118 | } 119 | val x = buildUnlabeled(it, names.patch(index, Nil, 1)) 120 | 121 | Numerical(x, new Vec(y.toArray), NumericalColumn(attribute)) 122 | } 123 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/loaders/Loader.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset.loaders 2 | 3 | import scala.io.Source.fromFile 4 | 5 | import qlearn.dataset.{Numerical, Labeled, Unlabeled} 6 | 7 | abstract class Loader { 8 | 9 | case class ParseError(error: String) extends Exception(error) 10 | 11 | 12 | def unlabeled(data: Iterator[String]): Unlabeled 13 | 14 | def unlabeled(file: String): Unlabeled = unlabeled(fromFile(file).getLines) 15 | 16 | 17 | def labeled(data: Iterator[String], attribute: Symbol): Numerical 18 | 19 | def labeled(file: String, attribute: Symbol): Numerical = labeled(fromFile(file).getLines, attribute) 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/loaders/TabLoader.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset.loaders 2 | 3 | object TabLoader extends Loader { 4 | def unlabeled(data: Iterator[String]) = { 5 | val names = data.next.split("\t") 6 | val columns = names.size 7 | 8 | val types = data.next.split("\t", -1).map(_.trim).map { 9 | case "continuous" | "c" => 1 10 | case "discrete" | "d" => 1 11 | case _ => throw new Exception("Only discrete and continous variables are supported.") 12 | } 13 | 14 | val flags = data.next.split("\t", -1).map(_.trim) 15 | 16 | require(columns == types.size, "Number of type columns doesn't match.") 17 | require(columns == flags.size, "Number of flag columns doesn't match.") 18 | 19 | var classNames = Vector.fill(columns)(Vector.empty[String]) 20 | 21 | /*val rows = data.map { line => 22 | (line.split("\t"), types, Iterator.from(0)).zipped.map { 23 | case ("?", _, _) => Double.NaN 24 | case (str, 1, _) => str.toDouble 25 | case (str, 2, i) => 26 | val names = classNames(i) 27 | 28 | names.indexOf(str) match { 29 | case -1 => 30 | classNames = classNames.updated(i, names :+ str) 31 | names.size.toDouble 32 | 33 | case j => j.toDouble 34 | } 35 | } 36 | }.toArray*/ 37 | 38 | ??? 39 | } 40 | 41 | def labeled[Binary](data: Iterator[String], attribute: String) = ??? 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/schema/BinaryColumn.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset.schema 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.{Binary, Unlabeled} 5 | import qlearn.loss.Loss 6 | import qlearn.loss.binary.LogisticLoss 7 | 8 | case class BinaryColumn(name: Symbol, loss: Loss[Binary] = LogisticLoss) extends Column { 9 | type T = Binary 10 | 11 | def populate(x: Unlabeled, y: Mat) = { 12 | assert(y.cols == 2) 13 | Binary(x, y(::, 1), this) 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/schema/Column.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset.schema 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.{SingleLabeled, Unlabeled} 5 | import qlearn.loss.Loss 6 | 7 | abstract class Column { 8 | type T// <: SingleLabeled[T] 9 | 10 | val loss: Loss[T] 11 | 12 | val name: Symbol 13 | 14 | def populate(x: Unlabeled, y: Mat): T 15 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/schema/NominalColumn.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset.schema 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.{NominalFull, Nominal, Unlabeled} 5 | import qlearn.loss.Loss 6 | import qlearn.loss.nominal.CrossEntropyLoss 7 | 8 | case class NominalColumn(name: Symbol, values: Vector[String], loss: Loss[Nominal] = CrossEntropyLoss()) extends Column { 9 | type T = Nominal 10 | 11 | val lookup = values.zipWithIndex.toMap 12 | 13 | def populate(x: Unlabeled, y: Mat) = { 14 | assert(y.cols == values.size) 15 | NominalFull(x, y, this) 16 | } 17 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/dataset/schema/NumericalColumn.scala: -------------------------------------------------------------------------------- 1 | package qlearn.dataset.schema 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.{Numerical, Unlabeled} 5 | import qlearn.loss.Loss 6 | import qlearn.loss.numerical.MeanSquaredLoss 7 | 8 | case class NumericalColumn(name: Symbol, loss: Loss[Numerical] = MeanSquaredLoss) extends Column { 9 | type T = Numerical 10 | 11 | def populate(x: Unlabeled, y: Mat) = { 12 | assert(y.cols == 1) 13 | Numerical(x, y(::, 0), this) 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/Loss.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss 2 | 3 | abstract class Loss[-T] { 4 | 5 | /* 6 | The domain of the function. 7 | 8 | It always ranges from the better score to the worse. 9 | */ 10 | 11 | def range: (Double, Double) 12 | 13 | /* 14 | The principal method to be defined. 15 | */ 16 | 17 | def apply(actual: T, predicted: T): Double 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/binary/F1.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.binary 2 | 3 | import qlearn.dataset.Binary 4 | import qlearn.loss.Loss 5 | 6 | object F1 extends Loss[Binary] { 7 | 8 | val range = 1.0 -> 0.0 9 | 10 | def apply(actual: Binary, predicted: Binary) = { 11 | val tp = (actual.yt :& predicted.yt).activeSize.toDouble 12 | val fp = (!actual.yt :& predicted.yt).activeSize 13 | val fn = (actual.yt :& !predicted.yt).activeSize 14 | 15 | val precision = tp / (tp + fp) 16 | val recall = tp / (tp + fn) 17 | 18 | 2 * precision * recall / (precision + recall) 19 | } 20 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/binary/HingeLoss.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.binary 2 | 3 | import breeze.stats.mean 4 | import breeze.linalg.max 5 | import qlearn.dataset.Binary 6 | import qlearn.loss.Loss 7 | 8 | object HingeLoss extends Loss[Binary] { 9 | 10 | val range = 0.0 -> Double.PositiveInfinity 11 | 12 | def apply(actual: Binary, predicted: Binary) = { 13 | val m = -actual.ymat :* predicted.ymat + 1.0 14 | mean(max(m, 0.0)) 15 | } 16 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/binary/LogisticLoss.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.binary 2 | 3 | import breeze.numerics.{exp, log} 4 | import breeze.stats.mean 5 | import qlearn.dataset.Binary 6 | import qlearn.loss.Loss 7 | 8 | object LogisticLoss extends Loss[Binary] { 9 | 10 | val range = 0.0 -> Double.PositiveInfinity 11 | 12 | def apply(actual: Binary, predicted: Binary) = 13 | mean(log(exp(-actual.ymat :* predicted.ymat) + 1.0)) 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/binary/Precision.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.binary 2 | 3 | import qlearn.dataset.Binary 4 | import qlearn.loss.Loss 5 | 6 | object Precision extends Loss[Binary] { 7 | 8 | val range = 1.0 -> 0.0 9 | 10 | def apply(actual: Binary, predicted: Binary) = { 11 | val tp = (actual.yt :& predicted.yt).activeSize.toDouble 12 | val fp = (!actual.yt :& predicted.yt).activeSize 13 | tp / (tp + fp) 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/binary/Recall.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.binary 2 | 3 | import qlearn.dataset.Binary 4 | import qlearn.loss.Loss 5 | 6 | object Recall extends Loss[Binary] { 7 | 8 | val range = 1.0 -> 0.0 9 | 10 | def apply(actual: Binary, predicted: Binary) = { 11 | val tp = (actual.yt :& predicted.yt).activeSize.toDouble 12 | val fn = (actual.yt :& !predicted.yt).activeSize 13 | tp / (tp + fn) 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/nominal/CrossEntropyLoss.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.nominal 2 | 3 | import breeze.linalg.{min, max, sum} 4 | import breeze.numerics.log 5 | import qlearn.dataset.Nominal 6 | import qlearn.loss.Loss 7 | 8 | case class CrossEntropyLoss(margin: Double = 1e-15) extends Loss[Nominal] { 9 | 10 | val range = 0.0 -> Double.PositiveInfinity 11 | 12 | def apply(actual: Nominal, predicted: Nominal) = { 13 | val corrected = min(max(predicted.ymat, margin), 1 - margin) 14 | sum(log(corrected) :* actual.ymat) / -actual.recordCount 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/nominal/FractionOfIncorrect.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.nominal 2 | 3 | import qlearn.dataset.Nominal 4 | import qlearn.loss.Loss 5 | 6 | object FractionOfIncorrect extends Loss[Nominal] { 7 | 8 | val range = 0.0 -> 1.0 9 | 10 | def apply(actual: Nominal, predicted: Nominal) = 11 | (actual.y :!= predicted.y).activeSize / actual.recordCount.toDouble 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/nominal/MatrixLoss.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.nominal 2 | 3 | import breeze.linalg.sum 4 | import qlearn.dataset.Nominal 5 | import qlearn.loss.Loss 6 | 7 | import qlearn.Types.Mat 8 | 9 | case class MatrixLoss(matrix: (Symbol, Seq[Double])*) extends Loss[Nominal] { 10 | 11 | val range = 0.0 -> Double.PositiveInfinity 12 | 13 | val m = Mat(matrix.sortBy(_._1.name).map(_._2): _*) 14 | 15 | def apply(actual: Nominal, predicted: Nominal) = 16 | sum(actual.ymat * m :* predicted.ymat) / actual.recordCount 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/MeanAbsoluteLoss.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical 2 | 3 | import breeze.stats.mean 4 | import breeze.numerics.abs 5 | import qlearn.dataset.Numerical 6 | import qlearn.loss.Loss 7 | 8 | object MeanAbsoluteLoss extends Loss[Numerical] { 9 | 10 | val range = 0.0 -> Double.PositiveInfinity 11 | 12 | def apply(actual: Numerical, predicted: Numerical) = 13 | mean(abs(actual.y - predicted.y)) 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/MeanSquaredLoss.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical 2 | 3 | import breeze.stats.mean 4 | import qlearn.dataset.Numerical 5 | import qlearn.loss.Loss 6 | 7 | object MeanSquaredLoss extends Loss[Numerical] { 8 | 9 | val range = 0.0 -> Double.PositiveInfinity 10 | 11 | def apply(actual: Numerical, predicted: Numerical) = 12 | mean((actual.y - predicted.y) :^ 2.0) 13 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/ChebyshevDistance.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.linalg.max 4 | import breeze.numerics.abs 5 | import qlearn.Types._ 6 | 7 | object ChebyshevDistance extends Distance { 8 | 9 | def apply(a: Vec, b: Vec) = max(abs(a - b)) 10 | 11 | override def apply(a: Mat, b: Vec) = 12 | max((a.r - b).r) 13 | 14 | override def apply(a: Mat, b: Mat) = 15 | max(abs(a - b).r) 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/CosineSimilarity.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.linalg.max 4 | import breeze.numerics.sqrt 5 | import qlearn.Types._ 6 | 7 | object CosineSimilarity extends Distance { 8 | 9 | override val range = 0.0 -> 1.0 10 | 11 | def apply(a: Vec, b: Vec) = { 12 | val sim = dot(a,b) / sqrt(dot(a,a) * dot(b,b)) 13 | 1 - math.max(sim, 0.0) 14 | } 15 | 16 | override def apply(a: Mat, b: Vec) = { 17 | val sim = dot(a,b) / sqrt(dot(a,a) * dot(b,b)) 18 | -max(sim, 0.0) + 1.0 19 | } 20 | 21 | override def apply(a: Mat, b: Mat) = { 22 | val sim = dot(a,b) / sqrt(dot(a,a) :* dot(b,b)) 23 | -max(sim, 0.0) + 1.0 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/Distance.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.linalg.sum 4 | import qlearn.Types._ 5 | import qlearn.dataset.Numerical 6 | import qlearn.loss.Loss 7 | 8 | abstract class Distance extends Loss[Numerical] { 9 | 10 | def range = 0.0 -> Double.PositiveInfinity 11 | 12 | require(range._1 < range._2, "Smaller distance has to mean better.") 13 | 14 | 15 | def dot(a: Vec, b: Vec) = a.dot(b) 16 | def dot(a: Mat, b: Vec) = a.r.dot(b) 17 | def dot(a: Mat, b: Mat) = sum((a :* b).r) 18 | 19 | 20 | def apply(actual: Numerical, predicted: Numerical) = 21 | apply(actual.y, predicted.y) 22 | 23 | /* 24 | Compute the distance between two vectors. 25 | */ 26 | 27 | def apply(a: Vec, b: Vec): Double 28 | 29 | /* 30 | Compute a vector of distances to all the rows 31 | of the matrix a. 32 | */ 33 | 34 | def apply(a: Mat, b: Vec): Vec = 35 | a.r.map( row => 36 | apply(row, b) 37 | ) 38 | 39 | /* 40 | Compute a vector of distances between the coaligned 41 | rows of both matrices. 42 | */ 43 | 44 | def apply(a: Mat, b: Mat): Vec = 45 | Vec.tabulate(a.rows)( i => 46 | apply(a(i, ::).t, b(i, ::).t) 47 | ) 48 | 49 | /* 50 | For performance reasons, you can also override this method. 51 | It computes the sum of the distances that you obtain by 52 | taking the pairwise rows of the both matrices. 53 | */ 54 | 55 | def total(a: Mat, b: Mat): Double = sum(apply(a, b)) 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/EuclideanDistance.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.linalg.norm 4 | import qlearn.Types._ 5 | 6 | object EuclideanDistance extends Distance { 7 | 8 | def apply(a: Vec, b: Vec) = norm(a - b) 9 | 10 | override def apply(a: Mat, b: Vec) = 11 | norm((a.r - b).r) 12 | 13 | override def apply(a: Mat, b: Mat) = 14 | norm((a - b).r) 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/ManhattanDistance.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.linalg.sum 4 | import breeze.numerics.abs 5 | import qlearn.Types._ 6 | 7 | object ManhattanDistance extends Distance { 8 | 9 | def apply(a: Vec, b: Vec) = sum(abs(a - b)) 10 | 11 | override def apply(a: Mat, b: Vec) = sum(abs(a.r - b).r) 12 | 13 | override def apply(a: Mat, b: Mat) = sum(abs(a - b).r) 14 | 15 | override def total(a: Mat, b: Mat) = sum(abs(a - b)) 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/NormDistance.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.linalg.norm 4 | import qlearn.Types._ 5 | 6 | case class NormDistance(p: Double) extends Distance { 7 | require(p >= 1.0, "P under 1.0 produces a degenerate norm.") 8 | 9 | def apply(a: Vec, b: Vec) = norm(a - b, p) 10 | 11 | override def apply(a: Mat, b: Vec) = 12 | norm((a.r - b).r, p) 13 | 14 | override def apply(a: Mat, b: Mat) = 15 | norm((a - b).r, p) 16 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/PolyKernel.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.numerics.pow 4 | import qlearn.Types._ 5 | 6 | case class PolyKernel(exponent: Double) extends Distance { 7 | 8 | def apply(a: Vec, b: Vec) = pow(dot(a,b), exponent) 9 | 10 | override def apply(a: Mat, b: Vec) = pow(dot(a,b), exponent) 11 | 12 | override def apply(a: Mat, b: Mat) = pow(dot(a,b), exponent) 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/loss/numerical/distance/RBFKernel.scala: -------------------------------------------------------------------------------- 1 | package qlearn.loss.numerical.distance 2 | 3 | import breeze.numerics.exp 4 | import qlearn.Types._ 5 | 6 | case class RBFKernel(gamma: Double = 0.01) extends Distance { 7 | 8 | def apply(a: Vec, b: Vec) = { 9 | val dots = dot(a,b)*2.0 - dot(a,a) - dot(b,b) 10 | exp(dots * gamma) 11 | } 12 | 13 | override def apply(a: Mat, b: Vec) = { 14 | val dots = dot(a,b)*2.0 - dot(a,a) - dot(b,b) 15 | exp(dots * gamma) 16 | } 17 | 18 | override def apply(a: Mat, b: Mat) = { 19 | val dots = dot(a,b)*2.0 - dot(a,a) - dot(b,b) 20 | exp(dots * gamma) 21 | } 22 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/Clusterer.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml 2 | 3 | import qlearn.dataset.{NominalBasic, Unlabeled} 4 | 5 | abstract class Clusterer { 6 | 7 | protected def names(k: Int) = (0 until k).map(_.toString).toVector 8 | 9 | def cluster(data: Unlabeled): NominalBasic 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/FittedModel.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml 2 | 3 | import qlearn.dataset.{Labeled, Unlabeled} 4 | 5 | 6 | abstract class FittedModel[T] { 7 | 8 | /* 9 | We keep the reference to the learning dataset just to know the schema 10 | (e.g. the ordinals of the nominal attributes, etc). 11 | 12 | TODO: get rid of this in the future. The learning dataset should be garbage collected ASAP. 13 | */ 14 | 15 | val schema: T 16 | 17 | /* 18 | The predict() method is the primary one. It returns the resulting dataset. 19 | */ 20 | 21 | def predict(data: Unlabeled): T 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/FittedModelMulti.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml 2 | 3 | import qlearn.dataset.{Unlabeled, SingleLabeled, MultiLabeled} 4 | import qlearn.Types.Mat 5 | 6 | case class FittedModelMulti[T <: SingleLabeled[T]](schema: MultiLabeled[T], models: Seq[FittedModel[T]]) extends FittedModel[MultiLabeled[T]] { 7 | 8 | def predict(data: Unlabeled): MultiLabeled[T] = { 9 | val ynew = Mat.horzcat(models.map(_.predict(data).ymat): _*) 10 | schema.updated(data, ynew) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/Model.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml 2 | 3 | import qlearn.dataset.{MultiLabeled, SingleLabeled} 4 | 5 | import scala.util.Random 6 | 7 | abstract class Model[T] { 8 | 9 | /* 10 | Models that want to report their learning status back to the user 11 | can allow them to specify this function 12 | */ 13 | 14 | val reporter = { (_: Seq[Double], _: String) => /* no action */ } 15 | 16 | /* 17 | The fit() method is the primary one. It returns the fitted model. 18 | */ 19 | 20 | def fit(data: T): FittedModel[T] 21 | 22 | //def fit(data: MultiLabeled[T]): FittedModel[MultiLabeled[T]] = FittedModelMulti(data, data.ys.map(fit)) 23 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/ModelForTwo.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml 2 | 3 | import qlearn.dataset.Labeled 4 | 5 | 6 | trait ModelForTwo[T <: Labeled[T]] { 7 | 8 | /* 9 | This trait represents the model's ability to learn from the dataset, 10 | which is split in two parts. Of course every model is able to do that 11 | by just concatenating beforehand. However, you should only use this 12 | trait if such thing could be done effortlessly, without additional 13 | memory or CPU costs. Note, concatenation is an O(n) operation. 14 | 15 | Why is this good? 16 | 17 | For models that implement this, we can avoid copying the dataset K times 18 | on K-fold cross validation. It is a matter of simply splicing out the current 19 | test fold from the dataset, leaving the training dataset in two separate parts 20 | (well, except for the first and last fold). 21 | */ 22 | 23 | def fit(data1: T, data2: T): FittedModel[T] 24 | 25 | 26 | /*def fit(data: T): FittedModel[T] = 27 | fit(data, data.pick(0 to -1))*/ 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/RandomizedClusterer.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml 2 | 3 | import qlearn.dataset.{Unlabeled, NominalBasic} 4 | 5 | import scala.util.Random 6 | 7 | trait RandomizedClusterer extends Clusterer { 8 | /* 9 | A clusterer should include this trait if its output significantly 10 | depends on the behavior of the random number generator. 11 | */ 12 | 13 | def cluster(data: Unlabeled, seed: Long): NominalBasic 14 | 15 | def cluster(data: Unlabeled) = cluster(data, Random.nextLong) 16 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/RandomizedModel.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml 2 | 3 | import scala.util.Random 4 | 5 | trait RandomizedModel[T] extends Model[T] { 6 | /* 7 | A model should include this trait if its output significantly 8 | depends on the behavior of the random number generator. 9 | */ 10 | 11 | def fit(data: T, seed: Long): FittedModel[T] 12 | 13 | def fit(data: T) = fit(data, Random.nextLong) 14 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/FittedNeuralNetwork.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | /*import qlearn.dataset.{MultiLabeled, SingleLabeled, Unlabeled} 4 | import qlearn.ml.FittedModel 5 | 6 | case class FittedNeuralNetwork[T <: SingleLabeled[T]](schema: MultiLabeled[T], a: Int) extends FittedModel[MultiLabeled[T]] { 7 | def predict(data: Unlabeled) = schema 8 | }*/ 9 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/FittedRandomTree.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | import breeze.linalg.sum 4 | import qlearn.Types._ 5 | import qlearn.dataset.{Nominal, Unlabeled} 6 | import qlearn.ml.FittedModel 7 | 8 | case class FittedRandomTree( 9 | schema: Nominal, 10 | tree: BinaryTree 11 | ) extends FittedModel[Nominal] { 12 | 13 | def predict(data: Unlabeled) = { 14 | val newy = data.xmat.r.map { vec => 15 | 16 | def recurse(tree: BinaryTree): Vec = 17 | tree match { 18 | case Leaf(prediction) => prediction / sum(prediction) 19 | case Node(left, right, feature, split) => 20 | recurse( 21 | if (vec(feature) < split) left else right 22 | ) 23 | } 24 | 25 | recurse(tree) 26 | } 27 | schema.updated(data, newy) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/FittedSameDistribution.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | import qlearn.Types.Vec 4 | import qlearn.dataset.{Unlabeled, Nominal} 5 | import qlearn.ml.FittedModel 6 | 7 | case class FittedSameDistribution(schema: Nominal, prediction: Vec) extends FittedModel[Nominal] { 8 | 9 | def predict(data: Unlabeled) = 10 | schema.updated(data, Vec.ones[Double](data.recordCount) * prediction.t) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/FittedSimpleKNN.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | import breeze.linalg.sum 4 | import qlearn.Types._ 5 | import qlearn.dataset.{Nominal, Unlabeled} 6 | import qlearn.loss.numerical.distance.{EuclideanDistance, Distance} 7 | import qlearn.ml.FittedModel 8 | import qlearn.util.Util 9 | 10 | case class FittedSimpleKNN( 11 | schema: Nominal, 12 | k: Int, 13 | distance: Distance = EuclideanDistance, 14 | weighting: Double => Double = {_ => 1.0} 15 | ) extends FittedModel[Nominal] { 16 | 17 | /* 18 | This classifier predicts the class distribution probabilities according 19 | to the k nearest records and their actual distance. 20 | 21 | With the weighting function you can denote how you want to ponderize 22 | the records according to their distance. Some examples: 23 | 24 | {_ => 1.0} : ponderize the points completely equally 25 | math.pow(_, -p) : exponential dampering for p > 0 26 | ... 27 | */ 28 | 29 | require(k <= schema.recordCount, s"Cannot run $k-NN classifier on a dataset with just ${schema.recordCount} records.") 30 | 31 | def predict(data: Unlabeled) = { 32 | val newy = data.xmat.r.map { record => 33 | val distances = distance(schema.xmat, record) 34 | val smallest = Util.kSmallestIndices(distances, k) 35 | 36 | // select k smallest distances and weight them 37 | val weighted = distances(smallest).map(weighting) 38 | // select k closest corresponding output rows 39 | val chosen = schema.ymat(smallest, ::).toDenseMatrix 40 | // weight those rows with "weighted" vector 41 | val columnSums = sum((chosen.c :* weighted).c).t 42 | // finally, normalize 43 | columnSums / sum(columnSums) 44 | } 45 | schema.updated(data, newy) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/NeuralNetwork.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | /*import qlearn.util.nnet.Layer 4 | import qlearn.util.nnet.activations.ActivationFunction 5 | import qlearn.dataset.{MultiLabeled, SingleLabeled} 6 | import qlearn.ml.Model 7 | 8 | 9 | case class NeuralNetwork[T <: SingleLabeled[T]]( 10 | layers: Vector[Layer], 11 | lastLevelActivation: ActivationFunction 12 | ) extends Model[MultiLabeled[T]] { 13 | 14 | def fit(data: MultiLabeled[T])= { 15 | FittedNeuralNetwork(data, 10) 16 | } 17 | } 18 | */ -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/RandomForest.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | import qlearn.dataset.Nominal 4 | import qlearn.ml.{RandomizedModel, Model} 5 | import qlearn.ml.meta.Bagging 6 | 7 | case class RandomForest(tree: RandomTree, count: Int) extends Model[Nominal] with RandomizedModel[Nominal] { 8 | 9 | def fit(data: Nominal, seed: Long) = 10 | Bagging(Seq.fill(count)(tree)).fit(data, seed) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/RandomTree.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | import breeze.linalg.sum 4 | import qlearn.Types._ 5 | import qlearn.dataset.Nominal 6 | import qlearn.ml.{RandomizedModel, Model} 7 | import qlearn.util.Util 8 | 9 | import scala.util.Random 10 | 11 | abstract class BinaryTree 12 | case class Leaf(predict: Vec) extends BinaryTree 13 | case class Node(left: BinaryTree, right: BinaryTree, feature: Int, split: Double) extends BinaryTree 14 | 15 | case class RandomTree( 16 | numFeatures: Option[Int] = None, 17 | maxDepth: Option[Int] = None, 18 | minParent: Int = 1 19 | ) extends Model[Nominal] with RandomizedModel[Nominal] { 20 | 21 | require(minParent >= 1) 22 | 23 | def recurse(data: Nominal, indices: Vector[Int], rand: Random, depth: Int = 0): BinaryTree = { 24 | 25 | /* 26 | Pick a random feature set. 27 | 28 | This is the random portion of this learner, appropriately 29 | named RandomTree. 30 | */ 31 | 32 | def randomFeaturePick = { 33 | val range = 0 until data.featureCount 34 | numFeatures match { 35 | case Some(limit) => Util.randomSubset(range, limit, rand) 36 | case _ => range 37 | } 38 | } 39 | 40 | /* 41 | Compute the entropy of a distribution. 42 | 43 | Unit is nats instead ob bits. 44 | */ 45 | 46 | def entropy(v: Vec) = { 47 | val total = sum(v) 48 | math.log(total) - sum(v.map( d => 49 | if (d == 0) 0 else d * math.log(d) 50 | )) / total 51 | } 52 | 53 | /* 54 | What is the best split value for a given feature? 55 | 56 | @returns (best split point, entropy) 57 | */ 58 | 59 | def bestSplitPoint(feature: Int, distribution: Vec): (Double, Double) = { 60 | var bestScore = Double.PositiveInfinity 61 | var best = 0.0 62 | 63 | val sorted = indices.sortBy(data.xmat(_, feature)) 64 | val l = data.ymat(0, ::).t * 0.0 65 | val r = distribution 66 | sorted.indices.init.foreach { i => 67 | val row = data.ymat(sorted(i), ::).t 68 | l += row 69 | r -= row 70 | 71 | val score = entropy(l) + entropy(r) 72 | if (score < bestScore) { 73 | bestScore = score 74 | best = data.xmat(sorted(i), feature) + data.xmat(sorted(i+1), feature) 75 | } 76 | } 77 | best/2 -> bestScore 78 | } 79 | 80 | /* 81 | Did we reach the stopping condition yet? 82 | 83 | React accordingly. 84 | */ 85 | 86 | val distribution = 87 | sum( data.ymat(indices, ::).toDenseMatrix.c ).t 88 | 89 | val shouldStop = 90 | indices.size <= minParent || // we have less than minParent remaining 91 | maxDepth.exists(_ > depth) || // the depth is already too big 92 | (distribution :> 0.0).activeSize == 1 // the node is already pure 93 | 94 | if (shouldStop) Leaf(distribution) else { 95 | val (feature, (split, _)) = 96 | randomFeaturePick.map( feature => 97 | feature -> bestSplitPoint(feature, distribution) 98 | ).minBy(_._2._2) 99 | 100 | val (left, right) = indices.partition(data.xmat(_, feature) <= split) 101 | Node( 102 | recurse(data, left, rand, depth + 1), 103 | recurse(data, right, rand, depth + 1), 104 | feature, split 105 | ) 106 | } 107 | } 108 | 109 | def fit(data: Nominal, seed: Long) = FittedRandomTree( 110 | data, recurse(data, data.indices.toVector, new Random(seed)) 111 | ) 112 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/SameDistribution.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | import breeze.linalg.sum 4 | import qlearn.Types._ 5 | import qlearn.dataset.Nominal 6 | import qlearn.ml.Model 7 | 8 | case class SameDistribution() extends Model[Nominal] { 9 | 10 | def fit(data: Nominal) = { 11 | val columnSums = sum(data.ymat.c).t 12 | FittedSameDistribution(data, columnSums / sum(columnSums)) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/SimpleKNN.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify 2 | 3 | import qlearn.dataset.Nominal 4 | import qlearn.loss.numerical.distance.{EuclideanDistance, Distance} 5 | import qlearn.ml.Model 6 | 7 | case class SimpleKNN( 8 | k: Int, 9 | distance: Distance = EuclideanDistance, 10 | weighting: Double => Double = {_ => 1.0} 11 | ) extends Model[Nominal] { 12 | 13 | def fit(data: Nominal) = 14 | FittedSimpleKNN(data, k, distance, weighting) 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/weka/LogisticRegression.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify.weka 2 | 3 | import qlearn.dataset.Nominal 4 | import qlearn.wekas.WekaModel 5 | import weka.classifiers.functions.Logistic 6 | 7 | case class LogisticRegression( 8 | ridge: Double = 1e-8, 9 | maxIterations: Option[Int] = None, 10 | useConjugateGradientDescent: Boolean = false 11 | ) extends WekaModel[Nominal](new Logistic) { 12 | 13 | val m = model.asInstanceOf[Logistic] 14 | 15 | m.setRidge(ridge) 16 | m.setMaxIts(maxIterations.getOrElse(-1)) 17 | m.setUseConjugateGradientDescent(useConjugateGradientDescent) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/classify/weka/REPTree.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.classify.weka 2 | 3 | import qlearn.dataset.Nominal 4 | import qlearn.ml.RandomizedModel 5 | import qlearn.wekas.WekaModel 6 | import weka.classifiers.trees 7 | 8 | import scala.util.Random 9 | 10 | case class REPTree( 11 | maxDepth: Option[Int] = None, 12 | minInstancesPerLeaf: Int = 2, 13 | pruning: Boolean = true, 14 | seed: Long = Random.nextLong 15 | ) extends WekaModel[Nominal](new trees.REPTree) { 16 | 17 | val m = model.asInstanceOf[trees.REPTree] 18 | 19 | m.setMaxDepth(maxDepth.getOrElse(-1)) 20 | m.setMinNum(minInstancesPerLeaf) 21 | m.setNoPruning(!pruning) 22 | m.setSeed(seed.toInt) 23 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/cluster/KMeans.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.cluster 2 | 3 | import breeze.linalg.argmin 4 | import qlearn.dataset.schema.NominalColumn 5 | import qlearn.dataset.{NominalBasic, Unlabeled} 6 | import qlearn.Types._ 7 | import qlearn.loss.numerical.distance.{Distance, EuclideanDistance} 8 | import qlearn.strategies.{NoRecentImprovement, Stopping} 9 | import qlearn.util.Util 10 | import qlearn.ml.{RandomizedClusterer, Clusterer} 11 | 12 | import scala.util.Random 13 | 14 | 15 | case class KMeans( 16 | k: Int, 17 | distance: Distance = EuclideanDistance, 18 | strategy: Stopping = NoRecentImprovement(5) 19 | ) extends Clusterer with RandomizedClusterer { 20 | 21 | /* 22 | This is the algoritm that performs k-means clustering via the iterative approach. 23 | 24 | */ 25 | 26 | 27 | require(k > 1, "Clustering requires at least 2 target clusters.") 28 | 29 | def cluster(data: Unlabeled, seed: Long): NominalBasic = { 30 | require(k <= data.recordCount, "Cannot have more clusters than data points.") 31 | 32 | val mat = data.xmat 33 | var centroids = { 34 | val randomSubset = Util.randomSubset(data.indices, k, new Random(seed)) 35 | mat(randomSubset, ::).toDenseMatrix 36 | } 37 | 38 | def closest(p: Vec): Int = 39 | argmin(distance(centroids, p)) 40 | 41 | val y = strategy.apply { 42 | val updated = Mat.zeros[Double](k, data.featureCount) 43 | val counts = Vec.zeros[Double](k) 44 | 45 | mat.r.foreach { row => 46 | val index = closest(row) 47 | updated(index, ::) += row.t 48 | counts(index) += 1.0 49 | } 50 | 51 | updated.c /= counts 52 | val error = distance.total(centroids, updated) 53 | centroids = updated 54 | 55 | error -> { () => 56 | Vec.tabulate(data.recordCount)( i => closest(mat(i, ::).t) ) 57 | } 58 | } 59 | 60 | NominalBasic(data, y, NominalColumn('cluster, names(k))) 61 | } 62 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/cluster/weka/CobWeb.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.cluster.weka 2 | 3 | import qlearn.ml.RandomizedClusterer 4 | import qlearn.wekas.WekaClusterer 5 | import weka.clusterers.Cobweb 6 | 7 | import scala.util.Random 8 | 9 | case class CobWeb( 10 | acuity: Double = 1.0, 11 | cutoff: Double = 0.002, 12 | seed: Long = Random.nextLong 13 | ) extends WekaClusterer(new Cobweb) { 14 | 15 | val c = clusterer.asInstanceOf[Cobweb] 16 | 17 | c.setAcuity(acuity) 18 | c.setCutoff(cutoff) 19 | c.setSeed(seed.toInt) 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/cluster/weka/ExpectationMaximization.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.cluster.weka 2 | 3 | import qlearn.ml.RandomizedClusterer 4 | import qlearn.wekas.WekaClusterer 5 | import weka.clusterers.EM 6 | 7 | import scala.util.Random 8 | 9 | case class ExpectationMaximization( 10 | k: Option[Int] = None, 11 | folds: Int = 10, 12 | runs: Int = 10, 13 | maxClusters: Option[Int] = None, 14 | maxIterations: Int = 100, 15 | epsForK: Double = 1e-6, 16 | epsForE: Double = 1e-6, 17 | seed: Long = Random.nextLong 18 | ) extends WekaClusterer(new EM) { 19 | 20 | val c = clusterer.asInstanceOf[EM] 21 | 22 | c.setNumClusters(k.getOrElse(-1)) 23 | c.setNumFolds(folds) 24 | c.setNumKMeansRuns(runs) 25 | c.setMaximumNumberOfClusters(maxClusters.getOrElse(-1)) 26 | c.setMaxIterations(maxIterations) 27 | c.setMinLogLikelihoodImprovementCV(epsForK) 28 | c.setMinLogLikelihoodImprovementIterating(epsForE) 29 | c.setSeed(seed.toInt) 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/cluster/weka/KMeans.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.cluster.weka 2 | 3 | import qlearn.loss.numerical.distance.{Distance, EuclideanDistance} 4 | import qlearn.ml.{RandomizedClusterer, RandomizedModel} 5 | import qlearn.wekas.WekaClusterer 6 | import weka.clusterers.SimpleKMeans 7 | 8 | import scala.util.Random 9 | 10 | case class KMeans( 11 | k: Int, 12 | distance: Distance = EuclideanDistance, 13 | maxIterations: Int = 500, 14 | seed: Long = Random.nextLong 15 | ) extends WekaClusterer(new SimpleKMeans) { 16 | 17 | val c = clusterer.asInstanceOf[SimpleKMeans] 18 | 19 | c.setNumClusters(k) 20 | c.setDistanceFunction(convertDistance(distance)) 21 | c.setMaxIterations(maxIterations) 22 | c.setSeed(seed.toInt) 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/meta/Bagging.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.meta 2 | 3 | import qlearn.dataset.SingleLabeled 4 | import qlearn.ml.{Model, RandomizedModel} 5 | import qlearn.util.Util 6 | 7 | import scala.util.Random 8 | 9 | case class Bagging[T <: SingleLabeled[T]]( 10 | learners: Seq[Model[T]], 11 | bagSizePercentage: Double = 100 12 | ) extends Model[T] with RandomizedModel[T] { 13 | 14 | def fit(data: T, seed: Long) = { 15 | val rand = new Random(seed) 16 | val bagSize = (bagSizePercentage * data.recordCount / 100).round.toInt 17 | 18 | val fitted = learners.map { learner => 19 | val bag = Util.randomWithReplacement(data.indices, bagSize, rand) 20 | learner.fit(data.pick(bag)) 21 | } 22 | 23 | FittedBagging(data, fitted) 24 | } 25 | } 26 | 27 | object Bagging { 28 | def apply[T <: SingleLabeled[T]](learner: Model[T], iterations: Int): Bagging[T] = 29 | Bagging(Seq.fill(iterations)(learner)) 30 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/meta/FittedBagging.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.meta 2 | 3 | import breeze.linalg.sum 4 | import qlearn.dataset.{SingleLabeled, Unlabeled} 5 | import qlearn.ml.FittedModel 6 | 7 | case class FittedBagging[T <: SingleLabeled[T]]( 8 | schema: T, 9 | fitted: Seq[FittedModel[T]] 10 | ) extends FittedModel[T] { 11 | 12 | def predict(data: Unlabeled) = { 13 | val newy = sum(fitted.map(_.predict(data).ymat)) 14 | schema.updated(data, newy / fitted.size.toDouble) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/meta/FittedOneVsAll.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.meta 2 | 3 | import breeze.linalg.normalize 4 | import qlearn.dataset.{Binary, Nominal, Unlabeled} 5 | import qlearn.ml.FittedModel 6 | import qlearn.Types._ 7 | 8 | case class FittedOneVsAll( 9 | schema: Nominal, 10 | fitted: Seq[FittedModel[Binary]] 11 | ) extends FittedModel[Nominal] { 12 | 13 | def predict(data: Unlabeled) = { 14 | val newy = Mat(fitted.map(_.predict(data).yb.toArray): _*).t 15 | schema.updated(data, normalize(newy.r, 1)) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/meta/OneVsAll.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.meta 2 | 3 | import qlearn.dataset.schema.BinaryColumn 4 | import qlearn.dataset.{Binary, Nominal} 5 | import qlearn.loss.binary.LogisticLoss 6 | import qlearn.ml.Model 7 | 8 | case class OneVsAll(learner: Model[Binary]) extends Model[Nominal] { 9 | 10 | private def binarize(data: Nominal) = 11 | (0 to data.ymat.cols).map { klass => 12 | val yb = data.y.map { v => 13 | if (v == klass) 1.0 else 0.0 14 | } 15 | Binary(data.x, yb, BinaryColumn(data.name)) 16 | } 17 | 18 | def fit(data: Nominal) = 19 | FittedOneVsAll(data, binarize(data).map(learner.fit)) 20 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/regress/FittedRidgeRegression.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.regress 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.{MultiLabeled, SingleLabeled, Unlabeled} 5 | import qlearn.ml.FittedModel 6 | 7 | case class FittedRidgeRegression[T <: SingleLabeled[T]](schema: T, coef: Mat) extends FittedModel[T] { 8 | 9 | def predict(data: Unlabeled) = schema.updated(data, data.xmat * coef) 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/regress/FittedRidgeRegressionMulti.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.regress 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.{MultiLabeled, SingleLabeled, Unlabeled} 5 | import qlearn.ml.FittedModel 6 | 7 | case class FittedRidgeRegressionMulti[T <: SingleLabeled[T]](schema: MultiLabeled[T], coef: Mat) extends FittedModel[MultiLabeled[T]] { 8 | 9 | def predict(data: Unlabeled) = schema.updated(data, data.xmat * coef) 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/regress/RidgeRegression.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.regress 2 | 3 | import breeze.linalg.{inv, pinv} 4 | import qlearn.dataset.{Labeled, Numerical, MultiLabeled, SingleLabeled} 5 | import qlearn.Types.Mat 6 | import qlearn.ml.Model 7 | 8 | case class RidgeRegression(ridge: Double = 0) extends Model[Numerical] { 9 | require(ridge >= 0, "The ridge parameter must be positive") 10 | 11 | private[this] def inverse[T <: Labeled[T]](data: T) = 12 | if (ridge != 0) { 13 | // less performant case 14 | val x = data.xmat 15 | val xt = x.t 16 | val diagonal = Mat.eye[Double](data.width) * ridge 17 | // TODO: Solve? 18 | inv(xt * x - diagonal) * xt 19 | } else pinv(data.xmat) 20 | 21 | def fit(data: Numerical) = 22 | FittedRidgeRegression(data, inverse(data) * data.ymat) 23 | 24 | /*override def fit(data: MultiLabeled[Numerical]) = 25 | FittedRidgeRegressionMulti(data, inverse(data) * data.ymat)*/ 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/ml/regress/weka/RidgeRegression.scala: -------------------------------------------------------------------------------- 1 | package qlearn.ml.regress.weka 2 | 3 | import qlearn.dataset.Numerical 4 | import qlearn.wekas.WekaModel 5 | import weka.classifiers.functions.LinearRegression 6 | 7 | case class RidgeRegression( 8 | ridge: Double = 0, 9 | eliminateColinear: Boolean = false, 10 | conserveMemory: Boolean = false 11 | ) extends WekaModel[Numerical](new LinearRegression) { 12 | 13 | val m = model.asInstanceOf[LinearRegression] 14 | 15 | m.setRidge(ridge) 16 | m.setEliminateColinearAttributes(eliminateColinear) 17 | m.setMinimal(conserveMemory) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/strategies/NoRecentImprovement.scala: -------------------------------------------------------------------------------- 1 | package qlearn.strategies 2 | 3 | case class NoRecentImprovement(n: Int, maxIterations: Int = 500) extends Stopping { 4 | def apply[T](a: => (Double, () => T)) = { 5 | val call = a 6 | 7 | var best = call._2() 8 | var bestError = call._1 9 | var bestI = 1 10 | 11 | var i = 1 12 | while(i < maxIterations && i - bestI < n) { 13 | i += 1 14 | 15 | val (error, thunk) = a 16 | println(s"Error: $error") 17 | if (error < bestError) { 18 | best = thunk() 19 | bestError = error 20 | bestI = i 21 | } else { 22 | println("* result NOT improved") 23 | } 24 | } 25 | 26 | best 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/strategies/Stopping.scala: -------------------------------------------------------------------------------- 1 | package qlearn.strategies 2 | 3 | abstract class Stopping { 4 | def apply[T](a: => (Double, () => T)): T 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/Util.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util 2 | 3 | import qlearn.Types.Vec 4 | 5 | import scala.util.Random 6 | 7 | object Util { 8 | val superScript = Vector('⁰', '¹', '²', '³', '⁴', '⁵', '⁶', '⁷', '⁸', '⁹') 9 | 10 | def toSuperScript(n: Int) = n.toString.map(_.asDigit).map(superScript).mkString 11 | 12 | def printDoubleNicely(num: Double, places: Int = 8) = { 13 | val str = s"%.${places - 1}f" format num take (places + 1) 14 | if (str == "NaN") { 15 | "_" * places 16 | } else if (str.startsWith("0.000") && num != 0 || str.startsWith("-0.000") || !str.contains('.')) { 17 | val Array(mant, rawExp) = s"%.${places}e" format num split "e\\+?" 18 | val exp = rawExp.toInt 19 | 20 | val len = places - exp.toString.size - 1 21 | s"%.${len}se%s" format (mant, exp) 22 | } else str.init 23 | } 24 | 25 | 26 | def randomSubset[T](items: IndexedSeq[T], k: Int, rnd: Random = Random) = { 27 | val n = items.size 28 | require(n >= k, s"Not enough elements: Cannot select $k from $n") 29 | 30 | var res = List.empty[T] 31 | (n - k until n).foreach { i => 32 | val pos = rnd.nextInt(i+1) 33 | val item = items(pos) 34 | res ::= (if (res contains item) items(i) else item) 35 | } 36 | res 37 | } 38 | 39 | def randomWithReplacement[T](items: IndexedSeq[T], k: Int, rnd: Random = Random) = 40 | Vector.fill(k)( 41 | items(rnd.nextInt(items.size)) 42 | ) 43 | 44 | 45 | def kSmallestIndices(items: Vec, k: Int): IndexedSeq[Int] = { 46 | val vec = items.toScalaVector 47 | vec.indices.sortBy(vec).take(k) 48 | } 49 | 50 | def kthSmallestElement(items: Vec, k: Int): Double = { 51 | ??? 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/decisionStump/DecisionStump.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util.decisionStump 2 | 3 | class DecisionStump { 4 | 5 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/nnet/Layer.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util.nnet 2 | 3 | import qlearn.util.nnet.activations.ActivationFunction 4 | 5 | case class Layer(size: Int, activation: ActivationFunction) { 6 | require(size > 0, "Layer must have at least one neuron.") 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/nnet/activations/ActivationFunction.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util.nnet.activations 2 | 3 | import qlearn.Types.Vec 4 | 5 | abstract class ActivationFunction { 6 | val min: Double 7 | val max: Double 8 | 9 | def compute(x: Vec): Vec 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/nnet/activations/Eliott.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util.nnet.activations 2 | 3 | import breeze.numerics._ 4 | import qlearn.Types.Vec 5 | 6 | // Elliott, D.L. "A better activation function for artificial neural networks", 1993 7 | // http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.46.7204&rep=rep1&type=pdf 8 | case class Eliott(s: Double = 1.0) extends ActivationFunction { 9 | val min = 0.0 10 | val max = 1.0 11 | 12 | def compute(x: Vec) = { 13 | val tmp = x :* s 14 | tmp :/ 2.0 :/ (abs(tmp) :+ 1.0) :+ 0.5 15 | } 16 | 17 | def gradient(x: Vec, v: Vec) = { 18 | val tmp = abs(x :* s) :+ 1.0 19 | (tmp :^ -2.0) :* (s/2) 20 | } 21 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/nnet/activations/EliottSym.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util.nnet.activations 2 | 3 | import breeze.numerics._ 4 | import qlearn.Types.Vec 5 | 6 | case class EliottSym(s: Double = 1.0) extends ActivationFunction { 7 | val min = -1.0 8 | val max = 1.0 9 | 10 | def compute(x: Vec) = { 11 | val tmp = x :* s 12 | tmp :/ (abs(tmp) + 1.0) 13 | } 14 | 15 | def gradient(x: Vec, v: Vec) = { 16 | val tmp = abs(x :* s) :+ 1.0 17 | (tmp :^ -2.0) :* s 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/nnet/activations/Sigmoid.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util.nnet.activations 2 | 3 | import breeze.numerics._ 4 | import qlearn.Types.Vec 5 | 6 | object Sigmoid extends ActivationFunction { 7 | val min = 0.0 8 | val max = 1.0 9 | 10 | def compute(x: Vec) = (exp(-x) :+ 1.0) :^ -1.0 11 | 12 | def gradient(x: Vec, v: Vec) = v :* (-v :+ 1.0) 13 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/util/nnet/activations/Tanh.scala: -------------------------------------------------------------------------------- 1 | package qlearn.util.nnet.activations 2 | 3 | import breeze.numerics._ 4 | import qlearn.Types.Vec 5 | 6 | object Tanh extends ActivationFunction { 7 | val min = -1.0 8 | val max = 1.0 9 | 10 | def compute(x: Vec) = tanh(x) 11 | 12 | def gradient(x: Vec, v: Vec) = -(v :^ 2.0) :+ 1.0 13 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/validation/CrossValidation.scala: -------------------------------------------------------------------------------- 1 | package qlearn.validation 2 | 3 | import qlearn.dataset.{SingleLabeled, Nominal, Labeled} 4 | import qlearn.ml.Model 5 | 6 | case class CrossValidation[T <: Labeled[T]](data: T, folds: Int = 10, stratify: Boolean = true) extends Validation[T] { 7 | 8 | /* 9 | This is a special, very optimized k-fold cross validation engine. 10 | 11 | The memory usage does not grow linearly with the number of folds, but is a constant. The dataset 12 | is copied at most 2 times, even if you do 100-fold cross validation. 13 | 14 | Space complexity: at most 2 * N 15 | Time complexity: O(N) 16 | */ 17 | 18 | require(folds >= 3, "The number of folds has to be at least three.") 19 | require(folds <= data.recordCount, "The number of folds cannot exceed the number of records.") 20 | 21 | /* 22 | An array that assigns the continuous range to each fold. All folds are the same size, 23 | except if the dataset is not evenly divisible by the number of folds -- in that case, 24 | folds at the begining have one record more. 25 | */ 26 | 27 | val ranges: Vector[Range] = { 28 | val div = data.recordCount / folds 29 | val mod = data.recordCount % folds 30 | val a = (0 to mod).map(_ * (div+1)) 31 | val b = (mod+1 to folds).map(_ * div + mod) 32 | (a ++ b).sliding(2).map { 33 | case Seq(first, last) => first until last 34 | }.toVector 35 | } 36 | 37 | val rangesTwice = ranges ++ ranges.map { range => 38 | range.start + data.recordCount to range.last + data.recordCount 39 | } 40 | 41 | def multiRange(a: Int, b: Int) = rangesTwice(a).start to rangesTwice(b-1).last 42 | 43 | /* 44 | An inteligent algoritm that ensures the clases in each individual fold are as evenly distributed 45 | as possible. 46 | */ 47 | 48 | def doStratify(data: Nominal) = { 49 | val classOf = data.indices.groupBy(data.y(_)) 50 | val (names, counts) = classOf.mapValues(_.size).toVector.sortBy(_._2).unzip 51 | 52 | val remaining = counts.toArray 53 | 54 | val rangeIndices = ranges.map { range => 55 | var size = range.size 56 | var curn = data.recordCount - range.min 57 | 58 | remaining.indices.flatMap { i => 59 | val chose = (1.0 * remaining(i) * size / curn).round.toInt 60 | size -= chose 61 | curn -= remaining(i) 62 | remaining(i) -= chose 63 | 64 | classOf(names(i)).dropRight( remaining(i) ).takeRight(chose) 65 | } 66 | } 67 | 68 | val indices = (rangeIndices ++ rangeIndices.dropRight(2)).flatten 69 | data.pick(indices) 70 | } 71 | 72 | /* 73 | Main methods. 74 | */ 75 | 76 | val dataset = 77 | (data, stratify) match { 78 | case (d: Nominal, true) => doStratify(d).asInstanceOf[T] 79 | case _ => data ++ data(multiRange(0, folds - 2)) 80 | } 81 | 82 | val learning = 83 | Vector.tabulate(folds) { fold => 84 | dataset(multiRange(fold, fold + folds - 1)) 85 | } 86 | 87 | val testing = ranges.map(dataset.apply) 88 | 89 | def validate(model: Model[T]) = 90 | (0 until folds).map { fold => 91 | val prediction = model.fit(learning(fold)).predict(testing(fold).x) 92 | loss(testing(fold), prediction) 93 | }.sum / folds 94 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/validation/LeaveOneOut.scala: -------------------------------------------------------------------------------- 1 | package qlearn.validation 2 | 3 | import qlearn.dataset.Labeled 4 | import qlearn.ml.Model 5 | 6 | case class LeaveOneOut[T <: Labeled[T]](data: T) extends Validation[T] { 7 | 8 | /* 9 | This validator mutates the dataset, therefore we have to make a defensive copy 10 | prior each validation. So, in case you are using multiple threads (or just want 11 | to trigger garbage collector less offently), use CrossValidation validator with 12 | folds = recordCount. It uses twice as much memory once on initialization, but 13 | then you can make as many validations as you need, even concurrently, without 14 | any additional memory usage. 15 | 16 | Space complexity: N * number of threads 17 | Time complexity: O(1) - no preprocessing 18 | */ 19 | 20 | def validate(model: Model[T]) = { 21 | val copy = data.duplicate(data.indices.init) 22 | 23 | def compare(excluded: T) = { 24 | val prediction = model.fit(copy).predict(excluded.x) 25 | loss(excluded, prediction) 26 | } 27 | 28 | val last = compare(data(-1)) 29 | 30 | val init = copy.indices.map { i => 31 | val excluded = data(i) 32 | 33 | copy.xmat(i, ::) := excluded.xmat(0, ::) 34 | copy.ymat(i, ::) := excluded.ymat(0, ::) 35 | 36 | compare(excluded) 37 | }.sum 38 | 39 | (init + last) / data.recordCount 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/validation/PercentageSplit.scala: -------------------------------------------------------------------------------- 1 | package qlearn.validation 2 | 3 | import qlearn.dataset.{Labeled, Nominal} 4 | import qlearn.ml.Model 5 | 6 | case class PercentageSplit[T <: Labeled[T]](data: T, percentage: Double = 70, stratify: Boolean = true) extends Validation[T] { 7 | 8 | /* 9 | Space complexity: N if stratified, O(1) otherwise 10 | */ 11 | 12 | val splitPoint = (data.recordCount / 100.0 * percentage).round.toInt 13 | 14 | require(splitPoint > 0, "There has to be at least one record in the learning dataset.") 15 | require(splitPoint < data.recordCount, "There has to be at least one record in the validation dataset.") 16 | 17 | /* 18 | An inteligent algoritm that ensures the clases in each individual fold are as evenly distributed 19 | as possible. 20 | */ 21 | 22 | def doStratify(data: Nominal) = { 23 | val classOf = data.indices.groupBy(data.y(_)) 24 | val (names, counts) = classOf.mapValues(_.size).toVector.sortBy(_._2).unzip 25 | 26 | var size = splitPoint 27 | var curn = data.recordCount 28 | 29 | val (a, b) = (counts, names).zipped.map { (count, name) => 30 | val chose = (1.0 * count * size / curn).round.toInt 31 | size -= chose 32 | curn -= count 33 | 34 | classOf(name).splitAt(chose) 35 | }.unzip 36 | 37 | data.pick((a ++ b).flatten) 38 | } 39 | 40 | /* 41 | Main methods. 42 | */ 43 | 44 | val dataset = 45 | (data, stratify) match { 46 | case (d: Nominal, true) => doStratify(d).asInstanceOf[T] 47 | case _ => data 48 | } 49 | 50 | val learning = dataset(0 until splitPoint) 51 | 52 | val testing = dataset(splitPoint to -1) 53 | 54 | def validate(model: Model[T]) = { 55 | val prediction = model.fit(learning).predict(testing.x) 56 | loss(testing, prediction) 57 | } 58 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/validation/SameDatasetValidation.scala: -------------------------------------------------------------------------------- 1 | package qlearn.validation 2 | 3 | import qlearn.dataset.Labeled 4 | import qlearn.ml.Model 5 | 6 | case class SameDatasetValidation[T <: Labeled[T]](data: T) extends Validation[T] { 7 | 8 | /* 9 | Simple validation on the same dataset which you learned from. 10 | 11 | Space complexity: O(1) 12 | Time complexity: O(1) 13 | */ 14 | 15 | def validate(model: Model[T]) = { 16 | val prediction = model.fit(data).predict(data.x) 17 | loss(data, prediction) 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/scala/qlearn/validation/Validation.scala: -------------------------------------------------------------------------------- 1 | package qlearn.validation 2 | 3 | import qlearn.dataset._ 4 | import qlearn.ml.Model 5 | 6 | abstract class Validation[T <: Labeled[T]] { 7 | def data: T 8 | 9 | // TODO: ugliest hack in the world 10 | // T should have loss method, comment it out 11 | // see my stackoverflow questions 12 | def loss(a: T, b: T): Double = data match { 13 | case x: Numerical => x.schema.loss(a.asInstanceOf[Numerical], b.asInstanceOf[Numerical]) 14 | case x: Binary => x.schema.loss(a.asInstanceOf[Binary], b.asInstanceOf[Binary]) 15 | case x: NominalBasic => x.schema.loss(a.asInstanceOf[Nominal], b.asInstanceOf[Nominal]) 16 | case x: NominalFull => x.schema.loss(a.asInstanceOf[Nominal], b.asInstanceOf[Nominal]) 17 | } 18 | 19 | def validate(model: Model[T]): Double 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/wekas/WekaClusterer.scala: -------------------------------------------------------------------------------- 1 | package qlearn.wekas 2 | 3 | import qlearn.dataset.schema.NominalColumn 4 | import qlearn.dataset.{NominalBasic, Unlabeled} 5 | import qlearn.Types.Vec 6 | import qlearn.loss.numerical.distance._ 7 | import qlearn.ml.Clusterer 8 | import weka.clusterers.{Clusterer => ClustererW, AbstractClusterer} 9 | 10 | class WekaClusterer(val clusterer: ClustererW) extends Clusterer { 11 | 12 | /* 13 | If the weka has our distance implemented, then use theirs, 14 | native version. Otherwise, wrap it with WekaDistance wrapper. 15 | */ 16 | 17 | protected def convertDistance(dist: Distance) = dist match { 18 | case EuclideanDistance => new weka.core.EuclideanDistance 19 | case ManhattanDistance => new weka.core.ManhattanDistance 20 | case ChebyshevDistance => new weka.core.ChebyshevDistance 21 | case NormDistance(p) => 22 | val tmp = new weka.core.MinkowskiDistance 23 | tmp.setOrder(p) 24 | tmp 25 | 26 | case _ => WekaDistance(dist) 27 | } 28 | 29 | /* 30 | The main clustering method. 31 | */ 32 | 33 | def cluster(data: Unlabeled) = { 34 | val copy = AbstractClusterer.makeCopy(clusterer) 35 | val instances = data.wekaDataset 36 | copy.buildClusterer(instances) 37 | 38 | val y = Vec.tabulate(data.recordCount)( i => 39 | copy.clusterInstance( instances.instance(i) ) 40 | ) 41 | 42 | val k = copy.numberOfClusters 43 | NominalBasic(data, y, NominalColumn('cluster, names(k))) 44 | } 45 | 46 | override def toString = s"Weka${clusterer.getClass.getSimpleName}()" 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/wekas/WekaDistance.scala: -------------------------------------------------------------------------------- 1 | package qlearn.wekas 2 | 3 | import qlearn.loss.numerical.distance.Distance 4 | import weka.core.DistanceFunction 5 | 6 | case class WekaDistance(dist: Distance) extends DistanceFunction { 7 | 8 | // TODO: Implement all these crazy Weka methods 9 | 10 | // Members declared in weka.core.DistanceFunction 11 | def clean(): Unit = ??? 12 | def distance(x$1: weka.core.Instance,x$2: weka.core.Instance,x$3: Double,x$4: weka.core.neighboursearch.PerformanceStats): Double = ??? 13 | def distance(x$1: weka.core.Instance,x$2: weka.core.Instance,x$3: Double): Double = ??? 14 | def distance(x$1: weka.core.Instance,x$2: weka.core.Instance,x$3: weka.core.neighboursearch.PerformanceStats): Double = ??? 15 | def distance(x$1: weka.core.Instance,x$2: weka.core.Instance): Double = ??? 16 | def getAttributeIndices(): String = ??? 17 | def getInstances(): weka.core.Instances = ??? 18 | def getInvertSelection(): Boolean = ??? 19 | def postProcessDistances(x$1: Array[Double]): Unit = ??? 20 | def setAttributeIndices(x$1: String): Unit = ??? 21 | def setInstances(x$1: weka.core.Instances): Unit = ??? 22 | def setInvertSelection(x$1: Boolean): Unit = ??? 23 | def update(x$1: weka.core.Instance): Unit = ??? 24 | 25 | // Members declared in weka.core.OptionHandler 26 | def getOptions(): Array[String] = ??? 27 | def listOptions(): java.util.Enumeration[weka.core.Option] = ??? 28 | def setOptions(x$1: Array[String]): Unit = ??? 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/wekas/WekaFittedModel.scala: -------------------------------------------------------------------------------- 1 | package qlearn.wekas 2 | 3 | import qlearn.Types.Mat 4 | import qlearn.dataset.{SingleLabeled, Unlabeled, MultiLabeled} 5 | import qlearn.ml.FittedModel 6 | import weka.classifiers.Classifier 7 | 8 | class WekaFittedModel[T <: SingleLabeled[T]](val schema: T, predictor: Classifier) extends FittedModel[T] { 9 | 10 | def predict(data: Unlabeled) = { 11 | val instances = data.wekaDataset 12 | val mat = Mat((0 until instances.numInstances).map { i => 13 | instances.instance(i).setDataset( schema.wekaDataset ) 14 | predictor.distributionForInstance(instances.instance(i)) 15 | }: _*) 16 | schema.updated(data, mat) 17 | } 18 | 19 | 20 | override def toString = s"WekaFitted${predictor.getClass.getSimpleName}()" 21 | 22 | def report { 23 | println(predictor) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/qlearn/wekas/WekaModel.scala: -------------------------------------------------------------------------------- 1 | package qlearn.wekas 2 | 3 | import qlearn.dataset.{MultiLabeled, SingleLabeled} 4 | import qlearn.ml.Model 5 | import weka.classifiers.{AbstractClassifier, Classifier} 6 | 7 | class WekaModel[T <: SingleLabeled[T]](val model: Classifier) extends Model[T] { 8 | 9 | def fit(data: T) = 10 | new WekaFittedModel(data, { 11 | val copy = AbstractClassifier.makeCopy(model) 12 | copy.buildClassifier(data.wekaDataset) 13 | copy 14 | }) 15 | 16 | override def toString = s"Weka${model.getClass.getSimpleName}()" 17 | } 18 | --------------------------------------------------------------------------------