├── .gitignore
├── README.md
├── build.sbt
├── project
    ├── build.properties
    └── plugins.sbt
└── src
    └── main
        ├── java
            └── META-INF
            │   └── MANIFEST.MF
        └── scala
            ├── Experiment.scala
            ├── Main.scala
            ├── Main2.scala
            └── qlearn
                ├── Types.scala
                ├── dataset
                    ├── Binary.scala
                    ├── Labeled.scala
                    ├── MultiLabeled.scala
                    ├── Nominal.scala
                    ├── NominalBasic.scala
                    ├── NominalFull.scala
                    ├── Numerical.scala
                    ├── SingleLabeled.scala
                    ├── Unlabeled.scala
                    ├── loaders
                    │   ├── ArffLoader.scala
                    │   ├── Loader.scala
                    │   └── TabLoader.scala
                    └── schema
                    │   ├── BinaryColumn.scala
                    │   ├── Column.scala
                    │   ├── NominalColumn.scala
                    │   └── NumericalColumn.scala
                ├── loss
                    ├── Loss.scala
                    ├── binary
                    │   ├── F1.scala
                    │   ├── HingeLoss.scala
                    │   ├── LogisticLoss.scala
                    │   ├── Precision.scala
                    │   └── Recall.scala
                    ├── nominal
                    │   ├── CrossEntropyLoss.scala
                    │   ├── FractionOfIncorrect.scala
                    │   └── MatrixLoss.scala
                    └── numerical
                    │   ├── MeanAbsoluteLoss.scala
                    │   ├── MeanSquaredLoss.scala
                    │   └── distance
                    │       ├── ChebyshevDistance.scala
                    │       ├── CosineSimilarity.scala
                    │       ├── Distance.scala
                    │       ├── EuclideanDistance.scala
                    │       ├── ManhattanDistance.scala
                    │       ├── NormDistance.scala
                    │       ├── PolyKernel.scala
                    │       └── RBFKernel.scala
                ├── ml
                    ├── Clusterer.scala
                    ├── FittedModel.scala
                    ├── FittedModelMulti.scala
                    ├── Model.scala
                    ├── ModelForTwo.scala
                    ├── RandomizedClusterer.scala
                    ├── RandomizedModel.scala
                    ├── classify
                    │   ├── FittedNeuralNetwork.scala
                    │   ├── FittedRandomTree.scala
                    │   ├── FittedSameDistribution.scala
                    │   ├── FittedSimpleKNN.scala
                    │   ├── NeuralNetwork.scala
                    │   ├── RandomForest.scala
                    │   ├── RandomTree.scala
                    │   ├── SameDistribution.scala
                    │   ├── SimpleKNN.scala
                    │   └── weka
                    │   │   ├── LogisticRegression.scala
                    │   │   └── REPTree.scala
                    ├── cluster
                    │   ├── KMeans.scala
                    │   └── weka
                    │   │   ├── CobWeb.scala
                    │   │   ├── ExpectationMaximization.scala
                    │   │   └── KMeans.scala
                    ├── meta
                    │   ├── Bagging.scala
                    │   ├── FittedBagging.scala
                    │   ├── FittedOneVsAll.scala
                    │   └── OneVsAll.scala
                    └── regress
                    │   ├── FittedRidgeRegression.scala
                    │   ├── FittedRidgeRegressionMulti.scala
                    │   ├── RidgeRegression.scala
                    │   └── weka
                    │       └── RidgeRegression.scala
                ├── strategies
                    ├── NoRecentImprovement.scala
                    └── Stopping.scala
                ├── util
                    ├── Util.scala
                    ├── decisionStump
                    │   └── DecisionStump.scala
                    └── nnet
                    │   ├── Layer.scala
                    │   └── activations
                    │       ├── ActivationFunction.scala
                    │       ├── Eliott.scala
                    │       ├── EliottSym.scala
                    │       ├── Sigmoid.scala
                    │       └── Tanh.scala
                ├── validation
                    ├── CrossValidation.scala
                    ├── LeaveOneOut.scala
                    ├── PercentageSplit.scala
                    ├── SameDatasetValidation.scala
                    └── Validation.scala
                └── wekas
                    ├── WekaClusterer.scala
                    ├── WekaDistance.scala
                    ├── WekaFittedModel.scala
                    └── WekaModel.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | 
 3 | *.class
 4 | *.log
 5 | 
 6 | # sbt specific
 7 | .cache/
 8 | .history/
 9 | .lib/
10 | dist/*
11 | target/
12 | lib_managed/
13 | src_managed/
14 | project/boot/
15 | project/plugins/project/
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | QuantumLearn
  2 | ===
  3 | 
  4 | I have used Weka for quite some time now. There were quite some things that bothered me. The dataset management was clumsy at best, but I could deal with that. What I couldn't get around was the general slowness of the library. This is not only my complaint, just make a basic google search. The reason for that? Weka doesn't use BLAS (and neither does any other Java machine learning library). Not only that, many times unnecessary data copies are made.
  5 | 
  6 | So, here is the manifesto of the QuantumLearn library:
  7 | 
  8 | * Where possible, the **BLAS & LAPACK methods** are used. This brings up to 10x speedup compared to Weka.
  9 | * Created with support for **multilabel datasets** from the start. This enables much faster learning for algorithms that support that (NN, Linear / Logistic regression), otherwise it just fails back to training a separate model for each label.
 10 | * Created around the idea of **immutable data structures**. This eases multithreading. Example: When you call *.fit* on *LinearRegression*, you get *FittedLinearRegression* back.
 11 | * Leverage the abilities of the type system. Scala helps a great deal here. This means we can catch obvious errors at compile times instead of failing with an exception - such as passing a nominal dataset to a regressor.
 12 | * Algorithms are reporting their **progress status** during the training process. This is really important, since the training is usually a really long-lasting activity. It's nice to know how much time you have left to wait.
 13 | 
 14 | Beware, this library is in really early alpha stage. Do not use in production.
 15 | 
 16 | Dealing with the lack of algorithms
 17 | ---
 18 | Until the algorithm is implemented in effective Scala code, the algorithm from Weka is wrapped. For now, this causes a dataset copy (but it is done only once).
 19 | 
 20 | Dealing with datasets
 21 | ===
 22 | Let's first create an unlabeled (unsupervised) dataset. We create a matrix and name the features.
 23 | 
 24 | ```scala
 25 | val unlabeled = Unlabeled(DenseMatrix(
 26 | 	(16.0,2.0,3.0),
 27 | 	(3.0,11.0,5.5),
 28 | 	(4.0,8.0,10.0),
 29 | 	(5.0,100.0,7.0)
 30 | ), Vector('x, 'y, 'z))
 31 | ```
 32 | 
 33 | This dataset might be used for clustering or to make predictions on. However, if we want to learn from it, we have to label (supervize) it. Along with the dataset, you can specify a custom cost function, as is shown below with specifying hinge loss for the binary dataset.
 34 | 
 35 | ```scala
 36 | val isMale = Binary('isMale, unlabeled, Vector(true, false, true, true), loss = HingeLoss)
 37 | val age    = Numerical('age, unlabeled, Vector(20.3, 56.8, 10.3, 11.8))
 38 | val major  = Nominal('major, unlabeled, Vector("ML", "literature", "ML", "art"))
 39 | ```
 40 | 
 41 | Multi label datasets
 42 | ---
 43 | 
 44 | You can then group many of those single-labeled datasets into a multi-labeled one.
 45 | 
 46 | ```scala
 47 | val labeled = MultiLabeled(isMale, age, major)
 48 | ```
 49 | 
 50 | Finally, to check everything is fine so far, we call `labeled.report` and get this on the standard output:
 51 | 
 52 | ```
 53 |        x         y         z          isMale       age  major=ML  major=literature  major=art
 54 | 16.00000  2.000000  3.000000   ->   1.000000  20.30000  1.000000  0.00000000000000  0.0000000
 55 | 3.000000  11.00000  5.500000   ->   0.000000  56.80000  0.000000  1.00000000000000  0.0000000
 56 | 4.000000  8.000000  10.00000   ->   1.000000  10.30000  1.000000  0.00000000000000  0.0000000
 57 | 5.000000  100.0000  7.000000   ->   1.000000  11.80000  0.000000  0.00000000000000  1.0000000
 58 | ```
 59 | 
 60 | Image datasets
 61 | ---
 62 | This is currently just a proposal. In the future, it should be possible to instantiate an unabeled dataset containing image data. This dataset behaves no differently as the one described above.
 63 | 
 64 | Let's create the collection of images, each of which is resized to 40x30 pixels. Moreover, each image is stored five times - the original, translated 1 and 5 pixels to the right and 1 and 3 pixels to the left. Additionally, each images has a 30% chance of appearing rotated by 1, 3 of 5 degrees.
 65 | 
 66 | ```scala
 67 | val unlabeledImages = ImageDataset(
 68 | 	Seq("images/bird1.jpg", "images/bird2.jpg", "images/bird3.png"),
 69 | 	height = 40, width = 30,
 70 | 	translations = Seq(1, -1, -3, 5),
 71 | 	rotations = Seq(1.0, 3.0, 5.0), rotationProbability = 0.3
 72 | )
 73 | ```
 74 | 
 75 | Transforming records, features and labels
 76 | ===
 77 | Say we want to add two new features to the dataset. We can use arbitrary data or reuse existing. Here's how it's done:
 78 | 
 79 | ```scala
 80 | val augmented = FeatureAdder(
 81 | 	'isFemale    -> (row => !row('isMale)),
 82 | 	'ageInMonths -> (row => row('age) * 12)
 83 | ).transform(labeled)
 84 | ```
 85 | 
 86 | 
 87 | Wrapping Weka learners
 88 | ===
 89 | 
 90 | Everything Weka-connected resides in a `qlearn.algorithms.weka` package. Some of the Weka algorithms are already nicely wrapped. The ones that are not, you can wrap yourself:
 91 | 
 92 | ```scala
 93 | // simple example
 94 | WekaWrapper(new J48)
 95 | 
 96 | // complex example
 97 | WekaWrapper({
 98 | 	val tmp = new J48
 99 | 	tmp.setMinNumObj(10)
100 | 	tmp.setUseLaplace(true)
101 | 	tmp
102 | })
103 | ```
104 | 
105 | Once this is done, you can use Weka learners in the same manner as the native ones. It's that simple.
106 | 
107 | Future improvements
108 | ===
109 | This section presents future ideas for optimization / improvement. We are not in a hurry, first just make sure everything works correctly.
110 | 
111 | Memory optimizations
112 | ---
113 | * Think how could we adapt the learners to avoid copying the dataset K times on K-fold cross-validation. Ideas: bit masking vector, two phase learning (to splice out the test fold), ... There is no free luch, and this causes additional CPU costs and complicates the code and class design. Therefore debate whether this is reasonable - RAM is cheap novadays.
114 | * Learners should drop the reference to the learning dataset as soon as they are done learning. For example, with linear regression, we just remember the coefficients and allow the original dataset to be garbage collected.
115 | * Could sparse datasets easily be supported? What is the performance cost, since BLAS is not used? How could more advanced optimizations in Breeze library help? See also: [Breeze bug report](https://github.com/scalanlp/breeze/issues/360)
116 | 
117 | CPU & computational optimizations
118 | ---
119 | * Which learners are capable of parallelism (or distributed computing - Hadoop, Akka, Spark)? This is not a huge concern, since things such as parameter selection (via cross validation), ensembles (bagging, stacking) are hugely parallel.
120 | * Since loss functions usually have unchanging target predictions, would preprocessing help?
121 | 
122 | No nominal attributes on X dataset
123 | ---
124 | For efficiency (simplicity, memory and coputation) reasons, the X dataset is represented as as simple matrix. This suffices most of the time by simply binarizing the nominal attribures. Some algorithms, such as Naive bayes, requires the higher level of knowledge. This could be solved this way:
125 | 
126 | * Make `=` a special symbol in the attribute name. Therefore, if you have attrbutes named `hairColor=grey`, `hairColor=black`, `hairColor=blonde`, the classifier can deduce the category memberships. However, such storage is inefficient with attrbutes with many possible values. Moreover, specifying attribute names to adhere to our standard is just a call for problems.
127 | * Represent the ordinal value as a double in a single attribute (`grey` = 1.0, `black` = 2.0, `blonde` = 3.0), and then accept the information about which attributes are stored this way. Example:
128 | 
129 | ```scala
130 | val unlabeled = Unlabeled(DenseMatrix(
131 | 	(1.0,2.0,3.0),
132 | 	(2.0,11.0,5.5),
133 | 	(1.0,8.0,10.0),
134 | 	(3.0,100.0,7.0)
135 | ), Vector('hairColor, 'y, 'z), nominal = Seq('hairColor))
136 | ```
137 | 
138 | Algorithm improvements
139 | ---
140 | * Do learning algorithms benefit from knowing the unsupervized future test data upfront? See: [CrossValidated question](https://stats.stackexchange.com/questions/156085/which-supervised-algorithms-benefit-from-knowing-future-inputs-upfront)
141 | 
142 | Misc
143 | ---
144 | * Java interop. Make it possible to use this library in Java without any extended effort.
145 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "QuantumLearn"
 2 | 
 3 | version := "0.1.5"
 4 | 
 5 | scalaVersion := "2.11.7"
 6 | 
 7 | libraryDependencies  ++= Seq(
 8 | 	"org.scalanlp" %% "breeze" % "0.12-SNAPSHOT",
 9 | 
10 | 	// native libraries greatly improve performance, but increase jar sizes.
11 | 	"org.scalanlp" %% "breeze-natives" % "0.12-SNAPSHOT",
12 | 
13 | 	// weka
14 | 	"nz.ac.waikato.cms.weka" % "weka-dev" % "3.7.13"
15 | )
16 | 
17 | resolvers ++= Seq(
18 | 	"Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/",
19 | 	"Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/"
20 | )


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.9


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn


--------------------------------------------------------------------------------
/src/main/java/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: Main
3 | 
4 | 


--------------------------------------------------------------------------------
/src/main/scala/Experiment.scala:
--------------------------------------------------------------------------------
 1 | import breeze.linalg.{pinv, sum}
 2 | import qlearn.Types.Mat
 3 | 
 4 | object Experiment extends App {
 5 | 	def memory {
 6 | 		System.gc
 7 | 		System.gc
 8 | 		System.gc
 9 | 		System.gc
10 | 		System.gc
11 | 		println(s"Mem ${Runtime.getRuntime.freeMemory}")
12 | 	}
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 	{
19 | 		val n = 10000
20 | 		val a = Mat.rand[Double](n, n)
21 | 
22 | 		var start = System.nanoTime
23 | 		val b = a(0 to -2, 0 to -2 by 2)
24 | 		println((System.nanoTime - start) / 1000000000.0)
25 | 	}
26 | 
27 | 
28 | 
29 | 	println("START")
30 | 	memory
31 | 	val a = Mat.rand[Double](1300, 1300)
32 | 	println(sum(a))
33 | 	memory
34 | 
35 | 
36 | 
37 | 	{
38 | 		val start = System.nanoTime
39 | 		pinv(a)
40 | 		println((System.nanoTime - start) / 1000000000.0)
41 | 	}
42 | 	memory
43 | 
44 | 	println("A")
45 | 
46 | 	{
47 | 		var start = System.nanoTime
48 | 		val b = a(0 to -2, 0 to -2)
49 | 		println((System.nanoTime - start) / 1000000000.0)
50 | 
51 | 		memory
52 | 
53 | 		start = System.nanoTime
54 | 		pinv(b)
55 | 		println((System.nanoTime - start) / 1000000000.0)
56 | 	}
57 | 	memory
58 | 
59 | 	println("B")
60 | 
61 | 	{
62 | 		var start = System.nanoTime
63 | 		val b = a(0 to -2, 0 to -2 by 2)
64 | 		println((System.nanoTime - start) / 1000000000.0)
65 | 
66 | 		memory
67 | 
68 | 		start = System.nanoTime
69 | 		pinv(b)
70 | 		println((System.nanoTime - start) / 1000000000.0)
71 | 	}
72 | 	memory
73 | 
74 | 	println("C")
75 | 
76 | 	{
77 | 		var start = System.nanoTime
78 | 		val bx = a((0 until a.rows).filter( _ => math.random < 0.5), ::)
79 | 		println((System.nanoTime - start) / 1000000000.0)
80 | 
81 | 		memory
82 | 
83 | 		start = System.nanoTime
84 | 		val b = bx.toDenseMatrix
85 | 		println((System.nanoTime - start) / 1000000000.0)
86 | 
87 | 		memory
88 | 
89 | 		start = System.nanoTime
90 | 		pinv(b)
91 | 		println((System.nanoTime - start) / 1000000000.0)
92 | 	}
93 | 	memory
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/Main.scala:
--------------------------------------------------------------------------------
 1 | import _root_.weka.clusterers.SimpleKMeans
 2 | import qlearn.dataset._
 3 | 
 4 | import qlearn.Types._
 5 | import qlearn.loss.numerical.distance.EuclideanDistance
 6 | import qlearn.ml
 7 | import qlearn.ml.classify.weka.{LogisticRegression, REPTree}
 8 | import qlearn.ml.classify.{RandomTree, SimpleKNN, FittedSimpleKNN}
 9 | import qlearn.ml.cluster.{weka, KMeans}
10 | import qlearn.ml.meta.Bagging
11 | import qlearn.ml.regress.RidgeRegression
12 | import qlearn.validation.{SameDatasetValidation, CrossValidation}
13 | import qlearn.wekas.WekaClusterer
14 | 
15 | object Main extends App {
16 | 	val ul = Unlabeled(Mat.rand(8, 10).t, Vector('a, 'b, 'c, 'd, 'e, 'f, 'g, 'h))
17 | 	val l1 = Nominal('wheel, ul, Vector.fill(4)("axxxxxx") ++ Vector.fill(3)("b") ++ Vector.fill(2)("c") ++ Vector.fill(1)("d"))
18 | 	val l2 = Nominal('tire, ul, Vector.fill(1)("axyza") ++ Vector.fill(2)("b") ++ Vector.fill(3)("c") ++ Vector.fill(4)("d"))
19 | 	val lb = Binary('sold, ul, Vector.fill(6)(true) ++ Vector.fill(4)(false))
20 | 	val ln = MultiLabeled(
21 | 		Numerical('num, ul, (1 to 10).map(_.toDouble), EuclideanDistance),
22 | 		Numerical('num2, ul, (11 to 20).map(_.toDouble))
23 | 	)
24 | 
25 | 	l1.report
26 | 	FittedSimpleKNN(l1, k = 3).predict(ul).report
27 | 
28 | 
29 | 	//val l3 = MultiLabeled(l1, l2, lb)
30 | 	//l3.report
31 | 
32 | 	//LogisticRegression().fit(l3).predict(ul).report
33 | 	//REPTree().fit(l3).predict(ul).report
34 | 	RandomTree(minParent = 5).fit(l1).predict(ul).report
35 | 
36 | 	val mdls = Seq.fill(300)(RandomTree(minParent = 1))
37 | 	Bagging(mdls).fit(l1).predict(ul).report
38 | 
39 | 	//RidgeRegression(ridge = 0.0).fit(ln).predict(ul).report
40 | 	//ml.regress.weka.RidgeRegression(ridge = 0.0).fit(ln).predict(ul).report
41 | 
42 | 	println(CrossValidation[Nominal](l1).validate(LogisticRegression()))
43 | 	println(SameDatasetValidation[Nominal](l1).validate(SimpleKNN(5)))
44 | 
45 | 
46 | 
47 | 	val ulc = Unlabeled(Mat.rand(8, 50000).t, Vector('a, 'b, 'c, 'd, 'e, 'f, 'g, 'h))
48 | 	println("a")
49 | 	KMeans(k = 50).cluster(ulc)
50 | 	println("b")
51 | 	weka.KMeans(k = 50)
52 | 	println("c")
53 | 	new WekaClusterer({
54 | 		val c = new SimpleKMeans
55 | 		c.setNumClusters(50)
56 | 		c
57 | 	}).cluster(ulc)
58 | 	println("x")
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 	val rul = Unlabeled(Mat.rand(8, 9).t, Vector('a, 'b, 'c, 'd, 'e, 'f, 'g, 'h))
70 | 	val nm = Numerical('num, rul, (1 to 9).map(_.toDouble))
71 | 	println(nm.wekaDataset)
72 | 	RidgeRegression().fit(nm).predict(rul).report
73 | 	ml.regress.weka.RidgeRegression().fit(nm).predict(rul).report
74 | }


--------------------------------------------------------------------------------
/src/main/scala/Main2.scala:
--------------------------------------------------------------------------------
 1 | import qlearn.dataset.loaders.ArffLoader
 2 | import qlearn.ml.cluster.KMeans
 3 | 
 4 | object Main2 extends App {
 5 | 	val data = ArffLoader.unlabeled("datasets/arff/regression/autoPrice.arff")
 6 | 	//val data = ArffLoader.unlabeled("datasets/arff/classification9/anneal.ORIG.arff")
 7 | 	data.report
 8 | 
 9 | 	KMeans(k = 30).cluster(data).report
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/Types.scala:
--------------------------------------------------------------------------------
 1 | package qlearn
 2 | 
 3 | import breeze.linalg.{*, DenseMatrix, Matrix, DenseVector}
 4 | 
 5 | object Types {
 6 | 	type Vec = DenseVector[Double]
 7 | 	type Mat = DenseMatrix[Double]
 8 | 
 9 | 	val Vec = DenseVector
10 | 	val Mat = DenseMatrix
11 | 
12 | 	type IntVec = DenseVector[Int]
13 | 	type BinVec = DenseVector[Boolean]
14 | 
15 | 	implicit class MatWrapper(m: Mat) {
16 | 		def r = m(*, ::)
17 | 
18 | 		def c = m(::, *)
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/Binary.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import qlearn.Types.{BinVec, Mat, Vec}
 4 | import qlearn.dataset.schema.BinaryColumn
 5 | import qlearn.loss.Loss
 6 | import qlearn.loss.binary.LogisticLoss
 7 | import qlearn.loss.nominal.CrossEntropyLoss
 8 | import qlearn.util.Util
 9 | 
10 | case class Binary(x: Unlabeled, yb: Vec, schema: BinaryColumn) extends Nominal {
11 | 
12 | 	val values = Vector("no", "yes")
13 | 
14 | 	lazy val ymat =
15 | 		Mat(yb.map( v =>
16 | 			Seq(1-v, v)
17 | 		).toScalaVector: _*)
18 | 
19 | 	val yt = yb :>= 0.5
20 | 
21 | 	override val y =
22 | 		yb.map( v =>
23 | 			if (v >= 0.5) 1 else 0
24 | 		)
25 | 
26 | 	def updated(xnew: Unlabeled, ynew: Mat): Binary = schema.populate(xnew, ynew)
27 | 
28 | 	/*
29 | 		Function that writes the dataset to stdout
30 | 		*/
31 | 
32 | 	val reportHeader = Seq("%9s" format name.name)
33 | 
34 | 	def reportLine(line: Int) =
35 | 		Seq(Util.printDoubleNicely(yb(line), reportHeader.head.size))
36 | }
37 | 
38 | object Binary {
39 | 
40 | 	def apply(name: Symbol, x: Unlabeled, seq: Seq[Boolean], loss: Loss[Binary] = LogisticLoss): Binary = {
41 | 		Binary(x, Vec(seq.map { v =>
42 | 			if (v) 1.0 else 0.0
43 | 		}: _*), BinaryColumn(name, loss))
44 | 	}
45 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/Labeled.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.schema.Column
 5 | import qlearn.loss.Loss
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | abstract class Labeled[+T <: Labeled[T]] {
10 | 	def x: Unlabeled
11 | 	def xmat = x.xmat
12 | 	val schema: Column
13 | 
14 | 	def recordCount = x.recordCount
15 | 	def featureCount = x.featureCount
16 | 
17 | 	def ymat: Mat
18 | 
19 | 	// TODO
20 | 	//require(ymat.rows == xmat.rows, "Both X and Y have to have the same number of rows")
21 | 
22 | 	def width: Int
23 | 
24 | 	def indices = 0 until recordCount
25 | 
26 | 	def apply(range: Range): T = updated(x(range), ymat(range, ::))
27 | 	def apply(index: Int): T = apply(index to index)
28 | 	def pick(indices: Seq[Int]): T = updated(x.pick(indices), ymat(indices, ::).toDenseMatrix)
29 | 
30 | 	def updated(xnew: Unlabeled, ynew: Mat): T
31 | 
32 | 	def ++[Q <: Labeled[Q]](that: Q): T = updated(x ++ that.x, Mat.vertcat(ymat, that.ymat))
33 | 
34 | 	def duplicate: T = updated(x.duplicate, ymat.copy)
35 | 
36 | 	def shuffle: T =
37 | 		pick( Random.shuffle(0 to recordCount - 1) )
38 | 
39 | 	/*
40 | 		Function that writes the dataset to stdout
41 | 	 */
42 | 
43 | 	def reportHeader: Seq[String]
44 | 	def reportLine(line: Int): Seq[String]
45 | 
46 | 	def report {
47 | 		println(x.reportHeader ++ Vector("    ") ++ reportHeader mkString "  ")
48 | 
49 | 		indices.foreach { i =>
50 | 			println(x.reportLine(i) ++ Vector(" -> ") ++ reportLine(i) mkString "  ")
51 | 		}
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/MultiLabeled.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.schema.Column
 5 | import qlearn.loss.Loss
 6 | import qlearn.util.Util
 7 | 
 8 | case class MultiLabeled[T <: SingleLabeled[T]](ys: T*) extends Labeled[MultiLabeled[T]] {
 9 | 	require(ys.nonEmpty, "Specify at least one dataset.")
10 | 
11 | 	val x = ys.head.x
12 | 
13 | 	lazy val width = ys.map(_.width).sum
14 | 
15 | 	lazy val ymat = Mat.horzcat(ys.map(_.ymat): _*)
16 | 
17 | 	type Tmp = T
18 | 	val schema = new Column {
19 | 		type T = MultiLabeled[Tmp]
20 | 
21 | 		val name = 'MultiToBeRenamed
22 | 
23 | 		val loss = new Loss[T] {
24 | 			val range = 1.0 -> 2.0
25 | 
26 | 			def apply(a: T, b: T) = 1.0
27 | 				/*(a.ys, b.ys).zipped.map(
28 | 					(as, bs) => as.schema.loss(as, bs)
29 | 				).sum / a.ys.size*/
30 | 		}
31 | 
32 | 		def populate(x: Unlabeled, y: Mat) =
33 | 			MultiLabeled(ys.zipWithIndex.map { case (y2, i) =>
34 | 				val start = widths(i)
35 | 				val stop = widths(i + 1)
36 | 				val slice = y(::, start until stop)
37 | 				y2.updated(x, slice)
38 | 			}: _*)
39 | 	}
40 | 
41 | 
42 | 	private val widths = ys.scanLeft(0)(_ + _.width)
43 | 
44 | 	def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew)
45 | 
46 | 	override def toString = {
47 | 		val labels = ys.map(_.name.name) mkString ", "
48 | 		s"Multi(${x.labelString}, $labels)"
49 | 	}
50 | 
51 | 	/*
52 | 		Function that writes the dataset to stdout
53 | 		*/
54 | 
55 | 	lazy val reportHeader = ys.flatMap(_.reportHeader)
56 | 
57 | 	def reportLine(line: Int) = ys.flatMap(_.reportLine(line))
58 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/Nominal.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import qlearn.Types._
 4 | import qlearn.dataset.schema.NominalColumn
 5 | import qlearn.loss.Loss
 6 | import qlearn.loss.nominal.CrossEntropyLoss
 7 | import weka.core.{Attribute, Instances}
 8 | 
 9 | abstract class Nominal extends SingleLabeled[Nominal] with Product with Serializable {
10 | 	val x: Unlabeled
11 | 	val ymat: Mat
12 | 	def values: Vector[String]
13 | 
14 | 	def width = values.size
15 | 
16 | 	val y: IntVec =
17 | 		Vec.tabulate(recordCount) { i =>
18 | 			val inner = ymat(i, ::).inner.toArray
19 | 			val max = inner.max
20 | 			inner.indexOf(max)
21 | 		}
22 | 
23 | 	/*
24 | 		Produce the weka dataset (Instances).
25 | 	 */
26 | 
27 | 	lazy val wekaDataset = {
28 | 		val data = new Instances(x.wekaDataset)
29 | 		val pos = data.numAttributes
30 | 
31 | 		import collection.JavaConversions._
32 | 		data.insertAttributeAt(new Attribute("output", values), pos)
33 | 		data.setClassIndex(pos)
34 | 
35 | 		indices.foreach { i =>
36 | 			data.instance(i).setValue(pos, values(y(i)))
37 | 		}
38 | 
39 | 		data
40 | 	}
41 | }
42 | 
43 | object Nominal {
44 | 	def apply(name: Symbol, x: Unlabeled, y: Seq[String], loss: Loss[Nominal] = CrossEntropyLoss()): NominalBasic = {
45 | 		val values = y.distinct.toVector
46 | 		val lookup = values.zipWithIndex.toMap
47 | 
48 | 		NominalBasic(x, Vec.tabulate(y.size)(y andThen lookup), NominalColumn(name, values, loss))
49 | 	}
50 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/NominalBasic.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import breeze.linalg.argmax
 4 | import qlearn.Types._
 5 | import qlearn.dataset.schema.NominalColumn
 6 | 
 7 | case class NominalBasic(x: Unlabeled, override val y: IntVec, schema: NominalColumn) extends Nominal {
 8 | 
 9 | 	def values = schema.values
10 | 
11 | 	lazy val ymat =
12 | 		Mat.tabulate(y.size, values.size) { (r, c) =>
13 | 			if (y(r) == c) 1.0 else 0.0
14 | 		}
15 | 
16 | 	def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew)
17 | 
18 | 	def updatedSame(xnew: Unlabeled, ynew: Mat): NominalBasic = {
19 | 		assert(ynew.cols == values.size)
20 | 		copy(x = xnew, y = argmax(ynew.r))
21 | 	}
22 | 
23 | 	/*
24 | 		Function that writes the dataset to stdout
25 | 	 */
26 | 
27 | 	lazy val reportHeader = {
28 | 		val len = values.map(_.size).max
29 | 		Seq(s"%${len}s" format name.name)
30 | 	}
31 | 
32 | 	def reportLine(line: Int) =
33 | 		Seq(values(y(line)).padTo(reportHeader.head.size, ' '))
34 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/NominalFull.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import qlearn.Types._
 4 | import qlearn.dataset.schema.NominalColumn
 5 | import qlearn.util.Util
 6 | 
 7 | case class NominalFull(x: Unlabeled, ymat: Mat, schema: NominalColumn) extends Nominal {
 8 | 
 9 | 	def values = schema.values
10 | 
11 | 	def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew)
12 | 
13 | 
14 | 	/*
15 | 		Function that writes the dataset to stdout
16 | 		*/
17 | 
18 | 	lazy val reportHeader =
19 | 		values.map(x =>
20 | 			"%9s" format s"${name.name}=$x"
21 | 		)
22 | 
23 | 	def reportLine(line: Int) = {
24 | 		val vec = ymat(line, ::).inner.toScalaVector
25 | 		vec.zipWithIndex.map { case (v, i) =>
26 | 			Util.printDoubleNicely(v, reportHeader(i).size)
27 | 		}
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/Numerical.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import qlearn.Types._
 4 | import qlearn.dataset.schema.NumericalColumn
 5 | import qlearn.loss.Loss
 6 | import qlearn.loss.numerical.MeanSquaredLoss
 7 | import qlearn.util.Util
 8 | import weka.core.{Attribute, Instances}
 9 | 
10 | case class Numerical(x: Unlabeled, y: Vec, schema: NumericalColumn) extends SingleLabeled[Numerical] {
11 | 	lazy val ymat = y.toDenseMatrix.t
12 | 
13 | 	val width = 1
14 | 
15 | 	def updated(xnew: Unlabeled, ynew: Mat) = schema.populate(xnew, ynew)
16 | 
17 | 
18 | 	/*
19 | 		Function that writes the dataset to stdout
20 | 	 */
21 | 
22 | 	val reportHeader = Seq("%9s" format name.name)
23 | 
24 | 	def reportLine(line: Int) =
25 | 		Seq(Util.printDoubleNicely(y(line), reportHeader.head.size))
26 | 
27 | 
28 | 	lazy val wekaDataset = {
29 | 		val data = new Instances(x.wekaDataset)
30 | 		val pos = data.numAttributes
31 | 		data.insertAttributeAt(new Attribute("output"), pos)
32 | 		data.setClassIndex(pos)
33 | 
34 | 		(0 until y.length).foreach { i =>
35 | 			data.instance(i).setValue(pos, y(i))
36 | 		}
37 | 
38 | 		data
39 | 	}
40 | }
41 | 
42 | object Numerical {
43 | 	def apply(name: Symbol, x: Unlabeled, y: Seq[Double], loss: Loss[Numerical] = MeanSquaredLoss): Numerical =
44 | 		Numerical(x, Vec(y: _*), NumericalColumn(name, loss))
45 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/SingleLabeled.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import weka.core.Instances
 4 | 
 5 | abstract class SingleLabeled[+T <: SingleLabeled[T]] extends Labeled[T] {
 6 | 	val name: Symbol = schema.name
 7 | 
 8 | 	override def toString = s"Single(${x.labelString}, ${name.name})"
 9 | 
10 | 	def wekaDataset: Instances
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/Unlabeled.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.util.Util
 5 | import weka.core.{DenseInstance, Attribute, Instances}
 6 | 
 7 | case class Unlabeled(xmat: Mat, names: Vector[Symbol]) {
 8 | 	val recordCount = xmat.rows
 9 | 	val featureCount = xmat.cols
10 | 
11 | 	require(recordCount > 0, "Dataset must have at least one record.")
12 | 	require(names.size == names.distinct.size, "The names have to be unique.")
13 | 	require(names.size == featureCount, "You have to name every feature.")
14 | 
15 | 
16 | 
17 | 	def indices = 0 until recordCount
18 | 
19 | 	def apply(range: Range): Unlabeled = copy(xmat = xmat(range, ::))
20 | 	def apply(index: Int): Unlabeled = apply(index to index)
21 | 	def pick(indices: Seq[Int]): Unlabeled = copy(xmat = xmat(indices, ::).toDenseMatrix)
22 | 
23 | 	def ++(that: Unlabeled) = copy(xmat = Mat.vertcat(xmat, that.xmat))
24 | 
25 | 	def duplicate = copy(xmat = xmat.copy)
26 | 
27 | 	def labelString = s"${recordCount} records, ${featureCount} features"
28 | 	override def toString = s"Data($labelString)"
29 | 
30 | 	/*
31 | 	Function that writes the dataset to stdout
32 | 	*/
33 | 
34 | 	lazy val reportHeader =
35 | 		names.map("%9s" format _.name)
36 | 
37 | 	def reportLine(line: Int) = {
38 | 		val vec = xmat(line, ::).inner.toScalaVector
39 | 		vec.zipWithIndex.map { case (v, i) =>
40 | 			Util.printDoubleNicely(v, reportHeader(i).size)
41 | 		}
42 | 	}
43 | 
44 | 	def report {
45 | 		println(reportHeader mkString "  ")
46 | 		indices.foreach { i =>
47 | 			println(reportLine(i) mkString "  ")
48 | 		}
49 | 	}
50 | 
51 | 
52 | 
53 | 	lazy val wekaDataset = {
54 | 		val attributes = new java.util.ArrayList[Attribute]()
55 | 		names.foreach { name =>
56 | 			attributes.add(new Attribute(name.name))
57 | 		}
58 | 
59 | 		val data = new Instances("Dataset", attributes, 0)
60 | 
61 | 		indices.foreach { i =>
62 | 			data.add(new DenseInstance(1.0, xmat(i, ::).inner.toArray))
63 | 		}
64 | 
65 | 		data
66 | 	}
67 | }
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/loaders/ArffLoader.scala:
--------------------------------------------------------------------------------
  1 | package qlearn.dataset.loaders
  2 | 
  3 | import qlearn.Types.{Vec, Mat}
  4 | import qlearn.dataset.schema.{Column, NominalColumn, NumericalColumn}
  5 | import qlearn.dataset.{Numerical, Unlabeled}
  6 | import qlearn.loss.numerical.MeanSquaredLoss
  7 | 
  8 | import scala.collection.mutable.ArrayBuffer
  9 | 
 10 | /*
 11 | 	ARFF data file loader
 12 | 
 13 | 	Missing features:
 14 | 	* support for labeled datasets (figure out types)
 15 | 	* support for string and date columns
 16 | 	* support for sparse rows
 17 | 	* support for instance weights (this will probably never be implemented)
 18 | 
 19 |  */
 20 | 
 21 | object ArffLoader extends Loader {
 22 | 
 23 | 	def removeComment(line: String) =
 24 | 		line.takeWhile(_ != '%')
 25 | 
 26 | 	def trimLine(line: String) = line.trim
 27 | 
 28 | 	def isEmptyLine(line: String) = line.isEmpty
 29 | 
 30 | 
 31 | 
 32 | 	object Regex {
 33 | 		// since the structure is really simple, there's no need to bother with parsers
 34 | 
 35 | 		private val literal = "'?(.*?)'?"
 36 | 
 37 | 		private val nominal = raw"\{\s*(.*?)\s*\}"
 38 | 
 39 | 		private val kind = raw"(real|numeric|integer|string|date|relational|$nominal)"
 40 | 
 41 | 		val name = raw"(?i)@relation\s+$literal\s*".r
 42 | 
 43 | 		val attribute = raw"(?i)@attribute\s+$literal\s+$kind".r
 44 | 	}
 45 | 
 46 | 
 47 | 
 48 | 	def clean(data: Iterator[String]) =
 49 | 		data.map(removeComment).map(trimLine).filterNot(isEmptyLine)
 50 | 
 51 | 	def commaSplit(str: String) = str.split(raw"\s*,\s*").toVector
 52 | 
 53 | 	def parseHeader(data: Iterator[String]) = {
 54 | 		val name = data.next match {
 55 | 			case Regex.name(name) => name
 56 | 			case line => throw ParseError(s"The dataset has to start with @relation, got instead: $line")
 57 | 		}
 58 | 
 59 | 		val attributes = Stream.continually(data.next).takeWhile(_.toLowerCase != "@data").map {
 60 | 			case Regex.attribute(name, "real" | "numeric" | "integer", _) => NumericalColumn(Symbol(name))
 61 | 
 62 | 			case Regex.attribute(name, kind, null) =>
 63 | 				throw ParseError(s"An attribute $name of type $kind is currently unsupported")
 64 | 
 65 | 			case Regex.attribute(name, _, kind) => NominalColumn(Symbol(name), commaSplit(kind))
 66 | 
 67 | 			case line =>
 68 | 				throw ParseError(s"Proper attribute declaration expected, got instead: $line")
 69 | 		}.toVector
 70 | 
 71 | 		(name, attributes)
 72 | 	}
 73 | 
 74 | 	def parseLine(types: Vector[Column])(line: String) =
 75 | 		(commaSplit(line), types).zipped.map {
 76 | 			case ("?", _)   => Double.NaN
 77 | 			case (value, col: NumericalColumn) => value.toDouble
 78 | 			case (value, col: NominalColumn) =>
 79 | 				col.lookup.get(value) match {
 80 | 					case Some(pos) => pos.toDouble
 81 | 					case _ => throw ParseError(s"Undeclared nominal value: $value")
 82 | 				}
 83 | 		}
 84 | 
 85 | 	def buildUnlabeled(it: Iterator[Double], names: Vector[Symbol]) = {
 86 | 		val array = it.toArray
 87 | 		val cols = names.size
 88 | 		val rows = array.size / cols
 89 | 		val matrix = new Mat(rows, cols, array, 0, cols, true)
 90 | 		Unlabeled(matrix, names)
 91 | 	}
 92 | 
 93 | 
 94 | 
 95 | 	def unlabeled(data: Iterator[String]) = {
 96 | 		val cleaned = clean(data)
 97 | 		val (name, columns) = parseHeader(cleaned)
 98 | 		val names = columns.map(_.name)
 99 | 
100 | 		val it = cleaned.flatMap(parseLine(columns))
101 | 		buildUnlabeled(it, names)
102 | 	}
103 | 
104 | 	def labeled(data: Iterator[String], attribute: Symbol) = {
105 | 		val cleaned = clean(data)
106 | 		val (name, columns) = parseHeader(cleaned)
107 | 		val names = columns.map(_.name)
108 | 
109 | 		val index = names.indexOf(attribute)
110 | 		if (index == -1)
111 | 			throw ParseError(s"The attribute $attribute was not found in the dataset")
112 | 
113 | 		val y = new ArrayBuffer[Double]
114 | 		val it = cleaned.flatMap { line =>
115 | 			val data = parseLine(columns)(line)
116 | 			y.append(data(index))
117 | 			data.patch(index, Nil, 1)
118 | 		}
119 | 		val x = buildUnlabeled(it, names.patch(index, Nil, 1))
120 | 
121 | 		Numerical(x, new Vec(y.toArray), NumericalColumn(attribute))
122 | 	}
123 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/loaders/Loader.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset.loaders
 2 | 
 3 | import scala.io.Source.fromFile
 4 | 
 5 | import qlearn.dataset.{Numerical, Labeled, Unlabeled}
 6 | 
 7 | abstract class Loader {
 8 | 
 9 | 	case class ParseError(error: String) extends Exception(error)
10 | 
11 | 
12 | 	def unlabeled(data: Iterator[String]): Unlabeled
13 | 
14 | 	def unlabeled(file: String): Unlabeled = unlabeled(fromFile(file).getLines)
15 | 
16 | 
17 | 	def labeled(data: Iterator[String], attribute: Symbol): Numerical
18 | 
19 | 	def labeled(file: String, attribute: Symbol): Numerical = labeled(fromFile(file).getLines, attribute)
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/loaders/TabLoader.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset.loaders
 2 | 
 3 | object TabLoader extends Loader {
 4 | 	def unlabeled(data: Iterator[String]) = {
 5 | 		val names = data.next.split("\t")
 6 | 		val columns = names.size
 7 | 
 8 | 		val types = data.next.split("\t", -1).map(_.trim).map {
 9 | 			case "continuous" | "c" => 1
10 | 			case "discrete" | "d"   => 1
11 | 			case _ => throw new Exception("Only discrete and continous variables are supported.")
12 | 		}
13 | 
14 | 		val flags = data.next.split("\t", -1).map(_.trim)
15 | 
16 | 		require(columns == types.size, "Number of type columns doesn't match.")
17 | 		require(columns == flags.size, "Number of flag columns doesn't match.")
18 | 
19 | 		var classNames = Vector.fill(columns)(Vector.empty[String])
20 | 
21 | 		/*val rows = data.map { line =>
22 | 			(line.split("\t"), types, Iterator.from(0)).zipped.map {
23 | 				case ("?", _, _) => Double.NaN
24 | 				case (str, 1, _) => str.toDouble
25 | 				case (str, 2, i) =>
26 | 					val names = classNames(i)
27 | 
28 | 					names.indexOf(str) match {
29 | 						case -1 =>
30 | 							classNames = classNames.updated(i, names :+ str)
31 | 							names.size.toDouble
32 | 
33 | 						case j => j.toDouble
34 | 					}
35 | 			}
36 | 		}.toArray*/
37 | 
38 | 		???
39 | 	}
40 | 
41 | 	def labeled[Binary](data: Iterator[String], attribute: String) = ???
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/schema/BinaryColumn.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset.schema
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.{Binary, Unlabeled}
 5 | import qlearn.loss.Loss
 6 | import qlearn.loss.binary.LogisticLoss
 7 | 
 8 | case class BinaryColumn(name: Symbol, loss: Loss[Binary] = LogisticLoss) extends Column {
 9 | 	type T = Binary
10 | 
11 | 	def populate(x: Unlabeled, y: Mat) = {
12 | 		assert(y.cols == 2)
13 | 		Binary(x, y(::, 1), this)
14 | 	}
15 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/schema/Column.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset.schema
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.{SingleLabeled, Unlabeled}
 5 | import qlearn.loss.Loss
 6 | 
 7 | abstract class Column {
 8 | 	type T// <: SingleLabeled[T]
 9 | 
10 | 	val loss: Loss[T]
11 | 
12 | 	val name: Symbol
13 | 
14 | 	def populate(x: Unlabeled, y: Mat): T
15 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/schema/NominalColumn.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset.schema
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.{NominalFull, Nominal, Unlabeled}
 5 | import qlearn.loss.Loss
 6 | import qlearn.loss.nominal.CrossEntropyLoss
 7 | 
 8 | case class NominalColumn(name: Symbol, values: Vector[String], loss: Loss[Nominal] = CrossEntropyLoss()) extends Column {
 9 | 	type T = Nominal
10 | 
11 | 	val lookup = values.zipWithIndex.toMap
12 | 
13 | 	def populate(x: Unlabeled, y: Mat) = {
14 | 		assert(y.cols == values.size)
15 | 		NominalFull(x, y, this)
16 | 	}
17 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/dataset/schema/NumericalColumn.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.dataset.schema
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.{Numerical, Unlabeled}
 5 | import qlearn.loss.Loss
 6 | import qlearn.loss.numerical.MeanSquaredLoss
 7 | 
 8 | case class NumericalColumn(name: Symbol, loss: Loss[Numerical] = MeanSquaredLoss) extends Column {
 9 | 	type T = Numerical
10 | 
11 | 	def populate(x: Unlabeled, y: Mat) = {
12 | 		assert(y.cols == 1)
13 | 		Numerical(x, y(::, 0), this)
14 | 	}
15 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/Loss.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss
 2 | 
 3 | abstract class Loss[-T] {
 4 | 
 5 | 	/*
 6 | 		The domain of the function.
 7 | 
 8 | 		It always ranges from the better score to the worse.
 9 | 	 */
10 | 
11 | 	def range: (Double, Double)
12 | 
13 | 	/*
14 | 		The principal method to be defined.
15 | 	 */
16 | 
17 | 	def apply(actual: T, predicted: T): Double
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/binary/F1.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.binary
 2 | 
 3 | import qlearn.dataset.Binary
 4 | import qlearn.loss.Loss
 5 | 
 6 | object F1 extends Loss[Binary] {
 7 | 
 8 | 	val range = 1.0 -> 0.0
 9 | 
10 | 	def apply(actual: Binary, predicted: Binary) = {
11 | 		val tp = (actual.yt :& predicted.yt).activeSize.toDouble
12 | 		val fp = (!actual.yt :& predicted.yt).activeSize
13 | 		val fn = (actual.yt :& !predicted.yt).activeSize
14 | 
15 | 		val precision = tp / (tp + fp)
16 | 		val recall = tp / (tp + fn)
17 | 
18 | 		2 * precision * recall / (precision + recall)
19 | 	}
20 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/binary/HingeLoss.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.binary
 2 | 
 3 | import breeze.stats.mean
 4 | import breeze.linalg.max
 5 | import qlearn.dataset.Binary
 6 | import qlearn.loss.Loss
 7 | 
 8 | object HingeLoss extends Loss[Binary] {
 9 | 
10 | 	val range = 0.0 -> Double.PositiveInfinity
11 | 
12 | 	def apply(actual: Binary, predicted: Binary) = {
13 | 		val m = -actual.ymat :* predicted.ymat + 1.0
14 | 		mean(max(m, 0.0))
15 | 	}
16 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/binary/LogisticLoss.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.binary
 2 | 
 3 | import breeze.numerics.{exp, log}
 4 | import breeze.stats.mean
 5 | import qlearn.dataset.Binary
 6 | import qlearn.loss.Loss
 7 | 
 8 | object LogisticLoss extends Loss[Binary] {
 9 | 
10 | 	val range = 0.0 -> Double.PositiveInfinity
11 | 
12 | 	def apply(actual: Binary, predicted: Binary) =
13 | 		mean(log(exp(-actual.ymat :* predicted.ymat) + 1.0))
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/binary/Precision.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.binary
 2 | 
 3 | import qlearn.dataset.Binary
 4 | import qlearn.loss.Loss
 5 | 
 6 | object Precision extends Loss[Binary] {
 7 | 
 8 | 	val range = 1.0 -> 0.0
 9 | 
10 | 	def apply(actual: Binary, predicted: Binary) = {
11 | 		val tp = (actual.yt :& predicted.yt).activeSize.toDouble
12 | 		val fp = (!actual.yt :& predicted.yt).activeSize
13 | 		tp / (tp + fp)
14 | 	}
15 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/binary/Recall.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.binary
 2 | 
 3 | import qlearn.dataset.Binary
 4 | import qlearn.loss.Loss
 5 | 
 6 | object Recall extends Loss[Binary] {
 7 | 
 8 | 	val range = 1.0 -> 0.0
 9 | 
10 | 	def apply(actual: Binary, predicted: Binary) = {
11 | 		val tp = (actual.yt :& predicted.yt).activeSize.toDouble
12 | 		val fn = (actual.yt :& !predicted.yt).activeSize
13 | 		tp / (tp + fn)
14 | 	}
15 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/nominal/CrossEntropyLoss.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.nominal
 2 | 
 3 | import breeze.linalg.{min, max, sum}
 4 | import breeze.numerics.log
 5 | import qlearn.dataset.Nominal
 6 | import qlearn.loss.Loss
 7 | 
 8 | case class CrossEntropyLoss(margin: Double = 1e-15) extends Loss[Nominal] {
 9 | 
10 | 	val range = 0.0 -> Double.PositiveInfinity
11 | 
12 | 	def apply(actual: Nominal, predicted: Nominal) = {
13 | 		val corrected = min(max(predicted.ymat, margin), 1 - margin)
14 | 		sum(log(corrected) :* actual.ymat) / -actual.recordCount
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/nominal/FractionOfIncorrect.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.nominal
 2 | 
 3 | import qlearn.dataset.Nominal
 4 | import qlearn.loss.Loss
 5 | 
 6 | object FractionOfIncorrect extends Loss[Nominal] {
 7 | 
 8 | 	val range = 0.0 -> 1.0
 9 | 
10 | 	def apply(actual: Nominal, predicted: Nominal) =
11 | 		(actual.y :!= predicted.y).activeSize / actual.recordCount.toDouble
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/nominal/MatrixLoss.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.nominal
 2 | 
 3 | import breeze.linalg.sum
 4 | import qlearn.dataset.Nominal
 5 | import qlearn.loss.Loss
 6 | 
 7 | import qlearn.Types.Mat
 8 | 
 9 | case class MatrixLoss(matrix: (Symbol, Seq[Double])*) extends Loss[Nominal] {
10 | 
11 | 	val range = 0.0 -> Double.PositiveInfinity
12 | 
13 | 	val m = Mat(matrix.sortBy(_._1.name).map(_._2): _*)
14 | 
15 | 	def apply(actual: Nominal, predicted: Nominal) =
16 | 		sum(actual.ymat * m :* predicted.ymat) / actual.recordCount
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/MeanAbsoluteLoss.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical
 2 | 
 3 | import breeze.stats.mean
 4 | import breeze.numerics.abs
 5 | import qlearn.dataset.Numerical
 6 | import qlearn.loss.Loss
 7 | 
 8 | object MeanAbsoluteLoss extends Loss[Numerical] {
 9 | 
10 | 	val range = 0.0 -> Double.PositiveInfinity
11 | 
12 | 	def apply(actual: Numerical, predicted: Numerical) =
13 | 		 mean(abs(actual.y - predicted.y))
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/MeanSquaredLoss.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical
 2 | 
 3 | import breeze.stats.mean
 4 | import qlearn.dataset.Numerical
 5 | import qlearn.loss.Loss
 6 | 
 7 | object MeanSquaredLoss extends Loss[Numerical] {
 8 | 
 9 | 	val range = 0.0 -> Double.PositiveInfinity
10 | 
11 | 	def apply(actual: Numerical, predicted: Numerical) =
12 | 		mean((actual.y - predicted.y) :^ 2.0)
13 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/ChebyshevDistance.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.linalg.max
 4 | import breeze.numerics.abs
 5 | import qlearn.Types._
 6 | 
 7 | object ChebyshevDistance extends Distance {
 8 | 
 9 | 	def apply(a: Vec, b: Vec) = max(abs(a - b))
10 | 
11 | 	override def apply(a: Mat, b: Vec) =
12 | 		max((a.r - b).r)
13 | 
14 | 	override def apply(a: Mat, b: Mat) =
15 | 		max(abs(a - b).r)
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/CosineSimilarity.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.linalg.max
 4 | import breeze.numerics.sqrt
 5 | import qlearn.Types._
 6 | 
 7 | object CosineSimilarity extends Distance {
 8 | 
 9 | 	override val range = 0.0 -> 1.0
10 | 
11 | 	def apply(a: Vec, b: Vec) = {
12 | 		val sim = dot(a,b) / sqrt(dot(a,a) * dot(b,b))
13 | 		1 - math.max(sim, 0.0)
14 | 	}
15 | 
16 | 	override def apply(a: Mat, b: Vec) = {
17 | 		val sim = dot(a,b) / sqrt(dot(a,a) * dot(b,b))
18 | 		-max(sim, 0.0) + 1.0
19 | 	}
20 | 
21 | 	override def apply(a: Mat, b: Mat) = {
22 | 		val sim = dot(a,b) / sqrt(dot(a,a) :* dot(b,b))
23 | 		-max(sim, 0.0) + 1.0
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/Distance.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.linalg.sum
 4 | import qlearn.Types._
 5 | import qlearn.dataset.Numerical
 6 | import qlearn.loss.Loss
 7 | 
 8 | abstract class Distance extends Loss[Numerical] {
 9 | 
10 | 	def range = 0.0 -> Double.PositiveInfinity
11 | 
12 | 	require(range._1 < range._2, "Smaller distance has to mean better.")
13 | 
14 | 	
15 | 	def dot(a: Vec, b: Vec) = a.dot(b)
16 | 	def dot(a: Mat, b: Vec) = a.r.dot(b)
17 | 	def dot(a: Mat, b: Mat) = sum((a :* b).r)
18 | 
19 | 
20 | 	def apply(actual: Numerical, predicted: Numerical) =
21 | 		apply(actual.y, predicted.y)
22 | 
23 | 	/*
24 | 		Compute the distance between two vectors.
25 | 	 */
26 | 
27 | 	def apply(a: Vec, b: Vec): Double
28 | 
29 | 	/*
30 | 		Compute a vector of distances to all the rows
31 | 		of the matrix a.
32 | 	 */
33 | 
34 | 	def apply(a: Mat, b: Vec): Vec =
35 | 		a.r.map( row =>
36 | 			apply(row, b)
37 | 		)
38 | 
39 | 	/*
40 | 		Compute a vector of distances between the coaligned
41 | 		rows of both matrices.
42 | 	 */
43 | 
44 | 	def apply(a: Mat, b: Mat): Vec =
45 | 		Vec.tabulate(a.rows)( i =>
46 | 			apply(a(i, ::).t, b(i, ::).t)
47 | 		)
48 | 
49 | 	/*
50 | 		For performance reasons, you can also override this method.
51 | 		It computes the sum of the distances that you obtain by
52 | 		taking the pairwise rows of the both matrices.
53 | 	 */
54 | 
55 | 	def total(a: Mat, b: Mat): Double = sum(apply(a, b))
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/EuclideanDistance.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.linalg.norm
 4 | import qlearn.Types._
 5 | 
 6 | object EuclideanDistance extends Distance {
 7 | 
 8 | 	def apply(a: Vec, b: Vec) = norm(a - b)
 9 | 
10 | 	override def apply(a: Mat, b: Vec) =
11 | 		norm((a.r - b).r)
12 | 
13 | 	override def apply(a: Mat, b: Mat) =
14 | 		norm((a - b).r)
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/ManhattanDistance.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.linalg.sum
 4 | import breeze.numerics.abs
 5 | import qlearn.Types._
 6 | 
 7 | object ManhattanDistance extends Distance {
 8 | 
 9 | 	def apply(a: Vec, b: Vec) = sum(abs(a - b))
10 | 
11 | 	override def apply(a: Mat, b: Vec) = sum(abs(a.r - b).r)
12 | 
13 | 	override def apply(a: Mat, b: Mat) = sum(abs(a - b).r)
14 | 
15 | 	override def total(a: Mat, b: Mat) = sum(abs(a - b))
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/NormDistance.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.linalg.norm
 4 | import qlearn.Types._
 5 | 
 6 | case class NormDistance(p: Double) extends Distance {
 7 | 	require(p >= 1.0, "P under 1.0 produces a degenerate norm.")
 8 | 
 9 | 	def apply(a: Vec, b: Vec) = norm(a - b, p)
10 | 
11 | 	override def apply(a: Mat, b: Vec) =
12 | 		norm((a.r - b).r, p)
13 | 
14 | 	override def apply(a: Mat, b: Mat) =
15 | 		norm((a - b).r, p)
16 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/PolyKernel.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.numerics.pow
 4 | import qlearn.Types._
 5 | 
 6 | case class PolyKernel(exponent: Double) extends Distance {
 7 | 
 8 | 	def apply(a: Vec, b: Vec) = pow(dot(a,b), exponent)
 9 | 
10 | 	override def apply(a: Mat, b: Vec) = pow(dot(a,b), exponent)
11 | 
12 | 	override def apply(a: Mat, b: Mat) = pow(dot(a,b), exponent)
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/loss/numerical/distance/RBFKernel.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.loss.numerical.distance
 2 | 
 3 | import breeze.numerics.exp
 4 | import qlearn.Types._
 5 | 
 6 | case class RBFKernel(gamma: Double = 0.01) extends Distance {
 7 | 
 8 | 	def apply(a: Vec, b: Vec) = {
 9 | 		val dots = dot(a,b)*2.0 - dot(a,a) - dot(b,b)
10 | 		exp(dots * gamma)
11 | 	}
12 | 
13 | 	override def apply(a: Mat, b: Vec) = {
14 | 		val dots = dot(a,b)*2.0 - dot(a,a) - dot(b,b)
15 | 		exp(dots * gamma)
16 | 	}
17 | 
18 | 	override def apply(a: Mat, b: Mat) = {
19 | 		val dots = dot(a,b)*2.0 - dot(a,a) - dot(b,b)
20 | 		exp(dots * gamma)
21 | 	}
22 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/Clusterer.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml
 2 | 
 3 | import qlearn.dataset.{NominalBasic, Unlabeled}
 4 | 
 5 | abstract class Clusterer {
 6 | 
 7 | 	protected def names(k: Int) = (0 until k).map(_.toString).toVector
 8 | 
 9 | 	def cluster(data: Unlabeled): NominalBasic
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/FittedModel.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml
 2 | 
 3 | import qlearn.dataset.{Labeled, Unlabeled}
 4 | 
 5 | 
 6 | abstract class FittedModel[T] {
 7 | 
 8 | 	/*
 9 | 		We keep the reference to the learning dataset just to know the schema
10 | 		(e.g. the ordinals of the nominal attributes, etc).
11 | 
12 | 		TODO: get rid of this in the future. The learning dataset should be garbage collected ASAP.
13 | 	 */
14 | 
15 | 	val schema: T
16 | 
17 | 	/*
18 | 		The predict() method is the primary one. It returns the resulting dataset.
19 | 	 */
20 | 
21 | 	def predict(data: Unlabeled): T
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/FittedModelMulti.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml
 2 | 
 3 | import qlearn.dataset.{Unlabeled, SingleLabeled, MultiLabeled}
 4 | import qlearn.Types.Mat
 5 | 
 6 | case class FittedModelMulti[T <: SingleLabeled[T]](schema: MultiLabeled[T], models: Seq[FittedModel[T]]) extends FittedModel[MultiLabeled[T]] {
 7 | 
 8 | 	def predict(data: Unlabeled): MultiLabeled[T] = {
 9 | 		val ynew = Mat.horzcat(models.map(_.predict(data).ymat): _*)
10 | 		schema.updated(data, ynew)
11 | 	}
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/Model.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml
 2 | 
 3 | import qlearn.dataset.{MultiLabeled, SingleLabeled}
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | abstract class Model[T] {
 8 | 
 9 | 	/*
10 | 		Models that want to report their learning status back to the user
11 | 		can allow them to specify this function
12 | 	 */
13 | 
14 | 	val reporter = { (_: Seq[Double], _: String) => /* no action */ }
15 | 
16 | 	/*
17 | 		The fit() method is the primary one. It returns the fitted model.
18 | 	 */
19 | 
20 | 	def fit(data: T): FittedModel[T]
21 | 
22 | 	//def fit(data: MultiLabeled[T]): FittedModel[MultiLabeled[T]] = FittedModelMulti(data, data.ys.map(fit))
23 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/ModelForTwo.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml
 2 | 
 3 | import qlearn.dataset.Labeled
 4 | 
 5 | 
 6 | trait ModelForTwo[T <: Labeled[T]] {
 7 | 
 8 | 	/*
 9 | 		This trait represents the model's ability to learn from the dataset,
10 | 		which is split in two parts. Of course every model is able to do that
11 | 		by just concatenating beforehand. However, you should only use this
12 | 		trait if such thing could be done effortlessly, without additional
13 | 		memory or CPU costs. Note, concatenation is an O(n) operation.
14 | 
15 | 		Why is this good?
16 | 
17 | 		For models that implement this, we can avoid copying the dataset K times
18 | 		on K-fold cross validation. It is a matter of simply splicing out the current
19 | 		test fold from the dataset, leaving the training dataset in two separate parts
20 | 		(well, except for the first and last fold).
21 |  */
22 | 
23 | 	def fit(data1: T, data2: T): FittedModel[T]
24 | 
25 | 
26 | 	/*def fit(data: T): FittedModel[T] =
27 | 		fit(data, data.pick(0 to -1))*/
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/RandomizedClusterer.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml
 2 | 
 3 | import qlearn.dataset.{Unlabeled, NominalBasic}
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | trait RandomizedClusterer extends Clusterer {
 8 | 	/*
 9 | 		A clusterer should include this trait if its output significantly
10 | 		depends on the behavior of the random number generator.
11 | 	 */
12 | 
13 | 	def cluster(data: Unlabeled, seed: Long): NominalBasic
14 | 
15 | 	def cluster(data: Unlabeled) = cluster(data, Random.nextLong)
16 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/RandomizedModel.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml
 2 | 
 3 | import scala.util.Random
 4 | 
 5 | trait RandomizedModel[T] extends Model[T] {
 6 | 	/*
 7 | 		A model should include this trait if its output significantly
 8 | 		depends on the behavior of the random number generator.
 9 | 	 */
10 | 
11 | 	def fit(data: T, seed: Long): FittedModel[T]
12 | 
13 | 	def fit(data: T) = fit(data, Random.nextLong)
14 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/FittedNeuralNetwork.scala:
--------------------------------------------------------------------------------
1 | package qlearn.ml.classify
2 | 
3 | /*import qlearn.dataset.{MultiLabeled, SingleLabeled, Unlabeled}
4 | import qlearn.ml.FittedModel
5 | 
6 | case class FittedNeuralNetwork[T <: SingleLabeled[T]](schema: MultiLabeled[T], a: Int) extends FittedModel[MultiLabeled[T]] {
7 | 	def predict(data: Unlabeled) = schema
8 | }*/
9 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/FittedRandomTree.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify
 2 | 
 3 | import breeze.linalg.sum
 4 | import qlearn.Types._
 5 | import qlearn.dataset.{Nominal, Unlabeled}
 6 | import qlearn.ml.FittedModel
 7 | 
 8 | case class FittedRandomTree(
 9 | 	schema: Nominal,
10 | 	tree: BinaryTree
11 | ) extends FittedModel[Nominal] {
12 | 
13 | 	def predict(data: Unlabeled) = {
14 | 		val newy = data.xmat.r.map { vec =>
15 | 
16 | 			def recurse(tree: BinaryTree): Vec =
17 | 				tree match {
18 | 					case Leaf(prediction) => prediction / sum(prediction)
19 | 					case Node(left, right, feature, split) =>
20 | 						recurse(
21 | 							if (vec(feature) < split) left else right
22 | 						)
23 | 				}
24 | 
25 | 			recurse(tree)
26 | 		}
27 | 		schema.updated(data, newy)
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/FittedSameDistribution.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify
 2 | 
 3 | import qlearn.Types.Vec
 4 | import qlearn.dataset.{Unlabeled, Nominal}
 5 | import qlearn.ml.FittedModel
 6 | 
 7 | case class FittedSameDistribution(schema: Nominal, prediction: Vec) extends FittedModel[Nominal] {
 8 | 
 9 | 	def predict(data: Unlabeled) =
10 | 		schema.updated(data, Vec.ones[Double](data.recordCount) * prediction.t)
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/FittedSimpleKNN.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify
 2 | 
 3 | import breeze.linalg.sum
 4 | import qlearn.Types._
 5 | import qlearn.dataset.{Nominal, Unlabeled}
 6 | import qlearn.loss.numerical.distance.{EuclideanDistance, Distance}
 7 | import qlearn.ml.FittedModel
 8 | import qlearn.util.Util
 9 | 
10 | case class FittedSimpleKNN(
11 | 	schema: Nominal,
12 | 	k: Int,
13 | 	distance: Distance = EuclideanDistance,
14 | 	weighting: Double => Double = {_ => 1.0}
15 | ) extends FittedModel[Nominal] {
16 | 
17 | 	/*
18 | 		This classifier predicts the class distribution probabilities according
19 | 		to the k nearest records and their actual distance.
20 | 
21 | 		With the weighting function you can denote how you want to ponderize
22 | 		the records according to their distance. Some examples:
23 | 
24 | 		{_ => 1.0}      : ponderize the points completely equally
25 | 		math.pow(_, -p) : exponential dampering for p > 0
26 | 		...
27 | 	 */
28 | 
29 | 	require(k <= schema.recordCount, s"Cannot run $k-NN classifier on a dataset with just ${schema.recordCount} records.")
30 | 
31 | 	def predict(data: Unlabeled) = {
32 | 		val newy = data.xmat.r.map { record =>
33 | 			val distances = distance(schema.xmat, record)
34 | 			val smallest = Util.kSmallestIndices(distances, k)
35 | 
36 | 			// select k smallest distances and weight them
37 | 			val weighted = distances(smallest).map(weighting)
38 | 			// select k closest corresponding output rows
39 | 			val chosen = schema.ymat(smallest, ::).toDenseMatrix
40 | 			// weight those rows with "weighted" vector
41 | 			val columnSums = sum((chosen.c :* weighted).c).t
42 | 			// finally, normalize
43 | 			columnSums / sum(columnSums)
44 | 		}
45 | 		schema.updated(data, newy)
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/NeuralNetwork.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify
 2 | 
 3 | /*import qlearn.util.nnet.Layer
 4 | import qlearn.util.nnet.activations.ActivationFunction
 5 | import qlearn.dataset.{MultiLabeled, SingleLabeled}
 6 | import qlearn.ml.Model
 7 | 
 8 | 
 9 | case class NeuralNetwork[T <: SingleLabeled[T]](
10 | 	layers: Vector[Layer],
11 | 	lastLevelActivation: ActivationFunction
12 | ) extends Model[MultiLabeled[T]] {
13 | 
14 | 	def fit(data: MultiLabeled[T])= {
15 | 		FittedNeuralNetwork(data, 10)
16 | 	}
17 | }
18 | */


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/RandomForest.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify
 2 | 
 3 | import qlearn.dataset.Nominal
 4 | import qlearn.ml.{RandomizedModel, Model}
 5 | import qlearn.ml.meta.Bagging
 6 | 
 7 | case class RandomForest(tree: RandomTree, count: Int) extends Model[Nominal] with RandomizedModel[Nominal] {
 8 | 
 9 | 	def fit(data: Nominal, seed: Long) =
10 | 		Bagging(Seq.fill(count)(tree)).fit(data, seed)
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/RandomTree.scala:
--------------------------------------------------------------------------------
  1 | package qlearn.ml.classify
  2 | 
  3 | import breeze.linalg.sum
  4 | import qlearn.Types._
  5 | import qlearn.dataset.Nominal
  6 | import qlearn.ml.{RandomizedModel, Model}
  7 | import qlearn.util.Util
  8 | 
  9 | import scala.util.Random
 10 | 
 11 | abstract class BinaryTree
 12 | case class Leaf(predict: Vec) extends BinaryTree
 13 | case class Node(left: BinaryTree, right: BinaryTree, feature: Int, split: Double) extends BinaryTree
 14 | 
 15 | case class RandomTree(
 16 | 	numFeatures: Option[Int] = None,
 17 | 	maxDepth: Option[Int] = None,
 18 | 	minParent: Int = 1
 19 | ) extends Model[Nominal] with RandomizedModel[Nominal] {
 20 | 
 21 | 	require(minParent >= 1)
 22 | 
 23 | 	def recurse(data: Nominal, indices: Vector[Int], rand: Random, depth: Int = 0): BinaryTree = {
 24 | 
 25 | 		/*
 26 | 			Pick a random feature set.
 27 | 
 28 | 			This is the random portion of this learner, appropriately
 29 | 			named RandomTree.
 30 | 		 */
 31 | 
 32 | 		def randomFeaturePick = {
 33 | 			val range = 0 until data.featureCount
 34 | 			numFeatures match {
 35 | 				case Some(limit) => Util.randomSubset(range, limit, rand)
 36 | 				case _ => range
 37 | 			}
 38 | 		}
 39 | 
 40 | 		/*
 41 | 			Compute the entropy of a distribution.
 42 | 
 43 | 			Unit is nats instead ob bits.
 44 | 		 */
 45 | 
 46 | 		def entropy(v: Vec) = {
 47 | 			val total = sum(v)
 48 | 			math.log(total) - sum(v.map( d =>
 49 | 				if (d == 0) 0 else d * math.log(d)
 50 | 			)) / total
 51 | 		}
 52 | 
 53 | 		/*
 54 | 			What is the best split value for a given feature?
 55 | 
 56 | 			@returns (best split point, entropy)
 57 | 		 */
 58 | 
 59 | 		def bestSplitPoint(feature: Int, distribution: Vec): (Double, Double) = {
 60 | 			var bestScore = Double.PositiveInfinity
 61 | 			var best = 0.0
 62 | 
 63 | 			val sorted = indices.sortBy(data.xmat(_, feature))
 64 | 			val l = data.ymat(0, ::).t * 0.0
 65 | 			val r = distribution
 66 | 			sorted.indices.init.foreach { i =>
 67 | 				val row = data.ymat(sorted(i), ::).t
 68 | 				l += row
 69 | 				r -= row
 70 | 
 71 | 				val score = entropy(l) + entropy(r)
 72 | 				if (score < bestScore) {
 73 | 					bestScore = score
 74 | 					best = data.xmat(sorted(i), feature) + data.xmat(sorted(i+1), feature)
 75 | 				}
 76 | 			}
 77 | 			best/2 -> bestScore
 78 | 		}
 79 | 
 80 | 		/*
 81 | 			Did we reach the stopping condition yet?
 82 | 
 83 | 			React accordingly.
 84 | 		 */
 85 | 
 86 | 		val distribution =
 87 | 			sum( data.ymat(indices, ::).toDenseMatrix.c ).t
 88 | 
 89 | 		val shouldStop =
 90 | 			indices.size <= minParent  ||          // we have less than minParent remaining
 91 | 			maxDepth.exists(_ > depth) ||          // the depth is already too big
 92 | 			(distribution :> 0.0).activeSize == 1  // the node is already pure
 93 | 
 94 | 		if (shouldStop) Leaf(distribution) else {
 95 | 			val (feature, (split, _)) =
 96 | 				randomFeaturePick.map( feature =>
 97 | 					feature -> bestSplitPoint(feature, distribution)
 98 | 				).minBy(_._2._2)
 99 | 
100 | 			val (left, right) = indices.partition(data.xmat(_, feature) <= split)
101 | 			Node(
102 | 				recurse(data, left, rand, depth + 1),
103 | 				recurse(data, right, rand, depth + 1),
104 | 				feature, split
105 | 			)
106 | 		}
107 | 	}
108 | 
109 | 	def fit(data: Nominal, seed: Long) = FittedRandomTree(
110 | 		data, recurse(data, data.indices.toVector, new Random(seed))
111 | 	)
112 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/SameDistribution.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify
 2 | 
 3 | import breeze.linalg.sum
 4 | import qlearn.Types._
 5 | import qlearn.dataset.Nominal
 6 | import qlearn.ml.Model
 7 | 
 8 | case class SameDistribution() extends Model[Nominal] {
 9 | 
10 | 	def fit(data: Nominal) = {
11 | 		val columnSums = sum(data.ymat.c).t
12 | 		FittedSameDistribution(data, columnSums / sum(columnSums))
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/SimpleKNN.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify
 2 | 
 3 | import qlearn.dataset.Nominal
 4 | import qlearn.loss.numerical.distance.{EuclideanDistance, Distance}
 5 | import qlearn.ml.Model
 6 | 
 7 | case class SimpleKNN(
 8 | 	k: Int,
 9 | 	distance: Distance = EuclideanDistance,
10 | 	weighting: Double => Double = {_ => 1.0}
11 | ) extends Model[Nominal] {
12 | 
13 | 	def fit(data: Nominal) =
14 | 		FittedSimpleKNN(data, k, distance, weighting)
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/weka/LogisticRegression.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify.weka
 2 | 
 3 | import qlearn.dataset.Nominal
 4 | import qlearn.wekas.WekaModel
 5 | import weka.classifiers.functions.Logistic
 6 | 
 7 | case class LogisticRegression(
 8 | 	ridge: Double = 1e-8,
 9 |   maxIterations: Option[Int] = None,
10 |   useConjugateGradientDescent: Boolean = false
11 | ) extends WekaModel[Nominal](new Logistic) {
12 | 
13 | 	val m = model.asInstanceOf[Logistic]
14 | 
15 | 	m.setRidge(ridge)
16 | 	m.setMaxIts(maxIterations.getOrElse(-1))
17 | 	m.setUseConjugateGradientDescent(useConjugateGradientDescent)
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/classify/weka/REPTree.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.classify.weka
 2 | 
 3 | import qlearn.dataset.Nominal
 4 | import qlearn.ml.RandomizedModel
 5 | import qlearn.wekas.WekaModel
 6 | import weka.classifiers.trees
 7 | 
 8 | import scala.util.Random
 9 | 
10 | case class REPTree(
11 | 	maxDepth: Option[Int] = None,
12 | 	minInstancesPerLeaf: Int = 2,
13 | 	pruning: Boolean = true,
14 | 	seed: Long = Random.nextLong
15 | ) extends WekaModel[Nominal](new trees.REPTree) {
16 | 
17 | 	val m = model.asInstanceOf[trees.REPTree]
18 | 
19 | 	m.setMaxDepth(maxDepth.getOrElse(-1))
20 | 	m.setMinNum(minInstancesPerLeaf)
21 | 	m.setNoPruning(!pruning)
22 | 	m.setSeed(seed.toInt)
23 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/cluster/KMeans.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.cluster
 2 | 
 3 | import breeze.linalg.argmin
 4 | import qlearn.dataset.schema.NominalColumn
 5 | import qlearn.dataset.{NominalBasic, Unlabeled}
 6 | import qlearn.Types._
 7 | import qlearn.loss.numerical.distance.{Distance, EuclideanDistance}
 8 | import qlearn.strategies.{NoRecentImprovement, Stopping}
 9 | import qlearn.util.Util
10 | import qlearn.ml.{RandomizedClusterer, Clusterer}
11 | 
12 | import scala.util.Random
13 | 
14 | 
15 | case class KMeans(
16 | 	k: Int,
17 | 	distance: Distance = EuclideanDistance,
18 | 	strategy: Stopping = NoRecentImprovement(5)
19 | ) extends Clusterer with RandomizedClusterer {
20 | 
21 | 	/*
22 | 		This is the algoritm that performs k-means clustering via the iterative approach.
23 | 
24 | 	 */
25 | 
26 | 
27 | 	require(k > 1, "Clustering requires at least 2 target clusters.")
28 | 
29 | 	def cluster(data: Unlabeled, seed: Long): NominalBasic = {
30 | 		require(k <= data.recordCount, "Cannot have more clusters than data points.")
31 | 
32 | 		val mat = data.xmat
33 | 		var centroids = {
34 | 			val randomSubset = Util.randomSubset(data.indices, k, new Random(seed))
35 | 			mat(randomSubset, ::).toDenseMatrix
36 | 		}
37 | 
38 | 		def closest(p: Vec): Int =
39 | 			argmin(distance(centroids, p))
40 | 
41 | 		val y = strategy.apply {
42 | 			val updated = Mat.zeros[Double](k, data.featureCount)
43 | 			val counts = Vec.zeros[Double](k)
44 | 
45 | 			mat.r.foreach { row =>
46 | 				val index = closest(row)
47 | 				updated(index, ::) += row.t
48 | 				counts(index) += 1.0
49 | 			}
50 | 
51 | 			updated.c /= counts
52 | 			val error = distance.total(centroids, updated)
53 | 			centroids = updated
54 | 
55 | 			error -> { () =>
56 | 				Vec.tabulate(data.recordCount)( i => closest(mat(i, ::).t) )
57 | 			}
58 | 		}
59 | 
60 | 		NominalBasic(data, y, NominalColumn('cluster, names(k)))
61 | 	}
62 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/cluster/weka/CobWeb.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.cluster.weka
 2 | 
 3 | import qlearn.ml.RandomizedClusterer
 4 | import qlearn.wekas.WekaClusterer
 5 | import weka.clusterers.Cobweb
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | case class CobWeb(
10 | 	acuity: Double = 1.0,
11 | 	cutoff: Double = 0.002,
12 | 	seed: Long = Random.nextLong
13 | ) extends WekaClusterer(new Cobweb) {
14 | 
15 | 	val c = clusterer.asInstanceOf[Cobweb]
16 | 
17 | 	c.setAcuity(acuity)
18 | 	c.setCutoff(cutoff)
19 | 	c.setSeed(seed.toInt)
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/cluster/weka/ExpectationMaximization.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.cluster.weka
 2 | 
 3 | import qlearn.ml.RandomizedClusterer
 4 | import qlearn.wekas.WekaClusterer
 5 | import weka.clusterers.EM
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | case class ExpectationMaximization(
10 | 	k: Option[Int] = None,
11 | 	folds: Int = 10,
12 | 	runs: Int = 10,
13 | 	maxClusters: Option[Int] = None,
14 | 	maxIterations: Int = 100,
15 | 	epsForK: Double = 1e-6,
16 | 	epsForE: Double = 1e-6,
17 | 	seed: Long = Random.nextLong
18 | ) extends WekaClusterer(new EM) {
19 | 
20 | 	val c = clusterer.asInstanceOf[EM]
21 | 
22 | 	c.setNumClusters(k.getOrElse(-1))
23 | 	c.setNumFolds(folds)
24 | 	c.setNumKMeansRuns(runs)
25 | 	c.setMaximumNumberOfClusters(maxClusters.getOrElse(-1))
26 | 	c.setMaxIterations(maxIterations)
27 | 	c.setMinLogLikelihoodImprovementCV(epsForK)
28 | 	c.setMinLogLikelihoodImprovementIterating(epsForE)
29 | 	c.setSeed(seed.toInt)
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/cluster/weka/KMeans.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.cluster.weka
 2 | 
 3 | import qlearn.loss.numerical.distance.{Distance, EuclideanDistance}
 4 | import qlearn.ml.{RandomizedClusterer, RandomizedModel}
 5 | import qlearn.wekas.WekaClusterer
 6 | import weka.clusterers.SimpleKMeans
 7 | 
 8 | import scala.util.Random
 9 | 
10 | case class KMeans(
11 | 	k: Int,
12 | 	distance: Distance = EuclideanDistance,
13 | 	maxIterations: Int = 500,
14 | 	seed: Long = Random.nextLong
15 | ) extends WekaClusterer(new SimpleKMeans) {
16 | 
17 | 	val c = clusterer.asInstanceOf[SimpleKMeans]
18 | 
19 | 	c.setNumClusters(k)
20 | 	c.setDistanceFunction(convertDistance(distance))
21 | 	c.setMaxIterations(maxIterations)
22 | 	c.setSeed(seed.toInt)
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/meta/Bagging.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.meta
 2 | 
 3 | import qlearn.dataset.SingleLabeled
 4 | import qlearn.ml.{Model, RandomizedModel}
 5 | import qlearn.util.Util
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | case class Bagging[T <: SingleLabeled[T]](
10 | 	learners: Seq[Model[T]],
11 | 	bagSizePercentage: Double = 100
12 | ) extends Model[T] with RandomizedModel[T] {
13 | 
14 | 	def fit(data: T, seed: Long) = {
15 | 		val rand = new Random(seed)
16 | 		val bagSize = (bagSizePercentage * data.recordCount / 100).round.toInt
17 | 
18 | 		val fitted = learners.map { learner =>
19 | 			val bag = Util.randomWithReplacement(data.indices, bagSize, rand)
20 | 			learner.fit(data.pick(bag))
21 | 		}
22 | 
23 | 		FittedBagging(data, fitted)
24 | 	}
25 | }
26 | 
27 | object Bagging {
28 | 	def apply[T <: SingleLabeled[T]](learner: Model[T], iterations: Int): Bagging[T] =
29 | 		Bagging(Seq.fill(iterations)(learner))
30 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/meta/FittedBagging.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.meta
 2 | 
 3 | import breeze.linalg.sum
 4 | import qlearn.dataset.{SingleLabeled, Unlabeled}
 5 | import qlearn.ml.FittedModel
 6 | 
 7 | case class FittedBagging[T <: SingleLabeled[T]](
 8 | 	schema: T,
 9 | 	fitted: Seq[FittedModel[T]]
10 | ) extends FittedModel[T] {
11 | 
12 | 	def predict(data: Unlabeled) = {
13 | 		val newy = sum(fitted.map(_.predict(data).ymat))
14 | 		schema.updated(data, newy / fitted.size.toDouble)
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/meta/FittedOneVsAll.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.meta
 2 | 
 3 | import breeze.linalg.normalize
 4 | import qlearn.dataset.{Binary, Nominal, Unlabeled}
 5 | import qlearn.ml.FittedModel
 6 | import qlearn.Types._
 7 | 
 8 | case class FittedOneVsAll(
 9 | 	schema: Nominal,
10 | 	fitted: Seq[FittedModel[Binary]]
11 | ) extends FittedModel[Nominal] {
12 | 
13 | 	def predict(data: Unlabeled) = {
14 | 		val newy = Mat(fitted.map(_.predict(data).yb.toArray): _*).t
15 | 		schema.updated(data, normalize(newy.r, 1))
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/meta/OneVsAll.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.meta
 2 | 
 3 | import qlearn.dataset.schema.BinaryColumn
 4 | import qlearn.dataset.{Binary, Nominal}
 5 | import qlearn.loss.binary.LogisticLoss
 6 | import qlearn.ml.Model
 7 | 
 8 | case class OneVsAll(learner: Model[Binary]) extends Model[Nominal] {
 9 | 
10 | 	private def binarize(data: Nominal) =
11 | 		(0 to data.ymat.cols).map { klass =>
12 | 			val yb = data.y.map { v =>
13 | 				if (v == klass) 1.0 else 0.0
14 | 			}
15 | 			Binary(data.x, yb, BinaryColumn(data.name))
16 | 		}
17 | 
18 | 	def fit(data: Nominal) =
19 | 		FittedOneVsAll(data, binarize(data).map(learner.fit))
20 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/regress/FittedRidgeRegression.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.regress
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.{MultiLabeled, SingleLabeled, Unlabeled}
 5 | import qlearn.ml.FittedModel
 6 | 
 7 | case class FittedRidgeRegression[T <: SingleLabeled[T]](schema: T, coef: Mat) extends FittedModel[T] {
 8 | 
 9 | 	def predict(data: Unlabeled) = schema.updated(data, data.xmat * coef)
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/regress/FittedRidgeRegressionMulti.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.regress
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.{MultiLabeled, SingleLabeled, Unlabeled}
 5 | import qlearn.ml.FittedModel
 6 | 
 7 | case class FittedRidgeRegressionMulti[T <: SingleLabeled[T]](schema: MultiLabeled[T], coef: Mat) extends FittedModel[MultiLabeled[T]] {
 8 | 
 9 | 	def predict(data: Unlabeled) = schema.updated(data, data.xmat * coef)
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/regress/RidgeRegression.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.regress
 2 | 
 3 | import breeze.linalg.{inv, pinv}
 4 | import qlearn.dataset.{Labeled, Numerical, MultiLabeled, SingleLabeled}
 5 | import qlearn.Types.Mat
 6 | import qlearn.ml.Model
 7 | 
 8 | case class RidgeRegression(ridge: Double = 0) extends Model[Numerical] {
 9 | 	require(ridge >= 0, "The ridge parameter must be positive")
10 | 
11 | 	private[this] def inverse[T <: Labeled[T]](data: T) =
12 | 		if (ridge != 0) {
13 | 			// less performant case
14 | 			val x = data.xmat
15 | 			val xt = x.t
16 | 			val diagonal = Mat.eye[Double](data.width) * ridge
17 | 			// TODO: Solve?
18 | 			inv(xt * x - diagonal) * xt
19 | 		} else pinv(data.xmat)
20 | 
21 | 	def fit(data: Numerical) =
22 | 		FittedRidgeRegression(data, inverse(data) * data.ymat)
23 | 
24 | 	/*override def fit(data: MultiLabeled[Numerical]) =
25 | 		FittedRidgeRegressionMulti(data, inverse(data) * data.ymat)*/
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/ml/regress/weka/RidgeRegression.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.ml.regress.weka
 2 | 
 3 | import qlearn.dataset.Numerical
 4 | import qlearn.wekas.WekaModel
 5 | import weka.classifiers.functions.LinearRegression
 6 | 
 7 | case class RidgeRegression(
 8 | 	ridge: Double = 0,
 9 | 	eliminateColinear: Boolean = false,
10 |   conserveMemory: Boolean = false
11 | ) extends WekaModel[Numerical](new LinearRegression) {
12 | 
13 | 	val m = model.asInstanceOf[LinearRegression]
14 | 
15 | 	m.setRidge(ridge)
16 | 	m.setEliminateColinearAttributes(eliminateColinear)
17 | 	m.setMinimal(conserveMemory)
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/strategies/NoRecentImprovement.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.strategies
 2 | 
 3 | case class NoRecentImprovement(n: Int, maxIterations: Int = 500) extends Stopping {
 4 | 	def apply[T](a: => (Double, () => T)) = {
 5 | 		val call = a
 6 | 
 7 | 		var best = call._2()
 8 | 		var bestError = call._1
 9 | 		var bestI = 1
10 | 
11 | 		var i = 1
12 | 		while(i < maxIterations && i - bestI < n) {
13 | 			i += 1
14 | 
15 | 			val (error, thunk) = a
16 | 			println(s"Error: $error")
17 | 			if (error < bestError) {
18 | 				best = thunk()
19 | 				bestError = error
20 | 				bestI = i
21 | 			} else {
22 | 				println("* result NOT improved")
23 | 			}
24 | 		}
25 | 
26 | 		best
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/strategies/Stopping.scala:
--------------------------------------------------------------------------------
1 | package qlearn.strategies
2 | 
3 | abstract class Stopping {
4 | 	def apply[T](a: => (Double, () => T)): T
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/Util.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.util
 2 | 
 3 | import qlearn.Types.Vec
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | object Util {
 8 | 	val superScript = Vector('⁰', '¹', '²', '³', '⁴', '⁵', '⁶', '⁷', '⁸', '⁹')
 9 | 
10 | 	def toSuperScript(n: Int) = n.toString.map(_.asDigit).map(superScript).mkString
11 | 
12 | 	def printDoubleNicely(num: Double, places: Int = 8) = {
13 | 		val str = s"%.${places - 1}f" format num take (places + 1)
14 | 		if (str == "NaN") {
15 | 			"_" * places
16 | 		} else if (str.startsWith("0.000") && num != 0 || str.startsWith("-0.000") || !str.contains('.')) {
17 | 			val Array(mant, rawExp) = s"%.${places}e" format num split "e\\+?"
18 | 			val exp = rawExp.toInt
19 | 
20 | 			val len = places - exp.toString.size - 1
21 | 			s"%.${len}se%s" format (mant, exp)
22 | 		} else str.init
23 | 	}
24 | 
25 | 
26 | 	def randomSubset[T](items: IndexedSeq[T], k: Int, rnd: Random = Random) = {
27 | 		val n = items.size
28 | 		require(n >= k, s"Not enough elements: Cannot select $k from $n")
29 | 
30 | 		var res = List.empty[T]
31 | 		(n - k until n).foreach { i =>
32 | 			val pos = rnd.nextInt(i+1)
33 | 			val item = items(pos)
34 | 			res ::= (if (res contains item) items(i) else item)
35 | 		}
36 | 		res
37 | 	}
38 | 
39 | 	def randomWithReplacement[T](items: IndexedSeq[T], k: Int, rnd: Random = Random) =
40 | 		Vector.fill(k)(
41 | 			items(rnd.nextInt(items.size))
42 | 		)
43 | 
44 | 
45 | 	def kSmallestIndices(items: Vec, k: Int): IndexedSeq[Int] = {
46 | 		val vec = items.toScalaVector
47 | 		vec.indices.sortBy(vec).take(k)
48 | 	}
49 | 
50 | 	def kthSmallestElement(items: Vec, k: Int): Double = {
51 | 		???
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/decisionStump/DecisionStump.scala:
--------------------------------------------------------------------------------
1 | package qlearn.util.decisionStump
2 | 
3 | class DecisionStump {
4 | 
5 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/nnet/Layer.scala:
--------------------------------------------------------------------------------
1 | package qlearn.util.nnet
2 | 
3 | import qlearn.util.nnet.activations.ActivationFunction
4 | 
5 | case class Layer(size: Int, activation: ActivationFunction) {
6 | 	require(size > 0, "Layer must have at least one neuron.")
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/nnet/activations/ActivationFunction.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.util.nnet.activations
 2 | 
 3 | import qlearn.Types.Vec
 4 | 
 5 | abstract class ActivationFunction {
 6 | 	val min: Double
 7 | 	val max: Double
 8 | 
 9 | 	def compute(x: Vec): Vec
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/nnet/activations/Eliott.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.util.nnet.activations
 2 | 
 3 | import breeze.numerics._
 4 | import qlearn.Types.Vec
 5 | 
 6 | // Elliott, D.L. "A better activation function for artificial neural networks", 1993
 7 | // http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.46.7204&rep=rep1&type=pdf
 8 | case class Eliott(s: Double = 1.0) extends ActivationFunction {
 9 | 	val min = 0.0
10 | 	val max = 1.0
11 | 
12 | 	def compute(x: Vec) = {
13 | 		val tmp = x :* s
14 | 		tmp :/ 2.0 :/ (abs(tmp) :+ 1.0) :+ 0.5
15 | 	}
16 | 
17 | 	def gradient(x: Vec, v: Vec) = {
18 | 		val tmp = abs(x :* s) :+ 1.0
19 | 		(tmp :^ -2.0) :* (s/2)
20 | 	}
21 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/nnet/activations/EliottSym.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.util.nnet.activations
 2 | 
 3 | import breeze.numerics._
 4 | import qlearn.Types.Vec
 5 | 
 6 | case class EliottSym(s: Double = 1.0) extends ActivationFunction {
 7 | 	val min = -1.0
 8 | 	val max = 1.0
 9 | 
10 | 	def compute(x: Vec) = {
11 | 		val tmp = x :* s
12 | 		tmp :/ (abs(tmp) + 1.0)
13 | 	}
14 | 
15 | 	def gradient(x: Vec, v: Vec) = {
16 | 		val tmp = abs(x :* s) :+ 1.0
17 | 		(tmp :^ -2.0) :* s
18 | 	}
19 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/nnet/activations/Sigmoid.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.util.nnet.activations
 2 | 
 3 | import breeze.numerics._
 4 | import qlearn.Types.Vec
 5 | 
 6 | object Sigmoid extends ActivationFunction {
 7 | 	val min = 0.0
 8 | 	val max = 1.0
 9 | 
10 | 	def compute(x: Vec) = (exp(-x) :+ 1.0) :^ -1.0
11 | 
12 | 	def gradient(x: Vec, v: Vec) = v :* (-v :+ 1.0)
13 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/util/nnet/activations/Tanh.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.util.nnet.activations
 2 | 
 3 | import breeze.numerics._
 4 | import qlearn.Types.Vec
 5 | 
 6 | object Tanh extends ActivationFunction {
 7 | 	val min = -1.0
 8 | 	val max = 1.0
 9 | 
10 | 	def compute(x: Vec) = tanh(x)
11 | 
12 | 	def gradient(x: Vec, v: Vec) = -(v :^ 2.0) :+ 1.0
13 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/validation/CrossValidation.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.validation
 2 | 
 3 | import qlearn.dataset.{SingleLabeled, Nominal, Labeled}
 4 | import qlearn.ml.Model
 5 | 
 6 | case class CrossValidation[T <: Labeled[T]](data: T, folds: Int = 10, stratify: Boolean = true) extends Validation[T] {
 7 | 
 8 | 	/*
 9 | 		This is a special, very optimized k-fold cross validation engine.
10 | 
11 | 		The memory usage does not grow linearly with the number of folds, but is a constant. The dataset
12 | 		is copied at most 2 times, even if you do 100-fold cross validation.
13 | 
14 | 		Space complexity: at most 2 * N
15 | 		Time complexity: O(N)
16 | 	 */
17 | 
18 | 	require(folds >= 3, "The number of folds has to be at least three.")
19 | 	require(folds <= data.recordCount, "The number of folds cannot exceed the number of records.")
20 | 
21 | 	/*
22 | 		An array that assigns the continuous range to each fold. All folds are the same size,
23 | 		except if the dataset is not evenly divisible by the number of folds -- in that case,
24 | 		folds at the begining have one record more.
25 | 	 */
26 | 
27 | 	val ranges: Vector[Range] = {
28 | 		val div = data.recordCount / folds
29 | 		val mod = data.recordCount % folds
30 | 		val a = (0 to mod).map(_ * (div+1))
31 | 		val b = (mod+1 to folds).map(_ * div + mod)
32 | 		(a ++ b).sliding(2).map {
33 | 			case Seq(first, last) => first until last
34 | 		}.toVector
35 | 	}
36 | 
37 | 	val rangesTwice = ranges ++ ranges.map { range =>
38 | 		range.start + data.recordCount to range.last + data.recordCount
39 | 	}
40 | 
41 | 	def multiRange(a: Int, b: Int) = rangesTwice(a).start to rangesTwice(b-1).last
42 | 
43 | 	/*
44 | 		An inteligent algoritm that ensures the clases in each individual fold are as evenly distributed
45 | 		as possible.
46 | 	 */
47 | 
48 | 	def doStratify(data: Nominal) = {
49 | 		val classOf = data.indices.groupBy(data.y(_))
50 | 		val (names, counts) = classOf.mapValues(_.size).toVector.sortBy(_._2).unzip
51 | 
52 | 		val remaining = counts.toArray
53 | 
54 | 		val rangeIndices = ranges.map { range =>
55 | 			var size = range.size
56 | 			var curn = data.recordCount - range.min
57 | 
58 | 			remaining.indices.flatMap { i =>
59 | 				val chose = (1.0 * remaining(i) * size / curn).round.toInt
60 | 				size -= chose
61 | 				curn -= remaining(i)
62 | 				remaining(i) -= chose
63 | 
64 | 				classOf(names(i)).dropRight( remaining(i) ).takeRight(chose)
65 | 			}
66 | 		}
67 | 
68 | 		val indices = (rangeIndices ++ rangeIndices.dropRight(2)).flatten
69 | 		data.pick(indices)
70 | 	}
71 | 
72 | 	/*
73 | 		Main methods.
74 | 	 */
75 | 
76 | 	val dataset =
77 | 		(data, stratify) match {
78 | 			case (d: Nominal, true) => doStratify(d).asInstanceOf[T]
79 | 			case _ => data ++ data(multiRange(0, folds - 2))
80 | 		}
81 | 
82 | 	val learning =
83 | 		Vector.tabulate(folds) { fold =>
84 | 			dataset(multiRange(fold, fold + folds - 1))
85 | 		}
86 | 
87 | 	val testing = ranges.map(dataset.apply)
88 | 
89 | 	def validate(model: Model[T]) =
90 | 		(0 until folds).map { fold =>
91 | 			val prediction = model.fit(learning(fold)).predict(testing(fold).x)
92 | 			loss(testing(fold), prediction)
93 | 		}.sum / folds
94 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/validation/LeaveOneOut.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.validation
 2 | 
 3 | import qlearn.dataset.Labeled
 4 | import qlearn.ml.Model
 5 | 
 6 | case class LeaveOneOut[T <: Labeled[T]](data: T) extends Validation[T] {
 7 | 
 8 | 	/*
 9 | 		This validator mutates the dataset, therefore we have to make a defensive copy
10 | 		prior each validation. So, in case you are using multiple threads (or just want
11 | 		to trigger garbage collector less offently), use CrossValidation validator with
12 | 		folds = recordCount. It uses twice as much memory once on initialization, but
13 | 		then you can make as many validations as you need, even concurrently, without
14 | 		any additional memory usage.
15 | 
16 | 		Space complexity: N * number of threads
17 | 		Time complexity: O(1) - no preprocessing
18 | 	 */
19 | 
20 | 	def validate(model: Model[T]) = {
21 | 		val copy = data.duplicate(data.indices.init)
22 | 
23 | 		def compare(excluded: T) = {
24 | 			val prediction = model.fit(copy).predict(excluded.x)
25 | 			loss(excluded, prediction)
26 | 		}
27 | 
28 | 		val last = compare(data(-1))
29 | 
30 | 		val init = copy.indices.map { i =>
31 | 			val excluded = data(i)
32 | 
33 | 			copy.xmat(i, ::) := excluded.xmat(0, ::)
34 | 			copy.ymat(i, ::) := excluded.ymat(0, ::)
35 | 
36 | 			compare(excluded)
37 | 		}.sum
38 | 
39 | 		(init + last) / data.recordCount
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/validation/PercentageSplit.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.validation
 2 | 
 3 | import qlearn.dataset.{Labeled, Nominal}
 4 | import qlearn.ml.Model
 5 | 
 6 | case class PercentageSplit[T <: Labeled[T]](data: T, percentage: Double = 70, stratify: Boolean = true) extends Validation[T] {
 7 | 
 8 | 	/*
 9 | 		Space complexity: N if stratified, O(1) otherwise
10 | 	 */
11 | 
12 | 	val splitPoint = (data.recordCount / 100.0 * percentage).round.toInt
13 | 
14 | 	require(splitPoint > 0, "There has to be at least one record in the learning dataset.")
15 | 	require(splitPoint < data.recordCount, "There has to be at least one record in the validation dataset.")
16 | 
17 | 	/*
18 | 		An inteligent algoritm that ensures the clases in each individual fold are as evenly distributed
19 | 		as possible.
20 | 	 */
21 | 
22 | 	def doStratify(data: Nominal) = {
23 | 		val classOf = data.indices.groupBy(data.y(_))
24 | 		val (names, counts) = classOf.mapValues(_.size).toVector.sortBy(_._2).unzip
25 | 
26 | 		var size = splitPoint
27 | 		var curn = data.recordCount
28 | 
29 | 		val (a, b) = (counts, names).zipped.map { (count, name) =>
30 | 			val chose = (1.0 * count * size / curn).round.toInt
31 | 			size -= chose
32 | 			curn -= count
33 | 
34 | 			classOf(name).splitAt(chose)
35 | 		}.unzip
36 | 
37 | 		data.pick((a ++ b).flatten)
38 | 	}
39 | 
40 | 	/*
41 | 		Main methods.
42 | 	 */
43 | 
44 | 	val dataset =
45 | 		(data, stratify) match {
46 | 			case (d: Nominal, true) => doStratify(d).asInstanceOf[T]
47 | 			case _ => data
48 | 		}
49 | 
50 | 	val learning = dataset(0 until splitPoint)
51 | 
52 | 	val testing = dataset(splitPoint to -1)
53 | 
54 | 	def validate(model: Model[T]) = {
55 | 		val prediction = model.fit(learning).predict(testing.x)
56 | 		loss(testing, prediction)
57 | 	}
58 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/validation/SameDatasetValidation.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.validation
 2 | 
 3 | import qlearn.dataset.Labeled
 4 | import qlearn.ml.Model
 5 | 
 6 | case class SameDatasetValidation[T <: Labeled[T]](data: T) extends Validation[T] {
 7 | 
 8 | 	/*
 9 | 		Simple validation on the same dataset which you learned from.
10 | 
11 | 		Space complexity: O(1)
12 | 		Time complexity: O(1)
13 | 	 */
14 | 
15 | 	def validate(model: Model[T]) = {
16 | 		val prediction = model.fit(data).predict(data.x)
17 | 		loss(data, prediction)
18 | 	}
19 | }


--------------------------------------------------------------------------------
/src/main/scala/qlearn/validation/Validation.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.validation
 2 | 
 3 | import qlearn.dataset._
 4 | import qlearn.ml.Model
 5 | 
 6 | abstract class Validation[T <: Labeled[T]] {
 7 | 	def data: T
 8 | 
 9 | 	// TODO: ugliest hack in the world
10 | 	// T should have loss method, comment it out
11 | 	// see my stackoverflow questions
12 | 	def loss(a: T, b: T): Double = data match {
13 | 		case x: Numerical => x.schema.loss(a.asInstanceOf[Numerical], b.asInstanceOf[Numerical])
14 | 		case x: Binary => x.schema.loss(a.asInstanceOf[Binary], b.asInstanceOf[Binary])
15 | 		case x: NominalBasic => x.schema.loss(a.asInstanceOf[Nominal], b.asInstanceOf[Nominal])
16 | 		case x: NominalFull => x.schema.loss(a.asInstanceOf[Nominal], b.asInstanceOf[Nominal])
17 | 	}
18 | 
19 | 	def validate(model: Model[T]): Double
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/wekas/WekaClusterer.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.wekas
 2 | 
 3 | import qlearn.dataset.schema.NominalColumn
 4 | import qlearn.dataset.{NominalBasic, Unlabeled}
 5 | import qlearn.Types.Vec
 6 | import qlearn.loss.numerical.distance._
 7 | import qlearn.ml.Clusterer
 8 | import weka.clusterers.{Clusterer => ClustererW, AbstractClusterer}
 9 | 
10 | class WekaClusterer(val clusterer: ClustererW) extends Clusterer {
11 | 
12 | 	/*
13 | 		If the weka has our distance implemented, then use theirs,
14 | 		native version. Otherwise, wrap it with WekaDistance wrapper.
15 | 	 */
16 | 
17 | 	protected def convertDistance(dist: Distance) = dist match {
18 | 		case EuclideanDistance => new weka.core.EuclideanDistance
19 | 		case ManhattanDistance => new weka.core.ManhattanDistance
20 | 		case ChebyshevDistance => new weka.core.ChebyshevDistance
21 | 		case NormDistance(p) =>
22 | 			val tmp = new weka.core.MinkowskiDistance
23 | 			tmp.setOrder(p)
24 | 			tmp
25 | 
26 | 		case _ => WekaDistance(dist)
27 | 	}
28 | 
29 | 	/*
30 | 		The main clustering method.
31 | 	 */
32 | 
33 | 	def cluster(data: Unlabeled) = {
34 | 		val copy = AbstractClusterer.makeCopy(clusterer)
35 | 		val instances = data.wekaDataset
36 | 		copy.buildClusterer(instances)
37 | 
38 | 		val y = Vec.tabulate(data.recordCount)( i =>
39 | 			copy.clusterInstance( instances.instance(i) )
40 | 		)
41 | 
42 | 		val k = copy.numberOfClusters
43 | 		NominalBasic(data, y, NominalColumn('cluster, names(k)))
44 | 	}
45 | 
46 | 	override def toString = s"Weka${clusterer.getClass.getSimpleName}()"
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/wekas/WekaDistance.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.wekas
 2 | 
 3 | import qlearn.loss.numerical.distance.Distance
 4 | import weka.core.DistanceFunction
 5 | 
 6 | case class WekaDistance(dist: Distance) extends DistanceFunction {
 7 | 
 8 | 	// TODO: Implement all these crazy Weka methods
 9 | 
10 | 	// Members declared in weka.core.DistanceFunction
11 | 	def clean(): Unit = ???
12 | 	def distance(x$1: weka.core.Instance,x$2: weka.core.Instance,x$3: Double,x$4: weka.core.neighboursearch.PerformanceStats): Double = ???
13 | 	def distance(x$1: weka.core.Instance,x$2: weka.core.Instance,x$3: Double): Double = ???
14 | 	def distance(x$1: weka.core.Instance,x$2: weka.core.Instance,x$3: weka.core.neighboursearch.PerformanceStats): Double = ???
15 | 	def distance(x$1: weka.core.Instance,x$2: weka.core.Instance): Double = ???
16 | 	def getAttributeIndices(): String = ???
17 | 	def getInstances(): weka.core.Instances = ???
18 | 	def getInvertSelection(): Boolean = ???
19 | 	def postProcessDistances(x$1: Array[Double]): Unit = ???
20 | 	def setAttributeIndices(x$1: String): Unit = ???
21 | 	def setInstances(x$1: weka.core.Instances): Unit = ???
22 | 	def setInvertSelection(x$1: Boolean): Unit = ???
23 | 	def update(x$1: weka.core.Instance): Unit = ???
24 | 
25 | 	// Members declared in weka.core.OptionHandler
26 | 	def getOptions(): Array[String] = ???
27 | 	def listOptions(): java.util.Enumeration[weka.core.Option] = ???
28 | 	def setOptions(x$1: Array[String]): Unit = ???
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/wekas/WekaFittedModel.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.wekas
 2 | 
 3 | import qlearn.Types.Mat
 4 | import qlearn.dataset.{SingleLabeled, Unlabeled, MultiLabeled}
 5 | import qlearn.ml.FittedModel
 6 | import weka.classifiers.Classifier
 7 | 
 8 | class WekaFittedModel[T <: SingleLabeled[T]](val schema: T, predictor: Classifier) extends FittedModel[T] {
 9 | 
10 | 	def predict(data: Unlabeled) = {
11 | 		val instances = data.wekaDataset
12 | 		val mat = Mat((0 until instances.numInstances).map { i =>
13 | 			instances.instance(i).setDataset( schema.wekaDataset )
14 | 			predictor.distributionForInstance(instances.instance(i))
15 | 		}: _*)
16 | 		schema.updated(data, mat)
17 | 	}
18 | 
19 | 
20 | 	override def toString = s"WekaFitted${predictor.getClass.getSimpleName}()"
21 | 
22 | 	def report {
23 | 		println(predictor)
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/qlearn/wekas/WekaModel.scala:
--------------------------------------------------------------------------------
 1 | package qlearn.wekas
 2 | 
 3 | import qlearn.dataset.{MultiLabeled, SingleLabeled}
 4 | import qlearn.ml.Model
 5 | import weka.classifiers.{AbstractClassifier, Classifier}
 6 | 
 7 | class WekaModel[T <: SingleLabeled[T]](val model: Classifier) extends Model[T] {
 8 | 
 9 | 	def fit(data: T) =
10 | 		new WekaFittedModel(data, {
11 | 			val copy = AbstractClassifier.makeCopy(model)
12 | 			copy.buildClassifier(data.wekaDataset)
13 | 			copy
14 | 		})
15 | 
16 | 	override def toString = s"Weka${model.getClass.getSimpleName}()"
17 | }
18 | 


--------------------------------------------------------------------------------