├── .gitignore
├── README.md
├── build.sbt
├── data
    ├── data.txt
    └── import_eventserver.py
├── engine.json
├── project
    ├── assembly.sbt
    ├── build.properties
    └── pio-build.sbt
├── src
    └── main
    │   └── scala
    │       ├── DataSource.scala
    │       ├── Engine.scala
    │       ├── Evaluation.scala
    │       ├── LDAAlgorithm.scala
    │       ├── Preparator.scala
    │       └── Serving.scala
└── template.json


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | manifest.json
3 | pio.log
4 | /pio.sbt
5 | target/
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Topic Modeling Template - LDA
 2 | 
 3 | This template requires spark >= 1.5.1.
 4 | 
 5 | Input data is plain text, in data/data.txt , one line per LDA "document".
 6 | 
 7 | create a PIO app:
 8 | ``` pio app new YOURAPPNAME ```
 9 | 
10 | import the data using:
11 | ```
12 | python data/import_eventserver.py --access_key YOURACCESSKEYHERE 
13 | ```
14 | 
15 | Params are in engine.json, the most important to consider is number of topics.
16 | 
17 | build,train the LDA model:
18 | ``` 
19 | pio build 
20 | pio train 
21 | pio deploy
22 | ```
23 | 
24 | 
25 | prediction query:
26 | ``` 
27 | {"text":  "wishing he did not have to go"} 
28 | ```
29 | 
30 | The response contains the top topic for this document, as well as the full set of topics for comparison (with the top 10 terms shown for each topic, for reference). You may wish to alter this to return only top topic.
31 | 
32 | You can do topic prediction on any document (formerly restricted to those in the train set).
33 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | name := "template-scala-topic-model-LDA"
 6 | 
 7 | organization := "io.prediction"
 8 | 
 9 | scalaVersion := "2.10.5"
10 | 
11 | excludeFilter in Runtime in unmanagedResources := "*.html"
12 | 
13 | resolvers += Resolver.sonatypeRepo("snapshots")   
14 | 
15 | libraryDependencies ++= {
16 |    Seq(
17 |   "io.prediction"    %% "core"          % "0.9.4" % "provided",
18 |   "org.apache.spark" %% "spark-core"    % "1.5.1" % "provided",
19 |   "org.apache.spark" %% "spark-mllib"   % "1.5.1" % "provided",
20 |      "org.xerial.snappy" % "snappy-java" % "1.1.1.7")
21 | }
22 | 


--------------------------------------------------------------------------------
/data/import_eventserver.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Import sample data for classification engine
 3 | """
 4 | 
 5 | import predictionio
 6 | import argparse
 7 | 
 8 | def import_events(client, file):
 9 |   f = open(file, 'r')
10 |   count = 0
11 |   print "Importing data..."
12 |   for line in f:
13 |     text = line.rstrip('\r\n')
14 |  
15 |     client.create_event(
16 |       event="$set",
17 |       entity_type="user",
18 |       entity_id=str(count), # use the count num as user ID
19 |       properties= {
20 |         "text" : text
21 |       }
22 |     )
23 |     count += 1
24 |   f.close()
25 |   print "%s events are imported." % count
26 | 
27 | if __name__ == '__main__':
28 |   parser = argparse.ArgumentParser(
29 |     description="Import sample data for classification engine")
30 |   parser.add_argument('--access_key', default='invald_access_key')
31 |   parser.add_argument('--url', default="http://localhost:7070")
32 |   parser.add_argument('--file', default="./data/data.txt")
33 | 
34 |   args = parser.parse_args()
35 |   print args
36 | 
37 |   client = predictionio.EventClient(
38 |     access_key=args.access_key,
39 |     url=args.url,
40 |     threads=5,
41 |     qsize=500)
42 |   import_events(client, args.file)
43 | 


--------------------------------------------------------------------------------
/engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "id": "default",
 3 |   "description": "Default settings",
 4 |   "engineFactory": "org.template.classification.ClassificationEngine",
 5 |   "datasource": {
 6 |     "params": {
 7 |       "appName": "LDA"
 8 |     }
 9 |   },
10 |   "algorithms": [
11 |     {
12 |       "name": "LDA",
13 |       "params": {
14 |         "numTopics" : 5,
15 |         "maxIter": 200,
16 |         "docConcentration": -1.0,
17 |         "topicConcentration": -1.0
18 |       }
19 |     }
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.5
2 | 


--------------------------------------------------------------------------------
/project/pio-build.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("io.prediction" % "pio-build" % "0.9.0")
2 | 


--------------------------------------------------------------------------------
/src/main/scala/DataSource.scala:
--------------------------------------------------------------------------------
 1 | package org.template.classification
 2 | 
 3 | import io.prediction.controller.PDataSource
 4 | import io.prediction.controller.EmptyEvaluationInfo
 5 | import io.prediction.controller.EmptyActualResult
 6 | import io.prediction.controller.Params
 7 | import io.prediction.data.storage.Event
 8 | import io.prediction.data.store.PEventStore
 9 | 
10 | import org.apache.spark.SparkContext
11 | import org.apache.spark.SparkContext._
12 | import org.apache.spark.rdd.RDD
13 | import org.apache.spark.mllib.linalg.Vectors
14 | 
15 | import grizzled.slf4j.Logger
16 | 
17 | case class DataSourceParams(
18 |   appName: String,
19 |   evalK: Option[Int]  // define the k-fold parameter.
20 | ) extends Params
21 | 
22 | class DataSource(val dsp: DataSourceParams)
23 |   extends PDataSource[TrainingData,
24 |       EmptyEvaluationInfo, Query, ActualResult] {
25 | 
26 |   @transient lazy val logger = Logger[this.type]
27 | 
28 |   override
29 |   def readTraining(sc: SparkContext): TrainingData = {
30 | 
31 |   val textPoints: RDD[TextPoint] = PEventStore.aggregateProperties(
32 |       appName = dsp.appName,
33 |       entityType = "user",
34 |       // only keep entities with these required properties defined
35 |       required = Some(List("text")))(sc)
36 |       // aggregateProperties() returns RDD pair of
37 |       // entity ID and its aggregated properties
38 |       .map { case (entityId, properties) =>
39 |         try {
40 |           new TextPoint(properties.get[String]("text").trim)
41 |         } catch {
42 |           case e: Exception => {
43 |             logger.error(s"Failed to get properties ${properties} of" +
44 |               s" ${entityId}. Exception: ${e}.")
45 |             throw e
46 |           }
47 |         }
48 |       }.cache()
49 | 
50 |     new TrainingData(textPoints)
51 |   }
52 | 
53 | }
54 | 
55 | class TextPoint (val text: String) extends Serializable
56 | 
57 | class TrainingData(
58 |   val trainingText: RDD[TextPoint]
59 | ) extends Serializable
60 | 


--------------------------------------------------------------------------------
/src/main/scala/Engine.scala:
--------------------------------------------------------------------------------
 1 | package org.template.classification
 2 | 
 3 | import io.prediction.controller.IEngineFactory
 4 | import io.prediction.controller.Engine
 5 | import org.apache.spark.mllib.linalg.Vector
 6 | 
 7 | class Query(
 8 |   val text: String
 9 | ) extends Serializable
10 | 
11 | class PredictedResult(
12 |   val topTopic: (Array[(String,Double)]),
13 |                      val topics: Array[(Int, Array[(String,Double)])]
14 | 
15 | ) extends Serializable
16 | 
17 | class ActualResult(
18 |                     val text: String
19 | ) extends Serializable
20 | 
21 | object ClassificationEngine extends IEngineFactory {
22 |   def apply() = {
23 |     new Engine(
24 |       classOf[DataSource],
25 |       classOf[Preparator],
26 |       Map("LDA" -> classOf[LDAAlgorithm]),
27 |       classOf[Serving])
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/Evaluation.scala:
--------------------------------------------------------------------------------
 1 | package org.template.classification
 2 | 
 3 | import io.prediction.controller.AverageMetric
 4 | import io.prediction.controller.EmptyEvaluationInfo
 5 | import io.prediction.controller.EngineParams
 6 | import io.prediction.controller.EngineParamsGenerator
 7 | import io.prediction.controller.Evaluation
 8 | 
 9 | case class LDAMetric
10 |   extends AverageMetric[EmptyEvaluationInfo, Query, PredictedResult, ActualResult] {
11 |   def calculate(query: Query, predicted: PredictedResult, actual: ActualResult)
12 |   : Double = ???
13 | }
14 | 
15 | object AccuracyEvaluation extends Evaluation {
16 |   // Define Engine and Metric used in Evaluation
17 |   engineMetric = (ClassificationEngine(), new LDAMetric())
18 | }
19 | 
20 | object EngineParamsList extends EngineParamsGenerator {
21 |   // Define list of EngineParams used in Evaluation
22 | 
23 |   // First, we define the base engine params. It specifies the appId from which
24 |   // the data is read, and a evalK parameter is used to define the
25 |   // cross-validation.
26 |   private[this] val baseEP = EngineParams(
27 |     dataSourceParams = DataSourceParams(appName = "INVALID_APP_NAME", evalK = Some(5)))
28 | 
29 |   // Second, we specify the engine params list by explicitly listing all
30 |   // algorithm parameters. In this case, we evaluate 3 engine params, each with
31 |   // a different algorithm params value.
32 |   engineParamsList = Seq(
33 |     baseEP.copy(algorithmParamsList = Seq(("LDA", AlgorithmParams(5, 10, 0.1, 1.0)))),
34 |     baseEP.copy(algorithmParamsList = Seq(("LDA", AlgorithmParams(5, 100, 0.01, 1.0)))),
35 |     baseEP.copy(algorithmParamsList = Seq(("LDA", AlgorithmParams(5, 1, 0.1, 1.0)))))
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/LDAAlgorithm.scala:
--------------------------------------------------------------------------------
  1 | package org.template.classification
  2 | 
  3 | import io.prediction.controller.{PAlgorithm, Params}
  4 | import io.prediction.controller.{IPersistentModel, IPersistentModelLoader}
  5 | 
  6 | 
  7 | import org.apache.spark.mllib.linalg.Vectors
  8 | import org.apache.spark.{SparkConf, SparkContext}
  9 | import org.apache.spark.mllib.linalg.Vector
 10 | import grizzled.slf4j.Logger
 11 | 
 12 | import java.nio.file.{Files, Paths}
 13 | 
 14 | import org.apache.spark.mllib.clustering.{DistributedLDAModel, LDAModel, LDA}
 15 | import org.apache.spark.rdd.RDD
 16 | 
 17 | import breeze.linalg.{DenseMatrix => BDM, argtopk, max, argmax}
 18 | import breeze.linalg.DenseVector
 19 | 
 20 | import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix, Vector, Vectors}
 21 | 
 22 | case class LDAModelWithCorpusAndVocab(
 23 |                                ldaModel: DistributedLDAModel,
 24 |                                corpus: RDD[(String, (Long,Vector))],
 25 |                                vocab : Map[String,Int],
 26 |                                sc: SparkContext
 27 |                                ) extends IPersistentModel[AlgorithmParams] with Serializable {
 28 |   def save(id: String, params: AlgorithmParams,
 29 |     sc: SparkContext): Boolean = { 
 30 |       ldaModel.save(sc, s"/tmp/${id}/ldaModel")
 31 |       corpus.saveAsObjectFile(s"/tmp/${id}/ldaCorpus")
 32 |       sc.parallelize(Seq(vocab)).saveAsObjectFile(s"/tmp/${id}/ldaVocab")
 33 |       true
 34 |   }
 35 | }
 36 | 
 37 | object LDAModelWithCorpusAndVocab
 38 |   extends IPersistentModelLoader[AlgorithmParams, LDAModelWithCorpusAndVocab] {
 39 |   def apply(id: String, params: AlgorithmParams,
 40 |     sc: Option[SparkContext]) = {
 41 |     new LDAModelWithCorpusAndVocab(
 42 |       DistributedLDAModel.load(sc.get, s"/tmp/${id}/ldaModel"),
 43 |       sc.get.objectFile(s"/tmp/${id}/ldaCorpus"),
 44 |       sc.get.objectFile[Map[String,Int]](s"/tmp/${id}/ldaVocab").first,
 45 |       sc.get
 46 |       )      
 47 |   }
 48 |  }
 49 | 
 50 | case class AlgorithmParams(
 51 |   numTopics: Int,
 52 |   maxIter: Int,
 53 |   docConcentration: Double,
 54 |   topicConcentration: Double
 55 | ) extends Params
 56 | 
 57 | // extends PAlgorithm because contains RDD.
 58 | // Does not implement save and load, because DistributedLDAModel doesn't support it yet
 59 | class LDAAlgorithm(val ap: AlgorithmParams)
 60 |   extends PAlgorithm[PreparedData, LDAModelWithCorpusAndVocab, Query, PredictedResult] {
 61 | 
 62 |   @transient lazy val logger = Logger[this.type]
 63 | 
 64 |   def train(sc: SparkContext, data: PreparedData): LDAModelWithCorpusAndVocab = {
 65 |     require(!data.points.take(1).isEmpty,
 66 |       s"RDD[labeldPoints] in PreparedData cannot be empty." +
 67 |       " Please check if DataSource generates TrainingData" +
 68 |       " and Preprator generates PreparedData correctly.")
 69 | 
 70 |     val dataStrings = data.points.map(s => s.text)
 71 |     val (corpus, vocab) = makeDocuments(dataStrings)
 72 |     val ldaModel = new LDA().setSeed(13457).setK(ap.numTopics).setMaxIterations(ap.maxIter).run(corpus)
 73 |       .asInstanceOf[DistributedLDAModel]
 74 | 
 75 |     new LDAModelWithCorpusAndVocab(ldaModel, dataStrings zip corpus, vocab, sc)
 76 |   }
 77 |  
 78 |   def predict(ldaModelAndCorpus: LDAModelWithCorpusAndVocab, query: Query): PredictedResult = { 
 79 |     val topics = ldaModelAndCorpus.ldaModel.describeTopics(10)
 80 |     val topicDists = ldaModelAndCorpus.ldaModel.topicDistributions
 81 |     val corpusMap =ldaModelAndCorpus.corpus.collect().toMap
 82 | 
 83 |     val maxTopicIndex: Int = getMaxTopicIndex(ldaModelAndCorpus.sc, query, ldaModelAndCorpus.ldaModel)
 84 |     val swappedMap = ldaModelAndCorpus.vocab.map(_.swap)
 85 |     val topicResults = for( ((indices, weights), outerIndex) <- topics zipWithIndex)
 86 |                        yield {outerIndex -> (indices map (x => swappedMap(x)) zip weights)
 87 |                          .sortWith((e1, e2) => (e1._2 > e2._2))}
 88 | 
 89 |     val topTopic = topicResults.toMap.getOrElse(maxTopicIndex,
 90 |                                        throw new scala.Exception("Cannot find topic"))
 91 | 
 92 |     new PredictedResult(topTopic, topicResults)
 93 |   }
 94 | 
 95 |   def getMaxTopicIndex(sc:SparkContext, query: Query, ldaModel: DistributedLDAModel): Int = {
 96 | 
 97 |     val text = query.text.trim
 98 | 
 99 |     val (corpus, vocab) = makeDocuments(sc.parallelize(Array(text)))
100 | 
101 |     val actualPredictions = ldaModel.toLocal.topicDistributions(corpus).map { case (id, topics) =>
102 |       // convert results to expectedPredictions format, which only has highest probability topic
103 |       val topicsBz = new DenseVector(topics.toArray)
104 |        (id, (argmax(topicsBz), max(topicsBz)))
105 |     }.sortByKey()
106 |     .values
107 |     .collect()
108 | 
109 |     actualPredictions.head._1
110 |   }
111 | 
112 | 
113 |   //See https://gist.github.com/jkbradley/ab8ae22a8282b2c8ce33
114 |   def makeDocuments(data: RDD[String]): (RDD[(Long, Vector)], Map[String, Int]) = {
115 |     // Split each document into a sequence of terms (words)
116 |     val tokenized: RDD[Seq[String]] =
117 |       data.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3)
118 |                                           .filter(_.forall(java.lang.Character.isLetter)))
119 | 
120 |     // Choose the vocabulary.
121 |     //   termCounts: Sorted list of (term, termCount) pairs
122 |     val termCounts: Array[(String, Long)] =
123 |       tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2)
124 |     //   vocabArray: Chosen vocab (removing common terms)
125 |     val numStopwords = termCounts.size / 10
126 |     val vocabArray: Array[String] =
127 |       termCounts.takeRight(termCounts.size - numStopwords).map(_._1)
128 |     //   vocab: Map term -> term index
129 |     val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
130 | 
131 | 
132 |     // Convert documents into term count vectors
133 |     val documents: RDD[(Long, Vector)] =
134 |       tokenized.zipWithIndex.map { case (tokens, id) =>
135 |         val counts = new scala.collection.mutable.HashMap[Int, Double]()
136 |         tokens.foreach { term =>
137 |           if (vocab.contains(term)) {
138 |             val idx = vocab(term)
139 |             counts(idx) = counts.getOrElse(idx, 0.0) + 1.0
140 |           }
141 |         }
142 |         (id, Vectors.sparse(vocab.size, counts.toSeq))
143 |       }
144 |     (documents, vocab)
145 |   }
146 | 
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/scala/Preparator.scala:
--------------------------------------------------------------------------------
 1 | package org.template.classification
 2 | 
 3 | import io.prediction.controller.PPreparator
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.SparkContext._
 7 | import org.apache.spark.rdd.RDD
 8 | 
 9 | 
10 | class PreparedData(
11 |   val points: RDD[TextPoint]
12 | ) extends Serializable
13 | 
14 | class Preparator extends PPreparator[TrainingData, PreparedData] {
15 | 
16 |   def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
17 |     new PreparedData(trainingData.trainingText)
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/Serving.scala:
--------------------------------------------------------------------------------
 1 | package org.template.classification
 2 | 
 3 | import io.prediction.controller.LServing
 4 | 
 5 | class Serving extends LServing[Query, PredictedResult] {
 6 | 
 7 |   override
 8 |   def serve(query: Query,
 9 |     predictedResults: Seq[PredictedResult]): PredictedResult = {
10 |     predictedResults.head
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/template.json:
--------------------------------------------------------------------------------
1 | {"pio": {"version": { "min": "0.9.4" }}}
2 | 


--------------------------------------------------------------------------------