├── .gitignore ├── README.md ├── build.sbt ├── data ├── data.txt └── import_eventserver.py ├── engine.json ├── project ├── assembly.sbt ├── build.properties └── pio-build.sbt ├── src └── main │ └── scala │ ├── DataSource.scala │ ├── Engine.scala │ ├── Evaluation.scala │ ├── LDAAlgorithm.scala │ ├── Preparator.scala │ └── Serving.scala └── template.json /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | manifest.json 3 | pio.log 4 | /pio.sbt 5 | target/ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Topic Modeling Template - LDA 2 | 3 | This template requires spark >= 1.5.1. 4 | 5 | Input data is plain text, in data/data.txt , one line per LDA "document". 6 | 7 | create a PIO app: 8 | ``` pio app new YOURAPPNAME ``` 9 | 10 | import the data using: 11 | ``` 12 | python data/import_eventserver.py --access_key YOURACCESSKEYHERE 13 | ``` 14 | 15 | Params are in engine.json, the most important to consider is number of topics. 16 | 17 | build,train the LDA model: 18 | ``` 19 | pio build 20 | pio train 21 | pio deploy 22 | ``` 23 | 24 | 25 | prediction query: 26 | ``` 27 | {"text": "wishing he did not have to go"} 28 | ``` 29 | 30 | The response contains the top topic for this document, as well as the full set of topics for comparison (with the top 10 terms shown for each topic, for reference). You may wish to alter this to return only top topic. 31 | 32 | You can do topic prediction on any document (formerly restricted to those in the train set). 33 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | name := "template-scala-topic-model-LDA" 6 | 7 | organization := "io.prediction" 8 | 9 | scalaVersion := "2.10.5" 10 | 11 | excludeFilter in Runtime in unmanagedResources := "*.html" 12 | 13 | resolvers += Resolver.sonatypeRepo("snapshots") 14 | 15 | libraryDependencies ++= { 16 | Seq( 17 | "io.prediction" %% "core" % "0.9.4" % "provided", 18 | "org.apache.spark" %% "spark-core" % "1.5.1" % "provided", 19 | "org.apache.spark" %% "spark-mllib" % "1.5.1" % "provided", 20 | "org.xerial.snappy" % "snappy-java" % "1.1.1.7") 21 | } 22 | -------------------------------------------------------------------------------- /data/import_eventserver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Import sample data for classification engine 3 | """ 4 | 5 | import predictionio 6 | import argparse 7 | 8 | def import_events(client, file): 9 | f = open(file, 'r') 10 | count = 0 11 | print "Importing data..." 12 | for line in f: 13 | text = line.rstrip('\r\n') 14 | 15 | client.create_event( 16 | event="$set", 17 | entity_type="user", 18 | entity_id=str(count), # use the count num as user ID 19 | properties= { 20 | "text" : text 21 | } 22 | ) 23 | count += 1 24 | f.close() 25 | print "%s events are imported." % count 26 | 27 | if __name__ == '__main__': 28 | parser = argparse.ArgumentParser( 29 | description="Import sample data for classification engine") 30 | parser.add_argument('--access_key', default='invald_access_key') 31 | parser.add_argument('--url', default="http://localhost:7070") 32 | parser.add_argument('--file', default="./data/data.txt") 33 | 34 | args = parser.parse_args() 35 | print args 36 | 37 | client = predictionio.EventClient( 38 | access_key=args.access_key, 39 | url=args.url, 40 | threads=5, 41 | qsize=500) 42 | import_events(client, args.file) 43 | -------------------------------------------------------------------------------- /engine.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "default", 3 | "description": "Default settings", 4 | "engineFactory": "org.template.classification.ClassificationEngine", 5 | "datasource": { 6 | "params": { 7 | "appName": "LDA" 8 | } 9 | }, 10 | "algorithms": [ 11 | { 12 | "name": "LDA", 13 | "params": { 14 | "numTopics" : 5, 15 | "maxIter": 200, 16 | "docConcentration": -1.0, 17 | "topicConcentration": -1.0 18 | } 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.5 2 | -------------------------------------------------------------------------------- /project/pio-build.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("io.prediction" % "pio-build" % "0.9.0") 2 | -------------------------------------------------------------------------------- /src/main/scala/DataSource.scala: -------------------------------------------------------------------------------- 1 | package org.template.classification 2 | 3 | import io.prediction.controller.PDataSource 4 | import io.prediction.controller.EmptyEvaluationInfo 5 | import io.prediction.controller.EmptyActualResult 6 | import io.prediction.controller.Params 7 | import io.prediction.data.storage.Event 8 | import io.prediction.data.store.PEventStore 9 | 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.SparkContext._ 12 | import org.apache.spark.rdd.RDD 13 | import org.apache.spark.mllib.linalg.Vectors 14 | 15 | import grizzled.slf4j.Logger 16 | 17 | case class DataSourceParams( 18 | appName: String, 19 | evalK: Option[Int] // define the k-fold parameter. 20 | ) extends Params 21 | 22 | class DataSource(val dsp: DataSourceParams) 23 | extends PDataSource[TrainingData, 24 | EmptyEvaluationInfo, Query, ActualResult] { 25 | 26 | @transient lazy val logger = Logger[this.type] 27 | 28 | override 29 | def readTraining(sc: SparkContext): TrainingData = { 30 | 31 | val textPoints: RDD[TextPoint] = PEventStore.aggregateProperties( 32 | appName = dsp.appName, 33 | entityType = "user", 34 | // only keep entities with these required properties defined 35 | required = Some(List("text")))(sc) 36 | // aggregateProperties() returns RDD pair of 37 | // entity ID and its aggregated properties 38 | .map { case (entityId, properties) => 39 | try { 40 | new TextPoint(properties.get[String]("text").trim) 41 | } catch { 42 | case e: Exception => { 43 | logger.error(s"Failed to get properties ${properties} of" + 44 | s" ${entityId}. Exception: ${e}.") 45 | throw e 46 | } 47 | } 48 | }.cache() 49 | 50 | new TrainingData(textPoints) 51 | } 52 | 53 | } 54 | 55 | class TextPoint (val text: String) extends Serializable 56 | 57 | class TrainingData( 58 | val trainingText: RDD[TextPoint] 59 | ) extends Serializable 60 | -------------------------------------------------------------------------------- /src/main/scala/Engine.scala: -------------------------------------------------------------------------------- 1 | package org.template.classification 2 | 3 | import io.prediction.controller.IEngineFactory 4 | import io.prediction.controller.Engine 5 | import org.apache.spark.mllib.linalg.Vector 6 | 7 | class Query( 8 | val text: String 9 | ) extends Serializable 10 | 11 | class PredictedResult( 12 | val topTopic: (Array[(String,Double)]), 13 | val topics: Array[(Int, Array[(String,Double)])] 14 | 15 | ) extends Serializable 16 | 17 | class ActualResult( 18 | val text: String 19 | ) extends Serializable 20 | 21 | object ClassificationEngine extends IEngineFactory { 22 | def apply() = { 23 | new Engine( 24 | classOf[DataSource], 25 | classOf[Preparator], 26 | Map("LDA" -> classOf[LDAAlgorithm]), 27 | classOf[Serving]) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/Evaluation.scala: -------------------------------------------------------------------------------- 1 | package org.template.classification 2 | 3 | import io.prediction.controller.AverageMetric 4 | import io.prediction.controller.EmptyEvaluationInfo 5 | import io.prediction.controller.EngineParams 6 | import io.prediction.controller.EngineParamsGenerator 7 | import io.prediction.controller.Evaluation 8 | 9 | case class LDAMetric 10 | extends AverageMetric[EmptyEvaluationInfo, Query, PredictedResult, ActualResult] { 11 | def calculate(query: Query, predicted: PredictedResult, actual: ActualResult) 12 | : Double = ??? 13 | } 14 | 15 | object AccuracyEvaluation extends Evaluation { 16 | // Define Engine and Metric used in Evaluation 17 | engineMetric = (ClassificationEngine(), new LDAMetric()) 18 | } 19 | 20 | object EngineParamsList extends EngineParamsGenerator { 21 | // Define list of EngineParams used in Evaluation 22 | 23 | // First, we define the base engine params. It specifies the appId from which 24 | // the data is read, and a evalK parameter is used to define the 25 | // cross-validation. 26 | private[this] val baseEP = EngineParams( 27 | dataSourceParams = DataSourceParams(appName = "INVALID_APP_NAME", evalK = Some(5))) 28 | 29 | // Second, we specify the engine params list by explicitly listing all 30 | // algorithm parameters. In this case, we evaluate 3 engine params, each with 31 | // a different algorithm params value. 32 | engineParamsList = Seq( 33 | baseEP.copy(algorithmParamsList = Seq(("LDA", AlgorithmParams(5, 10, 0.1, 1.0)))), 34 | baseEP.copy(algorithmParamsList = Seq(("LDA", AlgorithmParams(5, 100, 0.01, 1.0)))), 35 | baseEP.copy(algorithmParamsList = Seq(("LDA", AlgorithmParams(5, 1, 0.1, 1.0))))) 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/LDAAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package org.template.classification 2 | 3 | import io.prediction.controller.{PAlgorithm, Params} 4 | import io.prediction.controller.{IPersistentModel, IPersistentModelLoader} 5 | 6 | 7 | import org.apache.spark.mllib.linalg.Vectors 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.apache.spark.mllib.linalg.Vector 10 | import grizzled.slf4j.Logger 11 | 12 | import java.nio.file.{Files, Paths} 13 | 14 | import org.apache.spark.mllib.clustering.{DistributedLDAModel, LDAModel, LDA} 15 | import org.apache.spark.rdd.RDD 16 | 17 | import breeze.linalg.{DenseMatrix => BDM, argtopk, max, argmax} 18 | import breeze.linalg.DenseVector 19 | 20 | import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix, Vector, Vectors} 21 | 22 | case class LDAModelWithCorpusAndVocab( 23 | ldaModel: DistributedLDAModel, 24 | corpus: RDD[(String, (Long,Vector))], 25 | vocab : Map[String,Int], 26 | sc: SparkContext 27 | ) extends IPersistentModel[AlgorithmParams] with Serializable { 28 | def save(id: String, params: AlgorithmParams, 29 | sc: SparkContext): Boolean = { 30 | ldaModel.save(sc, s"/tmp/${id}/ldaModel") 31 | corpus.saveAsObjectFile(s"/tmp/${id}/ldaCorpus") 32 | sc.parallelize(Seq(vocab)).saveAsObjectFile(s"/tmp/${id}/ldaVocab") 33 | true 34 | } 35 | } 36 | 37 | object LDAModelWithCorpusAndVocab 38 | extends IPersistentModelLoader[AlgorithmParams, LDAModelWithCorpusAndVocab] { 39 | def apply(id: String, params: AlgorithmParams, 40 | sc: Option[SparkContext]) = { 41 | new LDAModelWithCorpusAndVocab( 42 | DistributedLDAModel.load(sc.get, s"/tmp/${id}/ldaModel"), 43 | sc.get.objectFile(s"/tmp/${id}/ldaCorpus"), 44 | sc.get.objectFile[Map[String,Int]](s"/tmp/${id}/ldaVocab").first, 45 | sc.get 46 | ) 47 | } 48 | } 49 | 50 | case class AlgorithmParams( 51 | numTopics: Int, 52 | maxIter: Int, 53 | docConcentration: Double, 54 | topicConcentration: Double 55 | ) extends Params 56 | 57 | // extends PAlgorithm because contains RDD. 58 | // Does not implement save and load, because DistributedLDAModel doesn't support it yet 59 | class LDAAlgorithm(val ap: AlgorithmParams) 60 | extends PAlgorithm[PreparedData, LDAModelWithCorpusAndVocab, Query, PredictedResult] { 61 | 62 | @transient lazy val logger = Logger[this.type] 63 | 64 | def train(sc: SparkContext, data: PreparedData): LDAModelWithCorpusAndVocab = { 65 | require(!data.points.take(1).isEmpty, 66 | s"RDD[labeldPoints] in PreparedData cannot be empty." + 67 | " Please check if DataSource generates TrainingData" + 68 | " and Preprator generates PreparedData correctly.") 69 | 70 | val dataStrings = data.points.map(s => s.text) 71 | val (corpus, vocab) = makeDocuments(dataStrings) 72 | val ldaModel = new LDA().setSeed(13457).setK(ap.numTopics).setMaxIterations(ap.maxIter).run(corpus) 73 | .asInstanceOf[DistributedLDAModel] 74 | 75 | new LDAModelWithCorpusAndVocab(ldaModel, dataStrings zip corpus, vocab, sc) 76 | } 77 | 78 | def predict(ldaModelAndCorpus: LDAModelWithCorpusAndVocab, query: Query): PredictedResult = { 79 | val topics = ldaModelAndCorpus.ldaModel.describeTopics(10) 80 | val topicDists = ldaModelAndCorpus.ldaModel.topicDistributions 81 | val corpusMap =ldaModelAndCorpus.corpus.collect().toMap 82 | 83 | val maxTopicIndex: Int = getMaxTopicIndex(ldaModelAndCorpus.sc, query, ldaModelAndCorpus.ldaModel) 84 | val swappedMap = ldaModelAndCorpus.vocab.map(_.swap) 85 | val topicResults = for( ((indices, weights), outerIndex) <- topics zipWithIndex) 86 | yield {outerIndex -> (indices map (x => swappedMap(x)) zip weights) 87 | .sortWith((e1, e2) => (e1._2 > e2._2))} 88 | 89 | val topTopic = topicResults.toMap.getOrElse(maxTopicIndex, 90 | throw new scala.Exception("Cannot find topic")) 91 | 92 | new PredictedResult(topTopic, topicResults) 93 | } 94 | 95 | def getMaxTopicIndex(sc:SparkContext, query: Query, ldaModel: DistributedLDAModel): Int = { 96 | 97 | val text = query.text.trim 98 | 99 | val (corpus, vocab) = makeDocuments(sc.parallelize(Array(text))) 100 | 101 | val actualPredictions = ldaModel.toLocal.topicDistributions(corpus).map { case (id, topics) => 102 | // convert results to expectedPredictions format, which only has highest probability topic 103 | val topicsBz = new DenseVector(topics.toArray) 104 | (id, (argmax(topicsBz), max(topicsBz))) 105 | }.sortByKey() 106 | .values 107 | .collect() 108 | 109 | actualPredictions.head._1 110 | } 111 | 112 | 113 | //See https://gist.github.com/jkbradley/ab8ae22a8282b2c8ce33 114 | def makeDocuments(data: RDD[String]): (RDD[(Long, Vector)], Map[String, Int]) = { 115 | // Split each document into a sequence of terms (words) 116 | val tokenized: RDD[Seq[String]] = 117 | data.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3) 118 | .filter(_.forall(java.lang.Character.isLetter))) 119 | 120 | // Choose the vocabulary. 121 | // termCounts: Sorted list of (term, termCount) pairs 122 | val termCounts: Array[(String, Long)] = 123 | tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2) 124 | // vocabArray: Chosen vocab (removing common terms) 125 | val numStopwords = termCounts.size / 10 126 | val vocabArray: Array[String] = 127 | termCounts.takeRight(termCounts.size - numStopwords).map(_._1) 128 | // vocab: Map term -> term index 129 | val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap 130 | 131 | 132 | // Convert documents into term count vectors 133 | val documents: RDD[(Long, Vector)] = 134 | tokenized.zipWithIndex.map { case (tokens, id) => 135 | val counts = new scala.collection.mutable.HashMap[Int, Double]() 136 | tokens.foreach { term => 137 | if (vocab.contains(term)) { 138 | val idx = vocab(term) 139 | counts(idx) = counts.getOrElse(idx, 0.0) + 1.0 140 | } 141 | } 142 | (id, Vectors.sparse(vocab.size, counts.toSeq)) 143 | } 144 | (documents, vocab) 145 | } 146 | 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/main/scala/Preparator.scala: -------------------------------------------------------------------------------- 1 | package org.template.classification 2 | 3 | import io.prediction.controller.PPreparator 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.rdd.RDD 8 | 9 | 10 | class PreparedData( 11 | val points: RDD[TextPoint] 12 | ) extends Serializable 13 | 14 | class Preparator extends PPreparator[TrainingData, PreparedData] { 15 | 16 | def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { 17 | new PreparedData(trainingData.trainingText) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/Serving.scala: -------------------------------------------------------------------------------- 1 | package org.template.classification 2 | 3 | import io.prediction.controller.LServing 4 | 5 | class Serving extends LServing[Query, PredictedResult] { 6 | 7 | override 8 | def serve(query: Query, 9 | predictedResults: Seq[PredictedResult]): PredictedResult = { 10 | predictedResults.head 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /template.json: -------------------------------------------------------------------------------- 1 | {"pio": {"version": { "min": "0.9.4" }}} 2 | --------------------------------------------------------------------------------