├── .gitignore ├── README.md ├── build.sbt ├── data └── import_eventserver.py ├── engine.json ├── project ├── assembly.sbt └── pio-build.sbt ├── src └── main │ └── scala │ ├── DataSource.scala │ ├── Engine.scala │ ├── Preparator.scala │ ├── RFAlgorithm.scala │ └── Serving.scala └── template.json /.gitignore: -------------------------------------------------------------------------------- 1 | data/*.txt 2 | manifest.json 3 | pio.log 4 | /pio.sbt 5 | target/ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lead Scoring Engine Template 2 | 3 | ## Documentation 4 | 5 | Please refer to http://docs.prediction.io/templates/leadscoring/quickstart/ 6 | 7 | ## Versions 8 | 9 | ### v0.3.0 10 | 11 | - update for PredictionIO 0.9.2, including: 12 | 13 | - use new PEventStore API 14 | - use appName in DataSource parameter 15 | 16 | ### v0.2.0 17 | 18 | - update build.sbt and template.json for PredictionIO 0.9.2 19 | 20 | ### v0.1.0 21 | 22 | - initial release (require PredictionIO 0.9.0) 23 | 24 | 25 | ## Development Notes 26 | 27 | ### Sample Query 28 | 29 | ``` 30 | $ curl -H "Content-Type: application/json" \ 31 | -d '{ 32 | "landingPageId" : "example.com/page9", 33 | "referrerId" : "referrer10.com", 34 | "browser": "Firefox" }' \ 35 | http://localhost:8000/queries.json \ 36 | -w %{time_total} 37 | ``` 38 | 39 | ``` 40 | $ curl -H "Content-Type: application/json" \ 41 | -d '{ 42 | "landingPageId" : "example.com/page9", 43 | "referrerId" : "referrer10.com", 44 | "browser": "Chrome" }' \ 45 | http://localhost:8000/queries.json \ 46 | -w %{time_total} 47 | ``` 48 | 49 | ``` 50 | $ curl -H "Content-Type: application/json" \ 51 | -d '{ 52 | "landingPageId" : "x", 53 | "referrerId" : "y", 54 | "browser": "z" }' \ 55 | http://localhost:8000/queries.json \ 56 | -w %{time_total} 57 | ``` 58 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | name := "template-scala-parallel-leadscoring" 6 | 7 | organization := "io.prediction" 8 | 9 | libraryDependencies ++= Seq( 10 | "io.prediction" %% "core" % pioVersion.value % "provided", 11 | "org.apache.spark" %% "spark-core" % "1.3.0" % "provided", 12 | "org.apache.spark" %% "spark-mllib" % "1.3.0" % "provided") 13 | -------------------------------------------------------------------------------- /data/import_eventserver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Import sample data for lead scoring engine 3 | """ 4 | 5 | import predictionio 6 | import argparse 7 | import random 8 | import uuid 9 | 10 | SEED = 3 11 | 12 | def import_events(client): 13 | random.seed(SEED) 14 | count = 0 15 | print client.get_status() 16 | print "Importing data..." 17 | 18 | # generate 10 users, with user ids u1,u2,....,u10 19 | user_ids = ["u%s" % i for i in range(1, 10+1)] 20 | 21 | # generate 50 items, with user ids u1,u2,....,u10 22 | item_ids = ["i%s" % i for i in range(1, 50+1)] 23 | 24 | # generate 20 pageId 25 | page_ids = ["example.com/page%s" % i for i in range(1, 20+1)] 26 | 27 | # generate 10 referrerId 28 | refferal_ids = ["referrer%s.com" % i for i in range(1, 10+1)] 29 | 30 | browsers = [ "Chrome", "Firefox", "Safari", "Internet Explorer" ] 31 | 32 | # for each session 33 | 34 | # simulate user session: 35 | # generate a session ID 36 | for loop in range(0, 50): 37 | session_id = uuid.uuid1().hex 38 | print "session", session_id 39 | referrer_id = random.choice(refferal_ids) 40 | browser = random.choice(browsers) 41 | uid = random.choice(user_ids) 42 | page_id = random.choice(page_ids) 43 | print "User", uid ,"lands on page", page_id, "referrer", referrer_id, \ 44 | "browser", browser 45 | client.create_event( 46 | event = "view", 47 | entity_type = "user", 48 | entity_id = uid, 49 | target_entity_type = "page", 50 | target_entity_id = page_id, 51 | properties = { 52 | "sessionId": session_id, 53 | "referrerId": referrer_id, 54 | "browser": browser 55 | } 56 | ) 57 | count += 1 58 | 59 | # 0 or more page view 60 | for i in range(0, random.randint(0,2)): 61 | page_id = random.choice(page_ids) 62 | print "User", uid ,"views page", page_id 63 | client.create_event( 64 | event = "view", 65 | entity_type = "user", 66 | entity_id = uid, 67 | target_entity_type = "page", 68 | target_entity_id = page_id, 69 | properties = { 70 | "sessionId": session_id 71 | } 72 | ) 73 | count += 1 74 | 75 | if random.choice([True, False]): 76 | # 1 or more buy 77 | for i in range(0, random.randint(1,3)): 78 | item_id = random.choice(item_ids) 79 | print "User", uid ,"buys item", item_id 80 | client.create_event( 81 | event = "buy", 82 | entity_type = "user", 83 | entity_id = uid, 84 | target_entity_type = "item", 85 | target_entity_id = item_id, 86 | properties = { 87 | "sessionId": session_id, 88 | } 89 | ) 90 | count += 1 91 | 92 | # pick a user id 93 | # random start time 94 | # view a random page, random referrer, random browser 95 | # random gap 96 | # more random visit 97 | # random gap 98 | # buy event 99 | # end session 100 | 101 | print "%s events are imported." % count 102 | 103 | if __name__ == '__main__': 104 | parser = argparse.ArgumentParser( 105 | description="Import sample data for similar product engine") 106 | parser.add_argument('--access_key', default='invald_access_key') 107 | parser.add_argument('--url', default="http://localhost:7070") 108 | 109 | args = parser.parse_args() 110 | print args 111 | 112 | client = predictionio.EventClient( 113 | access_key=args.access_key, 114 | url=args.url, 115 | threads=4, 116 | qsize=100) 117 | import_events(client) 118 | -------------------------------------------------------------------------------- /engine.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "default", 3 | "description": "Default settings", 4 | "engineFactory": "org.template.leadscoring.LeadScoringEngine", 5 | "datasource": { 6 | "params": { 7 | "appName": "INVALID_APP_NAME" 8 | } 9 | }, 10 | "algorithms": [ 11 | { 12 | "name": "randomforest", 13 | "params": { 14 | "numClasses": 3, 15 | "numTrees": 5, 16 | "featureSubsetStrategy": "auto", 17 | "impurity": "variance", 18 | "maxDepth": 4, 19 | "maxBins": 100, 20 | "seed" : 12345 21 | } 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /project/pio-build.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("io.prediction" % "pio-build" % "0.9.0") 2 | -------------------------------------------------------------------------------- /src/main/scala/DataSource.scala: -------------------------------------------------------------------------------- 1 | package org.template.leadscoring 2 | 3 | import io.prediction.controller.PDataSource 4 | import io.prediction.controller.EmptyEvaluationInfo 5 | import io.prediction.controller.EmptyActualResult 6 | import io.prediction.controller.Params 7 | import io.prediction.data.storage.Event 8 | import io.prediction.data.store.PEventStore 9 | 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.SparkContext._ 12 | import org.apache.spark.rdd.RDD 13 | import org.apache.spark.mllib.regression.LabeledPoint 14 | import org.apache.spark.mllib.linalg.Vectors 15 | 16 | import grizzled.slf4j.Logger 17 | 18 | case class DataSourceParams(appName: String) extends Params 19 | 20 | class DataSource(val dsp: DataSourceParams) 21 | extends PDataSource[TrainingData, 22 | EmptyEvaluationInfo, Query, EmptyActualResult] { 23 | 24 | @transient lazy val logger = Logger[this.type] 25 | 26 | override 27 | def readTraining(sc: SparkContext): TrainingData = { 28 | 29 | val viewPage: RDD[(String, Event)] = PEventStore.find( 30 | appName = dsp.appName, 31 | entityType = Some("user"), 32 | eventNames = Some(Seq("view")), 33 | // targetEntityType is optional field of an event. 34 | targetEntityType = Some(Some("page")))(sc) 35 | // eventsDb.find() returns RDD[Event] 36 | .map { event => 37 | val sessionId = try { 38 | event.properties.get[String]("sessionId") 39 | } catch { 40 | case e: Exception => { 41 | logger.error(s"Cannot get sessionId from event ${event}. ${e}.") 42 | throw e 43 | } 44 | } 45 | (sessionId, event) 46 | } 47 | 48 | val buyItem: RDD[(String, Event)] = PEventStore.find( 49 | appName = dsp.appName, 50 | entityType = Some("user"), 51 | eventNames = Some(Seq("buy")), 52 | // targetEntityType is optional field of an event. 53 | targetEntityType = Some(Some("item")))(sc) 54 | // eventsDb.find() returns RDD[Event] 55 | .map { event => 56 | val sessionId = try { 57 | event.properties.get[String]("sessionId") 58 | } catch { 59 | case e: Exception => { 60 | logger.error(s"Cannot get sessionId from event ${event}. ${e}.") 61 | throw e 62 | } 63 | } 64 | (sessionId, event) 65 | } 66 | 67 | val session: RDD[Session] = viewPage.cogroup(buyItem) 68 | .map { case (sessionId, (viewIter, buyIter)) => 69 | // the first view event of the session is the landing event 70 | val landing = viewIter.reduce{ (a, b) => 71 | if (a.eventTime.isBefore(b.eventTime)) a else b 72 | } 73 | // any buy after landing 74 | val buy = buyIter.filter( b => b.eventTime.isAfter(landing.eventTime)) 75 | .nonEmpty 76 | 77 | try { 78 | new Session( 79 | landingPageId = landing.targetEntityId.get, 80 | referrerId = landing.properties.getOrElse[String]("referrerId", ""), 81 | browser = landing.properties.getOrElse[String]("browser", ""), 82 | buy = buy 83 | ) 84 | } catch { 85 | case e: Exception => { 86 | logger.error(s"Cannot create session data from ${landing}. ${e}.") 87 | throw e 88 | } 89 | } 90 | }.cache() 91 | 92 | new TrainingData(session) 93 | } 94 | } 95 | 96 | 97 | case class Session( 98 | landingPageId: String, 99 | referrerId: String, 100 | browser: String, 101 | buy: Boolean // buy or not 102 | ) extends Serializable 103 | 104 | class TrainingData( 105 | val session: RDD[Session] 106 | ) extends Serializable 107 | -------------------------------------------------------------------------------- /src/main/scala/Engine.scala: -------------------------------------------------------------------------------- 1 | package org.template.leadscoring 2 | 3 | import io.prediction.controller.IEngineFactory 4 | import io.prediction.controller.Engine 5 | 6 | case class Query( 7 | landingPageId: String, 8 | referrerId: String, 9 | browser: String 10 | ) extends Serializable 11 | 12 | case class PredictedResult( 13 | score: Double 14 | ) extends Serializable 15 | 16 | object LeadScoringEngine extends IEngineFactory { 17 | def apply() = { 18 | new Engine( 19 | classOf[DataSource], 20 | classOf[Preparator], 21 | Map("randomforest" -> classOf[RFAlgorithm]), 22 | classOf[Serving]) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/Preparator.scala: -------------------------------------------------------------------------------- 1 | package org.template.leadscoring 2 | 3 | import io.prediction.controller.PPreparator 4 | //import io.prediction.data.storage.BiMap 5 | 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.mllib.regression.LabeledPoint 10 | import org.apache.spark.mllib.linalg.Vectors 11 | 12 | import grizzled.slf4j.Logger 13 | 14 | class PreparedData( 15 | val labeledPoints: RDD[LabeledPoint], 16 | val featureIndex: Map[String, Int], 17 | val featureCategoricalIntMap: Map[String, Map[String, Int]] 18 | ) extends Serializable 19 | 20 | 21 | class Preparator extends PPreparator[TrainingData, PreparedData] { 22 | 23 | @transient lazy val logger = Logger[this.type] 24 | 25 | private def createCategoricalIntMap( 26 | values: Array[String], // categorical values 27 | default: String // default cateegorical value 28 | ): Map[String, Int] = { 29 | val m = values.zipWithIndex.toMap 30 | if (m.contains(default)) 31 | m 32 | else 33 | // add default value if origina values don't have it 34 | m + (default -> m.size) 35 | } 36 | 37 | def prepare(sc: SparkContext, td: TrainingData): PreparedData = { 38 | 39 | // find out all values of the each feature 40 | val landingValues = td.session.map(_.landingPageId).distinct.collect 41 | val referrerValues = td.session.map(_.referrerId).distinct.collect 42 | val browserValues = td.session.map(_.browser).distinct.collect 43 | 44 | // map feature value to integer for each categorical feature 45 | val featureCategoricalIntMap = Map( 46 | "landingPage" -> createCategoricalIntMap(landingValues, ""), 47 | "referrer" -> createCategoricalIntMap(referrerValues, ""), 48 | "browser" -> createCategoricalIntMap(browserValues, "") 49 | ) 50 | // index position of each feature in the vector 51 | val featureIndex = Map( 52 | "landingPage" -> 0, 53 | "referrer" -> 1, 54 | "browser" -> 2 55 | ) 56 | 57 | // inject some default to cover default cases 58 | val defaults = Seq( 59 | new Session( 60 | landingPageId = "", 61 | referrerId = "", 62 | browser = "", 63 | buy = false 64 | ), 65 | new Session( 66 | landingPageId = "", 67 | referrerId = "", 68 | browser = "", 69 | buy = true 70 | )) 71 | 72 | val defaultRDD = sc.parallelize(defaults) 73 | val sessionRDD = td.session.union(defaultRDD) 74 | 75 | val labeledPoints: RDD[LabeledPoint] = sessionRDD.map { session => 76 | logger.debug(s"${session}") 77 | val label = if (session.buy) 1.0 else 0.0 78 | 79 | val feature = new Array[Double](featureIndex.size) 80 | feature(featureIndex("landingPage")) = 81 | featureCategoricalIntMap("landingPage")(session.landingPageId).toDouble 82 | feature(featureIndex("referrer")) = 83 | featureCategoricalIntMap("referrer")(session.referrerId).toDouble 84 | feature(featureIndex("browser")) = 85 | featureCategoricalIntMap("browser")(session.browser).toDouble 86 | 87 | LabeledPoint(label, Vectors.dense(feature)) 88 | }.cache() 89 | 90 | logger.debug(s"labelelPoints count: ${labeledPoints.count()}") 91 | new PreparedData( 92 | labeledPoints = labeledPoints, 93 | featureIndex = featureIndex, 94 | featureCategoricalIntMap = featureCategoricalIntMap) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/scala/RFAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package org.template.leadscoring 2 | 3 | import io.prediction.controller.P2LAlgorithm 4 | import io.prediction.controller.Params 5 | import io.prediction.data.storage.BiMap 6 | 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.mllib.tree.RandomForest 9 | import org.apache.spark.mllib.tree.model.RandomForestModel 10 | import org.apache.spark.mllib.linalg.Vectors 11 | 12 | import grizzled.slf4j.Logger 13 | 14 | case class RFAlgorithmParams( 15 | numTrees: Int, 16 | featureSubsetStrategy: String, 17 | impurity: String, 18 | maxDepth: Int, 19 | maxBins: Int, 20 | seed: Option[Int] 21 | ) extends Params 22 | 23 | class RFModel( 24 | val forest: RandomForestModel, 25 | val featureIndex: Map[String, Int], 26 | val featureCategoricalIntMap: Map[String, Map[String, Int]] 27 | ) extends Serializable { 28 | override def toString = { 29 | s" forest: [${forest}]" + 30 | s" featureIndex: ${featureIndex}" + 31 | s" featureCategoricalIntMap: ${featureCategoricalIntMap}" 32 | } 33 | } 34 | 35 | // extends P2LAlgorithm because the MLlib's RandomForestModel doesn't 36 | // contain RDD. 37 | class RFAlgorithm(val ap: RFAlgorithmParams) 38 | extends P2LAlgorithm[PreparedData, RFModel, Query, PredictedResult] { 39 | 40 | @transient lazy val logger = Logger[this.type] 41 | 42 | def train(sc: SparkContext, pd: PreparedData): RFModel = { 43 | 44 | val categoricalFeaturesInfo = pd.featureCategoricalIntMap 45 | .map { case (f, m) => 46 | (pd.featureIndex(f), m.size) 47 | } 48 | 49 | logger.info(s"categoricalFeaturesInfo: ${categoricalFeaturesInfo}") 50 | 51 | // use random seed if seed is not specified 52 | val seed = ap.seed.getOrElse(scala.util.Random.nextInt()) 53 | 54 | val forestModel: RandomForestModel = RandomForest.trainRegressor( 55 | input = pd.labeledPoints, 56 | categoricalFeaturesInfo = categoricalFeaturesInfo, 57 | numTrees = ap.numTrees, 58 | featureSubsetStrategy = ap.featureSubsetStrategy, 59 | impurity = ap.impurity, 60 | maxDepth = ap.maxDepth, 61 | maxBins = ap.maxBins, 62 | seed = seed) 63 | 64 | new RFModel( 65 | forest = forestModel, 66 | featureIndex = pd.featureIndex, 67 | featureCategoricalIntMap = pd.featureCategoricalIntMap 68 | ) 69 | } 70 | 71 | def predict(model: RFModel, query: Query): PredictedResult = { 72 | 73 | val featureIndex = model.featureIndex 74 | val featureCategoricalIntMap = model.featureCategoricalIntMap 75 | 76 | val landingPageId = query.landingPageId 77 | val referrerId = query.referrerId 78 | val browser = query.browser 79 | 80 | // look up categorical feature Int for landingPageId 81 | val landingFeature = lookupCategoricalInt( 82 | featureCategoricalIntMap = featureCategoricalIntMap, 83 | feature = "landingPage", 84 | value = landingPageId, 85 | default = "" 86 | ).toDouble 87 | 88 | 89 | // look up categorical feature Int for referrerId 90 | val referrerFeature = lookupCategoricalInt( 91 | featureCategoricalIntMap = featureCategoricalIntMap, 92 | feature = "referrer", 93 | value = referrerId, 94 | default = "" 95 | ).toDouble 96 | 97 | // look up categorical feature Int for brwoser 98 | val browserFeature = lookupCategoricalInt( 99 | featureCategoricalIntMap = featureCategoricalIntMap, 100 | feature = "browser", 101 | value = browser, 102 | default = "" 103 | ).toDouble 104 | 105 | // create feature Array 106 | val feature = new Array[Double](model.featureIndex.size) 107 | feature(featureIndex("landingPage")) = landingFeature 108 | feature(featureIndex("referrer")) = referrerFeature 109 | feature(featureIndex("browser")) = browserFeature 110 | 111 | val score = model.forest.predict(Vectors.dense(feature)) 112 | new PredictedResult(score) 113 | } 114 | 115 | private def lookupCategoricalInt( 116 | featureCategoricalIntMap: Map[String, Map[String, Int]], 117 | feature: String, 118 | value: String, 119 | default: String): Int = { 120 | 121 | featureCategoricalIntMap(feature) 122 | .get(value) 123 | .getOrElse { 124 | logger.info(s"Unknown ${feature} ${value}." + 125 | " Default feature value will be used.") 126 | // use default feature value 127 | featureCategoricalIntMap(feature)(default) 128 | } 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/main/scala/Serving.scala: -------------------------------------------------------------------------------- 1 | package org.template.leadscoring 2 | 3 | import io.prediction.controller.LServing 4 | 5 | import grizzled.slf4j.Logger 6 | 7 | class Serving extends LServing[Query, PredictedResult] { 8 | 9 | @transient lazy val logger = Logger[this.type] 10 | 11 | override 12 | def serve(query: Query, 13 | predictedResults: Seq[PredictedResult]): PredictedResult = { 14 | predictedResults.head 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /template.json: -------------------------------------------------------------------------------- 1 | {"pio": {"version": { "min": "0.9.2" }}} 2 | --------------------------------------------------------------------------------