├── README.md
├── build.sbt
├── project
    └── plugins.sbt
├── src
    └── main
    │   ├── resources
    │       ├── application.conf
    │       ├── items.txt
    │       └── ratings.txt
    │   └── scala
    │       ├── Test.scala
    │       ├── colfilt2
    │           └── CollaborativeFiltering.scala
    │       ├── import1
    │           └── Import.scala
    │       ├── model
    │           └── Model.scala
    │       └── utils
    │           ├── Converter.scala
    │           ├── CustomObjectInputStream.scala
    │           ├── EsClient.scala
    │           └── Settings.scala
└── workshop-spark.pdf


/README.md:
--------------------------------------------------------------------------------
1 | spark-elasticsearch-mllib
2 | =========================
3 | 
4 | ScalaIO 2014 Workshop
5 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | scalacOptions in (Compile, console) += "-Yrepl-sync"
 2 | 
 3 | organization := "com.ebiznext.scalaio"
 4 | 
 5 | name := "import"
 6 | 
 7 | version := "0.1.0-SNAPSHOT"
 8 | 
 9 | scalaVersion := "2.10.4"
10 | 
11 | resolvers += "Typesafe Releases" at "http://repo.typesafe.com/typesafe/releases/"
12 | 
13 | resolvers += "Conjars" at "http://conjars.org/repo"
14 | 
15 | resolvers += "cljars" at "https://clojars.org/repo/"
16 | 
17 | val jacksonV = "2.4.3"
18 | 
19 | val elastic4sV = "1.3.2"
20 | 
21 | val elasticSearchV = "1.3.2"
22 | 
23 | val sparkV = "1.1.0"
24 | 
25 | libraryDependencies ++= Seq(
26 |   "com.typesafe" % "config" % "1.0.2",
27 |   "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonV,
28 |   "com.fasterxml.jackson.core" % "jackson-annotations" % jacksonV,
29 |   "com.fasterxml.jackson.core" % "jackson-core" % jacksonV,
30 |   "com.fasterxml.jackson.core" % "jackson-databind" % jacksonV,
31 |   "com.sksamuel.elastic4s" %% "elastic4s" % elastic4sV exclude("org.elasticsearch", "elasticsearch"),
32 |   "org.elasticsearch" % "elasticsearch" % elasticSearchV,
33 |   "org.apache.spark" %% "spark-core" % sparkV,
34 |   "org.apache.spark" %% "spark-mllib" % sparkV,
35 |   "org.elasticsearch" % "elasticsearch-hadoop" % "2.1.0.Beta2",
36 |   "org.apache.mesos" % "mesos" % "0.18.1" exclude("com.google.protobuf", "protobuf-java"),
37 |   "org.specs2" %% "specs2" % "2.3.13" % "test"
38 | )
39 | 
40 | packAutoSettings
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.6.2")
2 | 


--------------------------------------------------------------------------------
/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | elasticsearch {
 2 |   date.format = "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"
 3 |   host = "127.0.0.1"
 4 |   http.port = 19200
 5 |   port = 19300
 6 |   index = "scalaio2014"
 7 |   cluster = "scalaiocluster"
 8 | }
 9 | 
10 | 
11 | spark {
12 |   master = "spark://127.0.0.1:19400"
13 |   appName = "scalaio2014"
14 |   ratingsFile = "ratings.txt"
15 |   itemsFile = "items.txt"
16 |   dataPath ="/Users/hayssams/git/public/scalaio-2014/workshop/src/main/resources/"
17 |   jarPath = "/Users/hayssams/git/public/scalaio-2014/workshop/target/pack/lib/"
18 | }


--------------------------------------------------------------------------------
/src/main/resources/items.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebiznext/spark-elasticsearch-mllib/4902f7c28546371ff292923377672676ad33e98b/src/main/resources/items.txt


--------------------------------------------------------------------------------
/src/main/scala/Test.scala:
--------------------------------------------------------------------------------
 1 | import java.util.Date
 2 | 
 3 | import scala.reflect.runtime.universe._
 4 | 
 5 | /**
 6 |  * Created by hayssams on 16/10/14.
 7 |  */
 8 | object Test extends App {
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/colfilt2/CollaborativeFiltering.scala:
--------------------------------------------------------------------------------
  1 | package colfilt2
  2 | 
  3 | import java.io.File
  4 | 
  5 | import com.sksamuel.elastic4s.ElasticDsl._
  6 | import import1.Import
  7 | import model.Model.{Rating, User}
  8 | import org.apache.spark.SparkContext
  9 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel}
 10 | import org.apache.spark.rdd.RDD
 11 | import utils.{EsClient, Settings}
 12 | 
 13 | object CollaborativeFiltering extends App {
 14 |   def sparkInit(): SparkContext = {
 15 |     import org.apache.spark.{SparkConf, SparkContext}
 16 |     val conf = new SparkConf().setAppName(Settings.Spark.AppName).setMaster(Settings.Spark.Master)
 17 |     conf.set("es.nodes", "localhost")
 18 |     conf.set("es.port", "19200")
 19 |     val sparkContext = new SparkContext(conf)
 20 | 
 21 |     val libdir = new File(Settings.Spark.JarPath)
 22 |     libdir.list().foreach(jar => sparkContext.addJar(Settings.Spark.JarPath + jar))
 23 |     sparkContext
 24 |   }
 25 | 
 26 | 
 27 |   def summary(): Unit = {
 28 |     import org.elasticsearch.spark._
 29 |     val sparkContext = sparkInit()
 30 |     val esType = s"${Settings.ElasticSearch.Index}/${EsClient.esType[Rating]}"
 31 |     val ratings = sparkContext.esRDD(esType)
 32 |     val users = ratings.map(_._2("userid")).distinct
 33 |     val nbUsers = users.count
 34 |     val nbItems = ratings.map(_._2("itemid")).distinct.count
 35 |     println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
 36 |     println(s"$nbUsers users rated $nbItems")
 37 |     println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
 38 | 
 39 |     //    val res: Seq[String] = rdd.map(_._2("itemid").toString).countByValue().toSeq.sortBy(-_._2).take(50).map(_._1)
 40 |     //    res.foreach(println)
 41 | 
 42 |     sparkContext.stop()
 43 |   }
 44 | 
 45 |   def als() = {
 46 |     import org.elasticsearch.spark._
 47 |     val sparkContext = sparkInit()
 48 |     val esType = s"${Settings.ElasticSearch.Index}/${EsClient.esType[Rating]}"
 49 |     val esWildcardQuery =  search in Settings.ElasticSearch.Index -> EsClient.esType[Rating] query { matchall }
 50 | 
 51 |     val ratings = sparkContext.esRDD(esType)
 52 | 
 53 |     import org.apache.spark.mllib.recommendation.Rating
 54 | 
 55 |     val allData = sparkContext.esRDD(esType, esWildcardQuery._builder.toString).cache()
 56 | 
 57 |     val allDataCount = allData.count()
 58 | 
 59 |     val trainingSet = allData.filter { x =>
 60 |       val index = x._2("index").toString.toLong
 61 |       index % 10 != 1 && index % 10 != 2
 62 |     }.map(x => Rating(x._2("userid").toString.toInt, x._2("itemid").toString.toInt, x._2("rating").toString.toDouble)).cache()
 63 | 
 64 |     val validatingSet = allData.filter { x =>
 65 |       val index = x._2("index").toString.toLong
 66 |       index % 10 == 1
 67 |     }.map(x => Rating(x._2("userid").toString.toInt, x._2("itemid").toString.toInt, x._2("rating").toString.toDouble)).cache()
 68 | 
 69 |     val testingSet = allData.filter { x =>
 70 |       val index = x._2("index").toString.toLong
 71 |       index % 10 == 2
 72 |     }.map(x => Rating(x._2("userid").toString.toInt, x._2("itemid").toString.toInt, x._2("rating").toString.toDouble)).cache()
 73 | 
 74 |     val numTraining = trainingSet.count()
 75 |     val numValidation = validatingSet.count()
 76 |     val numTest = testingSet.count()
 77 | 
 78 | 
 79 | 
 80 |     val ranks = List(12)
 81 |     val lambdas = List(0.16)
 82 |     val numIters = List(30)
 83 |     var bestModel: Option[MatrixFactorizationModel] = None
 84 |     var bestValidationRmse = Double.MaxValue
 85 |     var bestRank = 0
 86 |     var bestLambda = -1.0
 87 |     var bestNumIter = -1
 88 | 
 89 |     def rmse(model: MatrixFactorizationModel, data: RDD[Rating]) = {
 90 |       import org.apache.spark.SparkContext._
 91 |       val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
 92 |       val predictionsAndRatings = predictions.map(x => ((x.user, x.product), x.rating)).join(data.map(x => ((x.user, x.product), x.rating))).values
 93 |       math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
 94 |     }
 95 | 
 96 |     for (rank <- ranks; lambda <- lambdas; numIter <- numIters) {
 97 |       val model = ALS.train(trainingSet, rank, numIter, lambda)
 98 |       val validationRmse = rmse(model, validatingSet)
 99 |       println("RMSE (validation) = " + validationRmse + " for the model trained with rank = "
100 |         + rank + ", lambda = " + lambda + ", and numIter = " + numIter + ".")
101 | 
102 |       println(trainingSet.count() + "////" + testingSet.count() + "////" + validatingSet.count())
103 | 
104 |       if (validationRmse < bestValidationRmse) {
105 |         bestModel = Some(model)
106 |         bestValidationRmse = validationRmse
107 |         bestRank = rank
108 |         bestLambda = lambda
109 |         bestNumIter = numIter
110 |       }
111 |     }
112 |     val testRmse = rmse(bestModel.get, testingSet)
113 | 
114 |     val users = trainingSet.map(_.user).distinct.collect()
115 | 
116 |     //println("================" + users.length)
117 | 
118 |     val usersRecommendations = users.map { userId =>
119 |       val recommendations = bestModel.get.recommendProducts(userId, 10).map(_.product)
120 |       User(userId, recommendations)
121 |     }
122 | 
123 |     Import.bulkIndex(usersRecommendations.toList)
124 | 
125 |     import org.apache.spark.SparkContext._
126 |     val meanRating = trainingSet.union(validatingSet).map(_.rating).mean()
127 | 
128 |     val baselineRmse =
129 |       math.sqrt(testingSet.map((x: Rating) => (meanRating - x.rating) * (meanRating - x.rating)).mean)
130 |     val improvement = (baselineRmse - testRmse) / baselineRmse * 100
131 |     println("The best model improves the baseline by " + "%1.2f".format(improvement) + "%.")
132 | 
133 |     println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
134 |     println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
135 |     println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
136 |     println("The best model was trained with rank = " + bestRank + " and lambda = " + bestLambda + ", and numIter = " + bestNumIter + ", and its RMSE on the test set is " + testRmse + ".")
137 |     println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
138 |     println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
139 |     println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
140 | 
141 |     sparkContext.stop()
142 | 
143 |   }
144 | 
145 |   als()
146 |   Thread.sleep(30000)
147 | }
148 | 


--------------------------------------------------------------------------------
/src/main/scala/import1/Import.scala:
--------------------------------------------------------------------------------
  1 | package import1
  2 | 
  3 | // imports - will be omitted for other examples
  4 | 
  5 | 
  6 | import java.io.File
  7 | import java.util.Date
  8 | 
  9 | import model.Model.Rating
 10 | import org.apache.spark.SparkContext
 11 | import org.apache.spark.rdd.RDD
 12 | import org.elasticsearch.action.bulk.BulkResponse
 13 | import utils.{EsClient, Settings}
 14 | 
 15 | import scala.util.control.NonFatal
 16 | 
 17 | object Import extends App {
 18 | 
 19 |   import com.sksamuel.elastic4s.ElasticDsl._
 20 |   import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse
 21 | 
 22 |   def deleteIndex: DeleteIndexResponse = {
 23 |     val deleteIndexDefinition = delete index ("scalaio2014")
 24 |     EsClient().execute(deleteIndexDefinition).await
 25 |   }
 26 | 
 27 |   import org.elasticsearch.action.admin.indices.create.CreateIndexResponse
 28 | 
 29 |   def createIndex: CreateIndexResponse = {
 30 |     import com.sksamuel.elastic4s.mappings.FieldType.{IntegerType, StringType}
 31 |     val createIndexDefinition = create index "scalaio2014" mappings(
 32 |       "Rating" as(
 33 |         "index" typed IntegerType,
 34 |         "userid" typed IntegerType,
 35 |         "movieid" typed IntegerType,
 36 |         "rating" typed IntegerType,
 37 |         "timestamp" typed IntegerType
 38 |         ),
 39 |       "User" as(
 40 |         "userid" typed IntegerType,
 41 |         "movieid" typed StringType
 42 |         )
 43 |       )
 44 |     EsClient().execute(createIndexDefinition).await
 45 |   }
 46 | 
 47 | 
 48 |   def bulkIndex[T <: {def toMap() : Map[String, Any]} : Manifest](items: List[T]): BulkResponse = {
 49 | 
 50 |     val indexDefinitions = items.map(item => index into s"${Settings.ElasticSearch.Index}/${EsClient.esType[T]}" fields (item.toMap()))
 51 |     val bulkDefinition = bulk(
 52 |       indexDefinitions: _*
 53 |     )
 54 |     println(bulkDefinition._builder.requests().toString)
 55 |     EsClient().execute(bulkDefinition).await
 56 |   }
 57 | 
 58 | 
 59 |   def sparkInit(): SparkContext = {
 60 |     import org.apache.spark.{SparkConf, SparkContext}
 61 |     val conf = new SparkConf().setAppName(Settings.Spark.AppName).setMaster(Settings.Spark.Master)
 62 | 
 63 |     val sparkContext = new SparkContext(conf)
 64 | 
 65 |     val libdir = new File(Settings.Spark.JarPath)
 66 |     libdir.list().foreach(jar => sparkContext.addJar(Settings.Spark.JarPath + jar))
 67 |     sparkContext
 68 |   }
 69 | 
 70 |   def loadFileSpark() = {
 71 |     val sparkContext = sparkInit()
 72 | 
 73 |     val ratingLines = sparkContext.textFile(Settings.Spark.RatingsFile)
 74 | 
 75 |     var i = 0
 76 |     val ratings: RDD[Rating] = ratingLines.map { ratingLine =>
 77 |       val ratingFields = ratingLine.split("\\s+")
 78 |       i = i + 1
 79 |       Rating(i, ratingFields(0).toInt, ratingFields(1).toInt, ratingFields(2).toInt, ratingFields(3).toLong)
 80 |     } cache()
 81 | 
 82 |     ratings.foreach {
 83 |       rating => bulkIndex(List(rating))
 84 |     }
 85 | 
 86 |     sparkContext.stop()
 87 |   }
 88 | 
 89 |   def loadFileLocal(): Unit = {
 90 |     val lines = scala.tools.nsc.io.File(Settings.Spark.RatingsFile).lines()
 91 |     var i = 0
 92 |     val ratings = lines.map { ratingLine =>
 93 |       val ratingFields = ratingLine.split("\\s+")
 94 |       i = i + 1
 95 |       Rating(i, ratingFields(0).toInt, ratingFields(1).toInt, ratingFields(2).toInt, ratingFields(3).toLong)
 96 |     }
 97 |     ratings.foreach { rating =>
 98 |       bulkIndex(List(rating))
 99 |     }
100 |   }
101 | 
102 |   def doItAll(loader: () => Unit): Unit = {
103 |     try {
104 |       deleteIndex
105 | 
106 |     }
107 |     catch {
108 |       case NonFatal(e) => e.printStackTrace();println("No index need to be deleted")
109 |     }
110 | 
111 |     createIndex
112 |     println("Index created")
113 |     val start = new Date().getTime
114 |     loader()
115 |     val duration = (new Date().getTime - start) / 1000
116 |     println(s"duration=$duration")
117 |   }
118 | 
119 |   def doItAllLocal(): Unit = {
120 |     doItAll(loadFileLocal)
121 |   }
122 | 
123 |   def doItAllSpark(): Unit = {
124 |     doItAll(loadFileSpark)
125 |   }
126 | 
127 |   doItAllSpark()
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/scala/model/Model.scala:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | /**
 4 |  * Created by hayssams on 20/10/14.
 5 |  */
 6 | object Model {
 7 | 
 8 |   case class Rating(val index : Int, val userid: Int, itemid: Int, rating: Int, timestamp: Long) {
 9 |     def toMap(): Map[String, Any] = Map("index" -> index, "userid" -> userid, "itemid" -> itemid, "rating" -> rating, "timestamp" -> timestamp)
10 |   }
11 | 
12 |   case class Item(val itemid: Int, name: String, timestamp: Long, unknown: Boolean, action: Boolean, adventure: Boolean, animation: Boolean,
13 |                   children: Boolean, comedy: Boolean, crime: Boolean, documentary: Boolean, drama: Boolean, fantasy: Boolean, filmnoir: Boolean, horror: Boolean,
14 |                   musical: Boolean, mystery: Boolean, romance: Boolean, scifi: Boolean, thriller: Boolean, war: Boolean, western: Boolean) {
15 |     def toMap(): Map[String, Any] = Map("itemid" -> itemid, "name" -> name, "timestamp" -> timestamp, "unknown" -> unknown,
16 |       "action" -> action, "adventure" -> adventure, "animation" -> animation, "children" -> children, "comedy" -> comedy, "crime" -> crime, "documentary" -> documentary,
17 |       "drama" -> drama, "fantasy" -> fantasy, "filmnoir" -> filmnoir, "horror" -> horror, "musical" -> musical, "mystery" -> mystery, "romance" -> romance, "scifi" -> scifi,
18 |       "thriller" -> thriller, "war" -> war, "western" -> western)
19 |   }
20 | 
21 | 
22 |   case class User(val userid:Int, itemids:Array[Int]) {
23 |     def toMap(): Map[String, Any] = Map("userid" -> userid, "itemids" -> itemids.map(_.toString).mkString(","))
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/utils/Converter.scala:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | import java.io.{BufferedOutputStream, ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
 4 | 
 5 | import com.fasterxml.jackson.core.`type`.TypeReference
 6 | import com.fasterxml.jackson.databind.ObjectMapper
 7 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
 8 | 
 9 | import scala.Array.canBuildFrom
10 | 
11 | /**
12 |  * Generic Object Converter
13 |  * Binary converter based on Java standard serializer
14 |  * A performance improvement would be to rely on https://code.google.com/p/kryo/
15 |  *
16 |  * JSON converter based on jackson scala module
17 |  */
18 | trait Converter[T] {
19 |   def toDomain[T: Manifest](obj: Array[Byte]): T
20 | 
21 |   def fromDomain[T: Manifest](value: T): Array[Byte]
22 | }
23 | 
24 | 
25 | trait BinaryConverter[T] extends Converter[T] {
26 |   def toDomain[T: Manifest](obj: Array[Byte]): T = safeDecode(obj)
27 | 
28 |   def fromDomain[T: Manifest](value: T): Array[Byte] = {
29 |     val bos = new ByteArrayOutputStream()
30 |     val out = new ObjectOutputStream(new BufferedOutputStream(bos))
31 |     out writeObject (value)
32 |     out close()
33 |     bos toByteArray()
34 |   }
35 | 
36 |   def safeDecode[T: Manifest](bytes: Array[Byte]) = {
37 |     val cl = Option(this.getClass().getClassLoader())
38 |     val cin = cl match {
39 |       case Some(cls) =>
40 |         new CustomObjectInputStream(new ByteArrayInputStream(bytes), cls)
41 |       case None =>
42 |         new ObjectInputStream(new ByteArrayInputStream(bytes))
43 |     }
44 |     val obj = cin.readObject
45 |     cin.close
46 |     obj.asInstanceOf[T]
47 |   }
48 | }
49 | 
50 | trait JSONConverter[T] extends Converter[T] {
51 |   def toDomain[T: Manifest](bytes: Array[Byte]): T = {
52 |     val x: Option[T] = None
53 |     JacksonConverter.deserialize[T](new String(bytes))
54 |   }
55 | 
56 |   def fromDomain[T: Manifest](value: T): Array[Byte] = {
57 |     JacksonConverter.serialize(value) map (_.toChar) toCharArray() map (_.toByte)
58 |   }
59 | }
60 | 
61 | object JacksonConverter {
62 | 
63 |   import java.lang.reflect._
64 | 
65 |   lazy val mapper = new ObjectMapper().registerModule(DefaultScalaModule)
66 | 
67 |   def serialize(value: Any): String = {
68 |     mapper.writeValueAsString(value)
69 |   }
70 | 
71 |   def deserialize[T: Manifest](json: String): T = mapper.readValue(json, typeReference[T])
72 | 
73 |   private[this] def typeReference[T: Manifest] = new TypeReference[T] {
74 |     override def getType: Type = typeFromManifest(manifest[T])
75 |   }
76 | 
77 |   private[this] def typeFromManifest(m: Manifest[_]): Type = {
78 |     if (m.typeArguments.isEmpty) {
79 |       m.runtimeClass
80 |     }
81 |     else new ParameterizedType {
82 |       def getRawType = m.runtimeClass
83 | 
84 |       def getActualTypeArguments = m.typeArguments.map(typeFromManifest).toArray
85 | 
86 |       def getOwnerType = null
87 |     }
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/scala/utils/CustomObjectInputStream.scala:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | /**
 4 |  * Created by hayssams on 04/03/14.
 5 |  */
 6 | 
 7 | import java.io.{InputStream, ObjectInputStream, ObjectStreamClass}
 8 | 
 9 | import scala.Array.canBuildFrom
10 | 
11 | /**
12 |  * Handle when running thorugh SBT and forking is not activated
13 |  */
14 | class CustomObjectInputStream(in: InputStream, cl: ClassLoader) extends ObjectInputStream(in) {
15 |   override def resolveClass(cd: ObjectStreamClass): Class[_] =
16 |     try {
17 |       cl.loadClass(cd.getName())
18 |     } catch {
19 |       case cnf: ClassNotFoundException =>
20 |         super.resolveClass(cd)
21 |     }
22 | 
23 |   override def resolveProxyClass(interfaces: Array[String]): Class[_] =
24 |     try {
25 |       val ifaces = interfaces map { iface => cl.loadClass(iface)}
26 |       java.lang.reflect.Proxy.getProxyClass(cl, ifaces: _*)
27 |     } catch {
28 |       case e: ClassNotFoundException =>
29 |         super.resolveProxyClass(interfaces)
30 |     }
31 | }


--------------------------------------------------------------------------------
/src/main/scala/utils/EsClient.scala:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import java.util.Date
  4 | 
  5 | import com.sksamuel.elastic4s.ElasticClient
  6 | import com.sksamuel.elastic4s.ElasticDsl._
  7 | import com.sksamuel.elastic4s.source.DocumentSource
  8 | import org.elasticsearch.common.settings.ImmutableSettings
  9 | import org.elasticsearch.search.SearchHit
 10 | 
 11 | import scala.concurrent.ExecutionContext.Implicits.global
 12 | import scala.concurrent._
 13 | 
 14 | 
 15 | object EsClient {
 16 |   val settings = ImmutableSettings.settingsBuilder().put("cluster.name", Settings.ElasticSearch.Cluster).build()
 17 |   val client = ElasticClient.remote(settings, (Settings.ElasticSearch.Host, Settings.ElasticSearch.Port))
 18 | 
 19 |   def apply(): ElasticClient = client
 20 | 
 21 |   def index[T: Manifest](t: T, refresh: Boolean = true): String = {
 22 |     val json = JacksonConverter.serialize(t)
 23 |     val res = client.client.prepareIndex(Settings.ElasticSearch.Index, manifest[T].runtimeClass.getSimpleName)
 24 |       .setSource(json)
 25 |       .setRefresh(refresh)
 26 |       .execute()
 27 |       .actionGet()
 28 |     res.getId
 29 |   }
 30 | 
 31 |   def load[T: Manifest](uuid: String): Future[Option[T]] = {
 32 |     val req = get id uuid from Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName
 33 |     val res = client.execute(req)
 34 |     res map { res =>
 35 |       if (res.isExists) Some(JacksonConverter.deserialize[T](res.getSourceAsString)) else None
 36 |     }
 37 |   }
 38 | 
 39 |   def loadWithVersion[T: Manifest](uuid: String): Future[Option[(T, Long)]] = {
 40 |     val req = get id uuid from Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName
 41 |     val res = client.execute(req)
 42 |     res map { res =>
 43 |       val maybeT = if (res.isExists) Some(JacksonConverter.deserialize[T](res.getSourceAsString)) else None
 44 |       maybeT map ((_, res.getVersion))
 45 |     }
 46 |   }
 47 | 
 48 |   def delete[T: Manifest](uuid: String, refresh: Boolean): Future[Boolean] = {
 49 |     val req = com.sksamuel.elastic4s.ElasticDsl.delete id uuid from Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName refresh refresh
 50 |     val res = client.execute(req)
 51 |     res map { res =>
 52 |       res.isFound
 53 |     }
 54 |   }
 55 | 
 56 |   def update[T: Manifest](uuid: String, t: T, upsert: Boolean, refresh: Boolean): Future[Boolean] = {
 57 |     val js = JacksonConverter.serialize(t)
 58 |     val req = com.sksamuel.elastic4s.ElasticDsl.update id uuid in Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName refresh refresh doc new DocumentSource {
 59 |       override def json: String = js
 60 |     }
 61 |     req.docAsUpsert(upsert)
 62 |     val res = client.execute(req)
 63 |     res.map { res =>
 64 |       res.isCreated || res.getVersion > 1
 65 |     }
 66 |   }
 67 | 
 68 |   def updateWithVersion[T: Manifest](uuid: String, t: T, version: Long) = {
 69 |     val js = JacksonConverter.serialize(t)
 70 |     val req = com.sksamuel.elastic4s.ElasticDsl.update id uuid in Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName version version doc new DocumentSource {
 71 |       override def json: String = js
 72 |     }
 73 |     val res = client.execute(req)
 74 |     true
 75 |   }
 76 | 
 77 | 
 78 |   def searchAll[T: Manifest](req: SearchDefinition): Future[Seq[T]] = {
 79 |     val res = client.execute(req.size(Integer.MAX_VALUE))
 80 |     res.map { res =>
 81 |       res.getHits.getHits.map { hit => JacksonConverter.deserialize[T](hit.getSourceAsString)}
 82 |     }
 83 |   }
 84 | 
 85 |   def search[T: Manifest](req: SearchDefinition): Future[Option[T]] = {
 86 |     val res = client.execute(req.size(Integer.MAX_VALUE))
 87 |     res.map { res =>
 88 |       if (res.getHits.getTotalHits == 0)
 89 |         None
 90 |       else
 91 |         Some(JacksonConverter.deserialize[T](res.getHits.getHits()(0).getSourceAsString))
 92 |     }
 93 |   }
 94 | 
 95 |   def searchAllRaw(req: SearchDefinition): Future[Array[SearchHit]] = {
 96 |     val res = client.execute(req.size(Integer.MAX_VALUE))
 97 |     res.map { res =>
 98 |       res.getHits.getHits
 99 |     }
100 |   }
101 | 
102 |   def searchRaw(req: SearchDefinition): Future[Option[SearchHit]] = {
103 |     val res = client.execute(req.size(Integer.MAX_VALUE))
104 |     res.map { res =>
105 |       if (res.getHits.getTotalHits == 0)
106 |         None
107 |       else
108 |         Some(res.getHits.getHits()(0))
109 |     }
110 |   }
111 |   def esType[T:Manifest]: String = {
112 |     val rt = manifest[T].runtimeClass
113 |     rt.getSimpleName
114 |   }
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/scala/utils/Settings.scala:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | import com.typesafe.config.ConfigFactory
 4 | 
 5 | object Settings {
 6 |   private val config = ConfigFactory.load()
 7 | 
 8 |   val Env = if (System.getenv.containsKey("PRODUCTION")) {
 9 |     Environment.PROD
10 |   } else {
11 |     Environment.DEV
12 |   }
13 | 
14 |   object ElasticSearch {
15 |     val DateFormat = config.getString("elasticsearch.date.format")
16 |     val Host = config.getString("elasticsearch.host")
17 |     val HttpPort = config.getInt("elasticsearch.http.port")
18 |     val Port = config.getInt("elasticsearch.port")
19 |     val Index = config.getString("elasticsearch.index")
20 |     val Cluster = config.getString("elasticsearch.cluster")
21 |     val FullUrl = Host + ":" + HttpPort
22 |     println("ElascticSearch on " + Host + ":" + Port + ",index->" + Index + ", cluster->" + Cluster)
23 |   }
24 | 
25 |   object Spark {
26 |     val Master = config.getString("spark.master")
27 |     val AppName = config.getString("spark.appName")
28 |     val DataPath = config.getString("spark.dataPath")
29 |     val RatingsFile = DataPath + config.getString("spark.ratingsFile")
30 |     val ItemsFile = DataPath + config.getString("spark.itemsFile")
31 |     val JarPath= config.getString("spark.jarPath")
32 | 
33 |   }
34 | 
35 | }
36 | 
37 | object Environment extends Enumeration {
38 |   type Environment = Value
39 |   val DEV = Value(1)
40 |   val PROD = Value(2)
41 | }
42 | 


--------------------------------------------------------------------------------
/workshop-spark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebiznext/spark-elasticsearch-mllib/4902f7c28546371ff292923377672676ad33e98b/workshop-spark.pdf


--------------------------------------------------------------------------------